diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..6ad36f4 --- /dev/null +++ b/.gitignore @@ -0,0 +1,40 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +data/ +test.csv \ No newline at end of file diff --git a/README.md b/README.md index 24f361b..a673941 100644 --- a/README.md +++ b/README.md @@ -10,50 +10,58 @@ `FundaScaper` provides the easiest way to perform web scraping on Funda, the Dutch housing website. You can find houses either for sale or for rent, and access historical data from the past few years. Please note: + 1. Scraping this website is ONLY allowed for personal use (as per Funda's Terms and Conditions). 2. Any commercial use of this Python package is prohibited. The author holds no liability for any misuse of the package. ## Install + ### Install with pip: + ``` pip install funda-scraper ``` + ### Clone the repository: + ``` git clone https://github.com/whchien/funda-scraper.git cd funda-scraper export PYTHONPATH=${PWD} -python funda_scraper/scrape.py --area amsterdam --want_to rent --page_start 1 --n_pages 3 --save +python funda_scraper/scrape.py --area amsterdam --want_to rent --page_start 1 --number_of_pages 3 --save ``` -## Quickstart +## Quickstart + ``` from funda_scraper import FundaScraper scraper = FundaScraper( - area="amsterdam", - want_to="rent", - find_past=False, - page_start=1, - n_pages=3, - min_price=500, + area="amsterdam", + want_to="rent", + find_sold=False, + page_start=1, + number_of_pages=3, + min_price=500, max_price=2000 ) df = scraper.run(raw_data=False, save=True, filepath="test.csv") df.head() ``` -![image](static/example_df.png) +![image](static/example_df.png) -* Note for Windows Users: Please add `if __name__ == "__main__":` before your script. +- Note for Windows Users: Please add `if __name__ == "__main__":` before your script. ## Customizing Your Scraping + You can pass several arguments to `FundaScraper()` for customized scraping: + - `area`: Specify the city or specific area you want to look for, e.g. Amsterdam, Utrecht, Rotterdam, etc. - `want_to`: Choose either `buy` or `rent` to find houses either for sale or for rent. -- `find_past`: Set to `True` to find historical data; the default is `False`. -- `page_start`: Indicate which page to start scraping from; the default is `1`. -- `n_pages`: Indicate how many pages to scrape; the default is `1`. +- `find_sold`: Set to `True` to find historical data; the default is `False`. +- `page_start`: Indicate which page to start scraping from; the default is `1`. +- `number_of_pages`: Indicate how many pages to scrape; the default is `1`. - `min_price`: Indicate the lowest budget amount. - `max_price`: Indicate the highest budget amount. - `min_floor_area`: Indicate the minimum floor area. @@ -62,14 +70,14 @@ You can pass several arguments to `FundaScraper()` for customized scraping: - `property_type`: Specify the desired property type(s). - `sort`: Specify sorting criteria. - The scraped raw result contains following information: + - url - price - address - description - listed_since -- zip_code +- zip_code - size - year_built - living_area @@ -95,10 +103,8 @@ The scraped raw result contains following information: To fetch the data without preprocessing, specify `scraper.run(raw_data=True)`. -*Note*: Information regarding listing dates is no longer available since Q4 2023. Funda requires users to log in to see this information. - +_Note_: Information regarding listing dates is no longer available since Q4 2023. Funda requires users to log in to see this information. ## More information -Check the [example notebook](https://colab.research.google.com/drive/1hNzJJRWxD59lrbeDpfY1OUpBz0NktmfW?usp=sharing) for further details. If you find this project helpful, please give it a [star](https://github.com/whchien/funda-scraper). - +Check the [example notebook](https://colab.research.google.com/drive/1hNzJJRWxD59lrbeDpfY1OUpBz0NktmfW?usp=sharing) for further details. If you find this project helpful, please give it a [star](https://github.com/whchien/funda-scraper). diff --git a/funda_scraper/__init__.py b/funda_scraper/__init__.py index d48a695..6d23c1d 100644 --- a/funda_scraper/__init__.py +++ b/funda_scraper/__init__.py @@ -1,5 +1,6 @@ """Access the directory in python""" from funda_scraper.scrape import FundaScraper +from funda_scraper.extract import DataExtractor -__all__ = ["FundaScraper"] +__all__ = ["FundaScraper", "DataExtractor"] diff --git a/funda_scraper/config/config.yaml b/funda_scraper/config/config.yaml index 789662c..e369dcd 100644 --- a/funda_scraper/config/config.yaml +++ b/funda_scraper/config/config.yaml @@ -6,11 +6,12 @@ keep_cols: - date_sold - ym_sold - year_sold -# - term_days + # - term_days selling_data: - url - house_id - city + - neighborhood_name - house_type - building_type - price @@ -27,34 +28,36 @@ keep_cols: # - date_list # - ym_list # - year_list - - descrip - - photo + - description + # - photos css_selector: url: none - price: ".object-header__price" + price: "section.mt-6 > div:nth-child(2) > dl:nth-child(2) > dd:nth-child(2) > span:nth-child(1)" address: ".object-header__title" descrip: ".object-description-body" - listed_since: ".fd-align-items-center:nth-child(6) span" - zip_code: ".object-header__subtitle" + listed_since: "section.mt-6 > div:nth-child(2) > dl:nth-child(2) > dd:nth-child(6) > span:nth-child(1)" + zip_code: "span.text-neutral-40:nth-child(2)" size: ".object-kenmerken-list:nth-child(8) .fd-align-items-center:nth-child(5) span" year: ".fd-align-items-center~ .fd-align-items-center .fd-m-right-xs" - living_area: ".object-kenmerken-list:nth-child(8) .fd-align-items-center:nth-child(2) span" - kind_of_house: ".object-kenmerken-list:nth-child(5) .fd-align-items-center:nth-child(2) span" - building_type: ".object-kenmerken-list:nth-child(5) .fd-align-items-center:nth-child(4) span" - num_of_rooms: ".object-kenmerken-list:nth-child(11) .fd-align-items-center:nth-child(2)" - num_of_bathrooms: ".object-kenmerken-list:nth-child(11) .fd-align-items-center:nth-child(4)" + year_of_construction: ".mt-6.md\\:mt-7 > div:nth-of-type(2) > dl > dd:nth-of-type(3) span" + living_area: "section.mt-6 > div:nth-child(4) > dl:nth-child(2) > dd:nth-child(4)" + kind_of_house: "section.mt-6 > div:nth-of-type(2) > dl > dd:nth-of-type(1) span" + building_type: "section.mt-6 > div:nth-child(3) > dl:nth-child(2) > dd:nth-child(4) > span:nth-child(1)" + num_of_rooms: "section.mt-6 > div:nth-of-type(4) > dl > dd:nth-of-type(1) span" + num_of_bathrooms: "section.mt-6 > div:nth-of-type(4) > dl > dd:nth-of-type(2) span" layout: ".object-kenmerken-list:nth-child(11)" - energy_label: ".energielabel" - insulation: ".object-kenmerken-list:nth-child(14) .fd-align-items-center:nth-child(4)" - heating: ".object-kenmerken-list:nth-child(14) .fd-align-items-center:nth-child(6)" - ownership: ".object-kenmerken-list:nth-child(17) .fd-align-items-center:nth-child(4)" + energy_label: ".gap-6 > div:nth-child(1)" + insulation: ".mt-6.md\\:mt-7 > div:nth-of-type(5) > dl > dd:nth-of-type(2) span" + heating: "section.mt-6 > div:nth-of-type(5) > dl > dd:nth-of-type(3) span" + ownership: "section.mt-6 > div:nth-of-type(6) > dl > dd:nth-of-type(2) span" exteriors: ".object-kenmerken-list:nth-child(19)" - parking: ".object-kenmerken-list:nth-child(24)" - neighborhood_name: ".fd-display-inline--bp-m" + parking: "section.mt-6 > div:nth-of-type(5) > dl > dd:nth-of-type(1) span" + neighborhood_name: ".object-header__container > a" date_list: "dd:nth-child(2)" date_sold: "dd:nth-child(4)" term: "dd:nth-child(6)" price_sold: ".object-header__price--historic" - last_ask_price: ".object-kenmerken-list:nth-child(2) .fd-align-items-center:nth-child(2)" - last_ask_price_m2: ".object-kenmerken-list__asking-price" + last_ask_price: "section.mt-6 > div:nth-child(2) > dl:nth-child(2) > dd:nth-child(2) > span:nth-child(1)" + last_ask_price_m2: "section.mt-6 > div:nth-child(2) > dl:nth-child(2) > dd:nth-child(4) > span:nth-child(1)" photo: ".media-viewer-overview__section-list-item--photo img[data-lazy]" + photos: "main > div:nth-child(1) ul > li:nth-child(1) > a > span:nth-child(3)" diff --git a/funda_scraper/extract.py b/funda_scraper/extract.py new file mode 100644 index 0000000..5df2ea2 --- /dev/null +++ b/funda_scraper/extract.py @@ -0,0 +1,168 @@ + +import json +import pandas as pd +from bs4 import BeautifulSoup +from typing import List +from tqdm import tqdm +import traceback + +from funda_scraper.config.core import config +from funda_scraper.preprocess import preprocess_data +from funda_scraper.utils import logger +from funda_scraper.filerepository import FileRepository +from funda_scraper.searchrequest import SearchRequest +from funda_scraper.property import Property + +class DataExtractor(object): + + def __init__(self): + self.selectors = config.css_selector + self.raw_df = pd.DataFrame() + self.clean_df = pd.DataFrame() + self.file_repo = FileRepository() + + + def extract_data(self, search_request: SearchRequest, run_id: str, clean_data: bool) -> pd.DataFrame: + + detail_pages = self.file_repo.get_detail_pages(run_id) + + houses: list[Property] = [] + + houses_with_processing_errors = [] + + for page in tqdm(detail_pages, desc = "Processing detail pages.."): + try: + content = detail_pages[page] + house = self.extract_data_from_detail_page(content, search_request) + houses.append(house) + except Exception as e: + logger.error(f"An error occurred while processing house: {e}; skipping this house") + logger.error("Traceback:", exc_info=True) + houses_with_processing_errors.append(page) + + logger.info(f"*** All scraping done: {len(houses)} results ***") + logger.info(f"There were {len(houses_with_processing_errors)} houses that could not be processed") + for error_house in houses_with_processing_errors: + # TODO: move these to a separate errors folder or so + logger.info(f"Error: {error_house}") + + # It may be more intuitive to manipulate the Property objects instead of dataframes, but let's keep the dataframes approach for now + # Note that we are omitting the photos field, which is an array field, and include the photos_string property + df = pd.DataFrame([ + {**{k: v for k, v in vars(house).items() if k != 'photos'}, 'photos': house.photos_string} + for house in houses + ]) + + if not search_request.find_sold: + df = df.drop(["term", "price_sold", "date_sold"], axis=1) + + self.raw_df = df + + if not clean_data: + df = self.raw_df + else: + logger.info("*** Cleaning data ***") + df = preprocess_data(df = self.raw_df, is_past = search_request.find_sold) + self.clean_df = df + + self.file_repo.save_result_file(df, run_id) + + return df + + + def extract_data_from_detail_page(self, page: str, search_request: SearchRequest) -> Property: + soup = BeautifulSoup(page, "lxml") + + script_tag = soup.find_all("script", {"type": "application/ld+json"})[0] + json_data = json.loads(script_tag.contents[0]) + + url = json_data["url"] + description = json_data["description"] + address = f"{json_data["address"]["streetAddress"]}" + city = json_data["address"]["addressLocality"] + price = f"{json_data["offers"]["priceCurrency"]} {json_data["offers"]["price"]}" + + # Get the value according to respective CSS selectors + if search_request.to_buy: + if search_request.find_sold: + list_since_selector = self.selectors.date_list + else: + list_since_selector = self.selectors.listed_since + else: + if search_request.find_sold: + list_since_selector = ".fd-align-items-center:nth-child(9) span" + else: + list_since_selector = ".fd-align-items-center:nth-child(7) span" + + house = Property() + house.url = url + house.price = price + house.address = address + house.city = city + house.description = description + house.zip_code = self.get_value_from_css(soup, self.selectors.zip_code) + house.size = self.get_value_from_css(soup, self.selectors.size) + house.year_of_construction = self.get_value_from_css(soup, self.selectors.year_of_construction) + house.living_area = self.get_value_from_css(soup, self.selectors.living_area) + house.house_type = self.get_value_from_css(soup, self.selectors.kind_of_house) + house.building_type = self.get_value_from_css(soup, self.selectors.building_type) + house.number_of_rooms = self.get_value_from_css(soup, self.selectors.num_of_rooms) + house.number_of_bathrooms = self.get_value_from_css(soup, self.selectors.num_of_bathrooms) + house.layout = self.get_value_from_css(soup, self.selectors.layout), + house.energy_label = self.get_value_from_css(soup, self.selectors.energy_label) + house.insulation = self.get_value_from_css(soup, self.selectors.insulation) + house.heating = self.get_value_from_css(soup, self.selectors.heating) + house.ownership = self.get_value_from_css(soup, self.selectors.ownership) + house.exteriors = self.get_value_from_css(soup, self.selectors.exteriors) + house.parking = self.get_value_from_css(soup, self.selectors.parking) + house.neighborhood_name = self.get_value_from_css(soup, self.selectors.neighborhood_name) + house.date_list = self.get_value_from_css(soup, self.selectors.date_list) + house.date_sold = self.get_value_from_css(soup, self.selectors.date_sold) + house.term = self.get_value_from_css(soup, self.selectors.term) + house.price_sold = self.get_value_from_css(soup, self.selectors.price_sold) + house.last_ask_price = self.get_value_from_css(soup, self.selectors.last_ask_price) + house.last_ask_price_m2 = self.get_value_from_css(soup, self.selectors.last_ask_price_m2).split("\r")[0] + house.photos = self.get_photos(soup, house.url) + + for key, value in house.__dict__.items(): + formatted_value = self.format_string(value) + setattr(house, key, formatted_value) + + return house + + @staticmethod + def get_value_from_css(soup: BeautifulSoup, selector: str) -> str: + """Extracts data from HTML using a CSS selector.""" + result = soup.select(selector) + if len(result) > 0: + result = result[0].text + else: + result = "na" + return result + + + def format_string(self, value): + if type(value) == "str": + return value.replace("\n", "").replace("\r", "").strip() + else: + return value + + def get_photos(self, soup: BeautifulSoup, url: str) -> List[str]: + number_of_photos = 0 + try: + number_of_photos = int(self.get_value_from_css(soup, self.selectors.photos)) + except: + number_of_photos = 0 + + photos: List[str] = [] + + if (number_of_photos > 0): + for i in range(1, number_of_photos + 1): + photo_url = f"{url}media/foto/{i}" + photos.append(photo_url) + + return photos + + + + diff --git a/funda_scraper/filerepository.py b/funda_scraper/filerepository.py new file mode 100644 index 0000000..a8e6ec9 --- /dev/null +++ b/funda_scraper/filerepository.py @@ -0,0 +1,85 @@ +import os +import pandas as pd + +from typing import List +from funda_scraper.utils import logger + + +class FileRepository(object): + DATA_DIR = "data" + LISTPAGES_DIR = 'listpages' + DETAILPAGES_DIR = 'detailpages' + + def __init__(self) -> None: + self._ensure_dir(self.DATA_DIR) + + def _ensure_dir(self, dir_name: str): + if not os.path.exists(dir_name): + os.makedirs(dir_name) + + def get_list_pages(self, run_id: str) -> List[str]: + pages = [] + + list_pages_dir = self._get_list_pages_dir_name(run_id) + + for f in os.listdir(list_pages_dir): + file_path = os.path.join(list_pages_dir, f) + + if os.path.isfile(file_path): + with open(file_path, 'r') as file: + content = file.read() + pages.append(content) + + return pages + + def get_detail_pages(self, run_id: str) -> List[str]: + pages = {} + + detail_pages_dir = self._get_detail_pages_dir_name(run_id) + + for f in os.listdir(detail_pages_dir): + file_path = os.path.join(detail_pages_dir, f) + + if os.path.isfile(file_path): + with open(file_path, 'r') as file: + content = file.read() + pages[f] = content + + return pages + + def save_list_page(self, content: str, index: int, run_id: str): + list_pages_dir = self._get_list_pages_dir_name(run_id) + self._ensure_dir(list_pages_dir) + + file_path = os.path.join(list_pages_dir, f"listpage_{index}.html") + + with open(file_path, 'w') as file: + file.write(content) + + def save_detail_page(self, content: str, index: int, run_id: str): + detail_pages_dir = self._get_detail_pages_dir_name(run_id) + self._ensure_dir(detail_pages_dir) + + file_path = os.path.join(detail_pages_dir, f"detailpage_{index}.html") + + with open(file_path, 'w') as file: + file.write(content) + + def save_result_file(self, df: pd.DataFrame, run_id: str): + """Saves the scraped data to a CSV file.""" + file_path = os.path.join(self.DATA_DIR, run_id, "result.csv") + + df.to_csv(file_path, index=False) + logger.info(f"*** File saved: {file_path}. ***") + + def _get_list_pages_dir_name(self, run_id: str): + return os.path.join(self.DATA_DIR, run_id, self.LISTPAGES_DIR) + + def _get_detail_pages_dir_name(self, run_id: str): + return os.path.join(self.DATA_DIR, run_id, self.DETAILPAGES_DIR) + + + + + + diff --git a/funda_scraper/preprocess.py b/funda_scraper/preprocess.py index 96b8cf1..18fd9a6 100644 --- a/funda_scraper/preprocess.py +++ b/funda_scraper/preprocess.py @@ -159,6 +159,7 @@ def preprocess_data( """ df = df.dropna() + if not is_past: keep_cols = config.keep_cols.selling_data else: @@ -168,9 +169,9 @@ def preprocess_data( keep_cols.extend(keep_extra_cols) # Info - df["house_id"] = df["url"].apply(lambda x: int(x.split("/")[-2].split("-")[1])) - df["house_type"] = df["url"].apply(lambda x: x.split("/")[-2].split("-")[0]) - df = df[df["house_type"].isin(["appartement", "huis"])] + df["house_id"] = df["url"].apply(lambda x: int(x.split("/")[-2])) + #df["house_type"] = df["url"].apply(lambda x: x.split("/")[-2].split("-")[0]) + #df = df[df["house_type"].isin(["appartement", "huis"])] # Price price_col = "price_sold" if is_past else "price" @@ -184,13 +185,13 @@ def preprocess_data( df["zip"] = df["zip_code"].apply(lambda x: x[:4]) # House layout - df["room"] = df["num_of_rooms"].apply(find_n_room) - df["bedroom"] = df["num_of_rooms"].apply(find_n_bedroom) - df["bathroom"] = df["num_of_bathrooms"].apply(find_n_bathroom) + df["room"] = df["number_of_rooms"].apply(find_n_room) + df["bedroom"] = df["number_of_rooms"].apply(find_n_bedroom) + df["bathroom"] = df["number_of_bathrooms"].apply(find_n_bathroom) df["energy_label"] = df["energy_label"].apply(clean_energy_label) # Time - df["year_built"] = df["year"].apply(clean_year).astype(int) + df["year_built"] = df["year_of_construction"].apply(clean_year).astype(int) df["house_age"] = datetime.now().year - df["year_built"] if is_past: diff --git a/funda_scraper/property.py b/funda_scraper/property.py new file mode 100644 index 0000000..1c4ef0b --- /dev/null +++ b/funda_scraper/property.py @@ -0,0 +1,41 @@ +from dataclasses import dataclass, field + +@dataclass +class Property(): + url: str = None + price: str = None + address = None + city = None + description = None + listed_since= None + zip_code = None + size = None + year_of_construction = None + living_area = None + house_type = None + building_type = None + number_of_rooms = None + number_of_bathrooms = None + layout = None + energy_label = None + insulation = None + heating = None + ownership = None + exteriors = None + parking = None + neighborhood_name = None + date_list = None + date_sold = None + term = None + price_sold = None + last_ask_price = None + last_ask_price_m2 = None + photos: list[str] = field(default_factory=list) + + @property + def photos_string(self) -> bool: + if not self.photos or len(self.photos) == 0: + return "" + else: + return "|".join(self.photos) + diff --git a/funda_scraper/scrape.py b/funda_scraper/scrape.py index 17f8383..cf27990 100644 --- a/funda_scraper/scrape.py +++ b/funda_scraper/scrape.py @@ -1,12 +1,11 @@ """Main funda scraper module""" import argparse -import datetime import json import multiprocessing as mp -import os +import uuid from collections import OrderedDict -from typing import List, Optional +from typing import List from urllib.parse import urlparse, urlunparse import pandas as pd @@ -16,8 +15,9 @@ from tqdm.contrib.concurrent import process_map from funda_scraper.config.core import config -from funda_scraper.preprocess import clean_date_format, preprocess_data from funda_scraper.utils import logger +from funda_scraper.extract import DataExtractor +from funda_scraper.filerepository import FileRepository class FundaScraper(object): @@ -25,124 +25,69 @@ class FundaScraper(object): A class used to scrape real estate data from the Funda website. """ - def __init__( - self, - area: str, - want_to: str, - page_start: int = 1, - n_pages: int = 1, - find_past: bool = False, - min_price: Optional[int] = None, - max_price: Optional[int] = None, - days_since: Optional[int] = None, - property_type: Optional[str] = None, - min_floor_area: Optional[str] = None, - max_floor_area: Optional[str] = None, - sort: Optional[str] = None, - ): + def __init__(self, search_request): """ - :param area: The area to search for properties, formatted for URL compatibility. - :param want_to: Specifies whether the user wants to buy or rent properties. - :param page_start: The starting page number for the search. - :param n_pages: The number of pages to scrape. - :param find_past: Flag to indicate whether to find past listings. - :param min_price: The minimum price for the property search. - :param max_price: The maximum price for the property search. - :param days_since: The maximum number of days since the listing was published. - :param property_type: The type of property to search for. - :param min_floor_area: The minimum floor area for the property search. - :param max_floor_area: The maximum floor area for the property search. - :param sort: The sorting criterion for the search results. + :param search_request: The parameters for the search """ - # Init attributes - self.area = area.lower().replace(" ", "-") - self.property_type = property_type - self.want_to = want_to - self.find_past = find_past - self.page_start = max(page_start, 1) - self.n_pages = max(n_pages, 1) - self.page_end = self.page_start + self.n_pages - 1 - self.min_price = min_price - self.max_price = max_price - self.days_since = days_since - self.min_floor_area = min_floor_area - self.max_floor_area = max_floor_area - self.sort = sort - - # Instantiate along the way + self.search_request = search_request + self.links: List[str] = [] self.raw_df = pd.DataFrame() self.clean_df = pd.DataFrame() self.base_url = config.base_url - self.selectors = config.css_selector + + self.run_id = str(uuid.uuid1()) + + self.file_repo = FileRepository() + self.data_extractor = DataExtractor() + def __repr__(self): - return ( - f"FundaScraper(area={self.area}, " - f"want_to={self.want_to}, " - f"n_pages={self.n_pages}, " - f"page_start={self.page_start}, " - f"find_past={self.find_past}, " - f"min_price={self.min_price}, " - f"max_price={self.max_price}, " - f"days_since={self.days_since}, " - f"min_floor_area={self.min_floor_area}, " - f"max_floor_area={self.max_floor_area}, " - f"find_past={self.find_past})" - f"min_price={self.min_price})" - f"max_price={self.max_price})" - f"days_since={self.days_since})" - f"sort={self.sort})" - ) + return str(self.search_request) - @property - def to_buy(self) -> bool: - """Determines if the search is for buying or renting properties.""" - if self.want_to.lower() in ["buy", "koop", "b", "k"]: - return True - elif self.want_to.lower() in ["rent", "huur", "r", "h"]: - return False - else: - raise ValueError("'want_to' must be either 'buy' or 'rent'.") - - @property - def check_days_since(self) -> int: - """Validates the 'days_since' attribute.""" - if self.find_past: - raise ValueError("'days_since' can only be specified when find_past=False.") - - if self.days_since in [None, 1, 3, 5, 10, 30]: - return self.days_since - else: - raise ValueError("'days_since' must be either None, 1, 3, 5, 10 or 30.") - - @property - def check_sort(self) -> str: - """Validates the 'sort' attribute.""" - if self.sort in [ - None, - "relevancy", - "date_down", - "date_up", - "price_up", - "price_down", - "floor_area_down", - "plot_area_down", - "city_up" "postal_code_up", - ]: - return self.sort - else: - raise ValueError( - "'sort' must be either None, 'relevancy', 'date_down', 'date_up', 'price_up', 'price_down', " - "'floor_area_down', 'plot_area_down', 'city_up' or 'postal_code_up'. " - ) - @staticmethod - def _check_dir() -> None: - """Ensures the existence of the directory for storing data.""" - if not os.path.exists("data"): - os.makedirs("data") + def _get_list_pages(self, page_start: int = None, number_of_pages: int = None) -> None: + + page_start = self.search_request.page_start if page_start is None else page_start + number_of_pages = self.search_request.number_of_pages if number_of_pages is None else number_of_pages + + main_url = self._build_main_query_url() + + for i in tqdm(range(page_start, page_start + number_of_pages)): + url = f"{main_url}&search_result={i}" + response = requests.get(url, headers = config.header) + self.file_repo.save_list_page(response.text, i, self.run_id) + + return + + + def _get_detail_pages(self): + urls = [] + + list_pages = self.file_repo.get_list_pages(self.run_id) + + for page in list_pages: + soup = BeautifulSoup(page, "lxml") + script_tag = soup.find_all("script", {"type": "application/ld+json"})[0] + json_data = json.loads(script_tag.contents[0]) + item_list = [item["url"] for item in json_data["itemListElement"]] + urls += item_list + + urls = self.remove_duplicates(urls) + fixed_urls = [self.fix_link(url) for url in urls] + + pools = mp.cpu_count() + content = process_map(self.scrape_one_link, fixed_urls, max_workers=pools) + + for i, c in enumerate(content): + self.file_repo.save_detail_page(c, i, self.run_id) + + + def scrape_one_link(self, link: str) -> str: + response = requests.get(link, headers=config.header) + return response.text + @staticmethod def _get_links_from_one_parent(url: str) -> List[str]: @@ -155,52 +100,13 @@ def _get_links_from_one_parent(url: str) -> List[str]: urls = [item["url"] for item in json_data["itemListElement"]] return urls - def reset( - self, - area: Optional[str] = None, - property_type: Optional[str] = None, - want_to: Optional[str] = None, - page_start: Optional[int] = None, - n_pages: Optional[int] = None, - find_past: Optional[bool] = None, - min_price: Optional[int] = None, - max_price: Optional[int] = None, - days_since: Optional[int] = None, - min_floor_area: Optional[str] = None, - max_floor_area: Optional[str] = None, - sort: Optional[str] = None, - ) -> None: - """Resets or initializes the search parameters.""" - if area is not None: - self.area = area - if property_type is not None: - self.property_type = property_type - if want_to is not None: - self.want_to = want_to - if page_start is not None: - self.page_start = max(page_start, 1) - if n_pages is not None: - self.n_pages = max(n_pages, 1) - if find_past is not None: - self.find_past = find_past - if min_price is not None: - self.min_price = min_price - if max_price is not None: - self.max_price = max_price - if days_since is not None: - self.days_since = days_since - if min_floor_area is not None: - self.min_floor_area = min_floor_area - if max_floor_area is not None: - self.max_floor_area = max_floor_area - if sort is not None: - self.sort = sort @staticmethod def remove_duplicates(lst: List[str]) -> List[str]: """Removes duplicate links from a list.""" return list(OrderedDict.fromkeys(lst)) + @staticmethod def fix_link(link: str) -> str: """Fixes a given property link to ensure proper URL formatting.""" @@ -216,210 +122,67 @@ def fix_link(link: str) -> str: ) return fixed_link - def fetch_all_links(self, page_start: int = None, n_pages: int = None) -> None: - """Collects all available property links across multiple pages.""" - - page_start = self.page_start if page_start is None else page_start - n_pages = self.n_pages if n_pages is None else n_pages - - logger.info("*** Phase 1: Fetch all the available links from all pages *** ") - urls = [] - main_url = self._build_main_query_url() - - for i in tqdm(range(page_start, page_start + n_pages)): - try: - item_list = self._get_links_from_one_parent( - f"{main_url}&search_result={i}" - ) - urls += item_list - except IndexError: - self.page_end = i - logger.info(f"*** The last available page is {self.page_end} ***") - break - - urls = self.remove_duplicates(urls) - fixed_urls = [self.fix_link(url) for url in urls] - - logger.info( - f"*** Got all the urls. {len(fixed_urls)} houses found from {self.page_start} to {self.page_end} ***" - ) - self.links = fixed_urls def _build_main_query_url(self) -> str: """Constructs the main query URL for the search.""" - query = "koop" if self.to_buy else "huur" + query = "koop" if self.search_request.to_buy else "huur" main_url = ( - f"{self.base_url}/zoeken/{query}?selected_area=%5B%22{self.area}%22%5D" + f"{self.base_url}/zoeken/{query}?selected_area=%5B%22{self.search_request.area}%22%5D" ) - if self.property_type: - property_types = self.property_type.split(",") + if self.search_request.property_type: + property_types = self.search_request.property_type.split(",") formatted_property_types = [ "%22" + prop_type + "%22" for prop_type in property_types ] main_url += f"&object_type=%5B{','.join(formatted_property_types)}%5D" - if self.find_past: + if self.search_request.find_sold: main_url = f'{main_url}&availability=%5B"unavailable"%5D' - if self.min_price is not None or self.max_price is not None: - min_price = "" if self.min_price is None else self.min_price - max_price = "" if self.max_price is None else self.max_price + if self.search_request.min_price is not None or self.search_request.max_price is not None: + min_price = "" if self.search_request.min_price is None else self.search_request.min_price + max_price = "" if self.search_request.max_price is None else self.search_request.max_price main_url = f"{main_url}&price=%22{min_price}-{max_price}%22" - if self.days_since is not None: - main_url = f"{main_url}&publication_date={self.check_days_since}" + if self.search_request.days_since is not None: + main_url = f"{main_url}&publication_date={self.search_request.check_days_since}" - if self.min_floor_area or self.max_floor_area: - min_floor_area = "" if self.min_floor_area is None else self.min_floor_area - max_floor_area = "" if self.max_floor_area is None else self.max_floor_area + if self.search_request.min_floor_area or self.search_request.max_floor_area: + min_floor_area = "" if self.search_request.min_floor_area is None else self.search_request.min_floor_area + max_floor_area = "" if self.search_request.max_floor_area is None else self.search_request.max_floor_area main_url = f"{main_url}&floor_area=%22{min_floor_area}-{max_floor_area}%22" - if self.sort is not None: - main_url = f"{main_url}&sort=%22{self.check_sort}%22" + if self.search_request.sort is not None: + main_url = f"{main_url}&sort=%22{self.search_request.sort_by}%22" logger.info(f"*** Main URL: {main_url} ***") return main_url - @staticmethod - def get_value_from_css(soup: BeautifulSoup, selector: str) -> str: - """Extracts data from HTML using a CSS selector.""" - result = soup.select(selector) - if len(result) > 0: - result = result[0].text - else: - result = "na" - return result - - def scrape_one_link(self, link: str) -> List[str]: - """Scrapes data from a single property link.""" - - # Initialize for each page - response = requests.get(link, headers=config.header) - soup = BeautifulSoup(response.text, "lxml") - # Get the value according to respective CSS selectors - if self.to_buy: - if self.find_past: - list_since_selector = self.selectors.date_list - else: - list_since_selector = self.selectors.listed_since - else: - if self.find_past: - list_since_selector = ".fd-align-items-center:nth-child(9) span" - else: - list_since_selector = ".fd-align-items-center:nth-child(7) span" - - result = [ - link, - self.get_value_from_css(soup, self.selectors.price), - self.get_value_from_css(soup, self.selectors.address), - self.get_value_from_css(soup, self.selectors.descrip), - self.get_value_from_css(soup, list_since_selector), - self.get_value_from_css(soup, self.selectors.zip_code), - self.get_value_from_css(soup, self.selectors.size), - self.get_value_from_css(soup, self.selectors.year), - self.get_value_from_css(soup, self.selectors.living_area), - self.get_value_from_css(soup, self.selectors.kind_of_house), - self.get_value_from_css(soup, self.selectors.building_type), - self.get_value_from_css(soup, self.selectors.num_of_rooms), - self.get_value_from_css(soup, self.selectors.num_of_bathrooms), - self.get_value_from_css(soup, self.selectors.layout), - self.get_value_from_css(soup, self.selectors.energy_label), - self.get_value_from_css(soup, self.selectors.insulation), - self.get_value_from_css(soup, self.selectors.heating), - self.get_value_from_css(soup, self.selectors.ownership), - self.get_value_from_css(soup, self.selectors.exteriors), - self.get_value_from_css(soup, self.selectors.parking), - self.get_value_from_css(soup, self.selectors.neighborhood_name), - self.get_value_from_css(soup, self.selectors.date_list), - self.get_value_from_css(soup, self.selectors.date_sold), - self.get_value_from_css(soup, self.selectors.term), - self.get_value_from_css(soup, self.selectors.price_sold), - self.get_value_from_css(soup, self.selectors.last_ask_price), - self.get_value_from_css(soup, self.selectors.last_ask_price_m2).split("\r")[ - 0 - ], - ] - - # Deal with list_since_selector especially, since its CSS varies sometimes - if clean_date_format(result[4]) == "na": - for i in range(6, 16): - selector = f".fd-align-items-center:nth-child({i}) span" - update_list_since = self.get_value_from_css(soup, selector) - if clean_date_format(update_list_since) == "na": - pass - else: - result[4] = update_list_since - - photos_list = [ - p.get("data-lazy-srcset") for p in soup.select(self.selectors.photo) - ] - photos_string = ", ".join(photos_list) - - # Clean up the retried result from one page - result = [r.replace("\n", "").replace("\r", "").strip() for r in result] - result.append(photos_string) - return result - - def scrape_pages(self) -> None: - """Scrapes data from all collected property links.""" - - logger.info("*** Phase 2: Start scraping from individual links ***") - df = pd.DataFrame({key: [] for key in self.selectors.keys()}) - - # Scrape pages with multiprocessing to improve efficiency - # TODO: use asyncio instead - pools = mp.cpu_count() - content = process_map(self.scrape_one_link, self.links, max_workers=pools) + def _get_pages(self): + self._get_list_pages() + self._get_detail_pages() - for i, c in enumerate(content): - df.loc[len(df)] = c - - df["city"] = df["url"].map(lambda x: x.split("/")[4]) - df["log_id"] = datetime.datetime.now().strftime("%Y%m-%d%H-%M%S") - if not self.find_past: - df = df.drop(["term", "price_sold", "date_sold"], axis=1) - logger.info(f"*** All scraping done: {df.shape[0]} results ***") - self.raw_df = df - - def save_csv(self, df: pd.DataFrame, filepath: str = None) -> None: - """Saves the scraped data to a CSV file.""" - if filepath is None: - self._check_dir() - date = str(datetime.datetime.now().date()).replace("-", "") - status = "unavailable" if self.find_past else "unavailable" - want_to = "buy" if self.to_buy else "rent" - filepath = f"./data/houseprice_{date}_{self.area}_{want_to}_{status}_{len(self.links)}.csv" - df.to_csv(filepath, index=False) - logger.info(f"*** File saved: {filepath}. ***") - - def run( - self, raw_data: bool = False, save: bool = False, filepath: str = None - ) -> pd.DataFrame: + + def run(self, clean_data: bool = False) -> pd.DataFrame: """ - Runs the full scraping process, optionally saving the results to a CSV file. + Runs the full scraping process, saving the results to a CSV file. - :param raw_data: if true, the data won't be pre-processed - :param save: if true, the data will be saved as a csv file - :param filepath: the name for the file + :param clean_data: if true, the data won't be pre-processed :return: the (pre-processed) dataframe from scraping """ - self.fetch_all_links() - self.scrape_pages() + logger.info(f"Started scraping, run_id: {self.run_id}") - if raw_data: - df = self.raw_df - else: - logger.info("*** Cleaning data ***") - df = preprocess_data(df=self.raw_df, is_past=self.find_past) - self.clean_df = df + logger.info("Fetching pages..") + self._get_pages() - if save: - self.save_csv(df, filepath) + logger.info("Extracting data from the html pages") + df = self.data_extractor.extract_data(self.search_request, self.run_id, clean_data) logger.info("*** Done! ***") + return df @@ -439,7 +202,7 @@ def run( choices=["rent", "buy"], ) parser.add_argument( - "--find_past", + "--find_sold", action="store_true", help="Indicate whether you want to use historical data", ) @@ -447,7 +210,7 @@ def run( "--page_start", type=int, help="Specify which page to start scraping", default=1 ) parser.add_argument( - "--n_pages", type=int, help="Specify how many pages to scrape", default=1 + "--number_of_pages", type=int, help="Specify how many pages to scrape", default=1 ) parser.add_argument( "--min_price", type=int, help="Specify the min price", default=None @@ -493,9 +256,9 @@ def run( scraper = FundaScraper( area=args.area, want_to=args.want_to, - find_past=args.find_past, + find_sold=args.find_sold, page_start=args.page_start, - n_pages=args.n_pages, + number_of_pages=args.number_of_pages, min_price=args.min_price, max_price=args.max_price, days_since=args.days_since, diff --git a/funda_scraper/searchrequest.py b/funda_scraper/searchrequest.py new file mode 100644 index 0000000..b1207ad --- /dev/null +++ b/funda_scraper/searchrequest.py @@ -0,0 +1,148 @@ +from typing import List, Optional + +class SearchRequest(object): + + def __init__( + self, + area: str, + want_to: str, + page_start: int = 1, + number_of_pages: int = 1, + find_sold: bool = False, + min_price: Optional[int] = None, + max_price: Optional[int] = None, + days_since: Optional[int] = None, + property_type: Optional[str] = None, + min_floor_area: Optional[str] = None, + max_floor_area: Optional[str] = None, + sort: Optional[str] = None, + ): + """ + + :param area: The area to search for properties, formatted for URL compatibility. + :param want_to: Specifies whether the user wants to buy or rent properties. + :param page_start: The starting page number for the search. + :param number_of_pages: The number of pages to scrape. + :param find_sold: Flag to indicate whether to find past listings. + :param min_price: The minimum price for the property search. + :param max_price: The maximum price for the property search. + :param days_since: The maximum number of days since the listing was published. + :param property_type: The type of property to search for. + :param min_floor_area: The minimum floor area for the property search. + :param max_floor_area: The maximum floor area for the property search. + :param sort: The sorting criterion for the search results. + """ + # Init attributes + self.area = area.lower().replace(" ", "-") + self.property_type = property_type + self.want_to = want_to + self.find_sold = find_sold + self.page_start = max(page_start, 1) + self.number_of_pages = max(number_of_pages, 1) + self.page_end = self.page_start + self.number_of_pages - 1 + self.min_price = min_price + self.max_price = max_price + self.days_since = days_since + self.min_floor_area = min_floor_area + self.max_floor_area = max_floor_area + self.sort = sort + + def __repr__(self): + return ( + f"FundaScraper(area={self.area}, " + f"want_to={self.want_to}, " + f"number_of_pages={self.number_of_pages}, " + f"page_start={self.page_start}, " + f"find_sold={self.find_sold}, " + f"min_price={self.min_price}, " + f"max_price={self.max_price}, " + f"days_since={self.days_since}, " + f"min_floor_area={self.min_floor_area}, " + f"max_floor_area={self.max_floor_area}, " + f"find_sold={self.find_sold})" + f"min_price={self.min_price})" + f"max_price={self.max_price})" + f"days_since={self.days_since})" + f"sort={self.sort})" + ) + + @property + def to_buy(self) -> bool: + """Determines if the search is for buying or renting properties.""" + if self.want_to.lower() in ["buy", "koop", "b", "k"]: + return True + elif self.want_to.lower() in ["rent", "huur", "r", "h"]: + return False + else: + raise ValueError("'want_to' must be either 'buy' or 'rent'.") + + @property + def check_days_since(self) -> int: + """Validates the 'days_since' attribute.""" + if self.find_sold: + raise ValueError("'days_since' can only be specified when find_sold=False.") + + if self.days_since in [None, 1, 3, 5, 10, 30]: + return self.days_since + else: + raise ValueError("'days_since' must be either None, 1, 3, 5, 10 or 30.") + + @property + def sort_by(self) -> str: + """Validates the 'sort' attribute.""" + if self.sort in [ + None, + "relevancy", + "date_down", + "date_up", + "price_up", + "price_down", + "floor_area_down", + "plot_area_down", + "city_up", + "postal_code_up" + ]: + return self.sort + else: + return None + + def reset( + self, + area: Optional[str] = None, + property_type: Optional[str] = None, + want_to: Optional[str] = None, + page_start: Optional[int] = None, + number_of_pages: Optional[int] = None, + find_sold: Optional[bool] = None, + min_price: Optional[int] = None, + max_price: Optional[int] = None, + days_since: Optional[int] = None, + min_floor_area: Optional[str] = None, + max_floor_area: Optional[str] = None, + sort: Optional[str] = None, + ) -> None: + """Resets or initializes the search parameters.""" + if area is not None: + self.area = area + if property_type is not None: + self.property_type = property_type + if want_to is not None: + self.want_to = want_to + if page_start is not None: + self.page_start = max(page_start, 1) + if number_of_pages is not None: + self.number_of_pages = max(number_of_pages, 1) + if find_sold is not None: + self.find_sold = find_sold + if min_price is not None: + self.min_price = min_price + if max_price is not None: + self.max_price = max_price + if days_since is not None: + self.days_since = days_since + if min_floor_area is not None: + self.min_floor_area = min_floor_area + if max_floor_area is not None: + self.max_floor_area = max_floor_area + if sort is not None: + self.sort = sort \ No newline at end of file diff --git a/main.py b/main.py new file mode 100644 index 0000000..c0c6763 --- /dev/null +++ b/main.py @@ -0,0 +1,23 @@ +from funda_scraper.scrape import FundaScraper +from funda_scraper.extract import DataExtractor +from funda_scraper.searchrequest import SearchRequest + +if __name__ == "__main__": + + search_params = SearchRequest( + area = "Amsterdam", + want_to = "buy", + find_sold = False, + page_start = 90, + number_of_pages = 150, + # min_price=500, + # max_price=2000 + ) + + scraper = FundaScraper(search_params) + df = scraper.run(clean_data = True) + #df.head() + + # It's also possible to to extraction separately from fetching the html pages + #data_extractor = DataExtractor() + #data_extractor.extract_data(search_params, run_id = "196a5756-8643-11ef-840d-a0510ba6104e", clean_data = True) diff --git a/tests/test_scrape.py b/tests/test_scrape.py index bf0aa29..5adcf8a 100644 --- a/tests/test_scrape.py +++ b/tests/test_scrape.py @@ -11,8 +11,8 @@ def scraper(self): area="amsterdam", want_to="buy", page_start=1, - n_pages=1, - find_past=False, + number_of_pages=1, + find_sold=False, min_price=100000, max_price=500000, days_since=None, @@ -34,9 +34,9 @@ def test_check_sort(self, scraper): assert scraper.check_sort == "price_down" def test_reset(self, scraper): - scraper.reset(area="rotterdam", n_pages=2) + scraper.reset(area="rotterdam", number_of_pages=2) assert scraper.area == "rotterdam" - assert scraper.n_pages == 2 + assert scraper.number_of_pages == 2 def test_fix_link(self, scraper): link = "https://www.funda.nl/detail/koop/den-haag/appartement-address-333/88888888/" @@ -49,7 +49,7 @@ def test_fix_link(self, scraper): def test_rent(): scraper = FundaScraper( - area="amsterdam", want_to="rent", find_past=False, page_start=1, n_pages=1 + area="amsterdam", want_to="rent", find_sold=False, page_start=1, number_of_pages=1 ) df = scraper.run(raw_data=True) assert len(scraper.links) == 15 @@ -63,7 +63,7 @@ def test_rent(): def test_rent_past(): scraper = FundaScraper( - area="amsterdam", want_to="rent", find_past=True, page_start=1, n_pages=1 + area="amsterdam", want_to="rent", find_sold=True, page_start=1, number_of_pages=1 ) df = scraper.run(raw_data=True) assert len(scraper.links) == 15 @@ -77,7 +77,7 @@ def test_rent_past(): def test_buy(): scraper = FundaScraper( - area="amsterdam", want_to="buy", find_past=False, page_start=1, n_pages=1 + area="amsterdam", want_to="buy", find_sold=False, page_start=1, number_of_pages=1 ) df = scraper.run(raw_data=True) assert len(scraper.links) == 15 @@ -91,7 +91,7 @@ def test_buy(): def test_buy_past(): scraper = FundaScraper( - area="amsterdam", want_to="buy", find_past=True, page_start=1, n_pages=1 + area="amsterdam", want_to="buy", find_sold=True, page_start=1, number_of_pages=1 ) df = scraper.run(raw_data=True) assert len(scraper.links) == 15 @@ -108,9 +108,9 @@ def test_buy_house(): area="amsterdam", property_type="house", want_to="buy", - find_past=False, + find_sold=False, page_start=1, - n_pages=1, + number_of_pages=1, ) df = scraper.run(raw_data=True) assert len(scraper.links) == 15 @@ -128,9 +128,9 @@ def test_buy_apartment(): area="amsterdam", property_type="apartment", want_to="buy", - find_past=False, + find_sold=False, page_start=1, - n_pages=1, + number_of_pages=1, ) df = scraper.run(raw_data=True) assert len(scraper.links) == 15 @@ -148,9 +148,9 @@ def test_buy_mixed(): area="amsterdam", property_type="apartment,house", want_to="buy", - find_past=False, + find_sold=False, page_start=1, - n_pages=1, + number_of_pages=1, ) df = scraper.run(raw_data=True) assert len(scraper.links) == 15