From d3a6dc25745949c50b851f2309012f69a03edfe7 Mon Sep 17 00:00:00 2001 From: Mustafa Karakaya Date: Sat, 28 Sep 2024 15:51:01 +0200 Subject: [PATCH 01/17] add main.py and corrected config --- main.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 main.py diff --git a/main.py b/main.py new file mode 100644 index 0000000..60ae7b1 --- /dev/null +++ b/main.py @@ -0,0 +1,19 @@ +from funda_scraper.scrape import FundaScraper + +if __name__ == "__main__": + print('there u go') + + scraper = FundaScraper( + area="amsterdam", + want_to="buy", + find_past=False, + page_start=1, + n_pages=3, + # min_price=500, + # max_price=2000 + ) + df = scraper.run(raw_data=True, save=True, filepath="test.csv") + df.head() + + + From 86f7a2b5fba0072f4626a22627a25f24483374a4 Mon Sep 17 00:00:00 2001 From: Mustafa Karakaya Date: Sat, 28 Sep 2024 16:58:36 +0200 Subject: [PATCH 02/17] start separating fetching data from extraction --- .gitignore | 39 +++++++++++++++++ funda_scraper/config/config.yaml | 19 ++++---- funda_scraper/extract.py | 0 funda_scraper/scrape.py | 74 ++++++++++++++++++++++++++++++++ main.py | 3 +- 5 files changed, 125 insertions(+), 10 deletions(-) create mode 100644 .gitignore create mode 100644 funda_scraper/extract.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..73a43a2 --- /dev/null +++ b/.gitignore @@ -0,0 +1,39 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +data/ \ No newline at end of file diff --git a/funda_scraper/config/config.yaml b/funda_scraper/config/config.yaml index 789662c..4a8e6f3 100644 --- a/funda_scraper/config/config.yaml +++ b/funda_scraper/config/config.yaml @@ -31,23 +31,23 @@ keep_cols: - photo css_selector: url: none - price: ".object-header__price" + price: "section.mt-6 > div:nth-child(2) > dl:nth-child(2) > dd:nth-child(2) > span:nth-child(1)" address: ".object-header__title" descrip: ".object-description-body" - listed_since: ".fd-align-items-center:nth-child(6) span" - zip_code: ".object-header__subtitle" + listed_since: "section.mt-6 > div:nth-child(2) > dl:nth-child(2) > dd:nth-child(6) > span:nth-child(1)" + zip_code: "span.text-neutral-40:nth-child(2)" size: ".object-kenmerken-list:nth-child(8) .fd-align-items-center:nth-child(5) span" year: ".fd-align-items-center~ .fd-align-items-center .fd-m-right-xs" - living_area: ".object-kenmerken-list:nth-child(8) .fd-align-items-center:nth-child(2) span" + living_area: "section.mt-6 > div:nth-child(4) > dl:nth-child(2) > dd:nth-child(4)" kind_of_house: ".object-kenmerken-list:nth-child(5) .fd-align-items-center:nth-child(2) span" - building_type: ".object-kenmerken-list:nth-child(5) .fd-align-items-center:nth-child(4) span" + building_type: "section.mt-6 > div:nth-child(3) > dl:nth-child(2) > dd:nth-child(4) > span:nth-child(1)" num_of_rooms: ".object-kenmerken-list:nth-child(11) .fd-align-items-center:nth-child(2)" num_of_bathrooms: ".object-kenmerken-list:nth-child(11) .fd-align-items-center:nth-child(4)" layout: ".object-kenmerken-list:nth-child(11)" - energy_label: ".energielabel" + energy_label: ".gap-6 > div:nth-child(1)" insulation: ".object-kenmerken-list:nth-child(14) .fd-align-items-center:nth-child(4)" heating: ".object-kenmerken-list:nth-child(14) .fd-align-items-center:nth-child(6)" - ownership: ".object-kenmerken-list:nth-child(17) .fd-align-items-center:nth-child(4)" + ownership: "section.mt-6 > div:nth-child(7) > dl:nth-child(2) > dd:nth-child(4)" exteriors: ".object-kenmerken-list:nth-child(19)" parking: ".object-kenmerken-list:nth-child(24)" neighborhood_name: ".fd-display-inline--bp-m" @@ -55,6 +55,7 @@ css_selector: date_sold: "dd:nth-child(4)" term: "dd:nth-child(6)" price_sold: ".object-header__price--historic" - last_ask_price: ".object-kenmerken-list:nth-child(2) .fd-align-items-center:nth-child(2)" - last_ask_price_m2: ".object-kenmerken-list__asking-price" + last_ask_price: "section.mt-6 > div:nth-child(2) > dl:nth-child(2) > dd:nth-child(2) > span:nth-child(1)" + last_ask_price_m2: "section.mt-6 > div:nth-child(2) > dl:nth-child(2) > dd:nth-child(4) > span:nth-child(1)" photo: ".media-viewer-overview__section-list-item--photo img[data-lazy]" + diff --git a/funda_scraper/extract.py b/funda_scraper/extract.py new file mode 100644 index 0000000..e69de29 diff --git a/funda_scraper/scrape.py b/funda_scraper/scrape.py index 17f8383..01ecd8a 100644 --- a/funda_scraper/scrape.py +++ b/funda_scraper/scrape.py @@ -138,12 +138,70 @@ def check_sort(self) -> str: "'floor_area_down', 'plot_area_down', 'city_up' or 'postal_code_up'. " ) + def _ensure_dir(self, dir_name: str): + if not os.path.exists(dir_name): + os.makedirs(dir_name) + @staticmethod def _check_dir() -> None: """Ensures the existence of the directory for storing data.""" if not os.path.exists("data"): os.makedirs("data") + def _get_list_pages(self, page_start: int = None, n_pages: int = None) -> None: + self._ensure_dir('data/listpages') + + page_start = self.page_start if page_start is None else page_start + n_pages = self.n_pages if n_pages is None else n_pages + + main_url = self._build_main_query_url() + + for i in tqdm(range(page_start, page_start + n_pages)): + url = f"{main_url}&search_result={i}" + response = requests.get(url, headers = config.header) + + with open(f'./data/listpages/listpage_{i}.html', 'w') as file: + file.write(response.text) + + return + + def _get_detail_pages(self): + listpages_dir = 'data/listpages' + self._ensure_dir(listpages_dir) + self._ensure_dir('data/detailpages') + + urls = [] + + for f in os.listdir(listpages_dir): + file_path = os.path.join(listpages_dir, f) + + if os.path.isfile(file_path): + with open(file_path, 'r') as file: + content = file.read() + soup = BeautifulSoup(content, "lxml") + + script_tag = soup.find_all("script", {"type": "application/ld+json"})[0] + json_data = json.loads(script_tag.contents[0]) + item_list = [item["url"] for item in json_data["itemListElement"]] + urls += item_list + + + urls = self.remove_duplicates(urls) + fixed_urls = [self.fix_link(url) for url in urls] + + pools = mp.cpu_count() + content = process_map(self.scrape_one_link2, fixed_urls, max_workers=pools) + + for i, c in enumerate(content): + with open(f'./data/detailpages/detailpage_{i}.html', 'w') as file: + file.write(c) + + + def scrape_one_link2(self, link: str) -> str: + response = requests.get(link, headers=config.header) + return response.text + + @staticmethod def _get_links_from_one_parent(url: str) -> List[str]: """Scrapes all available property links from a single Funda search page.""" @@ -292,11 +350,23 @@ def get_value_from_css(soup: BeautifulSoup, selector: str) -> str: result = "na" return result + def save_file(self, file_name: str): + self._check_dir() + + + def scrape_one_link(self, link: str) -> List[str]: """Scrapes data from a single property link.""" # Initialize for each page response = requests.get(link, headers=config.header) + + with open('./data/test.html', 'w') as file: + file.write(response.content) + + return + + soup = BeautifulSoup(response.text, "lxml") # Get the value according to respective CSS selectors @@ -406,6 +476,10 @@ def run( :param filepath: the name for the file :return: the (pre-processed) dataframe from scraping """ + self._get_list_pages() + self._get_detail_pages() + return + self.fetch_all_links() self.scrape_pages() diff --git a/main.py b/main.py index 60ae7b1..6dfa564 100644 --- a/main.py +++ b/main.py @@ -12,7 +12,8 @@ # min_price=500, # max_price=2000 ) - df = scraper.run(raw_data=True, save=True, filepath="test.csv") + #df = scraper.run(raw_data=True, save=True, filepath="test.csv") + df = scraper.run(raw_data=True, save=True) df.head() From f2ed46b6e7a780836bec604e0d9eb050ca77ef5a Mon Sep 17 00:00:00 2001 From: Mustafa Karakaya Date: Sat, 28 Sep 2024 20:06:47 +0200 Subject: [PATCH 03/17] further work on refactoring to object-oriented setup --- .gitignore | 3 +- funda_scraper/__init__.py | 3 +- funda_scraper/extract.py | 155 ++++++++++++++++++++++++++++++++ funda_scraper/filerepository.py | 47 ++++++++++ main.py | 31 ++++--- 5 files changed, 224 insertions(+), 15 deletions(-) create mode 100644 funda_scraper/filerepository.py diff --git a/.gitignore b/.gitignore index 73a43a2..6ad36f4 100644 --- a/.gitignore +++ b/.gitignore @@ -36,4 +36,5 @@ MANIFEST pip-log.txt pip-delete-this-directory.txt -data/ \ No newline at end of file +data/ +test.csv \ No newline at end of file diff --git a/funda_scraper/__init__.py b/funda_scraper/__init__.py index d48a695..6d23c1d 100644 --- a/funda_scraper/__init__.py +++ b/funda_scraper/__init__.py @@ -1,5 +1,6 @@ """Access the directory in python""" from funda_scraper.scrape import FundaScraper +from funda_scraper.extract import DataExtractor -__all__ = ["FundaScraper"] +__all__ = ["FundaScraper", "DataExtractor"] diff --git a/funda_scraper/extract.py b/funda_scraper/extract.py index e69de29..d04ea10 100644 --- a/funda_scraper/extract.py +++ b/funda_scraper/extract.py @@ -0,0 +1,155 @@ + +import argparse +import datetime +import json +import multiprocessing as mp +import os +from collections import OrderedDict +from typing import List, Optional +from urllib.parse import urlparse, urlunparse + +import pandas as pd +import requests +from bs4 import BeautifulSoup +from tqdm import tqdm +from tqdm.contrib.concurrent import process_map + +from funda_scraper.config.core import config +from funda_scraper.preprocess import clean_date_format, preprocess_data +from funda_scraper.utils import logger + +from funda_scraper.filerepository import FileRepository + + +class DataExtractor(object): + + def __init__(self): + self.selectors = config.css_selector + self.raw_df = pd.DataFrame() + self.clean_df = pd.DataFrame() + + def extract_data(self, to_buy: bool, find_past: bool, raw_data: bool, save: bool, file_path: str): + file_repo = FileRepository() + + df = pd.DataFrame({key: [] for key in self.selectors.keys()}) + detail_pages = file_repo.get_detail_pages() + + for page in detail_pages: + page_data = self.extract_data_from_page(page, to_buy, find_past) + df.loc[len(df)] = page_data + + df["city"] = df["url"].map(lambda x: x.split("/")[4]) + df["log_id"] = datetime.datetime.now().strftime("%Y%m-%d%H-%M%S") + if not find_past: + df = df.drop(["term", "price_sold", "date_sold"], axis=1) + logger.info(f"*** All scraping done: {df.shape[0]} results ***") + self.raw_df = df + + if raw_data: + df = self.raw_df + else: + logger.info("*** Cleaning data ***") + df = preprocess_data(df=self.raw_df, is_past=self.find_past) + self.clean_df = df + + print(df) + + if save: + self.save_csv(df, file_path) + + + def extract_data_from_page(self, page: str, to_buy: bool, find_past: bool): + soup = BeautifulSoup(page, "lxml") + + script_tag = soup.find_all("script", {"type": "application/ld+json"})[0] + json_data = json.loads(script_tag.contents[0]) + + link = json_data["url"] + + # Get the value according to respective CSS selectors + if to_buy: + if find_past: + list_since_selector = self.selectors.date_list + else: + list_since_selector = self.selectors.listed_since + else: + if find_past: + list_since_selector = ".fd-align-items-center:nth-child(9) span" + else: + list_since_selector = ".fd-align-items-center:nth-child(7) span" + + result = [ + link, + self.get_value_from_css(soup, self.selectors.price), + self.get_value_from_css(soup, self.selectors.address), + self.get_value_from_css(soup, self.selectors.descrip), + self.get_value_from_css(soup, list_since_selector), + self.get_value_from_css(soup, self.selectors.zip_code), + self.get_value_from_css(soup, self.selectors.size), + self.get_value_from_css(soup, self.selectors.year), + self.get_value_from_css(soup, self.selectors.living_area), + self.get_value_from_css(soup, self.selectors.kind_of_house), + self.get_value_from_css(soup, self.selectors.building_type), + self.get_value_from_css(soup, self.selectors.num_of_rooms), + self.get_value_from_css(soup, self.selectors.num_of_bathrooms), + self.get_value_from_css(soup, self.selectors.layout), + self.get_value_from_css(soup, self.selectors.energy_label), + self.get_value_from_css(soup, self.selectors.insulation), + self.get_value_from_css(soup, self.selectors.heating), + self.get_value_from_css(soup, self.selectors.ownership), + self.get_value_from_css(soup, self.selectors.exteriors), + self.get_value_from_css(soup, self.selectors.parking), + self.get_value_from_css(soup, self.selectors.neighborhood_name), + self.get_value_from_css(soup, self.selectors.date_list), + self.get_value_from_css(soup, self.selectors.date_sold), + self.get_value_from_css(soup, self.selectors.term), + self.get_value_from_css(soup, self.selectors.price_sold), + self.get_value_from_css(soup, self.selectors.last_ask_price), + self.get_value_from_css(soup, self.selectors.last_ask_price_m2).split("\r")[ + 0 + ], + ] + + # Deal with list_since_selector especially, since its CSS varies sometimes + if clean_date_format(result[4]) == "na": + for i in range(6, 16): + selector = f".fd-align-items-center:nth-child({i}) span" + update_list_since = self.get_value_from_css(soup, selector) + if clean_date_format(update_list_since) == "na": + pass + else: + result[4] = update_list_since + + photos_list = [ + p.get("data-lazy-srcset") for p in soup.select(self.selectors.photo) + ] + photos_string = ", ".join(photos_list) + + # Clean up the retried result from one page + result = [r.replace("\n", "").replace("\r", "").strip() for r in result] + result.append(photos_string) + return result + + + def save_csv(self, df: pd.DataFrame, filepath: str = None) -> None: + """Saves the scraped data to a CSV file.""" + if filepath is None: + self._check_dir() + date = str(datetime.datetime.now().date()).replace("-", "") + status = "unavailable" if self.find_past else "unavailable" + want_to = "buy" if self.to_buy else "rent" + filepath = f"./data/houseprice_{date}_{self.area}_{want_to}_{status}_{len(self.links)}.csv" + df.to_csv(filepath, index=False) + logger.info(f"*** File saved: {filepath}. ***") + + + @staticmethod + def get_value_from_css(soup: BeautifulSoup, selector: str) -> str: + """Extracts data from HTML using a CSS selector.""" + result = soup.select(selector) + if len(result) > 0: + result = result[0].text + else: + result = "na" + return result + diff --git a/funda_scraper/filerepository.py b/funda_scraper/filerepository.py new file mode 100644 index 0000000..5fe9e44 --- /dev/null +++ b/funda_scraper/filerepository.py @@ -0,0 +1,47 @@ +import argparse +import datetime +import json +import multiprocessing as mp +import os +from collections import OrderedDict +from typing import List, Optional +from urllib.parse import urlparse, urlunparse + +import pandas as pd +import requests +from bs4 import BeautifulSoup +from tqdm import tqdm +from tqdm.contrib.concurrent import process_map + +from funda_scraper.config.core import config +from funda_scraper.preprocess import clean_date_format, preprocess_data +from funda_scraper.utils import logger + + +class FileRepository(object): + LISTPAGES_DIR = 'data/listpages' + DETAILPAGES_DIR = 'data/detailpages' + + def __init__(self) -> None: + self._ensure_dir(self.LISTPAGES_DIR) + self._ensure_dir(self.DETAILPAGES_DIR) + + def _ensure_dir(self, dir_name: str): + if not os.path.exists(dir_name): + os.makedirs(dir_name) + + def get_detail_pages(self) -> List[str]: + pages = [] + + for f in os.listdir(self.DETAILPAGES_DIR): + file_path = os.path.join(self.DETAILPAGES_DIR, f) + + if os.path.isfile(file_path): + with open(file_path, 'r') as file: + content = file.read() + pages.append(content) + + return pages + + + diff --git a/main.py b/main.py index 6dfa564..0cb125f 100644 --- a/main.py +++ b/main.py @@ -1,20 +1,25 @@ from funda_scraper.scrape import FundaScraper +from funda_scraper.extract import DataExtractor if __name__ == "__main__": - print('there u go') + print('there u go again') - scraper = FundaScraper( - area="amsterdam", - want_to="buy", - find_past=False, - page_start=1, - n_pages=3, - # min_price=500, - # max_price=2000 - ) - #df = scraper.run(raw_data=True, save=True, filepath="test.csv") - df = scraper.run(raw_data=True, save=True) - df.head() + # scraper = FundaScraper( + # area="amsterdam", + # want_to="buy", + # find_past=False, + # page_start=1, + # n_pages=3, + # # min_price=500, + # # max_price=2000 + # ) + # #df = scraper.run(raw_data=True, save=True, filepath="test.csv") + # df = scraper.run(raw_data=True, save=True) + # df.head() + + data_extractor = DataExtractor() + + data_extractor.extract_data(to_buy = True, find_past = False, raw_data = True, save = True, file_path = 'test.csv') From f00f98c9eaa207d8827eafcb083dcb9335385226 Mon Sep 17 00:00:00 2001 From: Mustafa Karakaya Date: Sat, 28 Sep 2024 20:39:52 +0200 Subject: [PATCH 04/17] further work on object-oriented setup --- funda_scraper/extract.py | 8 +- funda_scraper/filerepository.py | 23 ++++ funda_scraper/scrape.py | 222 +++----------------------------- main.py | 27 ++-- 4 files changed, 61 insertions(+), 219 deletions(-) diff --git a/funda_scraper/extract.py b/funda_scraper/extract.py index d04ea10..3480a1b 100644 --- a/funda_scraper/extract.py +++ b/funda_scraper/extract.py @@ -27,12 +27,12 @@ def __init__(self): self.selectors = config.css_selector self.raw_df = pd.DataFrame() self.clean_df = pd.DataFrame() + self.file_repo = FileRepository() - def extract_data(self, to_buy: bool, find_past: bool, raw_data: bool, save: bool, file_path: str): - file_repo = FileRepository() + def extract_data(self, to_buy: bool, find_past: bool, raw_data: bool, save: bool, file_path: str) -> pd.DataFrame: df = pd.DataFrame({key: [] for key in self.selectors.keys()}) - detail_pages = file_repo.get_detail_pages() + detail_pages = self.file_repo.get_detail_pages() for page in detail_pages: page_data = self.extract_data_from_page(page, to_buy, find_past) @@ -57,6 +57,8 @@ def extract_data(self, to_buy: bool, find_past: bool, raw_data: bool, save: bool if save: self.save_csv(df, file_path) + return df + def extract_data_from_page(self, page: str, to_buy: bool, find_past: bool): soup = BeautifulSoup(page, "lxml") diff --git a/funda_scraper/filerepository.py b/funda_scraper/filerepository.py index 5fe9e44..43aec36 100644 --- a/funda_scraper/filerepository.py +++ b/funda_scraper/filerepository.py @@ -30,6 +30,19 @@ def _ensure_dir(self, dir_name: str): if not os.path.exists(dir_name): os.makedirs(dir_name) + def get_list_pages(self) -> List[str]: + pages = [] + + for f in os.listdir(self.LISTPAGES_DIR): + file_path = os.path.join(self.LISTPAGES_DIR, f) + + if os.path.isfile(file_path): + with open(file_path, 'r') as file: + content = file.read() + pages.append(content) + + return pages + def get_detail_pages(self) -> List[str]: pages = [] @@ -43,5 +56,15 @@ def get_detail_pages(self) -> List[str]: return pages + def save_list_page(self, content: str, index: int): + with open(f'./data/listpages/listpage_{index}.html', 'w') as file: + file.write(content) + + def save_detail_page(self, content: str, index: int): + with open(f'./data/detailpages/detailpage_{index}.html', 'w') as file: + file.write(content) + + + diff --git a/funda_scraper/scrape.py b/funda_scraper/scrape.py index 01ecd8a..c23741e 100644 --- a/funda_scraper/scrape.py +++ b/funda_scraper/scrape.py @@ -18,6 +18,8 @@ from funda_scraper.config.core import config from funda_scraper.preprocess import clean_date_format, preprocess_data from funda_scraper.utils import logger +from funda_scraper.extract import DataExtractor +from funda_scraper.filerepository import FileRepository class FundaScraper(object): @@ -77,6 +79,9 @@ def __init__( self.base_url = config.base_url self.selectors = config.css_selector + self.file_repo = FileRepository() + self.data_extractor = DataExtractor() + def __repr__(self): return ( f"FundaScraper(area={self.area}, " @@ -138,18 +143,7 @@ def check_sort(self) -> str: "'floor_area_down', 'plot_area_down', 'city_up' or 'postal_code_up'. " ) - def _ensure_dir(self, dir_name: str): - if not os.path.exists(dir_name): - os.makedirs(dir_name) - - @staticmethod - def _check_dir() -> None: - """Ensures the existence of the directory for storing data.""" - if not os.path.exists("data"): - os.makedirs("data") - def _get_list_pages(self, page_start: int = None, n_pages: int = None) -> None: - self._ensure_dir('data/listpages') page_start = self.page_start if page_start is None else page_start n_pages = self.n_pages if n_pages is None else n_pages @@ -160,44 +154,33 @@ def _get_list_pages(self, page_start: int = None, n_pages: int = None) -> None: url = f"{main_url}&search_result={i}" response = requests.get(url, headers = config.header) - with open(f'./data/listpages/listpage_{i}.html', 'w') as file: - file.write(response.text) + self.file_repo.save_list_page(response.text, i) return def _get_detail_pages(self): - listpages_dir = 'data/listpages' - self._ensure_dir(listpages_dir) - self._ensure_dir('data/detailpages') - urls = [] - for f in os.listdir(listpages_dir): - file_path = os.path.join(listpages_dir, f) - - if os.path.isfile(file_path): - with open(file_path, 'r') as file: - content = file.read() - soup = BeautifulSoup(content, "lxml") - - script_tag = soup.find_all("script", {"type": "application/ld+json"})[0] - json_data = json.loads(script_tag.contents[0]) - item_list = [item["url"] for item in json_data["itemListElement"]] - urls += item_list + list_pages = self.file_repo.get_list_pages() + for page in list_pages: + soup = BeautifulSoup(page, "lxml") + script_tag = soup.find_all("script", {"type": "application/ld+json"})[0] + json_data = json.loads(script_tag.contents[0]) + item_list = [item["url"] for item in json_data["itemListElement"]] + urls += item_list urls = self.remove_duplicates(urls) fixed_urls = [self.fix_link(url) for url in urls] pools = mp.cpu_count() - content = process_map(self.scrape_one_link2, fixed_urls, max_workers=pools) + content = process_map(self.scrape_one_link, fixed_urls, max_workers=pools) for i, c in enumerate(content): - with open(f'./data/detailpages/detailpage_{i}.html', 'w') as file: - file.write(c) + self.file_repo.save_detail_page(c, i) - def scrape_one_link2(self, link: str) -> str: + def scrape_one_link(self, link: str) -> str: response = requests.get(link, headers=config.header) return response.text @@ -274,35 +257,6 @@ def fix_link(link: str) -> str: ) return fixed_link - def fetch_all_links(self, page_start: int = None, n_pages: int = None) -> None: - """Collects all available property links across multiple pages.""" - - page_start = self.page_start if page_start is None else page_start - n_pages = self.n_pages if n_pages is None else n_pages - - logger.info("*** Phase 1: Fetch all the available links from all pages *** ") - urls = [] - main_url = self._build_main_query_url() - - for i in tqdm(range(page_start, page_start + n_pages)): - try: - item_list = self._get_links_from_one_parent( - f"{main_url}&search_result={i}" - ) - urls += item_list - except IndexError: - self.page_end = i - logger.info(f"*** The last available page is {self.page_end} ***") - break - - urls = self.remove_duplicates(urls) - fixed_urls = [self.fix_link(url) for url in urls] - - logger.info( - f"*** Got all the urls. {len(fixed_urls)} houses found from {self.page_start} to {self.page_end} ***" - ) - self.links = fixed_urls - def _build_main_query_url(self) -> str: """Constructs the main query URL for the search.""" query = "koop" if self.to_buy else "huur" @@ -340,134 +294,8 @@ def _build_main_query_url(self) -> str: logger.info(f"*** Main URL: {main_url} ***") return main_url - @staticmethod - def get_value_from_css(soup: BeautifulSoup, selector: str) -> str: - """Extracts data from HTML using a CSS selector.""" - result = soup.select(selector) - if len(result) > 0: - result = result[0].text - else: - result = "na" - return result - - def save_file(self, file_name: str): - self._check_dir() - - - - def scrape_one_link(self, link: str) -> List[str]: - """Scrapes data from a single property link.""" - - # Initialize for each page - response = requests.get(link, headers=config.header) - - with open('./data/test.html', 'w') as file: - file.write(response.content) - - return - - soup = BeautifulSoup(response.text, "lxml") - - # Get the value according to respective CSS selectors - if self.to_buy: - if self.find_past: - list_since_selector = self.selectors.date_list - else: - list_since_selector = self.selectors.listed_since - else: - if self.find_past: - list_since_selector = ".fd-align-items-center:nth-child(9) span" - else: - list_since_selector = ".fd-align-items-center:nth-child(7) span" - - result = [ - link, - self.get_value_from_css(soup, self.selectors.price), - self.get_value_from_css(soup, self.selectors.address), - self.get_value_from_css(soup, self.selectors.descrip), - self.get_value_from_css(soup, list_since_selector), - self.get_value_from_css(soup, self.selectors.zip_code), - self.get_value_from_css(soup, self.selectors.size), - self.get_value_from_css(soup, self.selectors.year), - self.get_value_from_css(soup, self.selectors.living_area), - self.get_value_from_css(soup, self.selectors.kind_of_house), - self.get_value_from_css(soup, self.selectors.building_type), - self.get_value_from_css(soup, self.selectors.num_of_rooms), - self.get_value_from_css(soup, self.selectors.num_of_bathrooms), - self.get_value_from_css(soup, self.selectors.layout), - self.get_value_from_css(soup, self.selectors.energy_label), - self.get_value_from_css(soup, self.selectors.insulation), - self.get_value_from_css(soup, self.selectors.heating), - self.get_value_from_css(soup, self.selectors.ownership), - self.get_value_from_css(soup, self.selectors.exteriors), - self.get_value_from_css(soup, self.selectors.parking), - self.get_value_from_css(soup, self.selectors.neighborhood_name), - self.get_value_from_css(soup, self.selectors.date_list), - self.get_value_from_css(soup, self.selectors.date_sold), - self.get_value_from_css(soup, self.selectors.term), - self.get_value_from_css(soup, self.selectors.price_sold), - self.get_value_from_css(soup, self.selectors.last_ask_price), - self.get_value_from_css(soup, self.selectors.last_ask_price_m2).split("\r")[ - 0 - ], - ] - - # Deal with list_since_selector especially, since its CSS varies sometimes - if clean_date_format(result[4]) == "na": - for i in range(6, 16): - selector = f".fd-align-items-center:nth-child({i}) span" - update_list_since = self.get_value_from_css(soup, selector) - if clean_date_format(update_list_since) == "na": - pass - else: - result[4] = update_list_since - - photos_list = [ - p.get("data-lazy-srcset") for p in soup.select(self.selectors.photo) - ] - photos_string = ", ".join(photos_list) - - # Clean up the retried result from one page - result = [r.replace("\n", "").replace("\r", "").strip() for r in result] - result.append(photos_string) - return result - - def scrape_pages(self) -> None: - """Scrapes data from all collected property links.""" - - logger.info("*** Phase 2: Start scraping from individual links ***") - df = pd.DataFrame({key: [] for key in self.selectors.keys()}) - - # Scrape pages with multiprocessing to improve efficiency - # TODO: use asyncio instead - pools = mp.cpu_count() - content = process_map(self.scrape_one_link, self.links, max_workers=pools) - - for i, c in enumerate(content): - df.loc[len(df)] = c - - df["city"] = df["url"].map(lambda x: x.split("/")[4]) - df["log_id"] = datetime.datetime.now().strftime("%Y%m-%d%H-%M%S") - if not self.find_past: - df = df.drop(["term", "price_sold", "date_sold"], axis=1) - logger.info(f"*** All scraping done: {df.shape[0]} results ***") - self.raw_df = df - - def save_csv(self, df: pd.DataFrame, filepath: str = None) -> None: - """Saves the scraped data to a CSV file.""" - if filepath is None: - self._check_dir() - date = str(datetime.datetime.now().date()).replace("-", "") - status = "unavailable" if self.find_past else "unavailable" - want_to = "buy" if self.to_buy else "rent" - filepath = f"./data/houseprice_{date}_{self.area}_{want_to}_{status}_{len(self.links)}.csv" - df.to_csv(filepath, index=False) - logger.info(f"*** File saved: {filepath}. ***") - - def run( - self, raw_data: bool = False, save: bool = False, filepath: str = None - ) -> pd.DataFrame: + def run(self, raw_data: bool = False, save: bool = False, filepath: str = None) -> pd.DataFrame: """ Runs the full scraping process, optionally saving the results to a CSV file. @@ -478,22 +306,12 @@ def run( """ self._get_list_pages() self._get_detail_pages() - return - - self.fetch_all_links() - self.scrape_pages() - if raw_data: - df = self.raw_df - else: - logger.info("*** Cleaning data ***") - df = preprocess_data(df=self.raw_df, is_past=self.find_past) - self.clean_df = df - - if save: - self.save_csv(df, filepath) + df = self.data_extractor.extract_data(to_buy = self.to_buy, find_past = self.find_past, raw_data = raw_data + , save = save, file_path = filepath) logger.info("*** Done! ***") + return df diff --git a/main.py b/main.py index 0cb125f..821abf0 100644 --- a/main.py +++ b/main.py @@ -4,22 +4,21 @@ if __name__ == "__main__": print('there u go again') - # scraper = FundaScraper( - # area="amsterdam", - # want_to="buy", - # find_past=False, - # page_start=1, - # n_pages=3, - # # min_price=500, - # # max_price=2000 - # ) + scraper = FundaScraper( + area="amsterdam", + want_to="buy", + find_past=False, + page_start=1, + n_pages=3, + # min_price=500, + # max_price=2000 + ) # #df = scraper.run(raw_data=True, save=True, filepath="test.csv") - # df = scraper.run(raw_data=True, save=True) - # df.head() + df = scraper.run(raw_data = True, save = True, filepath = "test.csv") + df.head() - data_extractor = DataExtractor() - - data_extractor.extract_data(to_buy = True, find_past = False, raw_data = True, save = True, file_path = 'test.csv') + # data_extractor = DataExtractor() + # data_extractor.extract_data(to_buy = True, find_past = False, raw_data = True, save = True, file_path = 'test.csv') From a339e7a8e75944eca9ea0cead660c22064d44fd0 Mon Sep 17 00:00:00 2001 From: Mustafa Karakaya Date: Sun, 29 Sep 2024 07:40:52 +0200 Subject: [PATCH 05/17] introduced run_id to separate files of different runs --- funda_scraper/extract.py | 5 +-- funda_scraper/filerepository.py | 62 ++++++++++++++++++++------------- funda_scraper/scrape.py | 17 +++++---- 3 files changed, 51 insertions(+), 33 deletions(-) diff --git a/funda_scraper/extract.py b/funda_scraper/extract.py index 3480a1b..588c71f 100644 --- a/funda_scraper/extract.py +++ b/funda_scraper/extract.py @@ -4,6 +4,7 @@ import json import multiprocessing as mp import os +import uuid from collections import OrderedDict from typing import List, Optional from urllib.parse import urlparse, urlunparse @@ -29,10 +30,10 @@ def __init__(self): self.clean_df = pd.DataFrame() self.file_repo = FileRepository() - def extract_data(self, to_buy: bool, find_past: bool, raw_data: bool, save: bool, file_path: str) -> pd.DataFrame: + def extract_data(self, to_buy: bool, find_past: bool, raw_data: bool, save: bool, file_path: str, run_id: str) -> pd.DataFrame: df = pd.DataFrame({key: [] for key in self.selectors.keys()}) - detail_pages = self.file_repo.get_detail_pages() + detail_pages = self.file_repo.get_detail_pages(run_id) for page in detail_pages: page_data = self.extract_data_from_page(page, to_buy, find_past) diff --git a/funda_scraper/filerepository.py b/funda_scraper/filerepository.py index 43aec36..29bd5a3 100644 --- a/funda_scraper/filerepository.py +++ b/funda_scraper/filerepository.py @@ -1,40 +1,33 @@ -import argparse -import datetime -import json -import multiprocessing as mp import os -from collections import OrderedDict -from typing import List, Optional from urllib.parse import urlparse, urlunparse - -import pandas as pd -import requests -from bs4 import BeautifulSoup -from tqdm import tqdm from tqdm.contrib.concurrent import process_map +from typing import List, Optional + from funda_scraper.config.core import config from funda_scraper.preprocess import clean_date_format, preprocess_data from funda_scraper.utils import logger class FileRepository(object): - LISTPAGES_DIR = 'data/listpages' - DETAILPAGES_DIR = 'data/detailpages' + DATA_DIR = "data" + LISTPAGES_DIR = 'listpages' + DETAILPAGES_DIR = 'detailpages' def __init__(self) -> None: - self._ensure_dir(self.LISTPAGES_DIR) - self._ensure_dir(self.DETAILPAGES_DIR) + self._ensure_dir(self.DATA_DIR) def _ensure_dir(self, dir_name: str): if not os.path.exists(dir_name): os.makedirs(dir_name) - def get_list_pages(self) -> List[str]: + def get_list_pages(self, run_id: str) -> List[str]: pages = [] - for f in os.listdir(self.LISTPAGES_DIR): - file_path = os.path.join(self.LISTPAGES_DIR, f) + list_pages_dir = self._get_list_pages_dir_name(run_id) + + for f in os.listdir(list_pages_dir): + file_path = os.path.join(list_pages_dir, f) if os.path.isfile(file_path): with open(file_path, 'r') as file: @@ -43,11 +36,13 @@ def get_list_pages(self) -> List[str]: return pages - def get_detail_pages(self) -> List[str]: + def get_detail_pages(self, run_id: str) -> List[str]: pages = [] - for f in os.listdir(self.DETAILPAGES_DIR): - file_path = os.path.join(self.DETAILPAGES_DIR, f) + detail_pages_dir = self._get_detail_pages_dir_name(run_id) + + for f in os.listdir(detail_pages_dir): + file_path = os.path.join(detail_pages_dir, f) if os.path.isfile(file_path): with open(file_path, 'r') as file: @@ -56,14 +51,31 @@ def get_detail_pages(self) -> List[str]: return pages - def save_list_page(self, content: str, index: int): - with open(f'./data/listpages/listpage_{index}.html', 'w') as file: + def save_list_page(self, content: str, index: int, run_id: str): + list_pages_dir = self._get_list_pages_dir_name(run_id) + self._ensure_dir(list_pages_dir) + + file_path = os.path.join(list_pages_dir, f"listpage_{index}.html") + + with open(file_path, 'w') as file: file.write(content) - def save_detail_page(self, content: str, index: int): - with open(f'./data/detailpages/detailpage_{index}.html', 'w') as file: + def save_detail_page(self, content: str, index: int, run_id: str): + detail_pages_dir = self._get_detail_pages_dir_name(run_id) + self._ensure_dir(detail_pages_dir) + + file_path = os.path.join(detail_pages_dir, f"detailpage_{index}.html") + + with open(file_path, 'w') as file: file.write(content) + def _get_list_pages_dir_name(self, run_id: str): + return os.path.join(self.DATA_DIR, run_id, self.LISTPAGES_DIR) + + def _get_detail_pages_dir_name(self, run_id: str): + return os.path.join(self.DATA_DIR, run_id, self.DETAILPAGES_DIR) + + diff --git a/funda_scraper/scrape.py b/funda_scraper/scrape.py index c23741e..dc636f5 100644 --- a/funda_scraper/scrape.py +++ b/funda_scraper/scrape.py @@ -5,6 +5,7 @@ import json import multiprocessing as mp import os +import uuid from collections import OrderedDict from typing import List, Optional from urllib.parse import urlparse, urlunparse @@ -79,6 +80,8 @@ def __init__( self.base_url = config.base_url self.selectors = config.css_selector + self.run_id = str(uuid.uuid1()) + self.file_repo = FileRepository() self.data_extractor = DataExtractor() @@ -154,14 +157,14 @@ def _get_list_pages(self, page_start: int = None, n_pages: int = None) -> None: url = f"{main_url}&search_result={i}" response = requests.get(url, headers = config.header) - self.file_repo.save_list_page(response.text, i) + self.file_repo.save_list_page(response.text, i, self.run_id) return def _get_detail_pages(self): urls = [] - list_pages = self.file_repo.get_list_pages() + list_pages = self.file_repo.get_list_pages(self.run_id) for page in list_pages: soup = BeautifulSoup(page, "lxml") @@ -177,7 +180,7 @@ def _get_detail_pages(self): content = process_map(self.scrape_one_link, fixed_urls, max_workers=pools) for i, c in enumerate(content): - self.file_repo.save_detail_page(c, i) + self.file_repo.save_detail_page(c, i, self.run_id) def scrape_one_link(self, link: str) -> str: @@ -294,6 +297,9 @@ def _build_main_query_url(self) -> str: logger.info(f"*** Main URL: {main_url} ***") return main_url + def _get_pages(self): + self._get_list_pages() + self._get_detail_pages() def run(self, raw_data: bool = False, save: bool = False, filepath: str = None) -> pd.DataFrame: """ @@ -304,11 +310,10 @@ def run(self, raw_data: bool = False, save: bool = False, filepath: str = None) :param filepath: the name for the file :return: the (pre-processed) dataframe from scraping """ - self._get_list_pages() - self._get_detail_pages() + self._get_pages() df = self.data_extractor.extract_data(to_buy = self.to_buy, find_past = self.find_past, raw_data = raw_data - , save = save, file_path = filepath) + , save = save, file_path = filepath, run_id = self.run_id) logger.info("*** Done! ***") From dd25067655d25336ca1a70d4bca76a410b4267cd Mon Sep 17 00:00:00 2001 From: Mustafa Karakaya Date: Sun, 29 Sep 2024 13:51:50 +0200 Subject: [PATCH 06/17] save resultfile in same folder as the html files --- funda_scraper/extract.py | 17 +---------------- funda_scraper/filerepository.py | 8 ++++++++ 2 files changed, 9 insertions(+), 16 deletions(-) diff --git a/funda_scraper/extract.py b/funda_scraper/extract.py index 588c71f..4433f26 100644 --- a/funda_scraper/extract.py +++ b/funda_scraper/extract.py @@ -53,10 +53,8 @@ def extract_data(self, to_buy: bool, find_past: bool, raw_data: bool, save: bool df = preprocess_data(df=self.raw_df, is_past=self.find_past) self.clean_df = df - print(df) - if save: - self.save_csv(df, file_path) + self.file_repo.save_result_file(df, run_id) return df @@ -133,19 +131,6 @@ def extract_data_from_page(self, page: str, to_buy: bool, find_past: bool): result.append(photos_string) return result - - def save_csv(self, df: pd.DataFrame, filepath: str = None) -> None: - """Saves the scraped data to a CSV file.""" - if filepath is None: - self._check_dir() - date = str(datetime.datetime.now().date()).replace("-", "") - status = "unavailable" if self.find_past else "unavailable" - want_to = "buy" if self.to_buy else "rent" - filepath = f"./data/houseprice_{date}_{self.area}_{want_to}_{status}_{len(self.links)}.csv" - df.to_csv(filepath, index=False) - logger.info(f"*** File saved: {filepath}. ***") - - @staticmethod def get_value_from_css(soup: BeautifulSoup, selector: str) -> str: """Extracts data from HTML using a CSS selector.""" diff --git a/funda_scraper/filerepository.py b/funda_scraper/filerepository.py index 29bd5a3..4313ca8 100644 --- a/funda_scraper/filerepository.py +++ b/funda_scraper/filerepository.py @@ -1,4 +1,5 @@ import os +import pandas as pd from urllib.parse import urlparse, urlunparse from tqdm.contrib.concurrent import process_map @@ -69,6 +70,13 @@ def save_detail_page(self, content: str, index: int, run_id: str): with open(file_path, 'w') as file: file.write(content) + def save_result_file(self, df: pd.DataFrame, run_id: str): + """Saves the scraped data to a CSV file.""" + file_path = os.path.join(self.DATA_DIR, run_id, "result.csv") + + df.to_csv(file_path, index=False) + logger.info(f"*** File saved: {file_path}. ***") + def _get_list_pages_dir_name(self, run_id: str): return os.path.join(self.DATA_DIR, run_id, self.LISTPAGES_DIR) From 69bb2ff9fc1ff79043358efa50f015969da7e51a Mon Sep 17 00:00:00 2001 From: Mustafa Karakaya Date: Sun, 29 Sep 2024 14:42:24 +0200 Subject: [PATCH 07/17] user params object instead of passing separate params --- funda_scraper/extract.py | 23 +++-- funda_scraper/scrape.py | 175 +++++---------------------------- funda_scraper/searchrequest.py | 149 ++++++++++++++++++++++++++++ main.py | 29 +++--- 4 files changed, 200 insertions(+), 176 deletions(-) create mode 100644 funda_scraper/searchrequest.py diff --git a/funda_scraper/extract.py b/funda_scraper/extract.py index 4433f26..667cad3 100644 --- a/funda_scraper/extract.py +++ b/funda_scraper/extract.py @@ -18,8 +18,8 @@ from funda_scraper.config.core import config from funda_scraper.preprocess import clean_date_format, preprocess_data from funda_scraper.utils import logger - from funda_scraper.filerepository import FileRepository +from funda_scraper.searchrequest import SearchRequest class DataExtractor(object): @@ -30,36 +30,35 @@ def __init__(self): self.clean_df = pd.DataFrame() self.file_repo = FileRepository() - def extract_data(self, to_buy: bool, find_past: bool, raw_data: bool, save: bool, file_path: str, run_id: str) -> pd.DataFrame: + def extract_data(self, search_request: SearchRequest, run_id: str, clean_data: bool) -> pd.DataFrame: df = pd.DataFrame({key: [] for key in self.selectors.keys()}) detail_pages = self.file_repo.get_detail_pages(run_id) for page in detail_pages: - page_data = self.extract_data_from_page(page, to_buy, find_past) + page_data = self.extract_data_from_page(page, search_request) df.loc[len(df)] = page_data df["city"] = df["url"].map(lambda x: x.split("/")[4]) df["log_id"] = datetime.datetime.now().strftime("%Y%m-%d%H-%M%S") - if not find_past: + if not search_request.find_past: df = df.drop(["term", "price_sold", "date_sold"], axis=1) logger.info(f"*** All scraping done: {df.shape[0]} results ***") self.raw_df = df - if raw_data: + if not clean_data: df = self.raw_df else: logger.info("*** Cleaning data ***") - df = preprocess_data(df=self.raw_df, is_past=self.find_past) + df = preprocess_data(df = self.raw_df, is_past = search_request.find_past) self.clean_df = df - if save: - self.file_repo.save_result_file(df, run_id) + self.file_repo.save_result_file(df, run_id) return df - def extract_data_from_page(self, page: str, to_buy: bool, find_past: bool): + def extract_data_from_page(self, page: str, search_request: SearchRequest): soup = BeautifulSoup(page, "lxml") script_tag = soup.find_all("script", {"type": "application/ld+json"})[0] @@ -68,13 +67,13 @@ def extract_data_from_page(self, page: str, to_buy: bool, find_past: bool): link = json_data["url"] # Get the value according to respective CSS selectors - if to_buy: - if find_past: + if search_request.to_buy: + if search_request.find_past: list_since_selector = self.selectors.date_list else: list_since_selector = self.selectors.listed_since else: - if find_past: + if search_request.find_past: list_since_selector = ".fd-align-items-center:nth-child(9) span" else: list_since_selector = ".fd-align-items-center:nth-child(7) span" diff --git a/funda_scraper/scrape.py b/funda_scraper/scrape.py index dc636f5..0dcc0b6 100644 --- a/funda_scraper/scrape.py +++ b/funda_scraper/scrape.py @@ -21,6 +21,7 @@ from funda_scraper.utils import logger from funda_scraper.extract import DataExtractor from funda_scraper.filerepository import FileRepository +from funda_scraper.searchrequest import SearchRequest class FundaScraper(object): @@ -28,21 +29,7 @@ class FundaScraper(object): A class used to scrape real estate data from the Funda website. """ - def __init__( - self, - area: str, - want_to: str, - page_start: int = 1, - n_pages: int = 1, - find_past: bool = False, - min_price: Optional[int] = None, - max_price: Optional[int] = None, - days_since: Optional[int] = None, - property_type: Optional[str] = None, - min_floor_area: Optional[str] = None, - max_floor_area: Optional[str] = None, - sort: Optional[str] = None, - ): + def __init__(self, search_request): """ :param area: The area to search for properties, formatted for URL compatibility. @@ -58,109 +45,39 @@ def __init__( :param max_floor_area: The maximum floor area for the property search. :param sort: The sorting criterion for the search results. """ - # Init attributes - self.area = area.lower().replace(" ", "-") - self.property_type = property_type - self.want_to = want_to - self.find_past = find_past - self.page_start = max(page_start, 1) - self.n_pages = max(n_pages, 1) - self.page_end = self.page_start + self.n_pages - 1 - self.min_price = min_price - self.max_price = max_price - self.days_since = days_since - self.min_floor_area = min_floor_area - self.max_floor_area = max_floor_area - self.sort = sort + self.search_request = search_request # Instantiate along the way self.links: List[str] = [] self.raw_df = pd.DataFrame() self.clean_df = pd.DataFrame() self.base_url = config.base_url - self.selectors = config.css_selector self.run_id = str(uuid.uuid1()) self.file_repo = FileRepository() self.data_extractor = DataExtractor() + def __repr__(self): - return ( - f"FundaScraper(area={self.area}, " - f"want_to={self.want_to}, " - f"n_pages={self.n_pages}, " - f"page_start={self.page_start}, " - f"find_past={self.find_past}, " - f"min_price={self.min_price}, " - f"max_price={self.max_price}, " - f"days_since={self.days_since}, " - f"min_floor_area={self.min_floor_area}, " - f"max_floor_area={self.max_floor_area}, " - f"find_past={self.find_past})" - f"min_price={self.min_price})" - f"max_price={self.max_price})" - f"days_since={self.days_since})" - f"sort={self.sort})" - ) + return str(self.search_request) - @property - def to_buy(self) -> bool: - """Determines if the search is for buying or renting properties.""" - if self.want_to.lower() in ["buy", "koop", "b", "k"]: - return True - elif self.want_to.lower() in ["rent", "huur", "r", "h"]: - return False - else: - raise ValueError("'want_to' must be either 'buy' or 'rent'.") - - @property - def check_days_since(self) -> int: - """Validates the 'days_since' attribute.""" - if self.find_past: - raise ValueError("'days_since' can only be specified when find_past=False.") - - if self.days_since in [None, 1, 3, 5, 10, 30]: - return self.days_since - else: - raise ValueError("'days_since' must be either None, 1, 3, 5, 10 or 30.") - - @property - def check_sort(self) -> str: - """Validates the 'sort' attribute.""" - if self.sort in [ - None, - "relevancy", - "date_down", - "date_up", - "price_up", - "price_down", - "floor_area_down", - "plot_area_down", - "city_up" "postal_code_up", - ]: - return self.sort - else: - raise ValueError( - "'sort' must be either None, 'relevancy', 'date_down', 'date_up', 'price_up', 'price_down', " - "'floor_area_down', 'plot_area_down', 'city_up' or 'postal_code_up'. " - ) def _get_list_pages(self, page_start: int = None, n_pages: int = None) -> None: - page_start = self.page_start if page_start is None else page_start - n_pages = self.n_pages if n_pages is None else n_pages + page_start = self.search_request.page_start if page_start is None else page_start + n_pages = self.search_request.n_pages if n_pages is None else n_pages main_url = self._build_main_query_url() for i in tqdm(range(page_start, page_start + n_pages)): url = f"{main_url}&search_result={i}" response = requests.get(url, headers = config.header) - self.file_repo.save_list_page(response.text, i, self.run_id) return + def _get_detail_pages(self): urls = [] @@ -199,47 +116,6 @@ def _get_links_from_one_parent(url: str) -> List[str]: urls = [item["url"] for item in json_data["itemListElement"]] return urls - def reset( - self, - area: Optional[str] = None, - property_type: Optional[str] = None, - want_to: Optional[str] = None, - page_start: Optional[int] = None, - n_pages: Optional[int] = None, - find_past: Optional[bool] = None, - min_price: Optional[int] = None, - max_price: Optional[int] = None, - days_since: Optional[int] = None, - min_floor_area: Optional[str] = None, - max_floor_area: Optional[str] = None, - sort: Optional[str] = None, - ) -> None: - """Resets or initializes the search parameters.""" - if area is not None: - self.area = area - if property_type is not None: - self.property_type = property_type - if want_to is not None: - self.want_to = want_to - if page_start is not None: - self.page_start = max(page_start, 1) - if n_pages is not None: - self.n_pages = max(n_pages, 1) - if find_past is not None: - self.find_past = find_past - if min_price is not None: - self.min_price = min_price - if max_price is not None: - self.max_price = max_price - if days_since is not None: - self.days_since = days_since - if min_floor_area is not None: - self.min_floor_area = min_floor_area - if max_floor_area is not None: - self.max_floor_area = max_floor_area - if sort is not None: - self.sort = sort - @staticmethod def remove_duplicates(lst: List[str]) -> List[str]: """Removes duplicate links from a list.""" @@ -262,37 +138,37 @@ def fix_link(link: str) -> str: def _build_main_query_url(self) -> str: """Constructs the main query URL for the search.""" - query = "koop" if self.to_buy else "huur" + query = "koop" if self.search_request.to_buy else "huur" main_url = ( - f"{self.base_url}/zoeken/{query}?selected_area=%5B%22{self.area}%22%5D" + f"{self.base_url}/zoeken/{query}?selected_area=%5B%22{self.search_request.area}%22%5D" ) - if self.property_type: - property_types = self.property_type.split(",") + if self.search_request.property_type: + property_types = self.search_request.property_type.split(",") formatted_property_types = [ "%22" + prop_type + "%22" for prop_type in property_types ] main_url += f"&object_type=%5B{','.join(formatted_property_types)}%5D" - if self.find_past: + if self.search_request.find_past: main_url = f'{main_url}&availability=%5B"unavailable"%5D' - if self.min_price is not None or self.max_price is not None: - min_price = "" if self.min_price is None else self.min_price - max_price = "" if self.max_price is None else self.max_price + if self.search_request.min_price is not None or self.search_request.max_price is not None: + min_price = "" if self.search_request.min_price is None else self.search_request.min_price + max_price = "" if self.search_request.max_price is None else self.search_request.max_price main_url = f"{main_url}&price=%22{min_price}-{max_price}%22" - if self.days_since is not None: - main_url = f"{main_url}&publication_date={self.check_days_since}" + if self.search_request.days_since is not None: + main_url = f"{main_url}&publication_date={self.search_request.check_days_since}" - if self.min_floor_area or self.max_floor_area: - min_floor_area = "" if self.min_floor_area is None else self.min_floor_area - max_floor_area = "" if self.max_floor_area is None else self.max_floor_area + if self.search_request.min_floor_area or self.search_request.max_floor_area: + min_floor_area = "" if self.search_request.min_floor_area is None else self.search_request.min_floor_area + max_floor_area = "" if self.search_request.max_floor_area is None else self.search_request.max_floor_area main_url = f"{main_url}&floor_area=%22{min_floor_area}-{max_floor_area}%22" - if self.sort is not None: - main_url = f"{main_url}&sort=%22{self.check_sort}%22" + if self.search_request.sort is not None: + main_url = f"{main_url}&sort=%22{self.search_request.sort_by}%22" logger.info(f"*** Main URL: {main_url} ***") return main_url @@ -301,7 +177,7 @@ def _get_pages(self): self._get_list_pages() self._get_detail_pages() - def run(self, raw_data: bool = False, save: bool = False, filepath: str = None) -> pd.DataFrame: + def run(self, clean_data: bool = False) -> pd.DataFrame: """ Runs the full scraping process, optionally saving the results to a CSV file. @@ -312,8 +188,7 @@ def run(self, raw_data: bool = False, save: bool = False, filepath: str = None) """ self._get_pages() - df = self.data_extractor.extract_data(to_buy = self.to_buy, find_past = self.find_past, raw_data = raw_data - , save = save, file_path = filepath, run_id = self.run_id) + df = self.data_extractor.extract_data(self.search_request, self.run_id, clean_data) logger.info("*** Done! ***") diff --git a/funda_scraper/searchrequest.py b/funda_scraper/searchrequest.py new file mode 100644 index 0000000..189e404 --- /dev/null +++ b/funda_scraper/searchrequest.py @@ -0,0 +1,149 @@ +from collections import OrderedDict +from typing import List, Optional + +class SearchRequest(object): + + def __init__( + self, + area: str, + want_to: str, + page_start: int = 1, + n_pages: int = 1, + find_past: bool = False, + min_price: Optional[int] = None, + max_price: Optional[int] = None, + days_since: Optional[int] = None, + property_type: Optional[str] = None, + min_floor_area: Optional[str] = None, + max_floor_area: Optional[str] = None, + sort: Optional[str] = None, + ): + """ + + :param area: The area to search for properties, formatted for URL compatibility. + :param want_to: Specifies whether the user wants to buy or rent properties. + :param page_start: The starting page number for the search. + :param n_pages: The number of pages to scrape. + :param find_past: Flag to indicate whether to find past listings. + :param min_price: The minimum price for the property search. + :param max_price: The maximum price for the property search. + :param days_since: The maximum number of days since the listing was published. + :param property_type: The type of property to search for. + :param min_floor_area: The minimum floor area for the property search. + :param max_floor_area: The maximum floor area for the property search. + :param sort: The sorting criterion for the search results. + """ + # Init attributes + self.area = area.lower().replace(" ", "-") + self.property_type = property_type + self.want_to = want_to + self.find_past = find_past + self.page_start = max(page_start, 1) + self.n_pages = max(n_pages, 1) + self.page_end = self.page_start + self.n_pages - 1 + self.min_price = min_price + self.max_price = max_price + self.days_since = days_since + self.min_floor_area = min_floor_area + self.max_floor_area = max_floor_area + self.sort = sort + + def __repr__(self): + return ( + f"FundaScraper(area={self.area}, " + f"want_to={self.want_to}, " + f"n_pages={self.n_pages}, " + f"page_start={self.page_start}, " + f"find_past={self.find_past}, " + f"min_price={self.min_price}, " + f"max_price={self.max_price}, " + f"days_since={self.days_since}, " + f"min_floor_area={self.min_floor_area}, " + f"max_floor_area={self.max_floor_area}, " + f"find_past={self.find_past})" + f"min_price={self.min_price})" + f"max_price={self.max_price})" + f"days_since={self.days_since})" + f"sort={self.sort})" + ) + + @property + def to_buy(self) -> bool: + """Determines if the search is for buying or renting properties.""" + if self.want_to.lower() in ["buy", "koop", "b", "k"]: + return True + elif self.want_to.lower() in ["rent", "huur", "r", "h"]: + return False + else: + raise ValueError("'want_to' must be either 'buy' or 'rent'.") + + @property + def check_days_since(self) -> int: + """Validates the 'days_since' attribute.""" + if self.find_past: + raise ValueError("'days_since' can only be specified when find_past=False.") + + if self.days_since in [None, 1, 3, 5, 10, 30]: + return self.days_since + else: + raise ValueError("'days_since' must be either None, 1, 3, 5, 10 or 30.") + + @property + def sort_by(self) -> str: + """Validates the 'sort' attribute.""" + if self.sort in [ + None, + "relevancy", + "date_down", + "date_up", + "price_up", + "price_down", + "floor_area_down", + "plot_area_down", + "city_up", + "postal_code_up" + ]: + return self.sort + else: + return None + + def reset( + self, + area: Optional[str] = None, + property_type: Optional[str] = None, + want_to: Optional[str] = None, + page_start: Optional[int] = None, + n_pages: Optional[int] = None, + find_past: Optional[bool] = None, + min_price: Optional[int] = None, + max_price: Optional[int] = None, + days_since: Optional[int] = None, + min_floor_area: Optional[str] = None, + max_floor_area: Optional[str] = None, + sort: Optional[str] = None, + ) -> None: + """Resets or initializes the search parameters.""" + if area is not None: + self.area = area + if property_type is not None: + self.property_type = property_type + if want_to is not None: + self.want_to = want_to + if page_start is not None: + self.page_start = max(page_start, 1) + if n_pages is not None: + self.n_pages = max(n_pages, 1) + if find_past is not None: + self.find_past = find_past + if min_price is not None: + self.min_price = min_price + if max_price is not None: + self.max_price = max_price + if days_since is not None: + self.days_since = days_since + if min_floor_area is not None: + self.min_floor_area = min_floor_area + if max_floor_area is not None: + self.max_floor_area = max_floor_area + if sort is not None: + self.sort = sort \ No newline at end of file diff --git a/main.py b/main.py index 821abf0..7be6d1e 100644 --- a/main.py +++ b/main.py @@ -1,24 +1,25 @@ from funda_scraper.scrape import FundaScraper from funda_scraper.extract import DataExtractor +from funda_scraper.searchrequest import SearchRequest if __name__ == "__main__": - print('there u go again') - scraper = FundaScraper( - area="amsterdam", - want_to="buy", - find_past=False, - page_start=1, - n_pages=3, - # min_price=500, - # max_price=2000 - ) - # #df = scraper.run(raw_data=True, save=True, filepath="test.csv") - df = scraper.run(raw_data = True, save = True, filepath = "test.csv") + search_params = SearchRequest( + area = "Amsterdam", + want_to = "buy", + find_past = False, + page_start = 1, + n_pages = 3, + # min_price=500, + # max_price=2000 + ) + + scraper = FundaScraper(search_params) + df = scraper.run(clean_data = False) df.head() - # data_extractor = DataExtractor() - # data_extractor.extract_data(to_buy = True, find_past = False, raw_data = True, save = True, file_path = 'test.csv') + #data_extractor = DataExtractor() + #data_extractor.extract_data(search_params, run_id = "14431b3e-7e59-11ef-a3d4-a0510ba6104e", clean_data = False) From a1b04fe9b7edeac0276660cc09cdf4e202ecfe30 Mon Sep 17 00:00:00 2001 From: Mustafa Karakaya Date: Sun, 29 Sep 2024 15:16:47 +0200 Subject: [PATCH 08/17] rename n_pages to number_of_pages --- README.md | 42 +++++++++++++++++++--------------- funda_scraper/scrape.py | 39 ++++++++++++++----------------- funda_scraper/searchrequest.py | 16 ++++++------- main.py | 2 +- tests/test_scrape.py | 20 ++++++++-------- 5 files changed, 60 insertions(+), 59 deletions(-) diff --git a/README.md b/README.md index 24f361b..4ddc7c6 100644 --- a/README.md +++ b/README.md @@ -10,50 +10,58 @@ `FundaScaper` provides the easiest way to perform web scraping on Funda, the Dutch housing website. You can find houses either for sale or for rent, and access historical data from the past few years. Please note: + 1. Scraping this website is ONLY allowed for personal use (as per Funda's Terms and Conditions). 2. Any commercial use of this Python package is prohibited. The author holds no liability for any misuse of the package. ## Install + ### Install with pip: + ``` pip install funda-scraper ``` + ### Clone the repository: + ``` git clone https://github.com/whchien/funda-scraper.git cd funda-scraper export PYTHONPATH=${PWD} -python funda_scraper/scrape.py --area amsterdam --want_to rent --page_start 1 --n_pages 3 --save +python funda_scraper/scrape.py --area amsterdam --want_to rent --page_start 1 --number_of_pages 3 --save ``` -## Quickstart +## Quickstart + ``` from funda_scraper import FundaScraper scraper = FundaScraper( - area="amsterdam", - want_to="rent", - find_past=False, - page_start=1, - n_pages=3, - min_price=500, + area="amsterdam", + want_to="rent", + find_past=False, + page_start=1, + number_of_pages=3, + min_price=500, max_price=2000 ) df = scraper.run(raw_data=False, save=True, filepath="test.csv") df.head() ``` -![image](static/example_df.png) +![image](static/example_df.png) -* Note for Windows Users: Please add `if __name__ == "__main__":` before your script. +- Note for Windows Users: Please add `if __name__ == "__main__":` before your script. ## Customizing Your Scraping + You can pass several arguments to `FundaScraper()` for customized scraping: + - `area`: Specify the city or specific area you want to look for, e.g. Amsterdam, Utrecht, Rotterdam, etc. - `want_to`: Choose either `buy` or `rent` to find houses either for sale or for rent. - `find_past`: Set to `True` to find historical data; the default is `False`. -- `page_start`: Indicate which page to start scraping from; the default is `1`. -- `n_pages`: Indicate how many pages to scrape; the default is `1`. +- `page_start`: Indicate which page to start scraping from; the default is `1`. +- `number_of_pages`: Indicate how many pages to scrape; the default is `1`. - `min_price`: Indicate the lowest budget amount. - `max_price`: Indicate the highest budget amount. - `min_floor_area`: Indicate the minimum floor area. @@ -62,14 +70,14 @@ You can pass several arguments to `FundaScraper()` for customized scraping: - `property_type`: Specify the desired property type(s). - `sort`: Specify sorting criteria. - The scraped raw result contains following information: + - url - price - address - description - listed_since -- zip_code +- zip_code - size - year_built - living_area @@ -95,10 +103,8 @@ The scraped raw result contains following information: To fetch the data without preprocessing, specify `scraper.run(raw_data=True)`. -*Note*: Information regarding listing dates is no longer available since Q4 2023. Funda requires users to log in to see this information. - +_Note_: Information regarding listing dates is no longer available since Q4 2023. Funda requires users to log in to see this information. ## More information -Check the [example notebook](https://colab.research.google.com/drive/1hNzJJRWxD59lrbeDpfY1OUpBz0NktmfW?usp=sharing) for further details. If you find this project helpful, please give it a [star](https://github.com/whchien/funda-scraper). - +Check the [example notebook](https://colab.research.google.com/drive/1hNzJJRWxD59lrbeDpfY1OUpBz0NktmfW?usp=sharing) for further details. If you find this project helpful, please give it a [star](https://github.com/whchien/funda-scraper). diff --git a/funda_scraper/scrape.py b/funda_scraper/scrape.py index 0dcc0b6..e6cb176 100644 --- a/funda_scraper/scrape.py +++ b/funda_scraper/scrape.py @@ -32,22 +32,10 @@ class FundaScraper(object): def __init__(self, search_request): """ - :param area: The area to search for properties, formatted for URL compatibility. - :param want_to: Specifies whether the user wants to buy or rent properties. - :param page_start: The starting page number for the search. - :param n_pages: The number of pages to scrape. - :param find_past: Flag to indicate whether to find past listings. - :param min_price: The minimum price for the property search. - :param max_price: The maximum price for the property search. - :param days_since: The maximum number of days since the listing was published. - :param property_type: The type of property to search for. - :param min_floor_area: The minimum floor area for the property search. - :param max_floor_area: The maximum floor area for the property search. - :param sort: The sorting criterion for the search results. + :param search_request: The parameters for the search """ self.search_request = search_request - # Instantiate along the way self.links: List[str] = [] self.raw_df = pd.DataFrame() self.clean_df = pd.DataFrame() @@ -63,14 +51,14 @@ def __repr__(self): return str(self.search_request) - def _get_list_pages(self, page_start: int = None, n_pages: int = None) -> None: + def _get_list_pages(self, page_start: int = None, number_of_pages: int = None) -> None: page_start = self.search_request.page_start if page_start is None else page_start - n_pages = self.search_request.n_pages if n_pages is None else n_pages + number_of_pages = self.search_request.number_of_pages if number_of_pages is None else number_of_pages main_url = self._build_main_query_url() - for i in tqdm(range(page_start, page_start + n_pages)): + for i in tqdm(range(page_start, page_start + number_of_pages)): url = f"{main_url}&search_result={i}" response = requests.get(url, headers = config.header) self.file_repo.save_list_page(response.text, i, self.run_id) @@ -116,11 +104,13 @@ def _get_links_from_one_parent(url: str) -> List[str]: urls = [item["url"] for item in json_data["itemListElement"]] return urls + @staticmethod def remove_duplicates(lst: List[str]) -> List[str]: """Removes duplicate links from a list.""" return list(OrderedDict.fromkeys(lst)) + @staticmethod def fix_link(link: str) -> str: """Fixes a given property link to ensure proper URL formatting.""" @@ -136,6 +126,7 @@ def fix_link(link: str) -> str: ) return fixed_link + def _build_main_query_url(self) -> str: """Constructs the main query URL for the search.""" query = "koop" if self.search_request.to_buy else "huur" @@ -173,21 +164,25 @@ def _build_main_query_url(self) -> str: logger.info(f"*** Main URL: {main_url} ***") return main_url + def _get_pages(self): self._get_list_pages() self._get_detail_pages() + def run(self, clean_data: bool = False) -> pd.DataFrame: """ - Runs the full scraping process, optionally saving the results to a CSV file. + Runs the full scraping process, saving the results to a CSV file. - :param raw_data: if true, the data won't be pre-processed - :param save: if true, the data will be saved as a csv file - :param filepath: the name for the file + :param clean_data: if true, the data won't be pre-processed :return: the (pre-processed) dataframe from scraping """ + logger.info(f"Started scraping, run_id: {self.run_id}") + + logger.info("Fetching pages..") self._get_pages() + logger.info("Extracting data from the html pages") df = self.data_extractor.extract_data(self.search_request, self.run_id, clean_data) logger.info("*** Done! ***") @@ -219,7 +214,7 @@ def run(self, clean_data: bool = False) -> pd.DataFrame: "--page_start", type=int, help="Specify which page to start scraping", default=1 ) parser.add_argument( - "--n_pages", type=int, help="Specify how many pages to scrape", default=1 + "--number_of_pages", type=int, help="Specify how many pages to scrape", default=1 ) parser.add_argument( "--min_price", type=int, help="Specify the min price", default=None @@ -267,7 +262,7 @@ def run(self, clean_data: bool = False) -> pd.DataFrame: want_to=args.want_to, find_past=args.find_past, page_start=args.page_start, - n_pages=args.n_pages, + number_of_pages=args.number_of_pages, min_price=args.min_price, max_price=args.max_price, days_since=args.days_since, diff --git a/funda_scraper/searchrequest.py b/funda_scraper/searchrequest.py index 189e404..1167bd7 100644 --- a/funda_scraper/searchrequest.py +++ b/funda_scraper/searchrequest.py @@ -8,7 +8,7 @@ def __init__( area: str, want_to: str, page_start: int = 1, - n_pages: int = 1, + number_of_pages: int = 1, find_past: bool = False, min_price: Optional[int] = None, max_price: Optional[int] = None, @@ -23,7 +23,7 @@ def __init__( :param area: The area to search for properties, formatted for URL compatibility. :param want_to: Specifies whether the user wants to buy or rent properties. :param page_start: The starting page number for the search. - :param n_pages: The number of pages to scrape. + :param number_of_pages: The number of pages to scrape. :param find_past: Flag to indicate whether to find past listings. :param min_price: The minimum price for the property search. :param max_price: The maximum price for the property search. @@ -39,8 +39,8 @@ def __init__( self.want_to = want_to self.find_past = find_past self.page_start = max(page_start, 1) - self.n_pages = max(n_pages, 1) - self.page_end = self.page_start + self.n_pages - 1 + self.number_of_pages = max(number_of_pages, 1) + self.page_end = self.page_start + self.number_of_pages - 1 self.min_price = min_price self.max_price = max_price self.days_since = days_since @@ -52,7 +52,7 @@ def __repr__(self): return ( f"FundaScraper(area={self.area}, " f"want_to={self.want_to}, " - f"n_pages={self.n_pages}, " + f"number_of_pages={self.number_of_pages}, " f"page_start={self.page_start}, " f"find_past={self.find_past}, " f"min_price={self.min_price}, " @@ -113,7 +113,7 @@ def reset( property_type: Optional[str] = None, want_to: Optional[str] = None, page_start: Optional[int] = None, - n_pages: Optional[int] = None, + number_of_pages: Optional[int] = None, find_past: Optional[bool] = None, min_price: Optional[int] = None, max_price: Optional[int] = None, @@ -131,8 +131,8 @@ def reset( self.want_to = want_to if page_start is not None: self.page_start = max(page_start, 1) - if n_pages is not None: - self.n_pages = max(n_pages, 1) + if number_of_pages is not None: + self.number_of_pages = max(number_of_pages, 1) if find_past is not None: self.find_past = find_past if min_price is not None: diff --git a/main.py b/main.py index 7be6d1e..a5caf71 100644 --- a/main.py +++ b/main.py @@ -9,7 +9,7 @@ want_to = "buy", find_past = False, page_start = 1, - n_pages = 3, + number_of_pages = 3, # min_price=500, # max_price=2000 ) diff --git a/tests/test_scrape.py b/tests/test_scrape.py index bf0aa29..05252d0 100644 --- a/tests/test_scrape.py +++ b/tests/test_scrape.py @@ -11,7 +11,7 @@ def scraper(self): area="amsterdam", want_to="buy", page_start=1, - n_pages=1, + number_of_pages=1, find_past=False, min_price=100000, max_price=500000, @@ -34,9 +34,9 @@ def test_check_sort(self, scraper): assert scraper.check_sort == "price_down" def test_reset(self, scraper): - scraper.reset(area="rotterdam", n_pages=2) + scraper.reset(area="rotterdam", number_of_pages=2) assert scraper.area == "rotterdam" - assert scraper.n_pages == 2 + assert scraper.number_of_pages == 2 def test_fix_link(self, scraper): link = "https://www.funda.nl/detail/koop/den-haag/appartement-address-333/88888888/" @@ -49,7 +49,7 @@ def test_fix_link(self, scraper): def test_rent(): scraper = FundaScraper( - area="amsterdam", want_to="rent", find_past=False, page_start=1, n_pages=1 + area="amsterdam", want_to="rent", find_past=False, page_start=1, number_of_pages=1 ) df = scraper.run(raw_data=True) assert len(scraper.links) == 15 @@ -63,7 +63,7 @@ def test_rent(): def test_rent_past(): scraper = FundaScraper( - area="amsterdam", want_to="rent", find_past=True, page_start=1, n_pages=1 + area="amsterdam", want_to="rent", find_past=True, page_start=1, number_of_pages=1 ) df = scraper.run(raw_data=True) assert len(scraper.links) == 15 @@ -77,7 +77,7 @@ def test_rent_past(): def test_buy(): scraper = FundaScraper( - area="amsterdam", want_to="buy", find_past=False, page_start=1, n_pages=1 + area="amsterdam", want_to="buy", find_past=False, page_start=1, number_of_pages=1 ) df = scraper.run(raw_data=True) assert len(scraper.links) == 15 @@ -91,7 +91,7 @@ def test_buy(): def test_buy_past(): scraper = FundaScraper( - area="amsterdam", want_to="buy", find_past=True, page_start=1, n_pages=1 + area="amsterdam", want_to="buy", find_past=True, page_start=1, number_of_pages=1 ) df = scraper.run(raw_data=True) assert len(scraper.links) == 15 @@ -110,7 +110,7 @@ def test_buy_house(): want_to="buy", find_past=False, page_start=1, - n_pages=1, + number_of_pages=1, ) df = scraper.run(raw_data=True) assert len(scraper.links) == 15 @@ -130,7 +130,7 @@ def test_buy_apartment(): want_to="buy", find_past=False, page_start=1, - n_pages=1, + number_of_pages=1, ) df = scraper.run(raw_data=True) assert len(scraper.links) == 15 @@ -150,7 +150,7 @@ def test_buy_mixed(): want_to="buy", find_past=False, page_start=1, - n_pages=1, + number_of_pages=1, ) df = scraper.run(raw_data=True) assert len(scraper.links) == 15 From 40a80ac9de7378eedb2eb1769be26eaa3efe0371 Mon Sep 17 00:00:00 2001 From: Mustafa Karakaya Date: Sun, 29 Sep 2024 20:21:49 +0200 Subject: [PATCH 09/17] create Property class for houses --- funda_scraper/extract.py | 131 +++++++++++++++++++++----------------- funda_scraper/property.py | 34 ++++++++++ main.py | 10 +-- 3 files changed, 111 insertions(+), 64 deletions(-) create mode 100644 funda_scraper/property.py diff --git a/funda_scraper/extract.py b/funda_scraper/extract.py index 667cad3..783617f 100644 --- a/funda_scraper/extract.py +++ b/funda_scraper/extract.py @@ -20,6 +20,7 @@ from funda_scraper.utils import logger from funda_scraper.filerepository import FileRepository from funda_scraper.searchrequest import SearchRequest +from funda_scraper.property import Property class DataExtractor(object): @@ -30,20 +31,26 @@ def __init__(self): self.clean_df = pd.DataFrame() self.file_repo = FileRepository() + def extract_data(self, search_request: SearchRequest, run_id: str, clean_data: bool) -> pd.DataFrame: - df = pd.DataFrame({key: [] for key in self.selectors.keys()}) detail_pages = self.file_repo.get_detail_pages(run_id) + houses: list[Property] = [] + for page in detail_pages: - page_data = self.extract_data_from_page(page, search_request) - df.loc[len(df)] = page_data - - df["city"] = df["url"].map(lambda x: x.split("/")[4]) - df["log_id"] = datetime.datetime.now().strftime("%Y%m-%d%H-%M%S") - if not search_request.find_past: - df = df.drop(["term", "price_sold", "date_sold"], axis=1) - logger.info(f"*** All scraping done: {df.shape[0]} results ***") + house = self.extract_data_from_detail_page(page, search_request) + houses.append(house) + + #df["log_id"] = datetime.datetime.now().strftime("%Y%m-%d%H-%M%S") + + # if not search_request.find_past: + # df = df.drop(["term", "price_sold", "date_sold"], axis=1) + + logger.info(f"*** All scraping done: {len(houses)} results ***") + + df = pd.DataFrame([vars(house) for house in houses]) + self.raw_df = df if not clean_data: @@ -58,13 +65,17 @@ def extract_data(self, search_request: SearchRequest, run_id: str, clean_data: b return df - def extract_data_from_page(self, page: str, search_request: SearchRequest): + def extract_data_from_detail_page(self, page: str, search_request: SearchRequest) -> Property: soup = BeautifulSoup(page, "lxml") script_tag = soup.find_all("script", {"type": "application/ld+json"})[0] json_data = json.loads(script_tag.contents[0]) link = json_data["url"] + description = json_data["description"] + address = f"{json_data["address"]["streetAddress"]}" + city = json_data["address"]["addressLocality"] + price = f"{json_data["offers"]["priceCurrency"]} {json_data["offers"]["price"]}" # Get the value according to respective CSS selectors if search_request.to_buy: @@ -78,57 +89,51 @@ def extract_data_from_page(self, page: str, search_request: SearchRequest): else: list_since_selector = ".fd-align-items-center:nth-child(7) span" - result = [ - link, - self.get_value_from_css(soup, self.selectors.price), - self.get_value_from_css(soup, self.selectors.address), - self.get_value_from_css(soup, self.selectors.descrip), - self.get_value_from_css(soup, list_since_selector), - self.get_value_from_css(soup, self.selectors.zip_code), - self.get_value_from_css(soup, self.selectors.size), - self.get_value_from_css(soup, self.selectors.year), - self.get_value_from_css(soup, self.selectors.living_area), - self.get_value_from_css(soup, self.selectors.kind_of_house), - self.get_value_from_css(soup, self.selectors.building_type), - self.get_value_from_css(soup, self.selectors.num_of_rooms), - self.get_value_from_css(soup, self.selectors.num_of_bathrooms), - self.get_value_from_css(soup, self.selectors.layout), - self.get_value_from_css(soup, self.selectors.energy_label), - self.get_value_from_css(soup, self.selectors.insulation), - self.get_value_from_css(soup, self.selectors.heating), - self.get_value_from_css(soup, self.selectors.ownership), - self.get_value_from_css(soup, self.selectors.exteriors), - self.get_value_from_css(soup, self.selectors.parking), - self.get_value_from_css(soup, self.selectors.neighborhood_name), - self.get_value_from_css(soup, self.selectors.date_list), - self.get_value_from_css(soup, self.selectors.date_sold), - self.get_value_from_css(soup, self.selectors.term), - self.get_value_from_css(soup, self.selectors.price_sold), - self.get_value_from_css(soup, self.selectors.last_ask_price), - self.get_value_from_css(soup, self.selectors.last_ask_price_m2).split("\r")[ - 0 - ], - ] + house = Property() + house.link = link + house.price = price + house.address = address + house.city = city + house.description = description + house.zip_code = self.get_value_from_css(soup, self.selectors.zip_code) + house.size = self.get_value_from_css(soup, self.selectors.size) + house.year_of_construction = self.get_value_from_css(soup, self.selectors.year) + house.living_area = self.get_value_from_css(soup, self.selectors.living_area) + house.kind_of_house = self.get_value_from_css(soup, self.selectors.kind_of_house) + house.building_type = self.get_value_from_css(soup, self.selectors.building_type) + house.number_of_rooms = self.get_value_from_css(soup, self.selectors.num_of_rooms) + house.number_of_bathrooms = self.get_value_from_css(soup, self.selectors.num_of_bathrooms) + house.layout = self.get_value_from_css(soup, self.selectors.layout), + house.energy_label = self.get_value_from_css(soup, self.selectors.energy_label) + house.insulation = self.get_value_from_css(soup, self.selectors.insulation) + house.heating = self.get_value_from_css(soup, self.selectors.heating) + house.ownership = self.get_value_from_css(soup, self.selectors.ownership) + house.exteriors = self.get_value_from_css(soup, self.selectors.exteriors) + house.parking = self.get_value_from_css(soup, self.selectors.parking) + house.neighborhood_name = self.get_value_from_css(soup, self.selectors.neighborhood_name) + house.date_list = self.get_value_from_css(soup, self.selectors.date_list) + house.date_sold = self.get_value_from_css(soup, self.selectors.date_sold) + house.term = self.get_value_from_css(soup, self.selectors.term) + house.price_sold = self.get_value_from_css(soup, self.selectors.price_sold) + house.last_ask_price = self.get_value_from_css(soup, self.selectors.last_ask_price) + house.last_ask_price_m2 = self.get_value_from_css(soup, self.selectors.last_ask_price_m2).split("\r")[0] + house.photos = [p.get("data-lazy-srcset") for p in soup.select(self.selectors.photo)] # Deal with list_since_selector especially, since its CSS varies sometimes - if clean_date_format(result[4]) == "na": - for i in range(6, 16): - selector = f".fd-align-items-center:nth-child({i}) span" - update_list_since = self.get_value_from_css(soup, selector) - if clean_date_format(update_list_since) == "na": - pass - else: - result[4] = update_list_since - - photos_list = [ - p.get("data-lazy-srcset") for p in soup.select(self.selectors.photo) - ] - photos_string = ", ".join(photos_list) - - # Clean up the retried result from one page - result = [r.replace("\n", "").replace("\r", "").strip() for r in result] - result.append(photos_string) - return result + # if clean_date_format(result[4]) == "na": + # for i in range(6, 16): + # selector = f".fd-align-items-center:nth-child({i}) span" + # update_list_since = self.get_value_from_css(soup, selector) + # if clean_date_format(update_list_since) == "na": + # pass + # else: + # result[4] = update_list_since + + for key, value in house.__dict__.items(): + formatted_value = self.format_string(value) + setattr(house, key, formatted_value) + + return house @staticmethod def get_value_from_css(soup: BeautifulSoup, selector: str) -> str: @@ -140,3 +145,11 @@ def get_value_from_css(soup: BeautifulSoup, selector: str) -> str: result = "na" return result + + def format_string(self, value): + if type(value) == "str": + return value.replace("\n", "").replace("\r", "").strip() + else: + return value + + diff --git a/funda_scraper/property.py b/funda_scraper/property.py new file mode 100644 index 0000000..901843d --- /dev/null +++ b/funda_scraper/property.py @@ -0,0 +1,34 @@ +from dataclasses import dataclass, field + +@dataclass +class Property(): + link: str = None + price: str = None + address = None + city = None + description = None + listed_since= None + zip_code = None + size = None + year_of_construction = None + living_area = None + kind_of_house = None + building_type = None + number_of_rooms = None + number_of_bathrooms = None + layout = None + energy_label = None + insulation = None + heating = None + ownership = None + exteriors = None + parking = None + neighborhood_name = None + date_list = None + date_sold = None + term = None + price_sold = None + last_ask_price = None + last_ask_price_m2 = None + photos: list[str] = field(default_factory=list) + diff --git a/main.py b/main.py index a5caf71..3be841d 100644 --- a/main.py +++ b/main.py @@ -14,12 +14,12 @@ # max_price=2000 ) - scraper = FundaScraper(search_params) - df = scraper.run(clean_data = False) - df.head() + # scraper = FundaScraper(search_params) + # df = scraper.run(clean_data = False) + # df.head() - #data_extractor = DataExtractor() - #data_extractor.extract_data(search_params, run_id = "14431b3e-7e59-11ef-a3d4-a0510ba6104e", clean_data = False) + data_extractor = DataExtractor() + data_extractor.extract_data(search_params, run_id = "14431b3e-7e59-11ef-a3d4-a0510ba6104e", clean_data = False) From d4741c5909479a3b3f1425bb7fc0ed43badc4a68 Mon Sep 17 00:00:00 2001 From: Mustafa Karakaya Date: Mon, 30 Sep 2024 16:31:47 +0200 Subject: [PATCH 10/17] fix some css selectors --- funda_scraper/config/config.yaml | 20 ++++++++++---------- funda_scraper/extract.py | 3 ++- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/funda_scraper/config/config.yaml b/funda_scraper/config/config.yaml index 4a8e6f3..c5c0ee7 100644 --- a/funda_scraper/config/config.yaml +++ b/funda_scraper/config/config.yaml @@ -6,7 +6,7 @@ keep_cols: - date_sold - ym_sold - year_sold -# - term_days + # - term_days selling_data: - url - house_id @@ -38,19 +38,20 @@ css_selector: zip_code: "span.text-neutral-40:nth-child(2)" size: ".object-kenmerken-list:nth-child(8) .fd-align-items-center:nth-child(5) span" year: ".fd-align-items-center~ .fd-align-items-center .fd-m-right-xs" + year_of_construction: ".mt-6.md\\:mt-7 > div:nth-of-type(2) > dl > dd:nth-of-type(3) span" living_area: "section.mt-6 > div:nth-child(4) > dl:nth-child(2) > dd:nth-child(4)" - kind_of_house: ".object-kenmerken-list:nth-child(5) .fd-align-items-center:nth-child(2) span" + kind_of_house: ".mt-6.md\\:mt-7 > div:nth-of-type(2) > dl > dd:nth-of-type(1) span" building_type: "section.mt-6 > div:nth-child(3) > dl:nth-child(2) > dd:nth-child(4) > span:nth-child(1)" - num_of_rooms: ".object-kenmerken-list:nth-child(11) .fd-align-items-center:nth-child(2)" - num_of_bathrooms: ".object-kenmerken-list:nth-child(11) .fd-align-items-center:nth-child(4)" + num_of_rooms: ".mt-6.md\\:mt-7 > div:nth-of-type(4) > dl > dd:nth-of-type(1) span" + num_of_bathrooms: ".mt-6.md\\:mt-7 > div:nth-of-type(4) > dl > dd:nth-of-type(2) span" layout: ".object-kenmerken-list:nth-child(11)" energy_label: ".gap-6 > div:nth-child(1)" - insulation: ".object-kenmerken-list:nth-child(14) .fd-align-items-center:nth-child(4)" - heating: ".object-kenmerken-list:nth-child(14) .fd-align-items-center:nth-child(6)" - ownership: "section.mt-6 > div:nth-child(7) > dl:nth-child(2) > dd:nth-child(4)" + insulation: ".mt-6.md\\:mt-7 > div:nth-of-type(5) > dl > dd:nth-of-type(2) span" + heating: ".mt-6.md\\:mt-7 > div:nth-of-type(5) > dl > dd:nth-of-type(3) span" + ownership: ".mt-6.md\\:mt-7 > div:nth-of-type(6) > dl > dd:nth-of-type(2) span" exteriors: ".object-kenmerken-list:nth-child(19)" - parking: ".object-kenmerken-list:nth-child(24)" - neighborhood_name: ".fd-display-inline--bp-m" + parking: ".mt-6.md\\:mt-7 > div:nth-of-type(5) > dl > dd:nth-of-type(1) span" + neighborhood_name: ".object-header__container > a" date_list: "dd:nth-child(2)" date_sold: "dd:nth-child(4)" term: "dd:nth-child(6)" @@ -58,4 +59,3 @@ css_selector: last_ask_price: "section.mt-6 > div:nth-child(2) > dl:nth-child(2) > dd:nth-child(2) > span:nth-child(1)" last_ask_price_m2: "section.mt-6 > div:nth-child(2) > dl:nth-child(2) > dd:nth-child(4) > span:nth-child(1)" photo: ".media-viewer-overview__section-list-item--photo img[data-lazy]" - diff --git a/funda_scraper/extract.py b/funda_scraper/extract.py index 783617f..1e8043b 100644 --- a/funda_scraper/extract.py +++ b/funda_scraper/extract.py @@ -49,6 +49,7 @@ def extract_data(self, search_request: SearchRequest, run_id: str, clean_data: b logger.info(f"*** All scraping done: {len(houses)} results ***") + # It may be more intuitive to manipulate the Property objects instead of dataframes, but let's keep the dataframes approach for now df = pd.DataFrame([vars(house) for house in houses]) self.raw_df = df @@ -97,7 +98,7 @@ def extract_data_from_detail_page(self, page: str, search_request: SearchRequest house.description = description house.zip_code = self.get_value_from_css(soup, self.selectors.zip_code) house.size = self.get_value_from_css(soup, self.selectors.size) - house.year_of_construction = self.get_value_from_css(soup, self.selectors.year) + house.year_of_construction = self.get_value_from_css(soup, self.selectors.year_of_construction) house.living_area = self.get_value_from_css(soup, self.selectors.living_area) house.kind_of_house = self.get_value_from_css(soup, self.selectors.kind_of_house) house.building_type = self.get_value_from_css(soup, self.selectors.building_type) From 67397c5ca48e95cc2459c9839f3c7f4c1c191621 Mon Sep 17 00:00:00 2001 From: Mustafa Karakaya Date: Mon, 30 Sep 2024 16:34:33 +0200 Subject: [PATCH 11/17] simplify some css selectors --- funda_scraper/config/config.yaml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/funda_scraper/config/config.yaml b/funda_scraper/config/config.yaml index c5c0ee7..411ff16 100644 --- a/funda_scraper/config/config.yaml +++ b/funda_scraper/config/config.yaml @@ -40,17 +40,17 @@ css_selector: year: ".fd-align-items-center~ .fd-align-items-center .fd-m-right-xs" year_of_construction: ".mt-6.md\\:mt-7 > div:nth-of-type(2) > dl > dd:nth-of-type(3) span" living_area: "section.mt-6 > div:nth-child(4) > dl:nth-child(2) > dd:nth-child(4)" - kind_of_house: ".mt-6.md\\:mt-7 > div:nth-of-type(2) > dl > dd:nth-of-type(1) span" + kind_of_house: "section.mt-6 > div:nth-of-type(2) > dl > dd:nth-of-type(1) span" building_type: "section.mt-6 > div:nth-child(3) > dl:nth-child(2) > dd:nth-child(4) > span:nth-child(1)" - num_of_rooms: ".mt-6.md\\:mt-7 > div:nth-of-type(4) > dl > dd:nth-of-type(1) span" - num_of_bathrooms: ".mt-6.md\\:mt-7 > div:nth-of-type(4) > dl > dd:nth-of-type(2) span" + num_of_rooms: "section.mt-6 > div:nth-of-type(4) > dl > dd:nth-of-type(1) span" + num_of_bathrooms: "section.mt-6 > div:nth-of-type(4) > dl > dd:nth-of-type(2) span" layout: ".object-kenmerken-list:nth-child(11)" energy_label: ".gap-6 > div:nth-child(1)" insulation: ".mt-6.md\\:mt-7 > div:nth-of-type(5) > dl > dd:nth-of-type(2) span" - heating: ".mt-6.md\\:mt-7 > div:nth-of-type(5) > dl > dd:nth-of-type(3) span" - ownership: ".mt-6.md\\:mt-7 > div:nth-of-type(6) > dl > dd:nth-of-type(2) span" + heating: "section.mt-6 > div:nth-of-type(5) > dl > dd:nth-of-type(3) span" + ownership: "section.mt-6 > div:nth-of-type(6) > dl > dd:nth-of-type(2) span" exteriors: ".object-kenmerken-list:nth-child(19)" - parking: ".mt-6.md\\:mt-7 > div:nth-of-type(5) > dl > dd:nth-of-type(1) span" + parking: "section.mt-6 > div:nth-of-type(5) > dl > dd:nth-of-type(1) span" neighborhood_name: ".object-header__container > a" date_list: "dd:nth-child(2)" date_sold: "dd:nth-child(4)" From 5f9cce0c35578c4acee6d98662f2a91730180410 Mon Sep 17 00:00:00 2001 From: Mustafa Karakaya Date: Mon, 30 Sep 2024 17:03:30 +0200 Subject: [PATCH 12/17] fix pre process --- funda_scraper/config/config.yaml | 4 ++-- funda_scraper/extract.py | 6 +++--- funda_scraper/preprocess.py | 15 ++++++++------- funda_scraper/property.py | 4 ++-- main.py | 2 +- 5 files changed, 16 insertions(+), 15 deletions(-) diff --git a/funda_scraper/config/config.yaml b/funda_scraper/config/config.yaml index 411ff16..8c72e8c 100644 --- a/funda_scraper/config/config.yaml +++ b/funda_scraper/config/config.yaml @@ -27,8 +27,8 @@ keep_cols: # - date_list # - ym_list # - year_list - - descrip - - photo + - description + - photos css_selector: url: none price: "section.mt-6 > div:nth-child(2) > dl:nth-child(2) > dd:nth-child(2) > span:nth-child(1)" diff --git a/funda_scraper/extract.py b/funda_scraper/extract.py index 1e8043b..0a348ac 100644 --- a/funda_scraper/extract.py +++ b/funda_scraper/extract.py @@ -72,7 +72,7 @@ def extract_data_from_detail_page(self, page: str, search_request: SearchRequest script_tag = soup.find_all("script", {"type": "application/ld+json"})[0] json_data = json.loads(script_tag.contents[0]) - link = json_data["url"] + url = json_data["url"] description = json_data["description"] address = f"{json_data["address"]["streetAddress"]}" city = json_data["address"]["addressLocality"] @@ -91,7 +91,7 @@ def extract_data_from_detail_page(self, page: str, search_request: SearchRequest list_since_selector = ".fd-align-items-center:nth-child(7) span" house = Property() - house.link = link + house.url = url house.price = price house.address = address house.city = city @@ -100,7 +100,7 @@ def extract_data_from_detail_page(self, page: str, search_request: SearchRequest house.size = self.get_value_from_css(soup, self.selectors.size) house.year_of_construction = self.get_value_from_css(soup, self.selectors.year_of_construction) house.living_area = self.get_value_from_css(soup, self.selectors.living_area) - house.kind_of_house = self.get_value_from_css(soup, self.selectors.kind_of_house) + house.house_type = self.get_value_from_css(soup, self.selectors.kind_of_house) house.building_type = self.get_value_from_css(soup, self.selectors.building_type) house.number_of_rooms = self.get_value_from_css(soup, self.selectors.num_of_rooms) house.number_of_bathrooms = self.get_value_from_css(soup, self.selectors.num_of_bathrooms) diff --git a/funda_scraper/preprocess.py b/funda_scraper/preprocess.py index 96b8cf1..18fd9a6 100644 --- a/funda_scraper/preprocess.py +++ b/funda_scraper/preprocess.py @@ -159,6 +159,7 @@ def preprocess_data( """ df = df.dropna() + if not is_past: keep_cols = config.keep_cols.selling_data else: @@ -168,9 +169,9 @@ def preprocess_data( keep_cols.extend(keep_extra_cols) # Info - df["house_id"] = df["url"].apply(lambda x: int(x.split("/")[-2].split("-")[1])) - df["house_type"] = df["url"].apply(lambda x: x.split("/")[-2].split("-")[0]) - df = df[df["house_type"].isin(["appartement", "huis"])] + df["house_id"] = df["url"].apply(lambda x: int(x.split("/")[-2])) + #df["house_type"] = df["url"].apply(lambda x: x.split("/")[-2].split("-")[0]) + #df = df[df["house_type"].isin(["appartement", "huis"])] # Price price_col = "price_sold" if is_past else "price" @@ -184,13 +185,13 @@ def preprocess_data( df["zip"] = df["zip_code"].apply(lambda x: x[:4]) # House layout - df["room"] = df["num_of_rooms"].apply(find_n_room) - df["bedroom"] = df["num_of_rooms"].apply(find_n_bedroom) - df["bathroom"] = df["num_of_bathrooms"].apply(find_n_bathroom) + df["room"] = df["number_of_rooms"].apply(find_n_room) + df["bedroom"] = df["number_of_rooms"].apply(find_n_bedroom) + df["bathroom"] = df["number_of_bathrooms"].apply(find_n_bathroom) df["energy_label"] = df["energy_label"].apply(clean_energy_label) # Time - df["year_built"] = df["year"].apply(clean_year).astype(int) + df["year_built"] = df["year_of_construction"].apply(clean_year).astype(int) df["house_age"] = datetime.now().year - df["year_built"] if is_past: diff --git a/funda_scraper/property.py b/funda_scraper/property.py index 901843d..78c2da5 100644 --- a/funda_scraper/property.py +++ b/funda_scraper/property.py @@ -2,7 +2,7 @@ @dataclass class Property(): - link: str = None + url: str = None price: str = None address = None city = None @@ -12,7 +12,7 @@ class Property(): size = None year_of_construction = None living_area = None - kind_of_house = None + house_type = None building_type = None number_of_rooms = None number_of_bathrooms = None diff --git a/main.py b/main.py index 3be841d..8d4f1b4 100644 --- a/main.py +++ b/main.py @@ -19,7 +19,7 @@ # df.head() data_extractor = DataExtractor() - data_extractor.extract_data(search_params, run_id = "14431b3e-7e59-11ef-a3d4-a0510ba6104e", clean_data = False) + data_extractor.extract_data(search_params, run_id = "14431b3e-7e59-11ef-a3d4-a0510ba6104e", clean_data = True) From 9cb1e0371935c0fd91e66e354008e2c7a539f788 Mon Sep 17 00:00:00 2001 From: Mustafa Karakaya Date: Tue, 1 Oct 2024 09:37:40 +0200 Subject: [PATCH 13/17] fix extracting of photos --- funda_scraper/config/config.yaml | 1 + funda_scraper/extract.py | 28 ++++++++++++++++++++++++---- funda_scraper/property.py | 7 +++++++ main.py | 2 +- 4 files changed, 33 insertions(+), 5 deletions(-) diff --git a/funda_scraper/config/config.yaml b/funda_scraper/config/config.yaml index 8c72e8c..595fdb0 100644 --- a/funda_scraper/config/config.yaml +++ b/funda_scraper/config/config.yaml @@ -59,3 +59,4 @@ css_selector: last_ask_price: "section.mt-6 > div:nth-child(2) > dl:nth-child(2) > dd:nth-child(2) > span:nth-child(1)" last_ask_price_m2: "section.mt-6 > div:nth-child(2) > dl:nth-child(2) > dd:nth-child(4) > span:nth-child(1)" photo: ".media-viewer-overview__section-list-item--photo img[data-lazy]" + photos: "main > div:nth-child(1) ul > li:nth-child(1) > a > span:nth-child(3)" diff --git a/funda_scraper/extract.py b/funda_scraper/extract.py index 0a348ac..03e7c65 100644 --- a/funda_scraper/extract.py +++ b/funda_scraper/extract.py @@ -42,15 +42,17 @@ def extract_data(self, search_request: SearchRequest, run_id: str, clean_data: b house = self.extract_data_from_detail_page(page, search_request) houses.append(house) - #df["log_id"] = datetime.datetime.now().strftime("%Y%m-%d%H-%M%S") - # if not search_request.find_past: # df = df.drop(["term", "price_sold", "date_sold"], axis=1) logger.info(f"*** All scraping done: {len(houses)} results ***") # It may be more intuitive to manipulate the Property objects instead of dataframes, but let's keep the dataframes approach for now - df = pd.DataFrame([vars(house) for house in houses]) + # Note that we are omitting the photos field, which is an array field, and include the photos_string property + df = pd.DataFrame([ + {**{k: v for k, v in vars(house).items() if k != 'photos'}, 'photos': house.photos_string} + for house in houses + ]) self.raw_df = df @@ -118,7 +120,7 @@ def extract_data_from_detail_page(self, page: str, search_request: SearchRequest house.price_sold = self.get_value_from_css(soup, self.selectors.price_sold) house.last_ask_price = self.get_value_from_css(soup, self.selectors.last_ask_price) house.last_ask_price_m2 = self.get_value_from_css(soup, self.selectors.last_ask_price_m2).split("\r")[0] - house.photos = [p.get("data-lazy-srcset") for p in soup.select(self.selectors.photo)] + house.photos = self.get_photos(soup, house.url) # Deal with list_since_selector especially, since its CSS varies sometimes # if clean_date_format(result[4]) == "na": @@ -153,4 +155,22 @@ def format_string(self, value): else: return value + def get_photos(self, soup: BeautifulSoup, url: str) -> List[str]: + number_of_photos = 0 + try: + number_of_photos = int(self.get_value_from_css(soup, self.selectors.photos)) + except: + number_of_photos = 0 + + photos: List[str] = [] + + if (number_of_photos > 0): + for i in range(1, number_of_photos + 1): + photo_url = f"{url}media/foto/{i}" + photos.append(photo_url) + + return photos + + + diff --git a/funda_scraper/property.py b/funda_scraper/property.py index 78c2da5..1c4ef0b 100644 --- a/funda_scraper/property.py +++ b/funda_scraper/property.py @@ -32,3 +32,10 @@ class Property(): last_ask_price_m2 = None photos: list[str] = field(default_factory=list) + @property + def photos_string(self) -> bool: + if not self.photos or len(self.photos) == 0: + return "" + else: + return "|".join(self.photos) + diff --git a/main.py b/main.py index 8d4f1b4..9514900 100644 --- a/main.py +++ b/main.py @@ -19,7 +19,7 @@ # df.head() data_extractor = DataExtractor() - data_extractor.extract_data(search_params, run_id = "14431b3e-7e59-11ef-a3d4-a0510ba6104e", clean_data = True) + data_extractor.extract_data(search_params, run_id = "53861c47-7e64-11ef-921b-a0510ba6104e", clean_data = False) From 3a6b44411a3d1f7662a544fa2793f61afa07e145 Mon Sep 17 00:00:00 2001 From: Mustafa Karakaya Date: Tue, 1 Oct 2024 10:47:46 +0200 Subject: [PATCH 14/17] rename find_past to find_sold --- README.md | 4 ++-- funda_scraper/config/config.yaml | 2 +- funda_scraper/extract.py | 22 ++++++---------------- funda_scraper/scrape.py | 6 +++--- funda_scraper/searchrequest.py | 20 ++++++++++---------- main.py | 4 ++-- tests/test_scrape.py | 16 ++++++++-------- 7 files changed, 32 insertions(+), 42 deletions(-) diff --git a/README.md b/README.md index 4ddc7c6..a673941 100644 --- a/README.md +++ b/README.md @@ -39,7 +39,7 @@ from funda_scraper import FundaScraper scraper = FundaScraper( area="amsterdam", want_to="rent", - find_past=False, + find_sold=False, page_start=1, number_of_pages=3, min_price=500, @@ -59,7 +59,7 @@ You can pass several arguments to `FundaScraper()` for customized scraping: - `area`: Specify the city or specific area you want to look for, e.g. Amsterdam, Utrecht, Rotterdam, etc. - `want_to`: Choose either `buy` or `rent` to find houses either for sale or for rent. -- `find_past`: Set to `True` to find historical data; the default is `False`. +- `find_sold`: Set to `True` to find historical data; the default is `False`. - `page_start`: Indicate which page to start scraping from; the default is `1`. - `number_of_pages`: Indicate how many pages to scrape; the default is `1`. - `min_price`: Indicate the lowest budget amount. diff --git a/funda_scraper/config/config.yaml b/funda_scraper/config/config.yaml index 595fdb0..b28822d 100644 --- a/funda_scraper/config/config.yaml +++ b/funda_scraper/config/config.yaml @@ -28,7 +28,7 @@ keep_cols: # - ym_list # - year_list - description - - photos + # - photos css_selector: url: none price: "section.mt-6 > div:nth-child(2) > dl:nth-child(2) > dd:nth-child(2) > span:nth-child(1)" diff --git a/funda_scraper/extract.py b/funda_scraper/extract.py index 03e7c65..649a7d7 100644 --- a/funda_scraper/extract.py +++ b/funda_scraper/extract.py @@ -42,9 +42,6 @@ def extract_data(self, search_request: SearchRequest, run_id: str, clean_data: b house = self.extract_data_from_detail_page(page, search_request) houses.append(house) - # if not search_request.find_past: - # df = df.drop(["term", "price_sold", "date_sold"], axis=1) - logger.info(f"*** All scraping done: {len(houses)} results ***") # It may be more intuitive to manipulate the Property objects instead of dataframes, but let's keep the dataframes approach for now @@ -54,13 +51,16 @@ def extract_data(self, search_request: SearchRequest, run_id: str, clean_data: b for house in houses ]) + if not search_request.find_sold: + df = df.drop(["term", "price_sold", "date_sold"], axis=1) + self.raw_df = df if not clean_data: df = self.raw_df else: logger.info("*** Cleaning data ***") - df = preprocess_data(df = self.raw_df, is_past = search_request.find_past) + df = preprocess_data(df = self.raw_df, is_past = search_request.find_sold) self.clean_df = df self.file_repo.save_result_file(df, run_id) @@ -82,12 +82,12 @@ def extract_data_from_detail_page(self, page: str, search_request: SearchRequest # Get the value according to respective CSS selectors if search_request.to_buy: - if search_request.find_past: + if search_request.find_sold: list_since_selector = self.selectors.date_list else: list_since_selector = self.selectors.listed_since else: - if search_request.find_past: + if search_request.find_sold: list_since_selector = ".fd-align-items-center:nth-child(9) span" else: list_since_selector = ".fd-align-items-center:nth-child(7) span" @@ -122,16 +122,6 @@ def extract_data_from_detail_page(self, page: str, search_request: SearchRequest house.last_ask_price_m2 = self.get_value_from_css(soup, self.selectors.last_ask_price_m2).split("\r")[0] house.photos = self.get_photos(soup, house.url) - # Deal with list_since_selector especially, since its CSS varies sometimes - # if clean_date_format(result[4]) == "na": - # for i in range(6, 16): - # selector = f".fd-align-items-center:nth-child({i}) span" - # update_list_since = self.get_value_from_css(soup, selector) - # if clean_date_format(update_list_since) == "na": - # pass - # else: - # result[4] = update_list_since - for key, value in house.__dict__.items(): formatted_value = self.format_string(value) setattr(house, key, formatted_value) diff --git a/funda_scraper/scrape.py b/funda_scraper/scrape.py index e6cb176..3d79736 100644 --- a/funda_scraper/scrape.py +++ b/funda_scraper/scrape.py @@ -142,7 +142,7 @@ def _build_main_query_url(self) -> str: ] main_url += f"&object_type=%5B{','.join(formatted_property_types)}%5D" - if self.search_request.find_past: + if self.search_request.find_sold: main_url = f'{main_url}&availability=%5B"unavailable"%5D' if self.search_request.min_price is not None or self.search_request.max_price is not None: @@ -206,7 +206,7 @@ def run(self, clean_data: bool = False) -> pd.DataFrame: choices=["rent", "buy"], ) parser.add_argument( - "--find_past", + "--find_sold", action="store_true", help="Indicate whether you want to use historical data", ) @@ -260,7 +260,7 @@ def run(self, clean_data: bool = False) -> pd.DataFrame: scraper = FundaScraper( area=args.area, want_to=args.want_to, - find_past=args.find_past, + find_sold=args.find_sold, page_start=args.page_start, number_of_pages=args.number_of_pages, min_price=args.min_price, diff --git a/funda_scraper/searchrequest.py b/funda_scraper/searchrequest.py index 1167bd7..7dc7e31 100644 --- a/funda_scraper/searchrequest.py +++ b/funda_scraper/searchrequest.py @@ -9,7 +9,7 @@ def __init__( want_to: str, page_start: int = 1, number_of_pages: int = 1, - find_past: bool = False, + find_sold: bool = False, min_price: Optional[int] = None, max_price: Optional[int] = None, days_since: Optional[int] = None, @@ -24,7 +24,7 @@ def __init__( :param want_to: Specifies whether the user wants to buy or rent properties. :param page_start: The starting page number for the search. :param number_of_pages: The number of pages to scrape. - :param find_past: Flag to indicate whether to find past listings. + :param find_sold: Flag to indicate whether to find past listings. :param min_price: The minimum price for the property search. :param max_price: The maximum price for the property search. :param days_since: The maximum number of days since the listing was published. @@ -37,7 +37,7 @@ def __init__( self.area = area.lower().replace(" ", "-") self.property_type = property_type self.want_to = want_to - self.find_past = find_past + self.find_sold = find_sold self.page_start = max(page_start, 1) self.number_of_pages = max(number_of_pages, 1) self.page_end = self.page_start + self.number_of_pages - 1 @@ -54,13 +54,13 @@ def __repr__(self): f"want_to={self.want_to}, " f"number_of_pages={self.number_of_pages}, " f"page_start={self.page_start}, " - f"find_past={self.find_past}, " + f"find_sold={self.find_sold}, " f"min_price={self.min_price}, " f"max_price={self.max_price}, " f"days_since={self.days_since}, " f"min_floor_area={self.min_floor_area}, " f"max_floor_area={self.max_floor_area}, " - f"find_past={self.find_past})" + f"find_sold={self.find_sold})" f"min_price={self.min_price})" f"max_price={self.max_price})" f"days_since={self.days_since})" @@ -80,8 +80,8 @@ def to_buy(self) -> bool: @property def check_days_since(self) -> int: """Validates the 'days_since' attribute.""" - if self.find_past: - raise ValueError("'days_since' can only be specified when find_past=False.") + if self.find_sold: + raise ValueError("'days_since' can only be specified when find_sold=False.") if self.days_since in [None, 1, 3, 5, 10, 30]: return self.days_since @@ -114,7 +114,7 @@ def reset( want_to: Optional[str] = None, page_start: Optional[int] = None, number_of_pages: Optional[int] = None, - find_past: Optional[bool] = None, + find_sold: Optional[bool] = None, min_price: Optional[int] = None, max_price: Optional[int] = None, days_since: Optional[int] = None, @@ -133,8 +133,8 @@ def reset( self.page_start = max(page_start, 1) if number_of_pages is not None: self.number_of_pages = max(number_of_pages, 1) - if find_past is not None: - self.find_past = find_past + if find_sold is not None: + self.find_sold = find_sold if min_price is not None: self.min_price = min_price if max_price is not None: diff --git a/main.py b/main.py index 9514900..05e326e 100644 --- a/main.py +++ b/main.py @@ -7,7 +7,7 @@ search_params = SearchRequest( area = "Amsterdam", want_to = "buy", - find_past = False, + find_sold = False, page_start = 1, number_of_pages = 3, # min_price=500, @@ -19,7 +19,7 @@ # df.head() data_extractor = DataExtractor() - data_extractor.extract_data(search_params, run_id = "53861c47-7e64-11ef-921b-a0510ba6104e", clean_data = False) + data_extractor.extract_data(search_params, run_id = "53861c47-7e64-11ef-921b-a0510ba6104e", clean_data = True) diff --git a/tests/test_scrape.py b/tests/test_scrape.py index 05252d0..5adcf8a 100644 --- a/tests/test_scrape.py +++ b/tests/test_scrape.py @@ -12,7 +12,7 @@ def scraper(self): want_to="buy", page_start=1, number_of_pages=1, - find_past=False, + find_sold=False, min_price=100000, max_price=500000, days_since=None, @@ -49,7 +49,7 @@ def test_fix_link(self, scraper): def test_rent(): scraper = FundaScraper( - area="amsterdam", want_to="rent", find_past=False, page_start=1, number_of_pages=1 + area="amsterdam", want_to="rent", find_sold=False, page_start=1, number_of_pages=1 ) df = scraper.run(raw_data=True) assert len(scraper.links) == 15 @@ -63,7 +63,7 @@ def test_rent(): def test_rent_past(): scraper = FundaScraper( - area="amsterdam", want_to="rent", find_past=True, page_start=1, number_of_pages=1 + area="amsterdam", want_to="rent", find_sold=True, page_start=1, number_of_pages=1 ) df = scraper.run(raw_data=True) assert len(scraper.links) == 15 @@ -77,7 +77,7 @@ def test_rent_past(): def test_buy(): scraper = FundaScraper( - area="amsterdam", want_to="buy", find_past=False, page_start=1, number_of_pages=1 + area="amsterdam", want_to="buy", find_sold=False, page_start=1, number_of_pages=1 ) df = scraper.run(raw_data=True) assert len(scraper.links) == 15 @@ -91,7 +91,7 @@ def test_buy(): def test_buy_past(): scraper = FundaScraper( - area="amsterdam", want_to="buy", find_past=True, page_start=1, number_of_pages=1 + area="amsterdam", want_to="buy", find_sold=True, page_start=1, number_of_pages=1 ) df = scraper.run(raw_data=True) assert len(scraper.links) == 15 @@ -108,7 +108,7 @@ def test_buy_house(): area="amsterdam", property_type="house", want_to="buy", - find_past=False, + find_sold=False, page_start=1, number_of_pages=1, ) @@ -128,7 +128,7 @@ def test_buy_apartment(): area="amsterdam", property_type="apartment", want_to="buy", - find_past=False, + find_sold=False, page_start=1, number_of_pages=1, ) @@ -148,7 +148,7 @@ def test_buy_mixed(): area="amsterdam", property_type="apartment,house", want_to="buy", - find_past=False, + find_sold=False, page_start=1, number_of_pages=1, ) From ab4849a082a8c5bb69a8bbc35c213a954107fa98 Mon Sep 17 00:00:00 2001 From: Mustafa Karakaya Date: Tue, 1 Oct 2024 10:57:07 +0200 Subject: [PATCH 15/17] clean up import statements --- funda_scraper/extract.py | 16 ++-------------- funda_scraper/filerepository.py | 7 +------ funda_scraper/scrape.py | 6 +----- funda_scraper/searchrequest.py | 1 - main.py | 13 +++++++------ 5 files changed, 11 insertions(+), 32 deletions(-) diff --git a/funda_scraper/extract.py b/funda_scraper/extract.py index 649a7d7..b29863e 100644 --- a/funda_scraper/extract.py +++ b/funda_scraper/extract.py @@ -1,28 +1,16 @@ -import argparse -import datetime import json -import multiprocessing as mp -import os -import uuid -from collections import OrderedDict -from typing import List, Optional -from urllib.parse import urlparse, urlunparse - import pandas as pd -import requests from bs4 import BeautifulSoup -from tqdm import tqdm -from tqdm.contrib.concurrent import process_map +from typing import List from funda_scraper.config.core import config -from funda_scraper.preprocess import clean_date_format, preprocess_data +from funda_scraper.preprocess import preprocess_data from funda_scraper.utils import logger from funda_scraper.filerepository import FileRepository from funda_scraper.searchrequest import SearchRequest from funda_scraper.property import Property - class DataExtractor(object): def __init__(self): diff --git a/funda_scraper/filerepository.py b/funda_scraper/filerepository.py index 4313ca8..e2eb86e 100644 --- a/funda_scraper/filerepository.py +++ b/funda_scraper/filerepository.py @@ -1,12 +1,7 @@ import os import pandas as pd -from urllib.parse import urlparse, urlunparse -from tqdm.contrib.concurrent import process_map -from typing import List, Optional - -from funda_scraper.config.core import config -from funda_scraper.preprocess import clean_date_format, preprocess_data +from typing import List from funda_scraper.utils import logger diff --git a/funda_scraper/scrape.py b/funda_scraper/scrape.py index 3d79736..cf27990 100644 --- a/funda_scraper/scrape.py +++ b/funda_scraper/scrape.py @@ -1,13 +1,11 @@ """Main funda scraper module""" import argparse -import datetime import json import multiprocessing as mp -import os import uuid from collections import OrderedDict -from typing import List, Optional +from typing import List from urllib.parse import urlparse, urlunparse import pandas as pd @@ -17,11 +15,9 @@ from tqdm.contrib.concurrent import process_map from funda_scraper.config.core import config -from funda_scraper.preprocess import clean_date_format, preprocess_data from funda_scraper.utils import logger from funda_scraper.extract import DataExtractor from funda_scraper.filerepository import FileRepository -from funda_scraper.searchrequest import SearchRequest class FundaScraper(object): diff --git a/funda_scraper/searchrequest.py b/funda_scraper/searchrequest.py index 7dc7e31..b1207ad 100644 --- a/funda_scraper/searchrequest.py +++ b/funda_scraper/searchrequest.py @@ -1,4 +1,3 @@ -from collections import OrderedDict from typing import List, Optional class SearchRequest(object): diff --git a/main.py b/main.py index 05e326e..09017ad 100644 --- a/main.py +++ b/main.py @@ -9,17 +9,18 @@ want_to = "buy", find_sold = False, page_start = 1, - number_of_pages = 3, + number_of_pages = 5, # min_price=500, # max_price=2000 ) - # scraper = FundaScraper(search_params) - # df = scraper.run(clean_data = False) - # df.head() + scraper = FundaScraper(search_params) + df = scraper.run(clean_data = True) + df.head() - data_extractor = DataExtractor() - data_extractor.extract_data(search_params, run_id = "53861c47-7e64-11ef-921b-a0510ba6104e", clean_data = True) + # It's also possible to to extraction separately from fetching the html pages + # data_extractor = DataExtractor() + # data_extractor.extract_data(search_params, run_id = "7a46181a-7fd2-11ef-8dbf-a0510ba6104e", clean_data = True) From b399530c77d70d1703da81d0c144b4e19fd3606f Mon Sep 17 00:00:00 2001 From: Mustafa Karakaya Date: Wed, 9 Oct 2024 16:01:23 +0200 Subject: [PATCH 16/17] added some error handling --- funda_scraper/config/config.yaml | 1 + funda_scraper/extract.py | 16 +++++++++++++--- main.py | 15 ++++++--------- 3 files changed, 20 insertions(+), 12 deletions(-) diff --git a/funda_scraper/config/config.yaml b/funda_scraper/config/config.yaml index b28822d..e369dcd 100644 --- a/funda_scraper/config/config.yaml +++ b/funda_scraper/config/config.yaml @@ -11,6 +11,7 @@ keep_cols: - url - house_id - city + - neighborhood_name - house_type - building_type - price diff --git a/funda_scraper/extract.py b/funda_scraper/extract.py index b29863e..c79fbd4 100644 --- a/funda_scraper/extract.py +++ b/funda_scraper/extract.py @@ -3,6 +3,8 @@ import pandas as pd from bs4 import BeautifulSoup from typing import List +from tqdm import tqdm +import traceback from funda_scraper.config.core import config from funda_scraper.preprocess import preprocess_data @@ -26,11 +28,19 @@ def extract_data(self, search_request: SearchRequest, run_id: str, clean_data: b houses: list[Property] = [] - for page in detail_pages: - house = self.extract_data_from_detail_page(page, search_request) - houses.append(house) + houses_with_processing_errors = 0 + + for page in tqdm(detail_pages, desc = "Processing detail pages.."): + try: + house = self.extract_data_from_detail_page(page, search_request) + houses.append(house) + except Exception as e: + logger.error(f"An error occurred while processing house: {e}; skipping this house") + logger.error("Traceback:", exc_info=True) + houses_with_processing_errors += 1 logger.info(f"*** All scraping done: {len(houses)} results ***") + logger.info(f"There were {houses_with_processing_errors} houses that could not be processed") # It may be more intuitive to manipulate the Property objects instead of dataframes, but let's keep the dataframes approach for now # Note that we are omitting the photos field, which is an array field, and include the photos_string property diff --git a/main.py b/main.py index 09017ad..fe7a99c 100644 --- a/main.py +++ b/main.py @@ -9,18 +9,15 @@ want_to = "buy", find_sold = False, page_start = 1, - number_of_pages = 5, + number_of_pages = 100, # min_price=500, # max_price=2000 ) - scraper = FundaScraper(search_params) - df = scraper.run(clean_data = True) - df.head() + #scraper = FundaScraper(search_params) + #df = scraper.run(clean_data = True) + #df.head() # It's also possible to to extraction separately from fetching the html pages - # data_extractor = DataExtractor() - # data_extractor.extract_data(search_params, run_id = "7a46181a-7fd2-11ef-8dbf-a0510ba6104e", clean_data = True) - - - + data_extractor = DataExtractor() + data_extractor.extract_data(search_params, run_id = "196a5756-8643-11ef-840d-a0510ba6104e", clean_data = True) From c10d8fa1c7562dc1ae7f55fa80e87797ecb8d630 Mon Sep 17 00:00:00 2001 From: Mustafa Karakaya Date: Wed, 9 Oct 2024 16:38:42 +0200 Subject: [PATCH 17/17] use dictionaries instead of arrays for house list --- funda_scraper/extract.py | 12 ++++++++---- funda_scraper/filerepository.py | 4 ++-- main.py | 12 ++++++------ 3 files changed, 16 insertions(+), 12 deletions(-) diff --git a/funda_scraper/extract.py b/funda_scraper/extract.py index c79fbd4..5df2ea2 100644 --- a/funda_scraper/extract.py +++ b/funda_scraper/extract.py @@ -28,19 +28,23 @@ def extract_data(self, search_request: SearchRequest, run_id: str, clean_data: b houses: list[Property] = [] - houses_with_processing_errors = 0 + houses_with_processing_errors = [] for page in tqdm(detail_pages, desc = "Processing detail pages.."): try: - house = self.extract_data_from_detail_page(page, search_request) + content = detail_pages[page] + house = self.extract_data_from_detail_page(content, search_request) houses.append(house) except Exception as e: logger.error(f"An error occurred while processing house: {e}; skipping this house") logger.error("Traceback:", exc_info=True) - houses_with_processing_errors += 1 + houses_with_processing_errors.append(page) logger.info(f"*** All scraping done: {len(houses)} results ***") - logger.info(f"There were {houses_with_processing_errors} houses that could not be processed") + logger.info(f"There were {len(houses_with_processing_errors)} houses that could not be processed") + for error_house in houses_with_processing_errors: + # TODO: move these to a separate errors folder or so + logger.info(f"Error: {error_house}") # It may be more intuitive to manipulate the Property objects instead of dataframes, but let's keep the dataframes approach for now # Note that we are omitting the photos field, which is an array field, and include the photos_string property diff --git a/funda_scraper/filerepository.py b/funda_scraper/filerepository.py index e2eb86e..a8e6ec9 100644 --- a/funda_scraper/filerepository.py +++ b/funda_scraper/filerepository.py @@ -33,7 +33,7 @@ def get_list_pages(self, run_id: str) -> List[str]: return pages def get_detail_pages(self, run_id: str) -> List[str]: - pages = [] + pages = {} detail_pages_dir = self._get_detail_pages_dir_name(run_id) @@ -43,7 +43,7 @@ def get_detail_pages(self, run_id: str) -> List[str]: if os.path.isfile(file_path): with open(file_path, 'r') as file: content = file.read() - pages.append(content) + pages[f] = content return pages diff --git a/main.py b/main.py index fe7a99c..c0c6763 100644 --- a/main.py +++ b/main.py @@ -8,16 +8,16 @@ area = "Amsterdam", want_to = "buy", find_sold = False, - page_start = 1, - number_of_pages = 100, + page_start = 90, + number_of_pages = 150, # min_price=500, # max_price=2000 ) - #scraper = FundaScraper(search_params) - #df = scraper.run(clean_data = True) + scraper = FundaScraper(search_params) + df = scraper.run(clean_data = True) #df.head() # It's also possible to to extraction separately from fetching the html pages - data_extractor = DataExtractor() - data_extractor.extract_data(search_params, run_id = "196a5756-8643-11ef-840d-a0510ba6104e", clean_data = True) + #data_extractor = DataExtractor() + #data_extractor.extract_data(search_params, run_id = "196a5756-8643-11ef-840d-a0510ba6104e", clean_data = True)