From 62ed261ba715cf6d9c8282fb2e7f24bc3dfbe80d Mon Sep 17 00:00:00 2001 From: lethuan007 <111433077+lethuan007@users.noreply.github.com> Date: Mon, 14 Oct 2024 22:26:11 +0200 Subject: [PATCH 1/7] Update scrape.py Added min/max plot_area Added function for multiple areas Added 'property type functionality' --- funda_scraper/scrape.py | 48 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 46 insertions(+), 2 deletions(-) diff --git a/funda_scraper/scrape.py b/funda_scraper/scrape.py index 17f8383..6586e5d 100644 --- a/funda_scraper/scrape.py +++ b/funda_scraper/scrape.py @@ -3,6 +3,7 @@ import argparse import datetime import json +import time import multiprocessing as mp import os from collections import OrderedDict @@ -38,11 +39,13 @@ def __init__( property_type: Optional[str] = None, min_floor_area: Optional[str] = None, max_floor_area: Optional[str] = None, + min_plot_area: Optional[str] = None, + max_plot_area: Optional[str] = none, sort: Optional[str] = None, ): """ - :param area: The area to search for properties, formatted for URL compatibility. + :param area: The area to search for properties, this can be a comma-seperated list, formatted for URL compatibility. :param want_to: Specifies whether the user wants to buy or rent properties. :param page_start: The starting page number for the search. :param n_pages: The number of pages to scrape. @@ -53,10 +56,12 @@ def __init__( :param property_type: The type of property to search for. :param min_floor_area: The minimum floor area for the property search. :param max_floor_area: The maximum floor area for the property search. + :param min_plot_area: The minimum plot area for the property search. + :param max_plot_area: The maximum plot area for the property search. :param sort: The sorting criterion for the search results. """ # Init attributes - self.area = area.lower().replace(" ", "-") + self.area = area.lower().replace(" ", "-").replace(",","\",\"") #added functionality to add multiple cities, seperated by ', ' self.property_type = property_type self.want_to = want_to self.find_past = find_past @@ -68,6 +73,8 @@ def __init__( self.days_since = days_since self.min_floor_area = min_floor_area self.max_floor_area = max_floor_area + self.min_plot_area = min_plot_area + self.max_plot_area = max_plot_area self.sort = sort # Instantiate along the way @@ -89,6 +96,8 @@ def __repr__(self): f"days_since={self.days_since}, " f"min_floor_area={self.min_floor_area}, " f"max_floor_area={self.max_floor_area}, " + f"min_plot_area={self.min_plot_area}, " + f"max_plot_area={self.max_plot_area}, " f"find_past={self.find_past})" f"min_price={self.min_price})" f"max_price={self.max_price})" @@ -168,6 +177,8 @@ def reset( days_since: Optional[int] = None, min_floor_area: Optional[str] = None, max_floor_area: Optional[str] = None, + min_plot_area: Optional[str] = None, + max_plot_area: Optional[str] = None, sort: Optional[str] = None, ) -> None: """Resets or initializes the search parameters.""" @@ -193,6 +204,10 @@ def reset( self.min_floor_area = min_floor_area if max_floor_area is not None: self.max_floor_area = max_floor_area + if min_plot_area is not None: + self.min_plot_area = min_plot_area + if max_plot_area is not None: + self.max_plot_area = max_plot_area if sort is not None: self.sort = sort @@ -232,6 +247,7 @@ def fetch_all_links(self, page_start: int = None, n_pages: int = None) -> None: f"{main_url}&search_result={i}" ) urls += item_list + time.sleep(.5) except IndexError: self.page_end = i logger.info(f"*** The last available page is {self.page_end} ***") @@ -270,6 +286,11 @@ def _build_main_query_url(self) -> str: if self.days_since is not None: main_url = f"{main_url}&publication_date={self.check_days_since}" + + if self.min_plot_area or self.max_plot_area: + min_plot_area = "" if self.min_plot_area is None else self.min_plot_area + max_plot_area = "" if self.max_plot_area is None else self.max_plot_area + main_url = f"{main_url}&plot_area=%22{min_plot_area}-{max_plot_area}%22" if self.min_floor_area or self.max_floor_area: min_floor_area = "" if self.min_floor_area is None else self.min_floor_area @@ -455,12 +476,30 @@ def run( parser.add_argument( "--max_price", type=int, help="Specify the max price", default=None ) + parser.add_argument( + "--max_plot_area", type=int, help="Specify the max plot area", default=None + ) + parser.add_argument( + "--max_floor_area", type=int, help="Specify the max floor area", default=None + ) + parser.add_argument( + "--min_plot_area", type=int, help="Specify the min plot area", default=None + ) + parser.add_argument( + "--min_floor_area", type=int, help="Specify the min floor area", default=None + ) parser.add_argument( "--days_since", type=int, help="Specify the days since publication", default=None, ) + parser.add_argument( + "--property_type", + type=str, + help="Specify the type of property(house, land, appartment)", + default="house", + ) parser.add_argument( "--sort", type=str, @@ -498,6 +537,11 @@ def run( n_pages=args.n_pages, min_price=args.min_price, max_price=args.max_price, + min_plot_area=args.min_plot_area, + max_plot_area=args.max_plot_area, + min_floor_area=args.min_floor_area, + max_floor_area=args.max_floor_area, + property_type=args.property_type, days_since=args.days_since, sort=args.sort, ) From 1ec474c4675e37e0116172312e1b0c2b8f8e7bf7 Mon Sep 17 00:00:00 2001 From: insitive-jws <111644923+insitive-jws@users.noreply.github.com> Date: Thu, 17 Oct 2024 18:28:15 +0200 Subject: [PATCH 2/7] Added additional fields based on json, added scraping of opendata for db, ppm2.5 and no2, as well as additional data like average neighbourhood price / m2, %families. --- funda_scraper/config/config.yaml | 227 +++++++++++++++++++++++++------ 1 file changed, 185 insertions(+), 42 deletions(-) diff --git a/funda_scraper/config/config.yaml b/funda_scraper/config/config.yaml index 789662c..1528f07 100644 --- a/funda_scraper/config/config.yaml +++ b/funda_scraper/config/config.yaml @@ -9,52 +9,195 @@ keep_cols: # - term_days selling_data: - url - - house_id - - city - - house_type - - building_type - price - price_m2 - - room - - bedroom - - bathroom + - price_m2grond + - status + - acceptance + - listed_since + - house_type + - building_type + - year_built + - building_roofing + - building_details + - size - living_area + - property_area + - balcony_size + - other_interior_area + - other_exterior_area + - exteriors + - rooms + - bathroom + - bedroom + - toilets + - stories + - layout - energy_label - - zip + - insulation + - heating + - heatedwater + - heatingCV + - heatingAge + - solarpanels + - heatpump + - lowenergy + - street - address - - year_built - - house_age - # - date_list - # - ym_list - # - year_list + - zip_code + - city - descrip - - photo + - ownership + - cadastralarea + - location + - garden + - gardensize + - gardenorientation + - balcony + - parking + - parkingownproperty + - enclosedparking + - neighborhood_name + - latitude + - longitude + - monument + - monumentstatus + - DIYhome + - leasehold + - buurtid + - photos + - neighbourhood_price_m2 + - neighbourhood_families + - sound + - ppm25 + - no2 css_selector: - url: none - price: ".object-header__price" - address: ".object-header__title" - descrip: ".object-description-body" - listed_since: ".fd-align-items-center:nth-child(6) span" - zip_code: ".object-header__subtitle" - size: ".object-kenmerken-list:nth-child(8) .fd-align-items-center:nth-child(5) span" - year: ".fd-align-items-center~ .fd-align-items-center .fd-m-right-xs" - living_area: ".object-kenmerken-list:nth-child(8) .fd-align-items-center:nth-child(2) span" - kind_of_house: ".object-kenmerken-list:nth-child(5) .fd-align-items-center:nth-child(2) span" - building_type: ".object-kenmerken-list:nth-child(5) .fd-align-items-center:nth-child(4) span" - num_of_rooms: ".object-kenmerken-list:nth-child(11) .fd-align-items-center:nth-child(2)" - num_of_bathrooms: ".object-kenmerken-list:nth-child(11) .fd-align-items-center:nth-child(4)" - layout: ".object-kenmerken-list:nth-child(11)" - energy_label: ".energielabel" - insulation: ".object-kenmerken-list:nth-child(14) .fd-align-items-center:nth-child(4)" - heating: ".object-kenmerken-list:nth-child(14) .fd-align-items-center:nth-child(6)" - ownership: ".object-kenmerken-list:nth-child(17) .fd-align-items-center:nth-child(4)" - exteriors: ".object-kenmerken-list:nth-child(19)" - parking: ".object-kenmerken-list:nth-child(24)" - neighborhood_name: ".fd-display-inline--bp-m" - date_list: "dd:nth-child(2)" - date_sold: "dd:nth-child(4)" - term: "dd:nth-child(6)" - price_sold: ".object-header__price--historic" - last_ask_price: ".object-kenmerken-list:nth-child(2) .fd-align-items-center:nth-child(2)" - last_ask_price_m2: ".object-kenmerken-list__asking-price" - photo: ".media-viewer-overview__section-list-item--photo img[data-lazy]" + url: "ContentUrl" + price: + - "Id" + - + - "overdracht-huurprijs" + - "overdracht-vraagprijs" + price_m2: + - "Id" + - "overdracht-vraagprijsperm2" + status: + - "Id" + - "overdracht-status" + acceptance: + - "Label" + - "Aanvaarding" + listed_since: + - "Label" + - + - "Aangeboden sinds" + house_type: + - "Id" + - "bouw-soortobject" + building_type: + - "Id" + - "bouw-soortbouw" + year: + - "Label" + - + - "Bouwjaar" + - "Bouwperiode" + building_roofing: + - "Id" + - "bouw-dak" + building_details: + - "Id" + - "bouw-bijzonderheden" + size: + - "Id" + - "afmetingen-inhoud" + living_area: "woonoppervlakte" + property_area: "PerceelOppervlakteSubTitle" + balcony_size: + - "Id" + - "afmetingen-gebruiksoppervlakte-gebouwgebondenbuitenruimte" + other_interior_area: + - "Id" + - "afmetingen-gebruiksoppervlakte-overigeinpandigeruimte" + other_exterior_area: + - "Label" + - "Externe bergruimte" + exteriors: + - "Id" + - "afmetingen-gebruiksoppervlakte-externebergruimte" + num_of_rooms: "aantalkamers" + num_of_bathrooms: + - "Id" + - "indeling-totalbathroom" + num_of_bedrooms: "NumberOfBedrooms" + stories: + - "Id" + - "indeling-totalstories" + layout: + - "Id" + - "indeling-generalfacilities" + energy_label: "EnergyLabel" + insulation: + - "Id" + - "energie-isolatie" + heating: + - "Id" + - "energie-verwarming" + heatedwater: + - "Id" + - "energie-warmwater" + heatingCV: + - "Id" + - "energie-cv" + heatingAge: "cvketel" + solarpanels: "zonnepanelen" + heatpump: "warmtepomp" + lowenergy: "energiezuinig" + street: "AddressTitle" + address: "AddressSubTitle" + zip_code: "postcode" + city: "plaats" + descrip: "Aanbiedingstekst" + ownership: + - "Id" + - "cadastral-ownershipsituation" + cadastralarea: + - "Id" + - "cadastral-area" + location: + - "Id" + - "buitenruimte-ligging" + garden: + - "Id" + - "buitenruimte-tuin" + gardensize: + - "Id" + - "buitenruimte-hoofdtuin" + gardenorientation: + - "Id" + - "buitenruimte-liggingtuin" + balcony: + - "Id" + - "buitenruimte-balkonterras" + parking: + - "Label" + - "Soort parkeergelegenheid" + parkingownproperty: "parkeergelegenheidopeigenterrein" + enclosedparking: "parkeergelegenheidopeigenterrein" + neighborhood_name: "BuurtName" + latitude: "Latitude" + longitude: "Longitude" + monument: "rijksmonument" + monumentstatus: "monumentalestatus" + DIYhome: "kluswoning" + leasehold: "erfpacht" + buurtid: "BuurtIdentifier" + photos: + term: + price_sold: + date_sold: + neighbourhood_price_m2: + neighbourhood_families: + sound: + ppm25: + no2: \ No newline at end of file From 2e3193778878bd4d4116474ad9fb61b3e1c92822 Mon Sep 17 00:00:00 2001 From: insitive-jws <111644923+insitive-jws@users.noreply.github.com> Date: Thu, 17 Oct 2024 18:28:43 +0200 Subject: [PATCH 3/7] updated preprocessing to handle all additional fields --- funda_scraper/preprocess.py | 42 +++++++++++++++++++++++++++++++++---- 1 file changed, 38 insertions(+), 4 deletions(-) diff --git a/funda_scraper/preprocess.py b/funda_scraper/preprocess.py index 96b8cf1..06a9a5c 100644 --- a/funda_scraper/preprocess.py +++ b/funda_scraper/preprocess.py @@ -5,6 +5,7 @@ from typing import List, Union import pandas as pd +import numpy as np from dateutil.parser import parse from funda_scraper.config.core import config @@ -32,7 +33,7 @@ def clean_year(x: str) -> int: return 0 -def clean_living_area(x: str) -> int: +def clean_m2(x: str) -> int: """Clean the 'living_area' and transform from string to integer""" try: return int(str(x).replace(",", "").split(" m²")[0]) @@ -41,6 +42,24 @@ def clean_living_area(x: str) -> int: except IndexError: return 0 +def clean_garden(x: str) -> int: + """Clean the 'living_area' and transform from string to integer""" + try: + return str(x).replace(" m²", " m2") + except ValueError: + return 0 + except IndexError: + return 0 + +def clean_m3(x: str) -> int: + """Clean the 'living_area' and transform from string to integer""" + try: + return int(str(x).replace(",", "").split(" m³")[0]) + except ValueError: + return 0 + except IndexError: + return 0 + def find_keyword_from_regex(x: str, pattern: str) -> int: result = re.findall(pattern, x) @@ -69,6 +88,10 @@ def find_n_bathroom(x: str) -> int: pattern = r"(\d{1,2}\s{1}badkamers{0,1})|(\d{1,2}\s{1}bathrooms{0,1})" return find_keyword_from_regex(x, pattern) +def find_n_toilets(x: str) -> int: + """Find the number of bathrooms from a string""" + pattern = r"(\d{1,2}\s{1}apart{0,1})|(\d{1,2}\s{1}seperate{0,1})" + return find_keyword_from_regex(x, pattern) def map_dutch_month(x: str) -> str: """Map the month from Dutch to English.""" @@ -176,17 +199,28 @@ def preprocess_data( price_col = "price_sold" if is_past else "price" df["price"] = df[price_col].apply(clean_price) df = df[df["price"] != 0] - df["living_area"] = df["living_area"].apply(clean_living_area) + df["size"] = df["size"].apply(clean_m3) + df["balcony_size"] = df["balcony_size"].apply(clean_m2) + df["other_interior_area"] = df["other_interior_area"].apply(clean_m2) + df["other_exterior_area"] = df["other_exterior_area"].apply(clean_m2) + df["cadastralarea"] = df["cadastralarea"].apply(clean_m2) + df["exteriors"] = df["exteriors"].apply(clean_m2) + df["gardensize"] = df["gardensize"].apply(clean_garden) + df['property_area'] = df['property_area'].apply(clean_m2) + df["living_area"] = df["living_area"].apply(clean_m2) df = df[df["living_area"] != 0] df["price_m2"] = round(df.price / df.living_area, 1) + df["price_m2grond"] = round(df.price / df.property_area,1) + df["price_m2grond"] = df["price_m2grond"].replace(np.inf, 0) # Location - df["zip"] = df["zip_code"].apply(lambda x: x[:4]) + df["zip"] = df["zip_code"].apply(lambda x: x[:7]) # House layout - df["room"] = df["num_of_rooms"].apply(find_n_room) + df["rooms"] = df["num_of_rooms"].apply(find_n_room) df["bedroom"] = df["num_of_rooms"].apply(find_n_bedroom) df["bathroom"] = df["num_of_bathrooms"].apply(find_n_bathroom) + df["toilets"] = df["num_of_bathrooms"].apply(find_n_toilets) df["energy_label"] = df["energy_label"].apply(clean_energy_label) # Time From f78ef6afdfa4bad178c3baa0b38d4c37bc76b814 Mon Sep 17 00:00:00 2001 From: insitive-jws <111644923+insitive-jws@users.noreply.github.com> Date: Fri, 18 Oct 2024 13:04:02 +0200 Subject: [PATCH 4/7] Added extraction of garden width, and garden depth, as well as garden_size --- funda_scraper/preprocess.py | 69 ++++++++++++++++++++----------------- 1 file changed, 37 insertions(+), 32 deletions(-) diff --git a/funda_scraper/preprocess.py b/funda_scraper/preprocess.py index 06a9a5c..d294c51 100644 --- a/funda_scraper/preprocess.py +++ b/funda_scraper/preprocess.py @@ -10,7 +10,6 @@ from funda_scraper.config.core import config - def clean_price(x: str) -> int: """Clean the 'price' and transform from string to integer.""" try: @@ -20,7 +19,6 @@ def clean_price(x: str) -> int: except IndexError: return 0 - def clean_year(x: str) -> int: """Clean the 'year' and transform from string to integer""" if len(x) == 4: @@ -32,34 +30,43 @@ def clean_year(x: str) -> int: else: return 0 - def clean_m2(x: str) -> int: - """Clean the 'living_area' and transform from string to integer""" + """Clean the 'm2' and transform from string to integer""" try: - return int(str(x).replace(",", "").split(" m²")[0]) + return int(str(x).replace(".", "").split(" m²")[0]) except ValueError: return 0 except IndexError: return 0 def clean_garden(x: str) -> int: - """Clean the 'living_area' and transform from string to integer""" + """Clean the 'gardensize' and transform to readable format m2""" try: - return str(x).replace(" m²", " m2") + pattern = r"(\d{1,5}\s{1}m2{0,1})" + return find_keyword_from_regex(str(x).replace(" m²", " m2"), pattern) except ValueError: return 0 except IndexError: return 0 def clean_m3(x: str) -> int: - """Clean the 'living_area' and transform from string to integer""" + """Clean the 'm3' and transform from string to integer""" try: - return int(str(x).replace(",", "").split(" m³")[0]) + return int(str(x).replace(".", "").split(" m³")[0]) except ValueError: return 0 except IndexError: return 0 +def find_garden_depth(x: str) -> float: + """Find the number of bathrooms from a string""" + pattern = r"(\d{1,2},\d{1,2}\s{1}meter diep{0,1})|(\d{1,2},\d{1,2}\s{1}metre deep{0,1})" + return float_find_keyword_from_regex(x, pattern) + +def find_garden_width(x: str) -> float: + """Find the number of bathrooms from a string""" + pattern = r"(\d{1,2},\d{1,2}\s{1}meter breed{0,1})|(\d{1,2},\d{1,2}\s{1}metre wide{0,1})" + return float_find_keyword_from_regex(x, pattern) def find_keyword_from_regex(x: str, pattern: str) -> int: result = re.findall(pattern, x) @@ -70,18 +77,15 @@ def find_keyword_from_regex(x: str, pattern: str) -> int: x = 0 return int(x) - -def find_n_room(x: str) -> int: - """Find the number of rooms from a string""" - pattern = r"(\d{1,2}\s{1}kamers{0,1})|(\d{1,2}\s{1}rooms{0,1})" - return find_keyword_from_regex(x, pattern) - - -def find_n_bedroom(x: str) -> int: - """Find the number of bedrooms from a string""" - pattern = r"(\d{1,2}\s{1}slaapkamers{0,1})|(\d{1,2}\s{1}bedrooms{0,1})" - return find_keyword_from_regex(x, pattern) - +def float_find_keyword_from_regex(x: str, pattern: str) -> float: + result = re.findall(pattern, x) + if len(result) > 0: + result = "".join(result[0]) + x = result.split(" ")[0] + x = x.replace(",", ".") + else: + x = 0 + return float(x) def find_n_bathroom(x: str) -> int: """Find the number of bathrooms from a string""" @@ -110,7 +114,6 @@ def map_dutch_month(x: str) -> str: x = x.replace(k, v) return x - def clean_energy_label(x: str) -> str: """Clean the energy labels.""" try: @@ -121,7 +124,6 @@ def clean_energy_label(x: str) -> str: except IndexError: return x - def clean_date_format(x: str) -> Union[datetime, str]: """Transform the date from string to datetime object.""" @@ -195,19 +197,22 @@ def preprocess_data( df["house_type"] = df["url"].apply(lambda x: x.split("/")[-2].split("-")[0]) df = df[df["house_type"].isin(["appartement", "huis"])] - # Price - price_col = "price_sold" if is_past else "price" - df["price"] = df[price_col].apply(clean_price) - df = df[df["price"] != 0] - df["size"] = df["size"].apply(clean_m3) + # Areas + df["volume"] = df["volume"].apply(clean_m3) df["balcony_size"] = df["balcony_size"].apply(clean_m2) df["other_interior_area"] = df["other_interior_area"].apply(clean_m2) df["other_exterior_area"] = df["other_exterior_area"].apply(clean_m2) df["cadastralarea"] = df["cadastralarea"].apply(clean_m2) df["exteriors"] = df["exteriors"].apply(clean_m2) - df["gardensize"] = df["gardensize"].apply(clean_garden) df['property_area'] = df['property_area'].apply(clean_m2) df["living_area"] = df["living_area"].apply(clean_m2) + df["garden_width"] = df["garden_size"].apply(find_garden_width) # first before cleaning garden_size + df["garden_depth"] = df["garden_size"].apply(find_garden_depth) # first before cleaning garden_size + df["garden_size"] = df["garden_size"].apply(clean_garden) + # Price + price_col = "price_sold" if is_past else "price" + df["price"] = df[price_col].apply(clean_price) + df = df[df["price"] != 0] df = df[df["living_area"] != 0] df["price_m2"] = round(df.price / df.living_area, 1) df["price_m2grond"] = round(df.price / df.property_area,1) @@ -217,14 +222,14 @@ def preprocess_data( df["zip"] = df["zip_code"].apply(lambda x: x[:7]) # House layout - df["rooms"] = df["num_of_rooms"].apply(find_n_room) - df["bedroom"] = df["num_of_rooms"].apply(find_n_bedroom) + df["rooms"] = df["num_of_rooms"] + df["bedroom"] = df["num_of_bedrooms"] df["bathroom"] = df["num_of_bathrooms"].apply(find_n_bathroom) df["toilets"] = df["num_of_bathrooms"].apply(find_n_toilets) df["energy_label"] = df["energy_label"].apply(clean_energy_label) # Time - df["year_built"] = df["year"].apply(clean_year).astype(int) + df["year_built"] = df["year_built"].apply(clean_year).astype(int) df["house_age"] = datetime.now().year - df["year_built"] if is_past: From c8a8eca1d28d37e04a172fa91b32f8e3c375a8b1 Mon Sep 17 00:00:00 2001 From: insitive-jws <111644923+insitive-jws@users.noreply.github.com> Date: Fri, 18 Oct 2024 13:04:56 +0200 Subject: [PATCH 5/7] Added extraction of garden width, and garden depth, as well as garden_size --- funda_scraper/config/config.yaml | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/funda_scraper/config/config.yaml b/funda_scraper/config/config.yaml index 1528f07..a897011 100644 --- a/funda_scraper/config/config.yaml +++ b/funda_scraper/config/config.yaml @@ -64,13 +64,14 @@ keep_cols: - monumentstatus - DIYhome - leasehold - - buurtid + - NeighourhoodId - photos - neighbourhood_price_m2 - neighbourhood_families - sound - ppm25 - no2 + - log_id css_selector: url: "ContentUrl" price: @@ -97,7 +98,7 @@ css_selector: building_type: - "Id" - "bouw-soortbouw" - year: + year_built: - "Label" - - "Bouwjaar" @@ -191,13 +192,13 @@ css_selector: monumentstatus: "monumentalestatus" DIYhome: "kluswoning" leasehold: "erfpacht" - buurtid: "BuurtIdentifier" - photos: term: price_sold: - date_sold: + date_sold: + NeighourhoodId: "BuurtIdentifier" + photos: neighbourhood_price_m2: neighbourhood_families: sound: ppm25: - no2: \ No newline at end of file + no2: From 387d938d16e301f4851063ff0a249762885ce90b Mon Sep 17 00:00:00 2001 From: insitive-jws <111644923+insitive-jws@users.noreply.github.com> Date: Fri, 18 Oct 2024 13:13:18 +0200 Subject: [PATCH 6/7] Updated Scrape with following changes: - seperated Search-request in seperate class for readability - changed to json retreival and extraction of data! - save each json used in /data/id/*.json - added option to download photos to (date/id/photo) - added the polution (sound/ppm2/no2) - added Neighbourhood data from json - added most fields from funda-json - usage for json selectors: - 1 element -> finds the content of the key - 2 elements -> finds the content of the 'Value' key by searching in the (1) Key, (2) Values. e.g. Id, ["overdracht-huurprijs", "overdracht-vraagprijs"], searches for 'Id' containing either of "overdracht-huurprijs", "overdracht-vraagprijs", and returns the content of 'Value' within that element. --- .gitignore | 2 + funda_scraper/scrape.py | 434 +++++++++++++++++++++++++--------------- 2 files changed, 276 insertions(+), 160 deletions(-) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..0c38f5c --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +.vscode/launch.json +data \ No newline at end of file diff --git a/funda_scraper/scrape.py b/funda_scraper/scrape.py index 6586e5d..e6845d3 100644 --- a/funda_scraper/scrape.py +++ b/funda_scraper/scrape.py @@ -4,6 +4,7 @@ import datetime import json import time +import re import multiprocessing as mp import os from collections import OrderedDict @@ -20,12 +21,7 @@ from funda_scraper.preprocess import clean_date_format, preprocess_data from funda_scraper.utils import logger - -class FundaScraper(object): - """ - A class used to scrape real estate data from the Funda website. - """ - +class SearchRequest(object): def __init__( self, area: str, @@ -33,6 +29,7 @@ def __init__( page_start: int = 1, n_pages: int = 1, find_past: bool = False, + download_photos: bool = False, min_price: Optional[int] = None, max_price: Optional[int] = None, days_since: Optional[int] = None, @@ -40,7 +37,7 @@ def __init__( min_floor_area: Optional[str] = None, max_floor_area: Optional[str] = None, min_plot_area: Optional[str] = None, - max_plot_area: Optional[str] = none, + max_plot_area: Optional[str] = None, sort: Optional[str] = None, ): """ @@ -65,6 +62,7 @@ def __init__( self.property_type = property_type self.want_to = want_to self.find_past = find_past + self.download_photos = download_photos self.page_start = max(page_start, 1) self.n_pages = max(n_pages, 1) self.page_end = self.page_start + self.n_pages - 1 @@ -76,35 +74,6 @@ def __init__( self.min_plot_area = min_plot_area self.max_plot_area = max_plot_area self.sort = sort - - # Instantiate along the way - self.links: List[str] = [] - self.raw_df = pd.DataFrame() - self.clean_df = pd.DataFrame() - self.base_url = config.base_url - self.selectors = config.css_selector - - def __repr__(self): - return ( - f"FundaScraper(area={self.area}, " - f"want_to={self.want_to}, " - f"n_pages={self.n_pages}, " - f"page_start={self.page_start}, " - f"find_past={self.find_past}, " - f"min_price={self.min_price}, " - f"max_price={self.max_price}, " - f"days_since={self.days_since}, " - f"min_floor_area={self.min_floor_area}, " - f"max_floor_area={self.max_floor_area}, " - f"min_plot_area={self.min_plot_area}, " - f"max_plot_area={self.max_plot_area}, " - f"find_past={self.find_past})" - f"min_price={self.min_price})" - f"max_price={self.max_price})" - f"days_since={self.days_since})" - f"sort={self.sort})" - ) - @property def to_buy(self) -> bool: """Determines if the search is for buying or renting properties.""" @@ -138,7 +107,8 @@ def check_sort(self) -> str: "price_down", "floor_area_down", "plot_area_down", - "city_up" "postal_code_up", + "city_up", + "postal_code_up", ]: return self.sort else: @@ -147,23 +117,6 @@ def check_sort(self) -> str: "'floor_area_down', 'plot_area_down', 'city_up' or 'postal_code_up'. " ) - @staticmethod - def _check_dir() -> None: - """Ensures the existence of the directory for storing data.""" - if not os.path.exists("data"): - os.makedirs("data") - - @staticmethod - def _get_links_from_one_parent(url: str) -> List[str]: - """Scrapes all available property links from a single Funda search page.""" - response = requests.get(url, headers=config.header) - soup = BeautifulSoup(response.text, "lxml") - - script_tag = soup.find_all("script", {"type": "application/ld+json"})[0] - json_data = json.loads(script_tag.contents[0]) - urls = [item["url"] for item in json_data["itemListElement"]] - return urls - def reset( self, area: Optional[str] = None, @@ -171,6 +124,7 @@ def reset( want_to: Optional[str] = None, page_start: Optional[int] = None, n_pages: Optional[int] = None, + download_photos: Optional[bool] = None, find_past: Optional[bool] = None, min_price: Optional[int] = None, max_price: Optional[int] = None, @@ -192,6 +146,8 @@ def reset( self.page_start = max(page_start, 1) if n_pages is not None: self.n_pages = max(n_pages, 1) + if download_photos is not None: + self.download_photos = download_photos if find_past is not None: self.find_past = find_past if min_price is not None: @@ -211,34 +167,86 @@ def reset( if sort is not None: self.sort = sort + +class FundaScraper(object): + """ + A class used to scrape real estate data from the Funda website. + """ + + def __init__( + self, + search_request: SearchRequest + ): + """ + + :param area: The area to search for properties, this can be a comma-seperated list, formatted for URL compatibility. + :param want_to: Specifies whether the user wants to buy or rent properties. + :param page_start: The starting page number for the search. + :param n_pages: The number of pages to scrape. + :param find_past: Flag to indicate whether to find past listings. + :param min_price: The minimum price for the property search. + :param max_price: The maximum price for the property search. + :param days_since: The maximum number of days since the listing was published. + :param property_type: The type of property to search for. + :param min_floor_area: The minimum floor area for the property search. + :param max_floor_area: The maximum floor area for the property search. + :param min_plot_area: The minimum plot area for the property search. + :param max_plot_area: The maximum plot area for the property search. + :param sort: The sorting criterion for the search results. + """ + # Init attributes + self.search_request = search_request + + # Instantiate along the way + self.links: List[str] = [] + self.raw_df = pd.DataFrame() + self.clean_df = pd.DataFrame() + self.base_url = config.base_url + self.selectors = config.css_selector + + def __repr__(self): + return str(self.search_request) + + @staticmethod + def _check_dir() -> None: + date = str(datetime.datetime.now().date()).replace("-", "") + """Ensures the existence of the directory for storing data.""" + if not os.path.exists("data"): + os.makedirs("data") + if not os.path.exists(f"data/{date}"): + os.makedirs(f"data/{date}") + + @staticmethod + def _get_links_from_one_parent(url: str) -> List[str]: + """Scrapes all available property links from a single Funda search page.""" + response = requests.get(url, headers=config.header) + soup = BeautifulSoup(response.text, "lxml") + script_tag = soup.find_all("script", string=lambda t: t and "window.__NUXT__" in t)[0] + script_content = script_tag.string + ids = re.findall(r',_id:"(\d+)"', script_content) + return ids + @staticmethod def remove_duplicates(lst: List[str]) -> List[str]: """Removes duplicate links from a list.""" return list(OrderedDict.fromkeys(lst)) @staticmethod - def fix_link(link: str) -> str: - """Fixes a given property link to ensure proper URL formatting.""" - link_url = urlparse(link) - link_path = link_url.path.split("/") - property_id = link_path.pop(5) - property_address = link_path.pop(4).split("-") - link_path = link_path[2:4] - property_address.insert(1, property_id) - link_path.extend(["-".join(property_address), "?old_ldp=true"]) - fixed_link = urlunparse( - (link_url.scheme, link_url.netloc, "/".join(link_path), "", "", "") + def gen_link(link: str) -> str: + """Generates a given property link.""" + gen_url = ( + f"https://listing-detail-page.funda.io/api/v1/listing/nl/{link}" ) - return fixed_link + return gen_url def fetch_all_links(self, page_start: int = None, n_pages: int = None) -> None: """Collects all available property links across multiple pages.""" - page_start = self.page_start if page_start is None else page_start - n_pages = self.n_pages if n_pages is None else n_pages + page_start = self.search_request.page_start if page_start is None else page_start + n_pages = self.search_request.n_pages if n_pages is None else n_pages logger.info("*** Phase 1: Fetch all the available links from all pages *** ") - urls = [] + ids = [] main_url = self._build_main_query_url() for i in tqdm(range(page_start, page_start + n_pages)): @@ -246,142 +254,238 @@ def fetch_all_links(self, page_start: int = None, n_pages: int = None) -> None: item_list = self._get_links_from_one_parent( f"{main_url}&search_result={i}" ) - urls += item_list - time.sleep(.5) + ids += item_list + time.sleep(.2) except IndexError: self.page_end = i logger.info(f"*** The last available page is {self.page_end} ***") break - urls = self.remove_duplicates(urls) - fixed_urls = [self.fix_link(url) for url in urls] + ids = self.remove_duplicates(ids) + gen_urls = [self.gen_link(id) for id in ids] logger.info( - f"*** Got all the urls. {len(fixed_urls)} houses found from {self.page_start} to {self.page_end} ***" + f"*** Got all the urls. {len(gen_urls)} houses found from {self.search_request.page_start} to {self.search_request.page_end} ***" ) - self.links = fixed_urls + self.links = gen_urls def _build_main_query_url(self) -> str: """Constructs the main query URL for the search.""" - query = "koop" if self.to_buy else "huur" + query = "koop" if self.search_request.to_buy else "huur" main_url = ( - f"{self.base_url}/zoeken/{query}?selected_area=%5B%22{self.area}%22%5D" + f"{self.base_url}/zoeken/{query}?selected_area=%5B%22{self.search_request.area}%22%5D" ) - if self.property_type: - property_types = self.property_type.split(",") + if self.search_request.property_type: + property_types = self.search_request.property_type.split(",") formatted_property_types = [ "%22" + prop_type + "%22" for prop_type in property_types ] main_url += f"&object_type=%5B{','.join(formatted_property_types)}%5D" - if self.find_past: + if self.search_request.find_past: main_url = f'{main_url}&availability=%5B"unavailable"%5D' - if self.min_price is not None or self.max_price is not None: - min_price = "" if self.min_price is None else self.min_price - max_price = "" if self.max_price is None else self.max_price + if self.search_request.min_price is not None or self.search_request.max_price is not None: + min_price = "" if self.search_request.min_price is None else self.search_request.min_price + max_price = "" if self.search_request.max_price is None else self.search_request.max_price main_url = f"{main_url}&price=%22{min_price}-{max_price}%22" - if self.days_since is not None: - main_url = f"{main_url}&publication_date={self.check_days_since}" + if self.search_request.days_since is not None: + main_url = f"{main_url}&publication_date={self.search_request.check_days_since}" - if self.min_plot_area or self.max_plot_area: - min_plot_area = "" if self.min_plot_area is None else self.min_plot_area - max_plot_area = "" if self.max_plot_area is None else self.max_plot_area + if self.search_request.min_plot_area or self.search_request.max_plot_area: + min_plot_area = "" if self.search_request.min_plot_area is None else self.search_request.min_plot_area + max_plot_area = "" if self.search_request.max_plot_area is None else self.search_request.max_plot_area main_url = f"{main_url}&plot_area=%22{min_plot_area}-{max_plot_area}%22" - if self.min_floor_area or self.max_floor_area: - min_floor_area = "" if self.min_floor_area is None else self.min_floor_area - max_floor_area = "" if self.max_floor_area is None else self.max_floor_area + if self.search_request.min_floor_area or self.search_request.max_floor_area: + min_floor_area = "" if self.search_request.min_floor_area is None else self.search_request.min_floor_area + max_floor_area = "" if self.search_request.max_floor_area is None else self.search_request.max_floor_area main_url = f"{main_url}&floor_area=%22{min_floor_area}-{max_floor_area}%22" - if self.sort is not None: - main_url = f"{main_url}&sort=%22{self.check_sort}%22" + if self.search_request.sort is not None: + main_url = f"{main_url}&sort=%22{self.search_request.check_sort}%22" logger.info(f"*** Main URL: {main_url} ***") return main_url +# @staticmethod +# def get_value_from_css(soup: BeautifulSoup, selector: str) -> str: +# """Extracts data from HTML using a CSS selector.""" +# result = soup.select(selector) +# if len(result) > 0: +# result = result[0].text +# else: +# result = "na" +# return result + @staticmethod - def get_value_from_css(soup: BeautifulSoup, selector: str) -> str: - """Extracts data from HTML using a CSS selector.""" - result = soup.select(selector) - if len(result) > 0: - result = result[0].text - else: - result = "na" - return result + def extract_value(data, target_key, target_values=None): + if isinstance(data, dict): + for key, value in data.items(): + if key == target_key and (target_values is None or (isinstance(target_values, list) and any(value == tv for tv in target_values)) or value == target_values): + return data.get("Value", value) + if isinstance(value, (dict, list)): + result = FundaScraper.extract_value(value, target_key, target_values) + if result != "na": + return result + elif isinstance(data, list): + for item in data: + result = FundaScraper.extract_value(item, target_key, target_values) + if result != "na": + return result + return "na" + def scrape_one_link(self, link: str) -> List[str]: """Scrapes data from a single property link.""" # Initialize for each page + date = str(datetime.datetime.now().date()).replace("-", "") response = requests.get(link, headers=config.header) - soup = BeautifulSoup(response.text, "lxml") + json_data = response.json() + id = link.rsplit('/', 1)[1] + os.makedirs(f"data/{date}/{id}",exist_ok=True) + with open(f"data/{date}/{id}/{id}.json", 'w') as json_file: + json.dump(json_data, json_file, indent=4) + - # Get the value according to respective CSS selectors - if self.to_buy: - if self.find_past: + # Get the value according to respective json selectors + if self.search_request.to_buy: + if self.search_request.find_past: list_since_selector = self.selectors.date_list else: list_since_selector = self.selectors.listed_since else: - if self.find_past: + if self.search_request.find_past: list_since_selector = ".fd-align-items-center:nth-child(9) span" else: list_since_selector = ".fd-align-items-center:nth-child(7) span" result = [ - link, - self.get_value_from_css(soup, self.selectors.price), - self.get_value_from_css(soup, self.selectors.address), - self.get_value_from_css(soup, self.selectors.descrip), - self.get_value_from_css(soup, list_since_selector), - self.get_value_from_css(soup, self.selectors.zip_code), - self.get_value_from_css(soup, self.selectors.size), - self.get_value_from_css(soup, self.selectors.year), - self.get_value_from_css(soup, self.selectors.living_area), - self.get_value_from_css(soup, self.selectors.kind_of_house), - self.get_value_from_css(soup, self.selectors.building_type), - self.get_value_from_css(soup, self.selectors.num_of_rooms), - self.get_value_from_css(soup, self.selectors.num_of_bathrooms), - self.get_value_from_css(soup, self.selectors.layout), - self.get_value_from_css(soup, self.selectors.energy_label), - self.get_value_from_css(soup, self.selectors.insulation), - self.get_value_from_css(soup, self.selectors.heating), - self.get_value_from_css(soup, self.selectors.ownership), - self.get_value_from_css(soup, self.selectors.exteriors), - self.get_value_from_css(soup, self.selectors.parking), - self.get_value_from_css(soup, self.selectors.neighborhood_name), - self.get_value_from_css(soup, self.selectors.date_list), - self.get_value_from_css(soup, self.selectors.date_sold), - self.get_value_from_css(soup, self.selectors.term), - self.get_value_from_css(soup, self.selectors.price_sold), - self.get_value_from_css(soup, self.selectors.last_ask_price), - self.get_value_from_css(soup, self.selectors.last_ask_price_m2).split("\r")[ - 0 - ], - ] - - # Deal with list_since_selector especially, since its CSS varies sometimes - if clean_date_format(result[4]) == "na": - for i in range(6, 16): - selector = f".fd-align-items-center:nth-child({i}) span" - update_list_since = self.get_value_from_css(soup, selector) - if clean_date_format(update_list_since) == "na": - pass - else: - result[4] = update_list_since - + self.extract_value(json_data, self.selectors.url), + self.extract_value(json_data, *self.selectors.price), + self.extract_value(json_data, *self.selectors.price_m2), + self.extract_value(json_data, *self.selectors.status), + self.extract_value(json_data, *self.selectors.acceptance), + self.extract_value(json_data, *self.selectors.listed_since), + self.extract_value(json_data, *self.selectors.house_type), + self.extract_value(json_data, *self.selectors.building_type), + self.extract_value(json_data, self.selectors.year_built), + self.extract_value(json_data, *self.selectors.building_roofing), + self.extract_value(json_data, *self.selectors.building_details), + self.extract_value(json_data, *self.selectors.volume), + self.extract_value(json_data, self.selectors.living_area), + self.extract_value(json_data, self.selectors.property_area), + self.extract_value(json_data, *self.selectors.balcony_size), + self.extract_value(json_data, *self.selectors.other_interior_area), + self.extract_value(json_data, *self.selectors.other_exterior_area), + self.extract_value(json_data, *self.selectors.exteriors), + self.extract_value(json_data, self.selectors.num_of_rooms), + self.extract_value(json_data, *self.selectors.num_of_bathrooms), + self.extract_value(json_data, self.selectors.num_of_bedrooms), + self.extract_value(json_data, *self.selectors.stories), + self.extract_value(json_data, *self.selectors.layout), + self.extract_value(json_data, self.selectors.energy_label), + self.extract_value(json_data, *self.selectors.insulation), + self.extract_value(json_data, *self.selectors.heating), + self.extract_value(json_data, *self.selectors.heatedwater), + self.extract_value(json_data, *self.selectors.heatingCV), + self.extract_value(json_data, self.selectors.heatingAge), + self.extract_value(json_data, self.selectors.solarpanels), + self.extract_value(json_data, self.selectors.heatpump), + self.extract_value(json_data, self.selectors.lowenergy), + self.extract_value(json_data, self.selectors.street), + self.extract_value(json_data, self.selectors.address), + self.extract_value(json_data, self.selectors.zip_code), + self.extract_value(json_data, self.selectors.city), + self.extract_value(json_data, self.selectors.descrip), + self.extract_value(json_data, *self.selectors.ownership), + self.extract_value(json_data, *self.selectors.cadastralarea), + self.extract_value(json_data, *self.selectors.location), + self.extract_value(json_data, *self.selectors.garden), + self.extract_value(json_data, *self.selectors.garden_size), + self.extract_value(json_data, *self.selectors.gardenorientation), + self.extract_value(json_data, *self.selectors.balcony), + self.extract_value(json_data, *self.selectors.parking), + self.extract_value(json_data, self.selectors.parkingownproperty), + self.extract_value(json_data, self.selectors.enclosedparking), + self.extract_value(json_data, self.selectors.neighborhood_name), + self.extract_value(json_data, self.selectors.latitude), + self.extract_value(json_data, self.selectors.longitude), + self.extract_value(json_data, self.selectors.monument), + self.extract_value(json_data, self.selectors.monumentstatus), + self.extract_value(json_data, self.selectors.DIYhome), + self.extract_value(json_data, self.selectors.leasehold), + self.extract_value(json_data, self.selectors.term), + self.extract_value(json_data, self.selectors.price_sold), + self.extract_value(json_data, self.selectors.date_sold) + ] + NeighourhoodId = self.extract_value(json_data,self.selectors.NeighourhoodId) + hood_pricem2_link = f"https://marketinsights.funda.io/v2/LocalInsights/preview/{NeighourhoodId}" + hood_data = requests.get(hood_pricem2_link, headers=config.header).json() + hood_pricem2 = hood_data["averageAskingPricePerM2"] + hood_families = hood_data["familiesWithChildren"] + with open(f"data/{date}/{id}/{id}_neighbourhood.json", 'w') as json_file: + json.dump(hood_data, json_file, indent=4) + photos_list = [ - p.get("data-lazy-srcset") for p in soup.select(self.selectors.photo) + photo["PhotoUrl"] for photo in json_data["Media"]["Photos"] ] photos_string = ", ".join(photos_list) - + if self.search_request.download_photos: + for i, url in enumerate(photos_list): + response = requests.get(url) + if response.status_code == 200: + with open(f'data/{date}/{id}/{id}_photo_{i+1}.jpg', 'wb') as file: + file.write(response.content) + else: + print(f'Failed to download {url}') + else: + None + + # Clean up the retried result from one page - result = [r.replace("\n", "").replace("\r", "").strip() for r in result] + result = [r.replace("\n", "").replace("\r", "").strip() if r is not None and not isinstance(r, float) else r for r in result] + result.append(NeighourhoodId) result.append(photos_string) + result.append(hood_pricem2) + result.append(hood_families) + + # Adding surroundings data from open API + # first get RDY and RDX + lat=self.extract_value(json_data, self.selectors.latitude) + lon=self.extract_value(json_data, self.selectors.longitude) + rd_link = f"https://api.pdok.nl/bzk/locatieserver/search/v3_1/free?lat={lat}&lon={lon}&fl=id%20identificatie%20bron%20type%20straatnaam%20huisnummer%20huisletter%20huisnummertoevoeging%20postcode%20centroide_ll%20centroide_rd%20score&fq=type%3A%28adres%29&bq=type%3Aadres%5E1&start=0&rows=1&sort=score%20desc&wt=json" + rd_data = requests.get(rd_link, headers=config.header).json() + centroid = rd_data["response"]["docs"][0]["centroide_rd"] + rdy = int(650000 - float(centroid.split()[1].strip(')'))) + rdx = int(float(centroid.split()[0].strip('POINT('))) + with open(f"data/{date}/{id}/{id}_rd.json", 'w') as json_file: + json.dump(rd_data, json_file, indent=4) + + #now get 'polution', 'sound-levels', 'no2' + sound_link = f"https://data.rivm.nl/geo/alo/wms?SERVICE=WMS&VERSION=1.3.0&REQUEST=GetFeatureInfo&LAYERS=rivm_20220601_Geluid_lden_allebronnen_2020&QUERY_LAYERS=rivm_20220601_Geluid_lden_allebronnen_2020&BBOX=0,300000,300000,650000&WIDTH=300000&HEIGHT=350000&FEATURE_COUNT=1&INFO_FORMAT=application/json&CRS=EPSG:28992&i={rdx}&j={rdy}" + ppm25_link = f"https://data.rivm.nl/geo/alo/wms?SERVICE=WMS&VERSION=1.3.0&REQUEST=GetFeatureInfo&LAYERS=rivm_nsl_20240401_gm_PM252022&QUERY_LAYERS=rivm_nsl_20240401_gm_PM252022&BBOX=0,300000,300000,650000&WIDTH=300000&HEIGHT=350000&FEATURE_COUNT=1&INFO_FORMAT=application/json&CRS=EPSG:28992&i={rdx}&j={rdy}" + no2_link = f"https://data.rivm.nl/geo/alo/wms?SERVICE=WMS&VERSION=1.3.0&REQUEST=GetFeatureInfo&LAYERS=rivm_nsl_20240401_gm_NO22022&QUERY_LAYERS=rivm_nsl_20240401_gm_NO22022&BBOX=0,300000,300000,650000&WIDTH=300000&HEIGHT=350000&FEATURE_COUNT=1&INFO_FORMAT=application/json&CRS=EPSG:28992&i={rdx}&j={rdy}" + sound_data = requests.get(sound_link, headers=config.header).json() + ppm25_data = requests.get(ppm25_link, headers=config.header).json() + no2_data = requests.get(no2_link, headers=config.header).json() + sound = round(sound_data["features"][0]["properties"]["GRAY_INDEX"],2) + ppm25 = round(ppm25_data["features"][0]["properties"]["GRAY_INDEX"],2) + no2 = round(no2_data["features"][0]["properties"]["GRAY_INDEX"],2) + with open(f"data/{date}/{id}/{id}_ppm25.json", 'w') as json_file: + json.dump(ppm25_data, json_file, indent=4) + with open(f"data/{date}/{id}/{id}_no2.json", 'w') as json_file: + json.dump(no2_data, json_file, indent=4) + with open(f"data/{date}/{id}/{id}_sound.json", 'w') as json_file: + json.dump(sound_data, json_file, indent=4) + result.append(sound) + result.append(ppm25) + result.append(no2) + return result def scrape_pages(self) -> None: @@ -393,14 +497,13 @@ def scrape_pages(self) -> None: # Scrape pages with multiprocessing to improve efficiency # TODO: use asyncio instead pools = mp.cpu_count() - content = process_map(self.scrape_one_link, self.links, max_workers=pools) + content = process_map(self.scrape_one_link, self.links, max_workers=3) for i, c in enumerate(content): df.loc[len(df)] = c - df["city"] = df["url"].map(lambda x: x.split("/")[4]) df["log_id"] = datetime.datetime.now().strftime("%Y%m-%d%H-%M%S") - if not self.find_past: + if not self.search_request.find_past: df = df.drop(["term", "price_sold", "date_sold"], axis=1) logger.info(f"*** All scraping done: {df.shape[0]} results ***") self.raw_df = df @@ -410,14 +513,15 @@ def save_csv(self, df: pd.DataFrame, filepath: str = None) -> None: if filepath is None: self._check_dir() date = str(datetime.datetime.now().date()).replace("-", "") - status = "unavailable" if self.find_past else "unavailable" - want_to = "buy" if self.to_buy else "rent" - filepath = f"./data/houseprice_{date}_{self.area}_{want_to}_{status}_{len(self.links)}.csv" + status = "unavailable" if self.search_request.find_past else "unavailable" + area = self.search_request.area.replace("\",\"", "")[:20] + want_to = "buy" if self.search_request.to_buy else "rent" + filepath = f"./data/houseprice_{date}_{area}_{want_to}_{status}_{len(self.links)}.csv" df.to_csv(filepath, index=False) logger.info(f"*** File saved: {filepath}. ***") def run( - self, raw_data: bool = False, save: bool = False, filepath: str = None + self, raw_data: bool = False, save: bool = False, download_photos: bool = False, filepath: str = None ) -> pd.DataFrame: """ Runs the full scraping process, optionally saving the results to a CSV file. @@ -434,7 +538,7 @@ def run( df = self.raw_df else: logger.info("*** Cleaning data ***") - df = preprocess_data(df=self.raw_df, is_past=self.find_past) + df = preprocess_data(df=self.raw_df, is_past=self.search_request.find_past) self.clean_df = df if save: @@ -456,7 +560,7 @@ def run( "--want_to", type=str, help="Specify you want to 'rent' or 'buy'", - default="rent", + default="buy", choices=["rent", "buy"], ) parser.add_argument( @@ -526,10 +630,17 @@ def run( "--save", action="store_true", help="Indicate whether you want to save the data", + default=True, + ) + + parser.add_argument( + "--download_photos", + action="store_true", + help="Indicate whether you want to save the data", ) args = parser.parse_args() - scraper = FundaScraper( + requestsargs = SearchRequest( area=args.area, want_to=args.want_to, find_past=args.find_past, @@ -543,7 +654,10 @@ def run( max_floor_area=args.max_floor_area, property_type=args.property_type, days_since=args.days_since, + download_photos=args.download_photos, sort=args.sort, ) + print(requestsargs.to_buy) + scraper = FundaScraper(requestsargs) df = scraper.run(raw_data=args.raw_data, save=args.save) print(df.head()) From 6975b9b3e21db1bfe040fd9cce283428ecf076cb Mon Sep 17 00:00:00 2001 From: insitive-jws <111644923+insitive-jws@users.noreply.github.com> Date: Fri, 18 Oct 2024 13:16:27 +0200 Subject: [PATCH 7/7] Added garden width / garden depth to index --- funda_scraper/config/config.yaml | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/funda_scraper/config/config.yaml b/funda_scraper/config/config.yaml index a897011..4df7ffc 100644 --- a/funda_scraper/config/config.yaml +++ b/funda_scraper/config/config.yaml @@ -20,7 +20,7 @@ keep_cols: - year_built - building_roofing - building_details - - size + - volume - living_area - property_area - balcony_size @@ -51,7 +51,9 @@ keep_cols: - cadastralarea - location - garden - - gardensize + - garden_size + - garden_width + - garden_depth - gardenorientation - balcony - parking @@ -109,7 +111,7 @@ css_selector: building_details: - "Id" - "bouw-bijzonderheden" - size: + volume: - "Id" - "afmetingen-inhoud" living_area: "woonoppervlakte" @@ -171,7 +173,7 @@ css_selector: garden: - "Id" - "buitenruimte-tuin" - gardensize: + garden_size: - "Id" - "buitenruimte-hoofdtuin" gardenorientation: @@ -201,4 +203,4 @@ css_selector: neighbourhood_families: sound: ppm25: - no2: + no2: \ No newline at end of file