whchien · miritreus · Sep 28, 2024 · Sep 28, 2024 · Sep 28, 2024 · Sep 28, 2024
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,40 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+data/
+test.csv
diff --git a/README.md b/README.md
@@ -10,50 +10,58 @@
 `FundaScaper` provides the easiest way to perform web scraping on Funda, the Dutch housing website. You can find houses either for sale or for rent, and access historical data from the past few years.
 
 Please note:
+
 1. Scraping this website is ONLY allowed for personal use (as per Funda's Terms and Conditions).
 2. Any commercial use of this Python package is prohibited. The author holds no liability for any misuse of the package.
 
 ## Install
+
 ### Install with pip:
+
 ```
 pip install funda-scraper
 ```
+
 ### Clone the repository:
+
 ```
 git clone https://github.com/whchien/funda-scraper.git
 cd funda-scraper
 export PYTHONPATH=${PWD}
-python funda_scraper/scrape.py --area amsterdam --want_to rent --page_start 1 --n_pages 3 --save
+python funda_scraper/scrape.py --area amsterdam --want_to rent --page_start 1 --number_of_pages 3 --save
 ```
 
-## Quickstart 
+## Quickstart
+
 ```
 from funda_scraper import FundaScraper
 
 scraper = FundaScraper(
-    area="amsterdam", 
-    want_to="rent", 
-    find_past=False, 
-    page_start=1, 
-    n_pages=3, 
-    min_price=500, 
+    area="amsterdam",
+    want_to="rent",
+    find_sold=False,
+    page_start=1,
+    number_of_pages=3,
+    min_price=500,
     max_price=2000
 )
 df = scraper.run(raw_data=False, save=True, filepath="test.csv")
 df.head()
 ```
-![image](static/example_df.png)
 
+![image](static/example_df.png)
 
-* Note for Windows Users: Please add `if __name__ == "__main__":` before your script.
+- Note for Windows Users: Please add `if __name__ == "__main__":` before your script.
 
 ## Customizing Your Scraping
+
 You can pass several arguments to `FundaScraper()` for customized scraping:
+
 - `area`: Specify the city or specific area you want to look for, e.g. Amsterdam, Utrecht, Rotterdam, etc.
 - `want_to`: Choose either `buy` or `rent` to find houses either for sale or for rent.
-- `find_past`: Set to `True` to find historical data; the default is `False`.
-- `page_start`: Indicate which page to start scraping from; the default is `1`. 
-- `n_pages`: Indicate how many pages to scrape; the default is `1`. 
+- `find_sold`: Set to `True` to find historical data; the default is `False`.
+- `page_start`: Indicate which page to start scraping from; the default is `1`.
+- `number_of_pages`: Indicate how many pages to scrape; the default is `1`.
 - `min_price`: Indicate the lowest budget amount.
 - `max_price`: Indicate the highest budget amount.
 - `min_floor_area`: Indicate the minimum floor area.
@@ -62,14 +70,14 @@ You can pass several arguments to `FundaScraper()` for customized scraping:
 - `property_type`: Specify the desired property type(s).
 - `sort`: Specify sorting criteria.
 
-
 The scraped raw result contains following information:
+
 - url
 - price
 - address
 - description
 - listed_since
-- zip_code 
+- zip_code
 - size
 - year_built
 - living_area
@@ -95,10 +103,8 @@ The scraped raw result contains following information:
 
 To fetch the data without preprocessing, specify `scraper.run(raw_data=True)`.
 
-*Note*: Information regarding listing dates is no longer available since Q4 2023. Funda requires users to log in to see this information.
-
+_Note_: Information regarding listing dates is no longer available since Q4 2023. Funda requires users to log in to see this information.
 
 ## More information
 
-Check the  [example notebook](https://colab.research.google.com/drive/1hNzJJRWxD59lrbeDpfY1OUpBz0NktmfW?usp=sharing) for further details. If you find this project helpful, please give it a [star](https://github.com/whchien/funda-scraper).
-
+Check the [example notebook](https://colab.research.google.com/drive/1hNzJJRWxD59lrbeDpfY1OUpBz0NktmfW?usp=sharing) for further details. If you find this project helpful, please give it a [star](https://github.com/whchien/funda-scraper).
diff --git a/funda_scraper/__init__.py b/funda_scraper/__init__.py
@@ -1,5 +1,6 @@
 """Access the directory in python"""
 
 from funda_scraper.scrape import FundaScraper
+from funda_scraper.extract import DataExtractor
 
-__all__ = ["FundaScraper"]
+__all__ = ["FundaScraper", "DataExtractor"]
diff --git a/funda_scraper/config/config.yaml b/funda_scraper/config/config.yaml
@@ -6,11 +6,12 @@ keep_cols:
     - date_sold
     - ym_sold
     - year_sold
-#    - term_days
+  #    - term_days
   selling_data:
     - url
     - house_id
     - city
+    - neighborhood_name
     - house_type
     - building_type
     - price
@@ -27,34 +28,36 @@ keep_cols:
     # - date_list
     # - ym_list
     # - year_list
-    - descrip
-    - photo
+    - description
+    # - photos
 css_selector:
   url: none
-  price: ".object-header__price"
+  price: "section.mt-6 > div:nth-child(2) > dl:nth-child(2) > dd:nth-child(2) > span:nth-child(1)"
   address: ".object-header__title"
   descrip: ".object-description-body"
-  listed_since: ".fd-align-items-center:nth-child(6) span"
-  zip_code: ".object-header__subtitle"
+  listed_since: "section.mt-6 > div:nth-child(2) > dl:nth-child(2) > dd:nth-child(6) > span:nth-child(1)"
+  zip_code: "span.text-neutral-40:nth-child(2)"
   size: ".object-kenmerken-list:nth-child(8) .fd-align-items-center:nth-child(5) span"
   year: ".fd-align-items-center~ .fd-align-items-center .fd-m-right-xs"
-  living_area: ".object-kenmerken-list:nth-child(8) .fd-align-items-center:nth-child(2) span"
-  kind_of_house: ".object-kenmerken-list:nth-child(5) .fd-align-items-center:nth-child(2) span"
-  building_type: ".object-kenmerken-list:nth-child(5) .fd-align-items-center:nth-child(4) span"
-  num_of_rooms: ".object-kenmerken-list:nth-child(11) .fd-align-items-center:nth-child(2)"
-  num_of_bathrooms: ".object-kenmerken-list:nth-child(11) .fd-align-items-center:nth-child(4)"
+  year_of_construction: ".mt-6.md\\:mt-7 > div:nth-of-type(2) > dl > dd:nth-of-type(3) span"
+  living_area: "section.mt-6 > div:nth-child(4) > dl:nth-child(2) > dd:nth-child(4)"
+  kind_of_house: "section.mt-6 > div:nth-of-type(2) > dl > dd:nth-of-type(1) span"
+  building_type: "section.mt-6 > div:nth-child(3) > dl:nth-child(2) > dd:nth-child(4) > span:nth-child(1)"
+  num_of_rooms: "section.mt-6 > div:nth-of-type(4) > dl > dd:nth-of-type(1) span"
+  num_of_bathrooms: "section.mt-6 > div:nth-of-type(4) > dl > dd:nth-of-type(2) span"
   layout: ".object-kenmerken-list:nth-child(11)"
-  energy_label: ".energielabel"
-  insulation: ".object-kenmerken-list:nth-child(14) .fd-align-items-center:nth-child(4)"
-  heating: ".object-kenmerken-list:nth-child(14) .fd-align-items-center:nth-child(6)"
-  ownership: ".object-kenmerken-list:nth-child(17) .fd-align-items-center:nth-child(4)"
+  energy_label: ".gap-6 > div:nth-child(1)"
+  insulation: ".mt-6.md\\:mt-7 > div:nth-of-type(5) > dl > dd:nth-of-type(2) span"
+  heating: "section.mt-6 > div:nth-of-type(5) > dl > dd:nth-of-type(3) span"
+  ownership: "section.mt-6 > div:nth-of-type(6) > dl > dd:nth-of-type(2) span"
   exteriors: ".object-kenmerken-list:nth-child(19)"
-  parking: ".object-kenmerken-list:nth-child(24)"
-  neighborhood_name: ".fd-display-inline--bp-m"
+  parking: "section.mt-6 > div:nth-of-type(5) > dl > dd:nth-of-type(1) span"
+  neighborhood_name: ".object-header__container > a"
   date_list: "dd:nth-child(2)"
   date_sold: "dd:nth-child(4)"
   term: "dd:nth-child(6)"
   price_sold: ".object-header__price--historic"
-  last_ask_price: ".object-kenmerken-list:nth-child(2) .fd-align-items-center:nth-child(2)"
-  last_ask_price_m2: ".object-kenmerken-list__asking-price"
+  last_ask_price: "section.mt-6 > div:nth-child(2) > dl:nth-child(2) > dd:nth-child(2) > span:nth-child(1)"
+  last_ask_price_m2: "section.mt-6 > div:nth-child(2) > dl:nth-child(2) > dd:nth-child(4) > span:nth-child(1)"
   photo: ".media-viewer-overview__section-list-item--photo img[data-lazy]"
+  photos: "main > div:nth-child(1) ul > li:nth-child(1) > a > span:nth-child(3)"
diff --git a/funda_scraper/extract.py b/funda_scraper/extract.py
@@ -0,0 +1,168 @@
+
+import json
+import pandas as pd
+from bs4 import BeautifulSoup
+from typing import List
+from tqdm import tqdm
+import traceback
+
+from funda_scraper.config.core import config
+from funda_scraper.preprocess import preprocess_data
+from funda_scraper.utils import logger
+from funda_scraper.filerepository import FileRepository
+from funda_scraper.searchrequest import SearchRequest
+from funda_scraper.property import Property
+
+class DataExtractor(object):
+
+    def __init__(self):
+        self.selectors = config.css_selector
+        self.raw_df = pd.DataFrame()
+        self.clean_df = pd.DataFrame()
+        self.file_repo = FileRepository()
+
+
+    def extract_data(self, search_request: SearchRequest, run_id: str, clean_data: bool) -> pd.DataFrame:
+
+        detail_pages = self.file_repo.get_detail_pages(run_id)
+
+        houses: list[Property] = []
+
+        houses_with_processing_errors = []
+
+        for page in tqdm(detail_pages, desc = "Processing detail pages.."):
+            try:
+                content = detail_pages[page]
+                house = self.extract_data_from_detail_page(content, search_request)
+                houses.append(house)
+            except Exception as e:
+                logger.error(f"An error occurred while processing house: {e}; skipping this house")
+                logger.error("Traceback:", exc_info=True)
+                houses_with_processing_errors.append(page)
+
+        logger.info(f"*** All scraping done: {len(houses)} results ***")
+        logger.info(f"There were {len(houses_with_processing_errors)} houses that could not be processed")
+        for error_house in houses_with_processing_errors:
+            # TODO: move these to a separate errors folder or so
+            logger.info(f"Error: {error_house}")
+
+        # It may be more intuitive to manipulate the Property objects instead of dataframes, but let's keep the dataframes approach for now
+        # Note that we are omitting the photos field, which is an array field, and include the photos_string property
+        df = pd.DataFrame([
+                    {**{k: v for k, v in vars(house).items() if k != 'photos'}, 'photos': house.photos_string}
+                    for house in houses
+                ])
+
+        if not search_request.find_sold:
+            df = df.drop(["term", "price_sold", "date_sold"], axis=1)
+
+        self.raw_df = df
+
+        if not clean_data:
+            df = self.raw_df
+        else:
+            logger.info("*** Cleaning data ***")
+            df = preprocess_data(df = self.raw_df, is_past = search_request.find_sold)
+            self.clean_df = df
+
+        self.file_repo.save_result_file(df, run_id)
+
+        return df
+
+
+    def extract_data_from_detail_page(self, page: str, search_request: SearchRequest) -> Property:
+        soup = BeautifulSoup(page, "lxml")
+
+        script_tag = soup.find_all("script", {"type": "application/ld+json"})[0]
+        json_data = json.loads(script_tag.contents[0])
+
+        url = json_data["url"]
+        description = json_data["description"]
+        address = f"{json_data["address"]["streetAddress"]}"
+        city = json_data["address"]["addressLocality"]
+        price = f"{json_data["offers"]["priceCurrency"]} {json_data["offers"]["price"]}"
+
+        # Get the value according to respective CSS selectors
+        if search_request.to_buy:
+            if search_request.find_sold:
+                list_since_selector = self.selectors.date_list
+            else:
+                list_since_selector = self.selectors.listed_since
+        else:
+            if search_request.find_sold:
+                list_since_selector = ".fd-align-items-center:nth-child(9) span"
+            else:
+                list_since_selector = ".fd-align-items-center:nth-child(7) span"
+
+        house = Property()
+        house.url = url
+        house.price = price
+        house.address = address
+        house.city = city
+        house.description = description
+        house.zip_code = self.get_value_from_css(soup, self.selectors.zip_code)
+        house.size = self.get_value_from_css(soup, self.selectors.size)
+        house.year_of_construction = self.get_value_from_css(soup, self.selectors.year_of_construction)
+        house.living_area = self.get_value_from_css(soup, self.selectors.living_area)
+        house.house_type = self.get_value_from_css(soup, self.selectors.kind_of_house)
+        house.building_type = self.get_value_from_css(soup, self.selectors.building_type)
+        house.number_of_rooms = self.get_value_from_css(soup, self.selectors.num_of_rooms)
+        house.number_of_bathrooms = self.get_value_from_css(soup, self.selectors.num_of_bathrooms)
+        house.layout = self.get_value_from_css(soup, self.selectors.layout),
+        house.energy_label = self.get_value_from_css(soup, self.selectors.energy_label)
+        house.insulation = self.get_value_from_css(soup, self.selectors.insulation)
+        house.heating = self.get_value_from_css(soup, self.selectors.heating)
+        house.ownership = self.get_value_from_css(soup, self.selectors.ownership)
+        house.exteriors = self.get_value_from_css(soup, self.selectors.exteriors)
+        house.parking = self.get_value_from_css(soup, self.selectors.parking)
+        house.neighborhood_name = self.get_value_from_css(soup, self.selectors.neighborhood_name)
+        house.date_list = self.get_value_from_css(soup, self.selectors.date_list)
+        house.date_sold = self.get_value_from_css(soup, self.selectors.date_sold)
+        house.term = self.get_value_from_css(soup, self.selectors.term)
+        house.price_sold = self.get_value_from_css(soup, self.selectors.price_sold)
+        house.last_ask_price = self.get_value_from_css(soup, self.selectors.last_ask_price)
+        house.last_ask_price_m2 = self.get_value_from_css(soup, self.selectors.last_ask_price_m2).split("\r")[0]
+        house.photos = self.get_photos(soup, house.url)
+
+        for key, value in house.__dict__.items():
+            formatted_value = self.format_string(value)
+            setattr(house, key, formatted_value)
+
+        return house
+
+    @staticmethod
+    def get_value_from_css(soup: BeautifulSoup, selector: str) -> str:
+        """Extracts data from HTML using a CSS selector."""
+        result = soup.select(selector)
+        if len(result) > 0:
+            result = result[0].text
+        else:
+            result = "na"
+        return result
+
+
+    def format_string(self, value):
+        if type(value) == "str":
+            return value.replace("\n", "").replace("\r", "").strip()
+        else:
+            return value
+
+    def get_photos(self, soup: BeautifulSoup, url: str) -> List[str]:
+        number_of_photos = 0
+        try:
+            number_of_photos = int(self.get_value_from_css(soup, self.selectors.photos))
+        except:
+            number_of_photos = 0
+
+        photos: List[str] = []
+
+        if (number_of_photos > 0):
+            for i in range(1, number_of_photos + 1):
+                photo_url = f"{url}media/foto/{i}"
+                photos.append(photo_url)
+
+        return photos
+
+
+
+