Merge pull request #287 from perfectly-preserved-pie/dev

New lists: Big ass change
perfectly-preserved-pie · Nov 12, 2024 · 13c02c0 · 13c02c0
2 parents 2269821 + 8b17506
commit 13c02c0
Show file tree

Hide file tree

Showing 12 changed files with 818 additions and 519 deletions.
diff --git a/.gitignore b/.gitignore
@@ -5,6 +5,7 @@ __pycache__/larentals.cpython-310.pyc
 *.csv
 *.pyc
 *.xlsx
+.venv/
 env
 hdf
 larentals-checkpoint.py

diff --git a/Dockerfile b/Dockerfile
@@ -1,22 +1,31 @@
 FROM python:3.11-slim
 
-COPY requirements.txt .
+# Set the working directory
+WORKDIR /app
 
-# Install curl
-RUN apt-get update && apt-get install -y curl
+# Switch to root user to install dependencies
+USER root
 
-# Using uv to install packages because it's fast as fuck boiiii
-# https://www.youtube.com/watch?v=6E7ZGCfruaw
-# https://ryxcommar.com/2024/02/15/how-to-cut-your-python-docker-builds-in-half-with-uv/
-ADD --chmod=655 https://astral.sh/uv/install.sh /install.sh
-RUN /install.sh && rm /install.sh
-RUN /root/.cargo/bin/uv pip install --system --no-cache -r requirements.txt
+# Create the nonroot user and set permissions
+RUN adduser --disabled-password --gecos "" nonroot && chown -R nonroot /app
 
-COPY . ./
+# Copy everything into the working directory
+COPY . /app
+
+# Copy uv binary directly from the UV container image
+COPY --from=ghcr.io/astral-sh/uv:latest /uv /bin/uv
+
+# Install dependencies directly into the system environment using uv
+RUN uv pip install --system --no-cache-dir -r requirements.txt
+
+# Switch back to non-root user
+USER nonroot
+
+# Install curl (if needed, uncomment this line)
+# RUN apt-get update && apt-get install -y curl
 
 # Run the app using gunicorn.
 # Expose the port gunicorn is listening on (80).
 # Set the number of workers to 10.
-# Preload the app to avoid the overhead of loading the app for each worker. See https://www.joelsleppy.com/blog/gunicorn-application-preloading/
-# Set the app to be the server variable in app.py.
-CMD ["gunicorn", "-b", "0.0.0.0:80", "-k", "gevent", "--workers=10", "--preload", "app:server"]
+# Preload the app to avoid the overhead of loading the app for each worker.
+CMD ["gunicorn", "-b", "0.0.0.0:80", "-k", "gevent", "--workers=10", "--preload", "app:server"]
diff --git a/assets/datasets/lease.parquet b/assets/datasets/lease.parquet
diff --git a/assets/datasets/lease.parquet.bak.newest.kindafuckedup b/assets/datasets/lease.parquet.bak.newest.kindafuckedup
diff --git a/assets/javascript/popup.js b/assets/javascript/popup.js
@@ -22,7 +22,7 @@ window.dash_props = Object.assign({}, window.dash_props, {
                         return `
                             <tr>
                                 <th style="text-align:left;padding:8px;border-bottom:1px solid #ddd;">Listing ID (MLS#)</th>
-                                <td style="padding:8px;border-bottom:1px solid #ddd;">Not Available</td>
+                                <td style="padding:8px;border-bottom:1px solid #ddd;">${data.mls_number}</td>
                             </tr>
                         `;
                     }
@@ -47,9 +47,9 @@ window.dash_props = Object.assign({}, window.dash_props, {
                 const listingUrlBlock = getListingUrlBlock(data);
 
                 // Conditionally include the property image row if the image URL is available
-                const imageRow = data.image_url ? `
+                const imageRow = data.mls_photo ? `
                 <a href="${data.listing_url}" target="_blank" referrerPolicy="noreferrer">
-                    <img src="${data.image_url}" alt="Property Image" style="width:100%;height:auto;">
+                    <img src="${data.mls_photo}" alt="Property Image" style="width:100%;height:auto;">
                 </a>
                 ` : '';
 
@@ -64,7 +64,7 @@ window.dash_props = Object.assign({}, window.dash_props, {
                         <div>
                         ${imageRow}
                         <div style="text-align: center;">
-                            <h5>${data.address}</h5>
+                            <h5>${data.full_street_address}</h5>
                         </div>
                         <table style="width:100%;border-collapse:collapse;">
                             <tr>
@@ -106,11 +106,11 @@ window.dash_props = Object.assign({}, window.dash_props, {
                             </tr>
                             <tr>
                                 <th style="text-align:left;padding:8px;border-bottom:1px solid #ddd;">Bedrooms/Bathrooms</th>
-                                <td style="padding:8px;border-bottom:1px solid #ddd;">${data.bedrooms}/${data.bathrooms}</td>
+                                <td style="padding:8px;border-bottom:1px solid #ddd;">${data.bedrooms}/${data.total_bathrooms}</td>
                             </tr>
                             <tr>
-                                <th style="text-align:left;padding:8px;border-bottom:1px solid #ddd;">Garage Spaces</th>
-                                <td style="padding:8px;border-bottom:1px solid #ddd;">${data.garage_spaces || "Unknown"}</td>
+                                <th style="text-align:left;padding:8px;border-bottom:1px solid #ddd;">Parking Spaces</th>
+                                <td style="padding:8px;border-bottom:1px solid #ddd;">${data.parking_spaces || "Unknown"}</td>
                             </tr>
                             <tr>
                                 <th style="text-align:left;padding:8px;border-bottom:1px solid #ddd;">Pets Allowed?</th>

diff --git a/functions/dataframe_utils.py b/functions/dataframe_utils.py
@@ -1,5 +1,5 @@
-from aiolimiter import AsyncLimiter
-from functions.webscraping_utils import check_expired_listing
+from functions.mls_image_processing_utils import imagekit_transform
+from functions.webscraping_utils import check_expired_listing_bhhs, check_expired_listing_theagency, webscrape_bhhs, fetch_the_agency_data
 from loguru import logger
 import asyncio
 import pandas as pd
@@ -8,40 +8,99 @@
 # Initialize logging
 logger.add(sys.stderr, format="{time} {level} {message}", filter="my_module", level="INFO")
 
-async def remove_expired_listings(df: pd.DataFrame, limiter: AsyncLimiter) -> pd.DataFrame:
+def remove_inactive_listings(df: pd.DataFrame) -> pd.DataFrame:
     """
-    Asynchronously checks each listing URL in the DataFrame to determine if it has expired,
-    and removes rows with expired listings, applying rate limiting. Also counts the number of expired listings removed.
+    Checks each listing to determine if it has expired or been sold, and removes inactive listings.
+    If 'bhhs' is in the 'listing_url', it checks for expired listings.
+    If 'idcrealestate' is in the 'listing_url', it checks for sold listings.
 
     Parameters:
     df (pd.DataFrame): The DataFrame containing listing URLs and MLS numbers.
-    limiter (AsyncLimiter): The rate limiter to control request frequency.
 
     Returns:
-    pd.DataFrame: The DataFrame with expired listings removed.
+    pd.DataFrame: The DataFrame with inactive listings removed.
     """
-    async def check_and_mark_expired(row):
-        async with limiter:
-            expired = await check_expired_listing(row.listing_url, row.mls_number)
-        return (row.Index, expired)
-
-    # Gather tasks for all rows that need to be checked
-    tasks = [check_and_mark_expired(row) for row in df[df.listing_url.notnull()].itertuples()]
-    results = await asyncio.gather(*tasks)
-
-    # Determine indexes of rows to drop (where listing has expired)
-    indexes_to_drop = [index for index, expired in results if expired]
-
-    # Counter for expired listings
-    expired_count = len(indexes_to_drop)
-
-    # Log success messages for dropped listings and the count of expired listings
-    for index in indexes_to_drop:
-        mls_number = df.loc[index, 'mls_number']
-        logger.success(f"Removed {mls_number} (Index: {index}) from the dataframe because the listing has expired.")
-
-    logger.info(f"Total expired listings removed: {expired_count}")
-
-    # Drop the rows from the DataFrame and return the modified DataFrame
-    df_dropped_expired = df.drop(indexes_to_drop)
-    return df_dropped_expired
+    indexes_to_drop = []
+
+    for row in df.itertuples():
+        listing_url = str(getattr(row, 'listing_url', ''))
+        mls_number = str(getattr(row, 'mls_number', ''))
+
+        # Check if the listing is expired on BHHS
+        if 'bhhscalifornia.com' in listing_url:
+            is_expired = check_expired_listing_bhhs(listing_url, mls_number)
+            if is_expired:
+                indexes_to_drop.append(row.Index)
+                logger.success(f"Removed MLS {mls_number} (Index: {row.Index}) from the DataFrame because the listing has expired on BHHS.")
+        # Check if the listing is expired on The Agency
+        elif 'theagencyre.com' in listing_url:
+            is_sold = check_expired_listing_theagency(listing_url, mls_number)
+            if is_sold:
+                indexes_to_drop.append(row.Index)
+                logger.success(f"Removed MLS {mls_number} (Index: {row.Index}) from the DataFrame because the listing has expired on The Agency.")
+
+    inactive_count = len(indexes_to_drop)
+    logger.info(f"Total inactive listings removed: {inactive_count}")
+
+    df_active = df.drop(indexes_to_drop)
+    return df_active.reset_index(drop=True)
+
+def update_dataframe_with_listing_data(
+    df: pd.DataFrame, imagekit_instance
+) -> pd.DataFrame:
+    """
+    Updates the DataFrame with listing date, MLS photo, and listing URL by scraping BHHS and using The Agency's API.
+
+    Parameters:
+    df (pd.DataFrame): The DataFrame to update.
+    imagekit_instance: The ImageKit instance for image transformations.
+
+    Returns:
+    pd.DataFrame: The updated DataFrame.
+    """
+    for row in df.itertuples():
+        mls_number = row.mls_number
+        try:
+            webscrape = webscrape_bhhs(
+                url=f"https://www.bhhscalifornia.com/for-lease/{mls_number}-t_q;/",
+                row_index=row.Index,
+                mls_number=mls_number,
+                total_rows=len(df)
+            )
+
+            if not all(webscrape):
+                logger.warning(f"BHHS did not return complete data for MLS {mls_number}. Trying The Agency.")
+                agency_data = fetch_the_agency_data(
+                    mls_number,
+                    row_index=row.Index,
+                    total_rows=len(df),
+                    full_street_address=row.full_street_address
+                )
+
+                if agency_data and any(agency_data):
+                    listed_date, listing_url, mls_photo = agency_data
+                    if listed_date:
+                        df.at[row.Index, 'listed_date'] = listed_date
+                    if listing_url:
+                        df.at[row.Index, 'listing_url'] = listing_url
+                    if mls_photo:
+                        df.at[row.Index, 'mls_photo'] = imagekit_transform(
+                            mls_photo,
+                            mls_number,
+                            imagekit_instance=imagekit_instance
+                        )
+                    else:
+                        logger.warning(f"No photo URL found for MLS {mls_number} from The Agency.")
+                else:
+                    pass
+            else:
+                df.at[row.Index, 'listed_date'] = webscrape[0]
+                df.at[row.Index, 'mls_photo'] = imagekit_transform(
+                    webscrape[1],
+                    mls_number,
+                    imagekit_instance=imagekit_instance
+                )
+                df.at[row.Index, 'listing_url'] = webscrape[2]
+        except Exception as e:
+            logger.error(f"Error processing MLS {mls_number} at index {row.Index}: {e}")
+    return df
diff --git a/functions/geocoding_utils.py b/functions/geocoding_utils.py
@@ -66,39 +66,39 @@ def fetch_missing_city(address: str, geolocator: GoogleV3) -> Optional[str]:
 
     return city
 
-def return_postalcode(address: str, geolocator: GoogleV3) -> Optional[Union[int, type(pd.NA)]]:
+def return_zip_code(address: str, geolocator: GoogleV3) -> Optional[str]:
     """
-    Fetches the postal code for a given short address using forward and reverse geocoding.
-    
+    Fetches the postal code for a given address using geocoding.
+
     Parameters:
-    address (str): The short address.
-    geolocator (GoogleV3): An instance of a GoogleV3 geocoding class.
-    
+    address (str): The full street address.
+    geolocator (GoogleV3): An instance of the GoogleV3 geocoding class.
+
     Returns:
-    Optional[Union[int, type(pd.NA)]]: The postal code as an integer, or pd.NA if unsuccessful.
+    Optional[str]: The postal code as a string, or None if unsuccessful.
     """
-    # Initialize postalcode variable
     postalcode = None
 
     try:
-        geocode_info = geolocator.geocode(address, components={'administrative_area': 'CA', 'country': 'US'})
-        components = geolocator.geocode(f"{geocode_info.latitude}, {geocode_info.longitude}").raw['address_components']
-
-        # Create a dataframe from the list of dictionaries
-        components_df = pd.DataFrame(components)
-
-        # Iterate through rows to find the postal code
-        for row in components_df.itertuples():
-            if row.types == ['postal_code']:
-                postalcode = int(row.long_name)
-
-        logger.info(f"Fetched postal code {postalcode} for {address}.")
-    except AttributeError:
-        logger.warning(f"Geocoding returned no results for {address}.")
-        return pd.NA
+        geocode_info = geolocator.geocode(
+            address, components={'administrative_area': 'CA', 'country': 'US'}
+        )
+        if geocode_info:
+            raw = geocode_info.raw['address_components']
+            # Find the 'postal_code'
+            postalcode = next(
+                (addr['long_name'] for addr in raw if 'postal_code' in addr['types']),
+                None
+            )
+            if postalcode:
+                logger.info(f"Fetched zip code ({postalcode}) for {address}.")
+            else:
+                logger.warning(f"No postal code found in geocoding results for {address}.")
+        else:
+            logger.warning(f"Geocoding returned no results for {address}.")
     except Exception as e:
-        logger.warning(f"Couldn't fetch postal code for {address} because {e}.")
-        return pd.NA
+        logger.warning(f"Couldn't fetch zip code for {address} because of {e}.")
+        postalcode = None
 
     return postalcode