Skip to content

Commit

Permalink
Merge pull request #287 from perfectly-preserved-pie/dev
Browse files Browse the repository at this point in the history
New lists: Big ass change
  • Loading branch information
perfectly-preserved-pie authored Nov 12, 2024
2 parents 2269821 + 8b17506 commit 13c02c0
Show file tree
Hide file tree
Showing 12 changed files with 818 additions and 519 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ __pycache__/larentals.cpython-310.pyc
*.csv
*.pyc
*.xlsx
.venv/
env
hdf
larentals-checkpoint.py
Expand Down
35 changes: 22 additions & 13 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,22 +1,31 @@
FROM python:3.11-slim

COPY requirements.txt .
# Set the working directory
WORKDIR /app

# Install curl
RUN apt-get update && apt-get install -y curl
# Switch to root user to install dependencies
USER root

# Using uv to install packages because it's fast as fuck boiiii
# https://www.youtube.com/watch?v=6E7ZGCfruaw
# https://ryxcommar.com/2024/02/15/how-to-cut-your-python-docker-builds-in-half-with-uv/
ADD --chmod=655 https://astral.sh/uv/install.sh /install.sh
RUN /install.sh && rm /install.sh
RUN /root/.cargo/bin/uv pip install --system --no-cache -r requirements.txt
# Create the nonroot user and set permissions
RUN adduser --disabled-password --gecos "" nonroot && chown -R nonroot /app

COPY . ./
# Copy everything into the working directory
COPY . /app

# Copy uv binary directly from the UV container image
COPY --from=ghcr.io/astral-sh/uv:latest /uv /bin/uv

# Install dependencies directly into the system environment using uv
RUN uv pip install --system --no-cache-dir -r requirements.txt

# Switch back to non-root user
USER nonroot

# Install curl (if needed, uncomment this line)
# RUN apt-get update && apt-get install -y curl

# Run the app using gunicorn.
# Expose the port gunicorn is listening on (80).
# Set the number of workers to 10.
# Preload the app to avoid the overhead of loading the app for each worker. See https://www.joelsleppy.com/blog/gunicorn-application-preloading/
# Set the app to be the server variable in app.py.
CMD ["gunicorn", "-b", "0.0.0.0:80", "-k", "gevent", "--workers=10", "--preload", "app:server"]
# Preload the app to avoid the overhead of loading the app for each worker.
CMD ["gunicorn", "-b", "0.0.0.0:80", "-k", "gevent", "--workers=10", "--preload", "app:server"]
Binary file modified assets/datasets/lease.parquet
Binary file not shown.
Binary file not shown.
14 changes: 7 additions & 7 deletions assets/javascript/popup.js
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ window.dash_props = Object.assign({}, window.dash_props, {
return `
<tr>
<th style="text-align:left;padding:8px;border-bottom:1px solid #ddd;">Listing ID (MLS#)</th>
<td style="padding:8px;border-bottom:1px solid #ddd;">Not Available</td>
<td style="padding:8px;border-bottom:1px solid #ddd;">${data.mls_number}</td>
</tr>
`;
}
Expand All @@ -47,9 +47,9 @@ window.dash_props = Object.assign({}, window.dash_props, {
const listingUrlBlock = getListingUrlBlock(data);

// Conditionally include the property image row if the image URL is available
const imageRow = data.image_url ? `
const imageRow = data.mls_photo ? `
<a href="${data.listing_url}" target="_blank" referrerPolicy="noreferrer">
<img src="${data.image_url}" alt="Property Image" style="width:100%;height:auto;">
<img src="${data.mls_photo}" alt="Property Image" style="width:100%;height:auto;">
</a>
` : '';

Expand All @@ -64,7 +64,7 @@ window.dash_props = Object.assign({}, window.dash_props, {
<div>
${imageRow}
<div style="text-align: center;">
<h5>${data.address}</h5>
<h5>${data.full_street_address}</h5>
</div>
<table style="width:100%;border-collapse:collapse;">
<tr>
Expand Down Expand Up @@ -106,11 +106,11 @@ window.dash_props = Object.assign({}, window.dash_props, {
</tr>
<tr>
<th style="text-align:left;padding:8px;border-bottom:1px solid #ddd;">Bedrooms/Bathrooms</th>
<td style="padding:8px;border-bottom:1px solid #ddd;">${data.bedrooms}/${data.bathrooms}</td>
<td style="padding:8px;border-bottom:1px solid #ddd;">${data.bedrooms}/${data.total_bathrooms}</td>
</tr>
<tr>
<th style="text-align:left;padding:8px;border-bottom:1px solid #ddd;">Garage Spaces</th>
<td style="padding:8px;border-bottom:1px solid #ddd;">${data.garage_spaces || "Unknown"}</td>
<th style="text-align:left;padding:8px;border-bottom:1px solid #ddd;">Parking Spaces</th>
<td style="padding:8px;border-bottom:1px solid #ddd;">${data.parking_spaces || "Unknown"}</td>
</tr>
<tr>
<th style="text-align:left;padding:8px;border-bottom:1px solid #ddd;">Pets Allowed?</th>
Expand Down
123 changes: 91 additions & 32 deletions functions/dataframe_utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from aiolimiter import AsyncLimiter
from functions.webscraping_utils import check_expired_listing
from functions.mls_image_processing_utils import imagekit_transform
from functions.webscraping_utils import check_expired_listing_bhhs, check_expired_listing_theagency, webscrape_bhhs, fetch_the_agency_data
from loguru import logger
import asyncio
import pandas as pd
Expand All @@ -8,40 +8,99 @@
# Initialize logging
logger.add(sys.stderr, format="{time} {level} {message}", filter="my_module", level="INFO")

async def remove_expired_listings(df: pd.DataFrame, limiter: AsyncLimiter) -> pd.DataFrame:
def remove_inactive_listings(df: pd.DataFrame) -> pd.DataFrame:
"""
Asynchronously checks each listing URL in the DataFrame to determine if it has expired,
and removes rows with expired listings, applying rate limiting. Also counts the number of expired listings removed.
Checks each listing to determine if it has expired or been sold, and removes inactive listings.
If 'bhhs' is in the 'listing_url', it checks for expired listings.
If 'idcrealestate' is in the 'listing_url', it checks for sold listings.
Parameters:
df (pd.DataFrame): The DataFrame containing listing URLs and MLS numbers.
limiter (AsyncLimiter): The rate limiter to control request frequency.
Returns:
pd.DataFrame: The DataFrame with expired listings removed.
pd.DataFrame: The DataFrame with inactive listings removed.
"""
async def check_and_mark_expired(row):
async with limiter:
expired = await check_expired_listing(row.listing_url, row.mls_number)
return (row.Index, expired)

# Gather tasks for all rows that need to be checked
tasks = [check_and_mark_expired(row) for row in df[df.listing_url.notnull()].itertuples()]
results = await asyncio.gather(*tasks)

# Determine indexes of rows to drop (where listing has expired)
indexes_to_drop = [index for index, expired in results if expired]

# Counter for expired listings
expired_count = len(indexes_to_drop)

# Log success messages for dropped listings and the count of expired listings
for index in indexes_to_drop:
mls_number = df.loc[index, 'mls_number']
logger.success(f"Removed {mls_number} (Index: {index}) from the dataframe because the listing has expired.")

logger.info(f"Total expired listings removed: {expired_count}")

# Drop the rows from the DataFrame and return the modified DataFrame
df_dropped_expired = df.drop(indexes_to_drop)
return df_dropped_expired
indexes_to_drop = []

for row in df.itertuples():
listing_url = str(getattr(row, 'listing_url', ''))
mls_number = str(getattr(row, 'mls_number', ''))

# Check if the listing is expired on BHHS
if 'bhhscalifornia.com' in listing_url:
is_expired = check_expired_listing_bhhs(listing_url, mls_number)
if is_expired:
indexes_to_drop.append(row.Index)
logger.success(f"Removed MLS {mls_number} (Index: {row.Index}) from the DataFrame because the listing has expired on BHHS.")
# Check if the listing is expired on The Agency
elif 'theagencyre.com' in listing_url:
is_sold = check_expired_listing_theagency(listing_url, mls_number)
if is_sold:
indexes_to_drop.append(row.Index)
logger.success(f"Removed MLS {mls_number} (Index: {row.Index}) from the DataFrame because the listing has expired on The Agency.")

inactive_count = len(indexes_to_drop)
logger.info(f"Total inactive listings removed: {inactive_count}")

df_active = df.drop(indexes_to_drop)
return df_active.reset_index(drop=True)

def update_dataframe_with_listing_data(
df: pd.DataFrame, imagekit_instance
) -> pd.DataFrame:
"""
Updates the DataFrame with listing date, MLS photo, and listing URL by scraping BHHS and using The Agency's API.
Parameters:
df (pd.DataFrame): The DataFrame to update.
imagekit_instance: The ImageKit instance for image transformations.
Returns:
pd.DataFrame: The updated DataFrame.
"""
for row in df.itertuples():
mls_number = row.mls_number
try:
webscrape = webscrape_bhhs(
url=f"https://www.bhhscalifornia.com/for-lease/{mls_number}-t_q;/",
row_index=row.Index,
mls_number=mls_number,
total_rows=len(df)
)

if not all(webscrape):
logger.warning(f"BHHS did not return complete data for MLS {mls_number}. Trying The Agency.")
agency_data = fetch_the_agency_data(
mls_number,
row_index=row.Index,
total_rows=len(df),
full_street_address=row.full_street_address
)

if agency_data and any(agency_data):
listed_date, listing_url, mls_photo = agency_data
if listed_date:
df.at[row.Index, 'listed_date'] = listed_date
if listing_url:
df.at[row.Index, 'listing_url'] = listing_url
if mls_photo:
df.at[row.Index, 'mls_photo'] = imagekit_transform(
mls_photo,
mls_number,
imagekit_instance=imagekit_instance
)
else:
logger.warning(f"No photo URL found for MLS {mls_number} from The Agency.")
else:
pass
else:
df.at[row.Index, 'listed_date'] = webscrape[0]
df.at[row.Index, 'mls_photo'] = imagekit_transform(
webscrape[1],
mls_number,
imagekit_instance=imagekit_instance
)
df.at[row.Index, 'listing_url'] = webscrape[2]
except Exception as e:
logger.error(f"Error processing MLS {mls_number} at index {row.Index}: {e}")
return df
50 changes: 25 additions & 25 deletions functions/geocoding_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,39 +66,39 @@ def fetch_missing_city(address: str, geolocator: GoogleV3) -> Optional[str]:

return city

def return_postalcode(address: str, geolocator: GoogleV3) -> Optional[Union[int, type(pd.NA)]]:
def return_zip_code(address: str, geolocator: GoogleV3) -> Optional[str]:
"""
Fetches the postal code for a given short address using forward and reverse geocoding.
Fetches the postal code for a given address using geocoding.
Parameters:
address (str): The short address.
geolocator (GoogleV3): An instance of a GoogleV3 geocoding class.
address (str): The full street address.
geolocator (GoogleV3): An instance of the GoogleV3 geocoding class.
Returns:
Optional[Union[int, type(pd.NA)]]: The postal code as an integer, or pd.NA if unsuccessful.
Optional[str]: The postal code as a string, or None if unsuccessful.
"""
# Initialize postalcode variable
postalcode = None

try:
geocode_info = geolocator.geocode(address, components={'administrative_area': 'CA', 'country': 'US'})
components = geolocator.geocode(f"{geocode_info.latitude}, {geocode_info.longitude}").raw['address_components']

# Create a dataframe from the list of dictionaries
components_df = pd.DataFrame(components)

# Iterate through rows to find the postal code
for row in components_df.itertuples():
if row.types == ['postal_code']:
postalcode = int(row.long_name)

logger.info(f"Fetched postal code {postalcode} for {address}.")
except AttributeError:
logger.warning(f"Geocoding returned no results for {address}.")
return pd.NA
geocode_info = geolocator.geocode(
address, components={'administrative_area': 'CA', 'country': 'US'}
)
if geocode_info:
raw = geocode_info.raw['address_components']
# Find the 'postal_code'
postalcode = next(
(addr['long_name'] for addr in raw if 'postal_code' in addr['types']),
None
)
if postalcode:
logger.info(f"Fetched zip code ({postalcode}) for {address}.")
else:
logger.warning(f"No postal code found in geocoding results for {address}.")
else:
logger.warning(f"Geocoding returned no results for {address}.")
except Exception as e:
logger.warning(f"Couldn't fetch postal code for {address} because {e}.")
return pd.NA
logger.warning(f"Couldn't fetch zip code for {address} because of {e}.")
postalcode = None

return postalcode

Loading

0 comments on commit 13c02c0

Please sign in to comment.