Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Scrape schedule versions #35

Closed
79 changes: 18 additions & 61 deletions data_analysis/compare_scheduled_and_rt.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,18 @@
from tqdm import tqdm
from dotenv import load_dotenv

import static_gtfs_analysis

import data_analysis.static_gtfs_analysis as static_gtfs_analysis
from scrape_data.scrape_schedule_versions import create_schedule_list

load_dotenv()

BUCKET_PUBLIC = os.getenv('BUCKET_PUBLIC', 'chn-ghost-buses-public')
logger = logging.getLogger()
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s: %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p')
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s %(levelname)s: %(message)s',
datefmt='%m/%d/%Y %I:%M:%S %p'
)

BASE_PATH = S3Path(f"/{BUCKET_PUBLIC}")

Expand Down Expand Up @@ -96,7 +100,7 @@ def sum_trips_by_rt_by_freq(
rt_df: pd.DataFrame,
sched_df: pd.DataFrame,
agg_info: AggInfo,
holidays: List[str] = ["2022-05-30", "2022-07-04", "2022-09-05", "2022-11-24", "2022-12-25"]) -> Tuple[pd.DataFrame, pd.DataFrame]:
holidays: List[str] = ["2022-05-30", "2022-07-04", "2022-09-05", "2022-11-24", "2022-12-25"]) -> Tuple[pd.DataFrame, pd.DataFrame]:
"""Calculate ratio of trips to scheduled trips per route
per specified frequency.

Expand Down Expand Up @@ -172,7 +176,7 @@ def combine_real_time_rt_comparison(
schedule_data_list: List[dict],
agg_info: AggInfo,
holidays: List[str] = ["2022-05-31", "2022-07-04", "2022-09-05", "2022-11-24", "2022-12-25"],
save: bool = True) -> Tuple[pd.DataFrame, pd.DataFrame]:
save: bool = True) -> Tuple[pd.DataFrame, pd.DataFrame]:
"""Generate a combined DataFrame with the realtime route comparisons

Args:
Expand Down Expand Up @@ -222,7 +226,7 @@ def combine_real_time_rt_comparison(
for day in date_pbar:
date_str = day.to_date_string()
pbar.set_description(
f"Processing {date_str} at"
f" Processing {date_str} at "
f"{pendulum.now().to_datetime_string()}"
)

Expand Down Expand Up @@ -266,7 +270,7 @@ def combine_real_time_rt_comparison(
outpath,
index=False,
)
logger.info(f"Processing {feed['schedule_version']}")
logger.info(f" Processing version {feed['schedule_version']}")
combined_grouped = pd.concat([combined_grouped, compare_by_day_type])
combined_long = pd.concat([combined_long, compare_freq_by_rte])

Expand All @@ -275,7 +279,7 @@ def combine_real_time_rt_comparison(

def build_summary(
combined_df: pd.DataFrame,
save: bool = True) -> pd.DataFrame:
save: bool = True) -> pd.DataFrame:
"""Create a summary by route and day type

Args:
Expand Down Expand Up @@ -317,58 +321,12 @@ def main(freq: str = 'D') -> Tuple[List[dict],pd.DataFrame, pd.DataFrame]:
Args:
freq (str): Frequency of aggregation. Defaults to Daily.
Returns:
pd.DataFrame: A DataFrame of every day in the specified data with scheduled and
observed count of trips.
pd.DataFrame: A DataFrame of every day in the specified data with
scheduled and observed count of trips.
pd.DataFrame: A DataFrame summary across
versioned schedule comparisons.
"""

schedule_feeds = [{'schedule_version': '20220507',
'feed_start_date': '2022-05-20',
'feed_end_date': '2022-06-02'},
{'schedule_version': '20220603',
'feed_start_date': '2022-06-04',
'feed_end_date': '2022-06-07'},
{'schedule_version': '20220608',
'feed_start_date': '2022-06-09',
'feed_end_date': '2022-07-08'},
{'schedule_version': '20220709',
'feed_start_date': '2022-07-10',
'feed_end_date': '2022-07-17'},
{'schedule_version': '20220718',
'feed_start_date': '2022-07-19',
'feed_end_date': '2022-07-29'},
{'schedule_version': '20220730',
'feed_start_date': '2022-07-31',
'feed_end_date': '2022-08-10'},
{'schedule_version': '20220811',
'feed_start_date': '2022-08-12',
'feed_end_date': '2022-08-12'},
{'schedule_version': '20220813',
'feed_start_date': '2022-08-14',
'feed_end_date': '2022-08-16'},
{'schedule_version': '20220817',
'feed_start_date': '2022-08-18',
'feed_end_date': '2022-09-07'},
{'schedule_version': '20220908',
'feed_start_date': '2022-09-09',
'feed_end_date': '2022-09-17'},
{'schedule_version': '20220918',
'feed_start_date': '2022-09-19',
'feed_end_date': '2022-09-28'},
{'schedule_version': '20220929',
'feed_start_date': '2022-09-30',
'feed_end_date': '2022-10-06'},
{'schedule_version': '20221007',
'feed_start_date': '2022-10-08',
'feed_end_date': '2022-10-11'},
{'schedule_version': '20221012',
'feed_start_date': '2022-10-13',
'feed_end_date': '2022-10-19'},
{'schedule_version': '20221020',
'feed_start_date': '2022-10-21',
'feed_end_date': '2022-10-21'}
]
schedule_feeds = create_schedule_list(month=5, year=2022)

schedule_data_list = []
pbar = tqdm(schedule_feeds)
Expand All @@ -378,19 +336,19 @@ def main(freq: str = 'D') -> Tuple[List[dict],pd.DataFrame, pd.DataFrame]:
f"Generating daily schedule data for "
f"schedule version {schedule_version}"
)
logging.info(
logger.info(
f"\nDownloading zip file for schedule version "
f"{schedule_version}"
)
CTA_GTFS = static_gtfs_analysis.download_zip(schedule_version)
logging.info("\nExtracting data")
logger.info("\nExtracting data")
data = static_gtfs_analysis.GTFSFeed.extract_data(
CTA_GTFS,
version_id=schedule_version
)
data = static_gtfs_analysis.format_dates_hours(data)

logging.info("\nSummarizing trip data")
logger.info("\nSummarizing trip data")
trip_summary = static_gtfs_analysis.make_trip_summary(data,
pendulum.from_format(feed['feed_start_date'], 'YYYY-MM-DD'),
pendulum.from_format(feed['feed_end_date'], 'YYYY-MM-DD'))
Expand All @@ -404,7 +362,6 @@ def main(freq: str = 'D') -> Tuple[List[dict],pd.DataFrame, pd.DataFrame]:
{"schedule_version": schedule_version,
"data": route_daily_summary}
)

agg_info = AggInfo(freq=freq)
combined_long, combined_grouped = combine_real_time_rt_comparison(
schedule_feeds,
Expand Down
4 changes: 3 additions & 1 deletion scrape_data/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,4 @@
pendulum==2.1.2
requests==2.26.0
requests==2.26.0
beautifulsoup4==4.11.1
lxml==4.9.1
209 changes: 209 additions & 0 deletions scrape_data/scrape_schedule_versions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,209 @@
from typing import List, Tuple

from bs4 import BeautifulSoup
import requests
import pendulum
import logging
import calendar
import pandas as pd

logger = logging.getLogger()
logging.basicConfig(level=logging.INFO)
logger.setLevel(logging.INFO)

BASE_URL = "https://transitfeeds.com"


def check_latest_rt_data_date() -> str:
"""Fetch the latest available date of real-time bus data

Returns:
str: A string of the latest date in YYYY-MM-DD format.
"""
if pendulum.now("America/Chicago").hour >= 11:
end_date = pendulum.yesterday("America/Chicago").date().format("YYYY-MM-DD")
else:
end_date = (
pendulum.now("America/Chicago").subtract(days=2).date().format("YYYY-MM-DD")
)
return end_date


def fetch_schedule_versions(month: int, year: int) -> List[pendulum.date]:
"""Get the schedule versions from transitfeeds.com from the most recent
to specified month and year (inclusive). In case there are
multiple schedules for a given month and year pair,
all schedules will be fetched.

Args:
month (int): The month of interest
year (int): The year of interest

Returns:
List[pendulum.date]: A list of unique schedule versions
"""
link_list = []
page = 1
found = False
while not found:
logging.info(f" Searching page {page}")
url = BASE_URL + f"/p/chicago-transit-authority/165?p={page}"
response = requests.get(url).content
soup = BeautifulSoup(response, "lxml")
# List of dates from first row
table = soup.find_all("table")
for row in table[0].tbody.find_all("tr"):
first_col = row.find_all("td")[0]
date = pendulum.parse(first_col.text.strip(), strict=False)
# Find schedules up to and including the specified date.
if date.month == month and date.year == year:
logging.info(
f" Found schedule for"
f" {calendar.month_name[date.month]} {date.year}"
)
logging.info(
f" Adding schedule for {calendar.month_name[date.month]}"
f" {date.day}, {date.year}"
)
link_list.append(first_col)
found = True
continue
if found:
break
link_list.append(first_col)
page += 1

date_list = [s.text.strip() for s in link_list]
# Check for duplicates. The presence of duplicates could mean
# that the schedule was not in-effect.
# See https://github.com/chihacknight/chn-ghost-buses/issues/30
duplicates = pd.Series(date_list)[pd.Series(date_list).duplicated()].values
if len(duplicates) > 0:
logging.info(
f" The duplicate schedule versions are"
f" {set(duplicates)}. Check whether these were in-effect."
)
# Keep the first occurrence of duplicates.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just checking here - "first" means "the one that appears first in the order of display on the Transitfeeds site, which was actually scraped later"? (That would be the correct implementation IMO, just wasn't sure whether "first" means chronologically or in display order which is reverse chronological)

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes "first" here means the first in the order of display on Transitfeeds.

date_list appears as

[...'14 September 2021', '7 September 2021', '1 September 2021',
  '1 September 2021', '1 September 2021', '1 September 2021',
  '1 September 2021', '2 August 2021', '15 June 2021',..]

So taking the first will take the latest version. I've added a note on that in the comments.

# This will be the schedule that was left on the CTA website
# the longest.
date_list = pd.Series(date_list).drop_duplicates()

return sorted([pendulum.parse(date, strict=False).date() for date in date_list])


def modify_data_collection_start(date_list: List[pendulum.date]) -> List[pendulum.date]:
"""Whether to modify the schedule version for the start of
data collection on May 20, 2022

Args:
date_list (List[pendulum.date]): A list of dates in pendulum format

Returns:
List[pendulum.date]: A list of dates in pendulum format where the
start date for schedule version 2022-05-07
is 2022-05-19. This will ensure that the date
ranges are valid i.e. starting with 2022-05-20 up to the day
before the next schedule version.
"""
# For schedule version 20220507, set the date to be May 19th 2022,
# one day before the start of data collection. This will mean that
# the start date will fall on 2022-05-20 in calculate_version_date_ranges
for idx, date in enumerate(date_list):
if date.month == 5 and date.day == 7 and date.year == 2022:
date = pendulum.date(2022, 5, 19)
date_list[idx] = date

return date_list


def calculate_version_date_ranges(
month: int, year: int, start2022: bool = True
) -> Tuple[List[pendulum.date], List[Tuple[pendulum.date, pendulum.date]]]:
"""Get the start and end dates for each schedule version from the most
recent version to the version specified by the month and year

Args:
month (int): month of interest
year (int): year of interest
start2022 (bool, optional): Whether to modify the
start date of version 20220507 to reflect the start of
real-time bus data collection. Defaults to True.

Returns:
Tuple[List[pendulum.date], List[Tuple[pendulum.date, pendulum.date]]]:
A list of schedule versions and list of tuples for the
start and end dates corresponding to those versions.
"""
schedule_list = fetch_schedule_versions(month=month, year=year)
if start2022:
schedule_list = modify_data_collection_start(schedule_list)

start_end_list = []
for i in range(len(schedule_list)):
try:
date_tuple = (
schedule_list[i].add(days=1),
schedule_list[i + 1].subtract(days=1),
)
start_end_list.append(date_tuple)
except IndexError:
pass

# Handle the current schedule version by setting the end date as the latest
# available date for data.
start_end_list.append((schedule_list[-1].add(days=1), check_latest_rt_data_date()))
return schedule_list, start_end_list


def create_schedule_list_dict(
schedule_list: List[pendulum.date],
start_end_list: List[Tuple[pendulum.date, pendulum.date]],
) -> List[dict]:
"""Create a list of dictionaries with keys for the schedule_version,
start_date, and end_date

Args:
schedule_list (List[pendulum.date]): A list of schedule versions from
transitfeeds.com
start_end_list (List[pendulum.date]): A list of start and end dates
for each version

Returns:
List[dict]: A list of dictionaries with the start and end dates
corresponding to each schedule version.
"""
schedule_list_dict = []
for version, (start_date, end_date) in zip(schedule_list, start_end_list):
# Changing back the starting version to 20220507
if version == pendulum.date(2022, 5, 19):
version = pendulum.date(2022, 5, 7)
schedule_dict = {
"schedule_version": version.format("YYYYMMDD"),
"feed_start_date": start_date.format("YYYY-MM-DD"),
"feed_end_date": end_date.format("YYYY-MM-DD"),
}
schedule_list_dict.append(schedule_dict)
return schedule_list_dict


def create_schedule_list(month: int, year: int, start2022: bool = True) -> List[dict]:
"""Return a list of dictionaries with start and end dates
for each schedule version.

Args:
month (int): month of interest
year (int): year of interest
start2022 (bool, optional): Whether to modify the
start date of version 20220507 to reflect the start of
real-time bus data collection. Defaults to True.

Returns:
List[dict]: A list of dictionaries with the start and end dates
corresponding to each schedule version.
"""
schedule_list, start_end_list = calculate_version_date_ranges(
month=month, year=year, start2022=start2022
)
return create_schedule_list_dict(
schedule_list=schedule_list, start_end_list=start_end_list
)