chihacknight · dcjohnson24 · Oct 23, 2022 · Oct 24, 2022 · Oct 24, 2022 · Oct 25, 2022
diff --git a/data_analysis/compare_scheduled_and_rt.py b/data_analysis/compare_scheduled_and_rt.py
@@ -11,14 +11,18 @@
 from tqdm import tqdm
 from dotenv import load_dotenv
 
-import static_gtfs_analysis
-
+import data_analysis.static_gtfs_analysis as static_gtfs_analysis
+from scrape_data.scrape_schedule_versions import create_schedule_list
 
 load_dotenv()
 
 BUCKET_PUBLIC = os.getenv('BUCKET_PUBLIC', 'chn-ghost-buses-public')
 logger = logging.getLogger()
-logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s: %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p')
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s %(levelname)s: %(message)s',
+    datefmt='%m/%d/%Y %I:%M:%S %p'
+)
 
 BASE_PATH = S3Path(f"/{BUCKET_PUBLIC}")
 
@@ -96,7 +100,7 @@ def sum_trips_by_rt_by_freq(
     rt_df: pd.DataFrame,
     sched_df: pd.DataFrame,
     agg_info: AggInfo,
-    holidays: List[str] = ["2022-05-30", "2022-07-04", "2022-09-05", "2022-11-24", "2022-12-25"]) -> Tuple[pd.DataFrame, pd.DataFrame]:
+        holidays: List[str] = ["2022-05-30", "2022-07-04", "2022-09-05", "2022-11-24", "2022-12-25"]) -> Tuple[pd.DataFrame, pd.DataFrame]:
     """Calculate ratio of trips to scheduled trips per route
        per specified frequency.
 
@@ -172,7 +176,7 @@ def combine_real_time_rt_comparison(
     schedule_data_list: List[dict],
     agg_info: AggInfo,
     holidays: List[str] = ["2022-05-31", "2022-07-04", "2022-09-05", "2022-11-24", "2022-12-25"],
-    save: bool = True) -> Tuple[pd.DataFrame, pd.DataFrame]:
+        save: bool = True) -> Tuple[pd.DataFrame, pd.DataFrame]:
     """Generate a combined DataFrame with the realtime route comparisons
 
     Args:
@@ -222,7 +226,7 @@ def combine_real_time_rt_comparison(
         for day in date_pbar:
             date_str = day.to_date_string()
             pbar.set_description(
-                f"Processing {date_str} at"
+                f" Processing {date_str} at "
                 f"{pendulum.now().to_datetime_string()}"
             )
 
@@ -266,7 +270,7 @@ def combine_real_time_rt_comparison(
                 outpath,
                 index=False,
             )
-        logger.info(f"Processing {feed['schedule_version']}")
+        logger.info(f" Processing version {feed['schedule_version']}")
         combined_grouped = pd.concat([combined_grouped, compare_by_day_type])
         combined_long = pd.concat([combined_long, compare_freq_by_rte])
 
@@ -275,7 +279,7 @@ def combine_real_time_rt_comparison(
 
 def build_summary(
     combined_df: pd.DataFrame,
-    save: bool = True) -> pd.DataFrame:
+        save: bool = True) -> pd.DataFrame:
     """Create a summary by route and day type
 
     Args:
@@ -317,58 +321,12 @@ def main(freq: str = 'D') -> Tuple[List[dict],pd.DataFrame, pd.DataFrame]:
     Args:
         freq (str): Frequency of aggregation. Defaults to Daily.
     Returns:
-        pd.DataFrame: A DataFrame of every day in the specified data with scheduled and
-            observed count of trips. 
+        pd.DataFrame: A DataFrame of every day in the specified data with
+        scheduled and observed count of trips.
         pd.DataFrame: A DataFrame summary across
             versioned schedule comparisons.
     """
-
-    schedule_feeds =  [{'schedule_version': '20220507',
-        'feed_start_date': '2022-05-20',
-        'feed_end_date': '2022-06-02'},
-        {'schedule_version': '20220603',
-        'feed_start_date': '2022-06-04',
-        'feed_end_date': '2022-06-07'},
-        {'schedule_version': '20220608',
-        'feed_start_date': '2022-06-09',
-        'feed_end_date': '2022-07-08'},
-        {'schedule_version': '20220709',
-        'feed_start_date': '2022-07-10',
-        'feed_end_date': '2022-07-17'},
-        {'schedule_version': '20220718',
-        'feed_start_date': '2022-07-19',
-        'feed_end_date': '2022-07-29'},
-        {'schedule_version': '20220730',
-        'feed_start_date': '2022-07-31',
-        'feed_end_date': '2022-08-10'},
-        {'schedule_version': '20220811',
-        'feed_start_date': '2022-08-12',
-        'feed_end_date': '2022-08-12'},
-        {'schedule_version': '20220813',
-        'feed_start_date': '2022-08-14',
-        'feed_end_date': '2022-08-16'},
-        {'schedule_version': '20220817',
-        'feed_start_date': '2022-08-18',
-        'feed_end_date': '2022-09-07'},
-        {'schedule_version': '20220908',
-        'feed_start_date': '2022-09-09',
-        'feed_end_date': '2022-09-17'},
-        {'schedule_version': '20220918',
-        'feed_start_date': '2022-09-19',
-        'feed_end_date': '2022-09-28'},
-        {'schedule_version': '20220929',
-        'feed_start_date': '2022-09-30',
-        'feed_end_date': '2022-10-06'},
-        {'schedule_version': '20221007',
-        'feed_start_date': '2022-10-08',
-        'feed_end_date': '2022-10-11'},
-        {'schedule_version': '20221012',
-        'feed_start_date': '2022-10-13',
-        'feed_end_date': '2022-10-19'},
-        {'schedule_version': '20221020',
-        'feed_start_date': '2022-10-21',
-        'feed_end_date': '2022-10-21'}
-    ]
+    schedule_feeds = create_schedule_list(month=5, year=2022)
 
     schedule_data_list = []
     pbar = tqdm(schedule_feeds)
@@ -378,19 +336,19 @@ def main(freq: str = 'D') -> Tuple[List[dict],pd.DataFrame, pd.DataFrame]:
             f"Generating daily schedule data for "
             f"schedule version {schedule_version}"
         )
-        logging.info(
+        logger.info(
             f"\nDownloading zip file for schedule version "
             f"{schedule_version}"
         )
         CTA_GTFS = static_gtfs_analysis.download_zip(schedule_version)
-        logging.info("\nExtracting data")
+        logger.info("\nExtracting data")
         data = static_gtfs_analysis.GTFSFeed.extract_data(
             CTA_GTFS,
             version_id=schedule_version
         )
         data = static_gtfs_analysis.format_dates_hours(data)
 
-        logging.info("\nSummarizing trip data")
+        logger.info("\nSummarizing trip data")
         trip_summary = static_gtfs_analysis.make_trip_summary(data, 
             pendulum.from_format(feed['feed_start_date'], 'YYYY-MM-DD'), 
             pendulum.from_format(feed['feed_end_date'], 'YYYY-MM-DD'))
@@ -404,7 +362,6 @@ def main(freq: str = 'D') -> Tuple[List[dict],pd.DataFrame, pd.DataFrame]:
             {"schedule_version": schedule_version,
              "data": route_daily_summary}
         )
-
     agg_info = AggInfo(freq=freq)
     combined_long, combined_grouped = combine_real_time_rt_comparison(
         schedule_feeds,

diff --git a/scrape_data/requirements.txt b/scrape_data/requirements.txt
@@ -1,2 +1,4 @@
 pendulum==2.1.2
-requests==2.26.0
+requests==2.26.0
+beautifulsoup4==4.11.1
+lxml==4.9.1
diff --git a/scrape_data/scrape_schedule_versions.py b/scrape_data/scrape_schedule_versions.py
@@ -0,0 +1,214 @@
+from typing import List, Tuple
+
+from bs4 import BeautifulSoup
+import requests
+import pendulum
+import logging
+import calendar
+import pandas as pd
+
+logger = logging.getLogger()
+logging.basicConfig(level=logging.INFO)
+logger.setLevel(logging.INFO)
+
+BASE_URL = "https://transitfeeds.com"
+
+
+def check_latest_rt_data_date() -> str:
+    """Fetch the latest available date of real-time bus data
+
+    Returns:
+        str: A string of the latest date in YYYY-MM-DD format.
+    """
+    if pendulum.now("America/Chicago").hour >= 11:
+        end_date = pendulum.yesterday("America/Chicago").date().format("YYYY-MM-DD")
+    else:
+        end_date = (
+            pendulum.now("America/Chicago").subtract(days=2).date().format("YYYY-MM-DD")
+        )
+    return end_date
+
+
+def fetch_schedule_versions(month: int, year: int) -> List[pendulum.date]:
+    """Get the schedule versions from transitfeeds.com from the most recent
+       to specified month and year (inclusive). In case there are
+       multiple schedules for a given month and year pair,
+       all schedules will be fetched.
+
+    Args:
+        month (int): The month of interest
+        year (int): The year of interest
+
+    Returns:
+        List[pendulum.date]: A list of unique schedule versions
+    """
+    link_list = []
+    page = 1
+    found = False
+    while not found:
+        logging.info(f" Searching page {page}")
+        url = BASE_URL + f"/p/chicago-transit-authority/165?p={page}"
+        response = requests.get(url).content
+        soup = BeautifulSoup(response, "lxml")
+        # List of dates from first row
+        table = soup.find_all("table")
+        for row in table[0].tbody.find_all("tr"):
+            first_col = row.find_all("td")[0]
+            date = pendulum.parse(first_col.text.strip(), strict=False)
+            # Find schedules up to and including the specified date.
+            if date.month == month and date.year == year:
+                logging.info(
+                    f" Found schedule for"
+                    f" {calendar.month_name[date.month]} {date.year}"
+                )
+                logging.info(
+                    f" Adding schedule for {calendar.month_name[date.month]}"
+                    f" {date.day}, {date.year}"
+                )
+                link_list.append(first_col)
+                found = True
+                continue
+            if found:
+                break
+            link_list.append(first_col)
+        page += 1
+
+    date_list = [s.text.strip() for s in link_list]
+    # Check for duplicates. The presence of duplicates could mean
+    # that the schedule was not in-effect.
+    # See https://github.com/chihacknight/chn-ghost-buses/issues/30
+    duplicates = pd.Series(date_list)[pd.Series(date_list).duplicated()].values
+    if len(duplicates) > 0:
+        logging.info(
+            f" The duplicate schedule versions are"
+            f" {set(duplicates)}. Check whether these were in-effect."
+        )
+        # Keep the first occurrence of duplicates.
+        # date_list is returned from Transitfeeds from newest to oldest
+        # [...'14 September 2021', '7 September 2021', '1 September 2021',
+        #  '1 September 2021', '1 September 2021', '1 September 2021',
+        #  '1 September 2021', '2 August 2021', '15 June 2021',..]
+        # By keeping the first entry, the entry that appears
+        # first on TransitFeeds site will be kept, which is the version that 
+        # was left on the CTA website the longest.
+        date_list = pd.Series(date_list).drop_duplicates()
+
+    return sorted([pendulum.parse(date, strict=False).date() for date in date_list])
+
+
+def modify_data_collection_start(date_list: List[pendulum.date]) -> List[pendulum.date]:
+    """Whether to modify the schedule version for the start of
+        data collection on May 20, 2022
+
+    Args:
+        date_list (List[pendulum.date]): A list of dates in pendulum format
+
+    Returns:
+        List[pendulum.date]: A list of dates in pendulum format where the
+            start date for schedule version 2022-05-07
+            is 2022-05-19. This will ensure that the date
+            ranges are valid i.e. starting with 2022-05-20 up to the day
+            before the next schedule version.
+    """
+    # For schedule version 20220507, set the date to be May 19th 2022,
+    # one day before the start of data collection. This will mean that
+    # the start date will fall on 2022-05-20 in calculate_version_date_ranges
+    for idx, date in enumerate(date_list):
+        if date.month == 5 and date.day == 7 and date.year == 2022:
+            date = pendulum.date(2022, 5, 19)
+            date_list[idx] = date
+
+    return date_list
+
+
+def calculate_version_date_ranges(
+    month: int, year: int, start2022: bool = True
+) -> Tuple[List[pendulum.date], List[Tuple[pendulum.date, pendulum.date]]]:
+    """Get the start and end dates for each schedule version from the most
+        recent version to the version specified by the month and year
+
+    Args:
+        month (int): month of interest
+        year (int): year of interest
+        start2022 (bool, optional): Whether to modify the
+            start date of version 20220507 to reflect the start of
+            real-time bus data collection. Defaults to True.
+
+    Returns:
+        Tuple[List[pendulum.date], List[Tuple[pendulum.date, pendulum.date]]]:
+            A list of schedule versions and list of tuples for the
+            start and end dates corresponding to those versions.
+    """
+    schedule_list = fetch_schedule_versions(month=month, year=year)
+    if start2022:
+        schedule_list = modify_data_collection_start(schedule_list)
+
+    start_end_list = []
+    for i in range(len(schedule_list)):
+        try:
+            date_tuple = (
+                schedule_list[i].add(days=1),
+                schedule_list[i + 1].subtract(days=1),
+            )
+            start_end_list.append(date_tuple)
+        except IndexError:
+            pass
+
+    # Handle the current schedule version by setting the end date as the latest
+    # available date for data.
+    start_end_list.append((schedule_list[-1].add(days=1), check_latest_rt_data_date()))
+    return schedule_list, start_end_list
+
+
+def create_schedule_list_dict(
+    schedule_list: List[pendulum.date],
+    start_end_list: List[Tuple[pendulum.date, pendulum.date]],
+) -> List[dict]:
+    """Create a list of dictionaries with keys for the schedule_version,
+       start_date, and end_date
+
+    Args:
+        schedule_list (List[pendulum.date]): A list of schedule versions from
+            transitfeeds.com
+        start_end_list (List[pendulum.date]): A list of start and end dates
+            for each version
+
+    Returns:
+        List[dict]: A list of dictionaries with the start and end dates
+            corresponding to each schedule version.
+    """
+    schedule_list_dict = []
+    for version, (start_date, end_date) in zip(schedule_list, start_end_list):
+        # Changing back the starting version to 20220507
+        if version == pendulum.date(2022, 5, 19):
+            version = pendulum.date(2022, 5, 7)
+        schedule_dict = {
+            "schedule_version": version.format("YYYYMMDD"),
+            "feed_start_date": start_date.format("YYYY-MM-DD"),
+            "feed_end_date": end_date.format("YYYY-MM-DD"),
+        }
+        schedule_list_dict.append(schedule_dict)
+    return schedule_list_dict
+
+
+def create_schedule_list(month: int, year: int, start2022: bool = True) -> List[dict]:
+    """Return a list of dictionaries with start and end dates
+       for each schedule version.
+
+    Args:
+        month (int): month of interest
+        year (int): year of interest
+        start2022 (bool, optional): Whether to modify the
+            start date of version 20220507 to reflect the start of
+            real-time bus data collection. Defaults to True.
+
+    Returns:
+        List[dict]: A list of dictionaries with the start and end dates
+            corresponding to each schedule version.
+    """
+    schedule_list, start_end_list = calculate_version_date_ranges(
+        month=month, year=year, start2022=start2022
+    )
+    return create_schedule_list_dict(
+        schedule_list=schedule_list, start_end_list=start_end_list
+    )