From fbd0d114f40abb880a5b81783f4c816408a5a1de Mon Sep 17 00:00:00 2001 From: Ben Sully Date: Thu, 21 Nov 2024 13:25:22 +0000 Subject: [PATCH] fix: support holidays on non-UTC time zones Prior to this commit, when determining whether a given holiday's features should be 0 or 1 for a given timestamp, we checked whether each _day_ included in the holiday's lower-upper windows included the timestamp, and set the value to 1 if so. However, when rounding the holiday's timestamps down to 'day' we assumed that the holiday started and ended at midnight UTC, which won't be the case for certain holidays (i.e. anything outside of UTC). This commit does three things: 1. adds the 'Holiday::with_utc_offset()' method which allows a holiday to use non-UTC-aligned days when its timestamps are being floored 2. rather than adding a separate feature for each time a holiday's lower/upper window are found to contain a timestamp, reuse the same feature for each offset, which is what the Python Prophet implementation does. Really this part should be moved to a separate bugfix PR... 3. switches the lower and upper windows to be u32 instead of i32, to reflect the fact that they should never really be negative (it is quite confusing that the original Prophet expects lower windows to always be negative and upper windows always positive). --- crates/augurs-prophet/src/features.rs | 63 +++++- crates/augurs-prophet/src/prophet/prep.rs | 229 +++++++++++++++++++--- js/augurs-prophet-js/src/lib.rs | 17 +- 3 files changed, 277 insertions(+), 32 deletions(-) diff --git a/crates/augurs-prophet/src/features.rs b/crates/augurs-prophet/src/features.rs index bde803b9..854e9a86 100644 --- a/crates/augurs-prophet/src/features.rs +++ b/crates/augurs-prophet/src/features.rs @@ -1,7 +1,9 @@ //! Features used by Prophet, such as seasonality, regressors and holidays. use std::num::NonZeroU32; -use crate::{positive_float::PositiveFloat, Error, TimestampSeconds}; +use crate::{ + positive_float::PositiveFloat, prophet::prep::ONE_DAY_IN_SECONDS_INT, Error, TimestampSeconds, +}; /// The mode of a seasonality, regressor, or holiday. #[derive(Debug, Default, Clone, Copy, PartialEq, Eq)] @@ -17,9 +19,10 @@ pub enum FeatureMode { #[derive(Debug, Clone)] pub struct Holiday { pub(crate) ds: Vec, - pub(crate) lower_window: Option>, - pub(crate) upper_window: Option>, + pub(crate) lower_window: Option>, + pub(crate) upper_window: Option>, pub(crate) prior_scale: Option, + pub(crate) utc_offset: TimestampSeconds, } impl Holiday { @@ -30,6 +33,7 @@ impl Holiday { lower_window: None, upper_window: None, prior_scale: None, + utc_offset: 0, } } @@ -37,9 +41,9 @@ impl Holiday { /// /// The lower window is the number of days before the holiday /// that it is observed. For example, if the holiday is on - /// 2023-01-01 and the lower window is -1, then the holiday will + /// 2023-01-01 and the lower window is 1, then the holiday will /// _also_ be observed on 2022-12-31. - pub fn with_lower_window(mut self, lower_window: Vec) -> Result { + pub fn with_lower_window(mut self, lower_window: Vec) -> Result { if self.ds.len() != lower_window.len() { return Err(Error::MismatchedLengths { a_name: "ds".to_string(), @@ -58,7 +62,7 @@ impl Holiday { /// that it is observed. For example, if the holiday is on /// 2023-01-01 and the upper window is 1, then the holiday will /// _also_ be observed on 2023-01-02. - pub fn with_upper_window(mut self, upper_window: Vec) -> Result { + pub fn with_upper_window(mut self, upper_window: Vec) -> Result { if self.ds.len() != upper_window.len() { return Err(Error::MismatchedLengths { a_name: "ds".to_string(), @@ -76,6 +80,25 @@ impl Holiday { self.prior_scale = Some(prior_scale); self } + + /// Set the UTC offset for the holiday, in seconds. + /// + /// The UTC offset is used when deciding whether a timestamp is + /// on the holiday. + /// + /// Defaults to 0. + pub fn with_utc_offset(mut self, utc_offset: TimestampSeconds) -> Self { + self.utc_offset = utc_offset; + self + } + + /// Return the Unix timestamp of the given date, rounded down to the nearest day, + /// adjusted by the holiday's UTC offset. + pub(crate) fn floor_day(&self, ds: TimestampSeconds) -> TimestampSeconds { + let remainder = (ds + self.utc_offset) % ONE_DAY_IN_SECONDS_INT; + // Adjust the date to the holiday's UTC offset. + ds - remainder + } } /// Whether or not to standardize a regressor. @@ -232,3 +255,31 @@ impl Seasonality { self } } + +#[cfg(test)] +mod test { + use crate::features::Holiday; + + #[test] + fn holiday_floor_day_no_offset() { + let holiday = Holiday::new(vec![]); + assert_eq!(holiday.floor_day(1732147200), 1732147200); + assert_eq!(holiday.floor_day(1732189701), 1732147200); + } + + #[test] + fn holiday_floor_day_positive_offset() { + let offset = 60 * 60 * 4; + let holiday = Holiday::new(vec![]).with_utc_offset(offset); + assert_eq!(holiday.floor_day(1732132800), 1732132800); + assert_eq!(holiday.floor_day(1732132801), 1732132800); + } + + #[test] + fn holiday_floor_day_negative_offset() { + let offset = -60 * 60 * 3; + let holiday = Holiday::new(vec![]).with_utc_offset(offset); + assert_eq!(holiday.floor_day(1732158000), 1732158000); + assert_eq!(holiday.floor_day(1732165200), 1732158000); + } +} diff --git a/crates/augurs-prophet/src/prophet/prep.rs b/crates/augurs-prophet/src/prophet/prep.rs index 5893c878..000826c0 100644 --- a/crates/augurs-prophet/src/prophet/prep.rs +++ b/crates/augurs-prophet/src/prophet/prep.rs @@ -16,7 +16,7 @@ use crate::{ const ONE_YEAR_IN_SECONDS: f64 = 365.25 * 24.0 * 60.0 * 60.0; const ONE_WEEK_IN_SECONDS: f64 = 7.0 * 24.0 * 60.0 * 60.0; const ONE_DAY_IN_SECONDS: f64 = 24.0 * 60.0 * 60.0; -const ONE_DAY_IN_SECONDS_INT: i64 = 24 * 60 * 60; +pub(crate) const ONE_DAY_IN_SECONDS_INT: i64 = 24 * 60 * 60; #[derive(Debug, Clone, Default)] pub(super) struct Scales { @@ -121,7 +121,7 @@ impl ComponentColumns { } /// The name of a feature column in the `X` matrix passed to Stan. -#[derive(Debug, Clone, PartialEq, Eq)] +#[derive(Debug, Clone, PartialEq, Eq, Hash)] pub(super) enum FeatureName { /// A seasonality feature. Seasonality { @@ -660,51 +660,71 @@ impl Prophet { ) -> HashSet { let mut holiday_names = HashSet::with_capacity(holidays.len()); for (name, holiday) in holidays { + // Keep track of holiday columns here. Use a Vec and a HashMap to + // preserve order. + // For each day surrounding the holiday (decided by the lower and upper windows), + // plus the holiday itself, we want to create a new feature which is 0.0 for all + // days except that day, and 1.0 for that day. + let mut this_holiday_feature_names = Vec::new(); + let mut this_holiday_features: HashMap> = HashMap::new(); + // Default to a window of 0 days either side. let lower = holiday .lower_window .as_ref() - .map(|x| Box::new(x.iter().copied()) as Box>) + .map(|x| { + Box::new(x.iter().copied().map(|x| x as i32)) as Box> + }) .unwrap_or_else(|| Box::new(std::iter::repeat(0))); let upper = holiday .upper_window .as_ref() - .map(|x| Box::new(x.iter().copied()) as Box>) + .map(|x| { + Box::new(x.iter().copied().map(|x| x as i32)) as Box> + }) .unwrap_or_else(|| Box::new(std::iter::repeat(0))); - for (dt, lower, upper) in izip!(holiday.ds, lower, upper) { + for (dt, lower, upper) in izip!(&holiday.ds, lower, upper) { // Round down the original timestamps to the nearest day. - let remainder = dt % ONE_DAY_IN_SECONDS_INT; - let dt_date = dt - remainder; + let dt_date = holiday.floor_day(*dt); // Check each of the possible offsets allowed by the lower/upper windows. - for offset in lower..=upper { + // We know that the lower window is always positive since it was originally + // a u32, so we can use `-lower..upper`. + for offset in -lower..=upper { let offset_seconds = offset as i64 * ONE_DAY_IN_SECONDS as i64; let occurrence = dt_date + offset_seconds; let col_name = FeatureName::Holiday { name: name.clone(), _offset: offset, }; - let mut col = vec![0.0; ds.len()]; + let col = this_holiday_features + .entry(col_name.clone()) + .or_insert_with(|| { + this_holiday_feature_names.push(col_name); + vec![0.0; ds.len()] + }); // Get the indices of the ds column that are 'on holiday'. - // Set the value of the holiday column 1.0 for those dates. - for loc in ds - .iter() - .positions(|x| (x - (x % ONE_DAY_IN_SECONDS_INT)) == occurrence) - { + // Set the value of the holiday column to 1.0 for those dates. + for loc in ds.iter().positions(|&x| holiday.floor_day(x) == occurrence) { col[loc] = 1.0; } - // Add the holiday column to the features frame, and add a corresponding - // prior scale. - features.push(col_name, col); - prior_scales.push( - holiday - .prior_scale - .unwrap_or(self.opts.holidays_prior_scale), - ); } } + // Add the holiday columns to the features frame, and add a corresponding + // prior scale. + for col_name in this_holiday_feature_names { + features.push( + col_name.clone(), + this_holiday_features.remove(&col_name).unwrap(), + ); + prior_scales.push( + holiday + .prior_scale + .unwrap_or(self.opts.holidays_prior_scale), + ); + } holiday_names.insert(name.clone()); modes.insert( self.opts @@ -1078,9 +1098,19 @@ mod test { use super::*; use augurs_testing::assert_approx_eq; - use chrono::NaiveDate; + use chrono::{FixedOffset, NaiveDate, TimeZone, Utc}; use pretty_assertions::assert_eq; + macro_rules! concat_all { + ($($x:expr),+ $(,)?) => {{ + let mut result = Vec::new(); + $( + result.extend($x.iter().cloned()); + )+ + result + }}; + } + #[test] fn setup_dataframe() { let (data, _) = train_test_split(daily_univariate_ts(), 0.5); @@ -1197,6 +1227,159 @@ mod test { ); } + #[test] + fn make_holiday_features() { + // Create some hourly data between 2024-01-01 and 2024-01-07. + let start = Utc.with_ymd_and_hms(2024, 1, 1, 0, 0, 0).unwrap(); + let end = Utc.with_ymd_and_hms(2024, 1, 7, 0, 0, 0).unwrap(); + let ds = std::iter::successors(Some(start), |d| { + d.checked_add_signed(chrono::Duration::hours(1)) + }) + .take_while(|d| *d < end) + .map(|d| d.timestamp()) + .collect_vec(); + // Create two holidays: one in UTC on 2024-01-02 and 2024-01-04; + // one in UTC-3 on the same dates. + // The holidays may appear more than once since the data is hourly, + // and this shouldn't affect the results. + // Ignore windows for now. + let non_utc_tz = FixedOffset::west_opt(3600 * 3).unwrap(); + let holidays: HashMap = [ + ( + "UTC holiday".to_string(), + Holiday::new(vec![ + Utc.with_ymd_and_hms(2024, 1, 2, 0, 0, 0) + .unwrap() + .timestamp(), + Utc.with_ymd_and_hms(2024, 1, 2, 12, 0, 0) + .unwrap() + .timestamp(), + Utc.with_ymd_and_hms(2024, 1, 4, 0, 0, 0) + .unwrap() + .timestamp(), + ]), + ), + ( + "Non-UTC holiday".to_string(), + Holiday::new(vec![ + non_utc_tz + .with_ymd_and_hms(2024, 1, 2, 0, 0, 0) + .unwrap() + .timestamp(), + non_utc_tz + .with_ymd_and_hms(2024, 1, 2, 12, 0, 0) + .unwrap() + .timestamp(), + non_utc_tz + .with_ymd_and_hms(2024, 1, 4, 0, 0, 0) + .unwrap() + .timestamp(), + ]) + .with_utc_offset(-3 * 3600), + ), + ( + "Non-UTC holiday with windows".to_string(), + Holiday::new(vec![ + non_utc_tz + .with_ymd_and_hms(2024, 1, 2, 0, 0, 0) + .unwrap() + .timestamp(), + non_utc_tz + .with_ymd_and_hms(2024, 1, 2, 12, 0, 0) + .unwrap() + .timestamp(), + non_utc_tz + .with_ymd_and_hms(2024, 1, 4, 0, 0, 0) + .unwrap() + .timestamp(), + ]) + .with_lower_window(vec![1; 3]) + .unwrap() + .with_upper_window(vec![1; 3]) + .unwrap() + .with_utc_offset(-3 * 3600), + ), + ] + .into(); + let opts = ProphetOptions { + holidays: holidays.clone(), + ..Default::default() + }; + let prophet = Prophet::new(opts, MockOptimizer::new()); + let mut features_frame = FeaturesFrame::new(); + let mut prior_scales = Vec::new(); + let mut modes = Modes::default(); + + let holiday_names = prophet.make_holiday_features( + &ds, + holidays, + &mut features_frame, + &mut prior_scales, + &mut modes, + ); + assert_eq!( + holiday_names, + HashSet::from([ + "UTC holiday".to_string(), + "Non-UTC holiday".to_string(), + "Non-UTC holiday with windows".to_string() + ]) + ); + + assert_eq!(features_frame.names.len(), 5); + let utc_idx = features_frame + .names + .iter() + .position(|x| matches!(x, FeatureName::Holiday { name, .. } if name == "UTC holiday")) + .unwrap(); + assert_eq!( + features_frame.data[utc_idx], + concat_all!( + &[0.0; 24], // 2024-01-01 - off holiday + &[1.0; 24], // 2024-01-02 - on holiday + &[0.0; 24], // 2024-01-03 - off holiday + &[1.0; 24], // 2024-01-04 - on holiday + &[0.0; 48], // 2024-01-05 and 2024-01-06 - off holiday + ), + ); + let non_utc_idx = features_frame + .names + .iter() + .position( + |x| matches!(x, FeatureName::Holiday { name, .. } if name == "Non-UTC holiday"), + ) + .unwrap(); + assert_eq!( + features_frame.data[non_utc_idx], + concat_all!( + &[0.0; 24], // 2024-01-01 - off holiday + &[0.0; 3], // first 3 hours of 2024-01-02 in UTC are off holiday + &[1.0; 24], // rest of 2024-01-02 in UTC, and first 3 hours of the next day, are on holiday + &[0.0; 24], // continue the cycle... + &[1.0; 24], + &[0.0; 21 + 24], + ), + ); + + let non_utc_lower_window_idx = features_frame + .names + .iter() + .position( + |x| matches!(x, FeatureName::Holiday { name, _offset: -1 } if name == "Non-UTC holiday with windows"), + ) + .unwrap(); + assert_eq!( + features_frame.data[non_utc_lower_window_idx], + concat_all!( + &[0.0; 3], // first 3 hours of 2024-01-01 in UTC - off holiday + &[1.0; 24], // rest of 2024-01-01 and start of 2024-01-02 are on holiday + &[0.0; 24], // continue the cycle + &[1.0; 24], + &[0.0; 21 + 48], + ), + ); + } + #[test] fn regressor_column_matrix() { let holiday_dates = ["2012-10-09", "2013-10-09"] diff --git a/js/augurs-prophet-js/src/lib.rs b/js/augurs-prophet-js/src/lib.rs index 633c9e96..fe979630 100644 --- a/js/augurs-prophet-js/src/lib.rs +++ b/js/augurs-prophet-js/src/lib.rs @@ -1230,10 +1230,10 @@ pub struct Holiday { /// /// The lower window is the number of days before the holiday /// that it is observed. For example, if the holiday is on - /// 2023-01-01 and the lower window is -1, then the holiday will + /// 2023-01-01 and the lower window is 1, then the holiday will /// _also_ be observed on 2022-12-31. #[tsify(optional)] - pub lower_window: Option>, + pub lower_window: Option>, /// The upper window for the holiday. /// @@ -1242,11 +1242,19 @@ pub struct Holiday { /// 2023-01-01 and the upper window is 1, then the holiday will /// _also_ be observed on 2023-01-02. #[tsify(optional)] - pub upper_window: Option>, + pub upper_window: Option>, /// The prior scale for the holiday. #[tsify(optional)] pub prior_scale: Option, + + /// The UTC offset for the holiday, in seconds. + /// + /// The UTC offset is used when deciding whether a timestamp is + /// on the holiday. + #[tsify(optional)] + #[tsify(type = "TimestampSeconds | undefined")] + pub utc_offset_seconds: Option, } impl TryFrom for augurs_prophet::Holiday { @@ -1263,6 +1271,9 @@ impl TryFrom for augurs_prophet::Holiday { if let Some(prior_scale) = value.prior_scale { holiday = holiday.with_prior_scale(prior_scale.try_into()?); } + if let Some(utc_offset_seconds) = value.utc_offset_seconds { + holiday = holiday.with_utc_offset(utc_offset_seconds); + } Ok(holiday) } }