From 09e31f20437e6ff2ef1cf4b806159e2ca21ca039 Mon Sep 17 00:00:00 2001 From: remyogasawara Date: Mon, 21 Aug 2023 17:10:19 -0700 Subject: [PATCH] condense code --- .../transformers/preprocessing/decomposer.py | 31 +- .../preprocessing/stl_decomposer.py | 26 +- .../decomposer_tests/test_decomposer.py | 266 ++++++++---------- .../decomposer_tests/test_stl_decomposer.py | 265 +++++++---------- evalml/tests/conftest.py | 27 +- 5 files changed, 279 insertions(+), 336 deletions(-) diff --git a/evalml/pipelines/components/transformers/preprocessing/decomposer.py b/evalml/pipelines/components/transformers/preprocessing/decomposer.py index 06a7d30bfc..53d2260b7b 100644 --- a/evalml/pipelines/components/transformers/preprocessing/decomposer.py +++ b/evalml/pipelines/components/transformers/preprocessing/decomposer.py @@ -190,6 +190,34 @@ def _detrend_on_fly(X, y): relative_maxima = _get_rel_max_from_acf(y_detrended) return relative_maxima + # def set_period( + # self, + # X: pd.DataFrame, + # y: pd.Series, + # acf_threshold: float = 0.01, + # rel_max_order: int = 5, + # ): + # """Function to set the component's seasonal period based on the target's seasonality. + + # Args: + # X (pandas.DataFrame): The feature data of the time series problem. + # y (pandas.Series): The target data of a time series problem. + # acf_threshold (float) : The threshold for the autocorrelation function to determine the period. Any values below + # the threshold are considered to be 0 and will not be considered for the period. Defaults to 0.01. + # rel_max_order (int) : The order of the relative maximum to determine the period. Defaults to 5. + + # """ + # self.periods = {} + # if len(y.columns) == 1: + # self.period = self.determine_periodicity(X, y, acf_threshold, rel_max_order) + # self.update_parameters({"period": self.period}) + # self.periods[id] = self.period + # return + # else: + # for id in y.columns: + # self.periods[id] = self.determine_periodicity(X, y[id], acf_threshold, rel_max_order) + # self.update_parameters({"periods": self.periods}) + def set_period( self, X: pd.DataFrame, @@ -356,9 +384,6 @@ def plot_decomposition( fig, axs = plt.subplots(4) fig.set_size_inches(18.5, 14.5) - for ax in axs: - ax.cla() - if len(y.columns) > 1: results = decomposition_results[id] else: diff --git a/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py b/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py index d0be69088f..33061d38b2 100644 --- a/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py +++ b/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py @@ -45,12 +45,14 @@ def __init__( series_id: str = None, degree: int = 1, # Currently unused. period: int = None, + periods: dict = None, seasonal_smoother: int = 7, random_seed: int = 0, **kwargs, ): self.logger = logging.getLogger(__name__) self.series_id = series_id + self.periods = periods # Programmatically adjust seasonal_smoother to fit underlying STL requirements, # that seasonal_smoother must be odd. if seasonal_smoother % 2 == 0: @@ -64,6 +66,7 @@ def __init__( parameters = { "degree": degree, "period": period, + "periods": periods, "seasonal_smoother": seasonal_smoother, "time_index": time_index, "series_id": series_id, @@ -189,10 +192,22 @@ def fit( self.frequency = y.index.freqstr or pd.infer_freq(y.index) # Iterate through each id group self.seasonals = {} - self.periods = {} self.seasonalities = {} self.trends = {} self.residuals = {} + self.periods = {} + + # # Determine the period of the seasonal component + # # Set the period if it is single series and period is given + # if self.period is not None and len(y.columns) == 1: + # self.periods = {0: self.period} + # # Set periods if it is single series and period is + # if self.periods is None or self.period is None: + # self.set_period(X, y) + + # if self.period is None: + # self.set_period(X, y) + for id in y.columns: series_y = y[id] @@ -347,14 +362,13 @@ def inverse_transform( if isinstance(y_t, pd.Series): y_t = y_t.to_frame() + index = self._choose_proper_index(y_t) y = [] for id in y_t.columns: y_in_sample = pd.Series([]) y_out_of_sample = pd.Series([]) series_y = y_t[id] - index = self._choose_proper_index(series_y) - if len(y_t.columns) > 1: old_trend = self.trends[id] old_seasonal = self.seasonals[id] @@ -454,13 +468,9 @@ def get_trend_dataframe(self, X, y): def _decompose_target(X, y, fh, trend, seasonal, residual, period, id): """Function to generate a single DataFrame with trend, seasonality and residual components.""" - if len(y.index) == len(trend.index) and all( + if len(y.index) != len(trend.index) or not all( y.index == trend.index, ): - trend = trend - seasonal = seasonal - residual = residual - else: # TODO: Do a better job cloning. decomposer = STLDecomposer( seasonal_smoother=self.seasonal_smoother, diff --git a/evalml/tests/component_tests/decomposer_tests/test_decomposer.py b/evalml/tests/component_tests/decomposer_tests/test_decomposer.py index a20e310cfc..f4e26f4361 100644 --- a/evalml/tests/component_tests/decomposer_tests/test_decomposer.py +++ b/evalml/tests/component_tests/decomposer_tests/test_decomposer.py @@ -72,21 +72,23 @@ def test_decomposer_plot_decomposition( decomposer_child_class, y_has_time_index, generate_seasonal_data, - generate_multiseries_seasonal_data, variateness, ): + if variateness == "multivariate" and isinstance( + decomposer_child_class(), + PolynomialDecomposer, + ): + pytest.skip( + "Skipping Decomposer because multiseries is not implemented for Polynomial Decomposer", + ) + step = 0.01 period = 9 - if variateness == "univariate": - X, y = generate_seasonal_data(real_or_synthetic="synthetic")(period, step) - elif variateness == "multivariate": - if isinstance(decomposer_child_class(), PolynomialDecomposer): - pytest.skip( - "Skipping Decomposer because multiseries is not implemented for Polynomial Decomposer", - ) - X, y = generate_multiseries_seasonal_data(real_or_synthetic="synthetic")( - period, - ) + X, y = generate_seasonal_data( + real_or_synthetic="synthetic", + univariate_or_multivariate=variateness, + )(period, step) + if y_has_time_index == "y_has_time_index": y = y.set_axis(X.index) @@ -138,7 +140,7 @@ def test_decomposer_plot_decomposition( def test_decomposer_uses_time_index( decomposer_child_class, ts_data, - multiseries_ts_data_unstacked, + ts_multiseries_data, variateness, X_has_time_index, X_num_time_columns, @@ -152,10 +154,7 @@ def test_decomposer_uses_time_index( pytest.skip( "Skipping Decomposer because multiseries is not implemented for Polynomial Decomposer", ) - X, y = multiseries_ts_data_unstacked - X.index = X["date"] - y = y.set_axis(X.index) - X.ww.init() + X, _, y = ts_multiseries_data() time_index_col_name = "date" assert isinstance(X.index, pd.DatetimeIndex) @@ -453,7 +452,7 @@ def test_decomposer_projected_seasonality_integer_and_datetime( def test_decomposer_get_trend_dataframe_raises_errors( decomposer_child_class, ts_data, - multiseries_ts_data_unstacked, + ts_multiseries_data, variateness, ): if variateness == "univariate": @@ -463,12 +462,7 @@ def test_decomposer_get_trend_dataframe_raises_errors( pytest.skip( "Skipping Decomposer because multiseries is not implemented for Polynomial Decomposer", ) - X, y = multiseries_ts_data_unstacked - dts = pd.date_range("01-01-2000", periods=len(X)) - datetime_index = pd.DatetimeIndex(dts) - X.index = datetime_index - y.index = datetime_index - X["date"] = dts + X, _, y = ts_multiseries_data() dec = decomposer_child_class() dec.fit_transform(X, y) @@ -629,6 +623,7 @@ def test_decomposer_determine_periodicity_nullable_type_incompatibility( def test_decomposer_get_trend_dataframe_error_not_fit( decomposer_child_class, ts_data, + ts_multiseries_data, multiseries_ts_data_unstacked, variateness, fit_before_decompose, @@ -640,10 +635,8 @@ def test_decomposer_get_trend_dataframe_error_not_fit( pytest.skip( "Skipping Decomposer because multiseries is not implemented for Polynomial Decomposer", ) - X, y = multiseries_ts_data_unstacked - X.index = X["date"] - X.index.freq = "D" - + X, _, y = ts_multiseries_data() + # X, y = multiseries_ts_data_unstacked dec = decomposer_child_class(time_index="date") if fit_before_decompose: dec.fit_transform(X, y) @@ -669,7 +662,7 @@ def test_decomposer_get_trend_dataframe_error_not_fit( def test_decomposer_transform_returns_same_when_y_none( decomposer_child_class, ts_data, - multiseries_ts_data_unstacked, + ts_multiseries_data, variateness, ): if variateness == "univariate": @@ -679,7 +672,7 @@ def test_decomposer_transform_returns_same_when_y_none( pytest.skip( "Skipping Decomposer because multiseries is not implemented for Polynomial Decomposer", ) - X, y = multiseries_ts_data_unstacked + X, _, y = ts_multiseries_data() dec = decomposer_child_class().fit(X, y) X_t, y_t = dec.transform(X, None) @@ -701,7 +694,7 @@ def test_decomposer_transform_returns_same_when_y_none( def test_decomposer_raises_value_error_target_is_none( decomposer_child_class, ts_data, - multiseries_ts_data_unstacked, + ts_multiseries_data, variateness, ): if variateness == "univariate": @@ -711,7 +704,7 @@ def test_decomposer_raises_value_error_target_is_none( pytest.skip( "Skipping Decomposer because multiseries is not implemented for Polynomial Decomposer", ) - X, y = multiseries_ts_data_unstacked + X, _, y = ts_multiseries_data() with pytest.raises(ValueError, match="cannot be None for Decomposer!"): decomposer_child_class(degree=3).fit_transform(X, None) @@ -739,7 +732,7 @@ def test_decomposer_raises_value_error_target_is_none( def test_decomposer_bad_target_index( decomposer_child_class, ts_data, - multiseries_ts_data_unstacked, + ts_multiseries_data, variateness, ): if variateness == "univariate": @@ -749,7 +742,7 @@ def test_decomposer_bad_target_index( pytest.skip( "Skipping Decomposer because multiseries is not implemented for Polynomial Decomposer", ) - X, y = multiseries_ts_data_unstacked + X, _, y = ts_multiseries_data() dec = decomposer_child_class() y.index = pd.CategoricalIndex(["cat_index" for x in range(len(y))]) @@ -786,48 +779,52 @@ def test_decomposer_bad_target_index( def test_decomposer_fit_transform_out_of_sample( decomposer_child_class, variateness, - generate_multiseries_seasonal_data, generate_seasonal_data, transformer_fit_on_data, ): + if variateness == "multivariate" and isinstance( + decomposer_child_class(), + PolynomialDecomposer, + ): + pytest.skip( + "Skipping Decomposer because multiseries is not implemented for Polynomial Decomposer", + ) + # Generate 10 periods (the default) of synthetic seasonal data period = 7 - if variateness == "univariate": - X, y = generate_seasonal_data(real_or_synthetic="synthetic")( - period=period, - freq_str="D", - set_time_index=True, - seasonal_scale=0.05, # Increasing this value causes the decomposer to miscalculate trend - ) - subset_y = y[2 * period : 7 * period] - elif variateness == "multivariate": - if isinstance(decomposer_child_class(), PolynomialDecomposer): - pytest.skip( - "Skipping Decomposer because multiseries is not implemented for Polynomial Decomposer", - ) - X, y = generate_multiseries_seasonal_data(real_or_synthetic="synthetic")( - period=period, - freq_str="D", - set_time_index=True, - seasonal_scale=0.05, # Increasing this value causes the decomposer to miscalculate trend - ) - subset_y = y.loc[y.index[2 * period : 7 * period]] + X, y = generate_seasonal_data( + real_or_synthetic="synthetic", + univariate_or_multivariate=variateness, + )( + period=period, + freq_str="D", + set_time_index=True, + seasonal_scale=0.05, # Increasing this value causes the decomposer to miscalculate trend + ) + subset_y = y.loc[y.index[2 * period : 7 * period]] subset_X = X[2 * period : 7 * period] decomposer = decomposer_child_class(period=period) decomposer.fit(subset_X, subset_y) if transformer_fit_on_data == "in-sample": - if variateness == "univariate": - output_X, output_y = decomposer.transform(subset_X, subset_y) - pd.testing.assert_series_equal( - pd.Series(np.zeros(len(output_y))).set_axis(subset_y.index), - output_y, - check_dtype=False, - check_names=False, - atol=0.2, - ) + output_X, output_y = decomposer.transform(subset_X, subset_y) + if variateness == "multivariate": + assert_function = pd.testing.assert_frame_equal + y_expected = y_expected = pd.DataFrame( + [np.zeros(len(output_y)), np.zeros(len(output_y))], + ).T.set_axis(subset_y.index) + else: + assert_function = pd.testing.assert_series_equal + y_expected = pd.Series(np.zeros(len(output_y))).set_axis(subset_y.index) + assert_function( + y_expected, + output_y, + check_dtype=False, + check_names=False, + atol=0.2, + ) if transformer_fit_on_data != "in-sample": y_new = build_test_target( @@ -846,25 +843,23 @@ def test_decomposer_fit_transform_out_of_sample( ): output_X, output_inverse_y = decomposer.transform(None, y_new) else: - if variateness == "univariate": - output_X, output_y_t = decomposer.transform(None, y[y_new.index]) - pd.testing.assert_series_equal( - pd.Series(np.zeros(len(output_y_t))).set_axis(y_new.index), - output_y_t, - check_exact=False, - atol=0.1, # STLDecomposer is within atol=5.0e-4 - ) - elif variateness == "multivariate": + output_X, output_y_t = decomposer.transform(None, y.loc[y_new.index]) + if variateness == "multivariate": + assert_function = pd.testing.assert_frame_equal y_new = pd.DataFrame([y_new, y_new]).T - output_X, output_y_t = decomposer.transform(None, y.loc[y_new.index]) - pd.testing.assert_frame_equal( - pd.DataFrame( - [np.zeros(len(output_y_t)), np.zeros(len(output_y_t))], - ).T.set_axis(y_new.index), - output_y_t, - check_exact=False, - atol=0.1, # STLDecomposer is within atol=5.0e-4 - ) + y_expected = pd.DataFrame( + [np.zeros(len(output_y_t)), np.zeros(len(output_y_t))], + ).T.set_axis(y_new.index) + else: + assert_function = pd.testing.assert_series_equal + y_expected = pd.Series(np.zeros(len(output_y_t))).set_axis(y_new.index) + + assert_function( + y_expected, + output_y_t, + check_exact=False, + atol=0.1, # STLDecomposer is within atol=5.0e-4 + ) @pytest.mark.parametrize( @@ -895,60 +890,50 @@ def test_decomposer_inverse_transform( decomposer_child_class, index_type, generate_seasonal_data, - generate_multiseries_seasonal_data, variateness, transformer_fit_on_data, ): + if variateness == "multivariate" and isinstance( + decomposer_child_class(), + PolynomialDecomposer, + ): + pytest.skip( + "Skipping Decomposer because multiseries is not implemented for Polynomial Decomposer", + ) + # Generate 10 periods (the default) of synthetic seasonal data period = 7 - if variateness == "univariate": - X, y = generate_seasonal_data(real_or_synthetic="synthetic")( - period=period, - freq_str="D", - set_time_index=True, - seasonal_scale=0.05, # Increasing this value causes the decomposer to miscalculate trend - ) - if index_type == "integer_index": - y = y.reset_index(drop=True) - subset_y = y[: 5 * period] - elif variateness == "multivariate": - if isinstance(decomposer_child_class(), PolynomialDecomposer): - pytest.skip( - "Skipping Decomposer because multiseries is not implemented for Polynomial Decomposer", - ) - X, y = generate_multiseries_seasonal_data(real_or_synthetic="synthetic")( - period=period, - freq_str="D", - set_time_index=True, - seasonal_scale=0.05, # Increasing this value causes the decomposer to miscalculate trend - ) - if index_type == "integer_index": - y = y.reset_index(drop=True) - subset_y = y.loc[y.index[: 5 * period]] + X, y = generate_seasonal_data( + real_or_synthetic="synthetic", + univariate_or_multivariate=variateness, + )( + period=period, + freq_str="D", + set_time_index=True, + seasonal_scale=0.05, # Increasing this value causes the decomposer to miscalculate trend + ) + if index_type == "integer_index": + y = y.reset_index(drop=True) subset_X = X[: 5 * period] + subset_y = y.loc[y.index[: 5 * period]] decomposer = decomposer_child_class(period=period) output_X, output_y = decomposer.fit_transform(subset_X, subset_y) if transformer_fit_on_data == "in-sample": output_inverse_y = decomposer.inverse_transform(output_y) - if isinstance(decomposer, STLDecomposer) and variateness == "multivariate": - pd.testing.assert_frame_equal( - pd.DataFrame(subset_y), - output_inverse_y, - check_dtype=False, - ) - elif ( - isinstance(decomposer, PolynomialDecomposer) - or isinstance(decomposer, STLDecomposer) - and variateness == "univariate" - ): - pd.testing.assert_series_equal( - pd.Series(subset_y), - output_inverse_y, - check_dtype=False, - ) + if variateness == "multivariate": + assert_function = pd.testing.assert_frame_equal + y_expected = pd.DataFrame(subset_y) + else: + assert_function = pd.testing.assert_series_equal + y_expected = pd.Series(subset_y) + assert_function( + y_expected, + output_inverse_y, + check_dtype=False, + ) if transformer_fit_on_data != "in-sample": y_t_new = build_test_target( @@ -972,25 +957,20 @@ def test_decomposer_inverse_transform( output_inverse_y = decomposer.inverse_transform(y_t_new) # Because output_inverse_y.index is int32 and y[y_t_new.index].index is int64 in windows, # we need to test the indices equivalence separately. - if isinstance(decomposer, STLDecomposer) and variateness == "multivariate": - pd.testing.assert_frame_equal( - pd.DataFrame(y.loc[y_t_new.index]), - output_inverse_y, - check_exact=False, - rtol=1.0e-1, - ) - elif ( - isinstance(decomposer, PolynomialDecomposer) - or isinstance(decomposer, STLDecomposer) - and variateness == "univariate" - ): - pd.testing.assert_series_equal( - pd.Series(y[y_t_new.index]), - output_inverse_y, - check_exact=False, - check_index=False, - rtol=1.0e-1, - ) + + if variateness == "multivariate": + assert_function = pd.testing.assert_frame_equal + y_expected = pd.DataFrame(y.loc[y_t_new.index]) + else: + assert_function = pd.testing.assert_series_equal + y_expected = pd.Series(y[y_t_new.index]) + assert_function( + y_expected, + output_inverse_y, + check_exact=False, + rtol=1.0e-1, + ) + pd.testing.assert_index_equal( y.loc[y_t_new.index].index, output_inverse_y.index, @@ -1040,7 +1020,7 @@ def test_decomposer_doesnt_modify_target_index( def test_decomposer_monthly_begin_data( decomposer_child_class, ts_data, - multiseries_ts_data_unstacked, + ts_multiseries_data, variateness, ): if variateness == "univariate": @@ -1050,7 +1030,7 @@ def test_decomposer_monthly_begin_data( pytest.skip( "Skipping Decomposer because multiseries is not implemented for Polynomial Decomposer", ) - X, y = multiseries_ts_data_unstacked + X, _, y = ts_multiseries_data() dts = pd.date_range("01-01-2000", periods=len(X), freq="MS") datetime_index = pd.DatetimeIndex(dts) diff --git a/evalml/tests/component_tests/decomposer_tests/test_stl_decomposer.py b/evalml/tests/component_tests/decomposer_tests/test_stl_decomposer.py index f9936f718b..52b9cf532c 100644 --- a/evalml/tests/component_tests/decomposer_tests/test_stl_decomposer.py +++ b/evalml/tests/component_tests/decomposer_tests/test_stl_decomposer.py @@ -17,6 +17,7 @@ def test_stl_decomposer_init(): assert decomp.parameters == { "degree": 3, "period": None, + "periods": None, "seasonal_smoother": 7, "time_index": "dates", "series_id": None, @@ -28,6 +29,7 @@ def test_stl_decomposer_multiseries_init(): assert decomp.parameters == { "degree": 3, "period": None, + "periods": None, "seasonal_smoother": 7, "time_index": "dates", "series_id": "ids", @@ -52,13 +54,13 @@ def test_stl_decomposer_auto_sets_seasonal_smoother_to_odd(): def test_stl_raises_warning_high_smoother( caplog, ts_data, - multiseries_ts_data_unstacked, + ts_multiseries_data, variateness, ): if variateness == "univariate": X, _, y = ts_data() elif variateness == "multivariate": - X, y = multiseries_ts_data_unstacked + X, _, y = ts_multiseries_data() stl = STLDecomposer(seasonal_smoother=101) stl.fit(X, y) assert "STLDecomposer may perform poorly" in caplog.text @@ -119,39 +121,42 @@ def test_stl_fit_transform_in_sample( freq, trend_degree, generate_seasonal_data, - generate_multiseries_seasonal_data, variateness, ): - if variateness == "univariate": - X, y = generate_seasonal_data(real_or_synthetic="synthetic")( - period, - freq_str=freq, - trend_degree=trend_degree, - ) - elif variateness == "multivariate": - X, y = generate_multiseries_seasonal_data(real_or_synthetic="synthetic")( - period, - freq_str=freq, - trend_degree=trend_degree, - ) + X, y = generate_seasonal_data( + real_or_synthetic="synthetic", + univariate_or_multivariate=variateness, + )( + period, + freq_str=freq, + trend_degree=trend_degree, + ) stl = STLDecomposer(period=period) X_t, y_t = stl.fit_transform(X, y) - if variateness == "univariate": + # If y_t is a pd.Series, give it columns + if isinstance(y_t, pd.Series): + y_t = y_t.to_frame() + if isinstance(y, pd.Series): + y = y.to_frame() + # Get the expected answer + for id in y_t.columns: + y_t_series = y_t[id] + y_series = y[id] # Get the expected answer lin_reg = LinearRegression(fit_intercept=True) features = PolynomialFeatures(degree=trend_degree).fit_transform( np.arange(X.shape[0]).reshape(-1, 1), ) - lin_reg.fit(features, y) + lin_reg.fit(features, y_series) expected_trend = lin_reg.predict(features) # Check to make sure STL detrended/deseasoned pd.testing.assert_series_equal( - pd.Series(np.zeros(len(y_t))), - y_t, + pd.Series(np.zeros(len(y_t_series))), + y_t_series, check_exact=False, check_index=False, check_names=False, @@ -166,36 +171,6 @@ def test_stl_fit_transform_in_sample( check_names=False, atol=0.3, ) - elif variateness == "multivariate": - # Get the expected answer - for id in y.columns: - # Check to make sure STL detrended/deseasoned - y_t_series = y_t[id] - pd.testing.assert_series_equal( - pd.Series(np.zeros(len(y_t_series))), - y_t_series, - check_exact=False, - check_index=False, - check_names=False, - atol=0.1, - ) - y_series = y[id] - lin_reg = LinearRegression(fit_intercept=True) - features = PolynomialFeatures(degree=trend_degree).fit_transform( - np.arange(X.shape[0]).reshape(-1, 1), - ) - lin_reg.fit(features, y_series) - expected_trend = lin_reg.predict(features) - # Check the trend to make sure STL worked properly - pd.testing.assert_series_equal( - pd.Series(expected_trend), - pd.Series(stl.trends[id]), - check_exact=False, - check_index=False, - check_names=False, - atol=0.3, - ) - # Verify the X is not changed pd.testing.assert_frame_equal(X, X_t) @@ -223,50 +198,37 @@ def test_stl_fit_transform_in_sample( def test_stl_decomposer_inverse_transform( index_type, generate_seasonal_data, - generate_multiseries_seasonal_data, variateness, transformer_fit_on_data, ): # Generate 10 periods (the default) of synthetic seasonal data period = 7 - if variateness == "univariate": - X, y = generate_seasonal_data(real_or_synthetic="synthetic")( - period=period, - freq_str="D", - set_time_index=True, - ) - if index_type == "integer_index": - y = y.reset_index(drop=True) - subset_X = X[: 5 * period] - subset_y = y[: 5 * period] - elif variateness == "multivariate": - X, y = generate_multiseries_seasonal_data(real_or_synthetic="synthetic")( - period=period, - freq_str="D", - set_time_index=True, - ) - if index_type == "integer_index": - y = y.reset_index(drop=True) - subset_y = y.loc[y.index[: 5 * period]] + X, y = generate_seasonal_data( + real_or_synthetic="synthetic", + univariate_or_multivariate=variateness, + )( + period=period, + freq_str="D", + set_time_index=True, + ) + if index_type == "integer_index": + y = y.reset_index(drop=True) subset_X = X[: 5 * period] + subset_y = y.loc[y.index[: 5 * period]] + decomposer = STLDecomposer(period=period) output_X, output_y = decomposer.fit_transform(subset_X, subset_y) if transformer_fit_on_data == "in-sample": output_inverse_y = decomposer.inverse_transform(output_y) - if variateness == "univariate": - pd.testing.assert_series_equal( - subset_y, - output_inverse_y, - check_dtype=False, - ) - elif variateness == "multivariate": - pd.testing.assert_frame_equal( - pd.DataFrame(subset_y), - output_inverse_y, - check_dtype=False, - ) + if variateness == "multivariate": + assert_function = pd.testing.assert_frame_equal + y_expected = pd.DataFrame(subset_y) + else: + assert_function = pd.testing.assert_series_equal + y_expected = subset_y + assert_function(y_expected, output_inverse_y, check_dtype=False) if transformer_fit_on_data != "in-sample": y_t_new = build_test_target( @@ -291,20 +253,19 @@ def test_stl_decomposer_inverse_transform( # we need to test the indices equivalence separately. output_inverse_y = decomposer.inverse_transform(y_t_new) - if variateness == "univariate": - pd.testing.assert_series_equal( - y[y_t_new.index], - output_inverse_y, - check_index=False, - rtol=1.0e-2, - ) - elif variateness == "multivariate": - pd.testing.assert_frame_equal( - pd.DataFrame(y.loc[y_t_new.index]), - output_inverse_y, - check_exact=False, - rtol=1.0e-1, - ) + if variateness == "multivariate": + assert_function = pd.testing.assert_frame_equal + y_expected = pd.DataFrame(y.loc[y_t_new.index]) + else: + assert_function = pd.testing.assert_series_equal + y_expected = y[y_t_new.index] + assert_function( + y_expected, + output_inverse_y, + check_exact=False, + rtol=1.0e-1, + ) + pd.testing.assert_index_equal( y.loc[y_t_new.index].index, output_inverse_y.index, @@ -334,29 +295,21 @@ def test_stl_decomposer_inverse_transform( @pytest.mark.parametrize("fit_before_decompose", [True, False]) def test_stl_decomposer_get_trend_dataframe( generate_seasonal_data, - generate_multiseries_seasonal_data, transformer_fit_on_data, fit_before_decompose, variateness, ): period = 7 - - if variateness == "univariate": - X, y = generate_seasonal_data(real_or_synthetic="synthetic")( - period=period, - freq_str="D", - set_time_index=True, - ) - subset_y = y[: 5 * period] - elif variateness == "multivariate": - X, y = generate_multiseries_seasonal_data(real_or_synthetic="synthetic")( - period=period, - freq_str="D", - set_time_index=True, - ) - subset_y = y.loc[y.index[: 5 * period]] - + X, y = generate_seasonal_data( + real_or_synthetic="synthetic", + univariate_or_multivariate=variateness, + )( + period=period, + freq_str="D", + set_time_index=True, + ) subset_X = X[: 5 * period] + subset_y = y.loc[y.index[: 5 * period]] if transformer_fit_on_data == "in-sample": dec = STLDecomposer() @@ -446,19 +399,15 @@ def test_stl_decomposer_get_trend_dataframe( ) def test_stl_decomposer_get_trend_dataframe_sets_time_index_internally( generate_seasonal_data, - generate_multiseries_seasonal_data, variateness, ): - if variateness == "univariate": - X, y = generate_seasonal_data(real_or_synthetic="synthetic")( - period=7, - set_time_index=False, - ) - elif variateness == "multivariate": - X, y = generate_multiseries_seasonal_data(real_or_synthetic="synthetic")( - period=7, - set_time_index=False, - ) + X, y = generate_seasonal_data( + real_or_synthetic="synthetic", + univariate_or_multivariate=variateness, + )( + period=7, + set_time_index=False, + ) assert not isinstance(y.index, pd.DatetimeIndex) @@ -493,22 +442,19 @@ def test_stl_decomposer_get_trend_dataframe_sets_time_index_internally( def test_unsupported_frequencies( bad_frequency, generate_seasonal_data, - generate_multiseries_seasonal_data, variateness, ): """This test exists to highlight that even though the underlying statsmodels STL component won't work for minute or annual frequencies, we can still run these frequencies with automatic period detection. """ - if variateness == "univariate": - X, y = generate_seasonal_data(real_or_synthetic="synthetic")( - period=7, - freq_str=bad_frequency, - ) - elif variateness == "multivariate": - X, y = generate_multiseries_seasonal_data(real_or_synthetic="synthetic")( - period=7, - freq_str=bad_frequency, - ) + X, y = generate_seasonal_data( + real_or_synthetic="synthetic", + univariate_or_multivariate=variateness, + )( + period=7, + freq_str=bad_frequency, + ) + stl = STLDecomposer() X_t, y_t = stl.fit_transform(X, y) assert stl.period is not None @@ -523,19 +469,15 @@ def test_unsupported_frequencies( ) def test_stl_decomposer_doesnt_modify_target_index( generate_seasonal_data, - generate_multiseries_seasonal_data, variateness, ): - if variateness == "univariate": - X, y = generate_seasonal_data(real_or_synthetic="synthetic")( - period=7, - set_time_index=False, - ) - elif variateness == "multivariate": - X, y = generate_multiseries_seasonal_data(real_or_synthetic="synthetic")( - period=7, - set_time_index=False, - ) + X, y = generate_seasonal_data( + real_or_synthetic="synthetic", + univariate_or_multivariate=variateness, + )( + period=7, + set_time_index=False, + ) original_X_index = X.index original_y_index = y.index @@ -566,28 +508,21 @@ def test_stl_decomposer_get_trend_prediction_intervals( set_coverage, index_type, generate_seasonal_data, - generate_multiseries_seasonal_data, variateness, ): coverage = [0.75, 0.85, 0.95] if set_coverage else None period = 7 - if variateness == "univariate": - X, y = generate_seasonal_data(real_or_synthetic="synthetic")( - period=period, - freq_str="D", - set_time_index=True, - ) - y_train = y[: 15 * period] - y_validate = y[15 * period :] - elif variateness == "multivariate": - X, y = generate_multiseries_seasonal_data(real_or_synthetic="synthetic")( - period=period, - freq_str="D", - set_time_index=True, - ) - y_train = y.loc[y.index[: 15 * period]] - y_validate = y.loc[y.index[15 * period :]] + X, y = generate_seasonal_data( + real_or_synthetic="synthetic", + univariate_or_multivariate=variateness, + )( + period=period, + freq_str="D", + set_time_index=True, + ) X_train = X[: 15 * period] + y_train = y.loc[y.index[: 15 * period]] + y_validate = y.loc[y.index[15 * period :]] stl = STLDecomposer() stl.fit(X_train, y_train) @@ -626,15 +561,13 @@ def assert_pred_interval_coverage(pred_interval): ) def test_stl_decomposer_plot_decomposition( ts_data, - multiseries_ts_data_unstacked, + ts_multiseries_data, variateness, ): if variateness == "univariate": X, _, y = ts_data() elif variateness == "multivariate": - X, y = multiseries_ts_data_unstacked - X.index = X["date"] - X.index.freq = "D" + X, _, y = ts_multiseries_data() dec = STLDecomposer(time_index="date") dec.fit_transform(X, y) diff --git a/evalml/tests/conftest.py b/evalml/tests/conftest.py index 29b6fed30e..6306844a94 100644 --- a/evalml/tests/conftest.py +++ b/evalml/tests/conftest.py @@ -2520,21 +2520,9 @@ def generate_synthetic_data( y = y.set_axis(dts) return X, y - def _return_proper_func(real_or_synthetic): - if real_or_synthetic == "synthetic": - return generate_synthetic_data - elif real_or_synthetic == "real": - return generate_real_data - - return _return_proper_func - - -@pytest.fixture -def generate_multiseries_seasonal_data(): - """Function that returns data with a linear trend and a seasonal signal with specified period for multiseries.""" - - def generate_synthetic_data( + def generate_multiseries_synthetic_data( period, + step=None, num_periods=20, scale=1, seasonal_scale=1, @@ -2589,9 +2577,16 @@ def generate_synthetic_data( y_ms = pd.DataFrame(y_ms_list).T return X, y_ms - def _return_proper_func(real_or_synthetic): - if real_or_synthetic == "synthetic": + def _return_proper_func(real_or_synthetic, univariate_or_multivariate="univariate"): + if ( + real_or_synthetic == "synthetic" + and univariate_or_multivariate == "univariate" + ): return generate_synthetic_data + elif real_or_synthetic == "real" and univariate_or_multivariate == "univariate": + return generate_real_data + if univariate_or_multivariate == "multivariate": + return generate_multiseries_synthetic_data return _return_proper_func