From 81abfcaccd3073a626f0aab09e27e5c3ae9af822 Mon Sep 17 00:00:00 2001 From: remyogasawara <67338690+remyogasawara@users.noreply.github.com> Date: Fri, 8 Sep 2023 13:40:07 -0700 Subject: [PATCH] Add STLDecomposer to multiseries pipelines (#4299) * add decomposer to tests * Remove nan values * handle series and df * fix stl graph * fix condition for adding decomposer --------- Co-authored-by: christopherbunn --- docs/source/release_notes.rst | 1 + evalml/pipelines/component_graph.py | 4 +- .../preprocessing/stl_decomposer.py | 1 + evalml/pipelines/time_series_pipeline_base.py | 6 +++ evalml/pipelines/utils.py | 43 +++++++++++-------- .../automl_tests/test_default_algorithm.py | 4 +- .../automl_tests/test_iterative_algorithm.py | 7 ++- .../pipeline_tests/test_pipeline_utils.py | 2 +- 8 files changed, 45 insertions(+), 23 deletions(-) diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst index 06a28a2e69..3f5b6707ec 100644 --- a/docs/source/release_notes.rst +++ b/docs/source/release_notes.rst @@ -5,6 +5,7 @@ Release Notes * Extended STLDecomposer to Support Multiseries :pr:`4253` * Extended TimeSeriesImputer to handle multiseries :pr:`4291` * Added datacheck to check for mismatched series length in multiseries :pr:`4296` + * Added STLDecomposer to multiseries pipelines :pr:`4299` * Fixes * Changes * Documentation Changes diff --git a/evalml/pipelines/component_graph.py b/evalml/pipelines/component_graph.py index 48e83807c1..0f3f4e5810 100644 --- a/evalml/pipelines/component_graph.py +++ b/evalml/pipelines/component_graph.py @@ -802,14 +802,16 @@ def graph(self, name=None, graph_format=None): for component_name, component_class in self.component_instances.items(): label = "%s\l" % (component_name) # noqa: W605 if isinstance(component_class, ComponentBase): + # Reformat labels for nodes: cast values as strings, reformat floats to 2 decimal points and remove brackets from dictionary values so Digraph can parse it parameters = "\\l".join( [ key + " : " + "{:0.2f}".format(val) if (isinstance(val, float)) - else key + " : " + str(val) + else key + " : " + str(val).replace("{", "").replace("}", "") for key, val in component_class.parameters.items() ], ) # noqa: W605 + label = "%s |%s\l" % (component_name, parameters) # noqa: W605 graph.node(component_name, shape="record", label=label, nodesep="0.03") diff --git a/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py b/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py index b4bcfdd029..503be35da3 100644 --- a/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py +++ b/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py @@ -442,6 +442,7 @@ def inverse_transform( y.append(y_series) y_df = pd.DataFrame(y).T y_df.index = original_index + y_df.columns = y_t.columns return y_df def get_trend_dataframe(self, X, y): diff --git a/evalml/pipelines/time_series_pipeline_base.py b/evalml/pipelines/time_series_pipeline_base.py index 3badb6dc09..37f163cd90 100644 --- a/evalml/pipelines/time_series_pipeline_base.py +++ b/evalml/pipelines/time_series_pipeline_base.py @@ -265,6 +265,12 @@ def predict_in_sample( calculating_residuals=calculating_residuals, ) predictions = self._estimator_predict(features) + if isinstance(predictions, pd.Series): + predictions = predictions.rename(self.input_target_name) + elif isinstance(predictions, pd.DataFrame): + predictions = predictions.ww.rename( + dict(zip(predictions.columns, y.columns)), + ) if len(predictions) == len(y): predictions.index = y.index predictions = self.inverse_transform(predictions) diff --git a/evalml/pipelines/utils.py b/evalml/pipelines/utils.py index dbc51abee8..e23998096d 100644 --- a/evalml/pipelines/utils.py +++ b/evalml/pipelines/utils.py @@ -233,21 +233,27 @@ def _get_time_series_featurizer(X, y, problem_type, estimator_class, sampler_nam def _get_decomposer(X, y, problem_type, estimator_class, sampler_name=None): components = [] if is_time_series(problem_type) and is_regression(problem_type): - time_index = get_time_index(X, y, None) - # If the time index frequency is uninferrable, STL will fail - if time_index.freq is None: - return components - freq = time_index.freq.name - if STLDecomposer.is_freq_valid(freq): - # Make sure there's a seasonal period - order = 3 if "Q" in freq else 5 - seasonal_period = STLDecomposer.determine_periodicity( - X, - y, - rel_max_order=order, - ) - if seasonal_period is not None and seasonal_period <= DECOMPOSER_PERIOD_CAP: - components.append(STLDecomposer) + if is_multiseries(problem_type): + components.append(STLDecomposer) + else: + time_index = get_time_index(X, y, None) + # If the time index frequency is uninferrable, STL will fail + if time_index.freq is None: + return components + freq = time_index.freq.name + if STLDecomposer.is_freq_valid(freq): + # Make sure there's a seasonal period + order = 3 if "Q" in freq else 5 + seasonal_period = STLDecomposer.determine_periodicity( + X, + y, + rel_max_order=order, + ) + if ( + seasonal_period is not None + and seasonal_period <= DECOMPOSER_PERIOD_CAP + ): + components.append(STLDecomposer) return components @@ -292,9 +298,12 @@ def _get_preprocessing_components( list[Transformer]: A list of applicable preprocessing components to use with the estimator. """ if is_multiseries(problem_type): - return [] + if include_decomposer: + components_functions = [_get_decomposer] + else: + return [] - if is_time_series(problem_type): + elif is_time_series(problem_type): components_functions = [ _get_label_encoder, _get_drop_all_null, diff --git a/evalml/tests/automl_tests/test_default_algorithm.py b/evalml/tests/automl_tests/test_default_algorithm.py index b21cc452cb..31b8a166f7 100644 --- a/evalml/tests/automl_tests/test_default_algorithm.py +++ b/evalml/tests/automl_tests/test_default_algorithm.py @@ -670,7 +670,7 @@ def test_default_algorithm_multiseries_time_series( ) first_batch = algo.next_batch() - assert len(first_batch) == 1 + assert len(first_batch) == 2 pipeline = first_batch[0] assert pipeline.model_family == ModelFamily.VARMAX assert pipeline.parameters["pipeline"] == search_parameters["pipeline"] @@ -679,7 +679,7 @@ def test_default_algorithm_multiseries_time_series( long_explore = algo.next_batch() long_estimators = set([pipeline.estimator.name for pipeline in long_explore]) - assert len(long_explore) == 50 + assert len(long_explore) == 100 assert len(long_estimators) == 1 diff --git a/evalml/tests/automl_tests/test_iterative_algorithm.py b/evalml/tests/automl_tests/test_iterative_algorithm.py index f5ed9b73ac..3030c09909 100644 --- a/evalml/tests/automl_tests/test_iterative_algorithm.py +++ b/evalml/tests/automl_tests/test_iterative_algorithm.py @@ -18,11 +18,12 @@ DateTimeFeaturizer, EmailFeaturizer, NaturalLanguageFeaturizer, + STLDecomposer, TimeSeriesFeaturizer, URLFeaturizer, ) from evalml.pipelines.components.utils import get_estimators -from evalml.pipelines.utils import make_pipeline +from evalml.pipelines.utils import is_regression, make_pipeline from evalml.problem_types import ProblemTypes, is_multiseries, is_time_series @@ -97,6 +98,7 @@ def test_iterative_algorithm_init( assert algo.batch_number == 0 assert algo.default_max_batches == 1 estimators = get_estimators(problem_type) + decomposer = [STLDecomposer] if is_regression(problem_type) else [] assert len(algo.allowed_pipelines) == len( [ make_pipeline( @@ -107,7 +109,8 @@ def test_iterative_algorithm_init( parameters=search_parameters, ) for estimator in estimators - ], + ] + + decomposer, ) diff --git a/evalml/tests/pipeline_tests/test_pipeline_utils.py b/evalml/tests/pipeline_tests/test_pipeline_utils.py index 92eb95cc0e..db6de1a9d0 100644 --- a/evalml/tests/pipeline_tests/test_pipeline_utils.py +++ b/evalml/tests/pipeline_tests/test_pipeline_utils.py @@ -170,7 +170,7 @@ def test_make_pipeline( if is_time_series(problem_type): if is_multiseries(problem_type): - expected_components = dfs + [estimator_class] + expected_components = dfs + decomposer + [estimator_class] else: expected_components = ( dfs