From 41f8e875a3c86cf6a96c05d746673a13621961b1 Mon Sep 17 00:00:00 2001 From: Becca McBrayer Date: Thu, 17 Aug 2023 09:43:26 -0400 Subject: [PATCH] Swap is_multiseries logic to problem type (#4278) * Add multiseries time series regression as problem type * Completely revamp to multiseries based on problem type --- .../automl_algorithm/default_algorithm.py | 18 +----- .../automl_algorithm/iterative_algorithm.py | 6 -- evalml/automl/automl_search.py | 17 +++--- evalml/automl/utils.py | 1 + evalml/objectives/regression_objective.py | 8 ++- evalml/pipelines/components/component_base.py | 1 - ...tiseries_time_series_baseline_regressor.py | 5 +- .../estimators/regressors/varmax_regressor.py | 5 +- evalml/pipelines/components/utils.py | 16 +---- .../multiseries_regression_pipeline.py | 4 +- evalml/pipelines/utils.py | 36 +++++------ evalml/problem_types/__init__.py | 1 + evalml/problem_types/problem_types.py | 3 + evalml/problem_types/utils.py | 17 ++++++ .../parallel_tests/test_automl_dask.py | 14 ++++- evalml/tests/automl_tests/test_automl.py | 61 +++++++++++-------- .../tests/automl_tests/test_automl_utils.py | 22 +++---- .../automl_tests/test_default_algorithm.py | 12 ++-- .../automl_tests/test_iterative_algorithm.py | 14 +++-- .../tests/component_tests/test_components.py | 41 +++++++------ .../test_multiseries_baseline_regressor.py | 5 +- evalml/tests/component_tests/test_utils.py | 1 + .../component_tests/test_varmax_regressor.py | 2 +- evalml/tests/conftest.py | 7 +-- .../integration_tests/test_nullable_types.py | 12 +++- .../pipeline_tests/test_pipeline_utils.py | 21 +++---- evalml/tests/pipeline_tests/test_pipelines.py | 2 + .../problem_type_tests/test_problem_types.py | 11 +++- 28 files changed, 192 insertions(+), 171 deletions(-) diff --git a/evalml/automl/automl_algorithm/default_algorithm.py b/evalml/automl/automl_algorithm/default_algorithm.py index 5042e5cccf..bb6f7591b5 100644 --- a/evalml/automl/automl_algorithm/default_algorithm.py +++ b/evalml/automl/automl_algorithm/default_algorithm.py @@ -25,7 +25,7 @@ _make_pipeline_from_multiple_graphs, make_pipeline, ) -from evalml.problem_types import is_regression, is_time_series +from evalml.problem_types import is_multiseries, is_regression, is_time_series from evalml.utils import infer_feature_types from evalml.utils.logger import get_logger @@ -81,7 +81,6 @@ class DefaultAlgorithm(AutoMLAlgorithm): model families. Run evalml.pipelines.components.utils.allowed_model_families("binary") to see options. Change `binary` to `multiclass` or `regression` depending on the problem type. excluded_model_families (list[ModelFamily]): A list of model families to exclude from the estimators used when building pipelines. For default algorithm, this only excludes estimators in the non-naive batches. - is_multiseries (bool): Whether or not the problem is a multiseries time series problem. Defaults to False. """ def __init__( @@ -106,7 +105,6 @@ def __init__( run_feature_selection=True, verbose=False, exclude_featurizers=None, - is_multiseries=False, ): super().__init__( allowed_pipelines=[], @@ -140,7 +138,6 @@ def __init__( self.run_feature_selection = run_feature_selection self.ensembling = ensembling self.exclude_featurizers = exclude_featurizers or [] - self.is_multiseries = is_multiseries if allowed_model_families is not None and excluded_model_families is not None: raise ValueError( @@ -173,7 +170,7 @@ def default_max_batches(self): """Returns the number of max batches AutoMLSearch should run by default.""" if self.ensembling: return 3 - elif self.is_multiseries: + elif is_multiseries(self.problem_type): return 1 else: return 2 @@ -222,7 +219,6 @@ def _non_naive_estimators(self): self.problem_type, model_families=self.allowed_model_families, excluded_model_families=self.excluded_model_families, - is_multiseries=self.is_multiseries, ) if est not in self._naive_estimators() ] @@ -271,7 +267,6 @@ def _create_naive_pipelines(self, use_features=False): ), features=self.features, exclude_featurizers=self.exclude_featurizers, - is_multiseries=self.is_multiseries, ) for estimator in estimators ] @@ -300,7 +295,6 @@ def _add_without_pipelines(self, pipelines, estimators, feature_selector=[]): features=self.features, exclude_featurizers=self.exclude_featurizers, include_decomposer=False, - is_multiseries=self.is_multiseries, ) for estimator in estimators ] @@ -440,7 +434,6 @@ def _make_pipelines_helper(self, estimators): ), features=self.features, exclude_featurizers=self.exclude_featurizers, - is_multiseries=self.is_multiseries, ) for estimator in estimators ] @@ -484,7 +477,7 @@ def next_batch(self): # Skip the naive batch for multiseries time series batch = ( self._batch_number - if not self.is_multiseries + if not is_multiseries(self.problem_type) else self._batch_number + 1 ) if batch == 0: @@ -679,7 +672,6 @@ def _make_split_pipeline(self, estimator, pipeline_name=None): extra_components_before=[SelectColumns], use_estimator=False, exclude_featurizers=self.exclude_featurizers, - is_multiseries=self.is_multiseries, ) numeric_pipeline = make_pipeline( @@ -693,7 +685,6 @@ def _make_split_pipeline(self, estimator, pipeline_name=None): extra_components_after=[SelectColumns], use_estimator=False, exclude_featurizers=self.exclude_featurizers, - is_multiseries=self.is_multiseries, ) pre_pipeline_components = ( {"DFS Transformer": ["DFS Transformer", "X", "y"]} @@ -745,7 +736,6 @@ def _make_split_pipeline(self, estimator, pipeline_name=None): extra_components_before=[SelectColumns], features=self.features, exclude_featurizers=self.exclude_featurizers, - is_multiseries=self.is_multiseries, ) return categorical_pipeline elif self.run_feature_selection: @@ -762,7 +752,6 @@ def _make_split_pipeline(self, estimator, pipeline_name=None): extra_components_after=[SelectColumns], features=self.features, exclude_featurizers=self.exclude_featurizers, - is_multiseries=self.is_multiseries, ) return numeric_pipeline @@ -774,6 +763,5 @@ def _make_split_pipeline(self, estimator, pipeline_name=None): self.problem_type, sampler_name=self.sampler_name, exclude_featurizers=self.exclude_featurizers, - is_multiseries=self.is_multiseries, ) return pipeline diff --git a/evalml/automl/automl_algorithm/iterative_algorithm.py b/evalml/automl/automl_algorithm/iterative_algorithm.py index ae8dc098f3..cd09d679db 100644 --- a/evalml/automl/automl_algorithm/iterative_algorithm.py +++ b/evalml/automl/automl_algorithm/iterative_algorithm.py @@ -69,7 +69,6 @@ class IterativeAlgorithm(AutoMLAlgorithm): verbose (boolean): Whether or not to display logging information regarding pipeline building. Defaults to False. exclude_featurizers (list[str]): A list of featurizer components to exclude from the pipelines built by IterativeAlgorithm. Valid options are "DatetimeFeaturizer", "EmailFeaturizer", "URLFeaturizer", "NaturalLanguageFeaturizer", "TimeSeriesFeaturizer" - is_multiseries (bool): Whether or not the problem is a multiseries time series problem. Defaults to False. """ def __init__( @@ -96,7 +95,6 @@ def __init__( features=None, verbose=False, exclude_featurizers=None, - is_multiseries=False, ): self.X = infer_feature_types(X) self.y = infer_feature_types(y) @@ -131,7 +129,6 @@ def __init__( self.features = features self._set_additional_pipeline_params() self.exclude_featurizers = exclude_featurizers - self.is_multiseries = is_multiseries super().__init__( allowed_pipelines=self.allowed_pipelines, @@ -159,7 +156,6 @@ def _create_pipelines(self): self.problem_type, model_families=self.allowed_model_families, excluded_model_families=self.excluded_model_families, - is_multiseries=self.is_multiseries, ) allowed_estimators = self._filter_estimators( allowed_estimators, @@ -188,7 +184,6 @@ def _create_pipelines(self): ).get("known_in_advance", None), features=self.features, exclude_featurizers=self.exclude_featurizers, - is_multiseries=self.is_multiseries, ) for estimator in allowed_estimators ] @@ -212,7 +207,6 @@ def _create_pipelines(self): features=self.features, exclude_featurizers=self.exclude_featurizers, include_decomposer=False, - is_multiseries=self.is_multiseries, ) for estimator in allowed_estimators ] diff --git a/evalml/automl/automl_search.py b/evalml/automl/automl_search.py index ed4a01d03f..f4fdc5df50 100644 --- a/evalml/automl/automl_search.py +++ b/evalml/automl/automl_search.py @@ -65,6 +65,7 @@ handle_problem_types, is_binary, is_classification, + is_multiseries, is_time_series, ) from evalml.tuners import SKOptTuner @@ -625,10 +626,6 @@ def __init__( self.problem_configuration = self._validate_problem_configuration( problem_configuration, ) - self.is_multiseries = ( - is_time_series(self.problem_type) - and self.problem_configuration.get("series_id") is not None - ) self._train_best_pipeline = train_best_pipeline self._best_pipeline = None self._searched = False @@ -657,7 +654,7 @@ def __init__( ) # For multiseries problems, we need to mke sure that the data is primarily ordered by the time_index rather than the series_id - if self.is_multiseries: + if is_multiseries(self.problem_type): time_index = self.problem_configuration.get("time_index") series_id = self.problem_configuration.get("series_id") X_train = X_train.sort_values([time_index, series_id]) @@ -946,7 +943,6 @@ def _is_imbalanced(X, y, problem_type): features=features, verbose=self.verbose, exclude_featurizers=self.exclude_featurizers, - is_multiseries=self.is_multiseries, ) elif automl_algorithm == "default": self.automl_algorithm = DefaultAlgorithm( @@ -967,7 +963,6 @@ def _is_imbalanced(X, y, problem_type): verbose=self.verbose, n_jobs=self.n_jobs, exclude_featurizers=self.exclude_featurizers, - is_multiseries=self.is_multiseries, ) else: raise ValueError("Please specify a valid automl algorithm.") @@ -1068,6 +1063,13 @@ def _validate_problem_configuration(self, problem_configuration=None): is_valid, msg = contains_all_ts_parameters(problem_configuration) if not is_valid: raise ValueError(msg) + if ( + is_multiseries(self.problem_type) + and "series_id" not in problem_configuration + ): + raise ValueError( + "Must provide 'series_id' column in problem_configuration for multiseries time series problems.", + ) return problem_configuration or {} def _handle_keyboard_interrupt(self): @@ -1380,7 +1382,6 @@ def _get_baseline_pipeline(self): forecast_horizon, time_index, exclude_timeseries_featurizer, - self.is_multiseries, series_id, ) return baseline diff --git a/evalml/automl/utils.py b/evalml/automl/utils.py index cfaedf6832..56331807b4 100644 --- a/evalml/automl/utils.py +++ b/evalml/automl/utils.py @@ -49,6 +49,7 @@ def get_default_primary_search_objective(problem_type): "time series regression": "MedianAE", "time series binary": "Log Loss Binary", "time series multiclass": "Log Loss Multiclass", + "multiseries time series regression": "MedianAE", }[problem_type.value] return get_objective(objective_name, return_instance=True) diff --git a/evalml/objectives/regression_objective.py b/evalml/objectives/regression_objective.py index 4d76902631..a118d14b86 100644 --- a/evalml/objectives/regression_objective.py +++ b/evalml/objectives/regression_objective.py @@ -6,5 +6,9 @@ class RegressionObjective(ObjectiveBase): """Base class for all regression objectives.""" - problem_types = [ProblemTypes.REGRESSION, ProblemTypes.TIME_SERIES_REGRESSION] - """[ProblemTypes.REGRESSION, ProblemTypes.TIME_SERIES_REGRESSION]""" + problem_types = [ + ProblemTypes.REGRESSION, + ProblemTypes.TIME_SERIES_REGRESSION, + ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION, + ] + """[ProblemTypes.REGRESSION, ProblemTypes.TIME_SERIES_REGRESSION, ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION]""" diff --git a/evalml/pipelines/components/component_base.py b/evalml/pipelines/components/component_base.py index 5c00a053e1..12b6603bb4 100644 --- a/evalml/pipelines/components/component_base.py +++ b/evalml/pipelines/components/component_base.py @@ -31,7 +31,6 @@ class ComponentBase(ABC, metaclass=ComponentBaseMeta): # Referring to the pandas nullable dtypes; not just woodwork logical types _integer_nullable_incompatibilities = [] _boolean_nullable_incompatibilities = [] - is_multiseries = False def __init__(self, parameters=None, component_obj=None, random_seed=0, **kwargs): """Base class for all components. diff --git a/evalml/pipelines/components/estimators/regressors/multiseries_time_series_baseline_regressor.py b/evalml/pipelines/components/estimators/regressors/multiseries_time_series_baseline_regressor.py index 1ca88cd6bb..80be329341 100644 --- a/evalml/pipelines/components/estimators/regressors/multiseries_time_series_baseline_regressor.py +++ b/evalml/pipelines/components/estimators/regressors/multiseries_time_series_baseline_regressor.py @@ -25,12 +25,11 @@ class MultiseriesTimeSeriesBaselineRegressor(Estimator): """{}""" model_family = ModelFamily.BASELINE """ModelFamily.BASELINE""" - is_multiseries = True supported_problem_types = [ - ProblemTypes.TIME_SERIES_REGRESSION, + ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION, ] """[ - ProblemTypes.TIME_SERIES_REGRESSION, + ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION, ]""" def __init__(self, gap=1, forecast_horizon=1, random_seed=0, **kwargs): diff --git a/evalml/pipelines/components/estimators/regressors/varmax_regressor.py b/evalml/pipelines/components/estimators/regressors/varmax_regressor.py index df72ff72da..0c8c57d27c 100644 --- a/evalml/pipelines/components/estimators/regressors/varmax_regressor.py +++ b/evalml/pipelines/components/estimators/regressors/varmax_regressor.py @@ -48,10 +48,9 @@ class VARMAXRegressor(Estimator): "trend": Categorical(['n', 'c', 't', 'ct']), }""" model_family = ModelFamily.VARMAX - is_multiseries = True """ModelFamily.VARMAX""" - supported_problem_types = [ProblemTypes.TIME_SERIES_REGRESSION] - """[ProblemTypes.TIME_SERIES_REGRESSION]""" + supported_problem_types = [ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION] + """[ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION]""" def __init__( self, diff --git a/evalml/pipelines/components/utils.py b/evalml/pipelines/components/utils.py index c8e48e510c..b8dcb833c2 100644 --- a/evalml/pipelines/components/utils.py +++ b/evalml/pipelines/components/utils.py @@ -14,7 +14,7 @@ from evalml.pipelines.components.component_base import ComponentBase from evalml.pipelines.components.estimators.estimator import Estimator from evalml.pipelines.components.transformers.transformer import Transformer -from evalml.problem_types import ProblemTypes, handle_problem_types, is_time_series +from evalml.problem_types import ProblemTypes, handle_problem_types from evalml.utils import get_importable_subclasses @@ -56,18 +56,10 @@ def allowed_model_families(problem_type): return list(set([e.model_family for e in estimators])) -def _filter_multiseries_estimators(estimators, is_multiseries): - if is_multiseries: - return [estimator for estimator in estimators if estimator.is_multiseries] - else: - return [estimator for estimator in estimators if not estimator.is_multiseries] - - def get_estimators( problem_type, model_families=None, excluded_model_families=None, - is_multiseries=False, ): """Returns the estimators allowed for a particular problem type. @@ -77,7 +69,6 @@ def get_estimators( problem_type (ProblemTypes or str): Problem type to filter for. model_families (list[ModelFamily] or list[str]): Model families to filter for. excluded_model_families (list[ModelFamily]): A list of model families to exclude from the results. - is_multiseries (bool): Whether to return only estimators that support multiseries data. Returns: list[class]: A list of estimator subclasses. @@ -124,11 +115,6 @@ def get_estimators( if estimator_class.model_family not in model_families: continue estimator_classes.append(estimator_class) - if is_time_series(problem_type): - estimator_classes = _filter_multiseries_estimators( - estimator_classes, - is_multiseries, - ) return estimator_classes diff --git a/evalml/pipelines/multiseries_regression_pipeline.py b/evalml/pipelines/multiseries_regression_pipeline.py index 6b45653482..6ddc6ac9d4 100644 --- a/evalml/pipelines/multiseries_regression_pipeline.py +++ b/evalml/pipelines/multiseries_regression_pipeline.py @@ -21,9 +21,9 @@ class MultiseriesRegressionPipeline(TimeSeriesRegressionPipeline): """ - problem_type = ProblemTypes.TIME_SERIES_REGRESSION + problem_type = ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION - """ProblemTypes.TIME_SERIES_REGRESSION""" + """ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION""" def __init__( self, diff --git a/evalml/pipelines/utils.py b/evalml/pipelines/utils.py index 335322ad1b..7e61999f25 100644 --- a/evalml/pipelines/utils.py +++ b/evalml/pipelines/utils.py @@ -67,6 +67,7 @@ ProblemTypes, handle_problem_types, is_classification, + is_multiseries, is_regression, is_time_series, ) @@ -273,7 +274,6 @@ def _get_preprocessing_components( sampler_name=None, exclude_featurizers=None, include_decomposer=True, - is_multiseries=False, ): """Given input data, target data and an estimator class, construct a recommended preprocessing chain to be combined with the estimator and trained on the provided data. @@ -287,12 +287,11 @@ def _get_preprocessing_components( Valid options are "DatetimeFeaturizer", "EmailFeaturizer", "URLFeaturizer", "NaturalLanguageFeaturizer", "TimeSeriesFeaturizer" include_decomposer (bool): For time series regression problems, whether or not to include a decomposer in the generated pipeline. Defaults to True. - is_multiseries (bool): Whether or not the problem is a multiseries time series problem. Defaults to False. Returns: list[Transformer]: A list of applicable preprocessing components to use with the estimator. """ - if is_multiseries: + if is_multiseries(problem_type): return [] if is_time_series(problem_type): @@ -354,7 +353,7 @@ def _get_preprocessing_components( return components -def _get_pipeline_base_class(problem_type, is_multiseries=False): +def _get_pipeline_base_class(problem_type): """Returns pipeline base class for problem_type.""" problem_type = handle_problem_types(problem_type) if problem_type == ProblemTypes.BINARY: @@ -364,13 +363,13 @@ def _get_pipeline_base_class(problem_type, is_multiseries=False): elif problem_type == ProblemTypes.REGRESSION: return RegressionPipeline elif problem_type == ProblemTypes.TIME_SERIES_REGRESSION: - if is_multiseries: - return MultiseriesRegressionPipeline return TimeSeriesRegressionPipeline elif problem_type == ProblemTypes.TIME_SERIES_BINARY: return TimeSeriesBinaryClassificationPipeline - else: + elif problem_type == ProblemTypes.TIME_SERIES_MULTICLASS: return TimeSeriesMulticlassClassificationPipeline + else: + return MultiseriesRegressionPipeline def _make_pipeline_time_series( @@ -384,7 +383,6 @@ def _make_pipeline_time_series( exclude_featurizers=None, include_decomposer=True, features=False, - is_multiseries=False, ): """Make a pipeline for time series problems. @@ -407,7 +405,6 @@ def _make_pipeline_time_series( include_decomposer (bool): For time series regression problems, whether or not to include a decomposer in the generated pipeline. Defaults to True. features (bool): Whether to add a DFSTransformer component to this pipeline. - is_multiseries (bool): Whether or not the problem is a multiseries time series problem. Defaults to False. Returns: PipelineBase: TimeSeriesPipeline @@ -428,7 +425,6 @@ def _make_pipeline_time_series( sampler_name, exclude_featurizers, include_decomposer, - is_multiseries, ) dfs_transformer = [DFSTransformer] if features else [] @@ -448,7 +444,7 @@ def _make_pipeline_time_series( component_graph = PipelineBase._make_component_dict_from_component_list( preprocessing_components, ) - base_class = _get_pipeline_base_class(problem_type, is_multiseries) + base_class = _get_pipeline_base_class(problem_type) pipeline = base_class(component_graph, parameters=parameters) if X_known_in_advance is not None: # We can't specify a time series problem type because then the known-in-advance @@ -518,7 +514,6 @@ def make_pipeline( features=False, exclude_featurizers=None, include_decomposer=True, - is_multiseries=False, ): """Given input data, target data, an estimator class and the problem type, generates a pipeline class with a preprocessing chain which was recommended based on the inputs. The pipeline will be a subclass of the appropriate pipeline base class for the specified problem_type. @@ -540,7 +535,6 @@ def make_pipeline( Valid options are "DatetimeFeaturizer", "EmailFeaturizer", "URLFeaturizer", "NaturalLanguageFeaturizer", "TimeSeriesFeaturizer" include_decomposer (bool): For time series regression problems, whether or not to include a decomposer in the generated pipeline. Defaults to True. - is_multiseries (bool): Whether or not the problem is a multiseries time series problem. Defaults to False. Returns: PipelineBase object: PipelineBase instance with dynamically generated preprocessing components and specified estimator. @@ -553,7 +547,7 @@ def make_pipeline( if estimator: problem_type = handle_problem_types(problem_type) - if estimator not in get_estimators(problem_type, is_multiseries=is_multiseries): + if estimator not in get_estimators(problem_type): raise ValueError( f"{estimator.name} is not a valid estimator for problem type", ) @@ -574,7 +568,6 @@ def make_pipeline( exclude_featurizers, include_decomposer, features, - is_multiseries, ) else: preprocessing_components = _get_preprocessing_components( @@ -1218,7 +1211,6 @@ def make_timeseries_baseline_pipeline( forecast_horizon, time_index, exclude_featurizer=False, - is_multiseries=False, series_id=None, ): """Make a baseline pipeline for time series regression problems. @@ -1230,7 +1222,6 @@ def make_timeseries_baseline_pipeline( time_index (str): Column name of time_index parameter. exclude_featurizer (bool): Whether or not to exclude the TimeSeriesFeaturizer from the baseline graph. Defaults to False. - is_multiseries (bool): Whether or not the problem is a multiseries time series problem. Defaults to False. series_id (str): Column name of series_id parameter. Only used for multiseries time series. Defaults to None. Returns: @@ -1250,13 +1241,14 @@ def make_timeseries_baseline_pipeline( TimeSeriesBinaryClassificationPipeline, "Time Series Baseline Binary Pipeline", ), + ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION: ( + MultiseriesRegressionPipeline, + "Multiseries Time Series Baseline Pipeline", + ), }[problem_type] - if is_multiseries: - pipeline_class = MultiseriesRegressionPipeline - pipeline_name = "Multiseries Time Series Baseline Pipeline" baseline_estimator_name = ( "Multiseries Time Series Baseline Regressor" - if is_multiseries + if is_multiseries(problem_type) else "Time Series Baseline Estimator" ) component_graph = [baseline_estimator_name] @@ -1272,7 +1264,7 @@ def make_timeseries_baseline_pipeline( "forecast_horizon": forecast_horizon, }, } - if is_multiseries: + if is_multiseries(problem_type): parameters["pipeline"]["series_id"] = series_id if not exclude_featurizer: component_graph = ["Time Series Featurizer"] + component_graph diff --git a/evalml/problem_types/__init__.py b/evalml/problem_types/__init__.py index 5b589c909e..866f326164 100644 --- a/evalml/problem_types/__init__.py +++ b/evalml/problem_types/__init__.py @@ -6,6 +6,7 @@ is_regression, is_binary, is_multiclass, + is_multiseries, is_classification, is_time_series, ) diff --git a/evalml/problem_types/problem_types.py b/evalml/problem_types/problem_types.py index fe17fd73c6..4f4b182bf0 100644 --- a/evalml/problem_types/problem_types.py +++ b/evalml/problem_types/problem_types.py @@ -19,6 +19,8 @@ class ProblemTypes(Enum): """Time series binary classification problem.""" TIME_SERIES_MULTICLASS = "time series multiclass" """Time series multiclass classification problem.""" + MULTISERIES_TIME_SERIES_REGRESSION = "multiseries time series regression" + """Multiseries time series regression problem.""" def __str__(self): """String representation of the ProblemTypes enum.""" @@ -29,6 +31,7 @@ def __str__(self): ProblemTypes.TIME_SERIES_REGRESSION.name: "time series regression", ProblemTypes.TIME_SERIES_BINARY.name: "time series binary", ProblemTypes.TIME_SERIES_MULTICLASS.name: "time series multiclass", + ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION.name: "multiseries time series regression", } return problem_type_dict[self.name] diff --git a/evalml/problem_types/utils.py b/evalml/problem_types/utils.py index 0fa1552012..7f34aba059 100644 --- a/evalml/problem_types/utils.py +++ b/evalml/problem_types/utils.py @@ -87,6 +87,7 @@ def is_regression(problem_type): return handle_problem_types(problem_type) in [ ProblemTypes.REGRESSION, ProblemTypes.TIME_SERIES_REGRESSION, + ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION, ] @@ -165,4 +166,20 @@ def is_time_series(problem_type): ProblemTypes.TIME_SERIES_BINARY, ProblemTypes.TIME_SERIES_MULTICLASS, ProblemTypes.TIME_SERIES_REGRESSION, + ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION, ] + + +def is_multiseries(problem_type): + """Determines if the provided problem_type is a multiseries time series problem type. + + Args: + problem_type (str or ProblemTypes): type of supervised learning problem. See evalml.problem_types.ProblemType.all_problem_types for a full list. + + Returns: + bool: Whether or not the provided problem_type is a multiseries time series problem type. + """ + return ( + handle_problem_types(problem_type) + == ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION + ) diff --git a/evalml/tests/automl_tests/parallel_tests/test_automl_dask.py b/evalml/tests/automl_tests/parallel_tests/test_automl_dask.py index c3e370e3d0..f873f44b69 100644 --- a/evalml/tests/automl_tests/parallel_tests/test_automl_dask.py +++ b/evalml/tests/automl_tests/parallel_tests/test_automl_dask.py @@ -6,7 +6,13 @@ from evalml.automl.automl_algorithm import IterativeAlgorithm from evalml.automl.callbacks import raise_error_callback from evalml.automl.engine import CFEngine, DaskEngine, SequentialEngine -from evalml.problem_types import ProblemTypes, is_binary, is_multiclass, is_time_series +from evalml.problem_types import ( + ProblemTypes, + is_binary, + is_multiclass, + is_multiseries, + is_time_series, +) from evalml.tests.automl_tests.dask_test_utils import ( DaskPipelineFast, DaskPipelineSlow, @@ -285,9 +291,12 @@ def test_score_pipelines_passes_X_train_y_train( engine_str, X_y_based_on_pipeline_or_problem_type, ts_data, + multiseries_ts_data_stacked, AutoMLTestEnv, ): - if is_time_series(problem_type): + if is_multiseries(problem_type): + X, y = multiseries_ts_data_stacked + elif is_time_series(problem_type): X, _, y = ts_data(problem_type=problem_type) else: X, y = X_y_based_on_pipeline_or_problem_type(problem_type) @@ -310,6 +319,7 @@ def test_score_pipelines_passes_X_train_y_train( "gap": 0, "forecast_horizon": 1, "max_delay": 1, + "series_id": "series_id" if is_multiseries(problem_type) else None, }, engine=engine_str, ) diff --git a/evalml/tests/automl_tests/test_automl.py b/evalml/tests/automl_tests/test_automl.py index 6a06d58f9a..e2fb3738e4 100644 --- a/evalml/tests/automl_tests/test_automl.py +++ b/evalml/tests/automl_tests/test_automl.py @@ -85,6 +85,7 @@ ProblemTypes, handle_problem_types, is_classification, + is_multiseries, is_time_series, ) from evalml.tests.automl_tests.parallel_tests.test_automl_dask import engine_strs @@ -2284,22 +2285,28 @@ def fit(self, *args, **kwargs): ) -@pytest.mark.parametrize("is_multiseries", [True, False]) +@pytest.mark.parametrize( + "problem_type", + [ + ProblemTypes.TIME_SERIES_REGRESSION, + ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION, + ], +) def test_time_series_regression_with_parameters( - is_multiseries, + problem_type, ts_data, multiseries_ts_data_stacked, ): X, _, y = ts_data() X.index.name = "date" - if is_multiseries: + if is_multiseries(problem_type): X, y = multiseries_ts_data_stacked problem_configuration = { "time_index": "date", "gap": 1, "max_delay": 0, "forecast_horizon": 2, - "series_id": "series_id" if is_multiseries else None, + "series_id": "series_id" if is_multiseries(problem_type) else None, } automl = AutoMLSearch( X_train=X, @@ -4016,7 +4023,7 @@ def test_automl_baseline_pipeline_predictions_and_scores(problem_type): [ problem_type for problem_type in ProblemTypes.all_problem_types - if is_time_series(problem_type) + if is_time_series(problem_type) and not is_multiseries(problem_type) ], ) def test_automl_baseline_pipeline_predictions_and_scores_time_series(problem_type): @@ -4083,7 +4090,7 @@ def test_automl_multiseries_baseline_generation(multiseries_ts_data_stacked): automl = AutoMLSearch( X, y, - problem_type="time series regression", + problem_type="multiseries time series regression", problem_configuration={ "time_index": "date", "gap": 0, @@ -4224,20 +4231,17 @@ def test_automl_drop_unknown_columns(columns, AutoMLTestEnv, X_y_binary, caplog) ProblemTypes.TIME_SERIES_REGRESSION, ProblemTypes.TIME_SERIES_BINARY, ProblemTypes.TIME_SERIES_MULTICLASS, + ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION, ], ) -@pytest.mark.parametrize("is_multiseries", [False, True]) def test_data_splitter_gives_pipelines_same_data( automl_type, - is_multiseries, AutoMLTestEnv, X_y_binary, X_y_multi, X_y_regression, multiseries_ts_data_stacked, ): - if is_multiseries and automl_type != ProblemTypes.TIME_SERIES_REGRESSION: - pytest.skip("Multiseries only supported for time series regression") problem_configuration = None if automl_type == ProblemTypes.BINARY: X, y = X_y_binary @@ -4249,20 +4253,25 @@ def test_data_splitter_gives_pipelines_same_data( problem_configuration = { "gap": 1, "max_delay": 1, - "time_index": 0 if not is_multiseries else "date", + "time_index": 0, "forecast_horizon": 2, - "series_id": "series_id" if is_multiseries else None, } - if is_multiseries: - X, _ = multiseries_ts_data_stacked - # Can't use range() to generate y data for VARMAX, as the y columns will be linearly dependent - y = pd.Series( - (random.randint(0, 100) for _ in range(len(X))), - name="target", - ) - else: - X, y = X_y_regression - X.index = pd.DatetimeIndex(pd.date_range("01-01-2022", periods=len(X))) + X, y = X_y_regression + X.index = pd.DatetimeIndex(pd.date_range("01-01-2022", periods=len(X))) + elif automl_type == ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION: + problem_configuration = { + "gap": 1, + "max_delay": 1, + "time_index": "date", + "forecast_horizon": 2, + "series_id": "series_id", + } + X, _ = multiseries_ts_data_stacked + # Can't use range() to generate y data for VARMAX, as the y columns will be linearly dependent + y = pd.Series( + (random.randint(0, 100) for _ in range(len(X))), + name="target", + ) else: problem_configuration = { "gap": 1, @@ -4281,7 +4290,7 @@ def test_data_splitter_gives_pipelines_same_data( problem_configuration=problem_configuration, ) n_splits = automl.data_splitter.n_splits - env = AutoMLTestEnv(automl_type, is_multiseries=is_multiseries) + env = AutoMLTestEnv(automl_type) with env.test_context(score_return_value={automl.objective.name: 1.0}): automl.search() n_pipelines_evaluated = len(automl.results["pipeline_results"]) @@ -5183,15 +5192,17 @@ def test_exclude_featurizers( problem_type, input_type, get_test_data_from_configuration, + multiseries_ts_data_stacked, AutoMLTestEnv, ): parameters = {} if is_time_series(problem_type): parameters = { - "time_index": "dates", + "time_index": "date" if is_multiseries(problem_type) else "dates", "gap": 1, "max_delay": 1, "forecast_horizon": 1, + "series_id": "series_id" if is_multiseries(problem_type) else None, } X, y = get_test_data_from_configuration( @@ -5199,6 +5210,8 @@ def test_exclude_featurizers( problem_type, column_names=["dates", "text", "email", "url"], ) + if is_multiseries(problem_type): + X, y = multiseries_ts_data_stacked automl = AutoMLSearch( X_train=X, diff --git a/evalml/tests/automl_tests/test_automl_utils.py b/evalml/tests/automl_tests/test_automl_utils.py index b879e00168..1f3b85ec7c 100644 --- a/evalml/tests/automl_tests/test_automl_utils.py +++ b/evalml/tests/automl_tests/test_automl_utils.py @@ -20,7 +20,7 @@ RegressionPipeline, ) from evalml.preprocessing.data_splitters import TimeSeriesSplit, TrainingValidationSplit -from evalml.problem_types import ProblemTypes +from evalml.problem_types import ProblemTypes, is_multiseries, is_time_series from evalml.utils.woodwork_utils import infer_feature_types @@ -70,32 +70,23 @@ def test_get_default_primary_search_objective(): @pytest.mark.parametrize("problem_type", ProblemTypes.all_problem_types) @pytest.mark.parametrize("large_data", [False, True]) -@pytest.mark.parametrize("is_multiseries", [False, True]) -def test_make_data_splitter_default(problem_type, large_data, is_multiseries): - if is_multiseries and problem_type != ProblemTypes.TIME_SERIES_REGRESSION: - pytest.skip( - "Multiseries data is only supported for time series regression problems", - ) +def test_make_data_splitter_default(problem_type, large_data): n = 10 if large_data: n = _LARGE_DATA_ROW_THRESHOLD + 1 X = pd.DataFrame({"col_0": list(range(n)), "target": list(range(n))}) - if is_multiseries: + if is_multiseries(problem_type): X["series_id"] = pd.Series(range(n)) % 2 y = X.pop("target") problem_configuration = None - if problem_type in [ - ProblemTypes.TIME_SERIES_REGRESSION, - ProblemTypes.TIME_SERIES_BINARY, - ProblemTypes.TIME_SERIES_MULTICLASS, - ]: + if is_time_series(problem_type): problem_configuration = { "gap": 1, "max_delay": 7, "time_index": "foo", "forecast_horizon": 4, - "series_id": "series_id" if is_multiseries else None, + "series_id": "series_id" if is_multiseries(problem_type) else None, } data_splitter = make_data_splitter( @@ -135,6 +126,7 @@ def test_make_data_splitter_default(problem_type, large_data, is_multiseries): ProblemTypes.TIME_SERIES_REGRESSION, ProblemTypes.TIME_SERIES_BINARY, ProblemTypes.TIME_SERIES_MULTICLASS, + ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION, ]: assert isinstance(data_splitter, TimeSeriesSplit) assert data_splitter.n_splits == 3 @@ -143,7 +135,7 @@ def test_make_data_splitter_default(problem_type, large_data, is_multiseries): assert data_splitter.forecast_horizon == 4 assert data_splitter.time_index == "foo" assert data_splitter.is_cv - if is_multiseries: + if is_multiseries(problem_type): assert data_splitter._splitter.test_size == 8 else: assert data_splitter._splitter.test_size == 4 diff --git a/evalml/tests/automl_tests/test_default_algorithm.py b/evalml/tests/automl_tests/test_default_algorithm.py index eb91e2275f..b21cc452cb 100644 --- a/evalml/tests/automl_tests/test_default_algorithm.py +++ b/evalml/tests/automl_tests/test_default_algorithm.py @@ -23,7 +23,7 @@ TimeSeriesFeaturizer, URLFeaturizer, ) -from evalml.problem_types import ProblemTypes, is_time_series +from evalml.problem_types import ProblemTypes, is_multiseries, is_time_series def test_default_algorithm_init(X_y_binary): @@ -64,12 +64,10 @@ def test_default_algorithm_init(X_y_binary): algo = DefaultAlgorithm( X, y, - ProblemTypes.TIME_SERIES_REGRESSION, + ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION, sampler_name, verbose=True, - is_multiseries=True, ) - assert algo.is_multiseries is True assert algo.default_max_batches == 1 @@ -649,7 +647,7 @@ def test_default_algorithm_multiseries_time_series( multiseries_ts_data_stacked, ): X, y = multiseries_ts_data_stacked - problem_type = "time series regression" + problem_type = "multiseries time series regression" sampler_name = None search_parameters = { @@ -669,7 +667,6 @@ def test_default_algorithm_multiseries_time_series( problem_type, sampler_name, search_parameters=search_parameters, - is_multiseries=True, ) first_batch = algo.next_batch() @@ -856,6 +853,7 @@ def test_default_algorithm_accept_features( "max_delay": 3, "delay_features": False, "forecast_horizon": 10, + "series_id": "series_id" if is_multiseries(problem_type) else None, } algo = DefaultAlgorithm( @@ -1039,6 +1037,8 @@ def test_exclude_featurizers_default_algorithm( "max_delay": 1, "forecast_horizon": 3, } + if is_multiseries(problem_type): + parameters["series_id"] = "series_id" X, y = get_test_data_from_configuration( input_type, diff --git a/evalml/tests/automl_tests/test_iterative_algorithm.py b/evalml/tests/automl_tests/test_iterative_algorithm.py index 9a4bc343a6..f5ed9b73ac 100644 --- a/evalml/tests/automl_tests/test_iterative_algorithm.py +++ b/evalml/tests/automl_tests/test_iterative_algorithm.py @@ -23,7 +23,7 @@ ) from evalml.pipelines.components.utils import get_estimators from evalml.pipelines.utils import make_pipeline -from evalml.problem_types import ProblemTypes, is_time_series +from evalml.problem_types import ProblemTypes, is_multiseries, is_time_series @pytest.fixture @@ -65,14 +65,16 @@ def __init__( return _method -@pytest.mark.parametrize("problem_type", ["binary", "time series regression"]) +@pytest.mark.parametrize( + "problem_type", + ["binary", "multiseries time series regression"], +) def test_iterative_algorithm_init( problem_type, X_y_binary, multiseries_ts_data_stacked, ): X, y = X_y_binary if problem_type == "binary" else multiseries_ts_data_stacked - is_multiseries = problem_type == "time series regression" search_parameters = { "pipeline": { @@ -89,13 +91,12 @@ def test_iterative_algorithm_init( X=X, y=y, problem_type=problem_type, - is_multiseries=is_multiseries, search_parameters=search_parameters, ) assert algo.pipeline_number == 0 assert algo.batch_number == 0 assert algo.default_max_batches == 1 - estimators = get_estimators(problem_type, is_multiseries=is_multiseries) + estimators = get_estimators(problem_type) assert len(algo.allowed_pipelines) == len( [ make_pipeline( @@ -104,7 +105,6 @@ def test_iterative_algorithm_init( estimator, problem_type, parameters=search_parameters, - is_multiseries=is_multiseries, ) for estimator in estimators ], @@ -1103,6 +1103,8 @@ def test_exclude_featurizers_iterative_algorithm( "max_delay": 1, "forecast_horizon": 3, } + if is_multiseries(problem_type): + parameters["series_id"] = "series_id" X, y = get_test_data_from_configuration( input_type, diff --git a/evalml/tests/component_tests/test_components.py b/evalml/tests/component_tests/test_components.py index a00554ac57..0d8650eb71 100644 --- a/evalml/tests/component_tests/test_components.py +++ b/evalml/tests/component_tests/test_components.py @@ -1227,10 +1227,12 @@ def test_all_estimators_check_fit( ProblemTypes.TIME_SERIES_REGRESSION in component_class.supported_problem_types ): - if component_class.is_multiseries: - X, _, y = ts_multiseries_data() - else: - X, _, y = ts_data() + X, _, y = ts_data() + elif ( + ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION + in component_class.supported_problem_types + ): + X, _, y = ts_multiseries_data() else: X, y = X_y_binary @@ -1366,9 +1368,13 @@ def test_serialization( PolynomialDecomposer, STLDecomposer, ] + requires_multiseries_data = [ + MultiseriesTimeSeriesBaselineRegressor, + VARMAXRegressor, + ] component = helper_functions.safe_init_component_with_njobs_1(component_class) - if component.is_multiseries: + if component_class in requires_multiseries_data: component = component_class(time_index="date") X, _, y = ts_multiseries_data() elif component_class in requires_time_index: @@ -1740,16 +1746,16 @@ def test_estimator_fit_respects_custom_indices( if ProblemTypes.REGRESSION in supported_problem_types: X, y = X_y_regression elif ProblemTypes.TIME_SERIES_REGRESSION in supported_problem_types: - if estimator_class.is_multiseries: - X, _, y = ts_multiseries_data( - train_features_index_dt=False, - train_target_index_dt=False, - ) - else: - X, _, y = ts_data( - train_features_index_dt=False, - train_target_index_dt=False, - ) + X, _, y = ts_data( + train_features_index_dt=False, + train_target_index_dt=False, + ) + ts_problem = True + elif ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION in supported_problem_types: + X, _, y = ts_multiseries_data( + train_features_index_dt=False, + train_target_index_dt=False, + ) ts_problem = True else: X, y = X_y_binary @@ -1956,11 +1962,12 @@ def test_components_support_nullable_types( VARMAXRegressor, ] requires_all_numeric = [PCA, LinearDiscriminantAnalysis] + requires_multiseries_data = [VARMAXRegressor] component = helper_functions.safe_init_component_with_njobs_1(component_class) - if component_class.is_multiseries or component_class in requires_time_index: + if component_class in requires_time_index: component = component_class(time_index="date") - if component_class.is_multiseries: + if component_class in requires_multiseries_data: X, _, y = ts_multiseries_data( train_features_index_dt=False, train_target_index_dt=False, diff --git a/evalml/tests/component_tests/test_multiseries_baseline_regressor.py b/evalml/tests/component_tests/test_multiseries_baseline_regressor.py index 1e958fb507..1b5d1a0e9f 100644 --- a/evalml/tests/component_tests/test_multiseries_baseline_regressor.py +++ b/evalml/tests/component_tests/test_multiseries_baseline_regressor.py @@ -6,12 +6,15 @@ MultiseriesTimeSeriesBaselineRegressor, TimeSeriesFeaturizer, ) +from evalml.problem_types import ProblemTypes def test_multiseries_time_series_baseline_regressor_init(): baseline = MultiseriesTimeSeriesBaselineRegressor() assert baseline.model_family == ModelFamily.BASELINE - assert baseline.is_multiseries + assert baseline.supported_problem_types == [ + ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION, + ] assert baseline.start_delay == 2 baseline = MultiseriesTimeSeriesBaselineRegressor(gap=2, forecast_horizon=5) diff --git a/evalml/tests/component_tests/test_utils.py b/evalml/tests/component_tests/test_utils.py index 9434cbb9d1..552ba00d26 100644 --- a/evalml/tests/component_tests/test_utils.py +++ b/evalml/tests/component_tests/test_utils.py @@ -186,6 +186,7 @@ def test_scikit_learn_wrapper(X_y_binary, X_y_multi, X_y_regression): ProblemTypes.TIME_SERIES_REGRESSION, ProblemTypes.TIME_SERIES_MULTICLASS, ProblemTypes.TIME_SERIES_BINARY, + ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION, ]: continue diff --git a/evalml/tests/component_tests/test_varmax_regressor.py b/evalml/tests/component_tests/test_varmax_regressor.py index 9392fb3168..9f4acd067f 100644 --- a/evalml/tests/component_tests/test_varmax_regressor.py +++ b/evalml/tests/component_tests/test_varmax_regressor.py @@ -20,7 +20,7 @@ def test_model_family(): def test_problem_types(): assert set(VARMAXRegressor.supported_problem_types) == { - ProblemTypes.TIME_SERIES_REGRESSION, + ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION, } diff --git a/evalml/tests/conftest.py b/evalml/tests/conftest.py index 8a3aeea108..882209d52c 100644 --- a/evalml/tests/conftest.py +++ b/evalml/tests/conftest.py @@ -1921,7 +1921,7 @@ class _AutoMLTestEnv: >>> # env.mock_score.assert_called_once() """ - def __init__(self, problem_type, is_multiseries=False): + def __init__(self, problem_type): """Create a test environment. Args: @@ -1940,10 +1940,8 @@ def __init__(self, problem_type, is_multiseries=False): Set to None until the first computation is run in the test environment. mock_optimize_threshold (MagicMock): MagicMock corresponding to the BinaryClassificationObjective.optimize_threshold for the latest automl computation. Set to None until the first computation is run in the test environment. - is_multiseries (bool): Whether the problem type is a multiseries time series problem. """ self.problem_type = handle_problem_types(problem_type) - self.is_multiseries = is_multiseries self._mock_fit = None self._mock_tell = None self._mock_score = None @@ -1955,8 +1953,6 @@ def __init__(self, problem_type, is_multiseries=False): @property def _pipeline_class(self): - if self.is_multiseries: - return "evalml.pipelines.MultiseriesRegressionPipeline" return { ProblemTypes.REGRESSION: "evalml.pipelines.RegressionPipeline", ProblemTypes.BINARY: "evalml.pipelines.BinaryClassificationPipeline", @@ -1964,6 +1960,7 @@ def _pipeline_class(self): ProblemTypes.TIME_SERIES_REGRESSION: "evalml.pipelines.TimeSeriesRegressionPipeline", ProblemTypes.TIME_SERIES_MULTICLASS: "evalml.pipelines.TimeSeriesMulticlassClassificationPipeline", ProblemTypes.TIME_SERIES_BINARY: "evalml.pipelines.TimeSeriesBinaryClassificationPipeline", + ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION: "evalml.pipelines.MultiseriesRegressionPipeline", }[self.problem_type] def _patch_method(self, method, side_effect, return_value, pipeline_class_str=None): diff --git a/evalml/tests/integration_tests/test_nullable_types.py b/evalml/tests/integration_tests/test_nullable_types.py index 437311ad17..57b7e8c505 100644 --- a/evalml/tests/integration_tests/test_nullable_types.py +++ b/evalml/tests/integration_tests/test_nullable_types.py @@ -6,7 +6,7 @@ from evalml.pipelines import RegressionPipeline from evalml.pipelines.components import EmailFeaturizer, Imputer, URLFeaturizer from evalml.pipelines.components.transformers import ReplaceNullableTypes -from evalml.problem_types import ProblemTypes, is_time_series +from evalml.problem_types import ProblemTypes, is_multiseries, is_time_series @pytest.mark.parametrize("input_type", ["pd", "ww"]) @@ -41,6 +41,7 @@ def test_nullable_types_builds_pipelines( "gap": 1, "max_delay": 1, "forecast_horizon": 3, + "series_id": "series_id" if is_multiseries(problem_type) else None, } X, y = get_test_data_from_configuration( @@ -49,6 +50,8 @@ def test_nullable_types_builds_pipelines( column_names=column_names, nullable_target=True if "nullable target" in test_description else False, ) + if is_multiseries(problem_type): + X["series_id"] = pd.Series([0] * len(X)) automl = AutoMLSearch( X_train=X, @@ -60,8 +63,9 @@ def test_nullable_types_builds_pipelines( if automl_algorithm == "iterative": pipelines = [pl.name for pl in automl.allowed_pipelines] elif automl_algorithm == "default": + n_batches = 1 if is_multiseries(problem_type) else 2 # TODO: Upon resolution of GH Issue #3186, increase the num of batches. - for _ in range(2): + for _ in range(n_batches): pipelines = [pl.name for pl in automl.automl_algorithm.next_batch()] # A check to make sure we actually retrieve constructed pipelines from the algo. @@ -129,6 +133,7 @@ def test_automl_search_with_nullable_types( elif ( problem_type == ProblemTypes.REGRESSION or problem_type == ProblemTypes.TIME_SERIES_REGRESSION + or problem_type == ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION ): y = nullable_type_target(ltype="IntegerNullable", has_nans=False) @@ -139,7 +144,10 @@ def test_automl_search_with_nullable_types( "gap": 1, "max_delay": 1, "forecast_horizon": 3, + "series_id": "series_id" if is_multiseries(problem_type) else None, } + if is_multiseries(problem_type): + X["series_id"] = pd.Series([0] * len(X)) automl = AutoMLSearch( X_train=X, diff --git a/evalml/tests/pipeline_tests/test_pipeline_utils.py b/evalml/tests/pipeline_tests/test_pipeline_utils.py index 576284bae2..cbc963aa57 100644 --- a/evalml/tests/pipeline_tests/test_pipeline_utils.py +++ b/evalml/tests/pipeline_tests/test_pipeline_utils.py @@ -58,7 +58,7 @@ stack_X, unstack_multiseries, ) -from evalml.problem_types import ProblemTypes, is_time_series +from evalml.problem_types import ProblemTypes, is_multiseries, is_time_series @pytest.mark.parametrize("input_type", ["pd", "ww"]) @@ -87,28 +87,21 @@ ("nullable_types", ["numerical", "int_null", "bool_null", "age_null"]), ], ) -@pytest.mark.parametrize("is_multiseries", [False, True]) def test_make_pipeline( problem_type, input_type, features, test_description, column_names, - is_multiseries, get_test_data_from_configuration, multiseries_ts_data_stacked, ): - if is_multiseries and problem_type != ProblemTypes.TIME_SERIES_REGRESSION: - pytest.skip("Multiseries only supported for time series regression") X, y = get_test_data_from_configuration( input_type, problem_type, column_names=column_names, ) - estimators = get_estimators( - problem_type=problem_type, - is_multiseries=is_multiseries, - ) + estimators = get_estimators(problem_type=problem_type) pipeline_class = _get_pipeline_base_class(problem_type) for estimator_class in estimators: if problem_type in estimator_class.supported_problem_types: @@ -120,7 +113,9 @@ def test_make_pipeline( "gap": 1, "max_delay": 1, "forecast_horizon": 3, - "series_id": "series_id" if is_multiseries else None, + "series_id": "series_id" + if is_multiseries(problem_type) + else None, }, } @@ -131,7 +126,6 @@ def test_make_pipeline( problem_type, parameters, features=features, - is_multiseries=is_multiseries, ) assert isinstance(pipeline, pipeline_class) label_encoder = [LabelEncoder] if is_classification(problem_type) else [] @@ -175,7 +169,7 @@ def test_make_pipeline( ) if is_time_series(problem_type): - if is_multiseries: + if is_multiseries(problem_type): expected_components = dfs + [estimator_class] else: expected_components = ( @@ -626,8 +620,7 @@ def test_get_estimators(): assert ( len( get_estimators( - problem_type=ProblemTypes.TIME_SERIES_REGRESSION, - is_multiseries=True, + problem_type=ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION, ), ) == 1 diff --git a/evalml/tests/pipeline_tests/test_pipelines.py b/evalml/tests/pipeline_tests/test_pipelines.py index d0bd2a54de..96a9b0f973 100644 --- a/evalml/tests/pipeline_tests/test_pipelines.py +++ b/evalml/tests/pipeline_tests/test_pipelines.py @@ -2037,6 +2037,8 @@ def test_predict_has_input_target_name( time_series_binary_classification_pipeline_class, time_series_multiclass_classification_pipeline_class, ): + if problem_type == ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION: + pytest.skip("Multiseries time series regression case tested elsewhere") if problem_type == ProblemTypes.BINARY: X, y = X_y_binary clf = logistic_regression_binary_pipeline diff --git a/evalml/tests/problem_type_tests/test_problem_types.py b/evalml/tests/problem_type_tests/test_problem_types.py index 4f06cce49b..d848392a4f 100644 --- a/evalml/tests/problem_type_tests/test_problem_types.py +++ b/evalml/tests/problem_type_tests/test_problem_types.py @@ -24,6 +24,7 @@ def correct_problem_types(): ProblemTypes.TIME_SERIES_REGRESSION, ProblemTypes.TIME_SERIES_BINARY, ProblemTypes.TIME_SERIES_MULTICLASS, + ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION, ] yield correct_problem_types @@ -36,6 +37,7 @@ def test_handle_string(correct_problem_types): ProblemTypes.TIME_SERIES_REGRESSION, "time series binary", "time series multiclass", + ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION, ] for problem_type in zip(problem_types, correct_problem_types): assert handle_problem_types(problem_type[0]) == problem_type[1] @@ -154,6 +156,7 @@ def test_all_problem_types(): ProblemTypes.TIME_SERIES_REGRESSION, ProblemTypes.TIME_SERIES_BINARY, ProblemTypes.TIME_SERIES_MULTICLASS, + ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION, ] assert ProblemTypes.all_problem_types == expected @@ -161,7 +164,12 @@ def test_all_problem_types(): @pytest.mark.parametrize("problem_type", ProblemTypes.all_problem_types) def test_type_checks(problem_type): assert is_regression(problem_type) == ( - problem_type in [ProblemTypes.REGRESSION, ProblemTypes.TIME_SERIES_REGRESSION] + problem_type + in [ + ProblemTypes.REGRESSION, + ProblemTypes.TIME_SERIES_REGRESSION, + ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION, + ] ) assert is_binary(problem_type) == ( problem_type in [ProblemTypes.BINARY, ProblemTypes.TIME_SERIES_BINARY] @@ -184,5 +192,6 @@ def test_type_checks(problem_type): ProblemTypes.TIME_SERIES_BINARY, ProblemTypes.TIME_SERIES_MULTICLASS, ProblemTypes.TIME_SERIES_REGRESSION, + ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION, ] )