Integrate multiseries into AutoMLSearch (#4270)

* Integration into search, with tests * Swap is_multiseries logic to problem type (#4278)
alteryx · Aug 21, 2023 · 7781c77 · 7781c77
1 parent 24ba211
commit 7781c77
Show file tree

Hide file tree

Showing 31 changed files with 467 additions and 103 deletions.
diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst
@@ -2,6 +2,8 @@ Release Notes
 -------------
 **Future Releases**
     * Enhancements
+        * Added support for prediction intervals for VARMAX regressor :pr:`4267`
+        * Integrated multiseries time series into AutoMLSearch :pr:`4270`
     * Fixes
     * Changes
         * Updated ``ARIMARegressor`` to be compatible with sktime v0.22.0 and beyond :pr:`4283`
@@ -21,7 +23,6 @@ Release Notes
         * Added stacking and unstacking utility functions to work with multiseries data :pr:`4250`
         * Added multiseries regression pipeline class :pr:`4256`
         * Added multiseries VARMAX regressor :pr:`4238`
-        * Added support for prediction intervals for VARMAX regressor :pr:`4267`
     * Fixes
         * Added support for pandas 2 :pr:`4216`
         * Fixed bug where time series pipelines would fail due to MASE needing `y_train` when scoring :pr:`4258`

diff --git a/evalml/automl/automl_algorithm/default_algorithm.py b/evalml/automl/automl_algorithm/default_algorithm.py
@@ -25,7 +25,7 @@
     _make_pipeline_from_multiple_graphs,
     make_pipeline,
 )
-from evalml.problem_types import is_regression, is_time_series
+from evalml.problem_types import is_multiseries, is_regression, is_time_series
 from evalml.utils import infer_feature_types
 from evalml.utils.logger import get_logger
 
@@ -170,6 +170,8 @@ def default_max_batches(self):
         """Returns the number of max batches AutoMLSearch should run by default."""
         if self.ensembling:
             return 3
+        elif is_multiseries(self.problem_type):
+            return 1
         else:
             return 2
 
@@ -472,11 +474,17 @@ def next_batch(self):
                 )
         # this logic needs to be updated once time series also supports ensembling
         elif is_time_series(self.problem_type):
-            if self._batch_number == 0:
+            # Skip the naive batch for multiseries time series
+            batch = (
+                self._batch_number
+                if not is_multiseries(self.problem_type)
+                else self._batch_number + 1
+            )
+            if batch == 0:
                 next_batch = self._create_naive_pipelines()
-            elif self._batch_number == 1:
+            elif batch == 1:
                 next_batch = self._create_fast_final()
-            elif self.batch_number == 2:
+            elif batch == 2:
                 next_batch = self._create_long_exploration(n=self.top_n)
             else:
                 next_batch = self._create_n_pipelines(

diff --git a/evalml/automl/automl_search.py b/evalml/automl/automl_search.py
@@ -65,6 +65,7 @@
     handle_problem_types,
     is_binary,
     is_classification,
+    is_multiseries,
     is_time_series,
 )
 from evalml.tuners import SKOptTuner
@@ -403,6 +404,7 @@ class AutoMLSearch:
 
         problem_configuration (dict, None): Additional parameters needed to configure the search. For example,
             in time series problems, values should be passed in for the time_index, gap, forecast_horizon, and max_delay variables.
+            For multiseries time series problems, the values passed in should also include the name of a series_id column.
 
         train_best_pipeline (boolean): Whether or not to train the best pipeline before returning it. Defaults to True.
 
@@ -651,6 +653,14 @@ def __init__(
                     f"Dataset size is too small to create holdout set. Minimum dataset size is {self._HOLDOUT_SET_MIN_ROWS} rows, X_train has {len(X_train)} rows. Holdout set evaluation is disabled.",
                 )
 
+        # For multiseries problems, we need to make sure that the data is primarily ordered by the time_index rather than the series_id
+        if is_multiseries(self.problem_type):
+            time_index = self.problem_configuration.get("time_index")
+            series_id = self.problem_configuration.get("series_id")
+            X_train = X_train.sort_values([time_index, series_id])
+            y_train = y_train[X_train.index].reset_index(drop=True)
+            X_train = X_train.reset_index(drop=True)
+
         # Set holdout data in AutoML search if provided as parameter
         self.X_train = infer_feature_types(X_train)
         self.y_train = infer_feature_types(y_train)
@@ -1053,6 +1063,13 @@ def _validate_problem_configuration(self, problem_configuration=None):
             is_valid, msg = contains_all_ts_parameters(problem_configuration)
             if not is_valid:
                 raise ValueError(msg)
+            if (
+                is_multiseries(self.problem_type)
+                and "series_id" not in problem_configuration
+            ):
+                raise ValueError(
+                    "Must provide 'series_id' column in problem_configuration for multiseries time series problems.",
+                )
         return problem_configuration or {}
 
     def _handle_keyboard_interrupt(self):
@@ -1355,6 +1372,7 @@ def _get_baseline_pipeline(self):
             gap = self.problem_configuration["gap"]
             forecast_horizon = self.problem_configuration["forecast_horizon"]
             time_index = self.problem_configuration["time_index"]
+            series_id = self.problem_configuration.get("series_id", None)
             exclude_timeseries_featurizer = (
                 "TimeSeriesFeaturizer" in self.exclude_featurizers
             )
@@ -1364,6 +1382,7 @@ def _get_baseline_pipeline(self):
                 forecast_horizon,
                 time_index,
                 exclude_timeseries_featurizer,
+                series_id,
             )
         return baseline
 

diff --git a/evalml/automl/utils.py b/evalml/automl/utils.py
@@ -49,6 +49,7 @@ def get_default_primary_search_objective(problem_type):
         "time series regression": "MedianAE",
         "time series binary": "Log Loss Binary",
         "time series multiclass": "Log Loss Multiclass",
+        "multiseries time series regression": "MedianAE",
     }[problem_type.value]
     return get_objective(objective_name, return_instance=True)
 
@@ -87,12 +88,14 @@ def make_data_splitter(
             raise ValueError(
                 "problem_configuration is required for time series problem types",
             )
+        series_id = problem_configuration.get("series_id")
         return TimeSeriesSplit(
             n_splits=n_splits,
             gap=problem_configuration.get("gap"),
             max_delay=problem_configuration.get("max_delay"),
             time_index=problem_configuration.get("time_index"),
             forecast_horizon=problem_configuration.get("forecast_horizon"),
+            n_series=len(X[series_id].unique()) if series_id is not None else None,
         )
     if X.shape[0] > _LARGE_DATA_ROW_THRESHOLD:
         return TrainingValidationSplit(

diff --git a/evalml/objectives/regression_objective.py b/evalml/objectives/regression_objective.py
@@ -6,5 +6,9 @@
 class RegressionObjective(ObjectiveBase):
     """Base class for all regression objectives."""
 
-    problem_types = [ProblemTypes.REGRESSION, ProblemTypes.TIME_SERIES_REGRESSION]
-    """[ProblemTypes.REGRESSION, ProblemTypes.TIME_SERIES_REGRESSION]"""
+    problem_types = [
+        ProblemTypes.REGRESSION,
+        ProblemTypes.TIME_SERIES_REGRESSION,
+        ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION,
+    ]
+    """[ProblemTypes.REGRESSION, ProblemTypes.TIME_SERIES_REGRESSION, ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION]"""
diff --git a/evalml/pipelines/components/component_base.py b/evalml/pipelines/components/component_base.py
@@ -31,7 +31,6 @@ class ComponentBase(ABC, metaclass=ComponentBaseMeta):
     # Referring to the pandas nullable dtypes; not just woodwork logical types
     _integer_nullable_incompatibilities = []
     _boolean_nullable_incompatibilities = []
-    is_multiseries = False
 
     def __init__(self, parameters=None, component_obj=None, random_seed=0, **kwargs):
         """Base class for all components.

diff --git a/.../pipelines/components/estimators/regressors/multiseries_time_series_baseline_regressor.py b/.../pipelines/components/estimators/regressors/multiseries_time_series_baseline_regressor.py
@@ -25,12 +25,11 @@ class MultiseriesTimeSeriesBaselineRegressor(Estimator):
     """{}"""
     model_family = ModelFamily.BASELINE
     """ModelFamily.BASELINE"""
-    is_multiseries = True
     supported_problem_types = [
-        ProblemTypes.TIME_SERIES_REGRESSION,
+        ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION,
     ]
     """[
-        ProblemTypes.TIME_SERIES_REGRESSION,
+        ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION,
     ]"""
 
     def __init__(self, gap=1, forecast_horizon=1, random_seed=0, **kwargs):

diff --git a/evalml/pipelines/components/estimators/regressors/varmax_regressor.py b/evalml/pipelines/components/estimators/regressors/varmax_regressor.py
@@ -48,10 +48,9 @@ class VARMAXRegressor(Estimator):
         "trend": Categorical(['n', 'c', 't', 'ct']),
     }"""
     model_family = ModelFamily.VARMAX
-    is_multiseries = True
     """ModelFamily.VARMAX"""
-    supported_problem_types = [ProblemTypes.TIME_SERIES_REGRESSION]
-    """[ProblemTypes.TIME_SERIES_REGRESSION]"""
+    supported_problem_types = [ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION]
+    """[ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION]"""
 
     def __init__(
         self,
@@ -61,7 +60,7 @@ def __init__(
         trend: Optional[str] = "c",
         random_seed: Union[int, float] = 0,
         maxiter: int = 10,
-        use_covariates: bool = True,
+        use_covariates: bool = False,
         **kwargs,
     ):
         self.preds_95_upper = None
@@ -84,6 +83,7 @@ def __init__(
 
         parameters["use_covariates"] = use_covariates
         parameters["time_index"] = time_index
+        parameters.update({"p": p, "q": q})
 
         self.use_covariates = use_covariates
         self.time_index = time_index
@@ -133,13 +133,13 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.DataFrame] = None):
 
         if y is None:
             raise ValueError("VARMAX Regressor requires y as input.")
+        y = convert_bool_to_double(y, include_ints=True)
 
         if X is not None and self.use_covariates:
             self.last_X_index = X.index[-1]
             X = X.ww.select(exclude=["Datetime"])
 
             X = convert_bool_to_double(X)
-            y = convert_bool_to_double(y)
             X, y = match_indices(X, y)
 
             if not X.empty:

diff --git a/evalml/pipelines/components/utils.py b/evalml/pipelines/components/utils.py
@@ -56,7 +56,11 @@ def allowed_model_families(problem_type):
     return list(set([e.model_family for e in estimators]))
 
 
-def get_estimators(problem_type, model_families=None, excluded_model_families=None):
+def get_estimators(
+    problem_type,
+    model_families=None,
+    excluded_model_families=None,
+):
     """Returns the estimators allowed for a particular problem type.
 
     Can also optionally filter by a list of model types.
@@ -515,20 +519,27 @@ def match_indices(
     return X, y
 
 
-def convert_bool_to_double(data: pd.DataFrame):
-    """Converts all boolean columns in dataframe to doubles.
+def convert_bool_to_double(
+    data: pd.DataFrame,
+    include_ints: bool = False,
+) -> pd.DataFrame:
+    """Converts all boolean columns in dataframe to doubles. If include_ints, converts all integer columns to doubles as well.
 
     Args:
         data (pd.DataFrame): Input dataframe.
+        include_ints (bool): If True, converts all integer columns to doubles as well. Defaults to False.
 
     Returns:
         pd.DataFrame: Input dataframe with all boolean-valued columns converted to doubles.
     """
     data_ = data.ww.copy()
+    relevant_dtypes = ["Boolean"]
+    if include_ints:
+        relevant_dtypes.append("Integer")
     data_.ww.set_types(
         {
             col: "Double"
-            for col in data.ww.select(["Boolean"], return_schema=True).columns
+            for col in data.ww.select(relevant_dtypes, return_schema=True).columns
         },
     )
     return data_
diff --git a/evalml/pipelines/multiseries_regression_pipeline.py b/evalml/pipelines/multiseries_regression_pipeline.py
@@ -21,9 +21,9 @@ class MultiseriesRegressionPipeline(TimeSeriesRegressionPipeline):
 
     """
 
-    problem_type = ProblemTypes.TIME_SERIES_REGRESSION
+    problem_type = ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION
 
-    """ProblemTypes.TIME_SERIES_REGRESSION"""
+    """ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION"""
 
     def __init__(
         self,

diff --git a/evalml/pipelines/utils.py b/evalml/pipelines/utils.py
@@ -12,6 +12,7 @@
 from evalml.model_family import ModelFamily
 from evalml.pipelines import (
     ComponentGraph,
+    MultiseriesRegressionPipeline,
     TimeSeriesBinaryClassificationPipeline,
     TimeSeriesMulticlassClassificationPipeline,
     TimeSeriesRegressionPipeline,
@@ -66,6 +67,7 @@
     ProblemTypes,
     handle_problem_types,
     is_classification,
+    is_multiseries,
     is_regression,
     is_time_series,
 )
@@ -289,6 +291,9 @@ def _get_preprocessing_components(
     Returns:
         list[Transformer]: A list of applicable preprocessing components to use with the estimator.
     """
+    if is_multiseries(problem_type):
+        return []
+
     if is_time_series(problem_type):
         components_functions = [
             _get_label_encoder,
@@ -361,8 +366,10 @@ def _get_pipeline_base_class(problem_type):
         return TimeSeriesRegressionPipeline
     elif problem_type == ProblemTypes.TIME_SERIES_BINARY:
         return TimeSeriesBinaryClassificationPipeline
-    else:
+    elif problem_type == ProblemTypes.TIME_SERIES_MULTICLASS:
         return TimeSeriesMulticlassClassificationPipeline
+    else:
+        return MultiseriesRegressionPipeline
 
 
 def _make_pipeline_time_series(
@@ -1204,6 +1211,7 @@ def make_timeseries_baseline_pipeline(
     forecast_horizon,
     time_index,
     exclude_featurizer=False,
+    series_id=None,
 ):
     """Make a baseline pipeline for time series regression problems.
 
@@ -1214,6 +1222,7 @@ def make_timeseries_baseline_pipeline(
         time_index (str): Column name of time_index parameter.
         exclude_featurizer (bool): Whether or not to exclude the TimeSeriesFeaturizer from
             the baseline graph. Defaults to False.
+        series_id (str): Column name of series_id parameter. Only used for multiseries time series. Defaults to None.
 
     Returns:
         TimeSeriesPipelineBase, a time series pipeline corresponding to the problem type.
@@ -1232,20 +1241,31 @@ def make_timeseries_baseline_pipeline(
             TimeSeriesBinaryClassificationPipeline,
             "Time Series Baseline Binary Pipeline",
         ),
+        ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION: (
+            MultiseriesRegressionPipeline,
+            "Multiseries Time Series Baseline Pipeline",
+        ),
     }[problem_type]
-    component_graph = ["Time Series Baseline Estimator"]
+    baseline_estimator_name = (
+        "Multiseries Time Series Baseline Regressor"
+        if is_multiseries(problem_type)
+        else "Time Series Baseline Estimator"
+    )
+    component_graph = [baseline_estimator_name]
     parameters = {
         "pipeline": {
             "time_index": time_index,
             "gap": gap,
             "max_delay": 0,
             "forecast_horizon": forecast_horizon,
         },
-        "Time Series Baseline Estimator": {
+        baseline_estimator_name: {
             "gap": gap,
             "forecast_horizon": forecast_horizon,
         },
     }
+    if is_multiseries(problem_type):
+        parameters["pipeline"]["series_id"] = series_id
     if not exclude_featurizer:
         component_graph = ["Time Series Featurizer"] + component_graph
         parameters["Time Series Featurizer"] = {

diff --git a/evalml/preprocessing/data_splitters/time_series_split.py b/evalml/preprocessing/data_splitters/time_series_split.py
@@ -59,16 +59,23 @@ def __init__(
         gap=0,
         forecast_horizon=None,
         time_index=None,
+        n_series=None,
         n_splits=3,
     ):
         self.max_delay = max_delay
         self.gap = gap
         self.forecast_horizon = forecast_horizon if forecast_horizon else 1
         self.time_index = time_index
         self.n_splits = n_splits
+        self.n_series = n_series
+
+        test_size = forecast_horizon
+        if self.n_series is not None:
+            test_size = forecast_horizon * self.n_series
+
         self._splitter = SkTimeSeriesSplit(
             n_splits=n_splits,
-            test_size=forecast_horizon,
+            test_size=test_size,
         )
 
     def get_n_splits(self, X=None, y=None, groups=None):

diff --git a/evalml/problem_types/__init__.py b/evalml/problem_types/__init__.py
@@ -6,6 +6,7 @@
     is_regression,
     is_binary,
     is_multiclass,
+    is_multiseries,
     is_classification,
     is_time_series,
 )