Merge branch 'main' into 4244_extend_stldecomp_for_multiseries

alteryx · Aug 22, 2023 · 8e379c1 · 8e379c1
2 parents 837fc79 + de64082
commit 8e379c1
Show file tree

Hide file tree

Showing 35 changed files with 793 additions and 166 deletions.
diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst
@@ -2,10 +2,14 @@ Release Notes
 -------------
 **Future Releases**
     * Enhancements
+        * Added support for prediction intervals for VARMAX regressor :pr:`4267`
+        * Integrated multiseries time series into AutoMLSearch :pr:`4270`
         * Extended STLDecomposer to Support Multiseries :pr:`4253`
     * Fixes
+        * Fixed error when stacking data with no exogenous variables :pr:`4275`
     * Changes
         * Updated ``ARIMARegressor`` to be compatible with sktime v0.22.0 and beyond :pr:`4283`
+        * Updated ``graph_prediction_vs_actual_over_time()`` to be compatible with multiseries time series :pr:`4284`
     * Documentation Changes
         * Removed erroneous warnings from Data Checks User Guide page and removed ``tqdm`` warning in all notebooks :pr:`4274`
     * Testing Changes
@@ -22,7 +26,6 @@ Release Notes
         * Added stacking and unstacking utility functions to work with multiseries data :pr:`4250`
         * Added multiseries regression pipeline class :pr:`4256`
         * Added multiseries VARMAX regressor :pr:`4238`
-        * Added support for prediction intervals for VARMAX regressor :pr:`4267`
     * Fixes
         * Added support for pandas 2 :pr:`4216`
         * Fixed bug where time series pipelines would fail due to MASE needing `y_train` when scoring :pr:`4258`

diff --git a/evalml/automl/automl_algorithm/default_algorithm.py b/evalml/automl/automl_algorithm/default_algorithm.py
@@ -25,7 +25,7 @@
     _make_pipeline_from_multiple_graphs,
     make_pipeline,
 )
-from evalml.problem_types import is_regression, is_time_series
+from evalml.problem_types import is_multiseries, is_regression, is_time_series
 from evalml.utils import infer_feature_types
 from evalml.utils.logger import get_logger
 
@@ -170,6 +170,8 @@ def default_max_batches(self):
         """Returns the number of max batches AutoMLSearch should run by default."""
         if self.ensembling:
             return 3
+        elif is_multiseries(self.problem_type):
+            return 1
         else:
             return 2
 
@@ -472,11 +474,17 @@ def next_batch(self):
                 )
         # this logic needs to be updated once time series also supports ensembling
         elif is_time_series(self.problem_type):
-            if self._batch_number == 0:
+            # Skip the naive batch for multiseries time series
+            batch = (
+                self._batch_number
+                if not is_multiseries(self.problem_type)
+                else self._batch_number + 1
+            )
+            if batch == 0:
                 next_batch = self._create_naive_pipelines()
-            elif self._batch_number == 1:
+            elif batch == 1:
                 next_batch = self._create_fast_final()
-            elif self.batch_number == 2:
+            elif batch == 2:
                 next_batch = self._create_long_exploration(n=self.top_n)
             else:
                 next_batch = self._create_n_pipelines(

diff --git a/evalml/automl/automl_search.py b/evalml/automl/automl_search.py
@@ -65,6 +65,7 @@
     handle_problem_types,
     is_binary,
     is_classification,
+    is_multiseries,
     is_time_series,
 )
 from evalml.tuners import SKOptTuner
@@ -403,6 +404,7 @@ class AutoMLSearch:
 
         problem_configuration (dict, None): Additional parameters needed to configure the search. For example,
             in time series problems, values should be passed in for the time_index, gap, forecast_horizon, and max_delay variables.
+            For multiseries time series problems, the values passed in should also include the name of a series_id column.
 
         train_best_pipeline (boolean): Whether or not to train the best pipeline before returning it. Defaults to True.
 
@@ -651,6 +653,14 @@ def __init__(
                     f"Dataset size is too small to create holdout set. Minimum dataset size is {self._HOLDOUT_SET_MIN_ROWS} rows, X_train has {len(X_train)} rows. Holdout set evaluation is disabled.",
                 )
 
+        # For multiseries problems, we need to make sure that the data is primarily ordered by the time_index rather than the series_id
+        if is_multiseries(self.problem_type):
+            time_index = self.problem_configuration.get("time_index")
+            series_id = self.problem_configuration.get("series_id")
+            X_train = X_train.sort_values([time_index, series_id])
+            y_train = y_train[X_train.index].reset_index(drop=True)
+            X_train = X_train.reset_index(drop=True)
+
         # Set holdout data in AutoML search if provided as parameter
         self.X_train = infer_feature_types(X_train)
         self.y_train = infer_feature_types(y_train)
@@ -1053,6 +1063,13 @@ def _validate_problem_configuration(self, problem_configuration=None):
             is_valid, msg = contains_all_ts_parameters(problem_configuration)
             if not is_valid:
                 raise ValueError(msg)
+            if (
+                is_multiseries(self.problem_type)
+                and "series_id" not in problem_configuration
+            ):
+                raise ValueError(
+                    "Must provide 'series_id' column in problem_configuration for multiseries time series problems.",
+                )
         return problem_configuration or {}
 
     def _handle_keyboard_interrupt(self):
@@ -1355,6 +1372,7 @@ def _get_baseline_pipeline(self):
             gap = self.problem_configuration["gap"]
             forecast_horizon = self.problem_configuration["forecast_horizon"]
             time_index = self.problem_configuration["time_index"]
+            series_id = self.problem_configuration.get("series_id", None)
             exclude_timeseries_featurizer = (
                 "TimeSeriesFeaturizer" in self.exclude_featurizers
             )
@@ -1364,6 +1382,7 @@ def _get_baseline_pipeline(self):
                 forecast_horizon,
                 time_index,
                 exclude_timeseries_featurizer,
+                series_id,
             )
         return baseline
 

diff --git a/evalml/automl/utils.py b/evalml/automl/utils.py
@@ -49,6 +49,7 @@ def get_default_primary_search_objective(problem_type):
         "time series regression": "MedianAE",
         "time series binary": "Log Loss Binary",
         "time series multiclass": "Log Loss Multiclass",
+        "multiseries time series regression": "MedianAE",
     }[problem_type.value]
     return get_objective(objective_name, return_instance=True)
 
@@ -87,12 +88,14 @@ def make_data_splitter(
             raise ValueError(
                 "problem_configuration is required for time series problem types",
             )
+        series_id = problem_configuration.get("series_id")
         return TimeSeriesSplit(
             n_splits=n_splits,
             gap=problem_configuration.get("gap"),
             max_delay=problem_configuration.get("max_delay"),
             time_index=problem_configuration.get("time_index"),
             forecast_horizon=problem_configuration.get("forecast_horizon"),
+            n_series=len(X[series_id].unique()) if series_id is not None else None,
         )
     if X.shape[0] > _LARGE_DATA_ROW_THRESHOLD:
         return TrainingValidationSplit(

diff --git a/evalml/model_understanding/visualizations.py b/evalml/model_understanding/visualizations.py
@@ -12,6 +12,7 @@
 from evalml.model_family import ModelFamily
 from evalml.objectives.utils import get_objective
 from evalml.problem_types import ProblemTypes
+from evalml.problem_types.utils import is_multiseries
 from evalml.utils import import_or_raise, infer_feature_types, jupyter_check
 
 
@@ -373,25 +374,44 @@ def get_prediction_vs_actual_over_time_data(pipeline, X, y, X_train, y_train, da
     dates = infer_feature_types(dates)
     prediction = pipeline.predict_in_sample(X, y, X_train=X_train, y_train=y_train)
 
-    return pd.DataFrame(
-        {
-            "dates": dates.reset_index(drop=True),
-            "target": y.reset_index(drop=True),
-            "prediction": prediction.reset_index(drop=True),
-        },
-    )
+    if is_multiseries(pipeline.problem_type):
+        return pd.DataFrame(
+            {
+                "dates": dates.reset_index(drop=True),
+                "target": y.reset_index(drop=True),
+                "prediction": prediction.reset_index(drop=True),
+                "series_id": X[pipeline.series_id].reset_index(drop=True),
+            },
+        )
+    else:
+        return pd.DataFrame(
+            {
+                "dates": dates.reset_index(drop=True),
+                "target": y.reset_index(drop=True),
+                "prediction": prediction.reset_index(drop=True),
+            },
+        )
 
 
-def graph_prediction_vs_actual_over_time(pipeline, X, y, X_train, y_train, dates):
+def graph_prediction_vs_actual_over_time(
+    pipeline,
+    X,
+    y,
+    X_train,
+    y_train,
+    dates,
+    single_series=None,
+):
     """Plot the target values and predictions against time on the x-axis.
 
     Args:
         pipeline (TimeSeriesRegressionPipeline): Fitted time series regression pipeline.
-        X (pd.DataFrame): Features used to generate new predictions.
-        y (pd.Series): Target values to compare predictions against.
+        X (pd.DataFrame): Features used to generate new predictions. If problem is multiseries, X should be stacked.
+        y (pd.Series): Target values to compare predictions against. If problem is multiseries, y should be stacked.
         X_train (pd.DataFrame): Data the pipeline was trained on.
         y_train (pd.Series): Target values for training data.
         dates (pd.Series): Dates corresponding to target values and predictions.
+        single_series (str): A single series id value to plot just one series in a multiseries dataset. Defaults to None.
 
     Returns:
         plotly.Figure: Showing the prediction vs actual over time.
@@ -403,8 +423,15 @@ def graph_prediction_vs_actual_over_time(pipeline, X, y, X_train, y_train, dates
         "plotly.graph_objects",
         error_msg="Cannot find dependency plotly.graph_objects",
     )
+    subplots = import_or_raise(
+        "plotly.subplots",
+        error_msg="Cannot find dependency plotly.subplots",
+    )
 
-    if pipeline.problem_type != ProblemTypes.TIME_SERIES_REGRESSION:
+    if (
+        pipeline.problem_type != ProblemTypes.TIME_SERIES_REGRESSION
+        and pipeline.problem_type != ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION
+    ):
         raise ValueError(
             "graph_prediction_vs_actual_over_time only supports time series regression pipelines! "
             f"Received {str(pipeline.problem_type)}.",
@@ -419,30 +446,76 @@ def graph_prediction_vs_actual_over_time(pipeline, X, y, X_train, y_train, dates
         dates,
     )
 
-    data = [
-        _go.Scatter(
-            x=data["dates"],
-            y=data["target"],
-            mode="lines+markers",
-            name="Target",
-            line=dict(color="#1f77b4"),
-        ),
-        _go.Scatter(
-            x=data["dates"],
-            y=data["prediction"],
-            mode="lines+markers",
-            name="Prediction",
-            line=dict(color="#d62728"),
-        ),
-    ]
-    # Let plotly pick the best date format.
-    layout = _go.Layout(
-        title={"text": "Prediction vs Target over time"},
-        xaxis={"title": "Time"},
-        yaxis={"title": "Target Values and Predictions"},
-    )
+    fig = None
+    if is_multiseries(pipeline.problem_type):
+        id_list = (
+            [single_series] if single_series is not None else data["series_id"].unique()
+        )
+        fig = subplots.make_subplots(
+            rows=len(id_list),
+            cols=1,
+            subplot_titles=[f"Series: {id}" for id in id_list],
+        )
+        for curr_count, id in enumerate(id_list):
+            curr_df = data[data["series_id"] == id]
+            fig.append_trace(
+                _go.Scatter(
+                    x=curr_df["dates"],
+                    y=curr_df["target"],
+                    mode="lines+markers",
+                    name=f"Series {id}: Target",
+                ),
+                row=curr_count + 1,
+                col=1,
+            )
+            fig.append_trace(
+                _go.Scatter(
+                    x=curr_df["dates"],
+                    y=curr_df["prediction"],
+                    mode="lines+markers",
+                    name=f"Series {id}: Prediction",
+                ),
+                row=curr_count + 1,
+                col=1,
+            )
+            fig.update_xaxes(title_text="Time")
+            fig.update_yaxes(title_text=y.name)
+        if single_series is not None:
+            fig.update_layout(
+                title_text=f"Graph for Series {single_series}",
+            )
+        else:
+            fig.update_layout(
+                height=600 + (len(id_list)) * 200,
+                width=1500,
+                title_text="Graph for Multiseries",
+            )
+    else:
+        data = [
+            _go.Scatter(
+                x=data["dates"],
+                y=data["target"],
+                mode="lines+markers",
+                name="Target",
+                line=dict(color="#1f77b4"),
+            ),
+            _go.Scatter(
+                x=data["dates"],
+                y=data["prediction"],
+                mode="lines+markers",
+                name="Prediction",
+                line=dict(color="#d62728"),
+            ),
+        ]
+        # Let plotly pick the best date format.
+        layout = _go.Layout(
+            title={"text": "Prediction vs Target over time"},
+            xaxis={"title": "Time"},
+            yaxis={"title": "Target Values and Predictions"},
+        )
 
-    return _go.Figure(data=data, layout=layout)
+        fig = _go.Figure(data=data, layout=layout)
+    return fig
 
 
 def get_linear_coefficients(estimator, features=None):

diff --git a/evalml/objectives/regression_objective.py b/evalml/objectives/regression_objective.py
@@ -6,5 +6,9 @@
 class RegressionObjective(ObjectiveBase):
     """Base class for all regression objectives."""
 
-    problem_types = [ProblemTypes.REGRESSION, ProblemTypes.TIME_SERIES_REGRESSION]
-    """[ProblemTypes.REGRESSION, ProblemTypes.TIME_SERIES_REGRESSION]"""
+    problem_types = [
+        ProblemTypes.REGRESSION,
+        ProblemTypes.TIME_SERIES_REGRESSION,
+        ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION,
+    ]
+    """[ProblemTypes.REGRESSION, ProblemTypes.TIME_SERIES_REGRESSION, ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION]"""
diff --git a/evalml/pipelines/components/component_base.py b/evalml/pipelines/components/component_base.py
@@ -31,7 +31,6 @@ class ComponentBase(ABC, metaclass=ComponentBaseMeta):
     # Referring to the pandas nullable dtypes; not just woodwork logical types
     _integer_nullable_incompatibilities = []
     _boolean_nullable_incompatibilities = []
-    is_multiseries = False
 
     def __init__(self, parameters=None, component_obj=None, random_seed=0, **kwargs):
         """Base class for all components.

diff --git a/.../pipelines/components/estimators/regressors/multiseries_time_series_baseline_regressor.py b/.../pipelines/components/estimators/regressors/multiseries_time_series_baseline_regressor.py
@@ -25,12 +25,11 @@ class MultiseriesTimeSeriesBaselineRegressor(Estimator):
     """{}"""
     model_family = ModelFamily.BASELINE
     """ModelFamily.BASELINE"""
-    is_multiseries = True
     supported_problem_types = [
-        ProblemTypes.TIME_SERIES_REGRESSION,
+        ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION,
     ]
     """[
-        ProblemTypes.TIME_SERIES_REGRESSION,
+        ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION,
     ]"""
 
     def __init__(self, gap=1, forecast_horizon=1, random_seed=0, **kwargs):