diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst index 27106f6a2c..e4d547d078 100644 --- a/docs/source/release_notes.rst +++ b/docs/source/release_notes.rst @@ -2,10 +2,14 @@ Release Notes ------------- **Future Releases** * Enhancements + * Added support for prediction intervals for VARMAX regressor :pr:`4267` + * Integrated multiseries time series into AutoMLSearch :pr:`4270` * Extended STLDecomposer to Support Multiseries :pr:`4253` * Fixes + * Fixed error when stacking data with no exogenous variables :pr:`4275` * Changes * Updated ``ARIMARegressor`` to be compatible with sktime v0.22.0 and beyond :pr:`4283` + * Updated ``graph_prediction_vs_actual_over_time()`` to be compatible with multiseries time series :pr:`4284` * Documentation Changes * Removed erroneous warnings from Data Checks User Guide page and removed ``tqdm`` warning in all notebooks :pr:`4274` * Testing Changes @@ -22,7 +26,6 @@ Release Notes * Added stacking and unstacking utility functions to work with multiseries data :pr:`4250` * Added multiseries regression pipeline class :pr:`4256` * Added multiseries VARMAX regressor :pr:`4238` - * Added support for prediction intervals for VARMAX regressor :pr:`4267` * Fixes * Added support for pandas 2 :pr:`4216` * Fixed bug where time series pipelines would fail due to MASE needing `y_train` when scoring :pr:`4258` diff --git a/evalml/automl/automl_algorithm/default_algorithm.py b/evalml/automl/automl_algorithm/default_algorithm.py index fecada082c..bb6f7591b5 100644 --- a/evalml/automl/automl_algorithm/default_algorithm.py +++ b/evalml/automl/automl_algorithm/default_algorithm.py @@ -25,7 +25,7 @@ _make_pipeline_from_multiple_graphs, make_pipeline, ) -from evalml.problem_types import is_regression, is_time_series +from evalml.problem_types import is_multiseries, is_regression, is_time_series from evalml.utils import infer_feature_types from evalml.utils.logger import get_logger @@ -170,6 +170,8 @@ def default_max_batches(self): """Returns the number of max batches AutoMLSearch should run by default.""" if self.ensembling: return 3 + elif is_multiseries(self.problem_type): + return 1 else: return 2 @@ -472,11 +474,17 @@ def next_batch(self): ) # this logic needs to be updated once time series also supports ensembling elif is_time_series(self.problem_type): - if self._batch_number == 0: + # Skip the naive batch for multiseries time series + batch = ( + self._batch_number + if not is_multiseries(self.problem_type) + else self._batch_number + 1 + ) + if batch == 0: next_batch = self._create_naive_pipelines() - elif self._batch_number == 1: + elif batch == 1: next_batch = self._create_fast_final() - elif self.batch_number == 2: + elif batch == 2: next_batch = self._create_long_exploration(n=self.top_n) else: next_batch = self._create_n_pipelines( diff --git a/evalml/automl/automl_search.py b/evalml/automl/automl_search.py index 7a310a319b..b3bdc0bbfe 100644 --- a/evalml/automl/automl_search.py +++ b/evalml/automl/automl_search.py @@ -65,6 +65,7 @@ handle_problem_types, is_binary, is_classification, + is_multiseries, is_time_series, ) from evalml.tuners import SKOptTuner @@ -403,6 +404,7 @@ class AutoMLSearch: problem_configuration (dict, None): Additional parameters needed to configure the search. For example, in time series problems, values should be passed in for the time_index, gap, forecast_horizon, and max_delay variables. + For multiseries time series problems, the values passed in should also include the name of a series_id column. train_best_pipeline (boolean): Whether or not to train the best pipeline before returning it. Defaults to True. @@ -651,6 +653,14 @@ def __init__( f"Dataset size is too small to create holdout set. Minimum dataset size is {self._HOLDOUT_SET_MIN_ROWS} rows, X_train has {len(X_train)} rows. Holdout set evaluation is disabled.", ) + # For multiseries problems, we need to make sure that the data is primarily ordered by the time_index rather than the series_id + if is_multiseries(self.problem_type): + time_index = self.problem_configuration.get("time_index") + series_id = self.problem_configuration.get("series_id") + X_train = X_train.sort_values([time_index, series_id]) + y_train = y_train[X_train.index].reset_index(drop=True) + X_train = X_train.reset_index(drop=True) + # Set holdout data in AutoML search if provided as parameter self.X_train = infer_feature_types(X_train) self.y_train = infer_feature_types(y_train) @@ -1053,6 +1063,13 @@ def _validate_problem_configuration(self, problem_configuration=None): is_valid, msg = contains_all_ts_parameters(problem_configuration) if not is_valid: raise ValueError(msg) + if ( + is_multiseries(self.problem_type) + and "series_id" not in problem_configuration + ): + raise ValueError( + "Must provide 'series_id' column in problem_configuration for multiseries time series problems.", + ) return problem_configuration or {} def _handle_keyboard_interrupt(self): @@ -1355,6 +1372,7 @@ def _get_baseline_pipeline(self): gap = self.problem_configuration["gap"] forecast_horizon = self.problem_configuration["forecast_horizon"] time_index = self.problem_configuration["time_index"] + series_id = self.problem_configuration.get("series_id", None) exclude_timeseries_featurizer = ( "TimeSeriesFeaturizer" in self.exclude_featurizers ) @@ -1364,6 +1382,7 @@ def _get_baseline_pipeline(self): forecast_horizon, time_index, exclude_timeseries_featurizer, + series_id, ) return baseline diff --git a/evalml/automl/utils.py b/evalml/automl/utils.py index a35fd4fa0f..56331807b4 100644 --- a/evalml/automl/utils.py +++ b/evalml/automl/utils.py @@ -49,6 +49,7 @@ def get_default_primary_search_objective(problem_type): "time series regression": "MedianAE", "time series binary": "Log Loss Binary", "time series multiclass": "Log Loss Multiclass", + "multiseries time series regression": "MedianAE", }[problem_type.value] return get_objective(objective_name, return_instance=True) @@ -87,12 +88,14 @@ def make_data_splitter( raise ValueError( "problem_configuration is required for time series problem types", ) + series_id = problem_configuration.get("series_id") return TimeSeriesSplit( n_splits=n_splits, gap=problem_configuration.get("gap"), max_delay=problem_configuration.get("max_delay"), time_index=problem_configuration.get("time_index"), forecast_horizon=problem_configuration.get("forecast_horizon"), + n_series=len(X[series_id].unique()) if series_id is not None else None, ) if X.shape[0] > _LARGE_DATA_ROW_THRESHOLD: return TrainingValidationSplit( diff --git a/evalml/model_understanding/visualizations.py b/evalml/model_understanding/visualizations.py index 6eb74b58d9..2f9520e501 100644 --- a/evalml/model_understanding/visualizations.py +++ b/evalml/model_understanding/visualizations.py @@ -12,6 +12,7 @@ from evalml.model_family import ModelFamily from evalml.objectives.utils import get_objective from evalml.problem_types import ProblemTypes +from evalml.problem_types.utils import is_multiseries from evalml.utils import import_or_raise, infer_feature_types, jupyter_check @@ -373,25 +374,44 @@ def get_prediction_vs_actual_over_time_data(pipeline, X, y, X_train, y_train, da dates = infer_feature_types(dates) prediction = pipeline.predict_in_sample(X, y, X_train=X_train, y_train=y_train) - return pd.DataFrame( - { - "dates": dates.reset_index(drop=True), - "target": y.reset_index(drop=True), - "prediction": prediction.reset_index(drop=True), - }, - ) + if is_multiseries(pipeline.problem_type): + return pd.DataFrame( + { + "dates": dates.reset_index(drop=True), + "target": y.reset_index(drop=True), + "prediction": prediction.reset_index(drop=True), + "series_id": X[pipeline.series_id].reset_index(drop=True), + }, + ) + else: + return pd.DataFrame( + { + "dates": dates.reset_index(drop=True), + "target": y.reset_index(drop=True), + "prediction": prediction.reset_index(drop=True), + }, + ) -def graph_prediction_vs_actual_over_time(pipeline, X, y, X_train, y_train, dates): +def graph_prediction_vs_actual_over_time( + pipeline, + X, + y, + X_train, + y_train, + dates, + single_series=None, +): """Plot the target values and predictions against time on the x-axis. Args: pipeline (TimeSeriesRegressionPipeline): Fitted time series regression pipeline. - X (pd.DataFrame): Features used to generate new predictions. - y (pd.Series): Target values to compare predictions against. + X (pd.DataFrame): Features used to generate new predictions. If problem is multiseries, X should be stacked. + y (pd.Series): Target values to compare predictions against. If problem is multiseries, y should be stacked. X_train (pd.DataFrame): Data the pipeline was trained on. y_train (pd.Series): Target values for training data. dates (pd.Series): Dates corresponding to target values and predictions. + single_series (str): A single series id value to plot just one series in a multiseries dataset. Defaults to None. Returns: plotly.Figure: Showing the prediction vs actual over time. @@ -403,8 +423,15 @@ def graph_prediction_vs_actual_over_time(pipeline, X, y, X_train, y_train, dates "plotly.graph_objects", error_msg="Cannot find dependency plotly.graph_objects", ) + subplots = import_or_raise( + "plotly.subplots", + error_msg="Cannot find dependency plotly.subplots", + ) - if pipeline.problem_type != ProblemTypes.TIME_SERIES_REGRESSION: + if ( + pipeline.problem_type != ProblemTypes.TIME_SERIES_REGRESSION + and pipeline.problem_type != ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION + ): raise ValueError( "graph_prediction_vs_actual_over_time only supports time series regression pipelines! " f"Received {str(pipeline.problem_type)}.", @@ -419,30 +446,76 @@ def graph_prediction_vs_actual_over_time(pipeline, X, y, X_train, y_train, dates dates, ) - data = [ - _go.Scatter( - x=data["dates"], - y=data["target"], - mode="lines+markers", - name="Target", - line=dict(color="#1f77b4"), - ), - _go.Scatter( - x=data["dates"], - y=data["prediction"], - mode="lines+markers", - name="Prediction", - line=dict(color="#d62728"), - ), - ] - # Let plotly pick the best date format. - layout = _go.Layout( - title={"text": "Prediction vs Target over time"}, - xaxis={"title": "Time"}, - yaxis={"title": "Target Values and Predictions"}, - ) + fig = None + if is_multiseries(pipeline.problem_type): + id_list = ( + [single_series] if single_series is not None else data["series_id"].unique() + ) + fig = subplots.make_subplots( + rows=len(id_list), + cols=1, + subplot_titles=[f"Series: {id}" for id in id_list], + ) + for curr_count, id in enumerate(id_list): + curr_df = data[data["series_id"] == id] + fig.append_trace( + _go.Scatter( + x=curr_df["dates"], + y=curr_df["target"], + mode="lines+markers", + name=f"Series {id}: Target", + ), + row=curr_count + 1, + col=1, + ) + fig.append_trace( + _go.Scatter( + x=curr_df["dates"], + y=curr_df["prediction"], + mode="lines+markers", + name=f"Series {id}: Prediction", + ), + row=curr_count + 1, + col=1, + ) + fig.update_xaxes(title_text="Time") + fig.update_yaxes(title_text=y.name) + if single_series is not None: + fig.update_layout( + title_text=f"Graph for Series {single_series}", + ) + else: + fig.update_layout( + height=600 + (len(id_list)) * 200, + width=1500, + title_text="Graph for Multiseries", + ) + else: + data = [ + _go.Scatter( + x=data["dates"], + y=data["target"], + mode="lines+markers", + name="Target", + line=dict(color="#1f77b4"), + ), + _go.Scatter( + x=data["dates"], + y=data["prediction"], + mode="lines+markers", + name="Prediction", + line=dict(color="#d62728"), + ), + ] + # Let plotly pick the best date format. + layout = _go.Layout( + title={"text": "Prediction vs Target over time"}, + xaxis={"title": "Time"}, + yaxis={"title": "Target Values and Predictions"}, + ) - return _go.Figure(data=data, layout=layout) + fig = _go.Figure(data=data, layout=layout) + return fig def get_linear_coefficients(estimator, features=None): diff --git a/evalml/objectives/regression_objective.py b/evalml/objectives/regression_objective.py index 4d76902631..a118d14b86 100644 --- a/evalml/objectives/regression_objective.py +++ b/evalml/objectives/regression_objective.py @@ -6,5 +6,9 @@ class RegressionObjective(ObjectiveBase): """Base class for all regression objectives.""" - problem_types = [ProblemTypes.REGRESSION, ProblemTypes.TIME_SERIES_REGRESSION] - """[ProblemTypes.REGRESSION, ProblemTypes.TIME_SERIES_REGRESSION]""" + problem_types = [ + ProblemTypes.REGRESSION, + ProblemTypes.TIME_SERIES_REGRESSION, + ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION, + ] + """[ProblemTypes.REGRESSION, ProblemTypes.TIME_SERIES_REGRESSION, ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION]""" diff --git a/evalml/pipelines/components/component_base.py b/evalml/pipelines/components/component_base.py index 5c00a053e1..12b6603bb4 100644 --- a/evalml/pipelines/components/component_base.py +++ b/evalml/pipelines/components/component_base.py @@ -31,7 +31,6 @@ class ComponentBase(ABC, metaclass=ComponentBaseMeta): # Referring to the pandas nullable dtypes; not just woodwork logical types _integer_nullable_incompatibilities = [] _boolean_nullable_incompatibilities = [] - is_multiseries = False def __init__(self, parameters=None, component_obj=None, random_seed=0, **kwargs): """Base class for all components. diff --git a/evalml/pipelines/components/estimators/regressors/multiseries_time_series_baseline_regressor.py b/evalml/pipelines/components/estimators/regressors/multiseries_time_series_baseline_regressor.py index 1ca88cd6bb..80be329341 100644 --- a/evalml/pipelines/components/estimators/regressors/multiseries_time_series_baseline_regressor.py +++ b/evalml/pipelines/components/estimators/regressors/multiseries_time_series_baseline_regressor.py @@ -25,12 +25,11 @@ class MultiseriesTimeSeriesBaselineRegressor(Estimator): """{}""" model_family = ModelFamily.BASELINE """ModelFamily.BASELINE""" - is_multiseries = True supported_problem_types = [ - ProblemTypes.TIME_SERIES_REGRESSION, + ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION, ] """[ - ProblemTypes.TIME_SERIES_REGRESSION, + ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION, ]""" def __init__(self, gap=1, forecast_horizon=1, random_seed=0, **kwargs): diff --git a/evalml/pipelines/components/estimators/regressors/varmax_regressor.py b/evalml/pipelines/components/estimators/regressors/varmax_regressor.py index ecca230042..0c8c57d27c 100644 --- a/evalml/pipelines/components/estimators/regressors/varmax_regressor.py +++ b/evalml/pipelines/components/estimators/regressors/varmax_regressor.py @@ -48,10 +48,9 @@ class VARMAXRegressor(Estimator): "trend": Categorical(['n', 'c', 't', 'ct']), }""" model_family = ModelFamily.VARMAX - is_multiseries = True """ModelFamily.VARMAX""" - supported_problem_types = [ProblemTypes.TIME_SERIES_REGRESSION] - """[ProblemTypes.TIME_SERIES_REGRESSION]""" + supported_problem_types = [ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION] + """[ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION]""" def __init__( self, @@ -61,7 +60,7 @@ def __init__( trend: Optional[str] = "c", random_seed: Union[int, float] = 0, maxiter: int = 10, - use_covariates: bool = True, + use_covariates: bool = False, **kwargs, ): self.preds_95_upper = None @@ -84,6 +83,7 @@ def __init__( parameters["use_covariates"] = use_covariates parameters["time_index"] = time_index + parameters.update({"p": p, "q": q}) self.use_covariates = use_covariates self.time_index = time_index @@ -133,13 +133,13 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.DataFrame] = None): if y is None: raise ValueError("VARMAX Regressor requires y as input.") + y = convert_bool_to_double(y, include_ints=True) if X is not None and self.use_covariates: self.last_X_index = X.index[-1] X = X.ww.select(exclude=["Datetime"]) X = convert_bool_to_double(X) - y = convert_bool_to_double(y) X, y = match_indices(X, y) if not X.empty: diff --git a/evalml/pipelines/components/utils.py b/evalml/pipelines/components/utils.py index 5c83218d5f..b8dcb833c2 100644 --- a/evalml/pipelines/components/utils.py +++ b/evalml/pipelines/components/utils.py @@ -56,7 +56,11 @@ def allowed_model_families(problem_type): return list(set([e.model_family for e in estimators])) -def get_estimators(problem_type, model_families=None, excluded_model_families=None): +def get_estimators( + problem_type, + model_families=None, + excluded_model_families=None, +): """Returns the estimators allowed for a particular problem type. Can also optionally filter by a list of model types. @@ -515,20 +519,27 @@ def match_indices( return X, y -def convert_bool_to_double(data: pd.DataFrame): - """Converts all boolean columns in dataframe to doubles. +def convert_bool_to_double( + data: pd.DataFrame, + include_ints: bool = False, +) -> pd.DataFrame: + """Converts all boolean columns in dataframe to doubles. If include_ints, converts all integer columns to doubles as well. Args: data (pd.DataFrame): Input dataframe. + include_ints (bool): If True, converts all integer columns to doubles as well. Defaults to False. Returns: pd.DataFrame: Input dataframe with all boolean-valued columns converted to doubles. """ data_ = data.ww.copy() + relevant_dtypes = ["Boolean"] + if include_ints: + relevant_dtypes.append("Integer") data_.ww.set_types( { col: "Double" - for col in data.ww.select(["Boolean"], return_schema=True).columns + for col in data.ww.select(relevant_dtypes, return_schema=True).columns }, ) return data_ diff --git a/evalml/pipelines/multiseries_regression_pipeline.py b/evalml/pipelines/multiseries_regression_pipeline.py index 6b45653482..6ddc6ac9d4 100644 --- a/evalml/pipelines/multiseries_regression_pipeline.py +++ b/evalml/pipelines/multiseries_regression_pipeline.py @@ -21,9 +21,9 @@ class MultiseriesRegressionPipeline(TimeSeriesRegressionPipeline): """ - problem_type = ProblemTypes.TIME_SERIES_REGRESSION + problem_type = ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION - """ProblemTypes.TIME_SERIES_REGRESSION""" + """ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION""" def __init__( self, diff --git a/evalml/pipelines/utils.py b/evalml/pipelines/utils.py index f2b8dcd94e..dbc51abee8 100644 --- a/evalml/pipelines/utils.py +++ b/evalml/pipelines/utils.py @@ -12,6 +12,7 @@ from evalml.model_family import ModelFamily from evalml.pipelines import ( ComponentGraph, + MultiseriesRegressionPipeline, TimeSeriesBinaryClassificationPipeline, TimeSeriesMulticlassClassificationPipeline, TimeSeriesRegressionPipeline, @@ -66,6 +67,7 @@ ProblemTypes, handle_problem_types, is_classification, + is_multiseries, is_regression, is_time_series, ) @@ -289,6 +291,9 @@ def _get_preprocessing_components( Returns: list[Transformer]: A list of applicable preprocessing components to use with the estimator. """ + if is_multiseries(problem_type): + return [] + if is_time_series(problem_type): components_functions = [ _get_label_encoder, @@ -361,8 +366,10 @@ def _get_pipeline_base_class(problem_type): return TimeSeriesRegressionPipeline elif problem_type == ProblemTypes.TIME_SERIES_BINARY: return TimeSeriesBinaryClassificationPipeline - else: + elif problem_type == ProblemTypes.TIME_SERIES_MULTICLASS: return TimeSeriesMulticlassClassificationPipeline + else: + return MultiseriesRegressionPipeline def _make_pipeline_time_series( @@ -1204,6 +1211,7 @@ def make_timeseries_baseline_pipeline( forecast_horizon, time_index, exclude_featurizer=False, + series_id=None, ): """Make a baseline pipeline for time series regression problems. @@ -1214,6 +1222,7 @@ def make_timeseries_baseline_pipeline( time_index (str): Column name of time_index parameter. exclude_featurizer (bool): Whether or not to exclude the TimeSeriesFeaturizer from the baseline graph. Defaults to False. + series_id (str): Column name of series_id parameter. Only used for multiseries time series. Defaults to None. Returns: TimeSeriesPipelineBase, a time series pipeline corresponding to the problem type. @@ -1232,8 +1241,17 @@ def make_timeseries_baseline_pipeline( TimeSeriesBinaryClassificationPipeline, "Time Series Baseline Binary Pipeline", ), + ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION: ( + MultiseriesRegressionPipeline, + "Multiseries Time Series Baseline Pipeline", + ), }[problem_type] - component_graph = ["Time Series Baseline Estimator"] + baseline_estimator_name = ( + "Multiseries Time Series Baseline Regressor" + if is_multiseries(problem_type) + else "Time Series Baseline Estimator" + ) + component_graph = [baseline_estimator_name] parameters = { "pipeline": { "time_index": time_index, @@ -1241,11 +1259,13 @@ def make_timeseries_baseline_pipeline( "max_delay": 0, "forecast_horizon": forecast_horizon, }, - "Time Series Baseline Estimator": { + baseline_estimator_name: { "gap": gap, "forecast_horizon": forecast_horizon, }, } + if is_multiseries(problem_type): + parameters["pipeline"]["series_id"] = series_id if not exclude_featurizer: component_graph = ["Time Series Featurizer"] + component_graph parameters["Time Series Featurizer"] = { @@ -1397,8 +1417,11 @@ def unstack_multiseries( X_unstacked_cols.append(new_column) # Concatenate all the single series to reform dataframes - X_unstacked = pd.concat(X_unstacked_cols, axis=1) y_unstacked = pd.concat(y_unstacked_cols, axis=1) + if len(X_unstacked_cols) == 0: + X_unstacked = pd.DataFrame(index=y_unstacked.index) + else: + X_unstacked = pd.concat(X_unstacked_cols, axis=1) # Reset the axes now that they've been unstacked, keep time info in X X_unstacked = X_unstacked.reset_index() @@ -1457,7 +1480,7 @@ def stack_data(data, include_series_id=False, series_id_name=None, starting_inde return stacked_series -def stack_X(X, series_id_name, time_index, starting_index=None): +def stack_X(X, series_id_name, time_index, starting_index=None, series_id_values=None): """Restacks the unstacked features into a single DataFrame. Args: @@ -1466,37 +1489,61 @@ def stack_X(X, series_id_name, time_index, starting_index=None): time_index (str): The name of the time index column. starting_index (int): The starting index to use for the stacked DataFrame. If None, the starting index will match that of the input data. Defaults to None. + series_id_values (set, list): The unique values of a series ID, used to generate the index. If None, values will + be generated from X column values. Required if X only has time index values and no exogenous values. + Defaults to None. Returns: pd.DataFrame: The restacked features. """ original_columns = set() - series_ids = set() - for col in X.columns: - if col == time_index: - continue - separated_name = col.split("_") - original_columns.add("_".join(separated_name[:-1])) - series_ids.add(separated_name[-1]) - - restacked_X = [] - - for i, original_col in enumerate(original_columns): - # Only include the series id once (for the first column) - include_series_id = i == 0 - subset_X = [col for col in X.columns if original_col in col] - restacked_X.append( - stack_data( - X[subset_X], - include_series_id=include_series_id, - series_id_name=series_id_name, - starting_index=starting_index, - ), + series_ids = series_id_values or set() + if series_id_values is None: + for col in X.columns: + if col == time_index: + continue + separated_name = col.split("_") + original_columns.add("_".join(separated_name[:-1])) + series_ids.add(separated_name[-1]) + + if len(series_ids) == 0: + raise ValueError( + "Series ID values need to be passed in X column values or as a set with the `series_id_values` parameter.", ) - restacked_X = pd.concat(restacked_X, axis=1) time_index_col = X[time_index].repeat(len(series_ids)).reset_index(drop=True) - time_index_col.index = restacked_X.index - restacked_X[time_index] = time_index_col + + if len(original_columns) == 0: + start_index = starting_index or X.index[0] + stacked_index = pd.RangeIndex( + start=start_index, + stop=start_index + len(time_index_col), + ) + time_index_col.index = stacked_index + restacked_X = pd.DataFrame( + { + time_index: time_index_col, + series_id_name: sorted(list(series_ids)) * len(X), + }, + index=stacked_index, + ) + else: + restacked_X = [] + for i, original_col in enumerate(original_columns): + # Only include the series id once (for the first column) + include_series_id = i == 0 + subset_X = [col for col in X.columns if original_col in col] + restacked_X.append( + stack_data( + X[subset_X], + include_series_id=include_series_id, + series_id_name=series_id_name, + starting_index=starting_index, + ), + ) + + restacked_X = pd.concat(restacked_X, axis=1) + time_index_col.index = restacked_X.index + restacked_X[time_index] = time_index_col return restacked_X diff --git a/evalml/preprocessing/data_splitters/time_series_split.py b/evalml/preprocessing/data_splitters/time_series_split.py index 391b07672e..5605d17a02 100644 --- a/evalml/preprocessing/data_splitters/time_series_split.py +++ b/evalml/preprocessing/data_splitters/time_series_split.py @@ -59,6 +59,7 @@ def __init__( gap=0, forecast_horizon=None, time_index=None, + n_series=None, n_splits=3, ): self.max_delay = max_delay @@ -66,9 +67,15 @@ def __init__( self.forecast_horizon = forecast_horizon if forecast_horizon else 1 self.time_index = time_index self.n_splits = n_splits + self.n_series = n_series + + test_size = forecast_horizon + if self.n_series is not None: + test_size = forecast_horizon * self.n_series + self._splitter = SkTimeSeriesSplit( n_splits=n_splits, - test_size=forecast_horizon, + test_size=test_size, ) def get_n_splits(self, X=None, y=None, groups=None): diff --git a/evalml/preprocessing/utils.py b/evalml/preprocessing/utils.py index 6b3d656d4c..6e7c203611 100644 --- a/evalml/preprocessing/utils.py +++ b/evalml/preprocessing/utils.py @@ -72,12 +72,22 @@ def split_multiseries_data(X, y, series_id, time_index, **kwargs): X_unstacked, y_unstacked, problem_type="time series regression", **kwargs ) - X_train = stack_X(X_train_unstacked, series_id, time_index) + # Get unique series value from X if there is only the time_index column + # Otherwise, this information is generated in `stack_X` from the column values + series_id_values = set(X[series_id]) if len(X_unstacked.columns) == 1 else None + + X_train = stack_X( + X_train_unstacked, + series_id, + time_index, + series_id_values=series_id_values, + ) X_holdout = stack_X( X_holdout_unstacked, series_id, time_index, starting_index=X_train.index[-1] + 1, + series_id_values=series_id_values, ) y_train = stack_data(y_train_unstacked) y_holdout = stack_data(y_holdout_unstacked, starting_index=y_train.index[-1] + 1) diff --git a/evalml/problem_types/__init__.py b/evalml/problem_types/__init__.py index 5b589c909e..866f326164 100644 --- a/evalml/problem_types/__init__.py +++ b/evalml/problem_types/__init__.py @@ -6,6 +6,7 @@ is_regression, is_binary, is_multiclass, + is_multiseries, is_classification, is_time_series, ) diff --git a/evalml/problem_types/problem_types.py b/evalml/problem_types/problem_types.py index fe17fd73c6..4f4b182bf0 100644 --- a/evalml/problem_types/problem_types.py +++ b/evalml/problem_types/problem_types.py @@ -19,6 +19,8 @@ class ProblemTypes(Enum): """Time series binary classification problem.""" TIME_SERIES_MULTICLASS = "time series multiclass" """Time series multiclass classification problem.""" + MULTISERIES_TIME_SERIES_REGRESSION = "multiseries time series regression" + """Multiseries time series regression problem.""" def __str__(self): """String representation of the ProblemTypes enum.""" @@ -29,6 +31,7 @@ def __str__(self): ProblemTypes.TIME_SERIES_REGRESSION.name: "time series regression", ProblemTypes.TIME_SERIES_BINARY.name: "time series binary", ProblemTypes.TIME_SERIES_MULTICLASS.name: "time series multiclass", + ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION.name: "multiseries time series regression", } return problem_type_dict[self.name] diff --git a/evalml/problem_types/utils.py b/evalml/problem_types/utils.py index 0fa1552012..7f34aba059 100644 --- a/evalml/problem_types/utils.py +++ b/evalml/problem_types/utils.py @@ -87,6 +87,7 @@ def is_regression(problem_type): return handle_problem_types(problem_type) in [ ProblemTypes.REGRESSION, ProblemTypes.TIME_SERIES_REGRESSION, + ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION, ] @@ -165,4 +166,20 @@ def is_time_series(problem_type): ProblemTypes.TIME_SERIES_BINARY, ProblemTypes.TIME_SERIES_MULTICLASS, ProblemTypes.TIME_SERIES_REGRESSION, + ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION, ] + + +def is_multiseries(problem_type): + """Determines if the provided problem_type is a multiseries time series problem type. + + Args: + problem_type (str or ProblemTypes): type of supervised learning problem. See evalml.problem_types.ProblemType.all_problem_types for a full list. + + Returns: + bool: Whether or not the provided problem_type is a multiseries time series problem type. + """ + return ( + handle_problem_types(problem_type) + == ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION + ) diff --git a/evalml/tests/automl_tests/parallel_tests/test_automl_dask.py b/evalml/tests/automl_tests/parallel_tests/test_automl_dask.py index c3e370e3d0..f873f44b69 100644 --- a/evalml/tests/automl_tests/parallel_tests/test_automl_dask.py +++ b/evalml/tests/automl_tests/parallel_tests/test_automl_dask.py @@ -6,7 +6,13 @@ from evalml.automl.automl_algorithm import IterativeAlgorithm from evalml.automl.callbacks import raise_error_callback from evalml.automl.engine import CFEngine, DaskEngine, SequentialEngine -from evalml.problem_types import ProblemTypes, is_binary, is_multiclass, is_time_series +from evalml.problem_types import ( + ProblemTypes, + is_binary, + is_multiclass, + is_multiseries, + is_time_series, +) from evalml.tests.automl_tests.dask_test_utils import ( DaskPipelineFast, DaskPipelineSlow, @@ -285,9 +291,12 @@ def test_score_pipelines_passes_X_train_y_train( engine_str, X_y_based_on_pipeline_or_problem_type, ts_data, + multiseries_ts_data_stacked, AutoMLTestEnv, ): - if is_time_series(problem_type): + if is_multiseries(problem_type): + X, y = multiseries_ts_data_stacked + elif is_time_series(problem_type): X, _, y = ts_data(problem_type=problem_type) else: X, y = X_y_based_on_pipeline_or_problem_type(problem_type) @@ -310,6 +319,7 @@ def test_score_pipelines_passes_X_train_y_train( "gap": 0, "forecast_horizon": 1, "max_delay": 1, + "series_id": "series_id" if is_multiseries(problem_type) else None, }, engine=engine_str, ) diff --git a/evalml/tests/automl_tests/test_automl.py b/evalml/tests/automl_tests/test_automl.py index 64f4ac60d9..e1ce9e7f26 100644 --- a/evalml/tests/automl_tests/test_automl.py +++ b/evalml/tests/automl_tests/test_automl.py @@ -1,5 +1,6 @@ import inspect import os +import random import warnings from collections import OrderedDict, defaultdict from itertools import product @@ -84,6 +85,7 @@ ProblemTypes, handle_problem_types, is_classification, + is_multiseries, is_time_series, ) from evalml.tests.automl_tests.parallel_tests.test_automl_dask import engine_strs @@ -2283,14 +2285,28 @@ def fit(self, *args, **kwargs): ) -def test_time_series_regression_with_parameters(ts_data): +@pytest.mark.parametrize( + "problem_type", + [ + ProblemTypes.TIME_SERIES_REGRESSION, + ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION, + ], +) +def test_time_series_regression_with_parameters( + problem_type, + ts_data, + multiseries_ts_data_stacked, +): X, _, y = ts_data() X.index.name = "date" + if is_multiseries(problem_type): + X, y = multiseries_ts_data_stacked problem_configuration = { "time_index": "date", "gap": 1, "max_delay": 0, "forecast_horizon": 2, + "series_id": "series_id" if is_multiseries(problem_type) else None, } automl = AutoMLSearch( X_train=X, @@ -2306,6 +2322,28 @@ def test_time_series_regression_with_parameters(ts_data): ) +def test_multiseries_time_series_parameters_missing_series_id( + multiseries_ts_data_stacked, +): + X, y = multiseries_ts_data_stacked + problem_configuration = { + "time_index": "date", + "gap": 1, + "max_delay": 0, + "forecast_horizon": 2, + } + with pytest.raises( + ValueError, + match="Must provide 'series_id' column in problem_configuration", + ): + AutoMLSearch( + X_train=X, + y_train=y, + problem_type="multiseries time series regression", + problem_configuration=problem_configuration, + ) + + @pytest.mark.parametrize("graph_type", ["dict", "cg"]) def test_automl_accepts_component_graphs(graph_type, X_y_binary): X, y = X_y_binary @@ -4007,7 +4045,7 @@ def test_automl_baseline_pipeline_predictions_and_scores(problem_type): [ problem_type for problem_type in ProblemTypes.all_problem_types - if is_time_series(problem_type) + if is_time_series(problem_type) and not is_multiseries(problem_type) ], ) def test_automl_baseline_pipeline_predictions_and_scores_time_series(problem_type): @@ -4048,7 +4086,6 @@ def test_automl_baseline_pipeline_predictions_and_scores_time_series(problem_typ baseline.fit(X_train, y_train) expected_predictions = y.shift(1)[4:] - expected_predictions = expected_predictions if problem_type != ProblemTypes.TIME_SERIES_REGRESSION: expected_predictions = pd.Series( expected_predictions, @@ -4069,6 +4106,28 @@ def test_automl_baseline_pipeline_predictions_and_scores_time_series(problem_typ np.testing.assert_allclose(baseline.feature_importance.iloc[:, 1], importance) +def test_automl_multiseries_baseline_generation(multiseries_ts_data_stacked): + X, y = multiseries_ts_data_stacked + + automl = AutoMLSearch( + X, + y, + problem_type="multiseries time series regression", + problem_configuration={ + "time_index": "date", + "gap": 0, + "max_delay": 1, + "forecast_horizon": 1, + "series_id": "series_id", + }, + ) + baseline = automl._get_baseline_pipeline() + assert baseline.component_graph.compute_order == [ + "Time Series Featurizer", + "Multiseries Time Series Baseline Regressor", + ] + + @pytest.mark.parametrize( "objective,errors", [ @@ -4194,6 +4253,7 @@ def test_automl_drop_unknown_columns(columns, AutoMLTestEnv, X_y_binary, caplog) ProblemTypes.TIME_SERIES_REGRESSION, ProblemTypes.TIME_SERIES_BINARY, ProblemTypes.TIME_SERIES_MULTICLASS, + ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION, ], ) def test_data_splitter_gives_pipelines_same_data( @@ -4202,6 +4262,7 @@ def test_data_splitter_gives_pipelines_same_data( X_y_binary, X_y_multi, X_y_regression, + multiseries_ts_data_stacked, ): problem_configuration = None if automl_type == ProblemTypes.BINARY: @@ -4215,10 +4276,24 @@ def test_data_splitter_gives_pipelines_same_data( "gap": 1, "max_delay": 1, "time_index": 0, - "forecast_horizon": 10, + "forecast_horizon": 2, } X, y = X_y_regression X.index = pd.DatetimeIndex(pd.date_range("01-01-2022", periods=len(X))) + elif automl_type == ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION: + problem_configuration = { + "gap": 1, + "max_delay": 1, + "time_index": "date", + "forecast_horizon": 2, + "series_id": "series_id", + } + X, _ = multiseries_ts_data_stacked + # Can't use range() to generate y data for VARMAX, as the y columns will be linearly dependent + y = pd.Series( + (random.randint(0, 100) for _ in range(len(X))), + name="target", + ) else: problem_configuration = { "gap": 1, @@ -4672,6 +4747,34 @@ def test_cv_ranking_scores_time_series( assert cv_vals[0] == validation_vals[0] +def test_cv_split_multiseries_order(multiseries_ts_data_stacked, AutoMLTestEnv): + X, _ = multiseries_ts_data_stacked + # Can't use range() to generate y data for VARMAX, as the y columns will be linearly dependent + y = pd.Series( + (random.randint(0, 100) for _ in range(len(X))), + name="target", + ) + # Dates ordered by series means if we do a time series split, we'll have separate series in train and test + X = X.sort_values(["series_id"]) + y = y[X.index].reset_index(drop=True) + X = X.reset_index(drop=True) + problem_configuration = { + "time_index": "date", + "gap": 0, + "max_delay": 0, + "forecast_horizon": 6, + "series_id": "series_id", + } + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type="time series regression", + problem_configuration=problem_configuration, + n_jobs=1, + ) + automl.search() + + @pytest.mark.parametrize("algorithm,batches", [("iterative", 2), ("default", 3)]) @pytest.mark.parametrize( "parameter,expected", @@ -5111,15 +5214,17 @@ def test_exclude_featurizers( problem_type, input_type, get_test_data_from_configuration, + multiseries_ts_data_stacked, AutoMLTestEnv, ): parameters = {} if is_time_series(problem_type): parameters = { - "time_index": "dates", + "time_index": "date" if is_multiseries(problem_type) else "dates", "gap": 1, "max_delay": 1, "forecast_horizon": 1, + "series_id": "series_id" if is_multiseries(problem_type) else None, } X, y = get_test_data_from_configuration( @@ -5127,6 +5232,8 @@ def test_exclude_featurizers( problem_type, column_names=["dates", "text", "email", "url"], ) + if is_multiseries(problem_type): + X, y = multiseries_ts_data_stacked automl = AutoMLSearch( X_train=X, diff --git a/evalml/tests/automl_tests/test_automl_utils.py b/evalml/tests/automl_tests/test_automl_utils.py index 4fc64a4c8a..1f3b85ec7c 100644 --- a/evalml/tests/automl_tests/test_automl_utils.py +++ b/evalml/tests/automl_tests/test_automl_utils.py @@ -20,7 +20,7 @@ RegressionPipeline, ) from evalml.preprocessing.data_splitters import TimeSeriesSplit, TrainingValidationSplit -from evalml.problem_types import ProblemTypes +from evalml.problem_types import ProblemTypes, is_multiseries, is_time_series from evalml.utils.woodwork_utils import infer_feature_types @@ -75,19 +75,18 @@ def test_make_data_splitter_default(problem_type, large_data): if large_data: n = _LARGE_DATA_ROW_THRESHOLD + 1 X = pd.DataFrame({"col_0": list(range(n)), "target": list(range(n))}) + if is_multiseries(problem_type): + X["series_id"] = pd.Series(range(n)) % 2 y = X.pop("target") problem_configuration = None - if problem_type in [ - ProblemTypes.TIME_SERIES_REGRESSION, - ProblemTypes.TIME_SERIES_BINARY, - ProblemTypes.TIME_SERIES_MULTICLASS, - ]: + if is_time_series(problem_type): problem_configuration = { "gap": 1, "max_delay": 7, "time_index": "foo", "forecast_horizon": 4, + "series_id": "series_id" if is_multiseries(problem_type) else None, } data_splitter = make_data_splitter( @@ -127,6 +126,7 @@ def test_make_data_splitter_default(problem_type, large_data): ProblemTypes.TIME_SERIES_REGRESSION, ProblemTypes.TIME_SERIES_BINARY, ProblemTypes.TIME_SERIES_MULTICLASS, + ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION, ]: assert isinstance(data_splitter, TimeSeriesSplit) assert data_splitter.n_splits == 3 @@ -135,6 +135,10 @@ def test_make_data_splitter_default(problem_type, large_data): assert data_splitter.forecast_horizon == 4 assert data_splitter.time_index == "foo" assert data_splitter.is_cv + if is_multiseries(problem_type): + assert data_splitter._splitter.test_size == 8 + else: + assert data_splitter._splitter.test_size == 4 @pytest.mark.parametrize( diff --git a/evalml/tests/automl_tests/test_default_algorithm.py b/evalml/tests/automl_tests/test_default_algorithm.py index e4298c8fc2..b21cc452cb 100644 --- a/evalml/tests/automl_tests/test_default_algorithm.py +++ b/evalml/tests/automl_tests/test_default_algorithm.py @@ -23,7 +23,7 @@ TimeSeriesFeaturizer, URLFeaturizer, ) -from evalml.problem_types import ProblemTypes, is_time_series +from evalml.problem_types import ProblemTypes, is_multiseries, is_time_series def test_default_algorithm_init(X_y_binary): @@ -61,6 +61,15 @@ def test_default_algorithm_init(X_y_binary): ) assert algo.default_max_batches == 3 + algo = DefaultAlgorithm( + X, + y, + ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION, + sampler_name, + verbose=True, + ) + assert algo.default_max_batches == 1 + def test_default_algorithm_search_parameters_error(X_y_binary): X, y = X_y_binary @@ -634,6 +643,46 @@ def test_default_algorithm_time_series( assert len(long_estimators) == 3 +def test_default_algorithm_multiseries_time_series( + multiseries_ts_data_stacked, +): + X, y = multiseries_ts_data_stacked + problem_type = "multiseries time series regression" + sampler_name = None + + search_parameters = { + "pipeline": { + "time_index": "date", + "gap": 1, + "max_delay": 3, + "delay_features": False, + "forecast_horizon": 10, + "series_id": "series_id", + }, + } + + algo = DefaultAlgorithm( + X, + y, + problem_type, + sampler_name, + search_parameters=search_parameters, + ) + + first_batch = algo.next_batch() + assert len(first_batch) == 1 + pipeline = first_batch[0] + assert pipeline.model_family == ModelFamily.VARMAX + assert pipeline.parameters["pipeline"] == search_parameters["pipeline"] + + add_result(algo, first_batch) + + long_explore = algo.next_batch() + long_estimators = set([pipeline.estimator.name for pipeline in long_explore]) + assert len(long_explore) == 50 + assert len(long_estimators) == 1 + + @pytest.mark.parametrize( "problem_type", [ @@ -804,6 +853,7 @@ def test_default_algorithm_accept_features( "max_delay": 3, "delay_features": False, "forecast_horizon": 10, + "series_id": "series_id" if is_multiseries(problem_type) else None, } algo = DefaultAlgorithm( @@ -987,6 +1037,8 @@ def test_exclude_featurizers_default_algorithm( "max_delay": 1, "forecast_horizon": 3, } + if is_multiseries(problem_type): + parameters["series_id"] = "series_id" X, y = get_test_data_from_configuration( input_type, diff --git a/evalml/tests/automl_tests/test_iterative_algorithm.py b/evalml/tests/automl_tests/test_iterative_algorithm.py index 2859db1121..f5ed9b73ac 100644 --- a/evalml/tests/automl_tests/test_iterative_algorithm.py +++ b/evalml/tests/automl_tests/test_iterative_algorithm.py @@ -23,7 +23,7 @@ ) from evalml.pipelines.components.utils import get_estimators from evalml.pipelines.utils import make_pipeline -from evalml.problem_types import ProblemTypes, is_time_series +from evalml.problem_types import ProblemTypes, is_multiseries, is_time_series @pytest.fixture @@ -65,22 +65,46 @@ def __init__( return _method +@pytest.mark.parametrize( + "problem_type", + ["binary", "multiseries time series regression"], +) def test_iterative_algorithm_init( + problem_type, X_y_binary, + multiseries_ts_data_stacked, ): - X, y = X_y_binary - algo = IterativeAlgorithm(X=X, y=y, problem_type="binary") + X, y = X_y_binary if problem_type == "binary" else multiseries_ts_data_stacked + + search_parameters = { + "pipeline": { + "time_index": "date", + "gap": 1, + "max_delay": 3, + "delay_features": False, + "forecast_horizon": 10, + "series_id": "series_id", + }, + } + + algo = IterativeAlgorithm( + X=X, + y=y, + problem_type=problem_type, + search_parameters=search_parameters, + ) assert algo.pipeline_number == 0 assert algo.batch_number == 0 assert algo.default_max_batches == 1 - estimators = get_estimators("binary") + estimators = get_estimators(problem_type) assert len(algo.allowed_pipelines) == len( [ make_pipeline( X, y, estimator, - "binary", + problem_type, + parameters=search_parameters, ) for estimator in estimators ], @@ -1079,6 +1103,8 @@ def test_exclude_featurizers_iterative_algorithm( "max_delay": 1, "forecast_horizon": 3, } + if is_multiseries(problem_type): + parameters["series_id"] = "series_id" X, y = get_test_data_from_configuration( input_type, diff --git a/evalml/tests/component_tests/test_components.py b/evalml/tests/component_tests/test_components.py index 9eabfe869b..0d8650eb71 100644 --- a/evalml/tests/component_tests/test_components.py +++ b/evalml/tests/component_tests/test_components.py @@ -67,6 +67,7 @@ TimeSeriesRegularizer, Transformer, Undersampler, + VARMAXRegressor, XGBoostClassifier, XGBoostRegressor, ) @@ -1226,10 +1227,12 @@ def test_all_estimators_check_fit( ProblemTypes.TIME_SERIES_REGRESSION in component_class.supported_problem_types ): - if component_class.is_multiseries: - X, _, y = ts_multiseries_data() - else: - X, _, y = ts_data() + X, _, y = ts_data() + elif ( + ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION + in component_class.supported_problem_types + ): + X, _, y = ts_multiseries_data() else: X, y = X_y_binary @@ -1365,9 +1368,13 @@ def test_serialization( PolynomialDecomposer, STLDecomposer, ] + requires_multiseries_data = [ + MultiseriesTimeSeriesBaselineRegressor, + VARMAXRegressor, + ] component = helper_functions.safe_init_component_with_njobs_1(component_class) - if component.is_multiseries: + if component_class in requires_multiseries_data: component = component_class(time_index="date") X, _, y = ts_multiseries_data() elif component_class in requires_time_index: @@ -1739,16 +1746,16 @@ def test_estimator_fit_respects_custom_indices( if ProblemTypes.REGRESSION in supported_problem_types: X, y = X_y_regression elif ProblemTypes.TIME_SERIES_REGRESSION in supported_problem_types: - if estimator_class.is_multiseries: - X, _, y = ts_multiseries_data( - train_features_index_dt=False, - train_target_index_dt=False, - ) - else: - X, _, y = ts_data( - train_features_index_dt=False, - train_target_index_dt=False, - ) + X, _, y = ts_data( + train_features_index_dt=False, + train_target_index_dt=False, + ) + ts_problem = True + elif ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION in supported_problem_types: + X, _, y = ts_multiseries_data( + train_features_index_dt=False, + train_target_index_dt=False, + ) ts_problem = True else: X, y = X_y_binary @@ -1929,7 +1936,7 @@ def test_components_support_nullable_types( """Confirm that components without any nullable type incompatibilities can actually use all the nullable types in X and y in fit and predict/transform. If a new component is added that has nullable type incompatibilities, this should fail.""" - cannot_handle_boolean_target = [CatBoostRegressor] + cannot_handle_boolean_target = [CatBoostRegressor, VARMAXRegressor] if ( component_class == TimeSeriesBaselineEstimator @@ -1952,13 +1959,15 @@ def test_components_support_nullable_types( TimeSeriesRegularizer, PolynomialDecomposer, STLDecomposer, + VARMAXRegressor, ] requires_all_numeric = [PCA, LinearDiscriminantAnalysis] + requires_multiseries_data = [VARMAXRegressor] component = helper_functions.safe_init_component_with_njobs_1(component_class) - if component_class.is_multiseries or component_class in requires_time_index: + if component_class in requires_time_index: component = component_class(time_index="date") - if component_class.is_multiseries: + if component_class in requires_multiseries_data: X, _, y = ts_multiseries_data( train_features_index_dt=False, train_target_index_dt=False, @@ -1977,10 +1986,7 @@ def test_components_support_nullable_types( ) X.ww["bool col"] = bool_col if nullable_y_ltype == "BooleanNullable": - if component_class.is_multiseries: - y = pd.DataFrame({"target_a": bool_col, "target_b": ~bool_col}) - else: - y = bool_col + y = bool_col else: y = nullable_type_target(ltype=nullable_y_ltype, has_nans=False) X = nullable_type_test_data(has_nans=False) diff --git a/evalml/tests/component_tests/test_estimators.py b/evalml/tests/component_tests/test_estimators.py index d3278e857f..5bbf8dee1a 100644 --- a/evalml/tests/component_tests/test_estimators.py +++ b/evalml/tests/component_tests/test_estimators.py @@ -27,6 +27,7 @@ def test_estimators_feature_name_with_random_ascii( "ARIMARegressor", "ExponentialSmoothingRegressor", "ProphetRegressor", + "VARMAXRegressor", ]: continue supported_problem_types = [ @@ -182,7 +183,11 @@ def test_estimator_predict_output_type(X_y_binary, helper_functions): for component_class in _all_estimators_used_in_search(): for X, y, X_cols_expected, y_cols_expected, time_series in datatype_combos: - if component_class.name in ["ARIMA Regressor", "Prophet Regressor"]: + if component_class.name in [ + "ARIMA Regressor", + "Prophet Regressor", + "VARMAX Regressor", + ]: continue print( 'Checking output of predict for estimator "{}" on X type {} cols {}, y type {} name {}'.format( diff --git a/evalml/tests/component_tests/test_multiseries_baseline_regressor.py b/evalml/tests/component_tests/test_multiseries_baseline_regressor.py index 1e958fb507..1b5d1a0e9f 100644 --- a/evalml/tests/component_tests/test_multiseries_baseline_regressor.py +++ b/evalml/tests/component_tests/test_multiseries_baseline_regressor.py @@ -6,12 +6,15 @@ MultiseriesTimeSeriesBaselineRegressor, TimeSeriesFeaturizer, ) +from evalml.problem_types import ProblemTypes def test_multiseries_time_series_baseline_regressor_init(): baseline = MultiseriesTimeSeriesBaselineRegressor() assert baseline.model_family == ModelFamily.BASELINE - assert baseline.is_multiseries + assert baseline.supported_problem_types == [ + ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION, + ] assert baseline.start_delay == 2 baseline = MultiseriesTimeSeriesBaselineRegressor(gap=2, forecast_horizon=5) diff --git a/evalml/tests/component_tests/test_utils.py b/evalml/tests/component_tests/test_utils.py index 9434cbb9d1..552ba00d26 100644 --- a/evalml/tests/component_tests/test_utils.py +++ b/evalml/tests/component_tests/test_utils.py @@ -186,6 +186,7 @@ def test_scikit_learn_wrapper(X_y_binary, X_y_multi, X_y_regression): ProblemTypes.TIME_SERIES_REGRESSION, ProblemTypes.TIME_SERIES_MULTICLASS, ProblemTypes.TIME_SERIES_BINARY, + ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION, ]: continue diff --git a/evalml/tests/component_tests/test_varmax_regressor.py b/evalml/tests/component_tests/test_varmax_regressor.py index 75efa173ae..9f4acd067f 100644 --- a/evalml/tests/component_tests/test_varmax_regressor.py +++ b/evalml/tests/component_tests/test_varmax_regressor.py @@ -20,7 +20,7 @@ def test_model_family(): def test_problem_types(): assert set(VARMAXRegressor.supported_problem_types) == { - ProblemTypes.TIME_SERIES_REGRESSION, + ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION, } @@ -48,7 +48,7 @@ def test_remove_datetime_feature( datetime_feature=True, ) - clf = VARMAXRegressor() + clf = VARMAXRegressor(use_covariates=True) clf.fit(X_train, y_train) assert "date" not in mock_fit.call_args.kwargs["X"] @@ -95,6 +95,7 @@ def test_feature_importance(ts_multiseries_data): (False, True, True, False, False, True), ], ) +@pytest.mark.parametrize("use_covariates", [True, False]) def test_fit_predict( train_features_index_dt, train_target_index_dt, @@ -102,6 +103,7 @@ def test_fit_predict( no_features, datetime_feature, test_features_index_dt, + use_covariates, ts_multiseries_data, ): from sktime.forecasting.base import ForecastingHorizon @@ -119,10 +121,14 @@ def test_fit_predict( fh_ = ForecastingHorizon([i + 1 for i in range(len(X_test))], is_relative=True) a_clf = VARMAX(maxiter=10) - clf = a_clf.fit(X=X_train, y=y_train) - y_pred_sk = clf.predict(fh=fh_, X=X_test) - - m_clf = VARMAXRegressor(maxiter=10) + if use_covariates: + clf = a_clf.fit(X=X_train, y=y_train) + y_pred_sk = clf.predict(fh=fh_, X=X_test) + else: + clf = a_clf.fit(y=y_train) + y_pred_sk = clf.predict(fh=fh_) + + m_clf = VARMAXRegressor(maxiter=10, use_covariates=use_covariates) m_clf.fit(X=X_train, y=y_train) y_pred = m_clf.predict(X=X_test) np.testing.assert_almost_equal(y_pred_sk.values, y_pred.values) @@ -178,10 +184,12 @@ def test_fit_predict_sk_failure( @pytest.mark.parametrize("freq_num", ["1", "2"]) @pytest.mark.parametrize("freq_str", ["T", "M", "Y"]) +@pytest.mark.parametrize("use_covariates", [True, False]) def test_different_time_units_out_of_sample( freq_str, freq_num, ts_multiseries_data, + use_covariates, ): from sktime.forecasting.base import ForecastingHorizon from sktime.forecasting.varmax import VARMAX @@ -190,10 +198,14 @@ def test_different_time_units_out_of_sample( fh_ = ForecastingHorizon([i + 1 for i in range(len(y[15:]))], is_relative=True) a_clf = VARMAX(maxiter=10) - clf = a_clf.fit(X=X[:15], y=y[:15]) - y_pred_sk = clf.predict(fh=fh_, X=X[15:]) - - m_clf = VARMAXRegressor() + if use_covariates: + clf = a_clf.fit(X=X[:15], y=y[:15]) + y_pred_sk = clf.predict(fh=fh_, X=X[15:]) + else: + clf = a_clf.fit(y=y[:15]) + y_pred_sk = clf.predict(fh=fh_) + + m_clf = VARMAXRegressor(use_covariates=use_covariates) m_clf.fit(X=X[:15], y=y[:15]) y_pred = m_clf.predict(X=X[15:]) @@ -218,7 +230,7 @@ def test_varmax_supports_boolean_features(): X.ww.init() y = pd.DataFrame({"target_1": np.random.rand(10), "target_2": np.random.rand(10)}) - vx = VARMAXRegressor(time_index="dates") + vx = VARMAXRegressor(time_index="dates", use_covariates=True) with patch.object(VARMAX, "fit") as mock_fit: vx.fit(X, y) @@ -260,6 +272,18 @@ def test_varmax_regressor_respects_use_covariates( assert "X" not in mock_predict.call_args.kwargs +@patch("sktime.forecasting.varmax.VARMAX.fit") +def test_varmax_regressor_X_datetime_only(mock_fit, multiseries_ts_data_unstacked): + X, y = multiseries_ts_data_unstacked + X.ww.init() + X = X.ww.select(include=["Datetime"]) + + clf = VARMAXRegressor(use_covariates=True) + clf.fit(X, y) + + assert "X" not in mock_fit.call_args.kwargs + + def test_varmax_regressor_can_forecast_arbitrary_dates_no_covariates( ts_multiseries_data, ): diff --git a/evalml/tests/conftest.py b/evalml/tests/conftest.py index 6306844a94..3440e5ec91 100644 --- a/evalml/tests/conftest.py +++ b/evalml/tests/conftest.py @@ -1960,6 +1960,7 @@ def _pipeline_class(self): ProblemTypes.TIME_SERIES_REGRESSION: "evalml.pipelines.TimeSeriesRegressionPipeline", ProblemTypes.TIME_SERIES_MULTICLASS: "evalml.pipelines.TimeSeriesMulticlassClassificationPipeline", ProblemTypes.TIME_SERIES_BINARY: "evalml.pipelines.TimeSeriesBinaryClassificationPipeline", + ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION: "evalml.pipelines.MultiseriesRegressionPipeline", }[self.problem_type] def _patch_method(self, method, side_effect, return_value, pipeline_class_str=None): diff --git a/evalml/tests/integration_tests/test_nullable_types.py b/evalml/tests/integration_tests/test_nullable_types.py index 437311ad17..57b7e8c505 100644 --- a/evalml/tests/integration_tests/test_nullable_types.py +++ b/evalml/tests/integration_tests/test_nullable_types.py @@ -6,7 +6,7 @@ from evalml.pipelines import RegressionPipeline from evalml.pipelines.components import EmailFeaturizer, Imputer, URLFeaturizer from evalml.pipelines.components.transformers import ReplaceNullableTypes -from evalml.problem_types import ProblemTypes, is_time_series +from evalml.problem_types import ProblemTypes, is_multiseries, is_time_series @pytest.mark.parametrize("input_type", ["pd", "ww"]) @@ -41,6 +41,7 @@ def test_nullable_types_builds_pipelines( "gap": 1, "max_delay": 1, "forecast_horizon": 3, + "series_id": "series_id" if is_multiseries(problem_type) else None, } X, y = get_test_data_from_configuration( @@ -49,6 +50,8 @@ def test_nullable_types_builds_pipelines( column_names=column_names, nullable_target=True if "nullable target" in test_description else False, ) + if is_multiseries(problem_type): + X["series_id"] = pd.Series([0] * len(X)) automl = AutoMLSearch( X_train=X, @@ -60,8 +63,9 @@ def test_nullable_types_builds_pipelines( if automl_algorithm == "iterative": pipelines = [pl.name for pl in automl.allowed_pipelines] elif automl_algorithm == "default": + n_batches = 1 if is_multiseries(problem_type) else 2 # TODO: Upon resolution of GH Issue #3186, increase the num of batches. - for _ in range(2): + for _ in range(n_batches): pipelines = [pl.name for pl in automl.automl_algorithm.next_batch()] # A check to make sure we actually retrieve constructed pipelines from the algo. @@ -129,6 +133,7 @@ def test_automl_search_with_nullable_types( elif ( problem_type == ProblemTypes.REGRESSION or problem_type == ProblemTypes.TIME_SERIES_REGRESSION + or problem_type == ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION ): y = nullable_type_target(ltype="IntegerNullable", has_nans=False) @@ -139,7 +144,10 @@ def test_automl_search_with_nullable_types( "gap": 1, "max_delay": 1, "forecast_horizon": 3, + "series_id": "series_id" if is_multiseries(problem_type) else None, } + if is_multiseries(problem_type): + X["series_id"] = pd.Series([0] * len(X)) automl = AutoMLSearch( X_train=X, diff --git a/evalml/tests/model_understanding_tests/test_visualizations.py b/evalml/tests/model_understanding_tests/test_visualizations.py index 66ac019c57..d963fc4399 100644 --- a/evalml/tests/model_understanding_tests/test_visualizations.py +++ b/evalml/tests/model_understanding_tests/test_visualizations.py @@ -28,13 +28,49 @@ ElasticNetRegressor, LinearRegressor, MulticlassClassificationPipeline, + MultiseriesRegressionPipeline, RegressionPipeline, TimeSeriesRegressionPipeline, ) +from evalml.preprocessing import split_multiseries_data from evalml.problem_types import ProblemTypes from evalml.utils import get_random_state, infer_feature_types +@pytest.fixture(scope="module") +def component_graph_multiseries(): + return { + "Time Series Featurizer": ["Time Series Featurizer", "X", "y"], + "Baseline Multiseries": [ + "Multiseries Time Series Baseline Regressor", + "Time Series Featurizer.x", + "y", + ], + } + + +@pytest.fixture(scope="module") +def pipeline_parameters_multiseries(): + return { + "pipeline": { + "time_index": "date", + "max_delay": 10, + "forecast_horizon": 7, + "gap": 0, + "series_id": "series_id", + }, + "Time Series Featurizer": { + "time_index": "date", + "max_delay": 10, + "forecast_horizon": 7, + "gap": 0, + "delay_features": False, + "delay_target": True, + }, + "Baseline Multiseries": {"gap": 0, "forecast_horizon": 7}, + } + + @pytest.mark.parametrize("data_type", ["np", "pd", "ww"]) def test_cost_benefit_matrix_vs_threshold( data_type, @@ -346,6 +382,35 @@ def test_get_prediction_vs_actual_over_time_data(ts_data): assert list(results.columns) == ["dates", "target", "prediction"] +def test_get_prediction_vs_actual_over_time_data_multiseries( + multiseries_ts_data_stacked, + component_graph_multiseries, + pipeline_parameters_multiseries, +): + X, y = multiseries_ts_data_stacked + X_train, _, y_train, _ = split_multiseries_data( + X, + y, + "series_id", + "date", + ) + pipeline = MultiseriesRegressionPipeline( + component_graph_multiseries, + pipeline_parameters_multiseries, + ) + pipeline.fit(X_train, y_train) + results = get_prediction_vs_actual_over_time_data( + pipeline, + X, + y, + X_train, + y_train, + pd.Series(X["date"]), + ) + assert isinstance(results, pd.DataFrame) + assert list(results.columns) == ["dates", "target", "prediction", "series_id"] + + def test_graph_prediction_vs_actual_over_time(ts_data, go): X, _, y = ts_data() X_train, y_train = X.iloc[:30], y.iloc[:30] @@ -407,6 +472,63 @@ class NotTSPipeline: ) +@pytest.mark.parametrize("single_series", ["0", None]) +def test_graph_prediction_vs_actual_over_time_multiseries( + multiseries_ts_data_stacked, + go, + component_graph_multiseries, + pipeline_parameters_multiseries, + single_series, +): + X, y = multiseries_ts_data_stacked + X_train, _, y_train, _ = split_multiseries_data( + X, + y, + "series_id", + "date", + ) + pipeline = MultiseriesRegressionPipeline( + component_graph_multiseries, + pipeline_parameters_multiseries, + ) + pipeline.fit(X_train, y_train) + fig = graph_prediction_vs_actual_over_time( + pipeline, + X, + y, + X_train, + y_train, + X["date"], + single_series=single_series, + ) + assert isinstance(fig, go.Figure) + + fig_dict = fig.to_dict() + + if single_series is not None: + assert fig_dict["layout"]["title"]["text"] == "Graph for Series 0" + assert len(fig_dict["data"]) == 2 + else: + assert fig_dict["layout"]["title"]["text"] == "Graph for Multiseries" + # there's 5 series, and each series has two lines (one each for target/prediction) + assert len(fig_dict["data"]) == 10 + + assert fig_dict["layout"]["xaxis"]["title"]["text"] == "Time" + assert fig_dict["layout"]["yaxis"]["title"]["text"] == "target" + + curr_series = 0 + for i in range(len(fig_dict["data"])): + assert len(fig_dict["data"][i]["x"]) == len(X["date"].unique()) + assert len(fig_dict["data"][i]["y"]) == len(X["date"].unique()) + assert not np.isnan(fig_dict["data"][i]["y"]).all() + + if i % 2 == 0: + assert fig_dict["data"][i]["name"] == f"Series {curr_series}: Target" + else: + assert fig_dict["data"][i]["name"] == f"Series {curr_series}: Prediction" + curr_series += 1 + + def test_decision_tree_data_from_estimator_not_fitted(tree_estimators): est_class, _ = tree_estimators with pytest.raises( diff --git a/evalml/tests/pipeline_tests/test_pipeline_utils.py b/evalml/tests/pipeline_tests/test_pipeline_utils.py index 8af8426b70..92eb95cc0e 100644 --- a/evalml/tests/pipeline_tests/test_pipeline_utils.py +++ b/evalml/tests/pipeline_tests/test_pipeline_utils.py @@ -58,7 +58,7 @@ stack_X, unstack_multiseries, ) -from evalml.problem_types import ProblemTypes, is_time_series +from evalml.problem_types import ProblemTypes, is_multiseries, is_time_series @pytest.mark.parametrize("input_type", ["pd", "ww"]) @@ -94,6 +94,7 @@ def test_make_pipeline( test_description, column_names, get_test_data_from_configuration, + multiseries_ts_data_stacked, ): X, y = get_test_data_from_configuration( input_type, @@ -112,6 +113,9 @@ def test_make_pipeline( "gap": 1, "max_delay": 1, "forecast_horizon": 3, + "series_id": "series_id" + if is_multiseries(problem_type) + else None, }, } @@ -165,22 +169,25 @@ def test_make_pipeline( ) if is_time_series(problem_type): - expected_components = ( - dfs - + label_encoder - + email_featurizer - + url_featurizer - + drop_null - + natural_language_featurizer - + imputer - + delayed_features - + decomposer - + datetime - + ohe - + drop_nan_rows_transformer - + standard_scaler - + [estimator_class] - ) + if is_multiseries(problem_type): + expected_components = dfs + [estimator_class] + else: + expected_components = ( + dfs + + label_encoder + + email_featurizer + + url_featurizer + + drop_null + + natural_language_featurizer + + imputer + + delayed_features + + decomposer + + datetime + + ohe + + drop_nan_rows_transformer + + standard_scaler + + [estimator_class] + ) else: expected_components = ( dfs @@ -610,6 +617,14 @@ def test_get_estimators(): ) assert len(get_estimators(problem_type=ProblemTypes.MULTICLASS)) == 6 assert len(get_estimators(problem_type=ProblemTypes.REGRESSION)) == 5 + assert ( + len( + get_estimators( + problem_type=ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION, + ), + ) + == 1 + ) assert len(get_estimators(problem_type=ProblemTypes.BINARY, model_families=[])) == 0 assert ( @@ -1457,9 +1472,13 @@ def test_stack_data_noop(): pd.testing.assert_series_equal(stack_data(series_y), series_y) +@pytest.mark.parametrize("series_id_values_type", [set, list]) +@pytest.mark.parametrize("no_features", [True, False]) @pytest.mark.parametrize("starting_index", [None, 1, 132]) def test_stack_X( starting_index, + no_features, + series_id_values_type, multiseries_ts_data_stacked, multiseries_ts_data_unstacked, ): @@ -1469,7 +1488,28 @@ def test_stack_X( if starting_index is not None: X_expected.index = X_expected.index + starting_index - X_transformed = stack_X(X, "series_id", "date", starting_index=starting_index) + if no_features: + series_id_values = series_id_values_type(str(i) for i in range(0, 5)) + X = pd.DataFrame(X["date"]) + X_expected = X_expected[["date", "series_id"]] + + with pytest.raises( + ValueError, + match="Series ID values need to be passed in X column values or as a set with the `series_id_values` parameter.", + ): + stack_X(X, "series_id", "date", starting_index=starting_index) + + X_transformed = stack_X( + X, + "series_id", + "date", + starting_index=starting_index, + series_id_values=series_id_values, + ) + + else: + X_transformed = stack_X(X, "series_id", "date", starting_index=starting_index) + pd.testing.assert_frame_equal( X_expected.sort_index(axis=1), X_transformed.sort_index(axis=1), diff --git a/evalml/tests/pipeline_tests/test_pipelines.py b/evalml/tests/pipeline_tests/test_pipelines.py index 91f1e3c9f8..96a9b0f973 100644 --- a/evalml/tests/pipeline_tests/test_pipelines.py +++ b/evalml/tests/pipeline_tests/test_pipelines.py @@ -114,9 +114,9 @@ def test_all_estimators( is_using_conda, ): if is_using_conda: - n_estimators = 13 - else: n_estimators = 14 + else: + n_estimators = 15 assert len(_all_estimators_used_in_search()) == n_estimators @@ -2037,6 +2037,8 @@ def test_predict_has_input_target_name( time_series_binary_classification_pipeline_class, time_series_multiclass_classification_pipeline_class, ): + if problem_type == ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION: + pytest.skip("Multiseries time series regression case tested elsewhere") if problem_type == ProblemTypes.BINARY: X, y = X_y_binary clf = logistic_regression_binary_pipeline diff --git a/evalml/tests/preprocessing_tests/test_split_data.py b/evalml/tests/preprocessing_tests/test_split_data.py index 9403862ef4..cbb8c941ed 100644 --- a/evalml/tests/preprocessing_tests/test_split_data.py +++ b/evalml/tests/preprocessing_tests/test_split_data.py @@ -127,9 +127,13 @@ def test_split_data_ts(test, X_y_regression): assert len(y_test) == test_size -def test_split_multiseries_data(multiseries_ts_data_stacked): +@pytest.mark.parametrize("no_features", [True, False]) +def test_split_multiseries_data(no_features, multiseries_ts_data_stacked): X, y = multiseries_ts_data_stacked + if no_features: + X = X[["date", "series_id"]] + X_train_expected, X_holdout_expected = X[:-10], X[-10:] y_train_expected, y_holdout_expected = y[:-10], y[-10:] diff --git a/evalml/tests/problem_type_tests/test_problem_types.py b/evalml/tests/problem_type_tests/test_problem_types.py index 4f06cce49b..d848392a4f 100644 --- a/evalml/tests/problem_type_tests/test_problem_types.py +++ b/evalml/tests/problem_type_tests/test_problem_types.py @@ -24,6 +24,7 @@ def correct_problem_types(): ProblemTypes.TIME_SERIES_REGRESSION, ProblemTypes.TIME_SERIES_BINARY, ProblemTypes.TIME_SERIES_MULTICLASS, + ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION, ] yield correct_problem_types @@ -36,6 +37,7 @@ def test_handle_string(correct_problem_types): ProblemTypes.TIME_SERIES_REGRESSION, "time series binary", "time series multiclass", + ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION, ] for problem_type in zip(problem_types, correct_problem_types): assert handle_problem_types(problem_type[0]) == problem_type[1] @@ -154,6 +156,7 @@ def test_all_problem_types(): ProblemTypes.TIME_SERIES_REGRESSION, ProblemTypes.TIME_SERIES_BINARY, ProblemTypes.TIME_SERIES_MULTICLASS, + ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION, ] assert ProblemTypes.all_problem_types == expected @@ -161,7 +164,12 @@ def test_all_problem_types(): @pytest.mark.parametrize("problem_type", ProblemTypes.all_problem_types) def test_type_checks(problem_type): assert is_regression(problem_type) == ( - problem_type in [ProblemTypes.REGRESSION, ProblemTypes.TIME_SERIES_REGRESSION] + problem_type + in [ + ProblemTypes.REGRESSION, + ProblemTypes.TIME_SERIES_REGRESSION, + ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION, + ] ) assert is_binary(problem_type) == ( problem_type in [ProblemTypes.BINARY, ProblemTypes.TIME_SERIES_BINARY] @@ -184,5 +192,6 @@ def test_type_checks(problem_type): ProblemTypes.TIME_SERIES_BINARY, ProblemTypes.TIME_SERIES_MULTICLASS, ProblemTypes.TIME_SERIES_REGRESSION, + ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION, ] ) diff --git a/evalml/utils/gen_utils.py b/evalml/utils/gen_utils.py index 2bf8ef24d4..af253fa021 100644 --- a/evalml/utils/gen_utils.py +++ b/evalml/utils/gen_utils.py @@ -216,7 +216,6 @@ def _get_subclasses(base_class): "SVMClassifier", "SVMRegressor", "LinearRegressor", - "VARMAXRegressor", "VowpalWabbitBinaryClassifier", "VowpalWabbitMulticlassClassifier", "VowpalWabbitRegressor",