Skip to content

Commit

Permalink
Integrate multiseries into AutoMLSearch (#4270)
Browse files Browse the repository at this point in the history
* Integration into search, with tests

* Swap is_multiseries logic to problem type (#4278)
  • Loading branch information
eccabay authored Aug 21, 2023
1 parent 24ba211 commit 7781c77
Show file tree
Hide file tree
Showing 31 changed files with 467 additions and 103 deletions.
3 changes: 2 additions & 1 deletion docs/source/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@ Release Notes
-------------
**Future Releases**
* Enhancements
* Added support for prediction intervals for VARMAX regressor :pr:`4267`
* Integrated multiseries time series into AutoMLSearch :pr:`4270`
* Fixes
* Changes
* Updated ``ARIMARegressor`` to be compatible with sktime v0.22.0 and beyond :pr:`4283`
Expand All @@ -21,7 +23,6 @@ Release Notes
* Added stacking and unstacking utility functions to work with multiseries data :pr:`4250`
* Added multiseries regression pipeline class :pr:`4256`
* Added multiseries VARMAX regressor :pr:`4238`
* Added support for prediction intervals for VARMAX regressor :pr:`4267`
* Fixes
* Added support for pandas 2 :pr:`4216`
* Fixed bug where time series pipelines would fail due to MASE needing `y_train` when scoring :pr:`4258`
Expand Down
16 changes: 12 additions & 4 deletions evalml/automl/automl_algorithm/default_algorithm.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
_make_pipeline_from_multiple_graphs,
make_pipeline,
)
from evalml.problem_types import is_regression, is_time_series
from evalml.problem_types import is_multiseries, is_regression, is_time_series
from evalml.utils import infer_feature_types
from evalml.utils.logger import get_logger

Expand Down Expand Up @@ -170,6 +170,8 @@ def default_max_batches(self):
"""Returns the number of max batches AutoMLSearch should run by default."""
if self.ensembling:
return 3
elif is_multiseries(self.problem_type):
return 1
else:
return 2

Expand Down Expand Up @@ -472,11 +474,17 @@ def next_batch(self):
)
# this logic needs to be updated once time series also supports ensembling
elif is_time_series(self.problem_type):
if self._batch_number == 0:
# Skip the naive batch for multiseries time series
batch = (
self._batch_number
if not is_multiseries(self.problem_type)
else self._batch_number + 1
)
if batch == 0:
next_batch = self._create_naive_pipelines()
elif self._batch_number == 1:
elif batch == 1:
next_batch = self._create_fast_final()
elif self.batch_number == 2:
elif batch == 2:
next_batch = self._create_long_exploration(n=self.top_n)
else:
next_batch = self._create_n_pipelines(
Expand Down
19 changes: 19 additions & 0 deletions evalml/automl/automl_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@
handle_problem_types,
is_binary,
is_classification,
is_multiseries,
is_time_series,
)
from evalml.tuners import SKOptTuner
Expand Down Expand Up @@ -403,6 +404,7 @@ class AutoMLSearch:
problem_configuration (dict, None): Additional parameters needed to configure the search. For example,
in time series problems, values should be passed in for the time_index, gap, forecast_horizon, and max_delay variables.
For multiseries time series problems, the values passed in should also include the name of a series_id column.
train_best_pipeline (boolean): Whether or not to train the best pipeline before returning it. Defaults to True.
Expand Down Expand Up @@ -651,6 +653,14 @@ def __init__(
f"Dataset size is too small to create holdout set. Minimum dataset size is {self._HOLDOUT_SET_MIN_ROWS} rows, X_train has {len(X_train)} rows. Holdout set evaluation is disabled.",
)

# For multiseries problems, we need to make sure that the data is primarily ordered by the time_index rather than the series_id
if is_multiseries(self.problem_type):
time_index = self.problem_configuration.get("time_index")
series_id = self.problem_configuration.get("series_id")
X_train = X_train.sort_values([time_index, series_id])
y_train = y_train[X_train.index].reset_index(drop=True)
X_train = X_train.reset_index(drop=True)

# Set holdout data in AutoML search if provided as parameter
self.X_train = infer_feature_types(X_train)
self.y_train = infer_feature_types(y_train)
Expand Down Expand Up @@ -1053,6 +1063,13 @@ def _validate_problem_configuration(self, problem_configuration=None):
is_valid, msg = contains_all_ts_parameters(problem_configuration)
if not is_valid:
raise ValueError(msg)
if (
is_multiseries(self.problem_type)
and "series_id" not in problem_configuration
):
raise ValueError(
"Must provide 'series_id' column in problem_configuration for multiseries time series problems.",
)
return problem_configuration or {}

def _handle_keyboard_interrupt(self):
Expand Down Expand Up @@ -1355,6 +1372,7 @@ def _get_baseline_pipeline(self):
gap = self.problem_configuration["gap"]
forecast_horizon = self.problem_configuration["forecast_horizon"]
time_index = self.problem_configuration["time_index"]
series_id = self.problem_configuration.get("series_id", None)
exclude_timeseries_featurizer = (
"TimeSeriesFeaturizer" in self.exclude_featurizers
)
Expand All @@ -1364,6 +1382,7 @@ def _get_baseline_pipeline(self):
forecast_horizon,
time_index,
exclude_timeseries_featurizer,
series_id,
)
return baseline

Expand Down
3 changes: 3 additions & 0 deletions evalml/automl/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ def get_default_primary_search_objective(problem_type):
"time series regression": "MedianAE",
"time series binary": "Log Loss Binary",
"time series multiclass": "Log Loss Multiclass",
"multiseries time series regression": "MedianAE",
}[problem_type.value]
return get_objective(objective_name, return_instance=True)

Expand Down Expand Up @@ -87,12 +88,14 @@ def make_data_splitter(
raise ValueError(
"problem_configuration is required for time series problem types",
)
series_id = problem_configuration.get("series_id")
return TimeSeriesSplit(
n_splits=n_splits,
gap=problem_configuration.get("gap"),
max_delay=problem_configuration.get("max_delay"),
time_index=problem_configuration.get("time_index"),
forecast_horizon=problem_configuration.get("forecast_horizon"),
n_series=len(X[series_id].unique()) if series_id is not None else None,
)
if X.shape[0] > _LARGE_DATA_ROW_THRESHOLD:
return TrainingValidationSplit(
Expand Down
8 changes: 6 additions & 2 deletions evalml/objectives/regression_objective.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,9 @@
class RegressionObjective(ObjectiveBase):
"""Base class for all regression objectives."""

problem_types = [ProblemTypes.REGRESSION, ProblemTypes.TIME_SERIES_REGRESSION]
"""[ProblemTypes.REGRESSION, ProblemTypes.TIME_SERIES_REGRESSION]"""
problem_types = [
ProblemTypes.REGRESSION,
ProblemTypes.TIME_SERIES_REGRESSION,
ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION,
]
"""[ProblemTypes.REGRESSION, ProblemTypes.TIME_SERIES_REGRESSION, ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION]"""
1 change: 0 additions & 1 deletion evalml/pipelines/components/component_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@ class ComponentBase(ABC, metaclass=ComponentBaseMeta):
# Referring to the pandas nullable dtypes; not just woodwork logical types
_integer_nullable_incompatibilities = []
_boolean_nullable_incompatibilities = []
is_multiseries = False

def __init__(self, parameters=None, component_obj=None, random_seed=0, **kwargs):
"""Base class for all components.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,11 @@ class MultiseriesTimeSeriesBaselineRegressor(Estimator):
"""{}"""
model_family = ModelFamily.BASELINE
"""ModelFamily.BASELINE"""
is_multiseries = True
supported_problem_types = [
ProblemTypes.TIME_SERIES_REGRESSION,
ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION,
]
"""[
ProblemTypes.TIME_SERIES_REGRESSION,
ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION,
]"""

def __init__(self, gap=1, forecast_horizon=1, random_seed=0, **kwargs):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,10 +48,9 @@ class VARMAXRegressor(Estimator):
"trend": Categorical(['n', 'c', 't', 'ct']),
}"""
model_family = ModelFamily.VARMAX
is_multiseries = True
"""ModelFamily.VARMAX"""
supported_problem_types = [ProblemTypes.TIME_SERIES_REGRESSION]
"""[ProblemTypes.TIME_SERIES_REGRESSION]"""
supported_problem_types = [ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION]
"""[ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION]"""

def __init__(
self,
Expand All @@ -61,7 +60,7 @@ def __init__(
trend: Optional[str] = "c",
random_seed: Union[int, float] = 0,
maxiter: int = 10,
use_covariates: bool = True,
use_covariates: bool = False,
**kwargs,
):
self.preds_95_upper = None
Expand All @@ -84,6 +83,7 @@ def __init__(

parameters["use_covariates"] = use_covariates
parameters["time_index"] = time_index
parameters.update({"p": p, "q": q})

self.use_covariates = use_covariates
self.time_index = time_index
Expand Down Expand Up @@ -133,13 +133,13 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.DataFrame] = None):

if y is None:
raise ValueError("VARMAX Regressor requires y as input.")
y = convert_bool_to_double(y, include_ints=True)

if X is not None and self.use_covariates:
self.last_X_index = X.index[-1]
X = X.ww.select(exclude=["Datetime"])

X = convert_bool_to_double(X)
y = convert_bool_to_double(y)
X, y = match_indices(X, y)

if not X.empty:
Expand Down
19 changes: 15 additions & 4 deletions evalml/pipelines/components/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,11 @@ def allowed_model_families(problem_type):
return list(set([e.model_family for e in estimators]))


def get_estimators(problem_type, model_families=None, excluded_model_families=None):
def get_estimators(
problem_type,
model_families=None,
excluded_model_families=None,
):
"""Returns the estimators allowed for a particular problem type.
Can also optionally filter by a list of model types.
Expand Down Expand Up @@ -515,20 +519,27 @@ def match_indices(
return X, y


def convert_bool_to_double(data: pd.DataFrame):
"""Converts all boolean columns in dataframe to doubles.
def convert_bool_to_double(
data: pd.DataFrame,
include_ints: bool = False,
) -> pd.DataFrame:
"""Converts all boolean columns in dataframe to doubles. If include_ints, converts all integer columns to doubles as well.
Args:
data (pd.DataFrame): Input dataframe.
include_ints (bool): If True, converts all integer columns to doubles as well. Defaults to False.
Returns:
pd.DataFrame: Input dataframe with all boolean-valued columns converted to doubles.
"""
data_ = data.ww.copy()
relevant_dtypes = ["Boolean"]
if include_ints:
relevant_dtypes.append("Integer")
data_.ww.set_types(
{
col: "Double"
for col in data.ww.select(["Boolean"], return_schema=True).columns
for col in data.ww.select(relevant_dtypes, return_schema=True).columns
},
)
return data_
4 changes: 2 additions & 2 deletions evalml/pipelines/multiseries_regression_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,9 @@ class MultiseriesRegressionPipeline(TimeSeriesRegressionPipeline):
"""

problem_type = ProblemTypes.TIME_SERIES_REGRESSION
problem_type = ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION

"""ProblemTypes.TIME_SERIES_REGRESSION"""
"""ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION"""

def __init__(
self,
Expand Down
26 changes: 23 additions & 3 deletions evalml/pipelines/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from evalml.model_family import ModelFamily
from evalml.pipelines import (
ComponentGraph,
MultiseriesRegressionPipeline,
TimeSeriesBinaryClassificationPipeline,
TimeSeriesMulticlassClassificationPipeline,
TimeSeriesRegressionPipeline,
Expand Down Expand Up @@ -66,6 +67,7 @@
ProblemTypes,
handle_problem_types,
is_classification,
is_multiseries,
is_regression,
is_time_series,
)
Expand Down Expand Up @@ -289,6 +291,9 @@ def _get_preprocessing_components(
Returns:
list[Transformer]: A list of applicable preprocessing components to use with the estimator.
"""
if is_multiseries(problem_type):
return []

if is_time_series(problem_type):
components_functions = [
_get_label_encoder,
Expand Down Expand Up @@ -361,8 +366,10 @@ def _get_pipeline_base_class(problem_type):
return TimeSeriesRegressionPipeline
elif problem_type == ProblemTypes.TIME_SERIES_BINARY:
return TimeSeriesBinaryClassificationPipeline
else:
elif problem_type == ProblemTypes.TIME_SERIES_MULTICLASS:
return TimeSeriesMulticlassClassificationPipeline
else:
return MultiseriesRegressionPipeline


def _make_pipeline_time_series(
Expand Down Expand Up @@ -1204,6 +1211,7 @@ def make_timeseries_baseline_pipeline(
forecast_horizon,
time_index,
exclude_featurizer=False,
series_id=None,
):
"""Make a baseline pipeline for time series regression problems.
Expand All @@ -1214,6 +1222,7 @@ def make_timeseries_baseline_pipeline(
time_index (str): Column name of time_index parameter.
exclude_featurizer (bool): Whether or not to exclude the TimeSeriesFeaturizer from
the baseline graph. Defaults to False.
series_id (str): Column name of series_id parameter. Only used for multiseries time series. Defaults to None.
Returns:
TimeSeriesPipelineBase, a time series pipeline corresponding to the problem type.
Expand All @@ -1232,20 +1241,31 @@ def make_timeseries_baseline_pipeline(
TimeSeriesBinaryClassificationPipeline,
"Time Series Baseline Binary Pipeline",
),
ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION: (
MultiseriesRegressionPipeline,
"Multiseries Time Series Baseline Pipeline",
),
}[problem_type]
component_graph = ["Time Series Baseline Estimator"]
baseline_estimator_name = (
"Multiseries Time Series Baseline Regressor"
if is_multiseries(problem_type)
else "Time Series Baseline Estimator"
)
component_graph = [baseline_estimator_name]
parameters = {
"pipeline": {
"time_index": time_index,
"gap": gap,
"max_delay": 0,
"forecast_horizon": forecast_horizon,
},
"Time Series Baseline Estimator": {
baseline_estimator_name: {
"gap": gap,
"forecast_horizon": forecast_horizon,
},
}
if is_multiseries(problem_type):
parameters["pipeline"]["series_id"] = series_id
if not exclude_featurizer:
component_graph = ["Time Series Featurizer"] + component_graph
parameters["Time Series Featurizer"] = {
Expand Down
9 changes: 8 additions & 1 deletion evalml/preprocessing/data_splitters/time_series_split.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,16 +59,23 @@ def __init__(
gap=0,
forecast_horizon=None,
time_index=None,
n_series=None,
n_splits=3,
):
self.max_delay = max_delay
self.gap = gap
self.forecast_horizon = forecast_horizon if forecast_horizon else 1
self.time_index = time_index
self.n_splits = n_splits
self.n_series = n_series

test_size = forecast_horizon
if self.n_series is not None:
test_size = forecast_horizon * self.n_series

self._splitter = SkTimeSeriesSplit(
n_splits=n_splits,
test_size=forecast_horizon,
test_size=test_size,
)

def get_n_splits(self, X=None, y=None, groups=None):
Expand Down
1 change: 1 addition & 0 deletions evalml/problem_types/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
is_regression,
is_binary,
is_multiclass,
is_multiseries,
is_classification,
is_time_series,
)
Loading

0 comments on commit 7781c77

Please sign in to comment.