Skip to content

Commit

Permalink
Merge branch 'main' into 4244_extend_stldecomp_for_multiseries
Browse files Browse the repository at this point in the history
  • Loading branch information
remyogasawara authored Aug 22, 2023
2 parents 837fc79 + de64082 commit 8e379c1
Show file tree
Hide file tree
Showing 35 changed files with 793 additions and 166 deletions.
5 changes: 4 additions & 1 deletion docs/source/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,14 @@ Release Notes
-------------
**Future Releases**
* Enhancements
* Added support for prediction intervals for VARMAX regressor :pr:`4267`
* Integrated multiseries time series into AutoMLSearch :pr:`4270`
* Extended STLDecomposer to Support Multiseries :pr:`4253`
* Fixes
* Fixed error when stacking data with no exogenous variables :pr:`4275`
* Changes
* Updated ``ARIMARegressor`` to be compatible with sktime v0.22.0 and beyond :pr:`4283`
* Updated ``graph_prediction_vs_actual_over_time()`` to be compatible with multiseries time series :pr:`4284`
* Documentation Changes
* Removed erroneous warnings from Data Checks User Guide page and removed ``tqdm`` warning in all notebooks :pr:`4274`
* Testing Changes
Expand All @@ -22,7 +26,6 @@ Release Notes
* Added stacking and unstacking utility functions to work with multiseries data :pr:`4250`
* Added multiseries regression pipeline class :pr:`4256`
* Added multiseries VARMAX regressor :pr:`4238`
* Added support for prediction intervals for VARMAX regressor :pr:`4267`
* Fixes
* Added support for pandas 2 :pr:`4216`
* Fixed bug where time series pipelines would fail due to MASE needing `y_train` when scoring :pr:`4258`
Expand Down
16 changes: 12 additions & 4 deletions evalml/automl/automl_algorithm/default_algorithm.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
_make_pipeline_from_multiple_graphs,
make_pipeline,
)
from evalml.problem_types import is_regression, is_time_series
from evalml.problem_types import is_multiseries, is_regression, is_time_series
from evalml.utils import infer_feature_types
from evalml.utils.logger import get_logger

Expand Down Expand Up @@ -170,6 +170,8 @@ def default_max_batches(self):
"""Returns the number of max batches AutoMLSearch should run by default."""
if self.ensembling:
return 3
elif is_multiseries(self.problem_type):
return 1
else:
return 2

Expand Down Expand Up @@ -472,11 +474,17 @@ def next_batch(self):
)
# this logic needs to be updated once time series also supports ensembling
elif is_time_series(self.problem_type):
if self._batch_number == 0:
# Skip the naive batch for multiseries time series
batch = (
self._batch_number
if not is_multiseries(self.problem_type)
else self._batch_number + 1
)
if batch == 0:
next_batch = self._create_naive_pipelines()
elif self._batch_number == 1:
elif batch == 1:
next_batch = self._create_fast_final()
elif self.batch_number == 2:
elif batch == 2:
next_batch = self._create_long_exploration(n=self.top_n)
else:
next_batch = self._create_n_pipelines(
Expand Down
19 changes: 19 additions & 0 deletions evalml/automl/automl_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@
handle_problem_types,
is_binary,
is_classification,
is_multiseries,
is_time_series,
)
from evalml.tuners import SKOptTuner
Expand Down Expand Up @@ -403,6 +404,7 @@ class AutoMLSearch:
problem_configuration (dict, None): Additional parameters needed to configure the search. For example,
in time series problems, values should be passed in for the time_index, gap, forecast_horizon, and max_delay variables.
For multiseries time series problems, the values passed in should also include the name of a series_id column.
train_best_pipeline (boolean): Whether or not to train the best pipeline before returning it. Defaults to True.
Expand Down Expand Up @@ -651,6 +653,14 @@ def __init__(
f"Dataset size is too small to create holdout set. Minimum dataset size is {self._HOLDOUT_SET_MIN_ROWS} rows, X_train has {len(X_train)} rows. Holdout set evaluation is disabled.",
)

# For multiseries problems, we need to make sure that the data is primarily ordered by the time_index rather than the series_id
if is_multiseries(self.problem_type):
time_index = self.problem_configuration.get("time_index")
series_id = self.problem_configuration.get("series_id")
X_train = X_train.sort_values([time_index, series_id])
y_train = y_train[X_train.index].reset_index(drop=True)
X_train = X_train.reset_index(drop=True)

# Set holdout data in AutoML search if provided as parameter
self.X_train = infer_feature_types(X_train)
self.y_train = infer_feature_types(y_train)
Expand Down Expand Up @@ -1053,6 +1063,13 @@ def _validate_problem_configuration(self, problem_configuration=None):
is_valid, msg = contains_all_ts_parameters(problem_configuration)
if not is_valid:
raise ValueError(msg)
if (
is_multiseries(self.problem_type)
and "series_id" not in problem_configuration
):
raise ValueError(
"Must provide 'series_id' column in problem_configuration for multiseries time series problems.",
)
return problem_configuration or {}

def _handle_keyboard_interrupt(self):
Expand Down Expand Up @@ -1355,6 +1372,7 @@ def _get_baseline_pipeline(self):
gap = self.problem_configuration["gap"]
forecast_horizon = self.problem_configuration["forecast_horizon"]
time_index = self.problem_configuration["time_index"]
series_id = self.problem_configuration.get("series_id", None)
exclude_timeseries_featurizer = (
"TimeSeriesFeaturizer" in self.exclude_featurizers
)
Expand All @@ -1364,6 +1382,7 @@ def _get_baseline_pipeline(self):
forecast_horizon,
time_index,
exclude_timeseries_featurizer,
series_id,
)
return baseline

Expand Down
3 changes: 3 additions & 0 deletions evalml/automl/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ def get_default_primary_search_objective(problem_type):
"time series regression": "MedianAE",
"time series binary": "Log Loss Binary",
"time series multiclass": "Log Loss Multiclass",
"multiseries time series regression": "MedianAE",
}[problem_type.value]
return get_objective(objective_name, return_instance=True)

Expand Down Expand Up @@ -87,12 +88,14 @@ def make_data_splitter(
raise ValueError(
"problem_configuration is required for time series problem types",
)
series_id = problem_configuration.get("series_id")
return TimeSeriesSplit(
n_splits=n_splits,
gap=problem_configuration.get("gap"),
max_delay=problem_configuration.get("max_delay"),
time_index=problem_configuration.get("time_index"),
forecast_horizon=problem_configuration.get("forecast_horizon"),
n_series=len(X[series_id].unique()) if series_id is not None else None,
)
if X.shape[0] > _LARGE_DATA_ROW_THRESHOLD:
return TrainingValidationSplit(
Expand Down
141 changes: 107 additions & 34 deletions evalml/model_understanding/visualizations.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from evalml.model_family import ModelFamily
from evalml.objectives.utils import get_objective
from evalml.problem_types import ProblemTypes
from evalml.problem_types.utils import is_multiseries
from evalml.utils import import_or_raise, infer_feature_types, jupyter_check


Expand Down Expand Up @@ -373,25 +374,44 @@ def get_prediction_vs_actual_over_time_data(pipeline, X, y, X_train, y_train, da
dates = infer_feature_types(dates)
prediction = pipeline.predict_in_sample(X, y, X_train=X_train, y_train=y_train)

return pd.DataFrame(
{
"dates": dates.reset_index(drop=True),
"target": y.reset_index(drop=True),
"prediction": prediction.reset_index(drop=True),
},
)
if is_multiseries(pipeline.problem_type):
return pd.DataFrame(
{
"dates": dates.reset_index(drop=True),
"target": y.reset_index(drop=True),
"prediction": prediction.reset_index(drop=True),
"series_id": X[pipeline.series_id].reset_index(drop=True),
},
)
else:
return pd.DataFrame(
{
"dates": dates.reset_index(drop=True),
"target": y.reset_index(drop=True),
"prediction": prediction.reset_index(drop=True),
},
)


def graph_prediction_vs_actual_over_time(pipeline, X, y, X_train, y_train, dates):
def graph_prediction_vs_actual_over_time(
pipeline,
X,
y,
X_train,
y_train,
dates,
single_series=None,
):
"""Plot the target values and predictions against time on the x-axis.
Args:
pipeline (TimeSeriesRegressionPipeline): Fitted time series regression pipeline.
X (pd.DataFrame): Features used to generate new predictions.
y (pd.Series): Target values to compare predictions against.
X (pd.DataFrame): Features used to generate new predictions. If problem is multiseries, X should be stacked.
y (pd.Series): Target values to compare predictions against. If problem is multiseries, y should be stacked.
X_train (pd.DataFrame): Data the pipeline was trained on.
y_train (pd.Series): Target values for training data.
dates (pd.Series): Dates corresponding to target values and predictions.
single_series (str): A single series id value to plot just one series in a multiseries dataset. Defaults to None.
Returns:
plotly.Figure: Showing the prediction vs actual over time.
Expand All @@ -403,8 +423,15 @@ def graph_prediction_vs_actual_over_time(pipeline, X, y, X_train, y_train, dates
"plotly.graph_objects",
error_msg="Cannot find dependency plotly.graph_objects",
)
subplots = import_or_raise(
"plotly.subplots",
error_msg="Cannot find dependency plotly.subplots",
)

if pipeline.problem_type != ProblemTypes.TIME_SERIES_REGRESSION:
if (
pipeline.problem_type != ProblemTypes.TIME_SERIES_REGRESSION
and pipeline.problem_type != ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION
):
raise ValueError(
"graph_prediction_vs_actual_over_time only supports time series regression pipelines! "
f"Received {str(pipeline.problem_type)}.",
Expand All @@ -419,30 +446,76 @@ def graph_prediction_vs_actual_over_time(pipeline, X, y, X_train, y_train, dates
dates,
)

data = [
_go.Scatter(
x=data["dates"],
y=data["target"],
mode="lines+markers",
name="Target",
line=dict(color="#1f77b4"),
),
_go.Scatter(
x=data["dates"],
y=data["prediction"],
mode="lines+markers",
name="Prediction",
line=dict(color="#d62728"),
),
]
# Let plotly pick the best date format.
layout = _go.Layout(
title={"text": "Prediction vs Target over time"},
xaxis={"title": "Time"},
yaxis={"title": "Target Values and Predictions"},
)
fig = None
if is_multiseries(pipeline.problem_type):
id_list = (
[single_series] if single_series is not None else data["series_id"].unique()
)
fig = subplots.make_subplots(
rows=len(id_list),
cols=1,
subplot_titles=[f"Series: {id}" for id in id_list],
)
for curr_count, id in enumerate(id_list):
curr_df = data[data["series_id"] == id]
fig.append_trace(
_go.Scatter(
x=curr_df["dates"],
y=curr_df["target"],
mode="lines+markers",
name=f"Series {id}: Target",
),
row=curr_count + 1,
col=1,
)
fig.append_trace(
_go.Scatter(
x=curr_df["dates"],
y=curr_df["prediction"],
mode="lines+markers",
name=f"Series {id}: Prediction",
),
row=curr_count + 1,
col=1,
)
fig.update_xaxes(title_text="Time")
fig.update_yaxes(title_text=y.name)
if single_series is not None:
fig.update_layout(
title_text=f"Graph for Series {single_series}",
)
else:
fig.update_layout(
height=600 + (len(id_list)) * 200,
width=1500,
title_text="Graph for Multiseries",
)
else:
data = [
_go.Scatter(
x=data["dates"],
y=data["target"],
mode="lines+markers",
name="Target",
line=dict(color="#1f77b4"),
),
_go.Scatter(
x=data["dates"],
y=data["prediction"],
mode="lines+markers",
name="Prediction",
line=dict(color="#d62728"),
),
]
# Let plotly pick the best date format.
layout = _go.Layout(
title={"text": "Prediction vs Target over time"},
xaxis={"title": "Time"},
yaxis={"title": "Target Values and Predictions"},
)

return _go.Figure(data=data, layout=layout)
fig = _go.Figure(data=data, layout=layout)
return fig


def get_linear_coefficients(estimator, features=None):
Expand Down
8 changes: 6 additions & 2 deletions evalml/objectives/regression_objective.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,9 @@
class RegressionObjective(ObjectiveBase):
"""Base class for all regression objectives."""

problem_types = [ProblemTypes.REGRESSION, ProblemTypes.TIME_SERIES_REGRESSION]
"""[ProblemTypes.REGRESSION, ProblemTypes.TIME_SERIES_REGRESSION]"""
problem_types = [
ProblemTypes.REGRESSION,
ProblemTypes.TIME_SERIES_REGRESSION,
ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION,
]
"""[ProblemTypes.REGRESSION, ProblemTypes.TIME_SERIES_REGRESSION, ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION]"""
1 change: 0 additions & 1 deletion evalml/pipelines/components/component_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@ class ComponentBase(ABC, metaclass=ComponentBaseMeta):
# Referring to the pandas nullable dtypes; not just woodwork logical types
_integer_nullable_incompatibilities = []
_boolean_nullable_incompatibilities = []
is_multiseries = False

def __init__(self, parameters=None, component_obj=None, random_seed=0, **kwargs):
"""Base class for all components.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,11 @@ class MultiseriesTimeSeriesBaselineRegressor(Estimator):
"""{}"""
model_family = ModelFamily.BASELINE
"""ModelFamily.BASELINE"""
is_multiseries = True
supported_problem_types = [
ProblemTypes.TIME_SERIES_REGRESSION,
ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION,
]
"""[
ProblemTypes.TIME_SERIES_REGRESSION,
ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION,
]"""

def __init__(self, gap=1, forecast_horizon=1, random_seed=0, **kwargs):
Expand Down
Loading

0 comments on commit 8e379c1

Please sign in to comment.