Skip to content

Commit

Permalink
Added support for prediction intervals for VARMAX regressor (#4267)
Browse files Browse the repository at this point in the history
* Initial commit

* Updated release notes

* Fixed random state handling

* Updated tests

* Moved number of simulations as constant.
  • Loading branch information
christopherbunn authored Aug 14, 2023
1 parent daa8568 commit 3572300
Show file tree
Hide file tree
Showing 4 changed files with 92 additions and 20 deletions.
1 change: 1 addition & 0 deletions docs/source/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ Release Notes
* Added stacking and unstacking utility functions to work with multiseries data :pr:`4250`
* Added multiseries regression pipeline class :pr:`4256`
* Added multiseries VARMAX regressor :pr:`4238`
* Added support for prediction intervals for VARMAX regressor :pr:`4267`
* Fixes
* Added support for pandas 2 :pr:`4216`
* Fixed bug where time series pipelines would fail due to MASE needing `y_train` when scoring :pr:`4258`
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ class ExponentialSmoothingRegressor(Estimator):
random_seed (int): Seed for the random number generator. Defaults to 0.
"""

_N_REPETITIONS = 400

name = "Exponential Smoothing Regressor"
hyperparameter_ranges = {
"trend": [None, "additive"],
Expand Down Expand Up @@ -167,7 +169,7 @@ def get_prediction_intervals(
# anchor represents where the simulations should start from (forecasting is done from the "end")
y_pred = self._component_obj._fitted_forecaster.simulate(
nsimulations=X.shape[0],
repetitions=400,
repetitions=self._N_REPETITIONS,
anchor="end",
random_state=self.parameters["random_state"],
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ class VARMAXRegressor(Estimator):
solely be based off of the datetimes and target values. Defaults to True.
"""

_N_REPETITIONS = 400

name = "VARMAX Regressor"
hyperparameter_ranges = {
"p": Integer(0, 10),
Expand Down Expand Up @@ -215,11 +217,51 @@ def get_prediction_intervals(
predictions (pd.Series): Not used for VARMAX regressor.
Returns:
dict: Prediction intervals, keys are in the format {coverage}_lower or {coverage}_upper.
dict[dict]: A dict of prediction intervals, where the dict is in the format {series_id: {coverage}_lower or {coverage}_upper}.
"""
raise NotImplementedError(
"VARMAX does not have prediction intervals implemented yet.",
if coverage is None:
coverage = [0.95]

X, y = self._manage_woodwork(X, y)
use_exog = (
# If exogenous variables were used during training
self._component_obj._fitted_forecaster.model.exog is not None
and self.use_covariates
)
if use_exog:
X = X.ww.select(exclude=["Datetime"])
X = convert_bool_to_double(X)
# Accesses the fitted statsmodels model within sktime
# nsimulations represents how many steps should be simulated
# repetitions represents the number of simulations that should be run (confusing, I know)
# anchor represents where the simulations should start from (forecasting is done from the "end")
y_pred = self._component_obj._fitted_forecaster.simulate(
nsimulations=X.shape[0],
repetitions=self._N_REPETITIONS,
anchor="end",
random_state=self.random_seed,
exog=X if use_exog else None,
)
prediction_interval_result = {}
# Access the target column names (i.e. the series_id values) that the VARMAX component obj was fitted on
for series in self._component_obj._fitted_forecaster.model.endog_names:
series_result = {}
series_preds = y_pred[[col for col in y_pred.columns if series in col]]
for conf_int in coverage:
prediction_interval_lower = series_preds.quantile(
q=round((1 - conf_int) / 2, 3),
axis="columns",
)
prediction_interval_upper = series_preds.quantile(
q=round((1 + conf_int) / 2, 3),
axis="columns",
)
prediction_interval_lower.index = X.index
prediction_interval_upper.index = X.index
series_result[f"{conf_int}_lower"] = prediction_interval_lower
series_result[f"{conf_int}_upper"] = prediction_interval_upper
prediction_interval_result[series] = series_result
return prediction_interval_result

@property
def feature_importance(self) -> np.ndarray:
Expand Down
59 changes: 43 additions & 16 deletions evalml/tests/component_tests/test_varmax_regressor.py
Original file line number Diff line number Diff line change
Expand Up @@ -260,26 +260,10 @@ def test_varmax_regressor_respects_use_covariates(
assert "X" not in mock_predict.call_args.kwargs


def test_varmax_regressor_prediction_intervals_not_implemented_yet_error(
ts_multiseries_data,
):
X_train, X_test, y_train = ts_multiseries_data(n_series=2)

clf = VARMAXRegressor()

clf.fit(X_train, y_train)
with pytest.raises(
NotImplementedError,
match="VARMAX does not have prediction intervals implemented yet.",
):
clf.get_prediction_intervals(X_test)


def test_varmax_regressor_can_forecast_arbitrary_dates_no_covariates(
ts_multiseries_data,
):
X, _, y = ts_multiseries_data(n_series=2)

X_train, X_test, y_train, y_test = split_data(
X,
y,
Expand Down Expand Up @@ -329,3 +313,46 @@ def test_varmax_regressor_can_forecast_arbitrary_dates_past_holdout(
varmax.fit(X_train, y_train)

varmax.predict(X_test)


@pytest.mark.parametrize("use_X_train", [True, False])
@pytest.mark.parametrize("coverages", [None, [0.85], [0.95, 0.90, 0.85]])
@pytest.mark.parametrize("use_covariates", [True, False])
def test_varmax_regressor_prediction_intervals(
use_covariates,
coverages,
use_X_train,
ts_multiseries_data,
):
X_train, X_test, y_train = ts_multiseries_data(no_features=not use_covariates)

clf = VARMAXRegressor(use_covariates=use_covariates)
clf.fit(X=X_train if use_X_train else None, y=y_train)

# Check we are not using exogenous variables if use_covariates=False, even if X_test is passed.
if not use_covariates or not use_X_train:
with patch.object(
clf._component_obj._fitted_forecaster,
"simulate",
) as mock_simulate:
clf.get_prediction_intervals(X_test, None, coverages)
assert mock_simulate.call_args[1]["exog"] is None

results_coverage = clf.get_prediction_intervals(X_test, None, coverages)
predictions = clf.predict(X_test)

series_id_targets = list(results_coverage.keys())
for series in series_id_targets:
series_results_coverage = results_coverage[series]
conf_ints = list(series_results_coverage.keys())
data = list(series_results_coverage.values())

assert len(conf_ints) == (len(coverages) if coverages is not None else 1) * 2
assert len(data) == (len(coverages) if coverages is not None else 1) * 2

for interval in coverages if coverages is not None else [0.95]:
conf_int_lower = f"{interval}_lower"
conf_int_upper = f"{interval}_upper"

assert (series_results_coverage[conf_int_upper] > predictions[series]).all()
assert (predictions[series] > series_results_coverage[conf_int_lower]).all()

0 comments on commit 3572300

Please sign in to comment.