diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst index 0f9029a5df..a2c7964bbd 100644 --- a/docs/source/release_notes.rst +++ b/docs/source/release_notes.rst @@ -7,6 +7,7 @@ Release Notes * Added stacking and unstacking utility functions to work with multiseries data :pr:`4250` * Fixes * Added support for pandas 2 :pr:`4216` + * Fixed bug where time series pipelines would fail due to MASE needing `y_train` when scoring :pr:`4258` * Update s3 bucket for docs image :pr:`4260` * Changes * Unpinned sktime version :pr:`4214` diff --git a/evalml/pipelines/binary_classification_pipeline.py b/evalml/pipelines/binary_classification_pipeline.py index 5c8ddae075..86f8e11520 100644 --- a/evalml/pipelines/binary_classification_pipeline.py +++ b/evalml/pipelines/binary_classification_pipeline.py @@ -87,7 +87,7 @@ def predict_proba(self, X, X_train=None, y_train=None): return super().predict_proba(X) @staticmethod - def _score(X, y, predictions, objective): + def _score(X, y, predictions, objective, y_train=None): """Given data, model predictions or predicted probabilities computed on the data, and an objective, evaluate and return the objective score.""" if predictions.ndim > 1: predictions = predictions.iloc[:, 1] diff --git a/evalml/pipelines/pipeline_base.py b/evalml/pipelines/pipeline_base.py index a5b71c7b46..b89638afae 100644 --- a/evalml/pipelines/pipeline_base.py +++ b/evalml/pipelines/pipeline_base.py @@ -351,10 +351,18 @@ def score(self, X, y, objectives, X_train=None, y_train=None): """ @staticmethod - def _score(X, y, predictions, objective): - return objective.score(y, predictions, X=X) + def _score(X, y, predictions, objective, y_train=None): + return objective.score(y, predictions, X=X, y_train=y_train) - def _score_all_objectives(self, X, y, y_pred, y_pred_proba, objectives): + def _score_all_objectives( + self, + X, + y, + y_pred, + y_pred_proba, + objectives, + y_train=None, + ): """Given data, model predictions or predicted probabilities computed on the data, and an objective, evaluate and return the objective score. Will raise a PipelineScoreError if any objectives fail. @@ -366,6 +374,7 @@ def _score_all_objectives(self, X, y, y_pred, y_pred_proba, objectives): y_pred_proba (pd.Dataframe, pd.Series, None): The predicted probabilities for classification problems. Will be a DataFrame for multiclass problems and Series otherwise. Will be None for regression problems. objectives (list): List of objectives to score. + y_train (pd.Series or None): Training labels. Only used for time series, otherwise ignored. Returns: dict: Ordered dictionary with objectives and their scores. @@ -390,6 +399,7 @@ def _score_all_objectives(self, X, y, y_pred, y_pred_proba, objectives): y, y_pred_proba if objective.score_needs_proba else y_pred, objective, + y_train, ) scored_successfully.update({objective.name: score}) except Exception as e: diff --git a/evalml/pipelines/time_series_classification_pipelines.py b/evalml/pipelines/time_series_classification_pipelines.py index df4d8b8597..b14dac3a15 100644 --- a/evalml/pipelines/time_series_classification_pipelines.py +++ b/evalml/pipelines/time_series_classification_pipelines.py @@ -282,11 +282,17 @@ def predict_in_sample(self, X, y, X_train, y_train, objective=None): return infer_feature_types(predictions) @staticmethod - def _score(X, y, predictions, objective): + def _score(X, y, predictions, objective, y_train=None): """Given data, model predictions or predicted probabilities computed on the data, and an objective, evaluate and return the objective score.""" if predictions.ndim > 1: predictions = predictions.iloc[:, 1] - return TimeSeriesClassificationPipeline._score(X, y, predictions, objective) + return TimeSeriesClassificationPipeline._score( + X, + y, + predictions, + objective, + y_train, + ) class TimeSeriesMulticlassClassificationPipeline(TimeSeriesClassificationPipeline): diff --git a/evalml/pipelines/time_series_regression_pipeline.py b/evalml/pipelines/time_series_regression_pipeline.py index 93c4ec2a5a..fbe4ef8fc5 100644 --- a/evalml/pipelines/time_series_regression_pipeline.py +++ b/evalml/pipelines/time_series_regression_pipeline.py @@ -102,6 +102,7 @@ def score(self, X, y, objectives, X_train=None, y_train=None): y_predicted, y_pred_proba=None, objectives=objectives, + y_train=y_train, ) def get_forecast_period(self, X): diff --git a/evalml/tests/pipeline_tests/test_pipelines.py b/evalml/tests/pipeline_tests/test_pipelines.py index 6720f4732d..91f1e3c9f8 100644 --- a/evalml/tests/pipeline_tests/test_pipelines.py +++ b/evalml/tests/pipeline_tests/test_pipelines.py @@ -1,4 +1,5 @@ import io +import math import os import pickle import re @@ -1048,6 +1049,47 @@ def test_score_with_objective_that_requires_predict_proba( mock_predict.assert_called() +@patch("evalml.pipelines.components.Estimator.predict") +@patch("evalml.pipelines.components.Estimator.fit") +def test_score_with_objective_that_requires_y_train( + mock_fit, + mock_predict, + dummy_time_series_regression_pipeline_class, + generate_seasonal_data, +): + X, y = generate_seasonal_data(real_or_synthetic="real")(period=10) + X = X.reset_index() + + split = math.floor(0.9 * len(X)) + X_train, X_holdout = X.iloc[:split], X.iloc[split:] + y_train, y_holdout = y.iloc[:split], y.iloc[split:] + + parameters = { + "pipeline": { + "max_delay": 0, + "gap": 2, + "forecast_horizon": 2, + "time_index": "Date", + }, + } + + mock_regression_pipeline = dummy_time_series_regression_pipeline_class( + parameters=parameters, + ) + + mock_predict.return_value = pd.Series([1] * len(y_holdout)) + + mock_regression_pipeline.fit(X_train, y_train) + mock_regression_pipeline.score( + X_holdout, + y_holdout, + ["mean absolute scaled error"], + X_train=X_train, + y_train=y_train, + ) + mock_predict.assert_called() + + def test_score_auc(X_y_binary, logistic_regression_binary_pipeline): X, y = X_y_binary lr_pipeline = logistic_regression_binary_pipeline