diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst index b691277d03..1cd2bb58f7 100644 --- a/docs/source/release_notes.rst +++ b/docs/source/release_notes.rst @@ -2,6 +2,7 @@ Release Notes ------------- **Future Releases** * Enhancements + * Changed target name/series ID divider and added ability to return series ID column with predictions :pr:`4357` * Fixes * Changes * Pinned networkx version below 3.2 for Python version compatibility :pr:`4351` diff --git a/evalml/pipelines/multiseries_regression_pipeline.py b/evalml/pipelines/multiseries_regression_pipeline.py index d5467baa34..948ce040c5 100644 --- a/evalml/pipelines/multiseries_regression_pipeline.py +++ b/evalml/pipelines/multiseries_regression_pipeline.py @@ -6,6 +6,7 @@ TimeSeriesRegressionPipeline, ) from evalml.problem_types import ProblemTypes +from evalml.utils import infer_feature_types class MultiseriesRegressionPipeline(TimeSeriesRegressionPipeline): @@ -91,6 +92,7 @@ def predict_in_sample( y_train, objective=None, calculating_residuals=False, + include_series_id=False, ): """Predict on future data where the target is known, e.g. cross validation. @@ -102,6 +104,7 @@ def predict_in_sample( objective (ObjectiveBase, str, None): Objective used to threshold predicted probabilities, optional. calculating_residuals (bool): Whether we're calling predict_in_sample to calculate the residuals. This means the X and y arguments are not future data, but actually the train data. + include_series_id (bool): If true, include the series ID value in the prediction results Returns: pd.Series: Estimated labels. @@ -125,6 +128,33 @@ def predict_in_sample( self.time_index, self.input_target_name, ) + + # Order series columns to be same as expected input feature names + # and filter to only include features in `X_unstacked`. + input_features = list(self.input_feature_names.values())[0] + X_unstacked = X_unstacked[ + [feature for feature in input_features if feature in X_unstacked.columns] + ] + X_train_unstacked = X_train_unstacked[ + [ + feature + for feature in input_features + if feature in X_train_unstacked.columns + ] + ] + y_overlapping_features = [ + feature + for feature in y_train_unstacked.columns + if feature in y_unstacked.columns + ] + y_unstacked = y_unstacked[y_overlapping_features] + y_train_unstacked = y_train_unstacked[y_overlapping_features] + + X_train_unstacked = infer_feature_types(X_train_unstacked) + y_train_unstacked = infer_feature_types(y_train_unstacked) + X_unstacked = infer_feature_types(X_unstacked) + y_unstacked = infer_feature_types(y_unstacked) + unstacked_predictions = super().predict_in_sample( X_unstacked, y_unstacked, @@ -133,10 +163,15 @@ def predict_in_sample( objective, calculating_residuals, ) - stacked_predictions = stack_data(unstacked_predictions) + stacked_predictions = stack_data( + unstacked_predictions, + include_series_id=include_series_id, + series_id_name=self.series_id, + ) # Index will start at the unstacked index, so we need to reset it to the original index stacked_predictions.index = X.index + stacked_predictions = infer_feature_types(stacked_predictions) return stacked_predictions def get_forecast_period(self, X): diff --git a/evalml/pipelines/time_series_regression_pipeline.py b/evalml/pipelines/time_series_regression_pipeline.py index 15a3b53fca..92d0fda813 100644 --- a/evalml/pipelines/time_series_regression_pipeline.py +++ b/evalml/pipelines/time_series_regression_pipeline.py @@ -225,7 +225,11 @@ def _get_series_intervals(intervals, residuals, trend_pred_intervals, y): return return_intervals if self.problem_type == ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION: - from evalml.pipelines.utils import stack_data, unstack_multiseries + from evalml.pipelines.utils import ( + MULTISERIES_SEPARATOR_SYMBOL, + stack_data, + unstack_multiseries, + ) X, y = unstack_multiseries( X, @@ -268,7 +272,9 @@ def _get_series_intervals(intervals, residuals, trend_pred_intervals, y): # `pred_intervals` are in {series_id: {coverage_label: bound_value}} form for series_id, series_intervals in pred_intervals.items(): series_id_target_name = ( - self.input_target_name + "_" + str(series_id) + self.input_target_name + + MULTISERIES_SEPARATOR_SYMBOL + + str(series_id) ) series_id_prediction_intervals = _get_series_intervals( series_intervals, diff --git a/evalml/pipelines/utils.py b/evalml/pipelines/utils.py index e23998096d..26ffb9463b 100644 --- a/evalml/pipelines/utils.py +++ b/evalml/pipelines/utils.py @@ -76,6 +76,7 @@ from evalml.utils.gen_utils import contains_all_ts_parameters DECOMPOSER_PERIOD_CAP = 1000 +MULTISERIES_SEPARATOR_SYMBOL = "|" def _get_label_encoder(X, y, problem_type, estimator_class, sampler_name=None): @@ -1418,7 +1419,7 @@ def unstack_multiseries( for column_name in full_dataset.columns.drop([time_index, series_id]): new_column = single_series[column_name] new_column.index = new_time_index - new_column.name = f"{column_name}_{s_id}" + new_column.name = f"{column_name}{MULTISERIES_SEPARATOR_SYMBOL}{s_id}" if column_name == target_name: y_unstacked_cols.append(new_column) @@ -1435,11 +1436,15 @@ def unstack_multiseries( # Reset the axes now that they've been unstacked, keep time info in X X_unstacked = X_unstacked.reset_index() y_unstacked = y_unstacked.reset_index(drop=True) - return X_unstacked, y_unstacked -def stack_data(data, include_series_id=False, series_id_name=None, starting_index=None): +def stack_data( + data, + include_series_id=False, + series_id_name=None, + starting_index=None, +): """Stacks the given DataFrame back into a single Series, or a DataFrame if include_series_id is True. Should only be used for data that is expected to be a single series. To stack multiple unstacked columns, @@ -1464,7 +1469,9 @@ def stack_data(data, include_series_id=False, series_id_name=None, starting_inde # Extract the original column name series_id_with_name = stacked_series.index.droplevel() - stacked_series.name = "_".join(series_id_with_name[0].split("_")[:-1]) + stacked_series.name = MULTISERIES_SEPARATOR_SYMBOL.join( + series_id_with_name[0].split(MULTISERIES_SEPARATOR_SYMBOL)[:-1], + ) # If the index is the time index, keep it if not data.index.is_numeric() and starting_index is None: @@ -1481,11 +1488,14 @@ def stack_data(data, include_series_id=False, series_id_name=None, starting_inde # Pull out the series id information, if requested if include_series_id: series_id_col = pd.Series( - series_id_with_name.map(lambda col_name: col_name.split("_")[-1]), + series_id_with_name.map( + lambda col_name: col_name.split(MULTISERIES_SEPARATOR_SYMBOL)[-1], + ), name=series_id_name or "series_id", index=stacked_series.index, ) stacked_series = pd.concat([series_id_col, stacked_series], axis=1) + return stacked_series @@ -1511,8 +1521,8 @@ def stack_X(X, series_id_name, time_index, starting_index=None, series_id_values for col in X.columns: if col == time_index: continue - separated_name = col.split("_") - original_columns.add("_".join(separated_name[:-1])) + separated_name = col.split(MULTISERIES_SEPARATOR_SYMBOL) + original_columns.add(MULTISERIES_SEPARATOR_SYMBOL.join(separated_name[:-1])) series_ids.add(separated_name[-1]) if len(series_ids) == 0: diff --git a/evalml/tests/component_tests/test_time_series_featurizer.py b/evalml/tests/component_tests/test_time_series_featurizer.py index 0458d8cfd0..703132b4a9 100644 --- a/evalml/tests/component_tests/test_time_series_featurizer.py +++ b/evalml/tests/component_tests/test_time_series_featurizer.py @@ -15,6 +15,7 @@ ) from evalml.pipelines import TimeSeriesFeaturizer +from evalml.pipelines.utils import MULTISERIES_SEPARATOR_SYMBOL ROLLING_TRANSFORM_METHOD_NAME = "_compute_rolling_transforms" DELAYED_FEATURES_METHOD_NAME = "_compute_delays" @@ -991,7 +992,9 @@ def test_featurizer_y_dataframe(multiseries_ts_data_unstacked): assert featurizer.statistically_significant_lags == [6] - expected_y_cols = [f"target_{i}_delay_6" for i in range(y.shape[1])] + expected_y_cols = [ + f"target{MULTISERIES_SEPARATOR_SYMBOL}{i}_delay_6" for i in range(y.shape[1]) + ] X_t = featurizer.transform(X, y) for expected_y_col in expected_y_cols: assert expected_y_col in X_t.columns diff --git a/evalml/tests/component_tests/test_time_series_imputer.py b/evalml/tests/component_tests/test_time_series_imputer.py index 20ba00823b..137752f496 100644 --- a/evalml/tests/component_tests/test_time_series_imputer.py +++ b/evalml/tests/component_tests/test_time_series_imputer.py @@ -11,6 +11,7 @@ ) from evalml.pipelines.components import TimeSeriesImputer +from evalml.pipelines.utils import MULTISERIES_SEPARATOR_SYMBOL def test_invalid_strategy_parameters(): @@ -745,7 +746,12 @@ def test_time_series_imputer_multiseries( _, y_imputed = imputer.transform(X, y) assert isinstance(y_imputed, pd.DataFrame) - y_expected = pd.DataFrame({f"target_{i}": range(i, 100, 5) for i in range(5)}) + y_expected = pd.DataFrame( + { + f"target{MULTISERIES_SEPARATOR_SYMBOL}{i}": range(i, 100, 5) + for i in range(5) + }, + ) assert_frame_equal(y_imputed, y_expected, check_dtype=False) @@ -777,7 +783,10 @@ def test_time_series_imputer_multiseries_some_columns_all_nan( _, y_imputed = imputer.transform(X, y) y_expected = pd.DataFrame( - {f"target_{i}": range(i, 100, 5) for i in range(num_nan_cols, 5)}, + { + f"target{MULTISERIES_SEPARATOR_SYMBOL}{i}": range(i, 100, 5) + for i in range(num_nan_cols, 5) + }, ) assert_frame_equal(y_imputed, y_expected, check_dtype=False) diff --git a/evalml/tests/conftest.py b/evalml/tests/conftest.py index 3440e5ec91..80036d0704 100644 --- a/evalml/tests/conftest.py +++ b/evalml/tests/conftest.py @@ -1094,12 +1094,27 @@ def multiseries_ts_data_stacked(): @pytest.fixture def multiseries_ts_data_unstacked(): - feature_a = pd.DataFrame({f"feature_a_{i}": range(i, 100, 5) for i in range(5)}) + from evalml.pipelines.utils import MULTISERIES_SEPARATOR_SYMBOL + + feature_a = pd.DataFrame( + { + f"feature_a{MULTISERIES_SEPARATOR_SYMBOL}{i}": range(i, 100, 5) + for i in range(5) + }, + ) feature_b = pd.DataFrame( - {f"feature_b_{i}": range(99 - i, -1, -5) for i in range(5)}, + { + f"feature_b{MULTISERIES_SEPARATOR_SYMBOL}{i}": range(99 - i, -1, -5) + for i in range(5) + }, ) X = pd.concat([feature_a, feature_b], axis=1) - y = pd.DataFrame({f"target_{i}": range(i, 100, 5) for i in range(5)}) + y = pd.DataFrame( + { + f"target{MULTISERIES_SEPARATOR_SYMBOL}{i}": range(i, 100, 5) + for i in range(5) + }, + ) X["date"] = pd.date_range(start="1/1/2018", periods=20) return X, y diff --git a/evalml/tests/pipeline_tests/regression_pipeline_tests/test_multiseries_regression_pipeline.py b/evalml/tests/pipeline_tests/regression_pipeline_tests/test_multiseries_regression_pipeline.py index b3fc28dbae..f092da76a7 100644 --- a/evalml/tests/pipeline_tests/regression_pipeline_tests/test_multiseries_regression_pipeline.py +++ b/evalml/tests/pipeline_tests/regression_pipeline_tests/test_multiseries_regression_pipeline.py @@ -8,6 +8,7 @@ from evalml.pipelines import MultiseriesRegressionPipeline from evalml.pipelines.utils import unstack_multiseries from evalml.preprocessing import split_multiseries_data +from evalml.utils import infer_feature_types @pytest.fixture(scope="module") @@ -90,7 +91,9 @@ def test_multiseries_pipeline_fit( assert pipeline.frequency is not None +@pytest.mark.parametrize("include_series_id", [True, False]) def test_multiseries_pipeline_predict_in_sample( + include_series_id, multiseries_ts_data_stacked, component_graph, pipeline_parameters, @@ -111,6 +114,7 @@ def test_multiseries_pipeline_predict_in_sample( y_holdout, X_train=X_train, y_train=y_train, + include_series_id=include_series_id, ) expected = pd.Series( range(55, 65), @@ -118,7 +122,61 @@ def test_multiseries_pipeline_predict_in_sample( name="target", dtype="float64", ) - pd.testing.assert_series_equal(y_pred, expected) + if include_series_id: + expected = pd.concat([X_holdout["series_id"], expected], axis=1) + expected = infer_feature_types(expected) + pd.testing.assert_frame_equal(y_pred, expected) + else: + pd.testing.assert_series_equal(y_pred, expected) + + +@pytest.mark.parametrize("include_series_id", [True, False]) +def test_multiseries_pipeline_predict_in_sample_series_out_of_order( + include_series_id, + multiseries_ts_data_stacked, + component_graph, + pipeline_parameters, +): + X, y = multiseries_ts_data_stacked + X_train, X_holdout, y_train, y_holdout = split_multiseries_data( + X, + y, + "series_id", + "date", + ) + + # Reorder rows but keep ordered by date + # Store ordered series ID values to compare to output later + X_holdout_series_id = X_holdout["series_id"] + X_index = X_holdout.index + X_holdout = X_holdout.sample(frac=1).sort_values(by="date") + y_holdout = y_holdout.reindex(X_holdout.index) + + X_holdout.index = X_index + y_holdout.index = X_index + + pipeline = MultiseriesRegressionPipeline(component_graph, pipeline_parameters) + pipeline.fit(X_train, y_train) + + y_pred = pipeline.predict_in_sample( + X_holdout, + y_holdout, + X_train=X_train, + y_train=y_train, + include_series_id=include_series_id, + ) + expected = pd.Series( + range(55, 65), + index=range(90, 100), + name="target", + dtype="float64", + ) + if include_series_id: + expected = pd.concat([X_holdout_series_id, expected], axis=1) + expected = infer_feature_types(expected) + pd.testing.assert_frame_equal(y_pred, expected) + else: + pd.testing.assert_series_equal(y_pred, expected) @pytest.mark.parametrize("forecast_horizon", [1, 7]) diff --git a/evalml/tests/pipeline_tests/test_pipeline_utils.py b/evalml/tests/pipeline_tests/test_pipeline_utils.py index db6de1a9d0..5a9d4b163e 100644 --- a/evalml/tests/pipeline_tests/test_pipeline_utils.py +++ b/evalml/tests/pipeline_tests/test_pipeline_utils.py @@ -43,6 +43,7 @@ handle_component_class, ) from evalml.pipelines.utils import ( + MULTISERIES_SEPARATOR_SYMBOL, _get_pipeline_base_class, _get_preprocessing_components, _make_pipeline_from_multiple_graphs, @@ -1404,7 +1405,8 @@ def test_unstack_multiseries( X_unstacked, y_unstacked = multiseries_ts_data_unstacked y.name = target_name y_unstacked.columns = [ - f"{target_name}_{i}" for i in range(len(y_unstacked.columns)) + f"{target_name}{MULTISERIES_SEPARATOR_SYMBOL}{i}" + for i in range(len(y_unstacked.columns)) ] X_unstacked_transformed, y_unstacked_transformed = unstack_multiseries(