diff --git a/evalml/pipelines/multiseries_regression_pipeline.py b/evalml/pipelines/multiseries_regression_pipeline.py index d5467baa34..1009ab8497 100644 --- a/evalml/pipelines/multiseries_regression_pipeline.py +++ b/evalml/pipelines/multiseries_regression_pipeline.py @@ -6,6 +6,7 @@ TimeSeriesRegressionPipeline, ) from evalml.problem_types import ProblemTypes +from evalml.utils import infer_feature_types class MultiseriesRegressionPipeline(TimeSeriesRegressionPipeline): @@ -91,6 +92,7 @@ def predict_in_sample( y_train, objective=None, calculating_residuals=False, + include_series_id=False, ): """Predict on future data where the target is known, e.g. cross validation. @@ -102,6 +104,7 @@ def predict_in_sample( objective (ObjectiveBase, str, None): Objective used to threshold predicted probabilities, optional. calculating_residuals (bool): Whether we're calling predict_in_sample to calculate the residuals. This means the X and y arguments are not future data, but actually the train data. + include_series_id (bool): If true, include the series ID value in the prediction results Returns: pd.Series: Estimated labels. @@ -125,6 +128,31 @@ def predict_in_sample( self.time_index, self.input_target_name, ) + # Order series columns to be same as expected input feature names + input_features = list(self.input_feature_names.values())[0] + X_unstacked = X_unstacked[ + [feature for feature in input_features if feature in X_unstacked.columns] + ] + X_train_unstacked = X_train_unstacked[ + [ + feature + for feature in input_features + if feature in X_train_unstacked.columns + ] + ] + y_overlapping_features = [ + feature + for feature in y_train_unstacked.columns + if feature in y_unstacked.columns + ] + y_unstacked = y_unstacked[y_overlapping_features] + y_train_unstacked = y_train_unstacked[y_overlapping_features] + + X_train_unstacked = infer_feature_types(X_train_unstacked) + y_train_unstacked = infer_feature_types(y_train_unstacked) + X_unstacked = infer_feature_types(X_unstacked) + y_unstacked = infer_feature_types(y_unstacked) + unstacked_predictions = super().predict_in_sample( X_unstacked, y_unstacked, @@ -133,7 +161,14 @@ def predict_in_sample( objective, calculating_residuals, ) - stacked_predictions = stack_data(unstacked_predictions) + if include_series_id: + stacked_predictions = stack_data( + unstacked_predictions, + include_series_id=True, + series_id_name=self.series_id, + ) + else: + stacked_predictions = stack_data(unstacked_predictions) # Index will start at the unstacked index, so we need to reset it to the original index stacked_predictions.index = X.index diff --git a/evalml/pipelines/time_series_regression_pipeline.py b/evalml/pipelines/time_series_regression_pipeline.py index 15a3b53fca..92d0fda813 100644 --- a/evalml/pipelines/time_series_regression_pipeline.py +++ b/evalml/pipelines/time_series_regression_pipeline.py @@ -225,7 +225,11 @@ def _get_series_intervals(intervals, residuals, trend_pred_intervals, y): return return_intervals if self.problem_type == ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION: - from evalml.pipelines.utils import stack_data, unstack_multiseries + from evalml.pipelines.utils import ( + MULTISERIES_SEPARATOR_SYMBOL, + stack_data, + unstack_multiseries, + ) X, y = unstack_multiseries( X, @@ -268,7 +272,9 @@ def _get_series_intervals(intervals, residuals, trend_pred_intervals, y): # `pred_intervals` are in {series_id: {coverage_label: bound_value}} form for series_id, series_intervals in pred_intervals.items(): series_id_target_name = ( - self.input_target_name + "_" + str(series_id) + self.input_target_name + + MULTISERIES_SEPARATOR_SYMBOL + + str(series_id) ) series_id_prediction_intervals = _get_series_intervals( series_intervals, diff --git a/evalml/pipelines/utils.py b/evalml/pipelines/utils.py index e23998096d..4051a2324f 100644 --- a/evalml/pipelines/utils.py +++ b/evalml/pipelines/utils.py @@ -76,6 +76,7 @@ from evalml.utils.gen_utils import contains_all_ts_parameters DECOMPOSER_PERIOD_CAP = 1000 +MULTISERIES_SEPARATOR_SYMBOL = "|" def _get_label_encoder(X, y, problem_type, estimator_class, sampler_name=None): @@ -1418,7 +1419,7 @@ def unstack_multiseries( for column_name in full_dataset.columns.drop([time_index, series_id]): new_column = single_series[column_name] new_column.index = new_time_index - new_column.name = f"{column_name}_{s_id}" + new_column.name = f"{column_name}{MULTISERIES_SEPARATOR_SYMBOL}{s_id}" if column_name == target_name: y_unstacked_cols.append(new_column) @@ -1435,11 +1436,15 @@ def unstack_multiseries( # Reset the axes now that they've been unstacked, keep time info in X X_unstacked = X_unstacked.reset_index() y_unstacked = y_unstacked.reset_index(drop=True) - return X_unstacked, y_unstacked -def stack_data(data, include_series_id=False, series_id_name=None, starting_index=None): +def stack_data( + data, + include_series_id=False, + series_id_name=None, + starting_index=None, +): """Stacks the given DataFrame back into a single Series, or a DataFrame if include_series_id is True. Should only be used for data that is expected to be a single series. To stack multiple unstacked columns, @@ -1464,7 +1469,9 @@ def stack_data(data, include_series_id=False, series_id_name=None, starting_inde # Extract the original column name series_id_with_name = stacked_series.index.droplevel() - stacked_series.name = "_".join(series_id_with_name[0].split("_")[:-1]) + stacked_series.name = "".join( + series_id_with_name[0].split(MULTISERIES_SEPARATOR_SYMBOL)[:-1], + ) # If the index is the time index, keep it if not data.index.is_numeric() and starting_index is None: @@ -1481,11 +1488,14 @@ def stack_data(data, include_series_id=False, series_id_name=None, starting_inde # Pull out the series id information, if requested if include_series_id: series_id_col = pd.Series( - series_id_with_name.map(lambda col_name: col_name.split("_")[-1]), + series_id_with_name.map( + lambda col_name: col_name.split(MULTISERIES_SEPARATOR_SYMBOL)[-1], + ), name=series_id_name or "series_id", index=stacked_series.index, ) stacked_series = pd.concat([series_id_col, stacked_series], axis=1) + return stacked_series @@ -1511,8 +1521,8 @@ def stack_X(X, series_id_name, time_index, starting_index=None, series_id_values for col in X.columns: if col == time_index: continue - separated_name = col.split("_") - original_columns.add("_".join(separated_name[:-1])) + separated_name = col.split(MULTISERIES_SEPARATOR_SYMBOL) + original_columns.add(MULTISERIES_SEPARATOR_SYMBOL.join(separated_name[:-1])) series_ids.add(separated_name[-1]) if len(series_ids) == 0: diff --git a/evalml/tests/component_tests/test_time_series_featurizer.py b/evalml/tests/component_tests/test_time_series_featurizer.py index 0458d8cfd0..703132b4a9 100644 --- a/evalml/tests/component_tests/test_time_series_featurizer.py +++ b/evalml/tests/component_tests/test_time_series_featurizer.py @@ -15,6 +15,7 @@ ) from evalml.pipelines import TimeSeriesFeaturizer +from evalml.pipelines.utils import MULTISERIES_SEPARATOR_SYMBOL ROLLING_TRANSFORM_METHOD_NAME = "_compute_rolling_transforms" DELAYED_FEATURES_METHOD_NAME = "_compute_delays" @@ -991,7 +992,9 @@ def test_featurizer_y_dataframe(multiseries_ts_data_unstacked): assert featurizer.statistically_significant_lags == [6] - expected_y_cols = [f"target_{i}_delay_6" for i in range(y.shape[1])] + expected_y_cols = [ + f"target{MULTISERIES_SEPARATOR_SYMBOL}{i}_delay_6" for i in range(y.shape[1]) + ] X_t = featurizer.transform(X, y) for expected_y_col in expected_y_cols: assert expected_y_col in X_t.columns diff --git a/evalml/tests/component_tests/test_time_series_imputer.py b/evalml/tests/component_tests/test_time_series_imputer.py index 20ba00823b..137752f496 100644 --- a/evalml/tests/component_tests/test_time_series_imputer.py +++ b/evalml/tests/component_tests/test_time_series_imputer.py @@ -11,6 +11,7 @@ ) from evalml.pipelines.components import TimeSeriesImputer +from evalml.pipelines.utils import MULTISERIES_SEPARATOR_SYMBOL def test_invalid_strategy_parameters(): @@ -745,7 +746,12 @@ def test_time_series_imputer_multiseries( _, y_imputed = imputer.transform(X, y) assert isinstance(y_imputed, pd.DataFrame) - y_expected = pd.DataFrame({f"target_{i}": range(i, 100, 5) for i in range(5)}) + y_expected = pd.DataFrame( + { + f"target{MULTISERIES_SEPARATOR_SYMBOL}{i}": range(i, 100, 5) + for i in range(5) + }, + ) assert_frame_equal(y_imputed, y_expected, check_dtype=False) @@ -777,7 +783,10 @@ def test_time_series_imputer_multiseries_some_columns_all_nan( _, y_imputed = imputer.transform(X, y) y_expected = pd.DataFrame( - {f"target_{i}": range(i, 100, 5) for i in range(num_nan_cols, 5)}, + { + f"target{MULTISERIES_SEPARATOR_SYMBOL}{i}": range(i, 100, 5) + for i in range(num_nan_cols, 5) + }, ) assert_frame_equal(y_imputed, y_expected, check_dtype=False) diff --git a/evalml/tests/conftest.py b/evalml/tests/conftest.py index 3440e5ec91..80036d0704 100644 --- a/evalml/tests/conftest.py +++ b/evalml/tests/conftest.py @@ -1094,12 +1094,27 @@ def multiseries_ts_data_stacked(): @pytest.fixture def multiseries_ts_data_unstacked(): - feature_a = pd.DataFrame({f"feature_a_{i}": range(i, 100, 5) for i in range(5)}) + from evalml.pipelines.utils import MULTISERIES_SEPARATOR_SYMBOL + + feature_a = pd.DataFrame( + { + f"feature_a{MULTISERIES_SEPARATOR_SYMBOL}{i}": range(i, 100, 5) + for i in range(5) + }, + ) feature_b = pd.DataFrame( - {f"feature_b_{i}": range(99 - i, -1, -5) for i in range(5)}, + { + f"feature_b{MULTISERIES_SEPARATOR_SYMBOL}{i}": range(99 - i, -1, -5) + for i in range(5) + }, ) X = pd.concat([feature_a, feature_b], axis=1) - y = pd.DataFrame({f"target_{i}": range(i, 100, 5) for i in range(5)}) + y = pd.DataFrame( + { + f"target{MULTISERIES_SEPARATOR_SYMBOL}{i}": range(i, 100, 5) + for i in range(5) + }, + ) X["date"] = pd.date_range(start="1/1/2018", periods=20) return X, y diff --git a/evalml/tests/pipeline_tests/regression_pipeline_tests/test_multiseries_regression_pipeline.py b/evalml/tests/pipeline_tests/regression_pipeline_tests/test_multiseries_regression_pipeline.py index b3fc28dbae..73e3163af0 100644 --- a/evalml/tests/pipeline_tests/regression_pipeline_tests/test_multiseries_regression_pipeline.py +++ b/evalml/tests/pipeline_tests/regression_pipeline_tests/test_multiseries_regression_pipeline.py @@ -90,7 +90,9 @@ def test_multiseries_pipeline_fit( assert pipeline.frequency is not None +@pytest.mark.parametrize("include_series_id", [True, False]) def test_multiseries_pipeline_predict_in_sample( + include_series_id, multiseries_ts_data_stacked, component_graph, pipeline_parameters, @@ -111,6 +113,7 @@ def test_multiseries_pipeline_predict_in_sample( y_holdout, X_train=X_train, y_train=y_train, + include_series_id=include_series_id, ) expected = pd.Series( range(55, 65), @@ -118,7 +121,11 @@ def test_multiseries_pipeline_predict_in_sample( name="target", dtype="float64", ) - pd.testing.assert_series_equal(y_pred, expected) + if include_series_id: + expected = pd.concat([X_holdout["series_id"], expected], axis=1) + pd.testing.assert_frame_equal(y_pred, expected) + else: + pd.testing.assert_series_equal(y_pred, expected) @pytest.mark.parametrize("forecast_horizon", [1, 7]) diff --git a/evalml/tests/pipeline_tests/test_pipeline_utils.py b/evalml/tests/pipeline_tests/test_pipeline_utils.py index db6de1a9d0..5a9d4b163e 100644 --- a/evalml/tests/pipeline_tests/test_pipeline_utils.py +++ b/evalml/tests/pipeline_tests/test_pipeline_utils.py @@ -43,6 +43,7 @@ handle_component_class, ) from evalml.pipelines.utils import ( + MULTISERIES_SEPARATOR_SYMBOL, _get_pipeline_base_class, _get_preprocessing_components, _make_pipeline_from_multiple_graphs, @@ -1404,7 +1405,8 @@ def test_unstack_multiseries( X_unstacked, y_unstacked = multiseries_ts_data_unstacked y.name = target_name y_unstacked.columns = [ - f"{target_name}_{i}" for i in range(len(y_unstacked.columns)) + f"{target_name}{MULTISERIES_SEPARATOR_SYMBOL}{i}" + for i in range(len(y_unstacked.columns)) ] X_unstacked_transformed, y_unstacked_transformed = unstack_multiseries(