diff --git a/evalml/pipelines/utils.py b/evalml/pipelines/utils.py index 7e61999f25..c2575ec781 100644 --- a/evalml/pipelines/utils.py +++ b/evalml/pipelines/utils.py @@ -1401,6 +1401,7 @@ def unstack_multiseries( # Perform the unstacking X_unstacked_cols = [] y_unstacked_cols = [] + new_time_index = None for s_id in series_id_unique: single_series = full_dataset[full_dataset[series_id] == s_id] @@ -1417,8 +1418,11 @@ def unstack_multiseries( X_unstacked_cols.append(new_column) # Concatenate all the single series to reform dataframes - X_unstacked = pd.concat(X_unstacked_cols, axis=1) y_unstacked = pd.concat(y_unstacked_cols, axis=1) + if len(X_unstacked_cols) == 0: + X_unstacked = pd.DataFrame(index=y_unstacked.index) + else: + X_unstacked = pd.concat(X_unstacked_cols, axis=1) # Reset the axes now that they've been unstacked, keep time info in X X_unstacked = X_unstacked.reset_index() @@ -1477,7 +1481,7 @@ def stack_data(data, include_series_id=False, series_id_name=None, starting_inde return stacked_series -def stack_X(X, series_id_name, time_index, starting_index=None): +def stack_X(X, series_id_name, time_index, starting_index=None, series_id_values=None): """Restacks the unstacked features into a single DataFrame. Args: @@ -1486,21 +1490,29 @@ def stack_X(X, series_id_name, time_index, starting_index=None): time_index (str): The name of the time index column. starting_index (int): The starting index to use for the stacked DataFrame. If None, the starting index will match that of the input data. Defaults to None. + series_id_values (set): The unique values of a series ID, used to generate the index. If None, values will + be generated from X column values. Defaults to None. Returns: pd.DataFrame: The restacked features. """ original_columns = set() - series_ids = set() + series_ids = series_id_values or set() for col in X.columns: if col == time_index: continue separated_name = col.split("_") original_columns.add("_".join(separated_name[:-1])) - series_ids.add(separated_name[-1]) + if series_id_values is None: + series_ids.add(separated_name[-1]) restacked_X = [] + if len(series_ids) == 0: + raise ValueError( + "Unable to stack X as X had no exogenous variables and `series_id_values` is None.", + ) + for i, original_col in enumerate(original_columns): # Only include the series id once (for the first column) include_series_id = i == 0 @@ -1513,10 +1525,25 @@ def stack_X(X, series_id_name, time_index, starting_index=None): starting_index=starting_index, ), ) - restacked_X = pd.concat(restacked_X, axis=1) - time_index_col = X[time_index].repeat(len(series_ids)).reset_index(drop=True) - time_index_col.index = restacked_X.index - restacked_X[time_index] = time_index_col + + if len(restacked_X) == 0: + start_index = starting_index or X.index[0] + stacked_index = pd.RangeIndex( + start=start_index, + stop=start_index + len(time_index_col), + ) + time_index_col.index = stacked_index + restacked_X = pd.DataFrame( + { + time_index: time_index_col, + series_id_name: sorted(list(series_ids)) * len(X), + }, + index=stacked_index, + ) + else: + restacked_X = pd.concat(restacked_X, axis=1) + time_index_col.index = restacked_X.index + restacked_X[time_index] = time_index_col return restacked_X diff --git a/evalml/preprocessing/utils.py b/evalml/preprocessing/utils.py index 6b3d656d4c..6e7c203611 100644 --- a/evalml/preprocessing/utils.py +++ b/evalml/preprocessing/utils.py @@ -72,12 +72,22 @@ def split_multiseries_data(X, y, series_id, time_index, **kwargs): X_unstacked, y_unstacked, problem_type="time series regression", **kwargs ) - X_train = stack_X(X_train_unstacked, series_id, time_index) + # Get unique series value from X if there is only the time_index column + # Otherwise, this information is generated in `stack_X` from the column values + series_id_values = set(X[series_id]) if len(X_unstacked.columns) == 1 else None + + X_train = stack_X( + X_train_unstacked, + series_id, + time_index, + series_id_values=series_id_values, + ) X_holdout = stack_X( X_holdout_unstacked, series_id, time_index, starting_index=X_train.index[-1] + 1, + series_id_values=series_id_values, ) y_train = stack_data(y_train_unstacked) y_holdout = stack_data(y_holdout_unstacked, starting_index=y_train.index[-1] + 1) diff --git a/evalml/tests/pipeline_tests/test_pipeline_utils.py b/evalml/tests/pipeline_tests/test_pipeline_utils.py index cbc963aa57..95bbf31473 100644 --- a/evalml/tests/pipeline_tests/test_pipeline_utils.py +++ b/evalml/tests/pipeline_tests/test_pipeline_utils.py @@ -1472,9 +1472,11 @@ def test_stack_data_noop(): pd.testing.assert_series_equal(stack_data(series_y), series_y) +@pytest.mark.parametrize("no_features", [True, False]) @pytest.mark.parametrize("starting_index", [None, 1, 132]) def test_stack_X( starting_index, + no_features, multiseries_ts_data_stacked, multiseries_ts_data_unstacked, ): @@ -1484,7 +1486,28 @@ def test_stack_X( if starting_index is not None: X_expected.index = X_expected.index + starting_index - X_transformed = stack_X(X, "series_id", "date", starting_index=starting_index) + if no_features: + series_id_values = set(str(i) for i in range(0, 5)) + X = pd.DataFrame(X["date"]) + X_expected = X_expected[["date", "series_id"]] + + with pytest.raises( + ValueError, + match="Unable to stack X as X had no exogenous variables and `series_id_values` is None.", + ): + stack_X(X, "series_id", "date", starting_index=starting_index) + + X_transformed = stack_X( + X, + "series_id", + "date", + starting_index=starting_index, + series_id_values=series_id_values, + ) + + else: + X_transformed = stack_X(X, "series_id", "date", starting_index=starting_index) + pd.testing.assert_frame_equal( X_expected.sort_index(axis=1), X_transformed.sort_index(axis=1), diff --git a/evalml/tests/preprocessing_tests/test_split_data.py b/evalml/tests/preprocessing_tests/test_split_data.py index 9403862ef4..cbb8c941ed 100644 --- a/evalml/tests/preprocessing_tests/test_split_data.py +++ b/evalml/tests/preprocessing_tests/test_split_data.py @@ -127,9 +127,13 @@ def test_split_data_ts(test, X_y_regression): assert len(y_test) == test_size -def test_split_multiseries_data(multiseries_ts_data_stacked): +@pytest.mark.parametrize("no_features", [True, False]) +def test_split_multiseries_data(no_features, multiseries_ts_data_stacked): X, y = multiseries_ts_data_stacked + if no_features: + X = X[["date", "series_id"]] + X_train_expected, X_holdout_expected = X[:-10], X[-10:] y_train_expected, y_holdout_expected = y[:-10], y[-10:]