Skip to content

Commit

Permalink
Fixed error when stacking data with no exogenous variables (#4275)
Browse files Browse the repository at this point in the history
* Initial commit

* Updated release notes

* Refactored code structure.

* Updated error message and docstring

* Final nits
  • Loading branch information
christopherbunn authored Aug 21, 2023
1 parent 7781c77 commit 53bd61b
Show file tree
Hide file tree
Showing 5 changed files with 96 additions and 29 deletions.
1 change: 1 addition & 0 deletions docs/source/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ Release Notes
* Added support for prediction intervals for VARMAX regressor :pr:`4267`
* Integrated multiseries time series into AutoMLSearch :pr:`4270`
* Fixes
* Fixed error when stacking data with no exogenous variables :pr:`4275`
* Changes
* Updated ``ARIMARegressor`` to be compatible with sktime v0.22.0 and beyond :pr:`4283`
* Documentation Changes
Expand Down
79 changes: 53 additions & 26 deletions evalml/pipelines/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1417,8 +1417,11 @@ def unstack_multiseries(
X_unstacked_cols.append(new_column)

# Concatenate all the single series to reform dataframes
X_unstacked = pd.concat(X_unstacked_cols, axis=1)
y_unstacked = pd.concat(y_unstacked_cols, axis=1)
if len(X_unstacked_cols) == 0:
X_unstacked = pd.DataFrame(index=y_unstacked.index)
else:
X_unstacked = pd.concat(X_unstacked_cols, axis=1)

# Reset the axes now that they've been unstacked, keep time info in X
X_unstacked = X_unstacked.reset_index()
Expand Down Expand Up @@ -1477,7 +1480,7 @@ def stack_data(data, include_series_id=False, series_id_name=None, starting_inde
return stacked_series


def stack_X(X, series_id_name, time_index, starting_index=None):
def stack_X(X, series_id_name, time_index, starting_index=None, series_id_values=None):
"""Restacks the unstacked features into a single DataFrame.
Args:
Expand All @@ -1486,37 +1489,61 @@ def stack_X(X, series_id_name, time_index, starting_index=None):
time_index (str): The name of the time index column.
starting_index (int): The starting index to use for the stacked DataFrame. If None, the starting index
will match that of the input data. Defaults to None.
series_id_values (set, list): The unique values of a series ID, used to generate the index. If None, values will
be generated from X column values. Required if X only has time index values and no exogenous values.
Defaults to None.
Returns:
pd.DataFrame: The restacked features.
"""
original_columns = set()
series_ids = set()
for col in X.columns:
if col == time_index:
continue
separated_name = col.split("_")
original_columns.add("_".join(separated_name[:-1]))
series_ids.add(separated_name[-1])

restacked_X = []

for i, original_col in enumerate(original_columns):
# Only include the series id once (for the first column)
include_series_id = i == 0
subset_X = [col for col in X.columns if original_col in col]
restacked_X.append(
stack_data(
X[subset_X],
include_series_id=include_series_id,
series_id_name=series_id_name,
starting_index=starting_index,
),
series_ids = series_id_values or set()
if series_id_values is None:
for col in X.columns:
if col == time_index:
continue
separated_name = col.split("_")
original_columns.add("_".join(separated_name[:-1]))
series_ids.add(separated_name[-1])

if len(series_ids) == 0:
raise ValueError(
"Series ID values need to be passed in X column values or as a set with the `series_id_values` parameter.",
)
restacked_X = pd.concat(restacked_X, axis=1)

time_index_col = X[time_index].repeat(len(series_ids)).reset_index(drop=True)
time_index_col.index = restacked_X.index
restacked_X[time_index] = time_index_col

if len(original_columns) == 0:
start_index = starting_index or X.index[0]
stacked_index = pd.RangeIndex(
start=start_index,
stop=start_index + len(time_index_col),
)
time_index_col.index = stacked_index
restacked_X = pd.DataFrame(
{
time_index: time_index_col,
series_id_name: sorted(list(series_ids)) * len(X),
},
index=stacked_index,
)
else:
restacked_X = []
for i, original_col in enumerate(original_columns):
# Only include the series id once (for the first column)
include_series_id = i == 0
subset_X = [col for col in X.columns if original_col in col]
restacked_X.append(
stack_data(
X[subset_X],
include_series_id=include_series_id,
series_id_name=series_id_name,
starting_index=starting_index,
),
)

restacked_X = pd.concat(restacked_X, axis=1)
time_index_col.index = restacked_X.index
restacked_X[time_index] = time_index_col

return restacked_X
12 changes: 11 additions & 1 deletion evalml/preprocessing/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,12 +72,22 @@ def split_multiseries_data(X, y, series_id, time_index, **kwargs):
X_unstacked, y_unstacked, problem_type="time series regression", **kwargs
)

X_train = stack_X(X_train_unstacked, series_id, time_index)
# Get unique series value from X if there is only the time_index column
# Otherwise, this information is generated in `stack_X` from the column values
series_id_values = set(X[series_id]) if len(X_unstacked.columns) == 1 else None

X_train = stack_X(
X_train_unstacked,
series_id,
time_index,
series_id_values=series_id_values,
)
X_holdout = stack_X(
X_holdout_unstacked,
series_id,
time_index,
starting_index=X_train.index[-1] + 1,
series_id_values=series_id_values,
)
y_train = stack_data(y_train_unstacked)
y_holdout = stack_data(y_holdout_unstacked, starting_index=y_train.index[-1] + 1)
Expand Down
27 changes: 26 additions & 1 deletion evalml/tests/pipeline_tests/test_pipeline_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1472,9 +1472,13 @@ def test_stack_data_noop():
pd.testing.assert_series_equal(stack_data(series_y), series_y)


@pytest.mark.parametrize("series_id_values_type", [set, list])
@pytest.mark.parametrize("no_features", [True, False])
@pytest.mark.parametrize("starting_index", [None, 1, 132])
def test_stack_X(
starting_index,
no_features,
series_id_values_type,
multiseries_ts_data_stacked,
multiseries_ts_data_unstacked,
):
Expand All @@ -1484,7 +1488,28 @@ def test_stack_X(
if starting_index is not None:
X_expected.index = X_expected.index + starting_index

X_transformed = stack_X(X, "series_id", "date", starting_index=starting_index)
if no_features:
series_id_values = series_id_values_type(str(i) for i in range(0, 5))
X = pd.DataFrame(X["date"])
X_expected = X_expected[["date", "series_id"]]

with pytest.raises(
ValueError,
match="Series ID values need to be passed in X column values or as a set with the `series_id_values` parameter.",
):
stack_X(X, "series_id", "date", starting_index=starting_index)

X_transformed = stack_X(
X,
"series_id",
"date",
starting_index=starting_index,
series_id_values=series_id_values,
)

else:
X_transformed = stack_X(X, "series_id", "date", starting_index=starting_index)

pd.testing.assert_frame_equal(
X_expected.sort_index(axis=1),
X_transformed.sort_index(axis=1),
Expand Down
6 changes: 5 additions & 1 deletion evalml/tests/preprocessing_tests/test_split_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,9 +127,13 @@ def test_split_data_ts(test, X_y_regression):
assert len(y_test) == test_size


def test_split_multiseries_data(multiseries_ts_data_stacked):
@pytest.mark.parametrize("no_features", [True, False])
def test_split_multiseries_data(no_features, multiseries_ts_data_stacked):
X, y = multiseries_ts_data_stacked

if no_features:
X = X[["date", "series_id"]]

X_train_expected, X_holdout_expected = X[:-10], X[-10:]
y_train_expected, y_holdout_expected = y[:-10], y[-10:]

Expand Down

0 comments on commit 53bd61b

Please sign in to comment.