Skip to content

Commit

Permalink
Initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
christopherbunn committed Aug 21, 2023
1 parent 7781c77 commit 5351b88
Show file tree
Hide file tree
Showing 4 changed files with 75 additions and 11 deletions.
43 changes: 35 additions & 8 deletions evalml/pipelines/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1401,6 +1401,7 @@ def unstack_multiseries(
# Perform the unstacking
X_unstacked_cols = []
y_unstacked_cols = []
new_time_index = None
for s_id in series_id_unique:
single_series = full_dataset[full_dataset[series_id] == s_id]

Expand All @@ -1417,8 +1418,11 @@ def unstack_multiseries(
X_unstacked_cols.append(new_column)

# Concatenate all the single series to reform dataframes
X_unstacked = pd.concat(X_unstacked_cols, axis=1)
y_unstacked = pd.concat(y_unstacked_cols, axis=1)
if len(X_unstacked_cols) == 0:
X_unstacked = pd.DataFrame(index=y_unstacked.index)
else:
X_unstacked = pd.concat(X_unstacked_cols, axis=1)

# Reset the axes now that they've been unstacked, keep time info in X
X_unstacked = X_unstacked.reset_index()
Expand Down Expand Up @@ -1477,7 +1481,7 @@ def stack_data(data, include_series_id=False, series_id_name=None, starting_inde
return stacked_series


def stack_X(X, series_id_name, time_index, starting_index=None):
def stack_X(X, series_id_name, time_index, starting_index=None, series_id_values=None):
"""Restacks the unstacked features into a single DataFrame.
Args:
Expand All @@ -1486,21 +1490,29 @@ def stack_X(X, series_id_name, time_index, starting_index=None):
time_index (str): The name of the time index column.
starting_index (int): The starting index to use for the stacked DataFrame. If None, the starting index
will match that of the input data. Defaults to None.
series_id_values (set): The unique values of a series ID, used to generate the index. If None, values will
be generated from X column values. Defaults to None.
Returns:
pd.DataFrame: The restacked features.
"""
original_columns = set()
series_ids = set()
series_ids = series_id_values or set()
for col in X.columns:
if col == time_index:
continue
separated_name = col.split("_")
original_columns.add("_".join(separated_name[:-1]))
series_ids.add(separated_name[-1])
if series_id_values is None:
series_ids.add(separated_name[-1])

restacked_X = []

if len(series_ids) == 0:
raise ValueError(
"Unable to stack X as X had no exogenous variables and `series_id_values` is None.",
)

for i, original_col in enumerate(original_columns):
# Only include the series id once (for the first column)
include_series_id = i == 0
Expand All @@ -1513,10 +1525,25 @@ def stack_X(X, series_id_name, time_index, starting_index=None):
starting_index=starting_index,
),
)
restacked_X = pd.concat(restacked_X, axis=1)

time_index_col = X[time_index].repeat(len(series_ids)).reset_index(drop=True)
time_index_col.index = restacked_X.index
restacked_X[time_index] = time_index_col

if len(restacked_X) == 0:
start_index = starting_index or X.index[0]
stacked_index = pd.RangeIndex(
start=start_index,
stop=start_index + len(time_index_col),
)
time_index_col.index = stacked_index
restacked_X = pd.DataFrame(
{
time_index: time_index_col,
series_id_name: sorted(list(series_ids)) * len(X),
},
index=stacked_index,
)
else:
restacked_X = pd.concat(restacked_X, axis=1)
time_index_col.index = restacked_X.index
restacked_X[time_index] = time_index_col

return restacked_X
12 changes: 11 additions & 1 deletion evalml/preprocessing/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,12 +72,22 @@ def split_multiseries_data(X, y, series_id, time_index, **kwargs):
X_unstacked, y_unstacked, problem_type="time series regression", **kwargs
)

X_train = stack_X(X_train_unstacked, series_id, time_index)
# Get unique series value from X if there is only the time_index column
# Otherwise, this information is generated in `stack_X` from the column values
series_id_values = set(X[series_id]) if len(X_unstacked.columns) == 1 else None

X_train = stack_X(
X_train_unstacked,
series_id,
time_index,
series_id_values=series_id_values,
)
X_holdout = stack_X(
X_holdout_unstacked,
series_id,
time_index,
starting_index=X_train.index[-1] + 1,
series_id_values=series_id_values,
)
y_train = stack_data(y_train_unstacked)
y_holdout = stack_data(y_holdout_unstacked, starting_index=y_train.index[-1] + 1)
Expand Down
25 changes: 24 additions & 1 deletion evalml/tests/pipeline_tests/test_pipeline_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1472,9 +1472,11 @@ def test_stack_data_noop():
pd.testing.assert_series_equal(stack_data(series_y), series_y)


@pytest.mark.parametrize("no_features", [True, False])
@pytest.mark.parametrize("starting_index", [None, 1, 132])
def test_stack_X(
starting_index,
no_features,
multiseries_ts_data_stacked,
multiseries_ts_data_unstacked,
):
Expand All @@ -1484,7 +1486,28 @@ def test_stack_X(
if starting_index is not None:
X_expected.index = X_expected.index + starting_index

X_transformed = stack_X(X, "series_id", "date", starting_index=starting_index)
if no_features:
series_id_values = set(str(i) for i in range(0, 5))
X = pd.DataFrame(X["date"])
X_expected = X_expected[["date", "series_id"]]

with pytest.raises(
ValueError,
match="Unable to stack X as X had no exogenous variables and `series_id_values` is None.",
):
stack_X(X, "series_id", "date", starting_index=starting_index)

X_transformed = stack_X(
X,
"series_id",
"date",
starting_index=starting_index,
series_id_values=series_id_values,
)

else:
X_transformed = stack_X(X, "series_id", "date", starting_index=starting_index)

pd.testing.assert_frame_equal(
X_expected.sort_index(axis=1),
X_transformed.sort_index(axis=1),
Expand Down
6 changes: 5 additions & 1 deletion evalml/tests/preprocessing_tests/test_split_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,9 +127,13 @@ def test_split_data_ts(test, X_y_regression):
assert len(y_test) == test_size


def test_split_multiseries_data(multiseries_ts_data_stacked):
@pytest.mark.parametrize("no_features", [True, False])
def test_split_multiseries_data(no_features, multiseries_ts_data_stacked):
X, y = multiseries_ts_data_stacked

if no_features:
X = X[["date", "series_id"]]

X_train_expected, X_holdout_expected = X[:-10], X[-10:]
y_train_expected, y_holdout_expected = y[:-10], y[-10:]

Expand Down

0 comments on commit 5351b88

Please sign in to comment.