Skip to content

Commit

Permalink
Try extra debug
Browse files Browse the repository at this point in the history
  • Loading branch information
machineFL authored and christopherbunn committed Oct 31, 2023
1 parent 78dc5b8 commit 3048fa5
Show file tree
Hide file tree
Showing 9 changed files with 106 additions and 19 deletions.
37 changes: 36 additions & 1 deletion evalml/pipelines/multiseries_regression_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
TimeSeriesRegressionPipeline,
)
from evalml.problem_types import ProblemTypes
from evalml.utils import infer_feature_types


class MultiseriesRegressionPipeline(TimeSeriesRegressionPipeline):
Expand Down Expand Up @@ -91,6 +92,7 @@ def predict_in_sample(
y_train,
objective=None,
calculating_residuals=False,
include_series_id=False,
):
"""Predict on future data where the target is known, e.g. cross validation.
Expand All @@ -102,6 +104,7 @@ def predict_in_sample(
objective (ObjectiveBase, str, None): Objective used to threshold predicted probabilities, optional.
calculating_residuals (bool): Whether we're calling predict_in_sample to calculate the residuals. This means
the X and y arguments are not future data, but actually the train data.
include_series_id (bool): If true, include the series ID value in the prediction results
Returns:
pd.Series: Estimated labels.
Expand All @@ -125,6 +128,31 @@ def predict_in_sample(
self.time_index,
self.input_target_name,
)
# Order series columns to be same as expected input feature names
input_features = list(self.input_feature_names.values())[0]
X_unstacked = X_unstacked[
[feature for feature in input_features if feature in X_unstacked.columns]
]
X_train_unstacked = X_train_unstacked[
[
feature
for feature in input_features
if feature in X_train_unstacked.columns
]
]
y_overlapping_features = [
feature
for feature in y_train_unstacked.columns
if feature in y_unstacked.columns
]
y_unstacked = y_unstacked[y_overlapping_features]
y_train_unstacked = y_train_unstacked[y_overlapping_features]

X_train_unstacked = infer_feature_types(X_train_unstacked)
y_train_unstacked = infer_feature_types(y_train_unstacked)
X_unstacked = infer_feature_types(X_unstacked)
y_unstacked = infer_feature_types(y_unstacked)

unstacked_predictions = super().predict_in_sample(
X_unstacked,
y_unstacked,
Expand All @@ -133,7 +161,14 @@ def predict_in_sample(
objective,
calculating_residuals,
)
stacked_predictions = stack_data(unstacked_predictions)
if include_series_id:
stacked_predictions = stack_data(
unstacked_predictions,
include_series_id=True,
series_id_name=self.series_id,
)
else:
stacked_predictions = stack_data(unstacked_predictions)

# Index will start at the unstacked index, so we need to reset it to the original index
stacked_predictions.index = X.index
Expand Down
10 changes: 8 additions & 2 deletions evalml/pipelines/time_series_regression_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,11 @@ def _get_series_intervals(intervals, residuals, trend_pred_intervals, y):
return return_intervals

if self.problem_type == ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION:
from evalml.pipelines.utils import stack_data, unstack_multiseries
from evalml.pipelines.utils import (
MULTISERIES_SEPARATOR_SYMBOL,
stack_data,
unstack_multiseries,
)

X, y = unstack_multiseries(
X,
Expand Down Expand Up @@ -268,7 +272,9 @@ def _get_series_intervals(intervals, residuals, trend_pred_intervals, y):
# `pred_intervals` are in {series_id: {coverage_label: bound_value}} form
for series_id, series_intervals in pred_intervals.items():
series_id_target_name = (
self.input_target_name + "_" + str(series_id)
self.input_target_name
+ MULTISERIES_SEPARATOR_SYMBOL
+ str(series_id)
)
series_id_prediction_intervals = _get_series_intervals(
series_intervals,
Expand Down
24 changes: 17 additions & 7 deletions evalml/pipelines/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@
from evalml.utils.gen_utils import contains_all_ts_parameters

DECOMPOSER_PERIOD_CAP = 1000
MULTISERIES_SEPARATOR_SYMBOL = "|"


def _get_label_encoder(X, y, problem_type, estimator_class, sampler_name=None):
Expand Down Expand Up @@ -1418,7 +1419,7 @@ def unstack_multiseries(
for column_name in full_dataset.columns.drop([time_index, series_id]):
new_column = single_series[column_name]
new_column.index = new_time_index
new_column.name = f"{column_name}_{s_id}"
new_column.name = f"{column_name}{MULTISERIES_SEPARATOR_SYMBOL}{s_id}"

if column_name == target_name:
y_unstacked_cols.append(new_column)
Expand All @@ -1435,11 +1436,15 @@ def unstack_multiseries(
# Reset the axes now that they've been unstacked, keep time info in X
X_unstacked = X_unstacked.reset_index()
y_unstacked = y_unstacked.reset_index(drop=True)

return X_unstacked, y_unstacked


def stack_data(data, include_series_id=False, series_id_name=None, starting_index=None):
def stack_data(
data,
include_series_id=False,
series_id_name=None,
starting_index=None,
):
"""Stacks the given DataFrame back into a single Series, or a DataFrame if include_series_id is True.
Should only be used for data that is expected to be a single series. To stack multiple unstacked columns,
Expand All @@ -1464,7 +1469,9 @@ def stack_data(data, include_series_id=False, series_id_name=None, starting_inde

# Extract the original column name
series_id_with_name = stacked_series.index.droplevel()
stacked_series.name = "_".join(series_id_with_name[0].split("_")[:-1])
stacked_series.name = "".join(
series_id_with_name[0].split(MULTISERIES_SEPARATOR_SYMBOL)[:-1],
)

# If the index is the time index, keep it
if not data.index.is_numeric() and starting_index is None:
Expand All @@ -1481,11 +1488,14 @@ def stack_data(data, include_series_id=False, series_id_name=None, starting_inde
# Pull out the series id information, if requested
if include_series_id:
series_id_col = pd.Series(
series_id_with_name.map(lambda col_name: col_name.split("_")[-1]),
series_id_with_name.map(
lambda col_name: col_name.split(MULTISERIES_SEPARATOR_SYMBOL)[-1],
),
name=series_id_name or "series_id",
index=stacked_series.index,
)
stacked_series = pd.concat([series_id_col, stacked_series], axis=1)

return stacked_series


Expand All @@ -1511,8 +1521,8 @@ def stack_X(X, series_id_name, time_index, starting_index=None, series_id_values
for col in X.columns:
if col == time_index:
continue
separated_name = col.split("_")
original_columns.add("_".join(separated_name[:-1]))
separated_name = col.split(MULTISERIES_SEPARATOR_SYMBOL)
original_columns.add(MULTISERIES_SEPARATOR_SYMBOL.join(separated_name[:-1]))
series_ids.add(separated_name[-1])

if len(series_ids) == 0:
Expand Down
5 changes: 4 additions & 1 deletion evalml/tests/component_tests/test_time_series_featurizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
)

from evalml.pipelines import TimeSeriesFeaturizer
from evalml.pipelines.utils import MULTISERIES_SEPARATOR_SYMBOL

ROLLING_TRANSFORM_METHOD_NAME = "_compute_rolling_transforms"
DELAYED_FEATURES_METHOD_NAME = "_compute_delays"
Expand Down Expand Up @@ -991,7 +992,9 @@ def test_featurizer_y_dataframe(multiseries_ts_data_unstacked):

assert featurizer.statistically_significant_lags == [6]

expected_y_cols = [f"target_{i}_delay_6" for i in range(y.shape[1])]
expected_y_cols = [
f"target{MULTISERIES_SEPARATOR_SYMBOL}{i}_delay_6" for i in range(y.shape[1])
]
X_t = featurizer.transform(X, y)
for expected_y_col in expected_y_cols:
assert expected_y_col in X_t.columns
13 changes: 11 additions & 2 deletions evalml/tests/component_tests/test_time_series_imputer.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
)

from evalml.pipelines.components import TimeSeriesImputer
from evalml.pipelines.utils import MULTISERIES_SEPARATOR_SYMBOL


def test_invalid_strategy_parameters():
Expand Down Expand Up @@ -745,7 +746,12 @@ def test_time_series_imputer_multiseries(
_, y_imputed = imputer.transform(X, y)
assert isinstance(y_imputed, pd.DataFrame)

y_expected = pd.DataFrame({f"target_{i}": range(i, 100, 5) for i in range(5)})
y_expected = pd.DataFrame(
{
f"target{MULTISERIES_SEPARATOR_SYMBOL}{i}": range(i, 100, 5)
for i in range(5)
},
)
assert_frame_equal(y_imputed, y_expected, check_dtype=False)


Expand Down Expand Up @@ -777,7 +783,10 @@ def test_time_series_imputer_multiseries_some_columns_all_nan(
_, y_imputed = imputer.transform(X, y)

y_expected = pd.DataFrame(
{f"target_{i}": range(i, 100, 5) for i in range(num_nan_cols, 5)},
{
f"target{MULTISERIES_SEPARATOR_SYMBOL}{i}": range(i, 100, 5)
for i in range(num_nan_cols, 5)
},
)
assert_frame_equal(y_imputed, y_expected, check_dtype=False)

Expand Down
21 changes: 18 additions & 3 deletions evalml/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -1094,12 +1094,27 @@ def multiseries_ts_data_stacked():

@pytest.fixture
def multiseries_ts_data_unstacked():
feature_a = pd.DataFrame({f"feature_a_{i}": range(i, 100, 5) for i in range(5)})
from evalml.pipelines.utils import MULTISERIES_SEPARATOR_SYMBOL

feature_a = pd.DataFrame(
{
f"feature_a{MULTISERIES_SEPARATOR_SYMBOL}{i}": range(i, 100, 5)
for i in range(5)
},
)
feature_b = pd.DataFrame(
{f"feature_b_{i}": range(99 - i, -1, -5) for i in range(5)},
{
f"feature_b{MULTISERIES_SEPARATOR_SYMBOL}{i}": range(99 - i, -1, -5)
for i in range(5)
},
)
X = pd.concat([feature_a, feature_b], axis=1)
y = pd.DataFrame({f"target_{i}": range(i, 100, 5) for i in range(5)})
y = pd.DataFrame(
{
f"target{MULTISERIES_SEPARATOR_SYMBOL}{i}": range(i, 100, 5)
for i in range(5)
},
)

X["date"] = pd.date_range(start="1/1/2018", periods=20)
return X, y
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ cloudpickle==3.0.0
colorama==0.4.6
dask==2023.5.0
distributed==2023.5.0
featuretools==1.27.0
featuretools==1.28.0
graphviz==0.20.1
holidays==0.20
imbalanced-learn==0.11.0
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,9 @@ def test_multiseries_pipeline_fit(
assert pipeline.frequency is not None


@pytest.mark.parametrize("include_series_id", [True, False])
def test_multiseries_pipeline_predict_in_sample(
include_series_id,
multiseries_ts_data_stacked,
component_graph,
pipeline_parameters,
Expand All @@ -111,14 +113,19 @@ def test_multiseries_pipeline_predict_in_sample(
y_holdout,
X_train=X_train,
y_train=y_train,
include_series_id=include_series_id,
)
expected = pd.Series(
range(55, 65),
index=range(90, 100),
name="target",
dtype="float64",
)
pd.testing.assert_series_equal(y_pred, expected)
if include_series_id:
expected = pd.concat([X_holdout["series_id"], expected], axis=1)
pd.testing.assert_frame_equal(y_pred, expected)
else:
pd.testing.assert_series_equal(y_pred, expected)


@pytest.mark.parametrize("forecast_horizon", [1, 7])
Expand Down
4 changes: 3 additions & 1 deletion evalml/tests/pipeline_tests/test_pipeline_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
handle_component_class,
)
from evalml.pipelines.utils import (
MULTISERIES_SEPARATOR_SYMBOL,
_get_pipeline_base_class,
_get_preprocessing_components,
_make_pipeline_from_multiple_graphs,
Expand Down Expand Up @@ -1404,7 +1405,8 @@ def test_unstack_multiseries(
X_unstacked, y_unstacked = multiseries_ts_data_unstacked
y.name = target_name
y_unstacked.columns = [
f"{target_name}_{i}" for i in range(len(y_unstacked.columns))
f"{target_name}{MULTISERIES_SEPARATOR_SYMBOL}{i}"
for i in range(len(y_unstacked.columns))
]

X_unstacked_transformed, y_unstacked_transformed = unstack_multiseries(
Expand Down

0 comments on commit 3048fa5

Please sign in to comment.