Skip to content

Commit

Permalink
Update forecasting script and MM nb (#3450)
Browse files Browse the repository at this point in the history
* update forecasting script

* remove unwanted change

* fix code style issue
  • Loading branch information
iamrk04 authored Dec 3, 2024
1 parent 7b454e6 commit 437cc2a
Show file tree
Hide file tree
Showing 7 changed files with 112 additions and 35 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -29,14 +29,19 @@ def run(mini_batch):
print(f"run method start: {__file__}, run({mini_batch})")
resultList = []
for test in mini_batch:
if os.path.splitext(test)[-1] == ".parquet":
file_ext = os.path.splitext(test)[-1]
if file_ext == ".parquet":
X_test = pd.read_parquet(test)
elif os.path.splitext(test)[-1] == ".csv":
elif file_ext == ".csv":
X_test = pd.read_csv(test, parse_dates=[fitted_model.time_column_name])
else:
print(f"Unsupported file type: `{file_ext}`. Skipping the file.")
continue # Skip if it's neither a Parquet nor CSV file

y_test = X_test.pop(target_column_name).values
if target_column_name in X_test.columns:
y_test = X_test.pop(target_column_name).values
else:
y_test = None

# We have default quantiles values set as below(95th percentile)
quantiles = [0.025, 0.5, 0.975]
Expand All @@ -49,15 +54,21 @@ def run(mini_batch):
pred_quantiles[PI] = pred_quantiles[[min(quantiles), max(quantiles)]].apply(
lambda x: "[{}, {}]".format(x[0], x[1]), axis=1
)
X_test[target_column_name] = y_test
if y_test is not None:
X_test[target_column_name] = y_test
X_test[PI] = pred_quantiles[PI].values
X_test[predicted_column_name] = pred_quantiles[0.5].values
# drop rows where prediction or actuals are nan
# happens because of missing actuals
# or at edges of time due to lags/rolling windows
clean = X_test[
X_test[[target_column_name, predicted_column_name]].notnull().all(axis=1)
]
if target_column_name in X_test.columns:
clean = X_test[
X_test[[target_column_name, predicted_column_name]]
.notnull()
.all(axis=1)
]
else:
clean = X_test[X_test[predicted_column_name].notnull()]
print(
f"The predictions have {clean.shape[0]} rows and {clean.shape[1]} columns."
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,19 @@ def run(mini_batch):
print(f"run method start: {__file__}, run({mini_batch})")
resultList = []
for test in mini_batch:
if not test.endswith(".csv"):
file_ext = os.path.splitext(test)[-1]
if file_ext == ".parquet":
X_test = pd.read_parquet(test)
elif file_ext == ".csv":
X_test = pd.read_csv(test, parse_dates=[fitted_model.time_column_name])
else:
print(f"Unsupported file type: `{file_ext}`. Skipping the file.")
continue
X_test = pd.read_csv(test, parse_dates=[fitted_model.time_column_name])

if target_column_name not in X_test.columns:
raise ValueError(
f"Target column `{target_column_name}` not found in the test data, required for rolling forecast."
)
y_test = X_test.pop(target_column_name).values

# Make a rolling forecast, advancing the forecast origin by 1 period on each iteration through the test set
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,19 @@ def run(mini_batch):
print(f"run method start: {__file__}, run({mini_batch})")
resultList = []
for test in mini_batch:
if os.path.splitext(test)[-1] != ".csv":
file_ext = os.path.splitext(test)[-1]
if file_ext == ".parquet":
X_test = pd.read_parquet(test)
elif file_ext == ".csv":
X_test = pd.read_csv(test, parse_dates=[fitted_model.time_column_name])
else:
print(f"Unsupported file type: `{file_ext}`. Skipping the file.")
continue

X_test = pd.read_csv(test, parse_dates=[fitted_model.time_column_name])
y_test = X_test.pop(target_column_name).values
if target_column_name in X_test.columns:
y_test = X_test.pop(target_column_name).values
else:
y_test = None

# We have default quantiles values set as below(95th percentile)
quantiles = [0.025, 0.5, 0.975]
Expand All @@ -46,15 +54,21 @@ def run(mini_batch):
pred_quantiles[PI] = pred_quantiles[[min(quantiles), max(quantiles)]].apply(
lambda x: "[{}, {}]".format(x[0], x[1]), axis=1
)
X_test[target_column_name] = y_test
if y_test is not None:
X_test[target_column_name] = y_test
X_test[PI] = pred_quantiles[PI].values
X_test[predicted_column_name] = pred_quantiles[0.5].values
# drop rows where prediction or actuals are nan
# happens because of missing actuals
# or at edges of time due to lags/rolling windows
clean = X_test[
X_test[[target_column_name, predicted_column_name]].notnull().all(axis=1)
]
if target_column_name in X_test.columns:
clean = X_test[
X_test[[target_column_name, predicted_column_name]]
.notnull()
.all(axis=1)
]
else:
clean = X_test[X_test[predicted_column_name].notnull()]
print(
f"The predictions have {clean.shape[0]} rows and {clean.shape[1]} columns."
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,19 @@ def run(mini_batch):
print(f"run method start: {__file__}, run({mini_batch})")
resultList = []
for test in mini_batch:
X_test = pd.read_csv(test, parse_dates=[fitted_model.time_column_name])
y_test = X_test.pop(target_column_name).values
file_ext = os.path.splitext(test)[-1]
if file_ext == ".parquet":
X_test = pd.read_parquet(test)
elif file_ext == ".csv":
X_test = pd.read_csv(test, parse_dates=[fitted_model.time_column_name])
else:
print(f"Unsupported file type: `{file_ext}`. Skipping the file.")
continue

if target_column_name in X_test.columns:
y_test = X_test.pop(target_column_name).values
else:
y_test = None

# We have default quantiles values set as below(95th percentile)
quantiles = [0.025, 0.5, 0.975]
Expand All @@ -43,15 +54,21 @@ def run(mini_batch):
pred_quantiles[PI] = pred_quantiles[[min(quantiles), max(quantiles)]].apply(
lambda x: "[{}, {}]".format(x[0], x[1]), axis=1
)
X_test[target_column_name] = y_test
X_test[PI] = pred_quantiles[PI]
X_test[predicted_column_name] = pred_quantiles[0.5]
if y_test is not None:
X_test[target_column_name] = y_test
X_test[PI] = pred_quantiles[PI].values
X_test[predicted_column_name] = pred_quantiles[0.5].values
# drop rows where prediction or actuals are nan
# happens because of missing actuals
# or at edges of time due to lags/rolling windows
clean = X_test[
X_test[[target_column_name, predicted_column_name]].notnull().all(axis=1)
]
if target_column_name in X_test.columns:
clean = X_test[
X_test[[target_column_name, predicted_column_name]]
.notnull()
.all(axis=1)
]
else:
clean = X_test[X_test[predicted_column_name].notnull()]
print(
f"The predictions have {clean.shape[0]} rows and {clean.shape[1]} columns."
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,19 @@ def run(mini_batch):
print(f"run method start: {__file__}, run({mini_batch})")
resultList = []
for test in mini_batch:
if not test.endswith(".csv"):
file_ext = os.path.splitext(test)[-1]
if file_ext == ".parquet":
X_test = pd.read_parquet(test)
elif file_ext == ".csv":
X_test = pd.read_csv(test, parse_dates=[fitted_model.time_column_name])
else:
print(f"Unsupported file type: `{file_ext}`. Skipping the file.")
continue
X_test = pd.read_csv(test, parse_dates=[fitted_model.time_column_name])

if target_column_name not in X_test.columns:
raise ValueError(
f"Target column `{target_column_name}` not found in the test data, required for rolling forecast."
)
y_test = X_test.pop(target_column_name).values

# Make a rolling forecast, advancing the forecast origin by 1 period on each iteration through the test set
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,19 @@ def run(mini_batch):
print(f"run method start: {__file__}, run({mini_batch})")
resultList = []
for test in mini_batch:
if not test.endswith(".csv"):
file_ext = os.path.splitext(test)[-1]
if file_ext == ".parquet":
X_test = pd.read_parquet(test)
elif file_ext == ".csv":
X_test = pd.read_csv(test, parse_dates=[fitted_model.time_column_name])
else:
print(f"Unsupported file type: `{file_ext}`. Skipping the file.")
continue
X_test = pd.read_csv(test, parse_dates=[fitted_model.time_column_name])
y_test = X_test.pop(target_column_name).values

if target_column_name in X_test.columns:
y_test = X_test.pop(target_column_name).values
else:
y_test = None

# We have default quantiles values set as below(95th percentile)
quantiles = [0.025, 0.5, 0.975]
Expand All @@ -45,15 +54,21 @@ def run(mini_batch):
pred_quantiles[PI] = pred_quantiles[[min(quantiles), max(quantiles)]].apply(
lambda x: "[{}, {}]".format(x[0], x[1]), axis=1
)
X_test[target_column_name] = y_test
if y_test is not None:
X_test[target_column_name] = y_test
X_test[PI] = pred_quantiles[PI].values
X_test[predicted_column_name] = pred_quantiles[0.5].values
# drop rows where prediction or actuals are nan
# happens because of missing actuals
# or at edges of time due to lags/rolling windows
clean = X_test[
X_test[[target_column_name, predicted_column_name]].notnull().all(axis=1)
]
if target_column_name in X_test.columns:
clean = X_test[
X_test[[target_column_name, predicted_column_name]]
.notnull()
.all(axis=1)
]
else:
clean = X_test[X_test[predicted_column_name].notnull()]
print(
f"The predictions have {clean.shape[0]} rows and {clean.shape[1]} columns."
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -479,8 +479,8 @@
"| **enable_early_stopping** | Flag to enable early termination if the primary metric is no longer improving. |\n",
"| **partition_column_names** | The names of columns used to group your models. For timeseries, the groups must not split up individual time-series. That is, each group must contain one or more whole time-series. |\n",
"| **allow_multi_partitions** | A flag that allows users to train one model per partition when each partition contians more than one unique time series. The dafault value is `False`. |\n",
"| **track_child_runs** | Flag to disable tracking of child runs. Only best run is tracked if the flag is set to False (this includes the model and metrics of the run). |\n",
"| **n_best_runs** | Number of best runs to track per partition for a Many Models Run. Defaults to 1. Please set `track_child_runs` to `True` and then modify this parameter. |\n",
"| **track_child_runs** | Flag to enable tracking of child runs. Only best run is tracked if the flag is set to False (this includes the model and metrics of the run). Defaults to `False`. We do not encourage to turn this on since it can lead to throttling, instead use `n_best_runs` if you really need to track more than one best run. |\n",
"| **n_best_runs** | Number of best runs to track per partition for a Many Models Run. Defaults to 1. |\n",
"| **enable_early_stopping** | Flag to enable early termination if the primary metric is no longer improving. |\n",
"| **max_trials** | Represents the maximum number of trials an Automated ML job can try to run a training algorithm with different combination of hyperparameters. Its default value is set to 1000. If `enable_early_stopping` is defined, then the number of trials used to run training algorithms can be smaller.|\n",
"| **timeout_minutes** | Maximum amount of time in minutes that the whole AutoML job can take before the job terminates. This timeout includes setup, featurization and training runs but does not include the ensembling and model explainability runs at the end of the process since those actions need to happen once all the trials (children jobs) are done. If not specified, the default job's total timeout is 6 days (8,640 minutes). To specify a timeout less than or equal to 1 hour (60 minutes), make sure your dataset's size is not greater than 10,000,000 (rows times column) or an error results. |\n",
Expand Down

0 comments on commit 437cc2a

Please sign in to comment.