Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Forecast Scenario Notebook for Local and Remote Inferencing #3429

Merged
merged 8 commits into from
Oct 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
"""
This is the script that is executed on the compute instance. It relies
on the model.pkl file which is uploaded along with this script to the
compute instance.
"""

import os

import pandas as pd

from azureml.core import Dataset, Run
import joblib
from pandas.tseries.frequencies import to_offset


def init():
global target_column_name
global fitted_model

target_column_name = os.environ["TARGET_COLUMN_NAME"]
# AZUREML_MODEL_DIR is an environment variable created during deployment
# It is the path to the model folder (./azureml-models)
# Please provide your model's folder name if there's one
model_path = os.path.join(os.environ["AZUREML_MODEL_DIR"], "model.pkl")
fitted_model = joblib.load(model_path)


def run(mini_batch):
print(f"run method start: {__file__}, run({mini_batch})")
resultList = []
for test in mini_batch:
if os.path.splitext(test)[-1] == ".parquet":
X_test = pd.read_parquet(test)
elif os.path.splitext(test)[-1] == ".csv":
X_test = pd.read_csv(test, parse_dates=[fitted_model.time_column_name])
else:
continue # Skip if it's neither a Parquet nor CSV file

y_test = X_test.pop(target_column_name).values

# We have default quantiles values set as below(95th percentile)
quantiles = [0.025, 0.5, 0.975]
predicted_column_name = "predicted"
PI = "prediction_interval"
fitted_model.quantiles = quantiles
pred_quantiles = fitted_model.forecast_quantiles(
X_test, ignore_data_errors=True
)
pred_quantiles[PI] = pred_quantiles[[min(quantiles), max(quantiles)]].apply(
lambda x: "[{}, {}]".format(x[0], x[1]), axis=1
)
X_test[target_column_name] = y_test
X_test[PI] = pred_quantiles[PI].values
X_test[predicted_column_name] = pred_quantiles[0.5].values
# drop rows where prediction or actuals are nan
# happens because of missing actuals
# or at edges of time due to lags/rolling windows
clean = X_test[
X_test[[target_column_name, predicted_column_name]].notnull().all(axis=1)
]
print(
f"The predictions have {clean.shape[0]} rows and {clean.shape[1]} columns."
)

resultList.append(clean)

return pd.concat(resultList, sort=False, ignore_index=True)
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"append_row": {"pandas.DataFrame.to_csv": {"sep": ","}}}
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
# Generate synthetic data

import pandas as pd
import numpy as np


def get_timeseries(
train_len: int,
test_len: int,
time_column_name: str,
target_column_name: str,
time_series_id_column_name: str,
time_series_number: int = 1,
freq: str = "H",
):
"""
Return the time series of designed length.

:param train_len: The length of training data (one series).
:type train_len: int
:param test_len: The length of testing data (one series).
:type test_len: int
:param time_column_name: The desired name of a time column.
:type time_column_name: str
:param time_series_number: The number of time series in the data set.
:type time_series_number: int
:param freq: The frequency string representing pandas offset.
see https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html
:type freq: str
:returns: the tuple of train and test data sets.
:rtype: tuple

"""
data_train = [] # type: List[pd.DataFrame]
data_test = [] # type: List[pd.DataFrame]
data_length = train_len + test_len
for i in range(time_series_number):
X = pd.DataFrame(
{
time_column_name: pd.date_range(
start="2000-01-01", periods=data_length, freq=freq
),
target_column_name: np.arange(data_length).astype(float)
+ np.random.rand(data_length)
+ i * 5,
"ext_predictor": np.asarray(range(42, 42 + data_length)),
time_series_id_column_name: np.repeat("ts{}".format(i), data_length),
}
)
data_train.append(X[:train_len])
data_test.append(X[train_len:])
X_train = pd.concat(data_train)
y_train = X_train.pop(target_column_name).values
X_test = pd.concat(data_test)
y_test = X_test.pop(target_column_name).values
return X_train, y_train, X_test, y_test


def make_forecasting_query(
fulldata, time_column_name, target_column_name, forecast_origin, horizon, lookback
):

"""
This function will take the full dataset, and create the query
to predict all values of the time series from the `forecast_origin`
forward for the next `horizon` horizons. Context from previous
`lookback` periods will be included.



fulldata: pandas.DataFrame a time series dataset. Needs to contain X and y.
time_column_name: string which column (must be in fulldata) is the time axis
target_column_name: string which column (must be in fulldata) is to be forecast
forecast_origin: datetime type the last time we (pretend to) have target values
horizon: timedelta how far forward, in time units (not periods)
lookback: timedelta how far back does the model look

Example:


```

forecast_origin = pd.to_datetime("2012-09-01") + pd.DateOffset(days=5) # forecast 5 days after end of training
print(forecast_origin)

X_query, y_query = make_forecasting_query(data,
forecast_origin = forecast_origin,
horizon = pd.DateOffset(days=7), # 7 days into the future
lookback = pd.DateOffset(days=1), # model has lag 1 period (day)
)

```
"""

X_past = fulldata[
(fulldata[time_column_name] > forecast_origin - lookback)
& (fulldata[time_column_name] <= forecast_origin)
]

X_future = fulldata[
(fulldata[time_column_name] > forecast_origin)
& (fulldata[time_column_name] <= forecast_origin + horizon)
]

y_past = X_past.pop(target_column_name).values.astype(float)
y_future = X_future.pop(target_column_name).values.astype(float)

# Now take y_future and turn it into question marks
y_query = y_future.copy().astype(float) # because sometimes life hands you an int
y_query.fill(np.nan)

print("X_past is " + str(X_past.shape) + " - shaped")
print("X_future is " + str(X_future.shape) + " - shaped")
print("y_past is " + str(y_past.shape) + " - shaped")
print("y_query is " + str(y_query.shape) + " - shaped")

X_pred = pd.concat([X_past, X_future])
y_pred = np.concatenate([y_past, y_query])
return X_pred, y_pred
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading