Update forecasting script and MM nb (#3450)

* update forecasting script * remove unwanted change * fix code style issue
Azure · Dec 3, 2024 · 437cc2a · 437cc2a
1 parent 7b454e6
commit 437cc2a
Show file tree

Hide file tree

Showing 7 changed files with 112 additions and 35 deletions.
diff --git a/...dalone-jobs/automl-forecasting-forecast-function/forecasting_script/forecasting_script.py b/...dalone-jobs/automl-forecasting-forecast-function/forecasting_script/forecasting_script.py
@@ -29,14 +29,19 @@ def run(mini_batch):
     print(f"run method start: {__file__}, run({mini_batch})")
     resultList = []
     for test in mini_batch:
-        if os.path.splitext(test)[-1] == ".parquet":
+        file_ext = os.path.splitext(test)[-1]
+        if file_ext == ".parquet":
             X_test = pd.read_parquet(test)
-        elif os.path.splitext(test)[-1] == ".csv":
+        elif file_ext == ".csv":
             X_test = pd.read_csv(test, parse_dates=[fitted_model.time_column_name])
         else:
+            print(f"Unsupported file type: `{file_ext}`. Skipping the file.")
             continue  # Skip if it's neither a Parquet nor CSV file
 
-        y_test = X_test.pop(target_column_name).values
+        if target_column_name in X_test.columns:
+            y_test = X_test.pop(target_column_name).values
+        else:
+            y_test = None
 
         # We have default quantiles values set as below(95th percentile)
         quantiles = [0.025, 0.5, 0.975]
@@ -49,15 +54,21 @@ def run(mini_batch):
         pred_quantiles[PI] = pred_quantiles[[min(quantiles), max(quantiles)]].apply(
             lambda x: "[{}, {}]".format(x[0], x[1]), axis=1
         )
-        X_test[target_column_name] = y_test
+        if y_test is not None:
+            X_test[target_column_name] = y_test
         X_test[PI] = pred_quantiles[PI].values
         X_test[predicted_column_name] = pred_quantiles[0.5].values
         # drop rows where prediction or actuals are nan
         # happens because of missing actuals
         # or at edges of time due to lags/rolling windows
-        clean = X_test[
-            X_test[[target_column_name, predicted_column_name]].notnull().all(axis=1)
-        ]
+        if target_column_name in X_test.columns:
+            clean = X_test[
+                X_test[[target_column_name, predicted_column_name]]
+                .notnull()
+                .all(axis=1)
+            ]
+        else:
+            clean = X_test[X_test[predicted_column_name].notnull()]
         print(
             f"The predictions have {clean.shape[0]} rows and {clean.shape[1]} columns."
         )

diff --git a/...n/jobs/automl-standalone-jobs/automl-forecasting-github-dau/helpers/forecasting_script.py b/...n/jobs/automl-standalone-jobs/automl-forecasting-github-dau/helpers/forecasting_script.py
@@ -32,9 +32,19 @@ def run(mini_batch):
     print(f"run method start: {__file__}, run({mini_batch})")
     resultList = []
     for test in mini_batch:
-        if not test.endswith(".csv"):
+        file_ext = os.path.splitext(test)[-1]
+        if file_ext == ".parquet":
+            X_test = pd.read_parquet(test)
+        elif file_ext == ".csv":
+            X_test = pd.read_csv(test, parse_dates=[fitted_model.time_column_name])
+        else:
+            print(f"Unsupported file type: `{file_ext}`. Skipping the file.")
             continue
-        X_test = pd.read_csv(test, parse_dates=[fitted_model.time_column_name])
+
+        if target_column_name not in X_test.columns:
+            raise ValueError(
+                f"Target column `{target_column_name}` not found in the test data, required for rolling forecast."
+            )
         y_test = X_test.pop(target_column_name).values
 
         # Make a rolling forecast, advancing the forecast origin by 1 period on each iteration through the test set

diff --git a/...toml-standalone-jobs/automl-forecasting-orange-juice-sales/forecast/forecasting_script.py b/...toml-standalone-jobs/automl-forecasting-orange-juice-sales/forecast/forecasting_script.py
@@ -29,11 +29,19 @@ def run(mini_batch):
     print(f"run method start: {__file__}, run({mini_batch})")
     resultList = []
     for test in mini_batch:
-        if os.path.splitext(test)[-1] != ".csv":
+        file_ext = os.path.splitext(test)[-1]
+        if file_ext == ".parquet":
+            X_test = pd.read_parquet(test)
+        elif file_ext == ".csv":
+            X_test = pd.read_csv(test, parse_dates=[fitted_model.time_column_name])
+        else:
+            print(f"Unsupported file type: `{file_ext}`. Skipping the file.")
             continue
 
-        X_test = pd.read_csv(test, parse_dates=[fitted_model.time_column_name])
-        y_test = X_test.pop(target_column_name).values
+        if target_column_name in X_test.columns:
+            y_test = X_test.pop(target_column_name).values
+        else:
+            y_test = None
 
         # We have default quantiles values set as below(95th percentile)
         quantiles = [0.025, 0.5, 0.975]
@@ -46,15 +54,21 @@ def run(mini_batch):
         pred_quantiles[PI] = pred_quantiles[[min(quantiles), max(quantiles)]].apply(
             lambda x: "[{}, {}]".format(x[0], x[1]), axis=1
         )
-        X_test[target_column_name] = y_test
+        if y_test is not None:
+            X_test[target_column_name] = y_test
         X_test[PI] = pred_quantiles[PI].values
         X_test[predicted_column_name] = pred_quantiles[0.5].values
         # drop rows where prediction or actuals are nan
         # happens because of missing actuals
         # or at edges of time due to lags/rolling windows
-        clean = X_test[
-            X_test[[target_column_name, predicted_column_name]].notnull().all(axis=1)
-        ]
+        if target_column_name in X_test.columns:
+            clean = X_test[
+                X_test[[target_column_name, predicted_column_name]]
+                .notnull()
+                .all(axis=1)
+            ]
+        else:
+            clean = X_test[X_test[predicted_column_name].notnull()]
         print(
             f"The predictions have {clean.shape[0]} rows and {clean.shape[1]} columns."
         )

diff --git a/...toml-standalone-jobs/automl-forecasting-recipes-univariate/forecast/forecasting_script.py b/...toml-standalone-jobs/automl-forecasting-recipes-univariate/forecast/forecasting_script.py
@@ -29,8 +29,19 @@ def run(mini_batch):
     print(f"run method start: {__file__}, run({mini_batch})")
     resultList = []
     for test in mini_batch:
-        X_test = pd.read_csv(test, parse_dates=[fitted_model.time_column_name])
-        y_test = X_test.pop(target_column_name).values
+        file_ext = os.path.splitext(test)[-1]
+        if file_ext == ".parquet":
+            X_test = pd.read_parquet(test)
+        elif file_ext == ".csv":
+            X_test = pd.read_csv(test, parse_dates=[fitted_model.time_column_name])
+        else:
+            print(f"Unsupported file type: `{file_ext}`. Skipping the file.")
+            continue
+
+        if target_column_name in X_test.columns:
+            y_test = X_test.pop(target_column_name).values
+        else:
+            y_test = None
 
         # We have default quantiles values set as below(95th percentile)
         quantiles = [0.025, 0.5, 0.975]
@@ -43,15 +54,21 @@ def run(mini_batch):
         pred_quantiles[PI] = pred_quantiles[[min(quantiles), max(quantiles)]].apply(
             lambda x: "[{}, {}]".format(x[0], x[1]), axis=1
         )
-        X_test[target_column_name] = y_test
-        X_test[PI] = pred_quantiles[PI]
-        X_test[predicted_column_name] = pred_quantiles[0.5]
+        if y_test is not None:
+            X_test[target_column_name] = y_test
+        X_test[PI] = pred_quantiles[PI].values
+        X_test[predicted_column_name] = pred_quantiles[0.5].values
         # drop rows where prediction or actuals are nan
         # happens because of missing actuals
         # or at edges of time due to lags/rolling windows
-        clean = X_test[
-            X_test[[target_column_name, predicted_column_name]].notnull().all(axis=1)
-        ]
+        if target_column_name in X_test.columns:
+            clean = X_test[
+                X_test[[target_column_name, predicted_column_name]]
+                .notnull()
+                .all(axis=1)
+            ]
+        else:
+            clean = X_test[X_test[predicted_column_name].notnull()]
         print(
             f"The predictions have {clean.shape[0]} rows and {clean.shape[1]} columns."
         )

diff --git a/...jobs/automl-standalone-jobs/automl-forecasting-task-bike-share/forecast/rolling_script.py b/...jobs/automl-standalone-jobs/automl-forecasting-task-bike-share/forecast/rolling_script.py
@@ -29,9 +29,19 @@ def run(mini_batch):
     print(f"run method start: {__file__}, run({mini_batch})")
     resultList = []
     for test in mini_batch:
-        if not test.endswith(".csv"):
+        file_ext = os.path.splitext(test)[-1]
+        if file_ext == ".parquet":
+            X_test = pd.read_parquet(test)
+        elif file_ext == ".csv":
+            X_test = pd.read_csv(test, parse_dates=[fitted_model.time_column_name])
+        else:
+            print(f"Unsupported file type: `{file_ext}`. Skipping the file.")
             continue
-        X_test = pd.read_csv(test, parse_dates=[fitted_model.time_column_name])
+
+        if target_column_name not in X_test.columns:
+            raise ValueError(
+                f"Target column `{target_column_name}` not found in the test data, required for rolling forecast."
+            )
         y_test = X_test.pop(target_column_name).values
 
         # Make a rolling forecast, advancing the forecast origin by 1 period on each iteration through the test set

diff --git a/...toml-standalone-jobs/automl-forecasting-task-energy-demand/forecast/forecasting_script.py b/...toml-standalone-jobs/automl-forecasting-task-energy-demand/forecast/forecasting_script.py
@@ -29,10 +29,19 @@ def run(mini_batch):
     print(f"run method start: {__file__}, run({mini_batch})")
     resultList = []
     for test in mini_batch:
-        if not test.endswith(".csv"):
+        file_ext = os.path.splitext(test)[-1]
+        if file_ext == ".parquet":
+            X_test = pd.read_parquet(test)
+        elif file_ext == ".csv":
+            X_test = pd.read_csv(test, parse_dates=[fitted_model.time_column_name])
+        else:
+            print(f"Unsupported file type: `{file_ext}`. Skipping the file.")
             continue
-        X_test = pd.read_csv(test, parse_dates=[fitted_model.time_column_name])
-        y_test = X_test.pop(target_column_name).values
+
+        if target_column_name in X_test.columns:
+            y_test = X_test.pop(target_column_name).values
+        else:
+            y_test = None
 
         # We have default quantiles values set as below(95th percentile)
         quantiles = [0.025, 0.5, 0.975]
@@ -45,15 +54,21 @@ def run(mini_batch):
         pred_quantiles[PI] = pred_quantiles[[min(quantiles), max(quantiles)]].apply(
             lambda x: "[{}, {}]".format(x[0], x[1]), axis=1
         )
-        X_test[target_column_name] = y_test
+        if y_test is not None:
+            X_test[target_column_name] = y_test
         X_test[PI] = pred_quantiles[PI].values
         X_test[predicted_column_name] = pred_quantiles[0.5].values
         # drop rows where prediction or actuals are nan
         # happens because of missing actuals
         # or at edges of time due to lags/rolling windows
-        clean = X_test[
-            X_test[[target_column_name, predicted_column_name]].notnull().all(axis=1)
-        ]
+        if target_column_name in X_test.columns:
+            clean = X_test[
+                X_test[[target_column_name, predicted_column_name]]
+                .notnull()
+                .all(axis=1)
+            ]
+        else:
+            clean = X_test[X_test[predicted_column_name].notnull()]
         print(
             f"The predictions have {clean.shape[0]} rows and {clean.shape[1]} columns."
         )

diff --git a/...d_forecast_pipeline/aml-demand-forecast-mm-pipeline/aml-demand-forecast-mm-pipeline.ipynb b/...d_forecast_pipeline/aml-demand-forecast-mm-pipeline/aml-demand-forecast-mm-pipeline.ipynb
@@ -479,8 +479,8 @@
     "| **enable_early_stopping**  | Flag to enable early termination if the primary metric is no longer improving. |\n",
     "| **partition_column_names** | The names of columns used to group your models. For timeseries, the groups must not split up individual time-series. That is, each group must contain one or more whole time-series. |\n",
     "| **allow_multi_partitions** | A flag that allows users to train one model per partition when each partition contians more than one unique time series. The dafault value is `False`. |\n",
-    "| **track_child_runs**       | Flag to disable tracking of child runs. Only best run is tracked if the flag is set to False (this includes the model and metrics of the run). |\n",
-    "| **n_best_runs**            | Number of best runs to track per partition for a Many Models Run. Defaults to 1. Please set `track_child_runs` to `True` and then modify this parameter. |\n",
+    "| **track_child_runs**       | Flag to enable tracking of child runs. Only best run is tracked if the flag is set to False (this includes the model and metrics of the run). Defaults to `False`. We do not encourage to turn this on since it can lead to throttling, instead use `n_best_runs` if you really need to track more than one best run. |\n",
+    "| **n_best_runs**            | Number of best runs to track per partition for a Many Models Run. Defaults to 1. |\n",
     "| **enable_early_stopping**  | Flag to enable early termination if the primary metric is no longer improving. |\n",
     "| **max_trials** | Represents the maximum number of trials an Automated ML job can try to run a training algorithm with different combination of hyperparameters. Its default value is set to 1000. If `enable_early_stopping` is defined, then the number of trials used to run training algorithms can be smaller.|\n",
     "| **timeout_minutes** | Maximum amount of time in minutes that the whole AutoML job can take before the job terminates. This timeout includes setup, featurization and training runs but does not include the ensembling and model explainability runs at the end of the process since those actions need to happen once all the trials (children jobs) are done. If not specified, the default job's total timeout is 6 days (8,640 minutes). To specify a timeout less than or equal to 1 hour (60 minutes), make sure your dataset's size is not greater than 10,000,000 (rows times column) or an error results. |\n",