Fix backtest and residuals for multi series with different number of …

…historical forecasts (#2604) * fix backtest and residuals for mutliple series with different number of forecasts * update changelog
unit8co · Nov 22, 2024 · d103a05 · d103a05
1 parent 31a7b36
commit d103a05
Show file tree

Hide file tree

Showing 3 changed files with 28 additions and 8 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -15,6 +15,7 @@ but cannot always guarantee backwards compatibility. Changes that may **break co
 
 **Fixed**
 
+- Fixed a bug which raised an error when computing residuals (or backtest with "per time step" metrics) on multiple series with corresponding historical forecasts of different lengths. [#2604](https://github.com/unit8co/darts/pull/2604) by [Dennis Bader](https://github.com/dennisbader).
 - Fixed a bug when using `darts.utils.data.tabularization.create_lagged_component_names()` with target `lags=None`, that did not return any lagged target label component names. [#2576](https://github.com/unit8co/darts/pull/2576) by [Dennis Bader](https://github.com/dennisbader).
 - Fixed a bug when using `num_samples > 1` with a deterministic regression model and the optimized `historical_forecasts()` method, an exception was not raised. [#2576](https://github.com/unit8co/darts/pull/2588) by [Antoine Madrona](https://github.com/madtoinou).
 

diff --git a/darts/models/forecasting/forecasting_model.py b/darts/models/forecasting/forecasting_model.py
@@ -1504,13 +1504,24 @@ def __getitem__(self, index) -> TimeSeries:
                 kwargs["insample"] = series_gen
 
             errors.append(metric_f(series_gen, forecasts_list, **kwargs))
-        errors = np.array(errors)
+        try:
+            # multiple series can result in different number of forecasts; try if we can run it efficiently
+            errors = np.array(errors)
+            is_arr = True
+        except ValueError:
+            # otherwise, compute array later
+            is_arr = False
 
         # get errors for each input `series`
         backtest_list = []
         for i in range(len(cum_len) - 1):
             # errors_series with shape `(n metrics, n series specific historical forecasts, *)`
-            errors_series = errors[:, cum_len[i] : cum_len[i + 1]]
+            if is_arr:
+                errors_series = errors[:, cum_len[i] : cum_len[i + 1]]
+            else:
+                errors_series = np.array([
+                    errors_[cum_len[i] : cum_len[i + 1]] for errors_ in errors
+                ])
 
             if reduction is not None:
                 # shape `(n metrics, n forecasts, *)` -> `(n metrics, *)`

diff --git a/darts/tests/models/forecasting/test_residuals.py b/darts/tests/models/forecasting/test_residuals.py
@@ -150,7 +150,8 @@ def test_output_single_series_hfc_lpo_false(self, config):
     @pytest.mark.parametrize(
         "config",
         itertools.product(
-            [True, False],
+            [True, False],  # is univariate
+            [True, False],  # same lengths
             [
                 (metrics.err, ((0.0, 0.0), (-1.0, -2.0))),
                 (metrics.ape, ((0.0, 0.0), (100.0, 100.0))),
@@ -159,10 +160,12 @@ def test_output_single_series_hfc_lpo_false(self, config):
     )
     def test_output_multi_series_hfc_lpo_true(self, config):
         """Tests residuals based on historical forecasts generated on multiple `series` with last_points_only=True"""
-        is_univariate, (metric, score_exp) = config
+        is_univariate, same_lengths, (metric, score_exp) = config
         n_ts = 10
         y = ct(value=1.0, length=n_ts)
         hfc = ct(value=2.0, length=n_ts)
+        if not same_lengths:
+            y = y.append_values([1.0])
         if not is_univariate:
             y = y.stack(y + 1.0)
             hfc = hfc.stack(hfc + 2.0)
@@ -173,8 +176,9 @@ def test_output_multi_series_hfc_lpo_true(self, config):
         # expected residuals values of shape (n time steps, n components, n samples=1) per forecast
         scores_exp = []
         for i in range(len(hfc)):
+            num_fcs = len(hfc[i])
             scores_exp.append(
-                np.array([score_exp[i][:n_comps]] * 10).reshape(n_ts, -1, 1)
+                np.array([score_exp[i][:n_comps]] * num_fcs).reshape(num_fcs, -1, 1)
             )
 
         model = NaiveDrift()
@@ -208,7 +212,8 @@ def test_output_multi_series_hfc_lpo_true(self, config):
     @pytest.mark.parametrize(
         "config",
         itertools.product(
-            [True, False],
+            [True, False],  # is univariate
+            [True, False],  # same lengths
             [
                 (metrics.err, ((0.0, 0.0), (-1.0, -2.0))),
                 (metrics.ape, ((0.0, 0.0), (100.0, 100.0))),
@@ -219,10 +224,12 @@ def test_output_multi_series_hfc_lpo_false(self, config):
         """Tests residuals based on historical forecasts generated on multiple `series` with
         last_points_only=False.
         """
-        is_univariate, (metric, score_exp) = config
+        is_univariate, same_lengths, (metric, score_exp) = config
         n_ts = 10
         y = ct(value=1.0, length=n_ts)
         hfc = ct(value=2.0, length=n_ts)
+        if not same_lengths:
+            y = y.append_values([1.0])
         if not is_univariate:
             y = y.stack(y + 1.0)
             hfc = hfc.stack(hfc + 2.0)
@@ -233,8 +240,9 @@ def test_output_multi_series_hfc_lpo_false(self, config):
         # expected residuals values of shape (n time steps, n components, n samples=1) per forecast
         scores_exp = []
         for i in range(len(hfc)):
+            num_fcs = len(hfc[i][0])
             scores_exp.append(
-                np.array([score_exp[i][:n_comps]] * 10).reshape(n_ts, -1, 1)
+                np.array([score_exp[i][:n_comps]] * num_fcs).reshape(num_fcs, -1, 1)
             )
 
         model = NaiveDrift()