Merge pull request #39 from IBM/issue_38

Issue 38
ibm-granite · Apr 24, 2024 · 92d3fa2 · 92d3fa2
2 parents 064cbb8 + 6017681
commit 92d3fa2
Show file tree

Hide file tree

Showing 5 changed files with 71 additions and 23 deletions.
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -1,18 +1,17 @@
 {
   "[python]": {
-    "editor.defaultFormatter": "ms-python.black-formatter",
+    "editor.defaultFormatter": "charliermarsh.ruff",
     "editor.formatOnSave": true,
-    "editor.codeActionsOnSave": {
-      "source.organizeImports": "explicit"
-    }
   },
-  "isort.args": [
-    "--profile",
-    "black"
-  ],
   "python.testing.pytestArgs": [
     "tests"
   ],
   "python.testing.unittestEnabled": false,
-  "python.testing.pytestEnabled": true
+  "python.testing.pytestEnabled": true,
+  "ruff.format.args": [
+    "--config=./pyproject.toml"
+  ],
+  "ruff.lint.args": [
+    "--config=./pyproject.toml"
+  ]
 }
diff --git a/tests/toolkit/test_time_series_preprocessor.py b/tests/toolkit/test_time_series_preprocessor.py
@@ -212,6 +212,18 @@ def test_create_timestamps():
             2,
             [103.5, 107.0],
         ),
+        (
+            pd.Timestamp(2021, 12, 31),
+            "QE",
+            None,
+            4,
+            [
+                pd.Timestamp(2022, 3, 31),
+                pd.Timestamp(2022, 6, 30),
+                pd.Timestamp(2022, 9, 30),
+                pd.Timestamp(2022, 12, 31),
+            ],
+        ),
     ]
 
     for start, freq, sequence, periods, expected in test_cases:
@@ -220,8 +232,9 @@ def test_create_timestamps():
         assert ts == expected
 
         # test based on provided sequence
-        ts = create_timestamps(start, time_sequence=sequence, periods=periods)
-        assert ts == expected
+        if sequence is not None:
+            ts = create_timestamps(start, time_sequence=sequence, periods=periods)
+            assert ts == expected
 
     # it is an error to provide neither freq or sequence
     with pytest.raises(ValueError):

diff --git a/tsfm_public/toolkit/dataset.py b/tsfm_public/toolkit/dataset.py
@@ -232,7 +232,7 @@ def get_group_data(
 ):
     return cls(
         data_df=group,
-        group_id=group_id,
+        group_id=group_id if isinstance(group_id, tuple) else (group_id,),
         id_columns=id_columns,
         timestamp_column=timestamp_column,
         context_length=context_length,

diff --git a/tsfm_public/toolkit/time_series_forecasting_pipeline.py b/tsfm_public/toolkit/time_series_forecasting_pipeline.py
@@ -300,6 +300,10 @@ def postprocess(self, input, **kwargs):
         """Postprocess step
         Takes the dictionary of outputs from the previous step and converts to a more user
         readable pandas format.
+
+        If the explode forecasts option is True, then individual forecasts are expanded as multiple
+        rows in the dataframe. This should only be used when producing a single forecast (i.e., unexploded
+        result is one row per ID).
         """
         out = {}
 

diff --git a/tsfm_public/toolkit/time_series_preprocessor.py b/tsfm_public/toolkit/time_series_preprocessor.py
@@ -344,11 +344,29 @@ def _standardize_dataframe(
 
         return df
 
+    def _clean_up_dataframe(self, df: pd.DataFrame) -> None:
+        """Removes columns added during internal processing of the provided dataframe.
+
+        Currently, the following checks are done:
+         - Remove INTERNAL_ID_COLUMN if present
+
+        Args:
+            df (pd.DataFrame): Input pandas dataframe
+
+        Returns:
+            pd.DataFrame: Cleaned up dataframe
+        """
+
+        if not self.id_columns:
+            if INTERNAL_ID_COLUMN in df.columns:
+                df.drop(columns=INTERNAL_ID_COLUMN, inplace=True)
+
     def _get_groups(
         self,
         dataset: pd.DataFrame,
     ) -> Generator[Tuple[Any, pd.DataFrame], None, None]:
-        """Get groups of the time series dataset (multi-time series) based on the ID columns.
+        """Get groups of the time series dataset (multi-time series) based on the ID columns for scaling.
+        Note that this is used for scaling purposes only.
 
         Args:
             dataset (pd.DataFrame): Input dataset
@@ -472,7 +490,7 @@ def _check_dataset(self, dataset: Union[Dataset, pd.DataFrame]):
 
     def _set_targets(self, dataset: pd.DataFrame) -> None:
         if self.target_columns == []:
-            skip_columns = copy.copy(self.id_columns)
+            skip_columns = copy.copy(self.id_columns) + [INTERNAL_ID_COLUMN]
             if self.timestamp_column:
                 skip_columns.append(self.timestamp_column)
 
@@ -531,6 +549,7 @@ def train(
         if self.encode_categorical:
             self._train_categorical_encoder(df)
 
+        self._clean_up_dataframe(df)
         return self
 
     def inverse_scale_targets(
@@ -581,10 +600,12 @@ def inverse_scale_func(grp, id_columns):
         else:
             id_columns = INTERNAL_ID_COLUMN
 
-        return df.groupby(id_columns, group_keys=False).apply(
+        df_inv = df.groupby(id_columns, group_keys=False).apply(
             inverse_scale_func,
             id_columns=id_columns,
         )
+        self._clean_up_dataframe(df_inv)
+        return df_inv
 
     def preprocess(
         self,
@@ -640,6 +661,7 @@ def scale_func(grp, id_columns):
                 raise RuntimeError("Attempt to encode categorical columns, but the encoder has not been trained yet.")
             df[cols_to_encode] = self.categorical_encoder.transform(df[cols_to_encode])
 
+        self._clean_up_dataframe(df)
         return df
 
     def get_datasets(
@@ -759,14 +781,24 @@ def create_timestamps(
 
     # more complex logic is required to support all edge cases
     if isinstance(freq, (pd.Timedelta, datetime.timedelta, str)):
-        if isinstance(freq, str):
-            freq = pd._libs.tslibs.timedeltas.Timedelta(freq)
-
-        return pd.date_range(
-            last_timestamp,
-            freq=freq,
-            periods=periods + 1,
-        ).tolist()[1:]
+        try:
+            # try date range directly
+            return pd.date_range(
+                last_timestamp,
+                freq=freq,
+                periods=periods + 1,
+            ).tolist()[1:]
+        except ValueError as e:
+            # if it fails, we can try to compute a timedelta from the provided string
+            if isinstance(freq, str):
+                freq = pd._libs.tslibs.timedeltas.Timedelta(freq)
+                return pd.date_range(
+                    last_timestamp,
+                    freq=freq,
+                    periods=periods + 1,
+                ).tolist()[1:]
+            else:
+                raise e
     else:
         # numerical timestamp column
         return [last_timestamp + i * freq for i in range(1, periods + 1)]