Add tests, update get_dataset for target columns

Signed-off-by: Wesley M. Gifford <[email protected]>
ibm-granite · Mar 29, 2024 · 5d6178f · 5d6178f
1 parent 11509f0
commit 5d6178f
Show file tree

Hide file tree

Showing 2 changed files with 30 additions and 14 deletions.
diff --git a/tests/toolkit/test_time_series_preprocessor.py b/tests/toolkit/test_time_series_preprocessor.py
@@ -284,3 +284,16 @@ def test_train_without_targets(ts_data):
     tsp.train(ts_data)
 
     assert tsp.target_columns == ["value2"]
+
+
+def test_get_datasets_without_targets(ts_data):
+    ts_data = ts_data.drop(columns=["id", "id2"])
+    tsp = TimeSeriesPreprocessor(
+        timestamp_column="timestamp",
+        prediction_length=2,
+        context_length=5,
+    )
+
+    train, _, _ = tsp.get_datasets(ts_data, split_config={"train": 0.7, "test": 0.2})
+
+    train.datasets[0].target_columns == ["value1", "value2"]
diff --git a/tsfm_public/toolkit/time_series_preprocessor.py b/tsfm_public/toolkit/time_series_preprocessor.py
@@ -510,6 +510,7 @@ def train(
         self._check_dataset(dataset)
         df = self._standardize_dataframe(dataset)
         self._set_targets(df)
+        self._validate_columns()
 
         if self.freq is None:
             self._estimate_frequency(df)
@@ -563,7 +564,6 @@ def preprocess(
         # 2) incremental / batch based processing of datasets to minimize memory impact
 
         self._check_dataset(dataset)
-
         df = self._standardize_dataframe(dataset)
 
         if self.scaling:
@@ -647,22 +647,14 @@ def get_datasets(
 
         data = self._standardize_dataframe(dataset)
 
-        # get split_params
-        # split_params = get_split_params(config, self.context_length, len(data))
+        if not self.context_length:
+            raise ValueError("TimeSeriesPreprocessor must be instantiated with non-null context_length")
+        if not self.prediction_length:
+            raise ValueError("TimeSeriesPreprocessor must be instantiated with non-null prediction_length")
 
+        # get split_params
         split_params, split_function = get_split_params(split_config, context_length=self.context_length)
 
-        # specify columns
-        column_specifiers = {
-            "id_columns": self.id_columns,
-            "timestamp_column": self.timestamp_column,
-            "target_columns": self.target_columns,
-            "observable_columns": self.observable_columns,
-            "control_columns": self.control_columns,
-            "conditional_columns": self.conditional_columns,
-            "static_categorical_columns": self.static_categorical_columns,
-        }
-
         # split data
         if isinstance(split_function, dict):
             train_data = split_function["train"](data, id_columns=self.id_columns, **split_params["train"])
@@ -674,6 +666,17 @@ def get_datasets(
         # data preprocessing
         self.train(train_data)
 
+        # specify columns
+        column_specifiers = {
+            "id_columns": self.id_columns,
+            "timestamp_column": self.timestamp_column,
+            "target_columns": self.target_columns,
+            "observable_columns": self.observable_columns,
+            "control_columns": self.control_columns,
+            "conditional_columns": self.conditional_columns,
+            "static_categorical_columns": self.static_categorical_columns,
+        }
+
         # handle fewshot operation
         if fewshot_fraction is not None:
             if not ((fewshot_fraction <= 1.0) and (fewshot_fraction > 0.0)):