ibm-granite · wgifford · Mar 11, 2024 · Mar 11, 2024 · Mar 11, 2024 · Mar 11, 2024
@@ -0,0 +1,39 @@
+# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
+# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
+
+name: Python package
+
+on:
+  push:
+    branches: [ "main" ]
+  pull_request:
+    branches: [ "main" ]
+
+jobs:
+  build:
+
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: ["3.9", "3.10"]
+
+    steps:
+    - uses: actions/checkout@v3
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v3
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        # python -m pip install flake8 pytest
+        # if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+        pip install ".[dev]"
+    - name: Ruff quality checks
+      run: |
+        # use hf-style check
+        make quality
+    - name: Test with pytest
+      run: |
+        pytest
@@ -1,18 +1,18 @@
 # Adapted from HF Transformers: https://github.com/huggingface/transformers/tree/main
 .PHONY: quality style
 
-check_dirs := tests src tsfm notebooks
+check_dirs := tests tsfm_public tsfmhfdemos notebooks
 
 
 # this target runs checks on all files
 
 quality:
-	ruff check $(check_dirs) setup.py
-	ruff format --check $(check_dirs) setup.py
+	ruff check $(check_dirs)
+	ruff format --check $(check_dirs)
 
 # this target runs checks on all files and potentially modifies some of them
 
 style:
-	ruff check $(check_dirs) setup.py --fix 
-	ruff format $(check_dirs) setup.py
+	ruff check $(check_dirs) --fix 
+	ruff format $(check_dirs)
 
@@ -52,9 +52,7 @@ def test_ts_padding(ts_data):
 
     # test date handled
     # integer
-    assert df_padded.iloc[0]["time_int"] == df.iloc[0]["time_int"] - (
-        context_length - df.shape[0]
-    )
+    assert df_padded.iloc[0]["time_int"] == df.iloc[0]["time_int"] - (context_length - df.shape[0])
 
     # date
     df_padded = ts_padding(
@@ -64,9 +62,9 @@ def test_ts_padding(ts_data):
         context_length=context_length,
     )
 
-    assert df_padded.iloc[0]["time_date"] == df.iloc[0]["time_date"] - (
-        context_length - df.shape[0]
-    ) * timedelta(days=1)
+    assert df_padded.iloc[0]["time_date"] == df.iloc[0]["time_date"] - (context_length - df.shape[0]) * timedelta(
+        days=1
+    )
 
 
 def test_pretrain_df_dataset(ts_data):

@@ -47,22 +47,14 @@ def __init__(
             y_cols = [y_cols]
 
         if len(x_cols) > 0:
-            assert is_cols_in_df(
-                data_df, x_cols
-            ), f"one or more {x_cols} is not in the list of data_df columns"
+            assert is_cols_in_df(data_df, x_cols), f"one or more {x_cols} is not in the list of data_df columns"
 
         if len(y_cols) > 0:
-            assert is_cols_in_df(
-                data_df, y_cols
-            ), f"one or more {y_cols} is not in the list of data_df columns"
+            assert is_cols_in_df(data_df, y_cols), f"one or more {y_cols} is not in the list of data_df columns"
 
         if datetime_col:
-            assert datetime_col in list(
-                data_df.columns
-            ), f"{datetime_col} is not in the list of data_df columns"
-            assert (
-                datetime_col not in x_cols
-            ), f"{datetime_col} should not be in the list of x_cols"
+            assert datetime_col in list(data_df.columns), f"{datetime_col} is not in the list of data_df columns"
+            assert datetime_col not in x_cols, f"{datetime_col} should not be in the list of x_cols"
 
         self.data_df = data_df
         self.datetime_col = datetime_col
@@ -160,9 +152,7 @@ def __init__(
         cls=BaseDFDataset,
     ):
         if len(id_columns) > 0:
-            assert is_cols_in_df(
-                data_df, id_columns
-            ), f"{id_columns} is not in the data_df columns"
+            assert is_cols_in_df(data_df, id_columns), f"{id_columns} is not in the data_df columns"
 
         self.datetime_col = datetime_col
         self.id_columns = id_columns
@@ -398,9 +388,7 @@ def __getitem__(self, time_id):
             # seq_x: batch_size x seq_len x num_x_cols
             seq_x = self.X[time_id : time_id + self.seq_len].values
             # seq_y: batch_size x pred_len x num_x_cols
-            seq_y = self.y[
-                time_id + self.seq_len : time_id + self.seq_len + self.pred_len
-            ].values
+            seq_y = self.y[time_id + self.seq_len : time_id + self.seq_len + self.pred_len].values
 
             ret = {
                 "past_values": np_to_torch(seq_x),
@@ -490,9 +478,7 @@ def __init__(
         def __getitem__(self, time_id):
             # seq_x: batch_size x seq_len x num_x_cols
             seq_x = self.X[time_id : time_id + self.seq_len].values
-            seq_y = self.y[
-                time_id + self.seq_len - 1 : time_id + self.seq_len
-            ].values.ravel()
+            seq_y = self.y[time_id + self.seq_len - 1 : time_id + self.seq_len].values.ravel()
             # return _torch(seq_x, seq_y)
 
             ret = {
@@ -582,16 +568,12 @@ def ts_padding(
         if df[timestamp_column].dtype in ["<M8[ns]", "datetime64", "int"]:
             last_timestamp = df.iloc[0][timestamp_column]
             period = df.iloc[1][timestamp_column] - df.iloc[0][timestamp_column]
-            prepended_timestamps = [
-                last_timestamp + offset * period for offset in range(-fill_length, 0)
-            ]
+            prepended_timestamps = [last_timestamp + offset * period for offset in range(-fill_length, 0)]
             pad_df[timestamp_column] = prepended_timestamps
         else:
             pad_df[timestamp_column] = None
         # Ensure same type
-        pad_df[timestamp_column] = pad_df[timestamp_column].astype(
-            df[timestamp_column].dtype
-        )
+        pad_df[timestamp_column] = pad_df[timestamp_column].astype(df[timestamp_column].dtype)
 
     if id_columns:
         id_values = df.iloc[0][id_columns].to_list()
@@ -632,6 +614,4 @@ def is_cols_in_df(df: pd.DataFrame, cols: List[str]) -> bool:
     d6 = PretrainDFDataset(data_df=df, x_cols=["A", "B"], group_ids=["g1"], seq_len=2)
     print(f"d6: {d6}")
 
-    d7 = ForecastDFDataset(
-        data_df=df, x_cols=["A", "B"], group_ids=["g1"], seq_len=2, pred_len=2
-    )
+    d7 = ForecastDFDataset(data_df=df, x_cols=["A", "B"], group_ids=["g1"], seq_len=2, pred_len=2)
@@ -63,7 +63,7 @@ def _sanitize_parameters(self, **kwargs):
 
         return preprocess_kwargs, {}, postprocess_kwargs
 
-    def __call__(self, time_series: Union["pandas.DataFrame", str], **kwargs):
+    def __call__(self, time_series: Union["pd.DataFrame", str], **kwargs):
         """Main method of the forecasting pipeline. Takes the input time series data (in tabular format) and
         produces predictions.
 
@@ -146,11 +146,7 @@ def _forward(self, model_inputs, **kwargs):
 
         # copy the other inputs
         copy_inputs = True
-        for k in [
-            akey
-            for akey in model_inputs.keys()
-            if (akey not in model_input_keys) or copy_inputs
-        ]:
+        for k in [akey for akey in model_inputs.keys() if (akey not in model_input_keys) or copy_inputs]:
             model_outputs[k] = model_inputs[k]
 
         return model_outputs
@@ -162,11 +158,7 @@ def postprocess(self, input, **kwargs):
         """
         out = {}
 
-        model_output_key = (
-            "prediction_outputs"
-            if "prediction_outputs" in input.keys()
-            else "prediction_logits"
-        )
+        model_output_key = "prediction_outputs" if "prediction_outputs" in input.keys() else "prediction_logits"
 
         for i, c in enumerate(kwargs["output_columns"]):
             out[f"{c}_prediction"] = input[model_output_key][:, :, i].numpy().tolist()

@@ -11,7 +11,10 @@
 import pandas as pd
 from datasets import Dataset
 from sklearn.preprocessing import StandardScaler
-from transformers.feature_extraction_utils import FeatureExtractionMixin
+from transformers.feature_extraction_utils import (
+    FeatureExtractionMixin,
+    PreTrainedFeatureExtractor,
+)
 
 
 # Local
@@ -37,9 +40,7 @@ def to_dict(self) -> Dict[str, Any]:
         return output
 
     @classmethod
-    def from_dict(
-        cls, feature_extractor_dict: Dict[str, Any], **kwargs
-    ) -> "TimeSeriesScaler":
+    def from_dict(cls, feature_extractor_dict: Dict[str, Any], **kwargs) -> "TimeSeriesScaler":
         """
         Instantiates a TimeSeriesScaler from a Python dictionary of parameters.
 
@@ -59,18 +60,12 @@ def from_dict(
         init_param_names = ["copy", "with_mean", "with_std"]
 
         init_params = {}
-        for k, v in [
-            (k, v) for k, v in feature_extractor_dict.items() if k in init_param_names
-        ]:
+        for k, v in [(k, v) for k, v in feature_extractor_dict.items() if k in init_param_names]:
             init_params[k] = v
 
         t = TimeSeriesScaler(**init_params)
 
-        for k, v in [
-            (k, v)
-            for k, v in feature_extractor_dict.items()
-            if k not in init_param_names
-        ]:
+        for k, v in [(k, v) for k, v in feature_extractor_dict.items() if k not in init_param_names]:
             setattr(t, k, v)
 
         return t
@@ -104,9 +99,7 @@ def __init__(
         # note base class __init__ methods sets all arguments as attributes
 
         if not isinstance(id_columns, list):
-            raise ValueError(
-                f"Invalid argument provided for `id_columns`: {id_columns}"
-            )
+            raise ValueError(f"Invalid argument provided for `id_columns`: {id_columns}")
 
         self.timestamp_column = timestamp_column
         self.input_columns = input_columns
@@ -117,7 +110,7 @@ def __init__(
         self.scaling = scaling
         self.time_series_task = time_series_task
         self.scale_outputs = scale_outputs
-        self.scaler_dict = dict()
+        self.scaler_dict = {}
 
         kwargs["processor_class"] = self.__class__.__name__
 
@@ -138,9 +131,7 @@ def to_dict(self) -> Dict[str, Any]:
         return output
 
     @classmethod
-    def from_dict(
-        cls, feature_extractor_dict: Dict[str, Any], **kwargs
-    ) -> "PreTrainedFeatureExtractor":
+    def from_dict(cls, feature_extractor_dict: Dict[str, Any], **kwargs) -> PreTrainedFeatureExtractor:
         """
         Instantiates a type of [`~feature_extraction_utils.FeatureExtractionMixin`] from a Python dictionary of
         parameters.
@@ -178,11 +169,7 @@ def _prepare_single_time_series(self, name, d):
             seq_x = d[self.input_columns].iloc[s_begin:s_end].values
 
             if self.time_series_task == TimeSeriesTask.FORECASTING:
-                seq_y = (
-                    d[self.output_columns]
-                    .iloc[s_end : s_end + self.prediction_length]
-                    .values
-                )
+                seq_y = d[self.output_columns].iloc[s_end : s_end + self.prediction_length].values
             else:
                 seq_y = None
             # to do: add handling of other types
@@ -223,9 +210,7 @@ def _get_groups(
         dataset: pd.DataFrame,
     ):
         if self.id_columns:
-            group_by_columns = (
-                self.id_columns if len(self.id_columns) > 1 else self.id_columns[0]
-            )
+            group_by_columns = self.id_columns if len(self.id_columns) > 1 else self.id_columns[0]
         else:
             group_by_columns = INTERNAL_ID_COLUMN
 
@@ -245,9 +230,7 @@ def _get_columns_to_scale(
         """
         cols_to_scale = copy.copy(self.input_columns)
         if self.scale_outputs:
-            cols_to_scale.extend(
-                [c for c in self.output_columns if c not in self.input_columns]
-            )
+            cols_to_scale.extend([c for c in self.output_columns if c not in self.input_columns])
         return cols_to_scale
 
     def train(
@@ -312,9 +295,7 @@ def scale_func(grp, id_columns):
 
         df = self._standardize_dataframe(dataset)
         if self.id_columns:
-            id_columns = (
-                self.id_columns if len(self.id_columns) > 1 else self.id_columns[0]
-            )
+            id_columns = self.id_columns if len(self.id_columns) > 1 else self.id_columns[0]
         else:
             id_columns = INTERNAL_ID_COLUMN