From 8dd30a1cf413556e906a5305acfd36e549aa8c82 Mon Sep 17 00:00:00 2001 From: "Wesley M. Gifford" Date: Wed, 10 Jul 2024 19:36:43 -0400 Subject: [PATCH 1/7] better handling for frequency token --- .../toolkit/time_series_preprocessor.py | 58 +++++++------------ 1 file changed, 20 insertions(+), 38 deletions(-) diff --git a/tsfm_public/toolkit/time_series_preprocessor.py b/tsfm_public/toolkit/time_series_preprocessor.py index d55a08af..a80af299 100644 --- a/tsfm_public/toolkit/time_series_preprocessor.py +++ b/tsfm_public/toolkit/time_series_preprocessor.py @@ -197,6 +197,9 @@ def __init__( self.target_scaler_dict = {} self.categorical_encoder = None self.frequency_mapping = frequency_mapping + + self._timedelta_map = self._get_timedelta_map() + self.freq = freq kwargs["processor_class"] = self.__class__.__name__ @@ -228,6 +231,17 @@ def _validate_columns(self): "A column name should appear only once in `target_columns`, `observable_colums`, `control_columnts`, `conditional_columns`, `categorical_columns`, and `static_columns`." ) + def _get_timedelta_map( + self, + ): + td_map = {} + for k, v in self.frequency_mapping.items(): + if k == "oov": + continue + td_str = str(pd._libs.tslibs.timedeltas.Timedelta(k if k[0].isdigit() else f"1{k}")) + td_map[td_str] = k + return td_map + def to_dict(self) -> Dict[str, Any]: """ Serializes this instance to a Python dictionary. @@ -317,44 +331,6 @@ def from_dict(cls, feature_extractor_dict: Dict[str, Any], **kwargs) -> "PreTrai return super().from_dict(feature_extractor_dict, **kwargs) - # def _prepare_single_time_series(self, name, d): - # """ - # Segment and prepare the time series based on the configuration arguments. - - # name: name for the time series, for example as a result of a grouping operation - # d: the data for a single time series - # """ - # for s_begin in range(d.shape[0] - self.context_length + 1): - # s_end = s_begin + self.context_length - # seq_x = d[self.input_columns].iloc[s_begin:s_end].values - - # if self.time_series_task == TimeSeriesTask.FORECASTING: - # seq_y = ( - # d[self.output_columns] - # .iloc[s_end : s_end + self.prediction_length] - # .values - # ) - # else: - # seq_y = None - # # to do: add handling of other types - - # if self.timestamp_column: - # ts = d[self.timestamp_column].iloc[s_end - 1] - # else: - # ts = None - - # if self.id_columns: - # ids = d[self.id_columns].iloc[s_end - 1].values - # else: - # ids = None - - # yield { - # "timestamp_column": ts, - # "id_columns": ids, - # "past_values": seq_x, - # "future_values": seq_y, - # } - @classmethod def _get_scaler_class(cls, scaler_type): if scaler_type == ScalerType.MINMAX.value: @@ -484,6 +460,12 @@ def _train_categorical_encoder(self, df: pd.DataFrame): def get_frequency_token(self, token_name: str): token = self.frequency_mapping.get(token_name, None) + # try lookup using timedelta directly + if token is None: + token_name_mapped = self._timedelta_map.get(token_name, None) + if token_name_mapped is not None: + token = self.frequency_mapping.get(token_name_mapped, None) + if token is None: warn(f"Frequency token {token_name} was not found in the frequncy token mapping.") token = self.frequency_mapping["oov"] From 5b77790bca7b794f2513a301f633f4d16c3eada9 Mon Sep 17 00:00:00 2001 From: "Wesley M. Gifford" Date: Wed, 10 Jul 2024 19:37:04 -0400 Subject: [PATCH 2/7] update ruff version --- pyproject.toml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 586921a8..89ae0b9b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,7 +29,7 @@ packages = ["tsfm_public", "tsfm_public.toolkit", "tsfm_public.models", "tsfm_pu notebooks = ["jupyter", "matplotlib", "datasets", "ipywidgets", "plotly", "kaleido", "tensorboard"] testing = ["pytest", "tsfm_public[notebooks]", "parameterized"] -dev = ["pre-commit", "tsfm_public[testing]", "ruff==0.1.5"] +dev = ["pre-commit", "tsfm_public[testing]", "ruff==0.4.4"] evaluation = [ "tsevaluate @ git+ssh://git@github.ibm.com/srom/tsevaluate.git", ] @@ -54,15 +54,15 @@ version_file = "tsfm_public/_version.py" [tool.ruff] # Never enforce `E501` (line length violations). -ignore = ["C901", "E501", "E741", "F402", "F823" ] -select = ["C", "E", "F", "I", "W"] +lint.ignore = ["C901", "E501", "E741", "F402", "F823" ] +lint.select = ["C", "E", "F", "I", "W"] line-length = 119 # Ignore import violations in all `__init__.py` files. -[tool.ruff.per-file-ignores] +[tool.ruff.lint.per-file-ignores] "__init__.py" = ["E402", "F401", "F403", "F811"] -[tool.ruff.isort] +[tool.ruff.lint.isort] lines-after-imports = 2 known-first-party = ["tsfm_public"] From 51beee77846c83d361203498ec1dc255669d4120 Mon Sep 17 00:00:00 2001 From: "Wesley M. Gifford" Date: Wed, 10 Jul 2024 19:37:55 -0400 Subject: [PATCH 3/7] format --- tests/models/tinytimemixer/test_modeling_tinytimemixer.py | 2 +- tests/toolkit/test_dataset.py | 1 - tests/toolkit/test_time_series_forecasting_pipeline.py | 1 + tsfm_public/models/tinytimemixer/configuration_tinytimemixer.py | 2 +- tsfm_public/models/tinytimemixer/modeling_tinytimemixer.py | 2 +- tsfm_public/models/tinytimemixer/utils/ttm_utils.py | 1 + 6 files changed, 5 insertions(+), 4 deletions(-) diff --git a/tests/models/tinytimemixer/test_modeling_tinytimemixer.py b/tests/models/tinytimemixer/test_modeling_tinytimemixer.py index 9897f4d1..3ce871c4 100644 --- a/tests/models/tinytimemixer/test_modeling_tinytimemixer.py +++ b/tests/models/tinytimemixer/test_modeling_tinytimemixer.py @@ -2,7 +2,7 @@ # # This code is based on the test code for PatchTSMixer in the HuggingFace Transformers Library: # https://github.com/huggingface/transformers/blob/main/tests/models/patchtsmixer/test_modeling_patchtsmixer.py -""" Testing suite for the PyTorch TinyTimeMixer model. """ +"""Testing suite for the PyTorch TinyTimeMixer model.""" # Standard import itertools diff --git a/tests/toolkit/test_dataset.py b/tests/toolkit/test_dataset.py index 18f27b68..6e23adad 100644 --- a/tests/toolkit/test_dataset.py +++ b/tests/toolkit/test_dataset.py @@ -3,7 +3,6 @@ """Tests basic dataset functions""" - from datetime import datetime, timedelta import numpy as np diff --git a/tests/toolkit/test_time_series_forecasting_pipeline.py b/tests/toolkit/test_time_series_forecasting_pipeline.py index defdd123..503e5c46 100644 --- a/tests/toolkit/test_time_series_forecasting_pipeline.py +++ b/tests/toolkit/test_time_series_forecasting_pipeline.py @@ -2,6 +2,7 @@ # """Tests the time series preprocessor and functions""" + import pandas as pd from transformers import PatchTSTForPrediction diff --git a/tsfm_public/models/tinytimemixer/configuration_tinytimemixer.py b/tsfm_public/models/tinytimemixer/configuration_tinytimemixer.py index b4aafa7f..0e7df2d0 100644 --- a/tsfm_public/models/tinytimemixer/configuration_tinytimemixer.py +++ b/tsfm_public/models/tinytimemixer/configuration_tinytimemixer.py @@ -1,6 +1,6 @@ # Copyright contributors to the TSFM project # -""" TinyTimeMixer model configuration""" +"""TinyTimeMixer model configuration""" from typing import Optional, Union diff --git a/tsfm_public/models/tinytimemixer/modeling_tinytimemixer.py b/tsfm_public/models/tinytimemixer/modeling_tinytimemixer.py index 504dfdd8..eb64518d 100644 --- a/tsfm_public/models/tinytimemixer/modeling_tinytimemixer.py +++ b/tsfm_public/models/tinytimemixer/modeling_tinytimemixer.py @@ -2,7 +2,7 @@ # # This code is based on layers and components from the PatchTSMixer model in the HuggingFace Transformers # Library: https://github.com/huggingface/transformers/blob/main/src/transformers/models/patchtsmixer/modeling_patchtsmixer.py -""" PyTorch TinyTimeMixer model.""" +"""PyTorch TinyTimeMixer model.""" # Standard import copy diff --git a/tsfm_public/models/tinytimemixer/utils/ttm_utils.py b/tsfm_public/models/tinytimemixer/utils/ttm_utils.py index 03bd457e..dbbb5d68 100644 --- a/tsfm_public/models/tinytimemixer/utils/ttm_utils.py +++ b/tsfm_public/models/tinytimemixer/utils/ttm_utils.py @@ -1,4 +1,5 @@ """Utilities for TTM notebooks""" + # Standard import argparse import os From 5e0ff9409e9b37e9a2856ee0157269be34cb8de3 Mon Sep 17 00:00:00 2001 From: "Wesley M. Gifford" Date: Wed, 10 Jul 2024 19:40:40 -0400 Subject: [PATCH 4/7] add docstring --- tsfm_public/toolkit/time_series_preprocessor.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tsfm_public/toolkit/time_series_preprocessor.py b/tsfm_public/toolkit/time_series_preprocessor.py index a80af299..5324aee1 100644 --- a/tsfm_public/toolkit/time_series_preprocessor.py +++ b/tsfm_public/toolkit/time_series_preprocessor.py @@ -233,7 +233,12 @@ def _validate_columns(self): def _get_timedelta_map( self, - ): + ) -> Dict[str, str]: + """Get a mapping that relates timedeltas to frequencies in the frequency map. + + Returns: + Dict[str, str]: Dictionary of mappings from timedelta strings to frequency token names. + """ td_map = {} for k, v in self.frequency_mapping.items(): if k == "oov": From c67ffcda14c07392810d5021740797d566664550 Mon Sep 17 00:00:00 2001 From: "Wesley M. Gifford" Date: Wed, 10 Jul 2024 20:15:03 -0400 Subject: [PATCH 5/7] simpler solution, add test --- .../toolkit/test_time_series_preprocessor.py | 8 ++++ .../toolkit/time_series_preprocessor.py | 45 +++++++------------ 2 files changed, 25 insertions(+), 28 deletions(-) diff --git a/tests/toolkit/test_time_series_preprocessor.py b/tests/toolkit/test_time_series_preprocessor.py index bdb9c93e..8a52c7c9 100644 --- a/tests/toolkit/test_time_series_preprocessor.py +++ b/tests/toolkit/test_time_series_preprocessor.py @@ -429,6 +429,14 @@ def test_get_datasets_with_frequency_token(ts_data): assert train[0]["freq_token"] == DEFAULT_FREQUENCY_MAPPING["d"] +def test_get_frequency_token(): + tsp = TimeSeriesPreprocessor(timestamp_column="date") + + assert tsp.get_frequency_token("1h") == DEFAULT_FREQUENCY_MAPPING["h"] + assert tsp.get_frequency_token("h") == DEFAULT_FREQUENCY_MAPPING["h"] + assert tsp.get_frequency_token("0 days 01:00:00") == DEFAULT_FREQUENCY_MAPPING["h"] + + def test_id_columns_and_scaling_id_columns(ts_data_runs): df = ts_data_runs diff --git a/tsfm_public/toolkit/time_series_preprocessor.py b/tsfm_public/toolkit/time_series_preprocessor.py index 5324aee1..f65f3234 100644 --- a/tsfm_public/toolkit/time_series_preprocessor.py +++ b/tsfm_public/toolkit/time_series_preprocessor.py @@ -14,6 +14,7 @@ import pandas as pd from datasets import Dataset from deprecated import deprecated +from pandas.tseries.frequencies import to_offset from sklearn.preprocessing import MinMaxScaler as MinMaxScaler_ from sklearn.preprocessing import OrdinalEncoder as OrdinalEncoder_ from sklearn.preprocessing import StandardScaler as StandardScaler_ @@ -197,9 +198,6 @@ def __init__( self.target_scaler_dict = {} self.categorical_encoder = None self.frequency_mapping = frequency_mapping - - self._timedelta_map = self._get_timedelta_map() - self.freq = freq kwargs["processor_class"] = self.__class__.__name__ @@ -231,22 +229,6 @@ def _validate_columns(self): "A column name should appear only once in `target_columns`, `observable_colums`, `control_columnts`, `conditional_columns`, `categorical_columns`, and `static_columns`." ) - def _get_timedelta_map( - self, - ) -> Dict[str, str]: - """Get a mapping that relates timedeltas to frequencies in the frequency map. - - Returns: - Dict[str, str]: Dictionary of mappings from timedelta strings to frequency token names. - """ - td_map = {} - for k, v in self.frequency_mapping.items(): - if k == "oov": - continue - td_str = str(pd._libs.tslibs.timedeltas.Timedelta(k if k[0].isdigit() else f"1{k}")) - td_map[td_str] = k - return td_map - def to_dict(self) -> Dict[str, Any]: """ Serializes this instance to a Python dictionary. @@ -464,16 +446,23 @@ def _train_categorical_encoder(self, df: pd.DataFrame): def get_frequency_token(self, token_name: str): token = self.frequency_mapping.get(token_name, None) + if token is not None: + return token - # try lookup using timedelta directly - if token is None: - token_name_mapped = self._timedelta_map.get(token_name, None) - if token_name_mapped is not None: - token = self.frequency_mapping.get(token_name_mapped, None) - - if token is None: - warn(f"Frequency token {token_name} was not found in the frequncy token mapping.") - token = self.frequency_mapping["oov"] + # try to map as a frequency string + try: + token_name_offs = to_offset(token_name).freqstr + token = self.frequency_mapping.get(token_name_offs, None) + return token + except ValueError: + # lastly try to map the timedelta to a frequency string + token_name_td = pd._libs.tslibs.timedeltas.Timedelta(token_name) + token_name_offs = to_offset(token_name_td).freqstr + token = self.frequency_mapping.get(token_name_offs, None) + return token + + warn(f"Frequency token {token_name} was not found in the frequncy token mapping.") + token = self.frequency_mapping["oov"] return token From 08423873d6c7d6407647f7db5d3cd7c3bfa0a8f5 Mon Sep 17 00:00:00 2001 From: "Wesley M. Gifford" Date: Wed, 10 Jul 2024 20:50:00 -0400 Subject: [PATCH 6/7] exclude autogenerated _version.py --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 89ae0b9b..697ccbec 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -57,6 +57,7 @@ version_file = "tsfm_public/_version.py" lint.ignore = ["C901", "E501", "E741", "F402", "F823" ] lint.select = ["C", "E", "F", "I", "W"] line-length = 119 +extend-exclude = ["tsfm_public/_version.py"] # Ignore import violations in all `__init__.py` files. [tool.ruff.lint.per-file-ignores] From ea4dd6ef4651f1cd529010beb7aa081cd253ecfc Mon Sep 17 00:00:00 2001 From: "Wesley M. Gifford" Date: Wed, 10 Jul 2024 21:03:58 -0400 Subject: [PATCH 7/7] check for None --- tsfm_public/toolkit/time_series_preprocessor.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tsfm_public/toolkit/time_series_preprocessor.py b/tsfm_public/toolkit/time_series_preprocessor.py index f65f3234..f32258cc 100644 --- a/tsfm_public/toolkit/time_series_preprocessor.py +++ b/tsfm_public/toolkit/time_series_preprocessor.py @@ -453,13 +453,15 @@ def get_frequency_token(self, token_name: str): try: token_name_offs = to_offset(token_name).freqstr token = self.frequency_mapping.get(token_name_offs, None) - return token + if token is not None: + return token except ValueError: # lastly try to map the timedelta to a frequency string token_name_td = pd._libs.tslibs.timedeltas.Timedelta(token_name) token_name_offs = to_offset(token_name_td).freqstr token = self.frequency_mapping.get(token_name_offs, None) - return token + if token is not None: + return token warn(f"Frequency token {token_name} was not found in the frequncy token mapping.") token = self.frequency_mapping["oov"]