Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve frequency handling #84

Merged
merged 7 commits into from
Jul 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 6 additions & 5 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ packages = ["tsfm_public", "tsfm_public.toolkit", "tsfm_public.models", "tsfm_pu

notebooks = ["jupyter", "matplotlib", "datasets", "ipywidgets", "plotly", "kaleido", "tensorboard"]
testing = ["pytest", "tsfm_public[notebooks]", "parameterized"]
dev = ["pre-commit", "tsfm_public[testing]", "ruff==0.1.5"]
dev = ["pre-commit", "tsfm_public[testing]", "ruff==0.4.4"]
evaluation = [
"tsevaluate @ git+ssh://[email protected]/srom/tsevaluate.git",
]
Expand All @@ -54,15 +54,16 @@ version_file = "tsfm_public/_version.py"

[tool.ruff]
# Never enforce `E501` (line length violations).
ignore = ["C901", "E501", "E741", "F402", "F823" ]
select = ["C", "E", "F", "I", "W"]
lint.ignore = ["C901", "E501", "E741", "F402", "F823" ]
lint.select = ["C", "E", "F", "I", "W"]
line-length = 119
extend-exclude = ["tsfm_public/_version.py"]

# Ignore import violations in all `__init__.py` files.
[tool.ruff.per-file-ignores]
[tool.ruff.lint.per-file-ignores]
"__init__.py" = ["E402", "F401", "F403", "F811"]

[tool.ruff.isort]
[tool.ruff.lint.isort]
lines-after-imports = 2
known-first-party = ["tsfm_public"]

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
#
# This code is based on the test code for PatchTSMixer in the HuggingFace Transformers Library:
# https://github.com/huggingface/transformers/blob/main/tests/models/patchtsmixer/test_modeling_patchtsmixer.py
""" Testing suite for the PyTorch TinyTimeMixer model. """
"""Testing suite for the PyTorch TinyTimeMixer model."""

# Standard
import itertools
Expand Down
1 change: 0 additions & 1 deletion tests/toolkit/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@

"""Tests basic dataset functions"""


from datetime import datetime, timedelta

import numpy as np
Expand Down
1 change: 1 addition & 0 deletions tests/toolkit/test_time_series_forecasting_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#

"""Tests the time series preprocessor and functions"""

import pandas as pd
from transformers import PatchTSTForPrediction

Expand Down
8 changes: 8 additions & 0 deletions tests/toolkit/test_time_series_preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -429,6 +429,14 @@ def test_get_datasets_with_frequency_token(ts_data):
assert train[0]["freq_token"] == DEFAULT_FREQUENCY_MAPPING["d"]


def test_get_frequency_token():
tsp = TimeSeriesPreprocessor(timestamp_column="date")

assert tsp.get_frequency_token("1h") == DEFAULT_FREQUENCY_MAPPING["h"]
assert tsp.get_frequency_token("h") == DEFAULT_FREQUENCY_MAPPING["h"]
assert tsp.get_frequency_token("0 days 01:00:00") == DEFAULT_FREQUENCY_MAPPING["h"]


def test_id_columns_and_scaling_id_columns(ts_data_runs):
df = ts_data_runs

Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Copyright contributors to the TSFM project
#
""" TinyTimeMixer model configuration"""
"""TinyTimeMixer model configuration"""

from typing import Optional, Union

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
#
# This code is based on layers and components from the PatchTSMixer model in the HuggingFace Transformers
# Library: https://github.com/huggingface/transformers/blob/main/src/transformers/models/patchtsmixer/modeling_patchtsmixer.py
""" PyTorch TinyTimeMixer model."""
"""PyTorch TinyTimeMixer model."""

# Standard
import copy
Expand Down
1 change: 1 addition & 0 deletions tsfm_public/models/tinytimemixer/utils/ttm_utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Utilities for TTM notebooks"""

# Standard
import argparse
import os
Expand Down
60 changes: 19 additions & 41 deletions tsfm_public/toolkit/time_series_preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import pandas as pd
from datasets import Dataset
from deprecated import deprecated
from pandas.tseries.frequencies import to_offset
from sklearn.preprocessing import MinMaxScaler as MinMaxScaler_
from sklearn.preprocessing import OrdinalEncoder as OrdinalEncoder_
from sklearn.preprocessing import StandardScaler as StandardScaler_
Expand Down Expand Up @@ -317,44 +318,6 @@ def from_dict(cls, feature_extractor_dict: Dict[str, Any], **kwargs) -> "PreTrai

return super().from_dict(feature_extractor_dict, **kwargs)

# def _prepare_single_time_series(self, name, d):
# """
# Segment and prepare the time series based on the configuration arguments.

# name: name for the time series, for example as a result of a grouping operation
# d: the data for a single time series
# """
# for s_begin in range(d.shape[0] - self.context_length + 1):
# s_end = s_begin + self.context_length
# seq_x = d[self.input_columns].iloc[s_begin:s_end].values

# if self.time_series_task == TimeSeriesTask.FORECASTING:
# seq_y = (
# d[self.output_columns]
# .iloc[s_end : s_end + self.prediction_length]
# .values
# )
# else:
# seq_y = None
# # to do: add handling of other types

# if self.timestamp_column:
# ts = d[self.timestamp_column].iloc[s_end - 1]
# else:
# ts = None

# if self.id_columns:
# ids = d[self.id_columns].iloc[s_end - 1].values
# else:
# ids = None

# yield {
# "timestamp_column": ts,
# "id_columns": ids,
# "past_values": seq_x,
# "future_values": seq_y,
# }

@classmethod
def _get_scaler_class(cls, scaler_type):
if scaler_type == ScalerType.MINMAX.value:
Expand Down Expand Up @@ -483,10 +446,25 @@ def _train_categorical_encoder(self, df: pd.DataFrame):

def get_frequency_token(self, token_name: str):
token = self.frequency_mapping.get(token_name, None)
if token is not None:
return token

if token is None:
warn(f"Frequency token {token_name} was not found in the frequncy token mapping.")
token = self.frequency_mapping["oov"]
# try to map as a frequency string
try:
token_name_offs = to_offset(token_name).freqstr
token = self.frequency_mapping.get(token_name_offs, None)
if token is not None:
return token
except ValueError:
# lastly try to map the timedelta to a frequency string
token_name_td = pd._libs.tslibs.timedeltas.Timedelta(token_name)
token_name_offs = to_offset(token_name_td).freqstr
token = self.frequency_mapping.get(token_name_offs, None)
if token is not None:
return token

warn(f"Frequency token {token_name} was not found in the frequncy token mapping.")
token = self.frequency_mapping["oov"]

return token

Expand Down
Loading