Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Create python-package.yml #18

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 39 additions & 0 deletions .github/workflows/python-package.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python

name: Python package

on:
push:
branches: [ "main" ]
pull_request:
branches: [ "main" ]

jobs:
build:

runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
python-version: ["3.9", "3.10"]

steps:
- uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v3
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
python -m pip install --upgrade pip
# python -m pip install flake8 pytest
# if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
pip install ".[dev]"
- name: Ruff quality checks
run: |
# use hf-style check
make quality
- name: Test with pytest
run: |
pytest
10 changes: 5 additions & 5 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,18 +1,18 @@
# Adapted from HF Transformers: https://github.com/huggingface/transformers/tree/main
.PHONY: quality style

check_dirs := tests src tsfm notebooks
check_dirs := tests tsfm_public tsfmhfdemos notebooks


# this target runs checks on all files

quality:
ruff check $(check_dirs) setup.py
ruff format --check $(check_dirs) setup.py
ruff check $(check_dirs)
ruff format --check $(check_dirs)

# this target runs checks on all files and potentially modifies some of them

style:
ruff check $(check_dirs) setup.py --fix
ruff format $(check_dirs) setup.py
ruff check $(check_dirs) --fix
ruff format $(check_dirs)

10 changes: 4 additions & 6 deletions tests/toolkit/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,9 +52,7 @@ def test_ts_padding(ts_data):

# test date handled
# integer
assert df_padded.iloc[0]["time_int"] == df.iloc[0]["time_int"] - (
context_length - df.shape[0]
)
assert df_padded.iloc[0]["time_int"] == df.iloc[0]["time_int"] - (context_length - df.shape[0])

# date
df_padded = ts_padding(
Expand All @@ -64,9 +62,9 @@ def test_ts_padding(ts_data):
context_length=context_length,
)

assert df_padded.iloc[0]["time_date"] == df.iloc[0]["time_date"] - (
context_length - df.shape[0]
) * timedelta(days=1)
assert df_padded.iloc[0]["time_date"] == df.iloc[0]["time_date"] - (context_length - df.shape[0]) * timedelta(
days=1
)


def test_pretrain_df_dataset(ts_data):
Expand Down
40 changes: 10 additions & 30 deletions tsfm_public/toolkit/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,22 +47,14 @@ def __init__(
y_cols = [y_cols]

if len(x_cols) > 0:
assert is_cols_in_df(
data_df, x_cols
), f"one or more {x_cols} is not in the list of data_df columns"
assert is_cols_in_df(data_df, x_cols), f"one or more {x_cols} is not in the list of data_df columns"

if len(y_cols) > 0:
assert is_cols_in_df(
data_df, y_cols
), f"one or more {y_cols} is not in the list of data_df columns"
assert is_cols_in_df(data_df, y_cols), f"one or more {y_cols} is not in the list of data_df columns"

if datetime_col:
assert datetime_col in list(
data_df.columns
), f"{datetime_col} is not in the list of data_df columns"
assert (
datetime_col not in x_cols
), f"{datetime_col} should not be in the list of x_cols"
assert datetime_col in list(data_df.columns), f"{datetime_col} is not in the list of data_df columns"
assert datetime_col not in x_cols, f"{datetime_col} should not be in the list of x_cols"

self.data_df = data_df
self.datetime_col = datetime_col
Expand Down Expand Up @@ -160,9 +152,7 @@ def __init__(
cls=BaseDFDataset,
):
if len(id_columns) > 0:
assert is_cols_in_df(
data_df, id_columns
), f"{id_columns} is not in the data_df columns"
assert is_cols_in_df(data_df, id_columns), f"{id_columns} is not in the data_df columns"

self.datetime_col = datetime_col
self.id_columns = id_columns
Expand Down Expand Up @@ -398,9 +388,7 @@ def __getitem__(self, time_id):
# seq_x: batch_size x seq_len x num_x_cols
seq_x = self.X[time_id : time_id + self.seq_len].values
# seq_y: batch_size x pred_len x num_x_cols
seq_y = self.y[
time_id + self.seq_len : time_id + self.seq_len + self.pred_len
].values
seq_y = self.y[time_id + self.seq_len : time_id + self.seq_len + self.pred_len].values

ret = {
"past_values": np_to_torch(seq_x),
Expand Down Expand Up @@ -490,9 +478,7 @@ def __init__(
def __getitem__(self, time_id):
# seq_x: batch_size x seq_len x num_x_cols
seq_x = self.X[time_id : time_id + self.seq_len].values
seq_y = self.y[
time_id + self.seq_len - 1 : time_id + self.seq_len
].values.ravel()
seq_y = self.y[time_id + self.seq_len - 1 : time_id + self.seq_len].values.ravel()
# return _torch(seq_x, seq_y)

ret = {
Expand Down Expand Up @@ -582,16 +568,12 @@ def ts_padding(
if df[timestamp_column].dtype in ["<M8[ns]", "datetime64", "int"]:
last_timestamp = df.iloc[0][timestamp_column]
period = df.iloc[1][timestamp_column] - df.iloc[0][timestamp_column]
prepended_timestamps = [
last_timestamp + offset * period for offset in range(-fill_length, 0)
]
prepended_timestamps = [last_timestamp + offset * period for offset in range(-fill_length, 0)]
pad_df[timestamp_column] = prepended_timestamps
else:
pad_df[timestamp_column] = None
# Ensure same type
pad_df[timestamp_column] = pad_df[timestamp_column].astype(
df[timestamp_column].dtype
)
pad_df[timestamp_column] = pad_df[timestamp_column].astype(df[timestamp_column].dtype)

if id_columns:
id_values = df.iloc[0][id_columns].to_list()
Expand Down Expand Up @@ -632,6 +614,4 @@ def is_cols_in_df(df: pd.DataFrame, cols: List[str]) -> bool:
d6 = PretrainDFDataset(data_df=df, x_cols=["A", "B"], group_ids=["g1"], seq_len=2)
print(f"d6: {d6}")

d7 = ForecastDFDataset(
data_df=df, x_cols=["A", "B"], group_ids=["g1"], seq_len=2, pred_len=2
)
d7 = ForecastDFDataset(data_df=df, x_cols=["A", "B"], group_ids=["g1"], seq_len=2, pred_len=2)
14 changes: 3 additions & 11 deletions tsfm_public/toolkit/time_series_forecasting_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ def _sanitize_parameters(self, **kwargs):

return preprocess_kwargs, {}, postprocess_kwargs

def __call__(self, time_series: Union["pandas.DataFrame", str], **kwargs):
def __call__(self, time_series: Union["pd.DataFrame", str], **kwargs):
"""Main method of the forecasting pipeline. Takes the input time series data (in tabular format) and
produces predictions.

Expand Down Expand Up @@ -146,11 +146,7 @@ def _forward(self, model_inputs, **kwargs):

# copy the other inputs
copy_inputs = True
for k in [
akey
for akey in model_inputs.keys()
if (akey not in model_input_keys) or copy_inputs
]:
for k in [akey for akey in model_inputs.keys() if (akey not in model_input_keys) or copy_inputs]:
model_outputs[k] = model_inputs[k]

return model_outputs
Expand All @@ -162,11 +158,7 @@ def postprocess(self, input, **kwargs):
"""
out = {}

model_output_key = (
"prediction_outputs"
if "prediction_outputs" in input.keys()
else "prediction_logits"
)
model_output_key = "prediction_outputs" if "prediction_outputs" in input.keys() else "prediction_logits"

for i, c in enumerate(kwargs["output_columns"]):
out[f"{c}_prediction"] = input[model_output_key][:, :, i].numpy().tolist()
Expand Down
47 changes: 14 additions & 33 deletions tsfm_public/toolkit/time_series_preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,10 @@
import pandas as pd
from datasets import Dataset
from sklearn.preprocessing import StandardScaler
from transformers.feature_extraction_utils import FeatureExtractionMixin
from transformers.feature_extraction_utils import (
FeatureExtractionMixin,
PreTrainedFeatureExtractor,
)


# Local
Expand All @@ -37,9 +40,7 @@ def to_dict(self) -> Dict[str, Any]:
return output

@classmethod
def from_dict(
cls, feature_extractor_dict: Dict[str, Any], **kwargs
) -> "TimeSeriesScaler":
def from_dict(cls, feature_extractor_dict: Dict[str, Any], **kwargs) -> "TimeSeriesScaler":
"""
Instantiates a TimeSeriesScaler from a Python dictionary of parameters.

Expand All @@ -59,18 +60,12 @@ def from_dict(
init_param_names = ["copy", "with_mean", "with_std"]

init_params = {}
for k, v in [
(k, v) for k, v in feature_extractor_dict.items() if k in init_param_names
]:
for k, v in [(k, v) for k, v in feature_extractor_dict.items() if k in init_param_names]:
init_params[k] = v

t = TimeSeriesScaler(**init_params)

for k, v in [
(k, v)
for k, v in feature_extractor_dict.items()
if k not in init_param_names
]:
for k, v in [(k, v) for k, v in feature_extractor_dict.items() if k not in init_param_names]:
setattr(t, k, v)

return t
Expand Down Expand Up @@ -104,9 +99,7 @@ def __init__(
# note base class __init__ methods sets all arguments as attributes

if not isinstance(id_columns, list):
raise ValueError(
f"Invalid argument provided for `id_columns`: {id_columns}"
)
raise ValueError(f"Invalid argument provided for `id_columns`: {id_columns}")

self.timestamp_column = timestamp_column
self.input_columns = input_columns
Expand All @@ -117,7 +110,7 @@ def __init__(
self.scaling = scaling
self.time_series_task = time_series_task
self.scale_outputs = scale_outputs
self.scaler_dict = dict()
self.scaler_dict = {}

kwargs["processor_class"] = self.__class__.__name__

Expand All @@ -138,9 +131,7 @@ def to_dict(self) -> Dict[str, Any]:
return output

@classmethod
def from_dict(
cls, feature_extractor_dict: Dict[str, Any], **kwargs
) -> "PreTrainedFeatureExtractor":
def from_dict(cls, feature_extractor_dict: Dict[str, Any], **kwargs) -> PreTrainedFeatureExtractor:
"""
Instantiates a type of [`~feature_extraction_utils.FeatureExtractionMixin`] from a Python dictionary of
parameters.
Expand Down Expand Up @@ -178,11 +169,7 @@ def _prepare_single_time_series(self, name, d):
seq_x = d[self.input_columns].iloc[s_begin:s_end].values

if self.time_series_task == TimeSeriesTask.FORECASTING:
seq_y = (
d[self.output_columns]
.iloc[s_end : s_end + self.prediction_length]
.values
)
seq_y = d[self.output_columns].iloc[s_end : s_end + self.prediction_length].values
else:
seq_y = None
# to do: add handling of other types
Expand Down Expand Up @@ -223,9 +210,7 @@ def _get_groups(
dataset: pd.DataFrame,
):
if self.id_columns:
group_by_columns = (
self.id_columns if len(self.id_columns) > 1 else self.id_columns[0]
)
group_by_columns = self.id_columns if len(self.id_columns) > 1 else self.id_columns[0]
else:
group_by_columns = INTERNAL_ID_COLUMN

Expand All @@ -245,9 +230,7 @@ def _get_columns_to_scale(
"""
cols_to_scale = copy.copy(self.input_columns)
if self.scale_outputs:
cols_to_scale.extend(
[c for c in self.output_columns if c not in self.input_columns]
)
cols_to_scale.extend([c for c in self.output_columns if c not in self.input_columns])
return cols_to_scale

def train(
Expand Down Expand Up @@ -312,9 +295,7 @@ def scale_func(grp, id_columns):

df = self._standardize_dataframe(dataset)
if self.id_columns:
id_columns = (
self.id_columns if len(self.id_columns) > 1 else self.id_columns[0]
)
id_columns = self.id_columns if len(self.id_columns) > 1 else self.id_columns[0]
else:
id_columns = INTERNAL_ID_COLUMN

Expand Down
Loading
Loading