Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Get dataset fixes #24

Merged
merged 23 commits into from
Apr 4, 2024
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion tests/toolkit/test_time_series_preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,9 @@ def test_get_datasets(ts_data):
)

# new train length should be 20% of 100, minus the usual for context length and prediction length
fewshot_train_size = int(100 * 0.2) - (tsp.context_length + tsp.prediction_length) + 1
fewshot_train_size = (
int((100 - tsp.context_length) * 0.2) + tsp.context_length - (tsp.context_length + tsp.prediction_length) + 1
)
assert len(train) == fewshot_train_size

assert len(valid) == len(test)
Expand Down
1 change: 1 addition & 0 deletions tsfm_public/toolkit/time_series_preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -659,6 +659,7 @@ def get_datasets(
id_columns=self.id_columns,
fraction=fewshot_fraction,
location=fewshot_location,
minimum_size=self.context_length,
)

params = column_specifiers
Expand Down
15 changes: 10 additions & 5 deletions tsfm_public/toolkit/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,7 @@ def select_by_fixed_fraction(
id_columns: Optional[List[str]] = None,
fraction: float = 1.0,
location: str = FractionLocation.FIRST.value,
minimum_size: Optional[int] = 0,
) -> pd.DataFrame:
"""Select a portion of a dataset based on a fraction of the data.
Fraction can either be located at the start (location = FractionLocation.FIRST) or at the end (location = FractionLocation.LAST)
Expand All @@ -167,9 +168,10 @@ def select_by_fixed_fraction(
id_columns (List[str], optional): Columns which specify the IDs in the dataset. Defaults to None.
fraction (float): The fraction to select.
location (str): Location of where to select the fraction Defaults to FractionLocation.FIRST.value.
minimum_size (int, optional): Minimum size of the split. Defaults to None.

Raises:
ValueError: Raised when the
ValueError: Raised when the fraction is not within the range [0,1].

Returns:
pd.DataFrame: Subset of the dataframe.
Expand All @@ -180,9 +182,7 @@ def select_by_fixed_fraction(

if not id_columns:
return _split_group_by_fixed_fraction(
df,
fraction=fraction,
location=location,
df, fraction=fraction, location=location, minimum_size=minimum_size
).copy()

groups = df.groupby(_get_groupby_columns(id_columns))
Expand All @@ -194,6 +194,7 @@ def select_by_fixed_fraction(
name=name,
fraction=fraction,
location=location,
minimum_size=minimum_size,
)
)

Expand All @@ -216,6 +217,7 @@ def _split_group_by_index(
start_index: Optional[int] = None,
end_index: Optional[int] = None,
) -> pd.DataFrame:
"""Helper function for splitting by index."""
if start_index and (start_index >= len(group_df)):
msg = "Selection would result in an empty time series, please check start_index and time series length"
msg = msg + f" (id = {name})" if name else msg
Expand All @@ -239,6 +241,7 @@ def _split_group_by_fraction(
start_offset: Optional[int] = 0,
end_fraction: Optional[float] = None,
) -> pd.DataFrame:
"""Helper function for splitting by relative fraction."""
length = len(group_df)

if start_fraction is not None:
Expand All @@ -265,9 +268,11 @@ def _split_group_by_fixed_fraction(
name: Optional[str] = None,
fraction: float = 1.0,
location: Optional[str] = None,
minimum_size: Optional[int] = 0,
):
"""Helper function for splitting by fixed fraction."""
l = len(group_df)
fraction_size = int(fraction * l)
fraction_size = int(fraction * (l - minimum_size)) + minimum_size

if location == FractionLocation.FIRST.value:
start_index = 0
Expand Down
Loading