From fc09e1ad571709d8becea1ee5bc2cc77f94a9b3f Mon Sep 17 00:00:00 2001 From: "Wesley M. Gifford" Date: Tue, 23 Apr 2024 13:51:02 -0400 Subject: [PATCH] clean up output Signed-off-by: Wesley M. Gifford --- .../toolkit/time_series_preprocessor.py | 28 +++++++++++++++++-- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/tsfm_public/toolkit/time_series_preprocessor.py b/tsfm_public/toolkit/time_series_preprocessor.py index 68bfa5db..4de30c24 100644 --- a/tsfm_public/toolkit/time_series_preprocessor.py +++ b/tsfm_public/toolkit/time_series_preprocessor.py @@ -344,11 +344,29 @@ def _standardize_dataframe( return df + def _clean_up_dataframe(self, df: pd.DataFrame) -> None: + """Removes columns added during internal processing of the provided dataframe. + + Currently, the following checks are done: + - Remove INTERNAL_ID_COLUMN if present + + Args: + df (pd.DataFrame): Input pandas dataframe + + Returns: + pd.DataFrame: Cleaned up dataframe + """ + + if not self.id_columns: + if INTERNAL_ID_COLUMN in df.columns: + df.drop(columns=INTERNAL_ID_COLUMN, inplace=True) + def _get_groups( self, dataset: pd.DataFrame, ) -> Generator[Tuple[Any, pd.DataFrame], None, None]: - """Get groups of the time series dataset (multi-time series) based on the ID columns. + """Get groups of the time series dataset (multi-time series) based on the ID columns for scaling. + Note that this is used for scaling purposes only. Args: dataset (pd.DataFrame): Input dataset @@ -472,7 +490,7 @@ def _check_dataset(self, dataset: Union[Dataset, pd.DataFrame]): def _set_targets(self, dataset: pd.DataFrame) -> None: if self.target_columns == []: - skip_columns = copy.copy(self.id_columns) + skip_columns = copy.copy(self.id_columns) + [INTERNAL_ID_COLUMN] if self.timestamp_column: skip_columns.append(self.timestamp_column) @@ -531,6 +549,7 @@ def train( if self.encode_categorical: self._train_categorical_encoder(df) + self._clean_up_dataframe(df) return self def inverse_scale_targets( @@ -581,10 +600,12 @@ def inverse_scale_func(grp, id_columns): else: id_columns = INTERNAL_ID_COLUMN - return df.groupby(id_columns, group_keys=False).apply( + df_inv = df.groupby(id_columns, group_keys=False).apply( inverse_scale_func, id_columns=id_columns, ) + self._clean_up_dataframe(df_inv) + return df_inv def preprocess( self, @@ -640,6 +661,7 @@ def scale_func(grp, id_columns): raise RuntimeError("Attempt to encode categorical columns, but the encoder has not been trained yet.") df[cols_to_encode] = self.categorical_encoder.transform(df[cols_to_encode]) + self._clean_up_dataframe(df) return df def get_datasets(