From 102cad584da9c4d11d75257856061e2e4a298436 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ru=20Ke=C3=AFn?= <3181182+alphasentaurii@users.noreply.github.com> Date: Wed, 3 Apr 2024 20:22:55 -0400 Subject: [PATCH] Encoder handle unknown (#53) * only create one encoding per unknown variable * set default loglevel to INFO --- CHANGES.rst | 5 +++++ spacekit/logger/log.py | 2 +- spacekit/preprocessor/encode.py | 22 ++++++++++++---------- 3 files changed, 18 insertions(+), 11 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index 4137e99..bfec3c4 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -10,6 +10,11 @@ new features - `extractor.scrape.FitsScraper.scrape_dataframe` method added for scraping Fits data from dataframe [#52] +bug fixes +--------- + +- `preprocessor.encode.PairEncoder.handle_unknowns` create single new encoding value per unidentified variable [#53] + 1.0.1 (2024-04-03) ================== diff --git a/spacekit/logger/log.py b/spacekit/logger/log.py index faa7cba..ccbcada 100644 --- a/spacekit/logger/log.py +++ b/spacekit/logger/log.py @@ -68,7 +68,7 @@ def __init__( self.short_name = script_name self.console = console self.logfile = logfile - self.log_level = logging.DEBUG + self.log_level = logging.INFO self.console_log_output = console_log_output.lower() self.console_log_level = console_log_level.upper() self.console_log_color = console_log_color diff --git a/spacekit/preprocessor/encode.py b/spacekit/preprocessor/encode.py index 83c9da8..9765a1d 100644 --- a/spacekit/preprocessor/encode.py +++ b/spacekit/preprocessor/encode.py @@ -108,12 +108,13 @@ def inverse_pairs(self): return self.invpairs def handle_unknowns(self, unknowns): - self.log.warning(f"Found unknown values:\n {self.arr[unknowns]}") - add_encoding = max(list(self.keypairs.values())) + 1 + uvals = np.unique(self.arr[unknowns]) + self.log.warning(f"Found unknown values:\n {uvals}") try: - self.keypairs[self.arr[unknowns][0]] = add_encoding - self.classes_ = list(self.keypairs.keys()) - self.log.info("Successfully added encoding for unknown values.") + for u in uvals: + add_encoding = max(list(self.keypairs.values())) + 1 + self.keypairs[u] = add_encoding + self.classes_ = list(self.keypairs.keys()) except Exception as e: self.log.error("Unable to add encoding for unknown value(s)", e) @@ -236,16 +237,17 @@ def _encode_features(self): "encoding_pairs attr must be instantiated with key-value pairs" ) return - self.log.info("Encoding categorical features...") + self.log.debug("Encoding categorical features...") for col, name in self.encodings.items(): keypairs = self.encoding_pairs[col] enc = PairEncoder() enc.fit_transform(self.df, keypairs, axiscol=col) self.df[name] = enc.transformed - self.log.debug(f"*** {col} --> {name} ***") - self.log.debug( - f"\n\nORIGINAL:\n{self.df[col].value_counts()}\n\nENCODED:\n{self.df[name].value_counts()}\n" - ) + if self.verbose: + self.log.debug(f"*** {col} --> {name} ***") + self.log.debug( + f"\n\nORIGINAL:\n{self.df[col].value_counts()}\n\nENCODED:\n{self.df[name].value_counts()}\n" + ) self.rejoin_original() return self.df