Skip to content

Commit

Permalink
Encoder handle unknown (#53)
Browse files Browse the repository at this point in the history
* only create one encoding per unknown variable

* set default loglevel to INFO
  • Loading branch information
alphasentaurii authored Apr 4, 2024
1 parent 10c8f12 commit 102cad5
Show file tree
Hide file tree
Showing 3 changed files with 18 additions and 11 deletions.
5 changes: 5 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,11 @@ new features

- `extractor.scrape.FitsScraper.scrape_dataframe` method added for scraping Fits data from dataframe [#52]

bug fixes
---------

- `preprocessor.encode.PairEncoder.handle_unknowns` create single new encoding value per unidentified variable [#53]


1.0.1 (2024-04-03)
==================
Expand Down
2 changes: 1 addition & 1 deletion spacekit/logger/log.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ def __init__(
self.short_name = script_name
self.console = console
self.logfile = logfile
self.log_level = logging.DEBUG
self.log_level = logging.INFO
self.console_log_output = console_log_output.lower()
self.console_log_level = console_log_level.upper()
self.console_log_color = console_log_color
Expand Down
22 changes: 12 additions & 10 deletions spacekit/preprocessor/encode.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,12 +108,13 @@ def inverse_pairs(self):
return self.invpairs

def handle_unknowns(self, unknowns):
self.log.warning(f"Found unknown values:\n {self.arr[unknowns]}")
add_encoding = max(list(self.keypairs.values())) + 1
uvals = np.unique(self.arr[unknowns])
self.log.warning(f"Found unknown values:\n {uvals}")
try:
self.keypairs[self.arr[unknowns][0]] = add_encoding
self.classes_ = list(self.keypairs.keys())
self.log.info("Successfully added encoding for unknown values.")
for u in uvals:
add_encoding = max(list(self.keypairs.values())) + 1
self.keypairs[u] = add_encoding
self.classes_ = list(self.keypairs.keys())
except Exception as e:
self.log.error("Unable to add encoding for unknown value(s)", e)

Expand Down Expand Up @@ -236,16 +237,17 @@ def _encode_features(self):
"encoding_pairs attr must be instantiated with key-value pairs"
)
return
self.log.info("Encoding categorical features...")
self.log.debug("Encoding categorical features...")
for col, name in self.encodings.items():
keypairs = self.encoding_pairs[col]
enc = PairEncoder()
enc.fit_transform(self.df, keypairs, axiscol=col)
self.df[name] = enc.transformed
self.log.debug(f"*** {col} --> {name} ***")
self.log.debug(
f"\n\nORIGINAL:\n{self.df[col].value_counts()}\n\nENCODED:\n{self.df[name].value_counts()}\n"
)
if self.verbose:
self.log.debug(f"*** {col} --> {name} ***")
self.log.debug(
f"\n\nORIGINAL:\n{self.df[col].value_counts()}\n\nENCODED:\n{self.df[name].value_counts()}\n"
)
self.rejoin_original()
return self.df

Expand Down

0 comments on commit 102cad5

Please sign in to comment.