From 9e8dc03f13f12708958a9b6f720bb6e4b8682668 Mon Sep 17 00:00:00 2001 From: piotrlaczkowski Date: Fri, 22 Mar 2024 18:29:21 +0100 Subject: [PATCH] feat(KDP): adding text processing option to the preprocessor v0 --- kdp/processor.py | 6 +++--- kdp/stats.py | 1 + 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/kdp/processor.py b/kdp/processor.py index 0b1ee60..e1bfbe8 100644 --- a/kdp/processor.py +++ b/kdp/processor.py @@ -53,13 +53,13 @@ def __init__( self.features_stats = features_stats or {} self.numeric_features = numeric_features or [] self.categorical_features = categorical_features or [] - self.text_features = text_features or [] + self.text_features = text_features or [k for k, v in features_specs.items() if v == FeatureType.TEXT] or [] self.text_features_config = text_features_config or { "max_tokens": 10_000, "output_mode": TextVectorizerOutputOptions.INT, "output_sequence_length": 50, } - self.features_specs = features_specs or {} + self.features_specs = {k: v for k, v in features_specs.items() if v != FeatureType.TEXT} or {} self.category_encoding_option = category_encoding_option self.features_stats_path = features_stats_path or "features_stats.json" self.feature_crosses = feature_crosses or [] @@ -301,7 +301,7 @@ def _add_pipeline_text(self, feature_name: str, input_layer) -> None: # checking if we have custom setting per feature _feature_config = self.text_features_config.get(feature_name) or self.text_features_config # getting stop words for text preprocessing - _stop_words = _feature_config.pop("stop_words") + _stop_words = _feature_config.get("stop_words") if _stop_words: preprocessor.add_processing_step( diff --git a/kdp/stats.py b/kdp/stats.py index c50e312..73aedea 100644 --- a/kdp/stats.py +++ b/kdp/stats.py @@ -14,6 +14,7 @@ class FeatureType(Enum): FLOAT = "float" INTEGER_CATEGORICAL = "integer_categorical" STRING_CATEGORICAL = "string_categorical" + TEXT = "text" class WelfordAccumulator: