diff --git a/kdp/processor.py b/kdp/processor.py index 0b1ee60..e1bfbe8 100644 --- a/kdp/processor.py +++ b/kdp/processor.py @@ -53,13 +53,13 @@ def __init__( self.features_stats = features_stats or {} self.numeric_features = numeric_features or [] self.categorical_features = categorical_features or [] - self.text_features = text_features or [] + self.text_features = text_features or [k for k, v in features_specs.items() if v == FeatureType.TEXT] or [] self.text_features_config = text_features_config or { "max_tokens": 10_000, "output_mode": TextVectorizerOutputOptions.INT, "output_sequence_length": 50, } - self.features_specs = features_specs or {} + self.features_specs = {k: v for k, v in features_specs.items() if v != FeatureType.TEXT} or {} self.category_encoding_option = category_encoding_option self.features_stats_path = features_stats_path or "features_stats.json" self.feature_crosses = feature_crosses or [] @@ -301,7 +301,7 @@ def _add_pipeline_text(self, feature_name: str, input_layer) -> None: # checking if we have custom setting per feature _feature_config = self.text_features_config.get(feature_name) or self.text_features_config # getting stop words for text preprocessing - _stop_words = _feature_config.pop("stop_words") + _stop_words = _feature_config.get("stop_words") if _stop_words: preprocessor.add_processing_step( diff --git a/kdp/stats.py b/kdp/stats.py index c50e312..73aedea 100644 --- a/kdp/stats.py +++ b/kdp/stats.py @@ -14,6 +14,7 @@ class FeatureType(Enum): FLOAT = "float" INTEGER_CATEGORICAL = "integer_categorical" STRING_CATEGORICAL = "string_categorical" + TEXT = "text" class WelfordAccumulator: