From 9e8dc03f13f12708958a9b6f720bb6e4b8682668 Mon Sep 17 00:00:00 2001
From: piotrlaczkowski <piotr.laczkowski@gmail.com>
Date: Fri, 22 Mar 2024 18:29:21 +0100
Subject: [PATCH] feat(KDP): adding text processing option to the preprocessor
 v0

---
 kdp/processor.py | 6 +++---
 kdp/stats.py     | 1 +
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/kdp/processor.py b/kdp/processor.py
index 0b1ee60..e1bfbe8 100644
--- a/kdp/processor.py
+++ b/kdp/processor.py
@@ -53,13 +53,13 @@ def __init__(
         self.features_stats = features_stats or {}
         self.numeric_features = numeric_features or []
         self.categorical_features = categorical_features or []
-        self.text_features = text_features or []
+        self.text_features = text_features or [k for k, v in features_specs.items() if v == FeatureType.TEXT] or []
         self.text_features_config = text_features_config or {
             "max_tokens": 10_000,
             "output_mode": TextVectorizerOutputOptions.INT,
             "output_sequence_length": 50,
         }
-        self.features_specs = features_specs or {}
+        self.features_specs = {k: v for k, v in features_specs.items() if v != FeatureType.TEXT} or {}
         self.category_encoding_option = category_encoding_option
         self.features_stats_path = features_stats_path or "features_stats.json"
         self.feature_crosses = feature_crosses or []
@@ -301,7 +301,7 @@ def _add_pipeline_text(self, feature_name: str, input_layer) -> None:
         # checking if we have custom setting per feature
         _feature_config = self.text_features_config.get(feature_name) or self.text_features_config
         # getting stop words for text preprocessing
-        _stop_words = _feature_config.pop("stop_words")
+        _stop_words = _feature_config.get("stop_words")
 
         if _stop_words:
             preprocessor.add_processing_step(
diff --git a/kdp/stats.py b/kdp/stats.py
index c50e312..73aedea 100644
--- a/kdp/stats.py
+++ b/kdp/stats.py
@@ -14,6 +14,7 @@ class FeatureType(Enum):
     FLOAT = "float"
     INTEGER_CATEGORICAL = "integer_categorical"
     STRING_CATEGORICAL = "string_categorical"
+    TEXT = "text"
 
 
 class WelfordAccumulator: