Skip to content

Commit

Permalink
feat(KDP): adding text processing option to the preprocessor v0
Browse files Browse the repository at this point in the history
  • Loading branch information
piotrlaczkowski committed Mar 22, 2024
1 parent c0724c4 commit 9e8dc03
Show file tree
Hide file tree
Showing 2 changed files with 4 additions and 3 deletions.
6 changes: 3 additions & 3 deletions kdp/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,13 +53,13 @@ def __init__(
self.features_stats = features_stats or {}
self.numeric_features = numeric_features or []
self.categorical_features = categorical_features or []
self.text_features = text_features or []
self.text_features = text_features or [k for k, v in features_specs.items() if v == FeatureType.TEXT] or []
self.text_features_config = text_features_config or {
"max_tokens": 10_000,
"output_mode": TextVectorizerOutputOptions.INT,
"output_sequence_length": 50,
}
self.features_specs = features_specs or {}
self.features_specs = {k: v for k, v in features_specs.items() if v != FeatureType.TEXT} or {}
self.category_encoding_option = category_encoding_option
self.features_stats_path = features_stats_path or "features_stats.json"
self.feature_crosses = feature_crosses or []
Expand Down Expand Up @@ -301,7 +301,7 @@ def _add_pipeline_text(self, feature_name: str, input_layer) -> None:
# checking if we have custom setting per feature
_feature_config = self.text_features_config.get(feature_name) or self.text_features_config
# getting stop words for text preprocessing
_stop_words = _feature_config.pop("stop_words")
_stop_words = _feature_config.get("stop_words")

if _stop_words:
preprocessor.add_processing_step(
Expand Down
1 change: 1 addition & 0 deletions kdp/stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ class FeatureType(Enum):
FLOAT = "float"
INTEGER_CATEGORICAL = "integer_categorical"
STRING_CATEGORICAL = "string_categorical"
TEXT = "text"


class WelfordAccumulator:
Expand Down

0 comments on commit 9e8dc03

Please sign in to comment.