Skip to content

Commit

Permalink
test(KDP): adding full processing tests for the papeline (base featur…
Browse files Browse the repository at this point in the history
…es and crosses)
  • Loading branch information
piotrlaczkowski committed Apr 14, 2024
1 parent 05cfe2a commit b3fb9dd
Show file tree
Hide file tree
Showing 2 changed files with 40 additions and 5 deletions.
22 changes: 18 additions & 4 deletions kdp/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,14 +83,28 @@ def __init__(
features_stats: dict[str, Any] = None,
path_data: str = None,
batch_size: int = 50_000,
feature_crosses: list[tuple[str, str]] = None,
feature_crosses: list[tuple[str, str, int]] = None,
features_stats_path: str = None,
output_mode: str = OutputModeOptions.CONCAT,
overwrite_stats: bool = False,
log_to_file: bool = False,
features_specs: dict[str, FeatureType | str] = None,
) -> None:
"""Initialize a preprocessing model."""
"""Initialize a preprocessing model.
Args:
features_stats (dict[str, Any]): A dictionary containing the statistics of the features.
path_data (str): The path to the data from which estimate the statistics.
batch_size (int): The batch size for the data iteration for stats estimation.
feature_crosses (list[tuple[str, str, int]]):
A list of tuples containing the names of the features to be crossed,
and nr_bins to be used for hashing.
features_stats_path (str): The path where to save/load features statistics.
output_mode (str): The output mode of the model (concat | dict).
overwrite_stats (bool): A boolean indicating whether to overwrite the statistics.
log_to_file (bool): A boolean indicating whether to log to a file.
features_specs (dict[str, FeatureType | str]): A dictionary containing the features and their types.
"""
self.path_data = path_data
self.batch_size = batch_size or 50_000
self.features_stats = features_stats or {}
Expand Down Expand Up @@ -438,7 +452,7 @@ def _add_pipeline_cross(self) -> None:
feature_name = f"{feature_a}_x_{feature_b}"
preprocessor.add_processing_step(
layer_creator=PreprocessorLayerFactory.crossing_layer,
depth=nr_bins,
num_bins=nr_bins,
name=f"cross_{feature_name}",
)
# for concatenation we need the same format
Expand All @@ -448,7 +462,7 @@ def _add_pipeline_cross(self) -> None:
name=f"cast_to_float_{feature_name}",
)
crossed_input = [self.inputs[feature_a], self.inputs[feature_b]]
self.outputs[feature_name] = preprocessor.chain(input_data=crossed_input)
self.outputs[feature_name] = preprocessor.chain(input_layer=crossed_input)

def _prepare_outputs(self) -> None:
"""Preparing the outputs of the model.
Expand Down
23 changes: 22 additions & 1 deletion test/test_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -269,7 +269,7 @@ def tearDownClass(cls):
# Remove the temporary file after the test is done
cls.temp_dir.cleanup()

def test_build_preprocessor(self):
def test_build_preprocessor_base_features(self):
"""Test building the preprocessor model."""
ppr = PreprocessingModel(
path_data=self._path_data,
Expand All @@ -287,6 +287,27 @@ def test_build_preprocessor(self):
# checking if we have model as output
self.assertIsInstance(result["model"], tf.keras.Model)

def test_build_preprocessor_with_crosses(self):
"""Test building the preprocessor model."""
ppr = PreprocessingModel(
path_data=self._path_data,
features_specs=self.features_specs,
features_stats_path=self.features_stats_path,
feature_crosses=[
("feat6", "feat7", 5),
],
overwrite_stats=True,
)
result = ppr.build_preprocessor()
_model_output_shape = ppr.model.output_shape[1]

# checking if we have defined output shape
self.assertIsNotNone(_model_output_shape)
self.assertIsNotNone(result["output_dims"])

# checking if we have model as output
self.assertIsInstance(result["model"], tf.keras.Model)


if __name__ == "__main__":
unittest.main()

0 comments on commit b3fb9dd

Please sign in to comment.