From 78121b98b8446b7127d1bdd69bb5bf603bbb8ed1 Mon Sep 17 00:00:00 2001 From: Kevin Zecchini Date: Thu, 12 Dec 2019 13:32:22 -0500 Subject: [PATCH 1/2] ability to set training_fraction to 1, allowing non pandas df into TrainTestSplit --- .../sklearn_preprocessing_pipeline.py | 11 +--- primrose/pipelines/train_test_split.py | 64 +++++++++++-------- .../sklearn_preprocessing_transformer.py | 10 ++- 3 files changed, 49 insertions(+), 36 deletions(-) diff --git a/primrose/pipelines/sklearn_preprocessing_pipeline.py b/primrose/pipelines/sklearn_preprocessing_pipeline.py index a739086..3f0cd96 100644 --- a/primrose/pipelines/sklearn_preprocessing_pipeline.py +++ b/primrose/pipelines/sklearn_preprocessing_pipeline.py @@ -33,14 +33,9 @@ def init_pipeline(self): for operation in self.node_config["operations"]: - args=None - if 'args' in operation: - args = operation['args'] - - columns=None - if 'columns' in operation: - columns = operation["columns"] - + args = operation.get('args', None) + columns = operation.get('columns', None) + p = SklearnPreprocessingPipeline._instantiate_preprocessor(operation['class'], args, columns) ts.add(p) diff --git a/primrose/pipelines/train_test_split.py b/primrose/pipelines/train_test_split.py index 55e1ef9..0d84c95 100644 --- a/primrose/pipelines/train_test_split.py +++ b/primrose/pipelines/train_test_split.py @@ -44,7 +44,7 @@ def necessary_config(node_config): """ return set(['training_fraction', 'seed']) - def features(self, data): + def feature_subset(self, data): """Use user-specified features if available, otherwise use all non-target columns Args: @@ -54,14 +54,19 @@ def features(self, data): lsit of feature names """ - if 'features' in self.node_config: - return self.node_config['features'] + if isinstance(data, pd.DataFrame): + if 'features' in self.node_config: + cols = self.node_config['features'] - else: - if 'target_variable' in self.node_config: - return [f for f in data.columns if f != self.node_config['target_variable']] else: - return data.columns + if 'target_variable' in self.node_config: + cols = [f for f in data.columns if f != self.node_config['target_variable']] + else: + cols = data.columns + return data[cols] + + else: + return data def _train_test_split(self, data): """Split data into test/train sets @@ -74,25 +79,32 @@ def _train_test_split(self, data): """ logging.info("Splitting data into testing and training sets.") - if 'target_variable' in self.node_config: - data_train, data_test, target_train, target_test = train_test_split( - data[self.features(data)], - data[self.node_config['target_variable']], - test_size=(1.0 - float(self.node_config['training_fraction'])), - random_state=self.node_config['seed']) - - # re-merge training and target data into a single dataframe for transforming - train_data_to_transform = pd.concat([data_train, target_train], axis=1) - test_data_to_transform = pd.concat([data_test, target_test], axis=1) + test_size = (1.0 - float(self.node_config['training_fraction'])) + if test_size == 0: + train_data_to_transform = data + test_data_to_transform = pd.DataFrame() + else: - data_train, data_test = train_test_split( - data[self.features(data)], - test_size=(1.0 - float(self.node_config['training_fraction'])), - random_state=self.node_config['seed']) + if 'target_variable' in self.node_config: + data_train, data_test, target_train, target_test = train_test_split( + self.feature_subset(data), + data[self.node_config['target_variable']], + test_size=(1.0 - float(self.node_config['training_fraction'])), + random_state=self.node_config['seed']) + + # re-merge training and target data into a single dataframe for transforming + train_data_to_transform = pd.concat([data_train, target_train], axis=1) + test_data_to_transform = pd.concat([data_test, target_test], axis=1) + + else: + data_train, data_test = train_test_split( + self.feature_subset(data), + test_size=(1.0 - float(self.node_config['training_fraction'])), + random_state=self.node_config['seed']) - train_data_to_transform = data_train - test_data_to_transform = data_test + train_data_to_transform = data_train + test_data_to_transform = data_test logging.info('Training data rows: {}, Testing data rows: {}'.format(len(train_data_to_transform), len(test_data_to_transform))) @@ -169,7 +181,7 @@ def fit_transform(self, data_object): if not train_data.empty: train_data = self.execute_pipeline(train_data, PipelineModeType.FIT_TRANSFORM) - data_object.add(self, train_data[self.features(train_data)], key='data_train', overwrite=False) + data_object.add(self, self.feature_subset(train_data), key='data_train', overwrite=False) if 'target_variable' in self.node_config: data_object.add(self, train_data[self.node_config['target_variable']], key='target_train', @@ -180,7 +192,7 @@ def fit_transform(self, data_object): # run the pipeline in Transform mode since we've already fit the pipeline with training data test_data = self.execute_pipeline(test_data, PipelineModeType.TRANSFORM) self.data = test_data # assign the data to the testing data if available - data_object.add(self, test_data[self.features(train_data)], key='data_test', overwrite=False) + data_object.add(self, self.feature_subset(test_data), key='data_test', overwrite=False) if 'target_variable' in self.node_config: data_object.add(self, test_data[self.node_config['target_variable']], key='target_test', @@ -213,7 +225,7 @@ def transform(self, data_object): self.data = data # keep the data for use in the final_data_object_additions method - data_object.add(self, data[self.features(data)], key='data_test', overwrite=False) + data_object.add(self, self.feature_subset(data), key='data_test', overwrite=False) if 'target_variable' in self.node_config: data_object.add(self, data[self.node_config['target_variable']], key='target_test', overwrite=False) diff --git a/primrose/transformers/sklearn_preprocessing_transformer.py b/primrose/transformers/sklearn_preprocessing_transformer.py index 9fa3ace..717d3b1 100644 --- a/primrose/transformers/sklearn_preprocessing_transformer.py +++ b/primrose/transformers/sklearn_preprocessing_transformer.py @@ -4,9 +4,11 @@ Carl Anderson (carl.anderson@weightwatchers.com) """ -from primrose.base.transformer import AbstractTransformer +import logging import pandas as pd +from primrose.base.transformer import AbstractTransformer + class SklearnPreprocessingTransformer(AbstractTransformer): def __init__(self, preprocessor, columns): @@ -60,7 +62,11 @@ def transform(self, data): else: scaled_features = self.preprocessor.transform(data.values) - scaled_features_df = pd.DataFrame(scaled_features, index=data.index, columns=data.columns) + try: + scaled_features_df = pd.DataFrame(scaled_features, index=data.index, columns=data.columns) + except ValueError: + logging.info(f'{self.preprocessor.__class__.__name__} instance changed the number of columns. Returning raw values') + return pd.DataFrame(scaled_features) return scaled_features_df return self.preprocessor.transform(data) From 934c480f818ab1b8ecd5886921795dd88bd4879f Mon Sep 17 00:00:00 2001 From: Kevin Zecchini Date: Thu, 12 Dec 2019 13:41:12 -0500 Subject: [PATCH 2/2] revert change on feature selection in TrainTestSplit --- primrose/pipelines/train_test_split.py | 29 +++++++++++--------------- 1 file changed, 12 insertions(+), 17 deletions(-) diff --git a/primrose/pipelines/train_test_split.py b/primrose/pipelines/train_test_split.py index 0d84c95..57ba3a1 100644 --- a/primrose/pipelines/train_test_split.py +++ b/primrose/pipelines/train_test_split.py @@ -44,7 +44,7 @@ def necessary_config(node_config): """ return set(['training_fraction', 'seed']) - def feature_subset(self, data): + def features(self, data): """Use user-specified features if available, otherwise use all non-target columns Args: @@ -54,19 +54,14 @@ def feature_subset(self, data): lsit of feature names """ - if isinstance(data, pd.DataFrame): - if 'features' in self.node_config: - cols = self.node_config['features'] - - else: - if 'target_variable' in self.node_config: - cols = [f for f in data.columns if f != self.node_config['target_variable']] - else: - cols = data.columns - return data[cols] + if 'features' in self.node_config: + return self.node_config['features'] else: - return data + if 'target_variable' in self.node_config: + return [f for f in data.columns if f != self.node_config['target_variable']] + else: + return data.columns def _train_test_split(self, data): """Split data into test/train sets @@ -88,7 +83,7 @@ def _train_test_split(self, data): else: if 'target_variable' in self.node_config: data_train, data_test, target_train, target_test = train_test_split( - self.feature_subset(data), + data[self.features(data)], data[self.node_config['target_variable']], test_size=(1.0 - float(self.node_config['training_fraction'])), random_state=self.node_config['seed']) @@ -99,7 +94,7 @@ def _train_test_split(self, data): else: data_train, data_test = train_test_split( - self.feature_subset(data), + data[self.features(data)], test_size=(1.0 - float(self.node_config['training_fraction'])), random_state=self.node_config['seed']) @@ -181,7 +176,7 @@ def fit_transform(self, data_object): if not train_data.empty: train_data = self.execute_pipeline(train_data, PipelineModeType.FIT_TRANSFORM) - data_object.add(self, self.feature_subset(train_data), key='data_train', overwrite=False) + data_object.add(self, train_data[self.features(train_data)], key='data_train', overwrite=False) if 'target_variable' in self.node_config: data_object.add(self, train_data[self.node_config['target_variable']], key='target_train', @@ -192,7 +187,7 @@ def fit_transform(self, data_object): # run the pipeline in Transform mode since we've already fit the pipeline with training data test_data = self.execute_pipeline(test_data, PipelineModeType.TRANSFORM) self.data = test_data # assign the data to the testing data if available - data_object.add(self, self.feature_subset(test_data), key='data_test', overwrite=False) + data_object.add(self, test_data[self.features(train_data)], key='data_test', overwrite=False) if 'target_variable' in self.node_config: data_object.add(self, test_data[self.node_config['target_variable']], key='target_test', @@ -225,7 +220,7 @@ def transform(self, data_object): self.data = data # keep the data for use in the final_data_object_additions method - data_object.add(self, self.feature_subset(data), key='data_test', overwrite=False) + data_object.add(self, data[self.features(data)], key='data_test', overwrite=False) if 'target_variable' in self.node_config: data_object.add(self, data[self.node_config['target_variable']], key='target_test', overwrite=False)