diff --git a/example-requirements.txt b/example-requirements.txt index 0216719..0b1a424 100644 --- a/example-requirements.txt +++ b/example-requirements.txt @@ -1,2 +1,5 @@ metaflow +plotly-express +kaleido pandas +numpy diff --git a/examples/model-tournament/config.py b/examples/model-tournament/config.py index 576dbf5..5e2d8d4 100644 --- a/examples/model-tournament/config.py +++ b/examples/model-tournament/config.py @@ -1,6 +1,6 @@ n_numeric_features = 10 n_informative_numeric_features = 5 -n_categorical_features = 2 +n_categorical_features = 1 make_regression_init_kwargs = { f'type_{i}': { 'n_samples': 10_000, @@ -37,71 +37,7 @@ '__build_model': ['metaflow_helper.model_handlers.build_keras_regression_model'], # These go to the model initializer 'metric': ['mse'], - 'dense_layer_widths': [()], - 'dropout_probabilities': [()], - # This goes to the pipeline elements' fitters by pipeline step stepname, where f'{stepname}__parameter' gets - # renamed to parameter and then passed to the fitter for step stepname. The model stepname = 'model' - # and the preprocessing stepname = 'preprocessor'. See utilities.build_pipeline. - '__fit_kwargs': [{ - 'model__batch_size': None, - 'model__epochs': 10_000, - 'model__validation_split': 0.2, - 'model__eval_metric': 'val_mse', # monitor. Examples: 'mse' or 'val_mse' - 'model__verbose': 0, - 'model__patience': 10, - 'model__min_delta': 0.1, - }], - }, - { - # Anything with an underscore is a specially handled parameter - '__model': ['metaflow_helper.model_handlers.KerasRegressorHandler'], - '__build_model': ['metaflow_helper.model_handlers.build_keras_regression_model'], - # These go to the model initializer - 'metric': ['mse'], - 'dense_layer_widths': [(15,)], - 'dropout_probabilities': [(0,)], - # This goes to the pipeline elements' fitters by pipeline step stepname, where f'{stepname}__parameter' gets - # renamed to parameter and then passed to the fitter for step stepname. The model stepname = 'model' - # and the preprocessing stepname = 'preprocessor'. See utilities.build_pipeline. - '__fit_kwargs': [{ - 'model__batch_size': None, - 'model__epochs': 10_000, - 'model__validation_split': 0.2, - 'model__eval_metric': 'val_mse', # monitor. Examples: 'mse' or 'val_mse' - 'model__verbose': 0, - 'model__patience': 10, - 'model__min_delta': 0.1, - }], - }, - { - # Anything with an underscore is a specially handled parameter - '__model': ['metaflow_helper.model_handlers.KerasRegressorHandler'], - '__build_model': ['metaflow_helper.model_handlers.build_keras_regression_model'], - # These go to the model initializer - 'metric': ['mse'], - 'dense_layer_widths': [(15, 15,)], - 'dropout_probabilities': [(0, 0,)], - # This goes to the pipeline elements' fitters by pipeline step stepname, where f'{stepname}__parameter' gets - # renamed to parameter and then passed to the fitter for step stepname. The model stepname = 'model' - # and the preprocessing stepname = 'preprocessor'. See utilities.build_pipeline. - '__fit_kwargs': [{ - 'model__batch_size': None, - 'model__epochs': 10_000, - 'model__validation_split': 0.2, - 'model__eval_metric': 'val_mse', # monitor. Examples: 'mse' or 'val_mse' - 'model__verbose': 0, - 'model__patience': 10, - 'model__min_delta': 0.1, - }], - }, - { - # Anything with an underscore is a specially handled parameter - '__model': ['metaflow_helper.model_handlers.KerasRegressorHandler'], - '__build_model': ['metaflow_helper.model_handlers.build_keras_regression_model'], - # These go to the model initializer - 'metric': ['mse'], - 'dense_layer_widths': [(15*15,)], - 'dropout_probabilities': [(0,)], + 'dense_layer_widths': [(), (15,), (15, 15,), (15*15,)], # This goes to the pipeline elements' fitters by pipeline step stepname, where f'{stepname}__parameter' gets # renamed to parameter and then passed to the fitter for step stepname. The model stepname = 'model' # and the preprocessing stepname = 'preprocessor'. See utilities.build_pipeline. diff --git a/examples/model-tournament/train.py b/examples/model-tournament/train.py index af14e89..eef188f 100644 --- a/examples/model-tournament/train.py +++ b/examples/model-tournament/train.py @@ -151,7 +151,13 @@ def train_test(self): model__validation_data=(X_test_transformed, y_test), **fit_kwargs, ) - self.score = r2_score(y_test, model_pipeline.predict(X_test_transformed)) + y_test_pred = model_pipeline.predict(X_test_transformed) + model_pipeline.named_steps['model'].plot( + dir=f"results/{current.run_id}", + y_true=y_test, + y_pred=y_test_pred, + ) + self.score = r2_score(y_test, y_test_pred) print(f'score {self.score}, contender {contender}') self.next(self.train) @@ -188,8 +194,9 @@ def train(self): @step def end(self): indent = 4 - Path("results").mkdir(parents=True, exist_ok=True) - with open(f'results/results-{current.run_id}.txt', 'w') as f: + results_dir = f"results/{current.run_id}" + Path(results_dir).mkdir(parents=True, exist_ok=True) + with open(f'{results_dir}/summary.txt', 'w') as f: print(f'data set:\n{json.dumps(self.make_regression_init_kwargs, indent=indent)}', file=f) print('\n', file=f) for i, k in enumerate(sorted(self.contender_results.keys(), key=lambda k: -1 * self.contender_results[k]['mean_score'])): diff --git a/metaflow_helper/model_handlers/base.py b/metaflow_helper/model_handlers/base.py new file mode 100644 index 0000000..2f02976 --- /dev/null +++ b/metaflow_helper/model_handlers/base.py @@ -0,0 +1,64 @@ +import random +from pathlib import Path +import numpy as np +import pandas as pd +import plotly.graph_objects as go +import plotly + + +class BaseModelHandler: + def __init__(self): + pass + + def _validate_init_kwargs(self): + try: + self.mode + except NameError as e: + print('You must make mode an init kwarg') + raise e + try: + self.iterations + except NameError as e: + print('You must make iterations an init kwarg') + raise e + try: + self.input_dim + except NameError as e: + print('You must make input_dim an init kwarg') + raise e + + def _validate_fit_kwargs(self): + pass + + def plot(self, y_true, y_pred, dir='.', auto_open=True): + Path(dir).mkdir(parents=True, exist_ok=True) + if len(y_true) > 1_000: + idx = random.sample(range(len(y_true)), 1_000) + else: + idx = list(range(len(y_true))) + x = y_pred.iloc[idx] if isinstance(y_pred, pd.Series) else y_pred[idx] + y = y_true.iloc[idx] if isinstance(y_true, pd.Series) else y_true[idx] + plot_range = [np.min((x, y)), np.max((x, y))] + fig = go.Figure() + fig.add_trace( + go.Scatter( + x=x, + y=y, + mode='markers', + ), + ) + fig.add_shape( + type="line", + x0=plot_range[0], y0=plot_range[0], x1=plot_range[1], y1=plot_range[1], + line=dict( + color="Black", + width=2, + ) + ) + fig.update_layout( + xaxis_title='Predicted', + yaxis_title='True', + template='none', + ) + fig.write_image(f"{dir}/predicted-vs-true.png") + plotly.offline.plot(fig, filename=f"{dir}/predicted-vs-true.html", auto_open=auto_open) diff --git a/metaflow_helper/model_handlers/keras.py b/metaflow_helper/model_handlers/keras.py index e741bd0..f817567 100644 --- a/metaflow_helper/model_handlers/keras.py +++ b/metaflow_helper/model_handlers/keras.py @@ -5,9 +5,10 @@ from tensorflow.python.keras import regularizers from ..constants import RunMode +from .base import BaseModelHandler -class KerasRegressorHandler(BaseEstimator, RegressorMixin): +class KerasRegressorHandler(BaseModelHandler, BaseEstimator, RegressorMixin): def __init__(self, build_model=None, input_dim=None, mode=RunMode, iterations=None, eval_metric=None, **kwargs): self.build_model = build_model @@ -20,6 +21,7 @@ def __init__(self, build_model=None, input_dim=None, mode=RunMode, iterations=No self.history = [] self.iterations = iterations + self._validate_init_kwargs() self.model = self.build_model(input_dim=self.input_dim, **kwargs) def fit(self, X, y, validation_data=None, patience=None, min_delta=0, eval_metric=None, **kwargs): @@ -47,6 +49,7 @@ def fit(self, X, y, validation_data=None, patience=None, min_delta=0, eval_metri kwargs.pop(k) except KeyError: pass + self._validate_fit_kwargs() if kwargs is not None and 'validation_split' in kwargs: result = self.model.fit(X, y, callbacks=self.callbacks, **kwargs) else: @@ -67,6 +70,12 @@ def build_keras_regression_model(input_dim=None, dense_layer_widths=(10,), dropo l1_lambda_final=0, l2_lambda_final=0): if input_dim is None: raise ValueError(input_dim) + if len(dense_layer_widths) > len(dropout_probabilities): + dropout_probabilities = tuple([dropout_probabilities[0]]*len(dense_layer_widths)) + if len(dense_layer_widths) > len(l1_lambdas): + dropout_probabilities = tuple([l1_lambdas[0]]*len(dense_layer_widths)) + if len(dense_layer_widths) > len(l2_lambdas): + dropout_probabilities = tuple([l2_lambdas[0]]*len(dense_layer_widths)) model = Sequential() model.add(Input(shape=(input_dim, ))) for i, params in enumerate(zip(dense_layer_widths, dropout_probabilities, l1_lambdas, l2_lambdas)): diff --git a/metaflow_helper/model_handlers/lightgbm.py b/metaflow_helper/model_handlers/lightgbm.py index 2ed83c9..8392536 100644 --- a/metaflow_helper/model_handlers/lightgbm.py +++ b/metaflow_helper/model_handlers/lightgbm.py @@ -2,17 +2,20 @@ from sklearn.base import BaseEstimator, RegressorMixin from ..constants import RunMode +from .base import BaseModelHandler -class LightGBMRegressorHandler(BaseEstimator, RegressorMixin): +class LightGBMRegressorHandler(BaseModelHandler, BaseEstimator, RegressorMixin): def __init__(self, mode: RunMode, iterations=None, input_dim=None, **kwargs): self.mode = mode self.iterations = iterations self.input_dim = input_dim + if self.iterations is not None: kwargs['n_estimators'] = self.iterations + self._validate_init_kwargs() self.model = lgb.LGBMRegressor(**kwargs) def fit(self, X, y, validation_data=None, **kwargs): @@ -23,6 +26,7 @@ def fit(self, X, y, validation_data=None, **kwargs): kwargs.pop(k) except KeyError: pass + self._validate_fit_kwargs() if validation_data is not None: self.model.fit(X, y, eval_set=validation_data, **kwargs) else: