diff --git a/.gitignore b/.gitignore index cc07538..574b478 100644 --- a/.gitignore +++ b/.gitignore @@ -133,3 +133,9 @@ dmypy.json # pycharm .idea/ + +# Venvs +*.venv + +# Generated messages +/messages diff --git a/pydra_ml/__init__.py b/pydra_ml/__init__.py index 9224336..f6a0da4 100644 --- a/pydra_ml/__init__.py +++ b/pydra_ml/__init__.py @@ -35,3 +35,7 @@ def set_logger_level(lgr, level): set_logger_level(lgr, os.environ.get("PYDRAML_LOG_LEVEL", logging.INFO)) FORMAT = "%(asctime)-15s [%(levelname)8s] %(message)s" logging.basicConfig(format=FORMAT) + +from . import _version + +__version__ = _version.get_versions()["version"] diff --git a/pydra_ml/classifier.py b/pydra_ml/classifier.py index 0d554bf..b2a50c0 100644 --- a/pydra_ml/classifier.py +++ b/pydra_ml/classifier.py @@ -71,7 +71,7 @@ def gen_workflow(inputs, cache_dir=None, cache_locations=None): messengers=FileMessenger(), messenger_args={"message_dir": os.path.join(os.getcwd(), "messages")}, ) - wf.split(["clf_info", "permute"]) + wf.split(clf_info=inputs["clf_info"], permute=inputs["permute"]) wf.add( read_file_pdt( name="readcsv", @@ -102,7 +102,7 @@ def gen_workflow(inputs, cache_dir=None, cache_locations=None): permute=wf.lzin.permute, ) ) - wf.fit_clf.split("split_index") + wf.fit_clf.split(split_index=wf.gensplit.lzout.split_indices) wf.add( calc_metric_pdt( name="metric", output=wf.fit_clf.lzout.output, metrics=wf.lzin.metrics diff --git a/pydra_ml/report.py b/pydra_ml/report.py index cf19823..2d74c8d 100644 --- a/pydra_ml/report.py +++ b/pydra_ml/report.py @@ -3,12 +3,15 @@ import pickle import warnings +import matplotlib import matplotlib.pyplot as plt import numpy as np import pandas as pd import seaborn as sns from sklearn.metrics import accuracy_score, explained_variance_score +matplotlib.use("Agg") + def save_obj(obj, path): with open(path, "wb") as f: @@ -97,9 +100,9 @@ def plot_summary(summary, output_dir=None, filename="shap_plot", plot_top_n_shap # plot without all bootstrapping values summary = summary[["mean", "std", "min", "max"]] num_features = len(list(summary.index)) - if (plot_top_n_shap != 1 and type(plot_top_n_shap) == float) or type( + if (plot_top_n_shap != 1 and type(plot_top_n_shap) is float) or type( plot_top_n_shap - ) == int: + ) is int: # if plot_top_n_shap != 1.0 but includes 1 (int) if plot_top_n_shap <= 0: raise ValueError( @@ -223,7 +226,7 @@ def gen_report_shap_class(results, output_dir="./", plot_top_n_shap=16): f"""There were no {quadrant.upper()}s, this will output NaNs in the csv and figure for this split column""" ) - shaps_i_quadrant = shaps_i[ + shaps_i_quadrant = np.array(shaps_i)[ indexes.get(quadrant) ] # shape (P, F) P prediction x F feature_names abs_weighted_shap_values = np.abs(shaps_i_quadrant) * split_performance @@ -325,7 +328,7 @@ def gen_report_shap_regres(results, output_dir="./", plot_top_n_shap=16): f"""There were no {quadrant.upper()}s, this will output NaNs in the csv and figure for this split column""" ) - shaps_i_quadrant = shaps_i[ + shaps_i_quadrant = np.array(shaps_i)[ indexes.get(quadrant) ] # shape (P, F) P prediction x F feature_names abs_weighted_shap_values = np.abs(shaps_i_quadrant) * split_performance diff --git a/pydra_ml/tasks.py b/pydra_ml/tasks.py index 54689a6..27b17f7 100644 --- a/pydra_ml/tasks.py +++ b/pydra_ml/tasks.py @@ -1,5 +1,15 @@ #!/usr/bin/env python +import typing as ty + +from pydra.utils.hash import Cache, register_serializer +from sklearn.pipeline import Pipeline + + +@register_serializer +def bytes_repr_Pipeline(obj: Pipeline, cache: Cache): + yield str(obj).encode() + def read_file(filename, x_indices=None, target_vars=None, group=None): """Read a CSV data file @@ -92,7 +102,7 @@ def to_instance(clf_info): train_index, test_index = train_test_split[split_index] y = y.ravel() - if type(X[0][0]) == str: + if type(X[0][0]) is str: # it's loaded as bytes, so we need to decode as utf-8 X = np.array([str.encode(n[0]).decode("utf-8") for n in X]) if permute: @@ -126,7 +136,27 @@ def calc_metric(output, metrics): return score, output -def get_feature_importance(permute, model, gen_feature_importance=True): +def get_feature_importance( + *, + permute: bool, + model: ty.Tuple[Pipeline, list, list], + gen_feature_importance: bool = True, +): + """Compute feature importance for the model + + Parameters + ---------- + permute : bool + Whether or not to run the model in permuted mode + model : tuple(sklearn.pipeline.Pipeline, list, list) + The model to compute feature importance for + gen_feature_importance : bool + Whether or not to generate the feature importance + Returns + ------- + list + List of feature importance + """ if permute or not gen_feature_importance: return [] pipeline, train_index, test_index = model @@ -172,7 +202,7 @@ def get_feature_importance(permute, model, gen_feature_importance=True): pipeline_steps.coefs_ pipeline_steps.coef_ - Please add correct method in tasks.py or if inexistent, + Please add correct method in tasks.py or if non-existent, set gen_feature_importance to false in the spec file. This is the error that was returned by sklearn:\n\t{e}\n @@ -224,7 +254,9 @@ def get_shap(X, permute, model, gen_shap=False, nsamples="auto", l1_reg="aic"): import shap explainer = shap.KernelExplainer(pipe.predict, shap.kmeans(X[train_index], 5)) - shaps = explainer.shap_values(X[test_index], nsamples=nsamples, l1_reg=l1_reg) + shaps = explainer.shap_values( + X[test_index], nsamples=nsamples, l1_reg=l1_reg, silent=True + ) return shaps diff --git a/setup.cfg b/setup.cfg index 8feaf4d..2aaf11d 100644 --- a/setup.cfg +++ b/setup.cfg @@ -26,7 +26,7 @@ classifiers = [options] python_requires = >= 3.8 install_requires = - pydra == 0.22.0 + pydra >= 0.23.0-alpha psutil scikit-learn seaborn @@ -35,11 +35,9 @@ install_requires = test_requires = pytest >= 4.4.0 - pytest-cov pytest-env pytest-xdist pytest-rerunfailures - codecov packages = find: include_package_data = True @@ -58,11 +56,9 @@ docs = %(doc)s test = pytest >= 4.4.0 - pytest-cov pytest-env pytest-xdist pytest-rerunfailures - codecov tests = %(test)s dev =