Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

initial attempt to work with pydra 0.23+ #59

Merged
merged 17 commits into from
Feb 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -133,3 +133,9 @@ dmypy.json

# pycharm
.idea/

# Venvs
*.venv

# Generated messages
/messages
4 changes: 4 additions & 0 deletions pydra_ml/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,3 +35,7 @@ def set_logger_level(lgr, level):
set_logger_level(lgr, os.environ.get("PYDRAML_LOG_LEVEL", logging.INFO))
FORMAT = "%(asctime)-15s [%(levelname)8s] %(message)s"
logging.basicConfig(format=FORMAT)

from . import _version

__version__ = _version.get_versions()["version"]
4 changes: 2 additions & 2 deletions pydra_ml/classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ def gen_workflow(inputs, cache_dir=None, cache_locations=None):
messengers=FileMessenger(),
messenger_args={"message_dir": os.path.join(os.getcwd(), "messages")},
)
wf.split(["clf_info", "permute"])
wf.split(clf_info=inputs["clf_info"], permute=inputs["permute"])
wf.add(
read_file_pdt(
name="readcsv",
Expand Down Expand Up @@ -102,7 +102,7 @@ def gen_workflow(inputs, cache_dir=None, cache_locations=None):
permute=wf.lzin.permute,
)
)
wf.fit_clf.split("split_index")
wf.fit_clf.split(split_index=wf.gensplit.lzout.split_indices)
wf.add(
calc_metric_pdt(
name="metric", output=wf.fit_clf.lzout.output, metrics=wf.lzin.metrics
Expand Down
11 changes: 7 additions & 4 deletions pydra_ml/report.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,15 @@
import pickle
import warnings

import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.metrics import accuracy_score, explained_variance_score

matplotlib.use("Agg")


def save_obj(obj, path):
with open(path, "wb") as f:
Expand Down Expand Up @@ -97,9 +100,9 @@ def plot_summary(summary, output_dir=None, filename="shap_plot", plot_top_n_shap
# plot without all bootstrapping values
summary = summary[["mean", "std", "min", "max"]]
num_features = len(list(summary.index))
if (plot_top_n_shap != 1 and type(plot_top_n_shap) == float) or type(
if (plot_top_n_shap != 1 and type(plot_top_n_shap) is float) or type(
plot_top_n_shap
) == int:
) is int:
# if plot_top_n_shap != 1.0 but includes 1 (int)
if plot_top_n_shap <= 0:
raise ValueError(
Expand Down Expand Up @@ -223,7 +226,7 @@ def gen_report_shap_class(results, output_dir="./", plot_top_n_shap=16):
f"""There were no {quadrant.upper()}s, this will output NaNs
in the csv and figure for this split column"""
)
shaps_i_quadrant = shaps_i[
shaps_i_quadrant = np.array(shaps_i)[
indexes.get(quadrant)
] # shape (P, F) P prediction x F feature_names
abs_weighted_shap_values = np.abs(shaps_i_quadrant) * split_performance
Expand Down Expand Up @@ -325,7 +328,7 @@ def gen_report_shap_regres(results, output_dir="./", plot_top_n_shap=16):
f"""There were no {quadrant.upper()}s, this will
output NaNs in the csv and figure for this split column"""
)
shaps_i_quadrant = shaps_i[
shaps_i_quadrant = np.array(shaps_i)[
indexes.get(quadrant)
] # shape (P, F) P prediction x F feature_names
abs_weighted_shap_values = np.abs(shaps_i_quadrant) * split_performance
Expand Down
40 changes: 36 additions & 4 deletions pydra_ml/tasks.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,15 @@
#!/usr/bin/env python

import typing as ty

from pydra.utils.hash import Cache, register_serializer
from sklearn.pipeline import Pipeline


@register_serializer
def bytes_repr_Pipeline(obj: Pipeline, cache: Cache):
yield str(obj).encode()


def read_file(filename, x_indices=None, target_vars=None, group=None):
"""Read a CSV data file
Expand Down Expand Up @@ -92,7 +102,7 @@ def to_instance(clf_info):

train_index, test_index = train_test_split[split_index]
y = y.ravel()
if type(X[0][0]) == str:
if type(X[0][0]) is str:
# it's loaded as bytes, so we need to decode as utf-8
X = np.array([str.encode(n[0]).decode("utf-8") for n in X])
if permute:
Expand Down Expand Up @@ -126,7 +136,27 @@ def calc_metric(output, metrics):
return score, output


def get_feature_importance(permute, model, gen_feature_importance=True):
def get_feature_importance(
*,
permute: bool,
model: ty.Tuple[Pipeline, list, list],
gen_feature_importance: bool = True,
):
"""Compute feature importance for the model

Parameters
----------
permute : bool
Whether or not to run the model in permuted mode
model : tuple(sklearn.pipeline.Pipeline, list, list)
The model to compute feature importance for
gen_feature_importance : bool
Whether or not to generate the feature importance
Returns
-------
list
List of feature importance
"""
if permute or not gen_feature_importance:
return []
pipeline, train_index, test_index = model
Expand Down Expand Up @@ -172,7 +202,7 @@ def get_feature_importance(permute, model, gen_feature_importance=True):
pipeline_steps.coefs_
pipeline_steps.coef_

Please add correct method in tasks.py or if inexistent,
Please add correct method in tasks.py or if non-existent,
set gen_feature_importance to false in the spec file.

This is the error that was returned by sklearn:\n\t{e}\n
Expand Down Expand Up @@ -224,7 +254,9 @@ def get_shap(X, permute, model, gen_shap=False, nsamples="auto", l1_reg="aic"):
import shap

explainer = shap.KernelExplainer(pipe.predict, shap.kmeans(X[train_index], 5))
shaps = explainer.shap_values(X[test_index], nsamples=nsamples, l1_reg=l1_reg)
shaps = explainer.shap_values(
X[test_index], nsamples=nsamples, l1_reg=l1_reg, silent=True
)
return shaps


Expand Down
6 changes: 1 addition & 5 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ classifiers =
[options]
python_requires = >= 3.8
install_requires =
pydra == 0.22.0
pydra >= 0.23.0-alpha
psutil
scikit-learn
seaborn
Expand All @@ -35,11 +35,9 @@ install_requires =

test_requires =
pytest >= 4.4.0
pytest-cov
pytest-env
pytest-xdist
pytest-rerunfailures
codecov
packages = find:
include_package_data = True

Expand All @@ -58,11 +56,9 @@ docs =
%(doc)s
test =
pytest >= 4.4.0
pytest-cov
pytest-env
pytest-xdist
pytest-rerunfailures
codecov
tests =
%(test)s
dev =
Expand Down