Skip to content

Commit

Permalink
Merge branch 'master' into fix/auto
Browse files Browse the repository at this point in the history
  • Loading branch information
satra authored Feb 24, 2024
2 parents 92b0547 + 64e351e commit 3ab7bc3
Show file tree
Hide file tree
Showing 8 changed files with 69 additions and 28 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,11 @@ jobs:
- "3.11"

steps:
- uses: actions/checkout@v4
- uses: actions/checkout@v4.1.1
with: # no need for the history
fetch-depth: 1
- name: Set up Python ${{ matrix.python }}
uses: actions/setup-python@v4
uses: actions/setup-python@v5.0.0
with:
python-version: ${{ matrix.python }}

Expand Down
6 changes: 6 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -133,3 +133,9 @@ dmypy.json

# pycharm
.idea/

# Venvs
*.venv

# Generated messages
/messages
10 changes: 5 additions & 5 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,29 +5,29 @@ ci:

repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.4.0
rev: v4.5.0
hooks:
- id: check-added-large-files
- id: check-yaml
- id: end-of-file-fixer
- id: trailing-whitespace
- repo: https://github.com/psf/black
rev: 23.1.0
rev: 24.2.0
hooks:
- id: black
- repo: https://github.com/PyCQA/flake8
rev: 6.0.0
rev: 7.0.0
hooks:
- id: flake8
exclude: ^(pydra_ml/_version\.py|versioneer\.py)$
- repo: https://github.com/PyCQA/isort
rev: 5.12.0
rev: 5.13.2
hooks:
- id: isort
args: ["--profile", "black"]
exclude: ^(pydra_ml/_version\.py|versioneer\.py)$
- repo: https://github.com/codespell-project/codespell
rev: v2.2.2
rev: v2.2.6
hooks:
- id: codespell
exclude: ^(pydra_ml/_version\.py|versioneer\.py)$
4 changes: 4 additions & 0 deletions pydra_ml/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,3 +35,7 @@ def set_logger_level(lgr, level):
set_logger_level(lgr, os.environ.get("PYDRAML_LOG_LEVEL", logging.INFO))
FORMAT = "%(asctime)-15s [%(levelname)8s] %(message)s"
logging.basicConfig(format=FORMAT)

from . import _version

__version__ = _version.get_versions()["version"]
4 changes: 2 additions & 2 deletions pydra_ml/classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ def gen_workflow(inputs, cache_dir=None, cache_locations=None):
messengers=FileMessenger(),
messenger_args={"message_dir": os.path.join(os.getcwd(), "messages")},
)
wf.split(["clf_info", "permute"])
wf.split(clf_info=inputs["clf_info"], permute=inputs["permute"])
wf.add(
read_file_pdt(
name="readcsv",
Expand Down Expand Up @@ -102,7 +102,7 @@ def gen_workflow(inputs, cache_dir=None, cache_locations=None):
permute=wf.lzin.permute,
)
)
wf.fit_clf.split("split_index")
wf.fit_clf.split(split_index=wf.gensplit.lzout.split_indices)
wf.add(
calc_metric_pdt(
name="metric", output=wf.fit_clf.lzout.output, metrics=wf.lzin.metrics
Expand Down
23 changes: 13 additions & 10 deletions pydra_ml/report.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,15 @@
import pickle
import warnings

import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.metrics import accuracy_score, explained_variance_score

matplotlib.use("Agg")


def save_obj(obj, path):
with open(path, "wb") as f:
Expand Down Expand Up @@ -37,9 +40,9 @@ def performance_table(df, output_dir, round_decimals=2):
df_clf = df_clf.reset_index(drop=True)
df_metric_data_clean[clf] = df_clf
df_metric_data_clean_median = df_metric_data_clean.median().T
df_metric_data_clean.loc[
len(df_metric_data_clean)
] = df_metric_data_clean_median
df_metric_data_clean.loc[len(df_metric_data_clean)] = (
df_metric_data_clean_median
)
df_metric_data_clean.index = list(df_metric_data_clean.index[:-1]) + ["median"]
df_metric_data_clean.to_csv(
os.path.join(
Expand Down Expand Up @@ -77,9 +80,9 @@ def performance_table(df, output_dir, round_decimals=2):
)
if "null" in df_metric.type.unique():
null_median = round(df_metric_null_clean_median[clf], 2)
df_summary.loc[
0, clf
] = f"{data_median} [{ci_lower}{ci_upper}; {null_median}]"
df_summary.loc[0, clf] = (
f"{data_median} [{ci_lower}{ci_upper}; {null_median}]"
)
else:
df_summary.loc[0, clf] = f"{data_median} [{ci_lower}{ci_upper}]"

Expand All @@ -97,9 +100,9 @@ def plot_summary(summary, output_dir=None, filename="shap_plot", plot_top_n_shap
# plot without all bootstrapping values
summary = summary[["mean", "std", "min", "max"]]
num_features = len(list(summary.index))
if (plot_top_n_shap != 1 and type(plot_top_n_shap) == float) or type(
if (plot_top_n_shap != 1 and type(plot_top_n_shap) is float) or type(
plot_top_n_shap
) == int:
) is int:
# if plot_top_n_shap != 1.0 but includes 1 (int)
if plot_top_n_shap <= 0:
raise ValueError(
Expand Down Expand Up @@ -223,7 +226,7 @@ def gen_report_shap_class(results, output_dir="./", plot_top_n_shap=16):
f"""There were no {quadrant.upper()}s, this will output NaNs
in the csv and figure for this split column"""
)
shaps_i_quadrant = shaps_i[
shaps_i_quadrant = np.array(shaps_i)[
indexes.get(quadrant)
] # shape (P, F) P prediction x F feature_names
abs_weighted_shap_values = np.abs(shaps_i_quadrant) * split_performance
Expand Down Expand Up @@ -325,7 +328,7 @@ def gen_report_shap_regres(results, output_dir="./", plot_top_n_shap=16):
f"""There were no {quadrant.upper()}s, this will
output NaNs in the csv and figure for this split column"""
)
shaps_i_quadrant = shaps_i[
shaps_i_quadrant = np.array(shaps_i)[
indexes.get(quadrant)
] # shape (P, F) P prediction x F feature_names
abs_weighted_shap_values = np.abs(shaps_i_quadrant) * split_performance
Expand Down
40 changes: 36 additions & 4 deletions pydra_ml/tasks.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,15 @@
#!/usr/bin/env python

import typing as ty

from pydra.utils.hash import Cache, register_serializer
from sklearn.pipeline import Pipeline


@register_serializer
def bytes_repr_Pipeline(obj: Pipeline, cache: Cache):
yield str(obj).encode()


def read_file(filename, x_indices=None, target_vars=None, group=None):
"""Read a CSV data file
Expand Down Expand Up @@ -92,7 +102,7 @@ def to_instance(clf_info):

train_index, test_index = train_test_split[split_index]
y = y.ravel()
if type(X[0][0]) == str:
if type(X[0][0]) is str:
# it's loaded as bytes, so we need to decode as utf-8
X = np.array([str.encode(n[0]).decode("utf-8") for n in X])
if permute:
Expand Down Expand Up @@ -126,7 +136,27 @@ def calc_metric(output, metrics):
return score, output


def get_feature_importance(permute, model, gen_feature_importance=True):
def get_feature_importance(
*,
permute: bool,
model: ty.Tuple[Pipeline, list, list],
gen_feature_importance: bool = True,
):
"""Compute feature importance for the model
Parameters
----------
permute : bool
Whether or not to run the model in permuted mode
model : tuple(sklearn.pipeline.Pipeline, list, list)
The model to compute feature importance for
gen_feature_importance : bool
Whether or not to generate the feature importance
Returns
-------
list
List of feature importance
"""
if permute or not gen_feature_importance:
return []
pipeline, train_index, test_index = model
Expand Down Expand Up @@ -172,7 +202,7 @@ def get_feature_importance(permute, model, gen_feature_importance=True):
pipeline_steps.coefs_
pipeline_steps.coef_
Please add correct method in tasks.py or if inexistent,
Please add correct method in tasks.py or if non-existent,
set gen_feature_importance to false in the spec file.
This is the error that was returned by sklearn:\n\t{e}\n
Expand Down Expand Up @@ -224,7 +254,9 @@ def get_shap(X, permute, model, gen_shap=False, nsamples="auto", l1_reg="aic"):
import shap

explainer = shap.KernelExplainer(pipe.predict, shap.kmeans(X[train_index], 5))
shaps = explainer.shap_values(X[test_index], nsamples=nsamples, l1_reg=l1_reg)
shaps = explainer.shap_values(
X[test_index], nsamples=nsamples, l1_reg=l1_reg, silent=True
)
return shaps


Expand Down
6 changes: 1 addition & 5 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ classifiers =
[options]
python_requires = >= 3.8
install_requires =
pydra == 0.22.0
pydra >= 0.23.0-alpha
psutil
scikit-learn
seaborn
Expand All @@ -35,11 +35,9 @@ install_requires =

test_requires =
pytest >= 4.4.0
pytest-cov
pytest-env
pytest-xdist
pytest-rerunfailures
codecov
packages = find:
include_package_data = True

Expand All @@ -58,11 +56,9 @@ docs =
%(doc)s
test =
pytest >= 4.4.0
pytest-cov
pytest-env
pytest-xdist
pytest-rerunfailures
codecov
tests =
%(test)s
dev =
Expand Down

0 comments on commit 3ab7bc3

Please sign in to comment.