Merge branch 'master' into fix/auto

nipype · Feb 24, 2024 · 3ab7bc3 · 3ab7bc3
2 parents 92b0547 + 64e351e
commit 3ab7bc3
Show file tree

Hide file tree

Showing 8 changed files with 69 additions and 28 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -22,11 +22,11 @@ jobs:
           - "3.11"
 
     steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v4.1.1
         with:  # no need for the history
           fetch-depth: 1
       - name: Set up Python ${{ matrix.python }}
-        uses: actions/setup-python@v4
+        uses: actions/setup-python@v5.0.0
         with:
           python-version: ${{ matrix.python }}
 

diff --git a/.gitignore b/.gitignore
@@ -133,3 +133,9 @@ dmypy.json
 
 # pycharm
 .idea/
+
+# Venvs
+*.venv
+
+# Generated messages
+/messages
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -5,29 +5,29 @@ ci:
 
 repos:
 -   repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.4.0
+    rev: v4.5.0
     hooks:
     -   id: check-added-large-files
     -   id: check-yaml
     -   id: end-of-file-fixer
     -   id: trailing-whitespace
 -   repo: https://github.com/psf/black
-    rev: 23.1.0
+    rev: 24.2.0
     hooks:
     -   id: black
 -   repo: https://github.com/PyCQA/flake8
-    rev: 6.0.0
+    rev: 7.0.0
     hooks:
     - id: flake8
       exclude: ^(pydra_ml/_version\.py|versioneer\.py)$
 -   repo: https://github.com/PyCQA/isort
-    rev: 5.12.0
+    rev: 5.13.2
     hooks:
     -   id: isort
         args: ["--profile", "black"]
         exclude: ^(pydra_ml/_version\.py|versioneer\.py)$
 -   repo: https://github.com/codespell-project/codespell
-    rev: v2.2.2
+    rev: v2.2.6
     hooks:
     -   id: codespell
         exclude: ^(pydra_ml/_version\.py|versioneer\.py)$
diff --git a/pydra_ml/__init__.py b/pydra_ml/__init__.py
@@ -35,3 +35,7 @@ def set_logger_level(lgr, level):
 set_logger_level(lgr, os.environ.get("PYDRAML_LOG_LEVEL", logging.INFO))
 FORMAT = "%(asctime)-15s [%(levelname)8s] %(message)s"
 logging.basicConfig(format=FORMAT)
+
+from . import _version
+
+__version__ = _version.get_versions()["version"]
diff --git a/pydra_ml/classifier.py b/pydra_ml/classifier.py
@@ -71,7 +71,7 @@ def gen_workflow(inputs, cache_dir=None, cache_locations=None):
         messengers=FileMessenger(),
         messenger_args={"message_dir": os.path.join(os.getcwd(), "messages")},
     )
-    wf.split(["clf_info", "permute"])
+    wf.split(clf_info=inputs["clf_info"], permute=inputs["permute"])
     wf.add(
         read_file_pdt(
             name="readcsv",
@@ -102,7 +102,7 @@ def gen_workflow(inputs, cache_dir=None, cache_locations=None):
             permute=wf.lzin.permute,
         )
     )
-    wf.fit_clf.split("split_index")
+    wf.fit_clf.split(split_index=wf.gensplit.lzout.split_indices)
     wf.add(
         calc_metric_pdt(
             name="metric", output=wf.fit_clf.lzout.output, metrics=wf.lzin.metrics

diff --git a/pydra_ml/report.py b/pydra_ml/report.py
@@ -3,12 +3,15 @@
 import pickle
 import warnings
 
+import matplotlib
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
 import seaborn as sns
 from sklearn.metrics import accuracy_score, explained_variance_score
 
+matplotlib.use("Agg")
+
 
 def save_obj(obj, path):
     with open(path, "wb") as f:
@@ -37,9 +40,9 @@ def performance_table(df, output_dir, round_decimals=2):
             df_clf = df_clf.reset_index(drop=True)
             df_metric_data_clean[clf] = df_clf
         df_metric_data_clean_median = df_metric_data_clean.median().T
-        df_metric_data_clean.loc[
-            len(df_metric_data_clean)
-        ] = df_metric_data_clean_median
+        df_metric_data_clean.loc[len(df_metric_data_clean)] = (
+            df_metric_data_clean_median
+        )
         df_metric_data_clean.index = list(df_metric_data_clean.index[:-1]) + ["median"]
         df_metric_data_clean.to_csv(
             os.path.join(
@@ -77,9 +80,9 @@ def performance_table(df, output_dir, round_decimals=2):
             )
             if "null" in df_metric.type.unique():
                 null_median = round(df_metric_null_clean_median[clf], 2)
-                df_summary.loc[
-                    0, clf
-                ] = f"{data_median} [{ci_lower}–{ci_upper}; {null_median}]"
+                df_summary.loc[0, clf] = (
+                    f"{data_median} [{ci_lower}–{ci_upper}; {null_median}]"
+                )
             else:
                 df_summary.loc[0, clf] = f"{data_median} [{ci_lower}–{ci_upper}]"
 
@@ -97,9 +100,9 @@ def plot_summary(summary, output_dir=None, filename="shap_plot", plot_top_n_shap
     # plot without all bootstrapping values
     summary = summary[["mean", "std", "min", "max"]]
     num_features = len(list(summary.index))
-    if (plot_top_n_shap != 1 and type(plot_top_n_shap) == float) or type(
+    if (plot_top_n_shap != 1 and type(plot_top_n_shap) is float) or type(
         plot_top_n_shap
-    ) == int:
+    ) is int:
         # if plot_top_n_shap != 1.0 but includes 1 (int)
         if plot_top_n_shap <= 0:
             raise ValueError(
@@ -223,7 +226,7 @@ def gen_report_shap_class(results, output_dir="./", plot_top_n_shap=16):
                         f"""There were no {quadrant.upper()}s, this will output NaNs
                         in the csv and figure for this split column"""
                     )
-                shaps_i_quadrant = shaps_i[
+                shaps_i_quadrant = np.array(shaps_i)[
                     indexes.get(quadrant)
                 ]  # shape (P, F) P prediction x F feature_names
                 abs_weighted_shap_values = np.abs(shaps_i_quadrant) * split_performance
@@ -325,7 +328,7 @@ def gen_report_shap_regres(results, output_dir="./", plot_top_n_shap=16):
                         f"""There were no {quadrant.upper()}s, this will
                         output NaNs in the csv and figure for this split column"""
                     )
-                shaps_i_quadrant = shaps_i[
+                shaps_i_quadrant = np.array(shaps_i)[
                     indexes.get(quadrant)
                 ]  # shape (P, F) P prediction x F feature_names
                 abs_weighted_shap_values = np.abs(shaps_i_quadrant) * split_performance

diff --git a/pydra_ml/tasks.py b/pydra_ml/tasks.py
@@ -1,5 +1,15 @@
 #!/usr/bin/env python
 
+import typing as ty
+
+from pydra.utils.hash import Cache, register_serializer
+from sklearn.pipeline import Pipeline
+
+
+@register_serializer
+def bytes_repr_Pipeline(obj: Pipeline, cache: Cache):
+    yield str(obj).encode()
+
 
 def read_file(filename, x_indices=None, target_vars=None, group=None):
     """Read a CSV data file
@@ -92,7 +102,7 @@ def to_instance(clf_info):
 
     train_index, test_index = train_test_split[split_index]
     y = y.ravel()
-    if type(X[0][0]) == str:
+    if type(X[0][0]) is str:
         # it's loaded as bytes, so we need to decode as utf-8
         X = np.array([str.encode(n[0]).decode("utf-8") for n in X])
     if permute:
@@ -126,7 +136,27 @@ def calc_metric(output, metrics):
     return score, output
 
 
-def get_feature_importance(permute, model, gen_feature_importance=True):
+def get_feature_importance(
+    *,
+    permute: bool,
+    model: ty.Tuple[Pipeline, list, list],
+    gen_feature_importance: bool = True,
+):
+    """Compute feature importance for the model
+
+    Parameters
+    ----------
+    permute : bool
+        Whether or not to run the model in permuted mode
+    model : tuple(sklearn.pipeline.Pipeline, list, list)
+        The model to compute feature importance for
+    gen_feature_importance : bool
+        Whether or not to generate the feature importance
+    Returns
+    -------
+    list
+        List of feature importance
+    """
     if permute or not gen_feature_importance:
         return []
     pipeline, train_index, test_index = model
@@ -172,7 +202,7 @@ def get_feature_importance(permute, model, gen_feature_importance=True):
                 pipeline_steps.coefs_
                 pipeline_steps.coef_
 
-                Please add correct method in tasks.py or if inexistent,
+                Please add correct method in tasks.py or if non-existent,
                 set gen_feature_importance to false in the spec file.
 
                 This is the error that was returned by sklearn:\n\t{e}\n
@@ -224,7 +254,9 @@ def get_shap(X, permute, model, gen_shap=False, nsamples="auto", l1_reg="aic"):
     import shap
 
     explainer = shap.KernelExplainer(pipe.predict, shap.kmeans(X[train_index], 5))
-    shaps = explainer.shap_values(X[test_index], nsamples=nsamples, l1_reg=l1_reg)
+    shaps = explainer.shap_values(
+        X[test_index], nsamples=nsamples, l1_reg=l1_reg, silent=True
+    )
     return shaps
 
 

diff --git a/setup.cfg b/setup.cfg
@@ -26,7 +26,7 @@ classifiers =
 [options]
 python_requires = >= 3.8
 install_requires =
-    pydra == 0.22.0
+    pydra >= 0.23.0-alpha
     psutil
     scikit-learn
     seaborn
@@ -35,11 +35,9 @@ install_requires =
 
 test_requires =
     pytest >= 4.4.0
-    pytest-cov
     pytest-env
     pytest-xdist
     pytest-rerunfailures
-    codecov
 packages = find:
 include_package_data = True
 
@@ -58,11 +56,9 @@ docs =
     %(doc)s
 test =
     pytest >= 4.4.0
-    pytest-cov
     pytest-env
     pytest-xdist
     pytest-rerunfailures
-    codecov
 tests =
     %(test)s
 dev =