From fdf2d5efe09bbb4fcc203a1e8232e9ac59c75896 Mon Sep 17 00:00:00 2001 From: Satrajit Ghosh Date: Sun, 22 Oct 2023 10:29:37 -0400 Subject: [PATCH 01/17] initial attempt to work with pydra 0.23+ --- pydra_ml/classifier.py | 4 ++-- pydra_ml/tasks.py | 33 +++++++++++++++++++++++++++++++-- setup.cfg | 2 +- 3 files changed, 34 insertions(+), 5 deletions(-) diff --git a/pydra_ml/classifier.py b/pydra_ml/classifier.py index 0d554bf..b2a50c0 100644 --- a/pydra_ml/classifier.py +++ b/pydra_ml/classifier.py @@ -71,7 +71,7 @@ def gen_workflow(inputs, cache_dir=None, cache_locations=None): messengers=FileMessenger(), messenger_args={"message_dir": os.path.join(os.getcwd(), "messages")}, ) - wf.split(["clf_info", "permute"]) + wf.split(clf_info=inputs["clf_info"], permute=inputs["permute"]) wf.add( read_file_pdt( name="readcsv", @@ -102,7 +102,7 @@ def gen_workflow(inputs, cache_dir=None, cache_locations=None): permute=wf.lzin.permute, ) ) - wf.fit_clf.split("split_index") + wf.fit_clf.split(split_index=wf.gensplit.lzout.split_indices) wf.add( calc_metric_pdt( name="metric", output=wf.fit_clf.lzout.output, metrics=wf.lzin.metrics diff --git a/pydra_ml/tasks.py b/pydra_ml/tasks.py index 54689a6..1584802 100644 --- a/pydra_ml/tasks.py +++ b/pydra_ml/tasks.py @@ -1,5 +1,14 @@ #!/usr/bin/env python +import cloudpickle as cp +from pydra.utils.hash import Cache, register_serializer +from sklearn.pipeline import Pipeline + + +@register_serializer +def bytes_repr_Pipeline(obj: Pipeline, cache: Cache): + yield cp.dump(obj) + def read_file(filename, x_indices=None, target_vars=None, group=None): """Read a CSV data file @@ -126,7 +135,27 @@ def calc_metric(output, metrics): return score, output -def get_feature_importance(permute, model, gen_feature_importance=True): +def get_feature_importance( + *, + permute: bool, + model: tuple[Pipeline, list, list], + gen_feature_importance: bool = True, +): + """Compute feature importance for the model + + Parameters + ---------- + permute : bool + Whether or not to run the model in permuted mode + model : tuple(sklearn.pipeline.Pipeline, list, list) + The model to compute feature importance for + gen_feature_importance : bool + Whether or not to generate the feature importance + Returns + ------- + list + List of feature importance + """ if permute or not gen_feature_importance: return [] pipeline, train_index, test_index = model @@ -172,7 +201,7 @@ def get_feature_importance(permute, model, gen_feature_importance=True): pipeline_steps.coefs_ pipeline_steps.coef_ - Please add correct method in tasks.py or if inexistent, + Please add correct method in tasks.py or if non-existent, set gen_feature_importance to false in the spec file. This is the error that was returned by sklearn:\n\t{e}\n diff --git a/setup.cfg b/setup.cfg index 8feaf4d..8a3a5f5 100644 --- a/setup.cfg +++ b/setup.cfg @@ -26,7 +26,7 @@ classifiers = [options] python_requires = >= 3.8 install_requires = - pydra == 0.22.0 + pydra >= 0.23.0-alpha psutil scikit-learn seaborn From e102a9a40ecdff38a9d9f1d3f7206a0c755f9a40 Mon Sep 17 00:00:00 2001 From: Satrajit Ghosh Date: Sun, 22 Oct 2023 10:37:32 -0400 Subject: [PATCH 02/17] switch back to dumps --- pydra_ml/tasks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pydra_ml/tasks.py b/pydra_ml/tasks.py index 1584802..64678c7 100644 --- a/pydra_ml/tasks.py +++ b/pydra_ml/tasks.py @@ -7,7 +7,7 @@ @register_serializer def bytes_repr_Pipeline(obj: Pipeline, cache: Cache): - yield cp.dump(obj) + yield cp.dumps(obj) def read_file(filename, x_indices=None, target_vars=None, group=None): From d6bbd3972da3d55c1e606e64847697018c128b79 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 19 Feb 2024 16:56:09 +0000 Subject: [PATCH 03/17] [pre-commit.ci] pre-commit autoupdate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/pre-commit/pre-commit-hooks: v4.4.0 → v4.5.0](https://github.com/pre-commit/pre-commit-hooks/compare/v4.4.0...v4.5.0) - [github.com/psf/black: 23.1.0 → 24.2.0](https://github.com/psf/black/compare/23.1.0...24.2.0) - [github.com/PyCQA/flake8: 6.0.0 → 7.0.0](https://github.com/PyCQA/flake8/compare/6.0.0...7.0.0) - [github.com/PyCQA/isort: 5.12.0 → 5.13.2](https://github.com/PyCQA/isort/compare/5.12.0...5.13.2) - [github.com/codespell-project/codespell: v2.2.2 → v2.2.6](https://github.com/codespell-project/codespell/compare/v2.2.2...v2.2.6) --- .pre-commit-config.yaml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 11f542b..ab94d51 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -5,29 +5,29 @@ ci: repos: - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.4.0 + rev: v4.5.0 hooks: - id: check-added-large-files - id: check-yaml - id: end-of-file-fixer - id: trailing-whitespace - repo: https://github.com/psf/black - rev: 23.1.0 + rev: 24.2.0 hooks: - id: black - repo: https://github.com/PyCQA/flake8 - rev: 6.0.0 + rev: 7.0.0 hooks: - id: flake8 exclude: ^(pydra_ml/_version\.py|versioneer\.py)$ - repo: https://github.com/PyCQA/isort - rev: 5.12.0 + rev: 5.13.2 hooks: - id: isort args: ["--profile", "black"] exclude: ^(pydra_ml/_version\.py|versioneer\.py)$ - repo: https://github.com/codespell-project/codespell - rev: v2.2.2 + rev: v2.2.6 hooks: - id: codespell exclude: ^(pydra_ml/_version\.py|versioneer\.py)$ From 39f9e93af3fc539af16dce7ad984d58c775eddd2 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 19 Feb 2024 16:56:34 +0000 Subject: [PATCH 04/17] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- pydra_ml/report.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pydra_ml/report.py b/pydra_ml/report.py index 603b130..cf19823 100644 --- a/pydra_ml/report.py +++ b/pydra_ml/report.py @@ -37,9 +37,9 @@ def performance_table(df, output_dir, round_decimals=2): df_clf = df_clf.reset_index(drop=True) df_metric_data_clean[clf] = df_clf df_metric_data_clean_median = df_metric_data_clean.median().T - df_metric_data_clean.loc[ - len(df_metric_data_clean) - ] = df_metric_data_clean_median + df_metric_data_clean.loc[len(df_metric_data_clean)] = ( + df_metric_data_clean_median + ) df_metric_data_clean.index = list(df_metric_data_clean.index[:-1]) + ["median"] df_metric_data_clean.to_csv( os.path.join( @@ -77,9 +77,9 @@ def performance_table(df, output_dir, round_decimals=2): ) if "null" in df_metric.type.unique(): null_median = round(df_metric_null_clean_median[clf], 2) - df_summary.loc[ - 0, clf - ] = f"{data_median} [{ci_lower}–{ci_upper}; {null_median}]" + df_summary.loc[0, clf] = ( + f"{data_median} [{ci_lower}–{ci_upper}; {null_median}]" + ) else: df_summary.loc[0, clf] = f"{data_median} [{ci_lower}–{ci_upper}]" From ee7ed6fc254441197ab33f98829063631c61d240 Mon Sep 17 00:00:00 2001 From: Tom Close Date: Thu, 22 Feb 2024 16:52:32 +1100 Subject: [PATCH 05/17] added *.venv and /messages to gitignore --- .gitignore | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.gitignore b/.gitignore index cc07538..574b478 100644 --- a/.gitignore +++ b/.gitignore @@ -133,3 +133,9 @@ dmypy.json # pycharm .idea/ + +# Venvs +*.venv + +# Generated messages +/messages From 1321a64591d15dfd9b325683a3ca0a5254553e6f Mon Sep 17 00:00:00 2001 From: Tom Close Date: Sat, 24 Feb 2024 20:33:14 +1100 Subject: [PATCH 06/17] __str__ based bytes_repr for Pipeline objects --- pydra_ml/report.py | 4 ++-- pydra_ml/tasks.py | 3 +-- pydra_ml/tests/test_classifier.py | 3 +-- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/pydra_ml/report.py b/pydra_ml/report.py index 603b130..d292854 100644 --- a/pydra_ml/report.py +++ b/pydra_ml/report.py @@ -223,7 +223,7 @@ def gen_report_shap_class(results, output_dir="./", plot_top_n_shap=16): f"""There were no {quadrant.upper()}s, this will output NaNs in the csv and figure for this split column""" ) - shaps_i_quadrant = shaps_i[ + shaps_i_quadrant = np.array(shaps_i)[ indexes.get(quadrant) ] # shape (P, F) P prediction x F feature_names abs_weighted_shap_values = np.abs(shaps_i_quadrant) * split_performance @@ -325,7 +325,7 @@ def gen_report_shap_regres(results, output_dir="./", plot_top_n_shap=16): f"""There were no {quadrant.upper()}s, this will output NaNs in the csv and figure for this split column""" ) - shaps_i_quadrant = shaps_i[ + shaps_i_quadrant = np.array(shaps_i)[ indexes.get(quadrant) ] # shape (P, F) P prediction x F feature_names abs_weighted_shap_values = np.abs(shaps_i_quadrant) * split_performance diff --git a/pydra_ml/tasks.py b/pydra_ml/tasks.py index 64678c7..b0ce693 100644 --- a/pydra_ml/tasks.py +++ b/pydra_ml/tasks.py @@ -1,13 +1,12 @@ #!/usr/bin/env python -import cloudpickle as cp from pydra.utils.hash import Cache, register_serializer from sklearn.pipeline import Pipeline @register_serializer def bytes_repr_Pipeline(obj: Pipeline, cache: Cache): - yield cp.dumps(obj) + yield str(obj).encode() def read_file(filename, x_indices=None, target_vars=None, group=None): diff --git a/pydra_ml/tests/test_classifier.py b/pydra_ml/tests/test_classifier.py index 432809f..714fa7e 100644 --- a/pydra_ml/tests/test_classifier.py +++ b/pydra_ml/tests/test_classifier.py @@ -1,5 +1,4 @@ import os - import numpy as np from ..classifier import gen_workflow, run_workflow @@ -35,7 +34,7 @@ def test_classifier(tmpdir): "metrics": ["roc_auc_score", "accuracy_score"], } wf = gen_workflow(inputs, cache_dir=tmpdir) - results = run_workflow(wf, "cf", {"n_procs": 1}) + results = run_workflow(wf, "serial", {"n_procs": 1}) assert results[0][0]["ml_wf.clf_info"][1] == "MLPClassifier" assert results[0][0]["ml_wf.permute"] assert results[0][1].output.score[0][0] < results[1][1].output.score[0][0] From 58f1cea21a8c4ddf78642c5437e656e56742dba8 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sat, 24 Feb 2024 09:38:32 +0000 Subject: [PATCH 07/17] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- pydra_ml/tests/test_classifier.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pydra_ml/tests/test_classifier.py b/pydra_ml/tests/test_classifier.py index 714fa7e..1bc4182 100644 --- a/pydra_ml/tests/test_classifier.py +++ b/pydra_ml/tests/test_classifier.py @@ -1,4 +1,5 @@ import os + import numpy as np from ..classifier import gen_workflow, run_workflow From 7611491ea2cfcc737d4b953823dbceceb1f109bb Mon Sep 17 00:00:00 2001 From: Satrajit Ghosh Date: Sat, 24 Feb 2024 12:20:42 -0500 Subject: [PATCH 08/17] fix: use typing module for Tuple --- pydra_ml/tasks.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pydra_ml/tasks.py b/pydra_ml/tasks.py index b0ce693..2491d4e 100644 --- a/pydra_ml/tasks.py +++ b/pydra_ml/tasks.py @@ -1,5 +1,7 @@ #!/usr/bin/env python +import typing as ty + from pydra.utils.hash import Cache, register_serializer from sklearn.pipeline import Pipeline @@ -100,7 +102,7 @@ def to_instance(clf_info): train_index, test_index = train_test_split[split_index] y = y.ravel() - if type(X[0][0]) == str: + if type(X[0][0]) is str: # it's loaded as bytes, so we need to decode as utf-8 X = np.array([str.encode(n[0]).decode("utf-8") for n in X]) if permute: @@ -137,7 +139,7 @@ def calc_metric(output, metrics): def get_feature_importance( *, permute: bool, - model: tuple[Pipeline, list, list], + model: ty.Tuple[Pipeline, list, list], gen_feature_importance: bool = True, ): """Compute feature importance for the model From cf8d57230926a9887a671ba97b06fd4d484d392a Mon Sep 17 00:00:00 2001 From: Satrajit Ghosh Date: Sat, 24 Feb 2024 12:23:09 -0500 Subject: [PATCH 09/17] fix: type comparisons --- pydra_ml/report.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pydra_ml/report.py b/pydra_ml/report.py index e6a4b5b..ec58ae9 100644 --- a/pydra_ml/report.py +++ b/pydra_ml/report.py @@ -97,9 +97,9 @@ def plot_summary(summary, output_dir=None, filename="shap_plot", plot_top_n_shap # plot without all bootstrapping values summary = summary[["mean", "std", "min", "max"]] num_features = len(list(summary.index)) - if (plot_top_n_shap != 1 and type(plot_top_n_shap) == float) or type( + if (plot_top_n_shap != 1 and type(plot_top_n_shap) is float) or type( plot_top_n_shap - ) == int: + ) is int: # if plot_top_n_shap != 1.0 but includes 1 (int) if plot_top_n_shap <= 0: raise ValueError( From a3722fc193dd982ff0c5c4a21b6fa58cfae1c318 Mon Sep 17 00:00:00 2001 From: Satrajit Ghosh Date: Sat, 24 Feb 2024 12:48:00 -0500 Subject: [PATCH 10/17] Update ci.yml update action versions --- .github/workflows/ci.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index f82311d..5b68aa6 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -22,11 +22,11 @@ jobs: - "3.11" steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v4.1.1 with: # no need for the history fetch-depth: 1 - name: Set up Python ${{ matrix.python }} - uses: actions/setup-python@v4 + uses: actions/setup-python@v5.0.0 with: python-version: ${{ matrix.python }} From 40606de721ceb0fb23eb653fa14f371626babc80 Mon Sep 17 00:00:00 2001 From: Satrajit Ghosh Date: Sat, 24 Feb 2024 15:46:56 -0500 Subject: [PATCH 11/17] enable output --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 5b68aa6..1130be7 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -36,4 +36,4 @@ jobs: pip install --upgrade --force-reinstall --no-cache-dir --editable=".[test]" - name: Test with pytest run: | - pytest pydra_ml/tests/test_classifier.py + pytest -sv pydra_ml/tests/test_classifier.py From f17eca351d186399d5df756c92bb7925edb3d19d Mon Sep 17 00:00:00 2001 From: Satrajit Ghosh Date: Sat, 24 Feb 2024 15:59:35 -0500 Subject: [PATCH 12/17] switch to non-interactive outputs --- .github/workflows/ci.yml | 2 +- pydra_ml/report.py | 3 +++ pydra_ml/tasks.py | 4 +++- 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 1130be7..5b68aa6 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -36,4 +36,4 @@ jobs: pip install --upgrade --force-reinstall --no-cache-dir --editable=".[test]" - name: Test with pytest run: | - pytest -sv pydra_ml/tests/test_classifier.py + pytest pydra_ml/tests/test_classifier.py diff --git a/pydra_ml/report.py b/pydra_ml/report.py index ec58ae9..2d74c8d 100644 --- a/pydra_ml/report.py +++ b/pydra_ml/report.py @@ -3,12 +3,15 @@ import pickle import warnings +import matplotlib import matplotlib.pyplot as plt import numpy as np import pandas as pd import seaborn as sns from sklearn.metrics import accuracy_score, explained_variance_score +matplotlib.use("Agg") + def save_obj(obj, path): with open(path, "wb") as f: diff --git a/pydra_ml/tasks.py b/pydra_ml/tasks.py index 2491d4e..27b17f7 100644 --- a/pydra_ml/tasks.py +++ b/pydra_ml/tasks.py @@ -254,7 +254,9 @@ def get_shap(X, permute, model, gen_shap=False, nsamples="auto", l1_reg="aic"): import shap explainer = shap.KernelExplainer(pipe.predict, shap.kmeans(X[train_index], 5)) - shaps = explainer.shap_values(X[test_index], nsamples=nsamples, l1_reg=l1_reg) + shaps = explainer.shap_values( + X[test_index], nsamples=nsamples, l1_reg=l1_reg, silent=True + ) return shaps From ed454535883e3f2190955bd000cfc09cdf72ad97 Mon Sep 17 00:00:00 2001 From: Satrajit Ghosh Date: Sat, 24 Feb 2024 16:02:06 -0500 Subject: [PATCH 13/17] enable pytest flags again --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 5b68aa6..1130be7 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -36,4 +36,4 @@ jobs: pip install --upgrade --force-reinstall --no-cache-dir --editable=".[test]" - name: Test with pytest run: | - pytest pydra_ml/tests/test_classifier.py + pytest -sv pydra_ml/tests/test_classifier.py From b026d7ac5670953c084be7e8f2eda711613ad037 Mon Sep 17 00:00:00 2001 From: Satrajit Ghosh Date: Sat, 24 Feb 2024 16:04:41 -0500 Subject: [PATCH 14/17] switch to concurrent future --- pydra_ml/tests/test_classifier.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pydra_ml/tests/test_classifier.py b/pydra_ml/tests/test_classifier.py index 1bc4182..432809f 100644 --- a/pydra_ml/tests/test_classifier.py +++ b/pydra_ml/tests/test_classifier.py @@ -35,7 +35,7 @@ def test_classifier(tmpdir): "metrics": ["roc_auc_score", "accuracy_score"], } wf = gen_workflow(inputs, cache_dir=tmpdir) - results = run_workflow(wf, "serial", {"n_procs": 1}) + results = run_workflow(wf, "cf", {"n_procs": 1}) assert results[0][0]["ml_wf.clf_info"][1] == "MLPClassifier" assert results[0][0]["ml_wf.permute"] assert results[0][1].output.score[0][0] < results[1][1].output.score[0][0] From 143fb693435f0b69d3aea96a9e4b23bbc7296fc8 Mon Sep 17 00:00:00 2001 From: Satrajit Ghosh Date: Sat, 24 Feb 2024 16:09:17 -0500 Subject: [PATCH 15/17] versioneer updates --- .github/workflows/ci.yml | 2 +- pydra_ml/__init__.py | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 1130be7..5b68aa6 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -36,4 +36,4 @@ jobs: pip install --upgrade --force-reinstall --no-cache-dir --editable=".[test]" - name: Test with pytest run: | - pytest -sv pydra_ml/tests/test_classifier.py + pytest pydra_ml/tests/test_classifier.py diff --git a/pydra_ml/__init__.py b/pydra_ml/__init__.py index 9224336..f6a0da4 100644 --- a/pydra_ml/__init__.py +++ b/pydra_ml/__init__.py @@ -35,3 +35,7 @@ def set_logger_level(lgr, level): set_logger_level(lgr, os.environ.get("PYDRAML_LOG_LEVEL", logging.INFO)) FORMAT = "%(asctime)-15s [%(levelname)8s] %(message)s" logging.basicConfig(format=FORMAT) + +from . import _version + +__version__ = _version.get_versions()["version"] From 1e75e4eea4cbc599e93b2cff1a991c591f5adbc2 Mon Sep 17 00:00:00 2001 From: Satrajit Ghosh Date: Sat, 24 Feb 2024 16:09:56 -0500 Subject: [PATCH 16/17] remove coverage --- setup.cfg | 2 -- 1 file changed, 2 deletions(-) diff --git a/setup.cfg b/setup.cfg index 8a3a5f5..d15b5cc 100644 --- a/setup.cfg +++ b/setup.cfg @@ -35,11 +35,9 @@ install_requires = test_requires = pytest >= 4.4.0 - pytest-cov pytest-env pytest-xdist pytest-rerunfailures - codecov packages = find: include_package_data = True From 0f3df04fd1e36d4e8f139fd62cbe47853e3669a2 Mon Sep 17 00:00:00 2001 From: Satrajit Ghosh Date: Sat, 24 Feb 2024 16:11:01 -0500 Subject: [PATCH 17/17] remove coverage --- setup.cfg | 2 -- 1 file changed, 2 deletions(-) diff --git a/setup.cfg b/setup.cfg index d15b5cc..2aaf11d 100644 --- a/setup.cfg +++ b/setup.cfg @@ -56,11 +56,9 @@ docs = %(doc)s test = pytest >= 4.4.0 - pytest-cov pytest-env pytest-xdist pytest-rerunfailures - codecov tests = %(test)s dev =