Merge branch 'main' of github.com:big-o/skdag into main

scikit-learn-contrib · Jul 31, 2022 · cf94b90 · cf94b90
2 parents 937352b + 80af726
commit cf94b90
Show file tree

Hide file tree

Showing 12 changed files with 109 additions and 110 deletions.
diff --git a/.readthedocs.yml b/.readthedocs.yml
@@ -4,5 +4,5 @@ requirements_file: requirements.txt
 python:
   pip_install: true
   extra_requirements:
-    - tests
-    - docs
+    - test
+    - doc
diff --git a/.travis.yml b/.travis.yml
diff --git a/README.rst b/README.rst
@@ -1,18 +1,12 @@
 .. -*- mode: rst -*-
 
-|Travis|_ |AppVeyor|_ |Codecov|_ |CircleCI|_ |ReadTheDocs|_
+|AppVeyor|_ |Codecov|_ |ReadTheDocs|_
 
-.. |Travis| image:: https://travis-ci.org/scikit-learn-contrib/project-template.svg?branch=master
-.. _Travis: https://travis-ci.org/scikit-learn-contrib/project-template
+.. |AppVeyor| image:: https://ci.appveyor.com/api/projects/status/github/big-o/skdag?branch=main&svg=true
+.. _AppVeyor: https://ci.appveyor.com/project/big-o/skdag
 
-.. |AppVeyor| image:: https://ci.appveyor.com/api/projects/status/coy2qqaqr1rnnt5y/branch/master?svg=true
-.. _AppVeyor: https://ci.appveyor.com/project/glemaitre/project-template
-
-.. |Codecov| image:: https://codecov.io/gh/scikit-learn-contrib/project-template/branch/master/graph/badge.svg
-.. _Codecov: https://codecov.io/gh/scikit-learn-contrib/project-template
-
-.. |CircleCI| image:: https://circleci.com/gh/scikit-learn-contrib/project-template.svg?style=shield&circle-token=:circle-token
-.. _CircleCI: https://circleci.com/gh/scikit-learn-contrib/project-template/tree/master
+.. |Codecov| image:: https://codecov.io/gh/big-o/skdag/branch/main/graph/badge.svg
+.. _Codecov: https://codecov.io/gh/big-o/skdag
 
 .. |ReadTheDocs| image:: https://readthedocs.org/projects/skdag/badge/?version=latest
 .. _ReadTheDocs: https://skdag.readthedocs.io/en/latest/?badge=latest

diff --git a/appveyor.yml b/appveyor.yml
@@ -2,36 +2,33 @@ build: false
 
 environment:
   matrix:
-    - PYTHON: "C:\\Miniconda3-x64"
-      PYTHON_VERSION: "3.8.x"
-      PYTHON_ARCH: "64"
-      NUMPY_VERSION: "*"
-      SCIPY_VERSION: "*"
-      SKLEARN_VERSION: "*"
+    - APPVEYOR_BUILD_WORKER_IMAGE: Ubuntu
+      APPVEYOR_YML_DISABLE_PS_LINUX: true
 
-    - PYTHON: "C:\\Miniconda3-x64"
-      PYTHON_VERSION: "3.9.x"
-      PYTHON_ARCH: "64"
-      NUMPY_VERSION: "*"
-      SCIPY_VERSION: "*"
-      SKLEARN_VERSION: "*"
+stack: python 3.8
 
-install:
-  # Prepend miniconda installed Python to the PATH of this build
-  # Add Library/bin directory to fix issue
-  # https://github.com/conda/conda/issues/1753
-  - "SET PATH=%PYTHON%;%PYTHON%\\Scripts;%PYTHON%\\Library\\bin;%PATH%"
-  # install the dependencies
-  - "conda install --yes -c conda-forge pip numpy==%NUMPY_VERSION% scipy==%SCIPY_VERSION% scikit-learn==%SKLEARN_VERSION%"
-  - pip install codecov nose pytest pytest-cov
-  - pip install .
+install: |
+  if [[ "${APPVEYOR_BUILD_WORKER_IMAGE}" == "Ubuntu" ]]; then
+    sudo apt update
+    sudo apt install -y graphviz libgraphviz-dev
+  elif [[ "${APPVEYOR_BUILD_WORKER_IMAGE}" == "macOS" ]]; then
+    brew update
+    brew install graphviz
+  fi
+  pip install --upgrade pip
+  pip install -r requirements.txt
+  pip install -r requirements_test.txt
+  pip install -r requirements_doc.txt
+  pip install .
 
 test_script:
   - mkdir for_test
   - cd for_test
   - pytest -v --cov=skdag --pyargs skdag
 
 after_test:
-  - cp .coverage %APPVEYOR_BUILD_FOLDER%
-  - cd %APPVEYOR_BUILD_FOLDER%
-  - codecov
+  - cp .coverage ${APPVEYOR_BUILD_FOLDER}
+  - cd ${APPVEYOR_BUILD_FOLDER}
+  - curl -Os https://uploader.codecov.io/latest/linux/codecov
+  - chmod +x codecov
+  - ./codecov
diff --git a/doc/quick_start.rst b/doc/quick_start.rst
@@ -5,6 +5,22 @@ Quick Start with skdag
 The following tutorial shows you how to write some simple directed acyclic graphs (DAGs)
 with ``skdag``.
 
+Installation
+============
+
+Installing skdag is simple:
+
+.. code:: bash
+
+    pip install skdag
+
+Note that to visualise graphs you need to install the graphviz libraries too. Here's how
+to do this in Ubuntu:
+
+.. code:: bash
+
+    sudo apt install graphviz graphviz-dev
+
 Creating your own scikit-learn contribution package
 ===================================================
 

diff --git a/doc/user_guide.rst b/doc/user_guide.rst
@@ -163,6 +163,10 @@ the next step(s).
 Note that the passthrough is not strictly necessary but it is convenient as it ensures
 the stack has a single entry point, which makes it simpler to use.
 
+The DAG infers that :meth:`predict` should be called for the two intermediate
+estimators. Our meta-estimator is then simply taking in prediction for each classifier
+as its input features.
+
 As we can now see, the stacking ensemble method gives us a boost in performance:
 
 .. code-block:: python
@@ -174,6 +178,25 @@ As we can now see, the stacking ensemble method gives us a boost in performance:
     >>> svr.score(X_test, y_test)
     0.128...
 
+Note that for binary classifiers you probably need to specify that only the positive
+class probability is used as input by the meta-estimator. The DAG will automatically
+infer that :meth:`predict_proba` should be called, but you will need to manually tell
+the DAG which column to take. To do this, you can simply specify your step dependencies
+as a dictionary of step name to column indices instead:
+
+.. code:: python
+
+    >>> from sklearn.ensemble import RandomForestClassifier
+    >>> from sklearn.svm import SVC
+    >>> clf_stack = (
+    ...     DAGBuilder()
+    ...     .add_step("pass", "passthrough")
+    ...     .add_step("rf", RandomForestClassifier(), deps=["pass"])
+    ...     .add_step("svr", SVC(), deps=["pass"])
+    ...     .add_step("meta", LinearRegression(), deps={"rf": 1, "svc": 1}])
+    ...     .make_dag()
+    ... )
+
 Stacking works best when a diverse range of algorithms are used to provide predictions,
 which are then fed into a very simple meta-estimator. To minimize overfitting,
 cross-validation should be considered when using stacking.
diff --git a/requirements.txt b/requirements.txt
@@ -2,6 +2,7 @@ black
 joblib
 networkx>=2.6
 numpy
+pygraphviz
 scipy
 scikit-learn
 stackeddag
diff --git a/requirements_doc.txt b/requirements_doc.txt
@@ -0,0 +1,5 @@
+matplotlib
+numpydoc
+sphinx
+sphinx-gallery
+sphinx_rtd_theme
diff --git a/requirements_test.txt b/requirements_test.txt
@@ -0,0 +1,2 @@
+pytest
+pytest-cov
diff --git a/setup.py b/setup.py
@@ -54,8 +54,8 @@ def parse_requirements(filename):
     "Programming Language :: Python :: 3.9",
 ]
 EXTRAS_REQUIRE = {
-    "tests": ["pytest", "pytest-cov"],
-    "docs": ["sphinx", "sphinx-gallery", "sphinx_rtd_theme", "numpydoc", "matplotlib"],
+    tgt: parse_requirements(f"requirements_{tgt}.txt")
+    for tgt in ["test", "doc"]
 }
 
 setup(

diff --git a/skdag/dag/_dag.py b/skdag/dag/_dag.py
@@ -28,6 +28,19 @@
 __all__ = ["DAG", "DAGStep"]
 
 
+def _stack_inputs(X, node):
+    cols = [_safe_indexing(X[dep], node.deps[dep], axis=1) for dep in node.deps]
+    X_stacked = _stack(
+        [
+            col.reshape(-1, 1) if col is not None and col.ndim < 2 else col
+            for col in cols
+        ],
+        axis=node.axis,
+    )
+
+    return X_stacked
+
+
 def _transform_one(transformer, X, weight, allow_predictor=True, **fit_params):
     if _is_passthrough(transformer):
         res = X
@@ -74,10 +87,12 @@ def _fit_transform_one(
         elif hasattr(transformer, "transform"):
             res = transformer.fit(X, y, **fit_params).transform(X)
         elif allow_predictor:
-            if hasattr(transformer, "fit_predict"):
-                res = transformer.fit_predict(X, y, **fit_params)
-            elif hasattr(transformer, "predict"):
-                res = transformer.fit(X, y, **fit_params).predict(X)
+            for fn in ["predict_proba", "decision_function", "predict"]:
+                if hasattr(transformer, fn):
+                    res = getattr(transformer.fit(X, y, **fit_params), fn)(X)
+                    if res.ndim < 2:
+                        res = res.reshape(-1, 1)
+                    break
             else:
                 failed = True
                 res = None
@@ -128,10 +143,7 @@ def _parallel_fit(dag, step, Xin, Xs, y, fit_transform_fn, memory, **fit_params)
     transformer = step.estimator
 
     if step.deps:
-        X = _stack(
-            [_safe_indexing(Xs[dep], step.deps[dep], axis=1) for dep in step.deps],
-            axis=step.axis,
-        )
+        X = _stack_inputs(Xs, step)
     else:
         # For root nodes, the destination rather than the source is
         # specified.
@@ -166,10 +178,7 @@ def _parallel_fit(dag, step, Xin, Xs, y, fit_transform_fn, memory, **fit_params)
 def _parallel_transform(dag, step, Xin, Xs, transform_fn, **fn_params):
     transformer = step.estimator
     if step.deps:
-        X = _stack(
-            [_safe_indexing(Xs[dep], step.deps[dep], axis=1) for dep in step.deps],
-            axis=step.axis,
-        )
+        X = _stack_inputs(Xs, step)
     else:
         # For root nodes, the destination rather than the source is
         # specified.
@@ -198,7 +207,7 @@ def _parallel_fit_leaf(dag, leaf, Xts, y, **fit_params):
         if leaf.estimator == "passthrough":
             fitted_estimator = leaf.estimator
         else:
-            Xt = _stack([Xts[dep] for dep in leaf.deps], axis=leaf.axis)
+            Xt = _stack_inputs(Xts, leaf)
             fitted_estimator = leaf.estimator.fit(Xt, y, **fit_params)
 
     return fitted_estimator
@@ -208,10 +217,7 @@ def _parallel_execute(
     dag, leaf, fn, Xts, y=None, fit_first=False, fit_params=None, fn_params=None
 ):
     with _print_elapsed_time("DAG", dag._log_message(leaf)):
-        Xt = _stack(
-            [_safe_indexing(Xts[dep], leaf.deps[dep], axis=1) for dep in leaf.deps],
-            axis=leaf.axis,
-        )
+        Xt = _stack_inputs(Xts, leaf)
         fit_params = fit_params or {}
         fn_params = fn_params or {}
         if leaf.estimator == "passthrough":
@@ -1266,7 +1272,7 @@ def draw(
                 from IPython import get_ipython
 
                 rich = type(get_ipython()).__name__ == "ZMQInteractiveShell"
-            except NameError:
+            except (ModuleNotFoundError, NameError):
                 rich = False
 
             format = "svg" if rich else "txt"

diff --git a/skdag/dag/tests/test_dag.py b/skdag/dag/tests/test_dag.py
@@ -17,10 +17,11 @@
 from sklearn.linear_model import LinearRegression
 from sklearn.feature_selection import SelectKBest, f_classif
 from sklearn.decomposition import PCA
-from sklearn.datasets import load_iris
+from sklearn import datasets
 from sklearn.ensemble import RandomForestClassifier
 
-iris = load_iris()
+iris = datasets.load_iris()
+cancer = datasets.load_breast_cancer()
 
 JUNK_FOOD_DOCS = (
     "the pizza pizza beer copyright",
@@ -221,8 +222,8 @@ def test_dag_raise_set_params_error():
 
 def test_dag_stacking_pca_svm_rf():
     # Test the various methods of the pipeline (pca + svm).
-    X = iris.data
-    y = iris.target
+    X = cancer.data
+    y = cancer.target
     # Build a simple model stack with some preprocessing.
     pca = PCA(svd_solver="full", n_components="mle", whiten=True)
     svc = SVC(probability=True, random_state=0)
@@ -234,13 +235,13 @@ def test_dag_stacking_pca_svm_rf():
         .add_step("pca", pca)
         .add_step("svc", svc, deps=["pca"])
         .add_step("rf", rf, deps=["pca"])
-        .add_step("log", log, deps=["svc", "rf"])
+        .add_step("log", log, deps={"svc": 1, "rf": 1})
         .make_dag()
     )
     dag.fit(X, y)
 
-    prob_shape = len(iris.target), len(iris.target_names)
-    tgt_shape = iris.target.shape
+    prob_shape = len(cancer.target), len(cancer.target_names)
+    tgt_shape = cancer.target.shape
 
     assert dag.predict_proba(X).shape == prob_shape
     assert dag.predict(X).shape == tgt_shape
-Original file line number
+Diff line change
@@ Expand Up / @@ -4,5 +4,5 @@ requirements_file: requirements.txt @@
     python:
       pip_install: true
       extra_requirements:
-        - tests
-        - docs
+        - test
+        - doc