TST fix all tests

scikit-learn-contrib · May 1, 2024 · eba14ba · eba14ba
1 parent 360c560
commit eba14ba
Show file tree

Hide file tree

Showing 6 changed files with 140 additions and 50 deletions.
diff --git a/pixi.lock b/pixi.lock
diff --git a/pixi.toml b/pixi.toml
@@ -11,7 +11,7 @@ readme = "README.md"
 
 [dependencies]
 python = "*"
-scikit-learn = "*"
+scikit-learn = ">=1.4.2"
 
 [pypi-dependencies]
 skltemplate = { path=".", editable=true }

diff --git a/pyproject.toml b/pyproject.toml
@@ -8,7 +8,7 @@ authors = [
 description = "A template for scikit-learn compatible packages."
 readme = "README.md"
 dependencies = [
-  "scikit-learn>=1.0.2",
+  "scikit-learn>=1.4.2",
 ]
 classifiers = [
     "Programming Language :: Python :: 3",
@@ -69,4 +69,8 @@ ignore=[
 # folder.
 "examples/*"=["E402"]
 "doc/conf.py"=["E402"]
-"doc/_templates/numpydoc_docstring.py"=["F821", "W292"]
+"doc/_templates/numpydoc_docstring.py"=["F821", "W292"]
+
+[tool.pytest.ini_options]
+addopts = "--doctest-modules --color=yes"
+doctest_optionflags = "NORMALIZE_WHITESPACE"
diff --git a/skltemplate/_template.py b/skltemplate/_template.py
@@ -2,10 +2,10 @@
 This is a module to be used as a reference for building other modules
 """
 import numpy as np
-from sklearn.base import BaseEstimator, ClassifierMixin, TransformerMixin
+from sklearn.base import BaseEstimator, ClassifierMixin, TransformerMixin, _fit_context
 from sklearn.metrics import euclidean_distances
-from sklearn.utils.multiclass import unique_labels
-from sklearn.utils.validation import check_array, check_is_fitted, check_X_y
+from sklearn.utils.multiclass import check_classification_targets
+from sklearn.utils.validation import  check_is_fitted
 
 
 class TemplateEstimator(BaseEstimator):
@@ -17,7 +17,19 @@ class TemplateEstimator(BaseEstimator):
     Parameters
     ----------
     demo_param : str, default='demo_param'
-        A parameter used for demonstation of how to pass and store paramters.
+        A parameter used for demonstration of how to pass and store parameters.
+
+    Attributes
+    ----------
+    is_fitted_ : bool
+        A boolean indicating whether the estimator has been fitted.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
 
     Examples
     --------
@@ -30,16 +42,24 @@ class TemplateEstimator(BaseEstimator):
     TemplateEstimator()
     """
 
+    # This is a dictionary allowing to define the type of parameters.
+    # It used to validate parameter within the `_fit_context` decorator.
+    _parameter_constraints = {
+        "demo_param": [str],
+    }
+
     def __init__(self, demo_param="demo_param"):
         self.demo_param = demo_param
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y):
         """A reference implementation of a fitting function.
 
         Parameters
         ----------
         X : {array-like, sparse matrix}, shape (n_samples, n_features)
             The training input samples.
+
         y : array-like, shape (n_samples,) or (n_samples, n_outputs)
             The target values (class labels in classification, real numbers in
             regression).
@@ -49,7 +69,12 @@ def fit(self, X, y):
         self : object
             Returns self.
         """
-        X, y = check_X_y(X, y, accept_sparse=True)
+        # `_validate_data` is defined in the `BaseEstimator` class.
+        # It allows to:
+        # - run different checks on the input data;
+        # - define some attributes associated to the input data: `n_features_in_` and
+        #   `feature_names_in_`.
+        X, y = self._validate_data(X, y, accept_sparse=True)
         self.is_fitted_ = True
         # `fit` should always return `self`
         return self
@@ -67,11 +92,16 @@ def predict(self, X):
         y : ndarray, shape (n_samples,)
             Returns an array of ones.
         """
-        X = check_array(X, accept_sparse=True)
-        check_is_fitted(self, "is_fitted_")
+        # Check is fit had been called
+        check_is_fitted(self)
+        # We need to set reset=False because we don't want to overwrite `n_features_in_`
+        # `feature_names_in_` but only check that the shape is consistent.
+        X = self._validate_data(X, accept_sparse=True, reset=False)
         return np.ones(X.shape[0], dtype=np.int64)
 
 
+# Note that the mixin class should always be on the left of `BaseEstimator` to ensure
+# the MRO works as expected.
 class TemplateClassifier(ClassifierMixin, BaseEstimator):
     """An example classifier which implements a 1-NN algorithm.
 
@@ -87,22 +117,54 @@ class TemplateClassifier(ClassifierMixin, BaseEstimator):
     ----------
     X_ : ndarray, shape (n_samples, n_features)
         The input passed during :meth:`fit`.
+
     y_ : ndarray, shape (n_samples,)
         The labels passed during :meth:`fit`.
+
     classes_ : ndarray, shape (n_classes,)
         The classes seen at :meth:`fit`.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+    Examples
+    --------
+    >>> from sklearn.datasets import load_iris
+    >>> from skltemplate import TemplateClassifier
+    >>> X, y = load_iris(return_X_y=True)
+    >>> clf = TemplateClassifier().fit(X, y)
+    >>> clf.predict(X)
+    array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+           0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+           0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+           1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+           1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])
     """
 
+    # This is a dictionary allowing to define the type of parameters.
+    # It used to validate parameter within the `_fit_context` decorator.
+    _parameter_constraints = {
+        "demo_param": [str],
+    }
+
     def __init__(self, demo_param="demo"):
         self.demo_param = demo_param
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y):
         """A reference implementation of a fitting function for a classifier.
 
         Parameters
         ----------
         X : array-like, shape (n_samples, n_features)
             The training input samples.
+
         y : array-like, shape (n_samples,)
             The target values. An array of int.
 
@@ -111,13 +173,22 @@ def fit(self, X, y):
         self : object
             Returns self.
         """
-        # Check that X and y have correct shape
-        X, y = check_X_y(X, y)
-        # Store the classes seen during fit
-        self.classes_ = unique_labels(y)
-
+        # `_validate_data` is defined in the `BaseEstimator` class.
+        # It allows to:
+        # - run different checks on the input data;
+        # - define some attributes associated to the input data: `n_features_in_` and
+        #   `feature_names_in_`.
+        X, y = self._validate_data(X, y)
+        # We need to make sure that we have a classification task
+        check_classification_targets(y)
+
+        # classifier should always store the classes seen during `fit`
+        self.classes_ = np.unique(y)
+
+        # Store the training data to predict later
         self.X_ = X
         self.y_ = y
+
         # Return the classifier
         return self
 
@@ -136,15 +207,19 @@ def predict(self, X):
             seen during fit.
         """
         # Check is fit had been called
-        check_is_fitted(self, ["X_", "y_"])
+        check_is_fitted(self)
 
         # Input validation
-        X = check_array(X)
+        # We need to set reset=False because we don't want to overwrite `n_features_in_`
+        # `feature_names_in_` but only check that the shape is consistent.
+        X = self._validate_data(X, reset=False)
 
         closest = np.argmin(euclidean_distances(X, self.X_), axis=1)
         return self.y_[closest]
 
 
+# Note that the mixin class should always be on the left of `BaseEstimator` to ensure
+# the MRO works as expected.
 class TemplateTransformer(TransformerMixin, BaseEstimator):
     """An example transformer that returns the element-wise square root.
 
@@ -158,20 +233,32 @@ class TemplateTransformer(TransformerMixin, BaseEstimator):
 
     Attributes
     ----------
-    n_features_ : int
-        The number of features of the data passed to :meth:`fit`.
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
     """
 
+    # This is a dictionary allowing to define the type of parameters.
+    # It used to validate parameter within the `_fit_context` decorator.
+    _parameter_constraints = {
+        "demo_param": [str],
+    }
+
     def __init__(self, demo_param="demo"):
         self.demo_param = demo_param
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None):
         """A reference implementation of a fitting function for a transformer.
 
         Parameters
         ----------
         X : {array-like, sparse matrix}, shape (n_samples, n_features)
             The training input samples.
+
         y : None
             There is no need of a target in a transformer, yet the pipeline API
             requires this parameter.
@@ -181,9 +268,7 @@ def fit(self, X, y=None):
         self : object
             Returns self.
         """
-        X = check_array(X, accept_sparse=True)
-
-        self.n_features_ = X.shape[1]
+        X = self._validate_data(X, accept_sparse=True)
 
         # Return the transformer
         return self
@@ -202,14 +287,18 @@ def transform(self, X):
             The array containing the element-wise square roots of the values
             in ``X``.
         """
-        # Check is fit had been called
-        check_is_fitted(self, "n_features_")
+        # Since this is a stateless transformer, we should not call `check_is_fitted`.
+        # Common test will check for this particularly.
 
         # Input validation
-        X = check_array(X, accept_sparse=True)
-
-        # Check that the input is of the same shape as the one passed
-        # during fit.
-        if X.shape[1] != self.n_features_:
-            raise ValueError("Shape of input is different from what was seenin `fit`")
+        # We need to set reset=False because we don't want to overwrite `n_features_in_`
+        # `feature_names_in_` but only check that the shape is consistent.
+        X = self._validate_data(X, accept_sparse=True, reset=False)
         return np.sqrt(X)
+
+    def _more_tags(self):
+        # This is a quick example to show the tags API:\
+        # https://scikit-learn.org/dev/developers/develop.html#estimator-tags
+        # Here, our transformer does not do any operation in `fit` and only validate
+        # the parameters. Thus, it is stateless.
+        return {'stateless': True}
diff --git a/skltemplate/tests/test_common.py b/skltemplate/tests/test_common.py
@@ -1,11 +1,13 @@
-import pytest
-from sklearn.utils.estimator_checks import check_estimator
+"""This file shows how to write test based on the scikit-learn common tests."""
+
+from sklearn.utils.estimator_checks import parametrize_with_checks
 
 from skltemplate import TemplateClassifier, TemplateEstimator, TemplateTransformer
 
 
-@pytest.mark.parametrize(
-    "estimator", [TemplateEstimator(), TemplateTransformer(), TemplateClassifier()]
-)
-def test_all_estimators(estimator):
-    return check_estimator(estimator)
+# parametrize_with_checks allows to get a generator of check that is more fine-grained
+# than check_estimator
+@parametrize_with_checks([TemplateEstimator(), TemplateTransformer(), TemplateClassifier()])
+def test_estimators(estimator, check, request):
+    """Check the compatibility with scikit-learn API"""
+    check(estimator)
diff --git a/skltemplate/tests/test_template.py b/skltemplate/tests/test_template.py
@@ -1,7 +1,8 @@
+"""This file will just show how to write tests for the template classes."""
 import numpy as np
 import pytest
-from numpy.testing import assert_allclose, assert_array_equal
 from sklearn.datasets import load_iris
+from sklearn.utils._testing import assert_allclose, assert_array_equal
 
 from skltemplate import TemplateClassifier, TemplateEstimator, TemplateTransformer
 
@@ -12,6 +13,7 @@ def data():
 
 
 def test_template_estimator(data):
+    """Check the internals and behaviour of `TemplateEstimator`."""
     est = TemplateEstimator()
     assert est.demo_param == "demo_param"
 
@@ -23,22 +25,14 @@ def test_template_estimator(data):
     assert_array_equal(y_pred, np.ones(X.shape[0], dtype=np.int64))
 
 
-def test_template_transformer_error(data):
-    X, y = data
-    trans = TemplateTransformer()
-    trans.fit(X)
-    with pytest.raises(ValueError, match="Shape of input is different"):
-        X_diff_size = np.ones((10, X.shape[1] + 1))
-        trans.transform(X_diff_size)
-
-
 def test_template_transformer(data):
+    """Check the internals and behaviour of `TemplateTransformer`."""
     X, y = data
     trans = TemplateTransformer()
     assert trans.demo_param == "demo"
 
     trans.fit(X)
-    assert trans.n_features_ == X.shape[1]
+    assert trans.n_features_in_ == X.shape[1]
 
     X_trans = trans.transform(X)
     assert_allclose(X_trans, np.sqrt(X))
@@ -48,6 +42,7 @@ def test_template_transformer(data):
 
 
 def test_template_classifier(data):
+    """Check the internals and behaviour of `TemplateClassifier`."""
     X, y = data
     clf = TemplateClassifier()
     assert clf.demo_param == "demo"