Merge pull request #2 from pelucid/BN-1175-sample-weights-feature-sel…

…ection-updated Sample weights feature selection
pelucid · Mar 24, 2020 · 62baea6 · 62baea6
2 parents 91ac0cd + 7883e55
commit 62baea6
Show file tree

Hide file tree

Showing 8 changed files with 107 additions and 176 deletions.
diff --git a/.appveyor.yml b/.appveyor.yml
diff --git a/.travis.yml b/.travis.yml
@@ -1,51 +1,19 @@
-language: generic
-
-cache:
-  apt: true
-  directories:
-  - $HOME/.cache/pip
-  - $HOME/.ccache
-
-dist: trusty
-
-env:
-  global:
-    # Directory where tests are run from
-    - TEST_DIR=/tmp/sklearn
-    - OMP_NUM_THREADS=4
-    - OPENBLAS_NUM_THREADS=4
-
-matrix:
-    include:
-        - os: linux
-          sudo: required
-          python: 3.8
-          env: LATEST="false" IMAGE="true" COVERAGE="false" NUMPY_VERSION="1.18.1" SCIPY_VERSION="1.4.1" SKLEARN_VERSION="0.22.0" JOBLIB_VERSION=0.13.2 PANDAS_VERSION="1.0.1" IMAGEIO_VERSION="2.5.0" SKIMAGE_VERSION="0.15.0" DLIB_VERSION="19.17.0" MINICONDA_PYTHON_VERSION=3.7
-        - os: linux
-          python: 3.8
-          env: LATEST="true" IMAGE="true" COVERAGE="true" NOTEBOOKS="true" MINICONDA_PYTHON_VERSION=3.7
-        - os: linux
-          sudo: required
-          python: 3.8
-          env: LATEST="true" IMAGE="false" COVERAGE="false" NOTEBOOKS="false" MINICONDA_PYTHON_VERSION=3.7
-
-
-before_install:
-    - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew update; fi
-
-install:
-    - if [[ "$TRAVIS_OS_NAME" != "osx" ]]; then sudo apt-get update; fi
-    - source ci/.travis_install.sh
+language: python
+python:
+  - "3.6"
 
 script:
-    - bash ci/.travis_test.sh
+  - make deps
 
-after_success:
-    - if [[ "$COVERAGE" == "true" ]]; then coveralls || echo "failed"; fi
+dist: trusty
 
 notifications:
-    email:
-      recipients:
-          - [email protected]
-    on_success: always
-    on_failure: always
+  email:
+    recipients:
+      - [email protected]
+    on_success: never
+    on_failure: change
+
+    slack:
+      rooms:
+        secure: "O/kZr/L3H3R7ndC9aR8ScpL0lN893g8TAsiPdTxVQK7fJ9t88gAonR+yQIq/ZwD4Xwb5NDc09xMBpikhCrquBCqpSxf/EC9tDzePbjlqYCjsZ8wuOmo4byFrH32bvcxr3SIWw8zSY88Z4Ac5msTgUBW2aovOgn9wIM2Prs9kP9y/ftPnlkAs9IJHLJ5DmwEw9KEUTq1eKbWh7+nlguDkBIwpqeuU4gOUAZGFuCy4Cqs53K87PQRX0VjTxnkqODKoF8cuIN9TU0D0u75kqny902rXfzgaSyy8mFFxu+HkPhDbdFICV3H8P82QSpbmUTzcgvfHOBDxZHFJ2cxAScKoLtSzaaHvHA6H3WrHP6r5jei0UgG4v5dNRtyA5uO+DtVu0thr/cpBI97Hm9Ob9sTOCzIRtTuSPyk+cAYsksDDzL67diJhkaaCcIQBL6Z5qSEZzJS+Ggdzo5SwiTDrK/qOW/xk/53qQrB2lL9oQT4vuiTpwSUJnaWDeVlfYk09JnfYWfSHzNWXy+F8aQQylMzuoKGDTtVDya7dpObwCFatJ1yvr1sGu067Y5+04a5fk6hRmPaQDFNEfkN14BF8xw4EQJA2M8Gna6U6bQmkwHRPFIcJpC6vmA6UV5Wvg8Rpgme7GN3R9i07/1+520CeWdYCXFlYWlFy6kpAhECDMO3uJwU="
diff --git a/MANIFEST.in b/MANIFEST.in
diff --git a/Makefile b/Makefile
@@ -0,0 +1,16 @@
+# Guard against running Make commands outside a virtualenv or conda env
+venv:
+ifndef VIRTUAL_ENV
+ifndef CONDA_PREFIX
+$(error VIRTUAL / CONDA ENV is not set - please activate environment)
+endif
+endif
+
+clean: venv
+	@echo "Removing build artifacts / temp files"
+	find . -name "*.pyc" -delete
+
+deps: venv
+	pip install -U pip==18.1
+	pip install -Ue . --process-dependency-links
+
diff --git a/mlxtend/feature_selection/sequential_feature_selector.py b/mlxtend/feature_selection/sequential_feature_selector.py
@@ -19,20 +19,64 @@
 from sklearn.base import MetaEstimatorMixin
 from ..externals.name_estimators import _name_estimators
 from ..utils.base_compostion import _BaseXComposition
-from sklearn.model_selection import cross_val_score
+from sklearn.model_selection import StratifiedKFold, cross_val_score
 from joblib import Parallel, delayed
 
 
+def fit_and_score(model, X, y, sample_weights,
+                  train, test,
+                  scoring):
+    # Fit and predict
+    model_clone = clone(model)
+    model_clone.fit(X[train], y[train], sample_weight=sample_weights[train])
+
+    # Score
+    score = scoring(model_clone, X[test], y[test], sample_weight=sample_weights[test])
+    return score
+
+
+def cross_val_scores_weighted(model, X, y, scoring, sample_weights,
+                              cv=5,
+                              n_jobs=None,
+                              verbose=0,
+                              pre_dispatch='2*n_jobs'):
+    # Initialise CV
+    cv = StratifiedKFold(n_splits=cv, shuffle=True, random_state=42)
+
+    # Set up parallel processing
+    parallel = Parallel(n_jobs=n_jobs,
+                        verbose=verbose,
+                        pre_dispatch=pre_dispatch)
+
+    # Call fit_and_score
+    scores = parallel(
+        delayed(fit_and_score)(model, X, y, sample_weights, train, test, scoring)
+        for train, test in cv.split(X, y))
+
+    return scores
+
+
 def _calc_score(selector, X, y, indices, groups=None, **fit_params):
     if selector.cv:
-        scores = cross_val_score(selector.est_,
-                                 X[:, indices], y,
-                                 groups=groups,
-                                 cv=selector.cv,
-                                 scoring=selector.scorer,
-                                 n_jobs=1,
-                                 pre_dispatch=selector.pre_dispatch,
-                                 fit_params=fit_params)
+        if selector.sample_weights is not None:
+            scores = cross_val_scores_weighted(selector.est_,
+                                               X[:, indices], y,
+                                               scoring=selector.scorer,
+                                               sample_weights=selector.sample_weights,
+                                               cv=selector.cv,
+                                               n_jobs=1,
+                                               pre_dispatch=selector.pre_dispatch,
+                                               )
+        else:
+            scores = cross_val_score(selector.est_,
+                                     X[:, indices], y,
+                                     groups=groups,
+                                     cv=selector.cv,
+                                     scoring=selector.scorer,
+                                     n_jobs=1,
+                                     pre_dispatch=selector.pre_dispatch,
+                                     fit_params=fit_params)
+
     else:
         selector.est_.fit(X[:, indices], y, **fit_params)
         scores = np.array([selector.scorer(selector.est_, X[:, indices], y)])
@@ -66,7 +110,6 @@ def _get_featurenames(subsets_dict, feature_idx, custom_feature_names, X):
 
 
 class SequentialFeatureSelector(_BaseXComposition, MetaEstimatorMixin):
-
     """Sequential Feature Selection for Classification and Regression.
 
     Parameters
@@ -174,19 +217,22 @@ class SequentialFeatureSelector(_BaseXComposition, MetaEstimatorMixin):
     http://rasbt.github.io/mlxtend/user_guide/feature_selection/SequentialFeatureSelector/
 
     """
+
     def __init__(self, estimator, k_features=1,
                  forward=True, floating=False,
                  verbose=0, scoring=None,
                  cv=5, n_jobs=1,
                  pre_dispatch='2*n_jobs',
                  clone_estimator=True,
-                 fixed_features=None):
+                 fixed_features=None,
+                 sample_weights=None):
 
         self.estimator = estimator
         self.k_features = k_features
         self.forward = forward
         self.floating = floating
         self.pre_dispatch = pre_dispatch
+        self.sample_weights = sample_weights
         # Want to raise meaningful error message if a
         # cross-validation generator is inputted
         if isinstance(cv, types.GeneratorType):
@@ -335,8 +381,8 @@ def fit(self, X, y, custom_feature_names=None, groups=None, **fit_params):
                              'the number of elements in custom_feature_names '
                              'must equal the number of columns in X.')
 
-        if not isinstance(self.k_features, int) and\
-                not isinstance(self.k_features, tuple)\
+        if not isinstance(self.k_features, int) and \
+                not isinstance(self.k_features, tuple) \
                 and not isinstance(self.k_features, str):
             raise AttributeError('k_features must be a positive integer'
                                  ', tuple, or string')
@@ -345,7 +391,7 @@ def fit(self, X, y, custom_feature_names=None, groups=None, **fit_params):
                 self.k_features < 1 or self.k_features > X_.shape[1])):
             raise AttributeError('k_features must be a positive integer'
                                  ' between 1 and X.shape[1], got %s'
-                                 % (self.k_features, ))
+                                 % (self.k_features,))
 
         if isinstance(self.k_features, tuple):
             if len(self.k_features) != 2:
@@ -364,7 +410,7 @@ def fit(self, X, y, custom_feature_names=None, groups=None, **fit_params):
                 raise AttributeError('The min k_features value must be smaller'
                                      ' than the max k_features value.')
 
-        if isinstance(self.k_features, tuple) or\
+        if isinstance(self.k_features, tuple) or \
                 isinstance(self.k_features, str):
 
             select_in_range = True
@@ -475,8 +521,8 @@ def fit(self, X, y, custom_feature_names=None, groups=None, **fit_params):
                                     self._exclusion(
                                         feature_set=k_idx,
                                         fixed_feature=(
-                                            {new_feature} |
-                                            self.fixed_features_set_),
+                                                {new_feature} |
+                                                self.fixed_features_set_),
                                         X=X_,
                                         y=y,
                                         groups=groups,
@@ -519,7 +565,6 @@ def fit(self, X, y, custom_feature_names=None, groups=None, **fit_params):
                 # floating can lead to multiple same-sized subsets
                 if k not in self.subsets_ or (k_score >
                                               self.subsets_[k]['avg_score']):
-
                     k_idx = tuple(sorted(k_idx))
                     self.subsets_[k] = {
                         'feature_idx': k_idx,
@@ -637,7 +682,6 @@ def _exclusion(self, feature_set, X, y, fixed_feature=None,
                             fixed_feature.issubset(set(p)))
 
             for p, cv_scores in work:
-
                 all_avg_scores.append(np.nanmean(cv_scores))
                 all_cv_scores.append(cv_scores)
                 all_subsets.append(p)

diff --git a/requirements.txt b/requirements.txt
diff --git a/setup.cfg b/setup.cfg
@@ -1,2 +1,2 @@
-[bdist_wheel]
-universal = 1
+[aliases]
+test=pytest
diff --git a/setup.py b/setup.py
@@ -1,70 +1,18 @@
-# Sebastian Raschka 2014-2016
-# mlxtend Machine Learning Library Extensions
-# Author: Sebastian Raschka <sebastianraschka.com>
-#
-# License: BSD 3 clause
-
-from os.path import realpath, dirname, join
 from setuptools import setup, find_packages
-import mlxtend
-
-VERSION = mlxtend.__version__
-PROJECT_ROOT = dirname(realpath(__file__))
-
-REQUIREMENTS_FILE = join(PROJECT_ROOT, 'requirements.txt')
-
-with open(REQUIREMENTS_FILE) as f:
-    install_reqs = f.read().splitlines()
-
-install_reqs.append('setuptools')
-
 
 setup(name='mlxtend',
-      version=VERSION,
+      version='0.0.1',
       description='Machine Learning Library Extensions',
-      author='Sebastian Raschka',
-      author_email='[email protected]',
-      url='https://github.com/rasbt/mlxtend',
+      url='https://github.com/pelucid/mlxtend',
       packages=find_packages(),
-      package_data={'': ['LICENSE-BSD3.txt',
-                         'LICENSE-CC-BY.txt',
-                         'README.md',
-                         'requirements.txt']
-                    },
-      include_package_data=True,
-      install_requires=install_reqs,
-      extras_require={'testing': ['pytest'],
-                      'docs': ['mkdocs']},
-      license='BSD 3-Clause',
-      platforms='any',
-      classifiers=[
-             'License :: OSI Approved :: BSD License',
-             'Development Status :: 5 - Production/Stable',
-             'Operating System :: Microsoft :: Windows',
-             'Operating System :: POSIX',
-             'Operating System :: Unix',
-             'Operating System :: MacOS',
-             'Programming Language :: Python :: 3.7',
-             'Topic :: Scientific/Engineering',
-             'Topic :: Scientific/Engineering :: Artificial Intelligence',
-             'Topic :: Scientific/Engineering :: Information Analysis',
-             'Topic :: Scientific/Engineering :: Image Recognition',
+      install_requires=[
+          "scipy>=1.2.1",
+          "numpy>=1.16.2",
+          "pandas>=0.24.2",
+          "scikit-learn>=0.20.3",
+          "matplotlib>=3.0.0",
+          "joblib>=0.13.2"
       ],
-      long_description="""
-
-A library of Python tools and extensions for data science.
-
-
-Contact
-=============
-
-If you have any questions or comments about mlxtend,
-please feel free to contact me via
-eMail: [email protected]
-or Twitter: https://twitter.com/rasbt
-
-This project is hosted at https://github.com/rasbt/mlxtend
-
-The documentation can be found at http://rasbt.github.io/mlxtend/
-
-""")
+      setup_requires=["pytest-runner"],
+      tests_require=["pytest"]
+      )