From 5210f810f78d7aa102cb56e6b586880a771980b4 Mon Sep 17 00:00:00 2001 From: Sarah Brown Date: Mon, 6 Mar 2017 22:39:45 -0800 Subject: [PATCH] [MRG+2] addresses #8509 improvements to f_regression documentation (#8548) * clarify role of the function and streamline introduction * added feature selection methods to see also * completed see also * fixed pep related formatting for flake8checks. * fixed extra whitespace flake8 problems, remaining failure is a copied see all line from another function, the line is over by a period, does not make sense to newline that. * one more whitespace * FIX small pep8 error. --- .../feature_selection/univariate_selection.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/sklearn/feature_selection/univariate_selection.py b/sklearn/feature_selection/univariate_selection.py index f1d6047f0b55e..bdeda48a556a9 100644 --- a/sklearn/feature_selection/univariate_selection.py +++ b/sklearn/feature_selection/univariate_selection.py @@ -230,17 +230,18 @@ def chi2(X, y): def f_regression(X, y, center=True): """Univariate linear regression tests. - Quick linear model for testing the effect of a single regressor, - sequentially for many regressors. + Linear model for testing the individual effect of each of many regressors. + This is a scoring function to be used in a feature seletion procedure, not + a free standing feature selection procedure. This is done in 2 steps: - 1. The cross correlation between each regressor and the target is computed, + 1. The correlation between each regressor and the target is computed, that is, ((X[:, i] - mean(X[:, i])) * (y - mean_y)) / (std(X[:, i]) * std(y)). 2. It is converted to an F score then to a p-value. - Read more in the :ref:`User Guide `. + For more on usage see the :ref:`User Guide `. Parameters ---------- @@ -261,10 +262,18 @@ def f_regression(X, y, center=True): pval : array, shape=(n_features,) p-values of F-scores. + See also -------- + mutual_info_regression: Mutual information for a continuous target. f_classif: ANOVA F-value between label/feature for classification tasks. chi2: Chi-squared stats of non-negative features for classification tasks. + SelectKBest: Select features based on the k highest scores. + SelectFpr: Select features based on a false positive rate test. + SelectFdr: Select features based on an estimated false discovery rate. + SelectFwe: Select features based on family-wise error rate. + SelectPercentile: Select features based on percentile of the highest + scores. """ X, y = check_X_y(X, y, ['csr', 'csc', 'coo'], dtype=np.float64) n_samples = X.shape[0]