-
Notifications
You must be signed in to change notification settings - Fork 93
/
feature_selection_exhaustive.py
59 lines (44 loc) · 1.62 KB
/
feature_selection_exhaustive.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
"""Exhaustive Feature Selection using an sklearn estimator."""
"""
Settings for this recipe:
TARGET_COLUMN: Column name of target variable
ESTIMATOR: Base sklearn estimator
MIN_FEATURES: Minimum number of final features to select
MAX_FEATURES: Maximum number of final features to select
SCORING: Scoring metric
CV: Number of cross-validation folds
More details available here: http://rasbt.github.io/mlxtend/user_guide/feature_selection/SequentialFeatureSelector
P.S. Categorical inputs need to be converted to numeric before running feature selection.
"""
import datatable as dt
import numpy as np
import pandas as pd
from h2oaicore.data import CustomData
import typing
from sklearn.linear_model import LogisticRegression
# Please edit these before usage (default values are for credit card dataset)
TARGET_COLUMN = 'default payment next month'
ESTIMATOR = LogisticRegression()
MIN_FEATURES = 10
MAX_FEATURES = 15
SCORING = 'accuracy'
CV = 5
class ExhaustiveFeatureSelection(CustomData):
_modules_needed_by_name = ["mlxtend"]
@staticmethod
def create_data(X: dt.Frame = None) -> pd.DataFrame:
if X is None:
return []
from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS
X = X.to_pandas()
y = X[TARGET_COLUMN].values
X.drop(TARGET_COLUMN, axis=1, inplace=True)
efs = EFS(ESTIMATOR,
min_features=MIN_FEATURES,
max_features=MAX_FEATURES,
scoring=SCORING,
cv=CV,
n_jobs=-1)
efs.fit(X, y)
X_fs = X.iloc[:, list(efs.best_idx_)]
return X_fs