-
Notifications
You must be signed in to change notification settings - Fork 0
/
feature_selection.py
74 lines (58 loc) · 1.88 KB
/
feature_selection.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
"""
Greedy Feature Selection using Logistic Regression as base model
to optimize Area Under the ROC Curve
__author__ : Abhishek
Credits : Miroslaw @ Kaggle
"""
import numpy
import sklearn.linear_model as lm
from sklearn import metrics, preprocessing
class GreedyFeatureSelection(object):
def __init__(self, model, data, labels, scale=1, verbose=0):
self._model = model
if scale == 1:
self._data = preprocessing.scale(numpy.array(data))
else:
self._data = numpy.array(data)
self._labels = labels
self._verbose = verbose
def evaluateScore(self):
model = self._model
X = self._data
y = self._labels
#model = lm.LogisticRegression()
model.fit(X, y)
predictions = model.predict(X)
means = numpy.mean(numpy.abs((y - predictions) / y)) * 100
return means
def selectionLoop(self):
# set
X = self._data
# begin
score_history = []
good_features = set([])
num_features = X.shape[1]
while len(score_history) < 2 or score_history[-1][0] > score_history[-2][0]:
scores = []
for feature in range(num_features):
if feature not in good_features:
selected_features = list(good_features) + [feature]
Xts = numpy.column_stack(X[:, j] for j in selected_features)
score = self.evaluateScore()
scores.append((score, feature))
if self._verbose:
print "Current MAPE : ", numpy.mean(score)
good_features.add(sorted(scores)[-1][1])
score_history.append(sorted(scores)[-1])
if self._verbose:
print "Current Features : ", sorted(list(good_features))
# Remove last added feature
good_features.remove(score_history[-1][1])
good_features = sorted(list(good_features))
if self._verbose:
print "Selected Features : ", good_features
return good_features
def transform(self):
good_features = self.selectionLoop()
#return X[:, good_features]
return good_features