-
Notifications
You must be signed in to change notification settings - Fork 1
/
gradient_boosted_trees.py
168 lines (135 loc) · 6.69 KB
/
gradient_boosted_trees.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
import numpy as np
from itertools import product
from copy import deepcopy
from decision_trees import ClassificationTree, RegressionTree
import matplotlib.pyplot as plt
class GradientBoostedTree():
'''
Gradient Boosting Regressor/Classifier class
:method fit: fitting model
'''
def __init__(self, n_estimators=10, pseudo_residual_func='l2', lr=0.1, \
criterion='gini', estimator='mode', min_samples_split=5, \
min_samples_leaf=1, max_features='auto', max_depth=3):
'''
Initialize gradient boosting class
:param n_estimators: number of estimators (i.e. number of rounds of gradient boosting)
:pseudo_residual_func: function used for computing pseudo-residual
:param lr: step size of gradient descent
'''
# Pseudo-residual function for gradient boosting
pseudo_residual_func_dict = {
'l2': self._pseudo_residual_L2,
'logistic': self._pseudo_residual_logistic
}
estimator_criterion_dict = {
'mean': ['mse', 'mae'],
'median': ['mse', 'mae'],
'mode': ['gini', 'entropy']
}
assert estimator in estimator_criterion_dict.keys(), \
'Param "estimator" must be one of {}'.format(estimator_criterion_dict.keys())
assert criterion in estimator_criterion_dict[estimator], \
'Param "criterion" must be one of {} for leaf value estimator "{}"'\
.format(estimator_criterion_dict[estimator], estimator)
valid_max_features = ['auto', 'sqrt', None]
assert isinstance(max_features, int) or max_features in valid_max_features, \
'Param "max_features" must either be an integer or one of "{}"'.format(valid_max_features)
self.n_estimators = n_estimators
self.pseudo_residual_func = pseudo_residual_func_dict[pseudo_residual_func]
self.lr = lr
self.criterion = criterion
self.min_samples_split = min_samples_split
self.min_samples_leaf = min_samples_leaf
self.max_features = max_features
self.max_depth = max_depth
if estimator in ['mean', 'median']:
base_estimator = RegressionTree(criterion=criterion, estimator=estimator, max_depth=max_depth, \
min_samples_split=min_samples_split, max_features=max_features, \
min_samples_leaf=min_samples_leaf)
else:
base_estimator = ClassificationTree(criterion=criterion, max_depth=max_depth, \
min_samples_split=min_samples_split, max_features=max_features, \
min_samples_leaf=min_samples_leaf)
self.estimator = estimator
self.estimators = [deepcopy(base_estimator) for _ in range(n_estimators)]
def get_params(self, deep=True):
return self.estimators[0].get_params(deep)
def fit(self, X, y):
'''
Fit gradient boosting model
'''
predictions = np.zeros(y.shape[0])
for i in range(self.n_estimators):
pseudo_residuals = self.pseudo_residual_func(y, predictions)
self.estimators[i].fit(X, pseudo_residuals)
predictions += self.lr * self.estimators[i].predict(X)
return self
def predict(self, X):
'''
Predict value
'''
predictions = np.zeros(X.shape[0])
for i in range(self.n_estimators):
predictions += self.lr * self.estimators[i].predict(X)
return predictions
def _pseudo_residual_L2(self, y, yhat):
'''
Compute the pseudo-residual based with square loss on current predicted value.
'''
return y - yhat
def _pseudo_residual_logistic(self, y, yhat):
'''
Compute the pseudo-residual based with logistic loss on current predicted value.
'''
return y/(1 + np.exp(np.multiply(y, yhat)))
def main():
np.random.seed(0)
############### Classifiers ###############
data_train = np.loadtxt('data/svm-train.txt')
x_train, y_train = data_train[:, 0:2], data_train[:, 2].reshape(-1, 1)
y_train_label = (y_train > 0).astype(int).reshape(-1, 1)
# Plotting decision regions
x_min, x_max = x_train[:, 0].min() - 1, x_train[:, 0].max() + 1
y_min, y_max = x_train[:, 1].min() - 1, x_train[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1),
np.arange(y_min, y_max, 0.1))
f, axarr = plt.subplots(2, 3, sharex='col', sharey='row', figsize=(10, 8))
for idx, n_est, tt in zip(product([0, 1], [0, 1, 2]),
[1, 5, 10, 20, 50, 100],
['n_estimators = {}'.format(n) for n in [1, 5, 10, 20, 50, 100]]):
# Gradient Boosted Decision Tree Classifier
gbt = GradientBoostedTree(n_estimators=n_est, pseudo_residual_func='logistic', \
criterion='entropy', estimator='mode', max_depth=5)
gbt.fit(x_train, y_train.ravel())
Z = (gbt.predict(np.c_[xx.ravel(), yy.ravel()]) > 0).astype(int)
Z = Z.reshape(xx.shape)
f.suptitle('GBDT Classifier')
axarr[idx[0], idx[1]].contourf(xx, yy, Z, alpha=0.4)
axarr[idx[0], idx[1]].scatter(x_train[:, 0], x_train[:, 1], c=y_train_label.ravel(), alpha=0.8)
axarr[idx[0], idx[1]].set_title(tt)
plt.show()
#######################################################
############### Regressors ###############
data_krr_train = np.loadtxt('data/krr-train.txt')
x_krr_train, y_krr_train = data_krr_train[:,0].reshape(-1,1), data_krr_train[:,1].reshape(-1,1)
plot_size = 0.001
x_range = np.arange(0., 1., plot_size).reshape(-1, 1)
f, axarr = plt.subplots(2, 3, sharex='col', sharey='row', figsize=(10, 8))
for idx, n_est, tt in zip(product([0, 1], [0, 1, 2]),
[1, 5, 10, 20, 50, 100],
['n_estimators = {}'.format(n) for n in [1, 5, 10, 20, 50, 100]]):
# Gradient Boosted Decision Tree Regressor
gbt = GradientBoostedTree(n_estimators=n_est, pseudo_residual_func='l2', \
criterion='mse', estimator='mean', max_depth=2)
gbt.fit(x_krr_train, y_krr_train.ravel())
y_predict = gbt.predict(x_range)
f.suptitle('GBDT Regressor')
axarr[idx[0], idx[1]].plot(x_range, y_predict, color='r')
axarr[idx[0], idx[1]].scatter(x_krr_train, y_krr_train.ravel(), alpha=0.8)
axarr[idx[0], idx[1]].set_title(tt)
axarr[idx[0], idx[1]].set_xlim(0, 1)
plt.show()
#######################################################
if __name__ == '__main__':
main()