-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathFM_FTRL_machine.py
238 lines (187 loc) · 7.9 KB
/
FM_FTRL_machine.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
''' Based on Tinrtgu's FTRL code: http://www.kaggle.com/c/avazu-ctr-prediction/forums/t/10927/beat-the-benchmark-with-less-than-1mb-of-memory
'''
from csv import DictReader
from math import exp, copysign, log, sqrt
from datetime import datetime
import random
class FM_FTRL_machine(object):
def __init__(self, fm_dim, fm_initDev, L1, L2, L1_fm, L2_fm, D, alpha, beta, alpha_fm = .1, beta_fm = 1.0, dropoutRate = 1.0):
''' initialize the factorization machine.'''
self.alpha = alpha # learning rate parameter alpha
self.beta = beta # learning rate parameter beta
self.L1 = L1 # L1 regularizer for first order terms
self.L2 = L2 # L2 regularizer for first order terms
self.alpha_fm = alpha_fm # learning rate parameter alpha for factorization machine
self.beta_fm = beta_fm # learning rate parameter beta for factorization machine
self.L1_fm = L1_fm # L1 regularizer for factorization machine weights. Only use L1 after one epoch of training, because small initializations are needed for gradient.
self.L2_fm = L2_fm # L2 regularizer for factorization machine weights.
self.fm_dim = fm_dim # dimension of factorization.
self.fm_initDev = fm_initDev # standard deviation for random intitialization of factorization weights.
self.dropoutRate = dropoutRate # dropout rate (which is actually the inclusion rate), i.e. dropoutRate = .8 indicates a probability of .2 of dropping out a feature.
self.D = D
# model
# n: squared sum of past gradients
# z: weights
# w: lazy weights
# let index 0 be bias term to avoid collisions.
self.n = [0.] * (D + 1)
self.z = [0.] * (D + 1)
self.w = [0.] * (D + 1)
self.n_fm = {}
self.z_fm = {}
self.w_fm = {}
def init_fm(self, i):
''' initialize the factorization weight vector for variable i.
'''
if i not in self.n_fm:
self.n_fm[i] = [0.] * self.fm_dim
self.w_fm[i] = [0.] * self.fm_dim
self.z_fm[i] = [0.] * self.fm_dim
for k in range(self.fm_dim):
self.z_fm[i][k] = random.gauss(0., self.fm_initDev)
def predict_raw(self, x):
''' predict the raw score prior to logit transformation.
'''
alpha = self.alpha
beta = self.beta
L1 = self.L1
L2 = self.L2
alpha_fm = self.alpha_fm
beta_fm = self.beta_fm
L1_fm = self.L1_fm
L2_fm = self.L2_fm
# first order weights model
n = self.n
z = self.z
w = self.w
# FM interaction model
n_fm = self.n_fm
z_fm = self.z_fm
w_fm = self.w_fm
raw_y = 0.
# calculate the bias contribution
for i in [0]:
# no regularization for bias
w[i] = (- z[i]) / ((beta + sqrt(n[i])) / alpha)
raw_y += w[i]
# calculate the first order contribution.
for i in x:
sign = -1. if z[i] < 0. else 1. # get sign of z[i]
if sign * z[i] <= L1:
w[i] = 0.
else:
w[i] = (sign * L1 - z[i]) / ((beta + sqrt(n[i])) / alpha + L2)
raw_y += w[i]
len_x = len(x)
# calculate factorization machine contribution.
for i in x:
self.init_fm(i)
for k in range(self.fm_dim):
sign = -1. if z_fm[i][k] < 0. else 1. # get the sign of z_fm[i][k]
if sign * z_fm[i][k] <= L1_fm:
w_fm[i][k] = 0.
else:
w_fm[i][k] = (sign * L1_fm - z_fm[i][k]) / ((beta_fm + sqrt(n_fm[i][k])) / alpha_fm + L2_fm)
for i in range(len_x):
for j in range(i + 1, len_x):
for k in range(self.fm_dim):
raw_y += w_fm[x[i]][k] * w_fm[x[j]][k]
return raw_y
def predict(self, x):
''' predict the logit
'''
return 1. / (1. + exp(- max(min(self.predict_raw(x), 35.), -35.)))
def dropout(self, x):
''' dropout variables in list x
'''
for i, var in enumerate(x):
if random.random() > self.dropoutRate:
del x[i]
def dropoutThenPredict(self, x):
''' first dropout some variables and then predict the logit using the dropped out data.
'''
self.dropout(x)
return self.predict(x)
def predictWithDroppedOutModel(self, x):
''' predict using all data, using a model trained with dropout.
'''
return 1. / (1. + exp(- max(min(self.predict_raw(x) * self.dropoutRate, 35.), -35.)))
def update(self, x, p, y):
''' Update the parameters using FTRL (Follow the Regularized Leader)
'''
alpha = self.alpha
alpha_fm = self.alpha_fm
# model
n = self.n
z = self.z
w = self.w
# FM model
n_fm = self.n_fm
z_fm = self.z_fm
w_fm = self.w_fm
# cost gradient with respect to raw prediction.
g = p - y
fm_sum = {} # sums for calculating gradients for FM.
len_x = len(x)
for i in x + [0]:
# update the first order weights.
sigma = (sqrt(n[i] + g * g) - sqrt(n[i])) / alpha
z[i] += g - sigma * w[i]
n[i] += g * g
# initialize the sum of the FM interaction weights.
fm_sum[i] = [0.] * self.fm_dim
# sum the gradients for FM interaction weights.
for i in range(len_x):
for j in range(len_x):
if i != j:
for k in range(self.fm_dim):
fm_sum[x[i]][k] += w_fm[x[j]][k]
for i in x:
for k in range(self.fm_dim):
g_fm = g * fm_sum[i][k]
sigma = (sqrt(n_fm[i][k] + g_fm * g_fm) - sqrt(n_fm[i][k])) / alpha_fm
z_fm[i][k] += g_fm - sigma * w_fm[i][k]
n_fm[i][k] += g_fm * g_fm
def write_w(self, filePath):
''' write out the first order weights w to a file.
'''
with open(filePath, "w") as f_out:
for i, w in enumerate(self.w):
f_out.write("%i,%f\n" % (i, w))
def write_w_fm(self, filePath):
''' write out the factorization machine weights to a file.
'''
with open(filePath, "w") as f_out:
for k, w_fm in self.w_fm.iteritems():
f_out.write("%i,%s\n" % (k, ",".join([str(w) for w in w_fm])))
def logLoss(p, y):
'''
calculate the log loss cost
p: prediction [0, 1]
y: actual value {0, 1}
'''
p = max(min(p, 1. - 1e-15), 1e-15)
return - log(p) if y == 1. else -log(1. - p)
def data(filePath, hashSize, hashSalt):
''' generator for data using hash trick
INPUT:
filePath
hashSize
hashSalt: String with which to salt the hash function
'''
for t, row in enumerate(DictReader(open(filePath))):
ID = row['id']
del row['id']
y = 0.
if 'click' in row:
if row['click'] == '1':
y = 1.
del row['click']
date = int(row['hour'][4:6])
row['hour'] = row['hour'][6:]
x = []
for key in row:
value = row[key]
index = abs(hash(hashSalt + key + '_' + value)) % hashSize + 1 # 1 is added to hash index because I want 0 to indicate the bias term.
x.append(index)
yield t, date, ID, x, y