-
Notifications
You must be signed in to change notification settings - Fork 3
/
surprise_models.py
145 lines (115 loc) · 3.87 KB
/
surprise_models.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
import numpy as np
import pandas as pd
from surprise import Dataset, Reader, SVD, accuracy, dump
from surprise import AlgoBase, NMF, SVDpp, CoClustering
from keras.models import Sequential
from keras.layers import Activation, Dense
from random import randint
from utils import *
import os
import gc
# WARNING: This file is mostly deprecated. Just use for Surprise Model creation
# WARNING: This file takes a long time to run
ordering = 'mu' # rows correspond to movie_ids; cols correspond to user_ids
submit = True# set to True to save a submission on qual
save_model = False # set to True to save model parameters for future predictions
print('Loading data...')
#df = pd.read_csv(os.path.join('data', 'mu_train.csv'))
df = pd.read_csv(os.path.join('data', 'mu_probe.csv'))
# modify dataframe to reduce memory
del df['Unnamed: 0']
del df['Date Number']
df = df.astype('int32')
df_val = pd.read_csv(os.path.join('data', 'mu_val.csv'))
reader = Reader(rating_scale=(1, 5))
train_raw = Dataset.load_from_df(df[['User Number', 'Movie Number', 'Rating']], reader)
subset_length = 1000000
# Take a subset if you want
# train_full = train_raw
#train_full = train_raw[:subset_length]
train = train_raw.build_full_trainset()
model_name = 'vanilla_SVD'
print('Solving SVD...')
model = SVD(n_epochs = 20, verbose=True)
model.fit(train)
gc.collect()
# Model for NMF
# model_name = "NMF"
# print("Solving NMF Matrix Factorization")
# model = NMF(n_factors=5, n_epochs=20, verbose=True)
# model.fit(train)
# gc.collect()
# Model for CoCluster
# model_name = "cocluster"
# print("Solving CoCluster")
# model = CoCluster(n_cltr_u=7, n_cltr_i =5)
# model.fit(train)
# gc.collect()
def RMSE(true, pred):
# computes Root Mean-squared-error given true ratings and predictions
return np.sqrt(np.sum((true - pred)**2) / len(true))
#'''
train_pred = model.test(train.build_testset())
val_raw = Dataset.load_from_df(df_val[['User Number', 'Movie Number', 'Rating']], reader)
val = val_raw.build_full_trainset()
valset = val.build_testset()
pred = model.test(valset)
print('Train RMSE:', accuracy.rmse(train_pred))
print(model_name, 'Val RMSE:', accuracy.rmse(pred))
# WARNING: Moved blending entirely to blend.py. This is legacy code for safekeeping
#
# nmf_pred = nmf_model.test(valset)
# print('NMF VAl RMSE:', accuracy.rmse(nmf_pred))
#
# cluster_pred = cluster_model.test(valset)
# print('Cluster VAl RMSE:', accuracy.rmse(cluster_pred))
#
# # blend
#
# #real_svd = np.zeros(len(svd_pred))
# num_models = 3
# blend_pred = np.zeros(len(svd_pred), num_models)
#
# for j, pred in enumerate(svd_pred):
# blend_pred[j][0] = pred.est
#
# #real_nmf = np.zeros(len(nmf_pred))
# for j, pred in enumerate(nmf_pred):
# blend_pred[j][1] = pred.est
#
# #real_cluster = np.zeros(len(cluster_pred))
# for j, pred in enumerate(cluster_pred):
# blend_pred[j][2] = pred.est
#
# #blend_pred = blend_pred / 3
#
# # Use keras neural net
#
# nnmodel = Sequential()
# nnmodel.add(Dense(1, input_dim=num_models))
# nnmodel.fit(blend_pred, valset)
#
#real_blend_pred = nnmodel.predict(blend_pred)
#
# blend_rmse = RMSE(blend_pred, valset)
#
# save_submission(model_name, real_blend_pred, ordering)
#'''
if save_model:
print('Saving model...')
dump.dump(os.path.join('models', model_name), model)
if submit:
# Save probe and qual predictions for blending
print('Saving submission...')
df_qual = pd.read_csv(os.path.join('data', 'mu_qual.csv'))
pred = []
for _, row in df_qual.iterrows():
r_est = model.predict(row['User Number'], row['Movie Number']).est
pred.append(r_est)
save_submission(model_name + "_qual", pred, ordering)
df_qual = pd.read_csv(os.path.join('data', 'mu_probe.csv'))
pred = []
for _, row in df_qual.iterrows():
r_est = model.predict(row['User Number'], row['Movie Number']).est
pred.append(r_est)
save_submission(model_name + "_probe", pred, ordering)