Skip to content

Latest commit

 

History

History
421 lines (293 loc) · 8.82 KB

Surprise_algos.md

File metadata and controls

421 lines (293 loc) · 8.82 KB
jupyter
jupytext kernelspec
formats text_representation
ipynb,md
extension format_name format_version jupytext_version
.md
markdown
1.2
1.4.2
display_name language name
Python [conda env:python3]
python
conda-env-python3-py

Surprise Algos on the KlasCement Data

!pip install --upgrade scikit-learn
!pip install --upgrade scipy
!pip install --upgrade pandas
!pip install --upgrade seaborn
!pip install --upgrade uncertainties
!pip install --upgrade surprise
!pip install --upgrade s3fs
!pip install --upgrade jupytext
import numpy as np
import seaborn as sns
import pandas as pd


import scipy
from scipy import sparse

import sklearn
from sklearn.metrics import ndcg_score

import surprise
from surprise import Dataset, accuracy, NormalPredictor, BaselineOnly, KNNBasic, KNNWithMeans, KNNWithZScore, SlopeOne,  CoClustering,  SVD, NMF, SVDpp
from surprise.model_selection import cross_validate, KFold, RepeatedKFold,  train_test_split
from surprise import NormalPredictor
from surprise import Dataset
from surprise import Reader

import copy
import time
import sys
from itertools import starmap, product
import multiprocessing

import statistics
import uncertainties
from uncertainties import ufloat 
print('The scikit-learn version is {}.'.format(sklearn.__version__))
print('The scipy version is {}.'.format(scipy.__version__))
print('The seaborn version is {}.'.format(sns.__version__))
print('The uncertainties version is {}.'.format(uncertainties.__version__))
print('The surprise version is {}.'.format(surprise.__version__))
print('The pandas version is {}.'.format(pd.__version__))

Load data

bucket='045879944372-sagemaker-ml-dev'
df_ratings = pd.read_csv(          filepath_or_buffer = 's3://{}/{}'.format(bucket, 'klascement_ratings_int.csv'), 
                                              dtype  = {
                                                        'leermiddel_id': 'int32', 
                                                        'gebruiker_id': 'int32', 
                                                        'eng_score': 'int8', 
                                                       }
                        )
df_ratings
df_ratings[['eng_score']].hist()
nratings = df_ratings.shape[0]
n_users = np.int64(np.max(df_ratings[['gebruiker_id']].values ) )
n_items = np.int64(np.max(df_ratings[['leermiddel_id']].values ) )

print("n_users = {:,}".format( int( float ('%.3g' % (n_users)   ) ) ) )  
print("n_items = {:,}".format( int( float ('%.3g' % (n_items)   ) ) ) )
print("dense matrix size = {:,}".format( int( float ('%.3g' % (n_users*n_items)   ) ) ) )  
print("sparse matrix size = {:,}".format( int( float ('%.3g' % (nratings)   ) ) ) )  
print("sparsity = {:.2g} = 1 in  {:,} ".format(   nratings/(n_users*n_items)  ,  int( float ('%.3g' % (n_users*n_items/nratings)  ) )     ) )  
df_ratings_small = df_ratings[df_ratings['leermiddel_id'] <= 1000]
df_ratings_small = df_ratings_small[df_ratings_small['gebruiker_id'] <= 10000]

df_ratings_medium = df_ratings[df_ratings['leermiddel_id'] <= 2500]
df_ratings_medium = df_ratings_medium[df_ratings_medium['gebruiker_id'] <= 15000]

df_ratings_large = df_ratings[df_ratings['leermiddel_id'] <= 10000]
df_ratings_large = df_ratings_large[df_ratings_large['gebruiker_id'] <= 30000]

Make surprise datasets

# A reader is still needed but only the rating_scale param is required.
reader = Reader(rating_scale=(0, 10))

# The columns must correspond to user id, item id and ratings (in that order).
sds_small  = Dataset.load_from_df(df_ratings_small[['gebruiker_id', 'leermiddel_id', 'eng_score']], reader)
sds_medium = Dataset.load_from_df(df_ratings_medium[['gebruiker_id', 'leermiddel_id', 'eng_score']], reader)
sds_large = Dataset.load_from_df(df_ratings_large[['gebruiker_id', 'leermiddel_id', 'eng_score']], reader)
sds_huge   = Dataset.load_from_df(df_ratings[['gebruiker_id', 'leermiddel_id', 'eng_score']], reader)
def dataset_info(dataset):

    trainset = dataset.build_full_trainset()
    
    print("n_ratings = {:,}".format( int( float ('%.3g' % trainset.n_ratings ) ) )      )
    print("n_users = {:,}".format( int( float ('%.3g' % trainset.n_users ) ) )      )
    print("average number of ratings per user = {:,}".format( int( float ('%.3g' % (trainset.n_ratings / trainset.n_users)  ) ) )      )
    
dataset_info(sds_small)
dataset_info(sds_medium)
dataset_info(sds_large)

Train with Surprise

algo = SVD()

# Run 5-fold cross-validation and print results.
cross_validate(algo, sds_small, measures=['RMSE'], cv=2, verbose=False)
# cross_validate(algo, sds_medium, measures=['RMSE'], cv=2, verbose=False)
# cross_validate(algo, sds_large, measures=['RMSE'], cv=2, verbose=False)
# cross_validate(algo, sds_huge, measures=['RMSE'], cv=2, verbose=False)

cross_validate(algo, sds_huge, measures=['RMSE'], cv=2, verbose=False) {'test_rmse': array([0.9106886 , 0.91098935]), 'fit_time': (926.4300336837769, 933.39604139328), 'test_time': (223.18039846420288, 209.75310754776)}

Takes 933 seconds for Surprise to fit the full dataset with SVD. Ouch.

cross_validate(NormalPredictor(), sds_large, measures=['RMSE'], cv=2, verbose=True)
# cross_validate(NormalPredictor(), sds_huge, measures=['RMSE'], cv=2, verbose=True)
cross_validate(BaselineOnly(), sds_huge, measures=['RMSE'], cv=2, verbose=True)
cross_validate(BaselineOnly(), sds_large, measures=['RMSE'], cv=2, verbose=True)
# memoryerror for huge
cross_validate(KNNBasic(), sds_large, measures=['RMSE'], cv=2, verbose=True)
# memoryerror for huge 
cross_validate(SlopeOne(), sds_large, measures=['RMSE'], cv=2, verbose=True)
# NMF gives zerodivisionerror
# cross_validate(NMF(), sds_large, measures=['RMSE'], cv=2, verbose=True)
cross_validate(KNNWithZScore(), sds_large, measures=['RMSE'], cv=2, verbose=True)

Train with Funk

from sklearn.metrics import mean_absolute_error, mean_squared_error
# !pip install git+https://github.com/gbolmier/funk-svd
df_ratings.columns = ['u_id', 'i_id', 'rating' ]
train = df_ratings.sample(frac=0.9, random_state=7)
val = df_ratings.drop(train.index.tolist()).sample(n=300000, random_state=8)
test = df_ratings.drop(train.index.tolist()).drop(val.index.tolist())
test
svd = SVD(learning_rate=0.001, regularization=0.005, n_epochs=100, n_factors=15, min_rating=0, max_rating=10)

svd.fit(X=train, X_val=val, early_stopping=True, shuffle=False)

pred = svd.predict(test)
# mae = mean_absolute_error(test['rating'], pred)
rmse = mean_squared_error(test['rating'], pred)
rmse
svd
svd = SVD(learning_rate=0.001, regularization=0.005, n_epochs=100, n_factors=30, min_rating=0, max_rating=10)

svd.fit(X=train, X_val=val, early_stopping=True, shuffle=False)

pred = svd.predict(test)
rmse = mean_squared_error(test['rating'], pred)
rmse
svd = SVD(learning_rate=0.001, regularization=0.001, n_epochs=100, n_factors=30, min_rating=0, max_rating=10)

svd.fit(X=train, X_val=val, early_stopping=True, shuffle=False)

pred = svd.predict(test)
rmse = mean_squared_error(test['rating'], pred)
rmse
1+1
svd = SVD(learning_rate=0.001, regularization=0.001, n_epochs=100, n_factors=100, min_rating=0, max_rating=10)

svd.fit(X=train, X_val=val, early_stopping=True, shuffle=False)

pred = svd.predict(test)
rmse = mean_squared_error(test['rating'], pred)
rmse
svd = SVD(learning_rate=0.001, regularization=0.001, n_epochs=100, n_factors=300, min_rating=0, max_rating=10)

svd.fit(X=train, X_val=val, early_stopping=True, shuffle=False)

pred = svd.predict(test)
rmse = mean_squared_error(test['rating'], pred)
rmse
# from scipy.sparse import csc_matrix
# >>> from scipy.sparse.linalg import svds
# >>> A = csc_matrix([[1, 0, 0], [5, 0, 2], [0, -1, 0], [0, 0, 3]], dtype=float)
# >>> u, s, vt = svds(A, k=2) # k is the number of factors
# >>> s
# array([ 2.75193379,  5.6059665 ])
# NormalPredictor, BaselineOnly, KNNBasic, KNNWithMeans, KNNWithZScore, SlopeOne,  CoClustering,  SVD, NMF, SVDpp