Basic analysis of KlasCement Data

!pip install --upgrade scikit-learn
!pip install --upgrade "dask[dataframe]"
!pip install --upgrade dask
!pip install --upgrade s3fs
!pip install --upgrade pandas
!pip install --upgrade swifter
!pip install --upgrade seaborn

import numpy as np
import pandas as pd
import dask
import dask.dataframe as dd
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler

import seaborn as sns

from dask.diagnostics import ProgressBar
from IPython.display import display, HTML

import math
import s3fs
import swifter

from scipy import sparse

pd.set_option('display.max_rows', 1000)
pd.set_option('max_colwidth', 150)

print('The pandas version is {}.'.format(pd.__version__))
print('The dask version is {}.'.format(dask.__version__))
print('The s3fs version is {}.'.format(s3fs.__version__))

Load Data

bucket='045879944372-sagemaker-ml-dev'

df_nodate = pd.read_csv(          filepath_or_buffer = 's3://{}/{}'.format(bucket, 'klascement_no_dates_sorted.csv'), 
                                              dtype  = {
                                                        'res_cid': 'int32', 
                                                        'user_cid': 'int32', 
                                                        'event|favourited': 'int8',
                                                        'score':  'int8',
                                                        'event|clicked_through': 'int8',
                                                        'event|previewed': 'int8',
                                                        'event|downloaded': 'int8', 
                                                        'event|used': 'int8',
                                                        'event|visited_detail_pg': 'int8'  
                                                       }
                           )

display(df_nodate.head(n=3))
print(df_nodate.shape)

def show_interaction_count(colname, color):
    interact_counts_df = pd.DataFrame(  {'interaction_count': df_nodate.groupby(colname)[colname].count()}  ).reset_index(drop=False).sort_values(by='interaction_count')
    interact_counts_df['log_interaction_count'] = interact_counts_df['interaction_count'].swifter.apply(math.log10)
    display(interact_counts_df.head(n=1))
    interact_counts_df.hist(column = ['log_interaction_count'], color=color, bins=100 )
    return interact_counts_df

res_interact_counts_df = show_interaction_count('res_cid', 'darkred')

user_interact_counts_df = show_interaction_count('user_cid', 'blue')

minimum_interactions_threshold = 10

users_gr_thr = user_interact_counts_df[user_interact_counts_df['interaction_count'] >= minimum_interactions_threshold].reset_index(drop=True)[['user_cid']]
users_less_thr = user_interact_counts_df[user_interact_counts_df['interaction_count'] < minimum_interactions_threshold].reset_index(drop=True)[['user_cid']]
display(users_less_thr.head(n=2))
print(users_less_thr.shape[0])

res_gr_thr = res_interact_counts_df[res_interact_counts_df['interaction_count'] >= minimum_interactions_threshold].reset_index(drop=True)[['res_cid']]
res_less_thr = res_interact_counts_df[res_interact_counts_df['interaction_count'] < minimum_interactions_threshold].reset_index(drop=True)[['res_cid']]
display(res_less_thr.head(n=2))
print(res_less_thr.shape[0])

Exclude all users or resources with less than 10 interactions

print(df_nodate.shape)
df_nodate = df_nodate.merge(res_gr_thr, on='res_cid', how='inner')
print(df_nodate.shape)
df_nodate = df_nodate.merge(users_gr_thr, on='user_cid', how='inner')
print(df_nodate.shape)
display(df_nodate.head(n=3))

_ = show_interaction_count('user_cid', 'blue')

Begin the analysis

df_nodate['event|goodscore'] =  df_nodate['score'].swifter.apply(lambda x: 1 if x >= 4 else 0).astype('int8')

df_nodate['event|badscore'] =  df_nodate['score'].swifter.apply(lambda x: 1 if  x == 1 or x == 2 else 0).astype('int8')

# del df_nodate['event|used']

df_nodate.head()

df_nodate

corr_fr = df_nodate[['event|favourited','score','event|clicked_through','event|previewed','event|downloaded','event|used','event|visited_detail_pg']].corr()

def corrplot_of_corrdfr(corrdfr, cmap = sns.diverging_palette(20, 220, n=256)):
    plt.figure(figsize=(12, 12))
    ax = sns.heatmap( corrdfr,  center=0, cmap=cmap, square=True, annot=True  )
    ax.set_xticklabels( ax.get_xticklabels(), rotation=45, horizontalalignment='right');

corrplot_of_corrdfr(corr_fr)

sums  = df_nodate.loc[:,df_nodate.columns.str.startswith("event|")].astype(bool).sum(axis=0).astype("int64")
infos = sums/(np.max(df_nodate['res_cid']).astype("int64")*np.max(df_nodate['user_cid']).astype("int64"))

#TODO: there are some magic numbers in here 
infos = infos.apply(lambda x: np.round(-7-math.log2(x), 1) )

infos['event|badscore'] = -1*infos['event|badscore'] 

info_weight_dict = infos.to_dict()
info_weight_dict.items()

df_nodate['eng_score'] =  df_nodate.swifter.apply(lambda x:   sum (  [  w*x[c] for c , w in info_weight_dict.items() ] )    , axis=1 ).astype('float16')

df_nodate['eng_score'] = df_nodate['eng_score'].swifter.apply(lambda x: max(x, 0) ).astype('float16')

scaler = StandardScaler(with_mean=False)
df_nodate['eng_score']  = scaler.fit_transform( df_nodate[['eng_score']]   ).astype('float16')
df_nodate['eng_score'] = df_nodate['eng_score'].swifter.apply(lambda x: round(x, 0) ).astype('int8')

# Here we limit the score to 5 in oder to avoid a long tail of scores between 5 and 10
df_nodate['eng_score'] = df_nodate['eng_score'].swifter.apply(lambda x: min(x, 5) ).astype('int8')

df_nodate.hist(column = ['eng_score'], bins=20)

df_nodate.sample(n=50)

n_resources =  np.max(df_nodate['res_cid']).astype('int64') 
n_users     =  np.max(df_nodate['user_cid']).astype('int64')

print('n_resources          ≈ {:,}'.format( int( float ('%.3g' % n_resources     ) ) )   )
print('n_users              ≈ {:,}'.format( int( float ('%.3g' % n_users     ) ) )   )

poss_interactions = n_resources * n_users
print('poss_interactions    ≈ {:,}'.format( int( float ('%.3g' % poss_interactions     ) ) )   )

actual_interactions = df_nodate.shape[0]
print('actual_interactions  ≈ {:,}'.format( int( float ('%.3g' % actual_interactions     ) ) )   )

df_rating = df_nodate[['res_cid' , 'user_cid' , 'eng_score' ]]

df_rating

save_rating_data = True

if save_rating_data:
    filename = 'klascement_ratings_05_filtered10'
    data_location = 's3://{}/{}'.format(bucket, filename)

    df_rating.to_csv(data_location + '.csv', encoding='utf-8', index=False,  float_format='%.1f') 
else:
    print("nothing saved, since save_rating_data = False")

# df_nodate['event|usednd'] =  df_nodate.swifter.apply(lambda x:   1 if  x['event|used'] == 1 and x['event|downloaded'] == 0 else 0    , axis=1 ).astype('int8')

# def memuse_per_row(frame):
    
#     if('dask' in str(type(frame)) ): 
#         with ProgressBar():
#             df = frame.compute()
#     else:
#         df = frame
    
#     nrows = df.shape[0]
#     memuse = sum(df.memory_usage() )
#     bytes_per_row = memuse/nrows
#     return bytes_per_row

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Basic_analysis.md

Basic_analysis.md

Basic analysis of KlasCement Data

Load Data

Exclude all users or resources with less than 10 interactions

Begin the analysis

Files

Basic_analysis.md

Latest commit

History

Basic_analysis.md

File metadata and controls

Basic analysis of KlasCement Data

Load Data

Exclude all users or resources with less than 10 interactions

Begin the analysis