From c972f79e0cb9b238c97c1dc550f48e2be780911f Mon Sep 17 00:00:00 2001
From: Nikolaos Triantafyllis
Date: Fri, 29 Apr 2022 16:46:20 +0300
Subject: [PATCH 01/89] Adding tooltip to variables (see REC-25)
---
report.html.prototype | 40 +++++++++++++++++++++++++++++++++++++---
1 file changed, 37 insertions(+), 3 deletions(-)
diff --git a/report.html.prototype b/report.html.prototype
index 4e29b9b..2c4406a 100644
--- a/report.html.prototype
+++ b/report.html.prototype
@@ -11,7 +11,7 @@
.card {
animation-name: fadein;
- animation-duration: 2s;
+ animation-duration: 1s;
}
@keyframes fadein {
@@ -33,7 +33,38 @@
color: #FAC0E7;
}
-
+span {
+ position: relative;
+}
+
+span:hover:after {
+ background: #333;
+ background: rgba(0, 0, 0, .8);
+ border-radius: 5px;
+ bottom: -34px;
+ color: #fff;
+ content: attr(gloss);
+ left: 20%;
+ padding: 5px 15px;
+ position: absolute;
+ z-index: 98;
+ width: 350px;
+ font-size: 14px;
+}
+
+span:hover:before {
+ border: solid;
+ border-color: #333 transparent;
+ border-width: 0 6px 6px 6px;
+ bottom: -4px;
+ content: "";
+ left: 50%;
+ position: absolute;
+ z-index: 99;
+ font-size: 14px;
+}
+
+
@@ -136,7 +167,10 @@ function fill(data){
for (item of fill_list) {
key = 'val_' + item
value = data[item]
- el = document.getElementById(key).textContent = value
+ value_doc=data[item+'_doc']
+ document.getElementById(key).textContent = value
+ document.getElementById(key).setAttribute('gloss', value_doc);
+ //document.getElementById(key).gloss=value_doc
}
}
From 73d2ed4e47827e0bda2870d31331235a36b9d4ea Mon Sep 17 00:00:00 2001
From: Nikolaos Triantafyllis
Date: Fri, 29 Apr 2022 16:47:34 +0300
Subject: [PATCH 02/89] Update README.md
---
README.md | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/README.md b/README.md
index b765902..28edea0 100644
--- a/README.md
+++ b/README.md
@@ -3,15 +3,15 @@ A framework for counting the recommender metrics
# Preprocessor v.0.2
-
-
+
+
# RS metrics v.0.2
-
-
+
+
@@ -62,8 +62,8 @@ optional arguments:
8. Configure `./preprocessor.py` by editting the `config.yaml` or providing another with `-c`:
-
-
+
+
From e7003b8e811dfa22c264f8587b73db95127aeb49 Mon Sep 17 00:00:00 2001
From: Nikolaos Triantafyllis
Date: Fri, 29 Apr 2022 18:09:20 +0300
Subject: [PATCH 03/89] Adding timestamp in UI (see REC-18)
---
report.html.prototype | 13 ++++++++++---
1 file changed, 10 insertions(+), 3 deletions(-)
diff --git a/report.html.prototype b/report.html.prototype
index 2c4406a..bc9e230 100644
--- a/report.html.prototype
+++ b/report.html.prototype
@@ -79,6 +79,7 @@ span:hover:before {
This report contains information about the detailed results of the evaluation process of the recommendation system as well as statistics
related to the ingested dataset of user actions and recommendations
+
@@ -151,6 +152,14 @@ span:hover:before {
+
+
+
+
+
+ Generated on:
+
+
@@ -162,7 +171,7 @@ span:hover:before {
function fill(data){
// what to fill
- let fill_list = ['users', 'recommendations', 'services', 'user_actions','user_actions_order', 'user_actions_registered', 'user_actions_anonymous', 'user_actions_order_registered', 'user_actions_order_anonymous', 'user_actions_registered_perc', 'user_actions_anonymous_perc', 'user_actions_order_registered_perc', 'user_actions_order_anonymous_perc', 'catalog_coverage_perc', 'user_coverage_perc'];
+ let fill_list = ['users', 'recommendations', 'services', 'user_actions','user_actions_order', 'user_actions_registered', 'user_actions_anonymous', 'user_actions_order_registered', 'user_actions_order_anonymous', 'user_actions_registered_perc', 'user_actions_anonymous_perc', 'user_actions_order_registered_perc', 'user_actions_order_anonymous_perc', 'catalog_coverage_perc', 'user_coverage_perc', 'timestamp'];
for (item of fill_list) {
key = 'val_' + item
@@ -170,8 +179,6 @@ function fill(data){
value_doc=data[item+'_doc']
document.getElementById(key).textContent = value
document.getElementById(key).setAttribute('gloss', value_doc);
- //document.getElementById(key).gloss=value_doc
-
}
}
From 9eca706037e6dba85f43addda1209c724b7a1782 Mon Sep 17 00:00:00 2001
From: Nikolaos Triantafyllis
Date: Fri, 6 May 2022 18:27:11 +0300
Subject: [PATCH 04/89] no-changes
---
config.yaml | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/config.yaml b/config.yaml
index 6513966..62ac9d0 100644
--- a/config.yaml
+++ b/config.yaml
@@ -20,10 +20,10 @@ Service:
export: true
#from: 'user_actions'
#from: 'recommendations'
- from: 'source'
- #from: 'page_map'
+ #from: 'source'
+ from: 'page_map'
- published: false # applies only on source option
+ published: true # applies only on source option
User-actions:
merge: false # not implemented yet
From 9cd885d71f993a275226823683e63d76f8e75eec Mon Sep 17 00:00:00 2001
From: nikosT
Date: Sat, 7 May 2022 18:20:32 +0300
Subject: [PATCH 05/89] User actions is enriched according to REC-34.
Additionally, reward function has been altered where transition csv file is
being read once, leading to an overall greater execution performance
---
preprocessor.py | 71 +++++++++++++++++++++++++++--------------------
reward_mapping.py | 4 +--
2 files changed, 43 insertions(+), 32 deletions(-)
diff --git a/preprocessor.py b/preprocessor.py
index 3be128e..27f381e 100755
--- a/preprocessor.py
+++ b/preprocessor.py
@@ -7,7 +7,7 @@
import os
from natsort import natsorted
import natsort as ns
-
+import pandas as pd
import retrieval
import reward_mapping as rm
@@ -128,9 +128,6 @@ def __init__(self, source_page_id, target_page_id, order):
recdb = myclient[config["Source"]["MongoDB"]["db"]]
-
-
-
# automatically associate page ids to service ids
if config['Service']['download']:
service_list_path = os.path.join(args.output,config['Service']['path'])
@@ -155,9 +152,20 @@ def __init__(self, source_page_id, target_page_id, order):
dmap=dict(zip(keys, values)) #=> {'a': 1, 'b': 2}
-uas={}
+# reward_mapping.py is modified so that the function
+# reads the Transition rewards csv file once
+# consequently, one argument has been added to the
+# called function
+ROOT_DIR='./'
+
+TRANSITION_REWARDS_CSV_PATH = os.path.join(
+ ROOT_DIR, "resources", "transition_rewards.csv"
+)
+transition_rewards_df = pd.read_csv(TRANSITION_REWARDS_CSV_PATH, index_col="source")
-for ua in recdb["user_action"].find(query):
+luas=[]
+
+for ua in recdb["user_action"].find(query).sort("user"):
# set -1 to anonymous users
try:
@@ -166,44 +174,48 @@ def __init__(self, source_page_id, target_page_id, order):
user=-1
# process data that map from page id to service id exist
+ # for both source and target page ids
+ # if not set service id to -1
try:
- _pageid="/"+"/".join(ua['target']['page_id'].split('/')[1:3])
- service_id=dmap[_pageid]
+ _pageid="/"+"/".join(ua['source']['page_id'].split('/')[1:3])
+ source_service_id=dmap[_pageid]
+ except:
+ source_service_id=-1
+ try:
+ _pageid="/"+"/".join(ua['target']['page_id'].split('/')[1:3])
+ target_service_id=dmap[_pageid]
except:
- continue
+ target_service_id=-1
- symbolic_reward=rm.ua_to_reward_id(User_Action(ua['source']['page_id'],
+ # function has been modified where one more argument is given
+ # in order to avoid time-consuming processing of reading csv file
+ # for every func call
+ symbolic_reward=rm.ua_to_reward_id(transition_rewards_df,
+ User_Action(ua['source']['page_id'],
ua['target']['page_id'],
ua['action']['order']))
reward=reward_mapping[symbolic_reward]
- uas.setdefault(user,{})
-
- # then we need to merge rewards
- # keep the max value for each record
- try:
- if uas[user][service_id][0] < reward:
- uas[user][service_id]=[reward, ua['source']['root']['type'], ua['timestamp']]
- except:
- uas[user].setdefault(service_id,[reward, ua['source']['root']['type'], ua['timestamp']])
-
-luas=[]
-
-for user,_ in natsorted(uas.items(),alg=ns.ns.SIGNED):
- for service,act in natsorted(uas[user].items(),alg=ns.ns.SIGNED):
-
- if service:
- luas.append('{},{},{},{},{}\n'.format(user, service, *act))
+ luas.append('{},{},{},{},{},{},{},{}\n'.format(user,
+ source_service_id,
+ target_service_id,
+ reward,
+ ua['source']['root']['type'],
+ ua['timestamp'],
+ ua['source']['page_id'],
+ ua['target']['page_id']))
+#luas=natsorted(luas,alg=ns.ns.SIGNED)
with open(os.path.join(args.output,'user_actions.csv'), 'w') as o:
o.writelines(luas)
+
recs=[]
-for rec in recdb["recommendation"].find(query):
+for rec in recdb["recommendation"].find(query).sort("user"):
try:
user=rec['user']
@@ -213,7 +225,7 @@ def __init__(self, source_page_id, target_page_id, order):
for service in rec['services']:
recs.append('{},{},{},{}\n'.format(user, service, '1', rec['timestamp']))
-recs=natsorted(recs,alg=ns.ns.SIGNED)
+#recs=natsorted(recs,alg=ns.ns.SIGNED)
with open(os.path.join(args.output,'recommendations.csv'), 'w') as o:
o.writelines(recs)
@@ -234,7 +246,6 @@ def __init__(self, source_page_id, target_page_id, order):
with open(os.path.join(args.output,'users.csv'), 'w') as o:
o.writelines(us)
-
# export service catalog
if config['Service']['export']:
diff --git a/reward_mapping.py b/reward_mapping.py
index 25891a4..1b8d0f4 100755
--- a/reward_mapping.py
+++ b/reward_mapping.py
@@ -47,7 +47,7 @@ def _to_abstract_page_id(page_id: str, valid_page_ids: List[str]) -> str:
return "unknown_page_id"
-def ua_to_reward_id(user_action) -> str:
+def ua_to_reward_id(transition_rewards_df, user_action) -> str:
"""
This function maps user_action to the symbolic reward.
Mapping is using ONLY following fields of user_action:
@@ -61,7 +61,7 @@ def ua_to_reward_id(user_action) -> str:
For now it just return generic reward id.
"""
- transition_rewards_df = pd.read_csv(TRANSITION_REWARDS_CSV_PATH, index_col="source")
+
valid_page_ids = transition_rewards_df.index.values.tolist()
source = _to_abstract_page_id(user_action.source.page_id, valid_page_ids)
From cfe4e88159aa21e079391e3c5bd00580a33103f2 Mon Sep 17 00:00:00 2001
From: Nikolaos Triantafyllis
Date: Mon, 9 May 2022 15:53:44 +0300
Subject: [PATCH 06/89] Removing merge (REC-32)
---
config.yaml | 3 ---
1 file changed, 3 deletions(-)
diff --git a/config.yaml b/config.yaml
index 62ac9d0..fd7e0e4 100644
--- a/config.yaml
+++ b/config.yaml
@@ -25,9 +25,6 @@ Service:
published: true # applies only on source option
-User-actions:
- merge: false # not implemented yet
-
# Calculate source's metrics
Metrics: true
From b0d913761ebefb308bae82083b3cbe50ff6b5a5a Mon Sep 17 00:00:00 2001
From: Nikolaos Triantafyllis
Date: Mon, 9 May 2022 16:14:54 +0300
Subject: [PATCH 07/89] Adding start and end time according to REC-29. Min and
max values are selected respectively between user_action and recommendation
collections' timestamps
---
preprocessor.py | 8 ++++++++
1 file changed, 8 insertions(+)
diff --git a/preprocessor.py b/preprocessor.py
index 27f381e..3ea3fba 100755
--- a/preprocessor.py
+++ b/preprocessor.py
@@ -274,6 +274,14 @@ def __init__(self, source_page_id, target_page_id, order):
m.timestamp=str(datetime.utcnow())
+ ua_start=recdb["user_action"].find_one(sort=[("timestamp", 1)])["timestamp"]
+ ua_end=recdb["user_action"].find_one(sort=[("timestamp", -1)])["timestamp"]
+ rec_start=recdb["recommendation"].find_one(sort=[("timestamp", 1)])["timestamp"]
+ rec_end=recdb["recommendation"].find_one(sort=[("timestamp", -1)])["timestamp"]
+
+ m.start=str(min(ua_start, rec_start))
+ m.end=str(max(ua_end, rec_end))
+
m.users=recdb["user"].count_documents({})
m.recommendations=recdb["recommendation"].count_documents(query)
m.services=recdb["service"].count_documents({})
From 9208aabce63140c3d67096bf85633f353d012ac2 Mon Sep 17 00:00:00 2001
From: Nikolaos Triantafyllis
Date: Mon, 9 May 2022 16:22:27 +0300
Subject: [PATCH 08/89] fixing bug in start and end metrics where start and end
arguments were not taken into account
---
preprocessor.py | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/preprocessor.py b/preprocessor.py
index 3ea3fba..2d0630c 100755
--- a/preprocessor.py
+++ b/preprocessor.py
@@ -274,10 +274,10 @@ def __init__(self, source_page_id, target_page_id, order):
m.timestamp=str(datetime.utcnow())
- ua_start=recdb["user_action"].find_one(sort=[("timestamp", 1)])["timestamp"]
- ua_end=recdb["user_action"].find_one(sort=[("timestamp", -1)])["timestamp"]
- rec_start=recdb["recommendation"].find_one(sort=[("timestamp", 1)])["timestamp"]
- rec_end=recdb["recommendation"].find_one(sort=[("timestamp", -1)])["timestamp"]
+ ua_start=recdb["user_action"].find_one(query,sort=[("timestamp", 1)])["timestamp"]
+ ua_end=recdb["user_action"].find_one(query,sort=[("timestamp", -1)])["timestamp"]
+ rec_start=recdb["recommendation"].find_one(query,sort=[("timestamp", 1)])["timestamp"]
+ rec_end=recdb["recommendation"].find_one(query,sort=[("timestamp", -1)])["timestamp"]
m.start=str(min(ua_start, rec_start))
m.end=str(max(ua_end, rec_end))
From ada065dc0c646de5a59ca867e32bde63cb8f0c88 Mon Sep 17 00:00:00 2001
From: Nikolaos Triantafyllis
Date: Mon, 9 May 2022 17:00:44 +0300
Subject: [PATCH 09/89] Needed commit so that rsmetrics can read the new
user_actions format
(user,source_id,target_id,reward,timestamp,source_page,target_page)
---
metrics.py | 4 ++--
rsmetrics.py | 2 +-
2 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/metrics.py b/metrics.py
index f901135..3681df9 100644
--- a/metrics.py
+++ b/metrics.py
@@ -38,12 +38,12 @@ def services(object):
"""
Calculate the total number of unique services
found in Pandas DataFrame object services (if provided)
- or user_actions otherwise
+ or user_actions otherwise (from both Source and Target Service)
"""
if isinstance(object.services, pd.DataFrame):
return int(object.services.nunique()['Service'])
else:
- return int(object.user_actions.nunique()['Service'])
+ return len(np.unique(np.concatenate([object.user_actions['Source_Service'].unique(),object.user_actions['Target_Service'].unique()])))
@doc('The total number of recommendations found in recommendations.csv')
diff --git a/rsmetrics.py b/rsmetrics.py
index 129bb42..3325e9e 100755
--- a/rsmetrics.py
+++ b/rsmetrics.py
@@ -73,7 +73,7 @@ def inner():
sys.exit(0)
# read data
-run.user_actions=pd.read_csv(os.path.join(args.input,'user_actions.csv'),names=["User", "Service", "Reward", "Action", "Timestamp"])
+run.user_actions=pd.read_csv(os.path.join(args.input,'user_actions.csv'),names=["User", "Source_Service", "Target_Service", "Reward", "Action", "Timestamp", "Source_Page_ID", "Target_Page_ID"])
run.recommendations=pd.read_csv(os.path.join(args.input,'recommendations.csv'),names=["User", "Service", "Rating", "Timestamp"])
# convert timestamp column to datetime object
From 1c65e667044bf37cdb136e6d427e84d2f61d68d9 Mon Sep 17 00:00:00 2001
From: Nikolaos Triantafyllis
Date: Mon, 9 May 2022 17:10:15 +0300
Subject: [PATCH 10/89] Set start and end metrics. Get the min/max from both
user_actions and recommendations csv files
---
metrics.py | 19 +++++++++++++++++++
1 file changed, 19 insertions(+)
diff --git a/metrics.py b/metrics.py
index 3681df9..bbee950 100644
--- a/metrics.py
+++ b/metrics.py
@@ -20,6 +20,25 @@ def wrapper(f):
# Metrics
+
+@doc('The initial date where metrics are calculated on')
+def start(object):
+ """
+ Calculate the start date where metrics are calculated on
+ found in min value between Pandas DataFrame object user_action
+ and recommendation
+ """
+ return str(min(min(object.user_actions['Timestamp']),min(object.recommendations['Timestamp'])))
+
+@doc('The final date where metrics are calculated on')
+def end(object):
+ """
+ Calculate the end date where metrics are calculated on
+ found in max value between Pandas DataFrame object user_action
+ and recommendation
+ """
+ return str(max(max(object.user_actions['Timestamp']),max(object.recommendations['Timestamp'])))
+
@doc('The total number of unique users found in users.csv (if provided), otherwise in user_actions.csv')
def users(object):
"""
From e2eb2a55a2834b8af37a6bfcf2b4d76e17dfd50d Mon Sep 17 00:00:00 2001
From: Nikolaos Triantafyllis
Date: Mon, 9 May 2022 17:42:35 +0300
Subject: [PATCH 11/89] Users and Services consistency according to REC-19.
Also, published config option affects calculations with service collection
---
config.yaml | 15 ++++++---------
preprocessor.py | 26 +++++++++-----------------
2 files changed, 15 insertions(+), 26 deletions(-)
diff --git a/config.yaml b/config.yaml
index fd7e0e4..5b4eb8e 100644
--- a/config.yaml
+++ b/config.yaml
@@ -7,23 +7,20 @@ Source:
User:
export: true
- #from: 'user_actions'
- #from: 'recommendations'
- from: 'source'
Service:
+ # if true it keeps only published, otherwise all
+ # this has an effect in exporting when from is set to 'source'
+ # and also in metrics calculations where service is considered
+ published: true
+
# Use the EOSC-Marketplace webpage
# to associate page_id and service_id
download: true
path: ./page_map
export: true
- #from: 'user_actions'
- #from: 'recommendations'
- #from: 'source'
- from: 'page_map'
-
- published: true # applies only on source option
+ from: 'page_map' # or 'source'
# Calculate source's metrics
Metrics: true
diff --git a/preprocessor.py b/preprocessor.py
index 2d0630c..761359c 100755
--- a/preprocessor.py
+++ b/preprocessor.py
@@ -234,14 +234,7 @@ def __init__(self, source_page_id, target_page_id, order):
# export user catalog
if config['User']['export']:
- if config['User']['from']=='user_actions':
- us=natsorted(list(set(list(map(lambda x: x.split(',')[0]+'\n',luas)))),alg=ns.ns.SIGNED)
-
- elif config['User']['from']=='recommendations':
- us=natsorted(list(set(list(map(lambda x: x.split(',')[0]+'\n',recs)))),alg=ns.ns.SIGNED)
-
- else: # 'source'
- us=natsorted(list(set(list(map(lambda x: str(x['_id'])+'\n',recdb["user"].find({}))))),alg=ns.ns.SIGNED)
+ us=natsorted(list(set(list(map(lambda x: str(x['_id'])+'\n',recdb["user"].find({}))))),alg=ns.ns.SIGNED)
with open(os.path.join(args.output,'users.csv'), 'w') as o:
o.writelines(us)
@@ -249,13 +242,7 @@ def __init__(self, source_page_id, target_page_id, order):
# export service catalog
if config['Service']['export']:
- if config['Service']['from']=='user_actions':
- ss=natsorted(list(set(list(map(lambda x: x.split(',')[1]+'\n',luas)))),alg=ns.ns.SIGNED)
-
- elif config['Service']['from']=='recommendations':
- ss=natsorted(list(set(list(map(lambda x: x.split(',')[1]+'\n',recs)))),alg=ns.ns.SIGNED)
-
- elif config['Service']['from']=='page_map':
+ if config['Service']['from']=='page_map':
ss=natsorted(list(set(list(map(lambda x: x+'\n',values)))),alg=ns.ns.SIGNED)
else: # 'source'
@@ -270,7 +257,6 @@ def __init__(self, source_page_id, target_page_id, order):
# calculate pre metrics
if config['Metrics']:
- time_range=recdb["user_action"].distinct("timestamp", query)
m.timestamp=str(datetime.utcnow())
@@ -283,8 +269,14 @@ def __init__(self, source_page_id, target_page_id, order):
m.end=str(max(ua_end, rec_end))
m.users=recdb["user"].count_documents({})
+
m.recommendations=recdb["recommendation"].count_documents(query)
- m.services=recdb["service"].count_documents({})
+
+ if config['Service']['published']:
+ m.services=recdb["service"].count_documents({"status":"published"})
+ else:
+ m.services=recdb["service"].count_documents({})
+
m.user_actions=recdb["user_action"].count_documents(query)
m.user_actions_registered=recdb["user_action"].count_documents({**query,**{"user":{"$exists":True}}})
From 605c842fc80f73e3229c83dfecd1d7bb48c04669 Mon Sep 17 00:00:00 2001
From: Nikolaos Triantafyllis
Date: Mon, 9 May 2022 17:45:37 +0300
Subject: [PATCH 12/89] Setting v.0.2.2 in both Preprocessor and RSmetrics
---
preprocessor.py | 2 +-
rsmetrics.py | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/preprocessor.py b/preprocessor.py
index 761359c..0648e3d 100755
--- a/preprocessor.py
+++ b/preprocessor.py
@@ -16,7 +16,7 @@
__copyright__ = "© "+str(datetime.utcnow().year)+", National Infrastructures for Research and Technology (GRNET)"
__status__ = "Production"
-__version__ = "0.2"
+__version__ = "0.2.2"
os.environ['COLUMNS'] = "90"
diff --git a/rsmetrics.py b/rsmetrics.py
index 3325e9e..d5a3fbb 100755
--- a/rsmetrics.py
+++ b/rsmetrics.py
@@ -14,7 +14,7 @@
__copyright__ = "© "+str(datetime.utcnow().year)+", National Infrastructures for Research and Technology (GRNET)"
__status__ = "Production"
-__version__ = "0.2"
+__version__ = "0.2.2"
os.environ['COLUMNS'] = "90"
From 1a9958db7dd876291e4cd2ceb32fddb9d1e64c8f Mon Sep 17 00:00:00 2001
From: Nikolaos Triantafyllis
Date: Thu, 12 May 2022 20:27:21 +0300
Subject: [PATCH 13/89] Pre-metrics have been encapsulated in functions and
appended with description (see REC-24)
---
preprocessor.py | 98 ++++++++++++++++++++++++++++++++++---------------
1 file changed, 69 insertions(+), 29 deletions(-)
diff --git a/preprocessor.py b/preprocessor.py
index 0648e3d..99d8cce 100755
--- a/preprocessor.py
+++ b/preprocessor.py
@@ -8,8 +8,11 @@
from natsort import natsorted
import natsort as ns
import pandas as pd
+from inspect import getmembers, isfunction
import retrieval
+# local lib
+import pre_metrics as pm
import reward_mapping as rm
from get_service_catalog import get_eosc_marketplace_url, get_service_catalog_items, get_service_catalog_page_content, save_service_items_to_csv
@@ -258,49 +261,86 @@ def __init__(self, source_page_id, target_page_id, order):
# calculate pre metrics
if config['Metrics']:
- m.timestamp=str(datetime.utcnow())
+ run=pm.Runtime()
+ run.recdb=recdb
+ run.query=query
+ run.config=config
- ua_start=recdb["user_action"].find_one(query,sort=[("timestamp", 1)])["timestamp"]
- ua_end=recdb["user_action"].find_one(query,sort=[("timestamp", -1)])["timestamp"]
- rec_start=recdb["recommendation"].find_one(query,sort=[("timestamp", 1)])["timestamp"]
- rec_end=recdb["recommendation"].find_one(query,sort=[("timestamp", -1)])["timestamp"]
+ md={'timestamp':str(datetime.utcnow())}
- m.start=str(min(ua_start, rec_start))
- m.end=str(max(ua_end, rec_end))
+ # get all functions found in pre_metrics module
+ # apart from 'doc' func
+ # run and save the result in dictionary
+ # where key is the name of the function
+ # and value what it returns
+ # whereas, for each found functions
+ # an extra key_doc element in dictionary is set
+ # to save the text of the function
+ funcs = list(map(lambda x: x[0], getmembers(pm, isfunction)))
+ funcs = list(filter(lambda x: not x=='doc',funcs))
+ for func in funcs:
+ md[func+'_doc']=getattr(pm, func).text
+ md[func]=getattr(pm, func)(run)
- m.users=recdb["user"].count_documents({})
- m.recommendations=recdb["recommendation"].count_documents(query)
+ jsonstr = json.dumps(md)
- if config['Service']['published']:
- m.services=recdb["service"].count_documents({"status":"published"})
- else:
- m.services=recdb["service"].count_documents({})
+ print(jsonstr)
+
+ # Using a JSON string
+ with open(os.path.join(args.output,'pre_metrics.json'), 'w') as outfile:
+ outfile.write(jsonstr)
+
+
+
+
+
+
+
+
+
+
+
+
+ import sys
+ sys.exit(0)
+
+
+
+ #m.timestamp=str(datetime.utcnow())
+
+ #m.users=recdb["user"].count_documents({})
+
+ #m.recommendations=recdb["recommendation"].count_documents(query)
- m.user_actions=recdb["user_action"].count_documents(query)
+ #if config['Service']['published']:
+ # m.services=recdb["service"].count_documents({"status":"published"})
+ #else:
+ # m.services=recdb["service"].count_documents({})
- m.user_actions_registered=recdb["user_action"].count_documents({**query,**{"user":{"$exists":True}}})
- m.user_actions_anonymous=m.user_actions-m.user_actions_registered
- m.user_actions_registered_perc=round(m.user_actions_registered*100.0/m.user_actions,2)
- m.user_actions_anonymous_perc=100-m.user_actions_registered_perc
+ #m.user_actions=recdb["user_action"].count_documents(query)
- m.user_actions_order=recdb["user_action"].count_documents({**query, **{"action.order":True}})
- m.user_actions_order_registered=recdb["user_action"].count_documents({**query, **{"action.order":True,"user":{"$exists":True}}})
- m.user_actions_order_anonymous=m.user_actions_order-m.user_actions_order_registered
- m.user_actions_order_registered_perc=round(m.user_actions_order_registered*100.0/m.user_actions_order,2)
- m.user_actions_order_anonymous_perc=100-m.user_actions_order_registered_perc
+ #m.user_actions_registered=recdb["user_action"].count_documents({**query,**{"user":{"$exists":True}}})
+ #m.user_actions_anonymous=m.user_actions-m.user_actions_registered
+ #m.user_actions_registered_perc=round(m.user_actions_registered*100.0/m.user_actions,2)
+ #m.user_actions_anonymous_perc=100-m.user_actions_registered_perc
- m.user_actions_panel=recdb["user_action"].count_documents({**query, **{"source.root.type":"recommendation_panel"}})
- m.user_actions_panel_perc=round(m.user_actions_panel*100.0/m.user_actions,2)
+ #m.user_actions_order=recdb["user_action"].count_documents({**query, **{"action.order":True}})
+ #m.user_actions_order_registered=recdb["user_action"].count_documents({**query, **{"action.order":True,"user":{"$exists":True}}})
+ #m.user_actions_order_anonymous=m.user_actions_order-m.user_actions_order_registered
+ #m.user_actions_order_registered_perc=round(m.user_actions_order_registered*100.0/m.user_actions_order,2)
+ #m.user_actions_order_anonymous_perc=100-m.user_actions_order_registered_perc
- m.service_catalog=len(recdb["recommendation"].distinct("services", query))
+ #m.user_actions_panel=recdb["user_action"].count_documents({**query, **{"source.root.type":"recommendation_panel"}})
+ #m.user_actions_panel_perc=round(m.user_actions_panel*100.0/m.user_actions,2)
# catalog coverage
- m.service_catalog_perc=round(m.service_catalog*100.0/m.services,2)
+ #m.service_catalog=len(recdb["recommendation"].distinct("services", query))
+ #m.service_catalog_perc=round(m.service_catalog*100.0/m.services,2)
# user coverage
- m.user_catalog=len(recdb["user_action"].distinct("user", query))
- m.user_catalog_perc=round(m.user_catalog*100.0/m.users,2)
+ #m.user_catalog=len(recdb["user_action"].distinct("user", query))
+ #m.user_catalog_perc=round(m.user_catalog*100.0/m.users,2)
jsonstr = json.dumps(m.__dict__)
print(jsonstr)
From 2b05222be08ce9c936c21af8e0f5480515c201dd Mon Sep 17 00:00:00 2001
From: Nikolaos Triantafyllis
Date: Fri, 13 May 2022 12:45:00 +0300
Subject: [PATCH 14/89] Pre-metrics have been encapsulated in functions and
appended with description (see REC-24) [Missing pre_metrics.py file added]
---
pre_metrics.py | 227 +++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 227 insertions(+)
create mode 100644 pre_metrics.py
diff --git a/pre_metrics.py b/pre_metrics.py
new file mode 100644
index 0000000..668b805
--- /dev/null
+++ b/pre_metrics.py
@@ -0,0 +1,227 @@
+#!/usr/bin/env python3
+import pandas as pd
+import numpy as np
+
+
+class Runtime:
+ def __init__(self):
+ self.query={}
+ self.recdb=None
+ self.config=None
+
+# decorator to add the text attribute to function
+def doc(r):
+ def wrapper(f):
+ f.text = r
+ return f
+ return wrapper
+
+
+# Pre Metrics
+
+
+@doc('The initial date where metrics are calculated on')
+def start(object):
+ """
+ Calculate the start date where metrics are calculated on
+ found in min value between source object user_action
+ and recommendation
+ """
+ ua_start=object.recdb["user_action"].find_one(object.query,sort=[("timestamp", 1)])["timestamp"]
+ rec_start=object.recdb["recommendation"].find_one(object.query,sort=[("timestamp", 1)])["timestamp"]
+
+ return str(min(ua_start, rec_start))
+
+
+@doc('The final date where metrics are calculated on')
+def end(object):
+ """
+ Calculate the end date where metrics are calculated on
+ found in max value between source object user_action
+ and recommendation
+ """
+ ua_end=object.recdb["user_action"].find_one(object.query,sort=[("timestamp", -1)])["timestamp"]
+ rec_end=object.recdb["recommendation"].find_one(object.query,sort=[("timestamp", -1)])["timestamp"]
+ return str(max(ua_end, rec_end))
+
+
+@doc('The total number of unique users found in source')
+def users(object):
+ """
+ Calculate the total number of unique users
+ found in source object
+ """
+ return object.recdb["user"].count_documents({})
+
+
+@doc('The total number of unique services found in source (default to published only)')
+def services(object):
+ """
+ Calculate the total number of unique services
+ found in source object (default to published only)
+ """
+ if object.config['Service']['published']:
+ return object.recdb["service"].count_documents({"status":"published"})
+ else:
+ return object.recdb["service"].count_documents({})
+
+
+@doc('The total number of recommendations found in source')
+def recommendations(object):
+ """
+ Calculate the total number of recommendations
+ found in source object
+ """
+ return object.recdb["recommendation"].count_documents(object.query)
+
+
+@doc('The total number of user actions found in source')
+def user_actions(object):
+ """
+ Calculate the total number of user_actions
+ found in source object
+ """
+ return object.recdb["user_action"].count_documents(object.query)
+
+
+@doc('The total number of user actions occurred by registered users found in source')
+def user_actions_registered(object):
+ """
+ Calculate the total number of user_actions occurred by registered users
+ found in source object
+ """
+ return object.recdb["user_action"].count_documents({**object.query,**{"user":{"$exists":True}}})
+
+
+@doc('The total number of user actions occurred by anonymous users found in source')
+def user_actions_anonymous(object):
+ """
+ Calculate the total number of user_actions occurred by anonymous users
+ found in source object
+ """
+ return user_actions(object)-user_actions_registered(object)
+
+
+@doc('The percentage (%) of user actions occurred by registered users to the total user actions')
+def user_actions_registered_perc(object):
+ """
+ Calculate the percentage (%) of user actions occurred
+ by registered users to the total user actions
+ found in source object user_actions (in two decimals)
+ """
+ return round(user_actions_registered(object)*100.0/user_actions(object),2)
+
+
+@doc('The percentage (%) of user actions occurred by anonymous users to the total user actions')
+def user_actions_anonymous_perc(object):
+ """
+ Calculate the percentage (%) of user actions occurred
+ by anonymous users to the total user actions
+ found in source object user_actions (in two decimals)
+ """
+ return round(100.0-user_actions_registered_perc(object),2)
+
+
+@doc('The total number of user actions led to order found in source')
+def user_actions_order(object):
+ """
+ Calculate the total number of user_actions led to order
+ found in source object user_actions
+ """
+ return object.recdb["user_action"].count_documents({**object.query, **{"action.order":True}})
+
+
+@doc('The total number of user actions led to order by registered users found in source')
+def user_actions_order_registered(object):
+ """
+ Calculate the total number of user_actions led to order by registered users
+ found in source object user_actions
+ """
+ return object.recdb["user_action"].count_documents({**object.query, **{"action.order":True,"user":{"$exists":True}}})
+
+
+@doc('The total number of user actions led to order by anonymous users found in source')
+def user_actions_order_anonymous(object):
+ """
+ Calculate the total number of user_actions led to order by anonymous users
+ found in source object user_actions
+ """
+ return user_actions_order(object)-user_actions_order_registered(object)
+
+
+@doc('The percentage (%) of user actions occurred by registered users and led to order to the total user actions that led to order')
+def user_actions_order_registered_perc(object):
+ """
+ Calculate the percentage (%) of user actions occurred
+ by registered users and led to order to the total user actions that led to order
+ found in source object user_actions (in two decimals)
+ """
+ return round(user_actions_order_registered(object)*100.0/user_actions_order(object),2)
+
+
+@doc('The percentage (%) of user actions occurred by anonymous users and led to order to the total user actions that led to order')
+def user_actions_order_anonymous_perc(object):
+ """
+ Calculate the percentage (%) of user actions occurred
+ by anonymous users and led to order to the total user actions that led to order
+ found in source object user_actions (in two decimals)
+ """
+ return round(100.0-user_actions_order_registered_perc(object),2)
+
+
+@doc('The total number of user actions assosicated with the recommendation panel found in source')
+def user_actions_panel(object):
+ """
+ Calculate the total number of user_actions assosicated with the recommendation panel
+ found in source object user_actions
+ """
+ return object.recdb["user_action"].count_documents({**object.query, **{"source.root.type":"recommendation_panel"}})
+
+
+@doc('The percentage (%) of user actions assosicated with the recommendation panel to the total user actions')
+def user_actions_panel_perc(object):
+ """
+ Calculate the percentage (%) of user actions assosicated with
+ the recommendation panel to the total user actions
+ found in source object user_actions (in two decimals)
+ """
+ return round(user_actions_panel(object)*100.0/user_actions(object),2)
+
+
+@doc('The total number of unique services found in source')
+def catalog_coverage(object):
+ """
+ Calculate the total number of unique services
+ found in recommendations.csv
+ """
+ return len(object.recdb["recommendation"].distinct("services", object.query))
+
+
+@doc('The percentage (%) of unique services found in recommedations.csv to the total number of services (provided or found otherwise in source)')
+def catalog_coverage_perc(object):
+ """
+ Calculate the percentage (%) of unique services
+ found in source
+ """
+ return round(catalog_coverage(object)*100.0/services(object),2)
+
+
+@doc('The total number of unique users found in recommendations.csv')
+def user_coverage(object):
+ """
+ Calculate the total number of unique users
+ found in source
+ """
+ return len(object.recdb["user_action"].distinct("user", object.query))
+
+
+@doc('The percentage (%) of unique users found in recommedations.csv to the total number of users (provided or found otherwise in source)')
+def user_coverage_perc(object):
+ """
+ Calculate the percentage (%) of unique users
+ found in source to the total number
+ of users
+ """
+ return round(user_coverage(object)*100.0/users(object),2)
+
+
From ba71c5afc8998965d6e6ea22c92647672100cc07 Mon Sep 17 00:00:00 2001
From: Nikolaos Triantafyllis
Date: Fri, 13 May 2022 14:47:58 +0300
Subject: [PATCH 15/89] Bug-Fix in user_coverage calculation
---
config.yaml | 2 +-
pre_metrics.py | 30 +++++++++++++++---------------
preprocessor.py | 22 ----------------------
3 files changed, 16 insertions(+), 38 deletions(-)
diff --git a/config.yaml b/config.yaml
index 5b4eb8e..77590d5 100644
--- a/config.yaml
+++ b/config.yaml
@@ -12,7 +12,7 @@ Service:
# if true it keeps only published, otherwise all
# this has an effect in exporting when from is set to 'source'
# and also in metrics calculations where service is considered
- published: true
+ published: false
# Use the EOSC-Marketplace webpage
# to associate page_id and service_id
diff --git a/pre_metrics.py b/pre_metrics.py
index 668b805..943761b 100644
--- a/pre_metrics.py
+++ b/pre_metrics.py
@@ -45,7 +45,7 @@ def end(object):
return str(max(ua_end, rec_end))
-@doc('The total number of unique users found in source')
+@doc('The total number of unique users found in users of the source')
def users(object):
"""
Calculate the total number of unique users
@@ -54,7 +54,7 @@ def users(object):
return object.recdb["user"].count_documents({})
-@doc('The total number of unique services found in source (default to published only)')
+@doc('The total number of unique services found in services of the source')
def services(object):
"""
Calculate the total number of unique services
@@ -66,7 +66,7 @@ def services(object):
return object.recdb["service"].count_documents({})
-@doc('The total number of recommendations found in source')
+@doc('The total number of recommendations found in recommendations of the source')
def recommendations(object):
"""
Calculate the total number of recommendations
@@ -75,7 +75,7 @@ def recommendations(object):
return object.recdb["recommendation"].count_documents(object.query)
-@doc('The total number of user actions found in source')
+@doc('The total number of user actions found in user actions of the source')
def user_actions(object):
"""
Calculate the total number of user_actions
@@ -84,7 +84,7 @@ def user_actions(object):
return object.recdb["user_action"].count_documents(object.query)
-@doc('The total number of user actions occurred by registered users found in source')
+@doc('The total number of user actions occurred by registered users found in user actions of the source')
def user_actions_registered(object):
"""
Calculate the total number of user_actions occurred by registered users
@@ -93,7 +93,7 @@ def user_actions_registered(object):
return object.recdb["user_action"].count_documents({**object.query,**{"user":{"$exists":True}}})
-@doc('The total number of user actions occurred by anonymous users found in source')
+@doc('The total number of user actions occurred by anonymous users found in user actions of the source')
def user_actions_anonymous(object):
"""
Calculate the total number of user_actions occurred by anonymous users
@@ -122,7 +122,7 @@ def user_actions_anonymous_perc(object):
return round(100.0-user_actions_registered_perc(object),2)
-@doc('The total number of user actions led to order found in source')
+@doc('The total number of user actions led to order found in user actions of the source')
def user_actions_order(object):
"""
Calculate the total number of user_actions led to order
@@ -131,7 +131,7 @@ def user_actions_order(object):
return object.recdb["user_action"].count_documents({**object.query, **{"action.order":True}})
-@doc('The total number of user actions led to order by registered users found in source')
+@doc('The total number of user actions led to order by registered users found in user actions of the source')
def user_actions_order_registered(object):
"""
Calculate the total number of user_actions led to order by registered users
@@ -140,7 +140,7 @@ def user_actions_order_registered(object):
return object.recdb["user_action"].count_documents({**object.query, **{"action.order":True,"user":{"$exists":True}}})
-@doc('The total number of user actions led to order by anonymous users found in source')
+@doc('The total number of user actions led to order by anonymous users found in user actions of the source')
def user_actions_order_anonymous(object):
"""
Calculate the total number of user_actions led to order by anonymous users
@@ -169,7 +169,7 @@ def user_actions_order_anonymous_perc(object):
return round(100.0-user_actions_order_registered_perc(object),2)
-@doc('The total number of user actions assosicated with the recommendation panel found in source')
+@doc('The total number of user actions assosicated with the recommendation panel found in user actions of the source')
def user_actions_panel(object):
"""
Calculate the total number of user_actions assosicated with the recommendation panel
@@ -188,7 +188,7 @@ def user_actions_panel_perc(object):
return round(user_actions_panel(object)*100.0/user_actions(object),2)
-@doc('The total number of unique services found in source')
+@doc('The total number of unique services found in recommendations of the source')
def catalog_coverage(object):
"""
Calculate the total number of unique services
@@ -197,7 +197,7 @@ def catalog_coverage(object):
return len(object.recdb["recommendation"].distinct("services", object.query))
-@doc('The percentage (%) of unique services found in recommedations.csv to the total number of services (provided or found otherwise in source)')
+@doc('The percentage (%) of unique services found in recommedations of the source to the total number of services (provided or found otherwise in source)')
def catalog_coverage_perc(object):
"""
Calculate the percentage (%) of unique services
@@ -206,16 +206,16 @@ def catalog_coverage_perc(object):
return round(catalog_coverage(object)*100.0/services(object),2)
-@doc('The total number of unique users found in recommendations.csv')
+@doc('The total number of unique users found in recommendations of the source')
def user_coverage(object):
"""
Calculate the total number of unique users
found in source
"""
- return len(object.recdb["user_action"].distinct("user", object.query))
+ return len(object.recdb["recommendation"].distinct("user", object.query))
-@doc('The percentage (%) of unique users found in recommedations.csv to the total number of users (provided or found otherwise in source)')
+@doc('The percentage (%) of unique users found in recommedations of the source to the total number of users (provided or found otherwise in source)')
def user_coverage_perc(object):
"""
Calculate the percentage (%) of unique users
diff --git a/preprocessor.py b/preprocessor.py
index 99d8cce..a37c785 100755
--- a/preprocessor.py
+++ b/preprocessor.py
@@ -291,22 +291,6 @@ def __init__(self, source_page_id, target_page_id, order):
with open(os.path.join(args.output,'pre_metrics.json'), 'w') as outfile:
outfile.write(jsonstr)
-
-
-
-
-
-
-
-
-
-
-
- import sys
- sys.exit(0)
-
-
-
#m.timestamp=str(datetime.utcnow())
#m.users=recdb["user"].count_documents({})
@@ -342,9 +326,3 @@ def __init__(self, source_page_id, target_page_id, order):
#m.user_catalog=len(recdb["user_action"].distinct("user", query))
#m.user_catalog_perc=round(m.user_catalog*100.0/m.users,2)
- jsonstr = json.dumps(m.__dict__)
- print(jsonstr)
-
- # Using a JSON string
- with open(os.path.join(args.output,'pre_metrics.json'), 'w') as outfile:
- outfile.write(jsonstr)
From d94af93da24ff4bd23013e21a4e824bc21a8345f Mon Sep 17 00:00:00 2001
From: Nikolaos Triantafyllis
Date: Fri, 13 May 2022 15:51:47 +0300
Subject: [PATCH 16/89] default to publish
---
config.yaml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/config.yaml b/config.yaml
index 77590d5..5b4eb8e 100644
--- a/config.yaml
+++ b/config.yaml
@@ -12,7 +12,7 @@ Service:
# if true it keeps only published, otherwise all
# this has an effect in exporting when from is set to 'source'
# and also in metrics calculations where service is considered
- published: false
+ published: true
# Use the EOSC-Marketplace webpage
# to associate page_id and service_id
From ed3933f79ed3e63a8be16c484673709ff1d90c6d Mon Sep 17 00:00:00 2001
From: Nikolaos Triantafyllis
Date: Fri, 13 May 2022 17:31:47 +0300
Subject: [PATCH 17/89] Resolving ticket REC-36 for Consistency between
users/services in user_actions/recommendations
---
rsmetrics.py | 13 ++++++++++++-
1 file changed, 12 insertions(+), 1 deletion(-)
diff --git a/rsmetrics.py b/rsmetrics.py
index d5a3fbb..1c3b658 100755
--- a/rsmetrics.py
+++ b/rsmetrics.py
@@ -80,7 +80,7 @@ def inner():
run.user_actions['Timestamp']= pd.to_datetime(run.user_actions['Timestamp'])
run.recommendations['Timestamp']= pd.to_datetime(run.recommendations['Timestamp'])
-# restrict data to datetime range
+# restrict user actions and recommendations data to datetime range
if args.starttime:
run.user_actions=run.user_actions[(run.user_actions['Timestamp'] > args.starttime) & (run.user_actions['Timestamp'] < args.endtime)]
run.recommendations=run.recommendations[(run.recommendations['Timestamp'] > args.starttime) & (run.recommendations['Timestamp'] < args.endtime)]
@@ -98,6 +98,17 @@ def inner():
if args.services:
run.services=pd.read_csv(os.path.join(args.input,'services.csv'),names=["Service"])
+# remove user actions when user or service does not exist in users' or services' catalogs
+# adding -1 in all catalogs indicating the anonynoums users or not-known services
+run.user_actions = run.user_actions[run.user_actions['User'].isin(run.users['User'].tolist()+[-1])]
+run.user_actions = run.user_actions[run.user_actions['Source_Service'].isin(run.services['Service'].tolist()+[-1])]
+run.user_actions = run.user_actions[run.user_actions['Target_Service'].isin(run.services['Service'].tolist()+[-1])]
+
+# remove recommendations when user or service does not exist in users' or services' catalogs
+# adding -1 in all catalogs indicating the anonynoums users or not-known services
+run.recommendations = run.recommendations[run.recommendations['User'].isin(run.users['User'].tolist()+[-1])]
+run.recommendations = run.recommendations[run.recommendations['Service'].isin(run.services['Service'].tolist()+[-1])]
+
md={'timestamp':str(datetime.utcnow())}
From d69efc4a044ed4c75d398016df692a728d7789f6 Mon Sep 17 00:00:00 2001
From: Nikolaos Triantafyllis
Date: Fri, 13 May 2022 17:36:45 +0300
Subject: [PATCH 18/89] RSmetrics typical usage added
---
README.md | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/README.md b/README.md
index 28edea0..9e92bb1 100644
--- a/README.md
+++ b/README.md
@@ -68,7 +68,7 @@ optional arguments:
-9. Run from terminal: `./rsmetrics.py` to run RSmetrics
+9. Run from terminal: `./rsmetrics.py --users --services` to run RSmetrics and include the users.csv and services.csv files genrated by the Preprocessor
```bash
_____ _____ _ _
| __ \ / ____| | | (_)
From 1c9e4f148fe78628d025467c791b864832d6b7f9 Mon Sep 17 00:00:00 2001
From: Nikolaos Triantafyllis
Date: Fri, 13 May 2022 17:37:35 +0300
Subject: [PATCH 19/89] RSmetrics typical usage added (fix)
---
README.md | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/README.md b/README.md
index 9e92bb1..e04a175 100644
--- a/README.md
+++ b/README.md
@@ -68,7 +68,7 @@ optional arguments:
-9. Run from terminal: `./rsmetrics.py --users --services` to run RSmetrics and include the users.csv and services.csv files genrated by the Preprocessor
+9. Run from terminal: `./rsmetrics.py --users --services` to run RSmetrics and include the `users.csv` and `services.csv` files generated by the Preprocessor
```bash
_____ _____ _ _
| __ \ / ____| | | (_)
From 62160b0970d4b5991e8ec8e4fcf38efc00f40c08 Mon Sep 17 00:00:00 2001
From: Nikolaos Triantafyllis
Date: Fri, 13 May 2022 17:39:50 +0300
Subject: [PATCH 20/89] RSmetrics typical usage added (fix config)
---
README.md | 34 +++++++++++++++++++++++++++++-----
1 file changed, 29 insertions(+), 5 deletions(-)
diff --git a/README.md b/README.md
index e04a175..bbdde53 100644
--- a/README.md
+++ b/README.md
@@ -61,12 +61,36 @@ optional arguments:
```
8. Configure `./preprocessor.py` by editting the `config.yaml` or providing another with `-c`:
-
-
-
-
-
+```yaml
+
+# Set the desired connector (e.g. MongoDB)
+Source:
+ MongoDB:
+ host: localhost
+ port: 27017
+ db: recommender_dev
+
+User:
+ export: true
+Service:
+ # if true it keeps only published, otherwise all
+ # this has an effect in exporting when from is set to 'source'
+ # and also in metrics calculations where service is considered
+ published: true
+
+ # Use the EOSC-Marketplace webpage
+ # to associate page_id and service_id
+ download: true
+ path: ./page_map
+
+ export: true
+ from: 'page_map' # or 'source'
+
+# Calculate source's metrics
+Metrics: true
+
+```
9. Run from terminal: `./rsmetrics.py --users --services` to run RSmetrics and include the `users.csv` and `services.csv` files generated by the Preprocessor
```bash
From b8db5db0c8e538073331bec3c96bbcaa212bb45d Mon Sep 17 00:00:00 2001
From: Nikolaos Triantafyllis
Date: Fri, 13 May 2022 18:32:00 +0300
Subject: [PATCH 21/89] Resolving ticket REC-36 for Consistency between
users/services in user_actions/recommendations. Counting no appearance of
user in recommendations as an anonymous user; thus +1 in user_coverage
---
pre_metrics.py | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/pre_metrics.py b/pre_metrics.py
index 943761b..c24acf8 100644
--- a/pre_metrics.py
+++ b/pre_metrics.py
@@ -212,6 +212,10 @@ def user_coverage(object):
Calculate the total number of unique users
found in source
"""
+ # count also anonymous user as a user
+ anonymous=object.recdb["recommendation"].find_one({'user':None})
+ if anonymous:
+ return len(object.recdb["recommendation"].distinct("user", object.query))+1
return len(object.recdb["recommendation"].distinct("user", object.query))
From f919b0a0ad057781482e56bdd73c4f6132950d62 Mon Sep 17 00:00:00 2001
From: Nikolaos Triantafyllis
Date: Tue, 24 May 2022 00:03:29 +0300
Subject: [PATCH 22/89] Adding new metrics: recommendations_registered,
recommendations_anonymous with their percentages too
---
metrics.py | 37 +++++++++++++++++++++++++++++++++++++
1 file changed, 37 insertions(+)
diff --git a/metrics.py b/metrics.py
index bbee950..73a482f 100644
--- a/metrics.py
+++ b/metrics.py
@@ -73,6 +73,43 @@ def recommendations(object):
"""
return len(object.recommendations.index)
+@doc('The total number of recommendations for registered users found in recommendations.csv')
+def recommendations_registered(object):
+ """
+ Calculate the total number of recommendations for registered users
+ found in Pandas DataFrame object recommendations
+ """
+ return len(object.recommendations[object.recommendations['User'] != -1].index)
+
+
+@doc('The total number of recommendations for anonymous users found in recommendations.csv')
+def recommendations_anonymous(object):
+ """
+ Calculate the total number of recommendations for anonymous users
+ found in Pandas DataFrame object recommendations
+ """
+ return recommendations(object)-recommendations_registered(object)
+
+
+@doc('The percentage (%) of recommendations for registered users to the total recommendations')
+def recommendations_registered_perc(object):
+ """
+ Calculate the percentage (%) of recommendations occurred
+ by registered users to the total recommendations
+ found in Pandas DataFrame object recommendations (in two decimals)
+ """
+ return round(recommendations_registered(object)*100.0/recommendations(object),2)
+
+
+@doc('The percentage (%) of recommendations for anonymous users to the total recommendations')
+def recommendations_anonymous_perc(object):
+ """
+ Calculate the percentage (%) of recommendations occurred
+ by anonymous users to the total recommendations
+ found in Pandas DataFrame object recommendations (in two decimals)
+ """
+ return round(100.0-recommendations_registered_perc(object),2)
+
@doc('The total number of user actions found in user_actions.csv')
def user_actions(object):
From 1e3af3dd65a1930c2b936a89832cb62e33ea1231 Mon Sep 17 00:00:00 2001
From: Nikolaos Triantafyllis
Date: Tue, 24 May 2022 00:13:06 +0300
Subject: [PATCH 23/89] Adding new metrics in Preprocessor:
recommendations_registered, recommendations_anonymous with their percentages
too
---
pre_metrics.py | 37 +++++++++++++++++++++++++++++++++++++
1 file changed, 37 insertions(+)
diff --git a/pre_metrics.py b/pre_metrics.py
index c24acf8..264185f 100644
--- a/pre_metrics.py
+++ b/pre_metrics.py
@@ -74,6 +74,43 @@ def recommendations(object):
"""
return object.recdb["recommendation"].count_documents(object.query)
+@doc('The total number of recommendations for registered users found in recommendations.csv')
+def recommendations_registered(object):
+ """
+ Calculate the total number of recommendations for registered users
+ found in Pandas DataFrame object recommendations
+ """
+ return object.recdb["recommendation"].count_documents({**object.query,**{"user":{"$exists":True}}})
+
+
+@doc('The total number of recommendations for anonymous users found in recommendations.csv')
+def recommendations_anonymous(object):
+ """
+ Calculate the total number of recommendations for anonymous users
+ found in Pandas DataFrame object recommendations
+ """
+ return recommendations(object)-recommendations_registered(object)
+
+
+@doc('The percentage (%) of recommendations for registered users to the total recommendations')
+def recommendations_registered_perc(object):
+ """
+ Calculate the percentage (%) of recommendations occurred
+ by registered users to the total recommendations
+ found in Pandas DataFrame object recommendations (in two decimals)
+ """
+ return round(recommendations_registered(object)*100.0/recommendations(object),2)
+
+
+@doc('The percentage (%) of recommendations for anonymous users to the total recommendations')
+def recommendations_anonymous_perc(object):
+ """
+ Calculate the percentage (%) of recommendations occurred
+ by anonymous users to the total recommendations
+ found in Pandas DataFrame object recommendations (in two decimals)
+ """
+ return round(100.0-recommendations_registered_perc(object),2)
+
@doc('The total number of user actions found in user actions of the source')
def user_actions(object):
From 36431b8002a9754fd9137adc02b2463e66771458 Mon Sep 17 00:00:00 2001
From: Nikolaos Triantafyllis
Date: Tue, 24 May 2022 00:25:03 +0300
Subject: [PATCH 24/89] Removing user and service catalog metrics from
Preprocessor (REC-45)
---
pre_metrics.py | 78 +++++++++++++++++++++++++-------------------------
1 file changed, 39 insertions(+), 39 deletions(-)
diff --git a/pre_metrics.py b/pre_metrics.py
index 264185f..f75ef35 100644
--- a/pre_metrics.py
+++ b/pre_metrics.py
@@ -225,44 +225,44 @@ def user_actions_panel_perc(object):
return round(user_actions_panel(object)*100.0/user_actions(object),2)
-@doc('The total number of unique services found in recommendations of the source')
-def catalog_coverage(object):
- """
- Calculate the total number of unique services
- found in recommendations.csv
- """
- return len(object.recdb["recommendation"].distinct("services", object.query))
-
-
-@doc('The percentage (%) of unique services found in recommedations of the source to the total number of services (provided or found otherwise in source)')
-def catalog_coverage_perc(object):
- """
- Calculate the percentage (%) of unique services
- found in source
- """
- return round(catalog_coverage(object)*100.0/services(object),2)
-
-
-@doc('The total number of unique users found in recommendations of the source')
-def user_coverage(object):
- """
- Calculate the total number of unique users
- found in source
- """
- # count also anonymous user as a user
- anonymous=object.recdb["recommendation"].find_one({'user':None})
- if anonymous:
- return len(object.recdb["recommendation"].distinct("user", object.query))+1
- return len(object.recdb["recommendation"].distinct("user", object.query))
-
-
-@doc('The percentage (%) of unique users found in recommedations of the source to the total number of users (provided or found otherwise in source)')
-def user_coverage_perc(object):
- """
- Calculate the percentage (%) of unique users
- found in source to the total number
- of users
- """
- return round(user_coverage(object)*100.0/users(object),2)
+#@doc('The total number of unique services found in recommendations of the source')
+#def catalog_coverage(object):
+# """
+# Calculate the total number of unique services
+# found in recommendations.csv
+# """
+# return len(object.recdb["recommendation"].distinct("services", object.query))
+
+
+#@doc('The percentage (%) of unique services found in recommedations of the source to the total number of services (provided or found otherwise in source)')
+#def catalog_coverage_perc(object):
+# """
+# Calculate the percentage (%) of unique services
+# found in source
+# """
+# return round(catalog_coverage(object)*100.0/services(object),2)
+
+
+#@doc('The total number of unique users found in recommendations of the source')
+#def user_coverage(object):
+# """
+# Calculate the total number of unique users
+# found in source
+# """
+# # count also anonymous user as a user
+# anonymous=object.recdb["recommendation"].find_one({'user':None})
+# if anonymous:
+# return len(object.recdb["recommendation"].distinct("user", object.query))+1
+# return len(object.recdb["recommendation"].distinct("user", object.query))
+
+
+#@doc('The percentage (%) of unique users found in recommedations of the source to the total number of users (provided or found otherwise in source)')
+#def user_coverage_perc(object):
+# """
+# Calculate the percentage (%) of unique users
+# found in source to the total number
+# of users
+# """
+# return round(user_coverage(object)*100.0/users(object),2)
From 1ede7f2a05f879488567b33644e33b49c6b55e42 Mon Sep 17 00:00:00 2001
From: Nikolaos Triantafyllis
Date: Thu, 12 May 2022 20:27:21 +0300
Subject: [PATCH 25/89] Pre-metrics have been encapsulated in functions and
appended with description (see REC-24)
Pre-metrics have been encapsulated in functions and appended with description (see REC-24) [Missing pre_metrics.py file added]
Bug-Fix in user_coverage calculation
services' selection in configuration is set to publish by default
---
pre_metrics.py | 227 ++++++++++++++++++++++++++++++++++++++++++++++++
preprocessor.py | 88 +++++++++++--------
2 files changed, 280 insertions(+), 35 deletions(-)
create mode 100644 pre_metrics.py
diff --git a/pre_metrics.py b/pre_metrics.py
new file mode 100644
index 0000000..943761b
--- /dev/null
+++ b/pre_metrics.py
@@ -0,0 +1,227 @@
+#!/usr/bin/env python3
+import pandas as pd
+import numpy as np
+
+
+class Runtime:
+ def __init__(self):
+ self.query={}
+ self.recdb=None
+ self.config=None
+
+# decorator to add the text attribute to function
+def doc(r):
+ def wrapper(f):
+ f.text = r
+ return f
+ return wrapper
+
+
+# Pre Metrics
+
+
+@doc('The initial date where metrics are calculated on')
+def start(object):
+ """
+ Calculate the start date where metrics are calculated on
+ found in min value between source object user_action
+ and recommendation
+ """
+ ua_start=object.recdb["user_action"].find_one(object.query,sort=[("timestamp", 1)])["timestamp"]
+ rec_start=object.recdb["recommendation"].find_one(object.query,sort=[("timestamp", 1)])["timestamp"]
+
+ return str(min(ua_start, rec_start))
+
+
+@doc('The final date where metrics are calculated on')
+def end(object):
+ """
+ Calculate the end date where metrics are calculated on
+ found in max value between source object user_action
+ and recommendation
+ """
+ ua_end=object.recdb["user_action"].find_one(object.query,sort=[("timestamp", -1)])["timestamp"]
+ rec_end=object.recdb["recommendation"].find_one(object.query,sort=[("timestamp", -1)])["timestamp"]
+ return str(max(ua_end, rec_end))
+
+
+@doc('The total number of unique users found in users of the source')
+def users(object):
+ """
+ Calculate the total number of unique users
+ found in source object
+ """
+ return object.recdb["user"].count_documents({})
+
+
+@doc('The total number of unique services found in services of the source')
+def services(object):
+ """
+ Calculate the total number of unique services
+ found in source object (default to published only)
+ """
+ if object.config['Service']['published']:
+ return object.recdb["service"].count_documents({"status":"published"})
+ else:
+ return object.recdb["service"].count_documents({})
+
+
+@doc('The total number of recommendations found in recommendations of the source')
+def recommendations(object):
+ """
+ Calculate the total number of recommendations
+ found in source object
+ """
+ return object.recdb["recommendation"].count_documents(object.query)
+
+
+@doc('The total number of user actions found in user actions of the source')
+def user_actions(object):
+ """
+ Calculate the total number of user_actions
+ found in source object
+ """
+ return object.recdb["user_action"].count_documents(object.query)
+
+
+@doc('The total number of user actions occurred by registered users found in user actions of the source')
+def user_actions_registered(object):
+ """
+ Calculate the total number of user_actions occurred by registered users
+ found in source object
+ """
+ return object.recdb["user_action"].count_documents({**object.query,**{"user":{"$exists":True}}})
+
+
+@doc('The total number of user actions occurred by anonymous users found in user actions of the source')
+def user_actions_anonymous(object):
+ """
+ Calculate the total number of user_actions occurred by anonymous users
+ found in source object
+ """
+ return user_actions(object)-user_actions_registered(object)
+
+
+@doc('The percentage (%) of user actions occurred by registered users to the total user actions')
+def user_actions_registered_perc(object):
+ """
+ Calculate the percentage (%) of user actions occurred
+ by registered users to the total user actions
+ found in source object user_actions (in two decimals)
+ """
+ return round(user_actions_registered(object)*100.0/user_actions(object),2)
+
+
+@doc('The percentage (%) of user actions occurred by anonymous users to the total user actions')
+def user_actions_anonymous_perc(object):
+ """
+ Calculate the percentage (%) of user actions occurred
+ by anonymous users to the total user actions
+ found in source object user_actions (in two decimals)
+ """
+ return round(100.0-user_actions_registered_perc(object),2)
+
+
+@doc('The total number of user actions led to order found in user actions of the source')
+def user_actions_order(object):
+ """
+ Calculate the total number of user_actions led to order
+ found in source object user_actions
+ """
+ return object.recdb["user_action"].count_documents({**object.query, **{"action.order":True}})
+
+
+@doc('The total number of user actions led to order by registered users found in user actions of the source')
+def user_actions_order_registered(object):
+ """
+ Calculate the total number of user_actions led to order by registered users
+ found in source object user_actions
+ """
+ return object.recdb["user_action"].count_documents({**object.query, **{"action.order":True,"user":{"$exists":True}}})
+
+
+@doc('The total number of user actions led to order by anonymous users found in user actions of the source')
+def user_actions_order_anonymous(object):
+ """
+ Calculate the total number of user_actions led to order by anonymous users
+ found in source object user_actions
+ """
+ return user_actions_order(object)-user_actions_order_registered(object)
+
+
+@doc('The percentage (%) of user actions occurred by registered users and led to order to the total user actions that led to order')
+def user_actions_order_registered_perc(object):
+ """
+ Calculate the percentage (%) of user actions occurred
+ by registered users and led to order to the total user actions that led to order
+ found in source object user_actions (in two decimals)
+ """
+ return round(user_actions_order_registered(object)*100.0/user_actions_order(object),2)
+
+
+@doc('The percentage (%) of user actions occurred by anonymous users and led to order to the total user actions that led to order')
+def user_actions_order_anonymous_perc(object):
+ """
+ Calculate the percentage (%) of user actions occurred
+ by anonymous users and led to order to the total user actions that led to order
+ found in source object user_actions (in two decimals)
+ """
+ return round(100.0-user_actions_order_registered_perc(object),2)
+
+
+@doc('The total number of user actions assosicated with the recommendation panel found in user actions of the source')
+def user_actions_panel(object):
+ """
+ Calculate the total number of user_actions assosicated with the recommendation panel
+ found in source object user_actions
+ """
+ return object.recdb["user_action"].count_documents({**object.query, **{"source.root.type":"recommendation_panel"}})
+
+
+@doc('The percentage (%) of user actions assosicated with the recommendation panel to the total user actions')
+def user_actions_panel_perc(object):
+ """
+ Calculate the percentage (%) of user actions assosicated with
+ the recommendation panel to the total user actions
+ found in source object user_actions (in two decimals)
+ """
+ return round(user_actions_panel(object)*100.0/user_actions(object),2)
+
+
+@doc('The total number of unique services found in recommendations of the source')
+def catalog_coverage(object):
+ """
+ Calculate the total number of unique services
+ found in recommendations.csv
+ """
+ return len(object.recdb["recommendation"].distinct("services", object.query))
+
+
+@doc('The percentage (%) of unique services found in recommedations of the source to the total number of services (provided or found otherwise in source)')
+def catalog_coverage_perc(object):
+ """
+ Calculate the percentage (%) of unique services
+ found in source
+ """
+ return round(catalog_coverage(object)*100.0/services(object),2)
+
+
+@doc('The total number of unique users found in recommendations of the source')
+def user_coverage(object):
+ """
+ Calculate the total number of unique users
+ found in source
+ """
+ return len(object.recdb["recommendation"].distinct("user", object.query))
+
+
+@doc('The percentage (%) of unique users found in recommedations of the source to the total number of users (provided or found otherwise in source)')
+def user_coverage_perc(object):
+ """
+ Calculate the percentage (%) of unique users
+ found in source to the total number
+ of users
+ """
+ return round(user_coverage(object)*100.0/users(object),2)
+
+
diff --git a/preprocessor.py b/preprocessor.py
index 0648e3d..a37c785 100755
--- a/preprocessor.py
+++ b/preprocessor.py
@@ -8,8 +8,11 @@
from natsort import natsorted
import natsort as ns
import pandas as pd
+from inspect import getmembers, isfunction
import retrieval
+# local lib
+import pre_metrics as pm
import reward_mapping as rm
from get_service_catalog import get_eosc_marketplace_url, get_service_catalog_items, get_service_catalog_page_content, save_service_items_to_csv
@@ -258,53 +261,68 @@ def __init__(self, source_page_id, target_page_id, order):
# calculate pre metrics
if config['Metrics']:
- m.timestamp=str(datetime.utcnow())
+ run=pm.Runtime()
+ run.recdb=recdb
+ run.query=query
+ run.config=config
- ua_start=recdb["user_action"].find_one(query,sort=[("timestamp", 1)])["timestamp"]
- ua_end=recdb["user_action"].find_one(query,sort=[("timestamp", -1)])["timestamp"]
- rec_start=recdb["recommendation"].find_one(query,sort=[("timestamp", 1)])["timestamp"]
- rec_end=recdb["recommendation"].find_one(query,sort=[("timestamp", -1)])["timestamp"]
+ md={'timestamp':str(datetime.utcnow())}
- m.start=str(min(ua_start, rec_start))
- m.end=str(max(ua_end, rec_end))
+ # get all functions found in pre_metrics module
+ # apart from 'doc' func
+ # run and save the result in dictionary
+ # where key is the name of the function
+ # and value what it returns
+ # whereas, for each found functions
+ # an extra key_doc element in dictionary is set
+ # to save the text of the function
+ funcs = list(map(lambda x: x[0], getmembers(pm, isfunction)))
+ funcs = list(filter(lambda x: not x=='doc',funcs))
+ for func in funcs:
+ md[func+'_doc']=getattr(pm, func).text
+ md[func]=getattr(pm, func)(run)
- m.users=recdb["user"].count_documents({})
- m.recommendations=recdb["recommendation"].count_documents(query)
+ jsonstr = json.dumps(md)
- if config['Service']['published']:
- m.services=recdb["service"].count_documents({"status":"published"})
- else:
- m.services=recdb["service"].count_documents({})
+ print(jsonstr)
+
+ # Using a JSON string
+ with open(os.path.join(args.output,'pre_metrics.json'), 'w') as outfile:
+ outfile.write(jsonstr)
+
+ #m.timestamp=str(datetime.utcnow())
+
+ #m.users=recdb["user"].count_documents({})
- m.user_actions=recdb["user_action"].count_documents(query)
+ #m.recommendations=recdb["recommendation"].count_documents(query)
- m.user_actions_registered=recdb["user_action"].count_documents({**query,**{"user":{"$exists":True}}})
- m.user_actions_anonymous=m.user_actions-m.user_actions_registered
- m.user_actions_registered_perc=round(m.user_actions_registered*100.0/m.user_actions,2)
- m.user_actions_anonymous_perc=100-m.user_actions_registered_perc
+ #if config['Service']['published']:
+ # m.services=recdb["service"].count_documents({"status":"published"})
+ #else:
+ # m.services=recdb["service"].count_documents({})
- m.user_actions_order=recdb["user_action"].count_documents({**query, **{"action.order":True}})
- m.user_actions_order_registered=recdb["user_action"].count_documents({**query, **{"action.order":True,"user":{"$exists":True}}})
- m.user_actions_order_anonymous=m.user_actions_order-m.user_actions_order_registered
- m.user_actions_order_registered_perc=round(m.user_actions_order_registered*100.0/m.user_actions_order,2)
- m.user_actions_order_anonymous_perc=100-m.user_actions_order_registered_perc
+ #m.user_actions=recdb["user_action"].count_documents(query)
- m.user_actions_panel=recdb["user_action"].count_documents({**query, **{"source.root.type":"recommendation_panel"}})
- m.user_actions_panel_perc=round(m.user_actions_panel*100.0/m.user_actions,2)
+ #m.user_actions_registered=recdb["user_action"].count_documents({**query,**{"user":{"$exists":True}}})
+ #m.user_actions_anonymous=m.user_actions-m.user_actions_registered
+ #m.user_actions_registered_perc=round(m.user_actions_registered*100.0/m.user_actions,2)
+ #m.user_actions_anonymous_perc=100-m.user_actions_registered_perc
- m.service_catalog=len(recdb["recommendation"].distinct("services", query))
+ #m.user_actions_order=recdb["user_action"].count_documents({**query, **{"action.order":True}})
+ #m.user_actions_order_registered=recdb["user_action"].count_documents({**query, **{"action.order":True,"user":{"$exists":True}}})
+ #m.user_actions_order_anonymous=m.user_actions_order-m.user_actions_order_registered
+ #m.user_actions_order_registered_perc=round(m.user_actions_order_registered*100.0/m.user_actions_order,2)
+ #m.user_actions_order_anonymous_perc=100-m.user_actions_order_registered_perc
+
+ #m.user_actions_panel=recdb["user_action"].count_documents({**query, **{"source.root.type":"recommendation_panel"}})
+ #m.user_actions_panel_perc=round(m.user_actions_panel*100.0/m.user_actions,2)
# catalog coverage
- m.service_catalog_perc=round(m.service_catalog*100.0/m.services,2)
+ #m.service_catalog=len(recdb["recommendation"].distinct("services", query))
+ #m.service_catalog_perc=round(m.service_catalog*100.0/m.services,2)
# user coverage
- m.user_catalog=len(recdb["user_action"].distinct("user", query))
- m.user_catalog_perc=round(m.user_catalog*100.0/m.users,2)
-
- jsonstr = json.dumps(m.__dict__)
- print(jsonstr)
+ #m.user_catalog=len(recdb["user_action"].distinct("user", query))
+ #m.user_catalog_perc=round(m.user_catalog*100.0/m.users,2)
- # Using a JSON string
- with open(os.path.join(args.output,'pre_metrics.json'), 'w') as outfile:
- outfile.write(jsonstr)
From f18b94b8f68baecfcdab60c53dd446cd17dbd2f9 Mon Sep 17 00:00:00 2001
From: Nikolaos Triantafyllis
Date: Fri, 13 May 2022 17:31:47 +0300
Subject: [PATCH 26/89] Resolving ticket REC-36 for Consistency between
users/services in user_actions/recommendations
RSmetrics typical usage added
RSmetrics typical usage added (fix)
RSmetrics typical usage added (fix config)
Resolving ticket REC-36 for Consistency between users/services in user_actions/recommendations. Counting no appearance of user in recommendations as an anonymous user; thus +1 in user_coverage
---
README.md | 36 ++++++++++++++++++++++++++++++------
pre_metrics.py | 4 ++++
rsmetrics.py | 13 ++++++++++++-
3 files changed, 46 insertions(+), 7 deletions(-)
diff --git a/README.md b/README.md
index 28edea0..bbdde53 100644
--- a/README.md
+++ b/README.md
@@ -61,14 +61,38 @@ optional arguments:
```
8. Configure `./preprocessor.py` by editting the `config.yaml` or providing another with `-c`:
-
-
-
-
-
+```yaml
+
+# Set the desired connector (e.g. MongoDB)
+Source:
+ MongoDB:
+ host: localhost
+ port: 27017
+ db: recommender_dev
+
+User:
+ export: true
+Service:
+ # if true it keeps only published, otherwise all
+ # this has an effect in exporting when from is set to 'source'
+ # and also in metrics calculations where service is considered
+ published: true
+
+ # Use the EOSC-Marketplace webpage
+ # to associate page_id and service_id
+ download: true
+ path: ./page_map
+
+ export: true
+ from: 'page_map' # or 'source'
+
+# Calculate source's metrics
+Metrics: true
+
+```
-9. Run from terminal: `./rsmetrics.py` to run RSmetrics
+9. Run from terminal: `./rsmetrics.py --users --services` to run RSmetrics and include the `users.csv` and `services.csv` files generated by the Preprocessor
```bash
_____ _____ _ _
| __ \ / ____| | | (_)
diff --git a/pre_metrics.py b/pre_metrics.py
index 943761b..c24acf8 100644
--- a/pre_metrics.py
+++ b/pre_metrics.py
@@ -212,6 +212,10 @@ def user_coverage(object):
Calculate the total number of unique users
found in source
"""
+ # count also anonymous user as a user
+ anonymous=object.recdb["recommendation"].find_one({'user':None})
+ if anonymous:
+ return len(object.recdb["recommendation"].distinct("user", object.query))+1
return len(object.recdb["recommendation"].distinct("user", object.query))
diff --git a/rsmetrics.py b/rsmetrics.py
index d5a3fbb..1c3b658 100755
--- a/rsmetrics.py
+++ b/rsmetrics.py
@@ -80,7 +80,7 @@ def inner():
run.user_actions['Timestamp']= pd.to_datetime(run.user_actions['Timestamp'])
run.recommendations['Timestamp']= pd.to_datetime(run.recommendations['Timestamp'])
-# restrict data to datetime range
+# restrict user actions and recommendations data to datetime range
if args.starttime:
run.user_actions=run.user_actions[(run.user_actions['Timestamp'] > args.starttime) & (run.user_actions['Timestamp'] < args.endtime)]
run.recommendations=run.recommendations[(run.recommendations['Timestamp'] > args.starttime) & (run.recommendations['Timestamp'] < args.endtime)]
@@ -98,6 +98,17 @@ def inner():
if args.services:
run.services=pd.read_csv(os.path.join(args.input,'services.csv'),names=["Service"])
+# remove user actions when user or service does not exist in users' or services' catalogs
+# adding -1 in all catalogs indicating the anonynoums users or not-known services
+run.user_actions = run.user_actions[run.user_actions['User'].isin(run.users['User'].tolist()+[-1])]
+run.user_actions = run.user_actions[run.user_actions['Source_Service'].isin(run.services['Service'].tolist()+[-1])]
+run.user_actions = run.user_actions[run.user_actions['Target_Service'].isin(run.services['Service'].tolist()+[-1])]
+
+# remove recommendations when user or service does not exist in users' or services' catalogs
+# adding -1 in all catalogs indicating the anonynoums users or not-known services
+run.recommendations = run.recommendations[run.recommendations['User'].isin(run.users['User'].tolist()+[-1])]
+run.recommendations = run.recommendations[run.recommendations['Service'].isin(run.services['Service'].tolist()+[-1])]
+
md={'timestamp':str(datetime.utcnow())}
From 0781bd557eac05ea2a07b95f5fd9c884ee99dbe3 Mon Sep 17 00:00:00 2001
From: Nikolaos Triantafyllis
Date: Tue, 24 May 2022 00:03:29 +0300
Subject: [PATCH 27/89] Adding new metrics in RSmetrics:
recommendations_registered, recommendations_anonymous with their percentages
too
Adding new metrics in Preprocessor: recommendations_registered, recommendations_anonymous with their percentages too
---
metrics.py | 37 +++++++++++++++++++++++++++++++++++++
pre_metrics.py | 37 +++++++++++++++++++++++++++++++++++++
2 files changed, 74 insertions(+)
diff --git a/metrics.py b/metrics.py
index bbee950..73a482f 100644
--- a/metrics.py
+++ b/metrics.py
@@ -73,6 +73,43 @@ def recommendations(object):
"""
return len(object.recommendations.index)
+@doc('The total number of recommendations for registered users found in recommendations.csv')
+def recommendations_registered(object):
+ """
+ Calculate the total number of recommendations for registered users
+ found in Pandas DataFrame object recommendations
+ """
+ return len(object.recommendations[object.recommendations['User'] != -1].index)
+
+
+@doc('The total number of recommendations for anonymous users found in recommendations.csv')
+def recommendations_anonymous(object):
+ """
+ Calculate the total number of recommendations for anonymous users
+ found in Pandas DataFrame object recommendations
+ """
+ return recommendations(object)-recommendations_registered(object)
+
+
+@doc('The percentage (%) of recommendations for registered users to the total recommendations')
+def recommendations_registered_perc(object):
+ """
+ Calculate the percentage (%) of recommendations occurred
+ by registered users to the total recommendations
+ found in Pandas DataFrame object recommendations (in two decimals)
+ """
+ return round(recommendations_registered(object)*100.0/recommendations(object),2)
+
+
+@doc('The percentage (%) of recommendations for anonymous users to the total recommendations')
+def recommendations_anonymous_perc(object):
+ """
+ Calculate the percentage (%) of recommendations occurred
+ by anonymous users to the total recommendations
+ found in Pandas DataFrame object recommendations (in two decimals)
+ """
+ return round(100.0-recommendations_registered_perc(object),2)
+
@doc('The total number of user actions found in user_actions.csv')
def user_actions(object):
diff --git a/pre_metrics.py b/pre_metrics.py
index c24acf8..264185f 100644
--- a/pre_metrics.py
+++ b/pre_metrics.py
@@ -74,6 +74,43 @@ def recommendations(object):
"""
return object.recdb["recommendation"].count_documents(object.query)
+@doc('The total number of recommendations for registered users found in recommendations.csv')
+def recommendations_registered(object):
+ """
+ Calculate the total number of recommendations for registered users
+ found in Pandas DataFrame object recommendations
+ """
+ return object.recdb["recommendation"].count_documents({**object.query,**{"user":{"$exists":True}}})
+
+
+@doc('The total number of recommendations for anonymous users found in recommendations.csv')
+def recommendations_anonymous(object):
+ """
+ Calculate the total number of recommendations for anonymous users
+ found in Pandas DataFrame object recommendations
+ """
+ return recommendations(object)-recommendations_registered(object)
+
+
+@doc('The percentage (%) of recommendations for registered users to the total recommendations')
+def recommendations_registered_perc(object):
+ """
+ Calculate the percentage (%) of recommendations occurred
+ by registered users to the total recommendations
+ found in Pandas DataFrame object recommendations (in two decimals)
+ """
+ return round(recommendations_registered(object)*100.0/recommendations(object),2)
+
+
+@doc('The percentage (%) of recommendations for anonymous users to the total recommendations')
+def recommendations_anonymous_perc(object):
+ """
+ Calculate the percentage (%) of recommendations occurred
+ by anonymous users to the total recommendations
+ found in Pandas DataFrame object recommendations (in two decimals)
+ """
+ return round(100.0-recommendations_registered_perc(object),2)
+
@doc('The total number of user actions found in user actions of the source')
def user_actions(object):
From 1281de866b9fd7002c81e85fffbb94b69bcb8a4b Mon Sep 17 00:00:00 2001
From: Nikolaos Triantafyllis
Date: Wed, 25 May 2022 13:49:38 +0300
Subject: [PATCH 28/89] Remove comments
---
pre_metrics.py | 42 ------------------------------------------
1 file changed, 42 deletions(-)
diff --git a/pre_metrics.py b/pre_metrics.py
index f75ef35..5727282 100644
--- a/pre_metrics.py
+++ b/pre_metrics.py
@@ -224,45 +224,3 @@ def user_actions_panel_perc(object):
"""
return round(user_actions_panel(object)*100.0/user_actions(object),2)
-
-#@doc('The total number of unique services found in recommendations of the source')
-#def catalog_coverage(object):
-# """
-# Calculate the total number of unique services
-# found in recommendations.csv
-# """
-# return len(object.recdb["recommendation"].distinct("services", object.query))
-
-
-#@doc('The percentage (%) of unique services found in recommedations of the source to the total number of services (provided or found otherwise in source)')
-#def catalog_coverage_perc(object):
-# """
-# Calculate the percentage (%) of unique services
-# found in source
-# """
-# return round(catalog_coverage(object)*100.0/services(object),2)
-
-
-#@doc('The total number of unique users found in recommendations of the source')
-#def user_coverage(object):
-# """
-# Calculate the total number of unique users
-# found in source
-# """
-# # count also anonymous user as a user
-# anonymous=object.recdb["recommendation"].find_one({'user':None})
-# if anonymous:
-# return len(object.recdb["recommendation"].distinct("user", object.query))+1
-# return len(object.recdb["recommendation"].distinct("user", object.query))
-
-
-#@doc('The percentage (%) of unique users found in recommedations of the source to the total number of users (provided or found otherwise in source)')
-#def user_coverage_perc(object):
-# """
-# Calculate the percentage (%) of unique users
-# found in source to the total number
-# of users
-# """
-# return round(user_coverage(object)*100.0/users(object),2)
-
-
From b605bb243ea280bd368f4d17a0667e50724c77cc Mon Sep 17 00:00:00 2001
From: Nikolaos Triantafyllis
Date: Wed, 25 May 2022 13:50:49 +0300
Subject: [PATCH 29/89] Remove comments
---
preprocessor.py | 36 ------------------------------------
1 file changed, 36 deletions(-)
diff --git a/preprocessor.py b/preprocessor.py
index a37c785..47543b5 100755
--- a/preprocessor.py
+++ b/preprocessor.py
@@ -290,39 +290,3 @@ def __init__(self, source_page_id, target_page_id, order):
# Using a JSON string
with open(os.path.join(args.output,'pre_metrics.json'), 'w') as outfile:
outfile.write(jsonstr)
-
- #m.timestamp=str(datetime.utcnow())
-
- #m.users=recdb["user"].count_documents({})
-
- #m.recommendations=recdb["recommendation"].count_documents(query)
-
- #if config['Service']['published']:
- # m.services=recdb["service"].count_documents({"status":"published"})
- #else:
- # m.services=recdb["service"].count_documents({})
-
- #m.user_actions=recdb["user_action"].count_documents(query)
-
- #m.user_actions_registered=recdb["user_action"].count_documents({**query,**{"user":{"$exists":True}}})
- #m.user_actions_anonymous=m.user_actions-m.user_actions_registered
- #m.user_actions_registered_perc=round(m.user_actions_registered*100.0/m.user_actions,2)
- #m.user_actions_anonymous_perc=100-m.user_actions_registered_perc
-
- #m.user_actions_order=recdb["user_action"].count_documents({**query, **{"action.order":True}})
- #m.user_actions_order_registered=recdb["user_action"].count_documents({**query, **{"action.order":True,"user":{"$exists":True}}})
- #m.user_actions_order_anonymous=m.user_actions_order-m.user_actions_order_registered
- #m.user_actions_order_registered_perc=round(m.user_actions_order_registered*100.0/m.user_actions_order,2)
- #m.user_actions_order_anonymous_perc=100-m.user_actions_order_registered_perc
-
- #m.user_actions_panel=recdb["user_action"].count_documents({**query, **{"source.root.type":"recommendation_panel"}})
- #m.user_actions_panel_perc=round(m.user_actions_panel*100.0/m.user_actions,2)
-
- # catalog coverage
- #m.service_catalog=len(recdb["recommendation"].distinct("services", query))
- #m.service_catalog_perc=round(m.service_catalog*100.0/m.services,2)
-
- # user coverage
- #m.user_catalog=len(recdb["user_action"].distinct("user", query))
- #m.user_catalog_perc=round(m.user_catalog*100.0/m.users,2)
-
From 40b7948604947ec83a5da2554777271decdac5ad Mon Sep 17 00:00:00 2001
From: Konstantinos Kagkelidis
Date: Wed, 18 May 2022 04:27:29 +0300
Subject: [PATCH 30/89] REC-46 Calculate hit-rate
---
metrics.py | 44 +++++++++++++++++++++++++++++++++++++--
preprocessor.py | 13 +++++++++---
report.html.prototype | 30 ++++++++++++++++++++++++---
rsmetrics.py | 48 +++++++------------------------------------
4 files changed, 86 insertions(+), 49 deletions(-)
diff --git a/metrics.py b/metrics.py
index 73a482f..c43c5c9 100644
--- a/metrics.py
+++ b/metrics.py
@@ -47,7 +47,7 @@ def users(object):
or user_actions otherwise
"""
if isinstance(object.users, pd.DataFrame):
- return int(object.users.nunique()['User'])
+ return int(object.users['User'].nunique())
else:
return int(object.user_actions.nunique()['User'])
@@ -261,4 +261,44 @@ def user_coverage_perc(object):
"""
return round(user_coverage(object)*100.0/users(object),2)
-
+@doc('The ratio of user hits divided by the total number of users (user hit: a user that has accessed at least one service that is also a personal recommendation)')
+def hit_rate(object):
+ """
+ For each user get the recommended services and the services the user accessed
+ Check if the user has at least one accessed service in recommendations. If yes increase number of hits by one
+ Divide by the total number of users
+ """
+ users = object.users.values.tolist()
+ recs = object.recommendations.values.tolist()
+ # Fill lookup dictionary with all services recommender per user id
+ user_recs = dict()
+ for item in recs:
+ # skip anonymous users
+ if item == -1:
+ continue
+ user_id = item[0]
+ service_id = item[1]
+ if user_id in user_recs.keys():
+ user_recs[user_id].append(service_id)
+ else:
+ user_recs[user_id] = [service_id]
+
+ hits = 0
+ # For each user in users check if his accessed services are in his recommendations
+
+ for user in users:
+ user_id = user[0]
+ # create a set of unique accessed services by user
+ services = set(user[1])
+ if user_id in user_recs.keys():
+ # create a set of unique recommended services to the user
+ recommendations = set(user_recs.get(user_id))
+ # intersection should include services that have been both accessed by and recommended to the user
+ intersection = services.intersection(recommendations)
+ # If the user has at least one service (both recommended and accessed), this user is considered a hit
+ if len(intersection) > 0:
+ hits = hits + 1
+
+
+
+ return round(hits/len(users),5)
diff --git a/preprocessor.py b/preprocessor.py
index 47543b5..58400e1 100755
--- a/preprocessor.py
+++ b/preprocessor.py
@@ -10,6 +10,7 @@
import pandas as pd
from inspect import getmembers, isfunction
import retrieval
+import csv
# local lib
import pre_metrics as pm
@@ -237,10 +238,16 @@ def __init__(self, source_page_id, target_page_id, order):
# export user catalog
if config['User']['export']:
- us=natsorted(list(set(list(map(lambda x: str(x['_id'])+'\n',recdb["user"].find({}))))),alg=ns.ns.SIGNED)
+ # produce users csv with each user id along with the user's accessed services
+ # query users from database for fields _id and accessed_services then create a list of rows
+ # each rows contains two elements, first: user_id in string format and second: a space separated sorted list of accessed services
+ users = recdb['user'].find({},{'accessed_services':1})
+ users = list(map(lambda x: [str(x['_id']), " ".join([str(service_id) for service_id in sorted(set(x['accessed_services']))])], users))
- with open(os.path.join(args.output,'users.csv'), 'w') as o:
- o.writelines(us)
+ # save the users list of rows to a csv file
+ with open(os.path.join(args.output,'users.csv'), 'w') as f:
+ writer = csv.writer(f)
+ writer.writerows(users)
# export service catalog
if config['Service']['export']:
diff --git a/report.html.prototype b/report.html.prototype
index bc9e230..a001951 100644
--- a/report.html.prototype
+++ b/report.html.prototype
@@ -33,6 +33,17 @@
color: #FAC0E7;
}
+.card-hit-rate {
+ background-color: #6cae80;
+ color: #d9fac0;
+}
+
+.card-footer{
+ background-color: rgb(247,247,247,0.8);
+ color: black;
+ font-style: italic;
+}
+
span {
position: relative;
}
@@ -139,7 +150,7 @@ span:hover:before {
User Coverage: %
-
+
@@ -147,7 +158,15 @@ span:hover:before {
Catalog Coverage: %
-
+
+
+
+
@@ -171,7 +190,7 @@ span:hover:before {
function fill(data){
// what to fill
- let fill_list = ['users', 'recommendations', 'services', 'user_actions','user_actions_order', 'user_actions_registered', 'user_actions_anonymous', 'user_actions_order_registered', 'user_actions_order_anonymous', 'user_actions_registered_perc', 'user_actions_anonymous_perc', 'user_actions_order_registered_perc', 'user_actions_order_anonymous_perc', 'catalog_coverage_perc', 'user_coverage_perc', 'timestamp'];
+ let fill_list = ['users', 'recommendations', 'services', 'user_actions','user_actions_order', 'user_actions_registered', 'user_actions_anonymous', 'user_actions_order_registered', 'user_actions_order_anonymous', 'user_actions_registered_perc', 'user_actions_anonymous_perc', 'user_actions_order_registered_perc', 'user_actions_order_anonymous_perc', 'catalog_coverage_perc', 'user_coverage_perc', 'timestamp', 'hit_rate'];
for (item of fill_list) {
key = 'val_' + item
@@ -179,6 +198,11 @@ function fill(data){
value_doc=data[item+'_doc']
document.getElementById(key).textContent = value
document.getElementById(key).setAttribute('gloss', value_doc);
+ // Add ui footers with documentation string for some prominent metrics
+ let doc_el = document.getElementById('doc_'+item);
+ if (doc_el) {
+ doc_el.textContent=value_doc;
+ }
}
}
diff --git a/rsmetrics.py b/rsmetrics.py
index 1c3b658..56041f2 100755
--- a/rsmetrics.py
+++ b/rsmetrics.py
@@ -46,8 +46,8 @@ def inner():
optional.add_argument('-s', '--starttime', metavar=('DATETIME'), help='calculate metrics starting from given datetime in ISO format (UTC) e.g. YYYY-MM-DD', nargs='?', default=None)
optional.add_argument('-e', '--endtime', metavar=('DATETIME'), help='calculate metrics ending to given datetime in ISO format (UTC) e.g. YYYY-MM-DD', nargs='?', default=None)
-optional.add_argument('--users', help='enable reading total users from users.csv, otherwise it will be calculated according to the user actions', action='store_true', default=False)
-optional.add_argument('--services', help='enable reading total services from services.csv, otherwise it will be calculated according to the user actions', action='store_true', default=False)
+
+optional.add_argument('--services', help='enable reading total services from services.csv, otherwise it will be calculated according to the user actions', action='store_true', default=True)
optional.add_argument('-h', '--help', action='help', help='show this help message and exit')
optional.add_argument('-v', '--version', action='version', version='%(prog)s v'+__version__)
@@ -89,12 +89,11 @@ def inner():
run.user_actions=run.user_actions[run.user_actions['Timestamp'] < args.endtime]
run.recommendations=run.recommendations[run.recommendations['Timestamp'] < args.endtime]
-# populate users and services
-# if no users or services provided use
-# respective columns found in user_actions instead
-if args.users:
- run.users=pd.read_csv(os.path.join(args.input,'users.csv'),names=["User"])
+# users are populated with two columns: one includes user id and the other includes a list of accessed services
+run.users=pd.read_csv(os.path.join(args.input,'users.csv'),names=["User","Services"],converters={'Services': lambda x: map(int,x.split())})
+
+# populate services
if args.services:
run.services=pd.read_csv(os.path.join(args.input,'services.csv'),names=["Service"])
@@ -126,42 +125,9 @@ def inner():
md[func+'_doc']=getattr(m, func).text
md[func]=getattr(m, func)(run)
-# get uniq values per column of user actions
-#uniq_uas=uas.nunique()
-#uniq_recs=recs.nunique()
-
-#m.users=int(uniq_uas['User']) if not args.users else int(us['User'].nunique())
-#m.services=int(uniq_uas['Service']) if not args.services else int(ss['Service'])
-
-#m.recommendations=len(recs.index)
-#m.user_actions=len(uas.index)
-
-#m.user_actions_registered=len(uas[uas['User'] != -1].index)
-#m.user_actions_anonymous=m.user_actions-m.user_actions_registered
-
-#m.user_actions_registered_perc=round(m.user_actions_registered*100.0/m.user_actions,2)
-#m.user_actions_anonymous_perc=100-m.user_actions_registered_perc
-
-#m.user_actions_order=len(uas[uas['Reward'] == 1.0].index)
-
-#m.user_actions_order_registered=len(uas[(uas['Reward'] == 1.0) & (uas['User'] != -1)].index)
-#m.user_actions_order_anonymous=m.user_actions_order-m.user_actions_order_registered
-#m.user_actions_order_registered_perc=round(m.user_actions_order_registered*100.0/m.user_actions_order,2)
-#m.user_actions_order_anonymous_perc=100-m.user_actions_order_registered_perc
-
-#m.user_actions_panel=len(uas[uas['Action'] == 'recommendation_panel'].index)
-#m.user_actions_panel_perc=round(m.user_actions_panel*100.0/m.user_actions,2)
-
-#m.services_suggested=int(uniq_recs['Service'])
-
-# catalog coverage
-#m.services_suggested_perc=round(m.services_suggested*100.0/m.services,2)
-# user coverage
-#m.users_suggested=int(uniq_recs['User'])
-#m.users_suggested_perc=round(m.users_suggested*100.0/m.users,2)
-jsonstr = json.dumps(md)
+jsonstr = json.dumps(md,indent=4)
#jsonstr = json.dumps(m.__dict__)
print(jsonstr)
From 34bce83c78dddda7043adba07d41ada9458c6537 Mon Sep 17 00:00:00 2001
From: Konstantinos Kagkelidis
Date: Fri, 27 May 2022 19:49:45 +0300
Subject: [PATCH 31/89] REC-50 Add Click-Through Rate Metric
---
metrics.py | 29 +++++++++++++++++++++++++++++
report.html.prototype | 19 ++++++++++++++++++-
rsmetrics.py | 13 ++++++++-----
3 files changed, 55 insertions(+), 6 deletions(-)
diff --git a/metrics.py b/metrics.py
index c43c5c9..b839fb1 100644
--- a/metrics.py
+++ b/metrics.py
@@ -8,6 +8,7 @@ def __init__(self):
self.users=None
self.services=None
self.user_actions=None
+ self.user_actions_all=None
self.recommendations=None
# decorator to add the text attribute to function
@@ -302,3 +303,31 @@ def hit_rate(object):
return round(hits/len(users),5)
+
+
+@doc('The number of user clicks through recommendations panels divided by the total times recommendation panels were presented to users. Takes into account all historical data of user actions')
+def click_through_rate(object):
+ """
+ Get only the user actions that present a recommendation panel to the user in the source page
+ Those are actions with the following source paths:
+ - /services
+ - /services/
+ - /services/c/{any category name}
+ Count the items in above list as they represent the times recommendations panels were presented to the users of the portal
+ Narrow the above list into a new subset by selecting only user actions that originate from a recommendation panel
+ Those are actions that have the 'recommendation' string in the Action column
+ Count the items in the subset as they represent the times users clicked through recommendations
+ Divide the items of the subset with the items of the first list to get the click-through rate
+ """
+
+ # get user actions
+ user_actions_all = object.user_actions_all.values.tolist()
+
+ # filter only user actions with the needed source paths (/services, /services/, /services/c/...).
+ # source paths are on the [6] index of each list item
+ user_actions_recpanel_views = list(filter(lambda x: x[6] in ['/services', '/services/'] or x[6].startswith('/services/c/'),user_actions_all))
+
+ # further filter with those actions that they have 'recommender'
+ user_actions_recpanel_clicks = list(filter(lambda x: x[4]=='recommendation_panel',user_actions_recpanel_views))
+
+ return round(len(user_actions_recpanel_clicks)/len(user_actions_recpanel_views),2)
\ No newline at end of file
diff --git a/report.html.prototype b/report.html.prototype
index a001951..90c8b12 100644
--- a/report.html.prototype
+++ b/report.html.prototype
@@ -12,6 +12,7 @@
.card {
animation-name: fadein;
animation-duration: 1s;
+ margin-bottom: 25px;
}
@keyframes fadein {
@@ -38,6 +39,11 @@
color: #d9fac0;
}
+.card-ctr {
+ background-color: #a55f80;
+ color: #f9bdd6;
+}
+
.card-footer{
background-color: rgb(247,247,247,0.8);
color: black;
@@ -169,6 +175,14 @@ span:hover:before {
+
+
+
+
Click-through Rate (CTR): %
+
+
+
+
@@ -190,7 +204,10 @@ span:hover:before {
function fill(data){
// what to fill
- let fill_list = ['users', 'recommendations', 'services', 'user_actions','user_actions_order', 'user_actions_registered', 'user_actions_anonymous', 'user_actions_order_registered', 'user_actions_order_anonymous', 'user_actions_registered_perc', 'user_actions_anonymous_perc', 'user_actions_order_registered_perc', 'user_actions_order_anonymous_perc', 'catalog_coverage_perc', 'user_coverage_perc', 'timestamp', 'hit_rate'];
+ let fill_list = ['users', 'recommendations', 'services', 'user_actions','user_actions_order', 'user_actions_registered',
+ 'user_actions_anonymous', 'user_actions_order_registered', 'user_actions_order_anonymous', 'user_actions_registered_perc',
+ 'user_actions_anonymous_perc', 'user_actions_order_registered_perc', 'user_actions_order_anonymous_perc',
+ 'catalog_coverage_perc', 'user_coverage_perc', 'timestamp', 'hit_rate', 'click_through_rate'];
for (item of fill_list) {
key = 'val_' + item
diff --git a/rsmetrics.py b/rsmetrics.py
index 56041f2..73a00a3 100755
--- a/rsmetrics.py
+++ b/rsmetrics.py
@@ -73,20 +73,22 @@ def inner():
sys.exit(0)
# read data
-run.user_actions=pd.read_csv(os.path.join(args.input,'user_actions.csv'),names=["User", "Source_Service", "Target_Service", "Reward", "Action", "Timestamp", "Source_Page_ID", "Target_Page_ID"])
+run.user_actions_all=pd.read_csv(os.path.join(args.input,'user_actions.csv'),names=["User", "Source_Service", "Target_Service", "Reward", "Action", "Timestamp", "Source_Page_ID", "Target_Page_ID"])
run.recommendations=pd.read_csv(os.path.join(args.input,'recommendations.csv'),names=["User", "Service", "Rating", "Timestamp"])
# convert timestamp column to datetime object
-run.user_actions['Timestamp']= pd.to_datetime(run.user_actions['Timestamp'])
+run.user_actions_all['Timestamp']= pd.to_datetime(run.user_actions_all['Timestamp'])
+
+
run.recommendations['Timestamp']= pd.to_datetime(run.recommendations['Timestamp'])
# restrict user actions and recommendations data to datetime range
if args.starttime:
- run.user_actions=run.user_actions[(run.user_actions['Timestamp'] > args.starttime) & (run.user_actions['Timestamp'] < args.endtime)]
+ run.user_actions_all=run.user_actions_all[(run.user_actions_all['Timestamp'] > args.starttime) & (run.user_actions_all['Timestamp'] < args.endtime)]
run.recommendations=run.recommendations[(run.recommendations['Timestamp'] > args.starttime) & (run.recommendations['Timestamp'] < args.endtime)]
else:
- run.user_actions=run.user_actions[run.user_actions['Timestamp'] < args.endtime]
+ run.user_actions_all=run.user_actions_all[run.user_actions_all['Timestamp'] < args.endtime]
run.recommendations=run.recommendations[run.recommendations['Timestamp'] < args.endtime]
@@ -97,9 +99,10 @@ def inner():
if args.services:
run.services=pd.read_csv(os.path.join(args.input,'services.csv'),names=["Service"])
+
# remove user actions when user or service does not exist in users' or services' catalogs
# adding -1 in all catalogs indicating the anonynoums users or not-known services
-run.user_actions = run.user_actions[run.user_actions['User'].isin(run.users['User'].tolist()+[-1])]
+run.user_actions = run.user_actions_all[run.user_actions_all['User'].isin(run.users['User'].tolist()+[-1])]
run.user_actions = run.user_actions[run.user_actions['Source_Service'].isin(run.services['Service'].tolist()+[-1])]
run.user_actions = run.user_actions[run.user_actions['Target_Service'].isin(run.services['Service'].tolist()+[-1])]
From d09aff5d65f018c4ee575a3566ec51f9c8655f4c Mon Sep 17 00:00:00 2001
From: Nikolaos Triantafyllis
Date: Tue, 31 May 2022 16:37:40 +0300
Subject: [PATCH 32/89] First version of Shannon Entropy metric computation.
More info at REC-49
---
metrics.py | 71 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 70 insertions(+), 1 deletion(-)
diff --git a/metrics.py b/metrics.py
index b839fb1..92a794b 100644
--- a/metrics.py
+++ b/metrics.py
@@ -330,4 +330,73 @@ def click_through_rate(object):
# further filter with those actions that they have 'recommender'
user_actions_recpanel_clicks = list(filter(lambda x: x[4]=='recommendation_panel',user_actions_recpanel_views))
- return round(len(user_actions_recpanel_clicks)/len(user_actions_recpanel_views),2)
\ No newline at end of file
+ return round(len(user_actions_recpanel_clicks)/len(user_actions_recpanel_views),2)
+
+@doc('The diversity of the recommendations according to Shannon Entropy. The entropy is 0 when a single item is always chosen or recommended, and log n when n items are chosen or recommended equally often. (see book https://link.springer.com/10.1007/978-1-4939-7131-2_110158)')
+def diversity(object, anonymous=False):
+ """
+ Calculate Shannon Entropy based on https://elliot.readthedocs.io/en/latest/guide/metrics/diversity.html?highlight=entropy#module-elliot.evaluation.metrics.diversity.shannon_entropy.shannon_entropy. The entropy is 0 when a single item is always chosen or recommended, and log n when n items are chosen or recommended equally often. See more in https://link.springer.com/content/pdf/10.1007/978-1-4899-7637-6.pdf, page 293.
+ """
+ # keep recommendations with or without anonymous suggestions
+ # based on anonymous flag (default=False, i.e. ignore anonymous)
+ if anonymous:
+ recs=object.recommendations
+ else:
+ recs=object.recommendations[(object.recommendations['User'] != -1)]
+
+ # this variable keeps the sum of user_norm (where user_norm is
+ # the count of how many times a User has been suggested)
+ # however since no cutoff at per user recommendations is applied and
+ # also since each recommendation entry is one-to-one
+ # then the total number of recommendations is equal to this sum
+ free_norm=len(recs.index)
+
+ # (remember that recommendations have been previously
+ # filtered based on the existance of users in user.csv and
+ # services in services.csv)
+
+ # user_norm
+ # group recommendations entries by user id and
+ # then count how many times each user has been suggested
+ gr_user=recs.groupby(['User']).count()
+
+ # create a dictionary of user_norm in order to
+ # map the user id to the respective user_norm
+ # key= and value=
+ d_user=gr_user['Service'].to_dict()
+
+ # item_count
+ # group recommendations entries by service id and
+ # then count how many times each service has been suggested
+ gr_service=recs.groupby(['Service']).count()
+
+ # create a dictionary of item_count in order to
+ # map the service id to the respective item_count
+ # key= and value=
+ d_service=gr_service['User'].to_dict()
+
+ # it loops here for each service id, where
+ # key=service_id
+ for key in d_service:
+ # the impact of the service is calculated here
+ # the below line creates a list of the user ids
+ # where this particular service was suggested to
+ # the user list can contain duplicate users,
+ # because the same service might be suggested to
+ # the same user more than once
+ # for each element of the user list the associated
+ # user_norm value is found from the d_user dictionary
+ # when all user_norm are found, then they summed up
+ # to determine the weight of the service
+ weight=sum(list(map(lambda x: 1./d_user[x],recs[(recs['Service']==key)]['User'].tolist())))
+
+ # this line calculates the Shannon Entropy of each particular
+ # service id and stores it to the d_service dictionary accordingly
+ # initially, the d_service[key] contains the item_count of each service
+ d_service[key]=-weight*math.log2(d_service[key]/free_norm)
+
+ # an overall value of the Shannon Entropy is returned by
+ # summing all indivual ones and divide them by the number
+ # of unique users
+ return round(sum(d_service.values())/len(d_user),4)
+
From fef98f2f724bab03b3013ce45ca890d11e67f06d Mon Sep 17 00:00:00 2001
From: Konstantinos Kagkelidis
Date: Tue, 31 May 2022 18:03:07 +0300
Subject: [PATCH 33/89] REC-52 Add diversity metric to report
---
metrics.py | 2 +-
report.html.prototype | 17 ++++++++++++++++-
2 files changed, 17 insertions(+), 2 deletions(-)
diff --git a/metrics.py b/metrics.py
index 92a794b..c5bb78e 100644
--- a/metrics.py
+++ b/metrics.py
@@ -1,7 +1,7 @@
#!/usr/bin/env python3
import pandas as pd
import numpy as np
-
+import math
class Runtime:
def __init__(self):
diff --git a/report.html.prototype b/report.html.prototype
index 90c8b12..436daca 100644
--- a/report.html.prototype
+++ b/report.html.prototype
@@ -44,12 +44,19 @@
color: #f9bdd6;
}
+.card-diversity {
+ background-color: #FA8B1C;
+ color:#F4E588;
+}
+
.card-footer{
background-color: rgb(247,247,247,0.8);
color: black;
font-style: italic;
}
+
+
span {
position: relative;
}
@@ -183,6 +190,14 @@ span:hover:before {
+
+
+
+
Diversity (Shannon Entropy):
+
+
+
+
@@ -207,7 +222,7 @@ function fill(data){
let fill_list = ['users', 'recommendations', 'services', 'user_actions','user_actions_order', 'user_actions_registered',
'user_actions_anonymous', 'user_actions_order_registered', 'user_actions_order_anonymous', 'user_actions_registered_perc',
'user_actions_anonymous_perc', 'user_actions_order_registered_perc', 'user_actions_order_anonymous_perc',
- 'catalog_coverage_perc', 'user_coverage_perc', 'timestamp', 'hit_rate', 'click_through_rate'];
+ 'catalog_coverage_perc', 'user_coverage_perc', 'timestamp', 'hit_rate', 'click_through_rate', 'diversity'];
for (item of fill_list) {
key = 'val_' + item
From 1bf4c887ecb9e83e2d8bf4c5cac616dd0c087508 Mon Sep 17 00:00:00 2001
From: Konstantinos Kagkelidis
Date: Tue, 31 May 2022 20:58:10 +0300
Subject: [PATCH 34/89] REC-51 Display report as a webservice
---
.gitignore | 8 +++++++
README.md | 27 +++++++++++++++++++++
environment.yml | 12 +++++++++-
report.html.prototype | 2 +-
report.py | 14 +++++++++--
requirements.txt | 9 +++++++
webservice/.env | 1 +
webservice/app.py | 28 ++++++++++++++++++++++
webservice/templates/report.html.prototype | 1 +
9 files changed, 98 insertions(+), 4 deletions(-)
create mode 100644 webservice/.env
create mode 100644 webservice/app.py
create mode 120000 webservice/templates/report.html.prototype
diff --git a/.gitignore b/.gitignore
index b7b65a7..f1e6fdd 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,11 @@
+# The following are handy to ignore in this specific project
+# please ignore generated folders with results such as /data and /report
+/data
+/report
+
+# please ignore changes in the configuration file. If default configuration file structure is changed please override this rule with git add -f
+/config.yaml
+
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
diff --git a/README.md b/README.md
index bbdde53..dac0385 100644
--- a/README.md
+++ b/README.md
@@ -175,4 +175,31 @@ chmod u+x ./get_service_catalog.py
./get_service_catalog.py
```
+#### Serve Evaluation Reports as a Service
+
+The `webservice` folder hosts a simple webservice implemented in Flask framework which can be used to host the report results.
+
+__Note__: Please make sure you work in a virtual environment and you have already downloaded the required dependencies by issuing
+`pip install -r requirements.txt`
+
+The webservice application serves two endpoints
+ - `/` : This is the frontend webpage that displays the Report Results in a UI
+ - `/api` : This api call returns the evaluation metrics in json format
+
+To run the webservice issue:
+```
+cd ./webservice
+flask run
+```
+
+The webservice by default runs in localhost:5000 you can override this by issuing for example:
+```
+flask run -h 127.0.0.1 -p 8080
+```
+
+There is an env variable `RS_EVAL_METRIC_SOURCE` which directs the webservice to the generated `metrics.json` file produced after the evaluation process.
+This by default honors this repo's folder structure and directs to the root `/data/metrics.json` path
+
+You can override this by editing the `.env` file inside the `/webservice` folder, or specificy the `RS_EVAL_METRIC_SOURCE` variable accordingly before executing the `flask run` command
+
_Tested with python 3.9_
diff --git a/environment.yml b/environment.yml
index db48d7e..f61f3d1 100644
--- a/environment.yml
+++ b/environment.yml
@@ -25,16 +25,24 @@ dependencies:
- zlib=1.2.11=h7f8727e_4
- pip:
- beautifulsoup4==4.10.0
+ - certifi==2021.10.8
- charset-normalizer==2.0.12
+ - click==8.1.3
+ - Flask==2.1.2
- idna==3.3
+ - importlib-metadata==4.11.4
+ - itsdangerous==2.1.2
+ - Jinja2==3.1.2
- joblib==1.1.0
+ - MarkupSafe==2.1.1
- natsort==8.1.0
- numpy==1.22.3
- pandas==1.4.2
- pymongo==4.1.0
- python-dateutil==2.8.2
+ - python-dotenv==0.20.0
- pytz==2022.1
- - pyyaml==6.0
+ - PyYAML==6.0
- requests==2.27.1
- scikit-surprise==1.1.1
- scipy==1.8.0
@@ -42,3 +50,5 @@ dependencies:
- soupsieve==2.3.2
- surprise==0.1
- urllib3==1.26.9
+ - Werkzeug==2.1.2
+ - zipp==3.8.0
diff --git a/report.html.prototype b/report.html.prototype
index 436daca..d0fe318 100644
--- a/report.html.prototype
+++ b/report.html.prototype
@@ -240,7 +240,7 @@ function fill(data){
// Autostart
(function() {
- fetch('metrics.json')
+ fetch("{{metric_source}}")
.then(response => response.json())
.then(data => fill(data));
}());
diff --git a/report.py b/report.py
index 1c6bcf9..a5f192f 100755
--- a/report.py
+++ b/report.py
@@ -8,6 +8,7 @@
import argparse
from pathlib import Path
import shutil
+from jinja2 import Template
@@ -22,10 +23,19 @@ def main(args=None):
# create output folder if doesn't exist
Path(args.output).mkdir(parents=True, exist_ok=True)
- # copy needed files
- shutil.copy("./report.html.prototype", args.output+"/index.html")
+ # prepare needed files
+
+ # copy metrics.json to the appropriate folder
shutil.copy(args.input+"/metrics.json", args.output)
+ # modify report.htm.prototype template to generate appropriate html file
+ with open('./report.html.prototype') as f:
+ template = Template(f.read())
+ # fill template with the source of the metric data which will be the metrics.json file
+ # save the template as index.html to the appropriate reports folder
+ template.stream(metric_source="metrics.json").dump(args.output+"/index.html")
+
+
threading.Thread(target=start_server, args=(args,)).start()
webbrowser.open_new("http://"+args.address+":"+args.port)
diff --git a/requirements.txt b/requirements.txt
index 688721f..cbf4e24 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,13 +1,20 @@
beautifulsoup4==4.10.0
certifi==2021.10.8
charset-normalizer==2.0.12
+click==8.1.3
+Flask==2.1.2
idna==3.3
+importlib-metadata==4.11.4
+itsdangerous==2.1.2
+Jinja2==3.1.2
joblib==1.1.0
+MarkupSafe==2.1.1
natsort==8.1.0
numpy==1.22.3
pandas==1.4.2
pymongo==4.1.0
python-dateutil==2.8.2
+python-dotenv==0.20.0
pytz==2022.1
PyYAML==6.0
requests==2.27.1
@@ -17,3 +24,5 @@ six==1.16.0
soupsieve==2.3.2
surprise==0.1
urllib3==1.26.9
+Werkzeug==2.1.2
+zipp==3.8.0
diff --git a/webservice/.env b/webservice/.env
new file mode 100644
index 0000000..700691d
--- /dev/null
+++ b/webservice/.env
@@ -0,0 +1 @@
+RS_EVALUATION_METRICS=../data/metrics.json
\ No newline at end of file
diff --git a/webservice/app.py b/webservice/app.py
new file mode 100644
index 0000000..f7273f6
--- /dev/null
+++ b/webservice/app.py
@@ -0,0 +1,28 @@
+from flask import Flask, render_template, jsonify
+import json, os
+from dotenv import load_dotenv
+
+
+app = Flask('RS_EVALUATION')
+dotenv_path = os.path.join(app.instance_path, '.env')
+load_dotenv(dotenv_path)
+
+app.config['RS_EVALUATION_METRICS'] = os.environ.get('RS_EVALUATION_METRICS')
+
+
+@app.route("/")
+def main_page():
+ '''Serve the main page that constructs the report view'''
+ # Render the report template and specifiy metric resource to be '/api' since the report is hosted in the webservice
+ return render_template('./report.html.prototype',metric_source='/api')
+
+
+@app.route("/api")
+def api_metrics():
+ '''Serve the metrics data in json format as a default api response'''
+ result = {}
+
+ with open(app.config['RS_EVALUATION_METRICS'], 'r') as f:
+ result = json.load(f)
+ f.close()
+ return jsonify(result)
\ No newline at end of file
diff --git a/webservice/templates/report.html.prototype b/webservice/templates/report.html.prototype
new file mode 120000
index 0000000..3159f23
--- /dev/null
+++ b/webservice/templates/report.html.prototype
@@ -0,0 +1 @@
+../../report.html.prototype
\ No newline at end of file
From 30e5cf98459a54de2f2d2827c135d832b89bc156 Mon Sep 17 00:00:00 2001
From: Nikolaos Triantafyllis
Date: Mon, 6 Jun 2022 19:13:39 +0300
Subject: [PATCH 35/89] REC-53 Calculating Novelty
---
metrics.py | 70 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 70 insertions(+)
diff --git a/metrics.py b/metrics.py
index c5bb78e..65e52d6 100644
--- a/metrics.py
+++ b/metrics.py
@@ -400,3 +400,73 @@ def diversity(object, anonymous=False):
# of unique users
return round(sum(d_service.values())/len(d_user),4)
+@doc('Calculate novelty (Expected Free Discovery -EFD-) as the expected Inverse Collection Frequency -ICF- of (relevant and seen) recommended items')
+def novelty(object, anonymous=False):
+ """
+ Calculate novelty (Expected Free Discovery -EFD-) as
+ the expected Inverse Collection Frequency -ICF- of
+ (relevant and seen) recommended items
+ """
+ # inner function to run on each pandas df row
+ def nanmap(row):
+ if np.isnan(row.values[0]):
+ try:
+ return gr_service_target['User'][row.name]
+ except:
+ return gr_service_source['User'][row.name]
+ else:
+ return row
+ # no ranking (rank=1) - recommendation items are equally weighted
+ # no relevance (p(rel)=1) - an item is liked, picked, enjoyed (not such info)
+ # no discount - (disc(k)=1) - user views all recommendation items (not paging)
+
+ # keep recommendations with or without anonymous suggestions
+ # based on anonymous flag (default=False, i.e. ignore anonymous)
+ if anonymous:
+ recs=object.recommendations
+ uas=object.user_actions
+ else:
+ recs=object.recommendations[(object.recommendations['User'] != -1)]
+ uas=object.user_actions[(object.user_actions['User'] != -1)]
+
+ # item_count
+ # group user actions entries by service id and
+ # then count how many times each service has been suggested
+ gr_service_source=uas.groupby(['Source_Service']).count()
+ gr_service_target=uas.groupby(['Target_Service']).count()
+ # merge above results
+ gr_service=gr_service_source+gr_service_target
+ # when nan value find a keep the other value (search on both dfs)
+ gr_service=gr_service.apply(nanmap, axis=1)
+
+ # create a dictionary of item_count in order to
+ # map the service id to the respective item_count
+ # key= and value=
+ d_service=gr_service['User'].to_dict()
+
+ # this variable keeps the sum of user_norm (where user_norm is
+ # the count of how many times a User has been suggested)
+ # however since no cutoff at per user recommendations is applied and
+ # also since each recommendation entry is one-to-one
+ # then the total number of recommendations is equal to this sum
+ norm=sum(d_service.values())
+
+ # get the max novelty by getting the service with the lowest item_count
+ max_nov=-math.log2(min(d_service.values())/norm)
+
+ # calculate novelty for all services
+ d_service = {service: -math.log2(item_count/norm) for service, item_count in d_service.items()} # fix user_actions not recommendations to gather services
+
+ # get all unique users found in recommendations
+ users = recs['User'].unique()
+
+ # use max_nov -> min count if x service not found (removed functionality)
+ d_user={}
+ for u in users:
+ u_norm=len(recs[(recs['User']==u)].index)
+ d_user[u]=sum(list(map(lambda x: d_service.get(x, max_nov),recs[(recs['User']==u)]['Service'].tolist())))/u_norm # fix norm=number of recommended items per user
+
+ # average value (not in elliot)
+ return round(sum(d_user.values())/len(users),4)
+
+
From d9dd6bbdad5b9456cdd08a89cc23fd77e6f12099 Mon Sep 17 00:00:00 2001
From: Konstantinos Kagkelidis
Date: Tue, 14 Jun 2022 09:59:17 +0300
Subject: [PATCH 36/89] REC-57 Add novelty to the UI
---
report.html.prototype | 14 +++++++++++++-
1 file changed, 13 insertions(+), 1 deletion(-)
diff --git a/report.html.prototype b/report.html.prototype
index d0fe318..6511667 100644
--- a/report.html.prototype
+++ b/report.html.prototype
@@ -49,6 +49,10 @@
color:#F4E588;
}
+.card-novelty {
+ background-color: #ffaf26;
+ color:#f6f5a2;
+}
.card-footer{
background-color: rgb(247,247,247,0.8);
color: black;
@@ -198,6 +202,14 @@ span:hover:before {
+
@@ -222,7 +234,7 @@ function fill(data){
let fill_list = ['users', 'recommendations', 'services', 'user_actions','user_actions_order', 'user_actions_registered',
'user_actions_anonymous', 'user_actions_order_registered', 'user_actions_order_anonymous', 'user_actions_registered_perc',
'user_actions_anonymous_perc', 'user_actions_order_registered_perc', 'user_actions_order_anonymous_perc',
- 'catalog_coverage_perc', 'user_coverage_perc', 'timestamp', 'hit_rate', 'click_through_rate', 'diversity'];
+ 'catalog_coverage_perc', 'user_coverage_perc', 'timestamp', 'hit_rate', 'click_through_rate', 'diversity', 'novelty'];
for (item of fill_list) {
key = 'val_' + item
From 638d01f46d43b476c2ccc738957bda782fd86f0f Mon Sep 17 00:00:00 2001
From: Nikolaos Triantafyllis
Date: Mon, 20 Jun 2022 21:54:04 +0300
Subject: [PATCH 37/89] REC 58 Adding the GiniIndex Diversity
---
metrics.py | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 54 insertions(+)
diff --git a/metrics.py b/metrics.py
index 65e52d6..51985e2 100644
--- a/metrics.py
+++ b/metrics.py
@@ -469,4 +469,58 @@ def nanmap(row):
# average value (not in elliot)
return round(sum(d_user.values())/len(users),4)
+@doc('The diversity of the recommendations according to GiniIndex. The index is 0 when all items are chosen equally often, and 1 when a single item is always chosen.(see book https://link.springer.com/10.1007/978-1-4939-7131-2_110158)')
+def diversity_gini(object, anonymous=False):
+ """
+ Calculate GiniIndex based on https://elliot.readthedocs.io/en/latest/_modules/elliot/evaluation/metrics/diversity/gini_index/gini_index.html#GiniIndex.
+ """
+ # keep recommendations with or without anonymous suggestions
+ # based on anonymous flag (default=False, i.e. ignore anonymous)
+ if anonymous:
+ recs=object.recommendations
+ else:
+ recs=object.recommendations[(object.recommendations['User'] != -1)]
+
+ # this variable keeps the sum of user_norm (where user_norm is
+ # the count of how many times a User has been suggested)
+ # however since no cutoff at per user recommendations is applied and
+ # also since each recommendation entry is one-to-one
+ # then the total number of recommendations is equal to this sum
+ free_norm=len(recs.index)
+
+ # (remember that recommendations have been previously
+ # filtered based on the existance of users in user.csv and
+ # services in services.csv)
+
+ # user_norm
+ # group recommendations entries by user id and
+ # then count how many times each user has been suggested
+ #gr_user=recs.groupby(['User']).count()
+
+ # create a dictionary of user_norm in order to
+ # map the user id to the respective user_norm
+ # key= and value=
+ #d_user=gr_user['Service'].to_dict()
+
+ # item_count
+ # group recommendations entries by service id and
+ # then count how many times each service has been suggested
+ gr_service=recs.groupby(['Service']).count()
+
+ # create a dictionary of item_count in order to
+ # map the service id to the respective item_count
+ # key= and value=
+ d_service=gr_service['User'].to_dict()
+
+
+ n_recommended_items = len(d_service)
+ num_items = services(object)
+
+ gini = sum([(2 * (j + 1 + num_items-n_recommended_items) -num_items -1) * (cs / free_norm) for j, cs in enumerate(sorted(d_service.values()))])
+
+ gini /= (num_items - 1)
+ gini = 1 - gini
+
+ return round(gini,4)
+
From 02de989217478ed4a54b463bcb8a9bc127f976d9 Mon Sep 17 00:00:00 2001
From: Konstantinos Kagkelidis
Date: Mon, 27 Jun 2022 09:27:05 +0300
Subject: [PATCH 38/89] REC-55 Reorganize metrics output
---
metrics.py | 109 ++++++++++++++++++++++--------------------
report.html.prototype | 32 ++++++++++---
rsmetrics.py | 53 ++++++++++++--------
3 files changed, 115 insertions(+), 79 deletions(-)
diff --git a/metrics.py b/metrics.py
index 51985e2..2285e91 100644
--- a/metrics.py
+++ b/metrics.py
@@ -11,18 +11,28 @@ def __init__(self):
self.user_actions_all=None
self.recommendations=None
-# decorator to add the text attribute to function
-def doc(r):
+
+
+# decorator to add the text attribute to function as major metric
+def metric(txt):
def wrapper(f):
- f.text = r
+ f.kind = "metric"
+ f.doc = txt
return f
return wrapper
+# decorator to add the text attribute to function
+def statistic(txt):
+ def wrapper(f):
+ f.kind = "statistic"
+ f.doc = txt
+ return f
+ return wrapper
# Metrics
-@doc('The initial date where metrics are calculated on')
+@statistic('The initial date where metrics are calculated on')
def start(object):
"""
Calculate the start date where metrics are calculated on
@@ -31,7 +41,8 @@ def start(object):
"""
return str(min(min(object.user_actions['Timestamp']),min(object.recommendations['Timestamp'])))
-@doc('The final date where metrics are calculated on')
+
+@statistic('The final date where metrics are calculated on')
def end(object):
"""
Calculate the end date where metrics are calculated on
@@ -40,7 +51,8 @@ def end(object):
"""
return str(max(max(object.user_actions['Timestamp']),max(object.recommendations['Timestamp'])))
-@doc('The total number of unique users found in users.csv (if provided), otherwise in user_actions.csv')
+
+@statistic('The total number of unique users found in users.csv (if provided), otherwise in user_actions.csv')
def users(object):
"""
Calculate the total number of unique users
@@ -53,7 +65,7 @@ def users(object):
return int(object.user_actions.nunique()['User'])
-@doc('The total number of unique services found in services.csv (if provided), otherwise in user_actions.csv')
+@statistic('The total number of unique services found in services.csv (if provided), otherwise in user_actions.csv')
def services(object):
"""
Calculate the total number of unique services
@@ -66,7 +78,7 @@ def services(object):
return len(np.unique(np.concatenate([object.user_actions['Source_Service'].unique(),object.user_actions['Target_Service'].unique()])))
-@doc('The total number of recommendations found in recommendations.csv')
+@statistic('The total number of recommendations found in recommendations.csv')
def recommendations(object):
"""
Calculate the total number of recommendations
@@ -74,7 +86,8 @@ def recommendations(object):
"""
return len(object.recommendations.index)
-@doc('The total number of recommendations for registered users found in recommendations.csv')
+
+@statistic('The total number of recommendations for registered users found in recommendations.csv')
def recommendations_registered(object):
"""
Calculate the total number of recommendations for registered users
@@ -83,7 +96,7 @@ def recommendations_registered(object):
return len(object.recommendations[object.recommendations['User'] != -1].index)
-@doc('The total number of recommendations for anonymous users found in recommendations.csv')
+@statistic('The total number of recommendations for anonymous users found in recommendations.csv')
def recommendations_anonymous(object):
"""
Calculate the total number of recommendations for anonymous users
@@ -92,7 +105,8 @@ def recommendations_anonymous(object):
return recommendations(object)-recommendations_registered(object)
-@doc('The percentage (%) of recommendations for registered users to the total recommendations')
+
+@statistic('The percentage (%) of recommendations for registered users to the total recommendations')
def recommendations_registered_perc(object):
"""
Calculate the percentage (%) of recommendations occurred
@@ -102,7 +116,7 @@ def recommendations_registered_perc(object):
return round(recommendations_registered(object)*100.0/recommendations(object),2)
-@doc('The percentage (%) of recommendations for anonymous users to the total recommendations')
+@statistic('The percentage (%) of recommendations for anonymous users to the total recommendations')
def recommendations_anonymous_perc(object):
"""
Calculate the percentage (%) of recommendations occurred
@@ -112,7 +126,7 @@ def recommendations_anonymous_perc(object):
return round(100.0-recommendations_registered_perc(object),2)
-@doc('The total number of user actions found in user_actions.csv')
+@statistic('The total number of user actions found in user_actions.csv')
def user_actions(object):
"""
Calculate the total number of user_actions
@@ -121,7 +135,7 @@ def user_actions(object):
return len(object.user_actions.index)
-@doc('The total number of user actions occurred by registered users found in user_actions.csv')
+@statistic('The total number of user actions occurred by registered users found in user_actions.csv')
def user_actions_registered(object):
"""
Calculate the total number of user_actions occurred by registered users
@@ -130,7 +144,7 @@ def user_actions_registered(object):
return len(object.user_actions[object.user_actions['User'] != -1].index)
-@doc('The total number of user actions occurred by anonymous users found in user_actions.csv')
+@statistic('The total number of user actions occurred by anonymous users found in user_actions.csv')
def user_actions_anonymous(object):
"""
Calculate the total number of user_actions occurred by anonymous users
@@ -139,7 +153,7 @@ def user_actions_anonymous(object):
return user_actions(object)-user_actions_registered(object)
-@doc('The percentage (%) of user actions occurred by registered users to the total user actions')
+@statistic('The percentage (%) of user actions occurred by registered users to the total user actions')
def user_actions_registered_perc(object):
"""
Calculate the percentage (%) of user actions occurred
@@ -149,7 +163,7 @@ def user_actions_registered_perc(object):
return round(user_actions_registered(object)*100.0/user_actions(object),2)
-@doc('The percentage (%) of user actions occurred by anonymous users to the total user actions')
+@statistic('The percentage (%) of user actions occurred by anonymous users to the total user actions')
def user_actions_anonymous_perc(object):
"""
Calculate the percentage (%) of user actions occurred
@@ -159,7 +173,7 @@ def user_actions_anonymous_perc(object):
return round(100.0-user_actions_registered_perc(object),2)
-@doc('The total number of user actions led to order found in user_actions.csv')
+@statistic('The total number of user actions led to order found in user_actions.csv')
def user_actions_order(object):
"""
Calculate the total number of user_actions led to order
@@ -168,7 +182,7 @@ def user_actions_order(object):
return len(object.user_actions[object.user_actions['Reward'] == 1.0].index)
-@doc('The total number of user actions led to order by registered users found in user_actions.csv')
+@statistic('The total number of user actions led to order by registered users found in user_actions.csv')
def user_actions_order_registered(object):
"""
Calculate the total number of user_actions led to order by registered users
@@ -177,7 +191,7 @@ def user_actions_order_registered(object):
return len(object.user_actions[(object.user_actions['Reward'] == 1.0) & (object.user_actions['User'] != -1)].index)
-@doc('The total number of user actions led to order by anonymous users found in user_actions.csv')
+@statistic('The total number of user actions led to order by anonymous users found in user_actions.csv')
def user_actions_order_anonymous(object):
"""
Calculate the total number of user_actions led to order by anonymous users
@@ -186,7 +200,7 @@ def user_actions_order_anonymous(object):
return user_actions_order(object)-user_actions_order_registered(object)
-@doc('The percentage (%) of user actions occurred by registered users and led to order to the total user actions that led to order')
+@statistic('The percentage (%) of user actions occurred by registered users and led to order to the total user actions that led to order')
def user_actions_order_registered_perc(object):
"""
Calculate the percentage (%) of user actions occurred
@@ -196,7 +210,7 @@ def user_actions_order_registered_perc(object):
return round(user_actions_order_registered(object)*100.0/user_actions_order(object),2)
-@doc('The percentage (%) of user actions occurred by anonymous users and led to order to the total user actions that led to order')
+@statistic('The percentage (%) of user actions occurred by anonymous users and led to order to the total user actions that led to order')
def user_actions_order_anonymous_perc(object):
"""
Calculate the percentage (%) of user actions occurred
@@ -206,7 +220,7 @@ def user_actions_order_anonymous_perc(object):
return round(100.0-user_actions_order_registered_perc(object),2)
-@doc('The total number of user actions assosicated with the recommendation panel found in user_actions.csv')
+@statistic('The total number of user actions assosicated with the recommendation panel found in user_actions.csv')
def user_actions_panel(object):
"""
Calculate the total number of user_actions assosicated with the recommendation panel
@@ -215,7 +229,7 @@ def user_actions_panel(object):
return len(object.user_actions[object.user_actions['Action'] == 'recommendation_panel'].index)
-@doc('The percentage (%) of user actions assosicated with the recommendation panel to the total user actions')
+@statistic('The percentage (%) of user actions assosicated with the recommendation panel to the total user actions')
def user_actions_panel_perc(object):
"""
Calculate the percentage (%) of user actions assosicated with
@@ -225,8 +239,8 @@ def user_actions_panel_perc(object):
return round(user_actions_panel(object)*100.0/user_actions(object),2)
-@doc('The total number of unique services found in recommendations.csv')
-def catalog_coverage(object):
+@statistic('The total number of unique services found in recommendations.csv')
+def total_unique_services_recommended(object):
"""
Calculate the total number of unique services
found in recommendations.csv
@@ -234,18 +248,18 @@ def catalog_coverage(object):
return int(object.recommendations.nunique()['Service'])
-@doc('The percentage (%) of unique services found in recommedations.csv to the total number of services (provided or found otherwise in user_actions.csv)')
-def catalog_coverage_perc(object):
+@metric('The percentage (%) of unique services found in recommedations.csv to the total number of services (provided or found otherwise in user_actions.csv)')
+def catalog_coverage(object):
"""
Calculate the percentage (%) of unique services
found in recommedations.csv to the total number
of services (provided or found otherwise in user_actions.csv)
"""
- return round(catalog_coverage(object)*100.0/services(object),2)
+ return round(total_unique_services_recommended(object)*100.0/services(object),2)
-@doc('The total number of unique users found in recommendations.csv')
-def user_coverage(object):
+@statistic('The total number of unique users found in recommendations.csv')
+def total_unique_users_recommended(object):
"""
Calculate the total number of unique users
found in recommendations.csv
@@ -253,16 +267,17 @@ def user_coverage(object):
return int(object.recommendations.nunique()['User'])
-@doc('The percentage (%) of unique users found in recommedations.csv to the total number of users (provided or found otherwise in user_actions.csv)')
-def user_coverage_perc(object):
+@metric('The percentage (%) of unique users found in recommedations.csv to the total number of users (provided or found otherwise in user_actions.csv)')
+def user_coverage(object):
"""
Calculate the percentage (%) of unique users
found in recommedations.csv to the total number
of users (provided or found otherwise in user_actions.csv)
"""
- return round(user_coverage(object)*100.0/users(object),2)
+ return round(total_unique_users_recommended(object)*100.0/users(object),2)
-@doc('The ratio of user hits divided by the total number of users (user hit: a user that has accessed at least one service that is also a personal recommendation)')
+
+@metric('The ratio of user hits divided by the total number of users (user hit: a user that has accessed at least one service that is also a personal recommendation)')
def hit_rate(object):
"""
For each user get the recommended services and the services the user accessed
@@ -305,7 +320,7 @@ def hit_rate(object):
return round(hits/len(users),5)
-@doc('The number of user clicks through recommendations panels divided by the total times recommendation panels were presented to users. Takes into account all historical data of user actions')
+@metric('The number of user clicks through recommendations panels divided by the total times recommendation panels were presented to users. Takes into account all historical data of user actions')
def click_through_rate(object):
"""
Get only the user actions that present a recommendation panel to the user in the source page
@@ -332,7 +347,8 @@ def click_through_rate(object):
return round(len(user_actions_recpanel_clicks)/len(user_actions_recpanel_views),2)
-@doc('The diversity of the recommendations according to Shannon Entropy. The entropy is 0 when a single item is always chosen or recommended, and log n when n items are chosen or recommended equally often. (see book https://link.springer.com/10.1007/978-1-4939-7131-2_110158)')
+
+@metric('The diversity of the recommendations according to Shannon Entropy. The entropy is 0 when a single item is always chosen or recommended, and log n when n items are chosen or recommended equally often. (see book https://link.springer.com/10.1007/978-1-4939-7131-2_110158)')
def diversity(object, anonymous=False):
"""
Calculate Shannon Entropy based on https://elliot.readthedocs.io/en/latest/guide/metrics/diversity.html?highlight=entropy#module-elliot.evaluation.metrics.diversity.shannon_entropy.shannon_entropy. The entropy is 0 when a single item is always chosen or recommended, and log n when n items are chosen or recommended equally often. See more in https://link.springer.com/content/pdf/10.1007/978-1-4899-7637-6.pdf, page 293.
@@ -400,7 +416,8 @@ def diversity(object, anonymous=False):
# of unique users
return round(sum(d_service.values())/len(d_user),4)
-@doc('Calculate novelty (Expected Free Discovery -EFD-) as the expected Inverse Collection Frequency -ICF- of (relevant and seen) recommended items')
+
+@metric('Calculate novelty (Expected Free Discovery -EFD-) as the expected Inverse Collection Frequency -ICF- of (relevant and seen) recommended items')
def novelty(object, anonymous=False):
"""
Calculate novelty (Expected Free Discovery -EFD-) as
@@ -469,7 +486,7 @@ def nanmap(row):
# average value (not in elliot)
return round(sum(d_user.values())/len(users),4)
-@doc('The diversity of the recommendations according to GiniIndex. The index is 0 when all items are chosen equally often, and 1 when a single item is always chosen.(see book https://link.springer.com/10.1007/978-1-4939-7131-2_110158)')
+@metric('The diversity of the recommendations according to GiniIndex. The index is 0 when all items are chosen equally often, and 1 when a single item is always chosen.(see book https://link.springer.com/10.1007/978-1-4939-7131-2_110158)')
def diversity_gini(object, anonymous=False):
"""
Calculate GiniIndex based on https://elliot.readthedocs.io/en/latest/_modules/elliot/evaluation/metrics/diversity/gini_index/gini_index.html#GiniIndex.
@@ -488,20 +505,6 @@ def diversity_gini(object, anonymous=False):
# then the total number of recommendations is equal to this sum
free_norm=len(recs.index)
- # (remember that recommendations have been previously
- # filtered based on the existance of users in user.csv and
- # services in services.csv)
-
- # user_norm
- # group recommendations entries by user id and
- # then count how many times each user has been suggested
- #gr_user=recs.groupby(['User']).count()
-
- # create a dictionary of user_norm in order to
- # map the user id to the respective user_norm
- # key= and value=
- #d_user=gr_user['Service'].to_dict()
-
# item_count
# group recommendations entries by service id and
# then count how many times each service has been suggested
diff --git a/report.html.prototype b/report.html.prototype
index 6511667..304aa81 100644
--- a/report.html.prototype
+++ b/report.html.prototype
@@ -165,17 +165,17 @@ span:hover:before {
-
User Coverage: %
+ User Coverage: %
-
+
-
Catalog Coverage: %
+ Catalog Coverage: %
-
+
@@ -230,16 +230,34 @@ span:hover:before {
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
{{data.name}}
+
{{data.summary}}
+
+
+
+
+
+
metric
+
+
+
+
+
Description
+
{{data.description}}
+
+
+
+
+
+
+
+
+
+
+
+ Type {{data.output.type}}
+
+
+
Range Values
+
Min={{data.output.min}} to Max={{data.output.max}}
+
+
+
{{data.output.comment}}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Process
+
+
+ {%for item in data.process %}
+ -
+ {{item.step}}
+
+
{{item.details}}
+
+ {%endfor%}
+
+
+
+
+
+
+
+
+
+
+
+
+
+