Top 5 recommended services
++ + {% for item in data.top5_services_recommended.value %} +
+ {%endfor%} + + + +diff --git a/.gitignore b/.gitignore index b7b65a7..f1e6fdd 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,11 @@ +# The following are handy to ignore in this specific project +# please ignore generated folders with results such as /data and /report +/data +/report + +# please ignore changes in the configuration file. If default configuration file structure is changed please override this rule with git add -f +/config.yaml + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] diff --git a/README.md b/README.md index b765902..dac0385 100644 --- a/README.md +++ b/README.md @@ -3,15 +3,15 @@ A framework for counting the recommender metrics # Preprocessor v.0.2
# RS metrics v.0.2 @@ -61,14 +61,38 @@ optional arguments: ``` 8. Configure `./preprocessor.py` by editting the `config.yaml` or providing another with `-c`: -- - - -
+```yaml + +# Set the desired connector (e.g. MongoDB) +Source: + MongoDB: + host: localhost + port: 27017 + db: recommender_dev + +User: + export: true + +Service: + # if true it keeps only published, otherwise all + # this has an effect in exporting when from is set to 'source' + # and also in metrics calculations where service is considered + published: true + # Use the EOSC-Marketplace webpage + # to associate page_id and service_id + download: true + path: ./page_map -9. Run from terminal: `./rsmetrics.py` to run RSmetrics + export: true + from: 'page_map' # or 'source' + +# Calculate source's metrics +Metrics: true + +``` + +9. Run from terminal: `./rsmetrics.py --users --services` to run RSmetrics and include the `users.csv` and `services.csv` files generated by the Preprocessor ```bash _____ _____ _ _ | __ \ / ____| | | (_) @@ -151,4 +175,31 @@ chmod u+x ./get_service_catalog.py ./get_service_catalog.py ``` +#### Serve Evaluation Reports as a Service + +The `webservice` folder hosts a simple webservice implemented in Flask framework which can be used to host the report results. + +__Note__: Please make sure you work in a virtual environment and you have already downloaded the required dependencies by issuing +`pip install -r requirements.txt` + +The webservice application serves two endpoints + - `/` : This is the frontend webpage that displays the Report Results in a UI + - `/api` : This api call returns the evaluation metrics in json format + +To run the webservice issue: +``` +cd ./webservice +flask run +``` + +The webservice by default runs in localhost:5000 you can override this by issuing for example: +``` +flask run -h 127.0.0.1 -p 8080 +``` + +There is an env variable `RS_EVAL_METRIC_SOURCE` which directs the webservice to the generated `metrics.json` file produced after the evaluation process. +This by default honors this repo's folder structure and directs to the root `/data/metrics.json` path + +You can override this by editing the `.env` file inside the `/webservice` folder, or specificy the `RS_EVAL_METRIC_SOURCE` variable accordingly before executing the `flask run` command + _Tested with python 3.9_ diff --git a/config.yaml b/config.yaml index 6513966..26cf0ab 100644 --- a/config.yaml +++ b/config.yaml @@ -5,30 +5,34 @@ Source: port: 27017 db: recommender_dev -User: - export: true - #from: 'user_actions' - #from: 'recommendations' - from: 'source' +# The database where the Preprocessor's +# and RSmetrics data are stored +Datastore: + MongoDB: + host: localhost + port: 27017 + db: rsmetrics Service: # Use the EOSC-Marketplace webpage - # to associate page_id and service_id - download: true - path: ./page_map - - export: true - #from: 'user_actions' - #from: 'recommendations' - from: 'source' - #from: 'page_map' - - published: false # applies only on source option - -User-actions: - merge: false # not implemented yet - -# Calculate source's metrics + # to retrieve resources and + # associate the page_id and the service_id + Portal: + download: true + path: ./page_map + + # if true it keeps only published, otherwise all + # this has an effect in exporting when from is set to 'source' + # and also in metrics calculations where service is considered + published: true + + # which origin to use to retrieve Resources + # two options available: + # - 'source': use the Connector + # - 'page_map': use the EOSC Marketplace + from: 'page_map' # or 'source' + +# Calculate source's metrics (pre-metrics) Metrics: true diff --git a/environment.yml b/environment.yml index db48d7e..290b03b 100644 --- a/environment.yml +++ b/environment.yml @@ -25,16 +25,24 @@ dependencies: - zlib=1.2.11=h7f8727e_4 - pip: - beautifulsoup4==4.10.0 + - certifi==2021.10.8 - charset-normalizer==2.0.12 + - click==8.1.3 + - Flask==2.1.2 - idna==3.3 - - joblib==1.1.0 + - importlib-metadata==4.11.4 + - itsdangerous==2.1.2 + - Jinja2==3.1.2 + - joblib==1.2.0 + - MarkupSafe==2.1.1 - natsort==8.1.0 - numpy==1.22.3 - pandas==1.4.2 - pymongo==4.1.0 - python-dateutil==2.8.2 + - python-dotenv==0.20.0 - pytz==2022.1 - - pyyaml==6.0 + - PyYAML==6.0 - requests==2.27.1 - scikit-surprise==1.1.1 - scipy==1.8.0 @@ -42,3 +50,6 @@ dependencies: - soupsieve==2.3.2 - surprise==0.1 - urllib3==1.26.9 + - Werkzeug==2.1.2 + - zipp==3.8.0 + - flask-pymongo==2.3.0 diff --git a/get_service_catalog.py b/get_service_catalog.py index d3423ac..7cc80a5 100755 --- a/get_service_catalog.py +++ b/get_service_catalog.py @@ -50,7 +50,7 @@ def get_service_catalog_items(content): for item in results: a = item.findChildren("a", recursive=False)[0] row = [int(item.attrs["data-service-id"]), - item.text.strip(), a['href']] + a.text.strip(), a['href']] rows.append(row) # sort rows by id rows = sorted(rows, key=lambda x: x[0]) diff --git a/metric_descriptions/README.md b/metric_descriptions/README.md new file mode 100644 index 0000000..f5fdeee --- /dev/null +++ b/metric_descriptions/README.md @@ -0,0 +1,12 @@ +# Metric Descriptions folder + +This folder is meant to contained detailed yaml files defining in structure the implementation details of each metric +To add a new detailed description in this folder please consult the first file added here: diversity.yml and structure +the information accordingly + +### Important Note on filenames +The filename should correspond to the name of the metric used in `metrics.json` output and the extension `.yml` +So for the metric Shannon Diversity the short name used in `metrics.json` is `diversity` thus the filename is `diversity.yml` + +### Multiline values +In yaml fields that you need to support multiline string content please use the `>` operator diff --git a/metric_descriptions/catalog-coverage.yml b/metric_descriptions/catalog-coverage.yml new file mode 100644 index 0000000..d993588 --- /dev/null +++ b/metric_descriptions/catalog-coverage.yml @@ -0,0 +1,36 @@ +name: Catalog Coverage + +summary: > + The percentage (%) of the division of the unique services found in recommendations to the total number of published services + +description: > + The Catalog Coverage is described by the formula $$\frac{unique\_rec\_services}{services}$$ + +output: + type: float + min: 0 + max: 100 + comment: Catalog Coverage is 0 when none of the services is being recommended, and 100 when all of them are being recommended. + +prerequisites: + - all available recommendations + - all available services + +process: + - step: Retrieve recommendations + details: > + Retrieve all available recommendations found in source + - step: Gather all unique services + details: > + Gather all unique services found in all available recommendations + - step: Retrieve services + details: > + Retrieve all available published services found in source + - step: Calculate ratio + details: > + Calculate the percentage (%) of the division of the unique services found in recommendations to the total number of published services + +# This is optional for visual stylization of the metric when displayed on the report +style: + icon: pe-7s-box2 + color: bg-malibu-beach \ No newline at end of file diff --git a/metric_descriptions/click-through-rate.yml b/metric_descriptions/click-through-rate.yml new file mode 100644 index 0000000..edf28f4 --- /dev/null +++ b/metric_descriptions/click-through-rate.yml @@ -0,0 +1,37 @@ +name: Click-Through Rate + +summary: > + The number of user clicks through recommendations panels divided by the total times recommendation panels were presented to users. + +description: > + The number of user clicks through recommendations panels divided by the total times recommendation panels were presented to users. Takes into account all historical data of user actions. The metric is expressed by the formula: $$Click-Through Rate=\frac{clicks}{views}$$ +output: + type: float + min: 0 + max: +inf + comment: A value of 0 indicates that no clicks through recommendations panels occurred + +prerequisites: + - all available user actions + +process: + - step: Retrieve user actions with recommendation panel + details: > + Get only the user actions that present a recommendation panel to the user in the source page. Those are actions with the following source paths: (i) /services, (ii) /services/, (iii) /services/c/{any category name} + - step: Count user actions with recommendation panel + details: > + Count the items in the above list as they represent the times recommendations panels were presented to the users of the portal + - step: Filter list + details: > + Narrow the above list into a new subset by selecting only user actions that originate from a recommendation panel. Those are actions that have the 'recommendation' string in the Action column + - step: Count user actions with clicks through recommendation panel + details: > + Count the items in the subset as they represent the times users clicked through recommendations + - step: Calculate ratio + details: > + Divide the items of the subset with the items of the first list to get the click-through rate + +# This is optional for visual stylization of the metric when displayed on the report +style: + icon: pe-7s-mouse + color: bg-grow-early diff --git a/metric_descriptions/diversity-gini.yml b/metric_descriptions/diversity-gini.yml new file mode 100644 index 0000000..25b746e --- /dev/null +++ b/metric_descriptions/diversity-gini.yml @@ -0,0 +1,46 @@ +name: Diversity Gini Index + +summary: > + Measures Recommendations' diversity. The index is 0 when all items are chosen equally often, and 1 when a single item is always chosen. + +description: > + The diversity (\(G\)) of the recommendations according to Gini Index. The index is 0 when all items are chosen equally often, + and 1 when a single item is always chosen + (see book \(\href{https://link.springer.com/10.1007/978-1-4939-7131-2_110158}{https://link.springer.com/10.1007/978-1-4939-7131-2_110158}\)). Generally, the Gini Index mathematical expression is defined as: + $$G=\frac{1}{n-1}\sum_{j=1}^{n}(2j-n-1)p(i_j)$$where \(i_1,\ldots,i_n\) is the list of items ordered according to increasing \(p(i)\) and each item \(i\) accounts for a proportion \(p(i)\) of user recommendations. In RS Metrics the computation is determined by the following forumla: + $$Diversity=\frac{1}{n-1}\sum_{j=1}^{n}(2j-n-1)\left(\frac{count(j)}{recommendations}\right)$$ + +output: + type: float + min: 0 + max: 1 + comment: The index is 0 when all items are chosen equally often, and 1 when a single item is always chosen. + +prerequisites: + - recommendations without anonymous users + - all available services + +process: + - step: Clean up + details: > + Recommendations clean up; entries removal where users or services are not found in "users" or "services" files accordingly + - step: Services Impact + details: > + Calculation of the impact of the services, by counting how many times each service i was suggested to all possible users: count(j) + - step: Sort Services Impact from low to high + details: > + Sort the number of how many times each service (i.e. i) was suggested from the lower to the higher value, in order to apply the respective weight (j). The computation includes services with 0 recommendation occurrence + - step: Recommended Probability of the Services + details: > + For each service calculate its recommended probability by dividing the number of service's occurrence found in the recommendations to the total number of recommendations + - step: Service-based product computation + details: > + Calculation of the product of the recommended probability from previous step and services' respective index j, for each service individually + - step: Gini Index computation + details: > + Computation of the overall value by summing all values from previous step + +# This is optional for visual stylization of the metric when displayed on the report +style: + icon: pe-7s-shuffle + color: bg-plum-plate diff --git a/metric_descriptions/diversity.yml b/metric_descriptions/diversity.yml new file mode 100644 index 0000000..3ef86b4 --- /dev/null +++ b/metric_descriptions/diversity.yml @@ -0,0 +1,44 @@ +name: Diversity Shannon Entropy + +summary: > + Measures Recommendations' diversity. The entropy is 0 when a single item is always chosen or recommended, + and log n when n items are chosen or recommended equally often. + +description: > + The diversity (\(H\)) of the recommendations according to Shannon Entropy. The entropy is 0 when a single item + is always chosen or recommended, and log(n) when n items are chosen or recommended equally often + (see book \(\href{https://link.springer.com/10.1007/978-1-4939-7131-2_110158}{https://link.springer.com/10.1007/978-1-4939-7131-2_110158}\)). Generally, the Shannon Entropy mathematical expression is defined as: + $$H=-\sum_{i=1}^{n}p(i)\log_2 p(i) $$In RS Metrics the computation is determined by the following forumla: + $$Diversity=-\sum_{i=1}^{services}\left(\frac{count(i)}{recommendations}\right)\log_2 \left(\frac{count(i)}{recommendations}\right)$$ + +output: + type: float + min: 0 + max: +\(\infty\) + comment: The entropy is 0 when a single item is always chosen or recommended, and log n when n items are chosen or recommended equally often. + +prerequisites: + - recommendations without anonymous users + - all available services + +process: + - step: Clean up + details: > + Recommendations clean up; entries removal where users or services are not found in "users" or "services" files accordingly + - step: Services Impact + details: > + Calculation of the impact of the services, by counting how many times each service i was suggested to all possible users: count(i) + - step: Recommended Probability of the Services + details: > + For each service calculate its recommended probability by dividing the number of service's occurrences found in the recommendations to the total number of recommendations + - step: Service-based product computation + details: > + Calculation of the product of the recommended probability from previous step and the logarithmic value of it, for each service individually + - step: Shannon Entropy computation + details: > + Computation of the overall value by summing all values from previous step + +# This is optional for visual stylization of the metric when displayed on the report +style: + icon: pe-7s-way + color: bg-sunny-morning diff --git a/metric_descriptions/hit-rate.yml b/metric_descriptions/hit-rate.yml new file mode 100644 index 0000000..64dd955 --- /dev/null +++ b/metric_descriptions/hit-rate.yml @@ -0,0 +1,33 @@ +name: Hit Rate + +summary: > + The ratio of user hits divided by the total number of users + +description: > + The ratio of user hits divided by the total number of users (user hit: a user that has accessed at least one service that is also a personal recommendation). The metric is expressed by the formula: $$Hit Rate=\frac{hits}{users}$$ + +output: + type: float + min: 0 + max: +inf + comment: A value of 0 indicates that no user hits occurred + +prerequisites: + - all available recommendations by registered users + - all available users + +process: + - step: Retrieve user-service association + details: > + For each user get the recommended services and the services the user accessed + - step: Calculate hits + details: > + Check if the user has at least one accessed service in recommendations. If yes increase number of hits by one + - step: Calculate ratio + details: > + Divide user hits by the total number of users + +# This is optional for visual stylization of the metric when displayed on the report +style: + icon: pe-7s-look + color: bg-malibu-beach diff --git a/metric_descriptions/novelty.yml b/metric_descriptions/novelty.yml new file mode 100644 index 0000000..cc58f21 --- /dev/null +++ b/metric_descriptions/novelty.yml @@ -0,0 +1,49 @@ +name: Novelty + +summary: > + The novelty metric expresses the ability of the system to recommend items not generally seen before + by the population of users. + +description: > + Calculating novelty of the recommender system based on the user actions using the following formula: + + $$Novelty=\frac{\sum\nolimits_{i \in R}-log(p(i))}{|R|}$$ + + For each service item $i$ beloning to the set of recommended services $R$ calculate the portion $p(i)$ of the times + the service has been viewed to the total views of the services produced by the user actions data. + + + +output: + type: float + min: 0 + max: +\(\infty\) + comment: Novelty expresses the ability of the system to recommend items that are novel (not seen before) by the population of users. A smaller number expresses that more services are being recommended that the users have not seen before + +prerequisites: + - all available recommendations associated with registered users + - a subset of the available user actions associated with registered users that expresses transitions to service pages + +process: + - step: Clean up + details: > + Recommendations and user actions clean up; entries removal where users or services are not found in "users" or "services" files accordingly + - step: User actions that target services + details: > + Identify and keep user actions that express transition to target pages that are views of services. + Additionally, user actions where the source and the target page belong to the same service's space are removed from the process. + - step: Calculate views for each service + details: > + Group and count user actions that express views for each recommended service id + - step: Calculate view propability p(i) of each service + details: > + Calculation of the view propability of each service which is the fraction of the service's views to the total service views + - step: Overall Novelty computation + details: > + Computation of the overall value by summing the negative log of all recommended service views from previous step and dividing them by the total + number of recommended services + +# This is optional for visual stylization of the metric when displayed on the report +style: + icon: pe-7s-magic-wand + color: bg-ripe-malin diff --git a/metric_descriptions/user-coverage.yml b/metric_descriptions/user-coverage.yml new file mode 100644 index 0000000..be1eb13 --- /dev/null +++ b/metric_descriptions/user-coverage.yml @@ -0,0 +1,36 @@ +name: User Coverage + +summary: > + The percentage (%) of the division of the unique users found in recommendations to the total number of users + +description: > + The User Coverage is described by the formula $$\frac{unique\_rec\_users}{users}$$ + +output: + type: float + min: 0 + max: 100 + comment: User Coverage is 0 when recommendations are being suggested to none users, and 100 when recommendations are being suggested to all of the users + +prerequisites: + - all available recommendations + - all available users + +process: + - step: Retrieve recommendations + details: > + Retrieve all available recommendations found in source + - step: Gather all unique users + details: > + Gather all unique users found in all available recommendations + - step: Retrieve users + details: > + Retrieve all available users found in source + - step: Calculate ratio + details: > + Calculate the percentage (%) of the division of the unique users found in recommendations to the total number of users + +# This is optional for visual stylization of the metric when displayed on the report +style: + icon: pe-7s-user + color: bg-grow-early \ No newline at end of file diff --git a/metrics.py b/metrics.py index f901135..a4141f1 100644 --- a/metrics.py +++ b/metrics.py @@ -1,52 +1,93 @@ #!/usr/bin/env python3 import pandas as pd import numpy as np - +import math class Runtime: def __init__(self): self.users=None self.services=None self.user_actions=None + self.user_actions_all=None self.recommendations=None -# decorator to add the text attribute to function -def doc(r): + + +# decorator to add the text attribute to function as major metric +def metric(txt): def wrapper(f): - f.text = r + f.kind = "metric" + f.doc = txt return f return wrapper +# decorator to add the text attribute to function +def statistic(txt): + def wrapper(f): + f.kind = "statistic" + f.doc = txt + return f + return wrapper # Metrics -@doc('The total number of unique users found in users.csv (if provided), otherwise in user_actions.csv') +@statistic('The type of the resource') +def type(object): + """ + The type of the resource, e.g. service + """ + # currently + return "service" + +@statistic('The provider of the resource') +def provider(object): + """ + The provider of the resource, e.g. cyfronet + """ + # currently + return "cyfronet" + +@statistic('The initial date where metrics are calculated on') +def start(object): + """ + Calculate the start date where metrics are calculated on + found in min value between Pandas DataFrame object user_action + and recommendation + """ + return str(min(min(object.user_actions['Timestamp']),min(object.recommendations['Timestamp']))) + + +@statistic('The final date where metrics are calculated on') +def end(object): + """ + Calculate the end date where metrics are calculated on + found in max value between Pandas DataFrame object user_action + and recommendation + """ + return str(max(max(object.user_actions['Timestamp']),max(object.recommendations['Timestamp']))) + + +@statistic('The total number of unique registered users in the system') def users(object): """ Calculate the total number of unique users found in Pandas DataFrame object users (if provided) or user_actions otherwise """ - if isinstance(object.users, pd.DataFrame): - return int(object.users.nunique()['User']) - else: - return int(object.user_actions.nunique()['User']) + return int(object.users['User'].nunique()) -@doc('The total number of unique services found in services.csv (if provided), otherwise in user_actions.csv') +@statistic('The total number of unique published services in the system') def services(object): """ Calculate the total number of unique services found in Pandas DataFrame object services (if provided) - or user_actions otherwise + or user_actions otherwise (from both Source and Target Service) """ - if isinstance(object.services, pd.DataFrame): - return int(object.services.nunique()['Service']) - else: - return int(object.user_actions.nunique()['Service']) + return int(object.services['Service'].nunique()) -@doc('The total number of recommendations found in recommendations.csv') +@statistic('The total number of recommendations generated by the system') def recommendations(object): """ Calculate the total number of recommendations @@ -55,7 +96,46 @@ def recommendations(object): return len(object.recommendations.index) -@doc('The total number of user actions found in user_actions.csv') +@statistic('The total number of recommendations for registered users found in recommendations.csv') +def recommendations_registered(object): + """ + Calculate the total number of recommendations for registered users + found in Pandas DataFrame object recommendations + """ + return len(object.recommendations[object.recommendations['User'] != -1].index) + + +@statistic('The total number of recommendations for anonymous users found in recommendations.csv') +def recommendations_anonymous(object): + """ + Calculate the total number of recommendations for anonymous users + found in Pandas DataFrame object recommendations + """ + return recommendations(object)-recommendations_registered(object) + + + +@statistic('The percentage (%) of recommendations for registered users to the total recommendations') +def recommendations_registered_perc(object): + """ + Calculate the percentage (%) of recommendations occurred + by registered users to the total recommendations + found in Pandas DataFrame object recommendations (in two decimals) + """ + return round(recommendations_registered(object)*100.0/recommendations(object),2) + + +@statistic('The percentage (%) of recommendations for anonymous users to the total recommendations') +def recommendations_anonymous_perc(object): + """ + Calculate the percentage (%) of recommendations occurred + by anonymous users to the total recommendations + found in Pandas DataFrame object recommendations (in two decimals) + """ + return round(100.0-recommendations_registered_perc(object),2) + + +@statistic('The total number of user actions found in user_actions.csv') def user_actions(object): """ Calculate the total number of user_actions @@ -64,7 +144,7 @@ def user_actions(object): return len(object.user_actions.index) -@doc('The total number of user actions occurred by registered users found in user_actions.csv') +@statistic('The total number of user actions occurred by registered users found in user_actions.csv') def user_actions_registered(object): """ Calculate the total number of user_actions occurred by registered users @@ -73,7 +153,7 @@ def user_actions_registered(object): return len(object.user_actions[object.user_actions['User'] != -1].index) -@doc('The total number of user actions occurred by anonymous users found in user_actions.csv') +@statistic('The total number of user actions occurred by anonymous users found in user_actions.csv') def user_actions_anonymous(object): """ Calculate the total number of user_actions occurred by anonymous users @@ -82,7 +162,7 @@ def user_actions_anonymous(object): return user_actions(object)-user_actions_registered(object) -@doc('The percentage (%) of user actions occurred by registered users to the total user actions') +@statistic('The percentage (%) of user actions occurred by registered users to the total user actions') def user_actions_registered_perc(object): """ Calculate the percentage (%) of user actions occurred @@ -92,7 +172,7 @@ def user_actions_registered_perc(object): return round(user_actions_registered(object)*100.0/user_actions(object),2) -@doc('The percentage (%) of user actions occurred by anonymous users to the total user actions') +@statistic('The percentage (%) of user actions occurred by anonymous users to the total user actions') def user_actions_anonymous_perc(object): """ Calculate the percentage (%) of user actions occurred @@ -102,7 +182,7 @@ def user_actions_anonymous_perc(object): return round(100.0-user_actions_registered_perc(object),2) -@doc('The total number of user actions led to order found in user_actions.csv') +@statistic('The total number of user actions led to order found in user_actions.csv') def user_actions_order(object): """ Calculate the total number of user_actions led to order @@ -111,7 +191,7 @@ def user_actions_order(object): return len(object.user_actions[object.user_actions['Reward'] == 1.0].index) -@doc('The total number of user actions led to order by registered users found in user_actions.csv') +@statistic('The total number of user actions led to order by registered users found in user_actions.csv') def user_actions_order_registered(object): """ Calculate the total number of user_actions led to order by registered users @@ -120,7 +200,7 @@ def user_actions_order_registered(object): return len(object.user_actions[(object.user_actions['Reward'] == 1.0) & (object.user_actions['User'] != -1)].index) -@doc('The total number of user actions led to order by anonymous users found in user_actions.csv') +@statistic('The total number of user actions led to order by anonymous users found in user_actions.csv') def user_actions_order_anonymous(object): """ Calculate the total number of user_actions led to order by anonymous users @@ -129,7 +209,7 @@ def user_actions_order_anonymous(object): return user_actions_order(object)-user_actions_order_registered(object) -@doc('The percentage (%) of user actions occurred by registered users and led to order to the total user actions that led to order') +@statistic('The percentage (%) of user actions occurred by registered users and led to order to the total user actions that led to order') def user_actions_order_registered_perc(object): """ Calculate the percentage (%) of user actions occurred @@ -139,7 +219,7 @@ def user_actions_order_registered_perc(object): return round(user_actions_order_registered(object)*100.0/user_actions_order(object),2) -@doc('The percentage (%) of user actions occurred by anonymous users and led to order to the total user actions that led to order') +@statistic('The percentage (%) of user actions occurred by anonymous users and led to order to the total user actions that led to order') def user_actions_order_anonymous_perc(object): """ Calculate the percentage (%) of user actions occurred @@ -149,7 +229,7 @@ def user_actions_order_anonymous_perc(object): return round(100.0-user_actions_order_registered_perc(object),2) -@doc('The total number of user actions assosicated with the recommendation panel found in user_actions.csv') +@statistic('The total number of user actions assosicated with the recommendation panel found in user_actions.csv') def user_actions_panel(object): """ Calculate the total number of user_actions assosicated with the recommendation panel @@ -158,7 +238,7 @@ def user_actions_panel(object): return len(object.user_actions[object.user_actions['Action'] == 'recommendation_panel'].index) -@doc('The percentage (%) of user actions assosicated with the recommendation panel to the total user actions') +@statistic('The percentage (%) of user actions assosicated with the recommendation panel to the total user actions') def user_actions_panel_perc(object): """ Calculate the percentage (%) of user actions assosicated with @@ -168,8 +248,8 @@ def user_actions_panel_perc(object): return round(user_actions_panel(object)*100.0/user_actions(object),2) -@doc('The total number of unique services found in recommendations.csv') -def catalog_coverage(object): +@statistic('The total number of unique services found in recommendations.csv') +def total_unique_services_recommended(object): """ Calculate the total number of unique services found in recommendations.csv @@ -177,18 +257,18 @@ def catalog_coverage(object): return int(object.recommendations.nunique()['Service']) -@doc('The percentage (%) of unique services found in recommedations.csv to the total number of services (provided or found otherwise in user_actions.csv)') -def catalog_coverage_perc(object): +@metric('The percentage (%) of unique services found in recommedations.csv to the total number of services (provided or found otherwise in user_actions.csv)') +def catalog_coverage(object): """ Calculate the percentage (%) of unique services found in recommedations.csv to the total number of services (provided or found otherwise in user_actions.csv) """ - return round(catalog_coverage(object)*100.0/services(object),2) + return round(total_unique_services_recommended(object)*100.0/services(object),2) -@doc('The total number of unique users found in recommendations.csv') -def user_coverage(object): +@statistic('The total number of unique users found in recommendations.csv') +def total_unique_users_recommended(object): """ Calculate the total number of unique users found in recommendations.csv @@ -196,13 +276,357 @@ def user_coverage(object): return int(object.recommendations.nunique()['User']) -@doc('The percentage (%) of unique users found in recommedations.csv to the total number of users (provided or found otherwise in user_actions.csv)') -def user_coverage_perc(object): +@metric('The percentage (%) of unique users found in recommedations.csv to the total number of users (provided or found otherwise in user_actions.csv)') +def user_coverage(object): """ Calculate the percentage (%) of unique users found in recommedations.csv to the total number of users (provided or found otherwise in user_actions.csv) """ - return round(user_coverage(object)*100.0/users(object),2) - - + return round(total_unique_users_recommended(object)*100.0/users(object),2) + + +@metric('The ratio of user hits divided by the total number of users (user hit: a user that has accessed at least one service that is also a personal recommendation)') +def hit_rate(object): + """ + For each user get the recommended services and the services the user accessed + Check if the user has at least one accessed service in recommendations. If yes increase number of hits by one + Divide by the total number of users + """ + users = object.users.values.tolist() + recs = object.recommendations.values.tolist() + # Fill lookup dictionary with all services recommender per user id + user_recs = dict() + for item in recs: + # skip anonymous users + if item == -1: + continue + user_id = item[0] + service_id = item[1] + if user_id in user_recs.keys(): + user_recs[user_id].append(service_id) + else: + user_recs[user_id] = [service_id] + + hits = 0 + # For each user in users check if his accessed services are in his recommendations + + for user in users: + user_id = user[0] + # create a set of unique accessed services by user + services = set(user[1]) + if user_id in user_recs.keys(): + # create a set of unique recommended services to the user + recommendations = set(user_recs.get(user_id)) + # intersection should include services that have been both accessed by and recommended to the user + intersection = services.intersection(recommendations) + # If the user has at least one service (both recommended and accessed), this user is considered a hit + if len(intersection) > 0: + hits = hits + 1 + + + + return round(hits/len(users),5) + + +@metric('The number of user clicks through recommendations panels divided by the total times recommendation panels were presented to users. Takes into account all historical data of user actions') +def click_through_rate(object): + """ + Get only the user actions that present a recommendation panel to the user in the source page + Those are actions with the following source paths: + - /services + - /services/ + - /services/c/{any category name} + Count the items in above list as they represent the times recommendations panels were presented to the users of the portal + Narrow the above list into a new subset by selecting only user actions that originate from a recommendation panel + Those are actions that have the 'recommendation' string in the Action column + Count the items in the subset as they represent the times users clicked through recommendations + Divide the items of the subset with the items of the first list to get the click-through rate + """ + + # get user actions + user_actions_all = object.user_actions_all.values.tolist() + + # filter only user actions with the needed source paths (/services, /services/, /services/c/...). + # source paths are on the [6] index of each list item + user_actions_recpanel_views = list(filter(lambda x: x[6] in ['/services', '/services/'] or x[6].startswith('/services/c/'),user_actions_all)) + + # further filter with those actions that they have 'recommender' + user_actions_recpanel_clicks = list(filter(lambda x: x[4]=='recommendation_panel',user_actions_recpanel_views)) + + return round(len(user_actions_recpanel_clicks)/len(user_actions_recpanel_views),2) + + +@metric('The diversity of the recommendations according to Shannon Entropy. The entropy is 0 when a single item is always chosen or recommended, and log n when n items are chosen or recommended equally often.') +def diversity(object, anonymous=False): + """ + Calculate Shannon Entropy. The entropy is 0 when a single item is always chosen or recommended, and log n when n items are chosen or recommended equally often. + """ + # keep recommendations with or without anonymous suggestions + # based on anonymous flag (default=False, i.e. ignore anonymous) + if anonymous: + recs=object.recommendations + else: + recs=object.recommendations[(object.recommendations['User'] != -1)] + + # this variable keeps the sum of user_norm (where user_norm is + # the count of how many times a User has been suggested) + # however since no cutoff at per user recommendations is applied and + # also since each recommendation entry is one-to-one
+
+ Generated on:
+