diff --git a/.gitignore b/.gitignore index b7b65a7..f1e6fdd 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,11 @@ +# The following are handy to ignore in this specific project +# please ignore generated folders with results such as /data and /report +/data +/report + +# please ignore changes in the configuration file. If default configuration file structure is changed please override this rule with git add -f +/config.yaml + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] diff --git a/README.md b/README.md index b765902..dac0385 100644 --- a/README.md +++ b/README.md @@ -3,15 +3,15 @@ A framework for counting the recommender metrics # Preprocessor v.0.2

- - + +

# RS metrics v.0.2

- - + +

@@ -61,14 +61,38 @@ optional arguments: ``` 8. Configure `./preprocessor.py` by editting the `config.yaml` or providing another with `-c`: -

- - - -

+```yaml + +# Set the desired connector (e.g. MongoDB) +Source: + MongoDB: + host: localhost + port: 27017 + db: recommender_dev + +User: + export: true + +Service: + # if true it keeps only published, otherwise all + # this has an effect in exporting when from is set to 'source' + # and also in metrics calculations where service is considered + published: true + # Use the EOSC-Marketplace webpage + # to associate page_id and service_id + download: true + path: ./page_map -9. Run from terminal: `./rsmetrics.py` to run RSmetrics + export: true + from: 'page_map' # or 'source' + +# Calculate source's metrics +Metrics: true + +``` + +9. Run from terminal: `./rsmetrics.py --users --services` to run RSmetrics and include the `users.csv` and `services.csv` files generated by the Preprocessor ```bash _____ _____ _ _ | __ \ / ____| | | (_) @@ -151,4 +175,31 @@ chmod u+x ./get_service_catalog.py ./get_service_catalog.py ``` +#### Serve Evaluation Reports as a Service + +The `webservice` folder hosts a simple webservice implemented in Flask framework which can be used to host the report results. + +__Note__: Please make sure you work in a virtual environment and you have already downloaded the required dependencies by issuing +`pip install -r requirements.txt` + +The webservice application serves two endpoints + - `/` : This is the frontend webpage that displays the Report Results in a UI + - `/api` : This api call returns the evaluation metrics in json format + +To run the webservice issue: +``` +cd ./webservice +flask run +``` + +The webservice by default runs in localhost:5000 you can override this by issuing for example: +``` +flask run -h 127.0.0.1 -p 8080 +``` + +There is an env variable `RS_EVAL_METRIC_SOURCE` which directs the webservice to the generated `metrics.json` file produced after the evaluation process. +This by default honors this repo's folder structure and directs to the root `/data/metrics.json` path + +You can override this by editing the `.env` file inside the `/webservice` folder, or specificy the `RS_EVAL_METRIC_SOURCE` variable accordingly before executing the `flask run` command + _Tested with python 3.9_ diff --git a/config.yaml b/config.yaml index 6513966..26cf0ab 100644 --- a/config.yaml +++ b/config.yaml @@ -5,30 +5,34 @@ Source: port: 27017 db: recommender_dev -User: - export: true - #from: 'user_actions' - #from: 'recommendations' - from: 'source' +# The database where the Preprocessor's +# and RSmetrics data are stored +Datastore: + MongoDB: + host: localhost + port: 27017 + db: rsmetrics Service: # Use the EOSC-Marketplace webpage - # to associate page_id and service_id - download: true - path: ./page_map - - export: true - #from: 'user_actions' - #from: 'recommendations' - from: 'source' - #from: 'page_map' - - published: false # applies only on source option - -User-actions: - merge: false # not implemented yet - -# Calculate source's metrics + # to retrieve resources and + # associate the page_id and the service_id + Portal: + download: true + path: ./page_map + + # if true it keeps only published, otherwise all + # this has an effect in exporting when from is set to 'source' + # and also in metrics calculations where service is considered + published: true + + # which origin to use to retrieve Resources + # two options available: + # - 'source': use the Connector + # - 'page_map': use the EOSC Marketplace + from: 'page_map' # or 'source' + +# Calculate source's metrics (pre-metrics) Metrics: true diff --git a/environment.yml b/environment.yml index db48d7e..290b03b 100644 --- a/environment.yml +++ b/environment.yml @@ -25,16 +25,24 @@ dependencies: - zlib=1.2.11=h7f8727e_4 - pip: - beautifulsoup4==4.10.0 + - certifi==2021.10.8 - charset-normalizer==2.0.12 + - click==8.1.3 + - Flask==2.1.2 - idna==3.3 - - joblib==1.1.0 + - importlib-metadata==4.11.4 + - itsdangerous==2.1.2 + - Jinja2==3.1.2 + - joblib==1.2.0 + - MarkupSafe==2.1.1 - natsort==8.1.0 - numpy==1.22.3 - pandas==1.4.2 - pymongo==4.1.0 - python-dateutil==2.8.2 + - python-dotenv==0.20.0 - pytz==2022.1 - - pyyaml==6.0 + - PyYAML==6.0 - requests==2.27.1 - scikit-surprise==1.1.1 - scipy==1.8.0 @@ -42,3 +50,6 @@ dependencies: - soupsieve==2.3.2 - surprise==0.1 - urllib3==1.26.9 + - Werkzeug==2.1.2 + - zipp==3.8.0 + - flask-pymongo==2.3.0 diff --git a/get_service_catalog.py b/get_service_catalog.py index d3423ac..7cc80a5 100755 --- a/get_service_catalog.py +++ b/get_service_catalog.py @@ -50,7 +50,7 @@ def get_service_catalog_items(content): for item in results: a = item.findChildren("a", recursive=False)[0] row = [int(item.attrs["data-service-id"]), - item.text.strip(), a['href']] + a.text.strip(), a['href']] rows.append(row) # sort rows by id rows = sorted(rows, key=lambda x: x[0]) diff --git a/metric_descriptions/README.md b/metric_descriptions/README.md new file mode 100644 index 0000000..f5fdeee --- /dev/null +++ b/metric_descriptions/README.md @@ -0,0 +1,12 @@ +# Metric Descriptions folder + +This folder is meant to contained detailed yaml files defining in structure the implementation details of each metric +To add a new detailed description in this folder please consult the first file added here: diversity.yml and structure +the information accordingly + +### Important Note on filenames +The filename should correspond to the name of the metric used in `metrics.json` output and the extension `.yml` +So for the metric Shannon Diversity the short name used in `metrics.json` is `diversity` thus the filename is `diversity.yml` + +### Multiline values +In yaml fields that you need to support multiline string content please use the `>` operator diff --git a/metric_descriptions/catalog-coverage.yml b/metric_descriptions/catalog-coverage.yml new file mode 100644 index 0000000..d993588 --- /dev/null +++ b/metric_descriptions/catalog-coverage.yml @@ -0,0 +1,36 @@ +name: Catalog Coverage + +summary: > + The percentage (%) of the division of the unique services found in recommendations to the total number of published services + +description: > + The Catalog Coverage is described by the formula $$\frac{unique\_rec\_services}{services}$$ + +output: + type: float + min: 0 + max: 100 + comment: Catalog Coverage is 0 when none of the services is being recommended, and 100 when all of them are being recommended. + +prerequisites: + - all available recommendations + - all available services + +process: + - step: Retrieve recommendations + details: > + Retrieve all available recommendations found in source + - step: Gather all unique services + details: > + Gather all unique services found in all available recommendations + - step: Retrieve services + details: > + Retrieve all available published services found in source + - step: Calculate ratio + details: > + Calculate the percentage (%) of the division of the unique services found in recommendations to the total number of published services + +# This is optional for visual stylization of the metric when displayed on the report +style: + icon: pe-7s-box2 + color: bg-malibu-beach \ No newline at end of file diff --git a/metric_descriptions/click-through-rate.yml b/metric_descriptions/click-through-rate.yml new file mode 100644 index 0000000..edf28f4 --- /dev/null +++ b/metric_descriptions/click-through-rate.yml @@ -0,0 +1,37 @@ +name: Click-Through Rate + +summary: > + The number of user clicks through recommendations panels divided by the total times recommendation panels were presented to users. + +description: > + The number of user clicks through recommendations panels divided by the total times recommendation panels were presented to users. Takes into account all historical data of user actions. The metric is expressed by the formula: $$Click-Through Rate=\frac{clicks}{views}$$ +output: + type: float + min: 0 + max: +inf + comment: A value of 0 indicates that no clicks through recommendations panels occurred + +prerequisites: + - all available user actions + +process: + - step: Retrieve user actions with recommendation panel + details: > + Get only the user actions that present a recommendation panel to the user in the source page. Those are actions with the following source paths: (i) /services, (ii) /services/, (iii) /services/c/{any category name} + - step: Count user actions with recommendation panel + details: > + Count the items in the above list as they represent the times recommendations panels were presented to the users of the portal + - step: Filter list + details: > + Narrow the above list into a new subset by selecting only user actions that originate from a recommendation panel. Those are actions that have the 'recommendation' string in the Action column + - step: Count user actions with clicks through recommendation panel + details: > + Count the items in the subset as they represent the times users clicked through recommendations + - step: Calculate ratio + details: > + Divide the items of the subset with the items of the first list to get the click-through rate + +# This is optional for visual stylization of the metric when displayed on the report +style: + icon: pe-7s-mouse + color: bg-grow-early diff --git a/metric_descriptions/diversity-gini.yml b/metric_descriptions/diversity-gini.yml new file mode 100644 index 0000000..25b746e --- /dev/null +++ b/metric_descriptions/diversity-gini.yml @@ -0,0 +1,46 @@ +name: Diversity Gini Index + +summary: > + Measures Recommendations' diversity. The index is 0 when all items are chosen equally often, and 1 when a single item is always chosen. + +description: > + The diversity (\(G\)) of the recommendations according to Gini Index. The index is 0 when all items are chosen equally often, + and 1 when a single item is always chosen + (see book \(\href{https://link.springer.com/10.1007/978-1-4939-7131-2_110158}{https://link.springer.com/10.1007/978-1-4939-7131-2_110158}\)). Generally, the Gini Index mathematical expression is defined as: + $$G=\frac{1}{n-1}\sum_{j=1}^{n}(2j-n-1)p(i_j)$$where \(i_1,\ldots,i_n\) is the list of items ordered according to increasing \(p(i)\) and each item \(i\) accounts for a proportion \(p(i)\) of user recommendations. In RS Metrics the computation is determined by the following forumla: + $$Diversity=\frac{1}{n-1}\sum_{j=1}^{n}(2j-n-1)\left(\frac{count(j)}{recommendations}\right)$$ + +output: + type: float + min: 0 + max: 1 + comment: The index is 0 when all items are chosen equally often, and 1 when a single item is always chosen. + +prerequisites: + - recommendations without anonymous users + - all available services + +process: + - step: Clean up + details: > + Recommendations clean up; entries removal where users or services are not found in "users" or "services" files accordingly + - step: Services Impact + details: > + Calculation of the impact of the services, by counting how many times each service i was suggested to all possible users: count(j) + - step: Sort Services Impact from low to high + details: > + Sort the number of how many times each service (i.e. i) was suggested from the lower to the higher value, in order to apply the respective weight (j). The computation includes services with 0 recommendation occurrence + - step: Recommended Probability of the Services + details: > + For each service calculate its recommended probability by dividing the number of service's occurrence found in the recommendations to the total number of recommendations + - step: Service-based product computation + details: > + Calculation of the product of the recommended probability from previous step and services' respective index j, for each service individually + - step: Gini Index computation + details: > + Computation of the overall value by summing all values from previous step + +# This is optional for visual stylization of the metric when displayed on the report +style: + icon: pe-7s-shuffle + color: bg-plum-plate diff --git a/metric_descriptions/diversity.yml b/metric_descriptions/diversity.yml new file mode 100644 index 0000000..3ef86b4 --- /dev/null +++ b/metric_descriptions/diversity.yml @@ -0,0 +1,44 @@ +name: Diversity Shannon Entropy + +summary: > + Measures Recommendations' diversity. The entropy is 0 when a single item is always chosen or recommended, + and log n when n items are chosen or recommended equally often. + +description: > + The diversity (\(H\)) of the recommendations according to Shannon Entropy. The entropy is 0 when a single item + is always chosen or recommended, and log(n) when n items are chosen or recommended equally often + (see book \(\href{https://link.springer.com/10.1007/978-1-4939-7131-2_110158}{https://link.springer.com/10.1007/978-1-4939-7131-2_110158}\)). Generally, the Shannon Entropy mathematical expression is defined as: + $$H=-\sum_{i=1}^{n}p(i)\log_2 p(i) $$In RS Metrics the computation is determined by the following forumla: + $$Diversity=-\sum_{i=1}^{services}\left(\frac{count(i)}{recommendations}\right)\log_2 \left(\frac{count(i)}{recommendations}\right)$$ + +output: + type: float + min: 0 + max: +\(\infty\) + comment: The entropy is 0 when a single item is always chosen or recommended, and log n when n items are chosen or recommended equally often. + +prerequisites: + - recommendations without anonymous users + - all available services + +process: + - step: Clean up + details: > + Recommendations clean up; entries removal where users or services are not found in "users" or "services" files accordingly + - step: Services Impact + details: > + Calculation of the impact of the services, by counting how many times each service i was suggested to all possible users: count(i) + - step: Recommended Probability of the Services + details: > + For each service calculate its recommended probability by dividing the number of service's occurrences found in the recommendations to the total number of recommendations + - step: Service-based product computation + details: > + Calculation of the product of the recommended probability from previous step and the logarithmic value of it, for each service individually + - step: Shannon Entropy computation + details: > + Computation of the overall value by summing all values from previous step + +# This is optional for visual stylization of the metric when displayed on the report +style: + icon: pe-7s-way + color: bg-sunny-morning diff --git a/metric_descriptions/hit-rate.yml b/metric_descriptions/hit-rate.yml new file mode 100644 index 0000000..64dd955 --- /dev/null +++ b/metric_descriptions/hit-rate.yml @@ -0,0 +1,33 @@ +name: Hit Rate + +summary: > + The ratio of user hits divided by the total number of users + +description: > + The ratio of user hits divided by the total number of users (user hit: a user that has accessed at least one service that is also a personal recommendation). The metric is expressed by the formula: $$Hit Rate=\frac{hits}{users}$$ + +output: + type: float + min: 0 + max: +inf + comment: A value of 0 indicates that no user hits occurred + +prerequisites: + - all available recommendations by registered users + - all available users + +process: + - step: Retrieve user-service association + details: > + For each user get the recommended services and the services the user accessed + - step: Calculate hits + details: > + Check if the user has at least one accessed service in recommendations. If yes increase number of hits by one + - step: Calculate ratio + details: > + Divide user hits by the total number of users + +# This is optional for visual stylization of the metric when displayed on the report +style: + icon: pe-7s-look + color: bg-malibu-beach diff --git a/metric_descriptions/novelty.yml b/metric_descriptions/novelty.yml new file mode 100644 index 0000000..cc58f21 --- /dev/null +++ b/metric_descriptions/novelty.yml @@ -0,0 +1,49 @@ +name: Novelty + +summary: > + The novelty metric expresses the ability of the system to recommend items not generally seen before + by the population of users. + +description: > + Calculating novelty of the recommender system based on the user actions using the following formula: + + $$Novelty=\frac{\sum\nolimits_{i \in R}-log(p(i))}{|R|}$$ + + For each service item $i$ beloning to the set of recommended services $R$ calculate the portion $p(i)$ of the times + the service has been viewed to the total views of the services produced by the user actions data. + + + +output: + type: float + min: 0 + max: +\(\infty\) + comment: Novelty expresses the ability of the system to recommend items that are novel (not seen before) by the population of users. A smaller number expresses that more services are being recommended that the users have not seen before + +prerequisites: + - all available recommendations associated with registered users + - a subset of the available user actions associated with registered users that expresses transitions to service pages + +process: + - step: Clean up + details: > + Recommendations and user actions clean up; entries removal where users or services are not found in "users" or "services" files accordingly + - step: User actions that target services + details: > + Identify and keep user actions that express transition to target pages that are views of services. + Additionally, user actions where the source and the target page belong to the same service's space are removed from the process. + - step: Calculate views for each service + details: > + Group and count user actions that express views for each recommended service id + - step: Calculate view propability p(i) of each service + details: > + Calculation of the view propability of each service which is the fraction of the service's views to the total service views + - step: Overall Novelty computation + details: > + Computation of the overall value by summing the negative log of all recommended service views from previous step and dividing them by the total + number of recommended services + +# This is optional for visual stylization of the metric when displayed on the report +style: + icon: pe-7s-magic-wand + color: bg-ripe-malin diff --git a/metric_descriptions/user-coverage.yml b/metric_descriptions/user-coverage.yml new file mode 100644 index 0000000..be1eb13 --- /dev/null +++ b/metric_descriptions/user-coverage.yml @@ -0,0 +1,36 @@ +name: User Coverage + +summary: > + The percentage (%) of the division of the unique users found in recommendations to the total number of users + +description: > + The User Coverage is described by the formula $$\frac{unique\_rec\_users}{users}$$ + +output: + type: float + min: 0 + max: 100 + comment: User Coverage is 0 when recommendations are being suggested to none users, and 100 when recommendations are being suggested to all of the users + +prerequisites: + - all available recommendations + - all available users + +process: + - step: Retrieve recommendations + details: > + Retrieve all available recommendations found in source + - step: Gather all unique users + details: > + Gather all unique users found in all available recommendations + - step: Retrieve users + details: > + Retrieve all available users found in source + - step: Calculate ratio + details: > + Calculate the percentage (%) of the division of the unique users found in recommendations to the total number of users + +# This is optional for visual stylization of the metric when displayed on the report +style: + icon: pe-7s-user + color: bg-grow-early \ No newline at end of file diff --git a/metrics.py b/metrics.py index f901135..a4141f1 100644 --- a/metrics.py +++ b/metrics.py @@ -1,52 +1,93 @@ #!/usr/bin/env python3 import pandas as pd import numpy as np - +import math class Runtime: def __init__(self): self.users=None self.services=None self.user_actions=None + self.user_actions_all=None self.recommendations=None -# decorator to add the text attribute to function -def doc(r): + + +# decorator to add the text attribute to function as major metric +def metric(txt): def wrapper(f): - f.text = r + f.kind = "metric" + f.doc = txt return f return wrapper +# decorator to add the text attribute to function +def statistic(txt): + def wrapper(f): + f.kind = "statistic" + f.doc = txt + return f + return wrapper # Metrics -@doc('The total number of unique users found in users.csv (if provided), otherwise in user_actions.csv') +@statistic('The type of the resource') +def type(object): + """ + The type of the resource, e.g. service + """ + # currently + return "service" + +@statistic('The provider of the resource') +def provider(object): + """ + The provider of the resource, e.g. cyfronet + """ + # currently + return "cyfronet" + +@statistic('The initial date where metrics are calculated on') +def start(object): + """ + Calculate the start date where metrics are calculated on + found in min value between Pandas DataFrame object user_action + and recommendation + """ + return str(min(min(object.user_actions['Timestamp']),min(object.recommendations['Timestamp']))) + + +@statistic('The final date where metrics are calculated on') +def end(object): + """ + Calculate the end date where metrics are calculated on + found in max value between Pandas DataFrame object user_action + and recommendation + """ + return str(max(max(object.user_actions['Timestamp']),max(object.recommendations['Timestamp']))) + + +@statistic('The total number of unique registered users in the system') def users(object): """ Calculate the total number of unique users found in Pandas DataFrame object users (if provided) or user_actions otherwise """ - if isinstance(object.users, pd.DataFrame): - return int(object.users.nunique()['User']) - else: - return int(object.user_actions.nunique()['User']) + return int(object.users['User'].nunique()) -@doc('The total number of unique services found in services.csv (if provided), otherwise in user_actions.csv') +@statistic('The total number of unique published services in the system') def services(object): """ Calculate the total number of unique services found in Pandas DataFrame object services (if provided) - or user_actions otherwise + or user_actions otherwise (from both Source and Target Service) """ - if isinstance(object.services, pd.DataFrame): - return int(object.services.nunique()['Service']) - else: - return int(object.user_actions.nunique()['Service']) + return int(object.services['Service'].nunique()) -@doc('The total number of recommendations found in recommendations.csv') +@statistic('The total number of recommendations generated by the system') def recommendations(object): """ Calculate the total number of recommendations @@ -55,7 +96,46 @@ def recommendations(object): return len(object.recommendations.index) -@doc('The total number of user actions found in user_actions.csv') +@statistic('The total number of recommendations for registered users found in recommendations.csv') +def recommendations_registered(object): + """ + Calculate the total number of recommendations for registered users + found in Pandas DataFrame object recommendations + """ + return len(object.recommendations[object.recommendations['User'] != -1].index) + + +@statistic('The total number of recommendations for anonymous users found in recommendations.csv') +def recommendations_anonymous(object): + """ + Calculate the total number of recommendations for anonymous users + found in Pandas DataFrame object recommendations + """ + return recommendations(object)-recommendations_registered(object) + + + +@statistic('The percentage (%) of recommendations for registered users to the total recommendations') +def recommendations_registered_perc(object): + """ + Calculate the percentage (%) of recommendations occurred + by registered users to the total recommendations + found in Pandas DataFrame object recommendations (in two decimals) + """ + return round(recommendations_registered(object)*100.0/recommendations(object),2) + + +@statistic('The percentage (%) of recommendations for anonymous users to the total recommendations') +def recommendations_anonymous_perc(object): + """ + Calculate the percentage (%) of recommendations occurred + by anonymous users to the total recommendations + found in Pandas DataFrame object recommendations (in two decimals) + """ + return round(100.0-recommendations_registered_perc(object),2) + + +@statistic('The total number of user actions found in user_actions.csv') def user_actions(object): """ Calculate the total number of user_actions @@ -64,7 +144,7 @@ def user_actions(object): return len(object.user_actions.index) -@doc('The total number of user actions occurred by registered users found in user_actions.csv') +@statistic('The total number of user actions occurred by registered users found in user_actions.csv') def user_actions_registered(object): """ Calculate the total number of user_actions occurred by registered users @@ -73,7 +153,7 @@ def user_actions_registered(object): return len(object.user_actions[object.user_actions['User'] != -1].index) -@doc('The total number of user actions occurred by anonymous users found in user_actions.csv') +@statistic('The total number of user actions occurred by anonymous users found in user_actions.csv') def user_actions_anonymous(object): """ Calculate the total number of user_actions occurred by anonymous users @@ -82,7 +162,7 @@ def user_actions_anonymous(object): return user_actions(object)-user_actions_registered(object) -@doc('The percentage (%) of user actions occurred by registered users to the total user actions') +@statistic('The percentage (%) of user actions occurred by registered users to the total user actions') def user_actions_registered_perc(object): """ Calculate the percentage (%) of user actions occurred @@ -92,7 +172,7 @@ def user_actions_registered_perc(object): return round(user_actions_registered(object)*100.0/user_actions(object),2) -@doc('The percentage (%) of user actions occurred by anonymous users to the total user actions') +@statistic('The percentage (%) of user actions occurred by anonymous users to the total user actions') def user_actions_anonymous_perc(object): """ Calculate the percentage (%) of user actions occurred @@ -102,7 +182,7 @@ def user_actions_anonymous_perc(object): return round(100.0-user_actions_registered_perc(object),2) -@doc('The total number of user actions led to order found in user_actions.csv') +@statistic('The total number of user actions led to order found in user_actions.csv') def user_actions_order(object): """ Calculate the total number of user_actions led to order @@ -111,7 +191,7 @@ def user_actions_order(object): return len(object.user_actions[object.user_actions['Reward'] == 1.0].index) -@doc('The total number of user actions led to order by registered users found in user_actions.csv') +@statistic('The total number of user actions led to order by registered users found in user_actions.csv') def user_actions_order_registered(object): """ Calculate the total number of user_actions led to order by registered users @@ -120,7 +200,7 @@ def user_actions_order_registered(object): return len(object.user_actions[(object.user_actions['Reward'] == 1.0) & (object.user_actions['User'] != -1)].index) -@doc('The total number of user actions led to order by anonymous users found in user_actions.csv') +@statistic('The total number of user actions led to order by anonymous users found in user_actions.csv') def user_actions_order_anonymous(object): """ Calculate the total number of user_actions led to order by anonymous users @@ -129,7 +209,7 @@ def user_actions_order_anonymous(object): return user_actions_order(object)-user_actions_order_registered(object) -@doc('The percentage (%) of user actions occurred by registered users and led to order to the total user actions that led to order') +@statistic('The percentage (%) of user actions occurred by registered users and led to order to the total user actions that led to order') def user_actions_order_registered_perc(object): """ Calculate the percentage (%) of user actions occurred @@ -139,7 +219,7 @@ def user_actions_order_registered_perc(object): return round(user_actions_order_registered(object)*100.0/user_actions_order(object),2) -@doc('The percentage (%) of user actions occurred by anonymous users and led to order to the total user actions that led to order') +@statistic('The percentage (%) of user actions occurred by anonymous users and led to order to the total user actions that led to order') def user_actions_order_anonymous_perc(object): """ Calculate the percentage (%) of user actions occurred @@ -149,7 +229,7 @@ def user_actions_order_anonymous_perc(object): return round(100.0-user_actions_order_registered_perc(object),2) -@doc('The total number of user actions assosicated with the recommendation panel found in user_actions.csv') +@statistic('The total number of user actions assosicated with the recommendation panel found in user_actions.csv') def user_actions_panel(object): """ Calculate the total number of user_actions assosicated with the recommendation panel @@ -158,7 +238,7 @@ def user_actions_panel(object): return len(object.user_actions[object.user_actions['Action'] == 'recommendation_panel'].index) -@doc('The percentage (%) of user actions assosicated with the recommendation panel to the total user actions') +@statistic('The percentage (%) of user actions assosicated with the recommendation panel to the total user actions') def user_actions_panel_perc(object): """ Calculate the percentage (%) of user actions assosicated with @@ -168,8 +248,8 @@ def user_actions_panel_perc(object): return round(user_actions_panel(object)*100.0/user_actions(object),2) -@doc('The total number of unique services found in recommendations.csv') -def catalog_coverage(object): +@statistic('The total number of unique services found in recommendations.csv') +def total_unique_services_recommended(object): """ Calculate the total number of unique services found in recommendations.csv @@ -177,18 +257,18 @@ def catalog_coverage(object): return int(object.recommendations.nunique()['Service']) -@doc('The percentage (%) of unique services found in recommedations.csv to the total number of services (provided or found otherwise in user_actions.csv)') -def catalog_coverage_perc(object): +@metric('The percentage (%) of unique services found in recommedations.csv to the total number of services (provided or found otherwise in user_actions.csv)') +def catalog_coverage(object): """ Calculate the percentage (%) of unique services found in recommedations.csv to the total number of services (provided or found otherwise in user_actions.csv) """ - return round(catalog_coverage(object)*100.0/services(object),2) + return round(total_unique_services_recommended(object)*100.0/services(object),2) -@doc('The total number of unique users found in recommendations.csv') -def user_coverage(object): +@statistic('The total number of unique users found in recommendations.csv') +def total_unique_users_recommended(object): """ Calculate the total number of unique users found in recommendations.csv @@ -196,13 +276,357 @@ def user_coverage(object): return int(object.recommendations.nunique()['User']) -@doc('The percentage (%) of unique users found in recommedations.csv to the total number of users (provided or found otherwise in user_actions.csv)') -def user_coverage_perc(object): +@metric('The percentage (%) of unique users found in recommedations.csv to the total number of users (provided or found otherwise in user_actions.csv)') +def user_coverage(object): """ Calculate the percentage (%) of unique users found in recommedations.csv to the total number of users (provided or found otherwise in user_actions.csv) """ - return round(user_coverage(object)*100.0/users(object),2) - - + return round(total_unique_users_recommended(object)*100.0/users(object),2) + + +@metric('The ratio of user hits divided by the total number of users (user hit: a user that has accessed at least one service that is also a personal recommendation)') +def hit_rate(object): + """ + For each user get the recommended services and the services the user accessed + Check if the user has at least one accessed service in recommendations. If yes increase number of hits by one + Divide by the total number of users + """ + users = object.users.values.tolist() + recs = object.recommendations.values.tolist() + # Fill lookup dictionary with all services recommender per user id + user_recs = dict() + for item in recs: + # skip anonymous users + if item == -1: + continue + user_id = item[0] + service_id = item[1] + if user_id in user_recs.keys(): + user_recs[user_id].append(service_id) + else: + user_recs[user_id] = [service_id] + + hits = 0 + # For each user in users check if his accessed services are in his recommendations + + for user in users: + user_id = user[0] + # create a set of unique accessed services by user + services = set(user[1]) + if user_id in user_recs.keys(): + # create a set of unique recommended services to the user + recommendations = set(user_recs.get(user_id)) + # intersection should include services that have been both accessed by and recommended to the user + intersection = services.intersection(recommendations) + # If the user has at least one service (both recommended and accessed), this user is considered a hit + if len(intersection) > 0: + hits = hits + 1 + + + + return round(hits/len(users),5) + + +@metric('The number of user clicks through recommendations panels divided by the total times recommendation panels were presented to users. Takes into account all historical data of user actions') +def click_through_rate(object): + """ + Get only the user actions that present a recommendation panel to the user in the source page + Those are actions with the following source paths: + - /services + - /services/ + - /services/c/{any category name} + Count the items in above list as they represent the times recommendations panels were presented to the users of the portal + Narrow the above list into a new subset by selecting only user actions that originate from a recommendation panel + Those are actions that have the 'recommendation' string in the Action column + Count the items in the subset as they represent the times users clicked through recommendations + Divide the items of the subset with the items of the first list to get the click-through rate + """ + + # get user actions + user_actions_all = object.user_actions_all.values.tolist() + + # filter only user actions with the needed source paths (/services, /services/, /services/c/...). + # source paths are on the [6] index of each list item + user_actions_recpanel_views = list(filter(lambda x: x[6] in ['/services', '/services/'] or x[6].startswith('/services/c/'),user_actions_all)) + + # further filter with those actions that they have 'recommender' + user_actions_recpanel_clicks = list(filter(lambda x: x[4]=='recommendation_panel',user_actions_recpanel_views)) + + return round(len(user_actions_recpanel_clicks)/len(user_actions_recpanel_views),2) + + +@metric('The diversity of the recommendations according to Shannon Entropy. The entropy is 0 when a single item is always chosen or recommended, and log n when n items are chosen or recommended equally often.') +def diversity(object, anonymous=False): + """ + Calculate Shannon Entropy. The entropy is 0 when a single item is always chosen or recommended, and log n when n items are chosen or recommended equally often. + """ + # keep recommendations with or without anonymous suggestions + # based on anonymous flag (default=False, i.e. ignore anonymous) + if anonymous: + recs=object.recommendations + else: + recs=object.recommendations[(object.recommendations['User'] != -1)] + + # this variable keeps the sum of user_norm (where user_norm is + # the count of how many times a User has been suggested) + # however since no cutoff at per user recommendations is applied and + # also since each recommendation entry is one-to-one + # then the total number of recommendations is equal to this sum + free_norm=len(recs.index) + + # (remember that recommendations have been previously + # filtered based on the existance of users in user.csv and + # services in services.csv) + + # user_norm + # group recommendations entries by user id and + # then count how many times each user has been suggested + gr_user=recs.groupby(['User']).count() + + # create a dictionary of user_norm in order to + # map the user id to the respective user_norm + # key= and value= + d_user=gr_user['Service'].to_dict() + + # item_count + # group recommendations entries by service id and + # then count how many times each service has been suggested + gr_service=recs.groupby(['Service']).count() + + # create a dictionary of item_count in order to + # map the service id to the respective item_count + # key= and value= + d_service=gr_service['User'].to_dict() + + # each element represent the service's recommendations occurance + # e.g. [1,6,7] + # a service was recommended 1 time, another 6 times and another 7 times + services_recommendation_count = np.array(list(d_service.values())) + + # the total number of recommendations + n_recommendations = services_recommendation_count.sum() + + # element-wise computations (division for each service's recommendations occurance) + recommended_probability = services_recommendation_count/n_recommendations + + # H=-Sum(p*logp) [element-wise] + shannon_entropy = -np.sum(recommended_probability * np.log2(recommended_probability)) + + return round(shannon_entropy,4) + + +@metric('The novelty expresses how often new and unseen items are recommended to users') +def novelty(object): + """Calculate novelty of recommendations using the n=SUM(-log(p(i)))/|R| formula + """ + # published services + services_pub = object.services['Service'] + # recommended services to authenticated users + services_rec = object.recommendations[object.recommendations['User']!=-1]['Service'] + # services that are published and recommended + services_recpub = services_rec[services_rec.isin(services_pub)].drop_duplicates() + + # user actions + ua = object.user_actions + # user actions filtered if src and target the same. Also filter out if target equals -1 and filter out anonymous users + ua_serv_view = ua[(ua['Source_Service'] != ua['Target_Service']) & (ua['Target_Service'] != -1) & (ua['User']!=-1)] + + # count service views by service id (sorted by service id) + services_viewed = ua_serv_view['Target_Service'].value_counts().sort_index() + + # create a table for each recommended service with columns for number of views, p(i) and -log(pi) + r_services = pd.DataFrame(index=services_recpub).sort_index() + # add views column to assign views to each recommended service + r_services['views'] = services_viewed + + # count the total service views in order to compute the portions p(i) + total_views = r_services['views'].sum() + + # count the total recommended services |R| + total_services = len(r_services) + # compute the p(i) of each recommeneded service + r_services['pi']=r_services['views'] / total_views + # calculate the negative log of the p(i). + r_services['-logpi']=-1* np.log2(r_services['pi']) + + # calculate novelty based on formula n=SUM(-log(p(i)))/|R| + novelty = r_services['-logpi'].sum()/total_services + + return round(novelty,4) + + +@metric('The diversity of the recommendations according to GiniIndex. The index is 0 when all items are chosen equally often, and 1 when a single item is always chosen.') +def diversity_gini(object, anonymous=False): + """ + Calculate GiniIndex based on https://elliot.readthedocs.io/en/latest/_modules/elliot/evaluation/metrics/diversity/gini_index/gini_index.html#GiniIndex. (see book https://link.springer.com/10.1007/978-1-4939-7131-2_110158) + """ + # keep recommendations with or without anonymous suggestions + # based on anonymous flag (default=False, i.e. ignore anonymous) + if anonymous: + recs=object.recommendations + else: + recs=object.recommendations[(object.recommendations['User'] != -1)] + + # this variable keeps the sum of user_norm (where user_norm is + # the count of how many times a User has been suggested) + # however since no cutoff at per user recommendations is applied and + # also since each recommendation entry is one-to-one + # then the total number of recommendations is equal to this sum + free_norm=len(recs.index) + + # item_count + # group recommendations entries by service id and + # then count how many times each service has been suggested + gr_service=recs.groupby(['Service']).count() + + # create a dictionary of item_count in order to + # map the service id to the respective item_count + # key= and value= + d_service=gr_service['User'].to_dict() + + # total number of recommended services + n_recommended_items = len(d_service) + + # total number of services + num_items = services(object) + + # create a zero list + # to calculate gini index including elements with 0 occurance + zeros=[0]*(num_items-n_recommended_items) + + gini = sum([(2*(j + 1) -num_items -1) * (cs / free_norm) for j, cs in enumerate(zeros+sorted(d_service.values()))]) + + gini /= (num_items - 1) + + return round(gini,4) + +@metric('The Top 5 recommended services according to recommendations entries') +def top5_services_recommended(object, k=5, base='https://marketplace.eosc-portal.eu', anonymous=False): + """ + Calculate the Top 5 recommended service according to the recommendations entries. + Return a list of list with the elements: + # (i) service id + # (ii) service name + # (iii) service page appended with base (to create the URL) + # (iv) total number of recommendations of the service + # (v) percentage of the (iv) to the total number of recommendations + # expressed in %, with or without anonymous, based on the function's flag + Service's info is being retrieved from the servives.csv file + (i.e. each line forms: service_id, service_name, page_id) + """ + # keep recommendations with or without anonymous suggestions + # based on anonymous flag (default=False, i.e. ignore anonymous) + if anonymous: + recs=object.recommendations + else: + recs=object.recommendations[(object.recommendations['User'] != -1)] + + # item_count + # group recommendations entries by service id and + # then count how many times each service has been suggested + gr_service=recs.groupby(['Service']).count() + + # create a dictionary of item_count in order to + # map the service id to the respective item_count + # key= and value= + d_service=gr_service['User'].to_dict() + + # convert dictionary to double list (list of lists) + # where the sublist is + # and sort them from max to min + l_service=list(map(lambda x: [x,d_service[x]],d_service)) + l_service.sort(key = lambda x: x[1], reverse=True) + + # get only the first k elements + l_service=l_service[:k] + + topk_services=[] + + for service in l_service: + # get service's info from dataframe + _df_service=object.services[object.services['Service'].isin([service[0]])] + # append a list with the elements: + # (i) service id + # (ii) service name + # (iii) service page appended with base (to create the URL) + # (iv) total number of recommendations of the service + # (v) percentage of the (iv) to the total number of recommendations + # expressed in %, with or without anonymous, based on the function's flag + topk_services.append({"service_id": service[0], + "service_name": str(_df_service['Name'].item()), + "service_url": base+str(_df_service['Page'].item()), + "recommendations": { + "value":service[1], + "percentage": round(100*service[1]/len(recs.index),2), + "of_total": len(recs.index) + } + }) + + return topk_services + +@metric('The Top 5 ordered services according to user actions entries') +def top5_services_ordered(object, k=5, base='https://marketplace.eosc-portal.eu', anonymous=False): + """ + Calculate the Top 5 ordered services according to user actions entries. + User actions with Target Pages that lead to unknown services (=-1) are being ignored. + Return a list of list with the elements: + # (i) service id + # (ii) service name + # (iii) service page appended with base (to create the URL) + # (iv) total number of orders of the service + # (v) percentage of the (iv) to the total number of orders + # expressed in %, with or without anonymous, based on the function's flag + Service's info is being retrieved from the services.csv file + (i.e. each line forms: service_id, service_name, page_id) + """ + # keep user actions with or without anonymous suggestions + # based on anonymous flag (default=False, i.e. ignore anonymous) + # user_actions with Target Pages that lead to unknown services (=-1) are being ignored + if anonymous: + uas=object.user_actions[(object.user_actions['Reward'] == 1.0) & (object.user_actions['Target_Service'] != -1) & (object.user_actions['User'] != -1)] + else: + uas=object.user_actions[(object.user_actions['Reward'] == 1.0) & (object.user_actions['Target_Service'] != -1)] + + # item_count + # group user_actions entries by service id and + # then count how many times each service has been suggested + gr_service=uas.groupby(['Target_Service']).count() + + # create a dictionary of item_count in order to + # map the service id to the respective item_count + # key= and value= + d_service=gr_service['User'].to_dict() + + # convert dictionary to double list (list of lists) + # where the sublist is + # and sort them from max to min + l_service=list(map(lambda x: [x,d_service[x]],d_service)) + l_service.sort(key = lambda x: x[1], reverse=True) + + # get only the first k elements + l_service=l_service[:k] + + topk_services=[] + + for service in l_service: + # get service's info from dataframe + _df_service=object.services[object.services['Service'].isin([service[0]])] + # append a list with the elements: + # (i) service id + # (ii) service name + # (iii) service page appended with base (to create the URL) + # (iv) total number of orders of the service + # (v) percentage of the (iv) to the total number of orders + # expressed in %, with or without anonymous, based on the function's flag + topk_services.append({"service_id":service[0], + "service_name": str(_df_service['Name'].item()), + "service_url": base+str(_df_service['Page'].item()), + "orders": { + "value": service[1], + "percentage": round(100*service[1]/len(uas.index),2), + "of_total": len(uas.index) + }}) + + return topk_services diff --git a/pre_metrics.py b/pre_metrics.py new file mode 100644 index 0000000..e6577ae --- /dev/null +++ b/pre_metrics.py @@ -0,0 +1,242 @@ +#!/usr/bin/env python3 +import pandas as pd +import numpy as np + + +class Runtime: + def __init__(self): + self.query={} + self.recdb=None + self.config=None + +# decorator to add the text attribute to function +def doc(r): + def wrapper(f): + f.text = r + return f + return wrapper + + +# Pre Metrics +@doc('The type of the resource') +def type(object): + """ + The type of the resource, e.g. service + """ + # currently + return "service" + +# Pre Metrics +@doc('The provider of the resource') +def provider(object): + """ + The provider of the resource, e.g. cyfronet + """ + # currently + return "cyfronet" + + +@doc('The initial date where metrics are calculated on') +def start(object): + """ + Calculate the start date where metrics are calculated on + found in min value between source object user_action + and recommendation + """ + ua_start=object.recdb["user_action"].find_one(object.query,sort=[("timestamp", 1)])["timestamp"] + rec_start=object.recdb["recommendation"].find_one(object.query,sort=[("timestamp", 1)])["timestamp"] + + return str(min(ua_start, rec_start)) + + +@doc('The final date where metrics are calculated on') +def end(object): + """ + Calculate the end date where metrics are calculated on + found in max value between source object user_action + and recommendation + """ + ua_end=object.recdb["user_action"].find_one(object.query,sort=[("timestamp", -1)])["timestamp"] + rec_end=object.recdb["recommendation"].find_one(object.query,sort=[("timestamp", -1)])["timestamp"] + return str(max(ua_end, rec_end)) + + +@doc('The total number of unique users found in users of the source') +def users(object): + """ + Calculate the total number of unique users + found in source object + """ + return object.recdb["user"].count_documents({}) + + +@doc('The total number of unique services found in services of the source') +def services(object): + """ + Calculate the total number of unique services + found in source object (default to published only) + """ + if object.config['Service']['published']: + return object.recdb["service"].count_documents({"status":"published"}) + else: + return object.recdb["service"].count_documents({}) + + +@doc('The total number of recommendation_actions found in recommendation_actions of the source') +def recommendation_actions(object): + """ + Calculate the total number of recommendation_actions + found in source object + """ + return object.recdb["recommendation"].count_documents(object.query) + +@doc('The total number of recommendation_actions for registered users found in recommendation_actions.csv') +def recommendation_actions_registered(object): + """ + Calculate the total number of recommendation_actions for registered users + found in Pandas DataFrame object recommendation_actions + """ + return object.recdb["recommendation"].count_documents({**object.query,**{"user":{"$exists":True}}}) + + +@doc('The total number of recommendation_actions for anonymous users found in recommendation_actions.csv') +def recommendation_actions_anonymous(object): + """ + Calculate the total number of recommendation_actions for anonymous users + found in Pandas DataFrame object recommendation_actions + """ + return recommendation_actions(object)-recommendation_actions_registered(object) + + +@doc('The percentage (%) of recommendation_actions for registered users to the total recommendation_actions') +def recommendation_actions_registered_perc(object): + """ + Calculate the percentage (%) of recommendation_actions occurred + by registered users to the total recommendation_actions + found in Pandas DataFrame object recommendation_actions (in two decimals) + """ + return round(recommendation_actions_registered(object)*100.0/recommendation_actions(object),2) + + +@doc('The percentage (%) of recommendation_actions for anonymous users to the total recommendation_actions') +def recommendation_actions_anonymous_perc(object): + """ + Calculate the percentage (%) of recommendation_actions occurred + by anonymous users to the total recommendation_actions + found in Pandas DataFrame object recommendation_actions (in two decimals) + """ + return round(100.0-recommendation_actions_registered_perc(object),2) + + +@doc('The total number of user actions found in user actions of the source') +def user_actions(object): + """ + Calculate the total number of user_actions + found in source object + """ + return object.recdb["user_action"].count_documents(object.query) + + +@doc('The total number of user actions occurred by registered users found in user actions of the source') +def user_actions_registered(object): + """ + Calculate the total number of user_actions occurred by registered users + found in source object + """ + return object.recdb["user_action"].count_documents({**object.query,**{"user":{"$exists":True}}}) + + +@doc('The total number of user actions occurred by anonymous users found in user actions of the source') +def user_actions_anonymous(object): + """ + Calculate the total number of user_actions occurred by anonymous users + found in source object + """ + return user_actions(object)-user_actions_registered(object) + + +@doc('The percentage (%) of user actions occurred by registered users to the total user actions') +def user_actions_registered_perc(object): + """ + Calculate the percentage (%) of user actions occurred + by registered users to the total user actions + found in source object user_actions (in two decimals) + """ + return round(user_actions_registered(object)*100.0/user_actions(object),2) + + +@doc('The percentage (%) of user actions occurred by anonymous users to the total user actions') +def user_actions_anonymous_perc(object): + """ + Calculate the percentage (%) of user actions occurred + by anonymous users to the total user actions + found in source object user_actions (in two decimals) + """ + return round(100.0-user_actions_registered_perc(object),2) + + +@doc('The total number of user actions led to order found in user actions of the source') +def user_actions_order(object): + """ + Calculate the total number of user_actions led to order + found in source object user_actions + """ + return object.recdb["user_action"].count_documents({**object.query, **{"action.order":True}}) + + +@doc('The total number of user actions led to order by registered users found in user actions of the source') +def user_actions_order_registered(object): + """ + Calculate the total number of user_actions led to order by registered users + found in source object user_actions + """ + return object.recdb["user_action"].count_documents({**object.query, **{"action.order":True,"user":{"$exists":True}}}) + + +@doc('The total number of user actions led to order by anonymous users found in user actions of the source') +def user_actions_order_anonymous(object): + """ + Calculate the total number of user_actions led to order by anonymous users + found in source object user_actions + """ + return user_actions_order(object)-user_actions_order_registered(object) + + +@doc('The percentage (%) of user actions occurred by registered users and led to order to the total user actions that led to order') +def user_actions_order_registered_perc(object): + """ + Calculate the percentage (%) of user actions occurred + by registered users and led to order to the total user actions that led to order + found in source object user_actions (in two decimals) + """ + return round(user_actions_order_registered(object)*100.0/user_actions_order(object),2) + + +@doc('The percentage (%) of user actions occurred by anonymous users and led to order to the total user actions that led to order') +def user_actions_order_anonymous_perc(object): + """ + Calculate the percentage (%) of user actions occurred + by anonymous users and led to order to the total user actions that led to order + found in source object user_actions (in two decimals) + """ + return round(100.0-user_actions_order_registered_perc(object),2) + + +@doc('The total number of user actions assosicated with the recommendation panel found in user actions of the source') +def user_actions_panel(object): + """ + Calculate the total number of user_actions assosicated with the recommendation panel + found in source object user_actions + """ + return object.recdb["user_action"].count_documents({**object.query, **{"source.root.type":"recommendation_panel"}}) + + +@doc('The percentage (%) of user actions assosicated with the recommendation panel to the total user actions') +def user_actions_panel_perc(object): + """ + Calculate the percentage (%) of user actions assosicated with + the recommendation panel to the total user actions + found in source object user_actions (in two decimals) + """ + return round(user_actions_panel(object)*100.0/user_actions(object),2) + diff --git a/preprocessor.py b/preprocessor.py index 3be128e..9fa7dea 100755 --- a/preprocessor.py +++ b/preprocessor.py @@ -7,16 +7,20 @@ import os from natsort import natsorted import natsort as ns - +import pandas as pd +from inspect import getmembers, isfunction import retrieval +import csv +# local lib +import pre_metrics as pm import reward_mapping as rm from get_service_catalog import get_eosc_marketplace_url, get_service_catalog_items, get_service_catalog_page_content, save_service_items_to_csv __copyright__ = "© "+str(datetime.utcnow().year)+", National Infrastructures for Research and Technology (GRNET)" __status__ = "Production" -__version__ = "0.2" +__version__ = "0.2.2" os.environ['COLUMNS'] = "90" @@ -128,12 +132,9 @@ def __init__(self, source_page_id, target_page_id, order): recdb = myclient[config["Source"]["MongoDB"]["db"]] - - - # automatically associate page ids to service ids -if config['Service']['download']: - service_list_path = os.path.join(args.output,config['Service']['path']) +if config['Service']['Portal']['download']: + service_list_path = os.path.join(args.output,config['Service']['Portal']['path']) eosc_url = get_eosc_marketplace_url() print( "Retrieving page: marketplace list of services... \nGrabbing url: {0}".format(eosc_url)) @@ -146,19 +147,30 @@ def __init__(self, source_page_id, target_page_id, order): # read map file and save in dict -with open(os.path.join(args.output,config['Service']['path']), 'r') as f: +with open(os.path.join(args.output,config['Service']['Portal']['path']), 'r') as f: lines=f.readlines() -keys=list(map(lambda x: remove_service_prefix(x.split(',')[2]).strip(), lines)) -values=list(map(lambda x: x.split(',')[0].strip(), lines)) +keys=list(map(lambda x: remove_service_prefix(x.split(',')[-1]).strip(), lines)) +ids=list(map(lambda x: x.split(',')[0].strip(), lines)) +names=list(map(lambda x: x.split(',')[1].strip(), lines)) -dmap=dict(zip(keys, values)) #=> {'a': 1, 'b': 2} +dmap=dict(zip(keys, zip(ids,names))) #=> {'a': 1, 'b': 2} +rdmap=dict(zip(ids,zip(keys,names))) +# reward_mapping.py is modified so that the function +# reads the Transition rewards csv file once +# consequently, one argument has been added to the +# called function +ROOT_DIR='./' -uas={} +TRANSITION_REWARDS_CSV_PATH = os.path.join( + ROOT_DIR, "resources", "transition_rewards.csv" +) +transition_rewards_df = pd.read_csv(TRANSITION_REWARDS_CSV_PATH, index_col="source") -for ua in recdb["user_action"].find(query): +luas=[] +for ua in recdb["user_action"].find(query).sort("user"): # set -1 to anonymous users try: user=ua['user'] @@ -166,134 +178,166 @@ def __init__(self, source_page_id, target_page_id, order): user=-1 # process data that map from page id to service id exist + # for both source and target page ids + # if not set service id to -1 try: - _pageid="/"+"/".join(ua['target']['page_id'].split('/')[1:3]) - service_id=dmap[_pageid] + _pageid="/"+"/".join(ua['source']['page_id'].split('/')[1:3]) + source_service_id=dmap[_pageid][0] + except: + source_service_id=-1 + try: + _pageid="/"+"/".join(ua['target']['page_id'].split('/')[1:3]) + target_service_id=dmap[_pageid][0] except: - continue + target_service_id=-1 - symbolic_reward=rm.ua_to_reward_id(User_Action(ua['source']['page_id'], + # function has been modified where one more argument is given + # in order to avoid time-consuming processing of reading csv file + # for every func call + symbolic_reward=rm.ua_to_reward_id(transition_rewards_df, + User_Action(ua['source']['page_id'], ua['target']['page_id'], ua['action']['order'])) reward=reward_mapping[symbolic_reward] - uas.setdefault(user,{}) - - # then we need to merge rewards - # keep the max value for each record - try: - if uas[user][service_id][0] < reward: - uas[user][service_id]=[reward, ua['source']['root']['type'], ua['timestamp']] - except: - uas[user].setdefault(service_id,[reward, ua['source']['root']['type'], ua['timestamp']]) - -luas=[] - -for user,_ in natsorted(uas.items(),alg=ns.ns.SIGNED): - for service,act in natsorted(uas[user].items(),alg=ns.ns.SIGNED): - - if service: - luas.append('{},{},{},{},{}\n'.format(user, service, *act)) - - -with open(os.path.join(args.output,'user_actions.csv'), 'w') as o: - o.writelines(luas) - + luas.append({'user_id':int(user), + 'source_resource_id':int(source_service_id), + 'target_resource_id':int(target_service_id), + 'reward':float(reward), + 'panel':ua['source']['root']['type'], + 'timestamp':ua['timestamp'], + 'source_path':ua['source']['page_id'], + 'target_path':ua['target']['page_id'], + 'type': 'service', # currently, static + 'provider': 'cyfronet', # currently, static + 'ingestion': 'batch', # currently, static + }) + +#luas=natsorted(luas,alg=ns.ns.SIGNED) recs=[] -for rec in recdb["recommendation"].find(query): +for rec in recdb["recommendation"].find(query).sort("user"): try: user=rec['user'] except: user=-1 - for service in rec['services']: - recs.append('{},{},{},{}\n'.format(user, service, '1', rec['timestamp'])) - -recs=natsorted(recs,alg=ns.ns.SIGNED) - -with open(os.path.join(args.output,'recommendations.csv'), 'w') as o: - o.writelines(recs) - - -# export user catalog -if config['User']['export']: - - if config['User']['from']=='user_actions': - us=natsorted(list(set(list(map(lambda x: x.split(',')[0]+'\n',luas)))),alg=ns.ns.SIGNED) + recs.append({'user_id':int(user), + 'resource_ids': rec['services'], + 'timestamp':rec['timestamp'], + 'type': 'service', # currently, static + 'provider': 'cyfronet', # currently, static + 'ingestion': 'batch', # currently, static + }) + +#recs=natsorted(recs,alg=ns.ns.SIGNED) + +# produce users csv with each user id along with the user's accessed services +# query users from database for fields _id and accessed_services then create a list of rows +# each rows contains two elements, first: user_id in string format and second: a space separated sorted list of accessed services +users = recdb['user'].find({},{'accessed_services':1}) +users = list(map(lambda x: {'id':int(str(x['_id'])), + 'accessed_resources': sorted(set(x['accessed_services'])), + 'created_on': None, + 'deleted_on': None, + 'provider': 'cyfronet', # currently, static + 'ingestion': 'batch', # currently, static + }, users)) + +if config['Service']['from']=='page_map': + + _ss=natsorted(list(set(list(map(lambda x: x+'\n',ids)))),alg=ns.ns.SIGNED) + resources=[] + for s in _ss: + try: + #ss.append(s.strip()+',"'+rdmap[s.strip()][1]+'",'+rdmap[s.strip()][0]+'\n') + resources.append({'id':int(s.strip()), + 'name':rdmap[s.strip()][1], + 'path':rdmap[s.strip()][0], + 'created_on': None, + 'deleted_on': None, + 'type': 'service', # currently, static + 'provider': 'cyfronet', # currently, static + 'ingestion': 'batch', # currently, static + }) + except: + continue + +else: # 'source' + _query="" + if config['Service']['published']: + _query={"status":"published"} - elif config['User']['from']=='recommendations': - us=natsorted(list(set(list(map(lambda x: x.split(',')[0]+'\n',recs)))),alg=ns.ns.SIGNED) - - else: # 'source' - us=natsorted(list(set(list(map(lambda x: str(x['_id'])+'\n',recdb["user"].find({}))))),alg=ns.ns.SIGNED) - - with open(os.path.join(args.output,'users.csv'), 'w') as o: - o.writelines(us) + _ss=natsorted(list(set(list(map(lambda x: str(x['_id'])+',"'+str(x['name'])+'"\n',recdb["service"].find(_query))))),alg=ns.ns.SIGNED) + resources=[] + for s in _ss: + try: + #ss.append(s.strip()+','+rdmap[s.split(',')[0]]+'\n') + resources.append({'id':int(s.split(',')[0]), + 'name':rdmap[s.split(',')[0]][1], + 'path':rdmap[s.split(',')[0]][0], + 'created_on': None, + 'deleted_on': None, + 'type': 'service', # currently, static + 'provider': 'cyfronet', # currently, static + 'ingestion': 'batch', # currently, static + }) + + except: + continue + +# store data to Mongo DB +# connect to db server +datastore = pymongo.MongoClient("mongodb://"+config["Datastore"]["MongoDB"]["host"]+":"+str(config["Datastore"]["MongoDB"]["port"])+"/", uuidRepresentation='pythonLegacy') -# export service catalog -if config['Service']['export']: +# use db +rsmetrics_db = datastore[config["Datastore"]["MongoDB"]["db"]] - if config['Service']['from']=='user_actions': - ss=natsorted(list(set(list(map(lambda x: x.split(',')[1]+'\n',luas)))),alg=ns.ns.SIGNED) - - elif config['Service']['from']=='recommendations': - ss=natsorted(list(set(list(map(lambda x: x.split(',')[1]+'\n',recs)))),alg=ns.ns.SIGNED) +rsmetrics_db["user_actions"].delete_many({"provider":'cyfronet', "ingestion":'batch'}) +rsmetrics_db["user_actions"].insert_many(luas) - elif config['Service']['from']=='page_map': - ss=natsorted(list(set(list(map(lambda x: x+'\n',values)))),alg=ns.ns.SIGNED) +rsmetrics_db["recommendations"].delete_many({"provider":'cyfronet', "ingestion":'batch'}) +rsmetrics_db["recommendations"].insert_many(recs) - else: # 'source' - if config['Service']['published']: - ss=natsorted(list(set(list(map(lambda x: str(x['_id'])+'\n',recdb["service"].find({"status":"published"}))))),alg=ns.ns.SIGNED) - else: - ss=natsorted(list(set(list(map(lambda x: str(x['_id'])+'\n',recdb["service"].find({}))))),alg=ns.ns.SIGNED) +rsmetrics_db["users"].delete_many({"provider":'cyfronet', "ingestion":'batch'}) +rsmetrics_db["users"].insert_many(users) - with open(os.path.join(args.output,'services.csv'), 'w') as o: - o.writelines(ss) +rsmetrics_db["resources"].delete_many({"provider":'cyfronet', "ingestion":'batch'}) +rsmetrics_db["resources"].insert_many(resources) # calculate pre metrics if config['Metrics']: - time_range=recdb["user_action"].distinct("timestamp", query) - - m.timestamp=str(datetime.utcnow()) - - m.users=recdb["user"].count_documents({}) - m.recommendations=recdb["recommendation"].count_documents(query) - m.services=recdb["service"].count_documents({}) - m.user_actions=recdb["user_action"].count_documents(query) - - m.user_actions_registered=recdb["user_action"].count_documents({**query,**{"user":{"$exists":True}}}) - m.user_actions_anonymous=m.user_actions-m.user_actions_registered - m.user_actions_registered_perc=round(m.user_actions_registered*100.0/m.user_actions,2) - m.user_actions_anonymous_perc=100-m.user_actions_registered_perc - m.user_actions_order=recdb["user_action"].count_documents({**query, **{"action.order":True}}) - m.user_actions_order_registered=recdb["user_action"].count_documents({**query, **{"action.order":True,"user":{"$exists":True}}}) - m.user_actions_order_anonymous=m.user_actions_order-m.user_actions_order_registered - m.user_actions_order_registered_perc=round(m.user_actions_order_registered*100.0/m.user_actions_order,2) - m.user_actions_order_anonymous_perc=100-m.user_actions_order_registered_perc + run=pm.Runtime() + run.recdb=recdb + run.query=query + run.config=config - m.user_actions_panel=recdb["user_action"].count_documents({**query, **{"source.root.type":"recommendation_panel"}}) - m.user_actions_panel_perc=round(m.user_actions_panel*100.0/m.user_actions,2) + md={'timestamp':str(datetime.utcnow())} - m.service_catalog=len(recdb["recommendation"].distinct("services", query)) + # get all functions found in pre_metrics module + # apart from 'doc' func + # run and save the result in dictionary + # where key is the name of the function + # and value what it returns + # whereas, for each found functions + # an extra key_doc element in dictionary is set + # to save the text of the function + funcs = list(map(lambda x: x[0], getmembers(pm, isfunction))) + funcs = list(filter(lambda x: not x=='doc',funcs)) + for func in funcs: + md[func+'_doc']=getattr(pm, func).text + md[func]=getattr(pm, func)(run) - # catalog coverage - m.service_catalog_perc=round(m.service_catalog*100.0/m.services,2) + jsonstr = json.dumps(md) - # user coverage - m.user_catalog=len(recdb["user_action"].distinct("user", query)) - m.user_catalog_perc=round(m.user_catalog*100.0/m.users,2) + rsmetrics_db.drop_collection("pre_metrics") + rsmetrics_db["pre_metrics"].insert_one(md) - jsonstr = json.dumps(m.__dict__) print(jsonstr) - # Using a JSON string - with open(os.path.join(args.output,'pre_metrics.json'), 'w') as outfile: - outfile.write(jsonstr) diff --git a/report.html.prototype b/report.html.prototype index 4e29b9b..9e2e13b 100644 --- a/report.html.prototype +++ b/report.html.prototype @@ -11,7 +11,8 @@ .card { animation-name: fadein; - animation-duration: 2s; + animation-duration: 1s; + margin-bottom: 25px; } @keyframes fadein { @@ -33,7 +34,65 @@ color: #FAC0E7; } - +.card-hit-rate { + background-color: #6cae80; + color: #d9fac0; +} + +.card-ctr { + background-color: #a55f80; + color: #f9bdd6; +} + +.card-diversity { + background-color: #FA8B1C; + color:#F4E588; +} + +.card-novelty { + background-color: #ffaf26; + color:#f6f5a2; +} +.card-footer{ + background-color: rgb(247,247,247,0.8); + color: black; + font-style: italic; +} + + + +span { + position: relative; +} + +span:hover:after { + background: #333; + background: rgba(0, 0, 0, .8); + border-radius: 5px; + bottom: -34px; + color: #fff; + content: attr(gloss); + left: 20%; + padding: 5px 15px; + position: absolute; + z-index: 98; + width: 350px; + font-size: 14px; +} + +span:hover:before { + border: solid; + border-color: #333 transparent; + border-width: 0 6px 6px 6px; + bottom: -4px; + content: ""; + left: 50%; + position: absolute; + z-index: 99; + font-size: 14px; +} + +
@@ -48,6 +107,7 @@ This report contains information about the detailed results of the evaluation process of the recommendation system as well as statistics related to the ingested dataset of user actions and recommendations

+
@@ -105,21 +165,69 @@
-

User Coverage: %

+

User Coverage: %

- +
-

Catalog Coverage: %

+

Catalog Coverage: %

- + +
+
+
+
+
+

Hit Rate: %

+
+ +
+
+
+
+
+

Click-through Rate (CTR): %

+
+ +
+
+
+
+
+

Diversity (Shannon Entropy):

+
+ +
+
+
+
+
+

Diversity (Gini Index):

+
+ +
+
+
+
+
+

Novelty:

+
+
+
+ +

+

+ + Generated on: +

+
@@ -130,20 +238,47 @@ + + + + +
+
+ +
+
+ +
+
+
+ + + +
+
+
+
+
+
+
+ + + + + +
+
+
+
+
+
+ + +
+
KPIs +
Key Performance Indicator (KPI) is a measurable + value that demonstrates how effectively a company is achieving key business + objectives. +
+
+
+
+ {% set start_data = data.start.value.split('.')[0].split(' ') %} + {% set end_data = data.end.value.split('.')[0].split(' ') %} + Start Date: +
{{start_data[0]}}
+ End Date: +
{{end_data[0]}}
+
+
+
+
+
+
+
+
+ +
+
+ + + +
+

Click-Through Rate + + + +

+
+
The number of user clicks through recommendations + panels divided by the total times recommendation panels were presented to + users. Takes into account all historical data of user actions
+
+
+
+ {{data.click_through_rate.value}}
+
+
+
+
+ +
+
+
+
+ +
+
+ + + +
+

Hit Rate + + + +

+
+
The ratio of user hits divided by the total + number of users (user hit: a user that has accessed at least one service + that is also a personal recommendation)
+
+
+
{{data.hit_rate.value}} +
+
+
+
+
+ + + +
+ +
+
+
+
+
Top 5 recommended services
+

+ + {% for item in data.top5_services_recommended.value %} +

+
+
+

#{{loop.index}} {{item.service_name}}

+
+ +
+
+ +
+
+
+
+
+ Recommended {{item.recommendations.value}}  + times out of {{item.recommendations.of_total}} +
+
+
+
+
+ {{item.recommendations.percentage}}%
+
+
+
+
+ +
+
+

{{item.recommendations.value}}

+
/{{item.recommendations.of_total}}
+
+
+
+
+ {%endfor%} + +

+ +
+ +
+ +
+
+
+
+
Top 5 ordered services
+

+ + {% for item in data.top5_services_ordered.value %} +

+
+
+

#{{loop.index}} {{item.service_name}}

+
+ +
+
+ +
+
+
+
+ Ordered {{item.orders.value}} times + out of {{item.orders.of_total}} +
+
+
+
+
+ {{item.orders.percentage}}%
+
+
+
+
+ +
+
+

{{item.orders.value}}

+
/{{item.orders.of_total}}
+
+
+
+
+ {%endfor%} +

+ +
+ +
+ +
+ +
+ +
+ +
+
+
+ + + diff --git a/webservice/templates/metric_desc.html b/webservice/templates/metric_desc.html new file mode 100644 index 0000000..ff338e5 --- /dev/null +++ b/webservice/templates/metric_desc.html @@ -0,0 +1,314 @@ + + + + + + + + + {{data.name}} + + + + + + + + + +
+
+ +
+
+ +
+
+
+ + + +
+
+
+
+
+
+
+ + + + + + + +
+
+
+
+
+
+ + +
+
{{data.name}} +
{{data.summary}} +
+
+
+
+
+
metric
+
+
+
+
+
Description +
+

{{data.description}} +

+
+
+
+
+
+ + +
+ +
+ + + + + + + +
+
+
+

+

+ +
+ Type  {{data.output.type}} +
+
+ + + +
Range Values
+
Min={{data.output.min}} to Max={{data.output.max}}
+
+
+
+ +
+
{{data.output.comment}}
+ + +

+
+
+ {%for item in data.prerequisites %} +

+ {{item}} +

+ {%endfor%} +
+
+
+
+
+
+
+
+
Process +
+

+

    + {%for item in data.process %} +
  • + {{item.step}} +

    {{item.details}}

    +
  • + {%endfor%} +
+

+
+
+
+
+
+ +
+
+
+ + + diff --git a/webservice/templates/rsmetrics.html b/webservice/templates/rsmetrics.html new file mode 100644 index 0000000..ef2d9d5 --- /dev/null +++ b/webservice/templates/rsmetrics.html @@ -0,0 +1,648 @@ + + + + + + + + + RS Metrics - Data statistics and metrics computed for the Recommender System (RS) + + + + + + + + + + +
+
+ +
+
+ +
+
+
+ + + +
+
+
+
+
+
+
+ + + + + +
+
+
+
+
+
+ + +
+
RS Metrics +
Data statistics and metrics computed for the + Recommender System (RS) +
+ +
+
+
+ {% set start_data = data.start.value.split('.')[0].split(' ') %} + {% set end_data = data.end.value.split('.')[0].split(' ') %} + + + + Start Date:
{{start_data[0]}}
+ End Date:
{{end_data[0]}}
+ + +
+
+
+ + + +
+
Statistics
+
+
+
    +
  • +
    +
    +
    +
    +
    Users
    +
    {{data.users.doc}}
    +
    +
    +
    {{data.users.value}} +
    +
    +
    +
    +
    +
  • +
  • +
    +
    +
    +
    +
    Services
    +
    {{data.services.doc}}
    +
    +
    +
    {{data.services.value}} +
    +
    +
    +
    +
    +
  • +
  • +
    +
    +
    +
    +
    Recommendations
    +
    {{data.recommendations.doc}} +
    +
    +
    +
    + {{data.recommendations.value}}
    +
    +
    +
    +
    +
  • +
+
+
+
+
+
    +
  • +
    +
    +
    +
    +
    + +
    +
    +
    +
    User Actions {{data.user_actions.value}} +
    +
    +
    +
    +
    +
  • +
  • +
    +
    +
    +
    +
    by Registered Users
    +
    +
    +
    + {{data.user_actions_registered.value}}
    +
    +
    +
    + ({{data.user_actions_registered_perc.value}}%)
    + +
    +
    +
  • +
  • +
    +
    +
    +
    +
    by anonymous Users
    +
    +
    +
    + {{data.user_actions_anonymous.value}}
    +
    +
    +
    + ({{data.user_actions_anonymous_perc.value}})
    + +
    +
    +
  • +
+
+
+
+
+
    +
  • +
    +
    +
    +
    +
    + +
    +
    +
    +
    Total Orders + {{data.user_actions_order.value}} +
    +
    +
    +
    +
    +
  • +
  • +
    +
    +
    +
    +
    by Registered Users
    +
    +
    +
    + {{data.user_actions_order_registered.value}}
    +
    +
    +
    + ({{data.user_actions_order_registered_perc.value}}%)
    +
    +
    +
  • +
  • +
    +
    +
    +
    +
    by anonymous Users
    +
    +
    +
    + {{data.user_actions_order_anonymous.value}}
    +
    +
    +
    + ({{data.user_actions_order_anonymous_perc.value}}%)
    + +
    +
    +
  • +
+
+
+ +
+
 
+ + + + + + + +
+
Metrics
+
+
+
+
+
+
+ +
+
+
+
+

User Coverage + + + + + +

+
+
+
+ +
+
+
+ {{data.user_coverage.doc}} +
+
+
+
+ {{data.user_coverage.value}}%
+
+
+ +
+
+ + +
+ +
+
+
+
+
+
+ +
+
+
+
+

Catalog Coverage + + + + + +

+
+
+
+ +
+
+
+ {{data.catalog_coverage.doc}} +
+
+
+
+ {{data.catalog_coverage.value}}%
+
+
+ +
+
+ + +
+ + + +
+
+
+
+
+
+ + +
+
+
+
+

Diversity (Gini Index) + + + + + +

+
+
+
+ +
+
+
+ {{data.diversity_gini.doc}} +
+
+
+
+ {{data.diversity_gini.value}} + +
+
+
+ +
+
+ + +
+ + + +
+
 
+ + + + + + + +
+
+
+
+
+
+
+ + +
+
+
+
+

Diversity (Sh. Entropy) + + + + + +

+
+
+
+ +
+
+
+ {{data.diversity.doc}} +
+
+
+
{{data.diversity.value}}
+
+
+ +
+
+ + +
+ +
+
+
+
+
+
+ + +
+
+
+
+

Novelty + + + + + +

+
+
+
+ +
+
+
+ {{data.novelty.doc}} +
+
+
+
{{data.novelty.value}} +
+
+
+ +
+
+ + +
+ + + +
+
 
+ + +
+ +
+
+
+ + +