Merge pull request #81 from ARGOeu/devel

Version 1.0.2
ARGOeu · Nov 1, 2022 · 905af42 · 905af42
2 parents ff59fc6 + efd8c5d
commit 905af42
Show file tree

Hide file tree

Showing 39 changed files with 50,651 additions and 275 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,11 @@
+# The following are handy to ignore in this specific project
+# please ignore generated folders with results such as /data and /report
+/data
+/report
+
+# please ignore changes in the configuration file. If default configuration file structure is changed please override this rule with git add -f 
+/config.yaml
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]

diff --git a/README.md b/README.md
@@ -3,15 +3,15 @@ A framework for counting the recommender metrics
 
 # Preprocessor v.0.2
 <p align="center">
-<a href="https://github.com/nikosT/eosc-recommender-metrics/blob/devel/docs/Preprocessor.png">
-<img src="https://github.com/nikosT/eosc-recommender-metrics/blob/devel/docs/Preprocessor.png" width="70%"/>
+<a href="https://github.com/nikosT/eosc-recommender-metrics/blob/master/docs/Preprocessor.png">
+<img src="https://github.com/nikosT/eosc-recommender-metrics/blob/master/docs/Preprocessor.png" width="70%"/>
 </a>
 </p>
 
 # RS metrics v.0.2
 <p align="center">
-<a href="https://github.com/nikosT/eosc-recommender-metrics/blob/devel/docs/RSmetrics.png">
-<img src="https://github.com/nikosT/eosc-recommender-metrics/blob/devel/docs/RSmetrics.png" width="70%"/>
+<a href="https://github.com/nikosT/eosc-recommender-metrics/blob/master/docs/RSmetrics.png">
+<img src="https://github.com/nikosT/eosc-recommender-metrics/blob/master/docs/RSmetrics.png" width="70%"/>
 </a>
 </p>
 
@@ -61,14 +61,38 @@ optional arguments:
 ```
 
 8. Configure `./preprocessor.py` by editting the `config.yaml` or providing another with `-c`:
-<p align="center">
-<a href="https://github.com/nikosT/eosc-recommender-metrics/blob/devel/docs/preprocessor-config.png">
-<img src="https://github.com/nikosT/eosc-recommender-metrics/blob/devel/docs/preprocessor-config.png" width="70%"/>
-</a>
-</p>
+```yaml
+
+# Set the desired connector (e.g. MongoDB)
+Source:
+    MongoDB:
+        host: localhost
+        port: 27017
+        db: recommender_dev
+
+User:
+    export: true
+
+Service:
+    # if true it keeps only published, otherwise all
+    # this has an effect in exporting when from is set to 'source' 
+    # and also in metrics calculations where service is considered
+    published: true
 
+    # Use the EOSC-Marketplace webpage
+    # to associate page_id and service_id
+    download: true
+    path: ./page_map
 
-9. Run from terminal: `./rsmetrics.py` to run RSmetrics
+    export: true
+    from: 'page_map' # or 'source'
+
+# Calculate source's metrics
+Metrics: true
+
+```
+
+9. Run from terminal: `./rsmetrics.py --users --services` to run RSmetrics and include the `users.csv` and `services.csv` files generated by the Preprocessor
 ```bash
   _____   _____                _        _          
  |  __ \ / ____|              | |      (_)         
@@ -151,4 +175,31 @@ chmod u+x ./get_service_catalog.py
 ./get_service_catalog.py
 ```
 
+#### Serve Evaluation Reports as a Service
+
+The `webservice` folder hosts a simple webservice implemented in Flask framework which can be used to host the report results.
+
+__Note__: Please make sure you work in a virtual environment and you have already downloaded the required dependencies by issuing
+`pip install -r requirements.txt` 
+
+The webservice application serves two endpoints
+ - `/` : This is the frontend webpage that displays the Report Results in a UI
+ - `/api` : This api call returns the evaluation metrics in json format
+
+To run the webservice issue:
+```
+cd ./webservice
+flask run
+```
+
+The webservice by default runs in localhost:5000 you can override this by issuing for example:
+```
+flask run -h 127.0.0.1 -p 8080
+```
+
+There is an env variable `RS_EVAL_METRIC_SOURCE` which directs the webservice to the generated `metrics.json` file produced after the evaluation process.
+This by default honors this repo's folder structure and directs to the root `/data/metrics.json` path
+
+You can override this by editing the `.env` file inside the `/webservice` folder, or specificy the `RS_EVAL_METRIC_SOURCE` variable accordingly before executing the `flask run` command
+
 _Tested with python 3.9_
diff --git a/config.yaml b/config.yaml
@@ -5,30 +5,34 @@ Source:
         port: 27017
         db: recommender_dev
 
-User:
-    export: true
-    #from: 'user_actions'
-    #from: 'recommendations'
-    from: 'source'
+# The database where the Preprocessor's
+# and RSmetrics data are stored
+Datastore:
+    MongoDB:
+        host: localhost
+        port: 27017
+        db: rsmetrics
 
 Service:
     # Use the EOSC-Marketplace webpage
-    # to associate page_id and service_id
-    download: true
-    path: ./page_map
-
-    export: true
-    #from: 'user_actions'
-    #from: 'recommendations'
-    from: 'source'
-    #from: 'page_map'
-
-    published: false # applies only on source option
-
-User-actions:
-    merge: false # not implemented yet
-
-# Calculate source's metrics
+    # to retrieve resources and 
+    # associate the page_id and the service_id
+    Portal:
+        download: true
+        path: ./page_map
+
+    # if true it keeps only published, otherwise all
+    # this has an effect in exporting when from is set to 'source' 
+    # and also in metrics calculations where service is considered
+    published: true
+
+    # which origin to use to retrieve Resources
+    # two options available:
+    # - 'source': use the Connector
+    # - 'page_map': use the EOSC Marketplace
+    from: 'page_map' # or 'source'
+
+# Calculate source's metrics (pre-metrics)
 Metrics: true
 
 

diff --git a/environment.yml b/environment.yml
@@ -25,20 +25,31 @@ dependencies:
   - zlib=1.2.11=h7f8727e_4
   - pip:
     - beautifulsoup4==4.10.0
+    - certifi==2021.10.8
     - charset-normalizer==2.0.12
+    - click==8.1.3
+    - Flask==2.1.2
     - idna==3.3
-    - joblib==1.1.0
+    - importlib-metadata==4.11.4
+    - itsdangerous==2.1.2
+    - Jinja2==3.1.2
+    - joblib==1.2.0
+    - MarkupSafe==2.1.1
     - natsort==8.1.0
     - numpy==1.22.3
     - pandas==1.4.2
     - pymongo==4.1.0
     - python-dateutil==2.8.2
+    - python-dotenv==0.20.0
     - pytz==2022.1
-    - pyyaml==6.0
+    - PyYAML==6.0
     - requests==2.27.1
     - scikit-surprise==1.1.1
     - scipy==1.8.0
     - six==1.16.0
     - soupsieve==2.3.2
     - surprise==0.1
     - urllib3==1.26.9
+    - Werkzeug==2.1.2
+    - zipp==3.8.0
+    - flask-pymongo==2.3.0
diff --git a/get_service_catalog.py b/get_service_catalog.py
@@ -50,7 +50,7 @@ def get_service_catalog_items(content):
     for item in results:
         a = item.findChildren("a", recursive=False)[0]
         row = [int(item.attrs["data-service-id"]),
-               item.text.strip(), a['href']]
+               a.text.strip(), a['href']]
         rows.append(row)
     # sort rows by id
     rows = sorted(rows, key=lambda x: x[0])

diff --git a/metric_descriptions/README.md b/metric_descriptions/README.md
@@ -0,0 +1,12 @@
+# Metric Descriptions folder
+
+This folder is meant to contained detailed yaml files defining in structure the implementation details of each metric
+To add a new detailed description in this folder please consult the first file added here: diversity.yml and structure
+the information accordingly
+
+### Important Note on filenames
+The filename should correspond to the name of the metric used in `metrics.json` output and the extension `.yml` 
+So for the metric Shannon Diversity the short name used in `metrics.json` is `diversity` thus the filename is `diversity.yml`
+
+### Multiline values
+In yaml fields that you need to support multiline string content please use the `>` operator  
diff --git a/metric_descriptions/catalog-coverage.yml b/metric_descriptions/catalog-coverage.yml
@@ -0,0 +1,36 @@
+name: Catalog Coverage
+
+summary: > 
+    The percentage (%) of the division of the unique services found in recommendations to the total number of published services
+
+description: > 
+   The Catalog Coverage is described by the formula $$\frac{unique\_rec\_services}{services}$$
+
+output:
+    type: float
+    min: 0
+    max: 100
+    comment: Catalog Coverage is 0 when none of the services is being recommended, and 100 when all of them are being recommended.
+
+prerequisites:
+    - all available recommendations
+    - all available services
+
+process:
+    - step: Retrieve recommendations
+      details: >
+        Retrieve all available recommendations found in source
+    - step: Gather all unique services
+      details: >
+        Gather all unique services found in all available recommendations
+    - step: Retrieve services
+      details: >
+        Retrieve all available published services found in source
+    - step: Calculate ratio
+      details: >
+         Calculate the percentage (%) of the division of the unique services found in recommendations to the total number of published services
+
+# This is optional for visual stylization of the metric when displayed on the report
+style:
+    icon: pe-7s-box2
+    color: bg-malibu-beach
diff --git a/metric_descriptions/click-through-rate.yml b/metric_descriptions/click-through-rate.yml
@@ -0,0 +1,37 @@
+name: Click-Through Rate
+
+summary: > 
+    The number of user clicks through recommendations panels divided by the total times recommendation panels were presented to users.
+
+description: > 
+    The number of user clicks through recommendations panels divided by the total times recommendation panels were presented to users. Takes into account all historical data of user actions. The metric is expressed by the formula: $$Click-Through Rate=\frac{clicks}{views}$$
+output:
+    type: float
+    min: 0
+    max: +inf
+    comment: A value of 0 indicates that no clicks through recommendations panels occurred
+
+prerequisites:
+    - all available user actions
+
+process:
+    - step: Retrieve user actions with recommendation panel
+      details: >
+        Get only the user actions that present a recommendation panel to the user in the source page. Those are actions with the following source paths: (i) /services, (ii) /services/, (iii) /services/c/{any category name}
+    - step: Count user actions with recommendation panel
+      details: >
+        Count the items in the above list as they represent the times recommendations panels were presented to the users of the portal
+    - step: Filter list
+      details: >
+        Narrow the above list into a new subset by selecting only user actions that originate from a recommendation panel. Those are actions that have the 'recommendation' string in the Action column
+    - step: Count user actions with clicks through recommendation panel
+      details: >
+        Count the items in the subset as they represent the times users clicked through recommendations
+    - step: Calculate ratio
+      details: >
+        Divide the items of the subset with the items of the first list to get the click-through rate
+
+# This is optional for visual stylization of the metric when displayed on the report
+style:
+    icon: pe-7s-mouse
+    color: bg-grow-early
diff --git a/metric_descriptions/diversity-gini.yml b/metric_descriptions/diversity-gini.yml
@@ -0,0 +1,46 @@
+name: Diversity Gini Index
+
+summary: > 
+    Measures Recommendations' diversity. The index is 0 when all items are chosen equally often, and 1 when a single item is always chosen.
+
+description: > 
+    The diversity (\(G\)) of the recommendations according to Gini Index.  The index is 0 when all items are chosen equally often, 
+    and 1 when a single item is always chosen 
+    (see book \(\href{https://link.springer.com/10.1007/978-1-4939-7131-2_110158}{https://link.springer.com/10.1007/978-1-4939-7131-2_110158}\)). Generally, the Gini Index mathematical expression is defined as: 
+    $$G=\frac{1}{n-1}\sum_{j=1}^{n}(2j-n-1)p(i_j)$$where \(i_1,\ldots,i_n\) is the list of items ordered according to increasing \(p(i)\) and each item \(i\) accounts for a proportion \(p(i)\) of user recommendations. In RS Metrics the computation is determined by the following forumla:
+    $$Diversity=\frac{1}{n-1}\sum_{j=1}^{n}(2j-n-1)\left(\frac{count(j)}{recommendations}\right)$$
+
+output:
+    type: float
+    min: 0
+    max: 1
+    comment: The index is 0 when all items are chosen equally often, and 1 when a single item is always chosen.
+
+prerequisites:
+    - recommendations without anonymous users
+    - all available services
+
+process:
+    - step: Clean up
+      details: >
+        Recommendations clean up; entries removal where users or services are not found in "users" or "services" files accordingly
+    - step: Services Impact
+      details: >
+        Calculation of the impact of the services, by counting how many times each service i was suggested to all possible users: count(j)
+    - step: Sort Services Impact from low to high
+      details: >
+        Sort the number of how many times each service (i.e. i) was suggested from the lower to the higher value, in order to apply the respective weight (j). The computation includes services with 0 recommendation occurrence
+    - step: Recommended Probability of the Services
+      details: >
+        For each service calculate its recommended probability by dividing the number of service's occurrence found in the recommendations to the total number of recommendations
+    - step: Service-based product computation
+      details: >
+        Calculation of the product of the recommended probability from previous step and services' respective index j, for each service individually
+    - step: Gini Index computation
+      details: >
+        Computation of the overall value by summing all values from previous step
+
+# This is optional for visual stylization of the metric when displayed on the report
+style:
+    icon: pe-7s-shuffle
+    color: bg-plum-plate
diff --git a/metric_descriptions/diversity.yml b/metric_descriptions/diversity.yml
@@ -0,0 +1,44 @@
+name: Diversity Shannon Entropy
+
+summary: > 
+    Measures Recommendations' diversity. The entropy is 0 when a single item is always chosen or recommended, 
+    and log n when n items are chosen or recommended equally often.
+
+description: > 
+    The diversity (\(H\)) of the recommendations according to Shannon Entropy. The entropy is 0 when a single item 
+    is always chosen or recommended, and log(n) when n items are chosen or recommended equally often 
+    (see book \(\href{https://link.springer.com/10.1007/978-1-4939-7131-2_110158}{https://link.springer.com/10.1007/978-1-4939-7131-2_110158}\)). Generally, the Shannon Entropy mathematical expression is defined as: 
+    $$H=-\sum_{i=1}^{n}p(i)\log_2 p(i) $$In RS Metrics the computation is determined by the following forumla: 
+    $$Diversity=-\sum_{i=1}^{services}\left(\frac{count(i)}{recommendations}\right)\log_2 \left(\frac{count(i)}{recommendations}\right)$$
+
+output:
+    type: float
+    min: 0
+    max: +\(\infty\)
+    comment: The entropy is 0 when a single item is always chosen or recommended, and log n when n items are chosen or recommended equally often.
+
+prerequisites:
+    - recommendations without anonymous users
+    - all available services
+
+process:
+    - step: Clean up
+      details: >
+        Recommendations clean up; entries removal where users or services are not found in "users" or "services" files accordingly
+    - step: Services Impact
+      details: >
+        Calculation of the impact of the services, by counting how many times each service i was suggested to all possible users: count(i)
+    - step: Recommended Probability of the Services
+      details: >
+        For each service calculate its recommended probability by dividing the number of service's occurrences found in the recommendations to the total number of recommendations
+    - step: Service-based product computation
+      details: >
+        Calculation of the product of the recommended probability from previous step and the logarithmic value of it, for each service individually
+    - step: Shannon Entropy computation
+      details: >
+        Computation of the overall value by summing all values from previous step
+
+# This is optional for visual stylization of the metric when displayed on the report
+style:
+    icon: pe-7s-way
+    color: bg-sunny-morning