Add more details about the lemmatizer at the new corpus phase. (#317)

* Add more details about the lemmatizer at the new corpus phase. * Add test for lemmatization services * Add more verbosity because tests hang * Remove various screenshots
hipster-philology · Mar 20, 2024 · f41d822 · f41d822
1 parent 63661c3
commit f41d822
Show file tree

Hide file tree

Showing 12 changed files with 832 additions and 256 deletions.
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -36,7 +36,7 @@ jobs:
       run: |
         python -m pip install --upgrade pip
         pip install -r requirements.txt
-        pip install pytest pytest-cov coveralls flake8
+        pip install pytest pytest-cov coveralls flake8 pytest-sugar
     - name: Set up TEST_DATABASE_URL
       run: |
         echo "TEST_DATABASE_URL=postgresql://pyrrha:pyrrha@localhost:5432/data-test" >> $GITHUB_ENV

diff --git a/app/__init__.py b/app/__init__.py
@@ -1,7 +1,6 @@
 import os
 from flask import Flask, g
 
-from config import config
 from flask_compress import Compress
 from flask_login import LoginManager
 from flask_mail import Mail
@@ -29,6 +28,7 @@
 
 def create_app(config_name="dev"):
     """ Create the application """
+    from config import config
 
     app = Flask(
         __name__,

diff --git a/app/lemmatizers.py b/app/lemmatizers.py
@@ -0,0 +1,12 @@
+from typing import Optional
+from dataclasses import dataclass
+
+
+@dataclass
+class LemmatizerService:
+    title: str   # e.g. "Old French"
+    uri: str  # Current address
+    provider: str  # e.g. Deucalion at Ecole nationale des Chartes
+    bibtex: str  # Citation Scheme
+    apa: str  # APA equivalent
+    ui: Optional[str] = None
diff --git a/app/templates/main/corpus_new.html b/app/templates/main/corpus_new.html
@@ -72,10 +72,28 @@ <h1>{{ _('Create a new corpus') }}</h1>
             <div class="col-md-9">
                 <div class="input-group">
                   <select class="form-control" id="language-model">
-                      {%- for lang, address in lemmatizers %}
-                    <option value="{{ address }}">{{ lang }}</option>
+                      <option selected value="disabled" class="text-muted disabled">{{_('Select a service')}}</option>
+                      {%- for service in lemmatizers %}
+                    <option value="{{ service.uri }}">{{ service.title }}</option>
                       {% endfor -%}
                   </select>
+                  {%- for service in lemmatizers %}
+                    <div class="alert alert-info lemmatizer-details" data-uri="{{service.uri}}" style="display: none;">
+                        {% trans name=service.title, provider=service.provider %}
+                            <em>{{ name }}</em> is a lemmatization service provided by <i>{{ provider }}</i>.<br />
+                        {% endtrans %}
+                        {% if service.ui %}
+                            <em>{{_("If you only want to lemmatize (and not correct data), you can use their user interface here:")}}</em>
+                            <a href="{{service.ui}}">{{service.ui}}</a><br/>
+                        {% endif %}
+                        {{ _("Please cite tools accordingly. You can use the following APA or Bibtex: ")}}
+                        <ul>
+                            <li><em>APA:</em> <code>{{service.apa }}</code></li>
+                            <li><em>BIB:</em> <code style="padding-left: 1em; display: block;">{{service.bibtex}}</code></li>
+                        </ul>
+                    </div>
+                  {% endfor %}
+
                   <div class="input-group-append">
                     <button id="submit-model" class="btn btn-outline-secondary" type="button">{{ _('Lemmatize') }}</button>
                   </div>
@@ -200,7 +218,14 @@ <h1>{{ _('Create a new corpus') }}</h1>
 <script type="text/javascript">
 
 $(document).ready(function() {
-
+    $("#language-model").on("change", function (event) {
+        let uri = $("#language-model").val();
+        document.querySelectorAll(".lemmatizer-details").forEach(el => el.style.display = "none");
+        if (uri === "disabled") {
+            return;
+        }
+        document.querySelectorAll(`.lemmatizer-details[data-uri='${uri}']`).forEach(el => el.style.display = "block");
+    });
     $("#submit-model").on("click", function(event) {
         event.preventDefault();
         // Get the parameters
@@ -213,11 +238,12 @@ <h1>{{ _('Create a new corpus') }}</h1>
             tokens_fail = document.getElementById("tokens-fail"),
             approximate_count = document.getElementById("tokens-approximate");
 
+        if (uri === "disabled") { return; }
+
         // Create the form
         var formData = new FormData();
         formData.append("data", text_data.value);
 
-
         // Set up our HTTP request
         var xhr = new XMLHttpRequest();
         var update = function(xhr) {
@@ -228,14 +254,16 @@ <h1>{{ _('Create a new corpus') }}</h1>
         };
         // Setup our listener to process compeleted requests
         xhr.onreadystatechange = function () {
-
             // Only run if the request is complete
-            if (xhr.readyState < 3 || xhr.readyState > 4) return null;
+            if (typeof xhr.status !== "number") return null;
+
+            if (xhr.readyState < 3) return null;
 
             if (xhr.readyState === 3) {
                 update(xhr);
                 return null;
             }
+            if (xhr.status === 0) return null;
 
             // Process our return data
             if (xhr.status >= 200 && xhr.status < 300) {

diff --git a/config.py b/config.py
@@ -1,5 +1,6 @@
 import os
-
+from app.lemmatizers import LemmatizerService
+from typing import List
 
 basedir = os.path.abspath(os.path.dirname(__file__))
 
@@ -38,7 +39,7 @@ class Config:
     PAGINATION_DEFAULT_TOKENS = 100
 
     # Lemmatizer (until Deucalion client)
-    LEMMATIZERS = []
+    LEMMATIZERS: List[LemmatizerService] = []
 
     # Change automatically the Postgresql instance language if not english
     FORCE_PSQL_EN_LOCALE = True
@@ -101,6 +102,23 @@ class BaseTestConfig(Config):
     EMAIL_SUBJECT_PREFIX = '[{}]'.format(Config.APP_NAME)
     EMAIL_SENDER = '{app_name} Admin <{email}>'.format(app_name=Config.APP_NAME, email=MAIL_USERNAME)
 
+    LEMMATIZERS = [
+        LemmatizerService(
+            "Dummy lemmatizer",
+            "http://localhost:4567/lemma",
+            provider="ProviderInstitution",
+            ui="someui.com",
+            apa="Clérice et al. 2019",
+            bibtex="""@article{camps2021corpus,
+	title        = {Corpus and Models for Lemmatisation and POS-tagging of Old French},
+	author       = {Camps, Jean-Baptiste and Cl{\'e}rice, Thibault and Duval, Fr{\'e}d{\'e}ric and Kanaoka, Naomi and Pinche, Ariane and others},
+	year         = 2021,
+	journal      = {arXiv preprint arXiv:2109.11442},
+	keywords     = {Old French}
+}"""
+        )
+    ]
+
 
 class SQLiteTestConfig(BaseTestConfig):
     SQLALCHEMY_DATABASE_URI = os.environ.get('TEST_DATABASE_URL') or \

diff --git a/dummy_lemmatizer_service.py b/dummy_lemmatizer_service.py
@@ -0,0 +1,22 @@
+from flask import Flask, request, Response, stream_with_context
+app = Flask("fixture")
+
+
+@app.route("/lemma", methods=["POST"])
+def lemmatizing():
+    r = Response(
+        "\n".join(
+            ["token\tlemma"] +
+            ["\t".join([tok, f"{idx}"]) for idx, tok in enumerate(request.form.get("data").split())]
+        ),
+        200,
+        headers={
+            'Content-Type': 'text/plain; charset=utf-8',
+            'Access-Control-Allow-Origin': "*"
+        }
+    )
+    return r
+
+
+if __name__ == "__main__":
+    app.run(port="4567")
diff --git a/tests/test_selenium/base.py b/tests/test_selenium/base.py
@@ -731,7 +731,6 @@ def get_field(row, f):
             self.driver_find_element_by_class_name("pagination"),
             "a"
         )
-        self.driver.save_screenshot("first.results.png")
         for page_index in range(0, len(pagination)):
             # self.driver_find_element_by_class_name("pagination").find_elements_by_tag_name("a")[page_index].click()
             self.element_find_elements_by_tag_name(

diff --git a/tests/test_selenium/test_bookmark.py b/tests/test_selenium/test_bookmark.py
@@ -9,7 +9,6 @@ class TestBookmark(TestBase):
 
     def set_bookmark(self, tok_id, page=None):
         self.driver.get(self.url_for_with_port("main.tokens_correct", corpus_id="1", page=page))
-        self.driver.save_screenshot("token.png")
         self.driver_find_element_by_id("dd_t"+str(tok_id)).click()
         self.driver.implicitly_wait(2)
         dd = self.driver_find_element_by_css_selector("*[aria-labelledby='dd_t{}']".format(tok_id))

diff --git a/tests/test_selenium/test_control_lists.py b/tests/test_selenium/test_control_lists.py
@@ -147,10 +147,8 @@ def test_action_as_owner(self):
         # Check that we can edit informations about markdown or whatever
         self.driver.get(self.url_for_with_port("control_lists_bp.information_edit", control_list_id=1))
         self.driver_find_element_by_id("cl_notes").send_keys("# This is some notes")
-        self.driver.save_screenshot("HELLOOOO1.png")
         self.driver_find_element_by_id("submit").click()
         self.driver.get(self.url_for_with_port("control_lists_bp.information_read", control_list_id=1))
-        self.driver.save_screenshot("HELLOOOO2.png")
         self.assertEqual(
             self.driver_find_element_by_tag_name("h1").text, "This is some notes",
             "Check that edition works"

diff --git a/tests/test_selenium/test_corpus_init.py b/tests/test_selenium/test_corpus_init.py
@@ -3,9 +3,14 @@
 from app import db
 from tests.test_selenium.base import TestBase
 from tests.fixtures import PLAINTEXT_CORPORA
+from selenium.webdriver.support.select import Select
 import csv
 import os
 
+from flask import Flask, request, Response
+from multiprocessing import Process
+from selenium.webdriver.support.wait import WebDriverWait
+
 
 class TestCorpusRegistration(TestBase):
     """ Test creation of Corpus """
@@ -461,14 +466,19 @@ def test_registration_with_wrongly_formated_input(self):
 	dieu	NOMpro	NOMB.=s|GENRE=m|CAS=n
 vos	vos1	PROper	PERS.=2|NOMB.=p|GENRE=m|CAS=r
 soit	estre1	VERcjg	MODE=sub|TEMPS=pst|PERS.=3|NOMB.=s""")
+
         self.driver_find_element_by_id("label_checkbox_reuse").click()
         self.driver_find_element_by_id("control_list_select").click()
         self.driver_find_element_by_id("cl_opt_" + str(target_cl.id)).click()
 
         self.driver_find_element_by_id("submit").click()
 
         self.assertEqual(
-            sorted([e.text.strip() for e in self.driver_find_elements_by_css_selector(".alert.alert-danger")]),
+            sorted([
+                e.text.strip()
+                for e in self.driver_find_elements_by_css_selector(".alert.alert-danger")
+                if e.text.strip()
+            ]),
             sorted([
                 'At least one line of your corpus is missing a token/form. Check line 1'
             ]),
@@ -493,9 +503,12 @@ def test_registration_with_no_tsv_input(self):
         self.driver_find_element_by_id("cl_opt_" + str(target_cl.id)).click()
 
         self.driver_find_element_by_id("submit").click()
-
         self.assertEqual(
-            sorted([e.text.strip() for e in self.driver_find_elements_by_css_selector(".alert.alert-danger")]),
+            sorted([
+                e.text.strip()
+                for e in self.driver_find_elements_by_css_selector(".alert.alert-danger")
+                if e.text.strip()
+            ]),
             sorted([
                 'You did not input any text.'
             ]),
@@ -622,3 +635,87 @@ def test_corpus_name_unique_user(self):
         self.driver_find_element_by_id("submit").click()
         self.driver.implicitly_wait(5)
         self.assertFalse(self.driver_find_elements_by_css_selector(".alert.alert-danger"))
+
+    def test_lemmatization_service(self):
+        """
+        Test that a user can create a corpus and that this corpus has its data well recorded
+        """
+        # Click register menu link
+        self.driver_find_element_by_id("new_corpus_link").click()
+        self.driver.implicitly_wait(15)
+
+        # Fill in registration form
+        self.driver_find_element_by_id("corpusName").send_keys(PLAINTEXT_CORPORA["Wauchier"]["name"])
+        self.writeMultiline(self.driver_find_element_by_id("tokens"), PLAINTEXT_CORPORA["Wauchier"]["data"])
+
+        # Default
+        details = self.driver_find_elements_by_css_selector(".lemmatizer-details")
+        self.assertEqual(details[0].is_displayed(), False, "Nothing should be displayed by default")
+
+        # Select a lemmatizer
+        s = Select(self.driver_find_elements_by_css_selector("#language-model")[0])
+        s.select_by_visible_text("Dummy lemmatizer")
+        self.assertEqual(details[0].is_displayed(), True, "Something should be displayed")
+        self.assertIn("Dummy lemmatizer is a lemmatization service provided by ProviderInstitution.", details[0].text)
+
+        # Unselect
+        s.select_by_visible_text("Select a service")
+        self.assertEqual(details[0].is_displayed(), False, "Nothing should be now")
+
+    def test_lemmatization_service_runs(self):
+        # Create a second fixture app
+        app = Flask("fixture")
+
+        @app.route("/lemma", methods=["POST"])
+        def lemmatizing():
+            return Response(
+                "\n".join(
+                    ["token\tlemma"] +
+                    ["\t".join([tok, f"{idx}"]) for idx, tok in enumerate(request.form.get("data").split())]
+                ),
+                200,
+                headers={
+                    'Content-Type': 'text/plain; charset=utf-8',
+                    'Access-Control-Allow-Origin': "*"
+                }
+            )
+        # Start it
+        second_app = Process(target=app.run, daemon=True, kwargs=dict(host="localhost", port="4567"))
+        second_app.start()
+
+        self.driver_find_element_by_id("new_corpus_link").click()
+        self.driver.implicitly_wait(15)
+
+        # Fill in registration form
+        self.driver_find_element_by_id("corpusName").send_keys("Test")
+        self.writeMultiline(self.driver_find_element_by_id("tokens"), "Je suis")
+
+        # Select the lemmatizer
+        s = Select(self.driver_find_elements_by_css_selector("#language-model")[0])
+        s.select_by_index(1)
+        self.driver.implicitly_wait(5)
+
+        # Lemmatize
+        self.driver_find_element_by_id("submit-model").click()
+
+        # Wait
+        wait = WebDriverWait(self.driver, timeout=5)
+        wait.until(lambda x: self.driver_find_element_by_id("tokens-success").is_displayed())
+
+        # Check results
+        self.assertEqual(
+            self.driver_find_element_by_id("tokens").get_property("value").split("\n"),
+            ["token\tlemma", "Je\t0", "suis\t1"]
+        )
+
+        self.assertEqual(
+            self.driver_find_element_by_id("tokens-success").text,
+            "Operation finished with success ! 2 tokens analyzed in total.",
+            "Lemmatization happened"
+        )
+        # Kill second app
+        try:
+            second_app.terminate()
+            second_app.join(2)
+        finally:
+            second_app.close()
diff --git a/tests/test_selenium/test_export.py b/tests/test_selenium/test_export.py
@@ -102,10 +102,8 @@ def test_tei_geste_export(self):
         self.driver.refresh()
 
         self.driver.get(self.url_for_with_port("main.tokens_export", corpus_id=1))
-        self.driver.get_screenshot_as_file("./beforedownload.png")
         self.driver_find_element_by_id("geste-tei").click()
 
-        self.driver.get_screenshot_as_file("./afterdownload.png")
         time.sleep(5)
 
         with open(os.path.join(self.download_path, "wauchier.xml")) as f: