Skip to content

Commit

Permalink
Add more details about the lemmatizer at the new corpus phase. (#317)
Browse files Browse the repository at this point in the history
* Add more details about the lemmatizer at the new corpus phase.
* Add test for lemmatization services
* Add more verbosity because tests hang
* Remove various screenshots
  • Loading branch information
PonteIneptique authored Mar 20, 2024
1 parent 63661c3 commit f41d822
Show file tree
Hide file tree
Showing 12 changed files with 832 additions and 256 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ jobs:
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt
pip install pytest pytest-cov coveralls flake8
pip install pytest pytest-cov coveralls flake8 pytest-sugar
- name: Set up TEST_DATABASE_URL
run: |
echo "TEST_DATABASE_URL=postgresql://pyrrha:pyrrha@localhost:5432/data-test" >> $GITHUB_ENV
Expand Down
2 changes: 1 addition & 1 deletion app/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import os
from flask import Flask, g

from config import config
from flask_compress import Compress
from flask_login import LoginManager
from flask_mail import Mail
Expand Down Expand Up @@ -29,6 +28,7 @@

def create_app(config_name="dev"):
""" Create the application """
from config import config

app = Flask(
__name__,
Expand Down
12 changes: 12 additions & 0 deletions app/lemmatizers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
from typing import Optional
from dataclasses import dataclass


@dataclass
class LemmatizerService:
title: str # e.g. "Old French"
uri: str # Current address
provider: str # e.g. Deucalion at Ecole nationale des Chartes
bibtex: str # Citation Scheme
apa: str # APA equivalent
ui: Optional[str] = None
40 changes: 34 additions & 6 deletions app/templates/main/corpus_new.html
Original file line number Diff line number Diff line change
Expand Up @@ -72,10 +72,28 @@ <h1>{{ _('Create a new corpus') }}</h1>
<div class="col-md-9">
<div class="input-group">
<select class="form-control" id="language-model">
{%- for lang, address in lemmatizers %}
<option value="{{ address }}">{{ lang }}</option>
<option selected value="disabled" class="text-muted disabled">{{_('Select a service')}}</option>
{%- for service in lemmatizers %}
<option value="{{ service.uri }}">{{ service.title }}</option>
{% endfor -%}
</select>
{%- for service in lemmatizers %}
<div class="alert alert-info lemmatizer-details" data-uri="{{service.uri}}" style="display: none;">
{% trans name=service.title, provider=service.provider %}
<em>{{ name }}</em> is a lemmatization service provided by <i>{{ provider }}</i>.<br />
{% endtrans %}
{% if service.ui %}
<em>{{_("If you only want to lemmatize (and not correct data), you can use their user interface here:")}}</em>
<a href="{{service.ui}}">{{service.ui}}</a><br/>
{% endif %}
{{ _("Please cite tools accordingly. You can use the following APA or Bibtex: ")}}
<ul>
<li><em>APA:</em> <code>{{service.apa }}</code></li>
<li><em>BIB:</em> <code style="padding-left: 1em; display: block;">{{service.bibtex}}</code></li>
</ul>
</div>
{% endfor %}

<div class="input-group-append">
<button id="submit-model" class="btn btn-outline-secondary" type="button">{{ _('Lemmatize') }}</button>
</div>
Expand Down Expand Up @@ -200,7 +218,14 @@ <h1>{{ _('Create a new corpus') }}</h1>
<script type="text/javascript">

$(document).ready(function() {

$("#language-model").on("change", function (event) {
let uri = $("#language-model").val();
document.querySelectorAll(".lemmatizer-details").forEach(el => el.style.display = "none");
if (uri === "disabled") {
return;
}
document.querySelectorAll(`.lemmatizer-details[data-uri='${uri}']`).forEach(el => el.style.display = "block");
});
$("#submit-model").on("click", function(event) {
event.preventDefault();
// Get the parameters
Expand All @@ -213,11 +238,12 @@ <h1>{{ _('Create a new corpus') }}</h1>
tokens_fail = document.getElementById("tokens-fail"),
approximate_count = document.getElementById("tokens-approximate");

if (uri === "disabled") { return; }

// Create the form
var formData = new FormData();
formData.append("data", text_data.value);


// Set up our HTTP request
var xhr = new XMLHttpRequest();
var update = function(xhr) {
Expand All @@ -228,14 +254,16 @@ <h1>{{ _('Create a new corpus') }}</h1>
};
// Setup our listener to process compeleted requests
xhr.onreadystatechange = function () {

// Only run if the request is complete
if (xhr.readyState < 3 || xhr.readyState > 4) return null;
if (typeof xhr.status !== "number") return null;

if (xhr.readyState < 3) return null;

if (xhr.readyState === 3) {
update(xhr);
return null;
}
if (xhr.status === 0) return null;

// Process our return data
if (xhr.status >= 200 && xhr.status < 300) {
Expand Down
22 changes: 20 additions & 2 deletions config.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import os

from app.lemmatizers import LemmatizerService
from typing import List

basedir = os.path.abspath(os.path.dirname(__file__))

Expand Down Expand Up @@ -38,7 +39,7 @@ class Config:
PAGINATION_DEFAULT_TOKENS = 100

# Lemmatizer (until Deucalion client)
LEMMATIZERS = []
LEMMATIZERS: List[LemmatizerService] = []

# Change automatically the Postgresql instance language if not english
FORCE_PSQL_EN_LOCALE = True
Expand Down Expand Up @@ -101,6 +102,23 @@ class BaseTestConfig(Config):
EMAIL_SUBJECT_PREFIX = '[{}]'.format(Config.APP_NAME)
EMAIL_SENDER = '{app_name} Admin <{email}>'.format(app_name=Config.APP_NAME, email=MAIL_USERNAME)

LEMMATIZERS = [
LemmatizerService(
"Dummy lemmatizer",
"http://localhost:4567/lemma",
provider="ProviderInstitution",
ui="someui.com",
apa="Clérice et al. 2019",
bibtex="""@article{camps2021corpus,
title = {Corpus and Models for Lemmatisation and POS-tagging of Old French},
author = {Camps, Jean-Baptiste and Cl{\'e}rice, Thibault and Duval, Fr{\'e}d{\'e}ric and Kanaoka, Naomi and Pinche, Ariane and others},
year = 2021,
journal = {arXiv preprint arXiv:2109.11442},
keywords = {Old French}
}"""
)
]


class SQLiteTestConfig(BaseTestConfig):
SQLALCHEMY_DATABASE_URI = os.environ.get('TEST_DATABASE_URL') or \
Expand Down
22 changes: 22 additions & 0 deletions dummy_lemmatizer_service.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
from flask import Flask, request, Response, stream_with_context
app = Flask("fixture")


@app.route("/lemma", methods=["POST"])
def lemmatizing():
r = Response(
"\n".join(
["token\tlemma"] +
["\t".join([tok, f"{idx}"]) for idx, tok in enumerate(request.form.get("data").split())]
),
200,
headers={
'Content-Type': 'text/plain; charset=utf-8',
'Access-Control-Allow-Origin': "*"
}
)
return r


if __name__ == "__main__":
app.run(port="4567")
1 change: 0 additions & 1 deletion tests/test_selenium/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -731,7 +731,6 @@ def get_field(row, f):
self.driver_find_element_by_class_name("pagination"),
"a"
)
self.driver.save_screenshot("first.results.png")
for page_index in range(0, len(pagination)):
# self.driver_find_element_by_class_name("pagination").find_elements_by_tag_name("a")[page_index].click()
self.element_find_elements_by_tag_name(
Expand Down
1 change: 0 additions & 1 deletion tests/test_selenium/test_bookmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@ class TestBookmark(TestBase):

def set_bookmark(self, tok_id, page=None):
self.driver.get(self.url_for_with_port("main.tokens_correct", corpus_id="1", page=page))
self.driver.save_screenshot("token.png")
self.driver_find_element_by_id("dd_t"+str(tok_id)).click()
self.driver.implicitly_wait(2)
dd = self.driver_find_element_by_css_selector("*[aria-labelledby='dd_t{}']".format(tok_id))
Expand Down
2 changes: 0 additions & 2 deletions tests/test_selenium/test_control_lists.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,10 +147,8 @@ def test_action_as_owner(self):
# Check that we can edit informations about markdown or whatever
self.driver.get(self.url_for_with_port("control_lists_bp.information_edit", control_list_id=1))
self.driver_find_element_by_id("cl_notes").send_keys("# This is some notes")
self.driver.save_screenshot("HELLOOOO1.png")
self.driver_find_element_by_id("submit").click()
self.driver.get(self.url_for_with_port("control_lists_bp.information_read", control_list_id=1))
self.driver.save_screenshot("HELLOOOO2.png")
self.assertEqual(
self.driver_find_element_by_tag_name("h1").text, "This is some notes",
"Check that edition works"
Expand Down
103 changes: 100 additions & 3 deletions tests/test_selenium/test_corpus_init.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,14 @@
from app import db
from tests.test_selenium.base import TestBase
from tests.fixtures import PLAINTEXT_CORPORA
from selenium.webdriver.support.select import Select
import csv
import os

from flask import Flask, request, Response
from multiprocessing import Process
from selenium.webdriver.support.wait import WebDriverWait


class TestCorpusRegistration(TestBase):
""" Test creation of Corpus """
Expand Down Expand Up @@ -461,14 +466,19 @@ def test_registration_with_wrongly_formated_input(self):
dieu NOMpro NOMB.=s|GENRE=m|CAS=n
vos vos1 PROper PERS.=2|NOMB.=p|GENRE=m|CAS=r
soit estre1 VERcjg MODE=sub|TEMPS=pst|PERS.=3|NOMB.=s""")

self.driver_find_element_by_id("label_checkbox_reuse").click()
self.driver_find_element_by_id("control_list_select").click()
self.driver_find_element_by_id("cl_opt_" + str(target_cl.id)).click()

self.driver_find_element_by_id("submit").click()

self.assertEqual(
sorted([e.text.strip() for e in self.driver_find_elements_by_css_selector(".alert.alert-danger")]),
sorted([
e.text.strip()
for e in self.driver_find_elements_by_css_selector(".alert.alert-danger")
if e.text.strip()
]),
sorted([
'At least one line of your corpus is missing a token/form. Check line 1'
]),
Expand All @@ -493,9 +503,12 @@ def test_registration_with_no_tsv_input(self):
self.driver_find_element_by_id("cl_opt_" + str(target_cl.id)).click()

self.driver_find_element_by_id("submit").click()

self.assertEqual(
sorted([e.text.strip() for e in self.driver_find_elements_by_css_selector(".alert.alert-danger")]),
sorted([
e.text.strip()
for e in self.driver_find_elements_by_css_selector(".alert.alert-danger")
if e.text.strip()
]),
sorted([
'You did not input any text.'
]),
Expand Down Expand Up @@ -622,3 +635,87 @@ def test_corpus_name_unique_user(self):
self.driver_find_element_by_id("submit").click()
self.driver.implicitly_wait(5)
self.assertFalse(self.driver_find_elements_by_css_selector(".alert.alert-danger"))

def test_lemmatization_service(self):
"""
Test that a user can create a corpus and that this corpus has its data well recorded
"""
# Click register menu link
self.driver_find_element_by_id("new_corpus_link").click()
self.driver.implicitly_wait(15)

# Fill in registration form
self.driver_find_element_by_id("corpusName").send_keys(PLAINTEXT_CORPORA["Wauchier"]["name"])
self.writeMultiline(self.driver_find_element_by_id("tokens"), PLAINTEXT_CORPORA["Wauchier"]["data"])

# Default
details = self.driver_find_elements_by_css_selector(".lemmatizer-details")
self.assertEqual(details[0].is_displayed(), False, "Nothing should be displayed by default")

# Select a lemmatizer
s = Select(self.driver_find_elements_by_css_selector("#language-model")[0])
s.select_by_visible_text("Dummy lemmatizer")
self.assertEqual(details[0].is_displayed(), True, "Something should be displayed")
self.assertIn("Dummy lemmatizer is a lemmatization service provided by ProviderInstitution.", details[0].text)

# Unselect
s.select_by_visible_text("Select a service")
self.assertEqual(details[0].is_displayed(), False, "Nothing should be now")

def test_lemmatization_service_runs(self):
# Create a second fixture app
app = Flask("fixture")

@app.route("/lemma", methods=["POST"])
def lemmatizing():
return Response(
"\n".join(
["token\tlemma"] +
["\t".join([tok, f"{idx}"]) for idx, tok in enumerate(request.form.get("data").split())]
),
200,
headers={
'Content-Type': 'text/plain; charset=utf-8',
'Access-Control-Allow-Origin': "*"
}
)
# Start it
second_app = Process(target=app.run, daemon=True, kwargs=dict(host="localhost", port="4567"))
second_app.start()

self.driver_find_element_by_id("new_corpus_link").click()
self.driver.implicitly_wait(15)

# Fill in registration form
self.driver_find_element_by_id("corpusName").send_keys("Test")
self.writeMultiline(self.driver_find_element_by_id("tokens"), "Je suis")

# Select the lemmatizer
s = Select(self.driver_find_elements_by_css_selector("#language-model")[0])
s.select_by_index(1)
self.driver.implicitly_wait(5)

# Lemmatize
self.driver_find_element_by_id("submit-model").click()

# Wait
wait = WebDriverWait(self.driver, timeout=5)
wait.until(lambda x: self.driver_find_element_by_id("tokens-success").is_displayed())

# Check results
self.assertEqual(
self.driver_find_element_by_id("tokens").get_property("value").split("\n"),
["token\tlemma", "Je\t0", "suis\t1"]
)

self.assertEqual(
self.driver_find_element_by_id("tokens-success").text,
"Operation finished with success ! 2 tokens analyzed in total.",
"Lemmatization happened"
)
# Kill second app
try:
second_app.terminate()
second_app.join(2)
finally:
second_app.close()
2 changes: 0 additions & 2 deletions tests/test_selenium/test_export.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,10 +102,8 @@ def test_tei_geste_export(self):
self.driver.refresh()

self.driver.get(self.url_for_with_port("main.tokens_export", corpus_id=1))
self.driver.get_screenshot_as_file("./beforedownload.png")
self.driver_find_element_by_id("geste-tei").click()

self.driver.get_screenshot_as_file("./afterdownload.png")
time.sleep(5)

with open(os.path.join(self.download_path, "wauchier.xml")) as f:
Expand Down
Loading

0 comments on commit f41d822

Please sign in to comment.