add Latin lemmatization evaluation results

daidalos-project · Aug 21, 2024 · 60d1321 · 60d1321
1 parent 88ff455
commit 60d1321
Show file tree

Hide file tree

Showing 5 changed files with 50 additions and 13 deletions.
diff --git a/README.md b/README.md
@@ -2,6 +2,8 @@
 **S**ystematic **E**valuation **F**ramework for NLP models and datasets in **L**atin and **A**ncient **G**reek
 
 ## Evaluation Results
-### Lemmatization
-#### greCy on UD test data
+### Lemmatization on UD test data
+#### Ancient Greek: greCy 
 {'accuracy': 0.8942049121548943}
+#### Latin: LatinCy
+{'accuracy': 0.8843245653143111}
diff --git a/config.py b/config.py
@@ -3,3 +3,4 @@
 
 class Config:
     data_dir: str = os.path.abspath("data")
+    lemmatization_dir: str = os.path.join(data_dir, "lemmatization_test")
diff --git a/lemma.py b/lemma.py
@@ -5,13 +5,14 @@
 import spacy
 from conllu import SentenceList
 from dotenv import load_dotenv
-from spacy import Language
 from spacy.tokens import Doc
 from tqdm import tqdm
 import xml.etree.ElementTree as ET
 from config import Config
 from metrics import accuracy
 
+from models import Models
+
 # need this for greCy to work properly
 Doc.set_extension("trf_data", default=None)
 beta_to_uni: dict[str, str] = dict()
@@ -27,6 +28,26 @@ def convert_labels(lemmata_predicted: list[str], lemmata_true: list[str]) -> tup
     return predictions_int, references_int
 
 
+def lemmatize_greek(tokens: list[str]) -> list[str]:
+    """ Lemmatizes Ancient Greek tokens using spaCy. """
+    if not Models.lemmatizer_greek:
+        Models.lemmatizer_greek = spacy.load(
+            "grc_proiel_trf",  # grc_proiel_trf grc_odycy_joint_trf
+            exclude=["morphologizer", "parser", "tagger", "transformer"],  #
+        )
+    doc: Doc = Models.lemmatizer_greek(Doc(vocab=Models.lemmatizer_greek.vocab, words=tokens))
+    return [x.lemma_ for x in doc]
+
+
+def lemmatize_latin(tokens: list[str]) -> list[str]:
+    """Lemmatizes Latin tokens using spaCy."""
+    if not Models.lemmatizer_latin:
+        Models.lemmatizer_latin = spacy.load(
+            'la_core_web_lg', exclude=["morphologizer", "parser", "tagger", "ner"])
+    doc: Doc = Models.lemmatizer_latin(Doc(vocab=Models.lemmatizer_latin.vocab, words=tokens))
+    return [x.lemma_ for x in doc]
+
+
 def morpheus(text: str) -> list[str]:
     """ Runs Morpheus and uses it to lemmatize a given word form. """
     if text not in uni_to_beta:
@@ -50,28 +71,24 @@ def morpheus(text: str) -> list[str]:
     return [beta_to_uni[x] for x in lemmata]
 
 
-def run_evaluation():
-    data_dir: str = os.path.join(Config.data_dir, 'lemmatization_test')
+def run_evaluation(lemmatization_fn: callable, data_dir: str):
+    """ Performs evaluation of a lemmatization model for the given dataset. """
     sl: SentenceList = SentenceList()
     for file in [x for x in os.listdir(data_dir) if x.endswith(".conllu")]:
         file_path: str = os.path.join(data_dir, file)
         with open(file_path, 'r') as f:
             new_sl: SentenceList = conllu.parse(f.read())
             sl += new_sl
-    nlp: Language = spacy.load(
-        "grc_proiel_trf",  # grc_proiel_trf grc_odycy_joint_trf
-        exclude=["morphologizer", "parser", "tagger", "transformer"],  #
-    )
     lemmata_predicted: list[str] = []
     lemmata_true: list[str] = []
     for sent in tqdm(sl):
         words: list[str] = [tok["form"] for tok in sent]
         new_lemmata_true: list[str] = [tok["lemma"] for tok in sent]
         lemmata_true += new_lemmata_true
-        doc: Doc = nlp(Doc(vocab=nlp.vocab, words=words))
-        lemmata_predicted += [x.lemma_ for x in doc]
+        lemmata_predicted += lemmatization_fn(words)
     predictions_int, references_int = convert_labels(lemmata_predicted, lemmata_true)
     accuracy(predictions_int, references_int)
 
 
-# run_evaluation()
+# run_evaluation(lemmatize_greek, os.path.join(Config.lemmatization_dir, "greek"))
+# run_evaluation(lemmatize_latin, os.path.join(Config.lemmatization_dir, "latin"))
diff --git a/models.py b/models.py
@@ -0,0 +1,6 @@
+from spacy import Language
+
+
+class Models:
+    lemmatizer_greek: Language = None
+    lemmatizer_latin: Language = None
diff --git a/requirements.txt b/requirements.txt
@@ -14,6 +14,7 @@ beautifulsoup4==4.12.3
 betacode==1.0
 bleach==6.1.0
 blis==0.7.11
+boltons==21.0.0
 boto3==1.34.69
 botocore==1.34.69
 bpemb==0.3.5
@@ -23,6 +24,7 @@ cffi==1.16.0
 charset-normalizer==3.3.2
 click==8.1.7
 cloudpathlib==0.16.0
+cltk==1.3.0
 comm==0.2.2
 confection==0.1.4
 conllu==4.5.3
@@ -35,6 +37,7 @@ decorator==5.1.1
 defusedxml==0.7.1
 Deprecated==1.2.14
 dill==0.3.8
+emoji==2.12.1
 evaluate==0.4.1
 executing==2.0.1
 fastjsonschema==2.19.1
@@ -47,8 +50,11 @@ fsspec==2024.2.0
 ftfy==6.2.0
 gdown==5.1.0
 gensim==4.3.2
+gitdb==4.0.11
+GitPython==3.1.43
 grc_proiel_trf==3.7.5
 grecy==1.0
+greek-accentuation==1.2.0
 h11==0.14.0
 httpcore==1.0.4
 httpx==0.27.0
@@ -152,6 +158,7 @@ PyYAML==6.0.1
 pyzmq==25.1.2
 qtconsole==5.5.1
 QtPy==2.4.1
+rapidfuzz==3.9.6
 referencing==0.34.0
 regex==2023.12.25
 requests==2.31.0
@@ -171,23 +178,27 @@ seqeval==1.2.2
 setuptools==69.5.1
 six==1.16.0
 smart-open==6.4.0
+smmap==5.0.1
 sniffio==1.3.1
 soupsieve==2.5
-spacy==3.7.5
+spacy==3.7.6
 spacy-alignments==0.9.1
 spacy-legacy==3.0.12
 spacy-loggers==1.0.5
 spacy-transformers==1.3.5
 sqlitedict==2.1.0
 srsly==2.4.8
 stack-data==0.6.3
+stanza==1.8.2
+stringcase==1.2.0
 sympy==1.12
 tabulate==0.9.0
 terminado==0.18.1
 thinc==8.2.3
 threadpoolctl==3.4.0
 tinycss2==1.2.1
 tokenizers==0.15.2
+toml==0.10.2
 torch==2.2.1
 tornado==6.4
 tqdm==4.66.2
Original file line number	Diff line number	Diff line change
Expand Up		@@ -3,3 +3,4 @@

		class Config:
		data_dir: str = os.path.abspath("data")
		lemmatization_dir: str = os.path.join(data_dir, "lemmatization_test")