Skip to content

Commit

Permalink
add Latin lemmatization evaluation results
Browse files Browse the repository at this point in the history
  • Loading branch information
konstantinschulz committed Aug 21, 2024
1 parent 88ff455 commit 60d1321
Show file tree
Hide file tree
Showing 5 changed files with 50 additions and 13 deletions.
6 changes: 4 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
**S**ystematic **E**valuation **F**ramework for NLP models and datasets in **L**atin and **A**ncient **G**reek

## Evaluation Results
### Lemmatization
#### greCy on UD test data
### Lemmatization on UD test data
#### Ancient Greek: greCy
{'accuracy': 0.8942049121548943}
#### Latin: LatinCy
{'accuracy': 0.8843245653143111}
1 change: 1 addition & 0 deletions config.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@

class Config:
data_dir: str = os.path.abspath("data")
lemmatization_dir: str = os.path.join(data_dir, "lemmatization_test")
37 changes: 27 additions & 10 deletions lemma.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,14 @@
import spacy
from conllu import SentenceList
from dotenv import load_dotenv
from spacy import Language
from spacy.tokens import Doc
from tqdm import tqdm
import xml.etree.ElementTree as ET
from config import Config
from metrics import accuracy

from models import Models

# need this for greCy to work properly
Doc.set_extension("trf_data", default=None)
beta_to_uni: dict[str, str] = dict()
Expand All @@ -27,6 +28,26 @@ def convert_labels(lemmata_predicted: list[str], lemmata_true: list[str]) -> tup
return predictions_int, references_int


def lemmatize_greek(tokens: list[str]) -> list[str]:
""" Lemmatizes Ancient Greek tokens using spaCy. """
if not Models.lemmatizer_greek:
Models.lemmatizer_greek = spacy.load(
"grc_proiel_trf", # grc_proiel_trf grc_odycy_joint_trf
exclude=["morphologizer", "parser", "tagger", "transformer"], #
)
doc: Doc = Models.lemmatizer_greek(Doc(vocab=Models.lemmatizer_greek.vocab, words=tokens))
return [x.lemma_ for x in doc]


def lemmatize_latin(tokens: list[str]) -> list[str]:
"""Lemmatizes Latin tokens using spaCy."""
if not Models.lemmatizer_latin:
Models.lemmatizer_latin = spacy.load(
'la_core_web_lg', exclude=["morphologizer", "parser", "tagger", "ner"])
doc: Doc = Models.lemmatizer_latin(Doc(vocab=Models.lemmatizer_latin.vocab, words=tokens))
return [x.lemma_ for x in doc]


def morpheus(text: str) -> list[str]:
""" Runs Morpheus and uses it to lemmatize a given word form. """
if text not in uni_to_beta:
Expand All @@ -50,28 +71,24 @@ def morpheus(text: str) -> list[str]:
return [beta_to_uni[x] for x in lemmata]


def run_evaluation():
data_dir: str = os.path.join(Config.data_dir, 'lemmatization_test')
def run_evaluation(lemmatization_fn: callable, data_dir: str):
""" Performs evaluation of a lemmatization model for the given dataset. """
sl: SentenceList = SentenceList()
for file in [x for x in os.listdir(data_dir) if x.endswith(".conllu")]:
file_path: str = os.path.join(data_dir, file)
with open(file_path, 'r') as f:
new_sl: SentenceList = conllu.parse(f.read())
sl += new_sl
nlp: Language = spacy.load(
"grc_proiel_trf", # grc_proiel_trf grc_odycy_joint_trf
exclude=["morphologizer", "parser", "tagger", "transformer"], #
)
lemmata_predicted: list[str] = []
lemmata_true: list[str] = []
for sent in tqdm(sl):
words: list[str] = [tok["form"] for tok in sent]
new_lemmata_true: list[str] = [tok["lemma"] for tok in sent]
lemmata_true += new_lemmata_true
doc: Doc = nlp(Doc(vocab=nlp.vocab, words=words))
lemmata_predicted += [x.lemma_ for x in doc]
lemmata_predicted += lemmatization_fn(words)
predictions_int, references_int = convert_labels(lemmata_predicted, lemmata_true)
accuracy(predictions_int, references_int)


# run_evaluation()
# run_evaluation(lemmatize_greek, os.path.join(Config.lemmatization_dir, "greek"))
# run_evaluation(lemmatize_latin, os.path.join(Config.lemmatization_dir, "latin"))
6 changes: 6 additions & 0 deletions models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
from spacy import Language


class Models:
lemmatizer_greek: Language = None
lemmatizer_latin: Language = None
13 changes: 12 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ beautifulsoup4==4.12.3
betacode==1.0
bleach==6.1.0
blis==0.7.11
boltons==21.0.0
boto3==1.34.69
botocore==1.34.69
bpemb==0.3.5
Expand All @@ -23,6 +24,7 @@ cffi==1.16.0
charset-normalizer==3.3.2
click==8.1.7
cloudpathlib==0.16.0
cltk==1.3.0
comm==0.2.2
confection==0.1.4
conllu==4.5.3
Expand All @@ -35,6 +37,7 @@ decorator==5.1.1
defusedxml==0.7.1
Deprecated==1.2.14
dill==0.3.8
emoji==2.12.1
evaluate==0.4.1
executing==2.0.1
fastjsonschema==2.19.1
Expand All @@ -47,8 +50,11 @@ fsspec==2024.2.0
ftfy==6.2.0
gdown==5.1.0
gensim==4.3.2
gitdb==4.0.11
GitPython==3.1.43
grc_proiel_trf==3.7.5
grecy==1.0
greek-accentuation==1.2.0
h11==0.14.0
httpcore==1.0.4
httpx==0.27.0
Expand Down Expand Up @@ -152,6 +158,7 @@ PyYAML==6.0.1
pyzmq==25.1.2
qtconsole==5.5.1
QtPy==2.4.1
rapidfuzz==3.9.6
referencing==0.34.0
regex==2023.12.25
requests==2.31.0
Expand All @@ -171,23 +178,27 @@ seqeval==1.2.2
setuptools==69.5.1
six==1.16.0
smart-open==6.4.0
smmap==5.0.1
sniffio==1.3.1
soupsieve==2.5
spacy==3.7.5
spacy==3.7.6
spacy-alignments==0.9.1
spacy-legacy==3.0.12
spacy-loggers==1.0.5
spacy-transformers==1.3.5
sqlitedict==2.1.0
srsly==2.4.8
stack-data==0.6.3
stanza==1.8.2
stringcase==1.2.0
sympy==1.12
tabulate==0.9.0
terminado==0.18.1
thinc==8.2.3
threadpoolctl==3.4.0
tinycss2==1.2.1
tokenizers==0.15.2
toml==0.10.2
torch==2.2.1
tornado==6.4
tqdm==4.66.2
Expand Down

0 comments on commit 60d1321

Please sign in to comment.