diff --git a/README.md b/README.md index 55895ae..7812ed8 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,8 @@ **S**ystematic **E**valuation **F**ramework for NLP models and datasets in **L**atin and **A**ncient **G**reek ## Evaluation Results -### Lemmatization -#### greCy on UD test data +### Lemmatization on UD test data +#### Ancient Greek: greCy {'accuracy': 0.8942049121548943} +#### Latin: LatinCy +{'accuracy': 0.8843245653143111} diff --git a/config.py b/config.py index 384eec5..9538584 100644 --- a/config.py +++ b/config.py @@ -3,3 +3,4 @@ class Config: data_dir: str = os.path.abspath("data") + lemmatization_dir: str = os.path.join(data_dir, "lemmatization_test") diff --git a/lemma.py b/lemma.py index 8636e8e..d5e5806 100644 --- a/lemma.py +++ b/lemma.py @@ -5,13 +5,14 @@ import spacy from conllu import SentenceList from dotenv import load_dotenv -from spacy import Language from spacy.tokens import Doc from tqdm import tqdm import xml.etree.ElementTree as ET from config import Config from metrics import accuracy +from models import Models + # need this for greCy to work properly Doc.set_extension("trf_data", default=None) beta_to_uni: dict[str, str] = dict() @@ -27,6 +28,26 @@ def convert_labels(lemmata_predicted: list[str], lemmata_true: list[str]) -> tup return predictions_int, references_int +def lemmatize_greek(tokens: list[str]) -> list[str]: + """ Lemmatizes Ancient Greek tokens using spaCy. """ + if not Models.lemmatizer_greek: + Models.lemmatizer_greek = spacy.load( + "grc_proiel_trf", # grc_proiel_trf grc_odycy_joint_trf + exclude=["morphologizer", "parser", "tagger", "transformer"], # + ) + doc: Doc = Models.lemmatizer_greek(Doc(vocab=Models.lemmatizer_greek.vocab, words=tokens)) + return [x.lemma_ for x in doc] + + +def lemmatize_latin(tokens: list[str]) -> list[str]: + """Lemmatizes Latin tokens using spaCy.""" + if not Models.lemmatizer_latin: + Models.lemmatizer_latin = spacy.load( + 'la_core_web_lg', exclude=["morphologizer", "parser", "tagger", "ner"]) + doc: Doc = Models.lemmatizer_latin(Doc(vocab=Models.lemmatizer_latin.vocab, words=tokens)) + return [x.lemma_ for x in doc] + + def morpheus(text: str) -> list[str]: """ Runs Morpheus and uses it to lemmatize a given word form. """ if text not in uni_to_beta: @@ -50,28 +71,24 @@ def morpheus(text: str) -> list[str]: return [beta_to_uni[x] for x in lemmata] -def run_evaluation(): - data_dir: str = os.path.join(Config.data_dir, 'lemmatization_test') +def run_evaluation(lemmatization_fn: callable, data_dir: str): + """ Performs evaluation of a lemmatization model for the given dataset. """ sl: SentenceList = SentenceList() for file in [x for x in os.listdir(data_dir) if x.endswith(".conllu")]: file_path: str = os.path.join(data_dir, file) with open(file_path, 'r') as f: new_sl: SentenceList = conllu.parse(f.read()) sl += new_sl - nlp: Language = spacy.load( - "grc_proiel_trf", # grc_proiel_trf grc_odycy_joint_trf - exclude=["morphologizer", "parser", "tagger", "transformer"], # - ) lemmata_predicted: list[str] = [] lemmata_true: list[str] = [] for sent in tqdm(sl): words: list[str] = [tok["form"] for tok in sent] new_lemmata_true: list[str] = [tok["lemma"] for tok in sent] lemmata_true += new_lemmata_true - doc: Doc = nlp(Doc(vocab=nlp.vocab, words=words)) - lemmata_predicted += [x.lemma_ for x in doc] + lemmata_predicted += lemmatization_fn(words) predictions_int, references_int = convert_labels(lemmata_predicted, lemmata_true) accuracy(predictions_int, references_int) -# run_evaluation() +# run_evaluation(lemmatize_greek, os.path.join(Config.lemmatization_dir, "greek")) +# run_evaluation(lemmatize_latin, os.path.join(Config.lemmatization_dir, "latin")) diff --git a/models.py b/models.py new file mode 100644 index 0000000..5684b14 --- /dev/null +++ b/models.py @@ -0,0 +1,6 @@ +from spacy import Language + + +class Models: + lemmatizer_greek: Language = None + lemmatizer_latin: Language = None diff --git a/requirements.txt b/requirements.txt index a04850e..543c56a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,6 +14,7 @@ beautifulsoup4==4.12.3 betacode==1.0 bleach==6.1.0 blis==0.7.11 +boltons==21.0.0 boto3==1.34.69 botocore==1.34.69 bpemb==0.3.5 @@ -23,6 +24,7 @@ cffi==1.16.0 charset-normalizer==3.3.2 click==8.1.7 cloudpathlib==0.16.0 +cltk==1.3.0 comm==0.2.2 confection==0.1.4 conllu==4.5.3 @@ -35,6 +37,7 @@ decorator==5.1.1 defusedxml==0.7.1 Deprecated==1.2.14 dill==0.3.8 +emoji==2.12.1 evaluate==0.4.1 executing==2.0.1 fastjsonschema==2.19.1 @@ -47,8 +50,11 @@ fsspec==2024.2.0 ftfy==6.2.0 gdown==5.1.0 gensim==4.3.2 +gitdb==4.0.11 +GitPython==3.1.43 grc_proiel_trf==3.7.5 grecy==1.0 +greek-accentuation==1.2.0 h11==0.14.0 httpcore==1.0.4 httpx==0.27.0 @@ -152,6 +158,7 @@ PyYAML==6.0.1 pyzmq==25.1.2 qtconsole==5.5.1 QtPy==2.4.1 +rapidfuzz==3.9.6 referencing==0.34.0 regex==2023.12.25 requests==2.31.0 @@ -171,9 +178,10 @@ seqeval==1.2.2 setuptools==69.5.1 six==1.16.0 smart-open==6.4.0 +smmap==5.0.1 sniffio==1.3.1 soupsieve==2.5 -spacy==3.7.5 +spacy==3.7.6 spacy-alignments==0.9.1 spacy-legacy==3.0.12 spacy-loggers==1.0.5 @@ -181,6 +189,8 @@ spacy-transformers==1.3.5 sqlitedict==2.1.0 srsly==2.4.8 stack-data==0.6.3 +stanza==1.8.2 +stringcase==1.2.0 sympy==1.12 tabulate==0.9.0 terminado==0.18.1 @@ -188,6 +198,7 @@ thinc==8.2.3 threadpoolctl==3.4.0 tinycss2==1.2.1 tokenizers==0.15.2 +toml==0.10.2 torch==2.2.1 tornado==6.4 tqdm==4.66.2