-
Notifications
You must be signed in to change notification settings - Fork 0
/
lemma.py
96 lines (82 loc) · 4.17 KB
/
lemma.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import os.path
import subprocess
import betacode.conv
import conllu
import spacy
from conllu import SentenceList
from dotenv import load_dotenv
from spacy.tokens import Doc
from tqdm import tqdm
import xml.etree.ElementTree as ET
from config import Config
from metrics import accuracy
from models import Models
# need this for greCy to work properly
Doc.set_extension("trf_data", default=None)
beta_to_uni: dict[str, str] = dict()
uni_to_beta: dict[str, str] = dict()
def convert_labels(lemmata_predicted: list[str], lemmata_true: list[str]) -> tuple[list[int], list[int]]:
""" Converts lemmatization results from strings to integers, which is useful for calculating metrics. """
all_lemmata: set[str] = set(lemmata_predicted + lemmata_true)
lemma_to_idx: dict[str, int] = {lemma: idx for idx, lemma in enumerate(all_lemmata)}
predictions_int: list[int] = [lemma_to_idx[x] for x in lemmata_predicted]
references_int: list[int] = [lemma_to_idx[x] for x in lemmata_true]
return predictions_int, references_int
def lemmatize_greek(tokens: list[str]) -> list[str]:
""" Lemmatizes Ancient Greek tokens using spaCy. """
if not Models.lemmatizer_greek:
Models.lemmatizer_greek = spacy.load(
"grc_proiel_lg", # grc_proiel_trf grc_odycy_joint_trf
exclude=["morphologizer", "parser", "tagger", "transformer"], #
)
doc: Doc = Models.lemmatizer_greek(Doc(vocab=Models.lemmatizer_greek.vocab, words=tokens))
return [x.lemma_ for x in doc]
def lemmatize_latin(tokens: list[str]) -> list[str]:
"""Lemmatizes Latin tokens using spaCy."""
if not Models.lemmatizer_latin:
Models.lemmatizer_latin = spacy.load(
'la_core_web_lg', exclude=["morphologizer", "parser", "tagger", "ner"])
doc: Doc = Models.lemmatizer_latin(Doc(vocab=Models.lemmatizer_latin.vocab, words=tokens))
return [x.lemma_ for x in doc]
def morpheus(text: str) -> list[str]:
""" Runs Morpheus and uses it to lemmatize a given word form. """
if text not in uni_to_beta:
# Morpheus only accepts beta code; need to replace special sigma notation with ordinary one
uni_to_beta[text] = betacode.conv.uni_to_beta(text).replace("s1", "s")
# make sure that MORPHEUS_PATH is present in your .env file and points to the correct folder
# see https://github.com/perseids-tools/morpheus-perseids for installation instructions
load_dotenv()
env: dict = os.environ.copy()
env["MORPHLIB"] = "stemlib"
cp: subprocess.CompletedProcess = subprocess.run(
["bin/morpheus", uni_to_beta[text]], capture_output=True, env=env, cwd=env["MORPHEUS_PATH"])
output: bytes = cp.stdout
xml: str = output.decode("utf-8")
root: ET.Element = ET.fromstring(xml)
headwords: list[ET.Element] = root.findall(".//hdwd")
lemmata: list[str] = [x.text for x in headwords]
for lemma in lemmata:
if lemma not in beta_to_uni:
beta_to_uni[lemma] = betacode.conv.beta_to_uni(lemma)
return [beta_to_uni[x] for x in lemmata]
def run_evaluation(lemmatization_fn: callable, data_dir: str):
""" Performs evaluation of a lemmatization model for the given dataset. """
sl: SentenceList = SentenceList()
for file in [x for x in os.listdir(data_dir) if x.endswith(".conllu")]:
file_path: str = os.path.join(data_dir, file)
with open(file_path, 'r') as f:
new_sl: SentenceList = conllu.parse(f.read())
sl += new_sl
lemmata_predicted: list[str] = []
lemmata_true: list[str] = []
for sent in tqdm(sl):
words: list[str] = [tok["form"] for tok in sent]
new_lemmata_true: list[str] = [tok["lemma"] for tok in sent]
lemmata_true += new_lemmata_true
lemmata_predicted += lemmatization_fn(words)
print("Errors: ", [(lemmata_predicted[i], lemmata_true[i]) for i in range(len(lemmata_true)) if
lemmata_predicted[i] != lemmata_true[i]])
predictions_int, references_int = convert_labels(lemmata_predicted, lemmata_true)
accuracy(predictions_int, references_int)
# run_evaluation(lemmatize_greek, os.path.join(Config.lemmatization_dir, "greek"))
# run_evaluation(lemmatize_latin, os.path.join(Config.lemmatization_dir, "latin"))