Skip to content

Commit

Permalink
port commits from PR #233, use terms localizations
Browse files Browse the repository at this point in the history
  • Loading branch information
mshannon-sil committed Aug 30, 2024
1 parent bfb44d8 commit ff1c067
Show file tree
Hide file tree
Showing 18 changed files with 97,127 additions and 97 deletions.
34,811 changes: 34,811 additions & 0 deletions machine/corpora/BiblicalTermsEn.xml

Large diffs are not rendered by default.

9,434 changes: 9,434 additions & 0 deletions machine/corpora/BiblicalTermsEs.xml

Large diffs are not rendered by default.

10,757 changes: 10,757 additions & 0 deletions machine/corpora/BiblicalTermsFr.xml

Large diffs are not rendered by default.

16,835 changes: 16,835 additions & 0 deletions machine/corpora/BiblicalTermsId.xml

Large diffs are not rendered by default.

24,842 changes: 24,842 additions & 0 deletions machine/corpora/BiblicalTermsPt.xml

Large diffs are not rendered by default.

6 changes: 6 additions & 0 deletions machine/corpora/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from .file_paratext_project_text_updater import FileParatextProjectTextUpdater
from .flatten import flatten
from .memory_alignment_collection import MemoryAlignmentCollection
from .memory_paratext_project_terms_parser import MemoryParatextProjectTermsParser
from .memory_stream_container import MemoryStreamContainer
from .memory_text import MemoryText
from .multi_key_ref import MultiKeyRef
Expand All @@ -20,6 +21,7 @@
from .paratext_backup_text_corpus import ParatextBackupTextCorpus
from .paratext_project_settings import ParatextProjectSettings
from .paratext_project_settings_parser_base import ParatextProjectSettingsParserBase
from .paratext_project_terms_parser_base import ParatextProjectTermsParserBase
from .paratext_project_text_updater_base import ParatextProjectTextUpdaterBase
from .paratext_text_corpus import ParatextTextCorpus
from .scripture_element import ScriptureElement
Expand Down Expand Up @@ -67,6 +69,7 @@
from .usx_zip_text import UsxZipText
from .zip_paratext_project_settings_parser import ZipParatextProjectSettingsParser
from .zip_paratext_project_settings_parser_base import ZipParatextProjectSettingsParserBase
from .zip_paratext_project_terms_parser import ZipParatextProjectTermsParser
from .zip_paratext_project_text_updater import ZipParatextProjectTextUpdater

__all__ = [
Expand All @@ -89,6 +92,7 @@
"is_scripture",
"lowercase",
"MemoryAlignmentCollection",
"MemoryParatextProjectTermsParser",
"MemoryText",
"MemoryStreamContainer",
"MultiKeyRef",
Expand All @@ -103,6 +107,7 @@
"ParatextBackupTextCorpus",
"ParatextProjectSettings",
"ParatextProjectSettingsParserBase",
"ParatextProjectTermsParserBase",
"ParatextProjectTextUpdaterBase",
"ParatextTextCorpus",
"parse_usfm",
Expand Down Expand Up @@ -149,5 +154,6 @@
"UsxZipText",
"ZipParatextProjectSettingsParser",
"ZipParatextProjectSettingsParserBase",
"ZipParatextProjectTermsParser",
"ZipParatextProjectTextUpdater",
]
18 changes: 18 additions & 0 deletions machine/corpora/memory_paratext_project_terms_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
from io import BytesIO
from typing import BinaryIO, Dict

from ..corpora.paratext_project_settings import ParatextProjectSettings
from .paratext_project_terms_parser_base import ParatextProjectTermsParserBase


class MemoryParatextProjectTermsParser(ParatextProjectTermsParserBase):
def __init__(self, settings: ParatextProjectSettings, files: Dict[str, str]) -> None:
super().__init__(settings)

self.files = files

def exists(self, file_name: str) -> bool:
return file_name in self.files

def open(self, file_name: str) -> BinaryIO:
return BytesIO(self.files[file_name].encode("utf-8"))
106 changes: 12 additions & 94 deletions machine/corpora/paratext_backup_terms_corpus.py
Original file line number Diff line number Diff line change
@@ -1,110 +1,28 @@
import re
from typing import Dict, List, Optional
from xml.etree import ElementTree
from typing import List, Tuple
from zipfile import ZipFile

from .corpora_utils import get_entry
from ..utils.typeshed import StrPath
from .dictionary_text_corpus import DictionaryTextCorpus
from .memory_text import MemoryText
from .text_row import TextRow
from .zip_paratext_project_settings_parser import ZipParatextProjectSettingsParser

_PREDEFINED_TERMS_LIST_TYPES = ["Major", "All", "SilNt", "Pt6"]
from .zip_paratext_project_terms_parser import ZipParatextProjectTermsParser


class ParatextBackupTermsCorpus(DictionaryTextCorpus):
def __init__(self, filename: str, term_categories: List[str]) -> None:
with ZipFile(filename, "r") as archive:
terms_file_entry = get_entry(archive, "TermRenderings.xml")
if terms_file_entry is None:
return
settings_parser = ZipParatextProjectSettingsParser(archive)
settings = settings_parser.parse()

with archive.open(terms_file_entry) as key_terms_file:
term_renderings_tree = ElementTree.parse(key_terms_file)

biblical_terms_file_entry = get_entry(archive, settings.biblical_terms_file_name)
if settings.biblical_terms_list_type == "Project":
if biblical_terms_file_entry is not None:
with archive.open(biblical_terms_file_entry) as key_terms_file:
biblical_terms_tree = ElementTree.parse(key_terms_file)
term_id_to_category_dict = _get_category_per_id(biblical_terms_tree)
else:
with open("BiblicalTerms.xml", "rb") as key_terms_file:
biblical_terms_tree = ElementTree.parse(key_terms_file)
term_id_to_category_dict = _get_category_per_id(biblical_terms_tree)
elif settings.biblical_terms_list_type in _PREDEFINED_TERMS_LIST_TYPES:
with open(settings.biblical_terms_file_name, "rb") as key_terms_file:
biblical_terms_tree = ElementTree.parse(key_terms_file)
term_id_to_category_dict = _get_category_per_id(biblical_terms_tree)
else:
term_id_to_category_dict = {}
def __init__(self, filename: StrPath, term_categories: List[str], use_term_glosses: bool = True) -> None:
super().__init__()

terms_elements = term_renderings_tree.iter(".//TermRendering")
with ZipFile(filename, "r") as archive:
settings = ZipParatextProjectSettingsParser(archive).parse()
glosses: List[Tuple[str, List[str]]] = ZipParatextProjectTermsParser(archive, settings).parse(
term_categories, use_term_glosses
)
text_id = (
f"{settings.biblical_terms_list_type}:"
f"{settings.biblical_terms_project_name}:"
f"{settings.biblical_terms_file_name}"
)
rows: List[TextRow] = []
for e in terms_elements:
term_id = e.attrib["Id"]
category = term_id_to_category_dict.get(term_id, "")
if term_categories and (category == "" or category not in term_categories):
continue
term_id = term_id.replace("\n", "&#xA")
rendering = e.findtext("Renderings", "")
renderings = _get_renderings(rendering)
rows.append(TextRow(text_id, term_id, segment=renderings))
text = MemoryText(text_id, rows)
self._add_text(text)


def _get_renderings(rendering: str) -> List[str]:
# If entire term rendering is surrounded in square brackets, remove them
match = re.match(r"^\[(.+?)\]$", rendering)
if match:
rendering = match.group(1)
rendering = rendering.replace("?", "")
rendering = rendering.replace("*", "")
rendering = rendering.replace("/", " ")
rendering = rendering.strip()
rendering = _strip_parens(rendering)
rendering = _strip_parens(rendering, left="[", right="]")
rx = re.compile(r"\s+\d+(\.\d+)*$")
for match in rx.findall(rendering):
rendering = rendering.replace(match, "")
glosses = re.split(r"\|\|", rendering)
glosses = list(set(g.strip() for g in glosses if g.strip() != ""))
return glosses


def _strip_parens(term_string: str, left: str = "(", right: str = ")") -> str:
parens = 0
end = -1
for i in range(len(term_string) - 1, -1, -1):
c = term_string[i]
if c == right:
if parens == 0:
end = i + 1
parens += 1
elif c == left:
if parens > 0:
parens -= 1
if parens == 0:
term_string = term_string[:i] + term_string[end:]
return term_string


def _get_category_per_id(biblical_terms_tree: ElementTree.ElementTree) -> Dict[str, Optional[str]]:
term_id_to_category_dict = {}
for e in biblical_terms_tree.iter(".//Term"):
term_id = e.attrib["Id"]
if term_id not in term_id_to_category_dict:
category_element = e.find("Category")
category = (
category_element.text if category_element is not None and category_element.text is not None else ""
)
term_id_to_category_dict[term_id] = category
return term_id_to_category_dict
text = MemoryText(text_id, [TextRow(text_id, kvp[0], kvp[1]) for kvp in glosses])
self._add_text(text)
1 change: 1 addition & 0 deletions machine/corpora/paratext_project_settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ class ParatextProjectSettings:
biblical_terms_list_type: str
biblical_terms_project_name: str
biblical_terms_file_name: str
language_code: Optional[str]

def get_book_id(self, file_name: str) -> Optional[str]:
"""Returns None when the file name doesn't match the pattern of a book file name for the project."""
Expand Down
21 changes: 19 additions & 2 deletions machine/corpora/paratext_project_settings_parser_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,9 +81,26 @@ def parse(self) -> ParatextProjectSettings:
if len(parts) != 3:
raise ValueError(
f"The BiblicalTermsListSetting element in Settings.xml in project {full_name}"
f" is not in the expected format (i.e., Major::BiblicalTerms.xml) but is {biblical_terms_list_setting}."
f" is not in the expected format (e.g., Major::BiblicalTerms.xml) but is {biblical_terms_list_setting}."
)
language_code = None
language_iso_code_setting = settings_tree.getroot().findtext("LanguageIsoCode", "")
if language_iso_code_setting:
language_iso_code_setting_parts = settings_tree.getroot().findtext("LanguageIsoCode", "").split(":")
if language_iso_code_setting_parts:
language_code = language_iso_code_setting_parts[0]

return ParatextProjectSettings(
name, full_name, encoding, versification, stylesheet, prefix, form, suffix, parts[0], parts[1], parts[2]
name,
full_name,
encoding,
versification,
stylesheet,
prefix,
form,
suffix,
parts[0],
parts[1],
parts[2],
language_code,
)
150 changes: 150 additions & 0 deletions machine/corpora/paratext_project_terms_parser_base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
import re
from abc import ABC, abstractmethod
from collections import defaultdict
from importlib.resources import open_binary
from typing import BinaryIO, Dict, List, Optional, Tuple, Union
from xml.etree import ElementTree

from machine.corpora.paratext_project_settings import ParatextProjectSettings
from machine.corpora.paratext_project_settings_parser_base import ParatextProjectSettingsParserBase

_PREDEFINED_TERMS_LIST_TYPES = ["Major", "All", "SilNt", "Pt6"]
_SUPPORTED_LANGUAGE_TERMS_LOCALIZATION_XMLS_PACKAGE = "machine.corpora"
_SUPPORTED_LANGUAGE_TERMS_LOCALIZATION_XMLS = {
"en": "BiblicalTermsEn.xml",
"es": "BiblicalTermsEs.xml",
"fr": "BiblicalTermsFr.xml",
"id": "BiblicalTermsId.xml",
"pt": "BiblicalTermsPt.xml",
}
_CONTENT_IN_BRACKETS_REGEX = re.compile(r"^\[(.+?)\]$")
_NUMERICAL_INFORMATION_REGEX = re.compile(r"\s+\d+(\.\d+)*$")


class ParatextProjectTermsParserBase(ABC):
def __init__(self, settings: Union[ParatextProjectSettings, ParatextProjectSettingsParserBase]) -> None:
self._settings: ParatextProjectSettings
if isinstance(settings, ParatextProjectSettingsParserBase):
self._settings = settings.parse()
else:
self._settings = settings

def parse(self, term_categories: List[str], use_term_glosses: bool = True) -> List[Tuple[str, List[str]]]:
biblical_terms_doc = None
if self._settings.biblical_terms_list_type == "Project":
if self.exists(self._settings.biblical_terms_file_name):
with self.open(self._settings.biblical_terms_file_name) as stream:
biblical_terms_doc = ElementTree.parse(stream)
term_id_to_category_dict = _get_category_per_id(biblical_terms_doc)
elif self._settings.biblical_terms_list_type in _PREDEFINED_TERMS_LIST_TYPES:
with open_binary(
_SUPPORTED_LANGUAGE_TERMS_LOCALIZATION_XMLS_PACKAGE, self._settings.biblical_terms_file_name
) as stream:
biblical_terms_doc = ElementTree.parse(stream)
term_id_to_category_dict = _get_category_per_id(biblical_terms_doc)
else:
term_id_to_category_dict = {}

terms_glosses_doc: Optional[ElementTree.ElementTree] = None
resource_name = None
if self._settings.language_code is not None:
resource_name = _SUPPORTED_LANGUAGE_TERMS_LOCALIZATION_XMLS.get(self._settings.language_code)
if (
self._settings.language_code is not None
and self._settings.biblical_terms_list_type == "Major"
and resource_name
):
with open_binary(_SUPPORTED_LANGUAGE_TERMS_LOCALIZATION_XMLS_PACKAGE, resource_name) as stream:
terms_glosses_doc = ElementTree.parse(stream)

term_renderings_doc: Optional[ElementTree.ElementTree] = None
if self.exists("TermRenderings.xml"):
with self.open("TermRenderings.xml") as stream:
term_renderings_doc = ElementTree.parse(stream)

terms_renderings: Dict[str, List[str]] = defaultdict(list)
if term_renderings_doc is not None:
for term in term_renderings_doc.findall(".//TermRendering"):
id = term.attrib["Id"]
if _is_in_category(id, term_categories, term_id_to_category_dict):
id_ = id.replace("\n", "&#xA")
renderings = term.find("Renderings")
gloss = renderings.text if renderings is not None and renderings.text is not None else ""
glosses = _get_glosses(gloss)
terms_renderings[id_].extend(glosses)

terms_glosses: Dict[str, List[str]] = defaultdict(list)
if terms_glosses_doc is not None and use_term_glosses:
for elem in terms_glosses_doc.findall(".//Localization"):
id = elem.attrib["Id"]
if _is_in_category(id, term_categories, term_id_to_category_dict):
id_ = id.replace("\n", "&#xA")
gloss = elem.attrib["Gloss"]
glosses = _get_glosses(gloss)
terms_glosses[id_].extend(glosses)
if terms_glosses or terms_renderings:
combined = {**terms_renderings, **{k: v for k, v in terms_glosses.items() if k not in terms_renderings}}
return [(key, list(value)) for key, value in combined.items()]

return []

@abstractmethod
def exists(self, file_name: str) -> bool: ...

@abstractmethod
def open(self, file_name: str) -> BinaryIO: ...


def _is_in_category(id: str, term_categories: List[str], term_id_to_category_dict: Dict[str, str]) -> bool:
category = term_id_to_category_dict.get(id)
return not term_categories or (category is not None and category in term_categories)


def _get_glosses(gloss: str) -> List[str]:
match = _CONTENT_IN_BRACKETS_REGEX.match(gloss)
if match:
gloss = match.group(0)
gloss = gloss.replace("?", "")
gloss = gloss.replace("*", "")
gloss = gloss.replace("/", " ")
gloss = gloss.strip()
gloss = _strip_parens(gloss)
gloss = _strip_parens(gloss, left="[", right="]")
gloss = gloss.strip()
for match in _NUMERICAL_INFORMATION_REGEX.finditer(gloss):
gloss = gloss.replace(match.group(0), "")
glosses = re.split(r"\|\|", gloss)
glosses = [re.split(r"[,;]", g) for g in glosses]
glosses = [item.strip() for sublist in glosses for item in sublist if item.strip()]
return glosses


def _strip_parens(term_string: str, left: str = "(", right: str = ")") -> str:
parens: int = 0
end: int = -1
for i in range(len(term_string) - 1, -1, -1):
c = term_string[i]
if c == right:
if parens == 0:
end = i + 1
parens += 1
elif c == left:
if parens > 0:
parens -= 1
if parens == 0:
term_string = term_string[:i] + term_string[end:]
return term_string


def _get_category_per_id(biblical_terms_doc: ElementTree.ElementTree) -> Dict[str, str]:
term_id_to_category_dict: Dict[str, str] = {}

for term in biblical_terms_doc.findall(".//Term"):
term_id = term.attrib["Id"]
if term_id not in term_id_to_category_dict:
category = term.find("Category")
term_id_to_category_dict[term_id] = (
category.text if category is not None and category.text is not None else ""
)

return term_id_to_category_dict
Loading

0 comments on commit ff1c067

Please sign in to comment.