port commits from PR #233, use terms localizations

sillsdev · Aug 30, 2024 · ff1c067 · ff1c067
1 parent bfb44d8
commit ff1c067
Show file tree

Hide file tree

Showing 18 changed files with 97,127 additions and 97 deletions.
diff --git a/machine/corpora/BiblicalTermsEn.xml b/machine/corpora/BiblicalTermsEn.xml
diff --git a/machine/corpora/BiblicalTermsEs.xml b/machine/corpora/BiblicalTermsEs.xml
diff --git a/machine/corpora/BiblicalTermsFr.xml b/machine/corpora/BiblicalTermsFr.xml
diff --git a/machine/corpora/BiblicalTermsId.xml b/machine/corpora/BiblicalTermsId.xml
diff --git a/machine/corpora/BiblicalTermsPt.xml b/machine/corpora/BiblicalTermsPt.xml
diff --git a/machine/corpora/__init__.py b/machine/corpora/__init__.py
@@ -11,6 +11,7 @@
 from .file_paratext_project_text_updater import FileParatextProjectTextUpdater
 from .flatten import flatten
 from .memory_alignment_collection import MemoryAlignmentCollection
+from .memory_paratext_project_terms_parser import MemoryParatextProjectTermsParser
 from .memory_stream_container import MemoryStreamContainer
 from .memory_text import MemoryText
 from .multi_key_ref import MultiKeyRef
@@ -20,6 +21,7 @@
 from .paratext_backup_text_corpus import ParatextBackupTextCorpus
 from .paratext_project_settings import ParatextProjectSettings
 from .paratext_project_settings_parser_base import ParatextProjectSettingsParserBase
+from .paratext_project_terms_parser_base import ParatextProjectTermsParserBase
 from .paratext_project_text_updater_base import ParatextProjectTextUpdaterBase
 from .paratext_text_corpus import ParatextTextCorpus
 from .scripture_element import ScriptureElement
@@ -67,6 +69,7 @@
 from .usx_zip_text import UsxZipText
 from .zip_paratext_project_settings_parser import ZipParatextProjectSettingsParser
 from .zip_paratext_project_settings_parser_base import ZipParatextProjectSettingsParserBase
+from .zip_paratext_project_terms_parser import ZipParatextProjectTermsParser
 from .zip_paratext_project_text_updater import ZipParatextProjectTextUpdater
 
 __all__ = [
@@ -89,6 +92,7 @@
     "is_scripture",
     "lowercase",
     "MemoryAlignmentCollection",
+    "MemoryParatextProjectTermsParser",
     "MemoryText",
     "MemoryStreamContainer",
     "MultiKeyRef",
@@ -103,6 +107,7 @@
     "ParatextBackupTextCorpus",
     "ParatextProjectSettings",
     "ParatextProjectSettingsParserBase",
+    "ParatextProjectTermsParserBase",
     "ParatextProjectTextUpdaterBase",
     "ParatextTextCorpus",
     "parse_usfm",
@@ -149,5 +154,6 @@
     "UsxZipText",
     "ZipParatextProjectSettingsParser",
     "ZipParatextProjectSettingsParserBase",
+    "ZipParatextProjectTermsParser",
     "ZipParatextProjectTextUpdater",
 ]
diff --git a/machine/corpora/memory_paratext_project_terms_parser.py b/machine/corpora/memory_paratext_project_terms_parser.py
@@ -0,0 +1,18 @@
+from io import BytesIO
+from typing import BinaryIO, Dict
+
+from ..corpora.paratext_project_settings import ParatextProjectSettings
+from .paratext_project_terms_parser_base import ParatextProjectTermsParserBase
+
+
+class MemoryParatextProjectTermsParser(ParatextProjectTermsParserBase):
+    def __init__(self, settings: ParatextProjectSettings, files: Dict[str, str]) -> None:
+        super().__init__(settings)
+
+        self.files = files
+
+    def exists(self, file_name: str) -> bool:
+        return file_name in self.files
+
+    def open(self, file_name: str) -> BinaryIO:
+        return BytesIO(self.files[file_name].encode("utf-8"))
diff --git a/machine/corpora/paratext_backup_terms_corpus.py b/machine/corpora/paratext_backup_terms_corpus.py
@@ -1,110 +1,28 @@
-import re
-from typing import Dict, List, Optional
-from xml.etree import ElementTree
+from typing import List, Tuple
 from zipfile import ZipFile
 
-from .corpora_utils import get_entry
+from ..utils.typeshed import StrPath
 from .dictionary_text_corpus import DictionaryTextCorpus
 from .memory_text import MemoryText
 from .text_row import TextRow
 from .zip_paratext_project_settings_parser import ZipParatextProjectSettingsParser
-
-_PREDEFINED_TERMS_LIST_TYPES = ["Major", "All", "SilNt", "Pt6"]
+from .zip_paratext_project_terms_parser import ZipParatextProjectTermsParser
 
 
 class ParatextBackupTermsCorpus(DictionaryTextCorpus):
-    def __init__(self, filename: str, term_categories: List[str]) -> None:
-        with ZipFile(filename, "r") as archive:
-            terms_file_entry = get_entry(archive, "TermRenderings.xml")
-            if terms_file_entry is None:
-                return
-            settings_parser = ZipParatextProjectSettingsParser(archive)
-            settings = settings_parser.parse()
-
-            with archive.open(terms_file_entry) as key_terms_file:
-                term_renderings_tree = ElementTree.parse(key_terms_file)
-
-            biblical_terms_file_entry = get_entry(archive, settings.biblical_terms_file_name)
-            if settings.biblical_terms_list_type == "Project":
-                if biblical_terms_file_entry is not None:
-                    with archive.open(biblical_terms_file_entry) as key_terms_file:
-                        biblical_terms_tree = ElementTree.parse(key_terms_file)
-                        term_id_to_category_dict = _get_category_per_id(biblical_terms_tree)
-                else:
-                    with open("BiblicalTerms.xml", "rb") as key_terms_file:
-                        biblical_terms_tree = ElementTree.parse(key_terms_file)
-                        term_id_to_category_dict = _get_category_per_id(biblical_terms_tree)
-            elif settings.biblical_terms_list_type in _PREDEFINED_TERMS_LIST_TYPES:
-                with open(settings.biblical_terms_file_name, "rb") as key_terms_file:
-                    biblical_terms_tree = ElementTree.parse(key_terms_file)
-                    term_id_to_category_dict = _get_category_per_id(biblical_terms_tree)
-            else:
-                term_id_to_category_dict = {}
+    def __init__(self, filename: StrPath, term_categories: List[str], use_term_glosses: bool = True) -> None:
+        super().__init__()
 
-            terms_elements = term_renderings_tree.iter(".//TermRendering")
+        with ZipFile(filename, "r") as archive:
+            settings = ZipParatextProjectSettingsParser(archive).parse()
+            glosses: List[Tuple[str, List[str]]] = ZipParatextProjectTermsParser(archive, settings).parse(
+                term_categories, use_term_glosses
+            )
             text_id = (
                 f"{settings.biblical_terms_list_type}:"
                 f"{settings.biblical_terms_project_name}:"
                 f"{settings.biblical_terms_file_name}"
             )
-            rows: List[TextRow] = []
-            for e in terms_elements:
-                term_id = e.attrib["Id"]
-                category = term_id_to_category_dict.get(term_id, "")
-                if term_categories and (category == "" or category not in term_categories):
-                    continue
-                term_id = term_id.replace("\n", "&#xA")
-                rendering = e.findtext("Renderings", "")
-                renderings = _get_renderings(rendering)
-                rows.append(TextRow(text_id, term_id, segment=renderings))
-            text = MemoryText(text_id, rows)
-            self._add_text(text)
 
-
-def _get_renderings(rendering: str) -> List[str]:
-    # If entire term rendering is surrounded in square brackets, remove them
-    match = re.match(r"^\[(.+?)\]$", rendering)
-    if match:
-        rendering = match.group(1)
-    rendering = rendering.replace("?", "")
-    rendering = rendering.replace("*", "")
-    rendering = rendering.replace("/", " ")
-    rendering = rendering.strip()
-    rendering = _strip_parens(rendering)
-    rendering = _strip_parens(rendering, left="[", right="]")
-    rx = re.compile(r"\s+\d+(\.\d+)*$")
-    for match in rx.findall(rendering):
-        rendering = rendering.replace(match, "")
-    glosses = re.split(r"\|\|", rendering)
-    glosses = list(set(g.strip() for g in glosses if g.strip() != ""))
-    return glosses
-
-
-def _strip_parens(term_string: str, left: str = "(", right: str = ")") -> str:
-    parens = 0
-    end = -1
-    for i in range(len(term_string) - 1, -1, -1):
-        c = term_string[i]
-        if c == right:
-            if parens == 0:
-                end = i + 1
-            parens += 1
-        elif c == left:
-            if parens > 0:
-                parens -= 1
-                if parens == 0:
-                    term_string = term_string[:i] + term_string[end:]
-    return term_string
-
-
-def _get_category_per_id(biblical_terms_tree: ElementTree.ElementTree) -> Dict[str, Optional[str]]:
-    term_id_to_category_dict = {}
-    for e in biblical_terms_tree.iter(".//Term"):
-        term_id = e.attrib["Id"]
-        if term_id not in term_id_to_category_dict:
-            category_element = e.find("Category")
-            category = (
-                category_element.text if category_element is not None and category_element.text is not None else ""
-            )
-            term_id_to_category_dict[term_id] = category
-    return term_id_to_category_dict
+            text = MemoryText(text_id, [TextRow(text_id, kvp[0], kvp[1]) for kvp in glosses])
+            self._add_text(text)
diff --git a/machine/corpora/paratext_project_settings.py b/machine/corpora/paratext_project_settings.py
@@ -19,6 +19,7 @@ class ParatextProjectSettings:
     biblical_terms_list_type: str
     biblical_terms_project_name: str
     biblical_terms_file_name: str
+    language_code: Optional[str]
 
     def get_book_id(self, file_name: str) -> Optional[str]:
         """Returns None when the file name doesn't match the pattern of a book file name for the project."""

diff --git a/machine/corpora/paratext_project_settings_parser_base.py b/machine/corpora/paratext_project_settings_parser_base.py
@@ -81,9 +81,26 @@ def parse(self) -> ParatextProjectSettings:
         if len(parts) != 3:
             raise ValueError(
                 f"The BiblicalTermsListSetting element in Settings.xml in project {full_name}"
-                f" is not in the expected format (i.e., Major::BiblicalTerms.xml) but is {biblical_terms_list_setting}."
+                f" is not in the expected format (e.g., Major::BiblicalTerms.xml) but is {biblical_terms_list_setting}."
             )
+        language_code = None
+        language_iso_code_setting = settings_tree.getroot().findtext("LanguageIsoCode", "")
+        if language_iso_code_setting:
+            language_iso_code_setting_parts = settings_tree.getroot().findtext("LanguageIsoCode", "").split(":")
+            if language_iso_code_setting_parts:
+                language_code = language_iso_code_setting_parts[0]
 
         return ParatextProjectSettings(
-            name, full_name, encoding, versification, stylesheet, prefix, form, suffix, parts[0], parts[1], parts[2]
+            name,
+            full_name,
+            encoding,
+            versification,
+            stylesheet,
+            prefix,
+            form,
+            suffix,
+            parts[0],
+            parts[1],
+            parts[2],
+            language_code,
         )
diff --git a/machine/corpora/paratext_project_terms_parser_base.py b/machine/corpora/paratext_project_terms_parser_base.py
@@ -0,0 +1,150 @@
+import re
+from abc import ABC, abstractmethod
+from collections import defaultdict
+from importlib.resources import open_binary
+from typing import BinaryIO, Dict, List, Optional, Tuple, Union
+from xml.etree import ElementTree
+
+from machine.corpora.paratext_project_settings import ParatextProjectSettings
+from machine.corpora.paratext_project_settings_parser_base import ParatextProjectSettingsParserBase
+
+_PREDEFINED_TERMS_LIST_TYPES = ["Major", "All", "SilNt", "Pt6"]
+_SUPPORTED_LANGUAGE_TERMS_LOCALIZATION_XMLS_PACKAGE = "machine.corpora"
+_SUPPORTED_LANGUAGE_TERMS_LOCALIZATION_XMLS = {
+    "en": "BiblicalTermsEn.xml",
+    "es": "BiblicalTermsEs.xml",
+    "fr": "BiblicalTermsFr.xml",
+    "id": "BiblicalTermsId.xml",
+    "pt": "BiblicalTermsPt.xml",
+}
+_CONTENT_IN_BRACKETS_REGEX = re.compile(r"^\[(.+?)\]$")
+_NUMERICAL_INFORMATION_REGEX = re.compile(r"\s+\d+(\.\d+)*$")
+
+
+class ParatextProjectTermsParserBase(ABC):
+    def __init__(self, settings: Union[ParatextProjectSettings, ParatextProjectSettingsParserBase]) -> None:
+        self._settings: ParatextProjectSettings
+        if isinstance(settings, ParatextProjectSettingsParserBase):
+            self._settings = settings.parse()
+        else:
+            self._settings = settings
+
+    def parse(self, term_categories: List[str], use_term_glosses: bool = True) -> List[Tuple[str, List[str]]]:
+        biblical_terms_doc = None
+        if self._settings.biblical_terms_list_type == "Project":
+            if self.exists(self._settings.biblical_terms_file_name):
+                with self.open(self._settings.biblical_terms_file_name) as stream:
+                    biblical_terms_doc = ElementTree.parse(stream)
+                    term_id_to_category_dict = _get_category_per_id(biblical_terms_doc)
+        elif self._settings.biblical_terms_list_type in _PREDEFINED_TERMS_LIST_TYPES:
+            with open_binary(
+                _SUPPORTED_LANGUAGE_TERMS_LOCALIZATION_XMLS_PACKAGE, self._settings.biblical_terms_file_name
+            ) as stream:
+                biblical_terms_doc = ElementTree.parse(stream)
+                term_id_to_category_dict = _get_category_per_id(biblical_terms_doc)
+        else:
+            term_id_to_category_dict = {}
+
+        terms_glosses_doc: Optional[ElementTree.ElementTree] = None
+        resource_name = None
+        if self._settings.language_code is not None:
+            resource_name = _SUPPORTED_LANGUAGE_TERMS_LOCALIZATION_XMLS.get(self._settings.language_code)
+        if (
+            self._settings.language_code is not None
+            and self._settings.biblical_terms_list_type == "Major"
+            and resource_name
+        ):
+            with open_binary(_SUPPORTED_LANGUAGE_TERMS_LOCALIZATION_XMLS_PACKAGE, resource_name) as stream:
+                terms_glosses_doc = ElementTree.parse(stream)
+
+        term_renderings_doc: Optional[ElementTree.ElementTree] = None
+        if self.exists("TermRenderings.xml"):
+            with self.open("TermRenderings.xml") as stream:
+                term_renderings_doc = ElementTree.parse(stream)
+
+        terms_renderings: Dict[str, List[str]] = defaultdict(list)
+        if term_renderings_doc is not None:
+            for term in term_renderings_doc.findall(".//TermRendering"):
+                id = term.attrib["Id"]
+                if _is_in_category(id, term_categories, term_id_to_category_dict):
+                    id_ = id.replace("\n", "&#xA")
+                    renderings = term.find("Renderings")
+                    gloss = renderings.text if renderings is not None and renderings.text is not None else ""
+                    glosses = _get_glosses(gloss)
+                    terms_renderings[id_].extend(glosses)
+
+        terms_glosses: Dict[str, List[str]] = defaultdict(list)
+        if terms_glosses_doc is not None and use_term_glosses:
+            for elem in terms_glosses_doc.findall(".//Localization"):
+                id = elem.attrib["Id"]
+                if _is_in_category(id, term_categories, term_id_to_category_dict):
+                    id_ = id.replace("\n", "&#xA")
+                    gloss = elem.attrib["Gloss"]
+                    glosses = _get_glosses(gloss)
+                    terms_glosses[id_].extend(glosses)
+        if terms_glosses or terms_renderings:
+            combined = {**terms_renderings, **{k: v for k, v in terms_glosses.items() if k not in terms_renderings}}
+            return [(key, list(value)) for key, value in combined.items()]
+
+        return []
+
+    @abstractmethod
+    def exists(self, file_name: str) -> bool: ...
+
+    @abstractmethod
+    def open(self, file_name: str) -> BinaryIO: ...
+
+
+def _is_in_category(id: str, term_categories: List[str], term_id_to_category_dict: Dict[str, str]) -> bool:
+    category = term_id_to_category_dict.get(id)
+    return not term_categories or (category is not None and category in term_categories)
+
+
+def _get_glosses(gloss: str) -> List[str]:
+    match = _CONTENT_IN_BRACKETS_REGEX.match(gloss)
+    if match:
+        gloss = match.group(0)
+    gloss = gloss.replace("?", "")
+    gloss = gloss.replace("*", "")
+    gloss = gloss.replace("/", " ")
+    gloss = gloss.strip()
+    gloss = _strip_parens(gloss)
+    gloss = _strip_parens(gloss, left="[", right="]")
+    gloss = gloss.strip()
+    for match in _NUMERICAL_INFORMATION_REGEX.finditer(gloss):
+        gloss = gloss.replace(match.group(0), "")
+    glosses = re.split(r"\|\|", gloss)
+    glosses = [re.split(r"[,;]", g) for g in glosses]
+    glosses = [item.strip() for sublist in glosses for item in sublist if item.strip()]
+    return glosses
+
+
+def _strip_parens(term_string: str, left: str = "(", right: str = ")") -> str:
+    parens: int = 0
+    end: int = -1
+    for i in range(len(term_string) - 1, -1, -1):
+        c = term_string[i]
+        if c == right:
+            if parens == 0:
+                end = i + 1
+            parens += 1
+        elif c == left:
+            if parens > 0:
+                parens -= 1
+                if parens == 0:
+                    term_string = term_string[:i] + term_string[end:]
+    return term_string
+
+
+def _get_category_per_id(biblical_terms_doc: ElementTree.ElementTree) -> Dict[str, str]:
+    term_id_to_category_dict: Dict[str, str] = {}
+
+    for term in biblical_terms_doc.findall(".//Term"):
+        term_id = term.attrib["Id"]
+        if term_id not in term_id_to_category_dict:
+            category = term.find("Category")
+            term_id_to_category_dict[term_id] = (
+                category.text if category is not None and category.text is not None else ""
+            )
+
+    return term_id_to_category_dict