-
-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
port commits from PR #233, use terms localizations
- Loading branch information
1 parent
bfb44d8
commit ff1c067
Showing
18 changed files
with
97,127 additions
and
97 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
from io import BytesIO | ||
from typing import BinaryIO, Dict | ||
|
||
from ..corpora.paratext_project_settings import ParatextProjectSettings | ||
from .paratext_project_terms_parser_base import ParatextProjectTermsParserBase | ||
|
||
|
||
class MemoryParatextProjectTermsParser(ParatextProjectTermsParserBase): | ||
def __init__(self, settings: ParatextProjectSettings, files: Dict[str, str]) -> None: | ||
super().__init__(settings) | ||
|
||
self.files = files | ||
|
||
def exists(self, file_name: str) -> bool: | ||
return file_name in self.files | ||
|
||
def open(self, file_name: str) -> BinaryIO: | ||
return BytesIO(self.files[file_name].encode("utf-8")) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,110 +1,28 @@ | ||
import re | ||
from typing import Dict, List, Optional | ||
from xml.etree import ElementTree | ||
from typing import List, Tuple | ||
from zipfile import ZipFile | ||
|
||
from .corpora_utils import get_entry | ||
from ..utils.typeshed import StrPath | ||
from .dictionary_text_corpus import DictionaryTextCorpus | ||
from .memory_text import MemoryText | ||
from .text_row import TextRow | ||
from .zip_paratext_project_settings_parser import ZipParatextProjectSettingsParser | ||
|
||
_PREDEFINED_TERMS_LIST_TYPES = ["Major", "All", "SilNt", "Pt6"] | ||
from .zip_paratext_project_terms_parser import ZipParatextProjectTermsParser | ||
|
||
|
||
class ParatextBackupTermsCorpus(DictionaryTextCorpus): | ||
def __init__(self, filename: str, term_categories: List[str]) -> None: | ||
with ZipFile(filename, "r") as archive: | ||
terms_file_entry = get_entry(archive, "TermRenderings.xml") | ||
if terms_file_entry is None: | ||
return | ||
settings_parser = ZipParatextProjectSettingsParser(archive) | ||
settings = settings_parser.parse() | ||
|
||
with archive.open(terms_file_entry) as key_terms_file: | ||
term_renderings_tree = ElementTree.parse(key_terms_file) | ||
|
||
biblical_terms_file_entry = get_entry(archive, settings.biblical_terms_file_name) | ||
if settings.biblical_terms_list_type == "Project": | ||
if biblical_terms_file_entry is not None: | ||
with archive.open(biblical_terms_file_entry) as key_terms_file: | ||
biblical_terms_tree = ElementTree.parse(key_terms_file) | ||
term_id_to_category_dict = _get_category_per_id(biblical_terms_tree) | ||
else: | ||
with open("BiblicalTerms.xml", "rb") as key_terms_file: | ||
biblical_terms_tree = ElementTree.parse(key_terms_file) | ||
term_id_to_category_dict = _get_category_per_id(biblical_terms_tree) | ||
elif settings.biblical_terms_list_type in _PREDEFINED_TERMS_LIST_TYPES: | ||
with open(settings.biblical_terms_file_name, "rb") as key_terms_file: | ||
biblical_terms_tree = ElementTree.parse(key_terms_file) | ||
term_id_to_category_dict = _get_category_per_id(biblical_terms_tree) | ||
else: | ||
term_id_to_category_dict = {} | ||
def __init__(self, filename: StrPath, term_categories: List[str], use_term_glosses: bool = True) -> None: | ||
super().__init__() | ||
|
||
terms_elements = term_renderings_tree.iter(".//TermRendering") | ||
with ZipFile(filename, "r") as archive: | ||
settings = ZipParatextProjectSettingsParser(archive).parse() | ||
glosses: List[Tuple[str, List[str]]] = ZipParatextProjectTermsParser(archive, settings).parse( | ||
term_categories, use_term_glosses | ||
) | ||
text_id = ( | ||
f"{settings.biblical_terms_list_type}:" | ||
f"{settings.biblical_terms_project_name}:" | ||
f"{settings.biblical_terms_file_name}" | ||
) | ||
rows: List[TextRow] = [] | ||
for e in terms_elements: | ||
term_id = e.attrib["Id"] | ||
category = term_id_to_category_dict.get(term_id, "") | ||
if term_categories and (category == "" or category not in term_categories): | ||
continue | ||
term_id = term_id.replace("\n", "
") | ||
rendering = e.findtext("Renderings", "") | ||
renderings = _get_renderings(rendering) | ||
rows.append(TextRow(text_id, term_id, segment=renderings)) | ||
text = MemoryText(text_id, rows) | ||
self._add_text(text) | ||
|
||
|
||
def _get_renderings(rendering: str) -> List[str]: | ||
# If entire term rendering is surrounded in square brackets, remove them | ||
match = re.match(r"^\[(.+?)\]$", rendering) | ||
if match: | ||
rendering = match.group(1) | ||
rendering = rendering.replace("?", "") | ||
rendering = rendering.replace("*", "") | ||
rendering = rendering.replace("/", " ") | ||
rendering = rendering.strip() | ||
rendering = _strip_parens(rendering) | ||
rendering = _strip_parens(rendering, left="[", right="]") | ||
rx = re.compile(r"\s+\d+(\.\d+)*$") | ||
for match in rx.findall(rendering): | ||
rendering = rendering.replace(match, "") | ||
glosses = re.split(r"\|\|", rendering) | ||
glosses = list(set(g.strip() for g in glosses if g.strip() != "")) | ||
return glosses | ||
|
||
|
||
def _strip_parens(term_string: str, left: str = "(", right: str = ")") -> str: | ||
parens = 0 | ||
end = -1 | ||
for i in range(len(term_string) - 1, -1, -1): | ||
c = term_string[i] | ||
if c == right: | ||
if parens == 0: | ||
end = i + 1 | ||
parens += 1 | ||
elif c == left: | ||
if parens > 0: | ||
parens -= 1 | ||
if parens == 0: | ||
term_string = term_string[:i] + term_string[end:] | ||
return term_string | ||
|
||
|
||
def _get_category_per_id(biblical_terms_tree: ElementTree.ElementTree) -> Dict[str, Optional[str]]: | ||
term_id_to_category_dict = {} | ||
for e in biblical_terms_tree.iter(".//Term"): | ||
term_id = e.attrib["Id"] | ||
if term_id not in term_id_to_category_dict: | ||
category_element = e.find("Category") | ||
category = ( | ||
category_element.text if category_element is not None and category_element.text is not None else "" | ||
) | ||
term_id_to_category_dict[term_id] = category | ||
return term_id_to_category_dict | ||
text = MemoryText(text_id, [TextRow(text_id, kvp[0], kvp[1]) for kvp in glosses]) | ||
self._add_text(text) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,150 @@ | ||
import re | ||
from abc import ABC, abstractmethod | ||
from collections import defaultdict | ||
from importlib.resources import open_binary | ||
from typing import BinaryIO, Dict, List, Optional, Tuple, Union | ||
from xml.etree import ElementTree | ||
|
||
from machine.corpora.paratext_project_settings import ParatextProjectSettings | ||
from machine.corpora.paratext_project_settings_parser_base import ParatextProjectSettingsParserBase | ||
|
||
_PREDEFINED_TERMS_LIST_TYPES = ["Major", "All", "SilNt", "Pt6"] | ||
_SUPPORTED_LANGUAGE_TERMS_LOCALIZATION_XMLS_PACKAGE = "machine.corpora" | ||
_SUPPORTED_LANGUAGE_TERMS_LOCALIZATION_XMLS = { | ||
"en": "BiblicalTermsEn.xml", | ||
"es": "BiblicalTermsEs.xml", | ||
"fr": "BiblicalTermsFr.xml", | ||
"id": "BiblicalTermsId.xml", | ||
"pt": "BiblicalTermsPt.xml", | ||
} | ||
_CONTENT_IN_BRACKETS_REGEX = re.compile(r"^\[(.+?)\]$") | ||
_NUMERICAL_INFORMATION_REGEX = re.compile(r"\s+\d+(\.\d+)*$") | ||
|
||
|
||
class ParatextProjectTermsParserBase(ABC): | ||
def __init__(self, settings: Union[ParatextProjectSettings, ParatextProjectSettingsParserBase]) -> None: | ||
self._settings: ParatextProjectSettings | ||
if isinstance(settings, ParatextProjectSettingsParserBase): | ||
self._settings = settings.parse() | ||
else: | ||
self._settings = settings | ||
|
||
def parse(self, term_categories: List[str], use_term_glosses: bool = True) -> List[Tuple[str, List[str]]]: | ||
biblical_terms_doc = None | ||
if self._settings.biblical_terms_list_type == "Project": | ||
if self.exists(self._settings.biblical_terms_file_name): | ||
with self.open(self._settings.biblical_terms_file_name) as stream: | ||
biblical_terms_doc = ElementTree.parse(stream) | ||
term_id_to_category_dict = _get_category_per_id(biblical_terms_doc) | ||
elif self._settings.biblical_terms_list_type in _PREDEFINED_TERMS_LIST_TYPES: | ||
with open_binary( | ||
_SUPPORTED_LANGUAGE_TERMS_LOCALIZATION_XMLS_PACKAGE, self._settings.biblical_terms_file_name | ||
) as stream: | ||
biblical_terms_doc = ElementTree.parse(stream) | ||
term_id_to_category_dict = _get_category_per_id(biblical_terms_doc) | ||
else: | ||
term_id_to_category_dict = {} | ||
|
||
terms_glosses_doc: Optional[ElementTree.ElementTree] = None | ||
resource_name = None | ||
if self._settings.language_code is not None: | ||
resource_name = _SUPPORTED_LANGUAGE_TERMS_LOCALIZATION_XMLS.get(self._settings.language_code) | ||
if ( | ||
self._settings.language_code is not None | ||
and self._settings.biblical_terms_list_type == "Major" | ||
and resource_name | ||
): | ||
with open_binary(_SUPPORTED_LANGUAGE_TERMS_LOCALIZATION_XMLS_PACKAGE, resource_name) as stream: | ||
terms_glosses_doc = ElementTree.parse(stream) | ||
|
||
term_renderings_doc: Optional[ElementTree.ElementTree] = None | ||
if self.exists("TermRenderings.xml"): | ||
with self.open("TermRenderings.xml") as stream: | ||
term_renderings_doc = ElementTree.parse(stream) | ||
|
||
terms_renderings: Dict[str, List[str]] = defaultdict(list) | ||
if term_renderings_doc is not None: | ||
for term in term_renderings_doc.findall(".//TermRendering"): | ||
id = term.attrib["Id"] | ||
if _is_in_category(id, term_categories, term_id_to_category_dict): | ||
id_ = id.replace("\n", "
") | ||
renderings = term.find("Renderings") | ||
gloss = renderings.text if renderings is not None and renderings.text is not None else "" | ||
glosses = _get_glosses(gloss) | ||
terms_renderings[id_].extend(glosses) | ||
|
||
terms_glosses: Dict[str, List[str]] = defaultdict(list) | ||
if terms_glosses_doc is not None and use_term_glosses: | ||
for elem in terms_glosses_doc.findall(".//Localization"): | ||
id = elem.attrib["Id"] | ||
if _is_in_category(id, term_categories, term_id_to_category_dict): | ||
id_ = id.replace("\n", "
") | ||
gloss = elem.attrib["Gloss"] | ||
glosses = _get_glosses(gloss) | ||
terms_glosses[id_].extend(glosses) | ||
if terms_glosses or terms_renderings: | ||
combined = {**terms_renderings, **{k: v for k, v in terms_glosses.items() if k not in terms_renderings}} | ||
return [(key, list(value)) for key, value in combined.items()] | ||
|
||
return [] | ||
|
||
@abstractmethod | ||
def exists(self, file_name: str) -> bool: ... | ||
|
||
@abstractmethod | ||
def open(self, file_name: str) -> BinaryIO: ... | ||
|
||
|
||
def _is_in_category(id: str, term_categories: List[str], term_id_to_category_dict: Dict[str, str]) -> bool: | ||
category = term_id_to_category_dict.get(id) | ||
return not term_categories or (category is not None and category in term_categories) | ||
|
||
|
||
def _get_glosses(gloss: str) -> List[str]: | ||
match = _CONTENT_IN_BRACKETS_REGEX.match(gloss) | ||
if match: | ||
gloss = match.group(0) | ||
gloss = gloss.replace("?", "") | ||
gloss = gloss.replace("*", "") | ||
gloss = gloss.replace("/", " ") | ||
gloss = gloss.strip() | ||
gloss = _strip_parens(gloss) | ||
gloss = _strip_parens(gloss, left="[", right="]") | ||
gloss = gloss.strip() | ||
for match in _NUMERICAL_INFORMATION_REGEX.finditer(gloss): | ||
gloss = gloss.replace(match.group(0), "") | ||
glosses = re.split(r"\|\|", gloss) | ||
glosses = [re.split(r"[,;]", g) for g in glosses] | ||
glosses = [item.strip() for sublist in glosses for item in sublist if item.strip()] | ||
return glosses | ||
|
||
|
||
def _strip_parens(term_string: str, left: str = "(", right: str = ")") -> str: | ||
parens: int = 0 | ||
end: int = -1 | ||
for i in range(len(term_string) - 1, -1, -1): | ||
c = term_string[i] | ||
if c == right: | ||
if parens == 0: | ||
end = i + 1 | ||
parens += 1 | ||
elif c == left: | ||
if parens > 0: | ||
parens -= 1 | ||
if parens == 0: | ||
term_string = term_string[:i] + term_string[end:] | ||
return term_string | ||
|
||
|
||
def _get_category_per_id(biblical_terms_doc: ElementTree.ElementTree) -> Dict[str, str]: | ||
term_id_to_category_dict: Dict[str, str] = {} | ||
|
||
for term in biblical_terms_doc.findall(".//Term"): | ||
term_id = term.attrib["Id"] | ||
if term_id not in term_id_to_category_dict: | ||
category = term.find("Category") | ||
term_id_to_category_dict[term_id] = ( | ||
category.text if category is not None and category.text is not None else "" | ||
) | ||
|
||
return term_id_to_category_dict |
Oops, something went wrong.