From 3e97c10356294cc1cbf56c80f0aff3d313414231 Mon Sep 17 00:00:00 2001 From: mshannon-sil Date: Thu, 28 Mar 2024 17:43:16 -0400 Subject: [PATCH] port remaining changes from Machine PR #163, clean up code --- .../file_paratext_project_settings_parser.py | 7 +- .../corpora/paratext_backup_terms_corpus.py | 92 +++++++-------- machine/corpora/paratext_project_settings.py | 107 +++++++----------- .../paratext_project_settings_parser_base.py | 8 +- .../zip_paratext_project_settings_parser.py | 11 +- ...p_paratext_project_settings_parser_base.py | 5 +- .../corpora/test_paratext_project_settings.py | 59 ++++++++++ tests/corpora/test_usfm_verse_text_updater.py | 34 +++--- 8 files changed, 169 insertions(+), 154 deletions(-) create mode 100644 tests/corpora/test_paratext_project_settings.py diff --git a/machine/corpora/file_paratext_project_settings_parser.py b/machine/corpora/file_paratext_project_settings_parser.py index 2c76603..62d5872 100644 --- a/machine/corpora/file_paratext_project_settings_parser.py +++ b/machine/corpora/file_paratext_project_settings_parser.py @@ -1,5 +1,5 @@ from pathlib import Path -from typing import Any, BinaryIO, Optional +from typing import BinaryIO, Optional from ..utils.typeshed import StrPath from .paratext_project_settings_parser_base import ParatextProjectSettingsParserBase @@ -10,11 +10,6 @@ class FileParatextProjectSettingsParser(ParatextProjectSettingsParserBase): def __init__(self, project_dir: StrPath) -> None: self._project_dir = Path(project_dir) - def __enter__(self) -> "FileParatextProjectSettingsParser": - return self - - def __exit__(self, type: Any, value: Any, traceback: Any) -> None: ... - def create_stylesheet(self, file_name: StrPath) -> UsfmStylesheet: custom_stylesheet_filename = self._project_dir / file_name return UsfmStylesheet( diff --git a/machine/corpora/paratext_backup_terms_corpus.py b/machine/corpora/paratext_backup_terms_corpus.py index 35ac467..79bd141 100644 --- a/machine/corpora/paratext_backup_terms_corpus.py +++ b/machine/corpora/paratext_backup_terms_corpus.py @@ -9,10 +9,11 @@ from .text_row import TextRow from .zip_paratext_project_settings_parser import ZipParatextProjectSettingsParser +_PREDEFINED_TERMS_LIST_TYPES = ["Major", "All", "SilNt", "Pt6"] + class ParatextBackupTermsCorpus(DictionaryTextCorpus): def __init__(self, filename: str, term_categories: List[str]) -> None: - self._predefined_terms_list_types = ["Major", "All", "SilNt", "Pt6"] rows: List[TextRow] = [] with ZipFile(filename, "r") as archive: terms_file_entry = get_entry(archive, "TermRenderings.xml") @@ -25,10 +26,10 @@ def __init__(self, filename: str, term_categories: List[str]) -> None: term_renderings_tree = ET.parse(key_terms_file) biblical_terms_file_entry = get_entry(archive, settings.biblical_terms_file_name) - if settings.biblical_terms_list_type in self._predefined_terms_list_types: + if settings.biblical_terms_list_type in _PREDEFINED_TERMS_LIST_TYPES: with open(settings.biblical_terms_file_name, "rb") as key_terms_file: biblical_terms_tree = ET.parse(key_terms_file) - term_id_to_category_dict = self._get_category_per_id(biblical_terms_tree) + term_id_to_category_dict = _get_category_per_id(biblical_terms_tree) elif ( settings.biblical_terms_list_type == "Project" and settings.biblical_terms_project_name == settings.name @@ -36,7 +37,7 @@ def __init__(self, filename: str, term_categories: List[str]) -> None: ): with archive.open(biblical_terms_file_entry) as key_terms_file: biblical_terms_tree = ET.parse(key_terms_file) - term_id_to_category_dict = self._get_category_per_id(biblical_terms_tree) + term_id_to_category_dict = _get_category_per_id(biblical_terms_tree) else: term_id_to_category_dict = {} @@ -53,51 +54,52 @@ def __init__(self, filename: str, term_categories: List[str]) -> None: continue term_id = term_id.replace("\n", " ") rendering = e.findtext("Renderings", "") - renderings = self._get_renderings(rendering) + renderings = _get_renderings(rendering) rows.append(TextRow(text_id, term_id, segment=renderings)) text = MemoryText(text_id, rows) self._add_text(text) - def _get_renderings(self, rendering: str) -> List[str]: - # If entire term rendering is surrounded in square brackets, remove them - match = re.match(r"^\[(.+?)\]$", rendering) - if match: - rendering = match.group(1) - rendering = rendering.replace("?", "") - rendering = rendering.replace("*", "") - rendering = rendering.replace("/", " ") - rendering = rendering.strip() - rendering = self._strip_parens(rendering) - rendering = self._strip_parens(rendering, left="[", right="]") - rx = re.compile(r"\s+\d+(\.\d+)*$") - for match in rx.findall(rendering): - rendering = rendering.replace(match, "") - glosses = re.split(r"\|\|", rendering) - glosses = list(set(g.strip() for g in glosses if g.strip() != "")) - return glosses - def _strip_parens(self, term_string: str, left: str = "(", right: str = ")") -> str: - parens = 0 - end = -1 - for i in range(len(term_string) - 1, -1, -1): - c = term_string[i] - if c == right: +def _get_renderings(rendering: str) -> List[str]: + # If entire term rendering is surrounded in square brackets, remove them + match = re.match(r"^\[(.+?)\]$", rendering) + if match: + rendering = match.group(1) + rendering = rendering.replace("?", "") + rendering = rendering.replace("*", "") + rendering = rendering.replace("/", " ") + rendering = rendering.strip() + rendering = _strip_parens(rendering) + rendering = _strip_parens(rendering, left="[", right="]") + rx = re.compile(r"\s+\d+(\.\d+)*$") + for match in rx.findall(rendering): + rendering = rendering.replace(match, "") + glosses = re.split(r"\|\|", rendering) + glosses = list(set(g.strip() for g in glosses if g.strip() != "")) + return glosses + + +def _strip_parens(term_string: str, left: str = "(", right: str = ")") -> str: + parens = 0 + end = -1 + for i in range(len(term_string) - 1, -1, -1): + c = term_string[i] + if c == right: + if parens == 0: + end = i + 1 + parens += 1 + elif c == left: + if parens > 0: + parens -= 1 if parens == 0: - end = i + 1 - parens += 1 - elif c == left: - if parens > 0: - parens -= 1 - if parens == 0: - term_string = term_string[:i] + term_string[end:] - return term_string + term_string = term_string[:i] + term_string[end:] + return term_string - def _get_category_per_id(self, biblical_terms_tree: ET.ElementTree) -> Dict[str, Optional[str]]: - term_id_to_category_dict = {} - for e in biblical_terms_tree.iter(".//Term"): - category_element = e.find("Category") - category = ( - category_element.text if category_element is not None and category_element.text is not None else "" - ) - term_id_to_category_dict[e.attrib["Id"]] = category - return term_id_to_category_dict + +def _get_category_per_id(biblical_terms_tree: ET.ElementTree) -> Dict[str, Optional[str]]: + term_id_to_category_dict = {} + for e in biblical_terms_tree.iter(".//Term"): + category_element = e.find("Category") + category = category_element.text if category_element is not None and category_element.text is not None else "" + term_id_to_category_dict[e.attrib["Id"]] = category + return term_id_to_category_dict diff --git a/machine/corpora/paratext_project_settings.py b/machine/corpora/paratext_project_settings.py index 3915f5e..7759273 100644 --- a/machine/corpora/paratext_project_settings.py +++ b/machine/corpora/paratext_project_settings.py @@ -1,76 +1,45 @@ from abc import ABC +from dataclasses import dataclass +from ..scripture.canon import book_id_to_number from ..scripture.verse_ref import Versification from .usfm_stylesheet import UsfmStylesheet +@dataclass class ParatextProjectSettings(ABC): - def __init__( - self, - name: str, - full_name: str, - encoding: str, - versification: Versification, - stylesheet: UsfmStylesheet, - file_name_prefix: str, - file_name_form: str, - file_name_suffix: str, - biblical_terms_list_type: str, - biblical_terms_project_name: str, - biblical_terms_file_name: str, - ) -> None: - self._name = name - self._full_name = full_name - self._encoding = encoding - self._versification = versification - self._stylesheet = stylesheet - self._file_name_prefix = file_name_prefix - self._file_name_form = file_name_form - self._file_name_suffix = file_name_suffix - self._biblical_terms_list_type = biblical_terms_list_type - self._biblical_terms_project_name = biblical_terms_project_name - self._biblical_terms_file_name = biblical_terms_file_name - - @property - def name(self) -> str: - return self._name - - @property - def full_name(self) -> str: - return self._full_name - - @property - def encoding(self) -> str: - return self._encoding - - @property - def versification(self) -> Versification: - return self._versification - - @property - def stylesheet(self) -> UsfmStylesheet: - return self._stylesheet - - @property - def file_name_prefix(self) -> str: - return self._file_name_prefix - - @property - def file_name_form(self) -> str: - return self._file_name_form - - @property - def file_name_suffix(self) -> str: - return self._file_name_suffix - - @property - def biblical_terms_list_type(self) -> str: - return self._biblical_terms_list_type - - @property - def biblical_terms_project_name(self) -> str: - return self._biblical_terms_project_name - - @property - def biblical_terms_file_name(self) -> str: - return self._biblical_terms_file_name + name: str + full_name: str + encoding: str + versification: Versification + stylesheet: UsfmStylesheet + file_name_prefix: str + file_name_form: str + file_name_suffix: str + biblical_terms_list_type: str + biblical_terms_project_name: str + biblical_terms_file_name: str + + def get_book_file_name(self, book_id: str) -> str: + if self.file_name_form == "MAT": + book_part = book_id + elif self.file_name_form in ("40", "41"): + book_part = _get_book_file_name_digits(book_id) + else: + book_part = _get_book_file_name_digits(book_id) + book_id + return self.file_name_prefix + book_part + self.file_name_suffix + + +def _get_book_file_name_digits(book_id: str) -> str: + book_num = book_id_to_number(book_id) + if book_num < 10: + return f"0{book_num}" + if book_num < 40: + return str(book_num) + if book_num < 100: + return str(book_num + 1) + if book_num < 110: + return f"A{book_num - 100}" + if book_num < 120: + return f"B{book_num - 110}" + return f"C{book_num - 120}" diff --git a/machine/corpora/paratext_project_settings_parser_base.py b/machine/corpora/paratext_project_settings_parser_base.py index 5d95979..7388182 100644 --- a/machine/corpora/paratext_project_settings_parser_base.py +++ b/machine/corpora/paratext_project_settings_parser_base.py @@ -1,6 +1,6 @@ import xml.etree.ElementTree as ET from abc import ABC, abstractmethod -from typing import Any, BinaryIO +from typing import BinaryIO from ..scripture.verse_ref import Versification from ..utils.string_utils import parse_integer @@ -11,12 +11,6 @@ class ParatextProjectSettingsParserBase(ABC): - @abstractmethod - def __enter__(self) -> "ParatextProjectSettingsParserBase": ... - - @abstractmethod - def __exit__(self, type: Any, value: Any, traceback: Any) -> None: ... - @abstractmethod def exists(self, file_name: str) -> bool: ... diff --git a/machine/corpora/zip_paratext_project_settings_parser.py b/machine/corpora/zip_paratext_project_settings_parser.py index b568b64..54b5808 100644 --- a/machine/corpora/zip_paratext_project_settings_parser.py +++ b/machine/corpora/zip_paratext_project_settings_parser.py @@ -1,5 +1,5 @@ from io import BytesIO -from typing import Any, BinaryIO, Union +from typing import BinaryIO, Optional from zipfile import ZipFile from .zip_paratext_project_settings_parser_base import ZipParatextProjectSettingsParserBase @@ -9,21 +9,16 @@ class ZipParatextProjectSettingsParser(ZipParatextProjectSettingsParserBase): def __init__(self, archive: ZipFile) -> None: self._archive = archive - def __enter__(self) -> "ZipParatextProjectSettingsParser": - return self - - def __exit__(self, type: Any, value: Any, traceback: Any) -> None: ... - def exists(self, file_name: str) -> bool: return file_name in self._archive.namelist() - def find(self, extension: str) -> Union[str, None]: + def find(self, extension: str) -> Optional[str]: for entry in self._archive.namelist(): if entry.endswith(extension): return entry return None - def open(self, file_name: str) -> Union[BinaryIO, None]: + def open(self, file_name: str) -> Optional[BinaryIO]: if file_name in self._archive.namelist(): return BytesIO(self._archive.read(file_name)) return None diff --git a/machine/corpora/zip_paratext_project_settings_parser_base.py b/machine/corpora/zip_paratext_project_settings_parser_base.py index 462248c..42c656f 100644 --- a/machine/corpora/zip_paratext_project_settings_parser_base.py +++ b/machine/corpora/zip_paratext_project_settings_parser_base.py @@ -1,4 +1,5 @@ from tempfile import TemporaryFile +from typing import Optional from .paratext_project_settings_parser_base import ParatextProjectSettingsParserBase from .usfm_stylesheet import UsfmStylesheet @@ -7,12 +8,12 @@ class ZipParatextProjectSettingsParserBase(ParatextProjectSettingsParserBase): def create_stylesheet(self, file_name: str) -> UsfmStylesheet: with TemporaryFile() as stylesheet_temp_file, TemporaryFile() as custom_stylesheet_temp_file: - stylesheet_path = file_name + stylesheet_path: str = file_name if self.exists(file_name): with self.open(file_name) as source: stylesheet_temp_file.write(source.read()) stylesheet_path = stylesheet_temp_file.name - custom_stylesheet_path = None + custom_stylesheet_path: Optional[str] = None if self.exists("custom.sty"): with self.open("custom.sty") as source: custom_stylesheet_temp_file.write(source.read()) diff --git a/tests/corpora/test_paratext_project_settings.py b/tests/corpora/test_paratext_project_settings.py new file mode 100644 index 0000000..f30f836 --- /dev/null +++ b/tests/corpora/test_paratext_project_settings.py @@ -0,0 +1,59 @@ +from machine.corpora import UsfmStylesheet +from machine.corpora.paratext_project_settings import ParatextProjectSettings +from machine.scripture import ENGLISH_VERSIFICATION + + +def test_get_book_file_name_book_num() -> None: + settings = _create_settings("41") + assert settings.get_book_file_name("MRK") == "PROJ42.SFM" + + +def test_get_book_file_name_book_num_book_id() -> None: + settings = _create_settings("41MAT") + assert settings.get_book_file_name("MRK") == "PROJ42MRK.SFM" + + +def test_get_book_file_name_book_id() -> None: + settings = _create_settings("MAT") + assert settings.get_book_file_name("MRK") == "PROJMRK.SFM" + + +def test_get_book_file_name_book_num_double_digit() -> None: + settings = _create_settings("41") + assert settings.get_book_file_name("GEN") == "PROJ01.SFM" + + +def test_get_book_file_name_book_num_xxg() -> None: + settings = _create_settings("41") + assert settings.get_book_file_name("XXG") == "PROJ100.SFM" + + +def test_get_book_file_name_book_num_prefix_a() -> None: + settings = _create_settings("41") + assert settings.get_book_file_name("FRT") == "PROJA0.SFM" + + +def test_get_book_file_name_book_num_prefix_b() -> None: + settings = _create_settings("41") + assert settings.get_book_file_name("TDX") == "PROJB0.SFM" + + +def test_get_book_file_name_book_num_prefix_c() -> None: + settings = _create_settings("41") + assert settings.get_book_file_name("3MQ") == "PROJC0.SFM" + + +def _create_settings(file_name_form: str) -> ParatextProjectSettings: + return ParatextProjectSettings( + "Name", + "Name", + "utf-8", + ENGLISH_VERSIFICATION, + UsfmStylesheet("usfm.sty"), + "PROJ", + file_name_form, + ".SFM", + "Major", + "", + "BiblicalTerms.xml", + ) diff --git a/tests/corpora/test_usfm_verse_text_updater.py b/tests/corpora/test_usfm_verse_text_updater.py index 58c760c..829af78 100644 --- a/tests/corpora/test_usfm_verse_text_updater.py +++ b/tests/corpora/test_usfm_verse_text_updater.py @@ -2,15 +2,15 @@ from testutils.corpora_test_helpers import USFM_TEST_PROJECT_PATH -from machine.corpora.usfm_parser import parse_usfm +from machine.corpora import parse_usfm from machine.corpora.usfm_verse_text_updater import UsfmVerseTextUpdater -from machine.scripture.verse_ref import VerseRef, Versification, VersificationType +from machine.scripture import ENGLISH_VERSIFICATION, VerseRef def test_get_usfm_char_style() -> None: rows = [ ( - [VerseRef.from_string("MAT 1:1", Versification.get_builtin(VersificationType.ENGLISH))], + [VerseRef.from_string("MAT 1:1", ENGLISH_VERSIFICATION)], str("First verse of the first chapter."), ) ] @@ -34,7 +34,7 @@ def test_get_usfm_strip_all_text() -> None: def test_get_usfm_notes() -> None: rows = [ ( - [VerseRef.from_string("MAT 2:1", Versification.get_builtin(VersificationType.ENGLISH))], + [VerseRef.from_string("MAT 2:1", ENGLISH_VERSIFICATION)], str("First verse of the second chapter."), ) ] @@ -45,7 +45,7 @@ def test_get_usfm_notes() -> None: def test_get_usfm_row_verse_segment() -> None: rows = [ ( - [VerseRef.from_string("MAT 2:1a", Versification.get_builtin(VersificationType.ENGLISH))], + [VerseRef.from_string("MAT 2:1a", ENGLISH_VERSIFICATION)], str("First verse of the second chapter."), ) ] @@ -56,7 +56,7 @@ def test_get_usfm_row_verse_segment() -> None: def test_get_usfm_verse_segment() -> None: rows = [ ( - [VerseRef.from_string("MAT 2:7", Versification.get_builtin(VersificationType.ENGLISH))], + [VerseRef.from_string("MAT 2:7", ENGLISH_VERSIFICATION)], str("Seventh verse of the second chapter."), ) ] @@ -67,7 +67,7 @@ def test_get_usfm_verse_segment() -> None: def test_get_usfm_multiple_paras() -> None: rows = [ ( - [VerseRef.from_string("MAT 1:2", Versification.get_builtin(VersificationType.ENGLISH))], + [VerseRef.from_string("MAT 1:2", ENGLISH_VERSIFICATION)], str("Second verse of the first chapter."), ) ] @@ -78,7 +78,7 @@ def test_get_usfm_multiple_paras() -> None: def test_get_usfm_table() -> None: rows = [ ( - [VerseRef.from_string("MAT 2:9", Versification.get_builtin(VersificationType.ENGLISH))], + [VerseRef.from_string("MAT 2:9", ENGLISH_VERSIFICATION)], str("Ninth verse of the second chapter."), ) ] @@ -90,8 +90,8 @@ def test_get_usfm_range_single_row_multiple_verses() -> None: rows = [ ( [ - VerseRef.from_string("MAT 2:11", Versification.get_builtin(VersificationType.ENGLISH)), - VerseRef.from_string("MAT 2:12", Versification.get_builtin(VersificationType.ENGLISH)), + VerseRef.from_string("MAT 2:11", ENGLISH_VERSIFICATION), + VerseRef.from_string("MAT 2:12", ENGLISH_VERSIFICATION), ], str("Eleventh verse of the second chapter. Twelfth verse of the second chapter."), ) @@ -103,7 +103,7 @@ def test_get_usfm_range_single_row_multiple_verses() -> None: def test_get_usfm_range_single_row_single_verse() -> None: rows = [ ( - [VerseRef.from_string("MAT 2:11", Versification.get_builtin(VersificationType.ENGLISH))], + [VerseRef.from_string("MAT 2:11", ENGLISH_VERSIFICATION)], str("Eleventh verse of the second chapter."), ) ] @@ -114,11 +114,11 @@ def test_get_usfm_range_single_row_single_verse() -> None: def test_get_usfm_range_multiple_rows_single_verse() -> None: rows = [ ( - [VerseRef.from_string("MAT 2:11", Versification.get_builtin(VersificationType.ENGLISH))], + [VerseRef.from_string("MAT 2:11", ENGLISH_VERSIFICATION)], str("Eleventh verse of the second chapter."), ), ( - [VerseRef.from_string("MAT 2:12", Versification.get_builtin(VersificationType.ENGLISH))], + [VerseRef.from_string("MAT 2:12", ENGLISH_VERSIFICATION)], str("Twelfth verse of the second chapter."), ), ] @@ -129,11 +129,11 @@ def test_get_usfm_range_multiple_rows_single_verse() -> None: def test_get_usfm_opt_break() -> None: rows = [ ( - [VerseRef.from_string("MAT 2:2", Versification.get_builtin(VersificationType.ENGLISH))], + [VerseRef.from_string("MAT 2:2", ENGLISH_VERSIFICATION)], str("Second verse of the second chapter."), ), ( - [VerseRef.from_string("MAT 2:3", Versification.get_builtin(VersificationType.ENGLISH))], + [VerseRef.from_string("MAT 2:3", ENGLISH_VERSIFICATION)], str("Third verse of the second chapter."), ), ] @@ -144,7 +144,7 @@ def test_get_usfm_opt_break() -> None: def test_get_usfm_milestone() -> None: rows = [ ( - [VerseRef.from_string("MAT 2:10", Versification.get_builtin(VersificationType.ENGLISH))], + [VerseRef.from_string("MAT 2:10", ENGLISH_VERSIFICATION)], str("Tenth verse of the second chapter."), ) ] @@ -155,7 +155,7 @@ def test_get_usfm_milestone() -> None: def test_get_usfm_unmatched() -> None: rows = [ ( - [VerseRef.from_string("MAT 1:3", Versification.get_builtin(VersificationType.ENGLISH))], + [VerseRef.from_string("MAT 1:3", ENGLISH_VERSIFICATION)], str("Third verse of the first chapter."), ) ]