Skip to content

Commit

Permalink
port remaining changes from Machine PR #163, clean up code
Browse files Browse the repository at this point in the history
  • Loading branch information
mshannon-sil committed Mar 28, 2024
1 parent 46f5e4d commit 3e97c10
Show file tree
Hide file tree
Showing 8 changed files with 169 additions and 154 deletions.
7 changes: 1 addition & 6 deletions machine/corpora/file_paratext_project_settings_parser.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from pathlib import Path
from typing import Any, BinaryIO, Optional
from typing import BinaryIO, Optional

from ..utils.typeshed import StrPath
from .paratext_project_settings_parser_base import ParatextProjectSettingsParserBase
Expand All @@ -10,11 +10,6 @@ class FileParatextProjectSettingsParser(ParatextProjectSettingsParserBase):
def __init__(self, project_dir: StrPath) -> None:
self._project_dir = Path(project_dir)

def __enter__(self) -> "FileParatextProjectSettingsParser":
return self

def __exit__(self, type: Any, value: Any, traceback: Any) -> None: ...

def create_stylesheet(self, file_name: StrPath) -> UsfmStylesheet:
custom_stylesheet_filename = self._project_dir / file_name
return UsfmStylesheet(
Expand Down
92 changes: 47 additions & 45 deletions machine/corpora/paratext_backup_terms_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,11 @@
from .text_row import TextRow
from .zip_paratext_project_settings_parser import ZipParatextProjectSettingsParser

_PREDEFINED_TERMS_LIST_TYPES = ["Major", "All", "SilNt", "Pt6"]


class ParatextBackupTermsCorpus(DictionaryTextCorpus):
def __init__(self, filename: str, term_categories: List[str]) -> None:
self._predefined_terms_list_types = ["Major", "All", "SilNt", "Pt6"]
rows: List[TextRow] = []
with ZipFile(filename, "r") as archive:
terms_file_entry = get_entry(archive, "TermRenderings.xml")
Expand All @@ -25,18 +26,18 @@ def __init__(self, filename: str, term_categories: List[str]) -> None:
term_renderings_tree = ET.parse(key_terms_file)

biblical_terms_file_entry = get_entry(archive, settings.biblical_terms_file_name)
if settings.biblical_terms_list_type in self._predefined_terms_list_types:
if settings.biblical_terms_list_type in _PREDEFINED_TERMS_LIST_TYPES:
with open(settings.biblical_terms_file_name, "rb") as key_terms_file:
biblical_terms_tree = ET.parse(key_terms_file)
term_id_to_category_dict = self._get_category_per_id(biblical_terms_tree)
term_id_to_category_dict = _get_category_per_id(biblical_terms_tree)
elif (
settings.biblical_terms_list_type == "Project"
and settings.biblical_terms_project_name == settings.name
and biblical_terms_file_entry is not None
):
with archive.open(biblical_terms_file_entry) as key_terms_file:
biblical_terms_tree = ET.parse(key_terms_file)
term_id_to_category_dict = self._get_category_per_id(biblical_terms_tree)
term_id_to_category_dict = _get_category_per_id(biblical_terms_tree)
else:
term_id_to_category_dict = {}

Expand All @@ -53,51 +54,52 @@ def __init__(self, filename: str, term_categories: List[str]) -> None:
continue
term_id = term_id.replace("\n", "&#xA")
rendering = e.findtext("Renderings", "")
renderings = self._get_renderings(rendering)
renderings = _get_renderings(rendering)
rows.append(TextRow(text_id, term_id, segment=renderings))
text = MemoryText(text_id, rows)
self._add_text(text)

def _get_renderings(self, rendering: str) -> List[str]:
# If entire term rendering is surrounded in square brackets, remove them
match = re.match(r"^\[(.+?)\]$", rendering)
if match:
rendering = match.group(1)
rendering = rendering.replace("?", "")
rendering = rendering.replace("*", "")
rendering = rendering.replace("/", " ")
rendering = rendering.strip()
rendering = self._strip_parens(rendering)
rendering = self._strip_parens(rendering, left="[", right="]")
rx = re.compile(r"\s+\d+(\.\d+)*$")
for match in rx.findall(rendering):
rendering = rendering.replace(match, "")
glosses = re.split(r"\|\|", rendering)
glosses = list(set(g.strip() for g in glosses if g.strip() != ""))
return glosses

def _strip_parens(self, term_string: str, left: str = "(", right: str = ")") -> str:
parens = 0
end = -1
for i in range(len(term_string) - 1, -1, -1):
c = term_string[i]
if c == right:
def _get_renderings(rendering: str) -> List[str]:
# If entire term rendering is surrounded in square brackets, remove them
match = re.match(r"^\[(.+?)\]$", rendering)
if match:
rendering = match.group(1)
rendering = rendering.replace("?", "")
rendering = rendering.replace("*", "")
rendering = rendering.replace("/", " ")
rendering = rendering.strip()
rendering = _strip_parens(rendering)
rendering = _strip_parens(rendering, left="[", right="]")
rx = re.compile(r"\s+\d+(\.\d+)*$")
for match in rx.findall(rendering):
rendering = rendering.replace(match, "")
glosses = re.split(r"\|\|", rendering)
glosses = list(set(g.strip() for g in glosses if g.strip() != ""))
return glosses


def _strip_parens(term_string: str, left: str = "(", right: str = ")") -> str:
parens = 0
end = -1
for i in range(len(term_string) - 1, -1, -1):
c = term_string[i]
if c == right:
if parens == 0:
end = i + 1
parens += 1
elif c == left:
if parens > 0:
parens -= 1
if parens == 0:
end = i + 1
parens += 1
elif c == left:
if parens > 0:
parens -= 1
if parens == 0:
term_string = term_string[:i] + term_string[end:]
return term_string
term_string = term_string[:i] + term_string[end:]
return term_string

def _get_category_per_id(self, biblical_terms_tree: ET.ElementTree) -> Dict[str, Optional[str]]:
term_id_to_category_dict = {}
for e in biblical_terms_tree.iter(".//Term"):
category_element = e.find("Category")
category = (
category_element.text if category_element is not None and category_element.text is not None else ""
)
term_id_to_category_dict[e.attrib["Id"]] = category
return term_id_to_category_dict

def _get_category_per_id(biblical_terms_tree: ET.ElementTree) -> Dict[str, Optional[str]]:
term_id_to_category_dict = {}
for e in biblical_terms_tree.iter(".//Term"):
category_element = e.find("Category")
category = category_element.text if category_element is not None and category_element.text is not None else ""
term_id_to_category_dict[e.attrib["Id"]] = category
return term_id_to_category_dict
107 changes: 38 additions & 69 deletions machine/corpora/paratext_project_settings.py
Original file line number Diff line number Diff line change
@@ -1,76 +1,45 @@
from abc import ABC
from dataclasses import dataclass

from ..scripture.canon import book_id_to_number
from ..scripture.verse_ref import Versification
from .usfm_stylesheet import UsfmStylesheet


@dataclass
class ParatextProjectSettings(ABC):
def __init__(
self,
name: str,
full_name: str,
encoding: str,
versification: Versification,
stylesheet: UsfmStylesheet,
file_name_prefix: str,
file_name_form: str,
file_name_suffix: str,
biblical_terms_list_type: str,
biblical_terms_project_name: str,
biblical_terms_file_name: str,
) -> None:
self._name = name
self._full_name = full_name
self._encoding = encoding
self._versification = versification
self._stylesheet = stylesheet
self._file_name_prefix = file_name_prefix
self._file_name_form = file_name_form
self._file_name_suffix = file_name_suffix
self._biblical_terms_list_type = biblical_terms_list_type
self._biblical_terms_project_name = biblical_terms_project_name
self._biblical_terms_file_name = biblical_terms_file_name

@property
def name(self) -> str:
return self._name

@property
def full_name(self) -> str:
return self._full_name

@property
def encoding(self) -> str:
return self._encoding

@property
def versification(self) -> Versification:
return self._versification

@property
def stylesheet(self) -> UsfmStylesheet:
return self._stylesheet

@property
def file_name_prefix(self) -> str:
return self._file_name_prefix

@property
def file_name_form(self) -> str:
return self._file_name_form

@property
def file_name_suffix(self) -> str:
return self._file_name_suffix

@property
def biblical_terms_list_type(self) -> str:
return self._biblical_terms_list_type

@property
def biblical_terms_project_name(self) -> str:
return self._biblical_terms_project_name

@property
def biblical_terms_file_name(self) -> str:
return self._biblical_terms_file_name
name: str
full_name: str
encoding: str
versification: Versification
stylesheet: UsfmStylesheet
file_name_prefix: str
file_name_form: str
file_name_suffix: str
biblical_terms_list_type: str
biblical_terms_project_name: str
biblical_terms_file_name: str

def get_book_file_name(self, book_id: str) -> str:
if self.file_name_form == "MAT":
book_part = book_id
elif self.file_name_form in ("40", "41"):
book_part = _get_book_file_name_digits(book_id)
else:
book_part = _get_book_file_name_digits(book_id) + book_id
return self.file_name_prefix + book_part + self.file_name_suffix


def _get_book_file_name_digits(book_id: str) -> str:
book_num = book_id_to_number(book_id)
if book_num < 10:
return f"0{book_num}"
if book_num < 40:
return str(book_num)
if book_num < 100:
return str(book_num + 1)
if book_num < 110:
return f"A{book_num - 100}"
if book_num < 120:
return f"B{book_num - 110}"
return f"C{book_num - 120}"
8 changes: 1 addition & 7 deletions machine/corpora/paratext_project_settings_parser_base.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import xml.etree.ElementTree as ET
from abc import ABC, abstractmethod
from typing import Any, BinaryIO
from typing import BinaryIO

from ..scripture.verse_ref import Versification
from ..utils.string_utils import parse_integer
Expand All @@ -11,12 +11,6 @@

class ParatextProjectSettingsParserBase(ABC):

@abstractmethod
def __enter__(self) -> "ParatextProjectSettingsParserBase": ...

@abstractmethod
def __exit__(self, type: Any, value: Any, traceback: Any) -> None: ...

@abstractmethod
def exists(self, file_name: str) -> bool: ...

Expand Down
11 changes: 3 additions & 8 deletions machine/corpora/zip_paratext_project_settings_parser.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from io import BytesIO
from typing import Any, BinaryIO, Union
from typing import BinaryIO, Optional
from zipfile import ZipFile

from .zip_paratext_project_settings_parser_base import ZipParatextProjectSettingsParserBase
Expand All @@ -9,21 +9,16 @@ class ZipParatextProjectSettingsParser(ZipParatextProjectSettingsParserBase):
def __init__(self, archive: ZipFile) -> None:
self._archive = archive

def __enter__(self) -> "ZipParatextProjectSettingsParser":
return self

def __exit__(self, type: Any, value: Any, traceback: Any) -> None: ...

def exists(self, file_name: str) -> bool:
return file_name in self._archive.namelist()

def find(self, extension: str) -> Union[str, None]:
def find(self, extension: str) -> Optional[str]:
for entry in self._archive.namelist():
if entry.endswith(extension):
return entry
return None

def open(self, file_name: str) -> Union[BinaryIO, None]:
def open(self, file_name: str) -> Optional[BinaryIO]:
if file_name in self._archive.namelist():
return BytesIO(self._archive.read(file_name))
return None
5 changes: 3 additions & 2 deletions machine/corpora/zip_paratext_project_settings_parser_base.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from tempfile import TemporaryFile
from typing import Optional

from .paratext_project_settings_parser_base import ParatextProjectSettingsParserBase
from .usfm_stylesheet import UsfmStylesheet
Expand All @@ -7,12 +8,12 @@
class ZipParatextProjectSettingsParserBase(ParatextProjectSettingsParserBase):
def create_stylesheet(self, file_name: str) -> UsfmStylesheet:
with TemporaryFile() as stylesheet_temp_file, TemporaryFile() as custom_stylesheet_temp_file:
stylesheet_path = file_name
stylesheet_path: str = file_name
if self.exists(file_name):
with self.open(file_name) as source:
stylesheet_temp_file.write(source.read())
stylesheet_path = stylesheet_temp_file.name
custom_stylesheet_path = None
custom_stylesheet_path: Optional[str] = None
if self.exists("custom.sty"):
with self.open("custom.sty") as source:
custom_stylesheet_temp_file.write(source.read())
Expand Down
59 changes: 59 additions & 0 deletions tests/corpora/test_paratext_project_settings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
from machine.corpora import UsfmStylesheet
from machine.corpora.paratext_project_settings import ParatextProjectSettings
from machine.scripture import ENGLISH_VERSIFICATION


def test_get_book_file_name_book_num() -> None:
settings = _create_settings("41")
assert settings.get_book_file_name("MRK") == "PROJ42.SFM"


def test_get_book_file_name_book_num_book_id() -> None:
settings = _create_settings("41MAT")
assert settings.get_book_file_name("MRK") == "PROJ42MRK.SFM"


def test_get_book_file_name_book_id() -> None:
settings = _create_settings("MAT")
assert settings.get_book_file_name("MRK") == "PROJMRK.SFM"


def test_get_book_file_name_book_num_double_digit() -> None:
settings = _create_settings("41")
assert settings.get_book_file_name("GEN") == "PROJ01.SFM"


def test_get_book_file_name_book_num_xxg() -> None:
settings = _create_settings("41")
assert settings.get_book_file_name("XXG") == "PROJ100.SFM"


def test_get_book_file_name_book_num_prefix_a() -> None:
settings = _create_settings("41")
assert settings.get_book_file_name("FRT") == "PROJA0.SFM"


def test_get_book_file_name_book_num_prefix_b() -> None:
settings = _create_settings("41")
assert settings.get_book_file_name("TDX") == "PROJB0.SFM"


def test_get_book_file_name_book_num_prefix_c() -> None:
settings = _create_settings("41")
assert settings.get_book_file_name("3MQ") == "PROJC0.SFM"


def _create_settings(file_name_form: str) -> ParatextProjectSettings:
return ParatextProjectSettings(
"Name",
"Name",
"utf-8",
ENGLISH_VERSIFICATION,
UsfmStylesheet("usfm.sty"),
"PROJ",
file_name_form,
".SFM",
"Major",
"",
"BiblicalTerms.xml",
)
Loading

0 comments on commit 3e97c10

Please sign in to comment.