Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update machine.py to reflect usfm changes in machine #103

Merged
merged 3 commits into from
Apr 4, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 32 additions & 0 deletions machine/corpora/file_paratext_project_settings_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
from pathlib import Path
from typing import Any, BinaryIO, Optional

from ..utils.typeshed import StrPath
from .paratext_project_settings_parser_base import ParatextProjectSettingsParserBase
from .usfm_stylesheet import UsfmStylesheet


class FileParatextProjectSettingsParser(ParatextProjectSettingsParserBase):
def __init__(self, project_dir: StrPath) -> None:
self._project_dir = Path(project_dir)

def __enter__(self) -> "FileParatextProjectSettingsParser":
return self

def __exit__(self, type: Any, value: Any, traceback: Any) -> None: ...

def create_stylesheet(self, file_name: StrPath) -> UsfmStylesheet:
custom_stylesheet_filename = self._project_dir / file_name
return UsfmStylesheet(
file_name,
custom_stylesheet_filename if custom_stylesheet_filename.is_file() else None,
)

def exists(self, file_name: StrPath) -> bool:
return (self._project_dir / file_name).is_file()

def find(self, extension: str) -> Optional[Path]:
return next(self._project_dir.glob(f"*{extension}"), None)

def open(self, file_name: StrPath) -> BinaryIO:
return open(self._project_dir / file_name, "rb")
103 changes: 103 additions & 0 deletions machine/corpora/paratext_backup_terms_corpus.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
import re
import xml.etree.ElementTree as ET
from typing import Dict, List, Optional
from zipfile import ZipFile

from .corpora_utils import get_entry
from .dictionary_text_corpus import DictionaryTextCorpus
from .memory_text import MemoryText
from .text_row import TextRow
from .zip_paratext_project_settings_parser import ZipParatextProjectSettingsParser


class ParatextBackupTermsCorpus(DictionaryTextCorpus):
def __init__(self, filename: str, term_categories: List[str]) -> None:
self._predefined_terms_list_types = ["Major", "All", "SilNt", "Pt6"]
rows: List[TextRow] = []
with ZipFile(filename, "r") as archive:
terms_file_entry = get_entry(archive, "TermRenderings.xml")
if terms_file_entry is None:
return
settings_parser = ZipParatextProjectSettingsParser(archive)
settings = settings_parser.parse()

with archive.open(terms_file_entry) as key_terms_file:
term_renderings_tree = ET.parse(key_terms_file)

biblical_terms_file_entry = get_entry(archive, settings.biblical_terms_file_name)
if settings.biblical_terms_list_type in self._predefined_terms_list_types:
with open(settings.biblical_terms_file_name, "rb") as key_terms_file:
biblical_terms_tree = ET.parse(key_terms_file)
term_id_to_category_dict = self._get_category_per_id(biblical_terms_tree)
elif (
settings.biblical_terms_list_type == "Project"
and settings.biblical_terms_project_name == settings.name
and biblical_terms_file_entry is not None
):
with archive.open(biblical_terms_file_entry) as key_terms_file:
biblical_terms_tree = ET.parse(key_terms_file)
term_id_to_category_dict = self._get_category_per_id(biblical_terms_tree)
else:
term_id_to_category_dict = {}

terms_elements = term_renderings_tree.iter(".//TermRendering")
text_id = (
f"{settings.biblical_terms_list_type}:"
f"{settings.biblical_terms_project_name}:"
f"{settings.biblical_terms_file_name}"
)
for e in terms_elements:
term_id = e.attrib["Id"]
category = term_id_to_category_dict.get(term_id, "")
if term_categories and (category == "" or category not in term_categories):
continue
term_id = term_id.replace("\n", "&#xA")
rendering = e.findtext("Renderings", "")
renderings = self._get_renderings(rendering)
rows.append(TextRow(text_id, term_id, segment=renderings))
text = MemoryText(text_id, rows)
self._add_text(text)

def _get_renderings(self, rendering: str) -> List[str]:
# If entire term rendering is surrounded in square brackets, remove them
match = re.match(r"^\[(.+?)\]$", rendering)
if match:
rendering = match.group(1)
rendering = rendering.replace("?", "")
rendering = rendering.replace("*", "")
rendering = rendering.replace("/", " ")
rendering = rendering.strip()
rendering = self._strip_parens(rendering)
rendering = self._strip_parens(rendering, left="[", right="]")
rx = re.compile(r"\s+\d+(\.\d+)*$")
for match in rx.findall(rendering):
rendering = rendering.replace(match, "")
glosses = re.split(r"\|\|", rendering)
glosses = list(set(g.strip() for g in glosses if g.strip() != ""))
return glosses

def _strip_parens(self, term_string: str, left: str = "(", right: str = ")") -> str:
parens = 0
end = -1
for i in range(len(term_string) - 1, -1, -1):
c = term_string[i]
if c == right:
if parens == 0:
end = i + 1
parens += 1
elif c == left:
if parens > 0:
parens -= 1
if parens == 0:
term_string = term_string[:i] + term_string[end:]
return term_string

def _get_category_per_id(self, biblical_terms_tree: ET.ElementTree) -> Dict[str, Optional[str]]:
term_id_to_category_dict = {}
for e in biblical_terms_tree.iter(".//Term"):
category_element = e.find("Category")
category = (
category_element.text if category_element is not None and category_element.text is not None else ""
)
term_id_to_category_dict[e.attrib["Id"]] = category
return term_id_to_category_dict
117 changes: 15 additions & 102 deletions machine/corpora/paratext_backup_text_corpus.py
Original file line number Diff line number Diff line change
@@ -1,121 +1,34 @@
import xml.etree.ElementTree as etree
from io import TextIOWrapper
from tempfile import TemporaryFile
from typing import List, Optional
from zipfile import ZipFile, ZipInfo
from typing import List
from zipfile import ZipFile

import regex as re

from ..scripture.verse_ref import Versification
from ..utils.file_utils import detect_encoding_from_stream
from ..utils.string_utils import parse_integer
from ..utils.typeshed import StrPath
from .corpora_utils import find_entry, get_encoding, get_entry
from .scripture_text_corpus import ScriptureTextCorpus
from .usfm_stylesheet import UsfmStylesheet
from .usfm_zip_text import UsfmZipText
from .zip_paratext_project_settings_parser import ZipParatextProjectSettingsParser


class ParatextBackupTextCorpus(ScriptureTextCorpus):
def __init__(self, filename: StrPath, include_markers: bool = False) -> None:
with ZipFile(filename, "r") as archive:
settings_entry = get_entry(archive, "Settings.xml")
if settings_entry is None:
settings_entry = find_entry(archive, lambda zi: zi.filename.endswith(".ssf"))
if settings_entry is None:
raise ValueError("The project backup does not contain a settings file.")
parser = ZipParatextProjectSettingsParser(archive)
settings = parser.parse()

with archive.open(settings_entry, "r") as file:
settings_tree = etree.parse(file)
versification = settings.versification
regex = re.compile(f"^{re.escape(settings.file_name_prefix)}.*{re.escape(settings.file_name_suffix)}$")

encoding_str = settings_tree.getroot().findtext("Encoding", "65001")
code_page = parse_integer(encoding_str)
if code_page is None:
raise NotImplementedError(
f"The project uses a legacy encoding that requires TECKit, map file: {encoding_str}."
)
encoding = get_encoding(code_page)
if encoding is None:
raise RuntimeError(f"Code page {code_page} not supported.")

versification_type = int(settings_tree.getroot().findtext("Versification", "4"))
versification = Versification.get_builtin(versification_type)
custom_versification_entry = get_entry(archive, "custom.vrs")
if custom_versification_entry is not None:
guid = settings_tree.getroot().findtext("Guid", "")
versification_name = f"{versification.name}-{guid}"
try:
versification = _load_versification_from_entry(
archive,
custom_versification_entry,
"custom.vrs",
versification,
versification_name,
encoding="utf-8-sig",
)
except UnicodeDecodeError:
with archive.open(custom_versification_entry, "r") as file:
vers_encoding = detect_encoding_from_stream(file)
versification = _load_versification_from_entry(
archive,
custom_versification_entry,
"custom.vrs",
versification,
versification_name,
vers_encoding,
)

stylesheet_name = settings_tree.getroot().findtext("StyleSheet", "usfm.sty")
stylesheet_entry = get_entry(archive, stylesheet_name)
if stylesheet_entry is None and stylesheet_name != "usfm_sb.sty":
stylesheet_entry = get_entry(archive, "usfm.sty")
custom_stylesheet_entry = get_entry(archive, "custom.sty")
with TemporaryFile() as stylesheet_temp_file, TemporaryFile() as custom_stylesheet_temp_file:
stylesheet_path = "usfm.sty"
if stylesheet_entry is not None:
with archive.open(stylesheet_entry, "r") as file:
stylesheet_temp_file.write(file.read())
stylesheet_path = stylesheet_temp_file.name
stylesheet_temp_file.close()
custom_stylesheet_path: Optional[str] = None
if custom_stylesheet_entry is not None:
with archive.open(custom_stylesheet_entry, "r") as file:
custom_stylesheet_temp_file.write(file.read())
custom_stylesheet_path = custom_stylesheet_temp_file.name
custom_stylesheet_temp_file.close()
stylesheet = UsfmStylesheet(stylesheet_path, custom_stylesheet_path)

prefix = ""
suffix = ".SFM"
naming_elem = settings_tree.getroot().find("Naming")
if naming_elem is not None:
pre_part = naming_elem.get("PrePart", "")
if pre_part != "":
prefix = pre_part
post_part = naming_elem.get("PostPart", "")
if post_part != "":
suffix = post_part

regex = re.compile(f"^{re.escape(prefix)}.*{re.escape(suffix)}$")
texts: List[UsfmZipText] = []
for sfm_entry in (zi for zi in archive.filelist if regex.match(zi.filename)):
texts.append(
UsfmZipText(stylesheet, encoding, filename, sfm_entry.filename, versification, include_markers)
UsfmZipText(
settings.stylesheet,
settings.encoding,
filename,
sfm_entry.filename,
versification,
include_markers,
)
)

super().__init__(versification, texts)


def _load_versification_from_entry(
archive: ZipFile,
entry: ZipInfo,
filename: StrPath,
base_versification: Versification,
fallback_name: str,
encoding: str,
) -> Versification:
with archive.open(entry, "r") as file:
stream = TextIOWrapper(file, encoding=encoding)
return Versification.parse(
stream, filename, Versification(fallback_name, filename, base_versification), fallback_name
)
76 changes: 76 additions & 0 deletions machine/corpora/paratext_project_settings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
from abc import ABC

from ..scripture.verse_ref import Versification
from .usfm_stylesheet import UsfmStylesheet


class ParatextProjectSettings(ABC):
def __init__(
self,
name: str,
full_name: str,
encoding: str,
versification: Versification,
stylesheet: UsfmStylesheet,
file_name_prefix: str,
file_name_form: str,
file_name_suffix: str,
biblical_terms_list_type: str,
biblical_terms_project_name: str,
biblical_terms_file_name: str,
) -> None:
self._name = name
self._full_name = full_name
self._encoding = encoding
self._versification = versification
self._stylesheet = stylesheet
self._file_name_prefix = file_name_prefix
self._file_name_form = file_name_form
self._file_name_suffix = file_name_suffix
self._biblical_terms_list_type = biblical_terms_list_type
self._biblical_terms_project_name = biblical_terms_project_name
self._biblical_terms_file_name = biblical_terms_file_name

@property
def name(self) -> str:
return self._name

@property
def full_name(self) -> str:
return self._full_name

@property
def encoding(self) -> str:
return self._encoding

@property
def versification(self) -> Versification:
return self._versification

@property
def stylesheet(self) -> UsfmStylesheet:
return self._stylesheet

@property
def file_name_prefix(self) -> str:
return self._file_name_prefix

@property
def file_name_form(self) -> str:
return self._file_name_form

@property
def file_name_suffix(self) -> str:
return self._file_name_suffix

@property
def biblical_terms_list_type(self) -> str:
return self._biblical_terms_list_type

@property
def biblical_terms_project_name(self) -> str:
return self._biblical_terms_project_name

@property
def biblical_terms_file_name(self) -> str:
return self._biblical_terms_file_name
Loading
Loading