Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Port USFM code from Machine up to commit bf2b46d #115

Merged
merged 12 commits into from
Aug 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,10 @@ coverage.xml
*.py,cover
.hypothesis/
.pytest_cache/
tests/testutils/data/usfm/source/*
tests/testutils/data/usfm/target/*
tests/testutils/data/project/*
tests/testutils/data/pretranslations.json

# Translations
*.mo
Expand Down
9 changes: 8 additions & 1 deletion .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,12 @@
"editor.defaultFormatter": "ms-python.black-formatter",
"editor.formatOnSave": true
},
"black-formatter.path": ["poetry", "run", "black"]
"black-formatter.path": [
"poetry",
"run",
"black"
],
"python.analysis.extraPaths": [
"./tests"
]
}
12 changes: 12 additions & 0 deletions machine/corpora/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,17 @@
from .dbl_bundle_text_corpus import DblBundleTextCorpus
from .dictionary_alignment_corpus import DictionaryAlignmentCorpus
from .dictionary_text_corpus import DictionaryTextCorpus
from .file_paratext_project_settings_parser import FileParatextProjectSettingsParser
from .flatten import flatten
from .memory_alignment_collection import MemoryAlignmentCollection
from .memory_text import MemoryText
from .multi_key_ref import MultiKeyRef
from .parallel_text_corpus import ParallelTextCorpus
from .parallel_text_row import ParallelTextRow
from .paratext_backup_terms_corpus import ParatextBackupTermsCorpus
from .paratext_backup_text_corpus import ParatextBackupTextCorpus
from .paratext_project_settings import ParatextProjectSettings
from .paratext_project_settings_parser_base import ParatextProjectSettingsParserBase
from .paratext_text_corpus import ParatextTextCorpus
from .scripture_element import ScriptureElement
from .scripture_ref import EMPTY_SCRIPTURE_REF, ScriptureRef
Expand Down Expand Up @@ -57,6 +61,8 @@
from .usx_file_text import UsxFileText
from .usx_file_text_corpus import UsxFileTextCorpus
from .usx_zip_text import UsxZipText
from .zip_paratext_project_settings_parser import ZipParatextProjectSettingsParser
from .zip_paratext_project_settings_parser_base import ZipParatextProjectSettingsParserBase

__all__ = [
"AlignedWordPair",
Expand All @@ -72,6 +78,7 @@
"EMPTY_SCRIPTURE_REF",
"escape_spaces",
"extract_scripture_corpus",
"FileParatextProjectSettingsParser",
"flatten",
"is_scripture",
"lowercase",
Expand All @@ -85,7 +92,10 @@
"normalize",
"ParallelTextCorpus",
"ParallelTextRow",
"ParatextBackupTermsCorpus",
"ParatextBackupTextCorpus",
"ParatextProjectSettings",
"ParatextProjectSettingsParserBase",
"ParatextTextCorpus",
"parse_usfm",
"RtlReferenceOrder",
Expand Down Expand Up @@ -128,4 +138,6 @@
"UsxFileText",
"UsxFileTextCorpus",
"UsxZipText",
"ZipParatextProjectSettingsParser",
"ZipParatextProjectSettingsParserBase",
]
29 changes: 15 additions & 14 deletions machine/corpora/paratext_backup_text_corpus.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
from typing import List
from zipfile import ZipFile

import regex as re

from ..utils.typeshed import StrPath
from .scripture_text_corpus import ScriptureTextCorpus
from .usfm_zip_text import UsfmZipText
Expand All @@ -16,20 +14,23 @@ def __init__(self, filename: StrPath, include_markers: bool = False, include_all
settings = parser.parse()

versification = settings.versification
regex = re.compile(f"^{re.escape(settings.file_name_prefix)}.*{re.escape(settings.file_name_suffix)}$")

texts: List[UsfmZipText] = []
for sfm_entry in (zi for zi in archive.filelist if regex.match(zi.filename)):
texts.append(
UsfmZipText(
settings.stylesheet,
settings.encoding,
filename,
sfm_entry.filename,
versification,
include_markers,
include_all_text,
for sfm_entry in archive.filelist:
book_id = settings.get_book_id(sfm_entry.filename)
if book_id:
texts.append(
UsfmZipText(
settings.stylesheet,
settings.encoding,
book_id,
filename,
sfm_entry.filename,
versification,
include_markers,
include_all_text,
settings.name,
)
)
)

super().__init__(versification, texts)
40 changes: 39 additions & 1 deletion machine/corpora/paratext_project_settings.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from dataclasses import dataclass
from typing import Optional

from ..scripture.canon import book_id_to_number
from ..scripture.canon import book_id_to_number, book_number_to_id
from ..scripture.verse_ref import Versification
from .usfm_stylesheet import UsfmStylesheet

Expand All @@ -19,6 +20,29 @@ class ParatextProjectSettings:
biblical_terms_project_name: str
biblical_terms_file_name: str

def get_book_id(self, file_name: str) -> Optional[str]:
"""Returns None when the file name doesn't match the pattern of a book file name for the project."""
if not file_name.startswith(self.file_name_prefix) or not file_name.endswith(self.file_name_suffix):
return None

book_part: str = file_name[len(self.file_name_prefix) : -len(self.file_name_suffix)]
if self.file_name_form == "MAT":
if len(book_part) != 3:
return None
book_id = book_part
elif self.file_name_form in ("40", "41"):
if book_part != "100" and len(book_part) != 2:
return None
book_id = book_number_to_id(_get_book_number(book_part))
else:
if book_part.startswith("100"):
if len(book_part) != 6:
return None
elif len(book_part) != 5:
return None
book_id = book_part[2:] if len(book_part) == 5 else book_part[3:]
return book_id

def get_book_file_name(self, book_id: str) -> str:
if self.file_name_form == "MAT":
book_part = book_id
Expand All @@ -42,3 +66,17 @@ def _get_book_file_name_digits(book_id: str) -> str:
if book_num < 120:
return f"B{book_num - 110}"
return f"C{book_num - 120}"


def _get_book_number(book_file_name_digits: str) -> int:
if book_file_name_digits.startswith("A"):
return 100 + int(book_file_name_digits[1:])
if book_file_name_digits.startswith("B"):
return 110 + int(book_file_name_digits[1:])
if book_file_name_digits.startswith("C"):
return 120 + int(book_file_name_digits[1:])

book_num: int = int(book_file_name_digits)
if book_num >= 40:
return book_num - 1
return book_num
22 changes: 13 additions & 9 deletions machine/corpora/paratext_text_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,19 @@ def __init__(self, project_dir: StrPath, include_markers: bool = False, include_

texts: List[UsfmFileText] = []
for sfm_filename in Path(project_dir).glob(f"{settings.file_name_prefix}*{settings.file_name_suffix}"):
texts.append(
UsfmFileText(
settings.stylesheet,
settings.encoding,
sfm_filename,
versification,
include_markers,
include_all_text,
book_id = settings.get_book_id(sfm_filename.name)
if book_id:
texts.append(
UsfmFileText(
settings.stylesheet,
settings.encoding,
book_id,
sfm_filename,
versification,
include_markers,
include_all_text,
settings.name,
)
)
)

super().__init__(versification, texts)
23 changes: 16 additions & 7 deletions machine/corpora/scripture_element.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from __future__ import annotations

from functools import total_ordering
from typing import Optional

from ..utils.comparable import Comparable

Expand All @@ -20,17 +19,27 @@ def position(self) -> int:
def name(self) -> str:
return self._name

def compare_to(self, other: object, strict: Optional[bool] = True) -> int:
def to_relaxed(self) -> ScriptureElement:
return ScriptureElement(0, self.name)

def compare_to(self, other: object) -> int:
if not isinstance(other, ScriptureElement):
raise (TypeError("other is not a ScriptureElement object."))
if self is other:
return 0

if strict:
res = self.position - other.position
if res != 0:
return res

if self.position == 0 or other.position == 0:
if self.name == other.name:
return 0
# position 0 is always greater than any other position
if self.position == 0 and other.position != 0:
return 1
if other.position == 0 and self.position != 0:
return -1
return (self.name > other.name) - (self.name < other.name)
res = self.position - other.position
if res != 0:
return res
return (self.name > other.name) - (self.name < other.name)

def __eq__(self, other: ScriptureElement) -> bool:
Expand Down
21 changes: 11 additions & 10 deletions machine/corpora/scripture_ref.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from typing import List, Optional

from ..scripture.constants import ENGLISH_VERSIFICATION
from ..scripture.verse_ref import VerseRef, Versification, are_overlapping_verse_ranges
from ..scripture.verse_ref import VerseRef, Versification
from ..utils.comparable import Comparable
from .scripture_element import ScriptureElement

Expand Down Expand Up @@ -81,17 +81,15 @@ def is_empty(self) -> bool:
def is_verse(self) -> bool:
return VerseRef.verse_num != 0 and len(self.path) == 0

def to_relaxed(self) -> ScriptureRef:
return ScriptureRef(self.verse_ref, [pe.to_relaxed() for pe in self.path])

def change_versification(self, versification: Versification) -> ScriptureRef:
vr: VerseRef = self.verse_ref.copy()
vr.change_versification(versification)
return ScriptureRef(vr, self.path)

def overlaps(self, other: ScriptureRef) -> bool:
if not are_overlapping_verse_ranges(self.verse_ref, other.verse_ref):
return False
return self.path == other.path

def compare_to(self, other: object, compare_segments: bool = True, strict: bool = True):
def compare_to(self, other: object, compare_segments: bool = True) -> int:
if not isinstance(other, ScriptureRef):
raise TypeError("other is not a ScriptureRef object.")
if self is other:
Expand All @@ -102,11 +100,14 @@ def compare_to(self, other: object, compare_segments: bool = True, strict: bool
return res

for se1, se2 in zip(self.path, other.path):
res = se1.compare_to(se2, strict=strict)
res = se1.compare_to(se2)
if res != 0:
return res

return len(self.path) - len(other.path)
if len(self.path) < len(other.path):
return -1
elif len(self.path) > len(other.path):
return 1
return 0

def __eq__(self, other: object) -> bool:
if not isinstance(other, ScriptureRef):
Expand Down
Loading
Loading