Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Port USFM code from Machine up to commit bf2b46d #115

Merged
merged 12 commits into from
Aug 21, 2024
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,10 @@ coverage.xml
*.py,cover
.hypothesis/
.pytest_cache/
tests/testutils/data/usfm/source/*
tests/testutils/data/usfm/target/*
tests/testutils/data/project/*
tests/testutils/data/pretranslations.json

# Translations
*.mo
Expand Down
9 changes: 8 additions & 1 deletion .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,12 @@
"editor.defaultFormatter": "ms-python.black-formatter",
"editor.formatOnSave": true
},
"black-formatter.path": ["poetry", "run", "black"]
"black-formatter.path": [
"poetry",
"run",
"black"
],
"python.analysis.extraPaths": [
"./tests"
]
}
12 changes: 12 additions & 0 deletions machine/corpora/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,17 @@
from .dbl_bundle_text_corpus import DblBundleTextCorpus
from .dictionary_alignment_corpus import DictionaryAlignmentCorpus
from .dictionary_text_corpus import DictionaryTextCorpus
from .file_paratext_project_settings_parser import FileParatextProjectSettingsParser
from .flatten import flatten
from .memory_alignment_collection import MemoryAlignmentCollection
from .memory_text import MemoryText
from .multi_key_ref import MultiKeyRef
from .parallel_text_corpus import ParallelTextCorpus
from .parallel_text_row import ParallelTextRow
from .paratext_backup_terms_corpus import ParatextBackupTermsCorpus
from .paratext_backup_text_corpus import ParatextBackupTextCorpus
from .paratext_project_settings import ParatextProjectSettings
from .paratext_project_settings_parser_base import ParatextProjectSettingsParserBase
from .paratext_text_corpus import ParatextTextCorpus
from .scripture_element import ScriptureElement
from .scripture_ref import EMPTY_SCRIPTURE_REF, ScriptureRef
Expand Down Expand Up @@ -57,6 +61,8 @@
from .usx_file_text import UsxFileText
from .usx_file_text_corpus import UsxFileTextCorpus
from .usx_zip_text import UsxZipText
from .zip_paratext_project_settings_parser import ZipParatextProjectSettingsParser
from .zip_paratext_project_settings_parser_base import ZipParatextProjectSettingsParserBase

__all__ = [
"AlignedWordPair",
Expand All @@ -72,6 +78,7 @@
"EMPTY_SCRIPTURE_REF",
"escape_spaces",
"extract_scripture_corpus",
"FileParatextProjectSettingsParser",
"flatten",
"is_scripture",
"lowercase",
Expand All @@ -85,7 +92,10 @@
"normalize",
"ParallelTextCorpus",
"ParallelTextRow",
"ParatextBackupTermsCorpus",
"ParatextBackupTextCorpus",
"ParatextProjectSettings",
"ParatextProjectSettingsParserBase",
"ParatextTextCorpus",
"parse_usfm",
"RtlReferenceOrder",
Expand Down Expand Up @@ -128,4 +138,6 @@
"UsxFileText",
"UsxFileTextCorpus",
"UsxZipText",
"ZipParatextProjectSettingsParser",
"ZipParatextProjectSettingsParserBase",
]
29 changes: 15 additions & 14 deletions machine/corpora/paratext_backup_text_corpus.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
from typing import List
from zipfile import ZipFile

import regex as re

from ..utils.typeshed import StrPath
from .scripture_text_corpus import ScriptureTextCorpus
from .usfm_zip_text import UsfmZipText
Expand All @@ -16,20 +14,23 @@ def __init__(self, filename: StrPath, include_markers: bool = False, include_all
settings = parser.parse()

versification = settings.versification
regex = re.compile(f"^{re.escape(settings.file_name_prefix)}.*{re.escape(settings.file_name_suffix)}$")

texts: List[UsfmZipText] = []
for sfm_entry in (zi for zi in archive.filelist if regex.match(zi.filename)):
texts.append(
UsfmZipText(
settings.stylesheet,
settings.encoding,
filename,
sfm_entry.filename,
versification,
include_markers,
include_all_text,
for sfm_entry in archive.filelist:
book_id = settings.get_book_id(sfm_entry.filename)
if book_id:
texts.append(
UsfmZipText(
settings.stylesheet,
settings.encoding,
book_id,
filename,
sfm_entry.filename,
versification,
include_markers,
include_all_text,
settings.name,
)
)
)

super().__init__(versification, texts)
40 changes: 39 additions & 1 deletion machine/corpora/paratext_project_settings.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from dataclasses import dataclass
from typing import Optional

from ..scripture.canon import book_id_to_number
from ..scripture.canon import book_id_to_number, book_number_to_id
from ..scripture.verse_ref import Versification
from .usfm_stylesheet import UsfmStylesheet

Expand All @@ -19,6 +20,29 @@ class ParatextProjectSettings:
biblical_terms_project_name: str
biblical_terms_file_name: str

def get_book_id(self, file_name: str) -> Optional[str]:
"""Returns None when the file name doesn't match the pattern of a book file name for the project."""
if not file_name.startswith(self.file_name_prefix) or not file_name.endswith(self.file_name_suffix):
return None

book_part: str = file_name[len(self.file_name_prefix) : -len(self.file_name_suffix)]
if self.file_name_form == "MAT":
if len(book_part) != 3:
return None
book_id = book_part
elif self.file_name_form in ("40", "41"):
if book_part != "100" and len(book_part) != 2:
return None
book_id = book_number_to_id(_get_book_number(book_part))
else:
if book_part.startswith("100"):
if len(book_part) != 6:
return None
elif len(book_part) != 5:
return None
book_id = book_part[2:] if len(book_part) == 5 else book_part[3:]
return book_id

def get_book_file_name(self, book_id: str) -> str:
if self.file_name_form == "MAT":
book_part = book_id
Expand All @@ -42,3 +66,17 @@ def _get_book_file_name_digits(book_id: str) -> str:
if book_num < 120:
return f"B{book_num - 110}"
return f"C{book_num - 120}"


def _get_book_number(book_file_name_digits: str) -> int:
if book_file_name_digits.startswith("A"):
return 100 + int(book_file_name_digits[1:])
if book_file_name_digits.startswith("B"):
return 110 + int(book_file_name_digits[1:])
if book_file_name_digits.startswith("C"):
return 120 + int(book_file_name_digits[1:])

book_num: int = int(book_file_name_digits)
if book_num >= 40:
return book_num - 1
return book_num
22 changes: 13 additions & 9 deletions machine/corpora/paratext_text_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,19 @@ def __init__(self, project_dir: StrPath, include_markers: bool = False, include_

texts: List[UsfmFileText] = []
for sfm_filename in Path(project_dir).glob(f"{settings.file_name_prefix}*{settings.file_name_suffix}"):
texts.append(
UsfmFileText(
settings.stylesheet,
settings.encoding,
sfm_filename,
versification,
include_markers,
include_all_text,
book_id = settings.get_book_id(sfm_filename.name)
if book_id:
texts.append(
UsfmFileText(
settings.stylesheet,
settings.encoding,
book_id,
sfm_filename,
versification,
include_markers,
include_all_text,
settings.name,
)
)
)

super().__init__(versification, texts)
7 changes: 1 addition & 6 deletions machine/corpora/scripture_ref.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from typing import List, Optional

from ..scripture.constants import ENGLISH_VERSIFICATION
from ..scripture.verse_ref import VerseRef, Versification, are_overlapping_verse_ranges
from ..scripture.verse_ref import VerseRef, Versification
from ..utils.comparable import Comparable
from .scripture_element import ScriptureElement

Expand Down Expand Up @@ -86,11 +86,6 @@ def change_versification(self, versification: Versification) -> ScriptureRef:
vr.change_versification(versification)
return ScriptureRef(vr, self.path)

def overlaps(self, other: ScriptureRef) -> bool:
if not are_overlapping_verse_ranges(self.verse_ref, other.verse_ref):
return False
return self.path == other.path

def compare_to(self, other: object, compare_segments: bool = True, strict: bool = True):
if not isinstance(other, ScriptureRef):
raise TypeError("other is not a ScriptureRef object.")
Expand Down
7 changes: 4 additions & 3 deletions machine/corpora/scripture_ref_usfm_parser_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def verse(
self, state: UsfmParserState, number: str, marker: str, alt_number: Optional[str], pub_number: Optional[str]
) -> None:
if state.verse_ref == self._cur_verse_ref:
self._end_verse_text_wrapper(state)
self._end_verse_text(state, self._create_verse_refs())
# ignore duplicate verses
self._duplicate_verse = True
elif are_overlapping_verse_ranges(number, self._cur_verse_ref.verse):
Expand All @@ -61,7 +61,7 @@ def start_para(
) -> None:
if self._cur_verse_ref.is_default:
self._update_verse_ref(state.verse_ref, marker)
if not state.is_verse_text:
if not state.is_verse_text or marker == "d":
self._start_parent_element(marker)
self._start_non_verse_text_wrapper(state)

Expand Down Expand Up @@ -121,8 +121,9 @@ def _start_verse_text_wrapper(self, state: UsfmParserState) -> None:
self._start_verse_text(state, self._create_verse_refs())

def _end_verse_text_wrapper(self, state: UsfmParserState) -> None:
if not self._duplicate_verse and self._cur_verse_ref.verse_num != 0:
if not self._duplicate_verse and self._cur_verse_ref.verse_num > 0:
self._end_verse_text(state, self._create_verse_refs())
if self._cur_verse_ref.verse_num > 0:
self._cur_text_type_stack.pop()

def _start_non_verse_text_wrapper(self, state: UsfmParserState) -> None:
Expand Down
19 changes: 3 additions & 16 deletions machine/corpora/usfm_file_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,29 +14,16 @@ def __init__(
self,
stylesheet: UsfmStylesheet,
encoding: str,
id: str,
filename: StrPath,
versification: Optional[Versification] = None,
include_markers: bool = False,
include_all_text: bool = False,
project: Optional[str] = None,
) -> None:
super().__init__(
_get_id(filename, encoding), stylesheet, encoding, versification, include_markers, include_all_text
)
super().__init__(id, stylesheet, encoding, versification, include_markers, include_all_text, project)

self._filename = Path(filename)

def _create_stream_container(self) -> StreamContainer:
return FileStreamContainer(self._filename)


def _get_id(filename: StrPath, encoding: str) -> str:
with open(filename, "r", encoding=encoding) as file:
for line in file:
line = line.strip()
if line.startswith("\\id "):
id = line[4:]
index = id.find(" ")
if index != -1:
id = id[:index]
return id.strip().upper()
raise RuntimeError(f"The USFM file '{filename}' does not contain an 'id' marker.")
23 changes: 20 additions & 3 deletions machine/corpora/usfm_file_text_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,24 @@ def __init__(
stylesheet = UsfmStylesheet(stylesheet_filename)
texts: List[UsfmFileText] = []
for sfm_filename in Path(project_dir).glob(file_pattern):
texts.append(
UsfmFileText(stylesheet, encoding, sfm_filename, versification, include_markers, include_all_text)
)
id = _get_id(sfm_filename, encoding)
if id:
texts.append(
UsfmFileText(
stylesheet, encoding, id, sfm_filename, versification, include_markers, include_all_text
)
)
super().__init__(versification, texts)


def _get_id(filename: StrPath, encoding: str) -> Optional[str]:
with open(filename, "r", encoding=encoding) as file:
for line in file:
line = line.strip()
if line.startswith("\\id "):
id = line[4:]
index = id.find(" ")
if index != -1:
id = id[:index]
return id.strip().upper()
return None
21 changes: 13 additions & 8 deletions machine/corpora/usfm_text_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from .scripture_text import ScriptureText
from .stream_container import StreamContainer
from .text_row import TextRow
from .usfm_parser import parse_usfm
from .usfm_parser import UsfmParser
from .usfm_parser_state import UsfmParserState
from .usfm_stylesheet import UsfmStylesheet
from .usfm_token import UsfmAttribute, UsfmToken, UsfmTokenType
Expand All @@ -26,27 +26,32 @@ def __init__(
versification: Optional[Versification],
include_markers: bool,
include_all_text: bool,
project: Optional[str] = None,
) -> None:
super().__init__(id, versification)

self._stylesheet = stylesheet
self._encoding = encoding
self._include_markers = include_markers
self._include_all_text = include_all_text
self.project = project

@abstractmethod
def _create_stream_container(self) -> StreamContainer: ...

def _get_rows(self) -> Generator[TextRow, None, None]:
usfm = self._read_usfm()
row_collector = _TextRowCollector(self)
parse_usfm(
usfm,
row_collector,
self._stylesheet,
self.versification,
preserve_whitespace=self._include_markers,
)
parser = UsfmParser(usfm, row_collector, self._stylesheet, self._versification, self._include_markers)
try:
parser.process_tokens()
except Exception as e:
error_message = (
f"An error occurred while parsing the text '{self.id}'"
f"{f' in project {self.project}' if self.project else ''}"
f". Verse: {parser.state.verse_ref}, offset: {parser.state.verse_offset}, error: '{e}'"
)
raise RuntimeError(error_message) from e
return gen(row_collector.rows)

def _read_usfm(self) -> str:
Expand Down
Loading
Loading