Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Port USFM code from Machine up to commit bf2b46d #115

Merged
merged 12 commits into from
Aug 21, 2024
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@ coverage.xml
*.py,cover
.hypothesis/
.pytest_cache/
tests/testutils/data/usfm/source/*
tests/testutils/data/usfm/target/*

# Translations
*.mo
Expand Down
9 changes: 8 additions & 1 deletion .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,12 @@
"editor.defaultFormatter": "ms-python.black-formatter",
"editor.formatOnSave": true
},
"black-formatter.path": ["poetry", "run", "black"]
"black-formatter.path": [
"poetry",
"run",
"black"
],
"python.analysis.extraPaths": [
"./tests"
]
}
28 changes: 14 additions & 14 deletions machine/corpora/paratext_backup_text_corpus.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
from typing import List
from zipfile import ZipFile

import regex as re

from ..utils.typeshed import StrPath
from .scripture_text_corpus import ScriptureTextCorpus
from .usfm_zip_text import UsfmZipText
Expand All @@ -16,20 +14,22 @@ def __init__(self, filename: StrPath, include_markers: bool = False, include_all
settings = parser.parse()

versification = settings.versification
regex = re.compile(f"^{re.escape(settings.file_name_prefix)}.*{re.escape(settings.file_name_suffix)}$")

texts: List[UsfmZipText] = []
for sfm_entry in (zi for zi in archive.filelist if regex.match(zi.filename)):
texts.append(
UsfmZipText(
settings.stylesheet,
settings.encoding,
filename,
sfm_entry.filename,
versification,
include_markers,
include_all_text,
for sfm_entry in archive.filelist:
book_id = settings.get_book_id(sfm_entry.filename)
if book_id:
texts.append(
UsfmZipText(
settings.stylesheet,
settings.encoding,
book_id,
filename,
sfm_entry.filename,
versification,
include_markers,
include_all_text,
)
)
)

super().__init__(versification, texts)
39 changes: 38 additions & 1 deletion machine/corpora/paratext_project_settings.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from dataclasses import dataclass
from typing import Optional

from ..scripture.canon import book_id_to_number
from ..scripture.canon import book_id_to_number, book_number_to_id
from ..scripture.verse_ref import Versification
from .usfm_stylesheet import UsfmStylesheet

Expand All @@ -19,6 +20,28 @@ class ParatextProjectSettings:
biblical_terms_project_name: str
biblical_terms_file_name: str

def get_book_id(self, file_name: str) -> Optional[str]:
if not file_name.startswith(self.file_name_prefix) or not file_name.endswith(self.file_name_suffix):
return None

book_part: str = file_name[len(self.file_name_prefix) : -len(self.file_name_suffix)]
if self.file_name_form == "MAT":
if len(book_part) != 3:
return None
book_id = book_part
elif self.file_name_form in ("40", "41"):
if book_part != "100" and len(book_part) != 2:
return None
book_id = book_number_to_id(_get_book_number(book_part))
else:
if book_part.startswith("100"):
if len(book_part) != 6:
return None
elif len(book_part) != 5:
return None
book_id = book_part[2:] if len(book_part) == 5 else book_part[3:]
return book_id

def get_book_file_name(self, book_id: str) -> str:
if self.file_name_form == "MAT":
book_part = book_id
Expand All @@ -42,3 +65,17 @@ def _get_book_file_name_digits(book_id: str) -> str:
if book_num < 120:
return f"B{book_num - 110}"
return f"C{book_num - 120}"


def _get_book_number(book_file_name_digits: str) -> int:
if book_file_name_digits.startswith("A"):
return 100 + int(book_file_name_digits[1:])
if book_file_name_digits.startswith("B"):
return 110 + int(book_file_name_digits[1:])
if book_file_name_digits.startswith("C"):
return 120 + int(book_file_name_digits[1:])

book_num: int = int(book_file_name_digits)
if book_num >= 40:
return book_num - 1
return book_num
21 changes: 12 additions & 9 deletions machine/corpora/paratext_text_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,18 @@ def __init__(self, project_dir: StrPath, include_markers: bool = False, include_

texts: List[UsfmFileText] = []
for sfm_filename in Path(project_dir).glob(f"{settings.file_name_prefix}*{settings.file_name_suffix}"):
texts.append(
UsfmFileText(
settings.stylesheet,
settings.encoding,
sfm_filename,
versification,
include_markers,
include_all_text,
book_id = settings.get_book_id(sfm_filename.name)
if book_id:
texts.append(
UsfmFileText(
settings.stylesheet,
settings.encoding,
book_id,
sfm_filename,
versification,
include_markers,
include_all_text,
)
)
)

super().__init__(versification, texts)
7 changes: 1 addition & 6 deletions machine/corpora/scripture_ref.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from typing import List, Optional

from ..scripture.constants import ENGLISH_VERSIFICATION
from ..scripture.verse_ref import VerseRef, Versification, are_overlapping_verse_ranges
from ..scripture.verse_ref import VerseRef, Versification
from ..utils.comparable import Comparable
from .scripture_element import ScriptureElement

Expand Down Expand Up @@ -86,11 +86,6 @@ def change_versification(self, versification: Versification) -> ScriptureRef:
vr.change_versification(versification)
return ScriptureRef(vr, self.path)

def overlaps(self, other: ScriptureRef) -> bool:
if not are_overlapping_verse_ranges(self.verse_ref, other.verse_ref):
return False
return self.path == other.path

def compare_to(self, other: object, compare_segments: bool = True, strict: bool = True):
if not isinstance(other, ScriptureRef):
raise TypeError("other is not a ScriptureRef object.")
Expand Down
4 changes: 2 additions & 2 deletions machine/corpora/scripture_ref_usfm_parser_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def start_para(
) -> None:
if self._cur_verse_ref.is_default:
self._update_verse_ref(state.verse_ref, marker)
if not state.is_verse_text:
if not state.is_verse_text or self._current_text_type == ScriptureTextType.NONVERSE:
self._start_parent_element(marker)
self._start_non_verse_text_wrapper(state)

Expand Down Expand Up @@ -121,7 +121,7 @@ def _start_verse_text_wrapper(self, state: UsfmParserState) -> None:
self._start_verse_text(state, self._create_verse_refs())

def _end_verse_text_wrapper(self, state: UsfmParserState) -> None:
if not self._duplicate_verse and self._cur_verse_ref.verse_num != 0:
if not self._duplicate_verse and self._cur_verse_ref.verse_num > 0:
self._end_verse_text(state, self._create_verse_refs())
self._cur_text_type_stack.pop()

Expand Down
18 changes: 2 additions & 16 deletions machine/corpora/usfm_file_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,29 +14,15 @@ def __init__(
self,
stylesheet: UsfmStylesheet,
encoding: str,
id: str,
filename: StrPath,
versification: Optional[Versification] = None,
include_markers: bool = False,
include_all_text: bool = False,
) -> None:
super().__init__(
_get_id(filename, encoding), stylesheet, encoding, versification, include_markers, include_all_text
)
super().__init__(id, stylesheet, encoding, versification, include_markers, include_all_text)

self._filename = Path(filename)

def _create_stream_container(self) -> StreamContainer:
return FileStreamContainer(self._filename)


def _get_id(filename: StrPath, encoding: str) -> str:
with open(filename, "r", encoding=encoding) as file:
for line in file:
line = line.strip()
if line.startswith("\\id "):
id = line[4:]
index = id.find(" ")
if index != -1:
id = id[:index]
return id.strip().upper()
raise RuntimeError(f"The USFM file '{filename}' does not contain an 'id' marker.")
23 changes: 20 additions & 3 deletions machine/corpora/usfm_file_text_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,24 @@ def __init__(
stylesheet = UsfmStylesheet(stylesheet_filename)
texts: List[UsfmFileText] = []
for sfm_filename in Path(project_dir).glob(file_pattern):
texts.append(
UsfmFileText(stylesheet, encoding, sfm_filename, versification, include_markers, include_all_text)
)
id = _get_id(sfm_filename, encoding)
if id:
texts.append(
UsfmFileText(
stylesheet, encoding, id, sfm_filename, versification, include_markers, include_all_text
)
)
super().__init__(versification, texts)


def _get_id(filename: StrPath, encoding: str) -> Optional[str]:
with open(filename, "r", encoding=encoding) as file:
for line in file:
line = line.strip()
if line.startswith("\\id "):
id = line[4:]
index = id.find(" ")
if index != -1:
id = id[:index]
return id.strip().upper()
return None
Loading
Loading