diff --git a/.gitignore b/.gitignore index a44bbfe..10b04fd 100644 --- a/.gitignore +++ b/.gitignore @@ -50,6 +50,10 @@ coverage.xml *.py,cover .hypothesis/ .pytest_cache/ +tests/testutils/data/usfm/source/* +tests/testutils/data/usfm/target/* +tests/testutils/data/project/* +tests/testutils/data/pretranslations.json # Translations *.mo diff --git a/.vscode/settings.json b/.vscode/settings.json index 6e4c2a3..5bbc1b5 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -9,5 +9,12 @@ "editor.defaultFormatter": "ms-python.black-formatter", "editor.formatOnSave": true }, - "black-formatter.path": ["poetry", "run", "black"] + "black-formatter.path": [ + "poetry", + "run", + "black" + ], + "python.analysis.extraPaths": [ + "./tests" + ] } diff --git a/machine/corpora/__init__.py b/machine/corpora/__init__.py index 553ed97..f653d17 100644 --- a/machine/corpora/__init__.py +++ b/machine/corpora/__init__.py @@ -7,13 +7,17 @@ from .dbl_bundle_text_corpus import DblBundleTextCorpus from .dictionary_alignment_corpus import DictionaryAlignmentCorpus from .dictionary_text_corpus import DictionaryTextCorpus +from .file_paratext_project_settings_parser import FileParatextProjectSettingsParser from .flatten import flatten from .memory_alignment_collection import MemoryAlignmentCollection from .memory_text import MemoryText from .multi_key_ref import MultiKeyRef from .parallel_text_corpus import ParallelTextCorpus from .parallel_text_row import ParallelTextRow +from .paratext_backup_terms_corpus import ParatextBackupTermsCorpus from .paratext_backup_text_corpus import ParatextBackupTextCorpus +from .paratext_project_settings import ParatextProjectSettings +from .paratext_project_settings_parser_base import ParatextProjectSettingsParserBase from .paratext_text_corpus import ParatextTextCorpus from .scripture_element import ScriptureElement from .scripture_ref import EMPTY_SCRIPTURE_REF, ScriptureRef @@ -57,6 +61,8 @@ from .usx_file_text import UsxFileText from .usx_file_text_corpus import UsxFileTextCorpus from .usx_zip_text import UsxZipText +from .zip_paratext_project_settings_parser import ZipParatextProjectSettingsParser +from .zip_paratext_project_settings_parser_base import ZipParatextProjectSettingsParserBase __all__ = [ "AlignedWordPair", @@ -72,6 +78,7 @@ "EMPTY_SCRIPTURE_REF", "escape_spaces", "extract_scripture_corpus", + "FileParatextProjectSettingsParser", "flatten", "is_scripture", "lowercase", @@ -85,7 +92,10 @@ "normalize", "ParallelTextCorpus", "ParallelTextRow", + "ParatextBackupTermsCorpus", "ParatextBackupTextCorpus", + "ParatextProjectSettings", + "ParatextProjectSettingsParserBase", "ParatextTextCorpus", "parse_usfm", "RtlReferenceOrder", @@ -128,4 +138,6 @@ "UsxFileText", "UsxFileTextCorpus", "UsxZipText", + "ZipParatextProjectSettingsParser", + "ZipParatextProjectSettingsParserBase", ] diff --git a/machine/corpora/paratext_backup_text_corpus.py b/machine/corpora/paratext_backup_text_corpus.py index 347ba08..77d7065 100644 --- a/machine/corpora/paratext_backup_text_corpus.py +++ b/machine/corpora/paratext_backup_text_corpus.py @@ -1,8 +1,6 @@ from typing import List from zipfile import ZipFile -import regex as re - from ..utils.typeshed import StrPath from .scripture_text_corpus import ScriptureTextCorpus from .usfm_zip_text import UsfmZipText @@ -16,20 +14,23 @@ def __init__(self, filename: StrPath, include_markers: bool = False, include_all settings = parser.parse() versification = settings.versification - regex = re.compile(f"^{re.escape(settings.file_name_prefix)}.*{re.escape(settings.file_name_suffix)}$") texts: List[UsfmZipText] = [] - for sfm_entry in (zi for zi in archive.filelist if regex.match(zi.filename)): - texts.append( - UsfmZipText( - settings.stylesheet, - settings.encoding, - filename, - sfm_entry.filename, - versification, - include_markers, - include_all_text, + for sfm_entry in archive.filelist: + book_id = settings.get_book_id(sfm_entry.filename) + if book_id: + texts.append( + UsfmZipText( + settings.stylesheet, + settings.encoding, + book_id, + filename, + sfm_entry.filename, + versification, + include_markers, + include_all_text, + settings.name, + ) ) - ) super().__init__(versification, texts) diff --git a/machine/corpora/paratext_project_settings.py b/machine/corpora/paratext_project_settings.py index 41796c6..548ae6c 100644 --- a/machine/corpora/paratext_project_settings.py +++ b/machine/corpora/paratext_project_settings.py @@ -1,6 +1,7 @@ from dataclasses import dataclass +from typing import Optional -from ..scripture.canon import book_id_to_number +from ..scripture.canon import book_id_to_number, book_number_to_id from ..scripture.verse_ref import Versification from .usfm_stylesheet import UsfmStylesheet @@ -19,6 +20,29 @@ class ParatextProjectSettings: biblical_terms_project_name: str biblical_terms_file_name: str + def get_book_id(self, file_name: str) -> Optional[str]: + """Returns None when the file name doesn't match the pattern of a book file name for the project.""" + if not file_name.startswith(self.file_name_prefix) or not file_name.endswith(self.file_name_suffix): + return None + + book_part: str = file_name[len(self.file_name_prefix) : -len(self.file_name_suffix)] + if self.file_name_form == "MAT": + if len(book_part) != 3: + return None + book_id = book_part + elif self.file_name_form in ("40", "41"): + if book_part != "100" and len(book_part) != 2: + return None + book_id = book_number_to_id(_get_book_number(book_part)) + else: + if book_part.startswith("100"): + if len(book_part) != 6: + return None + elif len(book_part) != 5: + return None + book_id = book_part[2:] if len(book_part) == 5 else book_part[3:] + return book_id + def get_book_file_name(self, book_id: str) -> str: if self.file_name_form == "MAT": book_part = book_id @@ -42,3 +66,17 @@ def _get_book_file_name_digits(book_id: str) -> str: if book_num < 120: return f"B{book_num - 110}" return f"C{book_num - 120}" + + +def _get_book_number(book_file_name_digits: str) -> int: + if book_file_name_digits.startswith("A"): + return 100 + int(book_file_name_digits[1:]) + if book_file_name_digits.startswith("B"): + return 110 + int(book_file_name_digits[1:]) + if book_file_name_digits.startswith("C"): + return 120 + int(book_file_name_digits[1:]) + + book_num: int = int(book_file_name_digits) + if book_num >= 40: + return book_num - 1 + return book_num diff --git a/machine/corpora/paratext_text_corpus.py b/machine/corpora/paratext_text_corpus.py index 53c883c..24c24dd 100644 --- a/machine/corpora/paratext_text_corpus.py +++ b/machine/corpora/paratext_text_corpus.py @@ -16,15 +16,19 @@ def __init__(self, project_dir: StrPath, include_markers: bool = False, include_ texts: List[UsfmFileText] = [] for sfm_filename in Path(project_dir).glob(f"{settings.file_name_prefix}*{settings.file_name_suffix}"): - texts.append( - UsfmFileText( - settings.stylesheet, - settings.encoding, - sfm_filename, - versification, - include_markers, - include_all_text, + book_id = settings.get_book_id(sfm_filename.name) + if book_id: + texts.append( + UsfmFileText( + settings.stylesheet, + settings.encoding, + book_id, + sfm_filename, + versification, + include_markers, + include_all_text, + settings.name, + ) ) - ) super().__init__(versification, texts) diff --git a/machine/corpora/scripture_element.py b/machine/corpora/scripture_element.py index 503630b..db98bff 100644 --- a/machine/corpora/scripture_element.py +++ b/machine/corpora/scripture_element.py @@ -1,7 +1,6 @@ from __future__ import annotations from functools import total_ordering -from typing import Optional from ..utils.comparable import Comparable @@ -20,17 +19,27 @@ def position(self) -> int: def name(self) -> str: return self._name - def compare_to(self, other: object, strict: Optional[bool] = True) -> int: + def to_relaxed(self) -> ScriptureElement: + return ScriptureElement(0, self.name) + + def compare_to(self, other: object) -> int: if not isinstance(other, ScriptureElement): raise (TypeError("other is not a ScriptureElement object.")) if self is other: return 0 - if strict: - res = self.position - other.position - if res != 0: - return res - + if self.position == 0 or other.position == 0: + if self.name == other.name: + return 0 + # position 0 is always greater than any other position + if self.position == 0 and other.position != 0: + return 1 + if other.position == 0 and self.position != 0: + return -1 + return (self.name > other.name) - (self.name < other.name) + res = self.position - other.position + if res != 0: + return res return (self.name > other.name) - (self.name < other.name) def __eq__(self, other: ScriptureElement) -> bool: diff --git a/machine/corpora/scripture_ref.py b/machine/corpora/scripture_ref.py index b5fd75c..4bbefc2 100644 --- a/machine/corpora/scripture_ref.py +++ b/machine/corpora/scripture_ref.py @@ -4,7 +4,7 @@ from typing import List, Optional from ..scripture.constants import ENGLISH_VERSIFICATION -from ..scripture.verse_ref import VerseRef, Versification, are_overlapping_verse_ranges +from ..scripture.verse_ref import VerseRef, Versification from ..utils.comparable import Comparable from .scripture_element import ScriptureElement @@ -81,17 +81,15 @@ def is_empty(self) -> bool: def is_verse(self) -> bool: return VerseRef.verse_num != 0 and len(self.path) == 0 + def to_relaxed(self) -> ScriptureRef: + return ScriptureRef(self.verse_ref, [pe.to_relaxed() for pe in self.path]) + def change_versification(self, versification: Versification) -> ScriptureRef: vr: VerseRef = self.verse_ref.copy() vr.change_versification(versification) return ScriptureRef(vr, self.path) - def overlaps(self, other: ScriptureRef) -> bool: - if not are_overlapping_verse_ranges(self.verse_ref, other.verse_ref): - return False - return self.path == other.path - - def compare_to(self, other: object, compare_segments: bool = True, strict: bool = True): + def compare_to(self, other: object, compare_segments: bool = True) -> int: if not isinstance(other, ScriptureRef): raise TypeError("other is not a ScriptureRef object.") if self is other: @@ -102,11 +100,14 @@ def compare_to(self, other: object, compare_segments: bool = True, strict: bool return res for se1, se2 in zip(self.path, other.path): - res = se1.compare_to(se2, strict=strict) + res = se1.compare_to(se2) if res != 0: return res - - return len(self.path) - len(other.path) + if len(self.path) < len(other.path): + return -1 + elif len(self.path) > len(other.path): + return 1 + return 0 def __eq__(self, other: object) -> bool: if not isinstance(other, ScriptureRef): diff --git a/machine/corpora/scripture_ref_usfm_parser_handler.py b/machine/corpora/scripture_ref_usfm_parser_handler.py index 65c26cf..a5e0766 100644 --- a/machine/corpora/scripture_ref_usfm_parser_handler.py +++ b/machine/corpora/scripture_ref_usfm_parser_handler.py @@ -12,6 +12,7 @@ class ScriptureTextType(Enum): + NONE = auto() NONVERSE = auto() VERSE = auto() NOTE = auto() @@ -26,7 +27,7 @@ def __init__(self) -> None: @property def _current_text_type(self) -> ScriptureTextType: - return ScriptureTextType.NONVERSE if len(self._cur_text_type_stack) == 0 else self._cur_text_type_stack[-1] + return ScriptureTextType.NONE if len(self._cur_text_type_stack) == 0 else self._cur_text_type_stack[-1] def end_usfm(self, state: UsfmParserState) -> None: self._end_verse_text_wrapper(state) @@ -39,7 +40,7 @@ def verse( self, state: UsfmParserState, number: str, marker: str, alt_number: Optional[str], pub_number: Optional[str] ) -> None: if state.verse_ref == self._cur_verse_ref: - self._end_verse_text_wrapper(state) + self._end_verse_text(state, self._create_verse_refs()) # ignore duplicate verses self._duplicate_verse = True elif are_overlapping_verse_ranges(number, self._cur_verse_ref.verse): @@ -61,7 +62,7 @@ def start_para( ) -> None: if self._cur_verse_ref.is_default: self._update_verse_ref(state.verse_ref, marker) - if not state.is_verse_text: + if not state.is_verse_text or marker == "d": self._start_parent_element(marker) self._start_non_verse_text_wrapper(state) @@ -69,17 +70,23 @@ def end_para(self, state: UsfmParserState, marker: str) -> None: if self._current_text_type == ScriptureTextType.NONVERSE: self._end_parent_element() self._end_non_verse_text_wrapper(state) + elif self._current_text_type == ScriptureTextType.NONE: + # empty verse paragraph + self._start_parent_element(marker) + self._start_non_verse_text_wrapper(state) + self._end_parent_element() + self._end_non_verse_text_wrapper(state) def start_row(self, state: UsfmParserState, marker: str) -> None: - if self._current_text_type == ScriptureTextType.NONVERSE: + if self._current_text_type == ScriptureTextType.NONVERSE or self._current_text_type == ScriptureTextType.NONE: self._start_parent_element(marker) def end_row(self, state: UsfmParserState, marker: str) -> None: - if self._current_text_type == ScriptureTextType.NONVERSE: + if self._current_text_type == ScriptureTextType.NONVERSE or self._current_text_type == ScriptureTextType.NONE: self._end_parent_element() def start_cell(self, state: UsfmParserState, marker: str, align: str, colspan: int) -> None: - if self._current_text_type == ScriptureTextType.NONVERSE: + if self._current_text_type == ScriptureTextType.NONVERSE or self._current_text_type == ScriptureTextType.NONE: self._start_parent_element(marker) self._start_non_verse_text_wrapper(state) @@ -95,13 +102,27 @@ def end_sidebar(self, state: UsfmParserState, marker: str, closed: bool) -> None self._end_parent_element() def start_note(self, state: UsfmParserState, marker: str, caller: str, category: Optional[str]) -> None: - self._next_element(marker) - self._start_note_text_wrapper(state) + if self._current_text_type != ScriptureTextType.NONE: + self._next_element(marker) + self._start_note_text_wrapper(state) def end_note(self, state: UsfmParserState, marker: str, closed: bool) -> None: - self._end_note_text_wrapper(state) - - def ref(self, state: UsfmParserState, marker: str, display: str, target: str) -> None: ... + if self._current_text_type == ScriptureTextType.NOTE: + self._end_note_text_wrapper(state) + + def text(self, state: UsfmParserState, text: str) -> None: + # if we hit text in a verse paragraph and we aren't in a verse, then start a non-verse segment + para_tag = state.para_tag + if ( + self._current_text_type == ScriptureTextType.NONE + and para_tag is not None + and para_tag.marker != "tr" + and state.is_verse_text + and self._cur_verse_ref.verse_num == 0 + and len(text.strip()) > 0 + ): + self._start_parent_element(para_tag.marker) + self._start_non_verse_text_wrapper(state) def _start_verse_text(self, state: UsfmParserState, scripture_refs: Optional[List[ScriptureRef]]) -> None: ... @@ -121,8 +142,9 @@ def _start_verse_text_wrapper(self, state: UsfmParserState) -> None: self._start_verse_text(state, self._create_verse_refs()) def _end_verse_text_wrapper(self, state: UsfmParserState) -> None: - if not self._duplicate_verse and self._cur_verse_ref.verse_num != 0: + if not self._duplicate_verse and self._cur_verse_ref.verse_num > 0: self._end_verse_text(state, self._create_verse_refs()) + if self._cur_verse_ref.verse_num > 0: self._cur_text_type_stack.pop() def _start_non_verse_text_wrapper(self, state: UsfmParserState) -> None: diff --git a/machine/corpora/usfm_file_text.py b/machine/corpora/usfm_file_text.py index d4c2d68..ede53c4 100644 --- a/machine/corpora/usfm_file_text.py +++ b/machine/corpora/usfm_file_text.py @@ -14,29 +14,16 @@ def __init__( self, stylesheet: UsfmStylesheet, encoding: str, + id: str, filename: StrPath, versification: Optional[Versification] = None, include_markers: bool = False, include_all_text: bool = False, + project: Optional[str] = None, ) -> None: - super().__init__( - _get_id(filename, encoding), stylesheet, encoding, versification, include_markers, include_all_text - ) + super().__init__(id, stylesheet, encoding, versification, include_markers, include_all_text, project) self._filename = Path(filename) def _create_stream_container(self) -> StreamContainer: return FileStreamContainer(self._filename) - - -def _get_id(filename: StrPath, encoding: str) -> str: - with open(filename, "r", encoding=encoding) as file: - for line in file: - line = line.strip() - if line.startswith("\\id "): - id = line[4:] - index = id.find(" ") - if index != -1: - id = id[:index] - return id.strip().upper() - raise RuntimeError(f"The USFM file '{filename}' does not contain an 'id' marker.") diff --git a/machine/corpora/usfm_file_text_corpus.py b/machine/corpora/usfm_file_text_corpus.py index 4033d02..dbc5f67 100644 --- a/machine/corpora/usfm_file_text_corpus.py +++ b/machine/corpora/usfm_file_text_corpus.py @@ -25,7 +25,24 @@ def __init__( stylesheet = UsfmStylesheet(stylesheet_filename) texts: List[UsfmFileText] = [] for sfm_filename in Path(project_dir).glob(file_pattern): - texts.append( - UsfmFileText(stylesheet, encoding, sfm_filename, versification, include_markers, include_all_text) - ) + id = _get_id(sfm_filename, encoding) + if id: + texts.append( + UsfmFileText( + stylesheet, encoding, id, sfm_filename, versification, include_markers, include_all_text + ) + ) super().__init__(versification, texts) + + +def _get_id(filename: StrPath, encoding: str) -> Optional[str]: + with open(filename, "r", encoding=encoding) as file: + for line in file: + line = line.strip() + if line.startswith("\\id "): + id = line[4:] + index = id.find(" ") + if index != -1: + id = id[:index] + return id.strip().upper() + return None diff --git a/machine/corpora/usfm_text_base.py b/machine/corpora/usfm_text_base.py index 481f047..bae08fe 100644 --- a/machine/corpora/usfm_text_base.py +++ b/machine/corpora/usfm_text_base.py @@ -11,7 +11,7 @@ from .scripture_text import ScriptureText from .stream_container import StreamContainer from .text_row import TextRow -from .usfm_parser import parse_usfm +from .usfm_parser import UsfmParser from .usfm_parser_state import UsfmParserState from .usfm_stylesheet import UsfmStylesheet from .usfm_token import UsfmAttribute, UsfmToken, UsfmTokenType @@ -26,6 +26,7 @@ def __init__( versification: Optional[Versification], include_markers: bool, include_all_text: bool, + project: Optional[str] = None, ) -> None: super().__init__(id, versification) @@ -33,6 +34,7 @@ def __init__( self._encoding = encoding self._include_markers = include_markers self._include_all_text = include_all_text + self.project = project @abstractmethod def _create_stream_container(self) -> StreamContainer: ... @@ -40,13 +42,16 @@ def _create_stream_container(self) -> StreamContainer: ... def _get_rows(self) -> Generator[TextRow, None, None]: usfm = self._read_usfm() row_collector = _TextRowCollector(self) - parse_usfm( - usfm, - row_collector, - self._stylesheet, - self.versification, - preserve_whitespace=self._include_markers, - ) + parser = UsfmParser(usfm, row_collector, self._stylesheet, self._versification, self._include_markers) + try: + parser.process_tokens() + except Exception as e: + error_message = ( + f"An error occurred while parsing the text '{self.id}'" + f"{f' in project {self.project}' if self.project else ''}" + f". Verse: {parser.state.verse_ref}, offset: {parser.state.verse_offset}, error: '{e}'" + ) + raise RuntimeError(error_message) from e return gen(row_collector.rows) def _read_usfm(self) -> str: diff --git a/machine/corpora/usfm_text_updater.py b/machine/corpora/usfm_text_updater.py index ba62eea..f178dfe 100644 --- a/machine/corpora/usfm_text_updater.py +++ b/machine/corpora/usfm_text_updater.py @@ -14,37 +14,34 @@ def __init__( rows: Optional[List[Tuple[List[ScriptureRef], str]]] = None, id_text: Optional[str] = None, strip_all_text: bool = False, - strict_comparison: bool = True, + prefer_existing_text: bool = False, ) -> None: super().__init__() self._rows = rows or [] self._tokens: List[UsfmToken] = [] + self._new_tokens: List[UsfmToken] = [] self._id_text = id_text self._strip_all_text = strip_all_text - self._strict_comparison = strict_comparison + self._prefer_existing_text = prefer_existing_text self._replace_stack: List[bool] = [] self._row_index: int = 0 self._token_index: int = 0 - self._replace_text: bool = False @property def tokens(self) -> List[UsfmToken]: return self._tokens - @property - def replace_text(self) -> bool: - return self._strip_all_text or (len(self._replace_stack) > 0 and self._replace_stack[-1]) - def start_book(self, state: UsfmParserState, marker: str, code: str) -> None: self._collect_tokens(state) + start_book_tokens: List[UsfmToken] = [] if self._id_text is not None: - self._tokens.append(UsfmToken(UsfmTokenType.TEXT, text=self._id_text + " ")) - self._replace_stack.append(self._id_text is not None) + start_book_tokens.append(UsfmToken(UsfmTokenType.TEXT, text=self._id_text + " ")) + self._push_new_tokens(start_book_tokens) super().start_book(state, marker, code) def end_book(self, state: UsfmParserState, marker: str) -> None: - self._replace_stack.pop() + self._pop_new_tokens() super().end_book(state, marker) @@ -127,7 +124,7 @@ def start_char( unknown: bool, attributes: List[UsfmAttribute], ) -> None: - if self.replace_text: + if self._replace_with_new_tokens(state): self._skip_tokens(state) else: self._collect_tokens(state) @@ -141,7 +138,7 @@ def end_char( attributes: List[UsfmAttribute], closed: bool, ) -> None: - if closed and self.replace_text: + if closed and self._replace_with_new_tokens(state): self._skip_tokens(state) super().end_char(state, marker, attributes, closed) @@ -153,7 +150,7 @@ def start_note( caller: str, category: str, ) -> None: - if self.replace_text: + if self._replace_with_new_tokens(state): self._skip_tokens(state) else: self._collect_tokens(state) @@ -161,13 +158,13 @@ def start_note( super().start_note(state, marker, caller, category) def end_note(self, state: UsfmParserState, marker: str, closed: bool) -> None: - if closed and self.replace_text: + if closed and self._replace_with_new_tokens(state): self._skip_tokens(state) super().end_note(state, marker, closed) def ref(self, state: UsfmParserState, marker: str, display: str, target: str) -> None: - if self.replace_text: + if self._replace_with_new_tokens(state): self._skip_tokens(state) else: self._collect_tokens(state) @@ -175,7 +172,7 @@ def ref(self, state: UsfmParserState, marker: str, display: str, target: str) -> super().ref(state, marker, display, target) def text(self, state: UsfmParserState, text: str) -> None: - if self.replace_text: + if self._replace_with_new_tokens(state): self._skip_tokens(state) else: self._collect_tokens(state) @@ -183,7 +180,7 @@ def text(self, state: UsfmParserState, text: str) -> None: super().text(state, text) def opt_break(self, state: UsfmParserState) -> None: - if self.replace_text: + if self._replace_with_new_tokens(state): self._skip_tokens(state) else: self._collect_tokens(state) @@ -191,7 +188,7 @@ def opt_break(self, state: UsfmParserState) -> None: super().opt_break(state) def unmatched(self, state: UsfmParserState, marker: str) -> None: - if self.replace_text: + if self._replace_with_new_tokens(state): self._skip_tokens(state) else: self._collect_tokens(state) @@ -200,38 +197,37 @@ def unmatched(self, state: UsfmParserState, marker: str) -> None: def _start_verse_text(self, state: UsfmParserState, scripture_refs: List[ScriptureRef]) -> None: row_texts: List[str] = self._advance_rows(scripture_refs) - self._tokens.extend(UsfmToken(UsfmTokenType.TEXT, text=t + " ") for t in row_texts) - self._replace_stack.append(len(row_texts) > 0) + self._push_new_tokens([UsfmToken(UsfmTokenType.TEXT, text=t + " ") for t in row_texts]) def _end_verse_text(self, state: UsfmParserState, scripture_refs: List[ScriptureRef]) -> None: - self._replace_stack.pop() + self._pop_new_tokens() def _start_non_verse_text(self, state: UsfmParserState, scripture_ref: ScriptureRef) -> None: row_texts = self._advance_rows([scripture_ref]) - self._tokens.extend(UsfmToken(UsfmTokenType.TEXT, text=t + " ") for t in row_texts) - self._replace_stack.append(len(row_texts) > 0) + self._push_new_tokens([UsfmToken(UsfmTokenType.TEXT, text=t + " ") for t in row_texts]) def _end_non_verse_text(self, state: UsfmParserState, scripture_ref: ScriptureRef) -> None: - self._replace_stack.pop() + self._pop_new_tokens() def _start_note_text(self, state: UsfmParserState, scripture_ref: ScriptureRef) -> None: row_texts = self._advance_rows([scripture_ref]) + new_tokens: List[UsfmToken] = [] if len(row_texts) > 0: if state.token is None: raise ValueError("Invalid parser state.") - self._tokens.append(state.token) - self._tokens.append(UsfmToken(UsfmTokenType.CHARACTER, "ft", None, "ft*")) + new_tokens.append(state.token) + new_tokens.append(UsfmToken(UsfmTokenType.CHARACTER, "ft", None, "ft*")) for i, text in enumerate(row_texts): if i < len(row_texts) - 1: text += " " - self._tokens.append(UsfmToken(UsfmTokenType.TEXT, text=text)) - self._tokens.append(UsfmToken(UsfmTokenType.END, state.token.end_marker, None, None)) - self._replace_stack.append(True) + new_tokens.append(UsfmToken(UsfmTokenType.TEXT, text=text)) + new_tokens.append(UsfmToken(UsfmTokenType.END, state.token.end_marker, None, None)) + self._push_new_tokens(new_tokens) else: - self._replace_stack.append(self._replace_stack[-1]) + self._push_token_as_previous() def _end_note_text(self, state: UsfmParserState, scripture_ref: ScriptureRef) -> None: - self._replace_stack.pop() + self._pop_new_tokens() def get_usfm(self, stylesheet: Union[str, UsfmStylesheet] = "usfm.sty") -> str: if isinstance(stylesheet, str): @@ -241,36 +237,61 @@ def get_usfm(self, stylesheet: Union[str, UsfmStylesheet] = "usfm.sty") -> str: def _advance_rows(self, seg_scr_refs: List[ScriptureRef]) -> List[str]: row_texts: List[str] = [] - i = 0 - while self._row_index < len(self._rows) and i < len(seg_scr_refs): + source_index: int = 0 + while self._row_index < len(self._rows) and source_index < len(seg_scr_refs): + compare: int = 0 row_scr_refs, text = self._rows[self._row_index] - stop = False for row_scr_ref in row_scr_refs: - found = False - for seg_scr_ref in seg_scr_refs[i:]: - compare = row_scr_ref.compare_to( - seg_scr_refs[i], compare_segments=False, strict=self._strict_comparison - ) - if compare == 0: - row_texts.append(text) - i += 1 - found = True + while source_index < len(seg_scr_refs): + compare = row_scr_ref.compare_to(seg_scr_refs[source_index], compare_segments=False) + if compare > 0: + # source is ahead of row, increment source + source_index += 1 + else: break - elif compare > 0: - stop = True - break - if stop or found: + if compare == 0: + # source and row match + # grab the text and increment both + row_texts.append(text) + source_index += 1 break - if stop: - break - else: + if compare <= 0: + # row is ahead of source, increment row self._row_index += 1 return row_texts def _collect_tokens(self, state: UsfmParserState) -> None: + self._tokens.extend(self._new_tokens) + self._new_tokens.clear() while self._token_index <= state.index + state.special_token_count: self._tokens.append(state.tokens[self._token_index]) self._token_index += 1 def _skip_tokens(self, state: UsfmParserState) -> None: self._token_index = state.index + 1 + state.special_token_count + + def _replace_with_new_tokens(self, state: UsfmParserState) -> bool: + new_text: bool = len(self._replace_stack) > 0 and self._replace_stack[-1] + token_end: int = state.index + state.special_token_count + 1 + existing_text: bool = False + for index in range(self._token_index, token_end + 1): + if state.tokens[index].type == UsfmTokenType.TEXT and state.tokens[index].text: + existing_text = True + break + use_new_tokens: bool = ( + self._strip_all_text or (new_text and not existing_text) or (new_text and not self._prefer_existing_text) + ) + if use_new_tokens: + self._tokens.extend(self._new_tokens) + self._new_tokens.clear() + return use_new_tokens + + def _push_new_tokens(self, tokens: List[UsfmToken]) -> None: + self._replace_stack.append(any(tokens)) + self._new_tokens.extend(tokens) + + def _push_token_as_previous(self) -> None: + self._replace_stack.append(self._replace_stack[-1]) + + def _pop_new_tokens(self) -> None: + self._replace_stack.pop() diff --git a/machine/corpora/usfm_tokenizer.py b/machine/corpora/usfm_tokenizer.py index 54a5e69..0a25b43 100644 --- a/machine/corpora/usfm_tokenizer.py +++ b/machine/corpora/usfm_tokenizer.py @@ -212,6 +212,7 @@ def tokenize(self, usfm: str, preserve_whitespace: bool = False) -> Sequence[Usf def detokenize(self, tokens: Iterable[UsfmToken], tokens_have_whitespace: bool = False) -> str: prev_token: Optional[UsfmToken] = None usfm = "" + in_book = False for token in tokens: token_usfm = "" if token.type in {UsfmTokenType.BOOK, UsfmTokenType.CHAPTER, UsfmTokenType.PARAGRAPH}: @@ -224,6 +225,7 @@ def detokenize(self, tokens: Iterable[UsfmToken], tokens_have_whitespace: bool = if not tokens_have_whitespace: usfm += "\r\n" token_usfm = token.to_usfm() + in_book = token.type == UsfmTokenType.BOOK elif token.type is UsfmTokenType.VERSE: # Add newline if after anything other than [ or ( if len(usfm) > 0 and usfm[-1] != "[" and usfm[-1] != "(": @@ -242,7 +244,7 @@ def detokenize(self, tokens: Iterable[UsfmToken], tokens_have_whitespace: bool = "\u200e" if self.rtl_reference_order is RtlReferenceOrder.BOOK_VERSE_CHAPTER else "\u200f" ) token_usfm = _RTL_VERSE_REGEX.sub(token_usfm, f"$1{direction_marker}$2") - + in_book = False elif token.type is UsfmTokenType.TEXT: # Ensure spaces are preserved token_usfm = token.to_usfm() @@ -257,7 +259,15 @@ def detokenize(self, tokens: Iterable[UsfmToken], tokens_have_whitespace: bool = else: token_usfm = token_usfm.lstrip(" ") else: + if in_book: + if usfm[-1] == " " and ( + (prev_token is not None and prev_token.to_usfm().strip() != "") or not tokens_have_whitespace + ): + usfm = usfm[:-1] + if not tokens_have_whitespace: + usfm += "\r\n" token_usfm = token.to_usfm() + in_book = False usfm += token_usfm prev_token = token diff --git a/machine/corpora/usfm_zip_text.py b/machine/corpora/usfm_zip_text.py index 8e85570..0b4f44b 100644 --- a/machine/corpora/usfm_zip_text.py +++ b/machine/corpora/usfm_zip_text.py @@ -1,6 +1,4 @@ -from io import TextIOWrapper from typing import Optional -from zipfile import ZipFile from ..scripture.verse_ref import Versification from ..utils.typeshed import StrPath @@ -15,38 +13,17 @@ def __init__( self, stylesheet: UsfmStylesheet, encoding: str, + id: str, archive_filename: StrPath, path: str, versification: Optional[Versification] = None, include_markers: bool = False, include_all_text: bool = False, + project: Optional[str] = None, ) -> None: - super().__init__( - _get_id(archive_filename, path, encoding), - stylesheet, - encoding, - versification, - include_markers, - include_all_text, - ) + super().__init__(id, stylesheet, encoding, versification, include_markers, include_all_text, project) self._archive_filename = archive_filename self._path = path def _create_stream_container(self) -> StreamContainer: return ZipEntryStreamContainer(self._archive_filename, self._path) - - -def _get_id(archive_filename: StrPath, path: str, encoding: str) -> str: - with ZipFile(archive_filename, "r") as archive: - entry = next((zi for zi in archive.filelist if zi.filename == path)) - with archive.open(entry, "r") as file: - stream = TextIOWrapper(file, encoding=encoding) - for line in stream: - line = line.strip() - if line.startswith("\\id "): - id = line[4:] - index = id.find(" ") - if index != -1: - id = id[:index] - return id.strip().upper() - raise RuntimeError("The USFM does not contain an 'id' marker.") diff --git a/machine/scripture/verse_ref.py b/machine/scripture/verse_ref.py index 2bfa081..cf077b2 100644 --- a/machine/scripture/verse_ref.py +++ b/machine/scripture/verse_ref.py @@ -407,7 +407,11 @@ def _compare_verses(self, other: VerseRef, compare_segments: bool) -> int: result = verse.compare_to(other_verse, compare_all_verses=False, compare_segments=compare_segments) if result != 0: return result - return len(verse_list) - len(other_verse_list) + if len(verse_list) < len(other_verse_list): + return -1 + elif len(verse_list) > len(other_verse_list): + return 1 + return 0 def _validate_single_verse(self) -> ValidStatus: # Unknown versification is always invalid diff --git a/tests/corpora/test_paratext_backup_text_corpus.py b/tests/corpora/test_paratext_backup_text_corpus.py index 61d067a..57907c3 100644 --- a/tests/corpora/test_paratext_backup_text_corpus.py +++ b/tests/corpora/test_paratext_backup_text_corpus.py @@ -11,7 +11,7 @@ def test_texts() -> None: with _TestEnvironment() as env: - assert [t.id for t in env.corpus.texts] == ["LEV", "1CH", "MAT", "MRK"] + assert [t.id for t in env.corpus.texts] == ["LEV", "1CH", "MAT", "MRK", "JHN"] def test_get_text() -> None: @@ -23,6 +23,10 @@ def test_get_text() -> None: luk = env.corpus.get_text("LUK") assert luk is None + jhn = env.corpus.get_text("JHN") + assert jhn is not None + assert not any(jhn.get_rows()) + class _TestEnvironment(ContextManager["_TestEnvironment"]): def __init__(self) -> None: diff --git a/tests/corpora/test_paratext_project_settings.py b/tests/corpora/test_paratext_project_settings.py index f30f836..039dde8 100644 --- a/tests/corpora/test_paratext_project_settings.py +++ b/tests/corpora/test_paratext_project_settings.py @@ -43,6 +43,77 @@ def test_get_book_file_name_book_num_prefix_c() -> None: assert settings.get_book_file_name("3MQ") == "PROJC0.SFM" +def test_get_book_id_book_num() -> None: + settings = _create_settings("41") + assert settings.get_book_id("PROJ42.SFM") == "MRK" + + +def test_get_book_id_book_num_book_id() -> None: + settings = _create_settings("41MAT") + assert settings.get_book_id("PROJ42MRK.SFM") == "MRK" + + +def test_get_book_id_book_id() -> None: + settings = _create_settings("MAT") + assert settings.get_book_id("PROJMRK.SFM") == "MRK" + + +def test_get_book_id_book_num_double_digit() -> None: + settings = _create_settings("41") + assert settings.get_book_id("PROJ01.SFM") == "GEN" + + +def test_get_book_id_book_num_xxg_book_num() -> None: + settings = _create_settings("41") + assert settings.get_book_id("PROJ100.SFM") == "XXG" + + +def test_get_book_id_book_num_xxg_book_num_book_id() -> None: + settings = _create_settings("41MAT") + assert settings.get_book_id("PROJ100XXG.SFM") == "XXG" + + +def test_get_book_id_book_num_prefix_a() -> None: + settings = _create_settings("41") + assert settings.get_book_id("PROJA0.SFM") == "FRT" + + +def test_get_book_id_book_num_prefix_b() -> None: + settings = _create_settings("41") + assert settings.get_book_id("PROJB0.SFM") == "TDX" + + +def test_get_book_id_book_num_prefix_c() -> None: + settings = _create_settings("41") + assert settings.get_book_id("PROJC0.SFM") == "3MQ" + + +def test_get_book_id_wrong_prefix() -> None: + settings = _create_settings("41") + assert settings.get_book_id("WRONG42.SFM") is None + + +def test_get_book_id_wrong_suffix() -> None: + settings = _create_settings("41") + assert settings.get_book_id("PROJ42.WRONG") is None + + +def test_get_book_id_wrong_book_part_book_num() -> None: + settings = _create_settings("41") + assert settings.get_book_id("PROJ42MRK.SFM") is None + + +def test_get_book_id_wrong_book_part_book_id() -> None: + settings = _create_settings("MAT") + assert settings.get_book_id("PROJ42.SFM") is None + + +def test_get_book_id_wrong_book_part_book_num_book_id() -> None: + settings = _create_settings("41MAT") + assert settings.get_book_id("PROJMRK.SFM") is None + assert settings.get_book_id("PROJ100.SFM") is None + + def _create_settings(file_name_form: str) -> ParatextProjectSettings: return ParatextProjectSettings( "Name", diff --git a/tests/corpora/test_scripture_ref.py b/tests/corpora/test_scripture_ref.py index 3247d39..8674082 100644 --- a/tests/corpora/test_scripture_ref.py +++ b/tests/corpora/test_scripture_ref.py @@ -1,45 +1,48 @@ +from pytest import raises + from machine.corpora import ScriptureRef -def test_compare_to_strict(): - assert compare_to_strict("MAT 1:1", "MAT 1:2") == -1, "VerseLessThan" - assert compare_to_strict("MAT 1:1", "MAT 1:1") == 0, "VerseEqualTo" - assert compare_to_strict("MAT 1:2", "MAT 1:1") == 1, "VerseGreaterThan" - assert compare_to_strict("MAT 1:0/1:p", "MAT 1:0/2:p") == -1, "NonVerseLessThan" - assert compare_to_strict("MAT 1:0/1:p", "MAT 1:0/1:p") == 0, "NonVerseEqualTo" - assert compare_to_strict("MAT 1:0/2:p", "MAT 1:0/1:p") == 1, "NonVerseGreaterThan" - assert compare_to_strict("MAT 1:0/1:esb", "MAT 1:0/1:esb/1:p") == -1, "NonVerseParentChild" +def test_compare_to(): + assert compare_to("MAT 1:1", "MAT 1:2") == -1, "VerseLessThan" + assert compare_to("MAT 1:1", "MAT 1:1") == 0, "VerseEqualTo" + assert compare_to("MAT 1:2", "MAT 1:1") == 1, "VerseGreaterThan" + assert compare_to("MAT 1:1-3", "MAT 1:1") == 1, "MultiVerseExtensionGreaterThan" + assert compare_to("MAT 1:1", "MAT 1:1-3") == -1, "MultiVerseExtensionLessThan" + assert compare_to("MAT 1:1-3", "MAT 1:2") == -1, "MultiVerseStartLessThan" + assert compare_to("MAT 1:2", "MAT 1:1-3") == 1, "MultiVerseEndGreaterThan" + assert compare_to("MAT 1:0/1:p", "MAT 1:0/2:p") == -1, "NonVerseLessThan" + assert compare_to("MAT 1:0/1:p", "MAT 1:0/1:p") == 0, "NonVerseEqualTo" + assert compare_to("MAT 1:0/2:p", "MAT 1:0/1:p") == 1, "NonVerseGreaterThan" + assert compare_to("MAT 1:0/1:esb", "MAT 1:0/1:esb/1:p") == -1, "NonVerseParentChild" + assert compare_to("MAT 1:0/2:esb", "MAT 1:0/1:esb/1:p") == 1, "NonVerseParentOtherChild" + assert compare_to("MAT 1:0/p", "MAT 1:0/2:p") == 0, "RelaxedSameMarker" + assert compare_to("MAT 1:0/p", "MAT 1:0/2:esb") == 1, "RelaxedSameLevel" + assert compare_to("MAT 1:0/esb", "MAT 1:0/1:esb/1:p") == -1, "RelaxedParentChild" + assert compare_to("MAT 1:0/2:esb", "MAT 1:0/esb/p") == -1, "ParentRelaxedChild" -def test_compare_to_relaxed(): - assert compare_to_relaxed("MAT 1:1", "MAT 1:2") == -1, "VerseLessThan" - assert compare_to_relaxed("MAT 1:1", "MAT 1:1") == 0, "VerseEqualTo" - assert compare_to_relaxed("MAT 1:2", "MAT 1:1") == 1, "VerseGreaterThan" - assert compare_to_relaxed("MAT 1:0/1:p", "MAT 1:0/2:p") == 0, "NonVerseSameMarkerDifferentPosition" - assert compare_to_relaxed("MAT 1:0/2:esb", "MAT 1:0/1:esb/1:p") == -1, "NonVerseParentChild" +def test_is_equal_to(): + ref1 = ScriptureRef.parse("MAT 1:1/1:p") + ref1dup = ScriptureRef.parse("MAT 1:1/1:p") + ref2 = ScriptureRef.parse("MAT 1:2/1:p") + obj1 = "A different type" + assert ref1 == ref1dup + assert ref1 != ref2 + assert ref1 != obj1 -def compare_to_strict(ref1_str, ref2_str): - ref1 = ScriptureRef.parse(ref1_str) - ref2 = ScriptureRef.parse(ref2_str) - result = ref1.compare_to(ref2) +def test_is_equal_to_throws_argument_exception(): + ref1 = ScriptureRef.parse("MAT 1:1/1:p") + obj1 = "A different type" - if result < 0: - result = -1 - elif result > 0: - result = 1 - return result + with raises(TypeError): + ref1.compare_to(obj1) -def compare_to_relaxed(ref1_str, ref2_str): +def compare_to(ref1_str, ref2_str): ref1 = ScriptureRef.parse(ref1_str) ref2 = ScriptureRef.parse(ref2_str) - result = ref1.compare_to(ref2, strict=False) - - if result < 0: - result = -1 - elif result > 0: - result = 1 - return result + return ref1.compare_to(ref2) diff --git a/tests/corpora/test_usfm_file_text.py b/tests/corpora/test_usfm_file_text.py index 33679ae..9bf3afc 100644 --- a/tests/corpora/test_usfm_file_text.py +++ b/tests/corpora/test_usfm_file_text.py @@ -10,7 +10,7 @@ def test_get_rows_nonempty_text() -> None: assert text is not None rows = list(text) - assert len(rows) == 19 + assert len(rows) == 23 assert scripture_ref(rows[0]) == ScriptureRef.parse("MAT 1:1", corpus.versification) assert rows[0].text == "Chapter one, verse one." @@ -21,41 +21,41 @@ def test_get_rows_nonempty_text() -> None: assert scripture_ref(rows[4]) == ScriptureRef.parse("MAT 1:5", corpus.versification) assert rows[4].text == "Chapter one, verse five." - assert scripture_ref(rows[5]) == ScriptureRef.parse("MAT 2:1", corpus.versification) - assert rows[5].text == "Chapter two, verse one." + assert scripture_ref(rows[8]) == ScriptureRef.parse("MAT 2:1", corpus.versification) + assert rows[8].text == "Chapter two, verse one." - assert scripture_ref(rows[6]) == ScriptureRef.parse("MAT 2:2", corpus.versification) - assert rows[6].text == "Chapter two, verse two. Chapter two, verse three." - assert rows[6].is_in_range - assert rows[6].is_range_start + assert scripture_ref(rows[9]) == ScriptureRef.parse("MAT 2:2", corpus.versification) + assert rows[9].text == "Chapter two, verse two. Chapter two, verse three." + assert rows[9].is_in_range + assert rows[9].is_range_start - assert scripture_ref(rows[7]) == ScriptureRef.parse("MAT 2:3", corpus.versification) - assert len(rows[7].segment) == 0 - assert rows[7].is_in_range - assert not rows[7].is_range_start + assert scripture_ref(rows[10]) == ScriptureRef.parse("MAT 2:3", corpus.versification) + assert len(rows[10].segment) == 0 + assert rows[10].is_in_range + assert not rows[10].is_range_start - assert scripture_ref(rows[8]) == ScriptureRef.parse("MAT 2:4a", corpus.versification) - assert len(rows[8].segment) == 0 - assert rows[8].is_in_range - assert not rows[8].is_range_start + assert scripture_ref(rows[11]) == ScriptureRef.parse("MAT 2:4a", corpus.versification) + assert len(rows[11].segment) == 0 + assert rows[11].is_in_range + assert not rows[11].is_range_start - assert scripture_ref(rows[9]) == ScriptureRef.parse("MAT 2:4b", corpus.versification) - assert rows[9].text == "Chapter two, verse four." + assert scripture_ref(rows[12]) == ScriptureRef.parse("MAT 2:4b", corpus.versification) + assert rows[12].text == "Chapter two, verse four." - assert scripture_ref(rows[10]) == ScriptureRef.parse("MAT 2:5", corpus.versification) - assert rows[10].text == "Chapter two, verse five." + assert scripture_ref(rows[13]) == ScriptureRef.parse("MAT 2:5", corpus.versification) + assert rows[13].text == "Chapter two, verse five." - assert scripture_ref(rows[11]) == ScriptureRef.parse("MAT 2:6", corpus.versification) - assert rows[11].text == "Chapter two, verse six." + assert scripture_ref(rows[14]) == ScriptureRef.parse("MAT 2:6", corpus.versification) + assert rows[14].text == "Chapter two, verse six." - assert scripture_ref(rows[15]) == ScriptureRef.parse("MAT 2:9", corpus.versification) - assert rows[15].text == "Chapter 2 verse 9" + assert scripture_ref(rows[18]) == ScriptureRef.parse("MAT 2:9", corpus.versification) + assert rows[18].text == "Chapter 2 verse 9" - assert scripture_ref(rows[16]) == ScriptureRef.parse("MAT 2:10", corpus.versification) - assert rows[16].text == "Chapter 2 verse 10" + assert scripture_ref(rows[19]) == ScriptureRef.parse("MAT 2:10", corpus.versification) + assert rows[19].text == "Chapter 2 verse 10" - assert scripture_ref(rows[17]) == ScriptureRef.parse("MAT 2:11", corpus.versification) - assert not rows[17].text + assert scripture_ref(rows[20]) == ScriptureRef.parse("MAT 2:11", corpus.versification) + assert not rows[20].text def test_get_rows_nonempty_text_all_text() -> None: @@ -65,7 +65,7 @@ def test_get_rows_nonempty_text_all_text() -> None: assert text is not None rows = list(text) - assert len(rows) == 36 + assert len(rows) == 49 assert scripture_ref(rows[0]) == ScriptureRef.parse("MAT 1:0/1:h", corpus.versification) assert rows[0].text == "Matthew" @@ -79,44 +79,53 @@ def test_get_rows_nonempty_text_all_text() -> None: assert scripture_ref(rows[3]) == ScriptureRef.parse("MAT 1:0/3:ip/1:fe", corpus.versification) assert rows[3].text == "This is an endnote." - assert scripture_ref(rows[4]) == ScriptureRef.parse("Mat 1:0/4:s", corpus.versification) - assert rows[4].text == "Chapter One" + assert scripture_ref(rows[4]) == ScriptureRef.parse("Mat 1:0/4:p", corpus.versification) + assert rows[4].text == "Here is another paragraph." - assert scripture_ref(rows[6]) == ScriptureRef.parse("MAT 1:1/1:f", corpus.versification) - assert rows[6].text == "1:1: This is a footnote." + assert scripture_ref(rows[7]) == ScriptureRef.parse("MAT 1:0/7:weirdtaglookingthing", corpus.versification) + assert rows[7].text == "that is not an actual tag." - assert scripture_ref(rows[8]) == ScriptureRef.parse("MAT 1:2/1:f", corpus.versification) - assert rows[8].text == "1:2: This is a footnote." + assert scripture_ref(rows[8]) == ScriptureRef.parse("MAT 1:0/8:s", corpus.versification) + assert rows[8].text == "Chapter One" - assert scripture_ref(rows[12]) == ScriptureRef.parse("MAT 2:0/1:tr/1:tc1", corpus.versification) - assert rows[12].text == "Row one, column one." + assert scripture_ref(rows[10]) == ScriptureRef.parse("MAT 1:1/1:f", corpus.versification) + assert rows[10].text == "1:1: This is a footnote." - assert scripture_ref(rows[13]) == ScriptureRef.parse("MAT 2:0/1:tr/2:tc2", corpus.versification) - assert rows[13].text == "Row one, column two." + assert scripture_ref(rows[12]) == ScriptureRef.parse("MAT 1:2/1:f", corpus.versification) + assert rows[12].text == "1:2: This is a footnote." - assert scripture_ref(rows[14]) == ScriptureRef.parse("MAT 2:0/2:tr/1:tc1", corpus.versification) - assert rows[14].text == "Row two, column one." + assert scripture_ref(rows[19]) == ScriptureRef.parse("MAT 2:0/1:tr/1:tc1", corpus.versification) + assert rows[19].text == "Row one, column one." - assert scripture_ref(rows[15]) == ScriptureRef.parse("MAT 2:0/2:tr/2:tc2", corpus.versification) - assert rows[15].text == "Row two, column two." + assert scripture_ref(rows[20]) == ScriptureRef.parse("MAT 2:0/1:tr/2:tc2", corpus.versification) + assert rows[20].text == "Row one, column two." - assert scripture_ref(rows[16]) == ScriptureRef.parse("MAT 2:0/3:s1", corpus.versification) - assert rows[16].text == "Chapter Two" + assert scripture_ref(rows[21]) == ScriptureRef.parse("MAT 2:0/2:tr/1:tc1", corpus.versification) + assert rows[21].text == "Row two, column one." - assert scripture_ref(rows[18]) == ScriptureRef.parse("MAT 2:1/1:f", corpus.versification) - assert rows[18].text == "2:1: This is a footnote." + assert scripture_ref(rows[22]) == ScriptureRef.parse("MAT 2:0/2:tr/2:tc2", corpus.versification) + assert rows[22].text == "Row two, column two." - assert scripture_ref(rows[21]) == ScriptureRef.parse("MAT 2:3/1:esb/1:ms", corpus.versification) - assert rows[21].text == "This is a sidebar" + assert scripture_ref(rows[23]) == ScriptureRef.parse("MAT 2:0/3:s1", corpus.versification) + assert rows[23].text == "Chapter Two" - assert scripture_ref(rows[22]) == ScriptureRef.parse("MAT 2:3/1:esb/2:p", corpus.versification) - assert rows[22].text == "Here is some sidebar content." + assert scripture_ref(rows[24]) == ScriptureRef.parse("MAT 2:0/4:p", corpus.versification) + assert not rows[24].text - assert scripture_ref(rows[28]) == ScriptureRef.parse("MAT 2:7a/1:s", corpus.versification) - assert rows[28].text == "Section header" + assert scripture_ref(rows[26]) == ScriptureRef.parse("MAT 2:1/1:f", corpus.versification) + assert rows[26].text == "2:1: This is a footnote." - assert scripture_ref(rows[35]) == ScriptureRef.parse("MAT 2:12/1:restore", corpus.versification) - assert rows[35].text == "restore information" + assert scripture_ref(rows[29]) == ScriptureRef.parse("MAT 2:3/1:esb/1:ms", corpus.versification) + assert rows[29].text == "This is a sidebar" + + assert scripture_ref(rows[30]) == ScriptureRef.parse("MAT 2:3/1:esb/2:p", corpus.versification) + assert rows[30].text == "Here is some sidebar content." + + assert scripture_ref(rows[36]) == ScriptureRef.parse("MAT 2:7a/1:s", corpus.versification) + assert rows[36].text == "Section header" + + assert scripture_ref(rows[43]) == ScriptureRef.parse("MAT 2:12/1:restore", corpus.versification) + assert rows[43].text == "restore information" def test_get_rows_sentence_start() -> None: @@ -126,7 +135,7 @@ def test_get_rows_sentence_start() -> None: assert text is not None rows = list(text) - assert len(rows) == 19 + assert len(rows) == 23 assert scripture_ref(rows[3]) == ScriptureRef.parse("MAT 1:4", corpus.versification) assert rows[3].text == "Chapter one, verse four," @@ -154,7 +163,7 @@ def test_get_rows_include_markers() -> None: assert text is not None rows = list(text) - assert len(rows) == 19 + assert len(rows) == 23 assert scripture_ref(rows[0]) == ScriptureRef.parse("MAT 1:1", corpus.versification) assert ( @@ -167,38 +176,38 @@ def test_get_rows_include_markers() -> None: assert scripture_ref(rows[4]) == ScriptureRef.parse("MAT 1:5", corpus.versification) assert rows[4].text == 'Chapter one, \\li2 verse \\fig Figure 1|src="image1.png" size="col" ref="1:5"\\fig* five.' - assert scripture_ref(rows[5]) == ScriptureRef.parse("MAT 2:1", corpus.versification) - assert rows[5].text == "Chapter \\add two\\add*, verse \\f + \\fr 2:1: \\ft This is a footnote.\\f*one." + assert scripture_ref(rows[8]) == ScriptureRef.parse("MAT 2:1", corpus.versification) + assert rows[8].text == "Chapter \\add two\\add*, verse \\f + \\fr 2:1: \\ft This is a footnote.\\f*one." - assert scripture_ref(rows[6]) == ScriptureRef.parse("MAT 2:2", corpus.versification) - assert rows[6].text == "Chapter two, // verse \\fm ∆\\fm*two. Chapter two, verse \\w three|lemma\\w*." - assert rows[6].is_in_range - assert rows[6].is_range_start + assert scripture_ref(rows[9]) == ScriptureRef.parse("MAT 2:2", corpus.versification) + assert rows[9].text == "Chapter two, // verse \\fm ∆\\fm*two. Chapter two, verse \\w three|lemma\\w*." + assert rows[9].is_in_range + assert rows[9].is_range_start - assert scripture_ref(rows[7]) == ScriptureRef.parse("MAT 2:3", corpus.versification) - assert len(rows[7].segment) == 0 - assert rows[7].is_in_range - assert not rows[7].is_range_start + assert scripture_ref(rows[10]) == ScriptureRef.parse("MAT 2:3", corpus.versification) + assert len(rows[10].segment) == 0 + assert rows[10].is_in_range + assert not rows[10].is_range_start - assert scripture_ref(rows[8]) == ScriptureRef.parse("MAT 2:4a", corpus.versification) - assert len(rows[8].segment) == 0 - assert rows[8].is_in_range - assert not rows[8].is_range_start + assert scripture_ref(rows[11]) == ScriptureRef.parse("MAT 2:4a", corpus.versification) + assert len(rows[11].segment) == 0 + assert rows[11].is_in_range + assert not rows[11].is_range_start - assert scripture_ref(rows[9]) == ScriptureRef.parse("MAT 2:4b", corpus.versification) - assert rows[9].text == "Chapter two, verse four." + assert scripture_ref(rows[12]) == ScriptureRef.parse("MAT 2:4b", corpus.versification) + assert rows[12].text == "Chapter two, verse four." - assert scripture_ref(rows[10]) == ScriptureRef.parse("MAT 2:5", corpus.versification) - assert rows[10].text == "Chapter two, verse five \\rq (MAT 3:1)\\rq*." + assert scripture_ref(rows[13]) == ScriptureRef.parse("MAT 2:5", corpus.versification) + assert rows[13].text == "Chapter two, verse five \\rq (MAT 3:1)\\rq*." - assert scripture_ref(rows[11]) == ScriptureRef.parse("MAT 2:6", corpus.versification) - assert rows[11].text == 'Chapter two, verse \\w six|strong="12345" \\w*.' + assert scripture_ref(rows[14]) == ScriptureRef.parse("MAT 2:6", corpus.versification) + assert rows[14].text == 'Chapter two, verse \\w six|strong="12345" \\w*.' - assert scripture_ref(rows[15]) == ScriptureRef.parse("MAT 2:9", corpus.versification) - assert rows[15].text == "Chapter\\tcr2 2\\tc3 verse\\tcr4 9" + assert scripture_ref(rows[18]) == ScriptureRef.parse("MAT 2:9", corpus.versification) + assert rows[18].text == "Chapter\\tcr2 2\\tc3 verse\\tcr4 9" - assert scripture_ref(rows[16]) == ScriptureRef.parse("MAT 2:10", corpus.versification) - assert rows[16].text == "\\tc3-4 Chapter 2 verse 10" + assert scripture_ref(rows[19]) == ScriptureRef.parse("MAT 2:10", corpus.versification) + assert rows[19].text == "\\tc3-4 Chapter 2 verse 10" def test_get_rows_include_markers_all_text() -> None: @@ -209,30 +218,30 @@ def test_get_rows_include_markers_all_text() -> None: assert text is not None rows = list(text) - assert len(rows) == 32 + assert len(rows) == 45 assert scripture_ref(rows[2]) == ScriptureRef.parse("MAT 1:0/3:ip", corpus.versification) assert rows[2].text == "An introduction to Matthew\\fe + \\ft This is an endnote.\\fe*" - assert scripture_ref(rows[4]) == ScriptureRef.parse("MAT 1:1", corpus.versification) + assert scripture_ref(rows[8]) == ScriptureRef.parse("MAT 1:1", corpus.versification) assert ( - rows[4].text == "Chapter \\pn one\\+pro WON\\+pro*\\pn*, verse one.\\f + \\fr 1:1: \\ft This is a footnote.\\f*" + rows[8].text == "Chapter \\pn one\\+pro WON\\+pro*\\pn*, verse one.\\f + \\fr 1:1: \\ft This is a footnote.\\f*" ) - assert scripture_ref(rows[5]) == ScriptureRef.parse("MAT 1:2", corpus.versification) - assert rows[5].text == "\\bd C\\bd*hapter one, \\li2 verse\\f + \\fr 1:2: \\ft This is a footnote.\\f* two." + assert scripture_ref(rows[9]) == ScriptureRef.parse("MAT 1:2", corpus.versification) + assert rows[9].text == "\\bd C\\bd*hapter one, \\li2 verse\\f + \\fr 1:2: \\ft This is a footnote.\\f* two." - assert scripture_ref(rows[8]) == ScriptureRef.parse("MAT 1:5", corpus.versification) - assert rows[8].text == 'Chapter one, \\li2 verse \\fig Figure 1|src="image1.png" size="col" ref="1:5"\\fig* five.' + assert scripture_ref(rows[12]) == ScriptureRef.parse("MAT 1:5", corpus.versification) + assert rows[12].text == 'Chapter one, \\li2 verse \\fig Figure 1|src="image1.png" size="col" ref="1:5"\\fig* five.' - assert scripture_ref(rows[13]) == ScriptureRef.parse("MAT 2:0/3:s1", corpus.versification) - assert rows[13].text == "Chapter \\it Two \\it*" + assert scripture_ref(rows[20]) == ScriptureRef.parse("MAT 2:0/3:s1", corpus.versification) + assert rows[20].text == "Chapter \\it Two \\it*" - assert scripture_ref(rows[14]) == ScriptureRef.parse("MAT 2:1", corpus.versification) - assert rows[14].text == "Chapter \\add two\\add*, verse \\f + \\fr 2:1: \\ft This is a footnote.\\f*one." + assert scripture_ref(rows[22]) == ScriptureRef.parse("MAT 2:1", corpus.versification) + assert rows[22].text == "Chapter \\add two\\add*, verse \\f + \\fr 2:1: \\ft This is a footnote.\\f*one." - assert scripture_ref(rows[18]) == ScriptureRef.parse("MAT 2:3/1:esb/2:p", corpus.versification) - assert rows[18].text == "Here is some sidebar // content." + assert scripture_ref(rows[26]) == ScriptureRef.parse("MAT 2:3/1:esb/2:p", corpus.versification) + assert rows[26].text == "Here is some sidebar // content." def test_usfm_file_text_corpus_lowercase_usfm_id() -> None: diff --git a/tests/corpora/test_usfm_manual.py b/tests/corpora/test_usfm_manual.py new file mode 100644 index 0000000..90aa4f9 --- /dev/null +++ b/tests/corpora/test_usfm_manual.py @@ -0,0 +1,77 @@ +import json +from dataclasses import dataclass +from pathlib import Path +from typing import List, Tuple + +import pytest +from testutils.corpora_test_helpers import TEST_DATA_PATH, USFM_SOURCE_PROJECT_PATH, USFM_TARGET_PROJECT_PATH + +from machine.corpora import ( + FileParatextProjectSettingsParser, + ParatextProjectSettings, + ParatextTextCorpus, + ScriptureRef, + StandardParallelTextCorpus, + UsfmTextUpdater, + parse_usfm, +) + + +@pytest.mark.skip(reason="This is for manual testing only. Remove this decorator to run the test.") +def test_parse_parallel_corpus(): + t_corpus = ParatextTextCorpus(USFM_TARGET_PROJECT_PATH, include_all_text=True, include_markers=True) + s_corpus = ParatextTextCorpus(USFM_SOURCE_PROJECT_PATH, include_all_text=True, include_markers=True) + p_corpus = StandardParallelTextCorpus(s_corpus, t_corpus, all_source_rows=True, all_target_rows=False) + + rows = list(p_corpus.get_rows()) + assert rows + + +@dataclass +class PretranslationDto: + text_id: str + refs: List[str] + translation: str + + def __post_init__(self): + if self.text_id is None: + raise ValueError("text_id is a required field") + if self.refs is None: + raise ValueError("refs is a required field") + if self.translation is None: + raise ValueError("translation is a required field") + + +PRETRANSLATION_PATH = TEST_DATA_PATH / "pretranslations.json" +PARATEXT_PROJECT_PATH = TEST_DATA_PATH / "project" + + +@pytest.mark.skip(reason="This is for manual testing only. Remove this decorator to run the test.") +def test_create_usfm_file(): + parser = FileParatextProjectSettingsParser(PARATEXT_PROJECT_PATH) + settings: ParatextProjectSettings = parser.parse() + + # Read text from pretranslations file + with open(PRETRANSLATION_PATH, mode="r") as pretranslation_stream: + pretranslations_dto: List[PretranslationDto] = [ + PretranslationDto(text_id=item["textId"], refs=item["refs"], translation=item["translation"]) + for item in json.loads(pretranslation_stream.read()) + ] + + pretranslations: List[Tuple[List[ScriptureRef], str]] = [ + ( + [ScriptureRef.parse(ref, settings.versification).to_relaxed() for ref in p.refs] or [], + p.translation or "", + ) + for p in pretranslations_dto + ] + + for sfm_file_name in Path(PARATEXT_PROJECT_PATH).rglob(f"{settings.file_name_prefix}*{settings.file_name_suffix}"): + updater = UsfmTextUpdater(pretranslations, strip_all_text=True, prefer_existing_text=True) + + with open(sfm_file_name, mode="r") as sfm_file: + usfm: str = sfm_file.read() + + parse_usfm(usfm, updater, settings.stylesheet, settings.versification) + new_usfm: str = updater.get_usfm(settings.stylesheet) + assert new_usfm is not None diff --git a/tests/corpora/test_usfm_text_updater.py b/tests/corpora/test_usfm_text_updater.py index 80693ab..5545a69 100644 --- a/tests/corpora/test_usfm_text_updater.py +++ b/tests/corpora/test_usfm_text_updater.py @@ -30,6 +30,40 @@ def test_get_usfm_strip_all_text() -> None: assert "\\s\r\n" in target +def test_get_usfm_prefer_existing(): + rows = [ + ( + scr_ref("MAT 1:6"), + str("Text 6"), + ), + ( + scr_ref("MAT 1:7"), + str("Text 7"), + ), + ] + target = update_usfm(rows, prefer_existing_text=True) + assert "\\id MAT - Test\r\n" in target + assert "\\v 6 Verse 6 content.\r\n" in target + assert "\\v 7 Text 7\r\n" in target + + +def test_get_usfm_prefer_rows(): + rows = [ + ( + scr_ref("MAT 1:6"), + str("Text 6"), + ), + ( + scr_ref("MAT 1:7"), + str("Text 7"), + ), + ] + target = update_usfm(rows, prefer_existing_text=False) + assert "\\id MAT - Test\r\n" in target + assert "\\v 6 Text 6\r\n" in target + assert "\\v 7 Text 7\r\n" in target + + def test_get_usfm_verse_skip_note() -> None: rows = [ ( @@ -185,7 +219,7 @@ def test_get_usfm_nonverse_char_style() -> None: def test_get_usfm_nonverse_paragraph() -> None: rows = [ ( - scr_ref("MAT 1:0/4:s"), + scr_ref("MAT 1:0/8:s"), str("The first chapter."), ) ] @@ -216,7 +250,7 @@ def test_get_usfm_nonverse_relaxed() -> None: str("The third cell of the table."), ), ] - target = update_usfm(rows, strict_comparison=False) + target = update_usfm(rows) assert "\\s The first chapter.\r\n" in target assert "\\v 1 First verse of the first chapter.\r\n" in target assert "\\tr \\tc1 The first cell of the table. \\tc2 The second cell of the table.\r\n" in target @@ -297,6 +331,50 @@ def test_get_usfm_nonverse_replace_note() -> None: assert "\\ip The introductory paragraph. \\fe + \\ft This is a new endnote.\\fe*\r\n" in target +def test_get_usfm_verse_double_va_vp() -> None: + rows = [ + ( + scr_ref("MAT 3:1"), + str("Updating later in the book to start."), + ) + ] + target = update_usfm(rows) + assert "\\id MAT - Test\r\n" in target + assert "\\v 1 \\va 2\\va*\\vp 1 (2)\\vp*Updating later in the book to start.\r\n" in target + + +def test_get_usfm_verse_pretranslations_before_text() -> None: + rows = [ + ( + scr_ref("GEN 1:1"), + str("Pretranslations before the start"), + ), + ( + scr_ref("GEN 1:2"), + str("Pretranslations before the start"), + ), + ( + scr_ref("GEN 1:3"), + str("Pretranslations before the start"), + ), + ( + scr_ref("GEN 1:4"), + str("Pretranslations before the start"), + ), + ( + scr_ref("GEN 1:5"), + str("Pretranslations before the start"), + ), + ( + scr_ref("MAT 1:0/3:ip"), + str("The introductory paragraph."), + ), + ] + + target = update_usfm(rows) + assert "\\ip The introductory paragraph.\r\n" in target + + def scr_ref(*refs: str) -> List[ScriptureRef]: return [ScriptureRef.parse(ref) for ref in refs] @@ -305,10 +383,10 @@ def update_usfm( rows: Optional[List[Tuple[List[ScriptureRef], str]]] = None, id_text: Optional[str] = None, strip_all_text: bool = False, - strict_comparison: bool = True, + prefer_existing_text: bool = False, ) -> str: source = read_usfm() - updater = UsfmTextUpdater(rows, id_text, strip_all_text, strict_comparison) + updater = UsfmTextUpdater(rows, id_text, strip_all_text, prefer_existing_text) parse_usfm(source, updater) return updater.get_usfm() diff --git a/tests/corpora/test_usfm_tokenizer.py b/tests/corpora/test_usfm_tokenizer.py index 74c30ac..d9b07be 100644 --- a/tests/corpora/test_usfm_tokenizer.py +++ b/tests/corpora/test_usfm_tokenizer.py @@ -7,22 +7,22 @@ def test_tokenize() -> None: usfm = _read_usfm() usfm_tokenizer = UsfmTokenizer() tokens = usfm_tokenizer.tokenize(usfm) - assert len(tokens) == 170 + assert len(tokens) == 224 assert tokens[0].type is UsfmTokenType.BOOK assert tokens[0].marker == "id" assert tokens[0].data == "MAT" - assert tokens[15].type is UsfmTokenType.TEXT - assert tokens[15].text == "Chapter One " + assert tokens[34].type is UsfmTokenType.TEXT + assert tokens[34].text == "Chapter One " - assert tokens[16].type is UsfmTokenType.VERSE - assert tokens[16].marker == "v" - assert tokens[16].data == "1" + assert tokens[35].type is UsfmTokenType.VERSE + assert tokens[35].marker == "v" + assert tokens[35].data == "1" - assert tokens[25].type is UsfmTokenType.NOTE - assert tokens[25].marker == "f" - assert tokens[25].data == "+" + assert tokens[44].type is UsfmTokenType.NOTE + assert tokens[44].marker == "f" + assert tokens[44].data == "+" def test_detokenize() -> None: diff --git a/tests/testutils/corpora_test_helpers.py b/tests/testutils/corpora_test_helpers.py index 38c0429..11edfdb 100644 --- a/tests/testutils/corpora_test_helpers.py +++ b/tests/testutils/corpora_test_helpers.py @@ -7,6 +7,8 @@ from . import TEST_DATA_PATH USFM_TEST_PROJECT_PATH = TEST_DATA_PATH / "usfm" / "Tes" +USFM_TARGET_PROJECT_PATH = TEST_DATA_PATH / "usfm" / "target" +USFM_SOURCE_PROJECT_PATH = TEST_DATA_PATH / "usfm" / "source" USX_TEST_PROJECT_PATH = TEST_DATA_PATH / "usx" / "Tes" TEXT_TEST_PROJECT_PATH = TEST_DATA_PATH / "txt" CUSTOM_VERS_PATH = TEST_DATA_PATH / "custom.vrs" diff --git a/tests/testutils/data/usfm/Tes/04LEVTes.SFM b/tests/testutils/data/usfm/Tes/03LEVTes.SFM similarity index 100% rename from tests/testutils/data/usfm/Tes/04LEVTes.SFM rename to tests/testutils/data/usfm/Tes/03LEVTes.SFM diff --git a/tests/testutils/data/usfm/Tes/41MATTes.SFM b/tests/testutils/data/usfm/Tes/41MATTes.SFM index af634ba..43b2665 100644 --- a/tests/testutils/data/usfm/Tes/41MATTes.SFM +++ b/tests/testutils/data/usfm/Tes/41MATTes.SFM @@ -1,7 +1,11 @@ \id MAT - Test +\f + \fr 1.0 \ft \f* \h Matthew \mt Matthew \ip An introduction to Matthew\fe + \ft This is an endnote.\fe* +\p Here is another paragraph. +\p and with a \w keyword|a special concept\w* in it. +\p and a \weirdtaglookingthing that is not an actual tag. \c 1 \s Chapter One \v 1 Chapter \pn one\+pro WON\+pro*\pn*, verse one.\f + \fr 1:1: \ft This is a footnote.\f* @@ -14,11 +18,15 @@ \li2 verse four, \v 5 Chapter one, \li2 verse \fig Figure 1|src="image1.png" size="col" ref="1:5"\fig* five. +\v 6 Verse 6 content. +\v 7 +\v 8 \c 2 \tr \tc1 Row one, column one. \tc2 Row one, column two. \tr \tc1 Row two, column one. \tc2 Row two, column two. \s1 Chapter \it Two \it* \p +\p \v 1 Chapter \add two\add*, verse \f + \fr 2:1: \ft This is a footnote.\f*one. \v 2-3 Chapter two, // verse \fm ∆\fm*two. \esb @@ -29,7 +37,9 @@ \v 4b Chapter two, verse four. \p \v 6 Chapter two, verse \w six|strong="12345" \w*. +\p \v 6 Bad verse. +\p \v 5 Chapter two, verse five \rq (MAT 3:1)\rq*. \v 7a Chapter two, verse seven A, \s Section header \ts-s\* @@ -44,3 +54,14 @@ \v 10 \tc3-4 \qt-s |Jesus\*Chapter 2 verse 10\qt-e\* \v 11-12 \restore restore information +\c 3 +\cl PSALM 3 +\s1 Section 1 +\mt1 Major Title 1 +\d \va (1)\va* Description +\q1 +\v 1 \va 2\va*\vp 1 (2)\vp* Chapter 3 verse 1. +\q1 3.1 part 2 +\b +\q1 3.1 part 3 +\q1 3.1 part 4 diff --git a/tests/testutils/data/usfm/Tes/44JHNTes.SFM b/tests/testutils/data/usfm/Tes/44JHNTes.SFM new file mode 100644 index 0000000..e69de29