diff --git a/machine/corpora/__init__.py b/machine/corpora/__init__.py index 587bd34..553ed97 100644 --- a/machine/corpora/__init__.py +++ b/machine/corpora/__init__.py @@ -15,7 +15,15 @@ from .parallel_text_row import ParallelTextRow from .paratext_backup_text_corpus import ParatextBackupTextCorpus from .paratext_text_corpus import ParatextTextCorpus -from .scripture_text_corpus import ScriptureTextCorpus, create_versification_ref_corpus, extract_scripture_corpus +from .scripture_element import ScriptureElement +from .scripture_ref import EMPTY_SCRIPTURE_REF, ScriptureRef +from .scripture_ref_usfm_parser_handler import ScriptureRefUsfmParserHandler, ScriptureTextType +from .scripture_text_corpus import ( + ScriptureTextCorpus, + create_versification_ref_corpus, + extract_scripture_corpus, + is_scripture, +) from .standard_parallel_text_corpus import StandardParallelTextCorpus from .text import Text from .text_corpus import TextCorpus @@ -41,6 +49,7 @@ from .usfm_parser_state import UsfmElementType, UsfmParserElement, UsfmParserState from .usfm_stylesheet import UsfmStylesheet from .usfm_tag import UsfmJustification, UsfmStyleAttribute, UsfmStyleType, UsfmTag, UsfmTextProperties, UsfmTextType +from .usfm_text_updater import UsfmTextUpdater from .usfm_token import UsfmAttribute, UsfmToken, UsfmTokenType from .usfm_tokenizer import RtlReferenceOrder, UsfmTokenizer from .usx_file_alignment_collection import UsxFileAlignmentCollection @@ -60,9 +69,11 @@ "DblBundleTextCorpus", "DictionaryAlignmentCorpus", "DictionaryTextCorpus", + "EMPTY_SCRIPTURE_REF", "escape_spaces", "extract_scripture_corpus", "flatten", + "is_scripture", "lowercase", "MemoryAlignmentCollection", "MemoryText", @@ -78,7 +89,11 @@ "ParatextTextCorpus", "parse_usfm", "RtlReferenceOrder", + "ScriptureElement", + "ScriptureRef", + "ScriptureRefUsfmParserHandler", "ScriptureTextCorpus", + "ScriptureTextType", "StandardParallelTextCorpus", "Text", "TextCorpus", @@ -104,6 +119,7 @@ "UsfmTag", "UsfmTextProperties", "UsfmTextType", + "UsfmTextUpdater", "UsfmToken", "UsfmTokenizer", "UsfmTokenType", diff --git a/machine/corpora/dictionary_text_corpus.py b/machine/corpora/dictionary_text_corpus.py index df05908..9fe0a46 100644 --- a/machine/corpora/dictionary_text_corpus.py +++ b/machine/corpora/dictionary_text_corpus.py @@ -1,5 +1,6 @@ from typing import Iterable, Optional, overload +from ..scripture.verse_ref import Versification from .text import Text from .text_corpus import TextCorpus @@ -21,6 +22,7 @@ def __init__(self, *args, **kwargs) -> None: texts = args[0] self._texts = {t.id: t for t in texts} self._is_tokenized = False + self._versification = None @property def texts(self) -> Iterable[Text]: @@ -34,6 +36,14 @@ def is_tokenized(self) -> bool: def is_tokenized(self, value: bool) -> None: self._is_tokenized = value + @property + def versification(self) -> Optional[Versification]: + return self._versification + + @versification.setter + def versification(self, value: Versification) -> None: + self._versification = value + def __getitem__(self, id: str) -> Optional[Text]: return self._texts.get(id) diff --git a/machine/corpora/flatten.py b/machine/corpora/flatten.py index 8a3398e..2515a2b 100644 --- a/machine/corpora/flatten.py +++ b/machine/corpora/flatten.py @@ -1,6 +1,7 @@ from itertools import chain from typing import Generator, Iterable, List, Optional, cast, overload +from ..scripture.verse_ref import Versification from .alignment_collection import AlignmentCollection from .alignment_corpus import AlignmentCorpus from .alignment_row import AlignmentRow @@ -54,6 +55,10 @@ def texts(self) -> Iterable[Text]: def is_tokenized(self) -> bool: return all(c.is_tokenized for c in self._corpora) + @property + def versification(self) -> Optional[Versification]: + return self._corpora[0].versification if len(self._corpora) > 0 else None + def count(self, include_empty: bool = True) -> int: return sum(c.count(include_empty) for c in self._corpora) diff --git a/machine/corpora/parallel_text_row.py b/machine/corpora/parallel_text_row.py index 60666b7..2608184 100644 --- a/machine/corpora/parallel_text_row.py +++ b/machine/corpora/parallel_text_row.py @@ -18,6 +18,8 @@ def __init__( source_flags: TextRowFlags = TextRowFlags.SENTENCE_START, target_flags: TextRowFlags = TextRowFlags.SENTENCE_START, ) -> None: + if not text_id: + raise ValueError("A text_id must be set.") if len(source_refs) == 0 and len(target_refs) == 0: raise ValueError("Either a source or target ref must be set.") self._text_id = text_id diff --git a/machine/corpora/paratext_backup_text_corpus.py b/machine/corpora/paratext_backup_text_corpus.py index 7bd93e6..347ba08 100644 --- a/machine/corpora/paratext_backup_text_corpus.py +++ b/machine/corpora/paratext_backup_text_corpus.py @@ -10,7 +10,7 @@ class ParatextBackupTextCorpus(ScriptureTextCorpus): - def __init__(self, filename: StrPath, include_markers: bool = False) -> None: + def __init__(self, filename: StrPath, include_markers: bool = False, include_all_text: bool = False) -> None: with ZipFile(filename, "r") as archive: parser = ZipParatextProjectSettingsParser(archive) settings = parser.parse() @@ -28,6 +28,7 @@ def __init__(self, filename: StrPath, include_markers: bool = False) -> None: sfm_entry.filename, versification, include_markers, + include_all_text, ) ) diff --git a/machine/corpora/paratext_project_settings_parser_base.py b/machine/corpora/paratext_project_settings_parser_base.py index 2de7a48..a7a5883 100644 --- a/machine/corpora/paratext_project_settings_parser_base.py +++ b/machine/corpora/paratext_project_settings_parser_base.py @@ -73,8 +73,16 @@ def parse(self) -> ParatextProjectSettings: post_part = naming_elem.get("PostPart") if post_part: suffix = post_part - biblical_terms = settings_tree.getroot().findtext("BiblicalTermsListSetting", "") - parts = biblical_terms.split(":", 2) + biblical_terms_list_setting = settings_tree.getroot().findtext("BiblicalTermsListSetting", "") + if biblical_terms_list_setting is None: + # Default to Major::BiblicalTerms.xml to mirror Paratext behavior + biblical_terms_list_setting = "Major::BiblicalTerms.xml" + parts = biblical_terms_list_setting.split(":", 2) + if len(parts) != 3: + raise ValueError( + f"The BiblicalTermsListSetting element in Settings.xml in project {full_name}" + f" is not in the expected format (i.e., Major::BiblicalTerms.xml) but is {biblical_terms_list_setting}." + ) return ParatextProjectSettings( name, full_name, encoding, versification, stylesheet, prefix, form, suffix, parts[0], parts[1], parts[2] diff --git a/machine/corpora/paratext_text_corpus.py b/machine/corpora/paratext_text_corpus.py index c99e361..53c883c 100644 --- a/machine/corpora/paratext_text_corpus.py +++ b/machine/corpora/paratext_text_corpus.py @@ -8,7 +8,7 @@ class ParatextTextCorpus(ScriptureTextCorpus): - def __init__(self, project_dir: StrPath, include_markers: bool = False) -> None: + def __init__(self, project_dir: StrPath, include_markers: bool = False, include_all_text: bool = False) -> None: parser = FileParatextProjectSettingsParser(project_dir) settings = parser.parse() @@ -17,7 +17,14 @@ def __init__(self, project_dir: StrPath, include_markers: bool = False) -> None: texts: List[UsfmFileText] = [] for sfm_filename in Path(project_dir).glob(f"{settings.file_name_prefix}*{settings.file_name_suffix}"): texts.append( - UsfmFileText(settings.stylesheet, settings.encoding, sfm_filename, versification, include_markers) + UsfmFileText( + settings.stylesheet, + settings.encoding, + sfm_filename, + versification, + include_markers, + include_all_text, + ) ) super().__init__(versification, texts) diff --git a/machine/corpora/scripture_element.py b/machine/corpora/scripture_element.py new file mode 100644 index 0000000..503630b --- /dev/null +++ b/machine/corpora/scripture_element.py @@ -0,0 +1,54 @@ +from __future__ import annotations + +from functools import total_ordering +from typing import Optional + +from ..utils.comparable import Comparable + + +@total_ordering +class ScriptureElement(Comparable): + def __init__(self, position: int, name: str) -> None: + self._position = position + self._name = name + + @property + def position(self) -> int: + return self._position + + @property + def name(self) -> str: + return self._name + + def compare_to(self, other: object, strict: Optional[bool] = True) -> int: + if not isinstance(other, ScriptureElement): + raise (TypeError("other is not a ScriptureElement object.")) + if self is other: + return 0 + + if strict: + res = self.position - other.position + if res != 0: + return res + + return (self.name > other.name) - (self.name < other.name) + + def __eq__(self, other: ScriptureElement) -> bool: + if not isinstance(other, ScriptureElement): + return NotImplemented + + return self.position == other.position and self.name == other.name + + def __lt__(self, other: ScriptureElement) -> bool: + if not isinstance(other, ScriptureElement): + return NotImplemented + + return self.compare_to(other) < 0 + + def __hash__(self) -> int: + return hash((self.position, self.name)) + + def __repr__(self): + if self.position == 0: + return self.name + return f"{self.position}:{self.name}" diff --git a/machine/corpora/scripture_ref.py b/machine/corpora/scripture_ref.py new file mode 100644 index 0000000..b5fd75c --- /dev/null +++ b/machine/corpora/scripture_ref.py @@ -0,0 +1,128 @@ +from __future__ import annotations + +from functools import total_ordering +from typing import List, Optional + +from ..scripture.constants import ENGLISH_VERSIFICATION +from ..scripture.verse_ref import VerseRef, Versification, are_overlapping_verse_ranges +from ..utils.comparable import Comparable +from .scripture_element import ScriptureElement + + +@total_ordering +class ScriptureRef(Comparable): + def __init__(self, ref: Optional[VerseRef] = None, path: Optional[List[ScriptureElement]] = None) -> None: + self._verse_ref: VerseRef = ref if ref is not None else VerseRef() + self._path: List[ScriptureElement] = path if path is not None else [] + + _empty: Optional[ScriptureRef] = None + + @classmethod + def parse(cls, selection: str, versification: Optional[Versification] = None) -> ScriptureRef: + parts: List[str] = selection.split("/") + if len(parts) == 1: + return cls( + VerseRef.from_string(parts[0], versification if versification is not None else ENGLISH_VERSIFICATION) + ) + vref: str = parts[0] + path: List[ScriptureElement] = [] + for part in parts[1:]: + elem: List[str] = part.split(":") + if len(elem) == 1: + path.append(ScriptureElement(0, elem[0])) + else: + path.append(ScriptureElement(int(elem[0]), elem[1])) + + return cls( + VerseRef.from_string(vref, versification if versification is not None else ENGLISH_VERSIFICATION), path + ) + + @property + def verse_ref(self) -> VerseRef: + return self._verse_ref + + @property + def path(self) -> List[ScriptureElement]: + return self._path + + @property + def book_num(self) -> int: + return self.verse_ref.book_num + + @property + def chapter_num(self) -> int: + return self.verse_ref.chapter_num + + @property + def verse_num(self) -> int: + return self.verse_ref.verse_num + + @property + def book(self) -> str: + return self.verse_ref.book + + @property + def chapter(self) -> str: + return self.verse_ref.chapter + + @property + def verse(self) -> str: + return self.verse_ref.verse + + @property + def versification(self) -> Versification: + return self.verse_ref.versification + + @property + def is_empty(self) -> bool: + return self.verse_ref.is_default + + @property + def is_verse(self) -> bool: + return VerseRef.verse_num != 0 and len(self.path) == 0 + + def change_versification(self, versification: Versification) -> ScriptureRef: + vr: VerseRef = self.verse_ref.copy() + vr.change_versification(versification) + return ScriptureRef(vr, self.path) + + def overlaps(self, other: ScriptureRef) -> bool: + if not are_overlapping_verse_ranges(self.verse_ref, other.verse_ref): + return False + return self.path == other.path + + def compare_to(self, other: object, compare_segments: bool = True, strict: bool = True): + if not isinstance(other, ScriptureRef): + raise TypeError("other is not a ScriptureRef object.") + if self is other: + return 0 + + res = self.verse_ref.compare_to(other.verse_ref, compare_segments=compare_segments) + if res != 0: + return res + + for se1, se2 in zip(self.path, other.path): + res = se1.compare_to(se2, strict=strict) + if res != 0: + return res + + return len(self.path) - len(other.path) + + def __eq__(self, other: object) -> bool: + if not isinstance(other, ScriptureRef): + return NotImplemented + return self.verse_ref == other.verse_ref and self.path == other.path + + def __lt__(self, other: object) -> bool: + if not isinstance(other, ScriptureRef): + return NotImplemented + return self.compare_to(other) < 0 + + def __hash__(self) -> int: + return hash((self.verse_ref, tuple(self.path))) + + def __repr__(self) -> str: + return f"{self.verse_ref}/{'/'.join(str(se) for se in self.path)}" + + +EMPTY_SCRIPTURE_REF = ScriptureRef() diff --git a/machine/corpora/scripture_ref_usfm_parser_handler.py b/machine/corpora/scripture_ref_usfm_parser_handler.py new file mode 100644 index 0000000..65c26cf --- /dev/null +++ b/machine/corpora/scripture_ref_usfm_parser_handler.py @@ -0,0 +1,174 @@ +from abc import ABC +from enum import Enum, auto +from typing import List, Optional, Sequence + +from ..scripture.verse_ref import VerseRef, are_overlapping_verse_ranges +from .corpora_utils import merge_verse_ranges +from .scripture_element import ScriptureElement +from .scripture_ref import ScriptureRef +from .usfm_parser_handler import UsfmParserHandler +from .usfm_parser_state import UsfmParserState +from .usfm_token import UsfmAttribute + + +class ScriptureTextType(Enum): + NONVERSE = auto() + VERSE = auto() + NOTE = auto() + + +class ScriptureRefUsfmParserHandler(UsfmParserHandler, ABC): + def __init__(self) -> None: + self._cur_verse_ref: VerseRef = VerseRef() + self._cur_elements_stack: List[ScriptureElement] = [] + self._cur_text_type_stack: List[ScriptureTextType] = [] + self._duplicate_verse: bool = False + + @property + def _current_text_type(self) -> ScriptureTextType: + return ScriptureTextType.NONVERSE if len(self._cur_text_type_stack) == 0 else self._cur_text_type_stack[-1] + + def end_usfm(self, state: UsfmParserState) -> None: + self._end_verse_text_wrapper(state) + + def chapter(self, state: UsfmParserState, number: str, marker: str, alt_number: str, pub_number: str) -> None: + self._end_verse_text_wrapper(state) + self._update_verse_ref(state.verse_ref, marker) + + def verse( + self, state: UsfmParserState, number: str, marker: str, alt_number: Optional[str], pub_number: Optional[str] + ) -> None: + if state.verse_ref == self._cur_verse_ref: + self._end_verse_text_wrapper(state) + # ignore duplicate verses + self._duplicate_verse = True + elif are_overlapping_verse_ranges(number, self._cur_verse_ref.verse): + # merge overlapping verse ranges in to one range + verse_ref: VerseRef = self._cur_verse_ref.copy() + verse_ref.verse = merge_verse_ranges(number, self._cur_verse_ref.verse) + self._update_verse_ref(verse_ref, marker) + else: + self._end_verse_text_wrapper(state) + self._update_verse_ref(state.verse_ref, marker) + self._start_verse_text_wrapper(state) + + def start_para( + self, + state: UsfmParserState, + marker: str, + unknown: Optional[bool], + attributes: Optional[Sequence[UsfmAttribute]], + ) -> None: + if self._cur_verse_ref.is_default: + self._update_verse_ref(state.verse_ref, marker) + if not state.is_verse_text: + self._start_parent_element(marker) + self._start_non_verse_text_wrapper(state) + + def end_para(self, state: UsfmParserState, marker: str) -> None: + if self._current_text_type == ScriptureTextType.NONVERSE: + self._end_parent_element() + self._end_non_verse_text_wrapper(state) + + def start_row(self, state: UsfmParserState, marker: str) -> None: + if self._current_text_type == ScriptureTextType.NONVERSE: + self._start_parent_element(marker) + + def end_row(self, state: UsfmParserState, marker: str) -> None: + if self._current_text_type == ScriptureTextType.NONVERSE: + self._end_parent_element() + + def start_cell(self, state: UsfmParserState, marker: str, align: str, colspan: int) -> None: + if self._current_text_type == ScriptureTextType.NONVERSE: + self._start_parent_element(marker) + self._start_non_verse_text_wrapper(state) + + def end_cell(self, state: UsfmParserState, marker: str) -> None: + if self._current_text_type == ScriptureTextType.NONVERSE: + self._end_parent_element() + self._end_non_verse_text_wrapper(state) + + def start_sidebar(self, state: UsfmParserState, marker: str, category: str) -> None: + self._start_parent_element(marker) + + def end_sidebar(self, state: UsfmParserState, marker: str, closed: bool) -> None: + self._end_parent_element() + + def start_note(self, state: UsfmParserState, marker: str, caller: str, category: Optional[str]) -> None: + self._next_element(marker) + self._start_note_text_wrapper(state) + + def end_note(self, state: UsfmParserState, marker: str, closed: bool) -> None: + self._end_note_text_wrapper(state) + + def ref(self, state: UsfmParserState, marker: str, display: str, target: str) -> None: ... + + def _start_verse_text(self, state: UsfmParserState, scripture_refs: Optional[List[ScriptureRef]]) -> None: ... + + def _end_verse_text(self, state: UsfmParserState, scripture_refs: List[ScriptureRef]) -> None: ... + + def _start_non_verse_text(self, state: UsfmParserState, scripture_ref: ScriptureRef) -> None: ... + + def _end_non_verse_text(self, state: UsfmParserState, scripture_ref: ScriptureRef) -> None: ... + + def _start_note_text(self, state: UsfmParserState, scripture_ref: ScriptureRef) -> None: ... + + def _end_note_text(self, state: UsfmParserState, scripture_ref: ScriptureRef) -> None: ... + + def _start_verse_text_wrapper(self, state: UsfmParserState) -> None: + self._duplicate_verse = False + self._cur_text_type_stack.append(ScriptureTextType.VERSE) + self._start_verse_text(state, self._create_verse_refs()) + + def _end_verse_text_wrapper(self, state: UsfmParserState) -> None: + if not self._duplicate_verse and self._cur_verse_ref.verse_num != 0: + self._end_verse_text(state, self._create_verse_refs()) + self._cur_text_type_stack.pop() + + def _start_non_verse_text_wrapper(self, state: UsfmParserState) -> None: + self._cur_text_type_stack.append(ScriptureTextType.NONVERSE) + self._start_non_verse_text(state, self._create_non_verse_ref()) + + def _end_non_verse_text_wrapper(self, state: UsfmParserState) -> None: + self._end_non_verse_text(state, self._create_non_verse_ref()) + self._cur_text_type_stack.pop() + + def _start_note_text_wrapper(self, state: UsfmParserState) -> None: + self._cur_text_type_stack.append(ScriptureTextType.NOTE) + self._start_note_text(state, self._create_non_verse_ref()) + + def _end_note_text_wrapper(self, state: UsfmParserState) -> None: + self._end_note_text(state, self._create_non_verse_ref()) + self._cur_text_type_stack.pop() + + def _update_verse_ref(self, verse_ref: VerseRef, marker: str) -> None: + if not are_overlapping_verse_ranges(verse_ref, self._cur_verse_ref): + self._cur_elements_stack.clear() + self._cur_elements_stack.append(ScriptureElement(0, marker)) + self._cur_verse_ref = verse_ref.copy() + + def _next_element(self, marker: str) -> None: + prev_elem: ScriptureElement = self._cur_elements_stack.pop() + self._cur_elements_stack.append(ScriptureElement(prev_elem.position + 1, marker)) + + def _start_parent_element(self, marker: str) -> None: + self._next_element(marker) + self._cur_elements_stack.append(ScriptureElement(0, marker)) + + def _end_parent_element(self) -> None: + self._cur_elements_stack.pop() + + def _create_verse_refs(self) -> List[ScriptureRef]: + return ( + [ScriptureRef(v) for v in self._cur_verse_ref.all_verses()] + if self._cur_verse_ref.has_multiple + else [ScriptureRef(self._cur_verse_ref)] + ) + + def _create_non_verse_ref(self) -> ScriptureRef: + verse_ref = ( + list(self._cur_verse_ref.all_verses())[-1] if self._cur_verse_ref.has_multiple else self._cur_verse_ref + ) + # No need to reverse unlike in Machine, elements are already added in correct order + path = [e for e in self._cur_elements_stack if e.position > 0] + return ScriptureRef(verse_ref, path) diff --git a/machine/corpora/scripture_text.py b/machine/corpora/scripture_text.py index 73dcf28..515790c 100644 --- a/machine/corpora/scripture_text.py +++ b/machine/corpora/scripture_text.py @@ -1,9 +1,10 @@ -from typing import Generator, List, Optional +from typing import Generator, List, Optional, Union from ..scripture import ENGLISH_VERSIFICATION from ..scripture.verse_ref import VerseRef, Versification from ..utils.context_managed_generator import ContextManagedGenerator from .corpora_utils import gen, get_scripture_text_sort_key +from .scripture_ref import ScriptureElement, ScriptureRef from .text_base import TextBase from .text_row import TextRow, TextRowFlags @@ -20,19 +21,46 @@ def versification(self) -> Versification: def get_rows(self) -> ContextManagedGenerator[TextRow, None, None]: seg_list: List[TextRow] = [] out_of_order = False - prev_verse_ref = VerseRef() + prev_scr_ref = ScriptureRef() with super().get_rows() as rows: for row in rows: - verse_ref: VerseRef = row.ref + scr_ref: ScriptureRef = row.ref seg_list.append(row) - if not out_of_order and verse_ref < prev_verse_ref: + if not out_of_order and scr_ref < prev_scr_ref: out_of_order = True - prev_verse_ref = verse_ref + prev_scr_ref = scr_ref if out_of_order: seg_list.sort(key=lambda r: r.ref) return ContextManagedGenerator(gen(seg_list)) - def _create_rows( + def _create_scripture_rows( + self, ref: Union[List[ScriptureRef], VerseRef], text: str = "", is_sentence_start: bool = True + ) -> Generator[TextRow, None, None]: + if isinstance(ref, VerseRef): + yield from self._create_scripture_rows_verse_ref(ref, text, is_sentence_start) + else: + yield from self._create_scripture_rows_scripture_ref(ref, text, is_sentence_start) + + def _create_scripture_rows_scripture_ref( + self, scripture_refs: List[ScriptureRef], text: str = "", is_sentence_start: bool = True + ) -> Generator[TextRow, None, None]: + if len(scripture_refs) > 1: + first_verse = True + for sref in scripture_refs: + if first_verse: + flags: TextRowFlags = TextRowFlags.IN_RANGE | TextRowFlags.RANGE_START + if is_sentence_start: + flags |= TextRowFlags.SENTENCE_START + yield self._create_row(text, sref, flags) + first_verse = False + else: + yield self._create_empty_row(sref, TextRowFlags.IN_RANGE) + else: + yield self._create_row( + text, scripture_refs[0], TextRowFlags.SENTENCE_START if is_sentence_start else TextRowFlags.NONE + ) + + def _create_scripture_rows_verse_ref( self, verse_ref: VerseRef, text: str = "", is_sentence_start: bool = True ) -> Generator[TextRow, None, None]: if verse_ref.has_multiple: @@ -42,13 +70,33 @@ def _create_rows( flags = TextRowFlags.IN_RANGE | TextRowFlags.RANGE_START if is_sentence_start: flags |= TextRowFlags.SENTENCE_START - yield self._create_row(text, vref, flags) + yield self._create_row(text, ScriptureRef(vref), flags) first_verse = False else: - yield self._create_empty_row(vref, TextRowFlags.IN_RANGE) + yield self._create_empty_row(ScriptureRef(vref), TextRowFlags.IN_RANGE) else: yield self._create_row( - text, verse_ref, TextRowFlags.SENTENCE_START if is_sentence_start else TextRowFlags.NONE + text, ScriptureRef(verse_ref), TextRowFlags.SENTENCE_START if is_sentence_start else TextRowFlags.NONE + ) + + def _create_scripture_row( + self, + ref: Union[ScriptureRef, VerseRef], + text: str, + is_sentence_start: bool, + elements: Optional[List[ScriptureElement]] = None, + ) -> TextRow: + if isinstance(ref, VerseRef): + return self._create_row( + text, + ScriptureRef(ref, elements), + TextRowFlags.SENTENCE_START if is_sentence_start else TextRowFlags.NONE, + ) + else: + return self._create_row( + text, + ref, + TextRowFlags.SENTENCE_START if is_sentence_start else TextRowFlags.NONE, ) def _create_verse_ref(self, chapter: str, verse: str) -> VerseRef: diff --git a/machine/corpora/scripture_text_corpus.py b/machine/corpora/scripture_text_corpus.py index 4e30d81..402fe89 100644 --- a/machine/corpora/scripture_text_corpus.py +++ b/machine/corpora/scripture_text_corpus.py @@ -1,17 +1,20 @@ from typing import Generator, Iterable, Optional, Tuple, cast -from ..scripture import ORIGINAL_VERSIFICATION +from ..scripture import ENGLISH_VERSIFICATION, ORIGINAL_VERSIFICATION from ..scripture.canon import book_id_to_number, book_number_to_id, is_canonical from ..scripture.verse_ref import VerseRef, Versification from ..utils.context_managed_generator import ContextManagedGenerator from .dictionary_text_corpus import DictionaryTextCorpus +from .scripture_ref import ScriptureRef from .scripture_text import ScriptureText from .text_corpus import TextCorpus from .text_row import TextRow class ScriptureTextCorpus(DictionaryTextCorpus): - def __init__(self, versification: Versification, texts: Iterable[ScriptureText] = []) -> None: + def __init__( + self, versification: Versification = ENGLISH_VERSIFICATION, texts: Iterable[ScriptureText] = [] + ) -> None: super().__init__(texts) self._versification = versification @@ -30,7 +33,7 @@ def _get_rows(self) -> Generator[TextRow, None, None]: for v in range(1, self.versification.get_last_verse(b, c) + 1): vref = self._create_verse_ref(str(c), str(v)) if not self._versification.is_excluded(vref.bbbcccvvv): - yield from self._create_rows(vref) + yield from self._create_scripture_rows(vref) def create_versification_ref_corpus( @@ -64,21 +67,25 @@ def extract() -> Generator[Tuple[str, VerseRef, Optional[VerseRef]], None, None] cur_trg_line = "" cur_trg_line_range = True for row in rows: - ref: VerseRef = row.ref - if cur_ref is not None and ref.compare_to(cur_ref, compare_segments=False) != 0: + scripture_ref: ScriptureRef = cast(ScriptureRef, row.ref) + if not scripture_ref.is_verse: + continue + + vref: VerseRef = scripture_ref.verse_ref + if cur_ref is not None and vref.compare_to(cur_ref, compare_segments=False) != 0: yield "" if cur_trg_line_range else cur_trg_line, cur_ref, cur_trg_ref cur_trg_line_range = cur_trg_line_range or len(cur_trg_line) > 0 cur_trg_line = "" cur_trg_ref = None - cur_ref = ref + cur_ref = vref if cur_trg_ref is None and len(row.target_refs) > 0: - cur_trg_ref = cast(VerseRef, row.target_refs[0]) + cur_trg_ref = cast(ScriptureRef, row.target_refs[0]).verse_ref elif cur_trg_ref is not None and len(row.target_refs) > 0 and cur_trg_ref != row.target_refs[0]: cur_trg_ref = cur_trg_ref.copy() cur_trg_ref.simplify() - trg_ref = cast(VerseRef, row.target_refs[0]) - if cur_trg_ref < row.target_refs[0]: + trg_ref = cast(ScriptureRef, row.target_refs[0]).verse_ref + if cur_trg_ref < trg_ref: start_ref = cur_trg_ref end_ref = trg_ref else: @@ -99,3 +106,7 @@ def extract() -> Generator[Tuple[str, VerseRef, Optional[VerseRef]], None, None] yield "" if cur_trg_line_range else cur_trg_line, cur_ref, cur_trg_ref return ContextManagedGenerator(extract()) + + +def is_scripture(text_corpus: TextCorpus) -> bool: + return text_corpus.versification is not None diff --git a/machine/corpora/standard_parallel_text_corpus.py b/machine/corpora/standard_parallel_text_corpus.py index 303d401..ebe22f7 100644 --- a/machine/corpora/standard_parallel_text_corpus.py +++ b/machine/corpora/standard_parallel_text_corpus.py @@ -5,7 +5,7 @@ from queue import SimpleQueue from typing import Any, Collection, ContextManager, Generator, Iterable, List, Optional, Set, Tuple -from ..scripture.verse_ref import VerseRef, Versification +from ..scripture.verse_ref import Versification from ..utils.comparable import compare from ..utils.context_managed_generator import ContextManagedGenerator from .aligned_word_pair import AlignedWordPair @@ -14,6 +14,8 @@ from .dictionary_alignment_corpus import DictionaryAlignmentCorpus from .parallel_text_corpus import ParallelTextCorpus from .parallel_text_row import ParallelTextRow +from .scripture_ref import EMPTY_SCRIPTURE_REF, ScriptureRef +from .scripture_text_corpus import is_scripture from .text_corpus import TextCorpus from .text_row import TextRow, TextRowFlags @@ -77,16 +79,20 @@ def _get_rows(self) -> Generator[ParallelTextRow, None, None]: with ExitStack() as stack: src_iterator = stack.enter_context(self._source_corpus.get_rows(text_ids)) - trg_iterator = stack.enter_context(_TargetCorpusGenerator(self._target_corpus.get_rows(text_ids))) + trg_iterator = stack.enter_context( + _TargetCorpusGenerator( + self._target_corpus.get_rows(text_ids), + self._source_corpus.versification, + self._target_corpus.versification, + ) + ) alignment_iterator = stack.enter_context(self._alignment_corpus.get_rows(text_ids)) - range_info = _RangeInfo() + range_info = _RangeInfo(target_versification=self._target_corpus.versification) source_same_ref_rows: List[TextRow] = [] target_same_ref_rows: List[TextRow] = [] src_row = next(src_iterator, None) - if src_row is not None and isinstance(src_row.ref, VerseRef): - trg_iterator.source_versification = src_row.ref.versification trg_row = next(trg_iterator, None) alignment: Optional[AlignmentRow] = None while src_row is not None and trg_row is not None: @@ -95,6 +101,7 @@ def _get_rows(self) -> Generator[ParallelTextRow, None, None]: if not self._all_target_rows and src_row.is_in_range: if range_info.is_in_range and trg_row.is_in_range and len(trg_row.segment) > 0: yield range_info.create_row() + range_info.text_id = src_row.text_id range_info.source_refs.append(src_row.ref) target_same_ref_rows.clear() range_info.source_segment.extend(src_row.segment) @@ -117,6 +124,7 @@ def _get_rows(self) -> Generator[ParallelTextRow, None, None]: if not self._all_source_rows and trg_row.is_in_range: if range_info.is_in_range and src_row.is_in_range and len(src_row.segment) > 0: yield range_info.create_row() + range_info.text_id = trg_row.text_id range_info.target_refs.append(trg_row.ref) source_same_ref_rows.clear() range_info.target_segment.extend(trg_row.segment) @@ -241,6 +249,14 @@ def _create_rows( else: raise ValueError("Either a source or target must be specified.") + src_refs = [] if src_row is None else [src_row.ref] + trg_refs = [] if trg_row is None else [trg_row.ref] + + if len(trg_refs) == 0 and is_scripture(self._target_corpus): + for r in src_refs: + r: ScriptureRef + trg_refs.append(r.change_versification(self._target_corpus.versification)) + if src_row is None: source_flags = TextRowFlags.IN_RANGE if force_source_in_range else TextRowFlags.NONE else: @@ -253,8 +269,8 @@ def _create_rows( yield ParallelTextRow( text_id, - [] if src_row is None else [src_row.ref], - [] if trg_row is None else [trg_row.ref], + src_refs, + trg_refs, [] if src_row is None else src_row.segment, [] if trg_row is None else trg_row.segment, aligned_word_pairs, @@ -300,12 +316,17 @@ class _RangeInfo: is_target_sentence_start: bool = field(default=False, init=False) is_source_empty: bool = field(default=True, init=False) is_target_empty: bool = field(default=True, init=False) + target_versification: Optional[Versification] = field(default=None) @property def is_in_range(self) -> bool: - return len(self.source_refs) > 0 and len(self.target_refs) > 0 + return len(self.source_refs) > 0 or len(self.target_refs) > 0 def create_row(self) -> ParallelTextRow: + if len(self.target_refs) == 0 and self.target_versification is not None: + for r in self.source_refs: + r: ScriptureRef + self.target_refs.append(r.change_versification(self.target_versification)) row = ParallelTextRow( self.text_id, self.source_refs.copy(), @@ -329,31 +350,29 @@ def create_row(self) -> ParallelTextRow: class _TargetCorpusGenerator(ContextManager["_TargetCorpusGenerator"], Generator[TextRow, None, None]): - def __init__(self, generator: ContextManagedGenerator[TextRow, None, None]) -> None: + def __init__( + self, + generator: ContextManagedGenerator[TextRow, None, None], + source_versification: Versification, + target_versification: Versification, + ) -> None: self._generator = generator - self._is_scripture = False + self._source_versification = source_versification + self._is_scripture = ( + source_versification is not None + and target_versification is not None + and source_versification != target_versification + ) self._is_enumerating = False self._verse_rows: SimpleQueue[TextRow] = SimpleQueue() - self.source_versification: Optional[Versification] = None self._row: Optional[TextRow] = None def send(self, value: None) -> TextRow: - if not self._is_enumerating: - self._is_enumerating = True - self._row = next(self._generator, None) - if ( - self._row is not None - and isinstance(self._row.ref, VerseRef) - and self.source_versification != self._row.ref.versification - ): - self._is_scripture = True - elif self._row is not None: - return self._row - else: - raise StopIteration - if self._is_scripture: - if self._verse_rows.empty(): + if not self._is_enumerating: + self._row = next(self._generator, None) + self._is_enumerating = True + if self._verse_rows.empty() and self._row is not None: self._collect_verses() if not self._verse_rows.empty(): return self._verse_rows.get() @@ -378,21 +397,20 @@ def __exit__(self, type: Any, value: Any, traceback: Any) -> None: self.close() def _collect_verses(self) -> None: - assert self.source_versification is not None - seg_list: List[Tuple[VerseRef, TextRow]] = [] + assert self._source_versification is not None + seg_list: List[Tuple[ScriptureRef, TextRow]] = [] out_of_order = False - prev_verse_ref = VerseRef() + prev_scr_ref = EMPTY_SCRIPTURE_REF range_start_offset = -1 while self._row is not None: row = self._row - verse_ref: VerseRef = row.ref - if not prev_verse_ref.is_default and verse_ref.book_num != prev_verse_ref.book_num: + scr_ref: ScriptureRef = row.ref + if not prev_scr_ref.is_empty and scr_ref.book_num != prev_scr_ref.book_num: break - verse_ref = verse_ref.copy() - verse_ref.change_versification(self.source_versification) + scr_ref = scr_ref.change_versification(self._source_versification) # convert one-to-many mapping to a verse range - if verse_ref == prev_verse_ref: + if scr_ref == prev_scr_ref: range_start_verse_ref, range_start_row = seg_list[range_start_offset] flags = TextRowFlags.IN_RANGE if range_start_row.is_sentence_start: @@ -412,10 +430,10 @@ def _collect_verses(self) -> None: range_start_offset -= 1 else: range_start_offset = -1 - seg_list.append((verse_ref, row)) - if not out_of_order and verse_ref < prev_verse_ref: + seg_list.append((scr_ref, row)) + if not out_of_order and scr_ref < prev_scr_ref: out_of_order = True - prev_verse_ref = verse_ref + prev_scr_ref = scr_ref self._row = next(self._generator, None) if out_of_order: @@ -433,6 +451,6 @@ def _check_same_ref_rows(same_ref_rows: List[TextRow], other_row: TextRow) -> bo def _compare_refs(source_ref: Any, target_ref: Any) -> int: - if isinstance(source_ref, VerseRef) and isinstance(target_ref, VerseRef): + if isinstance(source_ref, ScriptureRef) and isinstance(target_ref, ScriptureRef): return source_ref.compare_to(target_ref, compare_segments=False) return compare(source_ref, target_ref) diff --git a/machine/corpora/text_corpus.py b/machine/corpora/text_corpus.py index e2995e4..35001c5 100644 --- a/machine/corpora/text_corpus.py +++ b/machine/corpora/text_corpus.py @@ -4,6 +4,7 @@ from itertools import islice from typing import Any, Callable, Generator, Iterable, Literal, Optional, Tuple +from ..scripture.verse_ref import Versification from ..tokenization.detokenizer import Detokenizer from ..tokenization.tokenizer import Tokenizer from ..utils.context_managed_generator import ContextManagedGenerator @@ -25,6 +26,10 @@ def texts(self) -> Iterable[Text]: ... @abstractmethod def is_tokenized(self) -> bool: ... + @property + @abstractmethod + def versification(self) -> Versification: ... + def get_rows(self, text_ids: Optional[Iterable[str]] = None) -> ContextManagedGenerator[TextRow, None, None]: return ContextManagedGenerator(self._get_rows(text_ids)) @@ -161,6 +166,10 @@ def texts(self) -> Iterable[Text]: def is_tokenized(self) -> bool: return self._is_tokenized + @property + def versification(self) -> Versification: + return self._corpus.versification + def count(self, include_empty: bool = True) -> int: return self._corpus.count(include_empty) @@ -182,6 +191,10 @@ def texts(self) -> Iterable[Text]: def is_tokenized(self) -> bool: return self._corpus.is_tokenized + @property + def versification(self) -> Versification: + return self._corpus.versification + def _get_rows(self, text_ids: Optional[Iterable[str]] = None) -> Generator[TextRow, None, None]: with self._corpus.get_rows((t.id for t in self.texts) if text_ids is None else text_ids) as rows: yield from rows @@ -200,6 +213,10 @@ def texts(self) -> Iterable[Text]: def is_tokenized(self) -> bool: return self._corpus.is_tokenized + @property + def versification(self) -> Versification: + return self._corpus.versification + def _get_rows(self, text_ids: Optional[Iterable[str]] = None) -> Generator[TextRow, None, None]: with self._corpus.get_rows(text_ids) as rows: yield from (row for i, row in enumerate(rows) if self._predicate(row, i)) @@ -218,6 +235,10 @@ def texts(self) -> Iterable[Text]: def is_tokenized(self) -> bool: return self._corpus.is_tokenized + @property + def versification(self) -> Versification: + return self._corpus.versification + def _get_rows(self, text_ids: Optional[Iterable[str]] = None) -> Generator[TextRow, None, None]: with self._corpus.get_rows(text_ids) as rows: yield from islice(rows, self._count) diff --git a/machine/corpora/usfm_file_text.py b/machine/corpora/usfm_file_text.py index 258090d..d4c2d68 100644 --- a/machine/corpora/usfm_file_text.py +++ b/machine/corpora/usfm_file_text.py @@ -17,8 +17,11 @@ def __init__( filename: StrPath, versification: Optional[Versification] = None, include_markers: bool = False, + include_all_text: bool = False, ) -> None: - super().__init__(_get_id(filename, encoding), stylesheet, encoding, versification, include_markers) + super().__init__( + _get_id(filename, encoding), stylesheet, encoding, versification, include_markers, include_all_text + ) self._filename = Path(filename) diff --git a/machine/corpora/usfm_file_text_corpus.py b/machine/corpora/usfm_file_text_corpus.py index 35f6ba3..4033d02 100644 --- a/machine/corpora/usfm_file_text_corpus.py +++ b/machine/corpora/usfm_file_text_corpus.py @@ -18,11 +18,14 @@ def __init__( versification: Optional[Versification] = None, include_markers: bool = False, file_pattern: str = "*.SFM", + include_all_text: bool = False, ) -> None: if versification is None: versification = ENGLISH_VERSIFICATION stylesheet = UsfmStylesheet(stylesheet_filename) texts: List[UsfmFileText] = [] for sfm_filename in Path(project_dir).glob(file_pattern): - texts.append(UsfmFileText(stylesheet, encoding, sfm_filename, versification, include_markers)) + texts.append( + UsfmFileText(stylesheet, encoding, sfm_filename, versification, include_markers, include_all_text) + ) super().__init__(versification, texts) diff --git a/machine/corpora/usfm_parser.py b/machine/corpora/usfm_parser.py index 455235a..810a330 100644 --- a/machine/corpora/usfm_parser.py +++ b/machine/corpora/usfm_parser.py @@ -58,6 +58,7 @@ def process_tokens(self) -> None: def process_token(self) -> bool: # If past end if self.state.index >= len(self.state.tokens) - 1: + self._close_all() if self.handler is not None: self.handler.end_usfm(self.state) return False diff --git a/machine/corpora/usfm_stylesheet.py b/machine/corpora/usfm_stylesheet.py index 29185aa..5d81bba 100644 --- a/machine/corpora/usfm_stylesheet.py +++ b/machine/corpora/usfm_stylesheet.py @@ -19,7 +19,7 @@ def is_cell_range(marker: str) -> Tuple[bool, str, int]: if col_span >= 2: return True, base_tag, col_span - return False, "", 0 + return False, marker, 0 class UsfmStylesheet: diff --git a/machine/corpora/usfm_text_base.py b/machine/corpora/usfm_text_base.py index 2637b22..481f047 100644 --- a/machine/corpora/usfm_text_base.py +++ b/machine/corpora/usfm_text_base.py @@ -2,15 +2,17 @@ from io import TextIOWrapper from typing import Generator, Iterable, List, Optional, Sequence -from ..scripture.verse_ref import VerseRef, Versification, are_overlapping_verse_ranges +from machine.corpora.scripture_ref import ScriptureRef + +from ..scripture.verse_ref import Versification from ..utils.string_utils import has_sentence_ending -from .corpora_utils import gen, merge_verse_ranges +from .corpora_utils import gen +from .scripture_ref_usfm_parser_handler import ScriptureRefUsfmParserHandler, ScriptureTextType from .scripture_text import ScriptureText from .stream_container import StreamContainer from .text_row import TextRow from .usfm_parser import parse_usfm -from .usfm_parser_handler import UsfmParserHandler -from .usfm_parser_state import UsfmElementType, UsfmParserState +from .usfm_parser_state import UsfmParserState from .usfm_stylesheet import UsfmStylesheet from .usfm_token import UsfmAttribute, UsfmToken, UsfmTokenType @@ -23,12 +25,14 @@ def __init__( encoding: str, versification: Optional[Versification], include_markers: bool, + include_all_text: bool, ) -> None: super().__init__(id, versification) self._stylesheet = stylesheet self._encoding = encoding self._include_markers = include_markers + self._include_all_text = include_all_text @abstractmethod def _create_stream_container(self) -> StreamContainer: ... @@ -52,13 +56,14 @@ def _read_usfm(self) -> str: return reader.read() -class _TextRowCollector(UsfmParserHandler): +class _TextRowCollector(ScriptureRefUsfmParserHandler): def __init__(self, text: UsfmTextBase) -> None: + super().__init__() + self._text = text self._rows: List[TextRow] = [] - self._verse_text = "" self._next_para_tokens: List[UsfmToken] = [] - self._verse_ref: Optional[VerseRef] = None + self._row_texts_stack: List[str] = [] self._sentence_start: bool = False self._next_para_text_started = False @@ -66,17 +71,6 @@ def __init__(self, text: UsfmTextBase) -> None: def rows(self) -> Iterable[TextRow]: return self._rows - def chapter( - self, - state: UsfmParserState, - number: str, - marker: str, - alt_number: Optional[str], - pub_number: Optional[str], - ) -> None: - self._verse_completed(next_sentence_start=True) - self._verse_ref = None - def verse( self, state: UsfmParserState, @@ -85,19 +79,7 @@ def verse( alt_number: Optional[str], pub_number: Optional[str], ) -> None: - if self._verse_ref is None: - self._verse_ref = state.verse_ref.copy() - elif state.verse_ref.exact_equals(self._verse_ref): - self._verse_completed() - - # ignore duplicate verse - self._verse_ref = None - elif are_overlapping_verse_ranges(number, self._verse_ref.verse): - # merge overlapping verse ranges in to one range - self._verse_ref.verse = merge_verse_ranges(number, self._verse_ref.verse) - else: - self._verse_completed() - self._verse_ref = state.verse_ref.copy() + super().verse(state, number, marker, alt_number, pub_number) self._next_para_text_started = True self._next_para_tokens.clear() @@ -108,22 +90,25 @@ def start_para( unknown: bool, attributes: Optional[Sequence[UsfmAttribute]], ) -> None: + super().start_para(state, marker, unknown, attributes) self._handle_para(state) def start_row(self, state: UsfmParserState, marker: str) -> None: + super().start_row(state, marker) self._handle_para(state) def start_cell(self, state: UsfmParserState, marker: str, align: str, colspan: int) -> None: - if self._verse_ref is None: - return + super().start_cell(state, marker, align, colspan) if self._text._include_markers: self._output_marker(state) - else: - if len(self._verse_text) > 0 and not self._verse_text[-1].isspace(): - self._verse_text += " " + elif self._current_text_type == ScriptureTextType.VERSE: + verse_text: str = self._row_texts_stack[-1] + if len(verse_text) > 0 and not verse_text[-1].isspace(): + self._row_texts_stack[-1] += " " def ref(self, state: UsfmParserState, marker: str, display: str, target: str) -> None: + super().ref(state, marker, display, target) self._output_marker(state) def start_char( @@ -133,87 +118,116 @@ def start_char( unknown: bool, attributes: Optional[Sequence[UsfmAttribute]], ) -> None: + super().start_char(state, marker_without_plus, unknown, attributes) self._output_marker(state) def end_char( self, state: UsfmParserState, marker: str, attributes: Optional[Sequence[UsfmAttribute]], closed: bool ) -> None: assert state.prev_token is not None + super().end_char(state, marker, attributes, closed) if self._text._include_markers and attributes is not None and state.prev_token.type == UsfmTokenType.ATTRIBUTE: - self._verse_text += str(state.prev_token) + self._row_texts_stack[-1] += str(state.prev_token) if closed: self._output_marker(state) if not self._text._include_markers and marker == "rq": - self._verse_text = self._verse_text.rstrip() + self._row_texts_stack[-1] = self._row_texts_stack[-1].rstrip() def start_note(self, state: UsfmParserState, marker: str, caller: str, category: Optional[str]) -> None: + super().start_note(state, marker, caller, category) self._output_marker(state) def end_note(self, state: UsfmParserState, marker: str, closed: bool) -> None: + super().end_note(state, marker, closed) if closed: self._output_marker(state) def opt_break(self, state: UsfmParserState) -> None: - if not self._text._include_markers: - self._verse_text = self._verse_text.rstrip() + super().opt_break(state) + if self._text._include_markers: + self._row_texts_stack[-1] += "//" + elif self._current_text_type != ScriptureTextType.VERSE or state.is_verse_text: + self._row_texts_stack[-1] = self._row_texts_stack[-1].rstrip() def text(self, state: UsfmParserState, text: str) -> None: - if self._verse_ref is None or not state.is_verse_para: + super().text(state, text) + + if len(self._row_texts_stack) == 0: return + row_text = self._row_texts_stack[-1] if self._text._include_markers: text = text.rstrip("\r\n") - if len(text) > 0 and not any(e.type == UsfmElementType.SIDEBAR for e in state.stack): + if len(text) > 0: if not text.isspace(): for token in self._next_para_tokens: - self._verse_text += str(token) + row_text += str(token) self._next_para_tokens.clear() self._next_para_text_started = True - if len(self._verse_text) == 0 or self._verse_text[-1].isspace(): + if len(row_text) == 0 or row_text[-1].isspace(): text = text.lstrip() - self._verse_text += text - elif state.is_verse_text and len(text) > 0: + row_text += text + elif len(text) > 0 and (self._current_text_type != ScriptureTextType.VERSE or state.is_verse_text): if ( state.prev_token is not None and state.prev_token.type == UsfmTokenType.END - and (self._verse_text == "" or self._verse_text[-1].isspace()) + and (len(row_text) == 0 or row_text[-1].isspace()) ): text = text.lstrip() - self._verse_text += text + row_text += text + self._row_texts_stack[-1] = row_text + + def _start_verse_text(self, state: UsfmParserState, scripture_refs: List[ScriptureRef]) -> None: + self._row_texts_stack.append("") + + def _end_verse_text(self, state: UsfmParserState, scripture_refs: List[ScriptureRef]) -> None: + text = self._row_texts_stack.pop() + self._rows.extend(self._text._create_scripture_rows(scripture_refs, text, self._sentence_start)) + self._sentence_start = (state.token and state.token.marker == "c") or has_sentence_ending(text) + + def _start_non_verse_text(self, state: UsfmParserState, scripture_ref: ScriptureRef) -> None: + self._row_texts_stack.append("") + + def _end_non_verse_text(self, state: UsfmParserState, scripture_ref: ScriptureRef) -> None: + text = self._row_texts_stack.pop() + if self._text._include_all_text: + self._rows.append(self._text._create_scripture_row(scripture_ref, text, self._sentence_start)) + + def _start_note_text(self, state: UsfmParserState, scripture_ref: ScriptureRef) -> None: + if self._text._include_markers: + return + self._row_texts_stack.append("") - def end_usfm(self, state: UsfmParserState) -> None: - self._verse_completed() + def _end_note_text(self, state: UsfmParserState, scripture_ref: ScriptureRef) -> None: + if self._text._include_markers: + return + text = self._row_texts_stack.pop() + if self._text._include_all_text: + self._rows.append(self._text._create_scripture_row(scripture_ref, text, self._sentence_start)) def _output_marker(self, state: UsfmParserState) -> None: - if self._verse_ref is None or not self._text._include_markers: + if not self._text._include_markers or len(self._row_texts_stack) == 0: return assert state.token is not None if self._next_para_text_started: - self._verse_text += str(state.token) + self._row_texts_stack[-1] += str(state.token) else: self._next_para_tokens.append(state.token) - def _verse_completed(self, next_sentence_start: Optional[bool] = None) -> None: - if self._verse_ref is None: - return - - self._rows.extend(self._text._create_rows(self._verse_ref, self._verse_text, self._sentence_start)) - self._sentence_start = ( - has_sentence_ending(self._verse_text) if next_sentence_start is None else next_sentence_start - ) - self._verse_text = "" - def _handle_para(self, state: UsfmParserState) -> None: - if self._verse_ref is None: + if len(self._row_texts_stack) == 0: return assert state.token is not None - if state.is_verse_para: - if len(self._verse_text) > 0 and not self._verse_text[-1].isspace(): - self._verse_text += " " + for i, row_text in enumerate(self._row_texts_stack): + if len(row_text) > 0 and not row_text[-1].isspace(): + self._row_texts_stack[i] += " " + if self._current_text_type == ScriptureTextType.VERSE: self._next_para_tokens.append(state.token) self._next_para_text_started = False + if not state.is_verse_para: + self._sentence_start = True diff --git a/machine/corpora/usfm_verse_text_updater.py b/machine/corpora/usfm_text_updater.py similarity index 51% rename from machine/corpora/usfm_verse_text_updater.py rename to machine/corpora/usfm_text_updater.py index e483b6e..ba62eea 100644 --- a/machine/corpora/usfm_verse_text_updater.py +++ b/machine/corpora/usfm_text_updater.py @@ -1,24 +1,28 @@ from typing import List, Optional, Tuple, Union -from ..scripture.verse_ref import VerseRef -from .usfm_parser_handler import UsfmParserHandler +from .scripture_ref import ScriptureRef +from .scripture_ref_usfm_parser_handler import ScriptureRefUsfmParserHandler from .usfm_parser_state import UsfmParserState from .usfm_stylesheet import UsfmStylesheet from .usfm_token import UsfmAttribute, UsfmToken, UsfmTokenType from .usfm_tokenizer import UsfmTokenizer -class UsfmVerseTextUpdater(UsfmParserHandler): +class UsfmTextUpdater(ScriptureRefUsfmParserHandler): def __init__( self, - rows: Optional[List[Tuple[List[VerseRef], str]]] = None, + rows: Optional[List[Tuple[List[ScriptureRef], str]]] = None, id_text: Optional[str] = None, - strip_all_text: Optional[bool] = False, + strip_all_text: bool = False, + strict_comparison: bool = True, ) -> None: + super().__init__() self._rows = rows or [] self._tokens: List[UsfmToken] = [] self._id_text = id_text self._strip_all_text = strip_all_text + self._strict_comparison = strict_comparison + self._replace_stack: List[bool] = [] self._row_index: int = 0 self._token_index: int = 0 self._replace_text: bool = False @@ -27,14 +31,22 @@ def __init__( def tokens(self) -> List[UsfmToken]: return self._tokens + @property + def replace_text(self) -> bool: + return self._strip_all_text or (len(self._replace_stack) > 0 and self._replace_stack[-1]) + def start_book(self, state: UsfmParserState, marker: str, code: str) -> None: self._collect_tokens(state) if self._id_text is not None: self._tokens.append(UsfmToken(UsfmTokenType.TEXT, text=self._id_text + " ")) - self._replace_text = True + self._replace_stack.append(self._id_text is not None) + + super().start_book(state, marker, code) def end_book(self, state: UsfmParserState, marker: str) -> None: - self._replace_text = False + self._replace_stack.pop() + + super().end_book(state, marker) def start_para( self, @@ -43,28 +55,36 @@ def start_para( unknown: bool, attributes: Optional[List[UsfmAttribute]], ) -> None: - if not state.is_verse_para: - self._replace_text = False self._collect_tokens(state) + super().start_para(state, marker, unknown, attributes) + def start_row(self, state: UsfmParserState, marker: str) -> None: self._collect_tokens(state) + super().start_row(state, marker) + def start_cell(self, state: UsfmParserState, marker: str, align: str, colspan: int) -> None: self._collect_tokens(state) + super().start_cell(state, marker, align, colspan) + def end_cell(self, state: UsfmParserState, marker: str) -> None: self._collect_tokens(state) + super().end_cell(state, marker) + def start_sidebar(self, state: UsfmParserState, marker: str, category: str) -> None: - self._replace_text = False self._collect_tokens(state) + super().start_sidebar(state, marker, category) + def end_sidebar(self, state: UsfmParserState, marker: str, closed: bool) -> None: - self._replace_text = False if closed: self._collect_tokens(state) + super().end_sidebar(state, marker, closed) + def chapter( self, state: UsfmParserState, @@ -73,9 +93,10 @@ def chapter( alt_number: str, pub_number: str, ) -> None: - self._replace_text = False self._collect_tokens(state) + super().chapter(state, number, marker, alt_number, pub_number) + def milestone( self, state: UsfmParserState, @@ -85,6 +106,8 @@ def milestone( ) -> None: self._collect_tokens(state) + super().milestone(state, marker, start_milestone, attributes) + def verse( self, state: UsfmParserState, @@ -93,30 +116,9 @@ def verse( alt_number: str, pub_number: str, ) -> None: - self._replace_text = False self._collect_tokens(state) - while self._row_index < len(self._rows): - verse_refs, text = self._rows[self._row_index] - stop = False - for verse_ref in verse_refs: - compare = verse_ref.compare_to(state.verse_ref, compare_segments=False) - if compare == 0: - self._tokens.append(UsfmToken(UsfmTokenType.TEXT, text=text + " ")) - self._replace_text = True - break - else: - if any(v == verse_ref for v in state.verse_ref.all_verses()): - self._tokens.append(UsfmToken(UsfmTokenType.TEXT, text=text + " ")) - self._replace_text = True - break - if compare > 0: - stop = True - break - if stop: - break - else: - self._row_index += 1 + super().verse(state, number, marker, alt_number, pub_number) def start_char( self, @@ -125,11 +127,13 @@ def start_char( unknown: bool, attributes: List[UsfmAttribute], ) -> None: - if self._strip_all_text or (self._replace_text and state.is_verse_para): + if self.replace_text: self._skip_tokens(state) else: self._collect_tokens(state) + super().start_char(state, marker_without_plus, unknown, attributes) + def end_char( self, state: UsfmParserState, @@ -137,9 +141,11 @@ def end_char( attributes: List[UsfmAttribute], closed: bool, ) -> None: - if closed and (self._strip_all_text or (self._replace_text and state.is_verse_para)): + if closed and self.replace_text: self._skip_tokens(state) + super().end_char(state, marker, attributes, closed) + def start_note( self, state: UsfmParserState, @@ -147,47 +153,120 @@ def start_note( caller: str, category: str, ) -> None: - if self._strip_all_text or (self._replace_text and state.is_verse_para): + if self.replace_text: self._skip_tokens(state) else: self._collect_tokens(state) + super().start_note(state, marker, caller, category) + def end_note(self, state: UsfmParserState, marker: str, closed: bool) -> None: - if closed and (self._strip_all_text or (self._replace_text and state.is_verse_para)): + if closed and self.replace_text: self._skip_tokens(state) + super().end_note(state, marker, closed) + def ref(self, state: UsfmParserState, marker: str, display: str, target: str) -> None: - if self._strip_all_text or (self._replace_text and state.is_verse_para): + if self.replace_text: self._skip_tokens(state) else: self._collect_tokens(state) + super().ref(state, marker, display, target) + def text(self, state: UsfmParserState, text: str) -> None: - if self._strip_all_text or ( - self._replace_text and (state.is_verse_para or (state.para_tag and state.para_tag.marker == "id")) - ): + if self.replace_text: self._skip_tokens(state) else: self._collect_tokens(state) + super().text(state, text) + def opt_break(self, state: UsfmParserState) -> None: - if self._strip_all_text or (self._replace_text and state.is_verse_para): + if self.replace_text: self._skip_tokens(state) else: self._collect_tokens(state) + super().opt_break(state) + def unmatched(self, state: UsfmParserState, marker: str) -> None: - if self._strip_all_text or (self._replace_text and state.is_verse_para): + if self.replace_text: self._skip_tokens(state) else: self._collect_tokens(state) + super().unmatched(state, marker) + + def _start_verse_text(self, state: UsfmParserState, scripture_refs: List[ScriptureRef]) -> None: + row_texts: List[str] = self._advance_rows(scripture_refs) + self._tokens.extend(UsfmToken(UsfmTokenType.TEXT, text=t + " ") for t in row_texts) + self._replace_stack.append(len(row_texts) > 0) + + def _end_verse_text(self, state: UsfmParserState, scripture_refs: List[ScriptureRef]) -> None: + self._replace_stack.pop() + + def _start_non_verse_text(self, state: UsfmParserState, scripture_ref: ScriptureRef) -> None: + row_texts = self._advance_rows([scripture_ref]) + self._tokens.extend(UsfmToken(UsfmTokenType.TEXT, text=t + " ") for t in row_texts) + self._replace_stack.append(len(row_texts) > 0) + + def _end_non_verse_text(self, state: UsfmParserState, scripture_ref: ScriptureRef) -> None: + self._replace_stack.pop() + + def _start_note_text(self, state: UsfmParserState, scripture_ref: ScriptureRef) -> None: + row_texts = self._advance_rows([scripture_ref]) + if len(row_texts) > 0: + if state.token is None: + raise ValueError("Invalid parser state.") + self._tokens.append(state.token) + self._tokens.append(UsfmToken(UsfmTokenType.CHARACTER, "ft", None, "ft*")) + for i, text in enumerate(row_texts): + if i < len(row_texts) - 1: + text += " " + self._tokens.append(UsfmToken(UsfmTokenType.TEXT, text=text)) + self._tokens.append(UsfmToken(UsfmTokenType.END, state.token.end_marker, None, None)) + self._replace_stack.append(True) + else: + self._replace_stack.append(self._replace_stack[-1]) + + def _end_note_text(self, state: UsfmParserState, scripture_ref: ScriptureRef) -> None: + self._replace_stack.pop() + def get_usfm(self, stylesheet: Union[str, UsfmStylesheet] = "usfm.sty") -> str: if isinstance(stylesheet, str): stylesheet = UsfmStylesheet(stylesheet) tokenizer = UsfmTokenizer(stylesheet) return tokenizer.detokenize(self._tokens) + def _advance_rows(self, seg_scr_refs: List[ScriptureRef]) -> List[str]: + row_texts: List[str] = [] + i = 0 + while self._row_index < len(self._rows) and i < len(seg_scr_refs): + row_scr_refs, text = self._rows[self._row_index] + stop = False + for row_scr_ref in row_scr_refs: + found = False + for seg_scr_ref in seg_scr_refs[i:]: + compare = row_scr_ref.compare_to( + seg_scr_refs[i], compare_segments=False, strict=self._strict_comparison + ) + if compare == 0: + row_texts.append(text) + i += 1 + found = True + break + elif compare > 0: + stop = True + break + if stop or found: + break + if stop: + break + else: + self._row_index += 1 + return row_texts + def _collect_tokens(self, state: UsfmParserState) -> None: while self._token_index <= state.index + state.special_token_count: self._tokens.append(state.tokens[self._token_index]) diff --git a/machine/corpora/usfm_zip_text.py b/machine/corpora/usfm_zip_text.py index 42b42b9..8e85570 100644 --- a/machine/corpora/usfm_zip_text.py +++ b/machine/corpora/usfm_zip_text.py @@ -19,9 +19,15 @@ def __init__( path: str, versification: Optional[Versification] = None, include_markers: bool = False, + include_all_text: bool = False, ) -> None: super().__init__( - _get_id(archive_filename, path, encoding), stylesheet, encoding, versification, include_markers + _get_id(archive_filename, path, encoding), + stylesheet, + encoding, + versification, + include_markers, + include_all_text, ) self._archive_filename = archive_filename self._path = path diff --git a/machine/corpora/usx_text_base.py b/machine/corpora/usx_text_base.py index 3ab10cc..66f8648 100644 --- a/machine/corpora/usx_text_base.py +++ b/machine/corpora/usx_text_base.py @@ -20,4 +20,4 @@ def _get_rows(self) -> Generator[TextRow, None, None]: with self._create_stream_container() as stream_container, stream_container.open_stream() as stream: for verse in self._parser.parse(stream): verse_ref = self._create_verse_ref(verse.chapter, verse.verse) - yield from self._create_rows(verse_ref, verse.text, verse.is_sentence_start) + yield from self._create_scripture_rows(verse_ref, verse.text, verse.is_sentence_start) diff --git a/machine/scripture/verse_ref.py b/machine/scripture/verse_ref.py index aec67e7..2bfa081 100644 --- a/machine/scripture/verse_ref.py +++ b/machine/scripture/verse_ref.py @@ -377,7 +377,12 @@ def compare_to(self, other: object, compare_all_verses: bool = True, compare_seg return 1 return 0 - def exact_equals(self, other: VerseRef) -> bool: + def exact_equals(self, other: object) -> bool: + if not isinstance(other, VerseRef): + return False + if self is other: + return True + return ( self.book_num == other.book_num and self.chapter_num == other.chapter_num @@ -437,7 +442,16 @@ def get_bbbcccvvv(book_num: int, chapter_num: int, verse_num: int) -> int: ) -def are_overlapping_verse_ranges(verse1: str, verse2: str) -> bool: +def are_overlapping_verse_ranges(verse1: Union[str, VerseRef], verse2: Union[str, VerseRef]) -> bool: + if isinstance(verse1, str) and isinstance(verse2, str): + return are_overlapping_verse_ranges_str(verse1, verse2) + elif isinstance(verse1, VerseRef) and isinstance(verse2, VerseRef): + return are_overlapping_verse_ranges_vref(verse1, verse2) + else: + raise TypeError("verse1 and verse2 are not both str or both VerseRef objects.") + + +def are_overlapping_verse_ranges_str(verse1: str, verse2: str) -> bool: verse1_parts = verse1.split(VERSE_SEQUENCE_INDICATOR) verse2_parts = verse2.split(VERSE_SEQUENCE_INDICATOR) @@ -470,6 +484,22 @@ def are_overlapping_verse_ranges(verse1: str, verse2: str) -> bool: return False +def are_overlapping_verse_ranges_vref(verse_ref1: VerseRef, verse_ref2: VerseRef) -> bool: + if verse_ref1.is_default or verse_ref2.is_default: + return False + + if verse_ref1.versification != verse_ref2.versification: + raise ValueError("Versification of verse references does not match.") + + if verse_ref1.book_num != verse_ref2.book_num or verse_ref1.chapter_num != verse_ref2.chapter_num: + return False + + if not verse_ref1.verse and not verse_ref2.verse: + return verse_ref1.verse_num == verse_ref2.verse_num + + return are_overlapping_verse_ranges_str(verse_ref1.verse, verse_ref2.verse) + + def _in_verse_range( verse1: int, verse1_seg: str, verse2: int, verse2_seg: str, verse2_end: int, verse2_end_seg: str ) -> bool: diff --git a/poetry.lock b/poetry.lock index 1e9713a..31f0d6c 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand. [[package]] name = "accelerate" @@ -2263,17 +2263,18 @@ test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=7.3.1)", "pytest- [[package]] name = "pluggy" -version = "0.13.1" +version = "1.5.0" description = "plugin and hook calling mechanisms for python" optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +python-versions = ">=3.8" files = [ - {file = "pluggy-0.13.1-py2.py3-none-any.whl", hash = "sha256:966c145cd83c96502c3c3868f50408687b38434af77734af1e9ca461a4081d2d"}, - {file = "pluggy-0.13.1.tar.gz", hash = "sha256:15b2acde666561e1298d71b523007ed7364de07029219b604cf808bfa1c765b0"}, + {file = "pluggy-1.5.0-py3-none-any.whl", hash = "sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669"}, + {file = "pluggy-1.5.0.tar.gz", hash = "sha256:2cffa88e94fdc978c4c574f15f9e59b7f4201d439195c3715ca9e2486f1d0cf1"}, ] [package.extras] dev = ["pre-commit", "tox"] +testing = ["pytest", "pytest-benchmark"] [[package]] name = "prometheus-client" @@ -2505,13 +2506,13 @@ files = [ [[package]] name = "pytest" -version = "7.4.2" +version = "8.3.2" description = "pytest: simple powerful testing with Python" optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" files = [ - {file = "pytest-7.4.2-py3-none-any.whl", hash = "sha256:1d881c6124e08ff0a1bb75ba3ec0bfd8b5354a01c194ddd5a0a870a48d99b002"}, - {file = "pytest-7.4.2.tar.gz", hash = "sha256:a766259cfab564a2ad52cb1aae1b881a75c3eb7e34ca3779697c23ed47c47069"}, + {file = "pytest-8.3.2-py3-none-any.whl", hash = "sha256:4ba08f9ae7dcf84ded419494d229b48d0903ea6407b030eaec46df5e6a73bba5"}, + {file = "pytest-8.3.2.tar.gz", hash = "sha256:c132345d12ce551242c87269de812483f5bcc87cdbb4722e48487ba194f9fdce"}, ] [package.dependencies] @@ -2519,11 +2520,11 @@ colorama = {version = "*", markers = "sys_platform == \"win32\""} exceptiongroup = {version = ">=1.0.0rc8", markers = "python_version < \"3.11\""} iniconfig = "*" packaging = "*" -pluggy = ">=0.12,<2.0" -tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""} +pluggy = ">=1.5,<2" +tomli = {version = ">=1", markers = "python_version < \"3.11\""} [package.extras] -testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"] +dev = ["argcomplete", "attrs (>=19.2)", "hypothesis (>=3.56)", "mock", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"] [[package]] name = "pytest-cov" @@ -2629,6 +2630,7 @@ files = [ {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"}, {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"}, {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"}, + {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"}, {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"}, {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"}, {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"}, @@ -4006,4 +4008,4 @@ thot = ["sil-thot"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<3.12" -content-hash = "d9d952505180b75b498e0251938497b14fabf0947e8c4357aa8b6ef4672320cd" +content-hash = "e25fd409a86457951a9ba91a820377a7d2cf6c424f2e922bc7aa2a92011b20c6" diff --git a/pyproject.toml b/pyproject.toml index a89df9d..6e5e02d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -71,7 +71,7 @@ dynaconf = "^3.2.5" json-stream = "^1.3.0" [tool.poetry.group.dev.dependencies] -pytest = "^7.4.2" +pytest = "^8.3.2" black = "^24.1.1" # match the vscode extension flake8 = "^7.0.0" isort = "^5.9.3" diff --git a/tests/corpora/test_parallel_text_corpus.py b/tests/corpora/test_parallel_text_corpus.py index cf089c0..9fd14b0 100644 --- a/tests/corpora/test_parallel_text_corpus.py +++ b/tests/corpora/test_parallel_text_corpus.py @@ -12,11 +12,12 @@ MemoryAlignmentCollection, MemoryText, ParallelTextCorpus, + ScriptureRef, StandardParallelTextCorpus, TextRow, TextRowFlags, ) -from machine.scripture import ENGLISH_VERSIFICATION, ORIGINAL_VERSIFICATION, VerseRef, Versification +from machine.scripture import ENGLISH_VERSIFICATION, ORIGINAL_VERSIFICATION, Versification def test_get_rows_no_rows() -> None: @@ -869,46 +870,49 @@ def test_get_segments_same_verse_ref_one_to_many() -> None: [ text_row( "MAT", - VerseRef.from_string("MAT 1:1", ORIGINAL_VERSIFICATION), + ScriptureRef.parse("MAT 1:1", ORIGINAL_VERSIFICATION), "source chapter one, verse one .", ), text_row( "MAT", - VerseRef.from_string("MAT 1:2", ORIGINAL_VERSIFICATION), + ScriptureRef.parse("MAT 1:2", ORIGINAL_VERSIFICATION), "source chapter one, verse two .", ), text_row( "MAT", - VerseRef.from_string("MAT 1:3", ORIGINAL_VERSIFICATION), + ScriptureRef.parse("MAT 1:3", ORIGINAL_VERSIFICATION), "source chapter one, verse three .", ), ], ) ) + source_corpus.versification = ORIGINAL_VERSIFICATION + target_corpus = DictionaryTextCorpus( MemoryText( "MAT", [ - text_row("MAT", VerseRef.from_string("MAT 1:1", versification), "target chapter one, verse one ."), + text_row("MAT", ScriptureRef.parse("MAT 1:1", versification), "target chapter one, verse one ."), text_row( "MAT", - VerseRef.from_string("MAT 1:2", versification), + ScriptureRef.parse("MAT 1:2", versification), "target chapter one, verse two . target chapter one, verse three .", TextRowFlags.SENTENCE_START | TextRowFlags.IN_RANGE | TextRowFlags.RANGE_START, ), - text_row("MAT", VerseRef.from_string("MAT 1:3", versification), flags=TextRowFlags.IN_RANGE), - text_row("MAT", VerseRef.from_string("MAT 1:4", versification), "target chapter one, verse four ."), + text_row("MAT", ScriptureRef.parse("MAT 1:3", versification), flags=TextRowFlags.IN_RANGE), + text_row("MAT", ScriptureRef.parse("MAT 1:4", versification), "target chapter one, verse four ."), ], ) ) + target_corpus.versification = versification parallel_corpus = StandardParallelTextCorpus(source_corpus, target_corpus) rows = list(parallel_corpus) assert len(rows) == 3 - assert rows[1].source_refs == [VerseRef.from_string("MAT 1:2", ORIGINAL_VERSIFICATION)] + assert rows[1].source_refs == [ScriptureRef.parse("MAT 1:2", ORIGINAL_VERSIFICATION)] assert rows[1].target_refs == [ - VerseRef.from_string("MAT 1:2", versification), - VerseRef.from_string("MAT 1:3", versification), + ScriptureRef.parse("MAT 1:2", versification), + ScriptureRef.parse("MAT 1:3", versification), ] assert rows[1].source_segment == "source chapter one, verse two .".split() assert rows[1].target_segment == "target chapter one, verse two . target chapter one, verse three .".split() @@ -925,58 +929,61 @@ def test_get_rows_verse_ref_out_of_order() -> None: [ text_row( "MAT", - VerseRef.from_string("MAT 1:1", ORIGINAL_VERSIFICATION), + ScriptureRef.parse("MAT 1:1", ORIGINAL_VERSIFICATION), "source chapter one, verse one .", ), text_row( "MAT", - VerseRef.from_string("MAT 1:2", ORIGINAL_VERSIFICATION), + ScriptureRef.parse("MAT 1:2", ORIGINAL_VERSIFICATION), "source chapter one, verse two .", ), text_row( "MAT", - VerseRef.from_string("MAT 1:3", ORIGINAL_VERSIFICATION), + ScriptureRef.parse("MAT 1:3", ORIGINAL_VERSIFICATION), "source chapter one, verse three .", ), text_row( "MAT", - VerseRef.from_string("MAT 1:4", ORIGINAL_VERSIFICATION), + ScriptureRef.parse("MAT 1:4", ORIGINAL_VERSIFICATION), "source chapter one, verse four .", ), ], ) ) + source_corpus.versification = ORIGINAL_VERSIFICATION + target_corpus = DictionaryTextCorpus( MemoryText( "MAT", [ - text_row("MAT", VerseRef.from_string("MAT 1:1", versification), "target chapter one, verse one ."), - text_row("MAT", VerseRef.from_string("MAT 1:2", versification), "target chapter one, verse two ."), - text_row("MAT", VerseRef.from_string("MAT 1:3", versification), "target chapter one, verse three ."), - text_row("MAT", VerseRef.from_string("MAT 1:4", versification), "target chapter one, verse four ."), - text_row("MAT", VerseRef.from_string("MAT 1:5", versification), "target chapter one, verse five ."), + text_row("MAT", ScriptureRef.parse("MAT 1:1", versification), "target chapter one, verse one ."), + text_row("MAT", ScriptureRef.parse("MAT 1:2", versification), "target chapter one, verse two ."), + text_row("MAT", ScriptureRef.parse("MAT 1:3", versification), "target chapter one, verse three ."), + text_row("MAT", ScriptureRef.parse("MAT 1:4", versification), "target chapter one, verse four ."), + text_row("MAT", ScriptureRef.parse("MAT 1:5", versification), "target chapter one, verse five ."), ], ) ) + target_corpus.versification = versification parallel_corpus = StandardParallelTextCorpus(source_corpus, target_corpus) rows = list(parallel_corpus) assert len(rows) == 4 - assert rows[1].source_refs == [VerseRef.from_string("MAT 1:2", ORIGINAL_VERSIFICATION)] - assert rows[1].target_refs == [VerseRef.from_string("MAT 1:3", versification)] + assert rows[1].source_refs == [ScriptureRef.parse("MAT 1:2", ORIGINAL_VERSIFICATION)] + assert rows[1].target_refs == [ScriptureRef.parse("MAT 1:3", versification)] assert rows[1].source_segment == "source chapter one, verse two .".split() assert rows[1].target_segment == "target chapter one, verse three .".split() - assert rows[2].source_refs == [VerseRef.from_string("MAT 1:3", ORIGINAL_VERSIFICATION)] - assert rows[2].target_refs == [VerseRef.from_string("MAT 1:2", versification)] + assert rows[2].source_refs == [ScriptureRef.parse("MAT 1:3", ORIGINAL_VERSIFICATION)] + assert rows[2].target_refs == [ScriptureRef.parse("MAT 1:2", versification)] assert rows[2].source_segment == "source chapter one, verse three .".split() assert rows[2].target_segment == "target chapter one, verse two .".split() - assert rows[3].source_refs == [VerseRef.from_string("MAT 1:4", ORIGINAL_VERSIFICATION)] + assert rows[3].source_refs == [ScriptureRef.parse("MAT 1:4", ORIGINAL_VERSIFICATION)] assert rows[3].target_refs == [ - VerseRef.from_string("MAT 1:4", versification), - VerseRef.from_string("MAT 1:5", versification), + ScriptureRef.parse("MAT 1:4", versification), + ScriptureRef.parse("MAT 1:5", versification), ] assert rows[3].source_segment == "source chapter one, verse four .".split() assert rows[3].target_segment == "target chapter one, verse four . target chapter one, verse five .".split() diff --git a/tests/corpora/test_scripture_ref.py b/tests/corpora/test_scripture_ref.py new file mode 100644 index 0000000..3247d39 --- /dev/null +++ b/tests/corpora/test_scripture_ref.py @@ -0,0 +1,45 @@ +from machine.corpora import ScriptureRef + + +def test_compare_to_strict(): + assert compare_to_strict("MAT 1:1", "MAT 1:2") == -1, "VerseLessThan" + assert compare_to_strict("MAT 1:1", "MAT 1:1") == 0, "VerseEqualTo" + assert compare_to_strict("MAT 1:2", "MAT 1:1") == 1, "VerseGreaterThan" + assert compare_to_strict("MAT 1:0/1:p", "MAT 1:0/2:p") == -1, "NonVerseLessThan" + assert compare_to_strict("MAT 1:0/1:p", "MAT 1:0/1:p") == 0, "NonVerseEqualTo" + assert compare_to_strict("MAT 1:0/2:p", "MAT 1:0/1:p") == 1, "NonVerseGreaterThan" + assert compare_to_strict("MAT 1:0/1:esb", "MAT 1:0/1:esb/1:p") == -1, "NonVerseParentChild" + + +def test_compare_to_relaxed(): + assert compare_to_relaxed("MAT 1:1", "MAT 1:2") == -1, "VerseLessThan" + assert compare_to_relaxed("MAT 1:1", "MAT 1:1") == 0, "VerseEqualTo" + assert compare_to_relaxed("MAT 1:2", "MAT 1:1") == 1, "VerseGreaterThan" + assert compare_to_relaxed("MAT 1:0/1:p", "MAT 1:0/2:p") == 0, "NonVerseSameMarkerDifferentPosition" + assert compare_to_relaxed("MAT 1:0/2:esb", "MAT 1:0/1:esb/1:p") == -1, "NonVerseParentChild" + + +def compare_to_strict(ref1_str, ref2_str): + ref1 = ScriptureRef.parse(ref1_str) + ref2 = ScriptureRef.parse(ref2_str) + + result = ref1.compare_to(ref2) + + if result < 0: + result = -1 + elif result > 0: + result = 1 + return result + + +def compare_to_relaxed(ref1_str, ref2_str): + ref1 = ScriptureRef.parse(ref1_str) + ref2 = ScriptureRef.parse(ref2_str) + + result = ref1.compare_to(ref2, strict=False) + + if result < 0: + result = -1 + elif result > 0: + result = 1 + return result diff --git a/tests/corpora/test_scripture_text_corpus.py b/tests/corpora/test_scripture_text_corpus.py index 486967c..925c9ca 100644 --- a/tests/corpora/test_scripture_text_corpus.py +++ b/tests/corpora/test_scripture_text_corpus.py @@ -5,7 +5,7 @@ def test_extract_scripture_corpus() -> None: - corpus = ParatextTextCorpus(USFM_TEST_PROJECT_PATH) + corpus = ParatextTextCorpus(USFM_TEST_PROJECT_PATH, include_all_text=True) lines = list(extract_scripture_corpus(corpus)) assert len(lines) == 41899 @@ -13,7 +13,7 @@ def test_extract_scripture_corpus() -> None: text, orig_vref, corpus_vref = lines[0] assert text == "" assert orig_vref.exact_equals(VerseRef.from_string("GEN 1:1", ORIGINAL_VERSIFICATION)) - assert corpus_vref is None + assert corpus_vref is not None and corpus_vref.exact_equals(VerseRef.from_string("GEN 1:1", corpus.versification)) text, orig_vref, corpus_vref = lines[3167] assert text == "Chapter fourteen, verse fifty-five. Segment b." @@ -28,7 +28,7 @@ def test_extract_scripture_corpus() -> None: text, orig_vref, corpus_vref = lines[10727] assert text == "" assert orig_vref.exact_equals(VerseRef.from_string("1CH 12:4", ORIGINAL_VERSIFICATION)) - assert corpus_vref is None + assert corpus_vref is not None and corpus_vref.exact_equals(VerseRef.from_string("1CH 12:4", corpus.versification)) text, orig_vref, corpus_vref = lines[10731] assert text == "" diff --git a/tests/corpora/test_usfm_file_text.py b/tests/corpora/test_usfm_file_text.py index ff4ebf6..33679ae 100644 --- a/tests/corpora/test_usfm_file_text.py +++ b/tests/corpora/test_usfm_file_text.py @@ -1,7 +1,6 @@ -from testutils.corpora_test_helpers import USFM_TEST_PROJECT_PATH, verse_ref +from testutils.corpora_test_helpers import USFM_TEST_PROJECT_PATH, scripture_ref -from machine.corpora import UsfmFileTextCorpus -from machine.scripture import VerseRef +from machine.corpora import ScriptureRef, UsfmFileTextCorpus def test_get_rows_nonempty_text() -> None: @@ -13,48 +12,112 @@ def test_get_rows_nonempty_text() -> None: assert len(rows) == 19 - assert verse_ref(rows[0]).exact_equals(VerseRef.from_string("MAT 1:1", corpus.versification)) + assert scripture_ref(rows[0]) == ScriptureRef.parse("MAT 1:1", corpus.versification) assert rows[0].text == "Chapter one, verse one." - assert verse_ref(rows[1]).exact_equals(VerseRef.from_string("MAT 1:2", corpus.versification)) + assert scripture_ref(rows[1]) == ScriptureRef.parse("MAT 1:2", corpus.versification) assert rows[1].text == "Chapter one, verse two." - assert verse_ref(rows[4]).exact_equals(VerseRef.from_string("MAT 1:5", corpus.versification)) + assert scripture_ref(rows[4]) == ScriptureRef.parse("MAT 1:5", corpus.versification) assert rows[4].text == "Chapter one, verse five." - assert verse_ref(rows[5]).exact_equals(VerseRef.from_string("MAT 2:1", corpus.versification)) + assert scripture_ref(rows[5]) == ScriptureRef.parse("MAT 2:1", corpus.versification) assert rows[5].text == "Chapter two, verse one." - assert verse_ref(rows[6]).exact_equals(VerseRef.from_string("MAT 2:2", corpus.versification)) + assert scripture_ref(rows[6]) == ScriptureRef.parse("MAT 2:2", corpus.versification) assert rows[6].text == "Chapter two, verse two. Chapter two, verse three." assert rows[6].is_in_range assert rows[6].is_range_start - assert verse_ref(rows[7]).exact_equals(VerseRef.from_string("MAT 2:3", corpus.versification)) + assert scripture_ref(rows[7]) == ScriptureRef.parse("MAT 2:3", corpus.versification) assert len(rows[7].segment) == 0 assert rows[7].is_in_range assert not rows[7].is_range_start - assert verse_ref(rows[8]).exact_equals(VerseRef.from_string("MAT 2:4a", corpus.versification)) + assert scripture_ref(rows[8]) == ScriptureRef.parse("MAT 2:4a", corpus.versification) assert len(rows[8].segment) == 0 assert rows[8].is_in_range assert not rows[8].is_range_start - assert verse_ref(rows[9]).exact_equals(VerseRef.from_string("MAT 2:4b", corpus.versification)) + assert scripture_ref(rows[9]) == ScriptureRef.parse("MAT 2:4b", corpus.versification) assert rows[9].text == "Chapter two, verse four." - assert verse_ref(rows[10]).exact_equals(VerseRef.from_string("MAT 2:5", corpus.versification)) + assert scripture_ref(rows[10]) == ScriptureRef.parse("MAT 2:5", corpus.versification) assert rows[10].text == "Chapter two, verse five." - assert verse_ref(rows[11]).exact_equals(VerseRef.from_string("MAT 2:6", corpus.versification)) + assert scripture_ref(rows[11]) == ScriptureRef.parse("MAT 2:6", corpus.versification) assert rows[11].text == "Chapter two, verse six." - assert verse_ref(rows[15]).exact_equals(VerseRef.from_string("MAT 2:9", corpus.versification)) + assert scripture_ref(rows[15]) == ScriptureRef.parse("MAT 2:9", corpus.versification) assert rows[15].text == "Chapter 2 verse 9" - assert verse_ref(rows[16]).exact_equals(VerseRef.from_string("MAT 2:10", corpus.versification)) + assert scripture_ref(rows[16]) == ScriptureRef.parse("MAT 2:10", corpus.versification) assert rows[16].text == "Chapter 2 verse 10" + assert scripture_ref(rows[17]) == ScriptureRef.parse("MAT 2:11", corpus.versification) + assert not rows[17].text + + +def test_get_rows_nonempty_text_all_text() -> None: + corpus = UsfmFileTextCorpus(USFM_TEST_PROJECT_PATH, include_all_text=True) + + text = corpus.get_text("MAT") + assert text is not None + rows = list(text) + + assert len(rows) == 36 + + assert scripture_ref(rows[0]) == ScriptureRef.parse("MAT 1:0/1:h", corpus.versification) + assert rows[0].text == "Matthew" + + assert scripture_ref(rows[1]) == ScriptureRef.parse("MAT 1:0/2:mt", corpus.versification) + assert rows[1].text == "Matthew" + + assert scripture_ref(rows[2]) == ScriptureRef.parse("MAT 1:0/3:ip", corpus.versification) + assert rows[2].text == "An introduction to Matthew" + + assert scripture_ref(rows[3]) == ScriptureRef.parse("MAT 1:0/3:ip/1:fe", corpus.versification) + assert rows[3].text == "This is an endnote." + + assert scripture_ref(rows[4]) == ScriptureRef.parse("Mat 1:0/4:s", corpus.versification) + assert rows[4].text == "Chapter One" + + assert scripture_ref(rows[6]) == ScriptureRef.parse("MAT 1:1/1:f", corpus.versification) + assert rows[6].text == "1:1: This is a footnote." + + assert scripture_ref(rows[8]) == ScriptureRef.parse("MAT 1:2/1:f", corpus.versification) + assert rows[8].text == "1:2: This is a footnote." + + assert scripture_ref(rows[12]) == ScriptureRef.parse("MAT 2:0/1:tr/1:tc1", corpus.versification) + assert rows[12].text == "Row one, column one." + + assert scripture_ref(rows[13]) == ScriptureRef.parse("MAT 2:0/1:tr/2:tc2", corpus.versification) + assert rows[13].text == "Row one, column two." + + assert scripture_ref(rows[14]) == ScriptureRef.parse("MAT 2:0/2:tr/1:tc1", corpus.versification) + assert rows[14].text == "Row two, column one." + + assert scripture_ref(rows[15]) == ScriptureRef.parse("MAT 2:0/2:tr/2:tc2", corpus.versification) + assert rows[15].text == "Row two, column two." + + assert scripture_ref(rows[16]) == ScriptureRef.parse("MAT 2:0/3:s1", corpus.versification) + assert rows[16].text == "Chapter Two" + + assert scripture_ref(rows[18]) == ScriptureRef.parse("MAT 2:1/1:f", corpus.versification) + assert rows[18].text == "2:1: This is a footnote." + + assert scripture_ref(rows[21]) == ScriptureRef.parse("MAT 2:3/1:esb/1:ms", corpus.versification) + assert rows[21].text == "This is a sidebar" + + assert scripture_ref(rows[22]) == ScriptureRef.parse("MAT 2:3/1:esb/2:p", corpus.versification) + assert rows[22].text == "Here is some sidebar content." + + assert scripture_ref(rows[28]) == ScriptureRef.parse("MAT 2:7a/1:s", corpus.versification) + assert rows[28].text == "Section header" + + assert scripture_ref(rows[35]) == ScriptureRef.parse("MAT 2:12/1:restore", corpus.versification) + assert rows[35].text == "restore information" + def test_get_rows_sentence_start() -> None: corpus = UsfmFileTextCorpus(USFM_TEST_PROJECT_PATH) @@ -65,11 +128,11 @@ def test_get_rows_sentence_start() -> None: assert len(rows) == 19 - assert verse_ref(rows[3]).exact_equals(VerseRef.from_string("MAT 1:4", corpus.versification)) + assert scripture_ref(rows[3]) == ScriptureRef.parse("MAT 1:4", corpus.versification) assert rows[3].text == "Chapter one, verse four," assert rows[3].is_sentence_start - assert verse_ref(rows[4]).exact_equals(VerseRef.from_string("MAT 1:5", corpus.versification)) + assert scripture_ref(rows[4]) == ScriptureRef.parse("MAT 1:5", corpus.versification) assert rows[4].text == "Chapter one, verse five." assert not rows[4].is_sentence_start @@ -93,51 +156,85 @@ def test_get_rows_include_markers() -> None: assert len(rows) == 19 - assert verse_ref(rows[0]).exact_equals(VerseRef.from_string("MAT 1:1", corpus.versification)) + assert scripture_ref(rows[0]) == ScriptureRef.parse("MAT 1:1", corpus.versification) assert ( rows[0].text == "Chapter \\pn one\\+pro WON\\+pro*\\pn*, verse one.\\f + \\fr 1:1: \\ft This is a footnote.\\f*" ) - assert verse_ref(rows[1]).exact_equals(VerseRef.from_string("MAT 1:2", corpus.versification)) + assert scripture_ref(rows[1]) == ScriptureRef.parse("MAT 1:2", corpus.versification) assert rows[1].text == "\\bd C\\bd*hapter one, \\li2 verse\\f + \\fr 1:2: \\ft This is a footnote.\\f* two." - assert verse_ref(rows[4]).exact_equals(VerseRef.from_string("MAT 1:5", corpus.versification)) + assert scripture_ref(rows[4]) == ScriptureRef.parse("MAT 1:5", corpus.versification) assert rows[4].text == 'Chapter one, \\li2 verse \\fig Figure 1|src="image1.png" size="col" ref="1:5"\\fig* five.' - assert verse_ref(rows[5]).exact_equals(VerseRef.from_string("MAT 2:1", corpus.versification)) + assert scripture_ref(rows[5]) == ScriptureRef.parse("MAT 2:1", corpus.versification) assert rows[5].text == "Chapter \\add two\\add*, verse \\f + \\fr 2:1: \\ft This is a footnote.\\f*one." - assert verse_ref(rows[6]).exact_equals(VerseRef.from_string("MAT 2:2", corpus.versification)) - assert rows[6].text == "Chapter two, verse \\fm ∆\\fm*two. Chapter two, verse \\w three|lemma\\w*." + assert scripture_ref(rows[6]) == ScriptureRef.parse("MAT 2:2", corpus.versification) + assert rows[6].text == "Chapter two, // verse \\fm ∆\\fm*two. Chapter two, verse \\w three|lemma\\w*." assert rows[6].is_in_range assert rows[6].is_range_start - assert verse_ref(rows[7]).exact_equals(VerseRef.from_string("MAT 2:3", corpus.versification)) + assert scripture_ref(rows[7]) == ScriptureRef.parse("MAT 2:3", corpus.versification) assert len(rows[7].segment) == 0 assert rows[7].is_in_range assert not rows[7].is_range_start - assert verse_ref(rows[8]).exact_equals(VerseRef.from_string("MAT 2:4a", corpus.versification)) + assert scripture_ref(rows[8]) == ScriptureRef.parse("MAT 2:4a", corpus.versification) assert len(rows[8].segment) == 0 assert rows[8].is_in_range assert not rows[8].is_range_start - assert verse_ref(rows[9]).exact_equals(VerseRef.from_string("MAT 2:4b", corpus.versification)) + assert scripture_ref(rows[9]) == ScriptureRef.parse("MAT 2:4b", corpus.versification) assert rows[9].text == "Chapter two, verse four." - assert verse_ref(rows[10]).exact_equals(VerseRef.from_string("MAT 2:5", corpus.versification)) + assert scripture_ref(rows[10]) == ScriptureRef.parse("MAT 2:5", corpus.versification) assert rows[10].text == "Chapter two, verse five \\rq (MAT 3:1)\\rq*." - assert verse_ref(rows[11]).exact_equals(VerseRef.from_string("MAT 2:6", corpus.versification)) + assert scripture_ref(rows[11]) == ScriptureRef.parse("MAT 2:6", corpus.versification) assert rows[11].text == 'Chapter two, verse \\w six|strong="12345" \\w*.' - assert verse_ref(rows[15]).exact_equals(VerseRef.from_string("MAT 2:9", corpus.versification)) + assert scripture_ref(rows[15]) == ScriptureRef.parse("MAT 2:9", corpus.versification) assert rows[15].text == "Chapter\\tcr2 2\\tc3 verse\\tcr4 9" - assert verse_ref(rows[16]).exact_equals(VerseRef.from_string("MAT 2:10", corpus.versification)) + assert scripture_ref(rows[16]) == ScriptureRef.parse("MAT 2:10", corpus.versification) assert rows[16].text == "\\tc3-4 Chapter 2 verse 10" +def test_get_rows_include_markers_all_text() -> None: + + corpus = UsfmFileTextCorpus(USFM_TEST_PROJECT_PATH, include_markers=True, include_all_text=True) + + text = corpus.get_text("MAT") + assert text is not None + rows = list(text) + + assert len(rows) == 32 + + assert scripture_ref(rows[2]) == ScriptureRef.parse("MAT 1:0/3:ip", corpus.versification) + assert rows[2].text == "An introduction to Matthew\\fe + \\ft This is an endnote.\\fe*" + + assert scripture_ref(rows[4]) == ScriptureRef.parse("MAT 1:1", corpus.versification) + assert ( + rows[4].text == "Chapter \\pn one\\+pro WON\\+pro*\\pn*, verse one.\\f + \\fr 1:1: \\ft This is a footnote.\\f*" + ) + + assert scripture_ref(rows[5]) == ScriptureRef.parse("MAT 1:2", corpus.versification) + assert rows[5].text == "\\bd C\\bd*hapter one, \\li2 verse\\f + \\fr 1:2: \\ft This is a footnote.\\f* two." + + assert scripture_ref(rows[8]) == ScriptureRef.parse("MAT 1:5", corpus.versification) + assert rows[8].text == 'Chapter one, \\li2 verse \\fig Figure 1|src="image1.png" size="col" ref="1:5"\\fig* five.' + + assert scripture_ref(rows[13]) == ScriptureRef.parse("MAT 2:0/3:s1", corpus.versification) + assert rows[13].text == "Chapter \\it Two \\it*" + + assert scripture_ref(rows[14]) == ScriptureRef.parse("MAT 2:1", corpus.versification) + assert rows[14].text == "Chapter \\add two\\add*, verse \\f + \\fr 2:1: \\ft This is a footnote.\\f*one." + + assert scripture_ref(rows[18]) == ScriptureRef.parse("MAT 2:3/1:esb/2:p", corpus.versification) + assert rows[18].text == "Here is some sidebar // content." + + def test_usfm_file_text_corpus_lowercase_usfm_id() -> None: corpus = UsfmFileTextCorpus(USFM_TEST_PROJECT_PATH) @@ -147,8 +244,8 @@ def test_usfm_file_text_corpus_lowercase_usfm_id() -> None: assert len(rows) == 2 - assert verse_ref(rows[0]).exact_equals(VerseRef.from_string("LEV 14:55", corpus.versification)) + assert scripture_ref(rows[0]) == ScriptureRef.parse("LEV 14:55", corpus.versification) assert rows[0].text == "Chapter fourteen, verse fifty-five. Segment b." - assert verse_ref(rows[1]).exact_equals(VerseRef.from_string("LEV 14:56", corpus.versification)) + assert scripture_ref(rows[1]) == ScriptureRef.parse("LEV 14:56", corpus.versification) assert rows[1].text == "Chapter fourteen, verse fifty-six." diff --git a/tests/corpora/test_usfm_text_updater.py b/tests/corpora/test_usfm_text_updater.py new file mode 100644 index 0000000..80693ab --- /dev/null +++ b/tests/corpora/test_usfm_text_updater.py @@ -0,0 +1,318 @@ +from typing import List, Optional, Tuple + +from testutils.corpora_test_helpers import USFM_TEST_PROJECT_PATH + +from machine.corpora import ScriptureRef, parse_usfm +from machine.corpora.usfm_text_updater import UsfmTextUpdater + + +def test_get_usfm_verse_char_style() -> None: + rows = [ + ( + scr_ref("MAT 1:1"), + str("First verse of the first chapter."), + ) + ] + target = update_usfm(rows) + assert "\\id MAT - Test\r\n" in target + assert "\\v 1 First verse of the first chapter.\r\n" in target + + +def test_get_usfm_id_text() -> None: + target = update_usfm(id_text="- Updated") + assert "\\id MAT - Updated\r\n" in target + + +def test_get_usfm_strip_all_text() -> None: + target = update_usfm(strip_all_text=True) + assert "\\id MAT\r\n" in target + assert "\\v 1\r\n" in target + assert "\\s\r\n" in target + + +def test_get_usfm_verse_skip_note() -> None: + rows = [ + ( + scr_ref("MAT 2:1"), + str("First verse of the second chapter."), + ) + ] + target = update_usfm(rows) + assert "\\v 1 First verse of the second chapter.\r\n" in target + + +def test_get_usfm_verse_replace_note() -> None: + rows = [ + ( + scr_ref("MAT 2:1a"), + str("First verse of the second chapter."), + ), + (scr_ref("MAT 2:1/1:f"), str("This is a new footnote.")), + ] + target = update_usfm(rows) + assert "\\v 1 First verse of the second chapter. \\f + \\ft This is a new footnote.\\f*\r\n" in target + + +def test_get_usfm_row_verse_segment() -> None: + rows = [ + ( + scr_ref("MAT 2:1a"), + str("First verse of the second chapter."), + ) + ] + target = update_usfm(rows) + assert "\\v 1 First verse of the second chapter.\r\n" in target + + +def test_get_usfm_verse_segment() -> None: + rows = [ + ( + scr_ref("MAT 2:7"), + str("Seventh verse of the second chapter."), + ) + ] + target = update_usfm(rows) + assert "\\v 7a Seventh verse of the second chapter.\r\n" in target + + +def test_get_usfm_verse_multiple_paras() -> None: + rows = [ + ( + scr_ref("MAT 1:2"), + str("Second verse of the first chapter."), + ) + ] + target = update_usfm(rows) + assert "\\v 2 Second verse of the first chapter.\r\n\\li2\r\n" in target + + +def test_get_usfm_verse_table() -> None: + rows = [ + ( + scr_ref("MAT 2:9"), + str("Ninth verse of the second chapter."), + ) + ] + target = update_usfm(rows) + assert "\\v 9 Ninth verse of the second chapter. \\tcr2 \\tc3 \\tcr4\r\n" in target + + +def test_get_usfm_verse_range_single_row_multiple_verses() -> None: + rows = [ + ( + scr_ref("MAT 2:11", "MAT 2:12"), + str("Eleventh verse of the second chapter. Twelfth verse of the second chapter."), + ) + ] + target = update_usfm(rows) + assert "\\v 11-12 Eleventh verse of the second chapter. Twelfth verse of the second chapter.\r\n" in target + + +def test_get_usfm_verse_range_single_row_single_verse() -> None: + rows = [ + ( + scr_ref("MAT 2:11"), + str("Eleventh verse of the second chapter."), + ) + ] + target = update_usfm(rows) + assert "\\v 11-12 Eleventh verse of the second chapter.\r\n" in target + + +def test_get_usfm_verse_range_multiple_rows_single_verse() -> None: + rows = [ + ( + scr_ref("MAT 2:11"), + str("Eleventh verse of the second chapter."), + ), + ( + scr_ref("MAT 2:12"), + str("Twelfth verse of the second chapter."), + ), + ] + target = update_usfm(rows) + assert "\\v 11-12 Eleventh verse of the second chapter. Twelfth verse of the second chapter.\r\n" in target + + +def test_get_usfm_verse_opt_break() -> None: + rows = [ + ( + scr_ref("MAT 2:2"), + str("Second verse of the second chapter."), + ), + ( + scr_ref("MAT 2:3"), + str("Third verse of the second chapter."), + ), + ] + target = update_usfm(rows) + assert "\\v 2-3 Second verse of the second chapter. Third verse of the second chapter.\r\n" in target + + +def test_get_usfm_verse_milestone() -> None: + rows = [ + ( + scr_ref("MAT 2:10"), + str("Tenth verse of the second chapter."), + ) + ] + target = update_usfm(rows) + assert "\\v 10 Tenth verse of the second chapter. \\tc3-4 \\qt-s |Jesus\\*\\qt-e\\*\r\n" in target + + +def test_get_usfm_verse_unmatched() -> None: + rows = [ + ( + scr_ref("MAT 1:3"), + str("Third verse of the first chapter."), + ) + ] + target = update_usfm(rows) + assert "\\v 3 Third verse of the first chapter.\r\n" in target + + +def test_get_usfm_nonverse_char_style() -> None: + rows = [ + ( + scr_ref("MAT 2:0/3:s1"), + str("The second chapter."), + ) + ] + target = update_usfm(rows) + assert "\\s1 The second chapter.\r\n" in target + + +def test_get_usfm_nonverse_paragraph() -> None: + rows = [ + ( + scr_ref("MAT 1:0/4:s"), + str("The first chapter."), + ) + ] + target = update_usfm(rows) + assert "\\s The first chapter.\r\n" in target + + +def test_get_usfm_nonverse_relaxed() -> None: + rows = [ + ( + scr_ref("MAT 1:0/s"), + str("The first chapter."), + ), + ( + scr_ref("MAT 1:1"), + str("First verse of the first chapter."), + ), + ( + scr_ref("MAT 2:0/tr/tc1"), + str("The first cell of the table."), + ), + ( + scr_ref("MAT 2:0/tr/tc2"), + str("The second cell of the table."), + ), + ( + scr_ref("MAT 2:0/tr/tc1"), + str("The third cell of the table."), + ), + ] + target = update_usfm(rows, strict_comparison=False) + assert "\\s The first chapter.\r\n" in target + assert "\\v 1 First verse of the first chapter.\r\n" in target + assert "\\tr \\tc1 The first cell of the table. \\tc2 The second cell of the table.\r\n" in target + assert "\\tr \\tc1 The third cell of the table. \\tc2 Row two, column two.\r\n" in target + + +def test_get_usfm_nonverse_sidebar() -> None: + rows = [ + ( + scr_ref("MAT 2:3/1:esb/1:ms"), + str("The first paragraph of the sidebar."), + ) + ] + target = update_usfm(rows) + assert "\\ms The first paragraph of the sidebar.\r\n" in target + + +def test_get_usfm_nonverse_table() -> None: + rows = [ + ( + scr_ref("MAT 2:0/1:tr/1:tc1"), + str("The first cell of the table."), + ), + ( + scr_ref("MAT 2:0/2:tr/1:tc1"), + str("The third cell of the table."), + ), + ] + target = update_usfm(rows) + assert "\\tr \\tc1 The first cell of the table. \\tc2 Row one, column two.\r\n" in target + + +def test_get_usfm_nonverse_optbreak() -> None: + rows = [ + ( + scr_ref("MAT 2:3/1:esb/2:p"), + str("The second paragraph of the sidebar."), + ) + ] + target = update_usfm(rows) + assert "\\p The second paragraph of the sidebar.\r\n" in target + + +def test_get_usfm_nonverse_milestone() -> None: + rows = [ + ( + scr_ref("MAT 2:7a/1:s"), + str("A new section header."), + ) + ] + target = update_usfm(rows) + assert "\\s A new section header. \\ts-s\\*\r\n" in target + + +def test_get_usfm_nonverse_skip_note() -> None: + rows = [ + ( + scr_ref("MAT 1:0/3:ip"), + str("The introductory paragraph."), + ) + ] + target = update_usfm(rows) + assert "\\ip The introductory paragraph.\r\n" in target + + +def test_get_usfm_nonverse_replace_note() -> None: + rows = [ + ( + scr_ref("MAT 1:0/3:ip"), + str("The introductory paragraph."), + ), + ( + scr_ref("MAT 1:0/3:ip/1:fe"), + str("This is a new endnote."), + ), + ] + target = update_usfm(rows) + assert "\\ip The introductory paragraph. \\fe + \\ft This is a new endnote.\\fe*\r\n" in target + + +def scr_ref(*refs: str) -> List[ScriptureRef]: + return [ScriptureRef.parse(ref) for ref in refs] + + +def update_usfm( + rows: Optional[List[Tuple[List[ScriptureRef], str]]] = None, + id_text: Optional[str] = None, + strip_all_text: bool = False, + strict_comparison: bool = True, +) -> str: + source = read_usfm() + updater = UsfmTextUpdater(rows, id_text, strip_all_text, strict_comparison) + parse_usfm(source, updater) + return updater.get_usfm() + + +def read_usfm() -> str: + with (USFM_TEST_PROJECT_PATH / "41MATTes.SFM").open("r", encoding="utf-8-sig", newline="\r\n") as file: + return file.read() diff --git a/tests/corpora/test_usfm_tokenizer.py b/tests/corpora/test_usfm_tokenizer.py index fed2cc5..74c30ac 100644 --- a/tests/corpora/test_usfm_tokenizer.py +++ b/tests/corpora/test_usfm_tokenizer.py @@ -7,22 +7,22 @@ def test_tokenize() -> None: usfm = _read_usfm() usfm_tokenizer = UsfmTokenizer() tokens = usfm_tokenizer.tokenize(usfm) - assert len(tokens) == 151 + assert len(tokens) == 170 assert tokens[0].type is UsfmTokenType.BOOK assert tokens[0].marker == "id" assert tokens[0].data == "MAT" - assert tokens[10].type is UsfmTokenType.TEXT - assert tokens[10].text == "Chapter One " + assert tokens[15].type is UsfmTokenType.TEXT + assert tokens[15].text == "Chapter One " - assert tokens[11].type is UsfmTokenType.VERSE - assert tokens[11].marker == "v" - assert tokens[11].data == "1" + assert tokens[16].type is UsfmTokenType.VERSE + assert tokens[16].marker == "v" + assert tokens[16].data == "1" - assert tokens[20].type is UsfmTokenType.NOTE - assert tokens[20].marker == "f" - assert tokens[20].data == "+" + assert tokens[25].type is UsfmTokenType.NOTE + assert tokens[25].marker == "f" + assert tokens[25].data == "+" def test_detokenize() -> None: diff --git a/tests/corpora/test_usfm_verse_text_updater.py b/tests/corpora/test_usfm_verse_text_updater.py deleted file mode 100644 index 829af78..0000000 --- a/tests/corpora/test_usfm_verse_text_updater.py +++ /dev/null @@ -1,179 +0,0 @@ -from typing import List, Optional, Tuple - -from testutils.corpora_test_helpers import USFM_TEST_PROJECT_PATH - -from machine.corpora import parse_usfm -from machine.corpora.usfm_verse_text_updater import UsfmVerseTextUpdater -from machine.scripture import ENGLISH_VERSIFICATION, VerseRef - - -def test_get_usfm_char_style() -> None: - rows = [ - ( - [VerseRef.from_string("MAT 1:1", ENGLISH_VERSIFICATION)], - str("First verse of the first chapter."), - ) - ] - target = update_usfm(rows) - assert "\\id MAT - Test\r\n" in target - assert "\\v 1 First verse of the first chapter.\r\n" in target - - -def test_get_usfm_id_text() -> None: - target = update_usfm(id_text="- Updated") - assert "\\id MAT - Updated\r\n" in target - - -def test_get_usfm_strip_all_text() -> None: - target = update_usfm(strip_all_text=True) - assert "\\id MAT\r\n" in target - assert "\\v 1\r\n" in target - assert "\\s\r\n" in target - - -def test_get_usfm_notes() -> None: - rows = [ - ( - [VerseRef.from_string("MAT 2:1", ENGLISH_VERSIFICATION)], - str("First verse of the second chapter."), - ) - ] - target = update_usfm(rows) - assert "\\v 1 First verse of the second chapter.\r\n" in target - - -def test_get_usfm_row_verse_segment() -> None: - rows = [ - ( - [VerseRef.from_string("MAT 2:1a", ENGLISH_VERSIFICATION)], - str("First verse of the second chapter."), - ) - ] - target = update_usfm(rows) - assert "\\v 1 First verse of the second chapter.\r\n" in target - - -def test_get_usfm_verse_segment() -> None: - rows = [ - ( - [VerseRef.from_string("MAT 2:7", ENGLISH_VERSIFICATION)], - str("Seventh verse of the second chapter."), - ) - ] - target = update_usfm(rows) - assert "\\v 7a Seventh verse of the second chapter.\r\n" in target - - -def test_get_usfm_multiple_paras() -> None: - rows = [ - ( - [VerseRef.from_string("MAT 1:2", ENGLISH_VERSIFICATION)], - str("Second verse of the first chapter."), - ) - ] - target = update_usfm(rows) - assert "\\v 2 Second verse of the first chapter.\r\n\\li2\r\n" in target - - -def test_get_usfm_table() -> None: - rows = [ - ( - [VerseRef.from_string("MAT 2:9", ENGLISH_VERSIFICATION)], - str("Ninth verse of the second chapter."), - ) - ] - target = update_usfm(rows) - assert "\\v 9 Ninth verse of the second chapter. \\tcr2 \\tc3 \\tcr4\r\n" in target - - -def test_get_usfm_range_single_row_multiple_verses() -> None: - rows = [ - ( - [ - VerseRef.from_string("MAT 2:11", ENGLISH_VERSIFICATION), - VerseRef.from_string("MAT 2:12", ENGLISH_VERSIFICATION), - ], - str("Eleventh verse of the second chapter. Twelfth verse of the second chapter."), - ) - ] - target = update_usfm(rows) - assert "\\v 11-12 Eleventh verse of the second chapter. Twelfth verse of the second chapter.\r\n" in target - - -def test_get_usfm_range_single_row_single_verse() -> None: - rows = [ - ( - [VerseRef.from_string("MAT 2:11", ENGLISH_VERSIFICATION)], - str("Eleventh verse of the second chapter."), - ) - ] - target = update_usfm(rows) - assert "\\v 11-12 Eleventh verse of the second chapter.\r\n" in target - - -def test_get_usfm_range_multiple_rows_single_verse() -> None: - rows = [ - ( - [VerseRef.from_string("MAT 2:11", ENGLISH_VERSIFICATION)], - str("Eleventh verse of the second chapter."), - ), - ( - [VerseRef.from_string("MAT 2:12", ENGLISH_VERSIFICATION)], - str("Twelfth verse of the second chapter."), - ), - ] - target = update_usfm(rows) - assert "\\v 11-12 Eleventh verse of the second chapter. Twelfth verse of the second chapter.\r\n" in target - - -def test_get_usfm_opt_break() -> None: - rows = [ - ( - [VerseRef.from_string("MAT 2:2", ENGLISH_VERSIFICATION)], - str("Second verse of the second chapter."), - ), - ( - [VerseRef.from_string("MAT 2:3", ENGLISH_VERSIFICATION)], - str("Third verse of the second chapter."), - ), - ] - target = update_usfm(rows) - assert "\\v 2-3 Second verse of the second chapter. Third verse of the second chapter.\r\n" in target - - -def test_get_usfm_milestone() -> None: - rows = [ - ( - [VerseRef.from_string("MAT 2:10", ENGLISH_VERSIFICATION)], - str("Tenth verse of the second chapter."), - ) - ] - target = update_usfm(rows) - assert "\\v 10 Tenth verse of the second chapter. \\tc3-4 \\qt-s |Jesus\\*\\qt-e\\*\r\n" in target - - -def test_get_usfm_unmatched() -> None: - rows = [ - ( - [VerseRef.from_string("MAT 1:3", ENGLISH_VERSIFICATION)], - str("Third verse of the first chapter."), - ) - ] - target = update_usfm(rows) - assert "\\v 3 Third verse of the first chapter.\r\n" in target - - -def update_usfm( - rows: Optional[List[Tuple[List[VerseRef], str]]] = None, - id_text: Optional[str] = None, - strip_all_text: bool = False, -) -> str: - source = read_usfm() - updater = UsfmVerseTextUpdater(rows, id_text, strip_all_text) - parse_usfm(source, updater) - return updater.get_usfm() - - -def read_usfm() -> str: - with (USFM_TEST_PROJECT_PATH / "41MATTes.SFM").open("r", encoding="utf-8-sig", newline="\r\n") as file: - return file.read() diff --git a/tests/corpora/test_usx_zip_text.py b/tests/corpora/test_usx_zip_text.py index fa45a20..4abe468 100644 --- a/tests/corpora/test_usx_zip_text.py +++ b/tests/corpora/test_usx_zip_text.py @@ -1,7 +1,7 @@ -from testutils.corpora_test_helpers import verse_ref +from testutils.corpora_test_helpers import scripture_ref from testutils.dbl_bundle_test_environment import DblBundleTestEnvironment -from machine.scripture import VerseRef +from machine.corpora import ScriptureRef def test_get_rows_nonempty_text() -> None: @@ -12,37 +12,37 @@ def test_get_rows_nonempty_text() -> None: assert len(rows) == 14 - assert verse_ref(rows[0]).exact_equals(VerseRef.from_string("MAT 1:1", env.corpus.versification)) + assert scripture_ref(rows[0]) == ScriptureRef.parse("MAT 1:1", env.corpus.versification) assert rows[0].text == "Chapter one, verse one." - assert verse_ref(rows[1]).exact_equals(VerseRef.from_string("MAT 1:2", env.corpus.versification)) + assert scripture_ref(rows[1]) == ScriptureRef.parse("MAT 1:2", env.corpus.versification) assert rows[1].text == "Chapter one, verse two." - assert verse_ref(rows[4]).exact_equals(VerseRef.from_string("MAT 1:5", env.corpus.versification)) + assert scripture_ref(rows[4]) == ScriptureRef.parse("MAT 1:5", env.corpus.versification) assert rows[4].text == "Chapter one, verse five." - assert verse_ref(rows[5]).exact_equals(VerseRef.from_string("MAT 2:1", env.corpus.versification)) + assert scripture_ref(rows[5]) == ScriptureRef.parse("MAT 2:1", env.corpus.versification) assert rows[5].text == "Chapter two, verse one." - assert verse_ref(rows[6]).exact_equals(VerseRef.from_string("MAT 2:2", env.corpus.versification)) + assert scripture_ref(rows[6]) == ScriptureRef.parse("MAT 2:2", env.corpus.versification) assert rows[6].text == "Chapter two, verse two. Chapter two, verse three." assert rows[6].is_in_range - assert verse_ref(rows[7]).exact_equals(VerseRef.from_string("MAT 2:3", env.corpus.versification)) + assert scripture_ref(rows[7]) == ScriptureRef.parse("MAT 2:3", env.corpus.versification) assert len(rows[7].text) == 0 assert rows[7].is_in_range - assert verse_ref(rows[8]).exact_equals(VerseRef.from_string("MAT 2:4a", env.corpus.versification)) + assert scripture_ref(rows[8]) == ScriptureRef.parse("MAT 2:4a", env.corpus.versification) assert len(rows[8].text) == 0 assert rows[8].is_in_range - assert verse_ref(rows[9]).exact_equals(VerseRef.from_string("MAT 2:4b", env.corpus.versification)) + assert scripture_ref(rows[9]) == ScriptureRef.parse("MAT 2:4b", env.corpus.versification) assert rows[9].text == "Chapter two, verse four." - assert verse_ref(rows[10]).exact_equals(VerseRef.from_string("MAT 2:5", env.corpus.versification)) + assert scripture_ref(rows[10]) == ScriptureRef.parse("MAT 2:5", env.corpus.versification) assert rows[10].text == "Chapter two, verse five." - assert verse_ref(rows[11]).exact_equals(VerseRef.from_string("MAT 2:6", env.corpus.versification)) + assert scripture_ref(rows[11]) == ScriptureRef.parse("MAT 2:6", env.corpus.versification) assert rows[11].text == "Chapter two, verse six." @@ -54,11 +54,11 @@ def test_get_rows_sentence_start() -> None: assert len(rows) == 14 - assert verse_ref(rows[3]).exact_equals(VerseRef.from_string("MAT 1:4", env.corpus.versification)) + assert scripture_ref(rows[3]) == ScriptureRef.parse("MAT 1:4", env.corpus.versification) assert rows[3].text == "Chapter one, verse four," assert rows[3].is_sentence_start - assert verse_ref(rows[4]).exact_equals(VerseRef.from_string("MAT 1:5", env.corpus.versification)) + assert scripture_ref(rows[4]) == ScriptureRef.parse("MAT 1:5", env.corpus.versification) assert rows[4].text == "Chapter one, verse five." assert not rows[4].is_sentence_start diff --git a/tests/testutils/corpora_test_helpers.py b/tests/testutils/corpora_test_helpers.py index bdd2b3b..38c0429 100644 --- a/tests/testutils/corpora_test_helpers.py +++ b/tests/testutils/corpora_test_helpers.py @@ -1,7 +1,7 @@ import shutil from pathlib import Path -from machine.corpora import TextRow +from machine.corpora import ScriptureRef, TextRow from machine.scripture import VerseRef from . import TEST_DATA_PATH @@ -25,3 +25,8 @@ def create_test_paratext_backup(temp_dir: Path) -> Path: def verse_ref(segment: TextRow) -> VerseRef: assert isinstance(segment.ref, VerseRef) return segment.ref + + +def scripture_ref(segment: TextRow) -> ScriptureRef: + assert isinstance(segment.ref, ScriptureRef) + return segment.ref diff --git a/tests/testutils/data/usfm/Tes/41MATTes.SFM b/tests/testutils/data/usfm/Tes/41MATTes.SFM index 2c77542..af634ba 100644 --- a/tests/testutils/data/usfm/Tes/41MATTes.SFM +++ b/tests/testutils/data/usfm/Tes/41MATTes.SFM @@ -1,7 +1,7 @@ \id MAT - Test \h Matthew \mt Matthew -\ip An introduction to Matthew +\ip An introduction to Matthew\fe + \ft This is an endnote.\fe* \c 1 \s Chapter One \v 1 Chapter \pn one\+pro WON\+pro*\pn*, verse one.\f + \fr 1:1: \ft This is a footnote.\f* @@ -15,13 +15,15 @@ \v 5 Chapter one, \li2 verse \fig Figure 1|src="image1.png" size="col" ref="1:5"\fig* five. \c 2 -\s1 Chapter Two +\tr \tc1 Row one, column one. \tc2 Row one, column two. +\tr \tc1 Row two, column one. \tc2 Row two, column two. +\s1 Chapter \it Two \it* \p \v 1 Chapter \add two\add*, verse \f + \fr 2:1: \ft This is a footnote.\f*one. \v 2-3 Chapter two, // verse \fm ∆\fm*two. \esb \ms This is a sidebar -\p Here is some sidebar content. +\p Here is some sidebar // content. \esbe \v 3-4a Chapter two, verse \w three|lemma\w*. \v 4b Chapter two, verse four.