Skip to content

Commit

Permalink
port commit a9058ce, support for non-verse text segments in Scripture…
Browse files Browse the repository at this point in the history
… corpora
  • Loading branch information
mshannon-sil committed Aug 13, 2024
1 parent b90d640 commit 98e645f
Show file tree
Hide file tree
Showing 33 changed files with 1,380 additions and 487 deletions.
10 changes: 10 additions & 0 deletions machine/corpora/dictionary_text_corpus.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from typing import Iterable, Optional, overload

from ..scripture.verse_ref import Versification
from .text import Text
from .text_corpus import TextCorpus

Expand All @@ -21,6 +22,7 @@ def __init__(self, *args, **kwargs) -> None:
texts = args[0]
self._texts = {t.id: t for t in texts}
self._is_tokenized = False
self._versification = None

@property
def texts(self) -> Iterable[Text]:
Expand All @@ -34,6 +36,14 @@ def is_tokenized(self) -> bool:
def is_tokenized(self, value: bool) -> None:
self._is_tokenized = value

@property
def versification(self) -> Optional[Versification]:
return self._versification

@versification.setter
def versification(self, value: Versification) -> None:
self._versification = value

def __getitem__(self, id: str) -> Optional[Text]:
return self._texts.get(id)

Expand Down
5 changes: 5 additions & 0 deletions machine/corpora/flatten.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from itertools import chain
from typing import Generator, Iterable, List, Optional, cast, overload

from ..scripture.verse_ref import Versification
from .alignment_collection import AlignmentCollection
from .alignment_corpus import AlignmentCorpus
from .alignment_row import AlignmentRow
Expand Down Expand Up @@ -54,6 +55,10 @@ def texts(self) -> Iterable[Text]:
def is_tokenized(self) -> bool:
return all(c.is_tokenized for c in self._corpora)

@property
def versification(self) -> Optional[Versification]:
return self._corpora[0].versification if len(self._corpora) > 0 else None

def count(self, include_empty: bool = True) -> int:
return sum(c.count(include_empty) for c in self._corpora)

Expand Down
2 changes: 2 additions & 0 deletions machine/corpora/parallel_text_row.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ def __init__(
source_flags: TextRowFlags = TextRowFlags.SENTENCE_START,
target_flags: TextRowFlags = TextRowFlags.SENTENCE_START,
) -> None:
if not text_id:
raise ValueError("A text_id must be set.")
if len(source_refs) == 0 and len(target_refs) == 0:
raise ValueError("Either a source or target ref must be set.")
self._text_id = text_id
Expand Down
3 changes: 2 additions & 1 deletion machine/corpora/paratext_backup_text_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@


class ParatextBackupTextCorpus(ScriptureTextCorpus):
def __init__(self, filename: StrPath, include_markers: bool = False) -> None:
def __init__(self, filename: StrPath, include_markers: bool = False, include_all_text: bool = False) -> None:
with ZipFile(filename, "r") as archive:
parser = ZipParatextProjectSettingsParser(archive)
settings = parser.parse()
Expand All @@ -28,6 +28,7 @@ def __init__(self, filename: StrPath, include_markers: bool = False) -> None:
sfm_entry.filename,
versification,
include_markers,
include_all_text,
)
)

Expand Down
11 changes: 9 additions & 2 deletions machine/corpora/paratext_text_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@


class ParatextTextCorpus(ScriptureTextCorpus):
def __init__(self, project_dir: StrPath, include_markers: bool = False) -> None:
def __init__(self, project_dir: StrPath, include_markers: bool = False, include_all_text: bool = False) -> None:
parser = FileParatextProjectSettingsParser(project_dir)
settings = parser.parse()

Expand All @@ -17,7 +17,14 @@ def __init__(self, project_dir: StrPath, include_markers: bool = False) -> None:
texts: List[UsfmFileText] = []
for sfm_filename in Path(project_dir).glob(f"{settings.file_name_prefix}*{settings.file_name_suffix}"):
texts.append(
UsfmFileText(settings.stylesheet, settings.encoding, sfm_filename, versification, include_markers)
UsfmFileText(
settings.stylesheet,
settings.encoding,
sfm_filename,
versification,
include_markers,
include_all_text,
)
)

super().__init__(versification, texts)
174 changes: 174 additions & 0 deletions machine/corpora/scripture_ref_usfm_parser_handler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@
from abc import ABC
from enum import Enum, auto
from typing import List, Optional, Sequence

from ..scripture.scripture_element import ScriptureElement
from ..scripture.scripture_ref import ScriptureRef
from ..scripture.verse_ref import VerseRef, are_overlapping_verse_ranges
from .corpora_utils import merge_verse_ranges
from .usfm_parser_handler import UsfmParserHandler
from .usfm_parser_state import UsfmParserState
from .usfm_token import UsfmAttribute


class ScriptureTextType(Enum):
NONVERSE = auto()
VERSE = auto()
NOTE = auto()


class ScriptureRefUsfmParserHandler(UsfmParserHandler, ABC):
def __init__(self) -> None:
self._cur_verse_ref: VerseRef = VerseRef()
self._cur_elements_stack: List[ScriptureElement] = []
self._cur_text_type_stack: List[ScriptureTextType] = []
self._duplicate_verse: bool = False

@property
def _current_text_type(self) -> ScriptureTextType:
return ScriptureTextType.NONVERSE if len(self._cur_text_type_stack) == 0 else self._cur_text_type_stack[-1]

def end_usfm(self, state: UsfmParserState) -> None:
self._end_verse_text_wrapper(state)

def chapter(self, state: UsfmParserState, number: str, marker: str, alt_number: str, pub_number: str) -> None:
self._end_verse_text_wrapper(state)
self._update_verse_ref(state.verse_ref, marker)

def verse(
self, state: UsfmParserState, number: str, marker: str, alt_number: Optional[str], pub_number: Optional[str]
) -> None:
if state.verse_ref == self._cur_verse_ref:
self._end_verse_text_wrapper(state)
# ignore duplicate verses
self._duplicate_verse = True
elif are_overlapping_verse_ranges(number, self._cur_verse_ref.verse):
# merge overlapping verse ranges in to one range
verse_ref: VerseRef = self._cur_verse_ref.copy()
verse_ref.verse = merge_verse_ranges(number, self._cur_verse_ref.verse)
self._update_verse_ref(verse_ref, marker)
else:
self._end_verse_text_wrapper(state)
self._update_verse_ref(state.verse_ref, marker)
self._start_verse_text_wrapper(state)

def start_para(
self,
state: UsfmParserState,
marker: str,
unknown: Optional[bool],
attributes: Optional[Sequence[UsfmAttribute]],
) -> None:
if self._cur_verse_ref.is_default:
self._update_verse_ref(state.verse_ref, marker)
if not state.is_verse_text:
self._start_parent_element(marker)
self._start_non_verse_text_wrapper(state)

def end_para(self, state: UsfmParserState, marker: str) -> None:
if self._current_text_type == ScriptureTextType.NONVERSE:
self._end_parent_element()
self._end_non_verse_text_wrapper(state)

def start_row(self, state: UsfmParserState, marker: str) -> None:
if self._current_text_type == ScriptureTextType.NONVERSE:
self._start_parent_element(marker)

def end_row(self, state: UsfmParserState, marker: str) -> None:
if self._current_text_type == ScriptureTextType.NONVERSE:
self._end_parent_element()

def start_cell(self, state: UsfmParserState, marker: str, align: str, colspan: int) -> None:
if self._current_text_type == ScriptureTextType.NONVERSE:
self._start_parent_element(marker)
self._start_non_verse_text_wrapper(state)

def end_cell(self, state: UsfmParserState, marker: str) -> None:
if self._current_text_type == ScriptureTextType.NONVERSE:
self._end_parent_element()
self._end_non_verse_text_wrapper(state)

def start_sidebar(self, state: UsfmParserState, marker: str, category: str) -> None:
self._start_parent_element(marker)

def end_sidebar(self, state: UsfmParserState, marker: str, closed: bool) -> None:
self._end_parent_element()

def start_note(self, state: UsfmParserState, marker: str, caller: str, category: Optional[str]) -> None:
self._next_element(marker)
self._start_note_text_wrapper(state)

def end_note(self, state: UsfmParserState, marker: str, closed: bool) -> None:
self._end_note_text_wrapper(state)

def ref(self, state: UsfmParserState, marker: str, display: str, target: str) -> None: ...

def _start_verse_text(self, state: UsfmParserState, scripture_refs: Optional[List[ScriptureRef]]) -> None: ...

def _end_verse_text(self, state: UsfmParserState, scripture_refs: List[ScriptureRef]) -> None: ...

def _start_non_verse_text(self, state: UsfmParserState, scripture_ref: ScriptureRef) -> None: ...

def _end_non_verse_text(self, state: UsfmParserState, scripture_ref: ScriptureRef) -> None: ...

def _start_note_text(self, state: UsfmParserState, scripture_ref: ScriptureRef) -> None: ...

def _end_note_text(self, state: UsfmParserState, scripture_ref: ScriptureRef) -> None: ...

def _start_verse_text_wrapper(self, state: UsfmParserState) -> None:
self._duplicate_verse = False
self._cur_text_type_stack.append(ScriptureTextType.VERSE)
self._start_verse_text(state, self._create_verse_refs())

def _end_verse_text_wrapper(self, state: UsfmParserState) -> None:
if not self._duplicate_verse and self._cur_verse_ref.verse_num != 0:
self._end_verse_text(state, self._create_verse_refs())
self._cur_text_type_stack.pop()

def _start_non_verse_text_wrapper(self, state: UsfmParserState) -> None:
self._cur_text_type_stack.append(ScriptureTextType.NONVERSE)
self._start_non_verse_text(state, self._create_non_verse_ref())

def _end_non_verse_text_wrapper(self, state: UsfmParserState) -> None:
self._end_non_verse_text(state, self._create_non_verse_ref())
self._cur_text_type_stack.pop()

def _start_note_text_wrapper(self, state: UsfmParserState) -> None:
self._cur_text_type_stack.append(ScriptureTextType.NOTE)
self._start_note_text(state, self._create_non_verse_ref())

def _end_note_text_wrapper(self, state: UsfmParserState) -> None:
self._end_note_text(state, self._create_non_verse_ref())
self._cur_text_type_stack.pop()

def _update_verse_ref(self, verse_ref: VerseRef, marker: str) -> None:
if not are_overlapping_verse_ranges(verse_ref, self._cur_verse_ref):
self._cur_elements_stack.clear()
self._cur_elements_stack.append(ScriptureElement(0, marker))
self._cur_verse_ref = verse_ref.copy()

def _next_element(self, marker: str) -> None:
prev_elem: ScriptureElement = self._cur_elements_stack.pop()
self._cur_elements_stack.append(ScriptureElement(prev_elem.position + 1, marker))

def _start_parent_element(self, marker: str) -> None:
self._next_element(marker)
self._cur_elements_stack.append(ScriptureElement(0, marker))

def _end_parent_element(self) -> None:
self._cur_elements_stack.pop()

def _create_verse_refs(self) -> List[ScriptureRef]:
return (
[ScriptureRef(v) for v in self._cur_verse_ref.all_verses()]
if self._cur_verse_ref.has_multiple
else [ScriptureRef(self._cur_verse_ref)]
)

def _create_non_verse_ref(self) -> ScriptureRef:
verse_ref = (
list(self._cur_verse_ref.all_verses())[-1] if self._cur_verse_ref.has_multiple else self._cur_verse_ref
)
# No need to reverse unlike in Machine, elements are already added in correct order
path = [e for e in self._cur_elements_stack if e.position > 0]
return ScriptureRef(verse_ref, path)
66 changes: 57 additions & 9 deletions machine/corpora/scripture_text.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from typing import Generator, List, Optional
from typing import Generator, List, Optional, Union

from ..scripture import ENGLISH_VERSIFICATION
from ..scripture.scripture_ref import ScriptureElement, ScriptureRef
from ..scripture.verse_ref import VerseRef, Versification
from ..utils.context_managed_generator import ContextManagedGenerator
from .corpora_utils import gen, get_scripture_text_sort_key
Expand All @@ -20,19 +21,46 @@ def versification(self) -> Versification:
def get_rows(self) -> ContextManagedGenerator[TextRow, None, None]:
seg_list: List[TextRow] = []
out_of_order = False
prev_verse_ref = VerseRef()
prev_scr_ref = ScriptureRef()
with super().get_rows() as rows:
for row in rows:
verse_ref: VerseRef = row.ref
scr_ref: ScriptureRef = row.ref
seg_list.append(row)
if not out_of_order and verse_ref < prev_verse_ref:
if not out_of_order and scr_ref < prev_scr_ref:
out_of_order = True
prev_verse_ref = verse_ref
prev_scr_ref = scr_ref
if out_of_order:
seg_list.sort(key=lambda r: r.ref)
return ContextManagedGenerator(gen(seg_list))

def _create_rows(
self, ref: Union[List[ScriptureRef], VerseRef], text: str = "", is_sentence_start: bool = True
) -> Generator[TextRow, None, None]:
if isinstance(ref, VerseRef):
yield from self._create_rows_verse_ref(ref, text, is_sentence_start)
else:
yield from self._create_rows_scripture_ref(ref, text, is_sentence_start)

def _create_rows_scripture_ref(
self, scripture_refs: List[ScriptureRef], text: str = "", is_sentence_start: bool = True
) -> Generator[TextRow, None, None]:
if len(scripture_refs) > 1:
first_verse = True
for sref in scripture_refs:
if first_verse:
flags: TextRowFlags = TextRowFlags.IN_RANGE | TextRowFlags.RANGE_START
if is_sentence_start:
flags |= TextRowFlags.SENTENCE_START
yield super()._create_row(text, sref, flags)
first_verse = False
else:
yield self._create_empty_row(sref, TextRowFlags.IN_RANGE)
else:
yield super()._create_row(
text, scripture_refs[0], TextRowFlags.SENTENCE_START if is_sentence_start else TextRowFlags.NONE
)

def _create_rows_verse_ref(
self, verse_ref: VerseRef, text: str = "", is_sentence_start: bool = True
) -> Generator[TextRow, None, None]:
if verse_ref.has_multiple:
Expand All @@ -42,13 +70,33 @@ def _create_rows(
flags = TextRowFlags.IN_RANGE | TextRowFlags.RANGE_START
if is_sentence_start:
flags |= TextRowFlags.SENTENCE_START
yield self._create_row(text, vref, flags)
yield super()._create_row(text, ScriptureRef(vref), flags)
first_verse = False
else:
yield self._create_empty_row(vref, TextRowFlags.IN_RANGE)
yield self._create_empty_row(ScriptureRef(vref), TextRowFlags.IN_RANGE)
else:
yield super()._create_row(
text, ScriptureRef(verse_ref), TextRowFlags.SENTENCE_START if is_sentence_start else TextRowFlags.NONE
)

def _create_row(
self,
ref: Union[ScriptureRef, VerseRef],
text: str,
is_sentence_start: bool,
elements: Optional[List[ScriptureElement]] = None,
) -> TextRow:
if isinstance(ref, VerseRef):
return super()._create_row(
text,
ScriptureRef(ref, elements),
TextRowFlags.SENTENCE_START if is_sentence_start else TextRowFlags.NONE,
)
else:
yield self._create_row(
text, verse_ref, TextRowFlags.SENTENCE_START if is_sentence_start else TextRowFlags.NONE
return super()._create_row(
text,
ref,
TextRowFlags.SENTENCE_START if is_sentence_start else TextRowFlags.NONE,
)

def _create_verse_ref(self, chapter: str, verse: str) -> VerseRef:
Expand Down
Loading

0 comments on commit 98e645f

Please sign in to comment.