Skip to content

Commit

Permalink
port commit 436a67d that moved logic to parallel text corpus
Browse files Browse the repository at this point in the history
  • Loading branch information
mshannon-sil committed Jul 13, 2024
1 parent 9a0b3fc commit d3af847
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 4 deletions.
28 changes: 26 additions & 2 deletions machine/corpora/standard_parallel_text_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from .dictionary_alignment_corpus import DictionaryAlignmentCorpus
from .parallel_text_corpus import ParallelTextCorpus
from .parallel_text_row import ParallelTextRow
from .scripture_text_corpus import ScriptureTextCorpus
from .text_corpus import TextCorpus
from .text_row import TextRow, TextRowFlags

Expand Down Expand Up @@ -81,6 +82,12 @@ def _get_rows(self) -> Generator[ParallelTextRow, None, None]:
alignment_iterator = stack.enter_context(self._alignment_corpus.get_rows(text_ids))

range_info = _RangeInfo()
if isinstance(self._target_corpus, ScriptureTextCorpus) and isinstance(
self._source_corpus, ScriptureTextCorpus
):
range_info.versification = self._target_corpus.versification
else:
range_info.versification = None
source_same_ref_rows: List[TextRow] = []
target_same_ref_rows: List[TextRow] = []

Expand Down Expand Up @@ -241,6 +248,16 @@ def _create_rows(
else:
raise ValueError("Either a source or target must be specified.")

src_refs = [] if src_row is None else [src_row.ref]
trg_refs = [] if trg_row is None else [trg_row.ref]

if len(trg_refs) == 0 and isinstance(self._target_corpus, ScriptureTextCorpus):
for r in src_refs:
r: VerseRef
t = r.copy()
t.change_versification(self._target_corpus.versification)
trg_refs.append(t)

if src_row is None:
source_flags = TextRowFlags.IN_RANGE if force_source_in_range else TextRowFlags.NONE
else:
Expand All @@ -253,8 +270,8 @@ def _create_rows(

yield ParallelTextRow(
text_id,
[] if src_row is None else [src_row.ref],
[] if trg_row is None else [trg_row.ref],
src_refs,
trg_refs,
[] if src_row is None else src_row.segment,
[] if trg_row is None else trg_row.segment,
aligned_word_pairs,
Expand Down Expand Up @@ -300,12 +317,19 @@ class _RangeInfo:
is_target_sentence_start: bool = field(default=False, init=False)
is_source_empty: bool = field(default=True, init=False)
is_target_empty: bool = field(default=True, init=False)
versification: Optional[Versification] = field(default=None, init=False)

@property
def is_in_range(self) -> bool:
return len(self.source_refs) > 0 and len(self.target_refs) > 0

def create_row(self) -> ParallelTextRow:
if len(self.target_refs) == 0 and self.versification is not None:
for r in self.source_refs:
r: VerseRef
t = r.copy()
t.change_versification(self.versification)
self.target_refs.append(t)
row = ParallelTextRow(
self.text_id,
self.source_refs.copy(),
Expand Down
2 changes: 0 additions & 2 deletions tests/corpora/test_scripture_text_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ def test_extract_scripture_corpus() -> None:
text, orig_vref, corpus_vref = lines[0]
assert text == ""
assert orig_vref.exact_equals(VerseRef.from_string("GEN 1:1", ORIGINAL_VERSIFICATION))
assert corpus_vref is None

text, orig_vref, corpus_vref = lines[3167]
assert text == "Chapter fourteen, verse fifty-five. Segment b."
Expand All @@ -28,7 +27,6 @@ def test_extract_scripture_corpus() -> None:
text, orig_vref, corpus_vref = lines[10727]
assert text == "<range>"
assert orig_vref.exact_equals(VerseRef.from_string("1CH 12:4", ORIGINAL_VERSIFICATION))
assert corpus_vref is None

text, orig_vref, corpus_vref = lines[10731]
assert text == "<range>"
Expand Down

0 comments on commit d3af847

Please sign in to comment.