From d3af847c6505a45472d1d9a4cd22095885f5a6be Mon Sep 17 00:00:00 2001 From: mshannon-sil <matthew_shannon@sil.org> Date: Sat, 13 Jul 2024 18:13:47 -0400 Subject: [PATCH] port commit 436a67d that moved logic to parallel text corpus --- .../corpora/standard_parallel_text_corpus.py | 28 +++++++++++++++++-- tests/corpora/test_scripture_text_corpus.py | 2 -- 2 files changed, 26 insertions(+), 4 deletions(-) diff --git a/machine/corpora/standard_parallel_text_corpus.py b/machine/corpora/standard_parallel_text_corpus.py index 303d4016..2a4ff7ab 100644 --- a/machine/corpora/standard_parallel_text_corpus.py +++ b/machine/corpora/standard_parallel_text_corpus.py @@ -14,6 +14,7 @@ from .dictionary_alignment_corpus import DictionaryAlignmentCorpus from .parallel_text_corpus import ParallelTextCorpus from .parallel_text_row import ParallelTextRow +from .scripture_text_corpus import ScriptureTextCorpus from .text_corpus import TextCorpus from .text_row import TextRow, TextRowFlags @@ -81,6 +82,12 @@ def _get_rows(self) -> Generator[ParallelTextRow, None, None]: alignment_iterator = stack.enter_context(self._alignment_corpus.get_rows(text_ids)) range_info = _RangeInfo() + if isinstance(self._target_corpus, ScriptureTextCorpus) and isinstance( + self._source_corpus, ScriptureTextCorpus + ): + range_info.versification = self._target_corpus.versification + else: + range_info.versification = None source_same_ref_rows: List[TextRow] = [] target_same_ref_rows: List[TextRow] = [] @@ -241,6 +248,16 @@ def _create_rows( else: raise ValueError("Either a source or target must be specified.") + src_refs = [] if src_row is None else [src_row.ref] + trg_refs = [] if trg_row is None else [trg_row.ref] + + if len(trg_refs) == 0 and isinstance(self._target_corpus, ScriptureTextCorpus): + for r in src_refs: + r: VerseRef + t = r.copy() + t.change_versification(self._target_corpus.versification) + trg_refs.append(t) + if src_row is None: source_flags = TextRowFlags.IN_RANGE if force_source_in_range else TextRowFlags.NONE else: @@ -253,8 +270,8 @@ def _create_rows( yield ParallelTextRow( text_id, - [] if src_row is None else [src_row.ref], - [] if trg_row is None else [trg_row.ref], + src_refs, + trg_refs, [] if src_row is None else src_row.segment, [] if trg_row is None else trg_row.segment, aligned_word_pairs, @@ -300,12 +317,19 @@ class _RangeInfo: is_target_sentence_start: bool = field(default=False, init=False) is_source_empty: bool = field(default=True, init=False) is_target_empty: bool = field(default=True, init=False) + versification: Optional[Versification] = field(default=None, init=False) @property def is_in_range(self) -> bool: return len(self.source_refs) > 0 and len(self.target_refs) > 0 def create_row(self) -> ParallelTextRow: + if len(self.target_refs) == 0 and self.versification is not None: + for r in self.source_refs: + r: VerseRef + t = r.copy() + t.change_versification(self.versification) + self.target_refs.append(t) row = ParallelTextRow( self.text_id, self.source_refs.copy(), diff --git a/tests/corpora/test_scripture_text_corpus.py b/tests/corpora/test_scripture_text_corpus.py index 486967c8..5afb7dd0 100644 --- a/tests/corpora/test_scripture_text_corpus.py +++ b/tests/corpora/test_scripture_text_corpus.py @@ -13,7 +13,6 @@ def test_extract_scripture_corpus() -> None: text, orig_vref, corpus_vref = lines[0] assert text == "" assert orig_vref.exact_equals(VerseRef.from_string("GEN 1:1", ORIGINAL_VERSIFICATION)) - assert corpus_vref is None text, orig_vref, corpus_vref = lines[3167] assert text == "Chapter fourteen, verse fifty-five. Segment b." @@ -28,7 +27,6 @@ def test_extract_scripture_corpus() -> None: text, orig_vref, corpus_vref = lines[10727] assert text == "<range>" assert orig_vref.exact_equals(VerseRef.from_string("1CH 12:4", ORIGINAL_VERSIFICATION)) - assert corpus_vref is None text, orig_vref, corpus_vref = lines[10731] assert text == "<range>"