From bfb44d8131ada187a93b0577322b598cd74ca1ad Mon Sep 17 00:00:00 2001 From: mshannon-sil Date: Wed, 28 Aug 2024 19:26:41 -0400 Subject: [PATCH] port commit a15d24e, fixes sillsdev/serval#449 --- .../corpora/paratext_project_text_updater_base.py | 1 - .../corpora/scripture_ref_usfm_parser_handler.py | 3 +++ tests/corpora/test_usfm_memory_text.py | 15 +++++++++++++++ 3 files changed, 18 insertions(+), 1 deletion(-) diff --git a/machine/corpora/paratext_project_text_updater_base.py b/machine/corpora/paratext_project_text_updater_base.py index f3c05bd..c637609 100644 --- a/machine/corpora/paratext_project_text_updater_base.py +++ b/machine/corpora/paratext_project_text_updater_base.py @@ -1,5 +1,4 @@ from abc import ABC, abstractmethod -from pathlib import Path from typing import BinaryIO, List, Optional, Tuple, Union from machine.corpora.paratext_project_settings import ParatextProjectSettings diff --git a/machine/corpora/scripture_ref_usfm_parser_handler.py b/machine/corpora/scripture_ref_usfm_parser_handler.py index 06dc32b..6c155c9 100644 --- a/machine/corpora/scripture_ref_usfm_parser_handler.py +++ b/machine/corpora/scripture_ref_usfm_parser_handler.py @@ -120,6 +120,9 @@ def text(self, state: UsfmParserState, text: str) -> None: if text.strip(): self._check_convert_verse_para_to_non_verse(state) + def opt_break(self, state: UsfmParserState) -> None: + self._check_convert_verse_para_to_non_verse(state) + def start_char( self, state: UsfmParserState, marker: str, unknown: bool, attributes: Optional[Sequence[UsfmAttribute]] ) -> None: diff --git a/tests/corpora/test_usfm_memory_text.py b/tests/corpora/test_usfm_memory_text.py index 4ba6ba3..1218252 100644 --- a/tests/corpora/test_usfm_memory_text.py +++ b/tests/corpora/test_usfm_memory_text.py @@ -74,6 +74,21 @@ def test_get_rows_triplicate_verse() -> None: assert rows[4].text == "Second verse" +def test_get_rows_opt_break_middle_include_markers() -> None: + rows: List[TextRow] = get_rows( + r"""\id MAT - Test +\c 1 +\v 1 First verse in line // More text +\c 2 +\v 1 +""", + include_all_text=True, + include_markers=True, + ) + assert len(rows) == 2, str.join(",", [tr.text for tr in rows]) + assert rows[0].text == "First verse in line // More text" + + def test_get_rows_verse_para_beginning_non_verse_segment() -> None: # a verse paragraph that begins with a non-verse segment followed by a verse segment rows: List[TextRow] = get_rows(