Skip to content

Commit

Permalink
port commit f4b27e5, fix crash when parsing duplicate verses separate…
Browse files Browse the repository at this point in the history
…d by a paragraph mark
  • Loading branch information
mshannon-sil committed Aug 19, 2024
1 parent 2edce37 commit 5ec94f7
Show file tree
Hide file tree
Showing 3 changed files with 6 additions and 3 deletions.
5 changes: 3 additions & 2 deletions machine/corpora/scripture_ref_usfm_parser_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def verse(
self, state: UsfmParserState, number: str, marker: str, alt_number: Optional[str], pub_number: Optional[str]
) -> None:
if state.verse_ref == self._cur_verse_ref:
self._end_verse_text_wrapper(state)
self._end_verse_text(state, self._create_verse_refs())
# ignore duplicate verses
self._duplicate_verse = True
elif are_overlapping_verse_ranges(number, self._cur_verse_ref.verse):
Expand All @@ -61,7 +61,7 @@ def start_para(
) -> None:
if self._cur_verse_ref.is_default:
self._update_verse_ref(state.verse_ref, marker)
if not state.is_verse_text or self._current_text_type == ScriptureTextType.NONVERSE:
if not state.is_verse_text or marker == "d":
self._start_parent_element(marker)
self._start_non_verse_text_wrapper(state)

Expand Down Expand Up @@ -123,6 +123,7 @@ def _start_verse_text_wrapper(self, state: UsfmParserState) -> None:
def _end_verse_text_wrapper(self, state: UsfmParserState) -> None:
if not self._duplicate_verse and self._cur_verse_ref.verse_num > 0:
self._end_verse_text(state, self._create_verse_refs())
if self._cur_verse_ref.verse_num > 0:
self._cur_text_type_stack.pop()

def _start_non_verse_text_wrapper(self, state: UsfmParserState) -> None:
Expand Down
2 changes: 1 addition & 1 deletion tests/corpora/test_usfm_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ def test_tokenize() -> None:
usfm = _read_usfm()
usfm_tokenizer = UsfmTokenizer()
tokens = usfm_tokenizer.tokenize(usfm)
assert len(tokens) == 202
assert len(tokens) == 204

assert tokens[0].type is UsfmTokenType.BOOK
assert tokens[0].marker == "id"
Expand Down
2 changes: 2 additions & 0 deletions tests/testutils/data/usfm/Tes/41MATTes.SFM
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,9 @@
\v 4b Chapter two, verse four.
\p
\v 6 Chapter two, verse \w six|strong="12345" \w*.
\p
\v 6 Bad verse.
\p
\v 5 Chapter two, verse five \rq (MAT 3:1)\rq*.
\v 7a Chapter two, verse seven A,
\s Section header \ts-s\*
Expand Down

0 comments on commit 5ec94f7

Please sign in to comment.