diff --git a/machine/corpora/usfm_text_base.py b/machine/corpora/usfm_text_base.py index 1fd96f3..c150779 100644 --- a/machine/corpora/usfm_text_base.py +++ b/machine/corpora/usfm_text_base.py @@ -2,6 +2,7 @@ from io import TextIOWrapper from typing import Generator, Iterable, List, Optional, Sequence +from ..scripture.canon import ALL_BOOK_IDS from ..scripture.verse_ref import Versification from ..utils.string_utils import has_sentence_ending from .corpora_utils import gen @@ -90,6 +91,13 @@ def __init__(self, text: UsfmTextBase) -> None: def rows(self) -> Iterable[TextRow]: return self._rows + def start_book(self, state: UsfmParserState, marker: str, code: str) -> None: + super().start_book(state, marker, code) + if code not in ALL_BOOK_IDS: + raise ValueError(f"The book {code} is not a valid book id.") + if code != self._text.id: + raise ValueError(f"The \\id marker {code} does not match the text id {self._text.id}.") + def verse( self, state: UsfmParserState, diff --git a/tests/corpora/test_scripture_text_corpus.py b/tests/corpora/test_scripture_text_corpus.py index 925c9ca..5bbbfca 100644 --- a/tests/corpora/test_scripture_text_corpus.py +++ b/tests/corpora/test_scripture_text_corpus.py @@ -1,4 +1,5 @@ -from testutils.corpora_test_helpers import USFM_TEST_PROJECT_PATH +from pytest import raises +from testutils.corpora_test_helpers import USFM_MISMATCH_ID_PROJECT_PATH, USFM_TEST_PROJECT_PATH from machine.corpora import ParatextTextCorpus, extract_scripture_corpus from machine.scripture import ORIGINAL_VERSIFICATION, VerseRef @@ -59,3 +60,14 @@ def test_extract_scripture_corpus() -> None: assert text == "" assert orig_vref.exact_equals(VerseRef.from_string("MAT 2:12", ORIGINAL_VERSIFICATION)) assert corpus_vref is not None and corpus_vref.exact_equals(VerseRef.from_string("MAT 2:12", corpus.versification)) + + +def test_extract_scripture_corpus_mismatch_id() -> None: + corpus = ParatextTextCorpus(USFM_MISMATCH_ID_PROJECT_PATH, include_all_text=True) + + with raises( + RuntimeError, + match=r"An error occurred while parsing the text 'JDG' in project mismatch_id. " + r"Verse: JUD 1:0, line: 1, character: 1, error: 'The \\id marker JUD does not match the text id JDG.'", + ): + list(extract_scripture_corpus(corpus)) diff --git a/tests/corpora/test_usfm_file_text.py b/tests/corpora/test_usfm_file_text.py index 383b95a..3f87fd3 100644 --- a/tests/corpora/test_usfm_file_text.py +++ b/tests/corpora/test_usfm_file_text.py @@ -1,4 +1,5 @@ -from testutils.corpora_test_helpers import USFM_TEST_PROJECT_PATH, scripture_ref +from pytest import raises +from testutils.corpora_test_helpers import USFM_INVALID_ID_PROJECT_PATH, USFM_TEST_PROJECT_PATH, scripture_ref from machine.corpora import ScriptureRef, UsfmFileTextCorpus @@ -244,6 +245,19 @@ def test_get_rows_include_markers_all_text() -> None: assert rows[26].text == "Here is some sidebar // content." +def test_get_rows_invalid_id() -> None: + corpus = UsfmFileTextCorpus(USFM_INVALID_ID_PROJECT_PATH) + + text = corpus.get_text("JGS") + assert text is not None + with raises( + RuntimeError, + match="An error occurred while parsing the text 'JGS'." + " Verse: 1:0, line: 1, character: 1, error: 'The book JGS is not a valid book id.", + ): + list(text) + + def test_usfm_file_text_corpus_lowercase_usfm_id() -> None: corpus = UsfmFileTextCorpus(USFM_TEST_PROJECT_PATH) diff --git a/tests/testutils/corpora_test_helpers.py b/tests/testutils/corpora_test_helpers.py index 4fd9341..e287560 100644 --- a/tests/testutils/corpora_test_helpers.py +++ b/tests/testutils/corpora_test_helpers.py @@ -9,6 +9,8 @@ USFM_TEST_PROJECT_PATH = TEST_DATA_PATH / "usfm" / "Tes" USFM_TARGET_PROJECT_PATH = TEST_DATA_PATH / "usfm" / "target" USFM_SOURCE_PROJECT_PATH = TEST_DATA_PATH / "usfm" / "source" +USFM_INVALID_ID_PROJECT_PATH = TEST_DATA_PATH / "usfm" / "invalid_id" +USFM_MISMATCH_ID_PROJECT_PATH = TEST_DATA_PATH / "usfm" / "mismatch_id" USX_TEST_PROJECT_PATH = TEST_DATA_PATH / "usx" / "Tes" TEXT_TEST_PROJECT_PATH = TEST_DATA_PATH / "txt" CUSTOM_VERS_PATH = TEST_DATA_PATH / "custom.vrs" diff --git a/tests/testutils/data/usfm/invalid_id/07JDG.SFM b/tests/testutils/data/usfm/invalid_id/07JDG.SFM new file mode 100644 index 0000000..6d75497 --- /dev/null +++ b/tests/testutils/data/usfm/invalid_id/07JDG.SFM @@ -0,0 +1,5 @@ +\id JGS - Test +\h Judges +\mt Judges +\c 1 +\v 1 Chapter one, verse one. \ No newline at end of file diff --git a/tests/testutils/data/usfm/invalid_id/Settings.xml b/tests/testutils/data/usfm/invalid_id/Settings.xml new file mode 100644 index 0000000..aa24e29 --- /dev/null +++ b/tests/testutils/data/usfm/invalid_id/Settings.xml @@ -0,0 +1,34 @@ + + usfm.sty + 4 + en::: + English + 8.0.100.76 + Test + 65001 + T + + NFC + invalid_id + a7e0b3ce0200736062f9f810a444dbfbe64aca35 + Charis SIL + 12 + + + + 41MAT + + .SFM + Major::BiblicalTerms.xml + F + F + F + Public + Standard:: + + 3 + 000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + 000000000000000000000000000000000000001100000000000000000000000000000000000000000000000000000000000000000000000000000000000 + + + \ No newline at end of file diff --git a/tests/testutils/data/usfm/invalid_id/custom.vrs b/tests/testutils/data/usfm/invalid_id/custom.vrs new file mode 100644 index 0000000..fb315af --- /dev/null +++ b/tests/testutils/data/usfm/invalid_id/custom.vrs @@ -0,0 +1,31 @@ +# custom.vrs + +LEV 14:56 +ROM 14:26 +REV 12:17 +TOB 5:22 +TOB 10:12 +SIR 23:28 +ESG 1:22 +ESG 3:15 +ESG 5:14 +ESG 8:17 +ESG 10:14 +SIR 33:33 +SIR 41:24 +BAR 1:22 +4MA 7:25 +4MA 12:20 + +# deliberately missing verses +-ROM 16:26 +-ROM 16:27 +-3JN 1:15 +-S3Y 1:49 +-ESG 4:6 +-ESG 9:5 +-ESG 9:30 + +LEV 14:55 = LEV 14:55 +LEV 14:55 = LEV 14:56 +LEV 14:56 = LEV 14:57 \ No newline at end of file diff --git a/tests/testutils/data/usfm/mismatch_id/07JDG.SFM b/tests/testutils/data/usfm/mismatch_id/07JDG.SFM new file mode 100644 index 0000000..1959177 --- /dev/null +++ b/tests/testutils/data/usfm/mismatch_id/07JDG.SFM @@ -0,0 +1,5 @@ +\id JUD - Test +\h Judges +\mt Judges +\c 1 +\v 1 Chapter one, verse one. \ No newline at end of file diff --git a/tests/testutils/data/usfm/mismatch_id/Settings.xml b/tests/testutils/data/usfm/mismatch_id/Settings.xml new file mode 100644 index 0000000..5e09b68 --- /dev/null +++ b/tests/testutils/data/usfm/mismatch_id/Settings.xml @@ -0,0 +1,34 @@ + + usfm.sty + 4 + en::: + English + 8.0.100.76 + Test + 65001 + T + + NFC + mismatch_id + a7e0b3ce0200736062f9f810a444dbfbe64aca35 + Charis SIL + 12 + + + + 41MAT + + .SFM + Major::BiblicalTerms.xml + F + F + F + Public + Standard:: + + 3 + 000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + 000000000000000000000000000000000000001100000000000000000000000000000000000000000000000000000000000000000000000000000000000 + + + \ No newline at end of file diff --git a/tests/testutils/data/usfm/mismatch_id/custom.vrs b/tests/testutils/data/usfm/mismatch_id/custom.vrs new file mode 100644 index 0000000..fb315af --- /dev/null +++ b/tests/testutils/data/usfm/mismatch_id/custom.vrs @@ -0,0 +1,31 @@ +# custom.vrs + +LEV 14:56 +ROM 14:26 +REV 12:17 +TOB 5:22 +TOB 10:12 +SIR 23:28 +ESG 1:22 +ESG 3:15 +ESG 5:14 +ESG 8:17 +ESG 10:14 +SIR 33:33 +SIR 41:24 +BAR 1:22 +4MA 7:25 +4MA 12:20 + +# deliberately missing verses +-ROM 16:26 +-ROM 16:27 +-3JN 1:15 +-S3Y 1:49 +-ESG 4:6 +-ESG 9:5 +-ESG 9:30 + +LEV 14:55 = LEV 14:55 +LEV 14:55 = LEV 14:56 +LEV 14:56 = LEV 14:57 \ No newline at end of file