From 84a6a3464406673ebc0087888167786f20940820 Mon Sep 17 00:00:00 2001 From: mshannon-sil Date: Mon, 11 Nov 2024 18:27:31 -0500 Subject: [PATCH] raise error when id tag doesn't match filename book id --- .../corpora/paratext_backup_text_corpus.py | 32 ++++++++++------- machine/corpora/paratext_text_corpus.py | 29 ++++++++++------ .../test_paratext_backup_text_corpus.py | 30 +++++++++++++--- tests/corpora/test_paratext_text_corpus.py | 14 ++++++++ tests/testutils/corpora_test_helpers.py | 12 +++++++ .../testutils/data/usfm/invalid_id/07JDG.SFM | 5 +++ .../data/usfm/invalid_id/Settings.xml | 34 +++++++++++++++++++ .../testutils/data/usfm/invalid_id/custom.vrs | 31 +++++++++++++++++ .../testutils/data/usfm/mismatch_id/07JDG.SFM | 5 +++ .../data/usfm/mismatch_id/Settings.xml | 34 +++++++++++++++++++ .../data/usfm/mismatch_id/custom.vrs | 31 +++++++++++++++++ 11 files changed, 230 insertions(+), 27 deletions(-) create mode 100644 tests/corpora/test_paratext_text_corpus.py create mode 100644 tests/testutils/data/usfm/invalid_id/07JDG.SFM create mode 100644 tests/testutils/data/usfm/invalid_id/Settings.xml create mode 100644 tests/testutils/data/usfm/invalid_id/custom.vrs create mode 100644 tests/testutils/data/usfm/mismatch_id/07JDG.SFM create mode 100644 tests/testutils/data/usfm/mismatch_id/Settings.xml create mode 100644 tests/testutils/data/usfm/mismatch_id/custom.vrs diff --git a/machine/corpora/paratext_backup_text_corpus.py b/machine/corpora/paratext_backup_text_corpus.py index 77d70654..34bf8f9f 100644 --- a/machine/corpora/paratext_backup_text_corpus.py +++ b/machine/corpora/paratext_backup_text_corpus.py @@ -19,18 +19,26 @@ def __init__(self, filename: StrPath, include_markers: bool = False, include_all for sfm_entry in archive.filelist: book_id = settings.get_book_id(sfm_entry.filename) if book_id: - texts.append( - UsfmZipText( - settings.stylesheet, - settings.encoding, - book_id, - filename, - sfm_entry.filename, - versification, - include_markers, - include_all_text, - settings.name, - ) + text = UsfmZipText( + settings.stylesheet, + settings.encoding, + book_id, + filename, + sfm_entry.filename, + versification, + include_markers, + include_all_text, + settings.name, ) + with text.get_rows() as rows: + row = next(rows, None) + if row and row.ref.book != book_id: + if row.ref.book == "": + raise ValueError(f"The \\id tag in {sfm_entry.filename} is invalid.") + raise ValueError( + f"The \\id tag {row.ref.book} in {sfm_entry.filename}" + f" does not match filename book id {book_id}." + ) + texts.append(text) super().__init__(versification, texts) diff --git a/machine/corpora/paratext_text_corpus.py b/machine/corpora/paratext_text_corpus.py index 24c24dd3..0831ae6a 100644 --- a/machine/corpora/paratext_text_corpus.py +++ b/machine/corpora/paratext_text_corpus.py @@ -18,17 +18,24 @@ def __init__(self, project_dir: StrPath, include_markers: bool = False, include_ for sfm_filename in Path(project_dir).glob(f"{settings.file_name_prefix}*{settings.file_name_suffix}"): book_id = settings.get_book_id(sfm_filename.name) if book_id: - texts.append( - UsfmFileText( - settings.stylesheet, - settings.encoding, - book_id, - sfm_filename, - versification, - include_markers, - include_all_text, - settings.name, - ) + text = UsfmFileText( + settings.stylesheet, + settings.encoding, + book_id, + sfm_filename, + versification, + include_markers, + include_all_text, + settings.name, ) + with text.get_rows() as rows: + row = next(rows, None) + if row and row.ref.book != book_id: + if row.ref.book == "": + raise ValueError(f"The \\id tag in {sfm_filename} is invalid.") + raise ValueError( + f"The \\id tag {row.ref.book} in {sfm_filename} does not match filename book id {book_id}." + ) + texts.append(text) super().__init__(versification, texts) diff --git a/tests/corpora/test_paratext_backup_text_corpus.py b/tests/corpora/test_paratext_backup_text_corpus.py index 57907c33..31d85bff 100644 --- a/tests/corpora/test_paratext_backup_text_corpus.py +++ b/tests/corpora/test_paratext_backup_text_corpus.py @@ -2,9 +2,14 @@ from pathlib import Path from tempfile import TemporaryDirectory -from typing import Any, ContextManager +from typing import Any, ContextManager, Optional -from testutils.corpora_test_helpers import create_test_paratext_backup +from pytest import raises +from testutils.corpora_test_helpers import ( + create_test_paratext_backup, + create_test_paratext_backup_invalid_id, + create_test_paratext_backup_mismatch_id, +) from machine.corpora import ParatextBackupTextCorpus @@ -28,10 +33,27 @@ def test_get_text() -> None: assert not any(jhn.get_rows()) +def test_invalid_id() -> None: + with raises(ValueError, match=r"The \\id tag in .* is invalid."): + with _TestEnvironment("invalid_id") as env: + env.corpus.get_text("JDG") + + +def test_mismatch_id() -> None: + with raises(ValueError, match=r"The \\id tag .* in .* does not match filename book id .*"): + with _TestEnvironment("mismatch_id") as env: + env.corpus.get_text("JDG") + + class _TestEnvironment(ContextManager["_TestEnvironment"]): - def __init__(self) -> None: + def __init__(self, project_folder_name: Optional[str] = None) -> None: self._temp_dir = TemporaryDirectory() - archive_filename = create_test_paratext_backup(Path(self._temp_dir.name)) + if project_folder_name == "invalid_id": + archive_filename = create_test_paratext_backup_invalid_id(Path(self._temp_dir.name)) + elif project_folder_name == "mismatch_id": + archive_filename = create_test_paratext_backup_mismatch_id(Path(self._temp_dir.name)) + else: + archive_filename = create_test_paratext_backup(Path(self._temp_dir.name)) self._corpus = ParatextBackupTextCorpus(archive_filename) @property diff --git a/tests/corpora/test_paratext_text_corpus.py b/tests/corpora/test_paratext_text_corpus.py new file mode 100644 index 00000000..ee3906cb --- /dev/null +++ b/tests/corpora/test_paratext_text_corpus.py @@ -0,0 +1,14 @@ +from pytest import raises +from testutils.corpora_test_helpers import USFM_INVALID_ID_PROJECT_PATH, USFM_MISMATCH_ID_PROJECT_PATH + +from machine.corpora import ParatextTextCorpus + + +def test_paratext_text_corpus_invalid_id() -> None: + with raises(ValueError, match=r"The \\id tag in .* is invalid."): + ParatextTextCorpus(USFM_INVALID_ID_PROJECT_PATH, include_all_text=True) + + +def test_paratext_text_corpus_mismatch_id() -> None: + with raises(ValueError, match=r"The \\id tag .* in .* does not match filename book id .*"): + ParatextTextCorpus(USFM_MISMATCH_ID_PROJECT_PATH, include_all_text=True) diff --git a/tests/testutils/corpora_test_helpers.py b/tests/testutils/corpora_test_helpers.py index 4fd93416..2a2fc502 100644 --- a/tests/testutils/corpora_test_helpers.py +++ b/tests/testutils/corpora_test_helpers.py @@ -9,6 +9,8 @@ USFM_TEST_PROJECT_PATH = TEST_DATA_PATH / "usfm" / "Tes" USFM_TARGET_PROJECT_PATH = TEST_DATA_PATH / "usfm" / "target" USFM_SOURCE_PROJECT_PATH = TEST_DATA_PATH / "usfm" / "source" +USFM_MISMATCH_ID_PROJECT_PATH = TEST_DATA_PATH / "usfm" / "mismatch_id" +USFM_INVALID_ID_PROJECT_PATH = TEST_DATA_PATH / "usfm" / "invalid_id" USX_TEST_PROJECT_PATH = TEST_DATA_PATH / "usx" / "Tes" TEXT_TEST_PROJECT_PATH = TEST_DATA_PATH / "txt" CUSTOM_VERS_PATH = TEST_DATA_PATH / "custom.vrs" @@ -24,6 +26,16 @@ def create_test_paratext_backup(temp_dir: Path) -> Path: return temp_dir / "Tes.zip" +def create_test_paratext_backup_invalid_id(temp_dir: Path) -> Path: + shutil.make_archive(str(temp_dir / "invalid_id"), "zip", USFM_INVALID_ID_PROJECT_PATH) + return temp_dir / "invalid_id.zip" + + +def create_test_paratext_backup_mismatch_id(temp_dir: Path) -> Path: + shutil.make_archive(str(temp_dir / "mismatch_id"), "zip", USFM_MISMATCH_ID_PROJECT_PATH) + return temp_dir / "mismatch_id.zip" + + def verse_ref(segment: TextRow) -> VerseRef: assert isinstance(segment.ref, VerseRef) return segment.ref diff --git a/tests/testutils/data/usfm/invalid_id/07JDG.SFM b/tests/testutils/data/usfm/invalid_id/07JDG.SFM new file mode 100644 index 00000000..40d866f3 --- /dev/null +++ b/tests/testutils/data/usfm/invalid_id/07JDG.SFM @@ -0,0 +1,5 @@ +\id JGS - Test +\h Judges +\mt Judges +\c 1 +\v 1 Chapter one, verse one. diff --git a/tests/testutils/data/usfm/invalid_id/Settings.xml b/tests/testutils/data/usfm/invalid_id/Settings.xml new file mode 100644 index 00000000..45cf3eab --- /dev/null +++ b/tests/testutils/data/usfm/invalid_id/Settings.xml @@ -0,0 +1,34 @@ + + usfm.sty + 4 + en::: + English + 8.0.100.76 + Test + 65001 + T + + NFC + invalid_id + a7e0b3ce0200736062f9f810a444dbfbe64aca35 + Charis SIL + 12 + + + + 41MAT + + .SFM + Major::BiblicalTerms.xml + F + F + F + Public + Standard:: + + 3 + 000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + 000000000000000000000000000000000000001100000000000000000000000000000000000000000000000000000000000000000000000000000000000 + + + \ No newline at end of file diff --git a/tests/testutils/data/usfm/invalid_id/custom.vrs b/tests/testutils/data/usfm/invalid_id/custom.vrs new file mode 100644 index 00000000..9c1cd387 --- /dev/null +++ b/tests/testutils/data/usfm/invalid_id/custom.vrs @@ -0,0 +1,31 @@ +# custom.vrs + +LEV 14:56 +ROM 14:26 +REV 12:17 +TOB 5:22 +TOB 10:12 +SIR 23:28 +ESG 1:22 +ESG 3:15 +ESG 5:14 +ESG 8:17 +ESG 10:14 +SIR 33:33 +SIR 41:24 +BAR 1:22 +4MA 7:25 +4MA 12:20 + +# deliberately missing verses +-ROM 16:26 +-ROM 16:27 +-3JN 1:15 +-S3Y 1:49 +-ESG 4:6 +-ESG 9:5 +-ESG 9:30 + +LEV 14:55 = LEV 14:55 +LEV 14:55 = LEV 14:56 +LEV 14:56 = LEV 14:57 diff --git a/tests/testutils/data/usfm/mismatch_id/07JDG.SFM b/tests/testutils/data/usfm/mismatch_id/07JDG.SFM new file mode 100644 index 00000000..bc7c876f --- /dev/null +++ b/tests/testutils/data/usfm/mismatch_id/07JDG.SFM @@ -0,0 +1,5 @@ +\id JUD - Test +\h Judges +\mt Judges +\c 1 +\v 1 Chapter one, verse one. diff --git a/tests/testutils/data/usfm/mismatch_id/Settings.xml b/tests/testutils/data/usfm/mismatch_id/Settings.xml new file mode 100644 index 00000000..a068c35c --- /dev/null +++ b/tests/testutils/data/usfm/mismatch_id/Settings.xml @@ -0,0 +1,34 @@ + + usfm.sty + 4 + en::: + English + 8.0.100.76 + Test + 65001 + T + + NFC + mismatch_id + a7e0b3ce0200736062f9f810a444dbfbe64aca35 + Charis SIL + 12 + + + + 41MAT + + .SFM + Major::BiblicalTerms.xml + F + F + F + Public + Standard:: + + 3 + 000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + 000000000000000000000000000000000000001100000000000000000000000000000000000000000000000000000000000000000000000000000000000 + + + \ No newline at end of file diff --git a/tests/testutils/data/usfm/mismatch_id/custom.vrs b/tests/testutils/data/usfm/mismatch_id/custom.vrs new file mode 100644 index 00000000..9c1cd387 --- /dev/null +++ b/tests/testutils/data/usfm/mismatch_id/custom.vrs @@ -0,0 +1,31 @@ +# custom.vrs + +LEV 14:56 +ROM 14:26 +REV 12:17 +TOB 5:22 +TOB 10:12 +SIR 23:28 +ESG 1:22 +ESG 3:15 +ESG 5:14 +ESG 8:17 +ESG 10:14 +SIR 33:33 +SIR 41:24 +BAR 1:22 +4MA 7:25 +4MA 12:20 + +# deliberately missing verses +-ROM 16:26 +-ROM 16:27 +-3JN 1:15 +-S3Y 1:49 +-ESG 4:6 +-ESG 9:5 +-ESG 9:30 + +LEV 14:55 = LEV 14:55 +LEV 14:55 = LEV 14:56 +LEV 14:56 = LEV 14:57