sillsdev · mshannon-sil · Aug 21, 2024 · Aug 16, 2024 · Aug 17, 2024 · Aug 17, 2024
diff --git a/.gitignore b/.gitignore
@@ -50,6 +50,8 @@ coverage.xml
 *.py,cover
 .hypothesis/
 .pytest_cache/
+tests/testutils/data/usfm/source/*
+tests/testutils/data/usfm/target/*
 
 # Translations
 *.mo

diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -9,5 +9,12 @@
     "editor.defaultFormatter": "ms-python.black-formatter",
     "editor.formatOnSave": true
   },
-  "black-formatter.path": ["poetry", "run", "black"]
+  "black-formatter.path": [
+    "poetry",
+    "run",
+    "black"
+  ],
+  "python.analysis.extraPaths": [
+    "./tests"
+  ]
 }
diff --git a/machine/corpora/paratext_backup_text_corpus.py b/machine/corpora/paratext_backup_text_corpus.py
@@ -1,8 +1,6 @@
 from typing import List
 from zipfile import ZipFile
 
-import regex as re
-
 from ..utils.typeshed import StrPath
 from .scripture_text_corpus import ScriptureTextCorpus
 from .usfm_zip_text import UsfmZipText
@@ -16,20 +14,22 @@ def __init__(self, filename: StrPath, include_markers: bool = False, include_all
             settings = parser.parse()
 
             versification = settings.versification
-            regex = re.compile(f"^{re.escape(settings.file_name_prefix)}.*{re.escape(settings.file_name_suffix)}$")
 
             texts: List[UsfmZipText] = []
-            for sfm_entry in (zi for zi in archive.filelist if regex.match(zi.filename)):
-                texts.append(
-                    UsfmZipText(
-                        settings.stylesheet,
-                        settings.encoding,
-                        filename,
-                        sfm_entry.filename,
-                        versification,
-                        include_markers,
-                        include_all_text,
+            for sfm_entry in archive.filelist:
+                book_id = settings.get_book_id(sfm_entry.filename)
+                if book_id:
+                    texts.append(
+                        UsfmZipText(
+                            settings.stylesheet,
+                            settings.encoding,
+                            book_id,
+                            filename,
+                            sfm_entry.filename,
+                            versification,
+                            include_markers,
+                            include_all_text,
+                        )
                     )
-                )
 
         super().__init__(versification, texts)
diff --git a/machine/corpora/paratext_project_settings.py b/machine/corpora/paratext_project_settings.py
@@ -1,6 +1,7 @@
 from dataclasses import dataclass
+from typing import Optional
 
-from ..scripture.canon import book_id_to_number
+from ..scripture.canon import book_id_to_number, book_number_to_id
 from ..scripture.verse_ref import Versification
 from .usfm_stylesheet import UsfmStylesheet
 
@@ -19,6 +20,28 @@ class ParatextProjectSettings:
     biblical_terms_project_name: str
     biblical_terms_file_name: str
 
+    def get_book_id(self, file_name: str) -> Optional[str]:
+        if not file_name.startswith(self.file_name_prefix) or not file_name.endswith(self.file_name_suffix):
+            return None
+
+        book_part: str = file_name[len(self.file_name_prefix) : -len(self.file_name_suffix)]
+        if self.file_name_form == "MAT":
+            if len(book_part) != 3:
+                return None
+            book_id = book_part
+        elif self.file_name_form in ("40", "41"):
+            if book_part != "100" and len(book_part) != 2:
+                return None
+            book_id = book_number_to_id(_get_book_number(book_part))
+        else:
+            if book_part.startswith("100"):
+                if len(book_part) != 6:
+                    return None
+            elif len(book_part) != 5:
+                return None
+            book_id = book_part[2:] if len(book_part) == 5 else book_part[3:]
+        return book_id
+
     def get_book_file_name(self, book_id: str) -> str:
         if self.file_name_form == "MAT":
             book_part = book_id
@@ -42,3 +65,17 @@ def _get_book_file_name_digits(book_id: str) -> str:
     if book_num < 120:
         return f"B{book_num - 110}"
     return f"C{book_num - 120}"
+
+
+def _get_book_number(book_file_name_digits: str) -> int:
+    if book_file_name_digits.startswith("A"):
+        return 100 + int(book_file_name_digits[1:])
+    if book_file_name_digits.startswith("B"):
+        return 110 + int(book_file_name_digits[1:])
+    if book_file_name_digits.startswith("C"):
+        return 120 + int(book_file_name_digits[1:])
+
+    book_num: int = int(book_file_name_digits)
+    if book_num >= 40:
+        return book_num - 1
+    return book_num
diff --git a/machine/corpora/paratext_text_corpus.py b/machine/corpora/paratext_text_corpus.py
@@ -16,15 +16,18 @@ def __init__(self, project_dir: StrPath, include_markers: bool = False, include_
 
         texts: List[UsfmFileText] = []
         for sfm_filename in Path(project_dir).glob(f"{settings.file_name_prefix}*{settings.file_name_suffix}"):
-            texts.append(
-                UsfmFileText(
-                    settings.stylesheet,
-                    settings.encoding,
-                    sfm_filename,
-                    versification,
-                    include_markers,
-                    include_all_text,
+            book_id = settings.get_book_id(sfm_filename.name)
+            if book_id:
+                texts.append(
+                    UsfmFileText(
+                        settings.stylesheet,
+                        settings.encoding,
+                        book_id,
+                        sfm_filename,
+                        versification,
+                        include_markers,
+                        include_all_text,
+                    )
                 )
-            )
 
         super().__init__(versification, texts)
diff --git a/machine/corpora/scripture_ref.py b/machine/corpora/scripture_ref.py
@@ -4,7 +4,7 @@
 from typing import List, Optional
 
 from ..scripture.constants import ENGLISH_VERSIFICATION
-from ..scripture.verse_ref import VerseRef, Versification, are_overlapping_verse_ranges
+from ..scripture.verse_ref import VerseRef, Versification
 from ..utils.comparable import Comparable
 from .scripture_element import ScriptureElement
 
@@ -86,11 +86,6 @@ def change_versification(self, versification: Versification) -> ScriptureRef:
         vr.change_versification(versification)
         return ScriptureRef(vr, self.path)
 
-    def overlaps(self, other: ScriptureRef) -> bool:
-        if not are_overlapping_verse_ranges(self.verse_ref, other.verse_ref):
-            return False
-        return self.path == other.path
-
     def compare_to(self, other: object, compare_segments: bool = True, strict: bool = True):
         if not isinstance(other, ScriptureRef):
             raise TypeError("other is not a ScriptureRef object.")

diff --git a/machine/corpora/scripture_ref_usfm_parser_handler.py b/machine/corpora/scripture_ref_usfm_parser_handler.py
@@ -61,7 +61,7 @@ def start_para(
     ) -> None:
         if self._cur_verse_ref.is_default:
             self._update_verse_ref(state.verse_ref, marker)
-        if not state.is_verse_text:
+        if not state.is_verse_text or self._current_text_type == ScriptureTextType.NONVERSE:
             self._start_parent_element(marker)
             self._start_non_verse_text_wrapper(state)
 
@@ -121,7 +121,7 @@ def _start_verse_text_wrapper(self, state: UsfmParserState) -> None:
         self._start_verse_text(state, self._create_verse_refs())
 
     def _end_verse_text_wrapper(self, state: UsfmParserState) -> None:
-        if not self._duplicate_verse and self._cur_verse_ref.verse_num != 0:
+        if not self._duplicate_verse and self._cur_verse_ref.verse_num > 0:
             self._end_verse_text(state, self._create_verse_refs())
             self._cur_text_type_stack.pop()
 

diff --git a/machine/corpora/usfm_file_text.py b/machine/corpora/usfm_file_text.py
@@ -14,29 +14,15 @@ def __init__(
         self,
         stylesheet: UsfmStylesheet,
         encoding: str,
+        id: str,
         filename: StrPath,
         versification: Optional[Versification] = None,
         include_markers: bool = False,
         include_all_text: bool = False,
     ) -> None:
-        super().__init__(
-            _get_id(filename, encoding), stylesheet, encoding, versification, include_markers, include_all_text
-        )
+        super().__init__(id, stylesheet, encoding, versification, include_markers, include_all_text)
 
         self._filename = Path(filename)
 
     def _create_stream_container(self) -> StreamContainer:
         return FileStreamContainer(self._filename)
-
-
-def _get_id(filename: StrPath, encoding: str) -> str:
-    with open(filename, "r", encoding=encoding) as file:
-        for line in file:
-            line = line.strip()
-            if line.startswith("\\id "):
-                id = line[4:]
-                index = id.find(" ")
-                if index != -1:
-                    id = id[:index]
-                return id.strip().upper()
-    raise RuntimeError(f"The USFM file '{filename}' does not contain an 'id' marker.")
diff --git a/machine/corpora/usfm_file_text_corpus.py b/machine/corpora/usfm_file_text_corpus.py
@@ -25,7 +25,24 @@ def __init__(
         stylesheet = UsfmStylesheet(stylesheet_filename)
         texts: List[UsfmFileText] = []
         for sfm_filename in Path(project_dir).glob(file_pattern):
-            texts.append(
-                UsfmFileText(stylesheet, encoding, sfm_filename, versification, include_markers, include_all_text)
-            )
+            id = _get_id(sfm_filename, encoding)
+            if id:
+                texts.append(
+                    UsfmFileText(
+                        stylesheet, encoding, id, sfm_filename, versification, include_markers, include_all_text
+                    )
+                )
         super().__init__(versification, texts)
+
+
+def _get_id(filename: StrPath, encoding: str) -> Optional[str]:
+    with open(filename, "r", encoding=encoding) as file:
+        for line in file:
+            line = line.strip()
+            if line.startswith("\\id "):
+                id = line[4:]
+                index = id.find(" ")
+                if index != -1:
+                    id = id[:index]
+                return id.strip().upper()
+    return None