sillsdev · mshannon-sil · Aug 21, 2024 · Aug 16, 2024 · Aug 17, 2024 · Aug 17, 2024
diff --git a/.gitignore b/.gitignore
@@ -50,6 +50,10 @@ coverage.xml
 *.py,cover
 .hypothesis/
 .pytest_cache/
+tests/testutils/data/usfm/source/*
+tests/testutils/data/usfm/target/*
+tests/testutils/data/project/*
+tests/testutils/data/pretranslations.json
 
 # Translations
 *.mo

diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -9,5 +9,12 @@
     "editor.defaultFormatter": "ms-python.black-formatter",
     "editor.formatOnSave": true
   },
-  "black-formatter.path": ["poetry", "run", "black"]
+  "black-formatter.path": [
+    "poetry",
+    "run",
+    "black"
+  ],
+  "python.analysis.extraPaths": [
+    "./tests"
+  ]
 }
diff --git a/machine/corpora/__init__.py b/machine/corpora/__init__.py
@@ -7,13 +7,17 @@
 from .dbl_bundle_text_corpus import DblBundleTextCorpus
 from .dictionary_alignment_corpus import DictionaryAlignmentCorpus
 from .dictionary_text_corpus import DictionaryTextCorpus
+from .file_paratext_project_settings_parser import FileParatextProjectSettingsParser
 from .flatten import flatten
 from .memory_alignment_collection import MemoryAlignmentCollection
 from .memory_text import MemoryText
 from .multi_key_ref import MultiKeyRef
 from .parallel_text_corpus import ParallelTextCorpus
 from .parallel_text_row import ParallelTextRow
+from .paratext_backup_terms_corpus import ParatextBackupTermsCorpus
 from .paratext_backup_text_corpus import ParatextBackupTextCorpus
+from .paratext_project_settings import ParatextProjectSettings
+from .paratext_project_settings_parser_base import ParatextProjectSettingsParserBase
 from .paratext_text_corpus import ParatextTextCorpus
 from .scripture_element import ScriptureElement
 from .scripture_ref import EMPTY_SCRIPTURE_REF, ScriptureRef
@@ -57,6 +61,8 @@
 from .usx_file_text import UsxFileText
 from .usx_file_text_corpus import UsxFileTextCorpus
 from .usx_zip_text import UsxZipText
+from .zip_paratext_project_settings_parser import ZipParatextProjectSettingsParser
+from .zip_paratext_project_settings_parser_base import ZipParatextProjectSettingsParserBase
 
 __all__ = [
     "AlignedWordPair",
@@ -72,6 +78,7 @@
     "EMPTY_SCRIPTURE_REF",
     "escape_spaces",
     "extract_scripture_corpus",
+    "FileParatextProjectSettingsParser",
     "flatten",
     "is_scripture",
     "lowercase",
@@ -85,7 +92,10 @@
     "normalize",
     "ParallelTextCorpus",
     "ParallelTextRow",
+    "ParatextBackupTermsCorpus",
     "ParatextBackupTextCorpus",
+    "ParatextProjectSettings",
+    "ParatextProjectSettingsParserBase",
     "ParatextTextCorpus",
     "parse_usfm",
     "RtlReferenceOrder",
@@ -128,4 +138,6 @@
     "UsxFileText",
     "UsxFileTextCorpus",
     "UsxZipText",
+    "ZipParatextProjectSettingsParser",
+    "ZipParatextProjectSettingsParserBase",
 ]
diff --git a/machine/corpora/paratext_backup_text_corpus.py b/machine/corpora/paratext_backup_text_corpus.py
@@ -1,8 +1,6 @@
 from typing import List
 from zipfile import ZipFile
 
-import regex as re
-
 from ..utils.typeshed import StrPath
 from .scripture_text_corpus import ScriptureTextCorpus
 from .usfm_zip_text import UsfmZipText
@@ -16,20 +14,23 @@ def __init__(self, filename: StrPath, include_markers: bool = False, include_all
             settings = parser.parse()
 
             versification = settings.versification
-            regex = re.compile(f"^{re.escape(settings.file_name_prefix)}.*{re.escape(settings.file_name_suffix)}$")
 
             texts: List[UsfmZipText] = []
-            for sfm_entry in (zi for zi in archive.filelist if regex.match(zi.filename)):
-                texts.append(
-                    UsfmZipText(
-                        settings.stylesheet,
-                        settings.encoding,
-                        filename,
-                        sfm_entry.filename,
-                        versification,
-                        include_markers,
-                        include_all_text,
+            for sfm_entry in archive.filelist:
+                book_id = settings.get_book_id(sfm_entry.filename)
+                if book_id:
+                    texts.append(
+                        UsfmZipText(
+                            settings.stylesheet,
+                            settings.encoding,
+                            book_id,
+                            filename,
+                            sfm_entry.filename,
+                            versification,
+                            include_markers,
+                            include_all_text,
+                            settings.name,
+                        )
                     )
-                )
 
         super().__init__(versification, texts)
diff --git a/machine/corpora/paratext_project_settings.py b/machine/corpora/paratext_project_settings.py
@@ -1,6 +1,7 @@
 from dataclasses import dataclass
+from typing import Optional
 
-from ..scripture.canon import book_id_to_number
+from ..scripture.canon import book_id_to_number, book_number_to_id
 from ..scripture.verse_ref import Versification
 from .usfm_stylesheet import UsfmStylesheet
 
@@ -19,6 +20,29 @@ class ParatextProjectSettings:
     biblical_terms_project_name: str
     biblical_terms_file_name: str
 
+    def get_book_id(self, file_name: str) -> Optional[str]:
+        """Returns None when the file name doesn't match the pattern of a book file name for the project."""
+        if not file_name.startswith(self.file_name_prefix) or not file_name.endswith(self.file_name_suffix):
+            return None
+
+        book_part: str = file_name[len(self.file_name_prefix) : -len(self.file_name_suffix)]
+        if self.file_name_form == "MAT":
+            if len(book_part) != 3:
+                return None
+            book_id = book_part
+        elif self.file_name_form in ("40", "41"):
+            if book_part != "100" and len(book_part) != 2:
+                return None
+            book_id = book_number_to_id(_get_book_number(book_part))
+        else:
+            if book_part.startswith("100"):
+                if len(book_part) != 6:
+                    return None
+            elif len(book_part) != 5:
+                return None
+            book_id = book_part[2:] if len(book_part) == 5 else book_part[3:]
+        return book_id
+
     def get_book_file_name(self, book_id: str) -> str:
         if self.file_name_form == "MAT":
             book_part = book_id
@@ -42,3 +66,17 @@ def _get_book_file_name_digits(book_id: str) -> str:
     if book_num < 120:
         return f"B{book_num - 110}"
     return f"C{book_num - 120}"
+
+
+def _get_book_number(book_file_name_digits: str) -> int:
+    if book_file_name_digits.startswith("A"):
+        return 100 + int(book_file_name_digits[1:])
+    if book_file_name_digits.startswith("B"):
+        return 110 + int(book_file_name_digits[1:])
+    if book_file_name_digits.startswith("C"):
+        return 120 + int(book_file_name_digits[1:])
+
+    book_num: int = int(book_file_name_digits)
+    if book_num >= 40:
+        return book_num - 1
+    return book_num
diff --git a/machine/corpora/paratext_text_corpus.py b/machine/corpora/paratext_text_corpus.py
@@ -16,15 +16,19 @@ def __init__(self, project_dir: StrPath, include_markers: bool = False, include_
 
         texts: List[UsfmFileText] = []
         for sfm_filename in Path(project_dir).glob(f"{settings.file_name_prefix}*{settings.file_name_suffix}"):
-            texts.append(
-                UsfmFileText(
-                    settings.stylesheet,
-                    settings.encoding,
-                    sfm_filename,
-                    versification,
-                    include_markers,
-                    include_all_text,
+            book_id = settings.get_book_id(sfm_filename.name)
+            if book_id:
+                texts.append(
+                    UsfmFileText(
+                        settings.stylesheet,
+                        settings.encoding,
+                        book_id,
+                        sfm_filename,
+                        versification,
+                        include_markers,
+                        include_all_text,
+                        settings.name,
+                    )
                 )
-            )
 
         super().__init__(versification, texts)
diff --git a/machine/corpora/scripture_ref.py b/machine/corpora/scripture_ref.py
@@ -4,7 +4,7 @@
 from typing import List, Optional
 
 from ..scripture.constants import ENGLISH_VERSIFICATION
-from ..scripture.verse_ref import VerseRef, Versification, are_overlapping_verse_ranges
+from ..scripture.verse_ref import VerseRef, Versification
 from ..utils.comparable import Comparable
 from .scripture_element import ScriptureElement
 
@@ -86,11 +86,6 @@ def change_versification(self, versification: Versification) -> ScriptureRef:
         vr.change_versification(versification)
         return ScriptureRef(vr, self.path)
 
-    def overlaps(self, other: ScriptureRef) -> bool:
-        if not are_overlapping_verse_ranges(self.verse_ref, other.verse_ref):
-            return False
-        return self.path == other.path
-
     def compare_to(self, other: object, compare_segments: bool = True, strict: bool = True):
         if not isinstance(other, ScriptureRef):
             raise TypeError("other is not a ScriptureRef object.")

diff --git a/machine/corpora/scripture_ref_usfm_parser_handler.py b/machine/corpora/scripture_ref_usfm_parser_handler.py
@@ -39,7 +39,7 @@ def verse(
         self, state: UsfmParserState, number: str, marker: str, alt_number: Optional[str], pub_number: Optional[str]
     ) -> None:
         if state.verse_ref == self._cur_verse_ref:
-            self._end_verse_text_wrapper(state)
+            self._end_verse_text(state, self._create_verse_refs())
             # ignore duplicate verses
             self._duplicate_verse = True
         elif are_overlapping_verse_ranges(number, self._cur_verse_ref.verse):
@@ -61,7 +61,7 @@ def start_para(
     ) -> None:
         if self._cur_verse_ref.is_default:
             self._update_verse_ref(state.verse_ref, marker)
-        if not state.is_verse_text:
+        if not state.is_verse_text or marker == "d":
             self._start_parent_element(marker)
             self._start_non_verse_text_wrapper(state)
 
@@ -121,8 +121,9 @@ def _start_verse_text_wrapper(self, state: UsfmParserState) -> None:
         self._start_verse_text(state, self._create_verse_refs())
 
     def _end_verse_text_wrapper(self, state: UsfmParserState) -> None:
-        if not self._duplicate_verse and self._cur_verse_ref.verse_num != 0:
+        if not self._duplicate_verse and self._cur_verse_ref.verse_num > 0:
             self._end_verse_text(state, self._create_verse_refs())
+        if self._cur_verse_ref.verse_num > 0:
             self._cur_text_type_stack.pop()
 
     def _start_non_verse_text_wrapper(self, state: UsfmParserState) -> None:

diff --git a/machine/corpora/usfm_file_text.py b/machine/corpora/usfm_file_text.py
@@ -14,29 +14,16 @@ def __init__(
         self,
         stylesheet: UsfmStylesheet,
         encoding: str,
+        id: str,
         filename: StrPath,
         versification: Optional[Versification] = None,
         include_markers: bool = False,
         include_all_text: bool = False,
+        project: Optional[str] = None,
     ) -> None:
-        super().__init__(
-            _get_id(filename, encoding), stylesheet, encoding, versification, include_markers, include_all_text
-        )
+        super().__init__(id, stylesheet, encoding, versification, include_markers, include_all_text, project)
 
         self._filename = Path(filename)
 
     def _create_stream_container(self) -> StreamContainer:
         return FileStreamContainer(self._filename)
-
-
-def _get_id(filename: StrPath, encoding: str) -> str:
-    with open(filename, "r", encoding=encoding) as file:
-        for line in file:
-            line = line.strip()
-            if line.startswith("\\id "):
-                id = line[4:]
-                index = id.find(" ")
-                if index != -1:
-                    id = id[:index]
-                return id.strip().upper()
-    raise RuntimeError(f"The USFM file '{filename}' does not contain an 'id' marker.")
diff --git a/machine/corpora/usfm_file_text_corpus.py b/machine/corpora/usfm_file_text_corpus.py
@@ -25,7 +25,24 @@ def __init__(
         stylesheet = UsfmStylesheet(stylesheet_filename)
         texts: List[UsfmFileText] = []
         for sfm_filename in Path(project_dir).glob(file_pattern):
-            texts.append(
-                UsfmFileText(stylesheet, encoding, sfm_filename, versification, include_markers, include_all_text)
-            )
+            id = _get_id(sfm_filename, encoding)
+            if id:
+                texts.append(
+                    UsfmFileText(
+                        stylesheet, encoding, id, sfm_filename, versification, include_markers, include_all_text
+                    )
+                )
         super().__init__(versification, texts)
+
+
+def _get_id(filename: StrPath, encoding: str) -> Optional[str]:
+    with open(filename, "r", encoding=encoding) as file:
+        for line in file:
+            line = line.strip()
+            if line.startswith("\\id "):
+                id = line[4:]
+                index = id.find(" ")
+                if index != -1:
+                    id = id[:index]
+                return id.strip().upper()
+    return None
diff --git a/machine/corpora/usfm_text_base.py b/machine/corpora/usfm_text_base.py
@@ -11,7 +11,7 @@
 from .scripture_text import ScriptureText
 from .stream_container import StreamContainer
 from .text_row import TextRow
-from .usfm_parser import parse_usfm
+from .usfm_parser import UsfmParser
 from .usfm_parser_state import UsfmParserState
 from .usfm_stylesheet import UsfmStylesheet
 from .usfm_token import UsfmAttribute, UsfmToken, UsfmTokenType
@@ -26,27 +26,32 @@ def __init__(
         versification: Optional[Versification],
         include_markers: bool,
         include_all_text: bool,
+        project: Optional[str] = None,
     ) -> None:
         super().__init__(id, versification)
 
         self._stylesheet = stylesheet
         self._encoding = encoding
         self._include_markers = include_markers
         self._include_all_text = include_all_text
+        self.project = project
 
     @abstractmethod
     def _create_stream_container(self) -> StreamContainer: ...
 
     def _get_rows(self) -> Generator[TextRow, None, None]:
         usfm = self._read_usfm()
         row_collector = _TextRowCollector(self)
-        parse_usfm(
-            usfm,
-            row_collector,
-            self._stylesheet,
-            self.versification,
-            preserve_whitespace=self._include_markers,
-        )
+        parser = UsfmParser(usfm, row_collector, self._stylesheet, self._versification, self._include_markers)
+        try:
+            parser.process_tokens()
+        except Exception as e:
+            error_message = (
+                f"An error occurred while parsing the text '{self.id}'"
+                f"{f' in project {self.project}' if self.project else ''}"
+                f". Verse: {parser.state.verse_ref}, offset: {parser.state.verse_offset}, error: '{e}'"
+            )
+            raise RuntimeError(error_message) from e
         return gen(row_collector.rows)
 
     def _read_usfm(self) -> str: