sillsdev · mshannon-sil · Aug 21, 2024 · Aug 16, 2024 · Aug 17, 2024 · Aug 17, 2024
diff --git a/.gitignore b/.gitignore
@@ -50,6 +50,10 @@ coverage.xml
 *.py,cover
 .hypothesis/
 .pytest_cache/
+tests/testutils/data/usfm/source/*
+tests/testutils/data/usfm/target/*
+tests/testutils/data/project/*
+tests/testutils/data/pretranslations.json
 
 # Translations
 *.mo

diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -9,5 +9,12 @@
     "editor.defaultFormatter": "ms-python.black-formatter",
     "editor.formatOnSave": true
   },
-  "black-formatter.path": ["poetry", "run", "black"]
+  "black-formatter.path": [
+    "poetry",
+    "run",
+    "black"
+  ],
+  "python.analysis.extraPaths": [
+    "./tests"
+  ]
 }
diff --git a/machine/corpora/__init__.py b/machine/corpora/__init__.py
@@ -7,13 +7,17 @@
 from .dbl_bundle_text_corpus import DblBundleTextCorpus
 from .dictionary_alignment_corpus import DictionaryAlignmentCorpus
 from .dictionary_text_corpus import DictionaryTextCorpus
+from .file_paratext_project_settings_parser import FileParatextProjectSettingsParser
 from .flatten import flatten
 from .memory_alignment_collection import MemoryAlignmentCollection
 from .memory_text import MemoryText
 from .multi_key_ref import MultiKeyRef
 from .parallel_text_corpus import ParallelTextCorpus
 from .parallel_text_row import ParallelTextRow
+from .paratext_backup_terms_corpus import ParatextBackupTermsCorpus
 from .paratext_backup_text_corpus import ParatextBackupTextCorpus
+from .paratext_project_settings import ParatextProjectSettings
+from .paratext_project_settings_parser_base import ParatextProjectSettingsParserBase
 from .paratext_text_corpus import ParatextTextCorpus
 from .scripture_element import ScriptureElement
 from .scripture_ref import EMPTY_SCRIPTURE_REF, ScriptureRef
@@ -57,6 +61,8 @@
 from .usx_file_text import UsxFileText
 from .usx_file_text_corpus import UsxFileTextCorpus
 from .usx_zip_text import UsxZipText
+from .zip_paratext_project_settings_parser import ZipParatextProjectSettingsParser
+from .zip_paratext_project_settings_parser_base import ZipParatextProjectSettingsParserBase
 
 __all__ = [
     "AlignedWordPair",
@@ -72,6 +78,7 @@
     "EMPTY_SCRIPTURE_REF",
     "escape_spaces",
     "extract_scripture_corpus",
+    "FileParatextProjectSettingsParser",
     "flatten",
     "is_scripture",
     "lowercase",
@@ -85,7 +92,10 @@
     "normalize",
     "ParallelTextCorpus",
     "ParallelTextRow",
+    "ParatextBackupTermsCorpus",
     "ParatextBackupTextCorpus",
+    "ParatextProjectSettings",
+    "ParatextProjectSettingsParserBase",
     "ParatextTextCorpus",
     "parse_usfm",
     "RtlReferenceOrder",
@@ -128,4 +138,6 @@
     "UsxFileText",
     "UsxFileTextCorpus",
     "UsxZipText",
+    "ZipParatextProjectSettingsParser",
+    "ZipParatextProjectSettingsParserBase",
 ]
diff --git a/machine/corpora/paratext_backup_text_corpus.py b/machine/corpora/paratext_backup_text_corpus.py
@@ -1,8 +1,6 @@
 from typing import List
 from zipfile import ZipFile
 
-import regex as re
-
 from ..utils.typeshed import StrPath
 from .scripture_text_corpus import ScriptureTextCorpus
 from .usfm_zip_text import UsfmZipText
@@ -16,20 +14,23 @@ def __init__(self, filename: StrPath, include_markers: bool = False, include_all
             settings = parser.parse()
 
             versification = settings.versification
-            regex = re.compile(f"^{re.escape(settings.file_name_prefix)}.*{re.escape(settings.file_name_suffix)}$")
 
             texts: List[UsfmZipText] = []
-            for sfm_entry in (zi for zi in archive.filelist if regex.match(zi.filename)):
-                texts.append(
-                    UsfmZipText(
-                        settings.stylesheet,
-                        settings.encoding,
-                        filename,
-                        sfm_entry.filename,
-                        versification,
-                        include_markers,
-                        include_all_text,
+            for sfm_entry in archive.filelist:
+                book_id = settings.get_book_id(sfm_entry.filename)
+                if book_id:
+                    texts.append(
+                        UsfmZipText(
+                            settings.stylesheet,
+                            settings.encoding,
+                            book_id,
+                            filename,
+                            sfm_entry.filename,
+                            versification,
+                            include_markers,
+                            include_all_text,
+                            settings.name,
+                        )
                     )
-                )
 
         super().__init__(versification, texts)
diff --git a/machine/corpora/paratext_project_settings.py b/machine/corpora/paratext_project_settings.py
@@ -1,6 +1,7 @@
 from dataclasses import dataclass
+from typing import Optional
 
-from ..scripture.canon import book_id_to_number
+from ..scripture.canon import book_id_to_number, book_number_to_id
 from ..scripture.verse_ref import Versification
 from .usfm_stylesheet import UsfmStylesheet
 
@@ -19,6 +20,29 @@ class ParatextProjectSettings:
     biblical_terms_project_name: str
     biblical_terms_file_name: str
 
+    def get_book_id(self, file_name: str) -> Optional[str]:
+        """Returns None when the file name doesn't match the pattern of a book file name for the project."""
+        if not file_name.startswith(self.file_name_prefix) or not file_name.endswith(self.file_name_suffix):
+            return None
+
+        book_part: str = file_name[len(self.file_name_prefix) : -len(self.file_name_suffix)]
+        if self.file_name_form == "MAT":
+            if len(book_part) != 3:
+                return None
+            book_id = book_part
+        elif self.file_name_form in ("40", "41"):
+            if book_part != "100" and len(book_part) != 2:
+                return None
+            book_id = book_number_to_id(_get_book_number(book_part))
+        else:
+            if book_part.startswith("100"):
+                if len(book_part) != 6:
+                    return None
+            elif len(book_part) != 5:
+                return None
+            book_id = book_part[2:] if len(book_part) == 5 else book_part[3:]
+        return book_id
+
     def get_book_file_name(self, book_id: str) -> str:
         if self.file_name_form == "MAT":
             book_part = book_id
@@ -42,3 +66,17 @@ def _get_book_file_name_digits(book_id: str) -> str:
     if book_num < 120:
         return f"B{book_num - 110}"
     return f"C{book_num - 120}"
+
+
+def _get_book_number(book_file_name_digits: str) -> int:
+    if book_file_name_digits.startswith("A"):
+        return 100 + int(book_file_name_digits[1:])
+    if book_file_name_digits.startswith("B"):
+        return 110 + int(book_file_name_digits[1:])
+    if book_file_name_digits.startswith("C"):
+        return 120 + int(book_file_name_digits[1:])
+
+    book_num: int = int(book_file_name_digits)
+    if book_num >= 40:
+        return book_num - 1
+    return book_num
diff --git a/machine/corpora/paratext_text_corpus.py b/machine/corpora/paratext_text_corpus.py
@@ -16,15 +16,19 @@ def __init__(self, project_dir: StrPath, include_markers: bool = False, include_
 
         texts: List[UsfmFileText] = []
         for sfm_filename in Path(project_dir).glob(f"{settings.file_name_prefix}*{settings.file_name_suffix}"):
-            texts.append(
-                UsfmFileText(
-                    settings.stylesheet,
-                    settings.encoding,
-                    sfm_filename,
-                    versification,
-                    include_markers,
-                    include_all_text,
+            book_id = settings.get_book_id(sfm_filename.name)
+            if book_id:
+                texts.append(
+                    UsfmFileText(
+                        settings.stylesheet,
+                        settings.encoding,
+                        book_id,
+                        sfm_filename,
+                        versification,
+                        include_markers,
+                        include_all_text,
+                        settings.name,
+                    )
                 )
-            )
 
         super().__init__(versification, texts)
diff --git a/machine/corpora/scripture_element.py b/machine/corpora/scripture_element.py
@@ -1,7 +1,6 @@
 from __future__ import annotations
 
 from functools import total_ordering
-from typing import Optional
 
 from ..utils.comparable import Comparable
 
@@ -20,17 +19,27 @@ def position(self) -> int:
     def name(self) -> str:
         return self._name
 
-    def compare_to(self, other: object, strict: Optional[bool] = True) -> int:
+    def to_relaxed(self) -> ScriptureElement:
+        return ScriptureElement(0, self.name)
+
+    def compare_to(self, other: object) -> int:
         if not isinstance(other, ScriptureElement):
             raise (TypeError("other is not a ScriptureElement object."))
         if self is other:
             return 0
 
-        if strict:
-            res = self.position - other.position
-            if res != 0:
-                return res
-
+        if self.position == 0 or other.position == 0:
+            if self.name == other.name:
+                return 0
+            # position 0 is always greater than any other position
+            if self.position == 0 and other.position != 0:
+                return 1
+            if other.position == 0 and self.position != 0:
+                return -1
+            return (self.name > other.name) - (self.name < other.name)
+        res = self.position - other.position
+        if res != 0:
+            return res
         return (self.name > other.name) - (self.name < other.name)
 
     def __eq__(self, other: ScriptureElement) -> bool:

diff --git a/machine/corpora/scripture_ref.py b/machine/corpora/scripture_ref.py
@@ -4,7 +4,7 @@
 from typing import List, Optional
 
 from ..scripture.constants import ENGLISH_VERSIFICATION
-from ..scripture.verse_ref import VerseRef, Versification, are_overlapping_verse_ranges
+from ..scripture.verse_ref import VerseRef, Versification
 from ..utils.comparable import Comparable
 from .scripture_element import ScriptureElement
 
@@ -81,17 +81,15 @@ def is_empty(self) -> bool:
     def is_verse(self) -> bool:
         return VerseRef.verse_num != 0 and len(self.path) == 0
 
+    def to_relaxed(self) -> ScriptureRef:
+        return ScriptureRef(self.verse_ref, [pe.to_relaxed() for pe in self.path])
+
     def change_versification(self, versification: Versification) -> ScriptureRef:
         vr: VerseRef = self.verse_ref.copy()
         vr.change_versification(versification)
         return ScriptureRef(vr, self.path)
 
-    def overlaps(self, other: ScriptureRef) -> bool:
-        if not are_overlapping_verse_ranges(self.verse_ref, other.verse_ref):
-            return False
-        return self.path == other.path
-
-    def compare_to(self, other: object, compare_segments: bool = True, strict: bool = True):
+    def compare_to(self, other: object, compare_segments: bool = True) -> int:
         if not isinstance(other, ScriptureRef):
             raise TypeError("other is not a ScriptureRef object.")
         if self is other:
@@ -102,11 +100,14 @@ def compare_to(self, other: object, compare_segments: bool = True, strict: bool
             return res
 
         for se1, se2 in zip(self.path, other.path):
-            res = se1.compare_to(se2, strict=strict)
+            res = se1.compare_to(se2)
             if res != 0:
                 return res
-
-        return len(self.path) - len(other.path)
+        if len(self.path) < len(other.path):
+            return -1
+        elif len(self.path) > len(other.path):
+            return 1
+        return 0
 
     def __eq__(self, other: object) -> bool:
         if not isinstance(other, ScriptureRef):