sillsdev · johnml1135 · Jun 11, 2024 · May 21, 2024 · Jun 7, 2024 · Jun 10, 2024
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
@@ -17,7 +17,8 @@
 		"AWS_ACCESS_KEY_ID": "${localEnv:AWS_ACCESS_KEY_ID}",
 		"AWS_SECRET_ACCESS_KEY": "${localEnv:AWS_SECRET_ACCESS_KEY}",
 		"CLEARML_API_ACCESS_KEY": "${localEnv:CLEARML_API_ACCESS_KEY}",
-		"CLEARML_API_SECRET_KEY": "${localEnv:CLEARML_API_SECRET_KEY}"
+		"CLEARML_API_SECRET_KEY": "${localEnv:CLEARML_API_SECRET_KEY}",
+		"ENV_FOR_DYNACONF": "development"
 	},
 	// Features to add to the dev container. More info: https://containers.dev/features.
 	// "features": {},

diff --git a/.devcontainer/dockerfile b/.devcontainer/dockerfile
@@ -19,7 +19,7 @@ RUN apt-get update && \
     apt-get install --no-install-recommends -y \
     python$PYTHON_VERSION \
     python$PYTHON_VERSION-distutils \
-    git curl gdb ca-certificates gnupg2 tar make gcc libssl-dev zlib1g-dev libncurses5-dev \
+    git vim curl gdb ca-certificates gnupg2 tar make gcc libssl-dev zlib1g-dev libncurses5-dev \
     libbz2-dev libreadline-dev libreadline6-dev libxml2-dev xz-utils libgdbm-dev libgdbm-compat-dev tk-dev dirmngr \
     libxmlsec1-dev libsqlite3-dev libffi-dev liblzma-dev lzma lzma-dev uuid-dev && \
     rm -rf /var/lib/apt/lists/*

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -50,12 +50,12 @@ jobs:
       - name: Lint with isort
         run: poetry run isort . --check-only
       - name: Setup Node for pyright
-        uses: actions/setup-node@v3
+        uses: actions/setup-node@v4
         with:
-          node-version: "12"
+          node-version: "14"
       - name: Lint with pyright
         run: |
-          npm install -g [email protected].313
+          npm install -g [email protected].362
           poetry run pyright
       - name: Test with pytest
         run: poetry run pytest --cov --cov-report=xml

diff --git a/.github/workflows/docker-build-push.yml b/.github/workflows/docker-build-push.yml
@@ -5,9 +5,21 @@ on:
     tags:
       - "docker_*"
 
+env:
+  REGISTRY: ghcr.io
+  IMAGE_NAME: ${{ github.repository }}
+
 jobs:
   docker:
     runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - dockerfile: ./dockerfile
+            image: ghcr.io/sillsdev/machine.py
+          - dockerfile: ./dockerfile.cpu_only
+            image: ghcr.io/sillsdev/machine.py.cpu_only
     steps:
       - name: Free Disk Space (Ubuntu)
         uses: jlumbroso/free-disk-space@main
@@ -21,8 +33,7 @@ jobs:
         id: meta
         uses: docker/metadata-action@v4
         with:
-          images: |
-            ghcr.io/${{ github.repository }}
+          images: ${{ matrix.image }}
           tags: |
             type=match,pattern=docker_(.*),group=1
           flavor: |
@@ -39,6 +50,7 @@ jobs:
         uses: docker/build-push-action@v4
         with:
           context: .
+          file: ${{ matrix.dockerfile }}
           push: true
           tags: ${{ steps.meta.outputs.tags }}
           labels: ${{ steps.meta.outputs.labels }}
diff --git a/.vscode/launch.json b/.vscode/launch.json
@@ -6,15 +6,15 @@
   "configurations": [
     {
       "name": "Python: Current File",
-      "type": "python",
+      "type": "debugpy",
       "request": "launch",
       "program": "${file}",
       "console": "integratedTerminal",
       "justMyCode": true
     },
     {
       "name": "build_nmt_engine",
-      "type": "python",
+      "type": "debugpy",
       "request": "launch",
       "module": "machine.jobs.build_nmt_engine",
       "justMyCode": false,
@@ -51,14 +51,29 @@
         ]
       }
     },
+    {
+      "name": "build_smt_engine",
+      "type": "debugpy",
+      "request": "launch",
+      "module": "machine.jobs.build_smt_engine",
+      "justMyCode": false,
+      "args": [
+        "--model-type",
+        "thot",
+        "--build-id",
+        "build1"
+      ]
+    },
     {
       "name": "Python: Debug Tests",
-      "type": "python",
+      "type": "debugpy",
       "request": "launch",
       "program": "${file}",
-      "purpose": ["debug-test"],
+      "purpose": [
+        "debug-test"
+      ],
       "console": "integratedTerminal",
       "justMyCode": false
     }
   ]
-}
+}
diff --git a/dockerfile b/dockerfile
@@ -40,7 +40,9 @@ RUN apt-get update && \
     apt-get install --no-install-recommends -y \
     curl \
     python$PYTHON_VERSION \
-    python$PYTHON_VERSION-distutils && \
+    python$PYTHON_VERSION-distutils \
+# these are needed for ClearML
+    git libsm6 libxext6 libxrender-dev libglib2.0-0 && \
     rm -rf /var/lib/apt/lists/* && \
     apt-get clean
 
@@ -51,9 +53,10 @@ RUN ln -sfn /usr/bin/python${PYTHON_VERSION} /usr/bin/python3  & \
     ln -sfn /usr/bin/python${PYTHON_VERSION} /usr/bin/python
 
 COPY --from=builder /src/requirements.txt .
-RUN pip install --no-cache-dir -r requirements.txt && rm requirements.txt
+RUN --mount=type=cache,target=/root/.cache \
+    pip install --no-cache-dir -r requirements.txt && rm requirements.txt
 
 COPY . .
-RUN pip install --no-deps . && rm -r *
+RUN pip install --no-deps . && rm -r /root/*
 
 CMD ["bash"]
diff --git a/dockerfile.cpu_only b/dockerfile.cpu_only
@@ -0,0 +1,41 @@
+#compatability with Tensorflow 2.6.0 as per https://www.tensorflow.org/install/source#gpu
+ARG PYTHON_VERSION=3.11
+ARG UBUNTU_VERSION=focal
+ARG POETRY_VERSION=1.6.1
+
+FROM python:$PYTHON_VERSION-slim as builder
+ARG POETRY_VERSION
+
+ENV POETRY_HOME=/opt/poetry
+ENV POETRY_VENV=/opt/poetry-venv
+ENV POETRY_CACHE_DIR=/opt/.cache
+
+# Install poetry separated from system interpreter
+RUN python3 -m venv $POETRY_VENV \
+    && $POETRY_VENV/bin/pip install -U pip setuptools \
+    && $POETRY_VENV/bin/pip install poetry==${POETRY_VERSION}
+
+# Add `poetry` to PATH
+ENV PATH="${PATH}:${POETRY_VENV}/bin"
+
+WORKDIR /src
+COPY poetry.lock pyproject.toml /src
+RUN poetry export --with=gpu --without-hashes -f requirements.txt > requirements.txt
+
+
+FROM python:$PYTHON_VERSION
+WORKDIR /root
+
+# these are needed for ClearML
+RUN apt-get update && \
+    apt-get install --no-install-recommends -y \
+    git libsm6 libxext6 libxrender-dev libglib2.0-0
+
+COPY --from=builder /src/requirements.txt .
+RUN --mount=type=cache,target=/root/.cache \
+    pip install --no-cache-dir -r requirements.txt && rm requirements.txt
+
+COPY . .
+RUN pip install --no-deps . && rm -r /root/*
+
+CMD ["bash"]
diff --git a/machine/corpora/dbl_bundle_text_corpus.py b/machine/corpora/dbl_bundle_text_corpus.py
@@ -1,7 +1,7 @@
 import os
-import xml.etree.ElementTree as etree
 from io import TextIOWrapper
 from typing import List
+from xml.etree import ElementTree
 from zipfile import ZipFile
 
 from ..scripture import ENGLISH_VERSIFICATION
@@ -17,7 +17,7 @@ class DblBundleTextCorpus(ScriptureTextCorpus):
     def __init__(self, filename: StrPath) -> None:
         with ZipFile(filename, "r") as archive:
             with archive.open("metadata.xml", "r") as stream:
-                doc = etree.parse(stream)
+                doc = ElementTree.parse(stream)
             version = doc.getroot().get("version", "2.0")
             parts = version.split(".", maxsplit=3)
             if f"{parts[0]}.{parts[1]}" not in DblBundleTextCorpus._SUPPORTED_VERSIONS:

diff --git a/machine/corpora/parallel_text_corpus.py b/machine/corpora/parallel_text_corpus.py
@@ -11,6 +11,7 @@
     Generator,
     Iterable,
     List,
+    Literal,
     Optional,
     Sequence,
     Tuple,
@@ -185,22 +186,22 @@ def _detokenize(row: ParallelTextRow) -> ParallelTextRow:
 
         return self.transform(_detokenize, is_target_tokenized=False)
 
-    def normalize(self, normalization_form: str) -> ParallelTextCorpus:
+    def normalize(self, normalization_form: Literal["NFC", "NFD", "NFKC", "NFKD"]) -> ParallelTextCorpus:
         def _normalize(row: ParallelTextRow) -> ParallelTextRow:
             row.source_segment = normalize(normalization_form, row.source_segment)
             row.target_segment = normalize(normalization_form, row.target_segment)
             return row
 
         return self.transform(_normalize)
 
-    def normalize_source(self, normalization_form: str) -> ParallelTextCorpus:
+    def normalize_source(self, normalization_form: Literal["NFC", "NFD", "NFKC", "NFKD"]) -> ParallelTextCorpus:
         def _normalize(row: ParallelTextRow) -> ParallelTextRow:
             row.source_segment = normalize(normalization_form, row.source_segment)
             return row
 
         return self.transform(_normalize)
 
-    def normalize_target(self, normalization_form: str) -> ParallelTextCorpus:
+    def normalize_target(self, normalization_form: Literal["NFC", "NFD", "NFKC", "NFKD"]) -> ParallelTextCorpus:
         def _normalize(row: ParallelTextRow) -> ParallelTextRow:
             row.target_segment = normalize(normalization_form, row.target_segment)
             return row

diff --git a/machine/corpora/paratext_backup_terms_corpus.py b/machine/corpora/paratext_backup_terms_corpus.py
@@ -1,6 +1,6 @@
 import re
-import xml.etree.ElementTree as ET
 from typing import Dict, List, Optional
+from xml.etree import ElementTree
 from zipfile import ZipFile
 
 from .corpora_utils import get_entry
@@ -23,20 +23,20 @@ def __init__(self, filename: str, term_categories: List[str]) -> None:
             settings = settings_parser.parse()
 
             with archive.open(terms_file_entry) as key_terms_file:
-                term_renderings_tree = ET.parse(key_terms_file)
+                term_renderings_tree = ElementTree.parse(key_terms_file)
 
             biblical_terms_file_entry = get_entry(archive, settings.biblical_terms_file_name)
             if settings.biblical_terms_list_type in _PREDEFINED_TERMS_LIST_TYPES:
                 with open(settings.biblical_terms_file_name, "rb") as key_terms_file:
-                    biblical_terms_tree = ET.parse(key_terms_file)
+                    biblical_terms_tree = ElementTree.parse(key_terms_file)
                     term_id_to_category_dict = _get_category_per_id(biblical_terms_tree)
             elif (
                 settings.biblical_terms_list_type == "Project"
                 and settings.biblical_terms_project_name == settings.name
                 and biblical_terms_file_entry is not None
             ):
                 with archive.open(biblical_terms_file_entry) as key_terms_file:
-                    biblical_terms_tree = ET.parse(key_terms_file)
+                    biblical_terms_tree = ElementTree.parse(key_terms_file)
                     term_id_to_category_dict = _get_category_per_id(biblical_terms_tree)
             else:
                 term_id_to_category_dict = {}
@@ -96,7 +96,7 @@ def _strip_parens(term_string: str, left: str = "(", right: str = ")") -> str:
     return term_string
 
 
-def _get_category_per_id(biblical_terms_tree: ET.ElementTree) -> Dict[str, Optional[str]]:
+def _get_category_per_id(biblical_terms_tree: ElementTree.ElementTree) -> Dict[str, Optional[str]]:
     term_id_to_category_dict = {}
     for e in biblical_terms_tree.iter(".//Term"):
         category_element = e.find("Category")

diff --git a/machine/corpora/paratext_project_settings_parser_base.py b/machine/corpora/paratext_project_settings_parser_base.py
@@ -1,6 +1,6 @@
-import xml.etree.ElementTree as ET
 from abc import ABC, abstractmethod
 from typing import BinaryIO
+from xml.etree import ElementTree
 
 from ..scripture.verse_ref import Versification
 from ..utils.string_utils import parse_integer
@@ -30,7 +30,7 @@ def parse(self) -> ParatextProjectSettings:
         if not settings_file_name:
             raise ValueError("The project does not contain a settings file.")
         with self.open(settings_file_name) as stream:
-            settings_tree = ET.parse(stream)
+            settings_tree = ElementTree.parse(stream)
 
         name = settings_tree.getroot().findtext("Name", "")
         full_name = settings_tree.getroot().findtext("FullName", "")

diff --git a/machine/corpora/text_corpus.py b/machine/corpora/text_corpus.py
@@ -2,7 +2,7 @@
 
 from abc import abstractmethod
 from itertools import islice
-from typing import Any, Callable, Generator, Iterable, Optional, Tuple
+from typing import Any, Callable, Generator, Iterable, Literal, Optional, Tuple
 
 from ..tokenization.detokenizer import Detokenizer
 from ..tokenization.tokenizer import Tokenizer
@@ -60,7 +60,7 @@ def _detokenize(row: TextRow) -> TextRow:
 
         return self.transform(_detokenize, is_tokenized=False)
 
-    def normalize(self, normalization_form: str) -> TextCorpus:
+    def normalize(self, normalization_form: Literal["NFC", "NFD", "NFKC", "NFKD"]) -> TextCorpus:
         def _normalize(row: TextRow) -> TextRow:
             row.segment = normalize(normalization_form, row.segment)
             return row

diff --git a/machine/corpora/token_processors.py b/machine/corpora/token_processors.py
@@ -1,5 +1,5 @@
 import unicodedata
-from typing import Sequence
+from typing import Literal, Sequence
 
 
 def lowercase(tokens: Sequence[str]) -> Sequence[str]:
@@ -14,7 +14,7 @@ def unescape_spaces(tokens: Sequence[str]) -> Sequence[str]:
     return [(" " if t == "<space>" else t) for t in tokens]
 
 
-def normalize(normalization_form: str, tokens: Sequence[str]) -> Sequence[str]:
+def normalize(normalization_form: Literal["NFC", "NFD", "NFKC", "NFKD"], tokens: Sequence[str]) -> Sequence[str]:
     return [unicodedata.normalize(normalization_form, t) for t in tokens]
 
 

diff --git a/machine/corpora/usx_file_alignment_collection.py b/machine/corpora/usx_file_alignment_collection.py
@@ -1,8 +1,8 @@
-import xml.etree.ElementTree as etree
 from collections import defaultdict
 from dataclasses import dataclass, field
 from pathlib import Path
 from typing import DefaultDict, Generator, List, Optional, Sequence, Set, Tuple
+from xml.etree import ElementTree
 
 from ..annotations.range import Range
 from ..scripture.verse_ref import VerseRef, Versification
@@ -127,7 +127,7 @@ class _RangeInfo:
 
 
 def _get_links(word_tokenizer: RangeTokenizer[str, int, str], tokens: Sequence[UsxToken]) -> DefaultDict[str, Set[int]]:
-    prev_para_elem: Optional[etree.Element] = None
+    prev_para_elem: Optional[ElementTree.Element] = None
     text = ""
     link_strs: List[Tuple[Range[int], str]] = []
     for token in tokens:

diff --git a/machine/corpora/usx_token.py b/machine/corpora/usx_token.py
@@ -1,13 +1,13 @@
-import xml.etree.ElementTree as etree
 from dataclasses import dataclass
 from typing import Optional
+from xml.etree import ElementTree
 
 
 @dataclass(frozen=True)
 class UsxToken:
-    para_element: etree.Element
+    para_element: ElementTree.Element
     text: str
-    element: Optional[etree.Element]
+    element: Optional[ElementTree.Element]
 
     def __repr__(self) -> str:
         return self.text