Config passing serval #45 (#32)

* add build_options argument, move max_steps and model_type inside build_options, only infer when no training data exists * update poetry.lock, launch.json * try adding setuptools to poetry * run dep installation twice as work around * add setuptools to pyproject.toml * update poetry to 1.6.1 * remove setup tools from pyproject.toml * remove setuptools from lock file * revert to last passing commit * move model_type back out of build_options * Remove setuptools from pyproject.toml * change default build_options to str type, custom error message for invalid json in build_options, fix training data check * log message when skipping training --------- Co-authored-by: Damien Daspit <[email protected]>
sillsdev · Oct 4, 2023 · 70ec8ed · 70ec8ed
1 parent 133d6ee
commit 70ec8ed
Show file tree

Hide file tree

Showing 12 changed files with 81 additions and 212 deletions.
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
@@ -31,7 +31,16 @@
 		"vscode": {
 			// Set *default* container specific settings.json values on container create.
 			"settings": {
-				"python.defaultInterpreterPath": "/usr/bin/python"
+				"python.defaultInterpreterPath": "/usr/bin/python",
+				"[python]": {
+					"editor.defaultFormatter": "ms-python.black-formatter",
+					"editor.codeActionsOnSave": {
+						"source.organizeImports": true
+					}
+				},
+				"editor.formatOnSave": true,
+				"editor.formatOnType": true,
+				"isort.args":["--profile", "black"]
 			},
 			// Add the IDs of extensions you want installed when the container is created.
 			"extensions": [
@@ -40,6 +49,7 @@
 				"ms-python.flake8",
 				"ms-python.pylint",
 				"ms-python.black-formatter",
+				"ms-python.isort",
 				"eamodio.gitlens",
 				"donjayamanne.githistory",
 				"tamasfe.even-better-toml",

diff --git a/.devcontainer/dockerfile b/.devcontainer/dockerfile
@@ -1,7 +1,7 @@
 #compatability with Tensorflow 2.6.0 as per https://www.tensorflow.org/install/source#gpu
 ARG PYTHON_VERSION=3.8
 ARG UBUNTU_VERSION=focal
-ARG POETRY_VERSION=1.5
+ARG POETRY_VERSION=1.6.1
 ARG CUDA_VERSION=11.2.2-cudnn8-runtime-ubuntu20.04
 
 FROM nvidia/cuda:$CUDA_VERSION

diff --git a/.vscode/launch.json b/.vscode/launch.json
@@ -29,8 +29,9 @@
         "spa_Latn",
         "--trg-lang",
         "eng_Latn",
-        "--max-steps",
-        "1000"
+        "--clearml",
+        "--build-options",
+        "{\"max_steps\": 10}"
       ]
     },
     {
@@ -40,4 +41,4 @@
       "justMyCode": false
     }
   ]
-}
+}
diff --git a/dockerfile b/dockerfile
@@ -1,7 +1,7 @@
 #compatability with Tensorflow 2.6.0 as per https://www.tensorflow.org/install/source#gpu
 ARG PYTHON_VERSION=3.8
 ARG UBUNTU_VERSION=focal
-ARG POETRY_VERSION=1.5
+ARG POETRY_VERSION=1.6.1
 ARG CUDA_VERSION=11.2.2-cudnn8-runtime-ubuntu20.04
 
 FROM python:$PYTHON_VERSION-slim as builder

diff --git a/machine/jobs/build_nmt_engine.py b/machine/jobs/build_nmt_engine.py
@@ -1,4 +1,5 @@
 import argparse
+import json
 import logging
 import os
 from typing import Callable, Optional, cast
@@ -36,6 +37,12 @@ def clearml_check_canceled() -> None:
         logger.info("NMT Engine Build Job started")
 
         SETTINGS.update(args)
+        try:
+            SETTINGS.build_options = json.loads(args["build_options"])
+        except ValueError as e:
+            raise ValueError("Build options could not be parsed: Invalid JSON") from e
+        if SETTINGS.build_options:
+            SETTINGS.update(SETTINGS.build_options)
         SETTINGS.data_dir = os.path.expanduser(cast(str, SETTINGS.data_dir))
 
         logger.info(f"Config: {SETTINGS.as_dict()}")
@@ -66,8 +73,8 @@ def main() -> None:
     parser.add_argument("--build-id", required=True, type=str, help="Build id")
     parser.add_argument("--src-lang", required=True, type=str, help="Source language tag")
     parser.add_argument("--trg-lang", required=True, type=str, help="Target language tag")
-    parser.add_argument("--max-steps", type=int, help="Maximum number of steps")
     parser.add_argument("--clearml", default=False, action="store_true", help="Initializes a ClearML task")
+    parser.add_argument("--build-options", default="{}", help="Build configurations")
     args = parser.parse_args()
 
     run({k: v for k, v in vars(args).items() if v is not None})

diff --git a/machine/jobs/clearml_shared_file_service.py b/machine/jobs/clearml_shared_file_service.py
@@ -16,7 +16,7 @@ def _download_file(self, path: str, cache: bool = False) -> Path:
         local_folder: Optional[str] = None
         if not cache:
             local_folder = str(self._data_dir)
-        file_path = try_n_times(lambda: StorageManager.download_file(uri, local_folder))
+        file_path = try_n_times(lambda: StorageManager.download_file(uri, local_folder, skip_zero_size_check=True))
         if file_path is None:
             raise RuntimeError(f"Failed to download file: {uri}")
         return Path(file_path)
@@ -31,6 +31,10 @@ def _download_folder(self, path: str, cache: bool = False) -> Path:
             raise RuntimeError(f"Failed to download folder: {uri}")
         return Path(folder_path) / path
 
+    def _exists_file(self, path: str) -> bool:
+        uri = f"{self._shared_file_uri}/{path}"
+        return try_n_times(lambda: StorageManager.exists_file(uri))  # type: ignore
+
     def _upload_file(self, path: str, local_file_path: Path) -> None:
         final_destination = try_n_times(
             lambda: StorageManager.upload_file(str(local_file_path), f"{self._shared_file_uri}/{path}")

diff --git a/machine/jobs/nmt_engine_build_job.py b/machine/jobs/nmt_engine_build_job.py
@@ -27,30 +27,33 @@ def run(self, check_canceled: Optional[Callable[[], None]] = None) -> None:
         target_corpus = self._shared_file_service.create_target_corpus()
         parallel_corpus = source_corpus.align_rows(target_corpus)
 
-        if check_canceled is not None:
-            check_canceled()
-
-        if self._nmt_model_factory.train_tokenizer:
-            logger.info("Training source tokenizer")
-            with self._nmt_model_factory.create_source_tokenizer_trainer(source_corpus) as source_tokenizer_trainer:
-                source_tokenizer_trainer.train(check_canceled=check_canceled)
-                source_tokenizer_trainer.save()
-
+        if parallel_corpus.count(include_empty=False):
             if check_canceled is not None:
                 check_canceled()
 
-            logger.info("Training target tokenizer")
-            with self._nmt_model_factory.create_target_tokenizer_trainer(target_corpus) as target_tokenizer_trainer:
-                target_tokenizer_trainer.train(check_canceled=check_canceled)
-                target_tokenizer_trainer.save()
+            if self._nmt_model_factory.train_tokenizer:
+                logger.info("Training source tokenizer")
+                with self._nmt_model_factory.create_source_tokenizer_trainer(source_corpus) as source_tokenizer_trainer:
+                    source_tokenizer_trainer.train(check_canceled=check_canceled)
+                    source_tokenizer_trainer.save()
 
-            if check_canceled is not None:
-                check_canceled()
+                if check_canceled is not None:
+                    check_canceled()
+
+                logger.info("Training target tokenizer")
+                with self._nmt_model_factory.create_target_tokenizer_trainer(target_corpus) as target_tokenizer_trainer:
+                    target_tokenizer_trainer.train(check_canceled=check_canceled)
+                    target_tokenizer_trainer.save()
+
+                if check_canceled is not None:
+                    check_canceled()
 
-        logger.info("Training NMT model")
-        with self._nmt_model_factory.create_model_trainer(parallel_corpus) as model_trainer:
-            model_trainer.train(check_canceled=check_canceled)
-            model_trainer.save()
+            logger.info("Training NMT model")
+            with self._nmt_model_factory.create_model_trainer(parallel_corpus) as model_trainer:
+                model_trainer.train(check_canceled=check_canceled)
+                model_trainer.save()
+        else:
+            logger.info("No matching entries in the source and target corpus - skipping training")
 
         if check_canceled is not None:
             check_canceled()

diff --git a/machine/jobs/settings.yaml b/machine/jobs/settings.yaml
@@ -1,4 +1,5 @@
 default:
+  model_type: huggingface
   max_steps: 20000
   data_dir: ~/machine
   batch_size: 1024

diff --git a/machine/jobs/shared_file_service.py b/machine/jobs/shared_file_service.py
@@ -40,6 +40,12 @@ def create_source_corpus(self) -> TextCorpus:
     def create_target_corpus(self) -> TextCorpus:
         return TextFileTextCorpus(self._download_file(f"builds/{self._build_id}/train.trg.txt"))
 
+    def exists_source_corpus(self) -> bool:
+        return self._exists_file(f"builds/{self._build_id}/train.src.txt")
+
+    def exists_target_corpus(self) -> bool:
+        return self._exists_file(f"builds/{self._build_id}/train.trg.txt")
+
     def get_source_pretranslations(self) -> ContextManagedGenerator[PretranslationInfo, None, None]:
         src_pretranslate_path = self._download_file(f"builds/{self._build_id}/pretranslate.src.json")
 
@@ -98,6 +104,10 @@ def _download_file(self, path: str, cache: bool = False) -> Path:
     def _download_folder(self, path: str, cache: bool = False) -> Path:
         ...
 
+    @abstractmethod
+    def _exists_file(self, path: str) -> bool:
+        ...
+
     @abstractmethod
     def _upload_file(self, path: str, local_file_path: Path) -> None:
         ...