From 14be7af91e81309108e2a2f95fab0f8c53d3a026 Mon Sep 17 00:00:00 2001
From: mshannon-sil <matthew_shannon@sil.org>
Date: Fri, 12 Jan 2024 16:21:26 -0500
Subject: [PATCH] refactor to have separate uri and folder for shared_file,
 only normalize with sacremoses for NLLB

---
 machine/jobs/clearml_shared_file_service.py   | 28 ++++++++++++++-----
 machine/jobs/settings.yaml                    |  5 +++-
 machine/jobs/shared_file_service.py           | 15 ++++------
 .../hugging_face_nmt_model_trainer.py         |  3 +-
 4 files changed, 33 insertions(+), 18 deletions(-)

diff --git a/machine/jobs/clearml_shared_file_service.py b/machine/jobs/clearml_shared_file_service.py
index 82213ab1..9b1bdb2a 100644
--- a/machine/jobs/clearml_shared_file_service.py
+++ b/machine/jobs/clearml_shared_file_service.py
@@ -12,7 +12,7 @@
 
 class ClearMLSharedFileService(SharedFileService):
     def _download_file(self, path: str, cache: bool = False) -> Path:
-        uri = f"{self._shared_file_uri}/{path}"
+        uri = f"{self._shared_file_uri}/{self._shared_file_folder}/{path}"
         local_folder: Optional[str] = None
         if not cache:
             local_folder = str(self._data_dir)
@@ -22,7 +22,7 @@ def _download_file(self, path: str, cache: bool = False) -> Path:
         return Path(file_path)
 
     def _download_folder(self, path: str, cache: bool = False) -> Path:
-        uri = f"{self._shared_file_uri}/{path}"
+        uri = f"{self._shared_file_uri}/{self._shared_file_folder}/{path}"
         local_folder: Optional[str] = None
         if not cache:
             local_folder = str(self._data_dir)
@@ -32,22 +32,36 @@ def _download_folder(self, path: str, cache: bool = False) -> Path:
         return Path(folder_path) / path
 
     def _exists_file(self, path: str) -> bool:
-        uri = f"{self._shared_file_uri}/{path}"
+        uri = f"{self._shared_file_uri}/{self._shared_file_folder}/{path}"
         return try_n_times(lambda: StorageManager.exists_file(uri))  # type: ignore
 
     def _upload_file(self, path: str, local_file_path: Path) -> None:
         final_destination = try_n_times(
-            lambda: StorageManager.upload_file(str(local_file_path), f"{self._shared_file_uri}/{path}")
+            lambda: StorageManager.upload_file(
+                str(local_file_path), f"{self._shared_file_uri}/{self._shared_file_folder}/{path}"
+            )
         )
         if final_destination is None:
-            logger.error(f"Failed to upload file {str(local_file_path)} to {self._shared_file_uri}/{path}.")
+            logger.error(
+                (
+                    f"Failed to upload file {str(local_file_path)} "
+                    f"to {self._shared_file_uri}/{self._shared_file_folder}/{path}."
+                )
+            )
 
     def _upload_folder(self, path: str, local_folder_path: Path) -> None:
         final_destination = try_n_times(
-            lambda: StorageManager.upload_folder(str(local_folder_path), f"{self._shared_file_uri}/{path}")
+            lambda: StorageManager.upload_folder(
+                str(local_folder_path), f"{self._shared_file_uri}/{self._shared_file_folder}/{path}"
+            )
         )
         if final_destination is None:
-            logger.error(f"Failed to upload folder {str(local_folder_path)} to {self._shared_file_uri}/{path}.")
+            logger.error(
+                (
+                    f"Failed to upload folder {str(local_folder_path)} "
+                    f"to {self._shared_file_uri}/{self._shared_file_folder}/{path}."
+                )
+            )
 
 
 def try_n_times(func: Callable, n=10):
diff --git a/machine/jobs/settings.yaml b/machine/jobs/settings.yaml
index 79680269..90edbc0e 100644
--- a/machine/jobs/settings.yaml
+++ b/machine/jobs/settings.yaml
@@ -1,6 +1,8 @@
 default:
   model_type: huggingface
   data_dir: ~/machine
+  shared_file_uri: s3://aqua-ml-data/
+  shared_file_folder: production
   pretranslation_batch_size: 1024
   huggingface:
     parent_model_name: facebook/nllb-200-distilled-1.3B
@@ -25,12 +27,13 @@ default:
       add_unk_src_tokens: true
       add_unk_trg_tokens: true
 development:
-  shared_file_uri: s3://aqua-ml-data/dev/
+  shared_file_folder: dev
   huggingface:
     parent_model_name: facebook/nllb-200-distilled-600M
     generate_params:
       num_beams: 1
 staging:
+  shared_file_folder: ext-qa
   huggingface:
     parent_model_name: hf-internal-testing/tiny-random-nllb
     train_params:
diff --git a/machine/jobs/shared_file_service.py b/machine/jobs/shared_file_service.py
index d53d12af..0c0cb6b3 100644
--- a/machine/jobs/shared_file_service.py
+++ b/machine/jobs/shared_file_service.py
@@ -5,7 +5,6 @@
 from typing import Any, Generator, Iterator, List, TextIO, TypedDict
 
 import json_stream
-from clearml.storage.helper import StorageHelper
 
 from ..corpora.text_corpus import TextCorpus
 from ..corpora.text_file_text_corpus import TextFileTextCorpus
@@ -64,15 +63,8 @@ def generator() -> Generator[PretranslationInfo, None, None]:
 
     @contextmanager
     def open_target_pretranslation_writer(self) -> Iterator[PretranslationWriter]:
-        def remove_prefix(text: str, prefix: str) -> str:
-            if text.startswith(prefix):
-                return text[len(prefix) :]
-            return text
-
         build_id: str = self._config.build_id
-        bucket_dir = str(Path(self._shared_file_uri) / "builds" / build_id)
-        base_url = StorageHelper._resolve_base_url(bucket_dir)
-        build_dir = self._data_dir / Path(remove_prefix(bucket_dir[len(base_url) :], "/"))
+        build_dir = self._data_dir / self._shared_file_folder / "builds" / build_id
         build_dir.mkdir(parents=True, exist_ok=True)
         target_pretranslate_path = build_dir / "pretranslate.trg.json"
         with target_pretranslate_path.open("w", encoding="utf-8", newline="\n") as file:
@@ -104,6 +96,11 @@ def _shared_file_uri(self) -> str:
         shared_file_uri: str = self._config.shared_file_uri
         return shared_file_uri.rstrip("/")
 
+    @property
+    def _shared_file_folder(self) -> str:
+        shared_file_folder: str = self._config.shared_file_folder
+        return shared_file_folder.rstrip("/")
+
     @abstractmethod
     def _download_file(self, path: str, cache: bool = False) -> Path:
         ...
diff --git a/machine/translation/huggingface/hugging_face_nmt_model_trainer.py b/machine/translation/huggingface/hugging_face_nmt_model_trainer.py
index b6b93c9f..f77f1638 100644
--- a/machine/translation/huggingface/hugging_face_nmt_model_trainer.py
+++ b/machine/translation/huggingface/hugging_face_nmt_model_trainer.py
@@ -171,7 +171,8 @@ def find_missing_characters(tokenizer: Any, train_dataset: Dataset, lang_codes:
             for lang_code in lang_codes:
                 for ex in train_dataset["translation"]:
                     charset = charset | set(ex[lang_code])
-            charset = {self._mpn.normalize(char) for char in charset}
+            if isinstance(tokenizer, (NllbTokenizerFast)):
+                charset = {self._mpn.normalize(char) for char in charset}
             charset = {tokenizer.backend_tokenizer.normalizer.normalize_str(char) for char in charset}
             charset = set(filter(None, {char.strip() for char in charset}))
             missing_characters = sorted(list(charset - vocab))