From 14be7af91e81309108e2a2f95fab0f8c53d3a026 Mon Sep 17 00:00:00 2001 From: mshannon-sil Date: Fri, 12 Jan 2024 16:21:26 -0500 Subject: [PATCH] refactor to have separate uri and folder for shared_file, only normalize with sacremoses for NLLB --- machine/jobs/clearml_shared_file_service.py | 28 ++++++++++++++----- machine/jobs/settings.yaml | 5 +++- machine/jobs/shared_file_service.py | 15 ++++------ .../hugging_face_nmt_model_trainer.py | 3 +- 4 files changed, 33 insertions(+), 18 deletions(-) diff --git a/machine/jobs/clearml_shared_file_service.py b/machine/jobs/clearml_shared_file_service.py index 82213ab1..9b1bdb2a 100644 --- a/machine/jobs/clearml_shared_file_service.py +++ b/machine/jobs/clearml_shared_file_service.py @@ -12,7 +12,7 @@ class ClearMLSharedFileService(SharedFileService): def _download_file(self, path: str, cache: bool = False) -> Path: - uri = f"{self._shared_file_uri}/{path}" + uri = f"{self._shared_file_uri}/{self._shared_file_folder}/{path}" local_folder: Optional[str] = None if not cache: local_folder = str(self._data_dir) @@ -22,7 +22,7 @@ def _download_file(self, path: str, cache: bool = False) -> Path: return Path(file_path) def _download_folder(self, path: str, cache: bool = False) -> Path: - uri = f"{self._shared_file_uri}/{path}" + uri = f"{self._shared_file_uri}/{self._shared_file_folder}/{path}" local_folder: Optional[str] = None if not cache: local_folder = str(self._data_dir) @@ -32,22 +32,36 @@ def _download_folder(self, path: str, cache: bool = False) -> Path: return Path(folder_path) / path def _exists_file(self, path: str) -> bool: - uri = f"{self._shared_file_uri}/{path}" + uri = f"{self._shared_file_uri}/{self._shared_file_folder}/{path}" return try_n_times(lambda: StorageManager.exists_file(uri)) # type: ignore def _upload_file(self, path: str, local_file_path: Path) -> None: final_destination = try_n_times( - lambda: StorageManager.upload_file(str(local_file_path), f"{self._shared_file_uri}/{path}") + lambda: StorageManager.upload_file( + str(local_file_path), f"{self._shared_file_uri}/{self._shared_file_folder}/{path}" + ) ) if final_destination is None: - logger.error(f"Failed to upload file {str(local_file_path)} to {self._shared_file_uri}/{path}.") + logger.error( + ( + f"Failed to upload file {str(local_file_path)} " + f"to {self._shared_file_uri}/{self._shared_file_folder}/{path}." + ) + ) def _upload_folder(self, path: str, local_folder_path: Path) -> None: final_destination = try_n_times( - lambda: StorageManager.upload_folder(str(local_folder_path), f"{self._shared_file_uri}/{path}") + lambda: StorageManager.upload_folder( + str(local_folder_path), f"{self._shared_file_uri}/{self._shared_file_folder}/{path}" + ) ) if final_destination is None: - logger.error(f"Failed to upload folder {str(local_folder_path)} to {self._shared_file_uri}/{path}.") + logger.error( + ( + f"Failed to upload folder {str(local_folder_path)} " + f"to {self._shared_file_uri}/{self._shared_file_folder}/{path}." + ) + ) def try_n_times(func: Callable, n=10): diff --git a/machine/jobs/settings.yaml b/machine/jobs/settings.yaml index 79680269..90edbc0e 100644 --- a/machine/jobs/settings.yaml +++ b/machine/jobs/settings.yaml @@ -1,6 +1,8 @@ default: model_type: huggingface data_dir: ~/machine + shared_file_uri: s3://aqua-ml-data/ + shared_file_folder: production pretranslation_batch_size: 1024 huggingface: parent_model_name: facebook/nllb-200-distilled-1.3B @@ -25,12 +27,13 @@ default: add_unk_src_tokens: true add_unk_trg_tokens: true development: - shared_file_uri: s3://aqua-ml-data/dev/ + shared_file_folder: dev huggingface: parent_model_name: facebook/nllb-200-distilled-600M generate_params: num_beams: 1 staging: + shared_file_folder: ext-qa huggingface: parent_model_name: hf-internal-testing/tiny-random-nllb train_params: diff --git a/machine/jobs/shared_file_service.py b/machine/jobs/shared_file_service.py index d53d12af..0c0cb6b3 100644 --- a/machine/jobs/shared_file_service.py +++ b/machine/jobs/shared_file_service.py @@ -5,7 +5,6 @@ from typing import Any, Generator, Iterator, List, TextIO, TypedDict import json_stream -from clearml.storage.helper import StorageHelper from ..corpora.text_corpus import TextCorpus from ..corpora.text_file_text_corpus import TextFileTextCorpus @@ -64,15 +63,8 @@ def generator() -> Generator[PretranslationInfo, None, None]: @contextmanager def open_target_pretranslation_writer(self) -> Iterator[PretranslationWriter]: - def remove_prefix(text: str, prefix: str) -> str: - if text.startswith(prefix): - return text[len(prefix) :] - return text - build_id: str = self._config.build_id - bucket_dir = str(Path(self._shared_file_uri) / "builds" / build_id) - base_url = StorageHelper._resolve_base_url(bucket_dir) - build_dir = self._data_dir / Path(remove_prefix(bucket_dir[len(base_url) :], "/")) + build_dir = self._data_dir / self._shared_file_folder / "builds" / build_id build_dir.mkdir(parents=True, exist_ok=True) target_pretranslate_path = build_dir / "pretranslate.trg.json" with target_pretranslate_path.open("w", encoding="utf-8", newline="\n") as file: @@ -104,6 +96,11 @@ def _shared_file_uri(self) -> str: shared_file_uri: str = self._config.shared_file_uri return shared_file_uri.rstrip("/") + @property + def _shared_file_folder(self) -> str: + shared_file_folder: str = self._config.shared_file_folder + return shared_file_folder.rstrip("/") + @abstractmethod def _download_file(self, path: str, cache: bool = False) -> Path: ... diff --git a/machine/translation/huggingface/hugging_face_nmt_model_trainer.py b/machine/translation/huggingface/hugging_face_nmt_model_trainer.py index b6b93c9f..f77f1638 100644 --- a/machine/translation/huggingface/hugging_face_nmt_model_trainer.py +++ b/machine/translation/huggingface/hugging_face_nmt_model_trainer.py @@ -171,7 +171,8 @@ def find_missing_characters(tokenizer: Any, train_dataset: Dataset, lang_codes: for lang_code in lang_codes: for ex in train_dataset["translation"]: charset = charset | set(ex[lang_code]) - charset = {self._mpn.normalize(char) for char in charset} + if isinstance(tokenizer, (NllbTokenizerFast)): + charset = {self._mpn.normalize(char) for char in charset} charset = {tokenizer.backend_tokenizer.normalizer.normalize_str(char) for char in charset} charset = set(filter(None, {char.strip() for char in charset})) missing_characters = sorted(list(charset - vocab))