Skip to content

Commit

Permalink
refactor to have separate uri and folder for shared_file, only normal…
Browse files Browse the repository at this point in the history
…ize with sacremoses for NLLB
  • Loading branch information
mshannon-sil committed Jan 12, 2024
1 parent 8629f3f commit 14be7af
Show file tree
Hide file tree
Showing 4 changed files with 33 additions and 18 deletions.
28 changes: 21 additions & 7 deletions machine/jobs/clearml_shared_file_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

class ClearMLSharedFileService(SharedFileService):
def _download_file(self, path: str, cache: bool = False) -> Path:
uri = f"{self._shared_file_uri}/{path}"
uri = f"{self._shared_file_uri}/{self._shared_file_folder}/{path}"
local_folder: Optional[str] = None
if not cache:
local_folder = str(self._data_dir)
Expand All @@ -22,7 +22,7 @@ def _download_file(self, path: str, cache: bool = False) -> Path:
return Path(file_path)

def _download_folder(self, path: str, cache: bool = False) -> Path:
uri = f"{self._shared_file_uri}/{path}"
uri = f"{self._shared_file_uri}/{self._shared_file_folder}/{path}"
local_folder: Optional[str] = None
if not cache:
local_folder = str(self._data_dir)
Expand All @@ -32,22 +32,36 @@ def _download_folder(self, path: str, cache: bool = False) -> Path:
return Path(folder_path) / path

def _exists_file(self, path: str) -> bool:
uri = f"{self._shared_file_uri}/{path}"
uri = f"{self._shared_file_uri}/{self._shared_file_folder}/{path}"
return try_n_times(lambda: StorageManager.exists_file(uri)) # type: ignore

def _upload_file(self, path: str, local_file_path: Path) -> None:
final_destination = try_n_times(
lambda: StorageManager.upload_file(str(local_file_path), f"{self._shared_file_uri}/{path}")
lambda: StorageManager.upload_file(
str(local_file_path), f"{self._shared_file_uri}/{self._shared_file_folder}/{path}"
)
)
if final_destination is None:
logger.error(f"Failed to upload file {str(local_file_path)} to {self._shared_file_uri}/{path}.")
logger.error(
(
f"Failed to upload file {str(local_file_path)} "
f"to {self._shared_file_uri}/{self._shared_file_folder}/{path}."
)
)

def _upload_folder(self, path: str, local_folder_path: Path) -> None:
final_destination = try_n_times(
lambda: StorageManager.upload_folder(str(local_folder_path), f"{self._shared_file_uri}/{path}")
lambda: StorageManager.upload_folder(
str(local_folder_path), f"{self._shared_file_uri}/{self._shared_file_folder}/{path}"
)
)
if final_destination is None:
logger.error(f"Failed to upload folder {str(local_folder_path)} to {self._shared_file_uri}/{path}.")
logger.error(
(
f"Failed to upload folder {str(local_folder_path)} "
f"to {self._shared_file_uri}/{self._shared_file_folder}/{path}."
)
)


def try_n_times(func: Callable, n=10):
Expand Down
5 changes: 4 additions & 1 deletion machine/jobs/settings.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
default:
model_type: huggingface
data_dir: ~/machine
shared_file_uri: s3://aqua-ml-data/
shared_file_folder: production
pretranslation_batch_size: 1024
huggingface:
parent_model_name: facebook/nllb-200-distilled-1.3B
Expand All @@ -25,12 +27,13 @@ default:
add_unk_src_tokens: true
add_unk_trg_tokens: true
development:
shared_file_uri: s3://aqua-ml-data/dev/
shared_file_folder: dev
huggingface:
parent_model_name: facebook/nllb-200-distilled-600M
generate_params:
num_beams: 1
staging:
shared_file_folder: ext-qa
huggingface:
parent_model_name: hf-internal-testing/tiny-random-nllb
train_params:
Expand Down
15 changes: 6 additions & 9 deletions machine/jobs/shared_file_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
from typing import Any, Generator, Iterator, List, TextIO, TypedDict

import json_stream
from clearml.storage.helper import StorageHelper

from ..corpora.text_corpus import TextCorpus
from ..corpora.text_file_text_corpus import TextFileTextCorpus
Expand Down Expand Up @@ -64,15 +63,8 @@ def generator() -> Generator[PretranslationInfo, None, None]:

@contextmanager
def open_target_pretranslation_writer(self) -> Iterator[PretranslationWriter]:
def remove_prefix(text: str, prefix: str) -> str:
if text.startswith(prefix):
return text[len(prefix) :]
return text

build_id: str = self._config.build_id
bucket_dir = str(Path(self._shared_file_uri) / "builds" / build_id)
base_url = StorageHelper._resolve_base_url(bucket_dir)
build_dir = self._data_dir / Path(remove_prefix(bucket_dir[len(base_url) :], "/"))
build_dir = self._data_dir / self._shared_file_folder / "builds" / build_id
build_dir.mkdir(parents=True, exist_ok=True)
target_pretranslate_path = build_dir / "pretranslate.trg.json"
with target_pretranslate_path.open("w", encoding="utf-8", newline="\n") as file:
Expand Down Expand Up @@ -104,6 +96,11 @@ def _shared_file_uri(self) -> str:
shared_file_uri: str = self._config.shared_file_uri
return shared_file_uri.rstrip("/")

@property
def _shared_file_folder(self) -> str:
shared_file_folder: str = self._config.shared_file_folder
return shared_file_folder.rstrip("/")

@abstractmethod
def _download_file(self, path: str, cache: bool = False) -> Path:
...
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,8 @@ def find_missing_characters(tokenizer: Any, train_dataset: Dataset, lang_codes:
for lang_code in lang_codes:
for ex in train_dataset["translation"]:
charset = charset | set(ex[lang_code])
charset = {self._mpn.normalize(char) for char in charset}
if isinstance(tokenizer, (NllbTokenizerFast)):
charset = {self._mpn.normalize(char) for char in charset}
charset = {tokenizer.backend_tokenizer.normalizer.normalize_str(char) for char in charset}
charset = set(filter(None, {char.strip() for char in charset}))
missing_characters = sorted(list(charset - vocab))
Expand Down

0 comments on commit 14be7af

Please sign in to comment.