Skip to content

Commit

Permalink
Config passing serval #45 (#32)
Browse files Browse the repository at this point in the history
* add build_options argument, move max_steps and model_type inside build_options, only infer when no training data exists

* update poetry.lock, launch.json

* try adding setuptools to poetry

* run dep installation twice as work around

* add setuptools to pyproject.toml

* update poetry to 1.6.1

* remove setup tools from pyproject.toml

* remove setuptools from lock file

* revert to last passing commit

* move model_type back out of build_options

* Remove setuptools from pyproject.toml

* change default build_options to str type, custom error message for invalid json in build_options, fix training data check

* log message when skipping training

---------

Co-authored-by: Damien Daspit <[email protected]>
  • Loading branch information
mshannon-sil and ddaspit authored Oct 4, 2023
1 parent 133d6ee commit 70ec8ed
Show file tree
Hide file tree
Showing 12 changed files with 81 additions and 212 deletions.
12 changes: 11 additions & 1 deletion .devcontainer/devcontainer.json
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,16 @@
"vscode": {
// Set *default* container specific settings.json values on container create.
"settings": {
"python.defaultInterpreterPath": "/usr/bin/python"
"python.defaultInterpreterPath": "/usr/bin/python",
"[python]": {
"editor.defaultFormatter": "ms-python.black-formatter",
"editor.codeActionsOnSave": {
"source.organizeImports": true
}
},
"editor.formatOnSave": true,
"editor.formatOnType": true,
"isort.args":["--profile", "black"]
},
// Add the IDs of extensions you want installed when the container is created.
"extensions": [
Expand All @@ -40,6 +49,7 @@
"ms-python.flake8",
"ms-python.pylint",
"ms-python.black-formatter",
"ms-python.isort",
"eamodio.gitlens",
"donjayamanne.githistory",
"tamasfe.even-better-toml",
Expand Down
2 changes: 1 addition & 1 deletion .devcontainer/dockerfile
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#compatability with Tensorflow 2.6.0 as per https://www.tensorflow.org/install/source#gpu
ARG PYTHON_VERSION=3.8
ARG UBUNTU_VERSION=focal
ARG POETRY_VERSION=1.5
ARG POETRY_VERSION=1.6.1
ARG CUDA_VERSION=11.2.2-cudnn8-runtime-ubuntu20.04

FROM nvidia/cuda:$CUDA_VERSION
Expand Down
7 changes: 4 additions & 3 deletions .vscode/launch.json
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,9 @@
"spa_Latn",
"--trg-lang",
"eng_Latn",
"--max-steps",
"1000"
"--clearml",
"--build-options",
"{\"max_steps\": 10}"
]
},
{
Expand All @@ -40,4 +41,4 @@
"justMyCode": false
}
]
}
}
2 changes: 1 addition & 1 deletion dockerfile
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#compatability with Tensorflow 2.6.0 as per https://www.tensorflow.org/install/source#gpu
ARG PYTHON_VERSION=3.8
ARG UBUNTU_VERSION=focal
ARG POETRY_VERSION=1.5
ARG POETRY_VERSION=1.6.1
ARG CUDA_VERSION=11.2.2-cudnn8-runtime-ubuntu20.04

FROM python:$PYTHON_VERSION-slim as builder
Expand Down
9 changes: 8 additions & 1 deletion machine/jobs/build_nmt_engine.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import argparse
import json
import logging
import os
from typing import Callable, Optional, cast
Expand Down Expand Up @@ -36,6 +37,12 @@ def clearml_check_canceled() -> None:
logger.info("NMT Engine Build Job started")

SETTINGS.update(args)
try:
SETTINGS.build_options = json.loads(args["build_options"])
except ValueError as e:
raise ValueError("Build options could not be parsed: Invalid JSON") from e
if SETTINGS.build_options:
SETTINGS.update(SETTINGS.build_options)
SETTINGS.data_dir = os.path.expanduser(cast(str, SETTINGS.data_dir))

logger.info(f"Config: {SETTINGS.as_dict()}")
Expand Down Expand Up @@ -66,8 +73,8 @@ def main() -> None:
parser.add_argument("--build-id", required=True, type=str, help="Build id")
parser.add_argument("--src-lang", required=True, type=str, help="Source language tag")
parser.add_argument("--trg-lang", required=True, type=str, help="Target language tag")
parser.add_argument("--max-steps", type=int, help="Maximum number of steps")
parser.add_argument("--clearml", default=False, action="store_true", help="Initializes a ClearML task")
parser.add_argument("--build-options", default="{}", help="Build configurations")
args = parser.parse_args()

run({k: v for k, v in vars(args).items() if v is not None})
Expand Down
6 changes: 5 additions & 1 deletion machine/jobs/clearml_shared_file_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def _download_file(self, path: str, cache: bool = False) -> Path:
local_folder: Optional[str] = None
if not cache:
local_folder = str(self._data_dir)
file_path = try_n_times(lambda: StorageManager.download_file(uri, local_folder))
file_path = try_n_times(lambda: StorageManager.download_file(uri, local_folder, skip_zero_size_check=True))
if file_path is None:
raise RuntimeError(f"Failed to download file: {uri}")
return Path(file_path)
Expand All @@ -31,6 +31,10 @@ def _download_folder(self, path: str, cache: bool = False) -> Path:
raise RuntimeError(f"Failed to download folder: {uri}")
return Path(folder_path) / path

def _exists_file(self, path: str) -> bool:
uri = f"{self._shared_file_uri}/{path}"
return try_n_times(lambda: StorageManager.exists_file(uri)) # type: ignore

def _upload_file(self, path: str, local_file_path: Path) -> None:
final_destination = try_n_times(
lambda: StorageManager.upload_file(str(local_file_path), f"{self._shared_file_uri}/{path}")
Expand Down
41 changes: 22 additions & 19 deletions machine/jobs/nmt_engine_build_job.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,30 +27,33 @@ def run(self, check_canceled: Optional[Callable[[], None]] = None) -> None:
target_corpus = self._shared_file_service.create_target_corpus()
parallel_corpus = source_corpus.align_rows(target_corpus)

if check_canceled is not None:
check_canceled()

if self._nmt_model_factory.train_tokenizer:
logger.info("Training source tokenizer")
with self._nmt_model_factory.create_source_tokenizer_trainer(source_corpus) as source_tokenizer_trainer:
source_tokenizer_trainer.train(check_canceled=check_canceled)
source_tokenizer_trainer.save()

if parallel_corpus.count(include_empty=False):
if check_canceled is not None:
check_canceled()

logger.info("Training target tokenizer")
with self._nmt_model_factory.create_target_tokenizer_trainer(target_corpus) as target_tokenizer_trainer:
target_tokenizer_trainer.train(check_canceled=check_canceled)
target_tokenizer_trainer.save()
if self._nmt_model_factory.train_tokenizer:
logger.info("Training source tokenizer")
with self._nmt_model_factory.create_source_tokenizer_trainer(source_corpus) as source_tokenizer_trainer:
source_tokenizer_trainer.train(check_canceled=check_canceled)
source_tokenizer_trainer.save()

if check_canceled is not None:
check_canceled()
if check_canceled is not None:
check_canceled()

logger.info("Training target tokenizer")
with self._nmt_model_factory.create_target_tokenizer_trainer(target_corpus) as target_tokenizer_trainer:
target_tokenizer_trainer.train(check_canceled=check_canceled)
target_tokenizer_trainer.save()

if check_canceled is not None:
check_canceled()

logger.info("Training NMT model")
with self._nmt_model_factory.create_model_trainer(parallel_corpus) as model_trainer:
model_trainer.train(check_canceled=check_canceled)
model_trainer.save()
logger.info("Training NMT model")
with self._nmt_model_factory.create_model_trainer(parallel_corpus) as model_trainer:
model_trainer.train(check_canceled=check_canceled)
model_trainer.save()
else:
logger.info("No matching entries in the source and target corpus - skipping training")

if check_canceled is not None:
check_canceled()
Expand Down
1 change: 1 addition & 0 deletions machine/jobs/settings.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
default:
model_type: huggingface
max_steps: 20000
data_dir: ~/machine
batch_size: 1024
Expand Down
10 changes: 10 additions & 0 deletions machine/jobs/shared_file_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,12 @@ def create_source_corpus(self) -> TextCorpus:
def create_target_corpus(self) -> TextCorpus:
return TextFileTextCorpus(self._download_file(f"builds/{self._build_id}/train.trg.txt"))

def exists_source_corpus(self) -> bool:
return self._exists_file(f"builds/{self._build_id}/train.src.txt")

def exists_target_corpus(self) -> bool:
return self._exists_file(f"builds/{self._build_id}/train.trg.txt")

def get_source_pretranslations(self) -> ContextManagedGenerator[PretranslationInfo, None, None]:
src_pretranslate_path = self._download_file(f"builds/{self._build_id}/pretranslate.src.json")

Expand Down Expand Up @@ -98,6 +104,10 @@ def _download_file(self, path: str, cache: bool = False) -> Path:
def _download_folder(self, path: str, cache: bool = False) -> Path:
...

@abstractmethod
def _exists_file(self, path: str) -> bool:
...

@abstractmethod
def _upload_file(self, path: str, local_file_path: Path) -> None:
...
Expand Down
Loading

0 comments on commit 70ec8ed

Please sign in to comment.