Skip to content

Commit

Permalink
Publish num_steps and inference_step on ClearML
Browse files Browse the repository at this point in the history
apply temp huggingface fix
Minor fixes
Faster local dockerfile build
local check - before running on ci small formating fixes

Fix imports

Minor fixes

Fix inference
Faster local dockerfile build
local check - before running on ci
small formating fixes
  • Loading branch information
johnml1135 committed Oct 12, 2023
1 parent adf9bb9 commit cb0aaa0
Show file tree
Hide file tree
Showing 12 changed files with 113 additions and 39 deletions.
8 changes: 6 additions & 2 deletions .devcontainer/devcontainer.json
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,10 @@
},
"editor.formatOnSave": true,
"editor.formatOnType": true,
"isort.args":["--profile", "black"]
"isort.args": [
"--profile",
"black"
]
},
// Add the IDs of extensions you want installed when the container is created.
"extensions": [
Expand All @@ -54,7 +57,8 @@
"donjayamanne.githistory",
"tamasfe.even-better-toml",
"github.vscode-github-actions",
"mhutchie.git-graph"
"mhutchie.git-graph",
"GitHub.copilot"
]
}
}
Expand Down
7 changes: 4 additions & 3 deletions dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,10 @@ RUN python3 -m venv $POETRY_VENV \
ENV PATH="${PATH}:${POETRY_VENV}/bin"

WORKDIR /src
COPY poetry.lock pyproject.toml /src
RUN poetry export --with=gpu --without-hashes -f requirements.txt > requirements.txt
COPY . /src
RUN poetry build
RUN poetry export --with=gpu --without-hashes -f requirements.txt > requirements.txt


FROM nvidia/cuda:$CUDA_VERSION
Expand All @@ -46,9 +47,9 @@ RUN ln -sfn /usr/bin/python${PYTHON_VERSION} /usr/bin/python3 & \
ln -sfn /usr/bin/python${PYTHON_VERSION} /usr/bin/python

COPY --from=builder /src/requirements.txt .
COPY --from=builder /src/dist/*.whl .

RUN pip install --no-cache-dir -r requirements.txt && rm requirements.txt

COPY --from=builder /src/dist/*.whl .
RUN pip install --no-deps *.whl && rm *.whl

CMD ["bash"]
13 changes: 13 additions & 0 deletions local_check.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#!/bin/bash
poetry install

echo "======================= black ======================"
poetry run black .
echo "======================= flake8 ======================"
poetry run flake8 .
echo "======================= isort ======================"
poetry run isort .
echo "======================= pyright ======================"
poetry run pyright
echo "======================= pytest ======================"
poetry run pytest
12 changes: 2 additions & 10 deletions machine/jobs/build_nmt_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,10 @@
import json
import logging
import os
from typing import Callable, Optional, cast
from typing import cast

from clearml import Task

from ..utils.canceled_error import CanceledError
from .clearml_shared_file_service import ClearMLSharedFileService
from .config import SETTINGS
from .nmt_engine_build_job import NmtEngineBuildJob
Expand All @@ -22,17 +21,10 @@


def run(args: dict) -> None:
check_canceled: Optional[Callable[[], None]] = None
task = None
if args["clearml"]:
task = Task.init()

def clearml_check_canceled() -> None:
if task.get_status() in {"stopped", "stopping"}:
raise CanceledError

check_canceled = clearml_check_canceled

try:
logger.info("NMT Engine Build Job started")

Expand Down Expand Up @@ -60,7 +52,7 @@ def clearml_check_canceled() -> None:
raise RuntimeError("The model type is invalid.")

job = NmtEngineBuildJob(SETTINGS, nmt_model_factory, shared_file_service)
job.run(check_canceled)
job.run(task)
logger.info("Finished")
except Exception as e:
logger.exception(e, stack_info=True)
Expand Down
12 changes: 12 additions & 0 deletions machine/jobs/huggingface/hugging_face_nmt_model_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from typing import Any, cast

from transformers import AutoConfig, AutoModelForSeq2SeqLM, HfArgumentParser, PreTrainedModel, Seq2SeqTrainingArguments
from transformers.integrations import ClearMLCallback

from ...corpora.parallel_text_corpus import ParallelTextCorpus
from ...corpora.text_corpus import TextCorpus
Expand Down Expand Up @@ -77,3 +78,14 @@ def save_model(self) -> None:
@property
def _model_dir(self) -> Path:
return Path(self._config.data_dir, "builds", self._config.build_id, "model")


# FIXME - remove this code when the fix is applied to Huggingface
# https://github.com/huggingface/transformers/pull/26763
def on_train_end(
self: ClearMLCallback, args, state, control, model=None, tokenizer=None, metrics=None, logs=None, **kwargs
):
pass


setattr(ClearMLCallback, "on_train_end", on_train_end)
54 changes: 36 additions & 18 deletions machine/jobs/nmt_engine_build_job.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
import logging
from contextlib import ExitStack
from typing import Any, Callable, Optional, Sequence
from typing import Any, Optional, Sequence

from clearml import Task

from ..corpora.corpora_utils import batch
from ..translation.translation_engine import TranslationEngine
from ..utils.canceled_error import CanceledError
from .nmt_model_factory import NmtModelFactory
from .shared_file_service import PretranslationInfo, PretranslationWriter, SharedFileService

Expand All @@ -15,10 +18,12 @@ def __init__(self, config: Any, nmt_model_factory: NmtModelFactory, shared_file_
self._config = config
self._nmt_model_factory = nmt_model_factory
self._shared_file_service = shared_file_service
self.clearml_task: Optional[Task] = None

def run(self, check_canceled: Optional[Callable[[], None]] = None) -> None:
if check_canceled is not None:
check_canceled()
def run(self, task: Optional[Task] = None) -> None:
self.clearml_task = task
self._send_clearml_config()
self._check_canceled()

self._nmt_model_factory.init()

Expand All @@ -28,45 +33,58 @@ def run(self, check_canceled: Optional[Callable[[], None]] = None) -> None:
parallel_corpus = source_corpus.align_rows(target_corpus)

if parallel_corpus.count(include_empty=False):
if check_canceled is not None:
check_canceled()
self._check_canceled()

if self._nmt_model_factory.train_tokenizer:
logger.info("Training source tokenizer")
with self._nmt_model_factory.create_source_tokenizer_trainer(source_corpus) as source_tokenizer_trainer:
source_tokenizer_trainer.train(check_canceled=check_canceled)
source_tokenizer_trainer.train(check_canceled=self._check_canceled)
source_tokenizer_trainer.save()

if check_canceled is not None:
check_canceled()
self._check_canceled()

logger.info("Training target tokenizer")
with self._nmt_model_factory.create_target_tokenizer_trainer(target_corpus) as target_tokenizer_trainer:
target_tokenizer_trainer.train(check_canceled=check_canceled)
target_tokenizer_trainer.train(check_canceled=self._check_canceled)
target_tokenizer_trainer.save()

if check_canceled is not None:
check_canceled()
self._check_canceled()

logger.info("Training NMT model")
with self._nmt_model_factory.create_model_trainer(parallel_corpus) as model_trainer:
model_trainer.train(check_canceled=check_canceled)
model_trainer.train(check_canceled=self._check_canceled)
model_trainer.save()
else:
logger.info("No matching entries in the source and target corpus - skipping training")

if check_canceled is not None:
check_canceled()
self._check_canceled()

logger.info("Pretranslating segments")
with ExitStack() as stack:
model = stack.enter_context(self._nmt_model_factory.create_engine())
src_pretranslations = stack.enter_context(self._shared_file_service.get_source_pretranslations())
writer = stack.enter_context(self._shared_file_service.open_target_pretranslation_writer())
for pi_batch in batch(src_pretranslations, self._config["batch_size"]):
if check_canceled is not None:
check_canceled()
current_inference_step = 0
self._update_inference_step(current_inference_step)
batch_size = self._config["batch_size"]
for pi_batch in batch(src_pretranslations, batch_size):
self._check_canceled()
_translate_batch(model, pi_batch, writer)
current_inference_step += batch_size
self._update_inference_step(current_inference_step)

def _send_clearml_config(self) -> None:
if self.clearml_task:
self.clearml_task.get_logger().report_single_value(name="total_steps", value=self._config["max_steps"])

def _check_canceled(self) -> None:
if self.clearml_task:
if self.clearml_task.get_status() in {"stopped", "stopping"}:
raise CanceledError

def _update_inference_step(self, step_num: int) -> None:
if self.clearml_task:
self.clearml_task.get_logger().report_single_value(name="inference_step", value=step_num)


def _translate_batch(
Expand Down
2 changes: 1 addition & 1 deletion machine/translation/thot/thot_smt_model_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def _prune_lex_table(filename: Path, threshold: float) -> None:
chunk = file.read(struct.size)
while chunk != b"":
entry = struct.unpack(chunk)
entries.append(entry[:-1])
entries.append(entry[:-1]) # type: ignore
chunk = file.read(struct.size)

if len(entries) == 0:
Expand Down
34 changes: 33 additions & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ pytest-mockito = "^0.0.4"
ipykernel = "^6.7.0"
jupyter = "^1.0.0"
pandas = "^1.3.0"
pyright = "^1.1.331"

[tool.poetry.group.gpu.dependencies]
# Torch is not included in the normal install to allow the user to choose the versions of these dependencies when
Expand Down
3 changes: 2 additions & 1 deletion tests/jobs/test_nmt_engine_build_job.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,9 @@ def test_run() -> None:
def test_cancel() -> None:
env = _TestEnvironment()
checker = _CancellationChecker(3)
setattr(env.job, "_check_canceled", checker.check_canceled)
with pytest.raises(CanceledError):
env.job.run(checker.check_canceled)
env.job.run()

assert env.target_pretranslations == ""

Expand Down
4 changes: 2 additions & 2 deletions typings/networkx/classes/digraph.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,11 @@ class DiGraph(Graph[T]):
edge_attr_dict_factory: Any = ...
graph: Any = ...
@overload
def __init__(self, incoming_graph_data: DiGraph[T] = ..., **attr: Any) -> None: ...
def __init__(self, incoming_graph_data: DiGraph[T] = ..., **attr: Any) -> None: ... # type: ignore
# @overload
# def __init__(self, incoming_graph_data: Optional[Any] = ..., **attr: Any) -> None: ...
@overload
def __init__(self, incoming_graph_data: List[Tuple[T, T]] = ..., **attr: Any) -> None: ...
def __init__(self, incoming_graph_data: List[Tuple[T, T]] = ..., **attr: Any) -> None: ... # type: ignore
@property
def adj(self): ...
@property
Expand Down
2 changes: 1 addition & 1 deletion typings/networkx/classes/reportviews.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ class NodeDataView(Set[T], Iterable[T]):
def __contains__(self, n: T): ...
def __getitem__(self, n: Any): ...

class DiDegreeView(Mapping[T, int], Iterable[int]):
class DiDegreeView(Mapping[T, int], Iterable[int]): # type: ignore
def __init__(self, G: Any, nbunch: Optional[Any] = ..., weight: Optional[Any] = ...) -> None: ...
def __call__(self, nbunch: Optional[Any] = ..., weight: Optional[Any] = ...): ...
def __iter__(self) -> Iterator[Tuple[T, int]]: ...
Expand Down

0 comments on commit cb0aaa0

Please sign in to comment.