Skip to content

Commit

Permalink
Updated machine.py to support Python 3.9-3.12 (#131)
Browse files Browse the repository at this point in the history
-Updated dev docker container to Python 3.9 and Ubuntu noble
-Updated non-dev docker container to Python 3.12 and Ubuntu noble
-Updated CUDA version in both containers to be compatible with Ubuntu noble
-Updated CI to build and test Python 3.9-3.12
-Updated transformers support to versions ">=4.38.0,<4.46"
  • Loading branch information
TaperChipmunk32 authored Oct 29, 2024
1 parent 1fbaef0 commit f93ddc4
Show file tree
Hide file tree
Showing 17 changed files with 3,233 additions and 2,416 deletions.
10 changes: 6 additions & 4 deletions .devcontainer/dockerfile
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
#compatability with Tensorflow 2.6.0 as per https://www.tensorflow.org/install/source#gpu
ARG PYTHON_VERSION=3.8
ARG UBUNTU_VERSION=focal
ARG PYTHON_VERSION=3.9
ARG UBUNTU_VERSION=noble
ARG POETRY_VERSION=1.6.1
ARG CUDA_VERSION=11.2.2-cudnn8-runtime-ubuntu20.04
ARG CUDA_VERSION=12.6.1-base-ubuntu24.04

FROM nvidia/cuda:$CUDA_VERSION
ARG PYTHON_VERSION
ARG POETRY_VERSION

ENV POETRY_VENV=/opt/poetry-venv
ENV PIP_DISABLE_PIP_VERSION_CHECK=on
ENV TZ=America/New_York
RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
Expand All @@ -33,7 +34,8 @@ RUN ln -sfn /usr/bin/python${PYTHON_VERSION} /usr/bin/python3 & \
# Install python packages
RUN pip install -U pip setuptools \
&& pip install poetry==${POETRY_VERSION} black pipenv virtualenv clearml \
&& poetry config virtualenvs.in-project true
&& poetry config virtualenvs.in-project true \
&& pip install cffi

COPY ./.devcontainer/clearml.conf /root/clearml.conf

Expand Down
22 changes: 11 additions & 11 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,16 +14,16 @@ jobs:
strategy:
matrix:
os: [ubuntu-latest, macos-12, windows-latest]
python-version: ["3.8", "3.9", "3.10", "3.11"]
python-version: ["3.9", "3.10", "3.11", "3.12"]
defaults:
run:
shell: bash

steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4
- name: Set up Python ${{ matrix.python-version }}
id: setup-python
uses: actions/setup-python@v4
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- name: Install Poetry
Expand All @@ -34,7 +34,7 @@ jobs:
installer-parallel: true
- name: Load cached venv
id: cached-poetry-dependencies
uses: actions/cache@v3
uses: actions/cache@v4
with:
path: .venv
key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}
Expand All @@ -55,7 +55,7 @@ jobs:
node-version: "14"
- name: Lint with pyright
run: |
npm install -g [email protected].362
npm install -g [email protected].386
poetry run pyright
- name: Test with pytest
run: poetry run pytest --cov --cov-report=xml
Expand All @@ -70,12 +70,12 @@ jobs:
runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v3
- name: Set up Python 3.8
- uses: actions/checkout@v4
- name: Set up Python 3.9
id: setup-python
uses: actions/setup-python@v4
uses: actions/setup-python@v5
with:
python-version: "3.8"
python-version: "3.9"
- name: Install Poetry
uses: snok/install-poetry@v1
with:
Expand All @@ -84,14 +84,14 @@ jobs:
installer-parallel: true
- name: Load cached venv
id: cached-poetry-dependencies
uses: actions/cache@v3
uses: actions/cache@v4
with:
path: .venv
key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}
- name: Build
run: poetry build
- name: Upload package
uses: actions/upload-artifact@v3
uses: actions/upload-artifact@v4
with:
name: wheel
path: dist/*.whl
Expand Down
10 changes: 4 additions & 6 deletions dockerfile
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
#compatability with Tensorflow 2.6.0 as per https://www.tensorflow.org/install/source#gpu
ARG PYTHON_VERSION=3.11
ARG UBUNTU_VERSION=focal
ARG PYTHON_VERSION=3.12
ARG UBUNTU_VERSION=noble
ARG POETRY_VERSION=1.6.1
ARG CUDA_VERSION=11.2.2-cudnn8-runtime-ubuntu20.04
ARG CUDA_VERSION=12.6.1-base-ubuntu24.04

FROM python:$PYTHON_VERSION-slim as builder
ARG POETRY_VERSION
Expand Down Expand Up @@ -35,13 +35,11 @@ WORKDIR /root

RUN apt-get update && \
apt-get install --no-install-recommends -y software-properties-common && \
add-apt-repository ppa:deadsnakes/ppa -y && \
apt-get update && \
apt-get install --no-install-recommends -y \
curl \
python$PYTHON_VERSION \
python$PYTHON_VERSION-distutils \
# these are needed for ClearML
# these are needed for ClearML
git libsm6 libxext6 libxrender-dev libglib2.0-0 && \
rm -rf /var/lib/apt/lists/* && \
apt-get clean
Expand Down
4 changes: 2 additions & 2 deletions dockerfile.cpu_only
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#compatability with Tensorflow 2.6.0 as per https://www.tensorflow.org/install/source#gpu
ARG PYTHON_VERSION=3.11
ARG UBUNTU_VERSION=focal
ARG PYTHON_VERSION=3.12
ARG UBUNTU_VERSION=noble
ARG POETRY_VERSION=1.6.1

FROM python:$PYTHON_VERSION-slim AS builder
Expand Down
7 changes: 4 additions & 3 deletions machine/corpora/parallel_text_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -490,7 +490,7 @@ def iterable() -> Iterable[Tuple[Union[str, int], dict]]:
example[alignment_column] = {source_lang: src_indices, target_lang: trg_indices}
yield key, example

return IterableDataset(ExamplesIterable(iterable, {}), info, split)
return IterableDataset(ExamplesIterable(iterable, {}), info, split) # type: ignore


class _TransformParallelTextCorpus(ParallelTextCorpus):
Expand Down Expand Up @@ -617,8 +617,9 @@ def count(self, include_empty: bool = True, text_ids: Optional[Iterable[str]] =
if include_empty:
return len(self._df)
return len(self._df[(self._df[self._source_column] != "") & (self._df[self._target_column] != "")])
return len(self._df[self._df[self._source_column].isin(set(text_ids))]) & (
len(self._df[self._target_column].isin(set(text_ids)))
text_ids = list(text_ids)
return len(self._df[self._df[self._source_column].isin(text_ids)]) & (
len(self._df[self._target_column].isin(text_ids))
)

def _get_rows(self, text_ids: Optional[Iterable[str]] = None) -> Generator[ParallelTextRow, None, None]:
Expand Down
7 changes: 4 additions & 3 deletions machine/corpora/usfm_text_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,9 +68,10 @@ def _get_rows(self) -> Generator[TextRow, None, None]:
return gen(row_collector.rows)

def _read_usfm(self) -> str:
with self._create_stream_container() as stream_container, TextIOWrapper(
stream_container.open_stream(), encoding=self._encoding, errors="replace"
) as reader:
with (
self._create_stream_container() as stream_container,
TextIOWrapper(stream_container.open_stream(), encoding=self._encoding, errors="replace") as reader,
):
return reader.read()


Expand Down
5 changes: 4 additions & 1 deletion machine/jobs/huggingface/hugging_face_nmt_model_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,10 @@ def __init__(self, config: Any) -> None:
and self._training_args.report_to is not None
and "clearml" in self._training_args.report_to
):
self._training_args.report_to.remove("clearml")
if isinstance(self._training_args.report_to, list):
self._training_args.report_to.remove("clearml")
elif isinstance(self._training_args.report_to, str) and self._training_args.report_to == "clearml":
self._training_args.report_to = None

# The default of training_args.log_level is passive, so we set log level at info here to have that default.
transformers_logging.set_verbosity_info()
Expand Down
7 changes: 4 additions & 3 deletions machine/jobs/nmt_engine_build_job.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,9 +69,10 @@ def _train_model(
check_canceled()

logger.info("Training NMT model")
with progress_reporter.start_next_phase() as phase_progress, self._nmt_model_factory.create_model_trainer(
parallel_corpus
) as model_trainer:
with (
progress_reporter.start_next_phase() as phase_progress,
self._nmt_model_factory.create_model_trainer(parallel_corpus) as model_trainer,
):
model_trainer.train(progress=phase_progress, check_canceled=check_canceled)
model_trainer.save()
train_corpus_size = model_trainer.stats.train_corpus_size
Expand Down
14 changes: 8 additions & 6 deletions machine/jobs/smt_engine_build_job.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,17 +49,19 @@ def _train_model(
check_canceled: Optional[Callable[[], None]],
) -> Tuple[int, float]:

with progress_reporter.start_next_phase() as phase_progress, self._smt_model_factory.create_model_trainer(
self._tokenizer, parallel_corpus
) as trainer:
with (
progress_reporter.start_next_phase() as phase_progress,
self._smt_model_factory.create_model_trainer(self._tokenizer, parallel_corpus) as trainer,
):
trainer.train(progress=phase_progress, check_canceled=check_canceled)
trainer.save()
train_corpus_size = trainer.stats.train_corpus_size
confidence = trainer.stats.metrics["bleu"] * 100

with progress_reporter.start_next_phase() as phase_progress, self._smt_model_factory.create_truecaser_trainer(
self._tokenizer, target_corpus
) as truecase_trainer:
with (
progress_reporter.start_next_phase() as phase_progress,
self._smt_model_factory.create_truecaser_trainer(self._tokenizer, target_corpus) as truecase_trainer,
):
truecase_trainer.train(progress=phase_progress, check_canceled=check_canceled)
truecase_trainer.save()

Expand Down
7 changes: 4 additions & 3 deletions machine/jobs/word_alignment_build_job.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,9 +69,10 @@ def _train_model(
check_canceled: Optional[Callable[[], None]],
) -> int:

with progress_reporter.start_next_phase() as phase_progress, self._word_alignment_model_factory.create_model_trainer(
self._tokenizer, parallel_corpus
) as trainer:
with (
progress_reporter.start_next_phase() as phase_progress,
self._word_alignment_model_factory.create_model_trainer(self._tokenizer, parallel_corpus) as trainer,
):
trainer.train(progress=phase_progress, check_canceled=check_canceled)
trainer.save()
train_corpus_size = trainer.stats.train_corpus_size
Expand Down
14 changes: 13 additions & 1 deletion machine/translation/huggingface/hugging_face_nmt_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,15 +44,21 @@ def __init__(
self._pipeline_kwargs = pipeline_kwargs
if isinstance(self._model, PreTrainedModel):
self._model.eval()
self._is_model_owned = False
else:
model_config = AutoConfig.from_pretrained(str(self._model), label2id={}, id2label={}, num_labels=0)
self._model = cast(
PreTrainedModel, AutoModelForSeq2SeqLM.from_pretrained(str(self._model), config=model_config)
)
self._is_model_owned = True
self._tokenizer = AutoTokenizer.from_pretrained(self._model.name_or_path, use_fast=True)
if isinstance(self._tokenizer, (NllbTokenizer, NllbTokenizerFast)):
self._mpn = MosesPunctNormalizer()
self._mpn.substitutions = [(re.compile(r), sub) for r, sub in self._mpn.substitutions] # type: ignore
self._mpn.substitutions = [
(str(re.compile(r)), sub)
for r, sub in self._mpn.substitutions
if isinstance(r, str) and isinstance(sub, str)
]
else:
self._mpn = None

Expand Down Expand Up @@ -93,6 +99,10 @@ def __init__(
**self._pipeline_kwargs,
)

@property
def tokenizer(self) -> PreTrainedTokenizer | PreTrainedTokenizerFast:
return self._tokenizer

def translate(self, segment: Union[str, Sequence[str]]) -> TranslationResult:
return self.translate_batch([segment])[0]

Expand Down Expand Up @@ -161,6 +171,8 @@ def __enter__(self) -> HuggingFaceNmtEngine:

def close(self) -> None:
del self._pipeline
if self._is_model_owned:
del self._model
gc.collect()
with torch.no_grad():
torch.cuda.empty_cache()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,11 @@ def __init__(
self._add_unk_src_tokens = add_unk_src_tokens
self._add_unk_tgt_tokens = add_unk_tgt_tokens
self._mpn = MosesPunctNormalizer()
self._mpn.substitutions = [(re.compile(r), sub) for r, sub in self._mpn.substitutions] # type: ignore
self._mpn.substitutions = [
(str(re.compile(r)), sub)
for r, sub in self._mpn.substitutions
if isinstance(r, str) and isinstance(sub, str)
]
self._stats = TrainStats()

@property
Expand Down Expand Up @@ -222,7 +226,8 @@ def add_lang_code_to_tokenizer(tokenizer: Any, lang_code: str):
)
lang_id = tokenizer.convert_tokens_to_ids(lang_code)
tokenizer.lang_code_to_id[lang_code] = lang_id
if isinstance(tokenizer, (NllbTokenizer, MBart50Tokenizer, MBartTokenizer)):

if isinstance(tokenizer, (MBart50Tokenizer, MBartTokenizer)):
tokenizer.id_to_lang_code[lang_id] = lang_code
tokenizer.fairseq_tokens_to_ids[lang_code] = lang_id
tokenizer.fairseq_ids_to_tokens[lang_id] = lang_code
Expand Down Expand Up @@ -271,7 +276,7 @@ def add_lang_code_to_tokenizer(tokenizer: Any, lang_code: str):

# For multilingual translation models like mBART-50 and M2M100 we need to force the target language token
# as the first generated token. We ask the user to explicitly provide this as --forced_bos_token argument.
forced_bos_token_id = tokenizer.lang_code_to_id[self._tgt_lang]
forced_bos_token_id = tokenizer.convert_tokens_to_ids(self._tgt_lang)
model.config.forced_bos_token_id = forced_bos_token_id
if model.generation_config is not None:
model.generation_config.forced_bos_token_id = forced_bos_token_id
Expand Down Expand Up @@ -372,7 +377,7 @@ def save(self) -> None:

def __exit__(self, type: Any, value: Any, traceback: Any) -> None:
if self._trainer is not None:
self._trainer = None
del self._trainer
gc.collect()
with torch.no_grad():
torch.cuda.empty_cache()
Expand Down
21 changes: 12 additions & 9 deletions machine/translation/thot/thot_smt_model_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,9 +129,10 @@ def _filter_phrase_table_using_corpus(filename: Path, source_corpus: Sequence[Se
j += 1

temp_filename = filename.parent / f"{filename.name}.temp"
with filename.open("r", encoding="utf-8-sig") as file, temp_filename.open(
"w", encoding="utf-8", newline="\n"
) as temp_file:
with (
filename.open("r", encoding="utf-8-sig") as file,
temp_filename.open("w", encoding="utf-8", newline="\n") as temp_file,
):
for line in file:
fields = line.strip().split("|||")
phrase = fields[1].strip()
Expand Down Expand Up @@ -295,9 +296,10 @@ def _write_ngram_counts_file(self, lm_prefix: Path, ngram_size: int, train_corpu

def _write_word_prediction_file(self, lm_prefix: Path, train_corpus: ParallelTextCorpus) -> None:
rand = Random(self.seed)
with (lm_prefix.parent / f"{lm_prefix.name}.wp").open(
"w", encoding="utf-8", newline="\n"
) as file, train_corpus.take(100000).get_rows() as rows:
with (
(lm_prefix.parent / f"{lm_prefix.name}.wp").open("w", encoding="utf-8", newline="\n") as file,
train_corpus.take(100000).get_rows() as rows,
):
for row in sorted(rows, key=lambda _: rand.randint(0, sys.maxsize)):
segment_str = " ".join(escape_token(t) for t in row.target_segment)
file.write(segment_str + "\n")
Expand Down Expand Up @@ -414,9 +416,10 @@ def _generate_best_alignments(
) -> None:
model = create_thot_word_alignment_model(self._word_alignment_model_type)
model.load(swm_prefix)
with filename.open("w", encoding="utf-8", newline="\n") as file, train_corpus.transform(
_escape_tokens_row
).get_rows() as rows:
with (
filename.open("w", encoding="utf-8", newline="\n") as file,
train_corpus.transform(_escape_tokens_row).get_rows() as rows,
):
i = 0
for row in rows:
file.write("# 1\n")
Expand Down
Loading

0 comments on commit f93ddc4

Please sign in to comment.