Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Smt build job #107

Merged
merged 4 commits into from
Jun 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .devcontainer/devcontainer.json
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@
"AWS_ACCESS_KEY_ID": "${localEnv:AWS_ACCESS_KEY_ID}",
"AWS_SECRET_ACCESS_KEY": "${localEnv:AWS_SECRET_ACCESS_KEY}",
"CLEARML_API_ACCESS_KEY": "${localEnv:CLEARML_API_ACCESS_KEY}",
"CLEARML_API_SECRET_KEY": "${localEnv:CLEARML_API_SECRET_KEY}"
"CLEARML_API_SECRET_KEY": "${localEnv:CLEARML_API_SECRET_KEY}",
"ENV_FOR_DYNACONF": "development"
},
// Features to add to the dev container. More info: https://containers.dev/features.
// "features": {},
Expand Down
2 changes: 1 addition & 1 deletion .devcontainer/dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ RUN apt-get update && \
apt-get install --no-install-recommends -y \
python$PYTHON_VERSION \
python$PYTHON_VERSION-distutils \
git curl gdb ca-certificates gnupg2 tar make gcc libssl-dev zlib1g-dev libncurses5-dev \
git vim curl gdb ca-certificates gnupg2 tar make gcc libssl-dev zlib1g-dev libncurses5-dev \
libbz2-dev libreadline-dev libreadline6-dev libxml2-dev xz-utils libgdbm-dev libgdbm-compat-dev tk-dev dirmngr \
libxmlsec1-dev libsqlite3-dev libffi-dev liblzma-dev lzma lzma-dev uuid-dev && \
rm -rf /var/lib/apt/lists/*
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -50,12 +50,12 @@ jobs:
- name: Lint with isort
run: poetry run isort . --check-only
- name: Setup Node for pyright
uses: actions/setup-node@v3
uses: actions/setup-node@v4
with:
node-version: "12"
node-version: "14"
- name: Lint with pyright
run: |
npm install -g [email protected].313
npm install -g [email protected].362
poetry run pyright
- name: Test with pytest
run: poetry run pytest --cov --cov-report=xml
Expand Down
16 changes: 14 additions & 2 deletions .github/workflows/docker-build-push.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,21 @@ on:
tags:
- "docker_*"

env:
REGISTRY: ghcr.io
IMAGE_NAME: ${{ github.repository }}

jobs:
docker:
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
include:
- dockerfile: ./dockerfile
image: ghcr.io/sillsdev/machine.py
- dockerfile: ./dockerfile.cpu_only
image: ghcr.io/sillsdev/machine.py.cpu_only
steps:
- name: Free Disk Space (Ubuntu)
uses: jlumbroso/free-disk-space@main
Expand All @@ -21,8 +33,7 @@ jobs:
id: meta
uses: docker/metadata-action@v4
with:
images: |
ghcr.io/${{ github.repository }}
images: ${{ matrix.image }}
tags: |
type=match,pattern=docker_(.*),group=1
flavor: |
Expand All @@ -39,6 +50,7 @@ jobs:
uses: docker/build-push-action@v4
with:
context: .
file: ${{ matrix.dockerfile }}
push: true
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}
25 changes: 20 additions & 5 deletions .vscode/launch.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,15 @@
"configurations": [
{
"name": "Python: Current File",
"type": "python",
"type": "debugpy",
"request": "launch",
"program": "${file}",
"console": "integratedTerminal",
"justMyCode": true
},
{
"name": "build_nmt_engine",
"type": "python",
"type": "debugpy",
"request": "launch",
"module": "machine.jobs.build_nmt_engine",
"justMyCode": false,
Expand Down Expand Up @@ -51,14 +51,29 @@
]
}
},
{
"name": "build_smt_engine",
"type": "debugpy",
"request": "launch",
"module": "machine.jobs.build_smt_engine",
"justMyCode": false,
"args": [
"--model-type",
"thot",
"--build-id",
"build1"
]
},
{
"name": "Python: Debug Tests",
"type": "python",
"type": "debugpy",
"request": "launch",
"program": "${file}",
"purpose": ["debug-test"],
"purpose": [
"debug-test"
],
"console": "integratedTerminal",
"justMyCode": false
}
]
}
}
9 changes: 6 additions & 3 deletions dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,9 @@ RUN apt-get update && \
apt-get install --no-install-recommends -y \
curl \
python$PYTHON_VERSION \
python$PYTHON_VERSION-distutils && \
python$PYTHON_VERSION-distutils \
# these are needed for ClearML
git libsm6 libxext6 libxrender-dev libglib2.0-0 && \
rm -rf /var/lib/apt/lists/* && \
apt-get clean

Expand All @@ -51,9 +53,10 @@ RUN ln -sfn /usr/bin/python${PYTHON_VERSION} /usr/bin/python3 & \
ln -sfn /usr/bin/python${PYTHON_VERSION} /usr/bin/python

COPY --from=builder /src/requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt && rm requirements.txt
RUN --mount=type=cache,target=/root/.cache \
pip install --no-cache-dir -r requirements.txt && rm requirements.txt

COPY . .
RUN pip install --no-deps . && rm -r *
RUN pip install --no-deps . && rm -r /root/*

CMD ["bash"]
41 changes: 41 additions & 0 deletions dockerfile.cpu_only
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#compatability with Tensorflow 2.6.0 as per https://www.tensorflow.org/install/source#gpu
ARG PYTHON_VERSION=3.11
ARG UBUNTU_VERSION=focal
ARG POETRY_VERSION=1.6.1

FROM python:$PYTHON_VERSION-slim as builder
ARG POETRY_VERSION

ENV POETRY_HOME=/opt/poetry
ENV POETRY_VENV=/opt/poetry-venv
ENV POETRY_CACHE_DIR=/opt/.cache

# Install poetry separated from system interpreter
RUN python3 -m venv $POETRY_VENV \
&& $POETRY_VENV/bin/pip install -U pip setuptools \
&& $POETRY_VENV/bin/pip install poetry==${POETRY_VERSION}

# Add `poetry` to PATH
ENV PATH="${PATH}:${POETRY_VENV}/bin"

WORKDIR /src
COPY poetry.lock pyproject.toml /src
RUN poetry export --with=gpu --without-hashes -f requirements.txt > requirements.txt


FROM python:$PYTHON_VERSION
WORKDIR /root

# these are needed for ClearML
RUN apt-get update && \
apt-get install --no-install-recommends -y \
git libsm6 libxext6 libxrender-dev libglib2.0-0

COPY --from=builder /src/requirements.txt .
RUN --mount=type=cache,target=/root/.cache \
pip install --no-cache-dir -r requirements.txt && rm requirements.txt

COPY . .
RUN pip install --no-deps . && rm -r /root/*

CMD ["bash"]
4 changes: 2 additions & 2 deletions machine/corpora/dbl_bundle_text_corpus.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import os
import xml.etree.ElementTree as etree
from io import TextIOWrapper
from typing import List
from xml.etree import ElementTree
from zipfile import ZipFile

from ..scripture import ENGLISH_VERSIFICATION
Expand All @@ -17,7 +17,7 @@ class DblBundleTextCorpus(ScriptureTextCorpus):
def __init__(self, filename: StrPath) -> None:
with ZipFile(filename, "r") as archive:
with archive.open("metadata.xml", "r") as stream:
doc = etree.parse(stream)
doc = ElementTree.parse(stream)
version = doc.getroot().get("version", "2.0")
parts = version.split(".", maxsplit=3)
if f"{parts[0]}.{parts[1]}" not in DblBundleTextCorpus._SUPPORTED_VERSIONS:
Expand Down
7 changes: 4 additions & 3 deletions machine/corpora/parallel_text_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
Generator,
Iterable,
List,
Literal,
Optional,
Sequence,
Tuple,
Expand Down Expand Up @@ -185,22 +186,22 @@ def _detokenize(row: ParallelTextRow) -> ParallelTextRow:

return self.transform(_detokenize, is_target_tokenized=False)

def normalize(self, normalization_form: str) -> ParallelTextCorpus:
def normalize(self, normalization_form: Literal["NFC", "NFD", "NFKC", "NFKD"]) -> ParallelTextCorpus:
def _normalize(row: ParallelTextRow) -> ParallelTextRow:
row.source_segment = normalize(normalization_form, row.source_segment)
row.target_segment = normalize(normalization_form, row.target_segment)
return row

return self.transform(_normalize)

def normalize_source(self, normalization_form: str) -> ParallelTextCorpus:
def normalize_source(self, normalization_form: Literal["NFC", "NFD", "NFKC", "NFKD"]) -> ParallelTextCorpus:
def _normalize(row: ParallelTextRow) -> ParallelTextRow:
row.source_segment = normalize(normalization_form, row.source_segment)
return row

return self.transform(_normalize)

def normalize_target(self, normalization_form: str) -> ParallelTextCorpus:
def normalize_target(self, normalization_form: Literal["NFC", "NFD", "NFKC", "NFKD"]) -> ParallelTextCorpus:
def _normalize(row: ParallelTextRow) -> ParallelTextRow:
row.target_segment = normalize(normalization_form, row.target_segment)
return row
Expand Down
10 changes: 5 additions & 5 deletions machine/corpora/paratext_backup_terms_corpus.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import re
import xml.etree.ElementTree as ET
from typing import Dict, List, Optional
from xml.etree import ElementTree
from zipfile import ZipFile

from .corpora_utils import get_entry
Expand All @@ -23,20 +23,20 @@ def __init__(self, filename: str, term_categories: List[str]) -> None:
settings = settings_parser.parse()

with archive.open(terms_file_entry) as key_terms_file:
term_renderings_tree = ET.parse(key_terms_file)
term_renderings_tree = ElementTree.parse(key_terms_file)

biblical_terms_file_entry = get_entry(archive, settings.biblical_terms_file_name)
if settings.biblical_terms_list_type in _PREDEFINED_TERMS_LIST_TYPES:
with open(settings.biblical_terms_file_name, "rb") as key_terms_file:
biblical_terms_tree = ET.parse(key_terms_file)
biblical_terms_tree = ElementTree.parse(key_terms_file)
term_id_to_category_dict = _get_category_per_id(biblical_terms_tree)
elif (
settings.biblical_terms_list_type == "Project"
and settings.biblical_terms_project_name == settings.name
and biblical_terms_file_entry is not None
):
with archive.open(biblical_terms_file_entry) as key_terms_file:
biblical_terms_tree = ET.parse(key_terms_file)
biblical_terms_tree = ElementTree.parse(key_terms_file)
term_id_to_category_dict = _get_category_per_id(biblical_terms_tree)
else:
term_id_to_category_dict = {}
Expand Down Expand Up @@ -96,7 +96,7 @@ def _strip_parens(term_string: str, left: str = "(", right: str = ")") -> str:
return term_string


def _get_category_per_id(biblical_terms_tree: ET.ElementTree) -> Dict[str, Optional[str]]:
def _get_category_per_id(biblical_terms_tree: ElementTree.ElementTree) -> Dict[str, Optional[str]]:
term_id_to_category_dict = {}
for e in biblical_terms_tree.iter(".//Term"):
category_element = e.find("Category")
Expand Down
4 changes: 2 additions & 2 deletions machine/corpora/paratext_project_settings_parser_base.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import xml.etree.ElementTree as ET
from abc import ABC, abstractmethod
from typing import BinaryIO
from xml.etree import ElementTree

from ..scripture.verse_ref import Versification
from ..utils.string_utils import parse_integer
Expand Down Expand Up @@ -30,7 +30,7 @@ def parse(self) -> ParatextProjectSettings:
if not settings_file_name:
raise ValueError("The project does not contain a settings file.")
with self.open(settings_file_name) as stream:
settings_tree = ET.parse(stream)
settings_tree = ElementTree.parse(stream)

name = settings_tree.getroot().findtext("Name", "")
full_name = settings_tree.getroot().findtext("FullName", "")
Expand Down
4 changes: 2 additions & 2 deletions machine/corpora/text_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from abc import abstractmethod
from itertools import islice
from typing import Any, Callable, Generator, Iterable, Optional, Tuple
from typing import Any, Callable, Generator, Iterable, Literal, Optional, Tuple

from ..tokenization.detokenizer import Detokenizer
from ..tokenization.tokenizer import Tokenizer
Expand Down Expand Up @@ -60,7 +60,7 @@ def _detokenize(row: TextRow) -> TextRow:

return self.transform(_detokenize, is_tokenized=False)

def normalize(self, normalization_form: str) -> TextCorpus:
def normalize(self, normalization_form: Literal["NFC", "NFD", "NFKC", "NFKD"]) -> TextCorpus:
def _normalize(row: TextRow) -> TextRow:
row.segment = normalize(normalization_form, row.segment)
return row
Expand Down
4 changes: 2 additions & 2 deletions machine/corpora/token_processors.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import unicodedata
from typing import Sequence
from typing import Literal, Sequence


def lowercase(tokens: Sequence[str]) -> Sequence[str]:
Expand All @@ -14,7 +14,7 @@ def unescape_spaces(tokens: Sequence[str]) -> Sequence[str]:
return [(" " if t == "<space>" else t) for t in tokens]


def normalize(normalization_form: str, tokens: Sequence[str]) -> Sequence[str]:
def normalize(normalization_form: Literal["NFC", "NFD", "NFKC", "NFKD"], tokens: Sequence[str]) -> Sequence[str]:
return [unicodedata.normalize(normalization_form, t) for t in tokens]


Expand Down
4 changes: 2 additions & 2 deletions machine/corpora/usx_file_alignment_collection.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import xml.etree.ElementTree as etree
from collections import defaultdict
from dataclasses import dataclass, field
from pathlib import Path
from typing import DefaultDict, Generator, List, Optional, Sequence, Set, Tuple
from xml.etree import ElementTree

from ..annotations.range import Range
from ..scripture.verse_ref import VerseRef, Versification
Expand Down Expand Up @@ -127,7 +127,7 @@ class _RangeInfo:


def _get_links(word_tokenizer: RangeTokenizer[str, int, str], tokens: Sequence[UsxToken]) -> DefaultDict[str, Set[int]]:
prev_para_elem: Optional[etree.Element] = None
prev_para_elem: Optional[ElementTree.Element] = None
text = ""
link_strs: List[Tuple[Range[int], str]] = []
for token in tokens:
Expand Down
6 changes: 3 additions & 3 deletions machine/corpora/usx_token.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
import xml.etree.ElementTree as etree
from dataclasses import dataclass
from typing import Optional
from xml.etree import ElementTree


@dataclass(frozen=True)
class UsxToken:
para_element: etree.Element
para_element: ElementTree.Element
text: str
element: Optional[etree.Element]
element: Optional[ElementTree.Element]

def __repr__(self) -> str:
return self.text
Loading
Loading