Skip to content

Commit

Permalink
Add pyright to check python types (#952)
Browse files Browse the repository at this point in the history
* Add pyright as a dependency

* Add pyright to Taskfile and to CI

* Add the utils folder to be type checked

* Fix the dependencies for type checking
  • Loading branch information
gregtatum authored Dec 11, 2024
1 parent b908760 commit 0f2268f
Show file tree
Hide file tree
Showing 14 changed files with 172 additions and 156 deletions.
9 changes: 9 additions & 0 deletions Taskfile.yml
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,14 @@ tasks:
cmds:
- poetry run black . {{.CLI_ARGS}}

lint-pyright:
desc: Runs type checking on python files.
deps:
cmds:
# For type checking, all of the various packages need to be installed
- poetry install --no-root --only pyright --only utils
- poetry run pyright {{.CLI_ARGS}}

lint-ruff:
desc: Lints the Python code with the ruff linter.
deps: [poetry-install-lint]
Expand Down Expand Up @@ -154,6 +162,7 @@ tasks:
- task: lint-eslint
- task: lint-black
- task: lint-ruff
- task: lint-pyright

test:
desc: Run python pytests in the current host.
Expand Down
2 changes: 1 addition & 1 deletion pipeline/common/downloads.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
logger = get_logger(__file__)


def stream_download_to_file(url: str, destination: str) -> None:
def stream_download_to_file(url: str, destination: Union[str, Path]) -> None:
"""
Streams a download to a file, and retries several times if there are any failures. The
destination file must not already exist.
Expand Down
66 changes: 51 additions & 15 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

19 changes: 17 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@ zstandard = "^0.22.0"
# https://github.com/mozilla/translations/issues/689
numpy = "<2"

[tool.poetry.group.pyright.dependencies]
pyright = "^1.1.390"

[tool.poetry.group.black.dependencies]
black = "^23.7.0"

Expand All @@ -36,14 +39,16 @@ requests="^2.26.0"
humanize = "^4.9.0"
blessed = "^1.20.0"
huggingface-hub = "^0.20.3"
websocket_client ="*"
websocket_client ="^1.8.0"
PyGithub="2.4.0"
pyperclip="1.9.0"
ruamel-yaml = "^0.18.6"
taskcluster = "^56.0.3"
taskcluster-taskgraph = "^11.1.0"

# This install group is for running tests. Note that any dependencies in the
# pipeline are installed separately through the run_task test abstraction. This
# list is only for things imported directly in the tests.
ruamel-yaml = "^0.18.6"
[tool.poetry.group.tests.dependencies]
sacrebleu="2.4.2"
mtdata="0.4.1"
Expand Down Expand Up @@ -105,3 +110,13 @@ markers = [
# task test -- -m "not slow
"slow: Tests that run slower"
]

[tool.pyright]
# When adding directories, add the entire subfolder to "include", and then
# exclude individual files that still need typing. This will make it so that all
# new files default to being typed.
include = [
"utils/**/*"
]
exclude = []
pythonVersion="3.10"
14 changes: 14 additions & 0 deletions taskcluster/kinds/tests/kind.yml
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,20 @@ tasks:
task lint-black
run-on-tasks-for: ["github-push", "github-pull-request"]

lint-pyright:
# Run python's pyright type checker.
worker-type: b-cpu
worker:
max-run-time: 3600
docker-image: {in-tree: test}
run:
command:
- bash
- -c
- >-
task lint-pyright
run-on-tasks-for: ["github-push", "github-pull-request"]

lint-ruff:
# Run ruff, a python linter.
worker-type: b-cpu
Expand Down
2 changes: 1 addition & 1 deletion utils/build-mono-nllb.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def compute_hashes_in_parallel_data(parallel_path: Path, lang: str):
sentence_hashes: set[int] = set()
sentences_visited = 0

with zipfile.ZipFile(parallel_path.open(), "r") as zip_ref:
with zipfile.ZipFile(parallel_path.open(), "r") as zip_ref: # type: ignore
with zip_ref.open(f"NLLB.en-{lang}.{lang}") as mono_file:
for line_bytes in mono_file:
sentences_visited += 1
Expand Down
36 changes: 26 additions & 10 deletions utils/config_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,9 @@
import sys
from io import StringIO
from pathlib import Path
from typing import Literal, Union
from typing import Any, Literal, Union

import humanize
import ruamel.yaml

from pipeline.common.downloads import get_download_size, location_exists
Expand Down Expand Up @@ -103,7 +104,7 @@ def get_git_revision_hash(remote_branch: str) -> str:


def update_config(
prod_config: any, name: str, source: str, target: str, fast: bool
prod_config: Any, name: str, source: str, target: str, fast: bool
) -> dict[str, str]:
experiment = prod_config["experiment"]

Expand Down Expand Up @@ -166,7 +167,11 @@ def update_config(


def add_train_data(
source: str, target: str, datasets: list[str], comment_section: dict[str, str], fast: bool
source: str,
target: str,
datasets: dict[str, list[str]],
comment_section: dict[str, str],
fast: bool,
):
opus_datasets = fetch_opus(source, target)
total_sentences = 0
Expand Down Expand Up @@ -195,7 +200,7 @@ def add_train_data(
total_sentences += sentences
corpus_key = dataset.corpus_key()
datasets["train"].append(corpus_key)
datasets["train"].yaml_add_eol_comment(
datasets["train"].yaml_add_eol_comment( # type: ignore
f"{sentences:,} sentences".rjust(70 - len(corpus_key), " "),
len(datasets["train"]) - 1,
)
Expand Down Expand Up @@ -230,7 +235,18 @@ def add_train_data(
# Just add the dataset when in fast mode.
dataset.append(corpus_key)
else:
byte_size, display_size = get_remote_file_size(entry.url)
byte_size = None
display_size = None
if isinstance(entry.url, tuple):
size_a = get_remote_file_size(entry.url[0])[0]
size_b = get_remote_file_size(entry.url[1])[0]
if size_a and size_b:
byte_size = size_a + size_b
display_size = humanize.naturalsize(byte_size)

else:
byte_size, display_size = get_remote_file_size(entry.url)

if byte_size is None:
# There was a network error, skip the dataset.
skipped_datasets.append(f"{corpus_key} - Error fetching ({entry.url})")
Expand All @@ -240,13 +256,13 @@ def add_train_data(
sentences = estimate_sentence_size(byte_size)
dataset.append(corpus_key)
if byte_size:
dataset.yaml_add_eol_comment(
dataset.yaml_add_eol_comment( # type: ignore
f"~{sentences:,} sentences ".rjust(70 - len(corpus_key), " ")
+ f"({display_size})",
len(datasets["train"]) - 1,
)
else:
dataset.yaml_add_eol_comment(
dataset.yaml_add_eol_comment( # type: ignore
"No Content-Length reported ".rjust(70 - len(corpus_key), " ")
+ f"({entry.url})",
len(datasets["train"]) - 1,
Expand Down Expand Up @@ -356,15 +372,15 @@ def add_mono_data(
lang: str,
direction: Union[Literal["src"], Literal["trg"]],
datasets: dict[str, list[str]],
experiment: any,
experiment: Any,
comment_section: dict[str, str],
):
mono_datasets = datasets[f"mono-{direction}"]
max_per_dataset: int = experiment[f"mono-max-sentences-{direction}"]["per-dataset"]

def add_comment(dataset_name: str, comment: str):
"""Add a right justified comment to a dataset."""
mono_datasets.yaml_add_eol_comment(
mono_datasets.yaml_add_eol_comment( # type: ignore
comment.rjust(50 - len(dataset_name), " "),
len(mono_datasets) - 1,
)
Expand Down Expand Up @@ -422,7 +438,7 @@ def add_comment(dataset_name: str, comment: str):
comment_section[f" mono-{direction}:"] = comment


def strip_comments(yaml_text: str) -> list[str]:
def strip_comments(yaml_text: str) -> str:
"""
ruamel.yaml preserves key ordering and comments. This function strips out the comments
Expand Down
Loading

0 comments on commit 0f2268f

Please sign in to comment.