Add pyright to check python types (#952)

* Add pyright as a dependency * Add pyright to Taskfile and to CI * Add the utils folder to be type checked * Fix the dependencies for type checking
mozilla · Dec 11, 2024 · 0f2268f · 0f2268f
1 parent b908760
commit 0f2268f
Show file tree

Hide file tree

Showing 14 changed files with 172 additions and 156 deletions.
diff --git a/Taskfile.yml b/Taskfile.yml
@@ -127,6 +127,14 @@ tasks:
     cmds:
       - poetry run black . {{.CLI_ARGS}}
 
+  lint-pyright:
+    desc: Runs type checking on python files.
+    deps:
+    cmds:
+      # For type checking, all of the various packages need to be installed
+      - poetry install --no-root --only pyright --only utils
+      - poetry run pyright {{.CLI_ARGS}}
+
   lint-ruff:
     desc: Lints the Python code with the ruff linter.
     deps: [poetry-install-lint]
@@ -154,6 +162,7 @@ tasks:
       - task: lint-eslint
       - task: lint-black
       - task: lint-ruff
+      - task: lint-pyright
 
   test:
     desc: Run python pytests in the current host.

diff --git a/pipeline/common/downloads.py b/pipeline/common/downloads.py
@@ -18,7 +18,7 @@
 logger = get_logger(__file__)
 
 
-def stream_download_to_file(url: str, destination: str) -> None:
+def stream_download_to_file(url: str, destination: Union[str, Path]) -> None:
     """
     Streams a download to a file, and retries several times if there are any failures. The
     destination file must not already exist.

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -12,6 +12,9 @@ zstandard = "^0.22.0"
 # https://github.com/mozilla/translations/issues/689
 numpy = "<2"
 
+[tool.poetry.group.pyright.dependencies]
+pyright = "^1.1.390"
+
 [tool.poetry.group.black.dependencies]
 black = "^23.7.0"
 
@@ -36,14 +39,16 @@ requests="^2.26.0"
 humanize = "^4.9.0"
 blessed = "^1.20.0"
 huggingface-hub = "^0.20.3"
-websocket_client ="*"
+websocket_client ="^1.8.0"
 PyGithub="2.4.0"
 pyperclip="1.9.0"
+ruamel-yaml = "^0.18.6"
+taskcluster = "^56.0.3"
+taskcluster-taskgraph = "^11.1.0"
 
 # This install group is for running tests. Note that any dependencies in the
 # pipeline are installed separately through the run_task test abstraction. This
 # list is only for things imported directly in the tests.
-ruamel-yaml = "^0.18.6"
 [tool.poetry.group.tests.dependencies]
 sacrebleu="2.4.2"
 mtdata="0.4.1"
@@ -105,3 +110,13 @@ markers = [
   #   task test -- -m "not slow
   "slow: Tests that run slower"
 ]
+
+[tool.pyright]
+# When adding directories, add the entire subfolder to "include", and then
+# exclude individual files that still need typing. This will make it so that all
+# new files default to being typed.
+include = [
+  "utils/**/*"
+]
+exclude = []
+pythonVersion="3.10"
diff --git a/taskcluster/kinds/tests/kind.yml b/taskcluster/kinds/tests/kind.yml
@@ -91,6 +91,20 @@ tasks:
             task lint-black
     run-on-tasks-for: ["github-push", "github-pull-request"]
 
+  lint-pyright:
+    # Run python's pyright type checker.
+    worker-type: b-cpu
+    worker:
+      max-run-time: 3600
+      docker-image: {in-tree: test}
+    run:
+      command:
+        - bash
+        - -c
+        - >-
+            task lint-pyright
+    run-on-tasks-for: ["github-push", "github-pull-request"]
+
   lint-ruff:
     # Run ruff, a python linter.
     worker-type: b-cpu

diff --git a/utils/build-mono-nllb.py b/utils/build-mono-nllb.py
@@ -39,7 +39,7 @@ def compute_hashes_in_parallel_data(parallel_path: Path, lang: str):
     sentence_hashes: set[int] = set()
     sentences_visited = 0
 
-    with zipfile.ZipFile(parallel_path.open(), "r") as zip_ref:
+    with zipfile.ZipFile(parallel_path.open(), "r") as zip_ref:  # type: ignore
         with zip_ref.open(f"NLLB.en-{lang}.{lang}") as mono_file:
             for line_bytes in mono_file:
                 sentences_visited += 1

diff --git a/utils/config_generator.py b/utils/config_generator.py
@@ -4,8 +4,9 @@
 import sys
 from io import StringIO
 from pathlib import Path
-from typing import Literal, Union
+from typing import Any, Literal, Union
 
+import humanize
 import ruamel.yaml
 
 from pipeline.common.downloads import get_download_size, location_exists
@@ -103,7 +104,7 @@ def get_git_revision_hash(remote_branch: str) -> str:
 
 
 def update_config(
-    prod_config: any, name: str, source: str, target: str, fast: bool
+    prod_config: Any, name: str, source: str, target: str, fast: bool
 ) -> dict[str, str]:
     experiment = prod_config["experiment"]
 
@@ -166,7 +167,11 @@ def update_config(
 
 
 def add_train_data(
-    source: str, target: str, datasets: list[str], comment_section: dict[str, str], fast: bool
+    source: str,
+    target: str,
+    datasets: dict[str, list[str]],
+    comment_section: dict[str, str],
+    fast: bool,
 ):
     opus_datasets = fetch_opus(source, target)
     total_sentences = 0
@@ -195,7 +200,7 @@ def add_train_data(
         total_sentences += sentences
         corpus_key = dataset.corpus_key()
         datasets["train"].append(corpus_key)
-        datasets["train"].yaml_add_eol_comment(
+        datasets["train"].yaml_add_eol_comment(  # type: ignore
             f"{sentences:,} sentences".rjust(70 - len(corpus_key), " "),
             len(datasets["train"]) - 1,
         )
@@ -230,7 +235,18 @@ def add_train_data(
             # Just add the dataset when in fast mode.
             dataset.append(corpus_key)
         else:
-            byte_size, display_size = get_remote_file_size(entry.url)
+            byte_size = None
+            display_size = None
+            if isinstance(entry.url, tuple):
+                size_a = get_remote_file_size(entry.url[0])[0]
+                size_b = get_remote_file_size(entry.url[1])[0]
+                if size_a and size_b:
+                    byte_size = size_a + size_b
+                    display_size = humanize.naturalsize(byte_size)
+
+            else:
+                byte_size, display_size = get_remote_file_size(entry.url)
+
             if byte_size is None:
                 # There was a network error, skip the dataset.
                 skipped_datasets.append(f"{corpus_key} - Error fetching ({entry.url})")
@@ -240,13 +256,13 @@ def add_train_data(
                 sentences = estimate_sentence_size(byte_size)
                 dataset.append(corpus_key)
                 if byte_size:
-                    dataset.yaml_add_eol_comment(
+                    dataset.yaml_add_eol_comment(  # type: ignore
                         f"~{sentences:,} sentences ".rjust(70 - len(corpus_key), " ")
                         + f"({display_size})",
                         len(datasets["train"]) - 1,
                     )
                 else:
-                    dataset.yaml_add_eol_comment(
+                    dataset.yaml_add_eol_comment(  # type: ignore
                         "No Content-Length reported ".rjust(70 - len(corpus_key), " ")
                         + f"({entry.url})",
                         len(datasets["train"]) - 1,
@@ -356,15 +372,15 @@ def add_mono_data(
     lang: str,
     direction: Union[Literal["src"], Literal["trg"]],
     datasets: dict[str, list[str]],
-    experiment: any,
+    experiment: Any,
     comment_section: dict[str, str],
 ):
     mono_datasets = datasets[f"mono-{direction}"]
     max_per_dataset: int = experiment[f"mono-max-sentences-{direction}"]["per-dataset"]
 
     def add_comment(dataset_name: str, comment: str):
         """Add a right justified comment to a dataset."""
-        mono_datasets.yaml_add_eol_comment(
+        mono_datasets.yaml_add_eol_comment(  # type: ignore
             comment.rjust(50 - len(dataset_name), " "),
             len(mono_datasets) - 1,
         )
@@ -422,7 +438,7 @@ def add_comment(dataset_name: str, comment: str):
     comment_section[f"  mono-{direction}:"] = comment
 
 
-def strip_comments(yaml_text: str) -> list[str]:
+def strip_comments(yaml_text: str) -> str:
     """
     ruamel.yaml preserves key ordering and comments. This function strips out the comments