From 59bd0cecfef224b73199150f5dfd684701cf54f6 Mon Sep 17 00:00:00 2001 From: varisd Date: Fri, 28 Jun 2024 17:27:52 +0200 Subject: [PATCH] branch rebase + pre-commit fixes --- opuspocus/pipeline_steps/clean.py | 15 ++---- opuspocus/pipeline_steps/corpus_step.py | 5 +- opuspocus/pipeline_steps/merge.py | 3 +- opuspocus/pipeline_steps/translate.py | 35 +++++-------- opuspocus/pipelines/opuspocus_pipeline.py | 4 +- opuspocus/runners/bash.py | 2 +- tests/conftest.py | 63 ++++++++++++----------- tests/test_opuspocus_pipeline.py | 32 +++++------- tests/test_pipeline_steps.py | 20 +++---- tests/test_runners.py | 21 ++++---- 10 files changed, 82 insertions(+), 118 deletions(-) diff --git a/opuspocus/pipeline_steps/clean.py b/opuspocus/pipeline_steps/clean.py index c63c4da..c6416ec 100644 --- a/opuspocus/pipeline_steps/clean.py +++ b/opuspocus/pipeline_steps/clean.py @@ -48,9 +48,7 @@ def register_categories(self) -> None: OpusCleaner server app creates a categories.json file listing locally available datasets and their user-specified categorization. """ - shutil.copy( - self.prev_corpus_step.categories_path, self.categories_path - ) + shutil.copy(self.prev_corpus_step.categories_path, self.categories_path) def get_command_targets(self) -> List[Path]: return [ @@ -64,9 +62,7 @@ def command(self, target_file: Path) -> None: dataset = ".".join(str(target_filename).split(".")[:-2]) input_file = Path(self.input_dir, "{}.filters.json".format(dataset)) - opuscleaner_bin_path = Path( - self.python_venv_dir, "bin", self.opuscleaner_cmd - ) + opuscleaner_bin_path = Path(self.python_venv_dir, "bin", self.opuscleaner_cmd) # Run OpusCleaner proc = subprocess.Popen( @@ -86,8 +82,7 @@ def command(self, target_file: Path) -> None: # Get the correct order of languages languages = [ - file.split(".")[-2] - for file in json.load(open(input_file, "r"))["files"] + file.split(".")[-2] for file in json.load(open(input_file, "r"))["files"] ] # Split OpusCleaner output into files @@ -100,6 +95,4 @@ def command(self, target_file: Path) -> None: # Check the return code rc = proc.poll() if rc: - raise Exception( - "Process {} exited with non-zero value.".format(proc.pid) - ) + raise Exception("Process {} exited with non-zero value.".format(proc.pid)) diff --git a/opuspocus/pipeline_steps/corpus_step.py b/opuspocus/pipeline_steps/corpus_step.py index cd582fb..a23f15e 100644 --- a/opuspocus/pipeline_steps/corpus_step.py +++ b/opuspocus/pipeline_steps/corpus_step.py @@ -134,10 +134,7 @@ def shard_index(self) -> Optional[Dict[str, List[Path]]]: def save_shard_dict(self, shard_dict: Dict[str, List[str]]) -> None: assert self.is_sharded - yaml.dump( - shard_dict, - open(Path(self.shard_dir, self.shard_index_file), "w") - ) + yaml.dump(shard_dict, open(Path(self.shard_dir, self.shard_index_file), "w")) def get_shard_list(self, dset_filename: str) -> List[Path]: assert self.shard_index diff --git a/opuspocus/pipeline_steps/merge.py b/opuspocus/pipeline_steps/merge.py index 3551fa9..20317f3 100644 --- a/opuspocus/pipeline_steps/merge.py +++ b/opuspocus/pipeline_steps/merge.py @@ -56,8 +56,7 @@ def other_corpus_step(self) -> CorpusStep: def register_categories(self) -> None: categories_dict = {} categories_dict["categories"] = [ - {"name" : cat} - for cat in self.prev_corpus_step.categories + {"name": cat} for cat in self.prev_corpus_step.categories ] # Merge the category lists diff --git a/opuspocus/pipeline_steps/translate.py b/opuspocus/pipeline_steps/translate.py index 530a74a..c33bfec 100644 --- a/opuspocus/pipeline_steps/translate.py +++ b/opuspocus/pipeline_steps/translate.py @@ -12,7 +12,7 @@ from opuspocus.pipeline_steps.corpus_step import CorpusStep from opuspocus.pipeline_steps.opuspocus_step import OpusPocusStep from opuspocus.pipeline_steps.train_model import TrainModelStep -from opuspocus.utils import RunnerResources, save_filestream, subprocess_wait +from opuspocus.utils import RunnerResources, save_filestream logger = logging.getLogger(__name__) @@ -59,8 +59,7 @@ def _inherits_sharded(self) -> bool: def model_config_path(self) -> Path: return Path( "{}.{}.npz.decoder.yml".format( - self.model_step.model_path, - self.model_suffix + self.model_step.model_path, self.model_suffix ) ) @@ -75,14 +74,12 @@ def infer_input(self, tgt_file: Path) -> Path: src_filename = ".".join( tgt_filename.split(".")[:-offset] + [self.src_lang] - + tgt_filename.split(".")[-(offset - 1):] + + tgt_filename.split(".")[-(offset - 1) :] ) if self.prev_corpus_step.is_sharded: src_file = Path(self.shard_dir, src_filename) if not src_file.exists(): - src_file.hardlink_to( - Path(self.input_shard_dir, src_filename) - ) + src_file.hardlink_to(Path(self.input_shard_dir, src_filename)) else: src_file = Path(self.output_dir, src_filename) if not src_file.exists(): @@ -96,10 +93,9 @@ def get_command_targets(self) -> List[Path]: targets = [] for dset in self.dataset_list: dset_filename = "{}.{}.gz".format(dset, self.tgt_lang) - targets.extend([ - shard_file - for shard_file in self.get_shard_list(dset_filename) - ]) + targets.extend( + [shard_file for shard_file in self.get_shard_list(dset_filename)] + ) return targets return [ Path(self.output_dir, "{}.{}.gz".format(dset, self.tgt_lang)) @@ -117,10 +113,9 @@ def command_preprocess(self) -> None: ) shard_dict[d_fname_target] = [ ".".join( - shard.split(".")[:-3] - + [self.tgt_lang] - + shard.split(".")[-2:] - ) for shard in s_fname_list + shard.split(".")[:-3] + [self.tgt_lang] + shard.split(".")[-2:] + ) + for shard in s_fname_list ] self.save_shard_dict(shard_dict) @@ -158,11 +153,7 @@ def command(self, target_file: Path) -> None: # Execute the command proc = subprocess.Popen( - cmd, - stdout=subprocess.PIPE, - stderr=sys.stderr, - env=env, - text=True + cmd, stdout=subprocess.PIPE, stderr=sys.stderr, env=env, text=True ) def terminate_signal(signalnum, handler): @@ -176,6 +167,4 @@ def terminate_signal(signalnum, handler): # Check the return code rc = proc.poll() if rc: - raise Exception( - "Process {} exited with non-zero value.".format(proc.pid) - ) + raise Exception("Process {} exited with non-zero value.".format(proc.pid)) diff --git a/opuspocus/pipelines/opuspocus_pipeline.py b/opuspocus/pipelines/opuspocus_pipeline.py index e4f3095..547b12d 100644 --- a/opuspocus/pipelines/opuspocus_pipeline.py +++ b/opuspocus/pipelines/opuspocus_pipeline.py @@ -82,9 +82,7 @@ def load_pipeline( "Pipeline directory ({}) does not exist.".format(pipeline_dir) ) if not pipeline_dir.is_dir(): - raise NotADirectoryError( - "{} is not a directory.".format(pipeline_dir) - ) + raise NotADirectoryError("{} is not a directory.".format(pipeline_dir)) pipeline_config_path = Path(pipeline_dir, cls.config_file) return cls(pipeline_config_path, pipeline_dir) diff --git a/opuspocus/runners/bash.py b/opuspocus/runners/bash.py index ac7ac5f..4a6c598 100644 --- a/opuspocus/runners/bash.py +++ b/opuspocus/runners/bash.py @@ -4,7 +4,7 @@ import subprocess import sys from pathlib import Path -from psutil import NoSuchProcess, Process, wait_procs +from psutil import Process, wait_procs from opuspocus.runners import OpusPocusRunner, TaskId, register_runner from opuspocus.utils import RunnerResources, subprocess_wait diff --git a/tests/conftest.py b/tests/conftest.py index 6dd4713..866fd69 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -20,68 +20,71 @@ def pipeline_dir(tmp_path_factory): @pytest.fixture(scope="session") def data_train_minimal(tmp_path_factory, languages): src_file = Path( - tmp_path_factory.mktemp("data"), - "-".join(languages), - "minimal", - "train.src" + tmp_path_factory.mktemp("data"), "-".join(languages), "minimal", "train.src" ) src_file.parent.mkdir(parents=True) with open(src_file, "w") as fh: - print("\n".join([ - "the colorless ideas slept furiously", - "pooh slept all night", - "working class hero is something to be", - "I am the working class walrus", - "walrus for president" - ]), file=fh) + print( + "\n".join( + [ + "the colorless ideas slept furiously", + "pooh slept all night", + "working class hero is something to be", + "I am the working class walrus", + "walrus for president", + ] + ), + file=fh, + ) tgt_file = Path(src_file.parent, "train.tgt") with open(tgt_file, "w") as fh: - print("\n".join([ - "les idées incolores dormaient furieusement", - "le caniche dormait toute la nuit", - "le héros de la classe ouvrière est quelque chose à être", - "Je suis le morse de la classe ouvrière", - "morse pour président" - ]), file=fh) + print( + "\n".join( + [ + "les idées incolores dormaient furieusement", + "le caniche dormait toute la nuit", + "le héros de la classe ouvrière est quelque chose à être", + "Je suis le morse de la classe ouvrière", + "morse pour président", + ] + ), + file=fh, + ) return (src_file, tgt_file) ### Change the following when dataclasses are properly implemented + @pytest.fixture(scope="session") -def config_minimal( - tmp_path_factory, - data_train_minimal, - pipeline_dir, - languages -): - config_file = Path( - tmp_path_factory.mktemp("test_configs"), "config_minimal.yml" - ) +def config_minimal(tmp_path_factory, data_train_minimal, pipeline_dir, languages): + config_file = Path(tmp_path_factory.mktemp("test_configs"), "config_minimal.yml") src_file, tgt_file = data_train_minimal step_label = "raw." + "-".join(languages) config = { "pipeline": { "pipeline_dir": str(pipeline_dir), - "steps" : [ + "steps": [ { "step": "raw", "step_label": step_label, "src_lang": languages[0], "tgt_lang": languages[1], - "raw_data_dir": str(data_train_minimal[0].parent) + "raw_data_dir": str(data_train_minimal[0].parent), } ], - "default_targets": [step_label] + "default_targets": [step_label], } } yaml.dump(config, open(config_file, "w")) return config_file + ### + @pytest.fixture(scope="module") def pipeline_minimal(config_minimal, pipeline_dir): return OpusPocusPipeline(config_minimal, pipeline_dir) diff --git a/tests/test_opuspocus_pipeline.py b/tests/test_opuspocus_pipeline.py index 18b9171..da1e20c 100644 --- a/tests/test_opuspocus_pipeline.py +++ b/tests/test_opuspocus_pipeline.py @@ -2,46 +2,39 @@ from argparse import Namespace from pathlib import Path -from opuspocus.pipelines import build_pipeline, load_pipeline, OpusPocusPipeline +from opuspocus.pipelines import build_pipeline, load_pipeline def test_build_pipeline_method(config_minimal, pipeline_minimal, pipeline_dir): - args = Namespace(**{ - "pipeline_config": config_minimal, - "pipline_dir": pipeline_dir - }) + args = Namespace(**{"pipeline_config": config_minimal, "pipline_dir": pipeline_dir}) pipeline = build_pipeline(args) assert pipeline == pipeline_minimal def test_load_pipeline_method(pipeline_minimal_initialized): - args = Namespace(**{ - "pipeline_dir": pipeline_dir - }) + args = Namespace(**{"pipeline_dir": pipeline_minimal_initialized.pipeline_dir}) pipeline = load_pipeline(args) - assert pipeline == saved_pipeline + assert pipeline == pipeline_minimal_initialized def test_load_pipeline_dir_not_exist(): - args = Namespace(**{ - "pipeline_dir": Path("nonexistent", "directory") - }) + args = Namespace(**{"pipeline_dir": Path("nonexistent", "directory")}) with pytest.raises(FileNotFoundError): load_pipeline(args) def test_load_pipeline_dir_not_directory(tmp_path): - args = Namespace(**{ - "pipeline_dir": tmp_path, - }) + args = Namespace( + **{ + "pipeline_dir": tmp_path, + } + ) with pytest.raises(NotADirectoryError): load_pipeline(args) + def test_pipeline_class_init_graph(config_minimal, pipeline_minimal): - config_labels = [ - s["step_label"] - for s in config_minimal["pipeline"]["steps"] - ] + config_labels = [s["step_label"] for s in config_minimal["pipeline"]["steps"]] assert len(config_labels) == len(pipeline_minimal.steps) for s in pipeline_minimal.steps: assert s.step_label in config_labels @@ -61,5 +54,4 @@ def test_pipeline_class_init_default_targets(config_minimal, pipeline_minimal): # - cycles - # TODO: test status/traceback? diff --git a/tests/test_pipeline_steps.py b/tests/test_pipeline_steps.py index ccf858f..0bda5ce 100644 --- a/tests/test_pipeline_steps.py +++ b/tests/test_pipeline_steps.py @@ -1,11 +1,9 @@ import pytest -from argparse import Namespace from opuspocus.pipeline_steps import ( - OpusPocusStep, build_step, - load_step, STEP_REGISTRY, ) +from opuspocus.pipeline_steps.raw import RawStep @pytest.fixture(scope="function", params=STEP_REGISTRY.keys()) @@ -13,27 +11,25 @@ def step_default(step, tmp_path_factory): return build_step( step.param, "{}.test".format(step.param), - tmp_path_factory.mktemp("empty_pipeline") + tmp_path_factory.mktemp("empty_pipeline"), ) -#def test_parameter_save_load(step_default): - - -#def test_list_parameters(): +# def test_parameter_save_load(step_default): +# def test_list_parameters(): @pytest.fixture(scope="session") -def mock_step_parameters(): +def mock_step_parameters(languages, data_train_minimal): src_lang, tgt_lang = languages return { "step": "raw", "step_label": "raw.en", "src_lang": src_lang, "tgt_lang": tgt_lang, - "raw_data_dir": str(data_train_minimal[0].parent) + "raw_data_dir": str(data_train_minimal[0].parent), } @@ -48,8 +44,8 @@ def mock_step_inited(step): return step -#def test_build_method(): +# def test_build_method(): # build_step(** -#def test_load_method(): +# def test_load_method(): diff --git a/tests/test_runners.py b/tests/test_runners.py index fe54c64..461fdb9 100644 --- a/tests/test_runners.py +++ b/tests/test_runners.py @@ -1,11 +1,5 @@ import pytest -from argparse import Namespace -from opuspocus.runners import ( - OpusPocusRunner, - build_runner, - load_runner, - RUNNER_REGISTRY -) +from opuspocus.runners import build_runner, RUNNER_REGISTRY @pytest.fixture(scope="function", params=RUNNER_REGISTRY.keys()) @@ -17,7 +11,7 @@ def runner_default(runner, pipeline_minimal_initialized): @pytest.fixture(scope="function") -def runner_bash(): +def runner_bash(pipeline_minimal_initialized): return build_runner( "bash", pipeline_minimal_initialized.pipeline_dir, @@ -26,14 +20,17 @@ def runner_bash(): def test_parameter_save_load(runner_default): runner_default.save_parameters() - params_load = runner_default.load_parameters() + # params_load = runner_default.load_parameters() + pass + + # for runner_default -#def test_list_parameters(): +# def test_list_parameters(): -#def test_load_runner_method(): +# def test_load_runner_method(): -#def test_pipeline_class_...(): +# def test_pipeline_class_...():