Skip to content

Commit

Permalink
branch rebase + pre-commit fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
varisd committed Jun 28, 2024
1 parent 1af179f commit 59bd0ce
Show file tree
Hide file tree
Showing 10 changed files with 82 additions and 118 deletions.
15 changes: 4 additions & 11 deletions opuspocus/pipeline_steps/clean.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,7 @@ def register_categories(self) -> None:
OpusCleaner server app creates a categories.json file listing locally
available datasets and their user-specified categorization.
"""
shutil.copy(
self.prev_corpus_step.categories_path, self.categories_path
)
shutil.copy(self.prev_corpus_step.categories_path, self.categories_path)

def get_command_targets(self) -> List[Path]:
return [
Expand All @@ -64,9 +62,7 @@ def command(self, target_file: Path) -> None:
dataset = ".".join(str(target_filename).split(".")[:-2])
input_file = Path(self.input_dir, "{}.filters.json".format(dataset))

opuscleaner_bin_path = Path(
self.python_venv_dir, "bin", self.opuscleaner_cmd
)
opuscleaner_bin_path = Path(self.python_venv_dir, "bin", self.opuscleaner_cmd)

# Run OpusCleaner
proc = subprocess.Popen(
Expand All @@ -86,8 +82,7 @@ def command(self, target_file: Path) -> None:

# Get the correct order of languages
languages = [
file.split(".")[-2]
for file in json.load(open(input_file, "r"))["files"]
file.split(".")[-2] for file in json.load(open(input_file, "r"))["files"]
]

# Split OpusCleaner output into files
Expand All @@ -100,6 +95,4 @@ def command(self, target_file: Path) -> None:
# Check the return code
rc = proc.poll()
if rc:
raise Exception(
"Process {} exited with non-zero value.".format(proc.pid)
)
raise Exception("Process {} exited with non-zero value.".format(proc.pid))
5 changes: 1 addition & 4 deletions opuspocus/pipeline_steps/corpus_step.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,10 +134,7 @@ def shard_index(self) -> Optional[Dict[str, List[Path]]]:

def save_shard_dict(self, shard_dict: Dict[str, List[str]]) -> None:
assert self.is_sharded
yaml.dump(
shard_dict,
open(Path(self.shard_dir, self.shard_index_file), "w")
)
yaml.dump(shard_dict, open(Path(self.shard_dir, self.shard_index_file), "w"))

def get_shard_list(self, dset_filename: str) -> List[Path]:
assert self.shard_index
Expand Down
3 changes: 1 addition & 2 deletions opuspocus/pipeline_steps/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,7 @@ def other_corpus_step(self) -> CorpusStep:
def register_categories(self) -> None:
categories_dict = {}
categories_dict["categories"] = [
{"name" : cat}
for cat in self.prev_corpus_step.categories
{"name": cat} for cat in self.prev_corpus_step.categories
]

# Merge the category lists
Expand Down
35 changes: 12 additions & 23 deletions opuspocus/pipeline_steps/translate.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from opuspocus.pipeline_steps.corpus_step import CorpusStep
from opuspocus.pipeline_steps.opuspocus_step import OpusPocusStep
from opuspocus.pipeline_steps.train_model import TrainModelStep
from opuspocus.utils import RunnerResources, save_filestream, subprocess_wait
from opuspocus.utils import RunnerResources, save_filestream

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -59,8 +59,7 @@ def _inherits_sharded(self) -> bool:
def model_config_path(self) -> Path:
return Path(
"{}.{}.npz.decoder.yml".format(
self.model_step.model_path,
self.model_suffix
self.model_step.model_path, self.model_suffix
)
)

Expand All @@ -75,14 +74,12 @@ def infer_input(self, tgt_file: Path) -> Path:
src_filename = ".".join(
tgt_filename.split(".")[:-offset]
+ [self.src_lang]
+ tgt_filename.split(".")[-(offset - 1):]
+ tgt_filename.split(".")[-(offset - 1) :]
)
if self.prev_corpus_step.is_sharded:
src_file = Path(self.shard_dir, src_filename)
if not src_file.exists():
src_file.hardlink_to(
Path(self.input_shard_dir, src_filename)
)
src_file.hardlink_to(Path(self.input_shard_dir, src_filename))
else:
src_file = Path(self.output_dir, src_filename)
if not src_file.exists():
Expand All @@ -96,10 +93,9 @@ def get_command_targets(self) -> List[Path]:
targets = []
for dset in self.dataset_list:
dset_filename = "{}.{}.gz".format(dset, self.tgt_lang)
targets.extend([
shard_file
for shard_file in self.get_shard_list(dset_filename)
])
targets.extend(
[shard_file for shard_file in self.get_shard_list(dset_filename)]
)
return targets
return [
Path(self.output_dir, "{}.{}.gz".format(dset, self.tgt_lang))
Expand All @@ -117,10 +113,9 @@ def command_preprocess(self) -> None:
)
shard_dict[d_fname_target] = [
".".join(
shard.split(".")[:-3]
+ [self.tgt_lang]
+ shard.split(".")[-2:]
) for shard in s_fname_list
shard.split(".")[:-3] + [self.tgt_lang] + shard.split(".")[-2:]
)
for shard in s_fname_list
]
self.save_shard_dict(shard_dict)

Expand Down Expand Up @@ -158,11 +153,7 @@ def command(self, target_file: Path) -> None:

# Execute the command
proc = subprocess.Popen(
cmd,
stdout=subprocess.PIPE,
stderr=sys.stderr,
env=env,
text=True
cmd, stdout=subprocess.PIPE, stderr=sys.stderr, env=env, text=True
)

def terminate_signal(signalnum, handler):
Expand All @@ -176,6 +167,4 @@ def terminate_signal(signalnum, handler):
# Check the return code
rc = proc.poll()
if rc:
raise Exception(
"Process {} exited with non-zero value.".format(proc.pid)
)
raise Exception("Process {} exited with non-zero value.".format(proc.pid))
4 changes: 1 addition & 3 deletions opuspocus/pipelines/opuspocus_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,9 +82,7 @@ def load_pipeline(
"Pipeline directory ({}) does not exist.".format(pipeline_dir)
)
if not pipeline_dir.is_dir():
raise NotADirectoryError(
"{} is not a directory.".format(pipeline_dir)
)
raise NotADirectoryError("{} is not a directory.".format(pipeline_dir))

pipeline_config_path = Path(pipeline_dir, cls.config_file)
return cls(pipeline_config_path, pipeline_dir)
Expand Down
2 changes: 1 addition & 1 deletion opuspocus/runners/bash.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import subprocess
import sys
from pathlib import Path
from psutil import NoSuchProcess, Process, wait_procs
from psutil import Process, wait_procs

from opuspocus.runners import OpusPocusRunner, TaskId, register_runner
from opuspocus.utils import RunnerResources, subprocess_wait
Expand Down
63 changes: 33 additions & 30 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,68 +20,71 @@ def pipeline_dir(tmp_path_factory):
@pytest.fixture(scope="session")
def data_train_minimal(tmp_path_factory, languages):
src_file = Path(
tmp_path_factory.mktemp("data"),
"-".join(languages),
"minimal",
"train.src"
tmp_path_factory.mktemp("data"), "-".join(languages), "minimal", "train.src"
)
src_file.parent.mkdir(parents=True)
with open(src_file, "w") as fh:
print("\n".join([
"the colorless ideas slept furiously",
"pooh slept all night",
"working class hero is something to be",
"I am the working class walrus",
"walrus for president"
]), file=fh)
print(
"\n".join(
[
"the colorless ideas slept furiously",
"pooh slept all night",
"working class hero is something to be",
"I am the working class walrus",
"walrus for president",
]
),
file=fh,
)

tgt_file = Path(src_file.parent, "train.tgt")
with open(tgt_file, "w") as fh:
print("\n".join([
"les idées incolores dormaient furieusement",
"le caniche dormait toute la nuit",
"le héros de la classe ouvrière est quelque chose à être",
"Je suis le morse de la classe ouvrière",
"morse pour président"
]), file=fh)
print(
"\n".join(
[
"les idées incolores dormaient furieusement",
"le caniche dormait toute la nuit",
"le héros de la classe ouvrière est quelque chose à être",
"Je suis le morse de la classe ouvrière",
"morse pour président",
]
),
file=fh,
)
return (src_file, tgt_file)


### Change the following when dataclasses are properly implemented


@pytest.fixture(scope="session")
def config_minimal(
tmp_path_factory,
data_train_minimal,
pipeline_dir,
languages
):
config_file = Path(
tmp_path_factory.mktemp("test_configs"), "config_minimal.yml"
)
def config_minimal(tmp_path_factory, data_train_minimal, pipeline_dir, languages):
config_file = Path(tmp_path_factory.mktemp("test_configs"), "config_minimal.yml")
src_file, tgt_file = data_train_minimal

step_label = "raw." + "-".join(languages)
config = {
"pipeline": {
"pipeline_dir": str(pipeline_dir),
"steps" : [
"steps": [
{
"step": "raw",
"step_label": step_label,
"src_lang": languages[0],
"tgt_lang": languages[1],
"raw_data_dir": str(data_train_minimal[0].parent)
"raw_data_dir": str(data_train_minimal[0].parent),
}
],
"default_targets": [step_label]
"default_targets": [step_label],
}
}
yaml.dump(config, open(config_file, "w"))
return config_file


###


@pytest.fixture(scope="module")
def pipeline_minimal(config_minimal, pipeline_dir):
return OpusPocusPipeline(config_minimal, pipeline_dir)
Expand Down
32 changes: 12 additions & 20 deletions tests/test_opuspocus_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,46 +2,39 @@
from argparse import Namespace
from pathlib import Path

from opuspocus.pipelines import build_pipeline, load_pipeline, OpusPocusPipeline
from opuspocus.pipelines import build_pipeline, load_pipeline


def test_build_pipeline_method(config_minimal, pipeline_minimal, pipeline_dir):
args = Namespace(**{
"pipeline_config": config_minimal,
"pipline_dir": pipeline_dir
})
args = Namespace(**{"pipeline_config": config_minimal, "pipline_dir": pipeline_dir})
pipeline = build_pipeline(args)
assert pipeline == pipeline_minimal


def test_load_pipeline_method(pipeline_minimal_initialized):
args = Namespace(**{
"pipeline_dir": pipeline_dir
})
args = Namespace(**{"pipeline_dir": pipeline_minimal_initialized.pipeline_dir})
pipeline = load_pipeline(args)
assert pipeline == saved_pipeline
assert pipeline == pipeline_minimal_initialized


def test_load_pipeline_dir_not_exist():
args = Namespace(**{
"pipeline_dir": Path("nonexistent", "directory")
})
args = Namespace(**{"pipeline_dir": Path("nonexistent", "directory")})
with pytest.raises(FileNotFoundError):
load_pipeline(args)


def test_load_pipeline_dir_not_directory(tmp_path):
args = Namespace(**{
"pipeline_dir": tmp_path,
})
args = Namespace(
**{
"pipeline_dir": tmp_path,
}
)
with pytest.raises(NotADirectoryError):
load_pipeline(args)


def test_pipeline_class_init_graph(config_minimal, pipeline_minimal):
config_labels = [
s["step_label"]
for s in config_minimal["pipeline"]["steps"]
]
config_labels = [s["step_label"] for s in config_minimal["pipeline"]["steps"]]
assert len(config_labels) == len(pipeline_minimal.steps)
for s in pipeline_minimal.steps:
assert s.step_label in config_labels
Expand All @@ -61,5 +54,4 @@ def test_pipeline_class_init_default_targets(config_minimal, pipeline_minimal):
# - cycles



# TODO: test status/traceback?
20 changes: 8 additions & 12 deletions tests/test_pipeline_steps.py
Original file line number Diff line number Diff line change
@@ -1,39 +1,35 @@
import pytest
from argparse import Namespace
from opuspocus.pipeline_steps import (
OpusPocusStep,
build_step,
load_step,
STEP_REGISTRY,
)
from opuspocus.pipeline_steps.raw import RawStep


@pytest.fixture(scope="function", params=STEP_REGISTRY.keys())
def step_default(step, tmp_path_factory):
return build_step(
step.param,
"{}.test".format(step.param),
tmp_path_factory.mktemp("empty_pipeline")
tmp_path_factory.mktemp("empty_pipeline"),
)


#def test_parameter_save_load(step_default):


#def test_list_parameters():
# def test_parameter_save_load(step_default):


# def test_list_parameters():


@pytest.fixture(scope="session")
def mock_step_parameters():
def mock_step_parameters(languages, data_train_minimal):
src_lang, tgt_lang = languages
return {
"step": "raw",
"step_label": "raw.en",
"src_lang": src_lang,
"tgt_lang": tgt_lang,
"raw_data_dir": str(data_train_minimal[0].parent)
"raw_data_dir": str(data_train_minimal[0].parent),
}


Expand All @@ -48,8 +44,8 @@ def mock_step_inited(step):
return step


#def test_build_method():
# def test_build_method():
# build_step(**


#def test_load_method():
# def test_load_method():
Loading

0 comments on commit 59bd0ce

Please sign in to comment.