Skip to content

Commit

Permalink
Add check for validate_data consistency to validate-project CLI (#…
Browse files Browse the repository at this point in the history
  • Loading branch information
danielhuppmann authored Aug 16, 2024
1 parent 9867535 commit 6efc8d9
Show file tree
Hide file tree
Showing 12 changed files with 97 additions and 47 deletions.
29 changes: 20 additions & 9 deletions nomenclature/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,12 @@ def cli_valid_yaml(path: Path):
type=str,
default=None,
)
@click.option(
"--validate_data",
help="Optional name for validation folder",
type=str,
default=None,
)
@click.option(
"--dimension",
"dimensions",
Expand All @@ -51,6 +57,7 @@ def cli_valid_project(
definitions: str,
mappings: Optional[str],
required_data: Optional[str],
validate_data: Optional[str],
dimensions: Optional[List[str]],
):
"""Assert that `path` is a valid project nomenclature
Expand All @@ -60,11 +67,13 @@ def cli_valid_project(
path : Path
Project directory to be validated
definitions : str, optional
Name of the definitions folder, defaults to "definitions"
Name of 'definitions' folder, defaults to "definitions"
mappings : str, optional
Name of the mappings folder, defaults to "mappings" (if this folder exists)
Name of 'mappings' folder, defaults to "mappings"
required_data: str, optional
Name of the required data folder, default to "required_data" (if folder exists)
Name of folder for 'required data' criteria, default to "required_data"
validate_data: str, optional
Name of folder for data validation criteria, default to "validate_data"
dimensions : List[str], optional
Dimensions to be checked, defaults to all sub-folders of `definitions`
Expand All @@ -76,23 +85,25 @@ def cli_valid_project(
--dimension <folder2>
--dimension <folder3>
Note
----
This test includes three steps:
1. Test that all yaml files in `definitions/` and `mappings/` can be correctly read
1. Test that all yaml files in `definitions` and `mappings` can be correctly parsed
as yaml files. This is a formal check for yaml syntax only.
2. Test that all files in `definitions/` can be correctly parsed as a
2. Test that all files in `definitions` can be correctly parsed as a
:class:`DataStructureDefinition` object comprised of individual codelists.
3. Test that all model mappings in `mappings/` can be correctly parsed as a
3. Test that all model mappings in `mappings` can be correctly parsed as a
:class:`RegionProcessor` object. This includes a check that all regions mentioned
in a model mapping are defined in the region codelist.
4. Test that all required-data and data-validation files can be parsed correctly
and are consistent with the `definitions`.
"""
assert_valid_yaml(path)
assert_valid_structure(path, definitions, mappings, required_data, dimensions)
assert_valid_structure(
path, definitions, mappings, required_data, validate_data, dimensions
)


@cli.command("check-region-aggregation")
Expand Down
15 changes: 10 additions & 5 deletions nomenclature/error.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import textwrap
from collections import namedtuple
from typing import Optional

pydantic_custom_error_config = {
"RegionNameCollisionError": (
Expand Down Expand Up @@ -47,22 +48,26 @@

class ErrorCollector:
errors: list[Exception]
description: Optional[str] = None

def __init__(self) -> None:
def __init__(self, description: str = None) -> None:
self.errors = []
self.description = description

def append(self, error: Exception) -> None:
self.errors.append(error)

def __repr__(self) -> str:
error = "error" if len(self.errors) == 1 else "errors"
error_list_str = "\n".join(
f"{i+1}. {error}" for i, error in enumerate(self.errors)
f"{i + 1}. {error}" for i, error in enumerate(self.errors)
)

return f"Collected {len(self.errors)} {error}:\n" + textwrap.indent(
error_list_str, prefix=" "
)
message = f"Collected {len(self.errors)} {error}"
if self.description is not None:
message += f" when checking {self.description}"

return f"{message}:\n" + textwrap.indent(error_list_str, prefix=" ")

def __bool__(self) -> bool:
return bool(self.errors)
1 change: 1 addition & 0 deletions nomenclature/processor/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@
RegionProcessor,
)
from nomenclature.processor.required_data import RequiredDataValidator # noqa
from nomenclature.processor.data_validator import DataValidator # noqa
6 changes: 3 additions & 3 deletions nomenclature/processor/data_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

import yaml

from nomenclature import DataStructureDefinition
from nomenclature.definition import DataStructureDefinition
from nomenclature.error import ErrorCollector
from nomenclature.processor.iamc import IamcDataFilter
from nomenclature.processor import Processor
Expand Down Expand Up @@ -33,11 +33,11 @@ def apply(self):
pass

def validate_with_definition(self, dsd: DataStructureDefinition) -> None:
errors = ErrorCollector()
errors = ErrorCollector(description=f"in file '{self.file}'")
for data in self.criteria_items:
try:
data.validate_with_definition(dsd)
except ValueError as value_error:
errors.append(value_error)
if errors:
raise ValueError(f"In file {get_relative_path(self.file)}:\n{errors}")
raise ValueError(errors)
64 changes: 43 additions & 21 deletions nomenclature/testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,12 @@
import yaml

from nomenclature.definition import DataStructureDefinition
from nomenclature.processor import RegionProcessor, RequiredDataValidator
from nomenclature.processor import (
DataValidator,
RegionProcessor,
RequiredDataValidator,
Processor,
)
from nomenclature.error import ErrorCollector

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -63,35 +68,38 @@ def _check_mappings(
raise FileNotFoundError(f"Mappings directory not found: {path / mappings}")


def _collect_RequiredData_errors(
required_data_dir: Path, dsd: DataStructureDefinition
def _collect_processor_errors(
path: Path,
processor: type[RequiredDataValidator] | type[DataValidator],
dsd: DataStructureDefinition,
) -> None:
errors = ErrorCollector()
for file in required_data_dir.iterdir():
errors = ErrorCollector(description=f"files for '{processor.__name__}' ({path})")
for file in path.iterdir():
try:
RequiredDataValidator.from_file(file).validate_with_definition(dsd)
processor.from_file(file).validate_with_definition(dsd)
except ValueError as error:
errors.append(error)
if errors:
raise ValueError(f"Found error(s) in required data files:\n{errors}")
raise ValueError(errors)


def _check_RequiredData(
def _check_processor_directory(
path: Path,
definitions: str = "definitions",
processor: Processor,
processor_arg: str,
folder: Optional[str] = None,
definitions: Optional[str] = "definitions",
dimensions: Optional[List[str]] = None,
required_data: Optional[str] = None,
) -> None:
dsd = DataStructureDefinition(path / definitions, dimensions)
if required_data is None:
if (path / "requiredData").is_dir():
_collect_RequiredData_errors(path / "required_data", dsd)

elif (path / required_data).is_dir():
_collect_RequiredData_errors(path / required_data, dsd)
if folder is None:
if (path / processor_arg).is_dir():
_collect_processor_errors(path / processor_arg, processor, dsd)
elif (path / folder).is_dir():
_collect_processor_errors(path / folder, processor, dsd)
else:
raise FileNotFoundError(
f"Directory for required data not found at: {path / required_data}"
f"Directory for '{processor_arg}' not found at: {path / folder}"
)


Expand All @@ -100,6 +108,7 @@ def assert_valid_structure(
definitions: str = "definitions",
mappings: Optional[str] = None,
required_data: Optional[str] = None,
validate_data: Optional[str] = None,
dimensions: Optional[List[str]] = None,
) -> None:
"""Assert that `path` can be initialized as a :class:`DataStructureDefinition`
Expand All @@ -113,16 +122,19 @@ def assert_valid_structure(
mappings : str, optional
Name of the mappings folder, defaults to "mappings" (if this folder exists)
required_data : str, optional
Name of the required data folder, defaults to "required_data" (if this folder
exists)
Name of the folder for required data, defaults to "required_data"
(if this folder exists)
validate_data : str, optional
Name of the folder for data validation criteria, defaults to "validate_date"
(if this folder exists)
dimensions : List[str], optional
Dimensions to be checked, defaults to all sub-folders of `definitions`
Notes
-----
Folder structure of `path`:
- A `definitions` folder is required and must be a valid
:class:`DataStructureDefinition`
:class:`DataStructureDefinition`
- The `definitions` folder must contain sub-folder(s) to validate
- If a `mappings` folder exists, it must be a valid :class:`RegionProcessor`
Expand All @@ -139,7 +151,17 @@ def assert_valid_structure(
f"`definitions` directory is empty: {path / definitions}"
)
_check_mappings(path, definitions, dimensions, mappings)
_check_RequiredData(path, definitions, dimensions, required_data)
_check_processor_directory(
path,
RequiredDataValidator,
"required_data",
required_data,
definitions,
dimensions,
)
_check_processor_directory(
path, DataValidator, "validate_data", validate_data, definitions, dimensions
)


# Todo: add function which runs `DataStructureDefinition(path).validate(scenario)`
3 changes: 0 additions & 3 deletions tests/data/validation/validate_data/fail_unknown_region.yaml

This file was deleted.

18 changes: 16 additions & 2 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
from nomenclature import cli
from nomenclature.testing import assert_valid_structure, assert_valid_yaml
from nomenclature.codelist import VariableCodeList
from nomenclature.cli import cli_run_workflow

from conftest import TEST_DATA_DIR

Expand Down Expand Up @@ -256,6 +255,22 @@ def test_cli_missing_mappings_fails():
assert "Mappings directory not found" in str(cli_result.exception)


def test_cli_validate_data_fails():
"""Assert that validating invalid yaml fails"""
cli_result = runner.invoke(
cli,
[
"validate-project",
str(TEST_DATA_DIR / "validation"),
],
)

assert cli_result.exit_code == 1
assert "Collected 2 errors" in str(cli_result.exception)
assert "Asia" in str(cli_result.exception)
assert "Final Energy|Industry" in str(cli_result.exception)


def test_cli_empty_definitions_dir():
"""Assert that an error is raised when the `definitions` directory is empty"""

Expand Down Expand Up @@ -365,7 +380,6 @@ def test_cli_add_missing_variables(simple_definition, tmp_path):


def test_cli_run_workflow(tmp_path, simple_df):

simple_df.to_excel(tmp_path / "input.xlsx")

runner.invoke(
Expand Down
8 changes: 4 additions & 4 deletions tests/test_data_validation.py → tests/test_validate_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def test_DataValidator_from_file():
obs = DataValidator.from_file(DATA_VALIDATION_TEST_DIR / "simple_validation.yaml")
assert obs == exp

dsd = DataStructureDefinition(TEST_DATA_DIR / "validation" / "definition")
dsd = DataStructureDefinition(TEST_DATA_DIR / "validation" / "definitions")
assert obs.validate_with_definition(dsd) is None


Expand All @@ -43,17 +43,17 @@ def test_DataValidator_validate_with_definition_raises(dimension, match):
# TODO Undefined unit

data_validator = DataValidator.from_file(
DATA_VALIDATION_TEST_DIR / f"validation_unknown_{dimension}.yaml"
DATA_VALIDATION_TEST_DIR / f"validate_unknown_{dimension}.yaml"
)

# validating against a DataStructure with all dimensions raises
dsd = DataStructureDefinition(TEST_DATA_DIR / "validation" / "definition")
dsd = DataStructureDefinition(TEST_DATA_DIR / "validation" / "definitions")
with pytest.raises(ValueError, match=match):
data_validator.validate_with_definition(dsd)

# validating against a DataStructure without the offending dimension passes
dsd = DataStructureDefinition(
TEST_DATA_DIR / "validation" / "definition",
TEST_DATA_DIR / "validation" / "definitions",
dimensions=[dim for dim in ["region", "variable"] if dim != dimension],
)
assert data_validator.validate_with_definition(dsd) is None

0 comments on commit 6efc8d9

Please sign in to comment.