diff --git a/nomenclature/processor/data_validator.py b/nomenclature/processor/data_validator.py new file mode 100644 index 00000000..96b79d20 --- /dev/null +++ b/nomenclature/processor/data_validator.py @@ -0,0 +1,43 @@ +from pathlib import Path +from typing import List, Union + +import yaml + +from nomenclature import DataStructureDefinition +from nomenclature.error import ErrorCollector +from nomenclature.processor.iamc import IamcDataFilter +from nomenclature.processor import Processor +from nomenclature.processor.utils import get_relative_path + + +class DataValidationCriteria(IamcDataFilter): + """Data validation criteria""" + + upper_bound: float = None + lower_bound: float = None + + +class DataValidator(Processor): + """Processor for validating IAMC datapoints""" + + criteria_items: List[DataValidationCriteria] + file: Path + + @classmethod + def from_file(cls, file: Union[Path, str]) -> "DataValidator": + with open(file, "r") as f: + content = yaml.safe_load(f) + return cls(file=file, criteria_items=content) + + def apply(self): + pass + + def validate_with_definition(self, dsd: DataStructureDefinition) -> None: + errors = ErrorCollector() + for data in self.criteria_items: + try: + data.validate_with_definition(dsd) + except ValueError as value_error: + errors.append(value_error) + if errors: + raise ValueError(f"In file {get_relative_path(self.file)}:\n{errors}") diff --git a/nomenclature/processor/iamc.py b/nomenclature/processor/iamc.py index 083fa26a..8dc3936a 100644 --- a/nomenclature/processor/iamc.py +++ b/nomenclature/processor/iamc.py @@ -1,4 +1,4 @@ -from typing import List +from typing import List, Tuple, Any from pydantic import BaseModel, field_validator from pyam import IAMC_IDX @@ -14,7 +14,7 @@ class IamcDataFilter(BaseModel): unit: List[str] | None = None year: List[int] | None = None - @field_validator("*", mode="before") + @field_validator(*IAMC_IDX + ["year"], mode="before") @classmethod def single_input_to_list(cls, v): return v if isinstance(v, list) else [v] @@ -24,12 +24,13 @@ def validate_with_definition(self, dsd: DataStructureDefinition) -> None: # check for filter-items that are not defined in the codelists for dimension in IAMC_IDX: - if codelist := getattr(dsd, dimension, None) is None: + codelist = getattr(dsd, dimension, None) + if codelist is None: continue - if invalid := codelist.validate_items(getattr(self, dimension) or []): + if invalid := codelist.validate_items(getattr(self, dimension, [])): error_msg += ( - f"The following {dimension}s were not found in the " - f"DataStructureDefinition:\n{invalid}\n" + f"The following {dimension}s are not defined in the " + f"DataStructureDefinition:\n {', '.join(invalid)}\n" ) if error_msg: diff --git a/tests/data/validation/definition/region/region.yaml b/tests/data/validation/definition/region/region.yaml new file mode 100644 index 00000000..b443c8fa --- /dev/null +++ b/tests/data/validation/definition/region/region.yaml @@ -0,0 +1,2 @@ +- Common: + - World diff --git a/tests/data/validation/definition/variable/variable.yaml b/tests/data/validation/definition/variable/variable.yaml new file mode 100644 index 00000000..59c7edf8 --- /dev/null +++ b/tests/data/validation/definition/variable/variable.yaml @@ -0,0 +1,8 @@ +- Final Energy: + unit: EJ/yr +- Primary Energy: + unit: EJ/yr +- Emissions|CO2: + unit: Mt CO2/yr +- Emissions|CH4: + unit: Mt CH4/yr diff --git a/tests/data/validation/validate_data/fail_unknown_region.yaml b/tests/data/validation/validate_data/fail_unknown_region.yaml new file mode 100644 index 00000000..8987e6ae --- /dev/null +++ b/tests/data/validation/validate_data/fail_unknown_region.yaml @@ -0,0 +1,3 @@ + - variable: Final Energy + region: Asia + diff --git a/tests/data/validation/validate_data/simple_validation.yaml b/tests/data/validation/validate_data/simple_validation.yaml new file mode 100644 index 00000000..2694b20f --- /dev/null +++ b/tests/data/validation/validate_data/simple_validation.yaml @@ -0,0 +1,6 @@ + - region: World + variable: Final Energy + year: 2010 + upper_bound: 2.5 + lower_bound: 1 + diff --git a/tests/data/validation/validate_data/validation_unknown_region.yaml b/tests/data/validation/validate_data/validation_unknown_region.yaml new file mode 100644 index 00000000..04558339 --- /dev/null +++ b/tests/data/validation/validate_data/validation_unknown_region.yaml @@ -0,0 +1,5 @@ + - region: Asia + variable: Final Energy + year: 2010 + upper_bound: 2.5 + lower_bound: 1 diff --git a/tests/data/validation/validate_data/validation_unknown_variable.yaml b/tests/data/validation/validate_data/validation_unknown_variable.yaml new file mode 100644 index 00000000..65e6f9c6 --- /dev/null +++ b/tests/data/validation/validate_data/validation_unknown_variable.yaml @@ -0,0 +1,5 @@ + - region: World + variable: Final Energy|Industry + year: 2010 + upper_bound: 2.5 + lower_bound: 1 diff --git a/tests/test_data_validation.py b/tests/test_data_validation.py new file mode 100644 index 00000000..8a6f8614 --- /dev/null +++ b/tests/test_data_validation.py @@ -0,0 +1,59 @@ +import pytest +from conftest import TEST_DATA_DIR + +from nomenclature import DataStructureDefinition +from nomenclature.processor.data_validator import DataValidator + +DATA_VALIDATION_TEST_DIR = TEST_DATA_DIR / "validation" / "validate_data" + + +def test_DataValidator_from_file(): + exp = DataValidator( + **{ + "criteria_items": [ + { + "region": ["World"], + "variable": "Final Energy", + "year": [2010], + "upper_bound": 2.5, + "lower_bound": 1.0, # test that integer in yaml is cast to float + } + ], + "file": DATA_VALIDATION_TEST_DIR / "simple_validation.yaml", + } + ) + obs = DataValidator.from_file(DATA_VALIDATION_TEST_DIR / "simple_validation.yaml") + assert obs == exp + + dsd = DataStructureDefinition(TEST_DATA_DIR / "validation" / "definition") + assert obs.validate_with_definition(dsd) is None + + +@pytest.mark.parametrize( + "dimension, match", + [ + ("region", r"regions.*not defined.*\n.*Asia"), + ("variable", r"variables.*not defined.*\n.*Final Energy\|Industry"), + ], +) +def test_DataValidator_validate_with_definition_raises(dimension, match): + # Testing two different failure cases + # 1. Undefined region + # 2. Undefined variable + # TODO Undefined unit + + data_validator = DataValidator.from_file( + DATA_VALIDATION_TEST_DIR / f"validation_unknown_{dimension}.yaml" + ) + + # validating against a DataStructure with all dimensions raises + dsd = DataStructureDefinition(TEST_DATA_DIR / "validation" / "definition") + with pytest.raises(ValueError, match=match): + data_validator.validate_with_definition(dsd) + + # validating against a DataStructure without the offending dimension passes + dsd = DataStructureDefinition( + TEST_DATA_DIR / "validation" / "definition", + dimensions=[dim for dim in ["region", "variable"] if dim != dimension], + ) + assert data_validator.validate_with_definition(dsd) is None