Skip to content

Commit

Permalink
Add parsing of data-validation yaml files (#366)
Browse files Browse the repository at this point in the history
Co-authored-by: Philip Hackstock <[email protected]>
  • Loading branch information
danielhuppmann and phackstock authored Aug 12, 2024
1 parent ff55b90 commit 9867535
Show file tree
Hide file tree
Showing 9 changed files with 138 additions and 6 deletions.
43 changes: 43 additions & 0 deletions nomenclature/processor/data_validator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
from pathlib import Path
from typing import List, Union

import yaml

from nomenclature import DataStructureDefinition
from nomenclature.error import ErrorCollector
from nomenclature.processor.iamc import IamcDataFilter
from nomenclature.processor import Processor
from nomenclature.processor.utils import get_relative_path


class DataValidationCriteria(IamcDataFilter):
"""Data validation criteria"""

upper_bound: float = None
lower_bound: float = None


class DataValidator(Processor):
"""Processor for validating IAMC datapoints"""

criteria_items: List[DataValidationCriteria]
file: Path

@classmethod
def from_file(cls, file: Union[Path, str]) -> "DataValidator":
with open(file, "r") as f:
content = yaml.safe_load(f)
return cls(file=file, criteria_items=content)

def apply(self):
pass

def validate_with_definition(self, dsd: DataStructureDefinition) -> None:
errors = ErrorCollector()
for data in self.criteria_items:
try:
data.validate_with_definition(dsd)
except ValueError as value_error:
errors.append(value_error)
if errors:
raise ValueError(f"In file {get_relative_path(self.file)}:\n{errors}")
13 changes: 7 additions & 6 deletions nomenclature/processor/iamc.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import List
from typing import List, Tuple, Any
from pydantic import BaseModel, field_validator

from pyam import IAMC_IDX
Expand All @@ -14,7 +14,7 @@ class IamcDataFilter(BaseModel):
unit: List[str] | None = None
year: List[int] | None = None

@field_validator("*", mode="before")
@field_validator(*IAMC_IDX + ["year"], mode="before")
@classmethod
def single_input_to_list(cls, v):
return v if isinstance(v, list) else [v]
Expand All @@ -24,12 +24,13 @@ def validate_with_definition(self, dsd: DataStructureDefinition) -> None:

# check for filter-items that are not defined in the codelists
for dimension in IAMC_IDX:
if codelist := getattr(dsd, dimension, None) is None:
codelist = getattr(dsd, dimension, None)
if codelist is None:
continue
if invalid := codelist.validate_items(getattr(self, dimension) or []):
if invalid := codelist.validate_items(getattr(self, dimension, [])):
error_msg += (
f"The following {dimension}s were not found in the "
f"DataStructureDefinition:\n{invalid}\n"
f"The following {dimension}s are not defined in the "
f"DataStructureDefinition:\n {', '.join(invalid)}\n"
)

if error_msg:
Expand Down
2 changes: 2 additions & 0 deletions tests/data/validation/definition/region/region.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
- Common:
- World
8 changes: 8 additions & 0 deletions tests/data/validation/definition/variable/variable.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
- Final Energy:
unit: EJ/yr
- Primary Energy:
unit: EJ/yr
- Emissions|CO2:
unit: Mt CO2/yr
- Emissions|CH4:
unit: Mt CH4/yr
3 changes: 3 additions & 0 deletions tests/data/validation/validate_data/fail_unknown_region.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
- variable: Final Energy
region: Asia

6 changes: 6 additions & 0 deletions tests/data/validation/validate_data/simple_validation.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
- region: World
variable: Final Energy
year: 2010
upper_bound: 2.5
lower_bound: 1

Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
- region: Asia
variable: Final Energy
year: 2010
upper_bound: 2.5
lower_bound: 1
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
- region: World
variable: Final Energy|Industry
year: 2010
upper_bound: 2.5
lower_bound: 1
59 changes: 59 additions & 0 deletions tests/test_data_validation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import pytest
from conftest import TEST_DATA_DIR

from nomenclature import DataStructureDefinition
from nomenclature.processor.data_validator import DataValidator

DATA_VALIDATION_TEST_DIR = TEST_DATA_DIR / "validation" / "validate_data"


def test_DataValidator_from_file():
exp = DataValidator(
**{
"criteria_items": [
{
"region": ["World"],
"variable": "Final Energy",
"year": [2010],
"upper_bound": 2.5,
"lower_bound": 1.0, # test that integer in yaml is cast to float
}
],
"file": DATA_VALIDATION_TEST_DIR / "simple_validation.yaml",
}
)
obs = DataValidator.from_file(DATA_VALIDATION_TEST_DIR / "simple_validation.yaml")
assert obs == exp

dsd = DataStructureDefinition(TEST_DATA_DIR / "validation" / "definition")
assert obs.validate_with_definition(dsd) is None


@pytest.mark.parametrize(
"dimension, match",
[
("region", r"regions.*not defined.*\n.*Asia"),
("variable", r"variables.*not defined.*\n.*Final Energy\|Industry"),
],
)
def test_DataValidator_validate_with_definition_raises(dimension, match):
# Testing two different failure cases
# 1. Undefined region
# 2. Undefined variable
# TODO Undefined unit

data_validator = DataValidator.from_file(
DATA_VALIDATION_TEST_DIR / f"validation_unknown_{dimension}.yaml"
)

# validating against a DataStructure with all dimensions raises
dsd = DataStructureDefinition(TEST_DATA_DIR / "validation" / "definition")
with pytest.raises(ValueError, match=match):
data_validator.validate_with_definition(dsd)

# validating against a DataStructure without the offending dimension passes
dsd = DataStructureDefinition(
TEST_DATA_DIR / "validation" / "definition",
dimensions=[dim for dim in ["region", "variable"] if dim != dimension],
)
assert data_validator.validate_with_definition(dsd) is None

0 comments on commit 9867535

Please sign in to comment.