Skip to content

Commit

Permalink
Add warning level to DataValidator (#431)
Browse files Browse the repository at this point in the history
* Add warning level to DataValidator

* Improve warning log message

* Commit suggestions and update testing

* Quickfix DataValidationCriteriaBounds
  • Loading branch information
dc-almeida authored Dec 13, 2024
1 parent 9cafe81 commit c16ee58
Show file tree
Hide file tree
Showing 3 changed files with 97 additions and 26 deletions.
62 changes: 45 additions & 17 deletions nomenclature/processor/data_validator.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import logging
import textwrap
from enum import Enum
from pathlib import Path

import yaml
Expand All @@ -16,7 +17,18 @@
logger = logging.getLogger(__name__)


class DataValidationCriteriaValue(IamcDataFilter):
class WarningEnum(str, Enum):
high = "high"
medium = "medium"
low = "low"
error = "error"


class DataValidationCriteria(IamcDataFilter):
warning_level: WarningEnum = WarningEnum.error


class DataValidationCriteriaValue(DataValidationCriteria):
value: float
rtol: float = 0.0
atol: float = 0.0
Expand All @@ -38,19 +50,19 @@ def validation_args(self):
return self.model_dump(
exclude_none=True,
exclude_unset=True,
exclude=["value", "rtol", "atol"],
exclude=["warning_level", "value", "rtol", "atol"],
)

@property
def criteria(self):
return self.model_dump(
exclude_none=True,
exclude_unset=True,
exclude=["lower_bound", "upper_bound"],
exclude=["warning_level", "lower_bound", "upper_bound"],
)


class DataValidationCriteriaBounds(IamcDataFilter):
class DataValidationCriteriaBounds(DataValidationCriteria):
upper_bound: float | None = None
lower_bound: float | None = None

Expand All @@ -64,6 +76,14 @@ def check_validation_criteria_exist(self):
def validation_args(self):
return self.criteria

@property
def criteria(self):
return self.model_dump(
exclude_none=True,
exclude_unset=True,
exclude=["warning_level"],
)


class DataValidator(Processor):
"""Processor for validating IAMC datapoints"""
Expand All @@ -89,31 +109,39 @@ def from_file(cls, file: Path | str) -> "DataValidator":
return cls(file=file, criteria_items=content)

def apply(self, df: IamDataFrame) -> IamDataFrame:
error_list = []
fail_list = []
error = False

with adjust_log_level():
for item in self.criteria_items:
failed_validation = df.validate(**item.validation_args)
if failed_validation is not None:
error_list.append(
" Criteria: "
+ ", ".join(
[f"{key}: {value}" for key, value in item.criteria.items()]
)
criteria_msg = " Criteria: " + ", ".join(
[f"{key}: {value}" for key, value in item.criteria.items()]
)
error_list.append(
failed_validation["warning_level"] = item.warning_level.value
if item.warning_level == WarningEnum.error:
error = True
fail_list.append(criteria_msg)
fail_list.append(
textwrap.indent(str(failed_validation), prefix=" ") + "\n"
)

if error_list:
logger.error(
"Failed data validation (file %s):\n%s",
get_relative_path(self.file),
"\n".join(error_list),
fail_msg = "(file %s):\n" % get_relative_path(self.file)
if error:
fail_msg = (
"Data validation with error(s)/warning(s) "
+ fail_msg
+ "\n".join(fail_list)
)
logger.error(fail_msg)
raise ValueError(
"Data validation failed. Please check the log for details."
)
if fail_list:
fail_msg = (
"Data validation with warning(s) " + fail_msg + "\n".join(fail_list)
)
logger.warning(fail_msg)
return df

def validate_with_definition(self, dsd: DataStructureDefinition) -> None:
Expand Down
13 changes: 13 additions & 0 deletions tests/data/validation/validate_data/validate_warning.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
- variable: Primary Energy
year: 2010
upper_bound: 2.5
lower_bound: 1
warning_level: low
- variable: Primary Energy
year: 2010
upper_bound: 5
lower_bound: 1
- variable: Primary Energy|Coal
year: 2010
upper_bound: 5
lower_bound: 1
48 changes: 39 additions & 9 deletions tests/test_validate_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,23 +102,53 @@ def test_DataValidator_apply_fails(simple_df, file, item_1, item_2, item_3, capl
data_file = DATA_VALIDATION_TEST_DIR / f"validate_data_fails_{file}.yaml"
data_validator = DataValidator.from_file(data_file)

failed_validation_message = f"""Failed data validation (file {data_file.relative_to(Path.cwd())}):
failed_validation_message = (
"Data validation with error(s)/warning(s) "
f"""(file {data_file.relative_to(Path.cwd())}):
Criteria: variable: ['Primary Energy'], {item_1}
model scenario region variable unit year value
0 model_a scen_a World Primary Energy EJ/yr 2010 6.0
1 model_a scen_b World Primary Energy EJ/yr 2010 7.0
model scenario region variable unit year value warning_level
0 model_a scen_a World Primary Energy EJ/yr 2010 6.0 error
1 model_a scen_b World Primary Energy EJ/yr 2010 7.0 error
Criteria: variable: ['Primary Energy|Coal'], {item_2}
model scenario region variable unit year value
0 model_a scen_a World Primary Energy|Coal EJ/yr 2005 0.5
model scenario region ... year value warning_level
0 model_a scen_a World ... 2005 0.5 error
[1 rows x 8 columns]
Criteria: variable: ['Primary Energy'], year: [2005], {item_3}
model scenario region variable unit year value
0 model_a scen_a World Primary Energy EJ/yr 2005 1.0
1 model_a scen_b World Primary Energy EJ/yr 2005 2.0"""
model scenario region variable unit year value warning_level
0 model_a scen_a World Primary Energy EJ/yr 2005 1.0 error
1 model_a scen_b World Primary Energy EJ/yr 2005 2.0 error"""
)

with pytest.raises(ValueError, match="Data validation failed"):
data_validator.apply(simple_df)

# check if the log message contains the correct information
assert failed_validation_message in caplog.text


def test_DataValidator_validate_with_warning(simple_df, caplog):
data_validator = DataValidator.from_file(
DATA_VALIDATION_TEST_DIR / "validate_warning.yaml"
)
with pytest.raises(ValueError, match="Data validation failed"):
data_validator.apply(simple_df)

failed_validation_message = (
"Data validation with error(s)/warning(s) "
f"""(file {(DATA_VALIDATION_TEST_DIR / "validate_warning.yaml").relative_to(Path.cwd())}):
Criteria: variable: ['Primary Energy'], year: [2010], upper_bound: 2.5, lower_bound: 1.0
model scenario region variable unit year value warning_level
0 model_a scen_a World Primary Energy EJ/yr 2010 6.0 low
1 model_a scen_b World Primary Energy EJ/yr 2010 7.0 low
Criteria: variable: ['Primary Energy'], year: [2010], upper_bound: 5.0, lower_bound: 1.0
model scenario region variable unit year value warning_level
0 model_a scen_a World Primary Energy EJ/yr 2010 6.0 error
1 model_a scen_b World Primary Energy EJ/yr 2010 7.0 error"""
)

# only prints two of three criteria in df to be validated
assert failed_validation_message in caplog.text

0 comments on commit c16ee58

Please sign in to comment.