From 9894e15d5e98ded56cf2b90d6d6e24e59b790824 Mon Sep 17 00:00:00 2001 From: haritha-ravi Date: Tue, 5 Sep 2023 09:42:03 +0200 Subject: [PATCH] Add threshold for file validations --- README.rst | 5 +++++ tests/test_vlads.py | 19 +++++++++++++++++++ vladiate/vlad.py | 25 +++++++++++++++++++++++++ 3 files changed, 49 insertions(+) diff --git a/README.rst b/README.rst index ae7af26..55626d9 100644 --- a/README.rst +++ b/README.rst @@ -354,6 +354,11 @@ Running Vlads Programatically Whether to disable log output generated by validations. Optional, defaults to `False`. + :``file_validation_failure_threshold=None``: + Stops validating the file after this failure threshold is reached. + Input a value between `0.0` and `1.0`. `1.0`(100%) validates the entire file. + Optional, defaults to `None`. + For example: .. code:: python diff --git a/tests/test_vlads.py b/tests/test_vlads.py index 1c41056..0d0c881 100644 --- a/tests/test_vlads.py +++ b/tests/test_vlads.py @@ -155,3 +155,22 @@ class TestVlad(Vlad): assert vlad.validators["Column A"][0].bad assert vlad.validators["Column B"][0].fail_count == 0 assert not vlad.validators["Column B"][0].bad + + +def test_stop_file_validation_at_invalid_threshold(): + source = LocalFile("vladiate/examples/real_vampires.csv") + + class TestVlad(Vlad): + validators = { + "Column A": [EmptyValidator()], + "Column B": [EmptyValidator()], + "Column C": [UniqueValidator()], + } + + vlad = TestVlad(source=source, file_validation_failure_threshold=0.1) + + assert not vlad.validate() + assert vlad.validators["Column A"][0].fail_count == 1 + assert vlad.validators["Column B"][0].fail_count == 0 + assert vlad.validators["Column C"][0].fail_count == 0 + assert vlad.invalid_lines == {1} diff --git a/vladiate/vlad.py b/vladiate/vlad.py index 367e40c..c6f8a06 100644 --- a/vladiate/vlad.py +++ b/vladiate/vlad.py @@ -14,6 +14,7 @@ def __init__( default_validator=EmptyValidator, delimiter=None, ignore_missing_validators=False, + file_validation_failure_threshold=None, quiet=False, row_validators=[], ): @@ -30,6 +31,8 @@ def __init__( self.ignore_missing_validators = ignore_missing_validators self.logger.disabled = quiet self.invalid_lines = set() + self.file_validation_failure_threshold = file_validation_failure_threshold + self.total_lines = 0 self.validators.update( { @@ -120,6 +123,11 @@ def _log_missing(self, missing_items): ) ) + def _get_total_lines(self): + reader = csv.DictReader(self.source.open(), delimiter=self.delimiter) + self.total_lines = sum(1 for _ in reader) + return self.total_lines + def validate(self): self.logger.info( "\nValidating {}(source={})".format(self.__class__.__name__, self.source) @@ -146,6 +154,9 @@ def validate(self): self._log_missing_fields() return False + if self.file_validation_failure_threshold: + self.total_lines = self._get_total_lines() + for line, row in enumerate(reader): self.line_count += 1 @@ -166,6 +177,20 @@ def validate(self): self.failures[field_name][line].append(e) self.invalid_lines.add(self.line_count) validator.fail_count += 1 + if ( + self.file_validation_failure_threshold + and self.total_lines > 0 + and validator.fail_count / self.total_lines + > self.file_validation_failure_threshold + ): + self.logger.error( + " {} failed {} time(s) ({:.1%})".format( + validator.__class__.__name__, + validator.fail_count, + validator.fail_count / self.total_lines, + ) + ) + return False if self.failures or self.row_failures: self.logger.info("\033[0;31m" + "Failed :(" + "\033[0m")