Skip to content

Commit

Permalink
Merge pull request #1420 from Sage-Bionetworks/develop-FDS-2015-js-va…
Browse files Browse the repository at this point in the history
…lidation

BugFix: JSON Schema Validation Issues
  • Loading branch information
mialy-defelice authored May 8, 2024
2 parents c51af2d + a898893 commit 0864e49
Show file tree
Hide file tree
Showing 13 changed files with 507 additions and 62 deletions.
24 changes: 16 additions & 8 deletions schematic/models/validate_attribute.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
np_array_to_str_list,
iterable_to_str_list,
rule_in_rule_list,
get_list_robustness,
)

from synapseclient.core.exceptions import SynapseNoCredentialsError
Expand Down Expand Up @@ -58,6 +59,7 @@ def generate_schema_error(
error_col=attribute_name,
error_message=error_message,
error_val=invalid_entry,
message_level="error",
)

return error_list, warning_list
Expand Down Expand Up @@ -475,7 +477,12 @@ def _get_rule_attributes(

is_schema_error = rule_name == "schema"
col_is_recommended = rule_name == "recommended"
col_is_required = dmge.get_node_required(node_display_name=error_col_name)

if not is_schema_error:
col_is_required = dmge.get_node_required(node_display_name=error_col_name)
else:
col_is_required = False

return (
rule_parts,
rule_name,
Expand Down Expand Up @@ -823,16 +830,17 @@ def list_validation(
# white spaces removed.
errors = []
warnings = []
replace_null = True

csv_re = comma_separated_list_regex()

rule_parts = val_rule.lower().split(" ")
if len(rule_parts) > 1:
list_robustness = rule_parts[1]
else:
list_robustness = "strict"
# Check if lists -must- be a list, or can be a single value.
list_robustness = get_list_robustness(val_rule=val_rule)

if list_robustness == "like":
replace_null = False

if list_robustness == "strict":
elif list_robustness == "strict":
manifest_col = manifest_col.astype(str)

# This will capture any if an entry is not formatted properly. Only for strict lists
Expand Down Expand Up @@ -864,7 +872,7 @@ def list_validation(
warnings.append(vr_warnings)

# Convert string to list.
manifest_col = parse_str_series_to_list(manifest_col)
manifest_col = parse_str_series_to_list(manifest_col, replace_null=replace_null)

return errors, warnings, manifest_col

Expand Down
15 changes: 10 additions & 5 deletions schematic/models/validate_manifest.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,10 @@
from schematic.store.synapse import SynapseStorage
from schematic.models.GE_Helpers import GreatExpectationsHelpers
from schematic.utils.validate_rules_utils import validation_rule_info
from schematic.utils.validate_utils import rule_in_rule_list
from schematic.utils.validate_utils import (
rule_in_rule_list,
convert_nan_entries_to_empty_strings,
)
from schematic.utils.schema_utils import extract_component_validation_rules

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -103,9 +106,9 @@ def validate_manifest_rules(
manifest: pd.core.frame.DataFrame,
dmge: DataModelGraphExplorer,
restrict_rules: bool,
project_scope: List,
project_scope: list[str],
access_token: Optional[str] = None,
) -> (pd.core.frame.DataFrame, List[List[str]]):
) -> (pd.core.frame.DataFrame, list[list[str]]):
"""
Purpose:
Take validation rules set for a particular attribute
Expand Down Expand Up @@ -295,8 +298,7 @@ def validate_manifest_values(
warnings = []
col_attr = {} # save the mapping between column index and attribute name

# Replace nans with empty strings so jsonschema
manifest = manifest.replace({np.nan: ""})
manifest = convert_nan_entries_to_empty_strings(manifest=manifest)

# numerical values need to be type string for the jsonValidator
for col in manifest.select_dtypes(
Expand Down Expand Up @@ -347,15 +349,18 @@ def validate_all(
project_scope: List,
access_token: str,
):
# Run Validation Rules
vm = ValidateManifest(errors, manifest, manifestPath, dmge, jsonSchema)
manifest, vmr_errors, vmr_warnings = vm.validate_manifest_rules(
manifest, dmge, restrict_rules, project_scope, access_token
)

if vmr_errors:
errors.extend(vmr_errors)
if vmr_warnings:
warnings.extend(vmr_warnings)

# Run JSON Schema Validation
vmv_errors, vmv_warnings = vm.validate_manifest_values(manifest, jsonSchema, dmge)
if vmv_errors:
errors.extend(vmv_errors)
Expand Down
81 changes: 76 additions & 5 deletions schematic/utils/validate_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,33 @@ def comma_separated_list_regex() -> Pattern[str]:
return csv_list_regex


def convert_nan_entries_to_empty_strings(
manifest: pd.core.frame.DataFrame,
) -> pd.core.frame.DataFrame:
"""
Nans need to be converted to empty strings for JSON Schema Validation. This helper
converts an a list with a single '<NA>' string or a single np.nan to empty strings.
These types of expected NANs come from different stages of conversion during import
and validation.
Args:
manifest: pd.core.frame.DataFrame, manifest prior to removing nans and
replacing with empty strings.
Returns:
manifest: pd.core.frame.DataFrame, manifest post removing nans and
replacing with empty strings.
"""
# Replace nans with empty strings so jsonschema, address replace type infering depreciation.
with pd.option_context("future.no_silent_downcasting", True):
manifest = manifest.replace({np.nan: ""}).infer_objects(copy=False) # type: ignore

for col in manifest.columns:
for index, value in manifest[col].items():
if value == ["<NA>"]:
manifest.loc[index, col] = [""] # type: ignore
return manifest


def rule_in_rule_list(rule: str, rule_list: list[str]) -> Optional[re.Match[str]]:
"""
Function to standardize
Expand All @@ -70,18 +97,62 @@ def rule_in_rule_list(rule: str, rule_list: list[str]) -> Optional[re.Match[str]
return re.search(rule_type, rule_list_str, flags=re.IGNORECASE)


def parse_str_series_to_list(col: pd.Series) -> pd.Series:
def get_list_robustness(val_rule: str) -> str:
"""Helper function to extract list robustness from the validation rule.
List robustness defines if the input -must- be a list (several values
or a single value with a trailing comma),
or if a user is allowed to submit a single value.
List rules default to `strict` if not defined to be `like`
Args:
val_rule: str, validation rule string.
Returns:
list_robutness: str, list robustness extracted from validation rule.
"""
list_robustness_options = ["like", "strict"]
list_robustness = None
default_robustness = list_robustness_options[1]

# Get the parts of a single rule, list is assumed to be in the first position, based on
# requirements that can be found in documentation.
rule_parts = val_rule.lower().split(" ")

if len(rule_parts) > 1:
# Check if list_robustness is defined in the rule, if not give them the default.
list_robustness_list = [
part for part in rule_parts if part in list_robustness_options
]
if list_robustness_list:
list_robustness = list_robustness_list[0]

if not list_robustness:
# If no robustness has been defined by the user, set to the default.
list_robustness = default_robustness
return list_robustness


def parse_str_series_to_list(col: pd.Series, replace_null: bool = True) -> pd.Series:
"""
Parse a pandas series of comma delimited strings
into a series with values that are lists of strings
into a series with values that are lists of strings. If replace_null, fill null values
with nan. If the type of the value needs to be an array, fill with empty list.
ex.
Input: 'a,b,c'
Output: ['a','b','c']
"""
col = col.apply(
lambda x: [s.strip() for s in str(x).split(",")] if not pd.isnull(x) else pd.NA
)
if replace_null:
col = col.apply(
lambda x: [s.strip() for s in str(x).split(",")]
if not pd.isnull(x)
else pd.NA
)
else:
col = col.apply(
lambda x: [s.strip() for s in str(x).split(",")]
if (isinstance(x, np.ndarray) and not x.any()) or not pd.isnull(x)
else []
)

return col


Expand Down
13 changes: 10 additions & 3 deletions tests/data/example.model.csv
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,16 @@ CRAM,,,"Genome Build, Genome FASTA",,FALSE,ValidValue,,,
CSV/TSV,,,Genome Build,,FALSE,ValidValue,,,
Genome Build,,"GRCh37, GRCh38, GRCm38, GRCm39",,,TRUE,DataProperty,,,
Genome FASTA,,,,,TRUE,DataProperty,,,
MockComponent,,,"Component, Check List, Check Regex List, Check Regex Single, Check Regex Format, Check Regex Integer, Check Num, Check Float, Check Int, Check String, Check URL,Check Match at Least, Check Match at Least values, Check Match Exactly, Check Match Exactly values, Check Match None, Check Match None values, Check Recommended, Check Ages, Check Unique, Check Range, Check Date, Check NA",,FALSE,DataType,,,
Check List,,"ab, cd, ef, gh",,,TRUE,DataProperty,,,list strict
Check Regex List,,,,,TRUE,DataProperty,,,list strict::regex match [a-f]
MockComponent,,,"Component, Check List, Check List Enum, Check List Like, Check List Like Enum, Check List Strict, Check List Enum Strict, Check Regex List, Check Regex List Like, Check Regex List Strict, Check Regex Single, Check Regex Format, Check Regex Integer, Check Num, Check Float, Check Int, Check String, Check URL,Check Match at Least, Check Match at Least values, Check Match Exactly, Check Match Exactly values, Check Match None, Check Match None values, Check Recommended, Check Ages, Check Unique, Check Range, Check Date, Check NA",,FALSE,DataType,,,
Check List,,,,,TRUE,DataProperty,,,list
Check List Enum,,"ab, cd, ef, gh",,,TRUE,DataProperty,,,list
Check List Like,,,,,TRUE,DataProperty,,,list like
Check List Like Enum,,"ab, cd, ef, gh",,,TRUE,DataProperty,,,list like
Check List Strict,,,,,TRUE,DataProperty,,,list strict
Check List Enum Strict,,"ab, cd, ef, gh",,,TRUE,DataProperty,,,list strict
Check Regex List,,,,,TRUE,DataProperty,,,list::regex match [a-f]
Check Regex List Strict,,,,,TRUE,DataProperty,,,list strict::regex match [a-f]
Check Regex List Like,,,,,TRUE,DataProperty,,,list like::regex match [a-f]
Check Regex Single,,,,,TRUE,DataProperty,,,regex search [a-f]
Check Regex Format,,,,,TRUE,DataProperty,,,regex match [a-f]
Check Regex Integer,,,,,TRUE,DataProperty,,,regex search ^\d+$
Expand Down
Loading

0 comments on commit 0864e49

Please sign in to comment.