From 70a004552e2f0d4c1ab658f5fd132abc760d6dfc Mon Sep 17 00:00:00 2001 From: Mialy DeFelice <85905780+mialy-defelice@users.noreply.github.com> Date: Thu, 2 May 2024 14:32:43 -0700 Subject: [PATCH 01/21] handle blank list like, add to testing st cases --- schematic/models/validate_attribute.py | 16 ++--- schematic/models/validate_manifest.py | 6 +- schematic/utils/validate_utils.py | 22 +++++-- tests/data/example.model.csv | 13 ++++- tests/data/example_test_nones.model.csv | 13 ++++- .../mock_manifests/Invalid_Test_Manifest.csv | 9 ++- .../mock_manifests/Valid_Test_Manifest.csv | 10 ++-- .../Valid_Test_Manifest_with_nones.csv | 10 ++-- tests/test_validation.py | 58 ++++++++++++++----- 9 files changed, 109 insertions(+), 48 deletions(-) diff --git a/schematic/models/validate_attribute.py b/schematic/models/validate_attribute.py index b34eb58a8..92dbdde9a 100644 --- a/schematic/models/validate_attribute.py +++ b/schematic/models/validate_attribute.py @@ -23,6 +23,7 @@ np_array_to_str_list, iterable_to_str_list, rule_in_rule_list, + get_list_robustness, ) from synapseclient.core.exceptions import SynapseNoCredentialsError @@ -800,6 +801,7 @@ def get_target_manifests( ) return synStore, target_manifest_ids, target_dataset_ids + def list_validation( self, val_rule: str, @@ -823,16 +825,16 @@ def list_validation( # white spaces removed. errors = [] warnings = [] + replace_null = True csv_re = comma_separated_list_regex() - rule_parts = val_rule.lower().split(" ") - if len(rule_parts) > 1: - list_robustness = rule_parts[1] - else: - list_robustness = "strict" + list_robustness = get_list_robustness(val_rule=val_rule) + + if list_robustness== 'like': + replace_null = False - if list_robustness == "strict": + elif list_robustness == "strict": manifest_col = manifest_col.astype(str) # This will capture any if an entry is not formatted properly. Only for strict lists @@ -864,7 +866,7 @@ def list_validation( warnings.append(vr_warnings) # Convert string to list. - manifest_col = parse_str_series_to_list(manifest_col) + manifest_col = parse_str_series_to_list(manifest_col, replace_null=replace_null) return errors, warnings, manifest_col diff --git a/schematic/models/validate_manifest.py b/schematic/models/validate_manifest.py index 118d0d6cd..6d4c5c5a5 100644 --- a/schematic/models/validate_manifest.py +++ b/schematic/models/validate_manifest.py @@ -295,8 +295,10 @@ def validate_manifest_values( warnings = [] col_attr = {} # save the mapping between column index and attribute name - # Replace nans with empty strings so jsonschema - manifest = manifest.replace({np.nan: ""}) + # Replace nans with empty strings so jsonschema, address replace type infering depreciation. + with pd.option_context("future.no_silent_downcasting", True): + manifest = manifest.replace({np.nan: ""}).infer_objects(copy=False) + #manifest = manifest.replace({np.nan: ""}) # numerical values need to be type string for the jsonValidator for col in manifest.select_dtypes( diff --git a/schematic/utils/validate_utils.py b/schematic/utils/validate_utils.py index 76cb6f903..d3cd1e545 100644 --- a/schematic/utils/validate_utils.py +++ b/schematic/utils/validate_utils.py @@ -69,8 +69,17 @@ def rule_in_rule_list(rule: str, rule_list: list[str]) -> Optional[re.Match[str] rule_list_str = "|".join(rule_list) return re.search(rule_type, rule_list_str, flags=re.IGNORECASE) +def get_list_robustness(val_rule:str) -> str: + rule_parts = val_rule.lower().split(" ") + if len(rule_parts) > 1: + list_robustness = rule_parts[1] + else: + list_robustness = "strict" + return list_robustness -def parse_str_series_to_list(col: pd.Series) -> pd.Series: + + +def parse_str_series_to_list(col: pd.Series, replace_null:bool=True) -> pd.Series: """ Parse a pandas series of comma delimited strings into a series with values that are lists of strings @@ -79,9 +88,14 @@ def parse_str_series_to_list(col: pd.Series) -> pd.Series: Output: ['a','b','c'] """ - col = col.apply( - lambda x: [s.strip() for s in str(x).split(",")] if not pd.isnull(x) else pd.NA - ) + if replace_null: + col = col.apply( + lambda x: [s.strip() for s in str(x).split(",")] if not pd.isnull(x) else pd.NA + ) + else: + col = col.apply( + lambda x: [s.strip() for s in str(x).split(",")] if (isinstance(x, np.ndarray) and not x.any()) or not pd.isnull(x) else []) + return col diff --git a/tests/data/example.model.csv b/tests/data/example.model.csv index bdafffa00..a99a404f1 100644 --- a/tests/data/example.model.csv +++ b/tests/data/example.model.csv @@ -19,9 +19,16 @@ CRAM,,,"Genome Build, Genome FASTA",,FALSE,ValidValue,,, CSV/TSV,,,Genome Build,,FALSE,ValidValue,,, Genome Build,,"GRCh37, GRCh38, GRCm38, GRCm39",,,TRUE,DataProperty,,, Genome FASTA,,,,,TRUE,DataProperty,,, -MockComponent,,,"Component, Check List, Check Regex List, Check Regex Single, Check Regex Format, Check Regex Integer, Check Num, Check Float, Check Int, Check String, Check URL,Check Match at Least, Check Match at Least values, Check Match Exactly, Check Match Exactly values, Check Match None, Check Match None values, Check Recommended, Check Ages, Check Unique, Check Range, Check Date, Check NA",,FALSE,DataType,,, -Check List,,"ab, cd, ef, gh",,,TRUE,DataProperty,,,list strict -Check Regex List,,,,,TRUE,DataProperty,,,list strict::regex match [a-f] +MockComponent,,,"Component, Check List, Check List Enum, Check List Like, Check List Like Enum, Check List Strict, Check List Enum Strict, Check Regex List, Check Regex List Like, Check Regex List Strict, Check Regex Single, Check Regex Format, Check Regex Integer, Check Num, Check Float, Check Int, Check String, Check URL,Check Match at Least, Check Match at Least values, Check Match Exactly, Check Match Exactly values, Check Match None, Check Match None values, Check Recommended, Check Ages, Check Unique, Check Range, Check Date, Check NA",,FALSE,DataType,,, +Check List,,,,,TRUE,DataProperty,,,list +Check List Enum,,"ab, cd, ef, gh",,,TRUE,DataProperty,,,list +Check List Like,,,,,TRUE,DataProperty,,,list like +Check List Like Enum,,"ab, cd, ef, gh",,,TRUE,DataProperty,,,list like +Check List Strict,,,,,TRUE,DataProperty,,,list strict +Check List Enum Strict,,"ab, cd, ef, gh",,,TRUE,DataProperty,,,list strict +Check Regex List,,,,,TRUE,DataProperty,,,list::regex match [a-f] +Check Regex List Strict,,,,,TRUE,DataProperty,,,list strict::regex match [a-f] +Check Regex List Like,,,,,TRUE,DataProperty,,,list like::regex match [a-f] Check Regex Single,,,,,TRUE,DataProperty,,,regex search [a-f] Check Regex Format,,,,,TRUE,DataProperty,,,regex match [a-f] Check Regex Integer,,,,,TRUE,DataProperty,,,regex search ^\d+$ diff --git a/tests/data/example_test_nones.model.csv b/tests/data/example_test_nones.model.csv index 718cee624..e39ec8e5d 100644 --- a/tests/data/example_test_nones.model.csv +++ b/tests/data/example_test_nones.model.csv @@ -19,9 +19,16 @@ CRAM,,,,"Genome Build, Genome FASTA",,FALSE,ValidValue,, CSV/TSV,,,,Genome Build,,FALSE,ValidValue,, Genome Build,,,"GRCh37, GRCh38, GRCm38, GRCm39",,,TRUE,DataProperty,, Genome FASTA,,,,,,TRUE,DataProperty,, -MockComponent,,,,"Component, Check List, Check Regex List, Check Regex Single, Check Regex Format, Check Regex Integer, Check Num, Check Float, Check Int, Check String, Check URL,Check Match at Least, Check Match at Least values, Check Match Exactly, Check Match Exactly values, Check Recommended, Check Ages, Check Unique, Check Range, Check Date, Check NA",,FALSE,DataType,, -Check List,list strict ,,"ab, cd, ef, gh",,,FALSE,DataProperty,, -Check Regex List,list strict::regex match [a-f],,,,,FALSE,DataProperty,, +MockComponent,,,,"Component, Check List, Check List Enum, Check List Like, Check List Like Enum, Check List Strict, Check List Enum Strict, Check Regex List, Check Regex List Like, Check Regex List Strict, Check Regex Single, Check Regex Format, Check Regex Integer, Check Num, Check Float, Check Int, Check String, Check URL,Check Match at Least, Check Match at Least values, Check Match Exactly, Check Match Exactly values, Check Recommended, Check Ages, Check Unique, Check Range, Check Date, Check NA",,FALSE,DataType,, +Check List,list,,,,,FALSE,DataProperty,, +Check List Enum,list,,"ab, cd, ef, gh",,,FALSE,DataProperty,, +Check List Like,list like,,,,,FALSE,DataProperty,, +Check List Like Enum,list like,,"ab, cd, ef, gh",,,FALSE,DataProperty,, +Check List Strict,list strict ,,,,,FALSE,DataProperty,, +Check List Enum Strict,list strict ,,"ab, cd, ef, gh",,,FALSE,DataProperty,, +Check Regex List,list::regex match [a-f],,,,,FALSE,DataProperty,, +Check Regex List Strict,list strict::regex match [a-f],,,,,FALSE,DataProperty,, +Check Regex List Like,list like::regex match [a-f],,,,,FALSE,DataProperty,, Check Regex Single,regex search [a-f],,,,,FALSE,DataProperty,, Check Regex Format,regex match [a-f],,,,,FALSE,DataProperty,, Check Regex Integer,regex search ^\d+$,,,,,FALSE,DataProperty,, diff --git a/tests/data/mock_manifests/Invalid_Test_Manifest.csv b/tests/data/mock_manifests/Invalid_Test_Manifest.csv index fcdc4d97b..258740064 100644 --- a/tests/data/mock_manifests/Invalid_Test_Manifest.csv +++ b/tests/data/mock_manifests/Invalid_Test_Manifest.csv @@ -1,5 +1,4 @@ -Component,Check List,Check Regex List,Check Regex Single,Check Regex Format,Check Regex Integer,Check Num,Check Float,Check Int,Check String,Check URL,Check Match at Least,Check Match at Least values,Check Match Exactly,Check Match Exactly values,Check Match None,Check Match None values,Check Recommended,Check Ages,Check Unique,Check Range,Check Date,Check NA -MockComponent,"ab,cd","ab,cd,ef",a,a,5.4,6,99.65,7,valid,https://www.google.com/,1738,1738,8085,98085,1911,334,,6549,str1,70,32-984,7 -MockComponent,invalid list values,ab cd ef,q,m,0,c,99,5.63,94,http://googlef.com/,7163,51100,9965,71738,123,717,,32851,str1,30,notADate,9.5 -MockComponent,"ab,cd","ab,cd,ef",b,b,683902,6.5,62.3,2,valid,https://github.com/Sage-Bionetworks/schematic,8085,8085,1738,210065,1427,123,,6550,str1,90,84-43-094,Not Applicable -,,,,,,,,,,,,,,,,,,,,, \ No newline at end of file +Component,Check List,Check List Enum,Check List Like,Check List Like Enum,Check List Strict,Check List Enum Strict,Check Regex List,Check Regex List Like,Check Regex List Strict,Check List,Check Regex List,Check Regex Single,Check Regex Format,Check Regex Integer,Check Num,Check Float,Check Int,Check String,Check URL,Check Match at Least,Check Match at Least values,Check Match Exactly,Check Match Exactly values,Check Match None,Check Match None values,Check Recommended,Check Ages,Check Unique,Check Range,Check Date,Check NA +MockComponent,"ab,cd","ab,cd",ab,ab,"ab,cd","ab,cd","a,c,f","a,c,f","a,c,f","ab,cd","ab,cd,ef",a,a,5.4,6,99.65,7,valid,https://www.google.com/,1738,1738,8085,98085,1911,334,,6549,str1,70,32-984,7 +MockComponent,9,a,c,c,ab cd,ab,a c f,a c f,a c f,invalid list values,ab cd ef,q,m,0,c,99,5.63,94,http://googlef.com/,7163,51100,9965,71738,123,717,,32851,str1,30,notADate,9.5 +MockComponent,ab,ab,"cd, ab","cd, ab",ab,"a, b",a,a,a,"ab,cd","ab,cd,ef",b,b,683902,6.5,62.3,2,valid,https://github.com/Sage-Bionetworks/schematic,8085,8085,1738,210065,1427,123,,6550,str1,90,84-43-094,Not Applicable diff --git a/tests/data/mock_manifests/Valid_Test_Manifest.csv b/tests/data/mock_manifests/Valid_Test_Manifest.csv index 7a4e53671..f47b4364c 100644 --- a/tests/data/mock_manifests/Valid_Test_Manifest.csv +++ b/tests/data/mock_manifests/Valid_Test_Manifest.csv @@ -1,5 +1,5 @@ -Component,Check List,Check Regex List,Check Regex Single,Check Regex Format,Check Regex Integer,Check Num,Check Float,Check Int,Check String,Check URL,Check Match at Least,Check Match at Least values,Check Match Exactly,Check Match Exactly values,Check Match None,Check Match None values,Check Recommended,Check Ages,Check Unique,Check Range,Check Date,Check NA -MockComponent,"ab,cd","a,c,f",a,a,0,6,99.65,7,valid,https://www.google.com/,1738,1738,8085,8085,1911,334,,6571,str1,75,10/21/2022,Not Applicable -MockComponent,"ab,cd","a,c,f",e,b,1234,71,58.4,3,valid,https://www.google.com/,9965,9965,9965,9965,1655,717,,6571,str2,80,October 21 2022,8 -MockComponent,"ab,cd","b,d,f",b,c,683902,6.5,62.3,2,valid,https://www.google.com/,8085,8085,1738,1738,1427,206,present,32849,str3,95,10/21/2022,Not Applicable -MockComponent,"ab,cd","b,d,f",b,c,0,6.5,62.3,2,valid,https://www.google.com/,79,79,7,7,6635,608,,32849,str4,55,21/10/2022,695 \ No newline at end of file +Component,Check List,Check List Enum,Check List Like,Check List Like Enum,Check List Strict,Check List Enum Strict,Check Regex List,Check Regex List Like,Check Regex List Strict,Check Regex Single,Check Regex Format,Check Regex Integer,Check Num,Check Float,Check Int,Check String,Check URL,Check Match at Least,Check Match at Least values,Check Match Exactly,Check Match Exactly values,Check Match None,Check Match None values,Check Recommended,Check Ages,Check Unique,Check Range,Check Date,Check NA +MockComponent,"ab,cd","ab,cd",ab,ab,"ab,cd","ab,cd","a,c,f",a,"a,c,f",a,a,0,6,99.65,7,valid,https://www.google.com/,1738,1738,8085,8085,1911,334,,6571,str1,75,10/21/2022,Not Applicable +MockComponent,"ab,cd","ab,cd",cd,cd,"ab,cd","ab,cd","a,c,f","a,c,f","a,c,f",e,b,1234,71,58.4,3,valid,https://www.google.com/,9965,9965,9965,9965,1655,717,,6571,str2,80,October 21 2022,8 +MockComponent,"ab,cd","ab, cd","cd,ab",ab,"ab,cd","ab,cd","a,b",c,"a,b",b,c,683902,6.5,62.3,2,valid,https://www.google.com/,8085,8085,1738,1738,1427,206,present,32849,str3,95,10/21/2022,Not Applicable +MockComponent,"ab,cd","ab,cd","ab,cd","ab,cd","ab,cd","ab,cd","b,d,f","b,d,f","b,d,f",b,c,0,6.5,62.3,2,valid,https://www.google.com/,79,79,7,7,6635,608,,32849,str4,55,21/10/2022,695 \ No newline at end of file diff --git a/tests/data/mock_manifests/Valid_Test_Manifest_with_nones.csv b/tests/data/mock_manifests/Valid_Test_Manifest_with_nones.csv index 091ac8c0a..0bbc8c71f 100644 --- a/tests/data/mock_manifests/Valid_Test_Manifest_with_nones.csv +++ b/tests/data/mock_manifests/Valid_Test_Manifest_with_nones.csv @@ -1,5 +1,5 @@ -Component,Check List,Check Regex List,Check Regex Single,Check Regex Format,Check Regex Integer,Check Num,Check Float,Check Int,Check String,Check URL,Check Match at Least,Check Match at Least values,Check Match Exactly,Check Match Exactly values,Check Recommended,Check Ages,Check Unique,Check Range,Check Date,Check NA -MockComponent,"ab,cd","a,c,f",a,a,,6,99.65,7,valid,https://www.google.com/,1738,1738,8085,8085,,6571,str1,75,10/21/2022,Not Applicable -MockComponent,"ab,cd","a,c,f",,b,1234,71,58.4,3,valid,https://www.google.com/,9965,9965,9965,9965,,6571,str2,80,October 21 2022,8 -MockComponent,,,b,,683902,,,,,,,,,,present,,,,, -MockComponent,"ab,cd","b,d,f",b,c,0,6.5,62.3,2,valid,https://www.google.com/,79,79,7,7,,32849,str4,55,21/10/2022,695 \ No newline at end of file +Component,Check List,Check List Enum,Check List Like,Check List Like Enum,Check List Strict,Check List Enum Strict,Check Regex List,Check Regex List Like,Check Regex List Strict,Check Regex Single,Check Regex Format,Check Regex Integer,Check Num,Check Float,Check Int,Check String,Check URL,Check Match at Least,Check Match at Least values,Check Match Exactly,Check Match Exactly values,Check Recommended,Check Ages,Check Unique,Check Range,Check Date,Check NA +MockComponent,"ab,cd","ab,cd",ab,ab,"ab,cd","ab,cd","a,c,f",a,"a,c,f",a,a,,6,99.65,7,valid,https://www.google.com/,1738,1738,8085,8085,,6571,str1,75,10/21/2022,Not Applicable +MockComponent,"ab,cd","ab,cd",,,"ab,cd","ab,cd","a,c,f","a,c,f","a,c,f",,b,1234,71,58.4,3,valid,https://www.google.com/,9965,9965,9965,9965,,6571,str2,80,October 21 2022,8 +MockComponent,,,,,,,,,,b,,683902,,,,,,,,,,present,,,,, +MockComponent,"ab,cd","ab,cd","ab,cd","ab,cd","ab,cd","ab,cd","b,d,f","b,d,f","b,d,f",b,c,0,6.5,62.3,2,valid,https://www.google.com/,79,79,7,7,,32849,str4,55,21/10/2022,695 \ No newline at end of file diff --git a/tests/test_validation.py b/tests/test_validation.py index f8b9a5746..2b8167ed8 100644 --- a/tests/test_validation.py +++ b/tests/test_validation.py @@ -189,22 +189,52 @@ def test_invalid_manifest(self, helpers, dmge): )[0] in errors assert GenerateError.generate_list_error( - val_rule="list strict", - list_string="invalid list values", + val_rule="list", + list_string="9", row_num="3", attribute_name="Check List", list_error="not_comma_delimited", - invalid_entry="invalid list values", + invalid_entry="9", dmge=dmge, )[0] in errors assert GenerateError.generate_list_error( - val_rule="list strict", - list_string="ab cd ef", + val_rule="list", + list_string="ab", + row_num="4", + attribute_name="Check List", + list_error="not_comma_delimited", + invalid_entry="ab", + dmge=dmge, + )[0] in errors + + assert GenerateError.generate_list_error( + val_rule="list", + list_string="a c f", row_num="3", attribute_name="Check Regex List", list_error="not_comma_delimited", - invalid_entry="ab cd ef", + invalid_entry="a c f", + dmge=dmge, + )[0] in errors + + assert GenerateError.generate_list_error( + val_rule="list", + list_string="a", + row_num="4", + attribute_name="Check Regex List", + list_error="not_comma_delimited", + invalid_entry="a", + dmge=dmge, + )[0] in errors + + assert GenerateError.generate_list_error( + val_rule="list", + list_string="a", + row_num="4", + attribute_name="Check Regex List", + list_error="not_comma_delimited", + invalid_entry="a", dmge=dmge, )[0] in errors @@ -402,22 +432,22 @@ def test_in_house_validation(self, helpers, dmge): )[0] in errors assert GenerateError.generate_list_error( - val_rule="list strict", - list_string="invalid list values", + val_rule="list", + list_string="9", row_num="3", attribute_name="Check List", list_error="not_comma_delimited", - invalid_entry="invalid list values", + invalid_entry="9", dmge=dmge, )[0] in errors assert GenerateError.generate_list_error( - val_rule="list strict", - list_string="ab cd ef", - row_num="3", - attribute_name="Check Regex List", + val_rule="list", + list_string="ab", + row_num="4", + attribute_name="Check List", list_error="not_comma_delimited", - invalid_entry="ab cd ef", + invalid_entry="ab", dmge=dmge, )[0] in errors From 5aab710764924f2fa878071d706121f84c360f53 Mon Sep 17 00:00:00 2001 From: Mialy DeFelice <85905780+mialy-defelice@users.noreply.github.com> Date: Thu, 2 May 2024 14:37:49 -0700 Subject: [PATCH 02/21] update example.model.jsonld --- tests/data/example.model.jsonld | 218 +++++++++++++++++++++++++++++++- 1 file changed, 213 insertions(+), 5 deletions(-) diff --git a/tests/data/example.model.jsonld b/tests/data/example.model.jsonld index df8c50b9a..44cea61d5 100644 --- a/tests/data/example.model.jsonld +++ b/tests/data/example.model.jsonld @@ -880,9 +880,30 @@ { "@id": "bts:CheckList" }, + { + "@id": "bts:CheckListEnum" + }, + { + "@id": "bts:CheckListLike" + }, + { + "@id": "bts:CheckListLikeEnum" + }, + { + "@id": "bts:CheckListStrict" + }, + { + "@id": "bts:CheckListEnumStrict" + }, { "@id": "bts:CheckRegexList" }, + { + "@id": "bts:CheckRegexListLike" + }, + { + "@id": "bts:CheckRegexListStrict" + }, { "@id": "bts:CheckRegexSingle" }, @@ -959,6 +980,25 @@ "schema:isPartOf": { "@id": "http://schema.biothings.io" }, + "sms:displayName": "Check List", + "sms:required": "sms:true", + "sms:validationRules": [ + "list" + ] + }, + { + "@id": "bts:CheckListEnum", + "@type": "rdfs:Class", + "rdfs:comment": "TBD", + "rdfs:label": "CheckListEnum", + "rdfs:subClassOf": [ + { + "@id": "bts:DataProperty" + } + ], + "schema:isPartOf": { + "@id": "http://schema.biothings.io" + }, "schema:rangeIncludes": [ { "@id": "bts:Ab" @@ -973,7 +1013,111 @@ "@id": "bts:Gh" } ], - "sms:displayName": "Check List", + "sms:displayName": "Check List Enum", + "sms:required": "sms:true", + "sms:validationRules": [ + "list" + ] + }, + { + "@id": "bts:CheckListLike", + "@type": "rdfs:Class", + "rdfs:comment": "TBD", + "rdfs:label": "CheckListLike", + "rdfs:subClassOf": [ + { + "@id": "bts:DataProperty" + } + ], + "schema:isPartOf": { + "@id": "http://schema.biothings.io" + }, + "sms:displayName": "Check List Like", + "sms:required": "sms:true", + "sms:validationRules": [ + "list like" + ] + }, + { + "@id": "bts:CheckListLikeEnum", + "@type": "rdfs:Class", + "rdfs:comment": "TBD", + "rdfs:label": "CheckListLikeEnum", + "rdfs:subClassOf": [ + { + "@id": "bts:DataProperty" + } + ], + "schema:isPartOf": { + "@id": "http://schema.biothings.io" + }, + "schema:rangeIncludes": [ + { + "@id": "bts:Ab" + }, + { + "@id": "bts:Cd" + }, + { + "@id": "bts:Ef" + }, + { + "@id": "bts:Gh" + } + ], + "sms:displayName": "Check List Like Enum", + "sms:required": "sms:true", + "sms:validationRules": [ + "list like" + ] + }, + { + "@id": "bts:CheckListStrict", + "@type": "rdfs:Class", + "rdfs:comment": "TBD", + "rdfs:label": "CheckListStrict", + "rdfs:subClassOf": [ + { + "@id": "bts:DataProperty" + } + ], + "schema:isPartOf": { + "@id": "http://schema.biothings.io" + }, + "sms:displayName": "Check List Strict", + "sms:required": "sms:true", + "sms:validationRules": [ + "list strict" + ] + }, + { + "@id": "bts:CheckListEnumStrict", + "@type": "rdfs:Class", + "rdfs:comment": "TBD", + "rdfs:label": "CheckListEnumStrict", + "rdfs:subClassOf": [ + { + "@id": "bts:DataProperty" + } + ], + "schema:isPartOf": { + "@id": "http://schema.biothings.io" + }, + "schema:rangeIncludes": [ + { + "@id": "bts:Ab" + }, + { + "@id": "bts:Cd" + }, + { + "@id": "bts:Ef" + }, + { + "@id": "bts:Gh" + } + ], + "sms:displayName": "Check List Enum Strict", "sms:required": "sms:true", "sms:validationRules": [ "list strict" @@ -994,6 +1138,46 @@ }, "sms:displayName": "Check Regex List", "sms:required": "sms:true", + "sms:validationRules": [ + "list", + "regex match [a-f]" + ] + }, + { + "@id": "bts:CheckRegexListLike", + "@type": "rdfs:Class", + "rdfs:comment": "TBD", + "rdfs:label": "CheckRegexListLike", + "rdfs:subClassOf": [ + { + "@id": "bts:DataProperty" + } + ], + "schema:isPartOf": { + "@id": "http://schema.biothings.io" + }, + "sms:displayName": "Check Regex List Like", + "sms:required": "sms:true", + "sms:validationRules": [ + "list like", + "regex match [a-f]" + ] + }, + { + "@id": "bts:CheckRegexListStrict", + "@type": "rdfs:Class", + "rdfs:comment": "TBD", + "rdfs:label": "CheckRegexListStrict", + "rdfs:subClassOf": [ + { + "@id": "bts:DataProperty" + } + ], + "schema:isPartOf": { + "@id": "http://schema.biothings.io" + }, + "sms:displayName": "Check Regex List Strict", + "sms:required": "sms:true", "sms:validationRules": [ "list strict", "regex match [a-f]" @@ -1387,7 +1571,13 @@ "rdfs:label": "Ab", "rdfs:subClassOf": [ { - "@id": "bts:CheckList" + "@id": "bts:CheckListEnum" + }, + { + "@id": "bts:CheckListLikeEnum" + }, + { + "@id": "bts:CheckListEnumStrict" } ], "schema:isPartOf": { @@ -1404,7 +1594,13 @@ "rdfs:label": "Cd", "rdfs:subClassOf": [ { - "@id": "bts:CheckList" + "@id": "bts:CheckListEnum" + }, + { + "@id": "bts:CheckListLikeEnum" + }, + { + "@id": "bts:CheckListEnumStrict" } ], "schema:isPartOf": { @@ -1421,7 +1617,13 @@ "rdfs:label": "Ef", "rdfs:subClassOf": [ { - "@id": "bts:CheckList" + "@id": "bts:CheckListEnum" + }, + { + "@id": "bts:CheckListLikeEnum" + }, + { + "@id": "bts:CheckListEnumStrict" } ], "schema:isPartOf": { @@ -1438,7 +1640,13 @@ "rdfs:label": "Gh", "rdfs:subClassOf": [ { - "@id": "bts:CheckList" + "@id": "bts:CheckListEnum" + }, + { + "@id": "bts:CheckListLikeEnum" + }, + { + "@id": "bts:CheckListEnumStrict" } ], "schema:isPartOf": { From d387d0495100804e431184bcc54d4d1f41c7540d Mon Sep 17 00:00:00 2001 From: Mialy DeFelice <85905780+mialy-defelice@users.noreply.github.com> Date: Thu, 2 May 2024 15:23:47 -0700 Subject: [PATCH 03/21] fix key error for wrong schema --- schematic/models/validate_attribute.py | 8 +++++++- schematic/models/validate_manifest.py | 2 +- .../Invalid_Biospecimen_Missing_Column_Manifest.csv | 2 ++ 3 files changed, 10 insertions(+), 2 deletions(-) create mode 100644 tests/data/mock_manifests/Invalid_Biospecimen_Missing_Column_Manifest.csv diff --git a/schematic/models/validate_attribute.py b/schematic/models/validate_attribute.py index 92dbdde9a..80309afb2 100644 --- a/schematic/models/validate_attribute.py +++ b/schematic/models/validate_attribute.py @@ -59,6 +59,7 @@ def generate_schema_error( error_col=attribute_name, error_message=error_message, error_val=invalid_entry, + message_level="error", ) return error_list, warning_list @@ -476,7 +477,12 @@ def _get_rule_attributes( is_schema_error = rule_name == "schema" col_is_recommended = rule_name == "recommended" - col_is_required = dmge.get_node_required(node_display_name=error_col_name) + + if not is_schema_error: + col_is_required = dmge.get_node_required(node_display_name=error_col_name) + else: + col_is_required = False + return ( rule_parts, rule_name, diff --git a/schematic/models/validate_manifest.py b/schematic/models/validate_manifest.py index 6d4c5c5a5..35184d3c0 100644 --- a/schematic/models/validate_manifest.py +++ b/schematic/models/validate_manifest.py @@ -298,7 +298,6 @@ def validate_manifest_values( # Replace nans with empty strings so jsonschema, address replace type infering depreciation. with pd.option_context("future.no_silent_downcasting", True): manifest = manifest.replace({np.nan: ""}).infer_objects(copy=False) - #manifest = manifest.replace({np.nan: ""}) # numerical values need to be type string for the jsonValidator for col in manifest.select_dtypes( @@ -353,6 +352,7 @@ def validate_all( manifest, vmr_errors, vmr_warnings = vm.validate_manifest_rules( manifest, dmge, restrict_rules, project_scope, access_token ) + if vmr_errors: errors.extend(vmr_errors) if vmr_warnings: diff --git a/tests/data/mock_manifests/Invalid_Biospecimen_Missing_Column_Manifest.csv b/tests/data/mock_manifests/Invalid_Biospecimen_Missing_Column_Manifest.csv new file mode 100644 index 000000000..f84922de2 --- /dev/null +++ b/tests/data/mock_manifests/Invalid_Biospecimen_Missing_Column_Manifest.csv @@ -0,0 +1,2 @@ +Sample ID,Patient ID,Component +123,123,Biospecimen \ No newline at end of file From c5c2d627894f03fde985db8717bd2b360dadd1a0 Mon Sep 17 00:00:00 2001 From: Mialy DeFelice <85905780+mialy-defelice@users.noreply.github.com> Date: Thu, 2 May 2024 15:32:14 -0700 Subject: [PATCH 04/21] add test_missing_column test --- tests/test_validation.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/tests/test_validation.py b/tests/test_validation.py index 2b8167ed8..b2b85851d 100644 --- a/tests/test_validation.py +++ b/tests/test_validation.py @@ -545,6 +545,30 @@ def test_in_house_validation(self, helpers, dmge): dmge=dmge, )[1] in warnings + + def test_missing_column(self, helpers, dmge:DataModelGraph): + """ Test that a manifest missing a column returns the proper error. + """ + model_name="example.model.csv" + manifest_name="mock_manifests/Invalid_Biospecimen_Missing_Column_Manifest.csv" + root_node="Biospecimen" + manifest_path = helpers.get_data_path(manifest_name) + + metadataModel = get_metadataModel(helpers, model_name) + errors, warnings = metadataModel.validateModelManifest( + manifestPath=manifest_path, + rootNode=root_node, + ) + + assert GenerateError.generate_schema_error( + row_num='2', + attribute_name="Wrong schema", + error_message="'Tissue Status' is a required property", + invalid_entry="Wrong schema", + dmge=dmge, + )[0] in errors + + @pytest.mark.parametrize( "model_name", [ From 616e2d186a7ae9717d52dae5af8390bc729ff1df Mon Sep 17 00:00:00 2001 From: Mialy DeFelice <85905780+mialy-defelice@users.noreply.github.com> Date: Thu, 2 May 2024 15:36:29 -0700 Subject: [PATCH 05/21] add docstring to get_list_robustness helper funct --- schematic/utils/validate_utils.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/schematic/utils/validate_utils.py b/schematic/utils/validate_utils.py index d3cd1e545..13d293381 100644 --- a/schematic/utils/validate_utils.py +++ b/schematic/utils/validate_utils.py @@ -70,6 +70,12 @@ def rule_in_rule_list(rule: str, rule_list: list[str]) -> Optional[re.Match[str] return re.search(rule_type, rule_list_str, flags=re.IGNORECASE) def get_list_robustness(val_rule:str) -> str: + """ Helper function to extract list robustness from the validation rule. + Args: + val_rule: str, validation rule string. + Returns: + list_robutness: str, list robustness extracted from validation rule. + """ rule_parts = val_rule.lower().split(" ") if len(rule_parts) > 1: list_robustness = rule_parts[1] @@ -77,8 +83,6 @@ def get_list_robustness(val_rule:str) -> str: list_robustness = "strict" return list_robustness - - def parse_str_series_to_list(col: pd.Series, replace_null:bool=True) -> pd.Series: """ Parse a pandas series of comma delimited strings From b92ea07f87568e51aa3a94aaac5934a2ad332193 Mon Sep 17 00:00:00 2001 From: Mialy DeFelice <85905780+mialy-defelice@users.noreply.github.com> Date: Thu, 2 May 2024 15:39:36 -0700 Subject: [PATCH 06/21] run poetry --- schematic/models/validate_attribute.py | 3 +-- schematic/utils/validate_utils.py | 17 ++++++++++++----- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/schematic/models/validate_attribute.py b/schematic/models/validate_attribute.py index 80309afb2..de5977fe8 100644 --- a/schematic/models/validate_attribute.py +++ b/schematic/models/validate_attribute.py @@ -807,7 +807,6 @@ def get_target_manifests( ) return synStore, target_manifest_ids, target_dataset_ids - def list_validation( self, val_rule: str, @@ -837,7 +836,7 @@ def list_validation( list_robustness = get_list_robustness(val_rule=val_rule) - if list_robustness== 'like': + if list_robustness == "like": replace_null = False elif list_robustness == "strict": diff --git a/schematic/utils/validate_utils.py b/schematic/utils/validate_utils.py index 13d293381..227ced5f5 100644 --- a/schematic/utils/validate_utils.py +++ b/schematic/utils/validate_utils.py @@ -69,8 +69,9 @@ def rule_in_rule_list(rule: str, rule_list: list[str]) -> Optional[re.Match[str] rule_list_str = "|".join(rule_list) return re.search(rule_type, rule_list_str, flags=re.IGNORECASE) -def get_list_robustness(val_rule:str) -> str: - """ Helper function to extract list robustness from the validation rule. + +def get_list_robustness(val_rule: str) -> str: + """Helper function to extract list robustness from the validation rule. Args: val_rule: str, validation rule string. Returns: @@ -83,7 +84,8 @@ def get_list_robustness(val_rule:str) -> str: list_robustness = "strict" return list_robustness -def parse_str_series_to_list(col: pd.Series, replace_null:bool=True) -> pd.Series: + +def parse_str_series_to_list(col: pd.Series, replace_null: bool = True) -> pd.Series: """ Parse a pandas series of comma delimited strings into a series with values that are lists of strings @@ -94,11 +96,16 @@ def parse_str_series_to_list(col: pd.Series, replace_null:bool=True) -> pd.Serie """ if replace_null: col = col.apply( - lambda x: [s.strip() for s in str(x).split(",")] if not pd.isnull(x) else pd.NA + lambda x: [s.strip() for s in str(x).split(",")] + if not pd.isnull(x) + else pd.NA ) else: col = col.apply( - lambda x: [s.strip() for s in str(x).split(",")] if (isinstance(x, np.ndarray) and not x.any()) or not pd.isnull(x) else []) + lambda x: [s.strip() for s in str(x).split(",")] + if (isinstance(x, np.ndarray) and not x.any()) or not pd.isnull(x) + else [] + ) return col From 20b966cb2ab42a18c305c2eeb1ff67e5d4149a46 Mon Sep 17 00:00:00 2001 From: Mialy DeFelice <85905780+mialy-defelice@users.noreply.github.com> Date: Thu, 2 May 2024 21:50:34 -0700 Subject: [PATCH 07/21] add a helper function to convert nans --- schematic/models/validate_manifest.py | 6 ++---- schematic/utils/validate_utils.py | 19 +++++++++++++++++++ 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/schematic/models/validate_manifest.py b/schematic/models/validate_manifest.py index 35184d3c0..e2f89279b 100644 --- a/schematic/models/validate_manifest.py +++ b/schematic/models/validate_manifest.py @@ -25,7 +25,7 @@ from schematic.store.synapse import SynapseStorage from schematic.models.GE_Helpers import GreatExpectationsHelpers from schematic.utils.validate_rules_utils import validation_rule_info -from schematic.utils.validate_utils import rule_in_rule_list +from schematic.utils.validate_utils import rule_in_rule_list, convert_nan_entries_to_empty_strings from schematic.utils.schema_utils import extract_component_validation_rules logger = logging.getLogger(__name__) @@ -295,9 +295,7 @@ def validate_manifest_values( warnings = [] col_attr = {} # save the mapping between column index and attribute name - # Replace nans with empty strings so jsonschema, address replace type infering depreciation. - with pd.option_context("future.no_silent_downcasting", True): - manifest = manifest.replace({np.nan: ""}).infer_objects(copy=False) + manifest = convert_nan_entries_to_empty_strings(manifest=manifest) # numerical values need to be type string for the jsonValidator for col in manifest.select_dtypes( diff --git a/schematic/utils/validate_utils.py b/schematic/utils/validate_utils.py index 227ced5f5..1ffa125dd 100644 --- a/schematic/utils/validate_utils.py +++ b/schematic/utils/validate_utils.py @@ -53,6 +53,25 @@ def comma_separated_list_regex() -> Pattern[str]: return csv_list_regex +def convert_nan_entries_to_empty_strings(manifest:pd.Series)->pd.Series: + """ + Nans need to be converted to empty strings for JSON Schema Validation. This helper + converts an a list with a single '' string or a single np.nan to empty strings. + These types of expected NANs come from different stages of conversion during import + and validation. + + """ + # Replace nans with empty strings so jsonschema, address replace type infering depreciation. + with pd.option_context("future.no_silent_downcasting", True): + manifest = manifest.replace({np.nan:""}).infer_objects(copy=False) + + for col in manifest.columns: + for index, value in manifest[col].items(): + if value == ['']: + #manifest[col][index] = [""] + manifest.loc[index, col] = [""] + return manifest + def rule_in_rule_list(rule: str, rule_list: list[str]) -> Optional[re.Match[str]]: """ From 7de939d73076aaebdaaced3b2edaf0163eaacb6e Mon Sep 17 00:00:00 2001 From: Mialy DeFelice <85905780+mialy-defelice@users.noreply.github.com> Date: Thu, 2 May 2024 21:52:53 -0700 Subject: [PATCH 08/21] run black --- schematic/models/validate_manifest.py | 5 ++++- schematic/utils/validate_utils.py | 9 +++++---- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/schematic/models/validate_manifest.py b/schematic/models/validate_manifest.py index e2f89279b..991dcfde3 100644 --- a/schematic/models/validate_manifest.py +++ b/schematic/models/validate_manifest.py @@ -25,7 +25,10 @@ from schematic.store.synapse import SynapseStorage from schematic.models.GE_Helpers import GreatExpectationsHelpers from schematic.utils.validate_rules_utils import validation_rule_info -from schematic.utils.validate_utils import rule_in_rule_list, convert_nan_entries_to_empty_strings +from schematic.utils.validate_utils import ( + rule_in_rule_list, + convert_nan_entries_to_empty_strings, +) from schematic.utils.schema_utils import extract_component_validation_rules logger = logging.getLogger(__name__) diff --git a/schematic/utils/validate_utils.py b/schematic/utils/validate_utils.py index 1ffa125dd..76ddd97ba 100644 --- a/schematic/utils/validate_utils.py +++ b/schematic/utils/validate_utils.py @@ -53,7 +53,8 @@ def comma_separated_list_regex() -> Pattern[str]: return csv_list_regex -def convert_nan_entries_to_empty_strings(manifest:pd.Series)->pd.Series: + +def convert_nan_entries_to_empty_strings(manifest: pd.Series) -> pd.Series: """ Nans need to be converted to empty strings for JSON Schema Validation. This helper converts an a list with a single '' string or a single np.nan to empty strings. @@ -63,12 +64,12 @@ def convert_nan_entries_to_empty_strings(manifest:pd.Series)->pd.Series: """ # Replace nans with empty strings so jsonschema, address replace type infering depreciation. with pd.option_context("future.no_silent_downcasting", True): - manifest = manifest.replace({np.nan:""}).infer_objects(copy=False) + manifest = manifest.replace({np.nan: ""}).infer_objects(copy=False) for col in manifest.columns: for index, value in manifest[col].items(): - if value == ['']: - #manifest[col][index] = [""] + if value == [""]: + # manifest[col][index] = [""] manifest.loc[index, col] = [""] return manifest From 9435ab6fd87f6e705f4f509e14c184ec4b43d11f Mon Sep 17 00:00:00 2001 From: Mialy DeFelice <85905780+mialy-defelice@users.noreply.github.com> Date: Thu, 2 May 2024 22:20:36 -0700 Subject: [PATCH 09/21] update typing, update docstrings --- schematic/utils/validate_utils.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/schematic/utils/validate_utils.py b/schematic/utils/validate_utils.py index 76ddd97ba..1e1ec3577 100644 --- a/schematic/utils/validate_utils.py +++ b/schematic/utils/validate_utils.py @@ -54,13 +54,17 @@ def comma_separated_list_regex() -> Pattern[str]: return csv_list_regex -def convert_nan_entries_to_empty_strings(manifest: pd.Series) -> pd.Series: +def convert_nan_entries_to_empty_strings(manifest:pd.core.frame.DataFrame) -> pd.core.frame.DataFrame: """ Nans need to be converted to empty strings for JSON Schema Validation. This helper converts an a list with a single '' string or a single np.nan to empty strings. These types of expected NANs come from different stages of conversion during import and validation. + Args: + manifest: pd.core.frame.DataFrame, manifest prior to removing nans and replacing with empty strings. + Returns: + manifest: pd.core.frame.DataFrame, manifest post removing nans and replacing with empty strings. """ # Replace nans with empty strings so jsonschema, address replace type infering depreciation. with pd.option_context("future.no_silent_downcasting", True): From 77a496325207030acccbe1529a9606893857d854 Mon Sep 17 00:00:00 2001 From: Mialy DeFelice <85905780+mialy-defelice@users.noreply.github.com> Date: Thu, 2 May 2024 22:22:19 -0700 Subject: [PATCH 10/21] lint validate_utils --- schematic/utils/validate_utils.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/schematic/utils/validate_utils.py b/schematic/utils/validate_utils.py index 1e1ec3577..e0226dd3a 100644 --- a/schematic/utils/validate_utils.py +++ b/schematic/utils/validate_utils.py @@ -54,7 +54,9 @@ def comma_separated_list_regex() -> Pattern[str]: return csv_list_regex -def convert_nan_entries_to_empty_strings(manifest:pd.core.frame.DataFrame) -> pd.core.frame.DataFrame: +def convert_nan_entries_to_empty_strings( + manifest: pd.core.frame.DataFrame, +) -> pd.core.frame.DataFrame: """ Nans need to be converted to empty strings for JSON Schema Validation. This helper converts an a list with a single '' string or a single np.nan to empty strings. @@ -62,9 +64,11 @@ def convert_nan_entries_to_empty_strings(manifest:pd.core.frame.DataFrame) -> pd and validation. Args: - manifest: pd.core.frame.DataFrame, manifest prior to removing nans and replacing with empty strings. + manifest: pd.core.frame.DataFrame, manifest prior to removing nans and + replacing with empty strings. Returns: - manifest: pd.core.frame.DataFrame, manifest post removing nans and replacing with empty strings. + manifest: pd.core.frame.DataFrame, manifest post removing nans and + replacing with empty strings. """ # Replace nans with empty strings so jsonschema, address replace type infering depreciation. with pd.option_context("future.no_silent_downcasting", True): From e541e62fb85e716d58b8b06288044e490b66e2b7 Mon Sep 17 00:00:00 2001 From: Mialy DeFelice <85905780+mialy-defelice@users.noreply.github.com> Date: Fri, 3 May 2024 09:13:48 -0700 Subject: [PATCH 11/21] add type ignore for mypy error --- schematic/utils/validate_utils.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/schematic/utils/validate_utils.py b/schematic/utils/validate_utils.py index e0226dd3a..53633070e 100644 --- a/schematic/utils/validate_utils.py +++ b/schematic/utils/validate_utils.py @@ -72,12 +72,11 @@ def convert_nan_entries_to_empty_strings( """ # Replace nans with empty strings so jsonschema, address replace type infering depreciation. with pd.option_context("future.no_silent_downcasting", True): - manifest = manifest.replace({np.nan: ""}).infer_objects(copy=False) + manifest = manifest.replace({np.nan: ""}).infer_objects(copy=False) # type: ignore for col in manifest.columns: for index, value in manifest[col].items(): if value == [""]: - # manifest[col][index] = [""] manifest.loc[index, col] = [""] return manifest From f189fa74a8ef12bc548dc936304676d56a549621 Mon Sep 17 00:00:00 2001 From: Mialy DeFelice <85905780+mialy-defelice@users.noreply.github.com> Date: Fri, 3 May 2024 09:17:37 -0700 Subject: [PATCH 12/21] run black --- schematic/utils/validate_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/schematic/utils/validate_utils.py b/schematic/utils/validate_utils.py index 53633070e..0f7f3f678 100644 --- a/schematic/utils/validate_utils.py +++ b/schematic/utils/validate_utils.py @@ -72,7 +72,7 @@ def convert_nan_entries_to_empty_strings( """ # Replace nans with empty strings so jsonschema, address replace type infering depreciation. with pd.option_context("future.no_silent_downcasting", True): - manifest = manifest.replace({np.nan: ""}).infer_objects(copy=False) # type: ignore + manifest = manifest.replace({np.nan: ""}).infer_objects(copy=False) # type: ignore for col in manifest.columns: for index, value in manifest[col].items(): From 661d5f4c52ca825861d54838681d345ce49f86cf Mon Sep 17 00:00:00 2001 From: Mialy DeFelice <85905780+mialy-defelice@users.noreply.github.com> Date: Fri, 3 May 2024 09:34:08 -0700 Subject: [PATCH 13/21] add type ignore to validate_utils --- schematic/utils/validate_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/schematic/utils/validate_utils.py b/schematic/utils/validate_utils.py index 0f7f3f678..cedee6e6d 100644 --- a/schematic/utils/validate_utils.py +++ b/schematic/utils/validate_utils.py @@ -77,7 +77,7 @@ def convert_nan_entries_to_empty_strings( for col in manifest.columns: for index, value in manifest[col].items(): if value == [""]: - manifest.loc[index, col] = [""] + manifest.loc[index, col] = [""] # type: ignore return manifest From d062df4630d9873c1c7fa60d21a22083ca5083be Mon Sep 17 00:00:00 2001 From: Mialy DeFelice <85905780+mialy-defelice@users.noreply.github.com> Date: Fri, 3 May 2024 11:00:51 -0700 Subject: [PATCH 14/21] Fix test to work with newly updated data model --- tests/test_schemas.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tests/test_schemas.py b/tests/test_schemas.py index 89fe54af8..85aa4cabe 100644 --- a/tests/test_schemas.py +++ b/tests/test_schemas.py @@ -438,7 +438,7 @@ def test_generate_data_model_graph(self, helpers, data_model, data_model_labels) ) assert expected_valid_values == [ k - for k, v in graph["CheckList"].items() + for k, v in graph["CheckListEnum"].items() for vk, vv in v.items() if vk == "rangeValue" ] @@ -447,9 +447,9 @@ def test_generate_data_model_graph(self, helpers, data_model, data_model_labels) # Check that all relationships recorded between 'CheckList' and 'Ab' are present assert ( - "rangeValue" and "parentOf" in graph["CheckList"][expected_valid_values[0]] + "rangeValue" and "parentOf" in graph["CheckListEnum"][expected_valid_values[0]] ) - assert "requiresDependency" not in graph["CheckList"][expected_valid_values[0]] + assert "requiresDependency" not in graph["CheckListEnum"][expected_valid_values[0]] # Check nodes: assert "Patient" in graph.nodes @@ -457,9 +457,10 @@ def test_generate_data_model_graph(self, helpers, data_model, data_model_labels) # Check weights assert graph["Sex"]["Female"]["rangeValue"]["weight"] == 0 + assert ( graph["MockComponent"]["CheckRegexFormat"]["requiresDependency"]["weight"] - == 4 + == 11 ) # Check Edge directions From ef385459fc771e935e50cb0b175e5b548279e0ea Mon Sep 17 00:00:00 2001 From: Mialy DeFelice <85905780+mialy-defelice@users.noreply.github.com> Date: Fri, 3 May 2024 15:16:09 -0700 Subject: [PATCH 15/21] add extra line to invalid manifest csv --- tests/data/mock_manifests/Invalid_Test_Manifest.csv | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/data/mock_manifests/Invalid_Test_Manifest.csv b/tests/data/mock_manifests/Invalid_Test_Manifest.csv index 258740064..a685acca7 100644 --- a/tests/data/mock_manifests/Invalid_Test_Manifest.csv +++ b/tests/data/mock_manifests/Invalid_Test_Manifest.csv @@ -2,3 +2,4 @@ Component,Check List,Check List Enum,Check List Like,Check List Like Enum,Check MockComponent,"ab,cd","ab,cd",ab,ab,"ab,cd","ab,cd","a,c,f","a,c,f","a,c,f","ab,cd","ab,cd,ef",a,a,5.4,6,99.65,7,valid,https://www.google.com/,1738,1738,8085,98085,1911,334,,6549,str1,70,32-984,7 MockComponent,9,a,c,c,ab cd,ab,a c f,a c f,a c f,invalid list values,ab cd ef,q,m,0,c,99,5.63,94,http://googlef.com/,7163,51100,9965,71738,123,717,,32851,str1,30,notADate,9.5 MockComponent,ab,ab,"cd, ab","cd, ab",ab,"a, b",a,a,a,"ab,cd","ab,cd,ef",b,b,683902,6.5,62.3,2,valid,https://github.com/Sage-Bionetworks/schematic,8085,8085,1738,210065,1427,123,,6550,str1,90,84-43-094,Not Applicable +,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, \ No newline at end of file From 41782de7b60f880163e8bb725b5c773c641aaf49 Mon Sep 17 00:00:00 2001 From: Mialy DeFelice <85905780+mialy-defelice@users.noreply.github.com> Date: Mon, 6 May 2024 16:10:38 -0700 Subject: [PATCH 16/21] add docstring about list robustness' --- schematic/models/validate_attribute.py | 1 + 1 file changed, 1 insertion(+) diff --git a/schematic/models/validate_attribute.py b/schematic/models/validate_attribute.py index de5977fe8..6ed613bae 100644 --- a/schematic/models/validate_attribute.py +++ b/schematic/models/validate_attribute.py @@ -834,6 +834,7 @@ def list_validation( csv_re = comma_separated_list_regex() + # Check if lists -must- be a list, or can be a single value. list_robustness = get_list_robustness(val_rule=val_rule) if list_robustness == "like": From fdc68a49de4a2480342f55b812db2a181f65d86f Mon Sep 17 00:00:00 2001 From: Mialy DeFelice <85905780+mialy-defelice@users.noreply.github.com> Date: Mon, 6 May 2024 16:11:26 -0700 Subject: [PATCH 17/21] add docstrings and update typing --- schematic/models/validate_manifest.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/schematic/models/validate_manifest.py b/schematic/models/validate_manifest.py index 991dcfde3..686842d20 100644 --- a/schematic/models/validate_manifest.py +++ b/schematic/models/validate_manifest.py @@ -106,9 +106,9 @@ def validate_manifest_rules( manifest: pd.core.frame.DataFrame, dmge: DataModelGraphExplorer, restrict_rules: bool, - project_scope: List, + project_scope: list[str], access_token: Optional[str] = None, - ) -> (pd.core.frame.DataFrame, List[List[str]]): + ) -> (pd.core.frame.DataFrame, list[list[str]]): """ Purpose: Take validation rules set for a particular attribute @@ -349,6 +349,7 @@ def validate_all( project_scope: List, access_token: str, ): + # Run Validation Rules vm = ValidateManifest(errors, manifest, manifestPath, dmge, jsonSchema) manifest, vmr_errors, vmr_warnings = vm.validate_manifest_rules( manifest, dmge, restrict_rules, project_scope, access_token @@ -359,6 +360,7 @@ def validate_all( if vmr_warnings: warnings.extend(vmr_warnings) + # Run JSON Schema Validation vmv_errors, vmv_warnings = vm.validate_manifest_values(manifest, jsonSchema, dmge) if vmv_errors: errors.extend(vmv_errors) From 5056de1010e4891129a1e5cfa30de15a025babc4 Mon Sep 17 00:00:00 2001 From: Mialy DeFelice <85905780+mialy-defelice@users.noreply.github.com> Date: Mon, 6 May 2024 16:14:44 -0700 Subject: [PATCH 18/21] rewrite get_list_robutness to be clearer and add doc strings to parse_str_series_to_list --- schematic/utils/validate_utils.py | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/schematic/utils/validate_utils.py b/schematic/utils/validate_utils.py index cedee6e6d..5f50dfb02 100644 --- a/schematic/utils/validate_utils.py +++ b/schematic/utils/validate_utils.py @@ -99,23 +99,42 @@ def rule_in_rule_list(rule: str, rule_list: list[str]) -> Optional[re.Match[str] def get_list_robustness(val_rule: str) -> str: """Helper function to extract list robustness from the validation rule. + List robustness defines if the input -must- be a list (several values + or a single value with a trailing comma), + or if a user is allowed to submit a single value. + List rules default to `strict` if not defined to be `like` Args: val_rule: str, validation rule string. Returns: list_robutness: str, list robustness extracted from validation rule. """ + list_robustness_options = ["like", "strict"] + list_robustness = None + default_robustness = list_robustness_options[1] + + # Get the parts of a single rule, list is assumed to be in the first position, based on + # requirements that can be found in documentation. rule_parts = val_rule.lower().split(" ") + if len(rule_parts) > 1: - list_robustness = rule_parts[1] - else: - list_robustness = "strict" + # Check if list_robustness is defined in the rule, if not give them the default. + list_robustness_list = [ + part for part in rule_parts if part in list_robustness_options + ] + if list_robustness_list: + list_robustness = list_robustness_list[0] + + if not list_robustness: + # If no robustness has been defined by the user, set to the default. + list_robustness = default_robustness return list_robustness def parse_str_series_to_list(col: pd.Series, replace_null: bool = True) -> pd.Series: """ Parse a pandas series of comma delimited strings - into a series with values that are lists of strings + into a series with values that are lists of strings. If replace_null, fill null values + with nan. If the type of the value needs to be an array, fill with empty list. ex. Input: 'a,b,c' Output: ['a','b','c'] From a89889324090bbe8747ebf00772af86aa4c70f65 Mon Sep 17 00:00:00 2001 From: Mialy DeFelice <85905780+mialy-defelice@users.noreply.github.com> Date: Mon, 6 May 2024 16:23:21 -0700 Subject: [PATCH 19/21] add test_convert_nan_entries_to_empty_strings and stubs for additional tests --- tests/test_utils.py | 82 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) diff --git a/tests/test_utils.py b/tests/test_utils.py index 525778185..8f6ef20e5 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -19,6 +19,9 @@ from pandas.testing import assert_frame_equal from synapseclient.core.exceptions import SynapseHTTPError +from schematic.models.validate_manifest import ValidateManifest +from schematic.models.metadata import MetadataModel + from schematic.schemas.data_model_parser import DataModelParser from schematic.schemas.data_model_graph import DataModelGraph, DataModelGraphExplorer from schematic.schemas.data_model_jsonld import ( @@ -43,6 +46,7 @@ ) from schematic.utils import cli_utils, df_utils, general, io_utils, validate_utils +from schematic.utils.df_utils import load_df from schematic.utils.general import ( calculate_datetime, check_synapse_cache_size, @@ -189,6 +193,14 @@ (1073741825, 1073741824, 1181116006.4), ] +def get_metadataModel(helpers, model_name:str): + metadataModel = MetadataModel( + inputMModelLocation=helpers.get_data_path(model_name), + inputMModelLocationType="local", + data_model_labels="class_label", + ) + return metadataModel + # create temporary files with various size based on request @pytest.fixture() @@ -1060,6 +1072,76 @@ def test_validate_property_schema(self, helpers): assert error is None + @pytest.mark.parametrize( + ("manifest", "model", "root_node"), + [("mock_manifests/Patient_test_no_entry_for_cond_required_column.manifest.csv", + "example.model.csv", "Patient"), + ("mock_manifests/Valid_Test_Manifest_with_nones.csv", + "example_test_nones.model.csv", "MockComponent")] + ) + def test_convert_nan_entries_to_empty_strings( + self, helpers, manifest, model, root_node): + # Get manifest and data model path + manifest_path = helpers.get_data_path(manifest) + model_path = helpers.get_data_path(model) + + ## Gather parmeters needed to run validate_manifest_rules + errors = [] + load_args = { + "dtype": "string", + } + + dmge = helpers.get_data_model_graph_explorer(path=model) + + self.data_model_js = DataModelJSONSchema( + jsonld_path=model_path, graph=dmge.graph + ) + json_schema = self.data_model_js.get_json_validation_schema( + root_node, root_node + "_validation" + ) + + manifest = load_df( + manifest_path, + preserve_raw_input=False, + allow_na_values=True, + **load_args,) + + metadataModel = get_metadataModel(helpers, model) + + # Instantiate Validate manifest, and run manifest validation + # In this step the manifest is modified while running rule + # validation so need to do this step to get the updated manfest. + vm = ValidateManifest( + errors, manifest, manifest_path, dmge, json_schema) + manifest, vmr_errors, vmr_warnings = vm.validate_manifest_rules( + manifest, dmge, restrict_rules=False, project_scope=["syn54126707"], + ) + + # Run convert nan function + output = validate_utils.convert_nan_entries_to_empty_strings( + manifest=manifest + ) + + # Compare post rule validation manifest with output manifest looking + # for expected nan to empty string conversion + if root_node == 'Patient': + assert manifest['Family History'][0] == [''] + assert output['Family History'][0] == [''] + elif root_node == 'MockComponent': + assert manifest['Check List'][2] == [''] + assert manifest['Check List Like Enum'][2] == [] + assert type(manifest['Check NA'][2]) == type(pd.NA) + + assert output['Check List'][2] == [''] + assert output['Check List Like Enum'][2] == [] + + + def test_get_list_robustness(self, helpers): + return + + def parse_str_series_to_list(self, helpers): + return + @pytest.mark.parametrize( "rule", [ From 7534d46038f1fbb3dc02da2affda42b9e191d6ef Mon Sep 17 00:00:00 2001 From: linglp Date: Thu, 9 May 2024 10:13:13 -0400 Subject: [PATCH 20/21] remove release md --- RELEASE.md | 51 --------------------------------------------------- 1 file changed, 51 deletions(-) delete mode 100644 RELEASE.md diff --git a/RELEASE.md b/RELEASE.md deleted file mode 100644 index 2fb4fb50a..000000000 --- a/RELEASE.md +++ /dev/null @@ -1,51 +0,0 @@ -# Release Process - -Once the code has been merged into the `develop` branch on this repo, there are two processes that need to be completed to ensure a _release_ is complete. - -- You should create a GitHub [tag](https://git-scm.com/book/en/v2/Git-Basics-Tagging), with the appropriate version number. Typically, from `v21.06` onwards all tags are created following the Linux Ubuntu versioning convention which is the `YY.MM` format where `Y` is the year and `M` is the month of that year when that release was created. -- You should push the package to [PyPI](https://pypi.org/). Schematic is on PyPI as [schematicpy](https://pypi.org/project/schematicpy/). You can go through the following two sections for that. - -## Steps -- Step 1: Open a pull request to merge the `main` branch to the `develop` branch: - - Click on the "Pull Request" tab on Github - - Click on the green button "New pull request" - - Select `main` as "base" and `develop` as "compare" - - Resolve conflicts - - Link all PRs and/or issues that are included in the release (example [here](https://github.com/Sage-Bionetworks/data_curator/pull/357)) - -- Step 2: Create a tag -`git tag -m ''` - -- Step 3: Push the tag to main branch (this step assumes that you have checked out the main branch locally) -`git push origin ` - -This should trigger the PYPI release workflow and release a new version of schematic to PYPI. You could check by cliking on the GitHub action log and login to your PYPI account (and select project `schematicpy`. Please note that you have to obtain access to `schematicpy` to be able to see it.) - ->Note: if you make some mistakes and would like to delete a tag, try the following commands: `git push --delete origin ` for deleting a tag remotely and `git tag -d ` for deleting a tag locally. - - -## Release to Test PyPI _(optional)_ - -The purpose of this section is to verify that the package looks and works as intended, by viewing it on [Test PyPI](https://test.pypi.org/) and installing the test version in a separate virtual environment. - -``` -poetry build # build the package -poetry config repositories.testpypi https://test.pypi.org/legacy/ # add Test PyPI as an alternate package repository -poetry publish -r testpypi # publish the package to Test PyPI -``` - -Installing: - -``` -pip install --index-url https://test.pypi.org/simple/ -``` - -## Release to PyPI _(mandatory)_ - -If the package looks great on Test PyPI and works well, the next step is to publish the package to PyPI: - -``` -poetry publish # publish the package to PyPI -``` - -> You'll need to [register](https://pypi.org/account/register/) for a PyPI account before uploading packages to the package index. Similarly for [Test PyPI](https://test.pypi.org/account/register/) as well. \ No newline at end of file From 9df079458e9728c3729c3102c29c7af53b64dfe1 Mon Sep 17 00:00:00 2001 From: Mialy DeFelice <85905780+mialy-defelice@users.noreply.github.com> Date: Thu, 9 May 2024 11:03:14 -0700 Subject: [PATCH 21/21] fix failing test to conform to data model updates --- tests/test_api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_api.py b/tests/test_api.py index f8ff09d4a..a070297ab 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -383,7 +383,7 @@ def test_get_node_validation_rules(test, client, data_model_jsonld): ) response_dta = json.loads(response.data) assert response.status_code == 200 - assert "list strict" in response_dta + assert "list" in response_dta assert "regex match [a-f]" in response_dta def test_get_nodes_display_names(test, client, data_model_jsonld):