diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 02cd20808..63db741c0 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -6,6 +6,19 @@ snovault Change Log ---------- +11.22.0 +======= + +* Changes to elasticsearch/create_mapping.py related to support nested quality-metrics/qc_values + for smaht-portal; appliy fallthru logic for create_mapping.schema_mapping if the type does + not match any other itemized type there; e.g. for ['boolean', 'integer', 'number', 'string'] + for the quality-metrics/qc_values/value type. +* Change to schema_utils.py/get_identifying_and_required_properties to handle (possibly) + required properties within an allOf within an anyOf, specifically for the Analyte + properties rna_integrity_number and rna_integrity_number_instrument, which are + required only if molecule is RNA. + + 11.21.1 ======= diff --git a/pyproject.toml b/pyproject.toml index d6f0051b8..354b21e19 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "dcicsnovault" -version = "11.21.1" +version = "11.21.1.1b2" # TODO: To become 11.22.00 description = "Storage support for 4DN Data Portals." authors = ["4DN-DCIC Team "] license = "MIT" diff --git a/snovault/elasticsearch/create_mapping.py b/snovault/elasticsearch/create_mapping.py index c968b3a62..2c199183c 100644 --- a/snovault/elasticsearch/create_mapping.py +++ b/snovault/elasticsearch/create_mapping.py @@ -76,7 +76,7 @@ def determine_if_is_date_field(field, schema): return is_date_field -def schema_mapping(field, schema, top_level=False, from_array=False): +def schema_mapping(field, schema, top_level=False, from_array=False, paths_for_logging=[]): """ Create the mapping for a given schema. Can handle using all fields for objects (*), but can handle specific fields using the field parameter. @@ -112,12 +112,13 @@ def schema_mapping(field, schema, top_level=False, from_array=False): # Elasticsearch handles multiple values for a field if type_ == 'array' and schema['items']: - return schema_mapping(field, schema['items'], from_array=True) + return schema_mapping(field, schema['items'], from_array=True, paths_for_logging=[*paths_for_logging, "[]"]) if type_ == 'object': properties = {} + paths_for_logging = [*paths_for_logging, schema.get("title", "").replace(" ", "")] for k, v in schema.get('properties', {}).items(): - mapping = schema_mapping(k, v) + mapping = schema_mapping(k, v, paths_for_logging=[*paths_for_logging, k]) if mapping is not None: if field == '*' or k == field: properties[k] = mapping @@ -150,25 +151,26 @@ def schema_mapping(field, schema, top_level=False, from_array=False): } } - if type_ == ["number", "string"]: - return { - 'type': 'text', - 'fields': { - 'value': { - 'type': 'float', - 'ignore_malformed': True, - }, - 'raw': { - 'type': 'keyword', - 'ignore_above': KW_IGNORE_ABOVE - }, - 'lower_case_sort': { - 'type': 'keyword', - 'normalizer': 'case_insensitive', - 'ignore_above': KW_IGNORE_ABOVE - } - } - } +# Move to bottom as the default ... +# if type_ == ["number", "string"]: +# return { +# 'type': 'text', +# 'fields': { +# 'value': { +# 'type': 'float', +# 'ignore_malformed': True, +# }, +# 'raw': { +# 'type': 'keyword', +# 'ignore_above': KW_IGNORE_ABOVE +# }, +# 'lower_case_sort': { +# 'type': 'keyword', +# 'normalizer': 'case_insensitive', +# 'ignore_above': KW_IGNORE_ABOVE +# } +# } +# } if type_ == 'boolean': return { @@ -241,6 +243,39 @@ def schema_mapping(field, schema, top_level=False, from_array=False): } } + # Fall thru case. + default_mapping = False + + # Warnings for unmapped items; guard against duplicate warning by squirreling away + # the paths_for_logging in a hidden attribute (__unmapped_warnings) of this function. + if len(paths_for_logging) > 1: + if not hasattr(schema_mapping, "__unmapped_warnings"): + setattr(schema_mapping, "__unmapped_warnings", []) + paths_for_logging = ".".join([path for path in paths_for_logging if path]) + if paths_for_logging not in schema_mapping.__unmapped_warnings: + schema_mapping.__unmapped_warnings.append(paths_for_logging) + log.warning(f"Using default mapping for field: {paths_for_logging} | type: {type_}") + + if default_mapping: + return { + 'type': 'text', + 'fields': { + 'value': { + 'type': 'float', + 'ignore_malformed': True, + }, + 'raw': { + 'type': 'keyword', + 'ignore_above': KW_IGNORE_ABOVE + }, + 'lower_case_sort': { + 'type': 'keyword', + 'normalizer': 'case_insensitive', + 'ignore_above': KW_IGNORE_ABOVE + } + } + } + def _inject_custom_settings(*, template: dict, custom_settings: IndexSettings) -> dict: """ Adds our custom settings to the base template @@ -678,7 +713,8 @@ def type_mapping(types, item_type, embed=True): type_info = types[item_type] schema = type_info.schema # TODO: use top_level parameter here for schema_mapping - mapping = schema_mapping('*', schema, from_array=False) + paths_for_logging = [schema.get("title", "").replace(" ", "")] if schema else [] + mapping = schema_mapping('*', schema, from_array=False, paths_for_logging=paths_for_logging) if not embed: return mapping diff --git a/snovault/schema_utils.py b/snovault/schema_utils.py index ed46ce0de..9e06421d6 100644 --- a/snovault/schema_utils.py +++ b/snovault/schema_utils.py @@ -655,6 +655,16 @@ def get_all_required_properties_from_any_of(schema: dict) -> list: is a property name or a LIST of property names; if the "anyOf" construct looks like it is anything OTHER than this limited usaage, then an EXCEPTION will be raised. """ + def get_possibly_required_properties_from_all_of(all_of_list: list): + required_properties = set() + if isinstance(all_of_list, list): + for condition in all_of_list: + if "required" in condition: + required_properties.update(condition["required"]) + elif "not" in condition and "required" in condition["not"]: + required_properties.difference_update(condition["not"]["required"]) + return list(required_properties) + def raise_unsupported_usage_exception(): raise Exception("Unsupported use of anyOf in schema.") required_properties = set() @@ -677,6 +687,8 @@ def raise_unsupported_usage_exception(): required_properties.add(any_of_value) else: raise_unsupported_usage_exception() + elif "allOf" in any_of: + required_properties.update(get_possibly_required_properties_from_all_of(any_of["allOf"])) return list(required_properties) required_properties = set() diff --git a/snovault/tests/test_schema_utils.py b/snovault/tests/test_schema_utils.py index 6a0e5a2e7..c90e50f6c 100644 --- a/snovault/tests/test_schema_utils.py +++ b/snovault/tests/test_schema_utils.py @@ -496,3 +496,179 @@ def test_get_identifying_and_required_properties(): identifying_properties, required_properties = get_identifying_and_required_properties(schema) assert set(identifying_properties) == {} assert set(required_properties) == {"some_required_property_a", "some_required_property_b", "either_require_this_property_a", "or_require_this_property_a"} + + +def test_get_identifying_and_required_properties_20240828(): + + from snovault.schema_utils import get_identifying_and_required_properties + + # Handle (possibly) required properties within an allOf within an anyOf, specifically + # for the Analyte properties rna_integrity_number and rna_integrity_number_instrument, + # which are required only if molecule is RNA. This example based on Analyte. + schema = { + "required": [ + "molecule", + "molecule_detail", + "samples", + "submission_centers", + "submitted_id" + ], + "identifyingProperties": [ + "accession", + "submitted_id", + "uuid" + ], + "properties": { + "uuid": { + "title": "UUID", + "type": "string", + "format": "uuid", + "exclude_from": [ + "FFedit-create" + ], + "serverDefault": "uuid4", + "permission": "restricted_fields", + "requestMethod": "POST" + } + }, + "anyOf": [ + { + "properties": { + "molecule": { + "not": { + "contains": { + "const": "DNA" + } + } + } + }, + "allOf": [ + { + "not": { + "required": [ + "genomic_quality_number" + ] + } + }, + { + "not": { + "required": [ + "genomic_quality_number_instrument" + ] + } + }, + { + "not": { + "required": [ + "genomic_quality_size_threshold" + ] + } + }, + { + "not": { + "required": [ + "dna_integrity_number" + ] + } + }, + { + "not": { + "required": [ + "dna_integrity_number_instrument" + ] + } + }, + { + "not": { + "required": [ + "dna_quality_number" + ] + } + }, + { + "not": { + "required": [ + "dna_quality_number_instrument" + ] + } + }, + { + "not": { + "required": [ + "dna_quality_size_threshold" + ] + } + }, + { + "required": [ + "rna_integrity_number" + ] + }, + { + "required": [ + "rna_integrity_number_instrument" + ] + } + ] + }, + { + "properties": { + "molecule": { + "not": { + "contains": { + "const": "RNA" + } + } + } + }, + "allOf": [ + { + "not": { + "required": [ + "rna_integrity_number" + ] + } + }, + { + "not": { + "required": [ + "rna_integrity_number_instrument" + ] + } + }, + { + "not": { + "required": [ + "ribosomal_rna_ratio" + ] + } + } + ] + }, + { + "properties": { + "molecule": { + "contains": { + "const": "RNA" + } + } + }, + "allOf": [ + { + "required": [ + "rna_integrity_number" + ] + }, + { + "required": [ + "rna_integrity_number_instrument" + ] + } + ] + } + ] + } + identifying_properties, required_properties = get_identifying_and_required_properties(schema) + assert identifying_properties == ["accession", "submitted_id", "uuid"] + assert sorted(required_properties) == sorted(["molecule_detail", "molecule", "samples", "rna_integrity_number", + "rna_integrity_number_instrument", "submitted_id", "submission_centers"])