4dn-dcic · dmichaels-harvard · Aug 17, 2024 · Aug 28, 2024 · Aug 28, 2024
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -6,6 +6,19 @@ snovault
 Change Log
 ----------
 
+11.22.0
+=======
+
+* Changes to elasticsearch/create_mapping.py related to support nested quality-metrics/qc_values
+  for smaht-portal; appliy fallthru logic for create_mapping.schema_mapping if the type does
+  not match any other itemized type there; e.g. for ['boolean', 'integer', 'number', 'string']
+  for the quality-metrics/qc_values/value type.
+* Change to schema_utils.py/get_identifying_and_required_properties to handle (possibly)
+  required properties within an allOf within an anyOf, specifically for the Analyte
+  properties rna_integrity_number and rna_integrity_number_instrument, which are
+  required only if molecule is RNA.
+
+
 11.21.1
 =======
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "dcicsnovault"
-version = "11.21.1"
+version = "11.21.1.1b2"  # TODO: To become 11.22.00
 description = "Storage support for 4DN Data Portals."
 authors = ["4DN-DCIC Team <[email protected]>"]
 license = "MIT"

diff --git a/snovault/elasticsearch/create_mapping.py b/snovault/elasticsearch/create_mapping.py
@@ -76,7 +76,7 @@ def determine_if_is_date_field(field, schema):
     return is_date_field
 
 
-def schema_mapping(field, schema, top_level=False, from_array=False):
+def schema_mapping(field, schema, top_level=False, from_array=False, paths_for_logging=[]):
     """
     Create the mapping for a given schema. Can handle using all fields for
     objects (*), but can handle specific fields using the field parameter.
@@ -112,12 +112,13 @@ def schema_mapping(field, schema, top_level=False, from_array=False):
 
     # Elasticsearch handles multiple values for a field
     if type_ == 'array' and schema['items']:
-        return schema_mapping(field, schema['items'], from_array=True)
+        return schema_mapping(field, schema['items'], from_array=True, paths_for_logging=[*paths_for_logging, "[]"])
 
     if type_ == 'object':
         properties = {}
+        paths_for_logging = [*paths_for_logging, schema.get("title", "").replace(" ", "")]
         for k, v in schema.get('properties', {}).items():
-            mapping = schema_mapping(k, v)
+            mapping = schema_mapping(k, v, paths_for_logging=[*paths_for_logging, k])
             if mapping is not None:
                 if field == '*' or k == field:
                     properties[k] = mapping
@@ -150,25 +151,26 @@ def schema_mapping(field, schema, top_level=False, from_array=False):
             }
         }
 
-    if type_ == ["number", "string"]:
-        return {
-            'type': 'text',
-            'fields': {
-                'value': {
-                    'type': 'float',
-                    'ignore_malformed': True,
-                },
-                'raw': {
-                    'type': 'keyword',
-                    'ignore_above': KW_IGNORE_ABOVE
-                },
-                'lower_case_sort': {
-                    'type': 'keyword',
-                    'normalizer': 'case_insensitive',
-                    'ignore_above': KW_IGNORE_ABOVE
-                }
-            }
-        }
+#   Move to bottom as the default ...
+#   if type_ == ["number", "string"]:
+#       return {
+#           'type': 'text',
+#           'fields': {
+#               'value': {
+#                   'type': 'float',
+#                   'ignore_malformed': True,
+#               },
+#               'raw': {
+#                   'type': 'keyword',
+#                   'ignore_above': KW_IGNORE_ABOVE
+#               },
+#               'lower_case_sort': {
+#                   'type': 'keyword',
+#                   'normalizer': 'case_insensitive',
+#                   'ignore_above': KW_IGNORE_ABOVE
+#               }
+#           }
+#       }
 
     if type_ == 'boolean':
         return {
@@ -241,6 +243,39 @@ def schema_mapping(field, schema, top_level=False, from_array=False):
             }
         }
 
+    # Fall thru case.
+    default_mapping = False
+
+    # Warnings for unmapped items; guard against duplicate warning by squirreling away
+    # the paths_for_logging in a hidden attribute (__unmapped_warnings) of this function.
+    if len(paths_for_logging) > 1:
+        if not hasattr(schema_mapping, "__unmapped_warnings"):
+            setattr(schema_mapping, "__unmapped_warnings", [])
+        paths_for_logging = ".".join([path for path in paths_for_logging if path])
+        if paths_for_logging not in schema_mapping.__unmapped_warnings:
+            schema_mapping.__unmapped_warnings.append(paths_for_logging)
+            log.warning(f"Using default mapping for field: {paths_for_logging} | type: {type_}")
+
+    if default_mapping:
+        return {
+            'type': 'text',
+            'fields': {
+                'value': {
+                    'type': 'float',
+                    'ignore_malformed': True,
+                },
+                'raw': {
+                    'type': 'keyword',
+                    'ignore_above': KW_IGNORE_ABOVE
+                },
+                'lower_case_sort': {
+                    'type': 'keyword',
+                    'normalizer': 'case_insensitive',
+                    'ignore_above': KW_IGNORE_ABOVE
+                }
+            }
+        }
+
 
 def _inject_custom_settings(*, template: dict, custom_settings: IndexSettings) -> dict:
     """ Adds our custom settings to the base template
@@ -678,7 +713,8 @@ def type_mapping(types, item_type, embed=True):
     type_info = types[item_type]
     schema = type_info.schema
     # TODO: use top_level parameter here for schema_mapping
-    mapping = schema_mapping('*', schema, from_array=False)
+    paths_for_logging = [schema.get("title", "").replace(" ", "")] if schema else []
+    mapping = schema_mapping('*', schema, from_array=False, paths_for_logging=paths_for_logging)
     if not embed:
         return mapping
 

diff --git a/snovault/schema_utils.py b/snovault/schema_utils.py
@@ -655,6 +655,16 @@ def get_all_required_properties_from_any_of(schema: dict) -> list:
         is a property name or a LIST of property names; if the "anyOf" construct looks like it is
         anything OTHER than this limited usaage, then an EXCEPTION will be raised.
         """
+        def get_possibly_required_properties_from_all_of(all_of_list: list):
+            required_properties = set()
+            if isinstance(all_of_list, list):
+                for condition in all_of_list:
+                    if "required" in condition:
+                        required_properties.update(condition["required"])
+                    elif "not" in condition and "required" in condition["not"]:
+                        required_properties.difference_update(condition["not"]["required"])
+            return list(required_properties)
+
         def raise_unsupported_usage_exception():
             raise Exception("Unsupported use of anyOf in schema.")
         required_properties = set()
@@ -677,6 +687,8 @@ def raise_unsupported_usage_exception():
                     required_properties.add(any_of_value)
                 else:
                     raise_unsupported_usage_exception()
+            elif "allOf" in any_of:
+                required_properties.update(get_possibly_required_properties_from_all_of(any_of["allOf"]))
         return list(required_properties)
 
     required_properties = set()

diff --git a/snovault/tests/test_schema_utils.py b/snovault/tests/test_schema_utils.py
@@ -496,3 +496,179 @@ def test_get_identifying_and_required_properties():
         identifying_properties, required_properties = get_identifying_and_required_properties(schema)
         assert set(identifying_properties) == {}
         assert set(required_properties) == {"some_required_property_a", "some_required_property_b", "either_require_this_property_a", "or_require_this_property_a"}
+
+
+def test_get_identifying_and_required_properties_20240828():
+
+    from snovault.schema_utils import get_identifying_and_required_properties
+
+    # Handle (possibly) required properties within an allOf within an anyOf, specifically
+    # for the Analyte properties rna_integrity_number and rna_integrity_number_instrument,
+    # which are required only if molecule is RNA. This example based on Analyte.
+    schema = {
+        "required": [
+            "molecule",
+            "molecule_detail",
+            "samples",
+            "submission_centers",
+            "submitted_id"
+        ],
+        "identifyingProperties": [
+            "accession",
+            "submitted_id",
+            "uuid"
+        ],
+        "properties": {
+            "uuid": {
+                "title": "UUID",
+                "type": "string",
+                "format": "uuid",
+                "exclude_from": [
+                    "FFedit-create"
+                ],
+                "serverDefault": "uuid4",
+                "permission": "restricted_fields",
+                "requestMethod": "POST"
+            }
+        },
+        "anyOf": [
+            {
+                "properties": {
+                    "molecule": {
+                        "not": {
+                            "contains": {
+                                "const": "DNA"
+                            }
+                        }
+                    }
+                },
+                "allOf": [
+                    {
+                        "not": {
+                            "required": [
+                                "genomic_quality_number"
+                            ]
+                        }
+                    },
+                    {
+                        "not": {
+                            "required": [
+                                "genomic_quality_number_instrument"
+                            ]
+                        }
+                    },
+                    {
+                        "not": {
+                            "required": [
+                                "genomic_quality_size_threshold"
+                            ]
+                        }
+                    },
+                    {
+                        "not": {
+                            "required": [
+                                "dna_integrity_number"
+                            ]
+                        }
+                    },
+                    {
+                        "not": {
+                            "required": [
+                                "dna_integrity_number_instrument"
+                            ]
+                        }
+                    },
+                    {
+                        "not": {
+                            "required": [
+                                "dna_quality_number"
+                            ]
+                        }
+                    },
+                    {
+                        "not": {
+                            "required": [
+                                "dna_quality_number_instrument"
+                            ]
+                        }
+                    },
+                    {
+                        "not": {
+                            "required": [
+                                "dna_quality_size_threshold"
+                            ]
+                        }
+                    },
+                    {
+                        "required": [
+                            "rna_integrity_number"
+                        ]
+                    },
+                    {
+                        "required": [
+                            "rna_integrity_number_instrument"
+                        ]
+                    }
+                ]
+            },
+            {
+                "properties": {
+                    "molecule": {
+                        "not": {
+                            "contains": {
+                                "const": "RNA"
+                            }
+                        }
+                    }
+                },
+                "allOf": [
+                    {
+                        "not": {
+                            "required": [
+                                "rna_integrity_number"
+                            ]
+                        }
+                    },
+                    {
+                        "not": {
+                            "required": [
+                                "rna_integrity_number_instrument"
+                            ]
+                        }
+                    },
+                    {
+                        "not": {
+                            "required": [
+                                "ribosomal_rna_ratio"
+                            ]
+                        }
+                    }
+                ]
+            },
+            {
+                "properties": {
+                    "molecule": {
+                        "contains": {
+                            "const": "RNA"
+                        }
+                    }
+                },
+                "allOf": [
+                    {
+                        "required": [
+                            "rna_integrity_number"
+                        ]
+                    },
+                    {
+                        "required": [
+                            "rna_integrity_number_instrument"
+                        ]
+                    }
+                ]
+            }
+        ]
+    }
+    identifying_properties, required_properties = get_identifying_and_required_properties(schema)
+    assert identifying_properties == ["accession", "submitted_id", "uuid"]
+    assert sorted(required_properties) == sorted(["molecule_detail", "molecule", "samples", "rna_integrity_number",
+                                                  "rna_integrity_number_instrument", "submitted_id", "submission_centers"])