Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Changes to elasticsearch/create_mapping.py related to nested quality-metrics/qc_values #303

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,19 @@ snovault
Change Log
----------

11.22.0
=======

* Changes to elasticsearch/create_mapping.py related to support nested quality-metrics/qc_values
for smaht-portal; appliy fallthru logic for create_mapping.schema_mapping if the type does
not match any other itemized type there; e.g. for ['boolean', 'integer', 'number', 'string']
for the quality-metrics/qc_values/value type.
* Change to schema_utils.py/get_identifying_and_required_properties to handle (possibly)
required properties within an allOf within an anyOf, specifically for the Analyte
properties rna_integrity_number and rna_integrity_number_instrument, which are
required only if molecule is RNA.


11.21.1
=======

Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "dcicsnovault"
version = "11.21.1"
version = "11.21.1.1b2" # TODO: To become 11.22.00
description = "Storage support for 4DN Data Portals."
authors = ["4DN-DCIC Team <[email protected]>"]
license = "MIT"
Expand Down
82 changes: 59 additions & 23 deletions snovault/elasticsearch/create_mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ def determine_if_is_date_field(field, schema):
return is_date_field


def schema_mapping(field, schema, top_level=False, from_array=False):
def schema_mapping(field, schema, top_level=False, from_array=False, paths_for_logging=[]):
"""
Create the mapping for a given schema. Can handle using all fields for
objects (*), but can handle specific fields using the field parameter.
Expand Down Expand Up @@ -112,12 +112,13 @@ def schema_mapping(field, schema, top_level=False, from_array=False):

# Elasticsearch handles multiple values for a field
if type_ == 'array' and schema['items']:
return schema_mapping(field, schema['items'], from_array=True)
return schema_mapping(field, schema['items'], from_array=True, paths_for_logging=[*paths_for_logging, "[]"])

if type_ == 'object':
properties = {}
paths_for_logging = [*paths_for_logging, schema.get("title", "").replace(" ", "")]
for k, v in schema.get('properties', {}).items():
mapping = schema_mapping(k, v)
mapping = schema_mapping(k, v, paths_for_logging=[*paths_for_logging, k])
if mapping is not None:
if field == '*' or k == field:
properties[k] = mapping
Expand Down Expand Up @@ -150,25 +151,26 @@ def schema_mapping(field, schema, top_level=False, from_array=False):
}
}

if type_ == ["number", "string"]:
return {
'type': 'text',
'fields': {
'value': {
'type': 'float',
'ignore_malformed': True,
},
'raw': {
'type': 'keyword',
'ignore_above': KW_IGNORE_ABOVE
},
'lower_case_sort': {
'type': 'keyword',
'normalizer': 'case_insensitive',
'ignore_above': KW_IGNORE_ABOVE
}
}
}
# Move to bottom as the default ...
# if type_ == ["number", "string"]:
# return {
# 'type': 'text',
# 'fields': {
# 'value': {
# 'type': 'float',
# 'ignore_malformed': True,
# },
# 'raw': {
# 'type': 'keyword',
# 'ignore_above': KW_IGNORE_ABOVE
# },
# 'lower_case_sort': {
# 'type': 'keyword',
# 'normalizer': 'case_insensitive',
# 'ignore_above': KW_IGNORE_ABOVE
# }
# }
# }

if type_ == 'boolean':
return {
Expand Down Expand Up @@ -241,6 +243,39 @@ def schema_mapping(field, schema, top_level=False, from_array=False):
}
}

# Fall thru case.
default_mapping = False

# Warnings for unmapped items; guard against duplicate warning by squirreling away
# the paths_for_logging in a hidden attribute (__unmapped_warnings) of this function.
if len(paths_for_logging) > 1:
if not hasattr(schema_mapping, "__unmapped_warnings"):
setattr(schema_mapping, "__unmapped_warnings", [])
paths_for_logging = ".".join([path for path in paths_for_logging if path])
if paths_for_logging not in schema_mapping.__unmapped_warnings:
schema_mapping.__unmapped_warnings.append(paths_for_logging)
log.warning(f"Using default mapping for field: {paths_for_logging} | type: {type_}")

if default_mapping:
return {
'type': 'text',
'fields': {
'value': {
'type': 'float',
'ignore_malformed': True,
},
'raw': {
'type': 'keyword',
'ignore_above': KW_IGNORE_ABOVE
},
'lower_case_sort': {
'type': 'keyword',
'normalizer': 'case_insensitive',
'ignore_above': KW_IGNORE_ABOVE
}
}
}


def _inject_custom_settings(*, template: dict, custom_settings: IndexSettings) -> dict:
""" Adds our custom settings to the base template
Expand Down Expand Up @@ -678,7 +713,8 @@ def type_mapping(types, item_type, embed=True):
type_info = types[item_type]
schema = type_info.schema
# TODO: use top_level parameter here for schema_mapping
mapping = schema_mapping('*', schema, from_array=False)
paths_for_logging = [schema.get("title", "").replace(" ", "")] if schema else []
mapping = schema_mapping('*', schema, from_array=False, paths_for_logging=paths_for_logging)
if not embed:
return mapping

Expand Down
12 changes: 12 additions & 0 deletions snovault/schema_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -655,6 +655,16 @@ def get_all_required_properties_from_any_of(schema: dict) -> list:
is a property name or a LIST of property names; if the "anyOf" construct looks like it is
anything OTHER than this limited usaage, then an EXCEPTION will be raised.
"""
def get_possibly_required_properties_from_all_of(all_of_list: list):
required_properties = set()
if isinstance(all_of_list, list):
for condition in all_of_list:
if "required" in condition:
required_properties.update(condition["required"])
elif "not" in condition and "required" in condition["not"]:
required_properties.difference_update(condition["not"]["required"])
return list(required_properties)

def raise_unsupported_usage_exception():
raise Exception("Unsupported use of anyOf in schema.")
required_properties = set()
Expand All @@ -677,6 +687,8 @@ def raise_unsupported_usage_exception():
required_properties.add(any_of_value)
else:
raise_unsupported_usage_exception()
elif "allOf" in any_of:
required_properties.update(get_possibly_required_properties_from_all_of(any_of["allOf"]))
return list(required_properties)

required_properties = set()
Expand Down
176 changes: 176 additions & 0 deletions snovault/tests/test_schema_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -496,3 +496,179 @@ def test_get_identifying_and_required_properties():
identifying_properties, required_properties = get_identifying_and_required_properties(schema)
assert set(identifying_properties) == {}
assert set(required_properties) == {"some_required_property_a", "some_required_property_b", "either_require_this_property_a", "or_require_this_property_a"}


def test_get_identifying_and_required_properties_20240828():

from snovault.schema_utils import get_identifying_and_required_properties

# Handle (possibly) required properties within an allOf within an anyOf, specifically
# for the Analyte properties rna_integrity_number and rna_integrity_number_instrument,
# which are required only if molecule is RNA. This example based on Analyte.
schema = {
"required": [
"molecule",
"molecule_detail",
"samples",
"submission_centers",
"submitted_id"
],
"identifyingProperties": [
"accession",
"submitted_id",
"uuid"
],
"properties": {
"uuid": {
"title": "UUID",
"type": "string",
"format": "uuid",
"exclude_from": [
"FFedit-create"
],
"serverDefault": "uuid4",
"permission": "restricted_fields",
"requestMethod": "POST"
}
},
"anyOf": [
{
"properties": {
"molecule": {
"not": {
"contains": {
"const": "DNA"
}
}
}
},
"allOf": [
{
"not": {
"required": [
"genomic_quality_number"
]
}
},
{
"not": {
"required": [
"genomic_quality_number_instrument"
]
}
},
{
"not": {
"required": [
"genomic_quality_size_threshold"
]
}
},
{
"not": {
"required": [
"dna_integrity_number"
]
}
},
{
"not": {
"required": [
"dna_integrity_number_instrument"
]
}
},
{
"not": {
"required": [
"dna_quality_number"
]
}
},
{
"not": {
"required": [
"dna_quality_number_instrument"
]
}
},
{
"not": {
"required": [
"dna_quality_size_threshold"
]
}
},
{
"required": [
"rna_integrity_number"
]
},
{
"required": [
"rna_integrity_number_instrument"
]
}
]
},
{
"properties": {
"molecule": {
"not": {
"contains": {
"const": "RNA"
}
}
}
},
"allOf": [
{
"not": {
"required": [
"rna_integrity_number"
]
}
},
{
"not": {
"required": [
"rna_integrity_number_instrument"
]
}
},
{
"not": {
"required": [
"ribosomal_rna_ratio"
]
}
}
]
},
{
"properties": {
"molecule": {
"contains": {
"const": "RNA"
}
}
},
"allOf": [
{
"required": [
"rna_integrity_number"
]
},
{
"required": [
"rna_integrity_number_instrument"
]
}
]
}
]
}
identifying_properties, required_properties = get_identifying_and_required_properties(schema)
assert identifying_properties == ["accession", "submitted_id", "uuid"]
assert sorted(required_properties) == sorted(["molecule_detail", "molecule", "samples", "rna_integrity_number",
"rna_integrity_number_instrument", "submitted_id", "submission_centers"])
Loading