Skip to content

Commit

Permalink
New compare functionality in portal_object_utils.
Browse files Browse the repository at this point in the history
  • Loading branch information
dmichaels-harvard committed Jan 21, 2024
1 parent fd2ee1e commit 1fc57c3
Show file tree
Hide file tree
Showing 3 changed files with 95 additions and 62 deletions.
79 changes: 43 additions & 36 deletions dcicutils/portal_object_utils.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
from functools import lru_cache
import re
from typing import List, Optional, Tuple, Union
from dcicutils.schema_utils import get_identifying_properties
from typing import Any, Callable, List, Optional, Tuple, Type, Union
from dcicutils.portal_utils import Portal
from dcicutils.schema_utils import Schema

PortalObject = Type["PortalObject"] # Forward type reference for type hints.


class PortalObject:
Expand All @@ -18,34 +20,23 @@ def data(self):

@property
@lru_cache(maxsize=1)
def schema(self):
return self._portal.get_schema(self.schema_type)

@property
@lru_cache(maxsize=1)
def schema_type(self):
def type(self):
return self._type or Portal.get_schema_type(self._data)

@property
@lru_cache(maxsize=1)
def schema_types(self):
def types(self):
return self._type or Portal.get_schema_types(self._data)

@property
@lru_cache(maxsize=1)
def schema_identifying_properties(self) -> list:
if not (schema := self.schema):
return []
return get_identifying_properties(schema)
def uuid(self) -> Optional[str]:
return self._data.get("uuid") if isinstance(self._data, dict) else None

@property
@lru_cache(maxsize=1)
def uuid(self) -> Optional[str]:
return PortalObject.get_uuid(self._data)

@staticmethod
def get_uuid(portal_object: dict) -> Optional[str]:
return portal_object.get("uuid") if isinstance(portal_object, dict) else None
def schema(self):
return self._portal.get_schema(self.type)

@property
@lru_cache(maxsize=1)
Expand All @@ -55,16 +46,18 @@ def identifying_properties(self) -> List[str]:
Implicitly include "uuid" and "identifier" properties as identifying properties if they are actually
properties in the object schema, and favor these (first); defavor "aliases"; no other ordering defined.
"""
if not (schema := self.schema) or not (schema_identifying_properties := schema.get("identifyingProperties")):
return []
identifying_properties = []
for identifying_property in self.schema_identifying_properties:
for identifying_property in schema_identifying_properties:
if identifying_property not in ["uuid", "identifier", "aliases"]:
if self._data.get(identifying_property):
identifying_properties.append(identifying_property)
if self._data.get("identifier"):
identifying_properties.insert(0, "identifier")
if self._data.get("uuid"):
identifying_properties.insert(0, "uuid")
if "aliases" in self.schema_identifying_properties and self._data.get("aliases"):
if "aliases" in schema_identifying_properties and self._data.get("aliases"):
identifying_properties.append("aliases")
return identifying_properties

Expand All @@ -81,19 +74,19 @@ def identifying_paths(self) -> List[str]:
if (identifying_value := self._data.get(identifying_property)):
if identifying_property == "uuid":
identifying_paths.append(f"/{identifying_value}")
# For now at least we include the path both with and without the schema type component
# as for some identifying values it works (only) with and some it works (only) without.
# For now at least we include the path both with and without the schema type component,
# as for some identifying values, it works (only) with, and some, it works (only) without.
# For example: If we have FileSet with "accession", an identifying property, with value
# SMAFSFXF1RO4 then /SMAFSFXF1RO4 works but /FileSet/SMAFSFXF1RO4 does not; and
# conversely using "submitted_id", also an identifying property, with value
# UW_FILE-SET_COLO-829BL_HI-C_1 then /UW_FILE-SET_COLO-829BL_HI-C_1 does
# not work but /FileSet/UW_FILE-SET_COLO-829BL_HI-C_1 does work.
elif isinstance(identifying_value, list):
for identifying_value_item in identifying_value:
identifying_paths.append(f"/{self.schema_type}/{identifying_value_item}")
identifying_paths.append(f"/{self.type}/{identifying_value_item}")
identifying_paths.append(f"/{identifying_value_item}")
else:
identifying_paths.append(f"/{self.schema_type}/{identifying_value}")
identifying_paths.append(f"/{self.type}/{identifying_value}")
identifying_paths.append(f"/{identifying_value}")
return identifying_paths

Expand All @@ -104,34 +97,47 @@ def identifying_path(self) -> Optional[str]:
return identifying_paths[0]

def lookup(self, include_identifying_path: bool = False,
raw: bool = False) -> Optional[Union[Tuple[dict, str], dict]]:
raw: bool = False) -> Optional[Union[Tuple[PortalObject, str], PortalObject]]:
return self._lookup(raw=raw) if include_identifying_path else self._lookup(raw=raw)[0]

def lookup_identifying_path(self) -> Optional[str]:
return self._lookup()[1]

def _lookup(self, raw: bool = False) -> Tuple[Optional[dict], Optional[str]]:
def _lookup(self, raw: bool = False) -> Tuple[Optional[PortalObject], Optional[str]]:
try:
for identifying_path in self.identifying_paths:
if (value := self._portal.get(identifying_path, raw=raw)) and (value.status_code == 200):
return value.json(), identifying_path
return PortalObject(self._portal, value.json(), self.type if raw else None), identifying_path
except Exception:
pass
return None, self.identifying_path

def compare(self, value: dict) -> dict:
def compare(self, value: Union[dict, PortalObject], consider_link_to: bool = False) -> dict:
"""
Compares this Portal object against the given Portal object value; noting differences values of properites
which they have in common; and properties which are in this Portal object and not in the given Portal object;
we do NOT check the converse, i.e. properties in the given Portal object which are not in this Portal object.
Returns a dictionary with a description of the differences.
"""
return PortalObject._compare(self._data, value.data if isinstance(value, PortalObject) else value)

_ARRAY_KEY_REGULAR_EXPRESSION = re.compile(r"^(#\d+)$")
def are_properties_equal(property_path: str, property_value_a: Any, property_value_b: Any) -> bool:
if property_value_a == property_value_b:
return True
nonlocal self
if (schema := self.schema) and (property_type := Schema.get_property_by_path(schema, property_path)):
if link_to := property_type.get("linkTo"):
if a := self._portal.get(f"/{link_to}/{property_value_a}", raw=True):
if (a.status_code == 200) and (a := a.json()):
if b := self._portal.get(f"/{link_to}/{property_value_b}", raw=True):
if (b.status_code == 200) and (b := b.json()):
return a == b
return False
return PortalObject._compare(self._data, value.data if isinstance(value, PortalObject) else value,
compare=are_properties_equal if consider_link_to else None)

_ARRAY_KEY_REGULAR_EXPRESSION = re.compile(rf"^({Schema._ARRAY_NAME_SUFFIX_CHAR}\d+)$")

@staticmethod
def _compare(a: dict, b: dict, _path: Optional[str] = None) -> dict:
def _compare(a: dict, b: dict, compare: Optional[Callable] = None, _path: Optional[str] = None) -> dict:
def key_to_path(key: str) -> Optional[str]: # noqa
nonlocal _path
if match := PortalObject._ARRAY_KEY_REGULAR_EXPRESSION.search(key):
Expand All @@ -149,10 +155,11 @@ def list_to_dictionary(value: list) -> dict: # noqa
diffs[path] = {"value": a[key], "missing_value": True}
else:
if isinstance(a[key], dict) and isinstance(b[key], dict):
diffs.update(PortalObject._compare(a[key], b[key], _path=path))
diffs.update(PortalObject._compare(a[key], b[key], compare=compare, _path=path))
elif isinstance(a[key], list) and isinstance(b[key], list):
diffs.update(PortalObject._compare(list_to_dictionary(a[key]),
list_to_dictionary(b[key]), _path=path))
list_to_dictionary(b[key]), compare=compare, _path=path))
elif a[key] != b[key]:
diffs[path] = {"value": a[key], "differing_value": b[key]}
if not callable(compare) or not compare(path, a[key], b[key]):
diffs[path] = {"value": a[key], "differing_value": b[key]}
return diffs
60 changes: 39 additions & 21 deletions dcicutils/schema_utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import os
from typing import Any, Dict, List, Optional, Tuple
from dcicutils.misc_utils import to_camel_case


class JsonSchemaConstants:
Expand Down Expand Up @@ -187,45 +189,46 @@ def get_one_of_formats(schema: Dict[str, Any]) -> List[str]:

class Schema:

def __init__(self, schema: dict) -> None:
self._schema = schema
def __init__(self, schema: dict, schema_type: Optional[str] = None) -> None:
self._data = schema if isinstance(schema, dict) else (schema.data if isinstance(schema, Schema) else {})
self._type = (isinstance(schema_type, str) and schema_type) or Schema.type_name(self._data.get("title", ""))

def get_property_by_path(self, property_path: str) -> Optional[dict]:
@property
def data(self) -> dict:
return self._data

@property
def type(self) -> str:
return self._type

@staticmethod
def type_name(value: str) -> Optional[str]: # File or other name.
if isinstance(value, str) and (value := os.path.basename(value.replace(" ", ""))):
return to_camel_case(value[0:dot] if (dot := value.rfind(".")) >= 0 else value)

def property_by_path(self, property_path: str) -> Optional[dict]:
"""
TODO
"""
return Schema._get_property_by_path(self._schema, property_path)
return Schema.get_property_by_path(self._data, property_path)

_ARRAY_NAME_SUFFIX_CHAR = "#"
_DOTTED_NAME_DELIMITER_CHAR = "."

@staticmethod
def _get_property_by_path(schema: dict, property_path: str) -> Optional[dict]:
def unarrayize_property_name(property_name: str) -> Tuple[str, Optional[List[int]]]:
if len(components := (property_name := property_name.strip()).split(Schema._ARRAY_NAME_SUFFIX_CHAR)) < 2:
return property_name, None
unarrayized_property_name = components[0].strip()
array_specifiers = []
for component in components[1:]:
if component.isdigit():
array_specifiers.append(int(component))
elif component == "":
array_specifiers.append(0)
else:
return property_name, None
return unarrayized_property_name, array_specifiers
def get_property_by_path(schema: dict, property_path: str) -> Optional[dict]:
if not isinstance(schema, dict) or not isinstance(property_path, str):
return None
elif not (schema_properties := schema.get("properties")):
return None
property_paths = property_path.split(Schema._DOTTED_NAME_DELIMITER_CHAR)
for property_index, property_name in enumerate(property_paths):
property_name, array_specifiers = unarrayize_property_name(property_name)
property_name, array_specifiers = Schema._unarrayize_property_name(property_name)
if not (property_value := schema_properties.get(property_name)):
return None
elif (property_type := property_value.get("type")) == "object":
property_paths_tail = Schema._DOTTED_NAME_DELIMITER_CHAR.join(property_paths[property_index + 1:])
return Schema._get_property_by_path(property_value, property_paths_tail)
return Schema.get_property_by_path(property_value, property_paths_tail)
elif (property_type := property_value.get("type")) == "array":
if not array_specifiers:
if property_index == len(property_paths) - 1:
Expand All @@ -241,5 +244,20 @@ def unarrayize_property_name(property_name: str) -> Tuple[str, Optional[List[int
if property_index == len(property_paths) - 1:
return property_value
property_paths_tail = Schema._DOTTED_NAME_DELIMITER_CHAR.join(property_paths[property_index + 1:])
return Schema._get_property_by_path(property_value, property_paths_tail)
return Schema.get_property_by_path(property_value, property_paths_tail)
return property_value

@staticmethod
def _unarrayize_property_name(property_name: str) -> Tuple[str, Optional[List[int]]]:
if len(components := (property_name := property_name.strip()).split(Schema._ARRAY_NAME_SUFFIX_CHAR)) < 2:
return property_name, None
unarrayized_property_name = components[0].strip()
array_specifiers = []
for component in components[1:]:
if component.isdigit():
array_specifiers.append(int(component))
elif component == "":
array_specifiers.append(0)
else:
return property_name, None
return unarrayized_property_name, array_specifiers
18 changes: 13 additions & 5 deletions dcicutils/structured_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ def validate(self, force: bool = False) -> None:
row_number += 1
if (validation_errors := schema.validate(data)) is not None:
for validation_error in validation_errors:
self._note_error({"src": create_dict(type=schema.name, row=row_number),
self._note_error({"src": create_dict(type=schema.type, row=row_number),
"error": validation_error}, "validation")

@property
Expand Down Expand Up @@ -168,7 +168,7 @@ def _load_reader(self, reader: RowReader, type_name: str) -> None:
if not structured_row_template: # Delay creation just so we don't reference schema if there are no rows.
if not schema and not noschema and not (schema := Schema.load_by_name(type_name, portal=self._portal)):
noschema = True
elif schema and (schema_name := schema.name):
elif schema and (schema_name := schema.type):
type_name = schema_name
structured_row_template = _StructuredRowTemplate(reader.header, schema)
structured_row = structured_row_template.create_row()
Expand Down Expand Up @@ -222,7 +222,7 @@ def create_row(self) -> dict:

def set_value(self, data: dict, column_name: str, value: str, file: Optional[str], row_number: int = -1) -> None:
if (set_value_function := self._set_value_functions.get(column_name)):
src = create_dict(type=self._schema.name if self._schema else None,
src = create_dict(type=self._schema.type if self._schema else None,
column=column_name, file=file, row=row_number)
set_value_function(data, value, src)

Expand Down Expand Up @@ -319,8 +319,8 @@ def ensure_column_consistency(column_name: str) -> None:
class Schema:

def __init__(self, schema_json: dict, portal: Optional[Portal] = None) -> None:
self.data = schema_json
self.name = Schema.type_name(schema_json.get("title", "")) if schema_json else ""
self._data = schema_json if isinstance(schema_json, dict) else {}
self._type = Schema.type_name(schema_json.get("title", ""))
self._portal = portal # Needed only to resolve linkTo references.
self._map_value_functions = {
"boolean": self._map_function_boolean,
Expand All @@ -333,6 +333,14 @@ def __init__(self, schema_json: dict, portal: Optional[Portal] = None) -> None:
self._unresolved_refs = []
self._typeinfo = self._create_typeinfo(schema_json)

@property
def data(self) -> dict:
return self._data

@property
def type(self) -> str:
return self._type

@staticmethod
def load_by_name(name: str, portal: Portal) -> Optional[dict]:
schema_json = portal.get_schema(Schema.type_name(name)) if portal else None
Expand Down

0 comments on commit 1fc57c3

Please sign in to comment.