From 1fc57c317efb74f18002c5dba184ed44c6710a46 Mon Sep 17 00:00:00 2001 From: David Michaels Date: Sun, 21 Jan 2024 15:14:05 -0500 Subject: [PATCH] New compare functionality in portal_object_utils. --- dcicutils/portal_object_utils.py | 79 +++++++++++++++++--------------- dcicutils/schema_utils.py | 60 +++++++++++++++--------- dcicutils/structured_data.py | 18 ++++++-- 3 files changed, 95 insertions(+), 62 deletions(-) diff --git a/dcicutils/portal_object_utils.py b/dcicutils/portal_object_utils.py index caf531080..aa623176c 100644 --- a/dcicutils/portal_object_utils.py +++ b/dcicutils/portal_object_utils.py @@ -1,8 +1,10 @@ from functools import lru_cache import re -from typing import List, Optional, Tuple, Union -from dcicutils.schema_utils import get_identifying_properties +from typing import Any, Callable, List, Optional, Tuple, Type, Union from dcicutils.portal_utils import Portal +from dcicutils.schema_utils import Schema + +PortalObject = Type["PortalObject"] # Forward type reference for type hints. class PortalObject: @@ -18,34 +20,23 @@ def data(self): @property @lru_cache(maxsize=1) - def schema(self): - return self._portal.get_schema(self.schema_type) - - @property - @lru_cache(maxsize=1) - def schema_type(self): + def type(self): return self._type or Portal.get_schema_type(self._data) @property @lru_cache(maxsize=1) - def schema_types(self): + def types(self): return self._type or Portal.get_schema_types(self._data) @property @lru_cache(maxsize=1) - def schema_identifying_properties(self) -> list: - if not (schema := self.schema): - return [] - return get_identifying_properties(schema) + def uuid(self) -> Optional[str]: + return self._data.get("uuid") if isinstance(self._data, dict) else None @property @lru_cache(maxsize=1) - def uuid(self) -> Optional[str]: - return PortalObject.get_uuid(self._data) - - @staticmethod - def get_uuid(portal_object: dict) -> Optional[str]: - return portal_object.get("uuid") if isinstance(portal_object, dict) else None + def schema(self): + return self._portal.get_schema(self.type) @property @lru_cache(maxsize=1) @@ -55,8 +46,10 @@ def identifying_properties(self) -> List[str]: Implicitly include "uuid" and "identifier" properties as identifying properties if they are actually properties in the object schema, and favor these (first); defavor "aliases"; no other ordering defined. """ + if not (schema := self.schema) or not (schema_identifying_properties := schema.get("identifyingProperties")): + return [] identifying_properties = [] - for identifying_property in self.schema_identifying_properties: + for identifying_property in schema_identifying_properties: if identifying_property not in ["uuid", "identifier", "aliases"]: if self._data.get(identifying_property): identifying_properties.append(identifying_property) @@ -64,7 +57,7 @@ def identifying_properties(self) -> List[str]: identifying_properties.insert(0, "identifier") if self._data.get("uuid"): identifying_properties.insert(0, "uuid") - if "aliases" in self.schema_identifying_properties and self._data.get("aliases"): + if "aliases" in schema_identifying_properties and self._data.get("aliases"): identifying_properties.append("aliases") return identifying_properties @@ -81,8 +74,8 @@ def identifying_paths(self) -> List[str]: if (identifying_value := self._data.get(identifying_property)): if identifying_property == "uuid": identifying_paths.append(f"/{identifying_value}") - # For now at least we include the path both with and without the schema type component - # as for some identifying values it works (only) with and some it works (only) without. + # For now at least we include the path both with and without the schema type component, + # as for some identifying values, it works (only) with, and some, it works (only) without. # For example: If we have FileSet with "accession", an identifying property, with value # SMAFSFXF1RO4 then /SMAFSFXF1RO4 works but /FileSet/SMAFSFXF1RO4 does not; and # conversely using "submitted_id", also an identifying property, with value @@ -90,10 +83,10 @@ def identifying_paths(self) -> List[str]: # not work but /FileSet/UW_FILE-SET_COLO-829BL_HI-C_1 does work. elif isinstance(identifying_value, list): for identifying_value_item in identifying_value: - identifying_paths.append(f"/{self.schema_type}/{identifying_value_item}") + identifying_paths.append(f"/{self.type}/{identifying_value_item}") identifying_paths.append(f"/{identifying_value_item}") else: - identifying_paths.append(f"/{self.schema_type}/{identifying_value}") + identifying_paths.append(f"/{self.type}/{identifying_value}") identifying_paths.append(f"/{identifying_value}") return identifying_paths @@ -104,34 +97,47 @@ def identifying_path(self) -> Optional[str]: return identifying_paths[0] def lookup(self, include_identifying_path: bool = False, - raw: bool = False) -> Optional[Union[Tuple[dict, str], dict]]: + raw: bool = False) -> Optional[Union[Tuple[PortalObject, str], PortalObject]]: return self._lookup(raw=raw) if include_identifying_path else self._lookup(raw=raw)[0] def lookup_identifying_path(self) -> Optional[str]: return self._lookup()[1] - def _lookup(self, raw: bool = False) -> Tuple[Optional[dict], Optional[str]]: + def _lookup(self, raw: bool = False) -> Tuple[Optional[PortalObject], Optional[str]]: try: for identifying_path in self.identifying_paths: if (value := self._portal.get(identifying_path, raw=raw)) and (value.status_code == 200): - return value.json(), identifying_path + return PortalObject(self._portal, value.json(), self.type if raw else None), identifying_path except Exception: pass return None, self.identifying_path - def compare(self, value: dict) -> dict: + def compare(self, value: Union[dict, PortalObject], consider_link_to: bool = False) -> dict: """ Compares this Portal object against the given Portal object value; noting differences values of properites which they have in common; and properties which are in this Portal object and not in the given Portal object; we do NOT check the converse, i.e. properties in the given Portal object which are not in this Portal object. Returns a dictionary with a description of the differences. """ - return PortalObject._compare(self._data, value.data if isinstance(value, PortalObject) else value) - - _ARRAY_KEY_REGULAR_EXPRESSION = re.compile(r"^(#\d+)$") + def are_properties_equal(property_path: str, property_value_a: Any, property_value_b: Any) -> bool: + if property_value_a == property_value_b: + return True + nonlocal self + if (schema := self.schema) and (property_type := Schema.get_property_by_path(schema, property_path)): + if link_to := property_type.get("linkTo"): + if a := self._portal.get(f"/{link_to}/{property_value_a}", raw=True): + if (a.status_code == 200) and (a := a.json()): + if b := self._portal.get(f"/{link_to}/{property_value_b}", raw=True): + if (b.status_code == 200) and (b := b.json()): + return a == b + return False + return PortalObject._compare(self._data, value.data if isinstance(value, PortalObject) else value, + compare=are_properties_equal if consider_link_to else None) + + _ARRAY_KEY_REGULAR_EXPRESSION = re.compile(rf"^({Schema._ARRAY_NAME_SUFFIX_CHAR}\d+)$") @staticmethod - def _compare(a: dict, b: dict, _path: Optional[str] = None) -> dict: + def _compare(a: dict, b: dict, compare: Optional[Callable] = None, _path: Optional[str] = None) -> dict: def key_to_path(key: str) -> Optional[str]: # noqa nonlocal _path if match := PortalObject._ARRAY_KEY_REGULAR_EXPRESSION.search(key): @@ -149,10 +155,11 @@ def list_to_dictionary(value: list) -> dict: # noqa diffs[path] = {"value": a[key], "missing_value": True} else: if isinstance(a[key], dict) and isinstance(b[key], dict): - diffs.update(PortalObject._compare(a[key], b[key], _path=path)) + diffs.update(PortalObject._compare(a[key], b[key], compare=compare, _path=path)) elif isinstance(a[key], list) and isinstance(b[key], list): diffs.update(PortalObject._compare(list_to_dictionary(a[key]), - list_to_dictionary(b[key]), _path=path)) + list_to_dictionary(b[key]), compare=compare, _path=path)) elif a[key] != b[key]: - diffs[path] = {"value": a[key], "differing_value": b[key]} + if not callable(compare) or not compare(path, a[key], b[key]): + diffs[path] = {"value": a[key], "differing_value": b[key]} return diffs diff --git a/dcicutils/schema_utils.py b/dcicutils/schema_utils.py index c009b116c..07aea2183 100644 --- a/dcicutils/schema_utils.py +++ b/dcicutils/schema_utils.py @@ -1,4 +1,6 @@ +import os from typing import Any, Dict, List, Optional, Tuple +from dcicutils.misc_utils import to_camel_case class JsonSchemaConstants: @@ -187,45 +189,46 @@ def get_one_of_formats(schema: Dict[str, Any]) -> List[str]: class Schema: - def __init__(self, schema: dict) -> None: - self._schema = schema + def __init__(self, schema: dict, schema_type: Optional[str] = None) -> None: + self._data = schema if isinstance(schema, dict) else (schema.data if isinstance(schema, Schema) else {}) + self._type = (isinstance(schema_type, str) and schema_type) or Schema.type_name(self._data.get("title", "")) - def get_property_by_path(self, property_path: str) -> Optional[dict]: + @property + def data(self) -> dict: + return self._data + + @property + def type(self) -> str: + return self._type + + @staticmethod + def type_name(value: str) -> Optional[str]: # File or other name. + if isinstance(value, str) and (value := os.path.basename(value.replace(" ", ""))): + return to_camel_case(value[0:dot] if (dot := value.rfind(".")) >= 0 else value) + + def property_by_path(self, property_path: str) -> Optional[dict]: """ TODO """ - return Schema._get_property_by_path(self._schema, property_path) + return Schema.get_property_by_path(self._data, property_path) _ARRAY_NAME_SUFFIX_CHAR = "#" _DOTTED_NAME_DELIMITER_CHAR = "." @staticmethod - def _get_property_by_path(schema: dict, property_path: str) -> Optional[dict]: - def unarrayize_property_name(property_name: str) -> Tuple[str, Optional[List[int]]]: - if len(components := (property_name := property_name.strip()).split(Schema._ARRAY_NAME_SUFFIX_CHAR)) < 2: - return property_name, None - unarrayized_property_name = components[0].strip() - array_specifiers = [] - for component in components[1:]: - if component.isdigit(): - array_specifiers.append(int(component)) - elif component == "": - array_specifiers.append(0) - else: - return property_name, None - return unarrayized_property_name, array_specifiers + def get_property_by_path(schema: dict, property_path: str) -> Optional[dict]: if not isinstance(schema, dict) or not isinstance(property_path, str): return None elif not (schema_properties := schema.get("properties")): return None property_paths = property_path.split(Schema._DOTTED_NAME_DELIMITER_CHAR) for property_index, property_name in enumerate(property_paths): - property_name, array_specifiers = unarrayize_property_name(property_name) + property_name, array_specifiers = Schema._unarrayize_property_name(property_name) if not (property_value := schema_properties.get(property_name)): return None elif (property_type := property_value.get("type")) == "object": property_paths_tail = Schema._DOTTED_NAME_DELIMITER_CHAR.join(property_paths[property_index + 1:]) - return Schema._get_property_by_path(property_value, property_paths_tail) + return Schema.get_property_by_path(property_value, property_paths_tail) elif (property_type := property_value.get("type")) == "array": if not array_specifiers: if property_index == len(property_paths) - 1: @@ -241,5 +244,20 @@ def unarrayize_property_name(property_name: str) -> Tuple[str, Optional[List[int if property_index == len(property_paths) - 1: return property_value property_paths_tail = Schema._DOTTED_NAME_DELIMITER_CHAR.join(property_paths[property_index + 1:]) - return Schema._get_property_by_path(property_value, property_paths_tail) + return Schema.get_property_by_path(property_value, property_paths_tail) return property_value + + @staticmethod + def _unarrayize_property_name(property_name: str) -> Tuple[str, Optional[List[int]]]: + if len(components := (property_name := property_name.strip()).split(Schema._ARRAY_NAME_SUFFIX_CHAR)) < 2: + return property_name, None + unarrayized_property_name = components[0].strip() + array_specifiers = [] + for component in components[1:]: + if component.isdigit(): + array_specifiers.append(int(component)) + elif component == "": + array_specifiers.append(0) + else: + return property_name, None + return unarrayized_property_name, array_specifiers diff --git a/dcicutils/structured_data.py b/dcicutils/structured_data.py index a9b56724b..1fbf0ea3e 100644 --- a/dcicutils/structured_data.py +++ b/dcicutils/structured_data.py @@ -74,7 +74,7 @@ def validate(self, force: bool = False) -> None: row_number += 1 if (validation_errors := schema.validate(data)) is not None: for validation_error in validation_errors: - self._note_error({"src": create_dict(type=schema.name, row=row_number), + self._note_error({"src": create_dict(type=schema.type, row=row_number), "error": validation_error}, "validation") @property @@ -168,7 +168,7 @@ def _load_reader(self, reader: RowReader, type_name: str) -> None: if not structured_row_template: # Delay creation just so we don't reference schema if there are no rows. if not schema and not noschema and not (schema := Schema.load_by_name(type_name, portal=self._portal)): noschema = True - elif schema and (schema_name := schema.name): + elif schema and (schema_name := schema.type): type_name = schema_name structured_row_template = _StructuredRowTemplate(reader.header, schema) structured_row = structured_row_template.create_row() @@ -222,7 +222,7 @@ def create_row(self) -> dict: def set_value(self, data: dict, column_name: str, value: str, file: Optional[str], row_number: int = -1) -> None: if (set_value_function := self._set_value_functions.get(column_name)): - src = create_dict(type=self._schema.name if self._schema else None, + src = create_dict(type=self._schema.type if self._schema else None, column=column_name, file=file, row=row_number) set_value_function(data, value, src) @@ -319,8 +319,8 @@ def ensure_column_consistency(column_name: str) -> None: class Schema: def __init__(self, schema_json: dict, portal: Optional[Portal] = None) -> None: - self.data = schema_json - self.name = Schema.type_name(schema_json.get("title", "")) if schema_json else "" + self._data = schema_json if isinstance(schema_json, dict) else {} + self._type = Schema.type_name(schema_json.get("title", "")) self._portal = portal # Needed only to resolve linkTo references. self._map_value_functions = { "boolean": self._map_function_boolean, @@ -333,6 +333,14 @@ def __init__(self, schema_json: dict, portal: Optional[Portal] = None) -> None: self._unresolved_refs = [] self._typeinfo = self._create_typeinfo(schema_json) + @property + def data(self) -> dict: + return self._data + + @property + def type(self) -> str: + return self._type + @staticmethod def load_by_name(name: str, portal: Portal) -> Optional[dict]: schema_json = portal.get_schema(Schema.type_name(name)) if portal else None