Skip to content

Commit

Permalink
Merge pull request #292 from 4dn-dcic/smaht-ingestion-related
Browse files Browse the repository at this point in the history
Misc changes related to SMaHT ingestion.
  • Loading branch information
dmichaels-harvard authored Nov 2, 2023
2 parents 2b92bc2 + 7c69085 commit 15e795f
Show file tree
Hide file tree
Showing 6 changed files with 52 additions and 30 deletions.
8 changes: 8 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,14 @@ dcicutils
Change Log
----------

8.2.0
=====
* 2023-11-02
* Added ``SchemaManager.get_identifying_properties`` in ``bundle_utils``
which implicitly adds ``identifier`` to ``identifyingProperties``.
* Added support for ``portal_vapp`` to to `ff_utils.get_metadata``.


8.1.0
=====

Expand Down
24 changes: 8 additions & 16 deletions dcicutils/bundle_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,13 @@
from .common import AnyJsonData
from .env_utils import EnvUtils, public_env_name
from .ff_utils import get_metadata
from .lang_utils import there_are
from .misc_utils import AbstractVirtualApp, ignored, ignorable, PRINT, to_camel_case
from .sheet_utils import (
LoadTableError, prefer_number, TabbedJsonSchemas,
Header, Headers, TabbedHeaders, ParsedHeader, ParsedHeaders, TabbedParsedHeaders, SheetCellValue, TabbedSheetData,
TableSetManagerRegistry, AbstractTableSetManager, InsertsManager, TableSetManager, load_table_set,
)
from .validation_utils import SchemaManager, validate_data_against_schemas, summary_of_data_validation_errors
from .validation_utils import SchemaManager, validate_data_against_schemas


PatchPrototype = Dict
Expand Down Expand Up @@ -40,7 +39,8 @@ def __str__(self):


class ValidationProblem(Exception):
pass
def __init__(self, problems: Optional[dict] = None):
self.problems = problems


class TypeHint:
Expand Down Expand Up @@ -506,7 +506,8 @@ def raise_any_pending_problems(self):
if problems:
for problem in problems:
PRINT(f"Problem: {problem}")
raise Exception(there_are(problems, kind='problem while compiling hints', tense='past', show=False))
raise ValidationProblem(problems)
# raise Exception(there_are(problems, kind='problem while compiling hints', tense='past', show=False))

def check_tabs(self):
result = {tab_name: self.check_tab(tab_name)
Expand All @@ -522,7 +523,8 @@ def validate_ref(self, item_type, item_ref):
return True
try:
# TODO: This probably needs a cache
info = get_metadata(f"/{to_camel_case(item_type)}/{item_ref}")
info = get_metadata(f"/{to_camel_case(item_type)}/{item_ref}",
ff_env=self.portal_env, vapp=self.portal_vapp)
# Basically return True if there's a value at all,
# but still check it's not an error message that didn't get raised.
return isinstance(info, dict) and 'uuid' in info
Expand Down Expand Up @@ -653,18 +655,8 @@ def load_items(filename: str, tab_name: Optional[str] = None, escaping: Optional
# No fancy checking for things like .json, etc. for now. Only check things that came from
# spreadsheet-like data, where structural datatypes are forced into strings.
checked_items = tabbed_rows

if validate:
problems = validate_data_against_schemas(checked_items, portal_env=portal_env, portal_vapp=portal_vapp,
override_schemas=override_schemas)
error_summary = summary_of_data_validation_errors(problems)
if error_summary:
for item in error_summary:
PRINT(item)
raise Exception("Validation problems were seen.")
# TODO: Maybe connect validation here. Although another option is to just call validation separately
# once this is successfully loaded. Needs thought. However, David's validation_utils can do
# the validation if we decide to do it, it would just need to be connected up.
# -kmp 23-Oct-2023
raise NotImplementedError("Need to implement validation.")
return checked_items, problems
return checked_items
15 changes: 14 additions & 1 deletion dcicutils/ff_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,7 +277,7 @@ def _sls(val):
return val.lstrip('/')


def get_metadata(obj_id, key=None, ff_env=None, check_queue=False, add_on=''):
def get_metadata(obj_id, key=None, ff_env=None, check_queue=False, add_on='', vapp: Optional[VirtualApp] = None):
"""
Function to get metadata for a given obj_id (uuid or @id, most likely).
Either takes a dictionary form authentication (MUST include 'server')
Expand All @@ -290,6 +290,13 @@ def get_metadata(obj_id, key=None, ff_env=None, check_queue=False, add_on=''):
"frame=object&force_md5"
*REQUIRES ff_env if check_queue is used*
"""
if vapp:
url = f"/{obj_id}?{add_on}"
response = vapp.get(url)
if response and response.status_code in [301, 302, 303, 307, 308]:
response = response.follow()
return get_response_json(response)

auth = get_authentication_with_server(key, ff_env)
if check_queue and stuff_in_queues(ff_env, check_secondary=False):
add_on += '&datastore=database'
Expand Down Expand Up @@ -989,6 +996,12 @@ def get_schema(name, key=None, ff_env: Optional[str] = None, portal_env: Optiona
portal_env = resolve_portal_env(ff_env=ff_env, portal_env=portal_env, portal_vapp=portal_vapp)
base_url = f"profiles/{to_camel_case(name)}.json"
add_on = 'frame=raw'

# TODO
# Now that get_metadata supported portal_vapp we can do:
# return get_metadata(obj_id=base_url, key=key, ff_env=portal_env, add_on=add_on, vapp=portal_vapp)
# however this breaks test_ff_utils.test_get_schema_with_vapp and no time to fix. 2023-11-02.

if portal_vapp:
full_url = f"/{base_url}?{add_on}"
res = portal_vapp.get(full_url)
Expand Down
16 changes: 13 additions & 3 deletions dcicutils/validation_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ def fetch_schema(self, schema_name: str):
def identifying_properties(self, schema: Optional[JsonSchema] = None, schema_name: Optional[str] = None,
among: Optional[List[str]] = None):
schema = schema if schema is not None else self.fetch_schema(schema_name)
possible_identifying_properties = set(schema.get("identifyingProperties") or []) | {'uuid'}
possible_identifying_properties = set(self.get_identifying_properties(schema)) | {'uuid'}
identifying_properties = sorted(possible_identifying_properties
if among is None
else (prop
Expand All @@ -97,6 +97,16 @@ def identifying_value(cls, data_item: Dict[str, AnyJsonData], identifying_proper
f' {disjoined_list([repr(x) for x in identifying_properties])}'
f' in {json.dumps(data_item)}.')

@staticmethod
def get_identifying_properties(schema: dict) -> list:
if not schema:
return []
identifying_properties = schema.get("identifyingProperties", [])
# Implicitly add "identifier" to "identifyingProperties", if it exists.
if "identifier" not in identifying_properties and "identifier" in schema.get("properties", {}):
identifying_properties.append("identifier")
return identifying_properties


def validate_data_against_schemas(data: TabbedSheetData, *,
portal_env: Optional[str] = None,
Expand Down Expand Up @@ -196,7 +206,7 @@ def validate_data_item_against_schemas(data_item: AnyJsonData, data_type: str,
"""
errors = []

identifying_properties = schema.get("identifyingProperties", [])
identifying_properties = SchemaManager.get_identifying_properties(schema)
identifying_value = SchemaManager.identifying_value(data_item, identifying_properties)
if not identifying_value:
errors.append({
Expand Down Expand Up @@ -264,7 +274,7 @@ def summary_of_data_validation_errors(data_validation_errors: Dict,
missing_properties_count += 1
if error.get("extraneous_properties"):
extraneous_properties_count += 1
if error.get("unclassified_error_count"):
if error.get("unclassified_error"):
unclassified_error_count += 1
if error.get("exception"):
exception_count += 1
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "dcicutils"
version = "8.1.0"
version = "8.2.0"
description = "Utility package for interacting with the 4DN Data Portal and other 4DN resources"
authors = ["4DN-DCIC Team <[email protected]>"]
license = "MIT"
Expand Down
17 changes: 8 additions & 9 deletions test/test_bundle_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -635,10 +635,8 @@ def get(self, path_url):
old_count = portal_vapp.call_count
with mock.patch.object(ff_utils_module, "get_authentication_with_server",
mock_not_called("get_authentication_with_server")):
with mock.patch.object(ff_utils_module, "get_metadata",
mock_not_called("get_metadata")):
actual_items = load_items(SAMPLE_ITEMS_FOR_REAL_SCHEMAS_FILE,
tab_name='ExperimentSeq', portal_vapp=portal_vapp)
actual_items = load_items(SAMPLE_ITEMS_FOR_REAL_SCHEMAS_FILE,
tab_name='ExperimentSeq', portal_vapp=portal_vapp)

assert portal_vapp.call_count == old_count + 1
assert actual_items == expected_items
Expand Down Expand Up @@ -718,12 +716,13 @@ def test_table_checker():
flattened=True,
portal_env=mock_ff_env)
checker.check_tabs()
assert str(exc.value) == "There were 2 problems while compiling hints."
assert printed.lines == [
f"Problem: User[0].project: Unable to validate Project reference: {SAMPLE_PROJECT_UUID!r}",
(f"Problem: User[0].user_institution: Unable to validate Institution reference:"
f" {SAMPLE_INSTITUTION_UUID!r}")
expected_problems = [
f"User[0].project: Unable to validate Project reference: {SAMPLE_PROJECT_UUID!r}",
f"User[0].user_institution: Unable to validate Institution reference: {SAMPLE_INSTITUTION_UUID!r}"
]
expected_problem_lines = [f"Problem: {problem}" for problem in expected_problems]
assert exc.value.problems == expected_problems
assert printed.lines == expected_problem_lines

checker = TableChecker(SAMPLE_WORKBOOK_WITH_MATCHED_UUID_REFS,
flattened=True,
Expand Down

0 comments on commit 15e795f

Please sign in to comment.