diff --git a/CHANGELOG.rst b/CHANGELOG.rst index d934d7947..3d867dcdb 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -6,6 +6,11 @@ dcicutils Change Log ---------- +8.14.1 +====== +* Minor changes to utility/troubleshooting/convenience scripts view-portal-object and update-portal-object. + + 8.14.0 ====== * Minor updates to the view-portal-object dev/troubleshooting utility script. diff --git a/dcicutils/scripts/update_portal_object.py b/dcicutils/scripts/update_portal_object.py index c5b29255b..27ffcef8b 100644 --- a/dcicutils/scripts/update_portal_object.py +++ b/dcicutils/scripts/update_portal_object.py @@ -2,9 +2,16 @@ # Command-line utility to update (post, patch, upsert) portal objects for SMaHT/CGAP/Fourfront. # ------------------------------------------------------------------------------------------------------ # Example commands: -# update-portal-object --post file_format.json -# update-portal-object --upsert directory-with-schema-named-dot-json-files -# update-portal-object --patch file-not-named-for-schema-name.json --schema UnalignedReads +# +# update-portal-object --load {json-file | directory-with-json-files} +# update-portal-object --post {json-file | directory-with-json-files} +# update-portal-object --upsert {json-file | directory-with-json-files} +# update-portal-object --patch {json-file | directory-with-json-files} +# +# The specified json-file or file withing directory-with-jaon-files must be JSON containing either +# a list of objects, which which case the file name for the target schema name, or if not, then +# the --schema option must be used to specified the target schema; or the JSON must be a dictionary +# of schema names, where the value of each is a list of objects for that schema. # -------------------------------------------------------------------------------------------------- import argparse @@ -14,13 +21,16 @@ import json import os import re +import shutil import sys from typing import Callable, List, Optional, Tuple, Union +from dcicutils.captured_output import captured_output from dcicutils.command_utils import yes_or_no -from dcicutils.common import ORCHESTRATED_APPS, APP_SMAHT +from dcicutils.common import ORCHESTRATED_APPS, APP_CGAP, APP_FOURFRONT, APP_SMAHT from dcicutils.ff_utils import delete_metadata, purge_metadata -from dcicutils.misc_utils import get_error_message, PRINT +from dcicutils.misc_utils import get_error_message, ignored, normalize_string, PRINT, to_camel_case, to_snake_case from dcicutils.portal_utils import Portal as PortalFromUtils +from dcicutils.tmpfile_utils import temporary_directory class Portal(PortalFromUtils): @@ -38,9 +48,10 @@ def purge_metadata(self, object_id: str) -> Optional[dict]: _DEFAULT_APP = "smaht" _SMAHT_ENV_ENVIRON_NAME = "SMAHT_ENV" +_DEFAULT_INI_FILE_FOR_LOAD = "development.ini" # Schema properties to ignore (by default) for the view schema usage. -_SCHEMAS_IGNORE_PROPERTIES = [ +_IGNORE_PROPERTIES_ON_UPDATE = [ "date_created", "last_modified", "principals_allowed", @@ -120,11 +131,21 @@ def main(): parser.add_argument("--post", type=str, required=False, default=None, help="POST data.") parser.add_argument("--patch", type=str, required=False, default=None, help="PATCH data.") parser.add_argument("--upsert", type=str, required=False, default=None, help="Upsert data.") + parser.add_argument("--load", "--loadxl", type=str, required=False, default=None, + help="Load data via snovault.loadxl.") + parser.add_argument("--ini", type=str, required=False, default=None, help="INI file for data via snovault.loadxl.") parser.add_argument("--delete", type=str, required=False, default=None, help="Delete data.") parser.add_argument("--purge", type=str, required=False, default=None, help="Purge data.") + parser.add_argument("--noignore", action="store_true", required=False, default=False, + help="Do not ignore standard fields on update(s).") + parser.add_argument("--ignore", nargs="+", help="Ignore these additional fields.") + parser.add_argument("--unresolved-output", "--unresolved", type=str, + help="Output file to write unresolved references to for --load only.") parser.add_argument("--confirm", action="store_true", required=False, default=False, help="Confirm before action.") parser.add_argument("--verbose", action="store_true", required=False, default=False, help="Verbose output.") parser.add_argument("--quiet", action="store_true", required=False, default=False, help="Quiet output.") + parser.add_argument("--noprogress", action="store_true", required=False, default=False, + help="No progress bar output for --load.") parser.add_argument("--debug", action="store_true", required=False, default=False, help="Debugging output.") args = parser.parse_args() @@ -134,45 +155,49 @@ def usage(message: Optional[str] = None) -> None: parser.print_help() sys.exit(1) - if app := args.app: - if (app not in ORCHESTRATED_APPS) and ((app := app.lower()) not in ORCHESTRATED_APPS): - usage(f"ERROR: Unknown app name; must be one of: {' | '.join(ORCHESTRATED_APPS)}") - else: - app = APP_SMAHT + if not (args.post or args.patch or args.upsert or args.delete or args.purge or args.load): + usage() - portal = _create_portal(env=args.env, app=app, verbose=args.verbose, debug=args.debug) + if not (portal := _create_portal(env=args.env, ini=args.ini, app=args.app, load=args.load, + verbose=args.verbose, debug=args.debug, quiet=args.quiet)): + exit(1) + + if args.load: + _load_data(portal=portal, load=args.load, ini_file=args.ini, explicit_schema_name=args.schema, + unresolved_output=args.unresolved_output, + verbose=args.verbose, debug=args.debug, noprogress=args.noprogress) if explicit_schema_name := args.schema: schema, explicit_schema_name = _get_schema(portal, explicit_schema_name) if not schema: - usage(f"ERROR: Unknown schema name: {args.schema}") - - if not (args.post or args.patch or args.upsert or args.delete or args.purge): - usage() + usage(f"Unknown specified schema name: {args.schema}") if args.post: _post_or_patch_or_upsert(portal=portal, file_or_directory=args.post, explicit_schema_name=explicit_schema_name, - update_function=post_data, + update_function=_post_data, update_action_name="POST", + noignore=args.noignore, ignore=args.ignore, confirm=args.confirm, verbose=args.verbose, quiet=args.quiet, debug=args.debug) if args.patch: _post_or_patch_or_upsert(portal=portal, file_or_directory=args.patch, explicit_schema_name=explicit_schema_name, - update_function=patch_data, + update_function=_patch_data, update_action_name="PATCH", patch_delete_fields=args.delete, + noignore=args.noignore, ignore=args.ignore, confirm=args.confirm, verbose=args.verbose, quiet=args.quiet, debug=args.debug) args.delete = None if args.upsert: _post_or_patch_or_upsert(portal=portal, file_or_directory=args.upsert, explicit_schema_name=explicit_schema_name, - update_function=upsert_data, + update_function=_upsert_data, update_action_name="UPSERT", patch_delete_fields=args.delete, + noignore=args.noignore, ignore=args.ignore, confirm=args.confirm, verbose=args.verbose, quiet=args.quiet, debug=args.debug) args.delete = None @@ -196,17 +221,10 @@ def _post_or_patch_or_upsert(portal: Portal, file_or_directory: str, explicit_schema_name: str, update_function: Callable, update_action_name: str, patch_delete_fields: Optional[str] = None, + noignore: bool = False, ignore: Optional[List[str]] = None, confirm: bool = False, verbose: bool = False, quiet: bool = False, debug: bool = False) -> None: - def is_schema_name_list(portal: Portal, keys: list) -> bool: - if isinstance(keys, list): - for key in keys: - if portal.get_schema(key) is None: - return False - return True - return False - def post_or_patch_or_upsert(portal: Portal, file: str, schema_name: Optional[str], patch_delete_fields: Optional[str] = None, confirm: bool = False, verbose: bool = False, @@ -222,17 +240,20 @@ def post_or_patch_or_upsert(portal: Portal, file: str, schema_name: Optional[str _print(f"DEBUG: File ({file}) contains an object of type: {schema_name}") update_function(portal, data, schema_name, file=file, patch_delete_fields=patch_delete_fields, + noignore=noignore, ignore=ignore, confirm=confirm, verbose=verbose, debug=debug) - elif is_schema_name_list(portal, list(data.keys())): + elif _is_schema_name_list(portal, list(data.keys())): if debug: _print(f"DEBUG: File ({file}) contains a dictionary of schema names.") for schema_name in data: if isinstance(schema_data := data[schema_name], list): + schema_data = _impose_special_ordering(schema_data, schema_name) if debug: _print(f"DEBUG: Processing {update_action_name}s for type: {schema_name}") for index, item in enumerate(schema_data): update_function(portal, item, schema_name, file=file, index=index, patch_delete_fields=patch_delete_fields, + noignore=noignore, ignore=ignore, confirm=confirm, verbose=verbose, debug=debug) else: _print(f"WARNING: File ({file}) contains schema item which is not a list: {schema_name}") @@ -241,9 +262,11 @@ def post_or_patch_or_upsert(portal: Portal, file: str, schema_name: Optional[str elif isinstance(data, list): if debug: _print(f"DEBUG: File ({file}) contains a list of objects of type: {schema_name}") + data = _impose_special_ordering(data, schema_name) for index, item in enumerate(data): update_function(portal, item, schema_name, file=file, index=index, patch_delete_fields=patch_delete_fields, + noignore=noignore, ignore=ignore, confirm=confirm, verbose=verbose, debug=debug) if debug: _print(f"DEBUG: Processing {update_action_name} file done: {file}") @@ -276,10 +299,18 @@ def post_or_patch_or_upsert(portal: Portal, file: str, schema_name: Optional[str _print(f"ERROR: Cannot find file or directory: {file_or_directory}") -def post_data(portal: Portal, data: dict, schema_name: str, - file: Optional[str] = None, index: int = 0, - patch_delete_fields: Optional[str] = None, # unused here - confirm: bool = False, verbose: bool = False, debug: bool = False) -> None: +def _impose_special_ordering(data: List[dict], schema_name: str) -> List[dict]: + if schema_name == "FileFormat": + return sorted(data, key=lambda item: "extra_file_formats" in item) + return data + + +def _post_data(portal: Portal, data: dict, schema_name: str, + file: Optional[str] = None, index: int = 0, + patch_delete_fields: Optional[str] = None, + noignore: bool = False, ignore: Optional[List[str]] = None, + confirm: bool = False, verbose: bool = False, debug: bool = False) -> None: + ignored(patch_delete_fields) if not (identifying_path := portal.get_identifying_path(data, portal_type=schema_name)): if isinstance(file, str) and isinstance(index, int): _print(f"ERROR: Item for POST has no identifying property: {file} (#{index + 1})") @@ -294,6 +325,7 @@ def post_data(portal: Portal, data: dict, schema_name: str, if verbose: _print(f"POST {schema_name} item: {identifying_path}") try: + data = _prune_data_for_update(data, noignore=noignore, ignore=ignore) portal.post_metadata(schema_name, data) if debug: _print(f"DEBUG: POST {schema_name} item done: {identifying_path}") @@ -303,10 +335,11 @@ def post_data(portal: Portal, data: dict, schema_name: str, return -def patch_data(portal: Portal, data: dict, schema_name: str, - file: Optional[str] = None, index: int = 0, - patch_delete_fields: Optional[str] = None, - confirm: bool = False, verbose: bool = False, debug: bool = False) -> None: +def _patch_data(portal: Portal, data: dict, schema_name: str, + file: Optional[str] = None, index: int = 0, + patch_delete_fields: Optional[str] = None, + noignore: bool = False, ignore: Optional[List[str]] = None, + confirm: bool = False, verbose: bool = False, debug: bool = False) -> None: if not (identifying_path := portal.get_identifying_path(data, portal_type=schema_name)): if isinstance(file, str) and isinstance(index, int): _print(f"ERROR: Item for PATCH has no identifying property: {file} (#{index + 1})") @@ -323,6 +356,7 @@ def patch_data(portal: Portal, data: dict, schema_name: str, try: if delete_fields := _parse_delete_fields(patch_delete_fields): identifying_path += f"?delete_fields={delete_fields}" + data = _prune_data_for_update(data, noignore=noignore, ignore=ignore) portal.patch_metadata(identifying_path, data) if debug: _print(f"DEBUG: PATCH {schema_name} item OK: {identifying_path}") @@ -332,10 +366,11 @@ def patch_data(portal: Portal, data: dict, schema_name: str, return -def upsert_data(portal: Portal, data: dict, schema_name: str, - file: Optional[str] = None, index: int = 0, - patch_delete_fields: Optional[str] = None, - confirm: bool = False, verbose: bool = False, debug: bool = False) -> None: +def _upsert_data(portal: Portal, data: dict, schema_name: str, + file: Optional[str] = None, index: int = 0, + patch_delete_fields: Optional[str] = None, + noignore: bool = False, ignore: Optional[List[str]] = None, + confirm: bool = False, verbose: bool = False, debug: bool = False) -> None: if not (identifying_path := portal.get_identifying_path(data, portal_type=schema_name)): if isinstance(file, str) and isinstance(index, int): _print(f"ERROR: Item for UPSERT has no identifying property: {file} (#{index + 1})") @@ -349,10 +384,12 @@ def upsert_data(portal: Portal, data: dict, schema_name: str, _print(f"{'PATCH' if exists else 'POST'} {schema_name} item: {identifying_path}") try: if not exists: + data = _prune_data_for_update(data, noignore=noignore, ignore=ignore) portal.post_metadata(schema_name, data) else: if delete_fields := _parse_delete_fields(patch_delete_fields): identifying_path += f"?delete_fields={delete_fields}" + data = _prune_data_for_update(data, noignore=noignore, ignore=ignore) portal.patch_metadata(identifying_path, data) if debug: _print(f"DEBUG: UPSERT {schema_name} item OK: {identifying_path}") @@ -362,25 +399,359 @@ def upsert_data(portal: Portal, data: dict, schema_name: str, return -def _create_portal(env: Optional[str] = None, app: Optional[str] = None, - verbose: bool = False, debug: bool = False) -> Optional[Portal]: +def _load_data(portal: Portal, load: str, ini_file: str, explicit_schema_name: Optional[str] = None, + unresolved_output: Optional[str] = False, + verbose: bool = False, debug: bool = False, noprogress: bool = False, + _single_insert_file: Optional[str] = None) -> bool: + + import snovault.loadxl + from snovault.loadxl import load_all_gen, LoadGenWrapper + from dcicutils.progress_bar import ProgressBar + + loadxl_summary = {} + loadxl_unresolved = {} + loadxl_output = [] + loadxl_total_item_count = 0 + loadxl_total_error_count = 0 + + def loadxl(portal: Portal, inserts_directory: str, schema_names_to_load: dict): + + nonlocal LoadGenWrapper, load_all_gen, loadxl_summary, verbose, debug + nonlocal loadxl_total_item_count, loadxl_total_error_count + progress_total = sum(schema_names_to_load.values()) * 2 # loadxl does two passes + progress_bar = ProgressBar(progress_total, interrupt_exit=True) if not noprogress else None + + def decode_bytes(str_or_bytes: Union[str, bytes], *, encoding: str = "utf-8") -> str: + if not isinstance(encoding, str): + encoding = "utf-8" + if isinstance(str_or_bytes, bytes): + return str_or_bytes.decode(encoding).strip() + elif isinstance(str_or_bytes, str): + return str_or_bytes.strip() + return "" + + def loadxl_print(arg): + if arg: + loadxl_output.append(normalize_string(str(arg))) + + snovault.loadxl.print = loadxl_print + + LOADXL_RESPONSE_PATTERN = re.compile(r"^([A-Z]+):\s*([a-zA-Z\/\d_-]+)\s*(\S+)\s*(\S+)?\s*(.*)$") + LOADXL_ACTION_NAME = {"POST": "Create", "PATCH": "Update", "SKIP": "Check", + "CHECK": "Validate", "ERROR": "Error"} + current_item_type = None + current_item_count = 0 + current_item_total = 0 + + for item in LoadGenWrapper(load_all_gen(testapp=portal.vapp, inserts=inserts_directory, + docsdir=None, overwrite=True, verbose=True, + continue_on_exception=True)): + loadxl_total_item_count += 1 + item = decode_bytes(item) + match = LOADXL_RESPONSE_PATTERN.match(item) + if not match or match.re.groups < 3: + continue + if (action := LOADXL_ACTION_NAME[match.group(1).upper()]) == "Error": + loadxl_total_error_count += 1 + identifying_value = match.group(2) + # + # Example message for unresolved link ... + # + # ERROR: /22813a02-906b-4b60-b2b2-4afaea24aa28 Bad response: 422 Unprocessable Entity + # (not 200 OK or 3xx redirect for http://localhost/file_set?skip_indexing=true)b\'{"@type": + # ["ValidationFailure", "Error"], "status": "error", "code": # 422, "title": "Unprocessable Entity", + # "description": "Failed validation", "errors": [{"location": "body", "name": # "Schema: ", + # "description": "Unable to resolve link: /Library/a4e8f79f-4d47-4e85-9707-c343c940a315"}, + # {"location": "body", "name": "Schema: libraries.0", + # "description": "\\\'a4e8f79f-4d47-4e85-9707-c343c940a315\\\' not found"}]}\' + # + # OR ... + # + # ERROR: /22813a02-906b-4b60-b2b2-4afaea24aa28 Bad response: 404 Not Found (not 200 OK or 3xx + # redirect for http://localhost/22813a02-906b-4b60-b2b2-4afaea24aa28)b\'{"@type": ["HTTPNotFound", + # "Error"], "status": "error", "code": 404, "title": "Not Found", "description": "The resource + # could not be found.", "detail": "debug_notfound of url http://localhost/22813a02-906b-4b60-b2b2-4afaea24aa28; # noqa + # path_info: \\\'/22813a02-906b-4b60-b2b2-4afaea24aa28\\\', context: , # noqa + # view_name: \\\'22813a02-906b-4b60-b2b2-4afaea24aa28\\\', subpath: (), traversed: (), root: + # , vroot: , vroot_path: ()"}\' # noqa + # + if (item_type := re.search(r"https?://.*/(.*)\?skip_indexing=.*", item)) and (len(item_type.groups()) == 1): # noqa + item_type = to_snake_case(item_type.group(1)) + identifying_value = f"/{to_camel_case(item_type)}{identifying_value}" + unresolved_link_error_message_prefix = "Unable to resolve link:" + if (i := item.find(unresolved_link_error_message_prefix)) > 0: + unresolved_link = item[i + len(unresolved_link_error_message_prefix):].strip() + if (i := unresolved_link.find("\"")) > 0: + if (unresolved_link := unresolved_link[0:i]): + if not loadxl_unresolved.get(unresolved_link): + loadxl_unresolved[unresolved_link] = [] + if identifying_value not in loadxl_unresolved[unresolved_link]: + loadxl_unresolved[unresolved_link].append(identifying_value) + if not item_type: + continue + else: + item_type = match.group(3) + if current_item_type != item_type: + if noprogress and debug and current_item_type is not None: + _print() + current_item_type = item_type + current_item_count = 0 + current_item_total = schema_names_to_load[item_type] + if progress_bar: + progress_bar.set_description(f"▶ {to_camel_case(current_item_type)}: {action}") + current_item_count += 1 + if loadxl_summary.get(current_item_type, None) is None: + loadxl_summary[current_item_type] = 0 + loadxl_summary[current_item_type] += 1 + if progress_bar: + progress_bar.set_progress(loadxl_total_item_count) + elif debug: + _print(f"{current_item_type}: {current_item_count} or {current_item_total} ({action})") + if progress_bar: + progress_bar.set_description("▶ Load Complete") + progress_bar.set_progress(progress_total) + if loadxl_total_item_count > loadxl_total_error_count: + _print() + + if not portal.vapp: + _print("Must using INI based Portal object with --load (use --ini option to specify an INI file).") + return False + if not os.path.isabs(load := os.path.normpath(os.path.expanduser(load))): + load = os.path.normpath(os.path.join(os.getcwd(), load)) + if not os.path.exists(load): + _print(f"Specified JSON data file not found: {load}") + return False + + if os.path.isdir(load): + inserts_directory = load + inserts_file = None + else: + inserts_directory = None + inserts_file = load + + if inserts_file: + with io.open(inserts_file, "r") as f: + try: + data = json.load(f) + except Exception: + _print(f"Cannot load JSON data from file: {inserts_file}") + return False + if isinstance(data, list): + if not (schema_name := explicit_schema_name): + if not (schema_name := _get_schema_name_from_schema_named_json_file_name(portal, inserts_file)): + _print(f"Unable to determine schema name for JSON data file: {inserts_file}") + return False + elif not (schema_name := _get_schema(portal, explicit_schema_name)[1]): + _print(f"Unknown specified schema name: {explicit_schema_name}") + return False + with temporary_directory() as tmpdir: + file_name = os.path.join(tmpdir, f"{to_snake_case(schema_name)}.json") + with io.open(file_name, "w") as f: + json.dump(data, f) + return _load_data(portal=portal, load=tmpdir, ini_file=ini_file, explicit_schema_name=schema_name, + unresolved_output=unresolved_output, + verbose=verbose, debug=debug, noprogress=noprogress, + _single_insert_file=inserts_file) + elif isinstance(data, dict): + if schema_name := explicit_schema_name: + if _is_schema_name_list(portal, schema_names := list(data.keys())): + _print(f"Ignoring specify --schema: {schema_name}") + elif not (schema_name := _get_schema(portal, schema_name)[1]): + _print(f"Unknown specified schema name: {explicit_schema_name}") + return False + else: + data = {schema_name: [data]} + if not _is_schema_name_list(portal, schema_names := list(data.keys())): + if not (schema_name := _get_schema_name_from_schema_named_json_file_name(portal, inserts_file)): + _print(f"Unrecognized types in JSON data file: {inserts_file}") + # Assume simple object of type from the JSON file name. + schema_names = [schema_name] + data = {schema_name: [data]} + with temporary_directory() as tmpdir: + nfiles = 0 + for schema_name in schema_names: + if not isinstance(schema_data := data[schema_name], list): + _print(f"Unexpected value for data type ({schema_name})" + f" in JSON data file: {inserts_file} ▶ ignoring") + continue + file_name = os.path.join(tmpdir, f"{to_snake_case(schema_name)}.json") + with io.open(file_name, "w") as f: + json.dump(schema_data, f) + nfiles += 1 + if nfiles > 0: + return _load_data(portal=portal, load=tmpdir, ini_file=ini_file, + unresolved_output=unresolved_output, + verbose=verbose, debug=debug, noprogress=noprogress, + _single_insert_file=inserts_file) + return True + else: + _print(f"Unrecognized JSON data in file: {inserts_file}") + return False + return True - env_from_environ = None - if not env and (app == APP_SMAHT): - if env := os.environ.get(_SMAHT_ENV_ENVIRON_NAME): - env_from_environ = True - if not (portal := Portal(env, app=app) if env or app else None): - return None if verbose: - if (env := portal.env) or (env := os.environ(_SMAHT_ENV_ENVIRON_NAME)): - _print(f"Portal environment" - f"{f' (from {_SMAHT_ENV_ENVIRON_NAME})' if env_from_environ else ''}: {portal.env}") - if portal.keys_file: - _print(f"Portal keys file: {portal.keys_file}") - if portal.key_id: - _print(f"Portal key prefix: {portal.key_id[0:2]}******") - if portal.server: - _print(f"Portal server: {portal.server}") + if _single_insert_file: + _print(f"Loading data into Portal (via snovault.loadxl) from file: {_single_insert_file}") + else: + _print(f"Loading data into Portal (via snovault.loadxl) from directory: {inserts_directory}") + + schema_names = list(_get_schemas(portal).keys()) + schema_snake_case_names = [to_snake_case(item) for item in schema_names] + schema_names_to_load = {} + + copy_to_temporary_directory = False + for json_file_path in glob.glob(os.path.join(inserts_directory, "*.json")): + json_file_name = os.path.basename(json_file_path) + schema_name = os.path.basename(json_file_name)[:-len(".json")] + if (schema_name not in schema_snake_case_names) and (schema_name not in schema_names): + _print(f"File is not named for a known schema: {json_file_name} ▶ ignoring") + copy_to_temporary_directory = True + else: + try: + with io.open(json_file_path, "r") as f: + if not isinstance(data := json.load(f), list): + _print("Data JSON file does not contain an array: {json_file_path} ▶ ignoring") + copy_to_temporary_directory = True + elif (nobjects := len(data)) < 1: + _print("Data JSON file contains no items: {json_file_path} ▶ ignoring") + copy_to_temporary_directory = True + else: + schema_names_to_load[schema_name] = nobjects + except Exception: + _print("Cannot load JSON data from file: {json_file_path} ▶ ignoring") + copy_to_temporary_directory = True + if not schema_names_to_load: + _print("Directory contains no valid data: {inserts_directory}") + return False + if copy_to_temporary_directory: + with temporary_directory() as tmpdir: + if debug: + _print(f"Using temporary directory: {tmpdir}") + for json_file_path in glob.glob(os.path.join(inserts_directory, "*.json")): + json_file_name = os.path.basename(json_file_path) + schema_name = os.path.basename(json_file_name)[:-len(".json")] + if (schema_name in schema_snake_case_names) or (schema_name in schema_names): + shutil.copy(json_file_path, tmpdir) + loadxl(portal=portal, inserts_directory=tmpdir, schema_names_to_load=schema_names_to_load) + else: + loadxl(portal=portal, inserts_directory=inserts_directory, schema_names_to_load=schema_names_to_load) + + if verbose: + if _single_insert_file: + _print(f"Done loading data into Portal (via snovault.loadxl) from file: {_single_insert_file}") + else: + _print(f"Done loading data into Portal (via snovault.loadxl) from directory: {inserts_directory}") + _print(f"Total items loaded: {loadxl_total_item_count // 2}" # TODO: straightend out this arithmetic + f"{f' (errors: {loadxl_total_error_count})' if loadxl_total_error_count else ''}") + for item in sorted(loadxl_summary.keys()): + _print(f"▷ {to_camel_case(item)}: {loadxl_summary[item] // 2}") # TODO: straightend out this arithmetic + if loadxl_unresolved: + _print("✗ Unresolved references:") + for item in loadxl_unresolved: + _print(f" ✗ {item}: {len(loadxl_unresolved[item])}") + for subitem in loadxl_unresolved[item]: + _print(f" ▶ {subitem}") + if unresolved_output: + if unresolved_output: + if not os.path.isabs(unresolved_output := os.path.normpath(os.path.expanduser(unresolved_output))): + unresolved_output = os.path.normpath(os.path.join(os.getcwd(), unresolved_output)) + if os.path.exists(unresolved_output): + if os.path.isdir(unresolved_output): + _print("Unresolved output file exists as a directory: {unresolved_output}") + return False + _print(f"Unresolved output file already exists: {unresolved_output}") + if yes_or_no(f"Do you want to overwrite this file?"): + with io.open(unresolved_output, "w") as f: + for item in loadxl_unresolved: + f.write(f"{item}\n") + if debug and loadxl_output: + _print("✗ Output from loadxl:") + for item in loadxl_output: + _print(f" ▶ {item}") + + return True + + +def _is_schema_name_list(portal: Portal, keys: list) -> bool: + if isinstance(keys, list): + for key in keys: + if portal.get_schema(key) is None: + return False + return True + return False + + +def _prune_data_for_update(data: dict, noignore: bool = False, ignore: Optional[List[str]] = None) -> dict: + ignore_these_properties = [] if noignore is True else _IGNORE_PROPERTIES_ON_UPDATE + if isinstance(ignore, list): + ignore_these_properties = ignore_these_properties + ignore + if not ignore_these_properties: + return data + return {key: value for key, value in data.items() if key not in ignore_these_properties} + + +def _create_portal(env: Optional[str] = None, ini: Optional[str] = None, app: Optional[str] = None, + load: Optional[str] = None, verbose: bool = False, debug: bool = False, + quiet: bool = False) -> Optional[Portal]: + + if app: + if (app not in ORCHESTRATED_APPS) and ((app := app.lower()) not in ORCHESTRATED_APPS): + _print(f"Unknown app name; must be one of: {' | '.join(ORCHESTRATED_APPS)}") + return None + elif APP_SMAHT in (env or os.environ.get(_SMAHT_ENV_ENVIRON_NAME) or ""): + app = APP_SMAHT + elif APP_CGAP in (env or ""): + app = APP_CGAP + elif APP_FOURFRONT in (env or ""): + app = APP_FOURFRONT + + if ini: + if env: + if not quiet: + _print("Ignoring --env option when --ini option is given.") + elif (app == _SMAHT_ENV_ENVIRON_NAME) and (env := os.environ.get(_SMAHT_ENV_ENVIRON_NAME)): + if not quiet: + _print(f"Ignoring SMAHT_ENV environment variable ({env}) when --ini option is given.") + if not os.path.isabs(ini_file := os.path.normpath(os.path.expanduser(ini))): + ini_file = os.path.normpath(os.path.join(os.getcwd(), ini_file)) + if not os.path.exists(ini_file): + _print(f"Specified Portal INI file not found: {ini_file}") + return None + with captured_output(not debug): + if not (portal := Portal(ini_file, app=app)): + _print(f"Cannot create INI based Portal object: {env} ({app})") + return None + else: + env_from_environ = False + if not env and app: + # If the --load option is specified, and no --ini option is specified, then do NOT default + # to using the SMAHT_ENV environment variable (if set) for an access-key based Portal + # object; rather default to the default INI file (i.e. development.ini). + if (not load) and (app == APP_SMAHT) and (env := os.environ.get(_SMAHT_ENV_ENVIRON_NAME)): + env_from_environ = True + if not env: + if not os.path.exists(ini_file := os.path.normpath(os.path.join(os.getcwd(), _DEFAULT_INI_FILE_FOR_LOAD))): + _print("Must specify --ini or --env option in order to create a Portal object.") + return None + return _create_portal(ini=ini_file, app=app, verbose=verbose, debug=debug) + if not (portal := Portal(env, app=app) if env or app else None): + _print(f"Cannot create access-key based Portal object: {env}{f' ({app})' if app else ''}") + return None + + if (ini_file := portal.ini_file): + if not quiet: + _print(f"Portal environment: {ini_file}") + elif (env := portal.env) or (env := os.environ.get(_SMAHT_ENV_ENVIRON_NAME)): + _print(f"Portal environment" + f"{f' (from {_SMAHT_ENV_ENVIRON_NAME})' if env_from_environ else ''}: {portal.env}") + if verbose: + if portal.keys_file: + _print(f"Portal keys file: {portal.keys_file}") + if portal.key_id: + _print(f"Portal key prefix: {portal.key_id[0:2]}******") + if portal.server: + _print(f"Portal server: {portal.server}") + return portal @@ -428,17 +799,21 @@ def _parse_delete_fields(value: str) -> str: def _get_schema_name_from_schema_named_json_file_name(portal: Portal, value: str) -> Optional[str]: - try: - if not value.endswith(".json"): - return None - _, schema_name = _get_schema(portal, os.path.basename(value[:-5])) - return schema_name - except Exception: - return False + if isinstance(value, str) and value: + try: + if value.endswith(".json"): + value = value[:-5] + _, schema_name = _get_schema(portal, os.path.basename(value)) + return schema_name + except Exception: + pass + return False @lru_cache(maxsize=1) def _get_schemas(portal: Portal) -> Optional[dict]: + if portal.vapp: + return portal.vapp.get("/profiles/?frame=raw").json return portal.get_schemas() diff --git a/dcicutils/scripts/view_portal_object.py b/dcicutils/scripts/view_portal_object.py index 90f459303..8696c94b6 100644 --- a/dcicutils/scripts/view_portal_object.py +++ b/dcicutils/scripts/view_portal_object.py @@ -66,7 +66,7 @@ import yaml from dcicutils.captured_output import captured_output, uncaptured_output from dcicutils.command_utils import yes_or_no -from dcicutils.misc_utils import get_error_message, is_uuid, PRINT +from dcicutils.misc_utils import get_error_message, is_uuid, PRINT, to_snake_case from dcicutils.portal_utils import Portal @@ -104,6 +104,9 @@ def main(): parser.add_argument("--raw", action="store_true", required=False, default=False, help="Raw output.") parser.add_argument("--inserts", action="store_true", required=False, default=False, help="Format output for subsequent inserts.") + parser.add_argument("--insert-files", action="store_true", required=False, default=False, + help="Output for to insert files.") + parser.add_argument("--ignore", nargs="+", help="Ignore these fields for --inserts.") parser.add_argument("--tree", action="store_true", required=False, default=False, help="Tree output for schemas.") parser.add_argument("--database", action="store_true", required=False, default=False, help="Read from database output.") @@ -116,18 +119,30 @@ def main(): parser.add_argument("--indent", required=False, default=False, help="Indent output.", type=int) parser.add_argument("--summary", action="store_true", required=False, default=False, help="Summary output (for schema only).") + parser.add_argument("--force", action="store_true", required=False, default=False, help="Debugging output.") parser.add_argument("--terse", action="store_true", required=False, default=False, help="Terse output.") parser.add_argument("--verbose", action="store_true", required=False, default=False, help="Verbose output.") + parser.add_argument("--noheader", action="store_true", required=False, default=False, help="Supress header output.") parser.add_argument("--debug", action="store_true", required=False, default=False, help="Debugging output.") args = parser.parse_args() portal = _create_portal(ini=args.ini, env=args.env or os.environ.get("SMAHT_ENV"), - server=args.server, app=args.app, verbose=args.verbose, debug=args.debug) + server=args.server, app=args.app, + verbose=args.verbose and not args.noheader, debug=args.debug) if not args.uuid: _print("UUID or schema or path required.") _exit(1) + if args.insert_files: + args.inserts = True + if args.output: + if not os.path.isdir(args.output): + _print(f"Specified output directory for insert files does not exist: {args.output}") + exit(1) + args.insert_files = args.output + args.output = None + if args.output: if os.path.exists(args.output): if os.path.isdir(args.output): @@ -135,7 +150,7 @@ def main(): _exit(1) elif os.path.isfile(args.output): _print(f"Specified output file already exists: {args.output}") - if not yes_or_no(f"Do you want to overwrite this file?"): + if (not args.force) and not yes_or_no(f"Do you want to overwrite this file?"): _exit(0) _output_file = io.open(args.output, "w") @@ -190,8 +205,13 @@ def main(): all=args.all, summary=args.summary, yaml=args.yaml) return - data = _get_portal_object(portal=portal, uuid=args.uuid, raw=args.raw, inserts=args.inserts, - database=args.database, check=args.bool, verbose=args.verbose) + data = _get_portal_object(portal=portal, uuid=args.uuid, raw=args.raw, database=args.database, + inserts=args.inserts, insert_files=args.insert_files, + ignore=args.ignore, check=args.bool, + force=args.force, verbose=args.verbose, debug=args.debug) + if args.insert_files: + return + if args.bool: if data: _print(f"{args.uuid}: found") @@ -241,30 +261,123 @@ def _create_portal(ini: str, env: Optional[str] = None, def _get_portal_object(portal: Portal, uuid: str, - raw: bool = False, inserts: bool = False, database: bool = False, - check: bool = False, verbose: bool = False) -> dict: - response = None - try: - if not uuid.startswith("/"): - path = f"/{uuid}" - else: - path = uuid - response = portal.get(path, raw=raw or inserts, database=database) - except Exception as e: - if "404" in str(e) and "not found" in str(e).lower(): - _print(f"Portal object not found at {portal.server}: {uuid}") - _exit() - _exit(f"Exception getting Portal object from {portal.server}: {uuid}\n{get_error_message(e)}") - if not response: - if check: + raw: bool = False, database: bool = False, + inserts: bool = False, insert_files: bool = False, + ignore: Optional[List[str]] = None, + check: bool = False, force: bool = False, + verbose: bool = False, debug: bool = False) -> dict: + + def prune_data(data: dict) -> dict: + nonlocal ignore + if not isinstance(ignore, list) or not ignore: + return data + return {key: value for key, value in data.items() if key not in ignore} + + def get_metadata_for_individual_result_type(uuid: str) -> Optional[dict]: # noqa + # There can be a lot of individual results for which we may need to get the actual type, + # so do this in a function we were can give verbose output feedback. + nonlocal portal, results_index, results_total, verbose + if verbose: + _print(f"Getting actual type for {results_type} result:" + f" {uuid} [{results_index} of {results_total}]", end="") + result = portal.get_metadata(uuid, raise_exception=False) + if (isinstance(result_types := result.get("@type"), list) and + result_types and (result_type := result_types[0])): # noqa + if verbose: + _print(f" -> {result_type}") + return result_type + if verbose: + _print() + return None + + def get_metadata_types(path: str) -> Optional[dict]: + nonlocal portal, debug + metadata_types = {} + try: + if verbose: + _print(f"Executing separted query to get actual metadata types for raw/inserts query.") + if ((response := portal.get(path)) and (response.status_code in [200, 307]) and + (response := response.json()) and (results := response.get("@graph"))): # noqa + for result in results: + if (result_type := result.get("@type")) and (result_uuid := result.get("uuid")): + if ((isinstance(result_type, list) and (result_type := result_type[0])) or + isinstance(result_type, str)): # noqa + metadata_types[result_uuid] = result_type + except Exception: return None - _exit(f"Null response getting Portal object from {portal.server}: {uuid}") - if response.status_code not in [200, 307]: - # TODO: Understand why the /me endpoint returns HTTP status code 307, which is only why we mention it above. - _exit(f"Invalid status code ({response.status_code}) getting Portal object from {portal.server}: {uuid}") - if not response.json: - _exit(f"Invalid JSON getting Portal object: {uuid}") - response = response.json() + return metadata_types + + def write_insert_files(response: dict) -> None: + nonlocal insert_files, force + output_directory = insert_files if isinstance(insert_files, str) else os.getcwd() + for schema_name in response: + schema_data = response[schema_name] + file_name = f"{to_snake_case(schema_name)}.json" + file_path = os.path.join(output_directory, file_name) + message_verb = "Writing" + if os.path.exists(file_path): + message_verb = "Overwriting" + if os.path.isdir(file_path): + _print(f"WARNING: Output file already exists as a directory. SKIPPING: {file_path}") + continue + if not force: + _print(f"Output file already exists: {file_path}") + if not yes_or_no(f"Overwrite this file?"): + continue + if verbose: + _print(f"{message_verb} {schema_name} (object{'s' if len(schema_data) != 1 else ''}:" + f" {len(schema_data)}) file: {file_path}") + with io.open(file_path, "w") as f: + json.dump(schema_data, f, indent=4) + + if os.path.exists(uuid) and inserts: + # Very special case: If given "uuid" (or other path) as actually a file then assume it + # contains a list of references (e.g. /Donor/3039a6ca-9849-432d-ad49-2c5630bcbee7) to fetch. + response = {} + if verbose: + _print(f"Reading references from file: {uuid}") + with io.open(uuid) as f: + for line in f: + if ((line := line.strip()) and (components := line.split("/")) and (len(components) > 1) and + (schema_name := components[1]) and (schema_name := _get_schema(portal, schema_name)[1])): # noqa + try: + if ((result := portal.get(line, raw=True, database=database)) and + (result.status_code in [200, 307]) and (result := result.json())): # noqa + if not response.get(schema_name): + response[schema_name] = [] + response[schema_name].append(result) + continue + except Exception: + pass + _print(f"Cannot get reference: {line}") + if insert_files: + write_insert_files(response) + return response + else: + response = None + try: + if not uuid.startswith("/"): + path = f"/{uuid}" + else: + path = uuid + response = portal.get(path, raw=raw or inserts, database=database) + except Exception as e: + if "404" in str(e) and "not found" in str(e).lower(): + _print(f"Portal object not found at {portal.server}: {uuid}") + _exit() + _exit(f"Exception getting Portal object from {portal.server}: {uuid}\n{get_error_message(e)}") + if not response: + if check: + return None + _exit(f"Null response getting Portal object from {portal.server}: {uuid}") + if response.status_code not in [200, 307]: + # TODO: Understand why the /me endpoint returns HTTP status code 307, which is only why we mention it above. + _exit(f"Invalid status code ({response.status_code}) getting Portal object from {portal.server}: {uuid}") + if not response.json: + _exit(f"Invalid JSON getting Portal object: {uuid}") + response = response.json() + + response_types = {} if inserts: # Format results as suitable for inserts (e.g. via update-portal-object). response.pop("schema_version", None) @@ -272,20 +385,34 @@ def _get_portal_object(portal: Portal, uuid: str, (isinstance(results_type := response.get("@type"), list) and results_type) and (isinstance(results_type := results_type[0], str) and results_type.endswith("SearchResults")) and (results_type := results_type[0:-len("SearchResults")])): # noqa - # For search results, the type (from XyzSearchResults, above) may not be precisely correct for - # each of the results; it may be the supertype (e.g. QualityMetric vs QualityMetricWorkflowRun); + # For (raw frame) search results, the type (from XyzSearchResults, above) may not be precisely correct + # for each of the results; it may be the supertype (e.g. QualityMetric vs QualityMetricWorkflowRun); # so for types which are supertypes (gotten via Portal.get_schemas_super_type_map) we actually - # lookup each result individually to determine its actual precise type. + # lookup each result individually to determine its actual precise type. Although, if we have + # more than (say) 5 results to do this for, then do a separate query (get_metadata_types) + # to get the result types all at once. if not ((supertypes := portal.get_schemas_super_type_map()) and (subtypes := supertypes.get(results_type))): subtypes = None response = {} + results_index = 0 + results_total = len(results) for result in results: + results_index += 1 + if debug: + print(f"Processing result: {results_index}") result.pop("schema_version", None) - if (subtypes and - (result_uuid := result.get("uuid")) and - (individual_result := portal.get_metadata(result_uuid, raise_exception=False)) and - isinstance(result_type:= individual_result.get("@type"), list) and result_type and result_type[0]): # noqa - result_type = result_type[0] + result = prune_data(result) + if (subtypes and one_or_more_objects_of_types_exists(portal, subtypes, debug=debug) and + (result_uuid := result.get("uuid"))): # noqa + # If we have more than (say) 5 results for which we need to determine that actual result type, + # then get them all at once via separate query (get_metadata_types)) which is not the raw frame. + if (results_total > 5) and (not response_types): + response_types = get_metadata_types(path) + if not (response_types and (result_type := response_types.get(result_uuid))): + if individual_result_type := get_metadata_for_individual_result_type(result_uuid): + result_type = individual_result_type + else: + result_type = results_type else: result_type = results_type if response.get(result_type): @@ -295,12 +422,59 @@ def _get_portal_object(portal: Portal, uuid: str, # Get the result as non-raw so we can get its type. elif ((response_cooked := portal.get(path, database=database)) and (isinstance(response_type := response_cooked.json().get("@type"), list) and response_type)): - response = {f"{response_type[0]}": [response]} + response = {f"{response_type[0]}": [prune_data(response)]} + if insert_files: + write_insert_files(response) +# output_directory = insert_files if isinstance(insert_files, str) else os.getcwd() +# for schema_name in response: +# schema_data = response[schema_name] +# file_name = f"{to_snake_case(schema_name)}.json" +# file_path = os.path.join(output_directory, file_name) +# message_verb = "Writing" +# if os.path.exists(file_path): +# message_verb = "Overwriting" +# if os.path.isdir(file_path): +# _print(f"WARNING: Output file already exists as a directory. SKIPPING: {file_path}") +# continue +# if not force: +# _print(f"Output file already exists: {file_path}") +# if not yes_or_no(f"Overwrite this file?"): +# continue +# if verbose: +# _print(f"{message_verb} {schema_name} (object{'s' if len(schema_data) != 1 else ''}:" +# f" {len(schema_data)}) file: {file_path}") +# with io.open(file_path, "w") as f: +# json.dump(schema_data, f, indent=4) elif raw: response.pop("schema_version", None) return response +def one_or_more_objects_of_types_exists(portal: Portal, schema_types: List[str], debug: bool = False) -> bool: + for schema_type in schema_types: + if one_or_more_objects_of_type_exists(portal, schema_type, debug=debug): + return True + return False + + +@lru_cache(maxsize=64) +def one_or_more_objects_of_type_exists(portal: Portal, schema_type: str, debug: bool = False) -> bool: + try: + if debug: + _print(f"Checking if there are actually any objects of type: {schema_type}") + if portal.get(f"/{schema_type}").status_code == 404: + if debug: + _print(f"No objects of type actually exist: {schema_type}") + return False + else: + if debug: + _print(f"One or more objects of type exist: {schema_type}") + except Exception as e: + _print(f"ERROR: Cannot determine if there are actually any objects of type: {schema_type}") + _print(e) + return True + + @lru_cache(maxsize=1) def _get_schemas(portal: Portal) -> Optional[dict]: return portal.get_schemas() diff --git a/poetry.lock b/poetry.lock index 0f853563a..ad7f922a2 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1994,7 +1994,7 @@ files = [ name = "tomli" version = "2.0.1" description = "A lil' TOML parser" -category = "main" +category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -2282,4 +2282,4 @@ testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<3.13" -content-hash = "45bd3f78d7d134a4e8ec30f529e286d963e5612daea5287174a43d1d3069afc2" +content-hash = "bb78e9c396c24c7df9ab1768d13cd979909c31edcdf796bebbb28bb07a5720a6" diff --git a/pyproject.toml b/pyproject.toml index c47099bb5..bfe446d56 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "dcicutils" -version = "8.14.0" +version = "8.14.1" description = "Utility package for interacting with the 4DN Data Portal and other 4DN resources" authors = ["4DN-DCIC Team "] license = "MIT"