diff --git a/CHANGELOG.rst b/CHANGELOG.rst index d8b85d0fb..317a7391b 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -6,8 +6,13 @@ dcicutils Change Log ---------- +8.4.0 +===== +* More work related to SMaHT ingestion (bundle/sheet_utils, data_readers, etc). + + 8.3.0 -========= +===== * Updates for RAS to Redis API diff --git a/dcicutils/bundle_utils.py b/dcicutils/bundle_utils.py index fb76c74a5..0f9b73e1e 100644 --- a/dcicutils/bundle_utils.py +++ b/dcicutils/bundle_utils.py @@ -4,7 +4,7 @@ from .common import AnyJsonData from .env_utils import EnvUtils, public_env_name from .ff_utils import get_metadata -from .misc_utils import AbstractVirtualApp, ignored, ignorable, PRINT, to_camel_case +from .misc_utils import AbstractVirtualApp, ignored, ignorable, PRINT, remove_empty_properties, to_camel_case from .sheet_utils import ( LoadTableError, prefer_number, TabbedJsonSchemas, Header, Headers, TabbedHeaders, ParsedHeader, ParsedHeaders, TabbedParsedHeaders, SheetCellValue, TabbedSheetData, @@ -15,6 +15,7 @@ PatchPrototype = Dict TabbedPatchPrototypes = Dict[str, PatchPrototype] +ARRAY_VALUE_DELIMITER = "|" class TypeHintContext: @@ -44,7 +45,10 @@ def __init__(self, problems: Optional[dict] = None): class TypeHint: - def apply_hint(self, value): + def __init__(self): + self.is_array = False + + def apply_hint(self, value, src): return value def __str__(self): @@ -55,38 +59,41 @@ def __repr__(self): class BoolHint(TypeHint): + def __init__(self): + super().__init__() # We could use other ways to do this, such as initial substring, but this is more likely to be right. # Then again, we might want to consder accepting athers like 'yes/no', 'y/n', 'on/off', '1/0'. TRUE_VALUES = ['true', 't'] FALSE_VALUES = ['false', 'f'] - def apply_hint(self, value): + def apply_hint(self, value, src): if isinstance(value, str) and value: l_value = value.lower() if l_value in self.TRUE_VALUES: return True elif l_value in self.FALSE_VALUES: return False - return super().apply_hint(value) + return super().apply_hint(value, src) class NumHint(TypeHint): - PREFERENCE_MAP = {'number': 'num', 'integer': 'int', 'float': 'float'} + PREFERENCE_MAP = {'number': 'number', 'integer': 'integer'} def __init__(self, declared_type: Optional[str] = None): if declared_type is None: - declared_type = 'num' + declared_type = 'number' self.preferred_type = self.PREFERENCE_MAP.get(declared_type) + super().__init__() - def apply_hint(self, value): + def apply_hint(self, value, src): if isinstance(value, str) and value: if self.preferred_type: return prefer_number(value, kind=self.preferred_type) else: return value - return super().apply_hint(value) + return super().apply_hint(value, src) class EnumHint(TypeHint): @@ -96,8 +103,9 @@ def __str__(self): def __init__(self, value_map): self.value_map = value_map + super().__init__() - def apply_hint(self, value): + def apply_hint(self, value, src): if isinstance(value, str): if value in self.value_map: result = self.value_map[value] @@ -112,7 +120,29 @@ def apply_hint(self, value): [only_found] = found result = self.value_map[only_found] return result - return super().apply_hint(value) + return super().apply_hint(value, src) + + +class ArrayHint(TypeHint): + def __init__(self): + super().__init__() + + def apply_hint(self, value, src): + if value is None or value == '': + return [] + if isinstance(value, str): + if not value: + return [] + return [value.strip() for value in value.split(ARRAY_VALUE_DELIMITER)] + return super().apply_hint(value, src) + + +class StringHint(TypeHint): + def __init__(self): + super().__init__() + + def apply_hint(self, value, src): + return str(value).strip() if value is not None else "" class RefHint(TypeHint): @@ -120,13 +150,29 @@ class RefHint(TypeHint): def __str__(self): return f"" - def __init__(self, schema_name: str, context: TypeHintContext): + def __init__(self, schema_name: str, required: bool, context: TypeHintContext): self.schema_name = schema_name self.context = context + self.required = required + super().__init__() + + def apply_hint(self, value, src): + if self.is_array and isinstance(value, str): + value = [value.strip() for value in value.split(ARRAY_VALUE_DELIMITER)] if value else [] + if self.is_array and isinstance(value, list): + for item in value: + self._apply_ref_hint(item, src) + else: + self._apply_ref_hint(value, src) + return value - def apply_hint(self, value): - if not self.context.validate_ref(item_type=self.schema_name, item_ref=value): - raise ValidationProblem(f"Unable to validate {self.schema_name} reference: {value!r}") + def _apply_ref_hint(self, value, src): + if not value and self.required: + raise ValidationProblem(f"No required reference (linkTo) value for: {self.schema_name}" + f"{f' from {src}' if src else ''}") + if value and not self.context.validate_ref(item_type=self.schema_name, item_ref=value): + raise ValidationProblem(f"Cannot resolve reference (linkTo) for: {self.schema_name}" + f"{f'/{value}' if value else ''}{f' from {src}' if src else ''}") return value @@ -228,7 +274,7 @@ def parse_sheet_headers(cls, headers: Headers) -> ParsedHeaders: def compute_patch_prototype(cls, parsed_headers: ParsedHeaders): prototype = {} for parsed_header in parsed_headers: - parsed_header0 = parsed_header[0] + parsed_header0 = parsed_header[0] if parsed_header else "" if isinstance(parsed_header0, int): raise LoadTableError(f"A header cannot begin with a numeric ref: {parsed_header0}") cls.assure_patch_prototype_shape(parent=prototype, keys=parsed_header) @@ -236,7 +282,9 @@ def compute_patch_prototype(cls, parsed_headers: ParsedHeaders): @classmethod def assure_patch_prototype_shape(cls, *, parent: Union[Dict, List], keys: ParsedHeader): - [key0, *more_keys] = keys + key0 = None + more_keys = None + [key0, *more_keys] = keys if keys else [None] key1 = more_keys[0] if more_keys else None if isinstance(key1, int): placeholder = [] @@ -288,8 +336,11 @@ def parse_item_value(cls, value: SheetCellValue, return True elif lvalue == 'false': return False - elif lvalue == 'null' or lvalue == '': + # elif lvalue == 'null' or lvalue == '': + elif lvalue == 'null': return None + elif lvalue == '': + return lvalue elif split_pipe and '|' in value: if value == '|': # Use '|' for [] return [] @@ -307,16 +358,18 @@ def parse_item_value(cls, value: SheetCellValue, @classmethod def set_path_value(cls, datum: Union[List, Dict], path: ParsedHeader, value: Any, force: bool = False): - if (value is None or value == '') and not force: + # if (value is None or value == '') and not force: + if value is None and not force: return - [key, *more_path] = path + [key, *more_path] = path if path else [None] if not more_path: datum[key] = value else: cls.set_path_value(datum[key], more_path, value) @classmethod - def find_type_hint_for_subschema(cls, subschema: Any, context: Optional[TypeHintContext] = None): + def find_type_hint_for_subschema(cls, subschema: Any, required: bool = False, + context: Optional[TypeHintContext] = None): if subschema is not None: t = subschema.get('type') if t == 'string': @@ -326,11 +379,19 @@ def find_type_hint_for_subschema(cls, subschema: Any, context: Optional[TypeHint return EnumHint(mapping) link_to = subschema.get('linkTo') if link_to and context.schema_exists(link_to): - return RefHint(schema_name=link_to, context=context) - elif t in ('integer', 'float', 'number'): + return RefHint(schema_name=link_to, required=required, context=context) + return StringHint() + elif t in ('integer', 'number'): return NumHint(declared_type=t) elif t == 'boolean': return BoolHint() + elif t == 'array': + array_type_hint = cls.find_type_hint_for_subschema(subschema.get("items"), + required=required, context=context) + if type(array_type_hint) is RefHint: + array_type_hint.is_array = True + return array_type_hint + return ArrayHint() @classmethod def find_type_hint_for_parsed_header(cls, parsed_header: Optional[ParsedHeader], schema: Any, @@ -344,7 +405,8 @@ def finder(subheader, subschema): if subschema.get('type') == 'object': subsubschema = subschema.get('properties', {}).get(key1) if not other_headers: - hint = cls.find_type_hint_for_subschema(subsubschema, context=context) + required = key1 and subschema and key1 in subschema.get('required', []) + hint = cls.find_type_hint_for_subschema(subsubschema, required=required, context=context) if hint: return hint else: @@ -453,7 +515,8 @@ def __init__(self, tabbed_sheet_data: TabbedSheetData, *, flattened: bool, self.portal_vapp = portal_vapp self.schema_manager: SchemaManager = SchemaManager(portal_env=portal_env, portal_vapp=portal_vapp, override_schemas=override_schemas) - self.schemas = self.schema_manager.fetch_relevant_schemas(self.tab_names) # , schemas=schemas) + schema_names_to_fetch = [key for key, value in tabbed_sheet_data.items() if value] + self.schemas = self.schema_manager.fetch_relevant_schemas(schema_names_to_fetch) self.lookup_tables_by_tab_name: Dict[str, Dict[str, Dict]] = { tab_name: self.build_lookup_table_for_tab(tab_name, rows=rows) for tab_name, rows in tabbed_sheet_data.items() @@ -474,6 +537,8 @@ def note_problem(self, problem: str): self._problems.append(problem) def build_lookup_table_for_tab(self, tab_name: str, *, rows: List[Dict]) -> Dict[str, Dict]: + if not self.schema_manager or not rows: + return [] identifying_properties = self.schema_manager.identifying_properties(schema_name=tab_name) if not identifying_properties: # Maybe issue a warning here that we're going to lose @@ -497,7 +562,15 @@ def contains_ref(self, item_type, item_ref): def resolve_ref(self, item_type, item_ref): lookup_table = self.lookup_tables_by_tab_name.get(item_type) if lookup_table: # Is it a type we're tracking? - return lookup_table.get(item_ref) or None + if isinstance(item_ref, list): + found = True + for item in item_ref: + if not lookup_table.get(item): + found = False + break + return True if found else None + else: + return lookup_table.get(item_ref) or None else: # Apparently some stray type not in our tables return None @@ -523,8 +596,18 @@ def validate_ref(self, item_type, item_ref): return True try: # TODO: This probably needs a cache - info = get_metadata(f"/{to_camel_case(item_type)}/{item_ref}", - ff_env=self.portal_env, vapp=self.portal_vapp) + if isinstance(item_ref, list): + found = True + for item in item_ref: + info = get_metadata(f"/{to_camel_case(item_type)}/{item}", + ff_env=self.portal_env, vapp=self.portal_vapp) + if not isinstance(info, dict) or 'uuid' not in info: + found = False + break + return found + else: + info = get_metadata(f"/{to_camel_case(item_type)}/{item_ref}", + ff_env=self.portal_env, vapp=self.portal_vapp) # Basically return True if there's a value at all, # but still check it's not an error message that didn't get raised. return isinstance(info, dict) and 'uuid' in info @@ -532,7 +615,7 @@ def validate_ref(self, item_type, item_ref): return False def schema_exists(self, schema_name: str) -> bool: - return self.schema_manager.schema_exists(schema_name) + return self.schema_manager.schema_exists(schema_name) if self.schema_manager else False def check_tab(self, tab_name: str): prototype = self.patch_prototypes_by_tab_name[tab_name] @@ -579,10 +662,15 @@ def check_flattened_row(self, row: Dict, *, tab_name: str, row_number: int, prot patch_item = copy.deepcopy(prototype) for column_number, column_value in enumerate(row.values()): parsed_value = ItemTools.parse_item_value(column_value, apply_heuristics=self.apply_heuristics) + column_name = (list(row.keys())[column_number] or "") if len(row) > column_number else "" + if column_name.endswith("#"): + if isinstance(parsed_value, str): + parsed_value = [value.strip() for value in parsed_value.split(ARRAY_VALUE_DELIMITER) if value] type_hint = type_hints[column_number] if type_hint: try: - parsed_value = type_hint.apply_hint(parsed_value) + src = f"{tab_name}{f'.{column_name}' if column_name else ''}" + parsed_value = type_hint.apply_hint(parsed_value, src) except ValidationProblem as e: headers = self.headers_by_tab_name[tab_name] column_name = headers[column_number] @@ -632,7 +720,7 @@ def create_tab_processor_state(self, tab_name: str) -> SheetState: type_hints=self.type_hints_by_tab_name[tab_name]) -check = TableChecker.check +# check = TableChecker.check def load_items(filename: str, tab_name: Optional[str] = None, escaping: Optional[bool] = None, @@ -641,9 +729,11 @@ def load_items(filename: str, tab_name: Optional[str] = None, escaping: Optional # TODO: validate= is presently False (i.e., disabled) by default while being debugged, # but for production use maybe should not be? -kmp 25-Oct-2023 validate: bool = False, + retain_empty_properties: bool = False, + sheet_order: Optional[List[str]] = None, **kwargs): annotated_data = TableSetManager.load_annotated(filename=filename, tab_name=tab_name, escaping=escaping, - prefer_number=False, **kwargs) + prefer_number=False, sheet_order=sheet_order, **kwargs) tabbed_rows = annotated_data['content'] flattened = annotated_data['flattened'] if flattened: @@ -655,6 +745,8 @@ def load_items(filename: str, tab_name: Optional[str] = None, escaping: Optional # No fancy checking for things like .json, etc. for now. Only check things that came from # spreadsheet-like data, where structural datatypes are forced into strings. checked_items = tabbed_rows + if not retain_empty_properties: + remove_empty_properties(checked_items) if validate: problems = validate_data_against_schemas(checked_items, portal_env=portal_env, portal_vapp=portal_vapp, override_schemas=override_schemas) diff --git a/dcicutils/data_readers.py b/dcicutils/data_readers.py new file mode 100644 index 000000000..46a07a059 --- /dev/null +++ b/dcicutils/data_readers.py @@ -0,0 +1,147 @@ +import abc +import csv +import openpyxl +from typing import Any, Generator, Iterator, List, Optional, Type, Tuple, Union +from dcicutils.misc_utils import right_trim + + +class RowReader(abc.ABC): + + def __init__(self): + self.header = None + self.location = 0 + self._warning_empty_headers = False + self._warning_extra_values = [] # Line numbers. + self.open() + + def __iter__(self) -> Iterator: + for row in self.rows: + self.location += 1 + if self.is_comment_row(row): + continue + if self.is_terminating_row(row): + break + if len(self.header) < len(row): # Row values beyond what there are headers for are ignored. + self._warning_extra_values.append(self.location) + yield {column: self.cell_value(value) for column, value in zip(self.header, row)} + + def _define_header(self, header: List[Optional[Any]]) -> None: + self.header = [] + for index, column in enumerate(header or []): + if not (column := str(column).strip() if column is not None else ""): + self._warning_empty_headers = True + break # Empty header column signals end of header. + self.header.append(column) + + def rows(self) -> Generator[Union[List[Optional[Any]], Tuple[Optional[Any], ...]], None, None]: + yield + + def is_comment_row(self, row: Union[List[Optional[Any]], Tuple[Optional[Any]]]) -> bool: + return False + + def is_terminating_row(self, row: Union[List[Optional[Any]], Tuple[Optional[Any]]]) -> bool: + return False + + def cell_value(self, value: Optional[Any]) -> Optional[Any]: + return str(value).strip() if value is not None else "" + + def open(self) -> None: + pass + + @property + def issues(self) -> Optional[List[str]]: + issues = [] + if self._warning_empty_headers: + issues.append("Empty header column encountered; ignoring it and all subsequent columns.") + if self._warning_extra_values: + issues.extend([f"Extra column values on row [{row_number}]" for row_number in self._warning_extra_values]) + return issues if issues else None + + +class ListReader(RowReader): + + def __init__(self, rows: List[List[Optional[Any]]]) -> None: + self._rows = rows + super().__init__() + + @property + def rows(self) -> Generator[List[Optional[Any]], None, None]: + for row in self._rows[1:]: + yield row + + def open(self) -> None: + if not self.header: + self._define_header(self._rows[0] if self._rows else []) + + +class CsvReader(RowReader): + + def __init__(self, file: str) -> None: + self._file = file + self._file_handle = None + self._rows = None + super().__init__() + + @property + def rows(self) -> Generator[List[Optional[Any]], None, None]: + for row in self._rows: + yield right_trim(row) + + def open(self) -> None: + if self._file_handle is None: + self._file_handle = open(self._file) + self._rows = csv.reader(self._file_handle, delimiter="\t" if self._file.endswith(".tsv") else ",") + self._define_header(right_trim(next(self._rows, []))) + + def __del__(self) -> None: + if (file_handle := self._file_handle) is not None: + self._file_handle = None + file_handle.close() + + +class ExcelSheetReader(RowReader): + + def __init__(self, sheet_name: str, workbook: openpyxl.workbook.workbook.Workbook) -> None: + self.sheet_name = sheet_name or "Sheet1" + self._workbook = workbook + self._rows = None + super().__init__() + + @property + def rows(self) -> Generator[Tuple[Optional[Any], ...], None, None]: + for row in self._rows(min_row=2, values_only=True): + yield right_trim(row) + + def is_terminating_row(self, row: Tuple[Optional[Any]]) -> bool: + return all(cell is None for cell in row) # Empty row signals end of data. + + def open(self) -> None: + if not self._rows: + self._rows = self._workbook[self.sheet_name].iter_rows + self._define_header(right_trim(next(self._rows(min_row=1, max_row=1, values_only=True), []))) + + +class Excel: + + def __init__(self, file: str, reader_class: Optional[Type] = None) -> None: + self._file = file + self._workbook = None + self.sheet_names = None + if isinstance(reader_class, Type) and issubclass(reader_class, ExcelSheetReader): + self._reader_class = reader_class + else: + self._reader_class = ExcelSheetReader + self.open() + + def sheet_reader(self, sheet_name: str) -> ExcelSheetReader: + return self._reader_class(sheet_name=sheet_name, workbook=self._workbook) + + def open(self) -> None: + if self._workbook is None: + self._workbook = openpyxl.load_workbook(self._file, data_only=True) + self.sheet_names = self._workbook.sheetnames or [] + + def __del__(self) -> None: + if (workbook := self._workbook) is not None: + self._workbook = None + workbook.close() diff --git a/dcicutils/misc_utils.py b/dcicutils/misc_utils.py index 97a17c21b..0484d6992 100644 --- a/dcicutils/misc_utils.py +++ b/dcicutils/misc_utils.py @@ -23,7 +23,7 @@ from collections import defaultdict from datetime import datetime as datetime_type from dateutil.parser import parse as dateutil_parse -from typing import Optional +from typing import Any, Callable, List, Optional, Tuple, Union # Is this the right place for this? I feel like this should be done in an application, not a library. @@ -978,6 +978,43 @@ def str_to_bool(x: Optional[str]) -> Optional[bool]: raise ValueError(f"An argument to str_or_bool must be a string or None: {x!r}") +def to_integer(value: str, fallback: Optional[Any] = None) -> Optional[Any]: + try: + return int(value) + except Exception: + return fallback + + +def to_float(value: str, fallback: Optional[Any] = None) -> Optional[Any]: + try: + return float(value) + except Exception: + return fallback + + +def to_boolean(value: str, fallback: Optional[Any]) -> Optional[Any]: + if isinstance(value, str) and (value := value.strip().lower()): + if (lower_value := value.lower()) in ["true", "t"]: + return True + elif lower_value in ["false", "f"]: + return False + return fallback + + +def to_enum(value: str, enumerators: List[str]) -> Optional[str]: + matches = [] + if isinstance(value, str) and (value := value.strip()) and isinstance(enumerators, List): + enum_specifiers = {str(enum).lower(): enum for enum in enumerators} + if (enum_value := enum_specifiers.get(lower_value := value.lower())) is not None: + return enum_value + for enum_canonical, _ in enum_specifiers.items(): + if enum_canonical.startswith(lower_value): + matches.append(enum_canonical) + if len(matches) == 1: + return enum_specifiers[matches[0]] + return enum_specifiers[matches[0]] if len(matches) == 1 else value + + @contextlib.contextmanager def override_environ(**overrides): """ @@ -1107,6 +1144,18 @@ def remove_suffix(suffix: str, text: str, required: bool = False): return text[:len(text)-len(suffix)] +def remove_empty_properties(data: Optional[Union[list, dict]]) -> None: + if isinstance(data, dict): + for key in list(data.keys()): + if (value := data[key]) in [None, "", {}, []]: + del data[key] + else: + remove_empty_properties(value) + elif isinstance(data, list): + for item in data: + remove_empty_properties(item) + + class ObsoleteError(Exception): pass @@ -1320,6 +1369,18 @@ def json_file_contents(filename): return json.load(fp) +def load_json_if(s: str, is_array: bool = False, is_object: bool = False, + fallback: Optional[Any] = None) -> Optional[Any]: + if (isinstance(s, str) and + ((is_object and s.startswith("{") and s.endswith("}")) or + (is_array and s.startswith("[") and s.endswith("]")))): + try: + return json.loads(s) + except Exception: + pass + return fallback + + def camel_case_to_snake_case(s, separator='_'): """ Converts CamelCase to snake_case. @@ -1404,6 +1465,42 @@ def string_list(s): return [p for p in [part.strip() for part in s.split(",")] if p] +def split_string(value: str, delimiter: str, escape: Optional[str] = None) -> List[str]: + """ + Splits the given string into an array of string based on the given delimiter, and an optional escape character. + """ + if not isinstance(value, str) or not (value := value.strip()): + return [] + if not isinstance(escape, str) or not escape: + return [item.strip() for item in value.split(delimiter)] + result = [] + item = r"" + escaped = False + for c in value: + if c == delimiter and not escaped: + result.append(item.strip()) + item = r"" + elif c == escape and not escaped: + escaped = True + else: + item += c + escaped = False + result.append(item.strip()) + return [item for item in result if item] + + +def right_trim(list_or_tuple: Union[List[Any], Tuple[Any]], + remove: Optional[Callable] = None) -> Union[List[Any], Tuple[Any]]: + """ + Removes training None (or emptry string) values from the give tuple or list arnd returns; + does NOT change the given value. + """ + i = len(list_or_tuple) - 1 + while i >= 0 and ((remove and remove(list_or_tuple[i])) or (not remove and list_or_tuple[i] in (None, ""))): + i -= 1 + return list_or_tuple[:i + 1] + + def is_c4_arn(arn: str) -> bool: """ Returns True iff the given (presumed) AWS ARN string value looks like it @@ -2050,6 +2147,32 @@ def merge_key_value_dict_lists(x, y): return [key_value_dict(k, v) for k, v in merged.items()] +def merge_objects(target: Union[dict, List[Any]], source: Union[dict, List[Any]], full: bool = False) -> dict: + """ + Merges the given source dictionary or list into the target dictionary or list. + This MAY well change the given target (dictionary or list) IN PLACE. + The the full argument is True then any target lists longer than the + source be will be filled out with the last element(s) of the source. + """ + if target is None: + return source + if isinstance(target, dict) and isinstance(source, dict) and source: + for key, value in source.items(): + target[key] = merge_objects(target[key], value, full) if key in target else value + elif isinstance(target, list) and isinstance(source, list) and source: + for i in range(max(len(source), len(target))): + if i < len(target): + if i < len(source): + target[i] = merge_objects(target[i], source[i], full) + elif full: + target[i] = merge_objects(target[i], source[len(source) - 1], full) + else: + target.append(source[i]) + elif source: + target = source + return target + + # Stealing topological sort classes below from python's graphlib module introduced # in v3.9 with minor refactoring. # Source: https://github.com/python/cpython/blob/3.11/Lib/graphlib.py diff --git a/dcicutils/qa_utils.py b/dcicutils/qa_utils.py index 0c9b019ed..ac18f3b19 100644 --- a/dcicutils/qa_utils.py +++ b/dcicutils/qa_utils.py @@ -3540,7 +3540,7 @@ def recurse(json1, json2, path=""): if not result: # out(f"Recursive failure at {path!r} in list comparison") pass - elif type(json1) == type(json2): + elif type(json1) is type(json2): result = json1 == json2 if not result: out(f"Failed at {path!r} due to value mismatch: {json.dumps(json1)} != {json.dumps(json2)}") diff --git a/dcicutils/s3_utils.py b/dcicutils/s3_utils.py index e34a3f2b7..ea1897beb 100644 --- a/dcicutils/s3_utils.py +++ b/dcicutils/s3_utils.py @@ -318,7 +318,7 @@ def get_key(self, keyfile_name: str = 'access_key_admin' SSECustomerKey=os.environ['S3_ENCRYPT_KEY'], SSECustomerAlgorithm='AES256') body_data: Union[bytes, str] = response['Body'].read() - auth_text: str = body_data.decode() if type(body_data) == bytes else body_data + auth_text: str = body_data.decode() if type(body_data) is bytes else body_data try: return json.loads(auth_text) except (ValueError, TypeError): diff --git a/dcicutils/sheet_utils.py b/dcicutils/sheet_utils.py index 66ce523fa..07b15a993 100644 --- a/dcicutils/sheet_utils.py +++ b/dcicutils/sheet_utils.py @@ -16,8 +16,8 @@ from tempfile import TemporaryFile, TemporaryDirectory from typing import Any, Dict, Iterable, List, Optional, Tuple, Type, Union from .common import AnyJsonData, Regexp, JsonSchema -from .lang_utils import conjoined_list, disjoined_list, maybe_pluralize # , there_are -from .misc_utils import ignored, pad_to, JsonLinesReader, remove_suffix # , PRINT, AbstractVirtualApp +from .lang_utils import conjoined_list, disjoined_list, maybe_pluralize +from .misc_utils import ignored, pad_to, JsonLinesReader, remove_suffix, to_camel_case Header = str @@ -32,6 +32,7 @@ SheetData = List[dict] TabbedSheetData = Dict[str, SheetData] TabbedJsonSchemas = Dict[str, JsonSchema] +CommentRow = object() class LoadFailure(Exception): @@ -69,7 +70,7 @@ def unwanted_kwargs(*, context, kwargs, context_plural=False, detailed=False): f" {maybe_pluralize(unwanted, 'keyword argument')} {conjoined_list(unwanted)}.") -def prefer_number(value: SheetCellValue, kind='num'): +def prefer_number(value: SheetCellValue, kind='number'): """ Given a string, if the string has number syntax, returns the number it represents. Otherwise, returns its argument. (It follows from this that if given an int or a float, it just returns that argument.) @@ -78,7 +79,7 @@ def prefer_number(value: SheetCellValue, kind='num'): is coerced to, but it has no effect when the argument is a number, even a number of the wrong kind. :param value: a string, int, or float - :param kind: one of 'num', 'int', or 'float' + :param kind: one of 'number' or 'integer' :returns: the argument coerced to a number of the appropriate kind, if possible, or else the argument literally """ if isinstance(value, str): # the given value might be an int or float, in which case just fall through @@ -87,12 +88,12 @@ def prefer_number(value: SheetCellValue, kind='num'): value = value ch0 = value[0] if ch0 == '+' or ch0 == '-' or ch0.isdigit(): - if kind == 'num' or kind == 'int': + if kind == 'integer': try: return int(value) except Exception: pass - if kind == 'num' or kind == 'float': + if kind == 'number': try: return float(value) except Exception: @@ -198,10 +199,10 @@ def load(cls, filename: str, **kwargs) -> TabbedSheetData: raise NotImplementedError(f".load(...) is not implemented for {cls.__name__}.") # noQA @property - def tab_names(self) -> List[str]: + def tab_names(self, order: Optional[List[str]] = None) -> List[str]: raise NotImplementedError(f".tab_names is not implemented for {self.__class__.__name__}..") # noQA - def load_content(self) -> Any: + def load_content(self, sheet_order: Optional[List[str]] = None) -> Any: raise NotImplementedError(f".load_content() is not implemented for {self.__class__.__name__}.") # noQA @@ -290,18 +291,24 @@ def _process_row(self, tab_name: str, state: Any, row: List[SheetCellValue]) -> """ raise NotImplementedError(f"._process_row(...) is not implemented for {self.__class__.__name__}.") # noQA - def load_content(self) -> AnyJsonData: - for tab_name in self.tab_names: + def load_content(self, sheet_order: Optional[List[str]] = None) -> AnyJsonData: + for tab_name in self.tab_names(sheet_order): sheet_content = [] state = self._create_tab_processor_state(tab_name) for row_data in self._raw_row_generator_for_tab_name(tab_name): processed_row_data: AnyJsonData = self._process_row(tab_name, state, row_data) + if not processed_row_data: + break + if processed_row_data is CommentRow: + continue sheet_content.append(processed_row_data) - self.content_by_tab_name[tab_name] = sheet_content + self.content_by_tab_name[tab_name.replace(" ", "")] = sheet_content return self.content_by_tab_name - def parse_cell_value(self, value: SheetCellValue) -> AnyJsonData: - return prefer_number(value) if self.prefer_number else value + def parse_cell_value(self, value: SheetCellValue, override_prefer_number: Optional[bool] = None) -> AnyJsonData: + if override_prefer_number is None: + override_prefer_number = self.prefer_number + return prefer_number(value) if override_prefer_number else value class TableSetManagerRegistry: @@ -355,8 +362,9 @@ class XlsxManager(FlattenedTableSetManager): """ This implements the mechanism to get a series of rows out of the sheets in an XLSX file. """ - ALLOWED_FILE_EXTENSIONS = ['.xlsx'] + TERMINATE_ON_EMPTY_ROW = True + CONVERT_VALUES_TO_STRING = True @classmethod def _all_rows(cls, sheet: Worksheet): @@ -370,9 +378,20 @@ def _all_cols(cls, sheet: Worksheet): for col in range(1, col_max + 1): yield col - @property - def tab_names(self) -> List[str]: - return self.reader_agent.sheetnames + def tab_names(self, order: Optional[List[str]] = None) -> List[str]: + def ordered_sheet_names(sheet_names: List[str]) -> List[str]: + if not order: + return sheet_names + ordered_sheet_names = [] + for item in order: + for sheet_name in sheet_names: + if to_camel_case(item.replace(" ", "")) == to_camel_case(sheet_name.replace(" ", "")): + ordered_sheet_names.append(sheet_name) + for sheet_name in sheet_names: + if sheet_name not in ordered_sheet_names: + ordered_sheet_names.append(sheet_name) + return ordered_sheet_names + return ordered_sheet_names(self.reader_agent.sheetnames) def _get_reader_agent(self) -> Workbook: return openpyxl.load_workbook(self.filename) @@ -383,21 +402,68 @@ def _raw_row_generator_for_tab_name(self, tab_name: str) -> Iterable[SheetRow]: for row in self._all_rows(sheet)) def _get_raw_row_content_tuple(self, sheet: Worksheet, row: int) -> SheetRow: - return [sheet.cell(row=row, column=col).value - for col in self._all_cols(sheet)] + # return [str(sheet.cell(row=row, column=col).value) for col in self._all_cols(sheet)] + results = [] + for col in self._all_cols(sheet): + value = sheet.cell(row=row, column=col).value +# if value is not None and XlsxManager.CONVERT_VALUES_TO_STRING: +# value = str(value).strip() + if XlsxManager.CONVERT_VALUES_TO_STRING: + value = str(value).strip() if value is not None else "" + results.append(value) + return results def _create_tab_processor_state(self, tab_name: str) -> Headers: sheet = self.reader_agent[tab_name] + """ headers: Headers = [str(sheet.cell(row=1, column=col).value) for col in self._all_cols(sheet)] - self.headers_by_tab_name[sheet.title] = headers - return headers + """ + headers = [] + for col in self._all_cols(sheet): + cell = sheet.cell(row=1, column=col).value + if cell is not None and XlsxManager.CONVERT_VALUES_TO_STRING: + cell = str(cell).strip() + headers.append(cell) + self.headers_by_tab_name[sheet.title] = XlsxManager.remove_trailing_none_values(headers) + return self.headers_by_tab_name[sheet.title] def _process_row(self, tab_name: str, headers: Headers, row_data: SheetRow) -> AnyJsonData: ignored(tab_name) - return {headers[i]: self.parse_cell_value(row_datum) + if XlsxManager.is_terminating_row(row_data): + return None + if XlsxManager.is_comment_row(row_data): + return CommentRow + if len(headers) < len(row_data): + row_data = row_data[:len(headers)] + override_prefer_number = False if XlsxManager.CONVERT_VALUES_TO_STRING else None + return {headers[i]: self.parse_cell_value(row_datum, override_prefer_number=override_prefer_number) for i, row_datum in enumerate(row_data)} + @staticmethod + def is_terminating_row(row: List[Optional[Any]]) -> bool: + # TODO: This is may change; currently an all blank row signals the end of input. + return all(cell is None or cell == "" for cell in row) and XlsxManager.TERMINATE_ON_EMPTY_ROW + + @staticmethod + def is_comment_row(row: Tuple[Optional[Any]]) -> bool: + # TODO: This will probably change; currently a row starting only with #, *, or ^ signals a comment. + for cell in row: + if cell is None: + continue + if (cell := str(cell)).startswith("#") or cell.startswith("*") or cell.startswith("^"): + return True + return False + + @staticmethod + def remove_trailing_none_values(values: List[Any]) -> List[Any]: + for index in range(len(values) - 1, -1, -1): + if values[index] is not None: + break + else: + return [] + return values[:index + 1] + def infer_tab_name_from_filename(filename): return os.path.basename(filename).split('.')[0] @@ -409,8 +475,7 @@ def __init__(self, filename: str, tab_name: Optional[str] = None, **kwargs): self._tab_name = tab_name or infer_tab_name_from_filename(filename) super().__init__(filename=filename, **kwargs) - @property - def tab_names(self) -> List[str]: + def tab_names(self, order: Optional[List[str]] = None) -> List[str]: return [self._tab_name] @@ -437,8 +502,7 @@ def _wrap_inserts_data(cls, filename: str, data: AnyJsonData) -> AnyJsonData: ignored(filename) return data - @property - def tab_names(self) -> List[str]: + def tab_names(self, order: Optional[List[str]] = None) -> List[str]: return list(self.content_by_tab_name.keys()) def _get_reader_agent(self) -> Any: @@ -457,7 +521,7 @@ def extract_tabbed_headers(cls, data: TabbedSheetData) -> TabbedHeaders: result[tab] = headers return result - def load_content(self) -> Dict[str, AnyJsonData]: + def load_content(self, sheet_order: Optional[List[str]] = None) -> Dict[str, AnyJsonData]: data = self._load_inserts_data(self.filename) self.content_by_tab_name = data self.headers_by_tab_name = self.extract_tabbed_headers(data) @@ -589,10 +653,10 @@ def _escape_cell_text(cls, cell_text): def _process_row(self, tab_name: str, headers: Headers, row_data: SheetRow) -> AnyJsonData: ignored(tab_name) if self.escaping: - return {headers[i]: self.parse_cell_value(self._escape_cell_text(cell_text)) + return {headers[i]: self.parse_cell_value(self._escape_cell_text(cell_text), override_prefer_number=False) for i, cell_text in enumerate(row_data)} else: - return {headers[i]: self.parse_cell_value(cell_text) + return {headers[i]: self.parse_cell_value(cell_text, override_prefer_number=False) for i, cell_text in enumerate(row_data)} @@ -688,6 +752,7 @@ def load(cls, filename: str, tab_name: Optional[str] = None, escaping: Optional[ @classmethod def load_annotated(cls, filename: str, tab_name: Optional[str] = None, escaping: Optional[bool] = None, + retain_empty_properties: bool = False, sheet_order: Optional[List[str]] = None, **kwargs) -> Dict: """ Given a filename and various options @@ -696,7 +761,7 @@ def load_annotated(cls, filename: str, tab_name: Optional[str] = None, escaping: with maybe_unpack(filename) as filename: manager = cls.create_implementation_manager(filename=filename, tab_name=tab_name, escaping=escaping, **kwargs) - content: TabbedSheetData = manager.load_content() + content: TabbedSheetData = manager.load_content(sheet_order) return { 'filename': filename, 'content': content, @@ -709,3 +774,4 @@ def load_annotated(cls, filename: str, tab_name: Optional[str] = None, escaping: load_table_set = TableSetManager.load +load_table_annotated = TableSetManager.load_annotated diff --git a/dcicutils/validation_utils.py b/dcicutils/validation_utils.py index 9e80146a0..abe6bcb2e 100644 --- a/dcicutils/validation_utils.py +++ b/dcicutils/validation_utils.py @@ -63,10 +63,15 @@ def fetch_schema(self, schema_name: str): return override_schema schema: Optional[AnyJsonData] = self.SCHEMA_CACHE.get(schema_name) if schema is None and schema_name not in self.SCHEMA_CACHE: # If None is already stored, don't look it up again - schema = get_schema(schema_name, portal_env=self.portal_env, portal_vapp=self.portal_vapp) + schema_name = schema_name.replace(" ", "") + schema = SchemaManager.get_schema(schema_name, portal_env=self.portal_env, portal_vapp=self.portal_vapp) self.SCHEMA_CACHE[schema_name] = schema return schema + @staticmethod + def get_schema(name: str, portal_env: Optional[str] = None, portal_vapp: Optional[AbstractVirtualApp] = None): + return get_schema(name, portal_env=portal_env, portal_vapp=portal_vapp) + # Should not be needed, given that SCHEMA_CACHE is an instance variable. # # @classmethod @@ -86,16 +91,18 @@ def identifying_properties(self, schema: Optional[JsonSchema] = None, schema_nam return identifying_properties @classmethod # This operation doesn't actually use the schemas so is safe as a class method - def identifying_value(cls, data_item: Dict[str, AnyJsonData], identifying_properties) -> AnyJsonData: - if not identifying_properties: + def identifying_value(cls, data_item: Dict[str, AnyJsonData], + identifying_properties, raise_exception: bool = False) -> AnyJsonData: + if not identifying_properties and raise_exception: raise ValueError("No identifying properties were specified.") - for identifying_property in identifying_properties: + for identifying_property in identifying_properties or []: if identifying_property in data_item: return data_item[identifying_property] - raise ValueError(f'{there_are(identifying_properties, just_are=True)}' - f' no {maybe_pluralize(identifying_properties, "identifying property")}' - f' {disjoined_list([repr(x) for x in identifying_properties])}' - f' in {json.dumps(data_item)}.') + if raise_exception: + raise ValueError(f'{there_are(identifying_properties, just_are=True)}' + f' no {maybe_pluralize(identifying_properties, "identifying property")}' + f' {disjoined_list([repr(x) for x in identifying_properties])}' + f' in {json.dumps(data_item)}.') @staticmethod def get_identifying_properties(schema: dict) -> list: @@ -171,7 +178,9 @@ def validate_data_against_schemas(data: TabbedSheetData, *, schema_manager = SchemaManager(portal_env=portal_env, portal_vapp=portal_vapp, override_schemas=override_schemas) errors = [] - schemas = schema_manager.fetch_relevant_schemas(list(data.keys())) + + schema_names_to_fetch = [key for key, value in data.items() if value] + schemas = schema_manager.fetch_relevant_schemas(schema_names_to_fetch) for data_type in data: schema = schemas.get(data_type) diff --git a/dcicutils/zip_utils.py b/dcicutils/zip_utils.py new file mode 100644 index 000000000..e98e6ac70 --- /dev/null +++ b/dcicutils/zip_utils.py @@ -0,0 +1,79 @@ +from contextlib import contextmanager +import gzip +import os +import shutil +import tarfile +import tempfile +from typing import List, Optional, Union +import zipfile + + +@contextmanager +def unpack_zip_file_to_temporary_directory(file: str) -> str: + with temporary_directory() as tmp_directory_name: + with zipfile.ZipFile(file, "r") as zipf: + zipf.extractall(tmp_directory_name) + yield tmp_directory_name + + +@contextmanager +def unpack_tar_file_to_temporary_directory(file: str) -> str: + with temporary_directory() as tmp_directory_name: + with tarfile.open(file, "r") as tarf: + tarf.extractall(tmp_directory_name) + yield tmp_directory_name + + +def unpack_files(file: str, suffixes: Optional[List[str]] = None) -> Optional[str]: + unpack_file_to_tmp_directory = { + ".tar": unpack_tar_file_to_temporary_directory, + ".zip": unpack_zip_file_to_temporary_directory + }.get(file[dot:]) if (dot := file.rfind(".")) > 0 else None + if unpack_file_to_tmp_directory is not None: + with unpack_file_to_tmp_directory(file) as tmp_directory_name: + for directory, _, files in os.walk(tmp_directory_name): # Ignore "." prefixed files. + for file in [file for file in files if not file.startswith(".")]: + if not suffixes or any(file.endswith(suffix) for suffix in suffixes): + yield os.path.join(directory, file) + + +@contextmanager +def unpack_gz_file_to_temporary_file(file: str, suffix: Optional[str] = None) -> str: + if (gz := file.endswith(".gz")) or file.endswith(".tgz"): # The .tgz suffix is simply short for .tar.gz. + with temporary_file(name=os.path.basename(file[:-3] if gz else file[:-4] + ".tar")) as tmp_file_name: + with open(tmp_file_name, "wb") as outputf: + with gzip.open(file, "rb") as inputf: + outputf.write(inputf.read()) + outputf.close() + yield tmp_file_name + + +@contextmanager +def temporary_directory() -> str: + try: + with tempfile.TemporaryDirectory() as tmp_directory_name: + yield tmp_directory_name + finally: + remove_temporary_directory(tmp_directory_name) + + +@contextmanager +def temporary_file(name: Optional[str] = None, suffix: Optional[str] = None, + content: Optional[Union[str, bytes, List[str]]] = None) -> str: + with temporary_directory() as tmp_directory_name: + tmp_file_name = os.path.join(tmp_directory_name, name or tempfile.mktemp(dir="")) + (suffix or "") + if content is not None: + with open(tmp_file_name, "wb" if isinstance(content, bytes) else "w") as tmp_file: + tmp_file.write("\n".join(content) if isinstance(content, list) else content) + yield tmp_file_name + + +def remove_temporary_directory(tmp_directory_name: str) -> None: + def is_temporary_directory(path: str) -> bool: + try: + tmpdir = tempfile.gettempdir() + return os.path.commonpath([path, tmpdir]) == tmpdir and os.path.exists(path) and os.path.isdir(path) + except Exception: + return False + if is_temporary_directory(tmp_directory_name): # Guard against errant deletion. + shutil.rmtree(tmp_directory_name) diff --git a/docs/source/dcicutils.rst b/docs/source/dcicutils.rst index 2d5b8058c..f8b902536 100644 --- a/docs/source/dcicutils.rst +++ b/docs/source/dcicutils.rst @@ -79,6 +79,13 @@ cloudformation_utils :members: +data_readers +^^^^^^^^^^ + +.. automodule:: dcicutils.data_readers + :members: + + data_utils ^^^^^^^^^^ @@ -335,3 +342,10 @@ variant_utils .. automodule:: dcicutils.variant_utils :members: + + +zip_utils +^^^^^^^^^ + +.. automodule:: dcicutils.zip_utils + :members: diff --git a/pyproject.toml b/pyproject.toml index 6d6847fd5..4ae62c250 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "dcicutils" -version = "8.3.0" +version = "8.4.0" description = "Utility package for interacting with the 4DN Data Portal and other 4DN resources" authors = ["4DN-DCIC Team "] license = "MIT" diff --git a/test/data_files/escaping-false.json b/test/data_files/escaping-false.json index 84ab06993..3c95926d9 100644 --- a/test/data_files/escaping-false.json +++ b/test/data_files/escaping-false.json @@ -5,56 +5,56 @@ "unquoted": "\\\\", "doublequoted": "\\\\", "singlequoted": "'\\\\'", - "overflow": null + "overflow": "" }, { "name": "formfeed", "unquoted": "\\f", "doublequoted": "\\f", "singlequoted": "'\\f'", - "overflow": null + "overflow": "" }, { "name": "newline", "unquoted": "\\n", "doublequoted": "\\n", "singlequoted": "'\\n'", - "overflow": null + "overflow": "" }, { "name": "return", "unquoted": "\\r", "doublequoted": "\\r", "singlequoted": "'\\r'", - "overflow": null + "overflow": "" }, { "name": "tab", "unquoted": "\\t", "doublequoted": "\\t", "singlequoted": "'\\t'", - "overflow": null + "overflow": "" }, { "name": "misc", "unquoted": "\\m", "doublequoted": "\\m", "singlequoted": "'\\m'", - "overflow": null + "overflow": "" }, { "name": "quote1", "unquoted": "N/A", "doublequoted": "x,,z", "singlequoted": "N/A", - "overflow": null + "overflow": "" }, { "name": "quotelong", "unquoted": "N/A", "doublequoted": "x,,z,N/A\nquotlongcontinued,", "singlequoted": "N/A", - "overflow": null + "overflow": "" }, { "name": "comma", diff --git a/test/data_files/escaping-true.json b/test/data_files/escaping-true.json index 5d6c837a6..bae122afd 100644 --- a/test/data_files/escaping-true.json +++ b/test/data_files/escaping-true.json @@ -5,56 +5,56 @@ "unquoted": "\\", "doublequoted": "\\", "singlequoted": "'\\'", - "overflow": null + "overflow": "" }, { "name": "formfeed", "unquoted": "\\f", "doublequoted": "\\f", "singlequoted": "'\\f'", - "overflow": null + "overflow": "" }, { "name": "newline", "unquoted": "\n", "doublequoted": "\n", "singlequoted": "'\n'", - "overflow": null + "overflow": "" }, { "name": "return", "unquoted": "\r", "doublequoted": "\r", "singlequoted": "'\r'", - "overflow": null + "overflow": "" }, { "name": "tab", "unquoted": "\t", "doublequoted": "\t", "singlequoted": "'\t'", - "overflow": null + "overflow": "" }, { "name": "misc", "unquoted": "\\m", "doublequoted": "\\m", "singlequoted": "'\\m'", - "overflow": null + "overflow": "" }, { "name": "quote1", "unquoted": "N/A", "doublequoted": "x,,z", "singlequoted": "N/A", - "overflow": null + "overflow": "" }, { "name": "quotelong", "unquoted": "N/A", "doublequoted": "x,,z,N/A\nquotlongcontinued,", "singlequoted": "N/A", - "overflow": null + "overflow": "" }, { "name": "comma", diff --git a/test/data_files/sample_items2.json b/test/data_files/sample_items2.json index 7e084f908..0f3b22f90 100644 --- a/test/data_files/sample_items2.json +++ b/test/data_files/sample_items2.json @@ -1,6 +1,6 @@ [ {"name": "john", "sex": "Male", "member": false}, {"name": "juan", "sex": "Male", "member": true}, - {"name": "igor", "sex": "unknown", "member": null}, + {"name": "igor", "sex": "unknown", "member": ""}, {"name": "mary", "sex": "Female", "member": true} ] diff --git a/test/test_bundle_utils.py b/test/test_bundle_utils.py index 6e1da6685..008442430 100644 --- a/test/test_bundle_utils.py +++ b/test/test_bundle_utils.py @@ -4,6 +4,7 @@ import os import pytest import re +from typing import List, Optional, Union from dcicutils import ( bundle_utils as bundle_utils_module, @@ -172,7 +173,7 @@ def test_item_tools_parse_item_value_basic(): # Nulls (None, None), - ('', None), ('null', None), ('Null', None), ('NULL', None), + ('', ''), ('null', None), ('Null', None), ('NULL', None), # Booleans ('true', True), ('True', True), ('TRUE', True), @@ -191,9 +192,9 @@ def test_item_tools_parse_item_value_basic(): ('|', []), # special case: lone '|' means empty ('alpha|', ['alpha']), ('7|', [7]), # special case: trailing '|' means singleton # These follow from general case of '|' as separator of items recursively parsed - ('|alpha', [None, 'alpha']), ('|alpha|', [None, 'alpha']), ('|7', [None, 7]), + ('|alpha', ['', 'alpha']), ('|alpha|', ['', 'alpha']), ('|7', ['', 7]), ('alpha|beta|gamma', ['alpha', 'beta', 'gamma']), - ('alpha|true|false|null||7|1.5', ['alpha', True, False, None, None, 7, 1.5]) + ('alpha|true|false|null||7|1.5', ['alpha', True, False, None, '', 7, 1.5]) ] for input, heuristic_result in expectations: @@ -271,6 +272,7 @@ def test_load_table_structures(): print("loaded=", json.dumps(loaded, indent=2)) expected = SAMPLE_JSON_TABS_FILE_ITEM_CONTENT print("expected=", json.dumps(expected, indent=2)) + # assert loaded == expected assert loaded == expected with pytest.raises(LoadArgumentsError) as exc: @@ -292,9 +294,13 @@ def test_load_items(): # mock_get_schema.return_value = {} with no_schemas(): - assert load_items(SAMPLE_XLSX_FILE, apply_heuristics=True) == SAMPLE_XLSX_FILE_ITEM_CONTENT - assert load_items(SAMPLE_CSV_FILE, apply_heuristics=True) == SAMPLE_CSV_FILE_ITEM_CONTENT - assert load_items(SAMPLE_TSV_FILE, apply_heuristics=True) == SAMPLE_TSV_FILE_ITEM_CONTENT + # TODO: Some work still to do WRT sorting out between None and empty strings in general. + assert load_items(SAMPLE_XLSX_FILE, apply_heuristics=True) == \ + remove_empty_properties(SAMPLE_XLSX_FILE_ITEM_CONTENT) + assert remove_empty_properties(load_items(SAMPLE_CSV_FILE, apply_heuristics=True), [None, '']) == \ + SAMPLE_CSV_FILE_ITEM_CONTENT + assert remove_empty_properties(load_items(SAMPLE_TSV_FILE, apply_heuristics=True), [None, '']) == \ + SAMPLE_TSV_FILE_ITEM_CONTENT with pytest.raises(LoadArgumentsError) as exc: load_items("something.else") @@ -320,7 +326,7 @@ def test_load_items(): SAMPLE_CSV_FILE2_SHEET_NAME: [ {"name": "john", "sex": "M", "member": "false"}, {"name": "juan", "sex": "male", "member": "true"}, - {"name": "igor", "sex": "unknown", "member": None}, + {"name": "igor", "sex": "unknown", "member": ""}, {"name": "mary", "sex": "Female", "member": "t"} ] } @@ -329,7 +335,7 @@ def test_load_items(): SAMPLE_CSV_FILE2_SHEET_NAME: [ {"name": "john", "sex": "M", "member": False}, {"name": "juan", "sex": "male", "member": True}, - {"name": "igor", "sex": "unknown", "member": None}, + {"name": "igor", "sex": "unknown"}, {"name": "mary", "sex": "Female", "member": "t"} ] } @@ -338,7 +344,7 @@ def test_load_items(): "Person": [ {"name": "john", "sex": "Male", "member": False}, {"name": "juan", "sex": "Male", "member": True}, - {"name": "igor", "sex": "unknown", "member": None}, + {"name": "igor", "sex": "unknown"}, {"name": "mary", "sex": "Female", "member": True} ] } @@ -717,8 +723,10 @@ def test_table_checker(): portal_env=mock_ff_env) checker.check_tabs() expected_problems = [ - f"User[0].project: Unable to validate Project reference: {SAMPLE_PROJECT_UUID!r}", - f"User[0].user_institution: Unable to validate Institution reference: {SAMPLE_INSTITUTION_UUID!r}" + f"User[0].project: Cannot resolve reference (linkTo) for:" + f" Project/{SAMPLE_PROJECT_UUID} from User.project", + f"User[0].user_institution: Cannot resolve reference (linkTo) for:" + f" Institution/{SAMPLE_INSTITUTION_UUID} from User.user_institution" ] expected_problem_lines = [f"Problem: {problem}" for problem in expected_problems] assert exc.value.problems == expected_problems @@ -733,3 +741,19 @@ def test_table_checker(): flattened=True, portal_env=mock_ff_env) checker.check_tabs() + + +def remove_empty_properties(data: Optional[Union[list, dict]], + empty: Optional[List[Optional[str]]] = None) -> Optional[Union[list, dict]]: + if not empty: + empty = [None] + if isinstance(data, dict): + for key in list(data.keys()): + if (value := data[key]) in empty: + del data[key] + else: + remove_empty_properties(value, empty) + elif isinstance(data, list): + for item in data: + remove_empty_properties(item, empty) + return data diff --git a/test/test_misc_utils.py b/test/test_misc_utils.py index b32a6d4ce..6bf0afebe 100644 --- a/test/test_misc_utils.py +++ b/test/test_misc_utils.py @@ -31,7 +31,7 @@ ObsoleteError, CycleError, TopologicalSorter, keys_and_values_to_dict, dict_to_keys_and_values, is_c4_arn, deduplicate_list, chunked, parse_in_radix, format_in_radix, managed_property, future_datetime, MIN_DATETIME, MIN_DATETIME_UTC, INPUT, builtin_print, map_chunked, to_camel_case, json_file_contents, - pad_to, JsonLinesReader, + pad_to, JsonLinesReader, split_string, merge_objects ) from dcicutils.qa_utils import ( Occasionally, ControlledTime, override_environ as qa_override_environ, MockFileSystem, printed_output, @@ -3590,3 +3590,87 @@ def test_json_lines_reader_lists(): parsed = [line for line in JsonLinesReader(fp)] expected = [item1, item2] assert parsed == expected + + +def test_split_array_string(): + def split_array_string(value: str) -> List[str]: + return split_string(value, "|", "\\") + assert split_array_string(r"abc|def|ghi") == ["abc", "def", "ghi"] + assert split_array_string(r"abc\|def|ghi") == ["abc|def", "ghi"] + assert split_array_string(r"abc\\|def|ghi") == ["abc\\", "def", "ghi"] + assert split_array_string(r"abc\\\|def|ghi") == ["abc\\|def", "ghi"] + assert split_array_string(r"abc\\\|def\|ghi") == ["abc\\|def|ghi"] + assert split_array_string(r"abc\\|\\def\|ghi") == ["abc\\", "\\def|ghi"] + assert split_array_string(r"abc\\|\\def\|ghi||") == ["abc\\", "\\def|ghi"] + assert split_array_string(r"|abcdefghi|") == ["abcdefghi"] + assert split_array_string(r"|abcdefghi||xyzzy") == ["abcdefghi", "xyzzy"] + assert split_array_string(r"") == [] + assert split_array_string(None) == [] + assert split_array_string(r"|") == [] + assert split_array_string(r"\|") == ["|"] + assert split_array_string(r"\\|") == ["\\"] + + +def test_merge_objects_1(): + target = {"abc": {"def": {"ghi": None}}, "xyzzy": [None, None, None]} + source = {"xyzzy": [{"foo": None}, {"goo": None}]} + expected = {"abc": {"def": {"ghi": None}}, "xyzzy": [{"foo": None}, {"goo": None}, None]} + merge_objects(target, source, False) + assert target == expected + + +def test_merge_objects_2(): + target = {"abc": {"def": {"ghi": None}}, "xyzzy": [None, None, None]} + source = {"xyzzy": [{"foo": None}, {"goo": None}]} + expected = {"abc": {"def": {"ghi": None}}, "xyzzy": [{"foo": None}, {"goo": None}, {"goo": None}]} + merge_objects(target, source, True) + assert target == expected + + +def test_merge_objects_3(): + target = {"abc": {"def": {"ghi": None}}, "xyzzy": [None, None]} + source = {"xyzzy": [{"foo": None}, {"goo": None}]} + expected = {"abc": {"def": {"ghi": None}}, "xyzzy": [{"foo": None}, {"goo": None}]} + merge_objects(target, source, False) + assert target == expected + + +def test_merge_objects_4(): + target = {"abc": {"def": {"ghi": None}}, "xyzzy": [None, None]} + source = {"xyzzy": [{"foo": None}, {"goo": None}]} + expected = {"abc": {"def": {"ghi": None}}, "xyzzy": [{"foo": None}, {"goo": None}]} + merge_objects(target, source, True) + assert target == expected + + +def test_merge_objects_5(): + target = {"abc": {"def": {"ghi": None}}, "xyzzy": ["mno"]} + source = {"xyzzy": [{"foo": None}, {"goo": None}]} + expected = {"abc": {"def": {"ghi": None}}, "xyzzy": [{"foo": None}, {"goo": None}]} + merge_objects(target, source, False) + assert target == expected + + +def test_merge_objects_6(): + target = {"abc": {"def": {"ghi": None}}, "xyzzy": ["mno"]} + source = {"xyzzy": [{"foo": None}, {"goo": None}]} + expected = {"abc": {"def": {"ghi": None}}, "xyzzy": [{"foo": None}, {"goo": None}]} + merge_objects(target, source, True) + assert target == expected + + +def test_merge_objects_7(): + target = {"abc": {"def": {"ghi": None}}, "xyzzy": [None, None, "abc", "def", 123]} + source = {"xyzzy": [{"foo": None}, {"goo": None}, {"hoo": None}]} + expected = {"abc": {"def": {"ghi": None}}, "xyzzy": [{"foo": None}, {"goo": None}, {"hoo": None}, "def", 123]} + merge_objects(target, source, False) + assert target == expected + + +def test_merge_objects_8(): + target = {"abc": {"def": {"ghi": None}}, "xyzzy": [None, None, "abc", "def", 123]} + source = {"xyzzy": [{"foo": None}, {"goo": None}, {"hoo": None}]} + expected = {"abc": {"def": {"ghi": None}}, + "xyzzy": [{"foo": None}, {"goo": None}, {"hoo": None}, {"hoo": None}, {"hoo": None}]} + merge_objects(target, source, True) + assert target == expected diff --git a/test/test_qa_utils.py b/test/test_qa_utils.py index bed502166..6e9c606bf 100644 --- a/test/test_qa_utils.py +++ b/test/test_qa_utils.py @@ -568,7 +568,7 @@ def test_occasionally_errors(): error_message="something") fail_for_a_while(1) except Exception as e: - assert type(e) == SyntaxError + assert type(e) is SyntaxError assert str(e) == "something" diff --git a/test/test_sheet_utils.py b/test/test_sheet_utils.py index c012772c1..276b17b23 100644 --- a/test/test_sheet_utils.py +++ b/test/test_sheet_utils.py @@ -1,6 +1,7 @@ import json import os import pytest +from typing import Optional, Union from collections import namedtuple from dcicutils.sheet_utils import ( @@ -116,29 +117,52 @@ def test_table_set_manager_registry_manager_for_filename(): SAMPLE_XLSX_FILE = os.path.join(TEST_DIR, 'data_files/sample_items.xlsx') -SAMPLE_XLSX_FILE_RAW_CONTENT = { +backup_SAMPLE_XLSX_FILE_RAW_CONTENT = { "Sheet1": [ - {"x": 1, "y.a": 1, "y.z": 1}, - {"x": 1, "y.a": 2, "y.z": 3}, + {"x": "1", "y.a": "1", "y.z": "1"}, + {"x": "1", "y.a": "2", "y.z": "3"}, {"x": "alpha", "y.a": "beta", "y.z": "gamma|delta"}, ], "Sheet2": [ { - "name": "bill", "age": 23, - "mother.name": "mary", "mother.age": 58, - "father.name": "fred", "father.age": 63, - "friends#0.name": "sam", "friends#0.age": 22, - "friends#1.name": "arthur", "friends#1.age": 19, + "name": "bill", "age": "23", + "mother.name": "mary", "mother.age": "58", + "father.name": "fred", "father.age": "63", + "friends#0.name": "sam", "friends#0.age": "22", + "friends#1.name": "arthur", "friends#1.age": "19", }, { - "name": "joe", "age": 9, - "mother.name": "estrella", "mother.age": 35, - "father.name": "anthony", "father.age": 34, - "friends#0.name": "anders", "friends#0.age": 9, + "name": "joe", "age": "9", + "mother.name": "estrella", "mother.age": "35", + "father.name": "anthony", "father.age": "34", + "friends#0.name": "anders", "friends#0.age": "9", "friends#1.name": None, "friends#1.age": None, }, ] } +SAMPLE_XLSX_FILE_RAW_CONTENT = { + "Sheet1": [ + {"x": "1", "y.a": "1", "y.z": "1"}, + {"x": "1", "y.a": "2", "y.z": "3"}, + {"x": "alpha", "y.a": "beta", "y.z": "gamma|delta"}, + ], + "Sheet2": [ + { + "name": "bill", "age": "23", + "mother.name": "mary", "mother.age": "58", + "father.name": "fred", "father.age": "63", + "friends#0.name": "sam", "friends#0.age": "22", + "friends#1.name": "arthur", "friends#1.age": "19", + }, + { + "name": "joe", "age": "9", + "mother.name": "estrella", "mother.age": "35", + "father.name": "anthony", "father.age": "34", + "friends#0.name": "anders", "friends#0.age": "9", + "friends#1.name": "", "friends#1.age": "", + }, + ] +} SAMPLE_XLSX_FILE_INFLATED_CONTENT = { "Sheet1": [ @@ -162,7 +186,7 @@ def test_table_set_manager_registry_manager_for_filename(): "father": {"name": "anthony", "age": 34}, "friends": [ {"name": "anders", "age": 9}, - {"name": None, "age": None} + {"name": "", "age": ""} ] }, ], @@ -243,11 +267,11 @@ def test_xlsx_manager_load_csv(): def test_csv_manager_load_content(): wt = CsvManager(SAMPLE_CSV_FILE) - assert wt.load_content() == SAMPLE_CSV_FILE_RAW_CONTENT + assert convert_number_to_string(wt.load_content()) == SAMPLE_CSV_FILE_RAW_CONTENT def test_csv_manager_load(): - assert CsvManager.load(SAMPLE_CSV_FILE) == SAMPLE_CSV_FILE_RAW_CONTENT + assert convert_number_to_string(CsvManager.load(SAMPLE_CSV_FILE)) == SAMPLE_CSV_FILE_RAW_CONTENT def test_csv_manager_load_csv(): @@ -269,11 +293,11 @@ def test_csv_escaping(): def test_tsv_manager_load_content(): wt = TsvManager(SAMPLE_TSV_FILE) - assert wt.load_content() == SAMPLE_TSV_FILE_RAW_CONTENT + assert convert_number_to_string(wt.load_content()) == SAMPLE_TSV_FILE_RAW_CONTENT def test_tsv_manager_load(): - assert TsvManager.load(SAMPLE_TSV_FILE) == SAMPLE_TSV_FILE_RAW_CONTENT + assert convert_number_to_string(TsvManager.load(SAMPLE_TSV_FILE)) == SAMPLE_TSV_FILE_RAW_CONTENT def test_tsv_manager_load_csv(): @@ -281,3 +305,16 @@ def test_tsv_manager_load_csv(): TsvManager.load(SAMPLE_XLSX_FILE) assert str(exc.value).startswith('The TableSetManager subclass TsvManager' ' expects only .tsv or .tsv.txt filenames:') + + +def convert_number_to_string(data: Optional[Union[list, dict]]) -> Optional[Union[list, dict]]: + if isinstance(data, dict): + for key in list(data.keys()): + if isinstance(value := data[key], int): + data[key] = str(value) + else: + convert_number_to_string(value) + elif isinstance(data, list): + for item in data: + convert_number_to_string(item) + return data diff --git a/test/test_validation_utils.py b/test/test_validation_utils.py index 40878a222..9129e05c0 100644 --- a/test/test_validation_utils.py +++ b/test/test_validation_utils.py @@ -73,20 +73,20 @@ def test_schema_manager_with_schemas(schema_id): def test_schema_manager_identifying_value(): with pytest.raises(ValueError) as exc: - assert SchemaManager.identifying_value({'any': 'thing'}, []) + assert SchemaManager.identifying_value({'any': 'thing'}, [], raise_exception=True) assert str(exc.value) == "No identifying properties were specified." person_named_fred = {'age': 33, 'name': 'Fred', 'favorite-color': 'yellow'} - assert SchemaManager.identifying_value(person_named_fred, ['uuid', 'name']) == 'Fred' + assert SchemaManager.identifying_value(person_named_fred, ['uuid', 'name'], raise_exception=True) == 'Fred' person_nicknamed_fred = {'age': 33, 'nickname': 'Fred', 'favorite-color': 'yellow'} with pytest.raises(ValueError) as exc: - SchemaManager.identifying_value(person_nicknamed_fred, ['uuid', 'name']) + SchemaManager.identifying_value(person_nicknamed_fred, ['uuid', 'name'], raise_exception=True) assert str(exc.value) == ("""There are no identifying properties 'uuid' or 'name'""" """ in {"age": 33, "nickname": "Fred", "favorite-color": "yellow"}.""") with pytest.raises(ValueError) as exc: - SchemaManager.identifying_value(person_nicknamed_fred, ['name']) + SchemaManager.identifying_value(person_nicknamed_fred, ['name'], raise_exception=True) assert str(exc.value) == ("""There is no identifying property 'name'""" """ in {"age": 33, "nickname": "Fred", "favorite-color": "yellow"}.""")