From d65d7e5f827656d7e4f3b3108dcc85b371c64388 Mon Sep 17 00:00:00 2001 From: Kori Kuzma Date: Wed, 17 Jul 2024 09:12:31 -0400 Subject: [PATCH 1/2] refactor: remove unused code + small cleanup * Only focused on ga4gh.core + Pydantic models --- src/ga4gh/core/__init__.py | 7 +--- src/ga4gh/core/enderef.py | 10 ++--- src/ga4gh/core/identifiers.py | 71 +++++++---------------------------- src/ga4gh/core/pydantic.py | 18 --------- src/ga4gh/vrs/models.py | 7 +--- 5 files changed, 20 insertions(+), 93 deletions(-) diff --git a/src/ga4gh/core/__init__.py b/src/ga4gh/core/__init__.py index cb19f47d..3941f2e2 100644 --- a/src/ga4gh/core/__init__.py +++ b/src/ga4gh/core/__init__.py @@ -8,12 +8,12 @@ from .enderef import ga4gh_enref, ga4gh_deref from .identifiers import ( ga4gh_digest, ga4gh_identify, ga4gh_serialize, is_ga4gh_identifier, - parse_ga4gh_identifier, VrsObjectIdentifierIs, use_ga4gh_compute_identifier_when, + VrsObjectIdentifierIs, use_ga4gh_compute_identifier_when, CURIE_NAMESPACE, CURIE_SEP, GA4GH_PREFIX_SEP, GA4GH_IR_REGEXP, GA4GH_DIGEST_REGEXP, PrevVrsVersion ) from .pydantic import ( - is_pydantic_instance, is_curie_type, is_ga4gh_identifiable, is_literal, pydantic_copy + is_pydantic_instance, is_curie_type, pydantic_copy ) from . import entity_models, domain_models @@ -25,7 +25,6 @@ "ga4gh_identify", "ga4gh_serialize", "is_ga4gh_identifier", - "parse_ga4gh_identifier", "VrsObjectIdentifierIs", "use_ga4gh_compute_identifier_when", "CURIE_NAMESPACE", @@ -36,8 +35,6 @@ "PrevVrsVersion", "is_pydantic_instance", "is_curie_type", - "is_ga4gh_identifiable", - "is_literal", "pydantic_copy", "entity_models", "domain_models" diff --git a/src/ga4gh/core/enderef.py b/src/ga4gh/core/enderef.py index d7d4c076..7808d2b0 100644 --- a/src/ga4gh/core/enderef.py +++ b/src/ga4gh/core/enderef.py @@ -13,9 +13,7 @@ from .identifiers import ga4gh_identify, is_ga4gh_identifier from .pydantic import ( is_pydantic_instance, - is_list, is_curie_type, - is_ga4gh_identifiable, get_pydantic_root, pydantic_copy) @@ -45,7 +43,7 @@ def _enref(o): ref_att_names = cra_map.get(o.type, []) for ran in ref_att_names: v = getattr(o, ran) - if is_list(v): + if isinstance(v, list): setattr(o, ran, [_enref(o2) for o2 in v]) elif isinstance(v, str): pass @@ -60,7 +58,7 @@ def _enref(o): if not is_pydantic_instance(o): raise ValueError("Called ga4gh_enref() with non-pydantic instance") - if not is_ga4gh_identifiable(o): + if not o.is_ga4gh_identifiable(): raise ValueError("Called ga4gh_enref() with non-identifiable object") # in-place replacement on object copy @@ -88,7 +86,7 @@ def _deref(o): ref_att_names = cra_map[o.type] for ran in ref_att_names: v = getattr(o, ran) - if is_list(v): + if isinstance(v, list): setattr(o, ran, [_deref(object_store[str(curie)]) for curie in v]) elif is_ga4gh_identifier(v): v = get_pydantic_root(v) @@ -101,7 +99,7 @@ def _deref(o): if not is_pydantic_instance(o): raise ValueError("Called ga4gh_deref() with non-pydantic instance") - if not is_ga4gh_identifiable(o): + if not o.is_ga4gh_identifiable(): raise ValueError("Called ga4gh_deref() with non-identifiable object") # in-place replacement on object copy diff --git a/src/ga4gh/core/identifiers.py b/src/ga4gh/core/identifiers.py index 4503af88..83070a70 100644 --- a/src/ga4gh/core/identifiers.py +++ b/src/ga4gh/core/identifiers.py @@ -16,19 +16,16 @@ """ import contextvars -import logging import re from contextlib import ContextDecorator from enum import Enum, IntEnum -from typing import Union, Optional -from pydantic import BaseModel, RootModel +from typing import Optional +from pydantic import BaseModel -from .pydantic import get_pydantic_root +from ga4gh.core.pydantic import get_pydantic_root __all__ = "ga4gh_digest ga4gh_identify ga4gh_serialize is_ga4gh_identifier parse_ga4gh_identifier".split() -_logger = logging.getLogger(__name__) - CURIE_NAMESPACE = "ga4gh" CURIE_SEP = ":" GA4GH_PREFIX_SEP = "." @@ -36,7 +33,7 @@ GA4GH_IR_REGEXP = re.compile(r"^ga4gh:(?P[^.]+)\.(?P[0-9A-Za-z_\-]{32})$") GA4GH_DIGEST_REGEXP = re.compile(r"^[0-9A-Za-z_\-]{32}$") -ns_w_sep = CURIE_NAMESPACE + CURIE_SEP +NS_W_SEP = f"{CURIE_NAMESPACE}{CURIE_SEP}" class VrsObjectIdentifierIs(IntEnum): @@ -47,9 +44,9 @@ class VrsObjectIdentifierIs(IntEnum): GA4GH_INVALID - Compute the identifier if it is missing or is present but syntactically invalid MISSING - Only compute the identifier if missing - The default behavior is safe and ensures that the identifiers are correct, - but at a performance cost. Where the source of inputs to `ga4gh_identify` - are well controlled, for example when annotating a VCF file with VRS IDs, + The default behavior is safe and ensures that the identifiers are correct, + but at a performance cost. Where the source of inputs to `ga4gh_identify` + are well controlled, for example when annotating a VCF file with VRS IDs, using `MISSING` can improve performance. """ @@ -113,27 +110,7 @@ def is_ga4gh_identifier(ir): False """ - return str(get_pydantic_root(ir)).startswith(ns_w_sep) - - -def parse_ga4gh_identifier(ir): - """ - Parses a GA4GH identifier, returning a dict with type and digest components - - >>> parse_ga4gh_identifier("ga4gh:SQ.0123abcd") - {'type': 'SQ', 'digest': '0123abcd'} - - >>> parse_ga4gh_identifier("notga4gh:SQ.0123abcd") - Traceback (most recent call last): - ... - ValueError: notga4gh:SQ.0123abcd - - """ - - try: - return GA4GH_IR_REGEXP.match(str(ir)).groupdict() - except AttributeError as e: - raise ValueError(ir) from e + return str(get_pydantic_root(ir)).startswith(NS_W_SEP) def ga4gh_identify(vro, in_place: str = 'default', as_version: PrevVrsVersion | None = None) -> str | None: @@ -171,7 +148,7 @@ def ga4gh_identify(vro, in_place: str = 'default', as_version: PrevVrsVersion | obj_id = getattr(vro, "id", None) if when_rule == VrsObjectIdentifierIs.MISSING: do_compute = obj_id is None or obj_id == "" - else: # GA4GHComputeIdentifierIs.GA4GH_INVALID + else: # VrsObjectIdentifierIs.GA4GH_INVALID do_compute = not vro.has_valid_ga4gh_id() if do_compute: @@ -182,9 +159,11 @@ def ga4gh_identify(vro, in_place: str = 'default', as_version: PrevVrsVersion | return None -def ga4gh_digest(vro: BaseModel, overwrite: bool = False, as_version: PrevVrsVersion | None = None) -> str: +def ga4gh_digest(vro: BaseModel, overwrite: bool = False, as_version: PrevVrsVersion | None = None) -> str | None: """Return the GA4GH digest for the object. + Only GA4GH identifiable objects are GA4GH digestible. + If ``as_version`` is provided, other parameters are ignored and a digest is returned following the conventions of the VRS version indicated by ``as_version_``. Raises ``ValueError`` if ``as_version`` is not a ``PrevVrsVersion``. @@ -197,7 +176,7 @@ def ga4gh_digest(vro: BaseModel, overwrite: bool = False, as_version: PrevVrsVer """ PrevVrsVersion.validate(as_version) - if vro.is_ga4gh_identifiable(): # Only GA4GH identifiable objects are GA4GH digestible + if vro.is_ga4gh_identifiable(): if as_version is None: return vro.get_or_create_digest(overwrite) else: @@ -206,30 +185,6 @@ def ga4gh_digest(vro: BaseModel, overwrite: bool = False, as_version: PrevVrsVer return None -def replace_with_digest(val: dict) -> Union[str, dict]: - """ - If val has a digest computed, return it, else return val - """ - if isinstance(val, dict) and val.get("digest", None) is not None: - return val["digest"] - return val - - -def collapse_identifiable_values(obj: dict) -> dict: - """ - Replaces dict values with their digests if they are defined. - Does not collapse the top level object, only objects it contains. - """ - if isinstance(obj, dict): - obj = { - k: replace_with_digest(collapse_identifiable_values(obj[k])) - for k in obj.keys() - } - elif isinstance(obj, list) or isinstance(obj, set): - obj = [replace_with_digest(collapse_identifiable_values(elem)) for elem in obj] - return obj - - def ga4gh_serialize(obj: BaseModel, as_version: PrevVrsVersion | None = None) -> Optional[bytes]: """Serializes an object for use in computed digest computation. diff --git a/src/ga4gh/core/pydantic.py b/src/ga4gh/core/pydantic.py index 57e4f373..cec242c6 100644 --- a/src/ga4gh/core/pydantic.py +++ b/src/ga4gh/core/pydantic.py @@ -21,24 +21,6 @@ def getattr_in(obj, names) -> Any: return v -def is_ga4gh_identifiable(o: Any) -> bool: - """ - Determine if object is a GA4GH identifiable type. - - :param o: Object - :return: `True` if `o` is a GA4GH Identifiable Object. `False` otherwise. - """ - return o.is_ga4gh_identifiable() - - -def is_literal(o: Any) -> bool: - return isinstance(o, (str, int, float, complex, bool)) - - -def is_list(o: Any) -> bool: - return isinstance(o, list) - - def is_curie_type(o: Any) -> bool: """ Returns true if the object is a str-like matching the CURIE pattern. diff --git a/src/ga4gh/vrs/models.py b/src/ga4gh/vrs/models.py index 79d4e572..da0b3911 100644 --- a/src/ga4gh/vrs/models.py +++ b/src/ga4gh/vrs/models.py @@ -28,7 +28,6 @@ from pydantic import BaseModel, Field, RootModel, StringConstraints, model_serializer from ga4gh.core.pydantic import ( - is_ga4gh_identifiable, getattr_in ) from ga4gh.core.entity_models import IRI, Expression, _DomainEntity @@ -43,7 +42,6 @@ def is_coll(thing): Return True if the thing looks like a collection. This is not exhaustive, do not use in general. """ - # return hasattr(thing, '__iter__') and not isinstance(thing, str) and not inspect.isclass(thing) return type(thing) in [list, set] if is_coll(vals): for x in vals: @@ -93,7 +91,7 @@ def pydantic_class_refatt_map(): # Types directly reffable reffable_classes = list(filter( lambda c: ('id' in c.model_fields - and is_ga4gh_identifiable(c)), + and c.is_ga4gh_identifiable()), model_classes )) # Types reffable because they are a union of reffable types @@ -234,9 +232,6 @@ def is_ga4gh_identifiable(): def has_valid_ga4gh_id(self): return self.id and GA4GH_IR_REGEXP.match(self.id) is not None - def has_valid_digest(self): - return bool(self.digest) # Pydantic constraint ensures digest field value is valid - def compute_digest(self, store=True, as_version: PrevVrsVersion | None = None) -> str: """A sha512t24u digest created using the VRS Computed Identifier algorithm. From 740e4e6c8dd11872371d53f13ced8d4105236c8d Mon Sep 17 00:00:00 2001 From: Kori Kuzma Date: Wed, 17 Jul 2024 09:17:57 -0400 Subject: [PATCH 2/2] rm parse_ga4gh_identifier --- src/ga4gh/core/identifiers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ga4gh/core/identifiers.py b/src/ga4gh/core/identifiers.py index 83070a70..85ea214a 100644 --- a/src/ga4gh/core/identifiers.py +++ b/src/ga4gh/core/identifiers.py @@ -24,7 +24,7 @@ from ga4gh.core.pydantic import get_pydantic_root -__all__ = "ga4gh_digest ga4gh_identify ga4gh_serialize is_ga4gh_identifier parse_ga4gh_identifier".split() +__all__ = "ga4gh_digest ga4gh_identify ga4gh_serialize is_ga4gh_identifier".split() CURIE_NAMESPACE = "ga4gh" CURIE_SEP = ":"