Skip to content

Commit

Permalink
feat: add enum for as_version + add restrictions to `ga4gh_serializ…
Browse files Browse the repository at this point in the history
…e_as_version` (#431)

Building off of #382 and #427

* Add `PrevVrsVersion` enum to store previous versions of VRS that is
supported for computing digests/identifiers
* Updates function signatures + docstrings for ga4gh
digest/serialize/identifier
* Adds restrictions to `ga4gh_serialize_as_version`
* For `SequenceLocation`: `sequenceReference` must be provided and must
be a valid `SequenceReference` obj
* For `Allele`: Only `LiteralSequenceExpression` and
`ReferenceLengthExpression` are supported and must provide a `sequence`
nonnull attribute.
  • Loading branch information
korikuzma authored Jul 17, 2024
1 parent 3347873 commit 8c01d00
Show file tree
Hide file tree
Showing 4 changed files with 135 additions and 60 deletions.
4 changes: 3 additions & 1 deletion src/ga4gh/core/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@
from .identifiers import (
ga4gh_digest, ga4gh_identify, ga4gh_serialize, is_ga4gh_identifier,
parse_ga4gh_identifier, VrsObjectIdentifierIs, use_ga4gh_compute_identifier_when,
CURIE_NAMESPACE, CURIE_SEP, GA4GH_PREFIX_SEP, GA4GH_IR_REGEXP, GA4GH_DIGEST_REGEXP
CURIE_NAMESPACE, CURIE_SEP, GA4GH_PREFIX_SEP, GA4GH_IR_REGEXP, GA4GH_DIGEST_REGEXP,
PrevVrsVersion
)
from .pydantic import (
is_pydantic_instance, is_curie_type, is_ga4gh_identifiable, is_literal, pydantic_copy
Expand All @@ -32,6 +33,7 @@
"GA4GH_PREFIX_SEP",
"GA4GH_IR_REGEXP",
"GA4GH_DIGEST_REGEXP",
"PrevVrsVersion",
"is_pydantic_instance",
"is_curie_type",
"is_ga4gh_identifiable",
Expand Down
71 changes: 41 additions & 30 deletions src/ga4gh/core/identifiers.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
import logging
import re
from contextlib import ContextDecorator
from enum import IntEnum
from enum import Enum, IntEnum
from typing import Union, Optional
from pydantic import BaseModel, RootModel

Expand Down Expand Up @@ -58,6 +58,20 @@ class VrsObjectIdentifierIs(IntEnum):
MISSING = 2


class PrevVrsVersion(str, Enum):
"""Define previous VRS versions that are supported for computing digests and
identifiers based on the current VRS model
"""

V1_3 = "1.3"

@classmethod
def validate(cls, version):
if version is not None and version not in cls.__members__.values():
err_msg = f"Expected `PrevVrsVersion`, but got {version}"
raise ValueError(err_msg)


ga4gh_compute_identifier_when = contextvars.ContextVar("ga4gh_compute_identifier_when")


Expand Down Expand Up @@ -122,9 +136,8 @@ def parse_ga4gh_identifier(ir):
raise ValueError(ir) from e


def ga4gh_identify(vro, in_place='default', as_version=None):
"""
Return the GA4GH digest-based id for the object, as a CURIE
def ga4gh_identify(vro, in_place: str = 'default', as_version: PrevVrsVersion | None = None) -> str | None:
"""Return the GA4GH digest-based id for the object, as a CURIE
(string). Returns None if object is not identifiable.
This function has three options for in_place editing of vro.id:
Expand All @@ -137,18 +150,18 @@ def ga4gh_identify(vro, in_place='default', as_version=None):
- 'never': the vro.id field will not be edited in-place,
even when empty
If 'as_version' is set to a version string, other parameters are
ignored and an identifier returned following the conventions of
the VRS version indicated by 'as_version'.
If ``as_version`` is provided, other parameters are ignored and an identifier is
returned following the conventions of the VRS version indicated by ``as_version_``.
Raises ``ValueError`` if ``as_version`` is not a ``PrevVrsVersion``.
TODO update example for VRS 2.0
>>> from ga4gh.core import ga4gh_identify
>>> import ga4gh.vrs
>>> ival = ga4gh.vrs.models.SimpleInterval(start=44908821, end=44908822)
>>> location = ga4gh.vrs.models.Location(sequence_id="ga4gh:SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl", interval=ival)
>>> location = ga4gh.vrs.models.SequenceLocation(start=44908821, end=44908822, sequenceReference=ga4gh.vrs.models.SequenceReference(refgetAccession="SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul"))
>>> ga4gh_identify(location)
'ga4gh:VSL.u5fspwVbQ79QkX6GHLF8tXPCAXFJqRPx'
'ga4gh:SL.4t6JnYWqHwYw9WzBT_lmWBb3tLQNalkT'
"""
PrevVrsVersion.validate(as_version)

if vro.is_ga4gh_identifiable():
when_rule = ga4gh_compute_identifier_when.get(VrsObjectIdentifierIs.ANY)
obj_id = None
Expand All @@ -169,23 +182,21 @@ def ga4gh_identify(vro, in_place='default', as_version=None):
return None


def ga4gh_digest(vro: BaseModel, overwrite=False, as_version=None):
"""
Return the GA4GH digest for the object.
If 'as_version' is set to a version string, other parameters
are ignored and a digest returned following the conventions of
the VRS version indicated by 'as_version'.
def ga4gh_digest(vro: BaseModel, overwrite: bool = False, as_version: PrevVrsVersion | None = None) -> str:
"""Return the GA4GH digest for the object.
TODO update example
If ``as_version`` is provided, other parameters are ignored and a digest is returned
following the conventions of the VRS version indicated by ``as_version_``.
Raises ``ValueError`` if ``as_version`` is not a ``PrevVrsVersion``.
>>> from ga4gh.core import ga4gh_digest
>>> import ga4gh.vrs
>>> ival = ga4gh.vrs.models.SimpleInterval(start=44908821, end=44908822)
>>> location = ga4gh.vrs.models.Location(sequence_id="ga4gh:SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl", interval=ival)
>>> location = ga4gh.vrs.models.SequenceLocation(start=44908821, end=44908822, sequenceReference=ga4gh.vrs.models.SequenceReference(refgetAccession="SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul"))
>>> ga4gh_digest(location)
'u5fspwVbQ79QkX6GHLF8tXPCAXFJqRPx'
'4t6JnYWqHwYw9WzBT_lmWBb3tLQNalkT'
"""
PrevVrsVersion.validate(as_version)

if vro.is_ga4gh_identifiable(): # Only GA4GH identifiable objects are GA4GH digestible
if as_version is None:
return vro.get_or_create_digest(overwrite)
Expand Down Expand Up @@ -219,14 +230,14 @@ def collapse_identifiable_values(obj: dict) -> dict:
return obj


def ga4gh_serialize(obj: BaseModel, as_version=None) -> Optional[bytes]:
"""
Serializes an object for use in computed digest computation.
def ga4gh_serialize(obj: BaseModel, as_version: PrevVrsVersion | None = None) -> Optional[bytes]:
"""Serializes an object for use in computed digest computation.
If a VRS version string is specified for the 'as_version' parameter,
the returned serialization follows the convention of the specified
VRS version.
If ``as_version`` is provided, the returned serialization follows
the conventions of the VRS version indicated by ``as_version_``.
"""
PrevVrsVersion.validate(as_version)

if as_version is None:
return obj.model_dump_json().encode("utf-8")
else:
Expand Down
76 changes: 51 additions & 25 deletions src/ga4gh/vrs/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,14 @@
from enum import Enum
import inspect
import sys
from ga4gh.core import sha512t24u, GA4GH_PREFIX_SEP, CURIE_SEP, CURIE_NAMESPACE, GA4GH_IR_REGEXP
from ga4gh.core import (
sha512t24u,
GA4GH_PREFIX_SEP,
CURIE_SEP,
CURIE_NAMESPACE,
GA4GH_IR_REGEXP,
PrevVrsVersion
)
from ga4gh.core.pydantic import get_pydantic_root

from pydantic import BaseModel, Field, RootModel, StringConstraints, model_serializer
Expand Down Expand Up @@ -230,11 +237,13 @@ def has_valid_ga4gh_id(self):
def has_valid_digest(self):
return bool(self.digest) # Pydantic constraint ensures digest field value is valid

def compute_digest(self, store=True, as_version=None) -> str:
def compute_digest(self, store=True, as_version: PrevVrsVersion | None = None) -> str:
"""A sha512t24u digest created using the VRS Computed Identifier algorithm.
Stores the digest in the object if store is True. If 'as_version' is set to
a version string, other parameters are ignored and a digest returned
following the conventions of the VRS version indicated by 'as_version'.
Stores the digest in the object if ``store`` is ``True``.
If ``as_version`` is provided, other parameters are ignored and a digest is
returned following the conventions of the VRS version indicated by ``as_version_``.
"""
if as_version is None:
digest = sha512t24u(self.model_dump_json().encode("utf-8"))
Expand Down Expand Up @@ -262,9 +271,9 @@ def get_or_create_ga4gh_identifier(self, in_place='default', recompute=False, as
Digests will be recalculated even if present if recompute is True.
If 'as_version' is set to a version string, other parameters are
ignored and an identifier returned following the conventions of
the VRS version indicated by 'as_version'.
If ``as_version`` is provided, other parameters are ignored and an identifier is
returned following the conventions of the VRS version indicated by
``as_version_``.
"""
if as_version is not None:
return self.compute_ga4gh_identifier(as_version=as_version)
Expand All @@ -287,9 +296,9 @@ def get_or_create_ga4gh_identifier(self, in_place='default', recompute=False, as
def compute_ga4gh_identifier(self, recompute=False, as_version=None):
"""Returns a GA4GH Computed Identifier.
If 'as_version' is set to a version string, other parameters are
ignored and a computed identifier returned following the conventions
of the VRS version indicated by 'as_version'.
If ``as_version`` is provided, other parameters are ignored and a computed
identifier is returned following the conventions of the VRS version indicated by
``as_version_``.
"""
if as_version is None:
self.get_or_create_digest(recompute)
Expand Down Expand Up @@ -456,12 +465,21 @@ class SequenceLocation(_Ga4ghIdentifiableObject):
)
sequence: Optional[SequenceString] = Field(None, description="The literal sequence encoded by the `sequenceReference` at these coordinates.")

def ga4gh_serialize_as_version(self, as_version):
def ga4gh_serialize_as_version(self, as_version: PrevVrsVersion):
"""This method will return a serialized string following the conventions for
SequenceLocation serialization as defined in the VRS version specified by 'as_version`."""
if as_version == '1.3':
out = list()
for value in [self.start,self.end]:
SequenceLocation serialization as defined in the VRS version specified by
``as_version``.
:raises ValueError: If ``sequenceReference`` is not a ``SequenceReference``
object; ``start`` or ``end`` are not an int or list.
"""
if as_version == PrevVrsVersion.V1_3:
if not isinstance(self.sequenceReference, SequenceReference):
err_msg = "Must provide `sequenceReference` and it must be a valid `SequenceReference`"
raise ValueError(err_msg)

out = []
for value in [self.start, self.end]:
value = get_pydantic_root(value)
if isinstance(value, int):
result = f'{{"type":"Number","value":{value}}}'
Expand All @@ -476,8 +494,6 @@ def ga4gh_serialize_as_version(self, as_version):
raise ValueError(f'{value} is not int or list.')
out.append(result)
return f'{{"interval":{{"end":{out[1]},"start":{out[0]},"type":"SequenceInterval"}},"sequence_id":"{self.sequenceReference.refgetAccession.split(".")[1]}","type":"SequenceLocation"}}'
else:
raise ValueError(f'Serializing as version {as_version} not supported for this class.')

def get_refget_accession(self):
if isinstance(self.sequenceReference, SequenceReference):
Expand All @@ -489,7 +505,7 @@ def get_refget_accession(self):

class ga4gh(_Ga4ghIdentifiableObject.ga4gh):
prefix = 'SL'
priorPrefix = {'1.3': 'VSL'}
priorPrefix = {PrevVrsVersion.V1_3.value: 'VSL'}
keys = [
'end',
'sequenceReference',
Expand Down Expand Up @@ -523,21 +539,31 @@ class Allele(_VariationBase):
..., description='An expression of the sequence state'
)

def ga4gh_serialize_as_version(self, as_version):
def ga4gh_serialize_as_version(self, as_version: PrevVrsVersion):
"""This method will return a serialized string following the conventions for
Allele serialization as defined in the VRS version specified by 'as_version`."""
Allele serialization as defined in the VRS version specified by 'as_version`.
:raises ValueError: If ``state`` is not a ``LiteralSequenceExpression`` or
``ReferenceLengthExpression``; ``state.sequence`` is null.
"""
location_digest = self.location.compute_digest(as_version=as_version)

if not isinstance(self.state, (LiteralSequenceExpression, ReferenceLengthExpression)):
err_msg = "Only `LiteralSequenceExpression` and `ReferenceLengthExpression` are supported for previous versions of VRS"
raise ValueError(err_msg)

sequence = get_pydantic_root(self.state.sequence)

if sequence is None:
raise ValueError('State sequence attribute must be defined.')
if as_version == '1.3':

if as_version == PrevVrsVersion.V1_3:
return f'{{"location":"{location_digest}","state":{{"sequence":"{sequence}","type":"LiteralSequenceExpression"}},"type":"Allele"}}'
else:
raise ValueError(f'Serializing as version {as_version} not supported for this class.')


class ga4gh(_Ga4ghIdentifiableObject.ga4gh):
prefix = 'VA'
priorPrefix = {'1.3': 'VA'}
priorPrefix = {PrevVrsVersion.V1_3.value: 'VA'}
keys = [
'location',
'state',
Expand Down
44 changes: 40 additions & 4 deletions tests/validation/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,19 +7,19 @@
import pytest
import yaml

from ga4gh.core import ga4gh_serialize, ga4gh_digest, ga4gh_identify
from ga4gh.core import ga4gh_serialize, ga4gh_digest, ga4gh_identify, PrevVrsVersion, entity_models
from ga4gh.vrs import models

def ga4gh_1_3_identify(*args, **kwargs):
kwargs['as_version'] = '1.3'
kwargs['as_version'] = PrevVrsVersion.V1_3
return ga4gh_identify(*args, **kwargs)

def ga4gh_1_3_digest(*args, **kwargs):
kwargs['as_version'] = '1.3'
kwargs['as_version'] = PrevVrsVersion.V1_3
return ga4gh_digest(*args, **kwargs)

def ga4gh_1_3_serialize(*args, **kwargs):
kwargs['as_version'] = '1.3'
kwargs['as_version'] = PrevVrsVersion.V1_3
return ga4gh_serialize(*args, **kwargs)

fxs = {
Expand Down Expand Up @@ -60,3 +60,39 @@ def test_validation(cls, data, fn, exp):
o = getattr(models, cls)(**data)
fx = fxs[fn]
assert fx(o) == exp


def test_prev_vrs_version():
"""Ensure that support to previous VRS digest/identifiers works correctly"""
loc = models.SequenceLocation(start=44908821, end=44908822, sequenceReference=models.SequenceReference(refgetAccession="SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl"))

# string representation should work as well
ga4gh_identify(loc, as_version="1.3")

invalid_vrs_version = "0.0"
invalid_vrs_version_msg = f"Expected `PrevVrsVersion`, but got {invalid_vrs_version}"

loc_no_seq_ref = models.SequenceLocation(start=44908821, end=44908822)
loc_iri = models.SequenceLocation(start=44908821, end=44908822, sequenceReference=entity_models.IRI("sequenceReferences.json#example1"))
allele_rle_no_seq = models.Allele(location=loc, state=models.ReferenceLengthExpression(length=11, repeatSubunitLength=3))
allele_le = models.Allele(location=loc, state=models.LengthExpression(length=2))
loc_seq_ref_msg = "Must provide `sequenceReference` and it must be a valid `SequenceReference`"
for ga4gh_func in [ga4gh_identify, ga4gh_digest, ga4gh_serialize]:
with pytest.raises(ValueError, match=invalid_vrs_version_msg):
ga4gh_func(loc, as_version=invalid_vrs_version_msg)

with pytest.raises(ValueError, match=loc_seq_ref_msg):
ga4gh_func(loc_no_seq_ref, as_version=PrevVrsVersion.V1_3)

with pytest.raises(ValueError, match=loc_seq_ref_msg):
ga4gh_func(loc_iri, as_version=PrevVrsVersion.V1_3)

with pytest.raises(ValueError, match="State sequence attribute must be defined."):
ga4gh_func(allele_rle_no_seq, as_version=PrevVrsVersion.V1_3)

allele_rlse_seq = allele_rle_no_seq.model_copy(deep=True)
allele_rlse_seq.state.sequence = "C"
assert ga4gh_func(allele_rlse_seq, as_version=PrevVrsVersion.V1_3)

with pytest.raises(ValueError, match="Only `LiteralSequenceExpression` and `ReferenceLengthExpression` are supported for previous versions of VRS"):
ga4gh_func(allele_le, as_version=PrevVrsVersion.V1_3)

0 comments on commit 8c01d00

Please sign in to comment.