Skip to content

Commit

Permalink
feat!: update gks-common / vrs models (#453)
Browse files Browse the repository at this point in the history
close #452
  • Loading branch information
korikuzma authored Oct 30, 2024
1 parent 213b3aa commit 08e76ef
Show file tree
Hide file tree
Showing 6 changed files with 479 additions and 156 deletions.
15 changes: 2 additions & 13 deletions src/ga4gh/core/domain_models.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,4 @@
"""GKS Common Library Domain Entity models
**This module should not be imported directly.**
Instead, users should use one of the following:
* `from ga4gh.core import domain_models`, and refer to models with the
abbreviated name, e.g., `domain_models.Gene` (recommended)
* `import ga4gh.core`, and refer to models using the fully-qualified
module name, e.g., `ga4gh.core.domain_models.Gene`
"""
"""GKS Common Library Domain Entity models"""
from enum import Enum
from typing import Literal, Union, List

Expand Down Expand Up @@ -138,5 +127,5 @@ class Gene(DomainEntity):

type: Literal["Gene"] = Field(
CommonDomainType.GENE.value,
description=f'MUST be "{CommonDomainType.GENE.value}".'
description=f'MUST be "{CommonDomainType.GENE.value}"'
)
316 changes: 265 additions & 51 deletions src/ga4gh/core/entity_models.py

Large diffs are not rendered by default.

89 changes: 63 additions & 26 deletions src/ga4gh/vrs/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
from ga4gh.core.pydantic import (
getattr_in
)
from ga4gh.core.entity_models import IRI, Expression, Entity
from ga4gh.core.entity_models import IRI, Entity


def flatten(vals):
Expand Down Expand Up @@ -151,6 +151,13 @@ class VrsType(str, Enum):
CN_CHANGE = "CopyNumberChange"


class Orientation(str, Enum):
"""The orientation of the molecular variation component."""

FORWARD = "forward"
REVERSE_COMPLEMENT = "reverse_complement"


class ResidueAlphabet(str, Enum):
"""The interpretation of the character codes referred to by the refget accession,
where "aa" specifies an amino acid character set, and "na" specifies a nucleic acid
Expand All @@ -174,6 +181,22 @@ class CopyChange(str, Enum):
EFO_0030072 = 'EFO:0030072'


class Syntax(str, Enum):
"""The syntax used to describe the variation. The value should be one of the
supported syntaxes.
"""

HGVS_C = "hgvs.c"
HGVS_P = "hgvs.p"
HGVS_G = "hgvs.g"
HGVS_M = "hgvs.m"
HGVS_N = "hgvs.n"
HGVS_R = "hgvs.r"
HGVS_ISCN = "iscn"
GNOMAD = "gnomad"
SPDI = "spdi"


def _recurse_ga4gh_serialize(obj):
if isinstance(obj, _Ga4ghIdentifiableObject):
return obj.get_or_create_digest()
Expand Down Expand Up @@ -314,6 +337,18 @@ def get_or_create_digest(self, recompute=False) -> str:
class ga4gh(_ValueObject.ga4gh):
prefix: str

class Expression(BaseModel):
"""Representation of a variation by a specified nomenclature or syntax for a
Variation object. Common examples of expressions for the description of molecular
variation include the HGVS and ISCN nomenclatures.
"""

model_config = ConfigDict(use_enum_values=True)

syntax: Syntax = Field(..., description="The syntax used to describe the variation. The value should be one of the supported syntaxes.")
value: str = Field(..., description="The expression of the variation in the specified syntax. The value should be a valid expression in the specified syntax.")
syntax_version: Optional[str] = Field(None, description="The version of the syntax used to describe the variation. This is particularly important for HGVS expressions, as the syntax has evolved over time.")


#########################################
# vrs numerics, comparators, and ranges
Expand Down Expand Up @@ -392,7 +427,7 @@ class LengthExpression(_ValueObject):
type: Literal["LengthExpression"] = Field(
VrsType.LEN_EXPR.value, description=f'MUST be "{VrsType.LEN_EXPR.value}"'
)
length: Optional[Union[Range, int]] = None
length: Optional[Union[Range, int]] = Field(None, description="The length of the sequence.")

class ga4gh(_ValueObject.ga4gh):
keys = [
Expand All @@ -408,13 +443,13 @@ class ReferenceLengthExpression(_ValueObject):
VrsType.REF_LEN_EXPR.value, description=f'MUST be "{VrsType.REF_LEN_EXPR.value}"'
)
length: Union[Range, int] = Field(
..., description='The number of residues of the expressed sequence.'
..., description='The number of residues in the expressed sequence.'
)
sequence: Optional[SequenceString] = Field(
None, description='the `Sequence` encoded by the Reference Length Expression.'
None, description='the literal Sequence encoded by the Reference Length Expression.'
)
repeatSubunitLength: int = Field(
..., description='The number of residues of the repeat subunit.'
..., description='The number of residues in the repeat subunit.'
)

class ga4gh(_ValueObject.ga4gh):
Expand Down Expand Up @@ -452,9 +487,9 @@ class SequenceReference(_ValueObject):
type: Literal["SequenceReference"] = Field(VrsType.SEQ_REF.value, description=f'MUST be "{VrsType.SEQ_REF.value}"')
refgetAccession: Annotated[str, StringConstraints(pattern=r'^SQ.[0-9A-Za-z_\-]{32}$')] = Field(
...,
description='A `GA4GH RefGet <http://samtools.github.io/hts-specs/refget.html>` identifier for the referenced sequence, using the sha512t24u digest.',
description='A [GA4GH RefGet](http://samtools.github.io/hts-specs/refget.html) identifier for the referenced sequence, using the sha512t24u digest.',
)
residueAlphabet: Optional[ResidueAlphabet] = Field(None, description="The interpretation of the character codes referred to by the refget accession, where 'aa' specifies an amino acid character set, and 'na' specifies a nucleic acid character set.")
residueAlphabet: Optional[ResidueAlphabet] = Field(None, description='The interpretation of the character codes referred to by the refget accession, where "aa" specifies an amino acid character set, and "na" specifies a nucleic acid character set.')
circular: Optional[bool] = Field(None, description="A boolean indicating whether the molecule represented by the sequence is circular (true) or linear (false).")

class ga4gh(_ValueObject.ga4gh):
Expand All @@ -469,15 +504,15 @@ class SequenceLocation(_Ga4ghIdentifiableObject):

type: Literal["SequenceLocation"] = Field(VrsType.SEQ_LOC.value, description=f'MUST be "{VrsType.SEQ_LOC.value}"')
sequenceReference: Optional[Union[IRI, SequenceReference]] = Field(
None, description='A reference to a `Sequence` on which the location is defined.'
None, description='A reference to a Sequence on which the location is defined.'
)
start: Optional[Union[Range, int]] = Field(
None,
description='The start coordinate or range of the SequenceLocation. The minimum value of this coordinate or range is 0.',
description='The start coordinate or range of the SequenceLocation. The minimum value of this coordinate or range is 0. MUST represent a coordinate or range less than or equal to the value of `end`.',
)
end: Optional[Union[Range, int]] = Field(
None,
description='The end coordinate or range of the SequenceLocation. The minimum value of this coordinate or range is 0.',
description='The end coordinate or range of the SequenceLocation. The minimum value of this coordinate or range is 0. MUST represent a coordinate or range greater than or equal to the value of `start`.',

)
sequence: Optional[SequenceString] = Field(None, description="The literal sequence encoded by the `sequenceReference` at these coordinates.")
Expand Down Expand Up @@ -614,7 +649,7 @@ class CisPhasedBlock(_VariationBase):
type: Literal["CisPhasedBlock"] = Field(VrsType.CIS_PHASED_BLOCK.value, description=f'MUST be "{VrsType.CIS_PHASED_BLOCK.value}"')
members: List[Union[Allele, IRI]] = Field(
...,
description='A list of `Alleles` that are found in-cis on a shared molecule.',
description='A list of Alleles that are found in-cis on a shared molecule.',
min_length=2,
)
sequenceReference: Optional[SequenceReference] = Field(None, description="An optional Sequence Reference on which all of the in-cis Alleles are found. When defined, this may be used to implicitly define the `sequenceReference` attribute for each of the CisPhasedBlock member Alleles.")
Expand Down Expand Up @@ -643,7 +678,7 @@ class Adjacency(_VariationBase):
potentially with an intervening linker sequence.
"""

type: Literal["Adjacency"] = Field(VrsType.ADJACENCY.value, description=f'MUST be "{VrsType.ADJACENCY.value}"')
type: Literal["Adjacency"] = Field(VrsType.ADJACENCY.value, description=f'MUST be "{VrsType.ADJACENCY.value}".')
adjoinedSequences: List[Union[IRI, SequenceLocation]] = Field(
...,
description="The terminal sequence or pair of adjoined sequences that defines in the adjacency.",
Expand All @@ -654,7 +689,7 @@ class Adjacency(_VariationBase):
None,
description="The sequence found between adjoined sequences."
)
homology: Optional[bool] = Field(None, description="A flag indicating if coordinate ambiguity in the adjoined sequences is from sequence homology (true) or other uncertainty (false).")
homology: Optional[bool] = Field(None, description="A flag indicating if coordinate ambiguity in the adjoined sequences is from sequence homology (true) or other uncertainty, such as instrument ambiguity (false).")

class ga4gh(_Ga4ghIdentifiableObject.ga4gh):
prefix = 'AJ'
Expand All @@ -671,7 +706,7 @@ class Terminus(_VariationBase):
is not allowed and it removes the unnecessary array structure.
"""

type: Literal["Terminus"] = Field(VrsType.TERMINUS.value, description=f'MUST be "{VrsType.TERMINUS.value}"')
type: Literal["Terminus"] = Field(VrsType.TERMINUS.value, description=f'MUST be "{VrsType.TERMINUS.value}".')
location: Union[IRI, SequenceLocation] = Field(..., description="The location of the terminus.")

class ga4gh(_Ga4ghIdentifiableObject.ga4gh):
Expand All @@ -685,17 +720,19 @@ class TraversalBlock(_ValueObject):
"""A component used to describe the orientation of a molecular variation within
a DerivativeMolecule."""

model_config = ConfigDict(use_enum_values=True)

type: Literal["TraversalBlock"] = Field(
VrsType.TRAVERSAL_BLOCK.value, description=f'MUST be "{VrsType.TRAVERSAL_BLOCK.value}"'
VrsType.TRAVERSAL_BLOCK.value, description=f'MUST be "{VrsType.TRAVERSAL_BLOCK.value}".'
)
orientation: Literal["forward", "reverse_complement"] = Field(
...,
description='The orientation of the traversal block, either forward or reverse_complement.'
orientation: Optional[Orientation] = Field(
None,
description='The orientation of the molecular variation component.'
)

component: Union[IRI, Adjacency, Allele, Terminus, CisPhasedBlock] = Field(
...,
description="The component that make up the derivative molecule."
component: Optional[Union[Allele, CisPhasedBlock, Adjacency, Terminus]] = Field(
None,
description="The unoriented molecular variation component."
)

class ga4gh(_ValueObject.ga4gh):
Expand All @@ -710,13 +747,13 @@ class DerivativeMolecule(_VariationBase):
molecule composed from multiple sequence components.
"""

type: Literal["DerivativeMolecule"] = Field(VrsType.DERIVATIVE_MOL.value, description=f'MUST be "{VrsType.DERIVATIVE_MOL.value}"')
components: List[TraversalBlock] = Field(
type: Literal["DerivativeMolecule"] = Field(VrsType.DERIVATIVE_MOL.value, description=f'MUST be "{VrsType.DERIVATIVE_MOL.value}".')
components: List[Union[IRI, TraversalBlock]] = Field(
...,
description="The traversal block components that make up the derivative molecule.",
description="The molecular components that constitute the derivative molecule.",
min_length=2
)
circular: Optional[bool] = Field(None, description="A flag indicating if the derivative molecule is circular (true) or linear (false).")
circular: Optional[bool] = Field(None, description="A boolean indicating whether the molecule represented by the sequence is circular (true) or linear (false).")

class ga4gh(_Ga4ghIdentifiableObject.ga4gh):
prefix = "DM"
Expand Down Expand Up @@ -769,7 +806,7 @@ class CopyNumberChange(_CopyNumber):
type: Literal["CopyNumberChange"] = Field(VrsType.CN_CHANGE.value, description=f'MUST be "{VrsType.CN_CHANGE.value}"')
copyChange: CopyChange = Field(
...,
description='MUST be one of "EFO:0030069" (complete genomic loss), "EFO:0020073" (high-level loss), "EFO:0030068" (low-level loss), "EFO:0030067" (loss), "EFO:0030064" (regional base ploidy), "EFO:0030070" (gain), "EFO:0030071" (low-level gain), "EFO:0030072" (high-level gain).',
description='MUST be a Coding representing one of "EFO:0030069" (complete genomic loss), "EFO:0020073" (high-level loss), "EFO:0030068" (low-level loss), "EFO:0030067" (loss), "EFO:0030064" (regional base ploidy), "EFO:0030070" (gain), "EFO:0030071" (low-level gain), "EFO:0030072" (high-level gain).',
)

class ga4gh(_Ga4ghIdentifiableObject.ga4gh):
Expand Down
2 changes: 1 addition & 1 deletion submodules/vrs
Submodule vrs updated 96 files
+3 −0 .readthedocs.yaml
+1 −1 .requirements.txt
+0 −25 LICENSE
+7 −6 README.md
+47 −42 docs/source/_static/theme_overrides.css
+0 −196 docs/source/appendices/associating_annotations.rst
+0 −208 docs/source/appendices/design_decisions.rst
+0 −128 docs/source/appendices/development_process.rst
+0 −49 docs/source/appendices/equivalence.rst
+0 −54 docs/source/appendices/faq.rst
+0 −117 docs/source/appendices/future_plans.rst
+82 −56 docs/source/appendices/ga4gh_identifiers.rst
+0 −31 docs/source/appendices/getting_involved.rst
+3 −3 docs/source/appendices/glossary.rst
+0 −129 docs/source/appendices/implementations.rst
+0 −113 docs/source/appendices/mm_and_versioning.rst
+0 −135 docs/source/appendices/relationships.rst
+18 −19 docs/source/appendices/truncated_digest_collision_analysis.rst
+0 −8 docs/source/concepts/AdditionalDataTypes.rst
+6 −0 docs/source/concepts/AdditionalDataTypes/Code.rst
+6 −0 docs/source/concepts/AdditionalDataTypes/Coding.rst
+6 −0 docs/source/concepts/AdditionalDataTypes/Entity.rst
+6 −0 docs/source/concepts/AdditionalDataTypes/Extension.rst
+6 −0 docs/source/concepts/AdditionalDataTypes/IRI.rst
+99 −0 docs/source/concepts/AdditionalDataTypes/index.rst
+15 −15 docs/source/concepts/LocationAndReference/SequenceLocation.rst
+9 −9 docs/source/concepts/LocationAndReference/SequenceReference.rst
+11 −0 docs/source/concepts/LocationAndReference/index.rst
+7 −7 docs/source/concepts/MolecularVariation/Adjacency.rst
+8 −8 docs/source/concepts/MolecularVariation/Allele.rst
+12 −9 docs/source/concepts/MolecularVariation/CisPhasedBlock.rst
+13 −11 docs/source/concepts/MolecularVariation/DerivativeMolecule.rst
+1 −1 docs/source/concepts/MolecularVariation/Terminus.rst
+5 −6 docs/source/concepts/MolecularVariation/index.rst
+8 −10 docs/source/concepts/SequenceExpression/LengthExpression.rst
+4 −9 docs/source/concepts/SequenceExpression/LiteralSequenceExpression.rst
+16 −9 docs/source/concepts/SequenceExpression/ReferenceLengthExpression.rst
+8 −4 docs/source/concepts/SequenceExpression/index.rst
+8 −8 docs/source/concepts/SystemicVariation/CopyNumber.rst
+6 −6 docs/source/concepts/SystemicVariation/index.rst
+2 −3 docs/source/concepts/index.rst
+0 −6 docs/source/concepts/location/index.rst
+3 −1 docs/source/conf.py
+78 −80 docs/source/conventions/computed_identifiers.rst
+67 −139 docs/source/conventions/example.rst
+9 −10 docs/source/conventions/normalization.rst
+49 −34 docs/source/conventions/required_data.rst
+0 −1 docs/source/def
+1 −0 docs/source/def/gks.common/core-im
+1 −0 docs/source/def/gks.common/data-types
+1 −0 docs/source/def/vrs
+2 −3 docs/source/index.rst
+4 −4 docs/source/introduction.rst
+1 −1 docs/source/quickstart.rst
+0 −0 docs/source/recipes/index.rst
+2 −2 docs/source/releases/1.1.rst
+4 −4 docs/source/releases/1.2.rst
+2 −2 docs/source/releases/index.rst
+1 −1 docs/source/rst_epilog
+2 −2 docs/source/schema.rst
+2 −3 docs/source/style.rst
+9 −10 docs/source/terms_and_model.rst
+3 −3 schema/vrs/Makefile
+5 −5 schema/vrs/def/Adjacency.rst
+5 −5 schema/vrs/def/Allele.rst
+5 −5 schema/vrs/def/CisPhasedBlock.rst
+5 −5 schema/vrs/def/CopyNumber.rst
+7 −7 schema/vrs/def/CopyNumberChange.rst
+5 −5 schema/vrs/def/CopyNumberCount.rst
+5 −5 schema/vrs/def/DerivativeMolecule.rst
+4 −4 schema/vrs/def/Ga4ghIdentifiableObject.rst
+5 −5 schema/vrs/def/LengthExpression.rst
+4 −4 schema/vrs/def/LiteralSequenceExpression.rst
+6 −6 schema/vrs/def/ReferenceLengthExpression.rst
+4 −4 schema/vrs/def/SequenceExpression.rst
+5 −5 schema/vrs/def/SequenceLocation.rst
+5 −5 schema/vrs/def/SequenceReference.rst
+5 −5 schema/vrs/def/Terminus.rst
+4 −4 schema/vrs/def/TraversalBlock.rst
+4 −4 schema/vrs/def/Variation.rst
+5 −3 schema/vrs/json/Adjacency
+5 −3 schema/vrs/json/Allele
+5 −3 schema/vrs/json/CisPhasedBlock
+6 −4 schema/vrs/json/CopyNumberChange
+5 −3 schema/vrs/json/CopyNumberCount
+5 −3 schema/vrs/json/DerivativeMolecule
+7 −3 schema/vrs/json/LengthExpression
+6 −3 schema/vrs/json/LiteralSequenceExpression
+11 −0 schema/vrs/json/Location
+8 −5 schema/vrs/json/ReferenceLengthExpression
+5 −3 schema/vrs/json/SequenceLocation
+9 −6 schema/vrs/json/SequenceReference
+5 −3 schema/vrs/json/Terminus
+8 −5 schema/vrs/json/TraversalBlock
+6 −4 schema/vrs/vrs-source.yaml
+1 −1 submodules/gks-common
148 changes: 148 additions & 0 deletions tests/validation/test_schemas.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
"""Test that VRS-Python Pydantic models match VRS and GKS-Common schemas"""

from enum import Enum
import json
from pathlib import Path

import pytest
from pydantic import BaseModel

from ga4gh.core import entity_models, domain_models
from ga4gh.vrs import models as vrs_models


class GKSSchema(str, Enum):
"""Enum for GKS schema"""

VRS = "vrs"
CORE_IM = "core-im"
DOMAIN = "domain-entities"


class GKSSchemaMapping(BaseModel):
"""Model for representing GKS Schema concrete classes, primitives, and schema"""

base_classes: set = set()
concrete_classes: set = set()
primitives: set = set()
schema: dict = {}


def _update_gks_schema_mapping(
f_path: Path, gks_schema_mapping: GKSSchemaMapping
) -> None:
"""Update ``gks_schema_mapping`` properties
:param f_path: Path to JSON Schema file
:param gks_schema_mapping: GKS schema mapping to update
"""
with f_path.open() as rf:
cls_def = json.load(rf)

spec_class = cls_def["title"]
gks_schema_mapping.schema[spec_class] = cls_def

if "properties" in cls_def:
gks_schema_mapping.concrete_classes.add(spec_class)
elif cls_def.get("type") in {"array", "integer", "string"}:
gks_schema_mapping.primitives.add(spec_class)
else:
gks_schema_mapping.base_classes.add(spec_class)


GKS_SCHEMA_MAPPING = {gks: GKSSchemaMapping() for gks in GKSSchema}
SUBMODULES_DIR = Path(__file__).parents[2] / "submodules" / "vrs"


# Get vrs classes
vrs_mapping = GKS_SCHEMA_MAPPING[GKSSchema.VRS]
for f in (SUBMODULES_DIR / "schema" / "vrs" / "json").glob("*"):
_update_gks_schema_mapping(f, vrs_mapping)


# Get core-im + domain classes
for child in (SUBMODULES_DIR / "submodules" / "gks-common" / "schema").iterdir():
mapping_key = (
GKSSchema.DOMAIN if str(child).endswith(GKSSchema.DOMAIN) else GKSSchema.CORE_IM
)
mapping = GKS_SCHEMA_MAPPING[mapping_key]
for f in (child / "json").glob("*"):
_update_gks_schema_mapping(f, mapping)


@pytest.mark.parametrize(
("gks_schema", "pydantic_models"),
[
(GKSSchema.VRS, vrs_models),
(GKSSchema.CORE_IM, entity_models),
(GKSSchema.DOMAIN, domain_models),
],
)
def test_schema_models_in_pydantic(gks_schema, pydantic_models):
"""Ensure that each schema model has corresponding Pydantic model"""
mapping = GKS_SCHEMA_MAPPING[gks_schema]
for schema_model in (
mapping.base_classes | mapping.concrete_classes | mapping.primitives
):
assert getattr(pydantic_models, schema_model, False), schema_model


@pytest.mark.parametrize(
("gks_schema", "pydantic_models"),
[
(GKSSchema.VRS, vrs_models),
(GKSSchema.CORE_IM, entity_models),
(GKSSchema.DOMAIN, domain_models),
],
)
def test_schema_class_fields(gks_schema, pydantic_models):
"""Check that each schema model properties exist and are required in corresponding
Pydantic model, and validate required properties
"""
mapping = GKS_SCHEMA_MAPPING[gks_schema]
for schema_model in mapping.concrete_classes:
schema_properties = mapping.schema[schema_model]["properties"]
pydantic_model = getattr(pydantic_models, schema_model)
assert set(pydantic_model.model_fields) == set(schema_properties), schema_model

required_schema_fields = set(mapping.schema[schema_model]["required"])

for prop, property_def in schema_properties.items():
pydantic_model_field_info = pydantic_model.model_fields[prop]
pydantic_field_required = pydantic_model_field_info.is_required()

if prop in required_schema_fields:
if prop != "type":
assert pydantic_field_required, f"{pydantic_model}.{prop}"
else:
assert not pydantic_field_required, f"{pydantic_model}.{prop}"

if "description" in property_def:
assert property_def["description"].replace("'", "\"") == pydantic_model_field_info.description.replace("'", "\""), f"{pydantic_model}.{prop}"
else:
assert pydantic_model_field_info.description is None, f"{pydantic_model}.{prop}"


def test_ga4gh_keys():
"""Ensure ga4ghDigest keys defined in schema model exist in corresponding Pydantic model"""
vrs_mapping = GKS_SCHEMA_MAPPING[GKSSchema.VRS]
for vrs_class in vrs_mapping.concrete_classes:
if (
vrs_mapping.schema[vrs_class].get("ga4ghDigest", {}).get("keys", None)
is None
):
continue

pydantic_model = getattr(vrs_models, vrs_class)

try:
pydantic_model_digest_keys = pydantic_model.ga4gh.keys
except AttributeError as e:
raise AttributeError(vrs_class) from e

assert set(pydantic_model_digest_keys) == set(
vrs_mapping.schema[vrs_class]["ga4ghDigest"]["keys"]
), vrs_class
assert pydantic_model_digest_keys == sorted(
pydantic_model.ga4gh.keys
), vrs_class
Loading

0 comments on commit 08e76ef

Please sign in to comment.