Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat!: update gks-common / vrs models #453

Merged
merged 11 commits into from
Oct 30, 2024
15 changes: 2 additions & 13 deletions src/ga4gh/core/domain_models.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,4 @@
"""GKS Common Library Domain Entity models

**This module should not be imported directly.**

Instead, users should use one of the following:

* `from ga4gh.core import domain_models`, and refer to models with the
abbreviated name, e.g., `domain_models.Gene` (recommended)

* `import ga4gh.core`, and refer to models using the fully-qualified
module name, e.g., `ga4gh.core.domain_models.Gene`
"""
"""GKS Common Library Domain Entity models"""
from enum import Enum
from typing import Literal, Union, List

Expand Down Expand Up @@ -138,5 +127,5 @@ class Gene(DomainEntity):

type: Literal["Gene"] = Field(
CommonDomainType.GENE.value,
description=f'MUST be "{CommonDomainType.GENE.value}".'
description=f'MUST be "{CommonDomainType.GENE.value}"'
)
316 changes: 265 additions & 51 deletions src/ga4gh/core/entity_models.py

Large diffs are not rendered by default.

89 changes: 63 additions & 26 deletions src/ga4gh/vrs/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
from ga4gh.core.pydantic import (
getattr_in
)
from ga4gh.core.entity_models import IRI, Expression, Entity
from ga4gh.core.entity_models import IRI, Entity


def flatten(vals):
Expand Down Expand Up @@ -151,6 +151,13 @@ class VrsType(str, Enum):
CN_CHANGE = "CopyNumberChange"


class Orientation(str, Enum):
"""The orientation of the molecular variation component."""

FORWARD = "forward"
REVERSE_COMPLEMENT = "reverse_complement"


class ResidueAlphabet(str, Enum):
"""The interpretation of the character codes referred to by the refget accession,
where "aa" specifies an amino acid character set, and "na" specifies a nucleic acid
Expand All @@ -174,6 +181,22 @@ class CopyChange(str, Enum):
EFO_0030072 = 'EFO:0030072'


class Syntax(str, Enum):
"""The syntax used to describe the variation. The value should be one of the
supported syntaxes.
"""

HGVS_C = "hgvs.c"
HGVS_P = "hgvs.p"
HGVS_G = "hgvs.g"
HGVS_M = "hgvs.m"
HGVS_N = "hgvs.n"
HGVS_R = "hgvs.r"
HGVS_ISCN = "iscn"
GNOMAD = "gnomad"
SPDI = "spdi"


def _recurse_ga4gh_serialize(obj):
if isinstance(obj, _Ga4ghIdentifiableObject):
return obj.get_or_create_digest()
Expand Down Expand Up @@ -314,6 +337,18 @@ def get_or_create_digest(self, recompute=False) -> str:
class ga4gh(_ValueObject.ga4gh):
prefix: str

class Expression(BaseModel):
"""Representation of a variation by a specified nomenclature or syntax for a
Variation object. Common examples of expressions for the description of molecular
variation include the HGVS and ISCN nomenclatures.
"""

model_config = ConfigDict(use_enum_values=True)

syntax: Syntax = Field(..., description="The syntax used to describe the variation. The value should be one of the supported syntaxes.")
value: str = Field(..., description="The expression of the variation in the specified syntax. The value should be a valid expression in the specified syntax.")
syntax_version: Optional[str] = Field(None, description="The version of the syntax used to describe the variation. This is particularly important for HGVS expressions, as the syntax has evolved over time.")


#########################################
# vrs numerics, comparators, and ranges
Expand Down Expand Up @@ -392,7 +427,7 @@ class LengthExpression(_ValueObject):
type: Literal["LengthExpression"] = Field(
VrsType.LEN_EXPR.value, description=f'MUST be "{VrsType.LEN_EXPR.value}"'
)
length: Optional[Union[Range, int]] = None
length: Optional[Union[Range, int]] = Field(None, description="The length of the sequence.")

class ga4gh(_ValueObject.ga4gh):
keys = [
Expand All @@ -408,13 +443,13 @@ class ReferenceLengthExpression(_ValueObject):
VrsType.REF_LEN_EXPR.value, description=f'MUST be "{VrsType.REF_LEN_EXPR.value}"'
)
length: Union[Range, int] = Field(
..., description='The number of residues of the expressed sequence.'
..., description='The number of residues in the expressed sequence.'
)
sequence: Optional[SequenceString] = Field(
None, description='the `Sequence` encoded by the Reference Length Expression.'
None, description='the literal Sequence encoded by the Reference Length Expression.'
)
repeatSubunitLength: int = Field(
..., description='The number of residues of the repeat subunit.'
..., description='The number of residues in the repeat subunit.'
)

class ga4gh(_ValueObject.ga4gh):
Expand Down Expand Up @@ -452,9 +487,9 @@ class SequenceReference(_ValueObject):
type: Literal["SequenceReference"] = Field(VrsType.SEQ_REF.value, description=f'MUST be "{VrsType.SEQ_REF.value}"')
refgetAccession: Annotated[str, StringConstraints(pattern=r'^SQ.[0-9A-Za-z_\-]{32}$')] = Field(
...,
description='A `GA4GH RefGet <http://samtools.github.io/hts-specs/refget.html>` identifier for the referenced sequence, using the sha512t24u digest.',
description='A [GA4GH RefGet](http://samtools.github.io/hts-specs/refget.html) identifier for the referenced sequence, using the sha512t24u digest.',
)
residueAlphabet: Optional[ResidueAlphabet] = Field(None, description="The interpretation of the character codes referred to by the refget accession, where 'aa' specifies an amino acid character set, and 'na' specifies a nucleic acid character set.")
residueAlphabet: Optional[ResidueAlphabet] = Field(None, description='The interpretation of the character codes referred to by the refget accession, where "aa" specifies an amino acid character set, and "na" specifies a nucleic acid character set.')
circular: Optional[bool] = Field(None, description="A boolean indicating whether the molecule represented by the sequence is circular (true) or linear (false).")

class ga4gh(_ValueObject.ga4gh):
Expand All @@ -469,15 +504,15 @@ class SequenceLocation(_Ga4ghIdentifiableObject):

type: Literal["SequenceLocation"] = Field(VrsType.SEQ_LOC.value, description=f'MUST be "{VrsType.SEQ_LOC.value}"')
sequenceReference: Optional[Union[IRI, SequenceReference]] = Field(
None, description='A reference to a `Sequence` on which the location is defined.'
None, description='A reference to a Sequence on which the location is defined.'
)
start: Optional[Union[Range, int]] = Field(
None,
description='The start coordinate or range of the SequenceLocation. The minimum value of this coordinate or range is 0.',
description='The start coordinate or range of the SequenceLocation. The minimum value of this coordinate or range is 0. MUST represent a coordinate or range less than or equal to the value of `end`.',
)
end: Optional[Union[Range, int]] = Field(
None,
description='The end coordinate or range of the SequenceLocation. The minimum value of this coordinate or range is 0.',
description='The end coordinate or range of the SequenceLocation. The minimum value of this coordinate or range is 0. MUST represent a coordinate or range greater than or equal to the value of `start`.',

)
sequence: Optional[SequenceString] = Field(None, description="The literal sequence encoded by the `sequenceReference` at these coordinates.")
Expand Down Expand Up @@ -614,7 +649,7 @@ class CisPhasedBlock(_VariationBase):
type: Literal["CisPhasedBlock"] = Field(VrsType.CIS_PHASED_BLOCK.value, description=f'MUST be "{VrsType.CIS_PHASED_BLOCK.value}"')
members: List[Union[Allele, IRI]] = Field(
...,
description='A list of `Alleles` that are found in-cis on a shared molecule.',
description='A list of Alleles that are found in-cis on a shared molecule.',
min_length=2,
)
sequenceReference: Optional[SequenceReference] = Field(None, description="An optional Sequence Reference on which all of the in-cis Alleles are found. When defined, this may be used to implicitly define the `sequenceReference` attribute for each of the CisPhasedBlock member Alleles.")
Expand Down Expand Up @@ -643,7 +678,7 @@ class Adjacency(_VariationBase):
potentially with an intervening linker sequence.
"""

type: Literal["Adjacency"] = Field(VrsType.ADJACENCY.value, description=f'MUST be "{VrsType.ADJACENCY.value}"')
type: Literal["Adjacency"] = Field(VrsType.ADJACENCY.value, description=f'MUST be "{VrsType.ADJACENCY.value}".')
adjoinedSequences: List[Union[IRI, SequenceLocation]] = Field(
...,
description="The terminal sequence or pair of adjoined sequences that defines in the adjacency.",
Expand All @@ -654,7 +689,7 @@ class Adjacency(_VariationBase):
None,
description="The sequence found between adjoined sequences."
)
homology: Optional[bool] = Field(None, description="A flag indicating if coordinate ambiguity in the adjoined sequences is from sequence homology (true) or other uncertainty (false).")
homology: Optional[bool] = Field(None, description="A flag indicating if coordinate ambiguity in the adjoined sequences is from sequence homology (true) or other uncertainty, such as instrument ambiguity (false).")

class ga4gh(_Ga4ghIdentifiableObject.ga4gh):
prefix = 'AJ'
Expand All @@ -671,7 +706,7 @@ class Terminus(_VariationBase):
is not allowed and it removes the unnecessary array structure.
"""

type: Literal["Terminus"] = Field(VrsType.TERMINUS.value, description=f'MUST be "{VrsType.TERMINUS.value}"')
type: Literal["Terminus"] = Field(VrsType.TERMINUS.value, description=f'MUST be "{VrsType.TERMINUS.value}".')
location: Union[IRI, SequenceLocation] = Field(..., description="The location of the terminus.")

class ga4gh(_Ga4ghIdentifiableObject.ga4gh):
Expand All @@ -685,17 +720,19 @@ class TraversalBlock(_ValueObject):
"""A component used to describe the orientation of a molecular variation within
a DerivativeMolecule."""

model_config = ConfigDict(use_enum_values=True)

type: Literal["TraversalBlock"] = Field(
VrsType.TRAVERSAL_BLOCK.value, description=f'MUST be "{VrsType.TRAVERSAL_BLOCK.value}"'
VrsType.TRAVERSAL_BLOCK.value, description=f'MUST be "{VrsType.TRAVERSAL_BLOCK.value}".'
)
orientation: Literal["forward", "reverse_complement"] = Field(
...,
description='The orientation of the traversal block, either forward or reverse_complement.'
orientation: Optional[Orientation] = Field(
None,
description='The orientation of the molecular variation component.'
)

component: Union[IRI, Adjacency, Allele, Terminus, CisPhasedBlock] = Field(
...,
description="The component that make up the derivative molecule."
component: Optional[Union[Allele, CisPhasedBlock, Adjacency, Terminus]] = Field(
None,
description="The unoriented molecular variation component."
)

class ga4gh(_ValueObject.ga4gh):
Expand All @@ -710,13 +747,13 @@ class DerivativeMolecule(_VariationBase):
molecule composed from multiple sequence components.
"""

type: Literal["DerivativeMolecule"] = Field(VrsType.DERIVATIVE_MOL.value, description=f'MUST be "{VrsType.DERIVATIVE_MOL.value}"')
components: List[TraversalBlock] = Field(
type: Literal["DerivativeMolecule"] = Field(VrsType.DERIVATIVE_MOL.value, description=f'MUST be "{VrsType.DERIVATIVE_MOL.value}".')
components: List[Union[IRI, TraversalBlock]] = Field(
...,
description="The traversal block components that make up the derivative molecule.",
description="The molecular components that constitute the derivative molecule.",
min_length=2
)
circular: Optional[bool] = Field(None, description="A flag indicating if the derivative molecule is circular (true) or linear (false).")
circular: Optional[bool] = Field(None, description="A boolean indicating whether the molecule represented by the sequence is circular (true) or linear (false).")

class ga4gh(_Ga4ghIdentifiableObject.ga4gh):
prefix = "DM"
Expand Down Expand Up @@ -769,7 +806,7 @@ class CopyNumberChange(_CopyNumber):
type: Literal["CopyNumberChange"] = Field(VrsType.CN_CHANGE.value, description=f'MUST be "{VrsType.CN_CHANGE.value}"')
copyChange: CopyChange = Field(
...,
description='MUST be one of "EFO:0030069" (complete genomic loss), "EFO:0020073" (high-level loss), "EFO:0030068" (low-level loss), "EFO:0030067" (loss), "EFO:0030064" (regional base ploidy), "EFO:0030070" (gain), "EFO:0030071" (low-level gain), "EFO:0030072" (high-level gain).',
description='MUST be a Coding representing one of "EFO:0030069" (complete genomic loss), "EFO:0020073" (high-level loss), "EFO:0030068" (low-level loss), "EFO:0030067" (loss), "EFO:0030064" (regional base ploidy), "EFO:0030070" (gain), "EFO:0030071" (low-level gain), "EFO:0030072" (high-level gain).',
)

class ga4gh(_Ga4ghIdentifiableObject.ga4gh):
Expand Down
2 changes: 1 addition & 1 deletion submodules/vrs
Submodule vrs updated 96 files
+3 −0 .readthedocs.yaml
+1 −1 .requirements.txt
+0 −25 LICENSE
+7 −6 README.md
+47 −42 docs/source/_static/theme_overrides.css
+0 −196 docs/source/appendices/associating_annotations.rst
+0 −208 docs/source/appendices/design_decisions.rst
+0 −128 docs/source/appendices/development_process.rst
+0 −49 docs/source/appendices/equivalence.rst
+0 −54 docs/source/appendices/faq.rst
+0 −117 docs/source/appendices/future_plans.rst
+82 −56 docs/source/appendices/ga4gh_identifiers.rst
+0 −31 docs/source/appendices/getting_involved.rst
+3 −3 docs/source/appendices/glossary.rst
+0 −129 docs/source/appendices/implementations.rst
+0 −113 docs/source/appendices/mm_and_versioning.rst
+0 −135 docs/source/appendices/relationships.rst
+18 −19 docs/source/appendices/truncated_digest_collision_analysis.rst
+0 −8 docs/source/concepts/AdditionalDataTypes.rst
+6 −0 docs/source/concepts/AdditionalDataTypes/Code.rst
+6 −0 docs/source/concepts/AdditionalDataTypes/Coding.rst
+6 −0 docs/source/concepts/AdditionalDataTypes/Entity.rst
+6 −0 docs/source/concepts/AdditionalDataTypes/Extension.rst
+6 −0 docs/source/concepts/AdditionalDataTypes/IRI.rst
+99 −0 docs/source/concepts/AdditionalDataTypes/index.rst
+15 −15 docs/source/concepts/LocationAndReference/SequenceLocation.rst
+9 −9 docs/source/concepts/LocationAndReference/SequenceReference.rst
+11 −0 docs/source/concepts/LocationAndReference/index.rst
+7 −7 docs/source/concepts/MolecularVariation/Adjacency.rst
+8 −8 docs/source/concepts/MolecularVariation/Allele.rst
+12 −9 docs/source/concepts/MolecularVariation/CisPhasedBlock.rst
+13 −11 docs/source/concepts/MolecularVariation/DerivativeMolecule.rst
+1 −1 docs/source/concepts/MolecularVariation/Terminus.rst
+5 −6 docs/source/concepts/MolecularVariation/index.rst
+8 −10 docs/source/concepts/SequenceExpression/LengthExpression.rst
+4 −9 docs/source/concepts/SequenceExpression/LiteralSequenceExpression.rst
+16 −9 docs/source/concepts/SequenceExpression/ReferenceLengthExpression.rst
+8 −4 docs/source/concepts/SequenceExpression/index.rst
+8 −8 docs/source/concepts/SystemicVariation/CopyNumber.rst
+6 −6 docs/source/concepts/SystemicVariation/index.rst
+2 −3 docs/source/concepts/index.rst
+0 −6 docs/source/concepts/location/index.rst
+3 −1 docs/source/conf.py
+78 −80 docs/source/conventions/computed_identifiers.rst
+67 −139 docs/source/conventions/example.rst
+9 −10 docs/source/conventions/normalization.rst
+49 −34 docs/source/conventions/required_data.rst
+0 −1 docs/source/def
+1 −0 docs/source/def/gks.common/core-im
+1 −0 docs/source/def/gks.common/data-types
+1 −0 docs/source/def/vrs
+2 −3 docs/source/index.rst
+4 −4 docs/source/introduction.rst
+1 −1 docs/source/quickstart.rst
+0 −0 docs/source/recipes/index.rst
+2 −2 docs/source/releases/1.1.rst
+4 −4 docs/source/releases/1.2.rst
+2 −2 docs/source/releases/index.rst
+1 −1 docs/source/rst_epilog
+2 −2 docs/source/schema.rst
+2 −3 docs/source/style.rst
+9 −10 docs/source/terms_and_model.rst
+3 −3 schema/vrs/Makefile
+5 −5 schema/vrs/def/Adjacency.rst
+5 −5 schema/vrs/def/Allele.rst
+5 −5 schema/vrs/def/CisPhasedBlock.rst
+5 −5 schema/vrs/def/CopyNumber.rst
+7 −7 schema/vrs/def/CopyNumberChange.rst
+5 −5 schema/vrs/def/CopyNumberCount.rst
+5 −5 schema/vrs/def/DerivativeMolecule.rst
+4 −4 schema/vrs/def/Ga4ghIdentifiableObject.rst
+5 −5 schema/vrs/def/LengthExpression.rst
+4 −4 schema/vrs/def/LiteralSequenceExpression.rst
+6 −6 schema/vrs/def/ReferenceLengthExpression.rst
+4 −4 schema/vrs/def/SequenceExpression.rst
+5 −5 schema/vrs/def/SequenceLocation.rst
+5 −5 schema/vrs/def/SequenceReference.rst
+5 −5 schema/vrs/def/Terminus.rst
+4 −4 schema/vrs/def/TraversalBlock.rst
+4 −4 schema/vrs/def/Variation.rst
+5 −3 schema/vrs/json/Adjacency
+5 −3 schema/vrs/json/Allele
+5 −3 schema/vrs/json/CisPhasedBlock
+6 −4 schema/vrs/json/CopyNumberChange
+5 −3 schema/vrs/json/CopyNumberCount
+5 −3 schema/vrs/json/DerivativeMolecule
+7 −3 schema/vrs/json/LengthExpression
+6 −3 schema/vrs/json/LiteralSequenceExpression
+11 −0 schema/vrs/json/Location
+8 −5 schema/vrs/json/ReferenceLengthExpression
+5 −3 schema/vrs/json/SequenceLocation
+9 −6 schema/vrs/json/SequenceReference
+5 −3 schema/vrs/json/Terminus
+8 −5 schema/vrs/json/TraversalBlock
+6 −4 schema/vrs/vrs-source.yaml
+1 −1 submodules/gks-common
148 changes: 148 additions & 0 deletions tests/validation/test_schemas.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
"""Test that VRS-Python Pydantic models match VRS and GKS-Common schemas"""

from enum import Enum
import json
from pathlib import Path

import pytest
from pydantic import BaseModel

from ga4gh.core import entity_models, domain_models
from ga4gh.vrs import models as vrs_models


class GKSSchema(str, Enum):
"""Enum for GKS schema"""

VRS = "vrs"
CORE_IM = "core-im"
DOMAIN = "domain-entities"


class GKSSchemaMapping(BaseModel):
"""Model for representing GKS Schema concrete classes, primitives, and schema"""

base_classes: set = set()
concrete_classes: set = set()
primitives: set = set()
schema: dict = {}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

as a side note, I tried to properly add ruff to vrs-python a few months ago, but the import sort ended up introducing a circular dependency. If it's not doing that anymore, then we could probably get it added in.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jsstevenson ah, I don't have ruff added to VRS-Python. I noticed this when working in Cat-VRS-Python, which does have ruff installed.



def _update_gks_schema_mapping(
f_path: Path, gks_schema_mapping: GKSSchemaMapping
) -> None:
"""Update ``gks_schema_mapping`` properties

:param f_path: Path to JSON Schema file
:param gks_schema_mapping: GKS schema mapping to update
"""
with f_path.open() as rf:
cls_def = json.load(rf)

spec_class = cls_def["title"]
gks_schema_mapping.schema[spec_class] = cls_def

if "properties" in cls_def:
gks_schema_mapping.concrete_classes.add(spec_class)
elif cls_def.get("type") in {"array", "integer", "string"}:
gks_schema_mapping.primitives.add(spec_class)
else:
gks_schema_mapping.base_classes.add(spec_class)


GKS_SCHEMA_MAPPING = {gks: GKSSchemaMapping() for gks in GKSSchema}
SUBMODULES_DIR = Path(__file__).parents[2] / "submodules" / "vrs"


# Get vrs classes
vrs_mapping = GKS_SCHEMA_MAPPING[GKSSchema.VRS]
for f in (SUBMODULES_DIR / "schema" / "vrs" / "json").glob("*"):
_update_gks_schema_mapping(f, vrs_mapping)


# Get core-im + domain classes
for child in (SUBMODULES_DIR / "submodules" / "gks-common" / "schema").iterdir():
mapping_key = (
GKSSchema.DOMAIN if str(child).endswith(GKSSchema.DOMAIN) else GKSSchema.CORE_IM
)
mapping = GKS_SCHEMA_MAPPING[mapping_key]
for f in (child / "json").glob("*"):
_update_gks_schema_mapping(f, mapping)


@pytest.mark.parametrize(
("gks_schema", "pydantic_models"),
[
(GKSSchema.VRS, vrs_models),
(GKSSchema.CORE_IM, entity_models),
(GKSSchema.DOMAIN, domain_models),
],
)
def test_schema_models_in_pydantic(gks_schema, pydantic_models):
"""Ensure that each schema model has corresponding Pydantic model"""
mapping = GKS_SCHEMA_MAPPING[gks_schema]
for schema_model in (
mapping.base_classes | mapping.concrete_classes | mapping.primitives
):
assert getattr(pydantic_models, schema_model, False), schema_model


@pytest.mark.parametrize(
("gks_schema", "pydantic_models"),
[
(GKSSchema.VRS, vrs_models),
(GKSSchema.CORE_IM, entity_models),
(GKSSchema.DOMAIN, domain_models),
],
)
def test_schema_class_fields(gks_schema, pydantic_models):
"""Check that each schema model properties exist and are required in corresponding
Pydantic model, and validate required properties
"""
mapping = GKS_SCHEMA_MAPPING[gks_schema]
for schema_model in mapping.concrete_classes:
schema_properties = mapping.schema[schema_model]["properties"]
pydantic_model = getattr(pydantic_models, schema_model)
assert set(pydantic_model.model_fields) == set(schema_properties), schema_model

required_schema_fields = set(mapping.schema[schema_model]["required"])

for prop, property_def in schema_properties.items():
pydantic_model_field_info = pydantic_model.model_fields[prop]
pydantic_field_required = pydantic_model_field_info.is_required()

if prop in required_schema_fields:
if prop != "type":
assert pydantic_field_required, f"{pydantic_model}.{prop}"
else:
assert not pydantic_field_required, f"{pydantic_model}.{prop}"

if "description" in property_def:
assert property_def["description"].replace("'", "\"") == pydantic_model_field_info.description.replace("'", "\""), f"{pydantic_model}.{prop}"
else:
assert pydantic_model_field_info.description is None, f"{pydantic_model}.{prop}"


def test_ga4gh_keys():
"""Ensure ga4ghDigest keys defined in schema model exist in corresponding Pydantic model"""
vrs_mapping = GKS_SCHEMA_MAPPING[GKSSchema.VRS]
for vrs_class in vrs_mapping.concrete_classes:
if (
vrs_mapping.schema[vrs_class].get("ga4ghDigest", {}).get("keys", None)
is None
):
continue

pydantic_model = getattr(vrs_models, vrs_class)

try:
pydantic_model_digest_keys = pydantic_model.ga4gh.keys
except AttributeError as e:
raise AttributeError(vrs_class) from e

assert set(pydantic_model_digest_keys) == set(
vrs_mapping.schema[vrs_class]["ga4ghDigest"]["keys"]
), vrs_class
assert pydantic_model_digest_keys == sorted(
pydantic_model.ga4gh.keys
), vrs_class
Loading
Loading