From 6b35c8e2e7ab25cefc6848349d683e4beba38541 Mon Sep 17 00:00:00 2001 From: Kori Kuzma Date: Thu, 31 Oct 2024 14:32:54 -0400 Subject: [PATCH] feat: add cat vrs pydantic models (#4) close #2 --- .gitmodules | 4 + pyproject.toml | 6 +- src/ga4gh/cat_vrs/__init__.py | 4 + src/ga4gh/cat_vrs/core_models.py | 99 +++++++++++++++ src/ga4gh/cat_vrs/profile_models.py | 154 ++++++++++++++++++++++++ submodules/cat_vrs | 1 + tests/validation/test_cat_vrs_schema.py | 123 +++++++++++++++++++ 7 files changed, 390 insertions(+), 1 deletion(-) create mode 100644 .gitmodules create mode 100644 src/ga4gh/cat_vrs/core_models.py create mode 100644 src/ga4gh/cat_vrs/profile_models.py create mode 160000 submodules/cat_vrs create mode 100644 tests/validation/test_cat_vrs_schema.py diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..ba6b9cb --- /dev/null +++ b/.gitmodules @@ -0,0 +1,4 @@ +[submodule "submodules/cat_vrs"] + path = submodules/cat_vrs + url = https://github.com/ga4gh/cat-vrs + branch = 1.x diff --git a/pyproject.toml b/pyproject.toml index e23228b..b95c9bb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,7 +31,10 @@ keywords = [ ] requires-python = ">=3.10" dynamic = ["version"] -dependencies = [] +dependencies = [ + "ga4gh.vrs~=2.0.0a12", + "pydantic==2.*", +] [project.optional-dependencies] dev = [ @@ -134,6 +137,7 @@ ignore = [ # S101 - assert # B011 - assert-false "tests/*" = ["ANN001", "ANN2", "ANN102", "S101", "B011"] +"src/ga4gh/cat_vrs/*models.py" = ["ANN102"] [tool.setuptools.packages.find] where = ["src"] diff --git a/src/ga4gh/cat_vrs/__init__.py b/src/ga4gh/cat_vrs/__init__.py index fd0f4a9..cd05ec7 100644 --- a/src/ga4gh/cat_vrs/__init__.py +++ b/src/ga4gh/cat_vrs/__init__.py @@ -1 +1,5 @@ """Package for Cat-VRS Python implementation""" + +from . import profile_models as cat_vrs_models + +__all__ = ["cat_vrs_models"] diff --git a/src/ga4gh/cat_vrs/core_models.py b/src/ga4gh/cat_vrs/core_models.py new file mode 100644 index 0000000..fb8623a --- /dev/null +++ b/src/ga4gh/cat_vrs/core_models.py @@ -0,0 +1,99 @@ +"""Define Pydantic models for GA4GH categorical variation objects. + +See the `CatVar page `_ on +the GA4GH website for more information. +""" + +from enum import Enum +from typing import Literal + +from ga4gh.core.entity_models import IRI, Coding, DomainEntity +from ga4gh.vrs.models import CopyChange, Location, Range, Variation +from pydantic import BaseModel, Field, RootModel, field_validator + + +class Relation(str, Enum): + """Defined relationships between members of the categorical variant and the defining + context. ``sequence_liftover`` refers to variants or locations that represent a + congruent concept on a differing assembly of a human genome (e.g. "GRCh37" and + "GRCh38") or gene (e.g. Locus Reference Genomic) sequence. ``transcript_projection`` + refers to variants or locations that occur on transcripts projected from the defined + genomic concept. ``codon_translation`` refers to variants or locations that + translate from the codon(s) represented by the defined concept. + """ + + SEQUENCE_LIFTOVER = "sequence_liftover" + TRANSCRIPT_PROJECTION = "transcript_projection" + CODON_TRANSLATION = "codon_translation" + + +class DefiningContextConstraint(BaseModel): + """The location or location-state, congruent with other reference sequences, about + which categorical variation is being described. + """ + + type: Literal["DefiningContextConstraint"] = Field( + "DefiningContextConstraint", description="MUST be 'DefiningContextConstraint'" + ) + definingContext: Variation | Location | IRI # noqa: N815 + relations: list[Relation] | None = Field( + None, + description="Defined relationships between members of the categorical variant and the defining context. ``sequence_liftover`` refers to variants or locations that represent a congruent concept on a differing assembly of a human genome (e.g. 'GRCh37' and 'GRCh38') or gene (e.g. Locus Reference Genomic) sequence. ``transcript_projection`` refers to variants or locations that occur on transcripts projected from the defined genomic concept. ``codon_translation`` refers to variants or locations that translate from the codon(s) represented by the defined concept.", + ) + + +class CopyCountConstraint(BaseModel): + """The absolute number of copies in a system""" + + type: Literal["CopyCountConstraint"] = Field( + "CopyCountConstraint", description="MUST be 'CopyCountConstraint'" + ) + copies: int | Range + + +class CopyChangeConstraint(BaseModel): + """A representation of copy number change""" + + type: Literal["CopyChangeConstraint"] = Field( + "CopyChangeConstraint", description="MUST be 'CopyChangeConstraint'" + ) + copyChange: Coding # noqa: N815 + + @field_validator("copyChange") + @classmethod + def validate_copy_change(cls, v: Coding) -> Coding: + """Validate copyChange property + + :param v: copyChange value + :raises ValueError: If ``copyChange.code`` is not a valid CopyChange + :return: copyChange property + """ + try: + CopyChange(v.code.root) + except ValueError as e: + err_msg = f"copyChange, {v.code.root}, not one of {[cc.value for cc in CopyChange]}" + raise ValueError(err_msg) from e + return v + + +class Constraint(RootModel): + """Constraints are used to construct an intensional semantics of categorical variant types.""" + + root: DefiningContextConstraint | CopyCountConstraint | CopyChangeConstraint = ( + Field(..., discriminator="type") + ) + + +class CategoricalVariant(DomainEntity): + """A representation of a categorically-defined domain for variation, in which + individual contextual variation instances may be members of the domain. + """ + + type: Literal["CategoricalVariant"] = Field( + "CategoricalVariant", description="MUST be 'CategoricalVariant'" + ) + members: list[Variation | IRI] | None = Field( + None, + description="A non-exhaustive list of VRS variation contexts that satisfy the constraints of this categorical variant.", + ) + constraints: list[Constraint] | None = None diff --git a/src/ga4gh/cat_vrs/profile_models.py b/src/ga4gh/cat_vrs/profile_models.py new file mode 100644 index 0000000..c18b612 --- /dev/null +++ b/src/ga4gh/cat_vrs/profile_models.py @@ -0,0 +1,154 @@ +"""Define Pydantic models for GA4GH categorical variation objects. + +See the `CatVar page `_ on +the GA4GH website for more information. +""" + +from enum import Enum + +from ga4gh.cat_vrs.core_models import ( + CategoricalVariant, + Constraint, + CopyChangeConstraint, + CopyCountConstraint, + DefiningContextConstraint, + Relation, +) +from pydantic import BaseModel, Field, field_validator + + +class CatVrsType(str, Enum): + """Define CatVRS types""" + + PROTEIN_SEQ_CONS = "ProteinSequenceConsequence" + CANONICAL_ALLELE = "CanonicalAllele" + CATEGORICAL_CNV = "CategoricalCnv" + DESCRIBED_VAR = "DescribedVariation" + NUMBER_COUNT = "NumberCount" + NUMBER_CHANGE = "NumberChange" + QUANTITY_VARIANCE = "QuantityVariance" + + +class ProteinSequenceConsequenceProperties(BaseModel): + """Cat-VRS Constraints found in Protein Sequence Consequences.""" + + constraints: list[Constraint] = Field(..., min_length=1) + + @field_validator("constraints") + @classmethod + def validate_constraints(cls, v: list[Constraint]) -> list[Constraint]: + """Validate constraints property + + :param v: Constraints property to validate + :raises ValueError: If none of the ``relations`` contains + ``Relation.CODON_TRANSLATION.value`` exactly once. + :return: Constraints property + """ + if not any( + constraint.relations.count(Relation.CODON_TRANSLATION) == 1 + for constraint in v + ): + err_msg = f"At least one `relations` in `constraints` must contain `{Relation.CODON_TRANSLATION.value}` exactly once." + raise ValueError(err_msg) + + return v + + +class ProteinSequenceConsequence( + ProteinSequenceConsequenceProperties, CategoricalVariant +): + """A change that occurs in a protein sequence as a result of genomic changes. Due to + the degenerate nature of the genetic code, there are often several genomic changes + that can cause a protein sequence consequence. + The protein sequence consequence, like a :ref:`CanonicalAllele`, is defined by an + `Allele `_ + that is representative of a collection of congruent Protein Alleles that share the + same altered codon(s). + """ + + +class CanonicalAlleleProperties(BaseModel): + """Cat-VRS Constraints found in Canonical Alleles.""" + + constraints: list[Constraint] = Field(..., min_length=1) + + @field_validator("constraints") + @classmethod + def validate_constraints(cls, v: list[Constraint]) -> list[Constraint]: + """Validate constraints property + + :param v: Constraints property to validate + :raises ValueError: If none of the ``relations`` contains both + ``Relation.SEQUENCE_LIFTOVER`` and ``Relation.TRANSCRIPT_PROJECTION`` + exactly once. + :return: Constraints property + """ + if not any( + ( + constraint.relations.count(Relation.SEQUENCE_LIFTOVER) == 1 + and constraint.relations.count(Relation.TRANSCRIPT_PROJECTION) == 1 + ) + for constraint in v + ): + err_msg = f"At least one `relations` in `constraints` must contain {Relation.SEQUENCE_LIFTOVER} and {Relation.TRANSCRIPT_PROJECTION} exactly once." + raise ValueError(err_msg) + + return v + + +class CanonicalAllele(CanonicalAlleleProperties, CategoricalVariant): + """A canonical allele is defined by an + `Allele `_ + that is representative of a collection of congruent Alleles, each of which depict + the same nucleic acid change on different underlying reference sequences. Congruent + representations of an Allele often exist across different genome assemblies and + associated cDNA transcript representations. + """ + + +class CategoricalCnvProperties(BaseModel): + """Cat-VRS Constraints found in CategoricalCnvs.""" + + constraints: list[Constraint] = Field(..., min_length=1) + + @field_validator("constraints") + @classmethod + def validate_constraints(cls, v: list[Constraint]) -> list[Constraint]: + """Validate constraints property + + :param v: Constraints property to validate + :raises ValueError: If no ``DefiningContextConstraint`` with + ``Relation.SEQUENCE_LIFTOVER`` in ``relations`` is found in ``constraints`` + or if neither ``CopyCountConstraint`` nor ``CopyChangeConstraint`` is found + in ``constraints``. + :return: Constraints property + """ + defining_context_found = False + copy_found = False + + for constraint in v: + if not defining_context_found: + defining_context_found = ( + isinstance(constraint, DefiningContextConstraint) + and constraint.relations + and Relation.SEQUENCE_LIFTOVER in constraint.relations + ) + + if not copy_found: + copy_found = isinstance( + constraint, CopyChangeConstraint | CopyCountConstraint + ) + + if not defining_context_found: + err_msg = f"At least one item in `constraints` must be a `DefiningContextConstraint`` and contain ``{Relation.SEQUENCE_LIFTOVER}` in `relations`." + raise ValueError(err_msg) + + if not copy_found: + err_msg = "At least one item in `constraints` must be a `CopyCountConstraint` or a `CopyChangeConstraint`." + raise ValueError(err_msg) + + return v + + +class CategoricalCnv(CategoricalCnvProperties, CategoricalVariant): + """A representation of the constraints for matching knowledge about CNVs.""" diff --git a/submodules/cat_vrs b/submodules/cat_vrs new file mode 160000 index 0000000..1458c87 --- /dev/null +++ b/submodules/cat_vrs @@ -0,0 +1 @@ +Subproject commit 1458c878dc8da73f26aced068b99781acb4e6867 diff --git a/tests/validation/test_cat_vrs_schema.py b/tests/validation/test_cat_vrs_schema.py new file mode 100644 index 0000000..830b528 --- /dev/null +++ b/tests/validation/test_cat_vrs_schema.py @@ -0,0 +1,123 @@ +"""Test that Cat VRS-Python Pydantic models match corresponding schemas""" + +import json +from enum import Enum +from pathlib import Path + +import pytest +from ga4gh.cat_vrs import core_models, profile_models +from pydantic import BaseModel + + +class CatVrsSchema(str, Enum): + """Enum for Cat VRS schema""" + + CORE = "core" + PROFILES = "profiles" + + +class CatVrsSchemaMapping(BaseModel): + """Model for representing Cat-VRS Schema concrete classes, primitives, and schema""" + + base_classes: set = set() + concrete_classes: set = set() + primitives: set = set() + schema: dict = {} + + +def _update_cat_vrs_schema_mapping( + f_path: Path, cat_vrs_schema_mapping: CatVrsSchemaMapping +) -> None: + """Update ``cat_vrs_schema_mapping`` properties + + :param f_path: Path to JSON Schema file + :param cat_vrs_schema_mapping: Cat-VRS schema mapping to update + """ + with f_path.open() as rf: + cls_def = json.load(rf) + + spec_class = cls_def["title"] + cat_vrs_schema_mapping.schema[spec_class] = cls_def + + if "properties" in cls_def: + cat_vrs_schema_mapping.concrete_classes.add(spec_class) + elif cls_def.get("type") in {"array", "integer", "string"}: + cat_vrs_schema_mapping.primitives.add(spec_class) + else: + cat_vrs_schema_mapping.base_classes.add(spec_class) + + +CAT_VRS_SCHEMA_MAPPING = {schema: CatVrsSchemaMapping() for schema in CatVrsSchema} +SUBMODULES_DIR = Path(__file__).parents[2] / "submodules" / "cat_vrs" / "schema" + + +# Get core + profiles classes +for child in SUBMODULES_DIR.iterdir(): + child_str = str(child) + if child_str.endswith(CatVrsSchema.CORE): + mapping_key = CatVrsSchema.CORE + elif child_str.endswith(CatVrsSchema.PROFILES): + mapping_key = CatVrsSchema.PROFILES + else: + continue + + mapping = CAT_VRS_SCHEMA_MAPPING[mapping_key] + for f in (child / "json").glob("*"): + _update_cat_vrs_schema_mapping(f, mapping) + + +@pytest.mark.parametrize( + ("cat_vrs_schema", "pydantic_models"), + [ + (CatVrsSchema.CORE, core_models), + (CatVrsSchema.PROFILES, profile_models), + ], +) +def test_schema_models_in_pydantic(cat_vrs_schema, pydantic_models): + """Ensure that each schema model has corresponding Pydantic model""" + mapping = CAT_VRS_SCHEMA_MAPPING[cat_vrs_schema] + for schema_model in ( + mapping.base_classes | mapping.concrete_classes | mapping.primitives + ): + assert getattr(pydantic_models, schema_model, False), schema_model + + +@pytest.mark.parametrize( + ("cat_vrs_schema", "pydantic_models"), + [ + (CatVrsSchema.CORE, core_models), + (CatVrsSchema.PROFILES, profile_models), + ], +) +def test_schema_class_fields(cat_vrs_schema, pydantic_models): + """Check that each schema model properties exist and are required in corresponding + Pydantic model, and validate required properties + """ + mapping = CAT_VRS_SCHEMA_MAPPING[cat_vrs_schema] + for schema_model in mapping.concrete_classes: + schema_properties = mapping.schema[schema_model]["properties"] + pydantic_model = getattr(pydantic_models, schema_model) + assert set(pydantic_model.model_fields) == set(schema_properties), schema_model + + required_schema_fields = set(mapping.schema[schema_model]["required"]) + + for prop, property_def in schema_properties.items(): + pydantic_model_field_info = pydantic_model.model_fields[prop] + pydantic_field_required = pydantic_model_field_info.is_required() + + if prop in required_schema_fields: + if prop != "type": + assert pydantic_field_required, f"{pydantic_model}.{prop}" + else: + assert not pydantic_field_required, f"{pydantic_model}.{prop}" + + if "description" in property_def: + assert property_def["description"].replace( + "'", '"' + ) == pydantic_model_field_info.description.replace( + "'", '"' + ), f"{pydantic_model}.{prop}" + else: + assert ( + pydantic_model_field_info.description is None + ), f"{pydantic_model}.{prop}"