diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000..ba6b9cb
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,4 @@
+[submodule "submodules/cat_vrs"]
+ path = submodules/cat_vrs
+ url = https://github.com/ga4gh/cat-vrs
+ branch = 1.x
diff --git a/pyproject.toml b/pyproject.toml
index e23228b..b95c9bb 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -31,7 +31,10 @@ keywords = [
]
requires-python = ">=3.10"
dynamic = ["version"]
-dependencies = []
+dependencies = [
+ "ga4gh.vrs~=2.0.0a12",
+ "pydantic==2.*",
+]
[project.optional-dependencies]
dev = [
@@ -134,6 +137,7 @@ ignore = [
# S101 - assert
# B011 - assert-false
"tests/*" = ["ANN001", "ANN2", "ANN102", "S101", "B011"]
+"src/ga4gh/cat_vrs/*models.py" = ["ANN102"]
[tool.setuptools.packages.find]
where = ["src"]
diff --git a/src/ga4gh/cat_vrs/__init__.py b/src/ga4gh/cat_vrs/__init__.py
index fd0f4a9..cd05ec7 100644
--- a/src/ga4gh/cat_vrs/__init__.py
+++ b/src/ga4gh/cat_vrs/__init__.py
@@ -1 +1,5 @@
"""Package for Cat-VRS Python implementation"""
+
+from . import profile_models as cat_vrs_models
+
+__all__ = ["cat_vrs_models"]
diff --git a/src/ga4gh/cat_vrs/core_models.py b/src/ga4gh/cat_vrs/core_models.py
new file mode 100644
index 0000000..fb8623a
--- /dev/null
+++ b/src/ga4gh/cat_vrs/core_models.py
@@ -0,0 +1,99 @@
+"""Define Pydantic models for GA4GH categorical variation objects.
+
+See the `CatVar page `_ on
+the GA4GH website for more information.
+"""
+
+from enum import Enum
+from typing import Literal
+
+from ga4gh.core.entity_models import IRI, Coding, DomainEntity
+from ga4gh.vrs.models import CopyChange, Location, Range, Variation
+from pydantic import BaseModel, Field, RootModel, field_validator
+
+
+class Relation(str, Enum):
+ """Defined relationships between members of the categorical variant and the defining
+ context. ``sequence_liftover`` refers to variants or locations that represent a
+ congruent concept on a differing assembly of a human genome (e.g. "GRCh37" and
+ "GRCh38") or gene (e.g. Locus Reference Genomic) sequence. ``transcript_projection``
+ refers to variants or locations that occur on transcripts projected from the defined
+ genomic concept. ``codon_translation`` refers to variants or locations that
+ translate from the codon(s) represented by the defined concept.
+ """
+
+ SEQUENCE_LIFTOVER = "sequence_liftover"
+ TRANSCRIPT_PROJECTION = "transcript_projection"
+ CODON_TRANSLATION = "codon_translation"
+
+
+class DefiningContextConstraint(BaseModel):
+ """The location or location-state, congruent with other reference sequences, about
+ which categorical variation is being described.
+ """
+
+ type: Literal["DefiningContextConstraint"] = Field(
+ "DefiningContextConstraint", description="MUST be 'DefiningContextConstraint'"
+ )
+ definingContext: Variation | Location | IRI # noqa: N815
+ relations: list[Relation] | None = Field(
+ None,
+ description="Defined relationships between members of the categorical variant and the defining context. ``sequence_liftover`` refers to variants or locations that represent a congruent concept on a differing assembly of a human genome (e.g. 'GRCh37' and 'GRCh38') or gene (e.g. Locus Reference Genomic) sequence. ``transcript_projection`` refers to variants or locations that occur on transcripts projected from the defined genomic concept. ``codon_translation`` refers to variants or locations that translate from the codon(s) represented by the defined concept.",
+ )
+
+
+class CopyCountConstraint(BaseModel):
+ """The absolute number of copies in a system"""
+
+ type: Literal["CopyCountConstraint"] = Field(
+ "CopyCountConstraint", description="MUST be 'CopyCountConstraint'"
+ )
+ copies: int | Range
+
+
+class CopyChangeConstraint(BaseModel):
+ """A representation of copy number change"""
+
+ type: Literal["CopyChangeConstraint"] = Field(
+ "CopyChangeConstraint", description="MUST be 'CopyChangeConstraint'"
+ )
+ copyChange: Coding # noqa: N815
+
+ @field_validator("copyChange")
+ @classmethod
+ def validate_copy_change(cls, v: Coding) -> Coding:
+ """Validate copyChange property
+
+ :param v: copyChange value
+ :raises ValueError: If ``copyChange.code`` is not a valid CopyChange
+ :return: copyChange property
+ """
+ try:
+ CopyChange(v.code.root)
+ except ValueError as e:
+ err_msg = f"copyChange, {v.code.root}, not one of {[cc.value for cc in CopyChange]}"
+ raise ValueError(err_msg) from e
+ return v
+
+
+class Constraint(RootModel):
+ """Constraints are used to construct an intensional semantics of categorical variant types."""
+
+ root: DefiningContextConstraint | CopyCountConstraint | CopyChangeConstraint = (
+ Field(..., discriminator="type")
+ )
+
+
+class CategoricalVariant(DomainEntity):
+ """A representation of a categorically-defined domain for variation, in which
+ individual contextual variation instances may be members of the domain.
+ """
+
+ type: Literal["CategoricalVariant"] = Field(
+ "CategoricalVariant", description="MUST be 'CategoricalVariant'"
+ )
+ members: list[Variation | IRI] | None = Field(
+ None,
+ description="A non-exhaustive list of VRS variation contexts that satisfy the constraints of this categorical variant.",
+ )
+ constraints: list[Constraint] | None = None
diff --git a/src/ga4gh/cat_vrs/profile_models.py b/src/ga4gh/cat_vrs/profile_models.py
new file mode 100644
index 0000000..c18b612
--- /dev/null
+++ b/src/ga4gh/cat_vrs/profile_models.py
@@ -0,0 +1,154 @@
+"""Define Pydantic models for GA4GH categorical variation objects.
+
+See the `CatVar page `_ on
+the GA4GH website for more information.
+"""
+
+from enum import Enum
+
+from ga4gh.cat_vrs.core_models import (
+ CategoricalVariant,
+ Constraint,
+ CopyChangeConstraint,
+ CopyCountConstraint,
+ DefiningContextConstraint,
+ Relation,
+)
+from pydantic import BaseModel, Field, field_validator
+
+
+class CatVrsType(str, Enum):
+ """Define CatVRS types"""
+
+ PROTEIN_SEQ_CONS = "ProteinSequenceConsequence"
+ CANONICAL_ALLELE = "CanonicalAllele"
+ CATEGORICAL_CNV = "CategoricalCnv"
+ DESCRIBED_VAR = "DescribedVariation"
+ NUMBER_COUNT = "NumberCount"
+ NUMBER_CHANGE = "NumberChange"
+ QUANTITY_VARIANCE = "QuantityVariance"
+
+
+class ProteinSequenceConsequenceProperties(BaseModel):
+ """Cat-VRS Constraints found in Protein Sequence Consequences."""
+
+ constraints: list[Constraint] = Field(..., min_length=1)
+
+ @field_validator("constraints")
+ @classmethod
+ def validate_constraints(cls, v: list[Constraint]) -> list[Constraint]:
+ """Validate constraints property
+
+ :param v: Constraints property to validate
+ :raises ValueError: If none of the ``relations`` contains
+ ``Relation.CODON_TRANSLATION.value`` exactly once.
+ :return: Constraints property
+ """
+ if not any(
+ constraint.relations.count(Relation.CODON_TRANSLATION) == 1
+ for constraint in v
+ ):
+ err_msg = f"At least one `relations` in `constraints` must contain `{Relation.CODON_TRANSLATION.value}` exactly once."
+ raise ValueError(err_msg)
+
+ return v
+
+
+class ProteinSequenceConsequence(
+ ProteinSequenceConsequenceProperties, CategoricalVariant
+):
+ """A change that occurs in a protein sequence as a result of genomic changes. Due to
+ the degenerate nature of the genetic code, there are often several genomic changes
+ that can cause a protein sequence consequence.
+ The protein sequence consequence, like a :ref:`CanonicalAllele`, is defined by an
+ `Allele `_
+ that is representative of a collection of congruent Protein Alleles that share the
+ same altered codon(s).
+ """
+
+
+class CanonicalAlleleProperties(BaseModel):
+ """Cat-VRS Constraints found in Canonical Alleles."""
+
+ constraints: list[Constraint] = Field(..., min_length=1)
+
+ @field_validator("constraints")
+ @classmethod
+ def validate_constraints(cls, v: list[Constraint]) -> list[Constraint]:
+ """Validate constraints property
+
+ :param v: Constraints property to validate
+ :raises ValueError: If none of the ``relations`` contains both
+ ``Relation.SEQUENCE_LIFTOVER`` and ``Relation.TRANSCRIPT_PROJECTION``
+ exactly once.
+ :return: Constraints property
+ """
+ if not any(
+ (
+ constraint.relations.count(Relation.SEQUENCE_LIFTOVER) == 1
+ and constraint.relations.count(Relation.TRANSCRIPT_PROJECTION) == 1
+ )
+ for constraint in v
+ ):
+ err_msg = f"At least one `relations` in `constraints` must contain {Relation.SEQUENCE_LIFTOVER} and {Relation.TRANSCRIPT_PROJECTION} exactly once."
+ raise ValueError(err_msg)
+
+ return v
+
+
+class CanonicalAllele(CanonicalAlleleProperties, CategoricalVariant):
+ """A canonical allele is defined by an
+ `Allele `_
+ that is representative of a collection of congruent Alleles, each of which depict
+ the same nucleic acid change on different underlying reference sequences. Congruent
+ representations of an Allele often exist across different genome assemblies and
+ associated cDNA transcript representations.
+ """
+
+
+class CategoricalCnvProperties(BaseModel):
+ """Cat-VRS Constraints found in CategoricalCnvs."""
+
+ constraints: list[Constraint] = Field(..., min_length=1)
+
+ @field_validator("constraints")
+ @classmethod
+ def validate_constraints(cls, v: list[Constraint]) -> list[Constraint]:
+ """Validate constraints property
+
+ :param v: Constraints property to validate
+ :raises ValueError: If no ``DefiningContextConstraint`` with
+ ``Relation.SEQUENCE_LIFTOVER`` in ``relations`` is found in ``constraints``
+ or if neither ``CopyCountConstraint`` nor ``CopyChangeConstraint`` is found
+ in ``constraints``.
+ :return: Constraints property
+ """
+ defining_context_found = False
+ copy_found = False
+
+ for constraint in v:
+ if not defining_context_found:
+ defining_context_found = (
+ isinstance(constraint, DefiningContextConstraint)
+ and constraint.relations
+ and Relation.SEQUENCE_LIFTOVER in constraint.relations
+ )
+
+ if not copy_found:
+ copy_found = isinstance(
+ constraint, CopyChangeConstraint | CopyCountConstraint
+ )
+
+ if not defining_context_found:
+ err_msg = f"At least one item in `constraints` must be a `DefiningContextConstraint`` and contain ``{Relation.SEQUENCE_LIFTOVER}` in `relations`."
+ raise ValueError(err_msg)
+
+ if not copy_found:
+ err_msg = "At least one item in `constraints` must be a `CopyCountConstraint` or a `CopyChangeConstraint`."
+ raise ValueError(err_msg)
+
+ return v
+
+
+class CategoricalCnv(CategoricalCnvProperties, CategoricalVariant):
+ """A representation of the constraints for matching knowledge about CNVs."""
diff --git a/submodules/cat_vrs b/submodules/cat_vrs
new file mode 160000
index 0000000..1458c87
--- /dev/null
+++ b/submodules/cat_vrs
@@ -0,0 +1 @@
+Subproject commit 1458c878dc8da73f26aced068b99781acb4e6867
diff --git a/tests/validation/test_cat_vrs_schema.py b/tests/validation/test_cat_vrs_schema.py
new file mode 100644
index 0000000..830b528
--- /dev/null
+++ b/tests/validation/test_cat_vrs_schema.py
@@ -0,0 +1,123 @@
+"""Test that Cat VRS-Python Pydantic models match corresponding schemas"""
+
+import json
+from enum import Enum
+from pathlib import Path
+
+import pytest
+from ga4gh.cat_vrs import core_models, profile_models
+from pydantic import BaseModel
+
+
+class CatVrsSchema(str, Enum):
+ """Enum for Cat VRS schema"""
+
+ CORE = "core"
+ PROFILES = "profiles"
+
+
+class CatVrsSchemaMapping(BaseModel):
+ """Model for representing Cat-VRS Schema concrete classes, primitives, and schema"""
+
+ base_classes: set = set()
+ concrete_classes: set = set()
+ primitives: set = set()
+ schema: dict = {}
+
+
+def _update_cat_vrs_schema_mapping(
+ f_path: Path, cat_vrs_schema_mapping: CatVrsSchemaMapping
+) -> None:
+ """Update ``cat_vrs_schema_mapping`` properties
+
+ :param f_path: Path to JSON Schema file
+ :param cat_vrs_schema_mapping: Cat-VRS schema mapping to update
+ """
+ with f_path.open() as rf:
+ cls_def = json.load(rf)
+
+ spec_class = cls_def["title"]
+ cat_vrs_schema_mapping.schema[spec_class] = cls_def
+
+ if "properties" in cls_def:
+ cat_vrs_schema_mapping.concrete_classes.add(spec_class)
+ elif cls_def.get("type") in {"array", "integer", "string"}:
+ cat_vrs_schema_mapping.primitives.add(spec_class)
+ else:
+ cat_vrs_schema_mapping.base_classes.add(spec_class)
+
+
+CAT_VRS_SCHEMA_MAPPING = {schema: CatVrsSchemaMapping() for schema in CatVrsSchema}
+SUBMODULES_DIR = Path(__file__).parents[2] / "submodules" / "cat_vrs" / "schema"
+
+
+# Get core + profiles classes
+for child in SUBMODULES_DIR.iterdir():
+ child_str = str(child)
+ if child_str.endswith(CatVrsSchema.CORE):
+ mapping_key = CatVrsSchema.CORE
+ elif child_str.endswith(CatVrsSchema.PROFILES):
+ mapping_key = CatVrsSchema.PROFILES
+ else:
+ continue
+
+ mapping = CAT_VRS_SCHEMA_MAPPING[mapping_key]
+ for f in (child / "json").glob("*"):
+ _update_cat_vrs_schema_mapping(f, mapping)
+
+
+@pytest.mark.parametrize(
+ ("cat_vrs_schema", "pydantic_models"),
+ [
+ (CatVrsSchema.CORE, core_models),
+ (CatVrsSchema.PROFILES, profile_models),
+ ],
+)
+def test_schema_models_in_pydantic(cat_vrs_schema, pydantic_models):
+ """Ensure that each schema model has corresponding Pydantic model"""
+ mapping = CAT_VRS_SCHEMA_MAPPING[cat_vrs_schema]
+ for schema_model in (
+ mapping.base_classes | mapping.concrete_classes | mapping.primitives
+ ):
+ assert getattr(pydantic_models, schema_model, False), schema_model
+
+
+@pytest.mark.parametrize(
+ ("cat_vrs_schema", "pydantic_models"),
+ [
+ (CatVrsSchema.CORE, core_models),
+ (CatVrsSchema.PROFILES, profile_models),
+ ],
+)
+def test_schema_class_fields(cat_vrs_schema, pydantic_models):
+ """Check that each schema model properties exist and are required in corresponding
+ Pydantic model, and validate required properties
+ """
+ mapping = CAT_VRS_SCHEMA_MAPPING[cat_vrs_schema]
+ for schema_model in mapping.concrete_classes:
+ schema_properties = mapping.schema[schema_model]["properties"]
+ pydantic_model = getattr(pydantic_models, schema_model)
+ assert set(pydantic_model.model_fields) == set(schema_properties), schema_model
+
+ required_schema_fields = set(mapping.schema[schema_model]["required"])
+
+ for prop, property_def in schema_properties.items():
+ pydantic_model_field_info = pydantic_model.model_fields[prop]
+ pydantic_field_required = pydantic_model_field_info.is_required()
+
+ if prop in required_schema_fields:
+ if prop != "type":
+ assert pydantic_field_required, f"{pydantic_model}.{prop}"
+ else:
+ assert not pydantic_field_required, f"{pydantic_model}.{prop}"
+
+ if "description" in property_def:
+ assert property_def["description"].replace(
+ "'", '"'
+ ) == pydantic_model_field_info.description.replace(
+ "'", '"'
+ ), f"{pydantic_model}.{prop}"
+ else:
+ assert (
+ pydantic_model_field_info.description is None
+ ), f"{pydantic_model}.{prop}"