From 08e76ef61fdc533d9ce9fe31dfdc33507ff23487 Mon Sep 17 00:00:00 2001 From: Kori Kuzma Date: Wed, 30 Oct 2024 09:17:13 -0400 Subject: [PATCH] feat!: update gks-common / vrs models (#453) close #452 --- src/ga4gh/core/domain_models.py | 15 +- src/ga4gh/core/entity_models.py | 316 +++++++++++++++++++++++----- src/ga4gh/vrs/models.py | 89 +++++--- submodules/vrs | 2 +- tests/validation/test_schemas.py | 148 +++++++++++++ tests/validation/test_vrs_schema.py | 65 ------ 6 files changed, 479 insertions(+), 156 deletions(-) create mode 100644 tests/validation/test_schemas.py delete mode 100644 tests/validation/test_vrs_schema.py diff --git a/src/ga4gh/core/domain_models.py b/src/ga4gh/core/domain_models.py index 853da78b..6a8558fc 100644 --- a/src/ga4gh/core/domain_models.py +++ b/src/ga4gh/core/domain_models.py @@ -1,15 +1,4 @@ -"""GKS Common Library Domain Entity models - -**This module should not be imported directly.** - -Instead, users should use one of the following: - - * `from ga4gh.core import domain_models`, and refer to models with the - abbreviated name, e.g., `domain_models.Gene` (recommended) - - * `import ga4gh.core`, and refer to models using the fully-qualified - module name, e.g., `ga4gh.core.domain_models.Gene` -""" +"""GKS Common Library Domain Entity models""" from enum import Enum from typing import Literal, Union, List @@ -138,5 +127,5 @@ class Gene(DomainEntity): type: Literal["Gene"] = Field( CommonDomainType.GENE.value, - description=f'MUST be "{CommonDomainType.GENE.value}".' + description=f'MUST be "{CommonDomainType.GENE.value}"' ) diff --git a/src/ga4gh/core/entity_models.py b/src/ga4gh/core/entity_models.py index f40e9c9d..1631d266 100644 --- a/src/ga4gh/core/entity_models.py +++ b/src/ga4gh/core/entity_models.py @@ -1,24 +1,29 @@ -"""GKS Common Library Entity models +"""GKS Common Library Data Type and Entity models""" +from __future__ import annotations -**This module should not be imported directly.** - -Instead, users should use one of the following: - - * `from ga4gh.core import entity_models`, and refer to models with the - abbreviated name, e.g., `entity_models.Coding` (recommended) - - * `import ga4gh.core`, and refer to models using the fully-qualified - module name, e.g., `ga4gh.core.entity_models.Coding` -""" from abc import ABC -from typing import Any, Dict, Annotated, Optional, Union, List +import datetime +import logging +from typing import Any, Dict, Annotated, Literal, Optional, Union, List from enum import Enum -from pydantic import BaseModel, Field, RootModel, StringConstraints, ConfigDict +from pydantic import BaseModel, Field, RootModel, StringConstraints, ConfigDict, field_validator from ga4gh.core import GA4GH_IR_REGEXP +class CoreImType(str, Enum): + """Define Core Information Model Types""" + + AGENT = "Agent" + CONTRIBUTION = "Contribution" + DOCUMENT = "Document" + METHOD = "Method" + DATA_SET = "DataSet" + EVIDENCE_LINE = "EvidenceLine" + INFORMATION_ENTITY = "InformationEntity" + STUDY_GROUP = "StudyGroup" + class Relation(str, Enum): """A mapping relation between concepts as defined by the Simple Knowledge @@ -32,26 +37,28 @@ class Relation(str, Enum): RELATED_MATCH = 'relatedMatch' -class Syntax(str, Enum): - """The syntax used to describe the variation. The value should be one of the - supported syntaxes. - """ +class AgentSubtype(str, Enum): + """A specific type of agent the Agent object represents.""" + + PERSON = "person" + ORGANIZATION = "organization" + SOFTWARE = "software" + + +class Direction(str, Enum): + """Define constraints for direction""" + + SUPPORTS = "supports" + NEUTRAL = "neutral" + DISPUTES = "disputes" - HGVS_C = "hgvs.c" - HGVS_P = "hgvs.p" - HGVS_G = "hgvs.g" - HGVS_M = "hgvs.m" - HGVS_N = "hgvs.n" - HGVS_R = "hgvs.r" - HGVS_ISCN = "iscn" - GNOMAD = "gnomad" - SPDI = "spdi" ######################################### # GKS Common Abstract Utility Classes # These do not inherit from Entity and are not typed explicitly ######################################### + class Code(RootModel): """Indicates that the value is taken from a set of controlled strings defined elsewhere. Technically, a code is restricted to a string which has at least one @@ -91,7 +98,6 @@ def ga4gh_serialize(self): ) - class Coding(BaseModel): """A structured representation of a code for a defined concept in a terminology or code system. @@ -105,7 +111,7 @@ class Coding(BaseModel): ..., description="The terminology/code system that defined the code. May be reported as a free-text name (e.g. 'Sequence Ontology'), but it is preferable to provide a uri/url for the system. When the 'code' is reported as a CURIE, the 'system' should be reported as the uri that the CURIE's prefix expands to (e.g. 'http://purl.obofoundry.org/so.owl/' for the Sequence Ontology)." ) - version: Optional[str] = Field( + systemVersion: Optional[str] = Field( None, description='Version of the terminology or code system that provided the code.' ) @@ -134,50 +140,37 @@ class Extension(BaseModel): name: str = Field(..., description='A name for the Extension. Should be indicative of its meaning and/or the type of information it value represents.') value: Optional[Union[float, str, bool, Dict[str, Any], List[Any]]] = Field( - None, description='The value of the Extension - can be any primitive or structured object' + ..., description='The value of the Extension - can be any primitive or structured object' ) description: Optional[str] = Field(None, description="A description of the meaning or utility of the Extension, to explain the type of information it is meant to hold.") -class Expression(BaseModel): - """Representation of a variation by a specified nomenclature or syntax for a - Variation object. Common examples of expressions for the description of molecular - variation include the HGVS and ISCN nomenclatures. - """ - - model_config = ConfigDict(use_enum_values=True) - - syntax: Syntax = Field(..., description="The syntax used to describe the variation. The value should be one of the supported syntaxes.") - value: str = Field(..., description="The expression of the variation in the specified syntax. The value should be a valid expression in the specified syntax.") - syntax_version: Optional[str] = Field(None, description="The version of the syntax used to describe the variation. This is particularly important for HGVS expressions, as the syntax has evolved over time.") - - ######################################### # GKS Common Abstract Entity Class Definitions ######################################### -class Entity(ABC, BaseModel): - """Entity is the root class of the 'gks-common' core information model classes - - those that have identifiers and other general metadata like labels, xrefs, urls, - descriptions, etc. All common classes descend from and inherit its attributes. +class Entity(BaseModel, ABC): + """Anything that exists, has existed, or will exist. + + Abstract base class to be extended by other classes. Do NOT instantiate directly. """ id: Optional[str] = Field( None, - description="The 'logical' identifier of the entity in the system of record, e.g. a UUID. This 'id' is unique within a given system. The identified entity may have a different 'id' in a different system, or may refer to an 'id' for the shared concept in another system (e.g. a CURIE)." + description="The 'logical' identifier of the Entity in the system of record, e.g. a UUID. This 'id' is unique within a given system, but may or may not be globally unique outside the system. It is used within a system to reference an object from another." ) - type: str + type: str = Field(..., description="The name of the class that is instantiated by a data object representing the Entity.") label: Optional[str] = Field( None, - description='A primary label for the entity.' + description='A primary name for the entity.' ) description: Optional[str] = Field( None, - description='A free-text description of the entity.' + description='A free-text description of the Entity.' ) alternativeLabels: Optional[List[str]] = Field(None, description="Alternative name(s) for the Entity.") - extensions: Optional[List[Extension]] = Field(None, description="A list of extensions to the entity. Extensions are not expected to be natively understood, but may be used for pre-negotiated exchange of message attributes between systems.") + extensions: Optional[List[Extension]] = Field(None, description="A list of extensions to the Entity, that allow for capture of information not directly supported by elements defined in the model.") class DomainEntity(Entity, ABC): @@ -189,3 +182,224 @@ class DomainEntity(Entity, ABC): """ mappings: Optional[List[ConceptMapping]] = Field(None, description="A list of mappings to concepts in terminologies or code systems. Each mapping should include a coding and a relation.") + + +class Agent(Entity): + """An autonomous actor (person, organization, or software agent) that bears some + form of responsibility for an activity taking place, for the existence of an entity, + or for another agent's activity. + """ + + type: Literal["Agent"] = Field(CoreImType.AGENT.value, description=f"MUST be '{CoreImType.AGENT.value}'.") + name: Optional[str] = Field(None, description="The given name of the Agent.") + subtype: Optional[AgentSubtype] = Field(None, description="A specific type of agent the Agent object represents. Must be one of {person, organization, software}.") + + +class ActivityBase(Entity, ABC): + """Internal base class that holds shared fields for Activity model. + + This class should not be used directly. + """ + + subtype: Optional[Coding] = Field(None, description="A specific type of activity the Activity instance represents.") + date: Optional[str] = Field(None, description="The date that the Activity was completed.") + specifiedBy: Optional[List[Method]] = Field(None, description="A method that was followed in performing an Activity, that describes how it was executed.") + + @field_validator("date") + @classmethod + def date_format(cls, v: Optional[str]) -> Optional[str]: + """Check that date is YYYY-MM-DD format""" + if v: + valid_format = "%Y-%m-%d" + + try: + datetime.datetime.strptime(v, valid_format).replace( + tzinfo=datetime.timezone.utc + ).strftime(valid_format) + except ValueError: + logging.warning("`date` SHOULD be formatted as a date string in ISO format 'YYYY-MM-DD'") + return v + + +class Activity(ActivityBase): + """An action or set of actions performed by an agent, that occurs over a period of + time. Activities may use, generate, modify, move, or destroy one or more entities. + """ + + performedBy: Optional[List[Agent] ]= Field(None, description="An Agent who contributed to executing the Activity.") + + +class Contribution(ActivityBase): + """An action taken by an agent in contributing to the creation, modification, + assessment, or deprecation of a particular entity (e.g. a Statement, EvidenceLine, + DataSet, Publication, etc.) + """ + + type: Literal["Contribution"] = Field(CoreImType.CONTRIBUTION.value, description=f"MUST be '{CoreImType.CONTRIBUTION.value}'.") + contributor: Optional[List[Agent]] = Field(None, description="The agent that made the contribution.", min_length=1, max_length=1) + activityType: Optional[Coding] = Field(None, description="The specific type of activity performed or role played by an agent in making the contribution (e.g. for a publication, agents may contribute as a primary author, editor, figure designer, data generator, etc. . Values of this property may be framed as activities or as contribution roles (e.g. using terms from the Contribution Role Ontology (CRO)).") + + +class InformationEntityBase(Entity, ABC): + """Internal base class that holds shared fields for InformationEntity model. + + This class should not be used directly. + """ + + type: Literal["InformationEntity"] = Field(CoreImType.INFORMATION_ENTITY.value, description=f"MUST be {CoreImType.INFORMATION_ENTITY.value}.") + specifiedBy: Optional[Union[Method, IRI]] = Field(None, description="A specification that describes all or part of the process that led to creation of the Information Entity") + contributions: Optional[List[Contribution] ]= Field(None, description="Specific actions taken by an Agent toward the creation, modification, validation, or deprecation of an Information Entity.") + reportedIn: Optional[List[Union[Document, IRI]]] = Field(None, description="A document in which the the Information Entity is reported.") + dateAuthored: Optional[str] = Field(None, description="Indicates when the information content expressed in the Information Entity was generated.") + recordMetadata: Optional[RecordMetadata] = Field(None, description="Provenance metadata about a specific concrete record of information as encoded/serialized in a particular data set or object (as opposed to provenance about the abstract information content the encoding carries).") + + +class InformationEntity(InformationEntityBase): + """An abstract (non-physical) entity that is about something - representing the + underlying 'information content' conveyed by physical or digital information + artifacts like books, web pages, data tables, or photographs. + """ + + derivedFrom: Optional[List[InformationEntity]] = Field(None, description="Another Information Entity from which this Information Entity is derived, in whole or in part.") + + +class Document(InformationEntity): + """A collection of information, usually in a text-based or graphic human-readable + form, intended to be read and understood together as a whole. + """ + + type: Literal["Document"] = Field(CoreImType.DOCUMENT.value, description=f"Must be '{CoreImType.DOCUMENT.value}'") + subtype: Optional[Coding] = Field( + None, description="A specific type of document that a Document instance represents (e.g. 'publication', 'patent', 'pathology report')" + ) + title: Optional[str] = Field(None, description="The official title given to the document by its authors.") + urls: Optional[List[Annotated[str, StringConstraints(pattern=r"^(https?|s?ftp)://")]]] = Field( + None, description="One or more URLs from which the content of the Document can be retrieved." + ) + doi: Optional[Annotated[str, StringConstraints(pattern=r"^10\.(\d+)(\.\d+)*\/[\w\-\.]+")]] = Field( + None, + description="A [Digital Object Identifier](https://www.doi.org/the-identifier/what-is-a-doi/) for the document.", + ) + pmid: Optional[int] = Field( + None, + description="A [PubMed unique identifier](https://en.wikipedia.org/wiki/PubMed#PubMed_identifier) for the document.", + ) + + +class Method(InformationEntity): + """A set of instructions that specify how to achieve some objective.""" + + type: Literal["Method"] = Field(CoreImType.METHOD.value, description=f"MUST be '{CoreImType.METHOD.value}'.") + subtype: Optional[Coding] = Field( + None, + description="A specific type of method that a Method instance represents (e.g. 'Variant Interpretation Guideline', or 'Experimental Protocol').", + ) + license: Optional[str] = Field(None, description="A specific license that dictates legal permissions for how a method can be used (by whom, where, for what purposes, with what additional requirements, etc.).") + + +class RecordMetadata(BaseModel): + """A reusable structure that encapsulates provenance metadata about a serialized + data record or object in a particular dataset (as opposed to provenance about the + real world entity this + record or object represents). + """ + + recordIdentifier: Optional[str] = Field(None, description="The identifier of the data record or object described in this RecordMetadata object.") + recordVersion: Optional[str] = Field(None, description="The version number of the record-level artifact the object describes.") + derivedFrom: Optional[str] = Field(None, description="Another data record from which the record described here was derived, through a data ingest and/or transformation process. Value should be a string representing the identifier of the source record.") + dateRecordCreated: Optional[str] = Field(None, description="The date the record was initially created.") + contributions: Optional[List[Contribution]] = Field(None, description="Describes specific contributions made by an human or software agent to the creation, modification, or administrative management of a data record or object.") + + +class DataSet(InformationEntity): + """A collection of related data items or records that are organized together in a + common format or structure, to enable their computational manipulation as a unit. + """ + + type: Literal["DataSet"] = Field(CoreImType.DATA_SET.value, description=f"MUST be '{CoreImType.DATA_SET.value}'.") + subtype: Optional[Coding] = Field(None, description="A specific type of data set the DataSet instance represents (e.g. a 'clinical data set', a 'sequencing data set', a 'gene expression data set', a 'genome annotation data set')") + releaseDate: Optional[str] = Field(None, description="Indicates when a version of a Data Set was formally released.") + version: Optional[str] = Field(None, description="The version of the Data Set, as assigned by its creator.") + license: Optional[str] = Field(None, description="A specific license that dictates legal permissions for how a data set can be used (by whom, where, for what purposes, with what additional requirements, etc.)") + + +class EvidenceLine(InformationEntity): + """An independent, evidence-based argument that may support or refute the validity + of a specific proposition. The strength and direction of this argument is based on + an interpretation of one or more pieces of information as evidence for or against + the target proposition. + """ + + type: Literal["EvidenceLine"] = Field(CoreImType.EVIDENCE_LINE.value, description=f"Must be '{CoreImType.EVIDENCE_LINE.value}'") + hasEvidenceItems: Optional[List[InformationEntity]] = Field(None, description="An individual piece of information that was evaluated as evidence in building the argument represented by an Evidence Line.") + directionOfEvidenceProvided: Optional[Direction] = Field(None, description="The direction of support that the Evidence Line is determined to provide toward its target Proposition (supports, disputes, neutral)") + strengthOfEvidenceProvided: Optional[Union[Coding, IRI]] = Field(None, description="The strength of support that an Evidence Line is determined to provide for or against its target Proposition, evaluated relative to the direction indicated by the directionOfEvidenceProvided value.") + scoreOfEvidenceProvided: Optional[float] = Field(None, description="A quantitative score indicating the strength of support that an Evidence Line is determined to provide for or against its target Proposition, evaluated relative to the direction indicated by the directionOfEvidenceProvided value.") + + +class StatementBase(InformationEntity, ABC): + """Internal base class that holds shared fields for Statement model.""" + + predicate: str = Field(..., description="The relationship declared to hold between the subject and the object of the Statement.") + direction: Optional[Direction] = Field(None, description="A term indicating whether the Statement supports, disputes, or remains neutral w.r.t. the validity of the Proposition it evaluates.") + strength: Optional[Union[Coding, IRI]]= Field(None, description="A term used to report the strength of a Proposition's assessment in the direction indicated (i.e. how strongly supported or disputed the Proposition is believed to be). Implementers may choose to frame a strength assessment in terms of how *confident* an agent is that the Proposition is true or false, or in terms of the *strength of all evidence* they believe supports or disputes it.") + score: Optional[float] = Field(None, description="A quantitative score that indicates the strength of a Proposition's assessment in the direction indicated (i.e. how strongly supported or disputed the Proposition is believed to be). Depending on its implementation, a score may reflect how *confident* that agent is that the Proposition is true or false, or the *strength of evidence* they believe supports or disputes it.") + statementText: Optional[str] = Field(None, description="A natural-language expression of what a Statement asserts to be true.") + classification: Optional[Union[Coding, IRI]] = Field(None, description="A single term or phrase summarizing the outcome of direction and strength assessments of a Statement's proposition, in terms of a classification of its subject.") + hasEvidenceLines: Optional[List[EvidenceLine]] = Field(None, description="An evidence-based argument that supports or disputes the validity of the proposition that a Statement assesses or puts forth as true. The strength and direction of this argument (whether it supports or disputes the proposition, and how strongly) is based on an interpretation of one or more pieces of information as evidence (i.e. 'Evidence Items).") + + +class Statement(StatementBase): + """A claim of purported truth as made by a particular agent, on a particular + occasion. Statements may be used to simply put forth a possible fact (i.e. a + 'proposition') as true, or to provide a more nuanced assessment of the level of + confidence or evidence supporting a particular proposition. + """ + + subject: Dict = Field(..., description="The Entity about which the Statement is made.") + object: Dict = Field(..., description="An Entity or concept that is related to the subject of a Statement via its predicate.") + + +class StudyGroup(Entity): + """A collection of individuals or specimens from the same taxonomic class, selected + for analysis in a scientific study based on their exhibiting one or more common + characteristics (e.g. species, race, age, gender, disease state, income). May be + referred to as a 'cohort' or 'population' in specific research settings. + """ + + type: Literal["StudyGroup"] = Field(CoreImType.STUDY_GROUP.value, description=f'Must be "{CoreImType.STUDY_GROUP.value}"') + memberCount: Optional[int] = Field(None, description="The total number of individual members in the StudyGroup.") + isSubsetOf: Optional[List[StudyGroup] ]= Field(None, description="A larger StudyGroup of which this StudyGroup represents a subset.") + characteristics: Optional[List[Characteristic]] = Field(None, description="A feature or role shared by all members of the StudyGroup, representing a criterion for membership in the group.") + + +class Characteristic(BaseModel): + """An object holding a name-value pair used to describe a trait or role of an + individual member of a StudyGroup. + """ + + name: str = Field(..., description="The type of the trait or role described by the trait (e.g. 'ethnicity', 'sex', 'age', 'disease status').") + value: str = Field(..., description="The specific value(s) that the indicated traitor role holds in all population members (e.g. 'east asian', 'female', 'adolescent', 'cancer').") + valueOperator: Optional[bool] = Field(None, description="An operation that defines how to logically interpret a set of more than one Characteristic values ('AND', 'OR', 'NOT')") + + +class StudyResultBase(InformationEntityBase, ABC): + """Internal base class that holds shared fields for StudyResult model.""" + + sourceDataSet: Optional[List[DataSet]] = Field(None, description="A larger DataSet from which the content of the StudyResult was derived.", max_length=1) + ancillaryResults: Optional[Dict] = None + qualityMeasures: Optional[Dict] = None + + +class StudyResult(InformationEntityBase, ABC): + """A collection of data items from a single study that pertain to a particular + subject or experimental unit in the study, along with optional provenance + information describing how these data items were generated. + """ + + focus: Optional[Union[DomainEntity, Coding, IRI]] = Field(None, description="The specific subject or experimental unit in a Study that data in the StudyResult object is about - e.g. a particular variant in a population allele frequency dataset like ExAC or gnomAD.") + sourceDataSet: Optional[List[DataSet]] = Field(None, description="A larger DataSet from which the content of the StudyResult was derived.", max_length=1) + componentResult: Optional[List[StudyResult]] = Field(None, description="Another StudyResult comprised of data items about the same focus as its parent Result, but based on a more narrowly scoped analysis of the foundational data (e.g. an analysis based on data about a subset of the parent Results full study population) .") + studyGroup: Optional[StudyGroup] = Field(None, description="A description of a specific group or population of subjects interrogated in the ResearchStudy that produced the data captured in the StudyResult.") + ancillaryResults: Optional[Dict] = None + qualityMeasures: Optional[Dict] = None diff --git a/src/ga4gh/vrs/models.py b/src/ga4gh/vrs/models.py index fbfc02ee..5e304a3a 100644 --- a/src/ga4gh/vrs/models.py +++ b/src/ga4gh/vrs/models.py @@ -32,7 +32,7 @@ from ga4gh.core.pydantic import ( getattr_in ) -from ga4gh.core.entity_models import IRI, Expression, Entity +from ga4gh.core.entity_models import IRI, Entity def flatten(vals): @@ -151,6 +151,13 @@ class VrsType(str, Enum): CN_CHANGE = "CopyNumberChange" +class Orientation(str, Enum): + """The orientation of the molecular variation component.""" + + FORWARD = "forward" + REVERSE_COMPLEMENT = "reverse_complement" + + class ResidueAlphabet(str, Enum): """The interpretation of the character codes referred to by the refget accession, where "aa" specifies an amino acid character set, and "na" specifies a nucleic acid @@ -174,6 +181,22 @@ class CopyChange(str, Enum): EFO_0030072 = 'EFO:0030072' +class Syntax(str, Enum): + """The syntax used to describe the variation. The value should be one of the + supported syntaxes. + """ + + HGVS_C = "hgvs.c" + HGVS_P = "hgvs.p" + HGVS_G = "hgvs.g" + HGVS_M = "hgvs.m" + HGVS_N = "hgvs.n" + HGVS_R = "hgvs.r" + HGVS_ISCN = "iscn" + GNOMAD = "gnomad" + SPDI = "spdi" + + def _recurse_ga4gh_serialize(obj): if isinstance(obj, _Ga4ghIdentifiableObject): return obj.get_or_create_digest() @@ -314,6 +337,18 @@ def get_or_create_digest(self, recompute=False) -> str: class ga4gh(_ValueObject.ga4gh): prefix: str +class Expression(BaseModel): + """Representation of a variation by a specified nomenclature or syntax for a + Variation object. Common examples of expressions for the description of molecular + variation include the HGVS and ISCN nomenclatures. + """ + + model_config = ConfigDict(use_enum_values=True) + + syntax: Syntax = Field(..., description="The syntax used to describe the variation. The value should be one of the supported syntaxes.") + value: str = Field(..., description="The expression of the variation in the specified syntax. The value should be a valid expression in the specified syntax.") + syntax_version: Optional[str] = Field(None, description="The version of the syntax used to describe the variation. This is particularly important for HGVS expressions, as the syntax has evolved over time.") + ######################################### # vrs numerics, comparators, and ranges @@ -392,7 +427,7 @@ class LengthExpression(_ValueObject): type: Literal["LengthExpression"] = Field( VrsType.LEN_EXPR.value, description=f'MUST be "{VrsType.LEN_EXPR.value}"' ) - length: Optional[Union[Range, int]] = None + length: Optional[Union[Range, int]] = Field(None, description="The length of the sequence.") class ga4gh(_ValueObject.ga4gh): keys = [ @@ -408,13 +443,13 @@ class ReferenceLengthExpression(_ValueObject): VrsType.REF_LEN_EXPR.value, description=f'MUST be "{VrsType.REF_LEN_EXPR.value}"' ) length: Union[Range, int] = Field( - ..., description='The number of residues of the expressed sequence.' + ..., description='The number of residues in the expressed sequence.' ) sequence: Optional[SequenceString] = Field( - None, description='the `Sequence` encoded by the Reference Length Expression.' + None, description='the literal Sequence encoded by the Reference Length Expression.' ) repeatSubunitLength: int = Field( - ..., description='The number of residues of the repeat subunit.' + ..., description='The number of residues in the repeat subunit.' ) class ga4gh(_ValueObject.ga4gh): @@ -452,9 +487,9 @@ class SequenceReference(_ValueObject): type: Literal["SequenceReference"] = Field(VrsType.SEQ_REF.value, description=f'MUST be "{VrsType.SEQ_REF.value}"') refgetAccession: Annotated[str, StringConstraints(pattern=r'^SQ.[0-9A-Za-z_\-]{32}$')] = Field( ..., - description='A `GA4GH RefGet ` identifier for the referenced sequence, using the sha512t24u digest.', + description='A [GA4GH RefGet](http://samtools.github.io/hts-specs/refget.html) identifier for the referenced sequence, using the sha512t24u digest.', ) - residueAlphabet: Optional[ResidueAlphabet] = Field(None, description="The interpretation of the character codes referred to by the refget accession, where 'aa' specifies an amino acid character set, and 'na' specifies a nucleic acid character set.") + residueAlphabet: Optional[ResidueAlphabet] = Field(None, description='The interpretation of the character codes referred to by the refget accession, where "aa" specifies an amino acid character set, and "na" specifies a nucleic acid character set.') circular: Optional[bool] = Field(None, description="A boolean indicating whether the molecule represented by the sequence is circular (true) or linear (false).") class ga4gh(_ValueObject.ga4gh): @@ -469,15 +504,15 @@ class SequenceLocation(_Ga4ghIdentifiableObject): type: Literal["SequenceLocation"] = Field(VrsType.SEQ_LOC.value, description=f'MUST be "{VrsType.SEQ_LOC.value}"') sequenceReference: Optional[Union[IRI, SequenceReference]] = Field( - None, description='A reference to a `Sequence` on which the location is defined.' + None, description='A reference to a Sequence on which the location is defined.' ) start: Optional[Union[Range, int]] = Field( None, - description='The start coordinate or range of the SequenceLocation. The minimum value of this coordinate or range is 0.', + description='The start coordinate or range of the SequenceLocation. The minimum value of this coordinate or range is 0. MUST represent a coordinate or range less than or equal to the value of `end`.', ) end: Optional[Union[Range, int]] = Field( None, - description='The end coordinate or range of the SequenceLocation. The minimum value of this coordinate or range is 0.', + description='The end coordinate or range of the SequenceLocation. The minimum value of this coordinate or range is 0. MUST represent a coordinate or range greater than or equal to the value of `start`.', ) sequence: Optional[SequenceString] = Field(None, description="The literal sequence encoded by the `sequenceReference` at these coordinates.") @@ -614,7 +649,7 @@ class CisPhasedBlock(_VariationBase): type: Literal["CisPhasedBlock"] = Field(VrsType.CIS_PHASED_BLOCK.value, description=f'MUST be "{VrsType.CIS_PHASED_BLOCK.value}"') members: List[Union[Allele, IRI]] = Field( ..., - description='A list of `Alleles` that are found in-cis on a shared molecule.', + description='A list of Alleles that are found in-cis on a shared molecule.', min_length=2, ) sequenceReference: Optional[SequenceReference] = Field(None, description="An optional Sequence Reference on which all of the in-cis Alleles are found. When defined, this may be used to implicitly define the `sequenceReference` attribute for each of the CisPhasedBlock member Alleles.") @@ -643,7 +678,7 @@ class Adjacency(_VariationBase): potentially with an intervening linker sequence. """ - type: Literal["Adjacency"] = Field(VrsType.ADJACENCY.value, description=f'MUST be "{VrsType.ADJACENCY.value}"') + type: Literal["Adjacency"] = Field(VrsType.ADJACENCY.value, description=f'MUST be "{VrsType.ADJACENCY.value}".') adjoinedSequences: List[Union[IRI, SequenceLocation]] = Field( ..., description="The terminal sequence or pair of adjoined sequences that defines in the adjacency.", @@ -654,7 +689,7 @@ class Adjacency(_VariationBase): None, description="The sequence found between adjoined sequences." ) - homology: Optional[bool] = Field(None, description="A flag indicating if coordinate ambiguity in the adjoined sequences is from sequence homology (true) or other uncertainty (false).") + homology: Optional[bool] = Field(None, description="A flag indicating if coordinate ambiguity in the adjoined sequences is from sequence homology (true) or other uncertainty, such as instrument ambiguity (false).") class ga4gh(_Ga4ghIdentifiableObject.ga4gh): prefix = 'AJ' @@ -671,7 +706,7 @@ class Terminus(_VariationBase): is not allowed and it removes the unnecessary array structure. """ - type: Literal["Terminus"] = Field(VrsType.TERMINUS.value, description=f'MUST be "{VrsType.TERMINUS.value}"') + type: Literal["Terminus"] = Field(VrsType.TERMINUS.value, description=f'MUST be "{VrsType.TERMINUS.value}".') location: Union[IRI, SequenceLocation] = Field(..., description="The location of the terminus.") class ga4gh(_Ga4ghIdentifiableObject.ga4gh): @@ -685,17 +720,19 @@ class TraversalBlock(_ValueObject): """A component used to describe the orientation of a molecular variation within a DerivativeMolecule.""" + model_config = ConfigDict(use_enum_values=True) + type: Literal["TraversalBlock"] = Field( - VrsType.TRAVERSAL_BLOCK.value, description=f'MUST be "{VrsType.TRAVERSAL_BLOCK.value}"' + VrsType.TRAVERSAL_BLOCK.value, description=f'MUST be "{VrsType.TRAVERSAL_BLOCK.value}".' ) - orientation: Literal["forward", "reverse_complement"] = Field( - ..., - description='The orientation of the traversal block, either forward or reverse_complement.' + orientation: Optional[Orientation] = Field( + None, + description='The orientation of the molecular variation component.' ) - component: Union[IRI, Adjacency, Allele, Terminus, CisPhasedBlock] = Field( - ..., - description="The component that make up the derivative molecule." + component: Optional[Union[Allele, CisPhasedBlock, Adjacency, Terminus]] = Field( + None, + description="The unoriented molecular variation component." ) class ga4gh(_ValueObject.ga4gh): @@ -710,13 +747,13 @@ class DerivativeMolecule(_VariationBase): molecule composed from multiple sequence components. """ - type: Literal["DerivativeMolecule"] = Field(VrsType.DERIVATIVE_MOL.value, description=f'MUST be "{VrsType.DERIVATIVE_MOL.value}"') - components: List[TraversalBlock] = Field( + type: Literal["DerivativeMolecule"] = Field(VrsType.DERIVATIVE_MOL.value, description=f'MUST be "{VrsType.DERIVATIVE_MOL.value}".') + components: List[Union[IRI, TraversalBlock]] = Field( ..., - description="The traversal block components that make up the derivative molecule.", + description="The molecular components that constitute the derivative molecule.", min_length=2 ) - circular: Optional[bool] = Field(None, description="A flag indicating if the derivative molecule is circular (true) or linear (false).") + circular: Optional[bool] = Field(None, description="A boolean indicating whether the molecule represented by the sequence is circular (true) or linear (false).") class ga4gh(_Ga4ghIdentifiableObject.ga4gh): prefix = "DM" @@ -769,7 +806,7 @@ class CopyNumberChange(_CopyNumber): type: Literal["CopyNumberChange"] = Field(VrsType.CN_CHANGE.value, description=f'MUST be "{VrsType.CN_CHANGE.value}"') copyChange: CopyChange = Field( ..., - description='MUST be one of "EFO:0030069" (complete genomic loss), "EFO:0020073" (high-level loss), "EFO:0030068" (low-level loss), "EFO:0030067" (loss), "EFO:0030064" (regional base ploidy), "EFO:0030070" (gain), "EFO:0030071" (low-level gain), "EFO:0030072" (high-level gain).', + description='MUST be a Coding representing one of "EFO:0030069" (complete genomic loss), "EFO:0020073" (high-level loss), "EFO:0030068" (low-level loss), "EFO:0030067" (loss), "EFO:0030064" (regional base ploidy), "EFO:0030070" (gain), "EFO:0030071" (low-level gain), "EFO:0030072" (high-level gain).', ) class ga4gh(_Ga4ghIdentifiableObject.ga4gh): diff --git a/submodules/vrs b/submodules/vrs index a7d75410..cf968d24 160000 --- a/submodules/vrs +++ b/submodules/vrs @@ -1 +1 @@ -Subproject commit a7d75410ff774aee76514f47438fc1a13165ce6d +Subproject commit cf968d24a37cefca5a2f363f0e2f36741cd12ad5 diff --git a/tests/validation/test_schemas.py b/tests/validation/test_schemas.py new file mode 100644 index 00000000..ee0b3783 --- /dev/null +++ b/tests/validation/test_schemas.py @@ -0,0 +1,148 @@ +"""Test that VRS-Python Pydantic models match VRS and GKS-Common schemas""" + +from enum import Enum +import json +from pathlib import Path + +import pytest +from pydantic import BaseModel + +from ga4gh.core import entity_models, domain_models +from ga4gh.vrs import models as vrs_models + + +class GKSSchema(str, Enum): + """Enum for GKS schema""" + + VRS = "vrs" + CORE_IM = "core-im" + DOMAIN = "domain-entities" + + +class GKSSchemaMapping(BaseModel): + """Model for representing GKS Schema concrete classes, primitives, and schema""" + + base_classes: set = set() + concrete_classes: set = set() + primitives: set = set() + schema: dict = {} + + +def _update_gks_schema_mapping( + f_path: Path, gks_schema_mapping: GKSSchemaMapping +) -> None: + """Update ``gks_schema_mapping`` properties + + :param f_path: Path to JSON Schema file + :param gks_schema_mapping: GKS schema mapping to update + """ + with f_path.open() as rf: + cls_def = json.load(rf) + + spec_class = cls_def["title"] + gks_schema_mapping.schema[spec_class] = cls_def + + if "properties" in cls_def: + gks_schema_mapping.concrete_classes.add(spec_class) + elif cls_def.get("type") in {"array", "integer", "string"}: + gks_schema_mapping.primitives.add(spec_class) + else: + gks_schema_mapping.base_classes.add(spec_class) + + +GKS_SCHEMA_MAPPING = {gks: GKSSchemaMapping() for gks in GKSSchema} +SUBMODULES_DIR = Path(__file__).parents[2] / "submodules" / "vrs" + + +# Get vrs classes +vrs_mapping = GKS_SCHEMA_MAPPING[GKSSchema.VRS] +for f in (SUBMODULES_DIR / "schema" / "vrs" / "json").glob("*"): + _update_gks_schema_mapping(f, vrs_mapping) + + +# Get core-im + domain classes +for child in (SUBMODULES_DIR / "submodules" / "gks-common" / "schema").iterdir(): + mapping_key = ( + GKSSchema.DOMAIN if str(child).endswith(GKSSchema.DOMAIN) else GKSSchema.CORE_IM + ) + mapping = GKS_SCHEMA_MAPPING[mapping_key] + for f in (child / "json").glob("*"): + _update_gks_schema_mapping(f, mapping) + + +@pytest.mark.parametrize( + ("gks_schema", "pydantic_models"), + [ + (GKSSchema.VRS, vrs_models), + (GKSSchema.CORE_IM, entity_models), + (GKSSchema.DOMAIN, domain_models), + ], +) +def test_schema_models_in_pydantic(gks_schema, pydantic_models): + """Ensure that each schema model has corresponding Pydantic model""" + mapping = GKS_SCHEMA_MAPPING[gks_schema] + for schema_model in ( + mapping.base_classes | mapping.concrete_classes | mapping.primitives + ): + assert getattr(pydantic_models, schema_model, False), schema_model + + +@pytest.mark.parametrize( + ("gks_schema", "pydantic_models"), + [ + (GKSSchema.VRS, vrs_models), + (GKSSchema.CORE_IM, entity_models), + (GKSSchema.DOMAIN, domain_models), + ], +) +def test_schema_class_fields(gks_schema, pydantic_models): + """Check that each schema model properties exist and are required in corresponding + Pydantic model, and validate required properties + """ + mapping = GKS_SCHEMA_MAPPING[gks_schema] + for schema_model in mapping.concrete_classes: + schema_properties = mapping.schema[schema_model]["properties"] + pydantic_model = getattr(pydantic_models, schema_model) + assert set(pydantic_model.model_fields) == set(schema_properties), schema_model + + required_schema_fields = set(mapping.schema[schema_model]["required"]) + + for prop, property_def in schema_properties.items(): + pydantic_model_field_info = pydantic_model.model_fields[prop] + pydantic_field_required = pydantic_model_field_info.is_required() + + if prop in required_schema_fields: + if prop != "type": + assert pydantic_field_required, f"{pydantic_model}.{prop}" + else: + assert not pydantic_field_required, f"{pydantic_model}.{prop}" + + if "description" in property_def: + assert property_def["description"].replace("'", "\"") == pydantic_model_field_info.description.replace("'", "\""), f"{pydantic_model}.{prop}" + else: + assert pydantic_model_field_info.description is None, f"{pydantic_model}.{prop}" + + +def test_ga4gh_keys(): + """Ensure ga4ghDigest keys defined in schema model exist in corresponding Pydantic model""" + vrs_mapping = GKS_SCHEMA_MAPPING[GKSSchema.VRS] + for vrs_class in vrs_mapping.concrete_classes: + if ( + vrs_mapping.schema[vrs_class].get("ga4ghDigest", {}).get("keys", None) + is None + ): + continue + + pydantic_model = getattr(vrs_models, vrs_class) + + try: + pydantic_model_digest_keys = pydantic_model.ga4gh.keys + except AttributeError as e: + raise AttributeError(vrs_class) from e + + assert set(pydantic_model_digest_keys) == set( + vrs_mapping.schema[vrs_class]["ga4ghDigest"]["keys"] + ), vrs_class + assert pydantic_model_digest_keys == sorted( + pydantic_model.ga4gh.keys + ), vrs_class diff --git a/tests/validation/test_vrs_schema.py b/tests/validation/test_vrs_schema.py deleted file mode 100644 index a42853a4..00000000 --- a/tests/validation/test_vrs_schema.py +++ /dev/null @@ -1,65 +0,0 @@ -"""test that VRS Python model structures match VRS Schema -""" -import json -from pathlib import Path - -from ga4gh.vrs import models - -ROOT_DIR = Path(__file__).parents[2] -VRS_SCHEMA_DIR = ROOT_DIR / 'submodules' / 'vrs' / 'schema' / 'vrs' / 'json' -VRS_SCHEMA = {} - -VRS_CONCRETE_CLASSES = set() -VRS_PRIMITIVES = set() - -for f in VRS_SCHEMA_DIR.glob("*"): - with open(f, "r") as rf: - cls_def = json.load(rf) - - vrs_class = cls_def["title"] - VRS_SCHEMA[vrs_class] = cls_def - if "properties" in cls_def: - VRS_CONCRETE_CLASSES.add(vrs_class) - elif cls_def.get("type") in {"array", "int", "str"}: - VRS_PRIMITIVES.add(vrs_class) - - -NOT_IMPLEMENTED = ['Adjacency', 'Haplotype'] # Use this to skip testing of not-implemented classes - # TODO: Remove this once 2.0 models at beta - - -def test_schema_models_exist(): - """test that VRS Python covers the models defined by VRS - """ - for vrs_class in VRS_CONCRETE_CLASSES | VRS_PRIMITIVES: - if vrs_class in NOT_IMPLEMENTED: - continue - assert getattr(models, vrs_class, False) - - -def test_schema_class_fields_are_valid(): - """test that VRS Python model fields match the VRS specification - """ - for vrs_class in VRS_CONCRETE_CLASSES: - if vrs_class in NOT_IMPLEMENTED: - continue - schema_fields = set(VRS_SCHEMA[vrs_class]['properties']) - pydantic_model = getattr(models, vrs_class) - assert set(pydantic_model.model_fields) == schema_fields, vrs_class - - -def test_model_keys_are_valid(): - """test that digest keys on Value Objects are valid and sorted - """ - for vrs_class in VRS_CONCRETE_CLASSES: - if vrs_class in NOT_IMPLEMENTED: - continue - if VRS_SCHEMA[vrs_class].get('ga4ghDigest', {}).get('keys', None) is None: - continue - pydantic_model = getattr(models, vrs_class) - try: - pydantic_model_digest_keys = pydantic_model.ga4gh.keys - except AttributeError: - raise AttributeError(vrs_class) - assert set(pydantic_model_digest_keys) == set(VRS_SCHEMA[vrs_class]['ga4ghDigest']['keys']), vrs_class - assert pydantic_model_digest_keys == sorted(pydantic_model.ga4gh.keys), vrs_class