Skip to content

Commit

Permalink
feat: add oncologic disease parameter from mondo terms (#200)
Browse files Browse the repository at this point in the history
  • Loading branch information
jsstevenson authored Oct 18, 2024
1 parent b43a289 commit 8c4022a
Show file tree
Hide file tree
Showing 16 changed files with 75 additions and 20 deletions.
18 changes: 13 additions & 5 deletions src/disease/database/postgresql.py
Original file line number Diff line number Diff line change
Expand Up @@ -318,6 +318,7 @@ def _format_source_record(self, source_row: tuple) -> dict:
"src_name": source_row[5],
"merge_ref": source_row[6],
"pediatric_disease": source_row[7],
"oncologic_disease": source_row[8],
"item_type": RecordType.IDENTITY.value,
}
return {k: v for k, v in disease_record.items() if v}
Expand Down Expand Up @@ -354,6 +355,7 @@ def _format_merged_record(self, merged_row: tuple) -> dict:
"associated_with": merged_row[3],
"xrefs": merged_row[4],
"pediatric_disease": merged_row[5],
"oncologic_disease": merged_row[6],
"item_type": RecordType.MERGER.value,
}
return {k: v for k, v in merged_record.items() if v}
Expand Down Expand Up @@ -532,8 +534,8 @@ def add_source_metadata(self, src_name: SourceName, meta: SourceMeta) -> None:
self.conn.commit()

_add_record_query = b"""
INSERT INTO disease_concepts (concept_id, source, pediatric_disease)
VALUES (%s, %s, %s);
INSERT INTO disease_concepts (concept_id, source, pediatric_disease, oncologic_disease)
VALUES (%s, %s, %s, %s);
"""
_insert_label_query = (
b"INSERT INTO disease_labels (label, concept_id) VALUES (%s, %s)"
Expand All @@ -555,7 +557,12 @@ def add_record(self, record: dict, src_name: SourceName) -> None:
try:
cur.execute(
self._add_record_query,
[concept_id, src_name.value, record.get("pediatric_disease")],
[
concept_id,
src_name.value,
record.get("pediatric_disease"),
record.get("oncologic_disease"),
],
)
cur.execute(self._insert_label_query, [record["label"], concept_id])
for a in record.get("aliases", []):
Expand All @@ -571,9 +578,9 @@ def add_record(self, record: dict, src_name: SourceName) -> None:

_add_merged_record_query = b"""
INSERT INTO disease_merged (
concept_id, label, aliases, associated_with, xrefs, pediatric_disease
concept_id, label, aliases, associated_with, xrefs, pediatric_disease, oncologic_disease
)
VALUES (%s, %s, %s, %s, %s, %s);
VALUES (%s, %s, %s, %s, %s, %s, %s);
"""

def add_merged_record(self, record: dict) -> None:
Expand All @@ -591,6 +598,7 @@ def add_merged_record(self, record: dict) -> None:
record.get("associated_with"),
record.get("xrefs"),
record.get("pediatric_disease"),
record.get("oncologic_disease"),
],
)
self.conn.commit()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ SELECT dc.concept_id,
dc.source,
dc.merge_ref,
dc.pediatric_disease,
dc.oncologic_disease,
lower(dc.concept_id) AS concept_id_lowercase
FROM disease_concepts dc
FULL JOIN (
Expand Down
4 changes: 3 additions & 1 deletion src/disease/database/postgresql/create_tables.sql
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,14 @@ CREATE TABLE disease_merged (
aliases TEXT [],
associated_with TEXT [],
xrefs TEXT [],
pediatric_disease BOOLEAN
pediatric_disease BOOLEAN,
oncologic_disease BOOLEAN
);
CREATE TABLE disease_concepts (
concept_id VARCHAR(127) PRIMARY KEY,
source VARCHAR(127) NOT NULL REFERENCES disease_sources (name),
pediatric_disease BOOLEAN,
oncologic_disease BOOLEAN,
merge_ref VARCHAR(127) REFERENCES disease_merged (concept_id)
);
CREATE TABLE disease_labels (
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@ CREATE TABLE disease_merged (
aliases TEXT [],
associated_with TEXT [],
xrefs TEXT [],
pediatric_disease BOOLEAN
pediatric_disease BOOLEAN,
oncologic_disease BOOLEAN
);
ALTER TABLE disease_concepts ADD CONSTRAINT disease_concepts_merge_ref_fkey
FOREIGN KEY (merge_ref) REFERENCES disease_merged (concept_id);
Expand Down
5 changes: 3 additions & 2 deletions src/disease/etl/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,8 +114,9 @@ def _load_disease(self, disease: dict) -> None:
else:
del disease[attr_type]

if "pediatric_disease" in disease and disease["pediatric_disease"] is None:
del disease["pediatric_disease"]
for field in ("pediatric_disease", "oncologic_disease"):
if field in disease and disease[field] is None:
del disease[field]

self._database.add_record(disease, self._src_name)
if self._store_ids:
Expand Down
2 changes: 1 addition & 1 deletion src/disease/etl/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ def record_order(record: dict) -> tuple:
merged_properties["xrefs"] = list({r["concept_id"] for r in records[1:]})

set_fields = ["aliases", "associated_with"]
scalar_fields = ["label", "pediatric_disease"]
scalar_fields = ["label", "pediatric_disease", "oncologic_disease"]
for record in records:
for field in set_fields:
if field in record:
Expand Down
5 changes: 5 additions & 0 deletions src/disease/etl/mondo.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,8 @@ def _transform_data(self) -> None:
diseases = self._construct_dependency_set(dag, disease_root)
peds_neoplasm_root = "MONDO:0006517"
pediatric_diseases = self._construct_dependency_set(dag, peds_neoplasm_root)
cancer_root = "MONDO:0045024"
cancers = self._construct_dependency_set(dag, cancer_root)

reader = fastobo.iter(str(self._data_file.absolute()))
for item in tqdm(reader, ncols=80, disable=self._silent):
Expand All @@ -198,4 +200,7 @@ def _transform_data(self) -> None:
if concept_id.upper() in pediatric_diseases:
params["pediatric_disease"] = True

if concept_id.upper() in cancers:
params["oncologic_disease"] = True

self._load_disease(params)
1 change: 1 addition & 0 deletions src/disease/etl/oncotree.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ def _add_disease(self, disease_node: dict) -> None:
"label": disease_node["name"],
"xrefs": [],
"associated_with": [],
"oncologic_disease": True,
}
refs = disease_node.get("externalReferences", [])
for prefix, codes in refs.items():
Expand Down
15 changes: 8 additions & 7 deletions src/disease/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -319,7 +319,9 @@ def _add_disease(
:return: completed normalized response object ready to return to user
"""
disease_obj = domain_models.Disease(
id=f"normalize.disease.{record['concept_id']}", label=record["label"]
id=f"normalize.disease.{record['concept_id']}",
label=record["label"],
extensions=[],
)

source_ids = record.get("xrefs", []) + record.get("associated_with", [])
Expand All @@ -340,13 +342,12 @@ def _add_disease(
if "aliases" in record:
disease_obj.alternativeLabels = record["aliases"]

if "pediatric_disease" in record and record["pediatric_disease"] is not None:
disease_obj.extensions = [
entity_models.Extension(
name="pediatric_disease",
value=record["pediatric_disease"],
for field in ("pediatric_disease", "oncologic_disease"):
value = record.get(field)
if value is not None:
disease_obj.extensions.append(
entity_models.Extension(name=field, value=value)
)
]

response["match_type"] = match_type
response["disease"] = disease_obj
Expand Down
2 changes: 2 additions & 0 deletions src/disease/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@ class Disease(BaseModel):
xrefs: list[StrictStr] = []
associated_with: list[StrictStr] = []
pediatric_disease: bool | None = None
oncologic_disease: bool | None = None

model_config = ConfigDict(
json_schema_extra={
Expand All @@ -123,6 +124,7 @@ class Disease(BaseModel):
"xrefs": [],
"associated_with": ["umls:C0019562"],
"pediatric_disease": None,
"oncologic_disease": None,
}
}
)
Expand Down
4 changes: 4 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,10 @@ def _compare_records(actual: Disease, fixt: Disease):
if (actual.pediatric_disease is not None) and (fixt.pediatric_disease is not None):
assert actual.pediatric_disease == fixt.pediatric_disease

assert (actual.oncologic_disease is None) == (fixt.oncologic_disease is None)
if (actual.oncologic_disease is not None) and (fixt.oncologic_disease is not None):
assert actual.oncologic_disease == fixt.oncologic_disease


@pytest.fixture(scope="session")
def compare_records():
Expand Down
10 changes: 9 additions & 1 deletion tests/unit/test_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ def neuroblastoma():
"umls:C2751421",
"umls:CN205405",
],
"pediatric": None,
"oncologic_disease": True,
}


Expand Down Expand Up @@ -107,6 +107,7 @@ def lnscc():
"kegg.disease:05223",
],
"item_type": "merger",
"oncologic_disease": True,
}


Expand All @@ -131,6 +132,7 @@ def richter():
"icd10.cm:C91.1",
],
"item_type": "merger",
"oncologic_disease": True,
}


Expand All @@ -150,6 +152,7 @@ def ped_liposarcoma():
],
"associated_with": ["umls:C0279984"],
"pediatric_disease": True,
"oncologic_disease": True,
"item_type": "merger",
}

Expand All @@ -168,6 +171,7 @@ def teratoma():
],
"associated_with": ["icdo:9080/0", "umls:C1368888"],
"item_type": "merger",
"oncologic_disease": True,
}


Expand Down Expand Up @@ -238,6 +242,10 @@ def compare_merged_records(actual, fixture):
if "pediatric_disease" in actual or "pediatric_disease" in fixture:
assert actual["pediatric_disease"] == fixture["pediatric_disease"]

assert ("oncologic_disease" in actual) == ("oncologic_disease" in fixture)
if "oncologic_disease" in actual or "oncologic_disease" in fixture:
assert actual["oncologic_disease"] == fixture["oncologic_disease"]


def test_generate_merged_record(
merge_instance,
Expand Down
5 changes: 5 additions & 0 deletions tests/unit/test_mondo.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ def neuroblastoma():
"mesh:D009447",
],
pediatric_disease=None,
oncologic_disease=True,
)


Expand All @@ -60,6 +61,7 @@ def richter_syndrome():
"umls:C0349631",
],
pediatric_disease=None,
oncologic_disease=True,
)


Expand All @@ -79,6 +81,7 @@ def pediatric_liposarcoma():
xrefs=["DOID:5695", "ncit:C8091"],
associated_with=["umls:C0279984"],
pediatric_disease=True,
oncologic_disease=True,
)


Expand All @@ -94,6 +97,7 @@ def cystic_teratoma_adult():
pediatric_disease=None,
xrefs=["ncit:C9012", "DOID:7079"],
associated_with=["umls:C1368888"],
oncologic_disease=True,
)


Expand All @@ -120,6 +124,7 @@ def nsclc():
"umls:C0007131",
"efo:0003060",
],
oncologic_disease=True,
)


Expand Down
3 changes: 3 additions & 0 deletions tests/unit/test_omim.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ def mafd2():
xrefs=[],
associated_with=[],
pediatric_disease=None,
oncologic_disease=None,
)


Expand All @@ -53,6 +54,7 @@ def acute_ll():
xrefs=[],
associated_with=[],
pediatric_disease=None,
oncologic_disease=None,
)


Expand All @@ -66,6 +68,7 @@ def lall():
xrefs=[],
associated_with=[],
pediatric_disease=None,
oncologic_disease=None,
)


Expand Down
3 changes: 3 additions & 0 deletions tests/unit/test_oncotree.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ def neuroblastoma():
xrefs=["ncit:C3270"],
associated_with=["umls:C0027819"],
pediatric_disease=None,
oncologic_disease=True,
)


Expand All @@ -35,6 +36,7 @@ def nsclc():
xrefs=["ncit:C2926"],
associated_with=["umls:C0007131"],
pediatric_disease=None,
oncologic_disease=True,
)


Expand All @@ -48,6 +50,7 @@ def ipn():
xrefs=[],
associated_with=[],
pediatric_disease=None,
oncologic_disease=True,
)


Expand Down
14 changes: 12 additions & 2 deletions tests/unit/test_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from datetime import datetime

import pytest
from ga4gh.core import domain_models
from ga4gh.core import domain_models, entity_models

from disease.query import InvalidParameterException, QueryHandler
from disease.schemas import MatchType, SourceName
Expand Down Expand Up @@ -81,6 +81,7 @@ def neuroblastoma():
"neural Crest tumor, malignant",
"neuroblastoma, malignant",
],
extensions=[entity_models.Extension(name="oncologic_disease", value=True)],
)


Expand All @@ -92,6 +93,7 @@ def skin_myo():
id="normalize.disease.ncit:C167370",
label="Skin Myoepithelioma",
alternativeLabels=["Cutaneous Myoepithelioma"],
extensions=[],
)


Expand Down Expand Up @@ -130,6 +132,7 @@ def mafd2():
"relation": "relatedMatch",
},
],
extensions=[],
)


Expand Down Expand Up @@ -179,9 +182,16 @@ def get_extension(extensions, name):
ped_fixture = get_extension(ext_fixture, "pediatric_disease")
assert (ped_actual is None) == (ped_fixture is None)
if ped_actual and ped_fixture:
assert set(ped_actual.value) == set(ped_fixture.value)
assert ped_actual.value == ped_fixture.value
assert ped_actual.value

onco_actual = get_extension(ext_actual, "oncologic_disease")
onco_fixture = get_extension(ext_fixture, "oncologic_disease")
assert (onco_actual is None) == (onco_fixture is None)
if onco_actual and onco_fixture:
assert onco_actual.value == onco_fixture.value
assert onco_actual.value


def test_query(query_handler):
"""Test that query returns properly-structured response."""
Expand Down

0 comments on commit 8c4022a

Please sign in to comment.