Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[DC-1022] [archived] Adds script to generate concept_ancestor_ext table. #685

Open
wants to merge 1 commit into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion data_steward/common.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
# Python imports
import os

import jinja2

# Project imports
from constants.bq_utils import VALIDATION_DATASET_REGEX
from constants.validation.participants.identity_match import REPORT_DIRECTORY_REGEX
import jinja2

# AOU required PII tables
PII_WILDCARD = 'pii*'
Expand Down Expand Up @@ -112,6 +113,8 @@
CONCEPT, CONCEPT_ANCESTOR, CONCEPT_CLASS, CONCEPT_RELATIONSHIP,
CONCEPT_SYNONYM, DOMAIN, DRUG_STRENGTH, RELATIONSHIP, VOCABULARY
]
CONCEPT_ANCESTOR_EXTENSION = 'concept_ancestor_extension'

# Achilles
ACHILLES_ANALYSIS = 'achilles_analysis'
ACHILLES_RESULTS = 'achilles_results'
Expand Down
173 changes: 173 additions & 0 deletions data_steward/tools/generate_concept_ancestor_extension.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,173 @@
"""
concept_relationship stores two types of relationships
1) hierarchical relationships: 'is_a' / 'subsume' that defines the hierarchy of the vocabulary;
2) associative relationships: relationships across the hierarchy such as Drug A
'is an indication of' Condition B. The concept_ancestor table is built based on 'Is A' and 'Subsume' relationships
recursively so any ancestor/descendent pairs (regardless of the levels of separation) are pre-computed for us.

The relationship for LOINC component concepts (e.g. Triglycerides) used to be in the subsumption relationship with lab
concepts (e.g. Triglyceride [Mass or Moles/volume] in Serum or Plasma) in the previous version of vocab, however, the
OMOP vocab team changed this relationship from 'subsume' to 'component of ' to align with the LOINC system. As a
consequence, concept_ancestor missed all of ancestor/descendent relationships involving LOINC component concepts.

This script generates the concept_ancestor_ext table for all the concepts in measurement domain using loinc hierarchy.
"""

import argparse
import logging

from common import CONCEPT_ANCESTOR_EXTENSION
from utils import bq

LOGGER = logging.getLogger(__name__)

CONCEPT_ANCESTOR_EXT_QUERY = '''
DECLARE
num_of_new_records INT64;
-- Instantiate concept_ancestor_extension with all LONIC measurement concepts and direct descendant concepts
CREATE OR REPLACE TABLE
`{project}.{dataset}.{ancestor_extension}` ( ancestor_concept_id INT64,
descendant_concept_id INT64,
levels_of_separation INT64 ) AS (
SELECT
DISTINCT cr.concept_id_1 AS ancestor_concept_id,
cr.concept_id_2 AS descendant_concept_id,
1 AS levels_of_separation
FROM (
SELECT
concept_id AS ancestor_concept_id
FROM
`{project}.{dataset}.concept` AS c
WHERE
c.vocabulary_id = 'LOINC'
AND domain_id = 'Measurement' ) AS loinc_ids
JOIN
`{project}.{dataset}.concept_relationship` AS cr
ON
loinc_ids.ancestor_concept_id = cr.concept_id_1
AND relationship_id IN ('Subsumes',
'Component of')
AND cr.concept_id_1 <> cr.concept_id_2
JOIN
`{project}.{dataset}.concept` AS c2
ON
cr.concept_id_2 = c2.concept_id
AND c2.domain_id = 'Measurement' );
LOOP
CREATE OR REPLACE TEMP TABLE descendants_next_iteration AS (
SELECT
DISTINCT cae.ancestor_concept_id,
cr.concept_id_2 AS descendant_concept_id,
cae.levels_of_separation + 1 AS levels_of_separation
FROM
`{project}.{dataset}.{ancestor_extension}` AS cae
JOIN
`{project}.{dataset}.concept_relationship` AS cr
ON
cae.descendant_concept_id = cr.concept_id_1
AND relationship_id IN ('Subsumes',
'Component of')
AND cr.concept_id_1 <> cr.concept_id_2
JOIN
`{project}.{dataset}.concept` AS c2
ON
cr.concept_id_2 = c2.concept_id
AND c2.domain_id = 'Measurement' );
SET
num_of_new_records = (
SELECT
COUNT(*)
FROM
descendants_next_iteration AS cae_new
LEFT JOIN
`{project}.{dataset}.{ancestor_extension}` AS cae
ON
cae_new.ancestor_concept_id = cae.ancestor_concept_id
AND cae_new.descendant_concept_id = cae.descendant_concept_id
AND cae_new.levels_of_separation = cae.levels_of_separation
AND cae.ancestor_concept_id <> cae_new.descendant_concept_id
WHERE
cae.ancestor_concept_id IS NULL );
IF
num_of_new_records = 0 THEN
LEAVE
;
END IF
;
INSERT
`{project}.{dataset}.{ancestor_extension}`
SELECT
cae_new.*
FROM
descendants_next_iteration AS cae_new
LEFT JOIN
`{project}.{dataset}.{ancestor_extension}` AS cae
ON
cae_new.ancestor_concept_id = cae.ancestor_concept_id
AND cae_new.descendant_concept_id = cae.descendant_concept_id
AND cae_new.levels_of_separation = cae.levels_of_separation
AND cae.ancestor_concept_id <> cae_new.descendant_concept_id
WHERE
cae.ancestor_concept_id IS NULL;
END LOOP
;
CREATE OR REPLACE TABLE
`{project}.{dataset}.{ancestor_extension}` AS
SELECT
ancestor_concept_id,
descendant_concept_id,
MIN(levels_of_separation) AS min_levels_of_separation,
MAX(levels_of_separation) AS max_levels_of_separation
FROM
`{project}.{dataset}.{ancestor_extension}`
GROUP BY
ancestor_concept_id,
descendant_concept_id;
'''


def generate_concept_ancestor_extension(project_id, dataset_id):
"""
generates concept ancestor extension table from the concept relationship table for LOINC hierarchy
:param project_id: identifier for project id
:param dataset_id: identifier for dataset
:return: Bq job result
"""

client = bq.get_client(project_id)
query = CONCEPT_ANCESTOR_EXT_QUERY.format(
project=project_id,
dataset=dataset_id,
ancestor_extension=CONCEPT_ANCESTOR_EXTENSION)
query_job = client.query(query)
res = query_job.result()
return res


def get_args_parser():
parser = argparse.ArgumentParser()
parser.add_argument(
'-p',
'--project_id',
dest='project_id',
action='store',
help=
'Identifies the project containing the ehr dataset and lookup dataset',
required=True)
parser.add_argument(
'-d',
'--dataset_id',
dest='ehr_dataset_id',
action='store',
help=
'Identifies the dataset where the concept_ancestor_ext table is to be created.',
required=True)
return parser


if __name__ == '__main__':
args_parser = get_args_parser()
args = args_parser.parse_args()
concept_ancestor_ext = generate_concept_ancestor_extension(
args.project_id, args.ehr_dataset_id)
LOGGER.info(concept_ancestor_ext)