diff --git a/scripts/biomedical/NCBI_dbSNP/background_process.sh b/scripts/biomedical/NCBI_dbSNP/background_process.sh new file mode 100644 index 000000000..3ce645150 --- /dev/null +++ b/scripts/biomedical/NCBI_dbSNP/background_process.sh @@ -0,0 +1,54 @@ +#!/bin/bash +# Sleep as long as N jobs of given name are running in background + +if [ $# -lt 1 ]; then + echo "Usage: $0 " + exit 1 +fi + +PYTHON_PATH=$1 +INPUT_PATH=$2 + +echo " $PYTHON_PATH - $INPUT_PATH" + +# Count the number of background jobs +function num_jobs { + local name="$PYTHON_PATH"; shift; + if [[ "$name" == "" ]]; then + echo $(jobs -r | wc -l) + else + echo $(ps -ef | egrep "$name" | wc -l) + fi +} + +# Get the number of cores +function num_cores { + [[ -z "$cores" ]] && cores=$(cat /proc/cpuinfo | grep "processor" | wc -l) + cores=${cores:-"10"} + echo $cores +} + +# Sleep while there are atleast N background jobs +function sleep_while_active { + max_jobs=6 + job_name="$PYTHON_PATH" + max_jobs=${max_jobs:-$(num_cores)} + j=$(num_jobs $job_name); + echo "No of jobs $j and job name $job_name" + while (( $j > ${max_jobs:-0} )); do + sleep 1; + j=$(num_jobs $job_name); + done; +} + +# Run processes in background +csv_files=$(ls $INPUT_PATH*_shard_*.vcf | xargs -n 1 basename) +#echo "files $csv_files" +for file in $csv_files; do + # Run the process per file in background + # python $2--input_file=$file & sleep_while_active + echo "$PYTHON_PATH --input_file=$file" + python3 $PYTHON_PATH --input_file=$file & sleep_while_active +done +wait + diff --git a/scripts/biomedical/NCBI_dbSNP/download.sh b/scripts/biomedical/NCBI_dbSNP/download.sh new file mode 100644 index 000000000..9d29c2d84 --- /dev/null +++ b/scripts/biomedical/NCBI_dbSNP/download.sh @@ -0,0 +1,47 @@ +#!/bin/bash + +mkdir -p input; cd input + +# download genome assemblies files +curl -L -O https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.14_GRCh37.p13/GCA_000001405.14_GRCh37.p13_assembly_report.txt + +curl -L -O https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.29_GRCh38.p14/GCA_000001405.29_GRCh38.p14_assembly_report.txt + +curl -L -O https://ftp.ncbi.nlm.nih.gov/pub/medgen/NAMES.RRF.gz +gunzip NAMES.RRF.gz +mv NAMES.RRF NAMES.txt + +curl -L -O https://ftp.ncbi.nlm.nih.gov/pub/medgen/MGDEF.RRF.gz +gunzip MGDEF.RRF.gz +mv MGDEF.RRF MGDEF.txt + +curl -L -O https://ftp.ncbi.nlm.nih.gov/pub/medgen/MGSTY.RRF.gz +gunzip MGSTY.RRF.gz +mv MGSTY.RRF MGSTY.txt + +curl -L -O https://ftp.ncbi.nlm.nih.gov/pub/medgen/MedGenIDMappings.txt.gz +gunzip MedGenIDMappings.txt.gz + +curl -L -O https://ftp.ncbi.nlm.nih.gov/pub/clinvar/gene_condition_source_id +gunzip gene_condition_source_id +mv gene_condition_source_id gene_condition_source_id.txt + +mkdir -p GCF25; cd GCF25 + +curl -L -O https://ftp.ncbi.nlm.nih.gov/snp/latest_release/VCF/GCF_000001405.25.gz +gunzip GCF_000001405.25.gz +mv GCF_000001405.25 GCF_000001405.25.vcf + +cd .. +mkdir -p GCF40; cd GCF40 + +curl -L -O https://ftp.ncbi.nlm.nih.gov/snp/latest_release/VCF/GCF_000001405.40.gz +gunzip GCF_000001405.40.gz +mv GCF_000001405.40 GCF_000001405.40.vcf + +cd .. +mkdir -p freq; cd freq + +curl -L -O https://ftp.ncbi.nlm.nih.gov/snp/population_frequency/latest_release/freq.vcf.gz +gunzip freq.vcf.gz +cd .. diff --git a/scripts/biomedical/NCBI_dbSNP/run.sh b/scripts/biomedical/NCBI_dbSNP/run.sh new file mode 100644 index 000000000..a77ec2144 --- /dev/null +++ b/scripts/biomedical/NCBI_dbSNP/run.sh @@ -0,0 +1,43 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#!/bin/bash + +# make all required directories +mkdir -p output +mkdir -p output/GCF25 +mkdir -p output/GCF40 +mkdir -p output/GCF40/hg38 +mkdir -p output/GCF40/hg38alleledisease +mkdir -p output/GCF40/hg38alleledrug +mkdir -p output/GCF40/hg38alleles +mkdir -p output/GCF40/hg38freq +mkdir -p output/freq + +echo "File split started" +sh split_files.sh +echo "Splitting Completed" + +# Command +echo "Running python script" + +python3 scripts/process_medgen.py +python3 scripts/process_genome_assembly_report.py +python3 scripts/process_gene_condition_source.py + +echo "Start background process" +sh background_process.sh scripts/process_dbsnp_hg19_positions.py input/GCF25/ +sh background_process.sh scripts/process_dbsnp_hg38.py input/GCF40/ +sh background_process.sh scripts/process_dbsnp_freq.py input/freq/ +echo "Background process completed" diff --git a/scripts/biomedical/NCBI_dbSNP/scripts/process_dbsnp_freq.py b/scripts/biomedical/NCBI_dbSNP/scripts/process_dbsnp_freq.py new file mode 100644 index 000000000..ae632a450 --- /dev/null +++ b/scripts/biomedical/NCBI_dbSNP/scripts/process_dbsnp_freq.py @@ -0,0 +1,249 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Author: Pradeep Kumar Krishnaswamy +Date: 13-Oct-2024 +Name: process_dbsnp_freq +Description: cleaning the NCBI dbSNP freq input file. +@source data: Download NCBI dbSNP data from FTP location. Refer to download.sh for details +""" + +import csv +import os +import sys +import time +from copy import deepcopy +from absl import flags +from absl import logging +from datetime import datetime as dt + +MODULE_DIR = os.path.dirname(os.path.dirname(__file__)) + +# Setup path for import from data/util +# or set `export PYTHONPATH="./:/data/util"` in bash +_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) +sys.path.append(_SCRIPT_DIR) +_DATA_DIR = _SCRIPT_DIR.split('/data/')[0] +sys.path.append(os.path.join(_DATA_DIR, 'data/util')) + +import file_util +from counters import Counters + +# for local testing purpose only +# from Utils.counters import Counters +# import Utils.file_util as file_util + +_FLAGS = flags.FLAGS + +flags.DEFINE_string('input_file', 'freq_shard_aa.vcf', + 'Input file to process. Mandatory to pass this argument') +flags.DEFINE_string('output_dir', 'output/freq', 'Output directory for generated files.') +flags.DEFINE_string('input_dir', 'input/freq', 'Input directory where .vcf files downloaded.') + +_FLAGS(sys.argv) + +CSV_DICT = { + 'dcid_gv': '', + 'dcid': '', + 'name': '', + 'alleleFrequency': '', + 'alternativeAllele': '', + 'genotypeHeterozygousFrequency': '', + 'genotypeHomozygousAlternativeFrequency': '', + 'genotypeHomozygousReferenceFrequency': '', + 'hardyWeinbergEquationPValue': '', + 'isGlobalPopulation': '', + 'measuredPopulation': '', + 'referenceAllele': '', + 'rsID': '', + 'sampleSize': '' +} + +COLUMN_CODES_DICT = { + 9: { + 'name': 'European', + 'isGlobal': False + }, + 10: { + 'name': 'African Others', + 'isGlobal': False + }, + 11: { + 'name': 'East Asian', + 'isGlobal': False + }, + 12: { + 'name': 'African American', + 'isGlobal': False + }, + 13: { + 'name': 'Latin American 1', + 'isGlobal': False + }, + 14: { + 'name': 'Latin American 2', + 'isGlobal': False + }, + 15: { + 'name': 'Other Asian', + 'isGlobal': False + }, + 16: { + 'name': 'South Asian', + 'isGlobal': False + }, + 17: { + 'name': 'Other', + 'isGlobal': False + }, + 18: { + 'name': 'African', + 'isGlobal': False + }, + 19: { + 'name': 'Asian', + 'isGlobal': False + }, + 20: { + 'name': 'Total', + 'isGlobal': True + } +} + + +def process_input_csv(input_file: str, output_freq_file_path: str) -> None: + """ Row by row processing of NCBI dbSNP freq input file + Args: + input_file (str): file path to process + output_freq_file_path (str): output file path to save cleaned csv + """ + with open(output_freq_file_path, 'w') as output_hg38_freq: + writer_hg38_freq = csv.DictWriter(output_hg38_freq, CSV_DICT, extrasaction='ignore') + writer_hg38_freq.writeheader() + counters = Counters() + counters.add_counter('total', file_util.file_estimate_num_rows(input_file)) + + with open(input_file, 'r') as input_file_csv: + for line in input_file_csv: + # skip row + if line[0] == '#': + continue + # process this row + else: + input_row = line.replace('\n', '').split('\t') + + dciv_gv = f'bio/{input_row[2]}' + rsID = input_row[2] + #print(rsID, end='\r') + if rsID == '.': + continue + refAllele = input_row[3] + altAllele = input_row[4] + + for i in range(9, 21): + try: + row = deepcopy(CSV_DICT) + row['dcid_gv'] = dciv_gv + #print(dciv_gv, end='\r') + row['rsID'] = rsID + row['referenceAllele'] = refAllele + row['alternativeAllele'] = altAllele + row['dcid'] = f"bio/{rsID}_{COLUMN_CODES_DICT[i]['name'].replace(' ', '_')}" + row['name'] = f'"{rsID} {COLUMN_CODES_DICT[i]["name"]} Population Frequency"' + row['isGlobalPopulation'] = COLUMN_CODES_DICT[i]['isGlobal'] + row['measuredPopulation'] = COLUMN_CODES_DICT[i]['name'] + row['genotypeHeterozygousFrequency'] = "0.00000" + row['genotypeHomozygousAlternativeFrequency'] = "0.00000" + row['genotypeHomozygousReferenceFrequency'] = "0.00000" + if input_row[i].count(':') != 5: + # Skip this record as it cannot be unpacked to desired format + continue + # column format “AN:AC:HWEP:GR:GV:GA”. + row = parse_freq_row(input_row[i], refAllele, altAllele, row) + writer_hg38_freq.writerow(row) + + except Exception as e: + print(input_row[i], e, rsID) + counters.add_counter('processed', 1) + + +def parse_freq_row(freq_value, refAllele, altAllele, row): + """ parser for freq input value for the given row dict + + Args: + freq_value (_type_): freq string e.g. format “AN:AC:HWEP:GR:GV:GA”. + refAllele (_type_): referenceAllele + altAllele (_type_): alternativeAllele + row (_type_): input row dict + + Returns: + _type_: row dict + """ + AN, AC, HWEP, GR, GV, GA = [ + int(x) if x.lstrip("-").isdigit() else x for x in freq_value.split(':') + ] + if isinstance(AC, str): + ref_val = 0.0 + alt_alleles = AC.split(',') + if AN != 0: + ref_val = eval(f"({AN}-{'-'.join(alt_alleles)})/{AN}") + + alt_lst = altAllele.split(',') + alt_val_lst = [] + + for idx, alt_allele in enumerate(alt_alleles): + alt_val = "0.00000" + if AN != 0: + alt_val = f"{(int(alt_allele)/AN):.5f}" + alt_val_lst.append(f"{alt_lst[idx]}:{alt_val}") + row['alleleFrequency'] = f"{refAllele}:{ref_val:.5f}, {', '.join(alt_val_lst)}" + + else: + if AN == 0: + row['alleleFrequency'] = f"{refAllele}:0.00000, {altAllele}:0.00000" + + else: + row['alleleFrequency'] = f"{refAllele}:{((AN-AC)/AN):.5f}, {altAllele}:{(AC/AN):.5f}" + + if AN != 0: + row['genotypeHeterozygousFrequency'] = f"{(GV / (AN / 2)):.5f}" + row['genotypeHomozygousAlternativeFrequency'] = f"{(GA / (AN / 2)):.5f}" + row['genotypeHomozygousReferenceFrequency'] = f"{(GR / (AN / 2)):.5f}" + + row['hardyWeinbergEquationPValue'] = HWEP + row['sampleSize'] = AN + return row + + +def main(input_file: str) -> None: + """ Main method + + Args: + input_file (str): file path to process + """ + start_time = time.time() + logging.set_verbosity('info') + logging.info(f"Freq processing input file {input_file} - {dt.now()}") + input_file_path = os.path.join(MODULE_DIR + '/' + _FLAGS.input_dir, input_file) + + output_file = input_file.split('.')[0] + '.csv' + output_csv = os.path.join(MODULE_DIR + '/' + _FLAGS.output_dir, output_file) + logging.info(f"output_csv {output_csv}") + + process_input_csv(input_file_path, output_csv) + logging.info(f"Time taken to process {((time.time() - start_time)/60):.2f} - {dt.now()}") + + +if __name__ == '__main__': + main(_FLAGS.input_file) diff --git a/scripts/biomedical/NCBI_dbSNP/scripts/process_dbsnp_hg19_positions.py b/scripts/biomedical/NCBI_dbSNP/scripts/process_dbsnp_hg19_positions.py new file mode 100644 index 000000000..0f9099f32 --- /dev/null +++ b/scripts/biomedical/NCBI_dbSNP/scripts/process_dbsnp_hg19_positions.py @@ -0,0 +1,154 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Author: Pradeep Kumar Krishnaswamy +Date: 13-Oct-2024 +Name: process_dbsnp_hg19_positions +Description: cleaning the NCBI dbSNP HG19 positions input file. +@source data: Download NCBI dbSNP data from FTP location. Refer to download.sh for details +""" + +import csv +import os +import sys +import copy +import json +import random +import time +from absl import flags +from absl import logging + +MODULE_DIR = os.path.dirname(os.path.dirname(__file__)) + +# Setup path for import from data/util +# or set `export PYTHONPATH="./:/data/util"` in bash +_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) +sys.path.append(_SCRIPT_DIR) +_DATA_DIR = _SCRIPT_DIR.split('/data/')[0] +sys.path.append(os.path.join(_DATA_DIR, 'data/util')) + +import file_util +from counters import Counters + +# for local testing purpose only +# from Utils.counters import Counters +# import Utils.file_util as file_util + +_FLAGS = flags.FLAGS +flags.DEFINE_string('input_file', 'gcf25_shard_aa.vcf', + 'Input file to process. Mandatory to pass this argument') +flags.DEFINE_string('output_dir', 'output/GCF25', 'Output directory for generated files.') +flags.DEFINE_string('input_dir', 'input/GCF25', 'Input directory where .vcf files downloaded.') +flags.DEFINE_string('json_dir', 'output', 'Directory of json file generated from genome_assembly') + +_FLAGS(sys.argv) + +CSV_DICT = { + 'dcid': '', + 'name': '', + 'dcid_pos': '', + 'name_pos': '', + 'inChromosome': '', + 'position': '', + 'rsID': '' +} + +hg19_genome_assembly_file_name = 'hg19_genome_assembly_report.json' +HG19_REFSEQ_DICT = {} + + +def load_json(): + """ load hg19 genome assembly file file + """ + global HG19_REFSEQ_DICT + hg19_file_path = os.path.join(MODULE_DIR + '/' + _FLAGS.json_dir, + hg19_genome_assembly_file_name) + + hg19_dict = None + with open(hg19_file_path, 'r') as f: + hg19_dict = json.load(f) + for hg in hg19_dict: + HG19_REFSEQ_DICT[hg['refSeqAccession']] = hg['dcid'] + + +def parse_hg19_row(input_row, hg19_ref_seq): + """ parse hg19 row + + Args: + input_row (_type_): input dict + hg19_ref_seq (_type_): hg19 genome assembly + + Returns: + _type_: row dict + """ + current_row = copy.deepcopy(CSV_DICT) + current_row['dcid'] = f'bio/{input_row[2]}' + current_row['name'] = input_row[2] + if hg19_ref_seq: + current_row['dcid_pos'] = f'{hg19_ref_seq}_{input_row[1]}' + current_row['name_pos'] = f'"hg19 {hg19_ref_seq.replace("bio/hg19_", "")} {input_row[1]}"' + current_row['inChromosome'] = hg19_ref_seq + current_row['position'] = input_row[1] + current_row['rsID'] = input_row[2] + return current_row + + +def main(input_file_name: str) -> None: + """ Main method + + Args: + input_file (str): file path to process + """ + logging.set_verbosity('info') + logging.info(f"HG18 processing input file {input_file_name}") + start_time = time.time() + + load_json() + global HG19_REFSEQ_DICT + logging.set_verbosity('info') + input_file = os.path.join(MODULE_DIR + '/' + _FLAGS.input_dir, input_file_name) + logging.info(f"HG19 processing input file {input_file}") + output_file = input_file_name.split('.')[0] + '.csv' + output_csv = os.path.join(MODULE_DIR + '/' + _FLAGS.output_dir, output_file) + logging.info(f"output_csv {output_csv}") + + counters = Counters() + counters.add_counter('total', file_util.file_estimate_num_rows(input_file)) + + with open(input_file, 'r') as input_file_csv: + with open(output_csv, 'w') as output_file_csv: + writer = csv.DictWriter(output_file_csv, CSV_DICT) + # write header + writer.writeheader() + for line in input_file_csv: + # skip row + if line[0] == '#': + continue + # process this row + else: + input_row = line.replace('\n', '').split('\t') + hg19_ref_seq = None + if input_row[0] in HG19_REFSEQ_DICT: + hg19_ref_seq = HG19_REFSEQ_DICT[input_row[0]] + current_row = parse_hg19_row(input_row, hg19_ref_seq) + if current_row: + # write to output + writer.writerow(current_row) + counters.add_counter('processed', 1) + + logging.info(f"Time taken to process {((time.time() - start_time)/60):.2f}") + + +if __name__ == '__main__': + main(_FLAGS.input_file) diff --git a/scripts/biomedical/NCBI_dbSNP/scripts/process_dbsnp_hg38.py b/scripts/biomedical/NCBI_dbSNP/scripts/process_dbsnp_hg38.py new file mode 100644 index 000000000..c82fe7292 --- /dev/null +++ b/scripts/biomedical/NCBI_dbSNP/scripts/process_dbsnp_hg38.py @@ -0,0 +1,944 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Author: Pradeep Kumar Krishnaswamy +Date: 13-Oct-2024 +Name: process_dbsnp_hg38 +Description: cleaning the NCBI dbSNP HG38 input file. +@source data: Download NCBI dbSNP data from FTP location. Refer to download.sh for details +""" + +import csv +import os +import sys +import re +import json +import struct +import typing +import time +from copy import deepcopy +from absl import flags +from absl import logging +from datetime import datetime as dt + +MODULE_DIR = os.path.dirname(os.path.dirname(__file__)) + +# Setup path for import from data/util +# or set `export PYTHONPATH="./:/data/util"` in bash +_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) +sys.path.append(_SCRIPT_DIR) +_DATA_DIR = _SCRIPT_DIR.split('/data/')[0] +sys.path.append(os.path.join(_DATA_DIR, 'data/util')) + +import file_util +from counters import Counters + +# for local testing purpose only +# from Utils.counters import Counters +# import Utils.file_util as file_util + +_FLAGS = flags.FLAGS +# flag dict +flags.DEFINE_string('input_file', 'gcf40_shard_aa.vcf', + 'Input file to process. Mandatory to pass this argument') +flags.DEFINE_string('output_dir', 'output/GCF40', 'Output directory for generated files.') +flags.DEFINE_string('input_dir', 'input/GCF40', 'Input directory where .vcf files downloaded.') +flags.DEFINE_string('mapping_file_dir', 'output', 'path of the cui_dcid_mapping.csv file.') +flags.DEFINE_string('json_dir', 'output', 'Directory of json file generated from genome_assembly') +flags.DEFINE_string( + 'gene_id_dcid_mapping', 'ncbi_gene_id_dcid_mapping.csv', + 'Please specify the path to the "ncbi_gene_id_dcid_mapping.csv" file generated in Gene import. If not provided, the script will default to the current working directory.' +) + +_FLAGS(sys.argv) + +# Declare Universal Variables +_BASE_32_MAP = [ + '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'b', 'c', 'd', 'f', 'g', 'h', 'j', 'k', 'l', + 'm', 'n', 'p', 'q', 'r', 's', 't', 'v', 'w', 'x', 'y', 'z', 'e' +] +_NUM_BITS_32 = 5 +_LONG_ID_LEN = 13 + +HG38_DICT = { + 'dcid': '', + 'name': '', + 'dcid_pos': '', + 'name_pos': '', + 'chrom': '', + 'position': '', + 'alleleOrigin': '', + 'alternativeAllele': '', + 'dbSNPBuildID': '', + 'geneID': '', + 'geneID_2': '', + 'referenceAllele': '', + 'rsID': '', + 'suspectReasonCode': '', + 'variantClass': '', + 'genotypesAvailable': False, + 'hasNonSynonymousFrameShift': False, + 'hasNonSynonymousMissenseMutation': False, + 'hasNonSynonymousNonsenseMutation': False, + 'hasSynonymousMutation': False, + 'isCommonVariant': False, + 'isInAcceptorSpliceSite': False, + 'isInDonorSpliceSite': False, + 'isInIntron': False, + 'isInFivePrimeGeneRegion': False, + 'isInFivePrimeUTR': False, + 'isInThreePrimeGeneRegion': False, + 'isInThreePrimeUTR': False, + 'isPublished': False +} +HG38_ALLELES_DICT = { + 'dcid': '', + 'dcid_allele': '', + 'name_allele': '', + 'CLNHGVS': '', + 'variant': '', + 'ARUP_Laboratories\x2c_Molecular_Genetics_and_Genomics\x2cARUP_Laboratories': '', + 'arupLaboratoriesMolecularGeneticsAndGenomicsArupLaboratoriesID': '', + 'HGMD': '', + 'OMIM': '', + 'PharmGKB': '', + 'PharmGKB_Clinical_Annotation': '', + 'UniProtKB': '' +} +HG38_ALLELES_DISEASE_DICT = { + 'dcid': '', + 'dcid_allele': '', + 'arupLaboratoriesMolecularGeneticsAndGenomicsArupLaboratoriesID': '', + 'geneticTestingRegistryID': '', + 'humanGeneMutationDatabaseID': '', + 'omimID': '', + 'pharmGKBID': '', + 'uniProtID': '', + 'dcid_disease': '', + 'name_disease': '', + 'experimentalFactorOntologyID': '', + 'geneReviewsID': '', + 'humanPhenotypeOntologyID': '', + 'medicalGeneticsSummariesID': '', + 'medicalSubjectHeadingID': '', + 'officeOfRareDiseasesId': '', + 'orphaNumber': '', + 'snomedCT': '', + 'medGenID': '', + 'dcid_disease_allele_association': '', + 'name_disease_allele_association': '', + 'CLNORIGIN': '', + 'CLNSIG': '', + 'CLNREVSTAT': '', + 'geneID': '', + 'geneticTestingRegistryID': '', + 'pharmGKBID': '', + 'CLNACC': '' +} +HG38_ALLELES_DRUG_DICT = { + 'dcid': '', + 'dcid_allele': '', + 'arupLaboratoriesMolecularGeneticsAndGenomicsArupLaboratoriesID': '', + 'geneticTestingRegistryID': '', + 'humanGeneMutationDatabaseID': '', + 'omimID': '', + 'pharmGKBID': '', + 'uniProtID': '', + 'dcid_disease': '', + 'name_disease': '', + 'compound_dcid': '', + 'experimentalFactorOntologyID': '', + 'geneReviewsID': '', + 'humanPhenotypeOntologyID': '', + 'medicalGeneticsSummariesID': '', + 'medicalSubjectHeadingID': '', + 'officeOfRareDiseasesId': '', + 'orphaNumber': '', + 'snomedCT': '', + 'medGenID': '', + 'dcid_disease_allele_association': '', + 'name_disease_allele_association': '', + 'CLNORIGIN': '', + 'CLNSIG': '', + 'CLNREVSTAT': '', + 'geneID': '', + 'geneticTestingRegistryID': '', + 'pharmGKBID': '', + 'CLNACC': '' +} +HG38_FREQ_DICT = { + 'dcid': '', + 'dcid_freq': '', + 'name_freq': '', + 'alleleFrequency': '', + 'measuredPopulation': '', + 'rsID': '' +} + +CUI_DCID_MAPPING_DICT = {} + +dbsnp_hg38_file_name = 'hg38/{0}' +dbsnp_hg38_alleles_file_name = 'hg38alleles/{0}' +dbsnp_hg38_allele_disease_file_name = 'hg38alleledisease/{0}' +dbsnp_hg38_allele_drug_file_name = 'hg38alleledrug/{0}' +dbsnp_hg38_freq_file_name = 'hg38freq/{0}' +cui_dict_mapping_file_name = 'cui_dcid_mappings.csv' +writer_hg38 = None +writer_hg38_alleles = None +writer_hg38_allele_disease = None +writer_hg38_allele_drug = None +writer_hg38_freq = None +hg19_genome_assembly_file_name = 'hg19_genome_assembly_report.json' +hg38_genome_assembly_file_name = 'hg38_genome_assembly_report.json' +gene_id_dcid_mapping_file_name = 'ncbi_gene_id_dcid_mapping.csv' +HG19_REFSEQ_DICT = {} +HG38_DCID_DICT = {} +GENE_ID_DCID_MAPPING = {} +DB_NOT_AVAILABLE = set() + +HG38_FLAG_PROPS = { + 'NSF': 'hasNonSynonymousFrameShift', + 'NSM': 'hasNonSynonymousMissenseMutation', + 'NSN': 'hasNonSynonymousNonsenseMutation', + 'SYN': 'hasSynonymousMutation', + 'U3': 'isInThreePrimeUTR', + 'U5': 'isInFivePrimeUTR', + 'ASS': 'isInAcceptorSpliceSite', + 'DSS': 'isInDonorSpliceSite', + 'INT': 'isInIntron', + 'R3': 'isInThreePrimeGeneRegion', + 'R5': 'isInFivePrimeGeneRegion', + 'GNO': 'genotypesAvailable', + 'PUB': 'isPublished', + 'COMMON': 'isCommonVariant', + 'PM': 'isPublished' +} + +# database dict +HG38_DB_DICT = { + 'ARUP_Laboratories\x2c_Molecular_Genetics_and_Genomics\x2cARUP_Laboratories': + 'arupLaboratoriesMolecularGeneticsAndGenomicsArupLaboratoriesID', + 'Genetic_Testing_Registry_(GTR)': + 'geneticTestingRegistryID', + 'HGMD': + 'humanGeneMutationDatabaseID', + 'OMIM': + 'omimID', + 'PharmGKB': + 'pharmGKBID', + 'PharmGKB_Clinical_Annotation': + 'pharmGKBID', + 'UniProtKB': + 'uniProtID' +} + +# database to column mapping + +HG38_DB_COL_MAPPING = { + 'MedGen': 'medGenID', + 'Orphanet': 'orphaNumber', + 'OMIM': 'omimID', + 'SNOMED_CT': 'snomedCT', + 'MeSH': 'medicalSubjectHeadingID', + 'Gene': 'geneID', + 'EFO': 'experimentalFactorOntologyID', + 'GeneReviews': 'geneReviewsID', + 'GeneReviews\\x2c': 'geneReviewsID', + 'Genetics_Home_Reference': 'geneticsHomeReferenceID', + 'Genetic_Testing_Registry_(GTR)': 'geneticTestingRegistryID', + 'Medical_Genetics_Summaries': 'medicalGeneticsSummariesID', + 'Office_of_Rare_Diseases': 'officeOfRareDiseasesId', + 'PharmGKB': 'pharmGKBID', + 'PharmGKB_Clinical_Annotation': 'pharmGKBID' +} + +# clinicalSignificance dict +HG38_SIG_DICT = { + '0': 'dcs:ClinSigUncertain', # Uncertain significance + '1': 'dcs:ClinSigNotProvided', # not provided + '2': 'dcs:ClinSigBenign', # Benign + '3': 'dcs:ClinSigLikelyBenign', # Likely benign + '4': 'dcs:ClinSigLikelyPathogenic', # Likely pathogenic + '5': 'dcs:ClinSigPathogenic', # Pathogenic + '6': 'dcs:ClinSigDrugResponse', # Drug response + '8': 'dcs:ClinSigConfersSensitivity', # Confers sensitivity + '9': 'dcs:ClinSigRiskFactor', # Risk factor + '10': 'dcs:ClinSigAssociation', # Association + '11': 'dcs:ClinSigProtective', # Protective + '12': 'dcs:ClinSigConflictingPathogenicity', # Conflicting interpretations of pathogenicity + '13': 'dcs:ClinSigAffects', # Affects + '14': 'dcs:ClinSigAssociationNotFound', # Association not found + '15': 'dcs:ClinSigBenign, dcs:ClinSigLikelyBenign', # Benign/Likely bengin + '16': 'dcs:ClinSigPathogenic, dcs:ClinSigLikelyPathogenic', # Pathogenic/Likely pathogenic + '17': 'dcs:ClinSigConflicting', # Conflicting data from submitters + '18': 'dcs:ClinSigPathogenic, dcs:ClinSigLowPenetrance', # Pathogenic, low penetrance + '19': 'dcs:ClinSigPathogenic, dcs:ClinSigLowPenetrance', # Pathogenic, low penetrance + '20': 'dcs:ClinSigEstablishedRiskAllele', # Established risk allele + '21': 'dcs:ClinSigLikelyRiskAllele', # Likely risk allele + '22': 'dcs:ClinSigUncertainRiskAllele', # Uncertain risk allele + '255': 'dcs:ClinSigOther' # other +} + +# enums + +SAO_ENUM = { + '0': 'dcs:VariantAlleleOriginUnspecified', + '1': 'dcs:VariantAlleleOriginGermline', + '2': 'dcs:VariantAlleleOriginSomatic', + '3': 'dcs:VariantAlleleOriginGermline, dcs:VariantAlleleOriginSomatic' +} + +VC_ENUM = { + '1': 'dcs:VariationTypeSNV', + 'SNV': 'dcs:VariationTypeSNV', + '2': 'dcs:VariationTypeDIV', + 'DIV': 'dcs:VariationTypeDIV', + '3': 'dcs:VariationTypeHeterozygous', + 'HETEROZYGOUS': 'dcs:VariationTypeHeterozygous', + '4': 'dcs:VariationTypeSTR', + 'STR': 'dcs:VariationTypeSTR', + '5': 'dcs:VariationTypeNamed', + 'NAMED': 'dcs:VariationTypeNamed', + '6': 'dcs:VariationTypeNoVariation', + 'NO VARIATION': 'dcs:VariationTypeNoVariation', + '7': 'dcs:VariationTypeMixed', + 'MIXED': 'dcs:VariationTypeMixed', + '8': 'dcs:VariationTypeMNV', + 'MNV': 'dcs:VariationTypeMNV', + '9': 'dcs:VariationTypeException', + 'Exception': 'dcs:VariationTypeException', + 'INS': 'dcs:VariationTypeINS', + 'DEL': 'dcs:VariationTypeDEL', + 'INDEL': 'dcs:VariationTypeINDEL' +} + +REVIEW_STATUS_ENUM = { + 'no_assertion': 'dcs:ClinVarReviewStatusNoAssertion', # No asserition provided by submitter + 'no_assertion_criteria_provided': 'dcs:ClinVarReviewStatusNoAssertion', + 'no_criteria': + 'dcs:ClinVarReviewStatusNoCriteria', # No assertion criteria provided by submitter + 'no_assertion_criteria_provided': 'dcs:ClinVarReviewStatusNoCriteria', + 'no_assertion': 'dcs:ClinVarReviewStatusNoCriteria', + 'no_assertion_provided': 'dcs:ClinVarReviewStatusNoCriteria', + 'Single': 'dcs:ClinVarReviewStatusSingleSubmitter', # Classified by single submitter + '_single_submitter': 'dcs:ClinVarReviewStatusSingleSubmitter', + 'single_submitter': 'dcs:ClinVarReviewStatusSingleSubmitter', + 'mult': 'dcs:ClinVarReviewStatusMultipleSubmitters', # Classified by multiple submitters + '_multiple_submitters': 'dcs:ClinVarReviewStatusMultipleSubmitters', + 'multiple_submitters': 'dcs:ClinVarReviewStatusMultipleSubmitters', + 'conf': + 'dcs:ClinVarReviewStatusConflictingInterpretations', # Criteria provided conflicting interpretations + 'conflicting_interpretations': 'dcs:ClinVarReviewStatusConflictingInterpretations', + '_conflicting_interpretations': 'dcs:ClinVarReviewStatusConflictingInterpretations', + 'exp': 'dcs:ClinVarReviewStatusReviewed', # Reviewed by expert panel + 'reviewed_by_expert_panel': 'dcs:ClinVarReviewStatusReviewed', + 'guideline': 'dcs:ClinVarReviewStatusPracticeGuideline', # Practice guideline + 'practice_guideline': 'dcs:ClinVarReviewStatusPracticeGuideline', + 'criteria_provided': 'dcs:ClinVarReviewStatusCriteriaProvided', + 'no_conflicts': 'dcs:ClinVarReviewStatusNoConflicts', + '_no_conflicts': 'dcs:ClinVarReviewStatusNoConflicts', + 'non_interpretation_for_the_single_variant': 'dcs:ClinVarReviewStatusNoInterpretation', + 'no_interpretation_for_the_single_varian': 'dcs:ClinVarReviewStatusNoInterpretation' +} + + +def load_json(hg38_file_path: str, hg19_file_path: str, gene_id_dcid_mapping_path) -> None: + global HG19_REFSEQ_DICT, HG38_DCID_DICT, GENE_ID_DCID_MAPPING + start_time = time.time() + hg19_dict = None + with open(hg19_file_path, 'r') as f: + hg19_dict = json.load(f) + for hg in hg19_dict: + HG19_REFSEQ_DICT[hg['refSeqAccession']] = hg['dcid'] + + hg38_dict = None + with open(hg38_file_path, 'r') as f: + hg38_dict = json.load(f) + for hg in hg38_dict: + HG38_DCID_DICT[hg['refSeqAccession']] = [hg['dcid'], hg['name']] + + with open(gene_id_dcid_mapping_path) as f: + next(f) # Skip the header + reader = csv.reader(f, skipinitialspace=True) + GENE_ID_DCID_MAPPING = dict(reader) + + logging.info(f"Count of GENE_ID_DCID_MAPPING loaded {len(GENE_ID_DCID_MAPPING)}") + logging.info(f"Time take to load mapping files {int((time.time() - start_time))} sec") + + +def load_mapping_data(file_path: str) -> None: + global CUI_DCID_MAPPING_DICT + with open(file_path, 'r') as csv_file: + next(csv_file) + for input_row in csv_file: + line = input_row.split(',') + CUI_DCID_MAPPING_DICT[line[1]] = { + "dcid": line[0], + 'name': line[2], + 'is_drug_response': line[3] + } + logging.info(f"CUI DCID MAPPING records {len(CUI_DCID_MAPPING_DICT)}") + + +def process_input_csv(input_file: str, dbsnp_hg38_file_path: str, dbsnp_hg38_alleles_file_path: str, + dbsnp_hg38_allele_disease_file_path: str, + dbsnp_hg38_allele_drug_file_path: str, + dbsnp_hg38_freq_file_path: str) -> None: + + global CUI_DCID_MAPPING_DICT, writer_hg38, writer_hg38_alleles, writer_hg38_allele_disease, writer_hg38_allele_drug, writer_hg38_freq + + # open all output file and write header + with open(dbsnp_hg38_file_path, 'w') as output_hg38, open( + dbsnp_hg38_alleles_file_path, 'w') as output_hg38_alleles, open( + dbsnp_hg38_allele_disease_file_path, 'w') as output_hg38_allele_disease, open( + dbsnp_hg38_allele_drug_file_path, + 'w') as output_hg38_allele_drug, open(dbsnp_hg38_freq_file_path, + 'w') as output_hg38_freq: + writer_hg38 = csv.DictWriter(output_hg38, HG38_DICT, extrasaction='ignore') + writer_hg38.writeheader() + + writer_hg38_alleles = csv.DictWriter(output_hg38_alleles, + HG38_ALLELES_DICT, + extrasaction='ignore') + writer_hg38_alleles.writeheader() + + writer_hg38_allele_disease = csv.DictWriter(output_hg38_allele_disease, + HG38_ALLELES_DISEASE_DICT, + extrasaction='ignore') + writer_hg38_allele_disease.writeheader() + + writer_hg38_allele_drug = csv.DictWriter(output_hg38_allele_drug, + HG38_ALLELES_DRUG_DICT, + extrasaction='ignore') + writer_hg38_allele_drug.writeheader() + + writer_hg38_freq = csv.DictWriter(output_hg38_freq, HG38_FREQ_DICT, extrasaction='ignore') + writer_hg38_freq.writeheader() + + counters = Counters() + counters.add_counter('total', file_util.file_estimate_num_rows(input_file)) + + with open(input_file, 'r') as input_file_csv: + for line in input_file_csv: + # skip row + if line[0] == '#': + continue + + # core process starts here + else: + input_row = line.replace('\n', '').split('\t') + + # dcid + dcid = f'bio/{input_row[2]}' + rsID = input_row[2] + hg38_dcid = None + if input_row[0] in HG38_DCID_DICT: + hg38_dcid = HG38_DCID_DICT[input_row[0]] + + hg38_row, dict_info = parse_hg38_row(input_row, dcid, rsID, hg38_dcid) + + # process hg38_freq + process_hg38_freq(dcid, input_row[3], input_row[4], dict_info, rsID, + writer_hg38_freq) + + # Process hg38_alleles + process_hg38_alleles(dcid, input_row[3], input_row[4], dict_info, + writer_hg38_alleles) + + # Process hg38_alleles_disease_association & hg38_allele_drug_response_associations + process_hg38_alleles_disease_drug(dcid, dict_info, writer_hg38_allele_disease, + writer_hg38_allele_drug) + + writer_hg38.writerow(hg38_row) + counters.add_counter('processed', 1) + + +def parse_hg38_row(input_row, dcid, rsID, hg38_dcid): + hg38_row = deepcopy(HG38_DICT) + hg38_row['dcid'] = dcid + hg38_row['rsID'] = rsID + # name + hg38_row['name'] = input_row[2] + + if hg38_dcid: + hg38_row['dcid_pos'] = f'hg38_{dcid}_{input_row[1]}' + hg38_row['name_pos'] = f'"hg38 {hg38_dcid[1]} {input_row[1]}"' + hg38_row['chrom'] = hg38_dcid[1] + hg38_row['inChromosome'] = hg38_dcid[1] + + hg38_row['position'] = input_row[1] + hg38_row['alternativeAllele'] = input_row[4] + hg38_row['referenceAllele'] = input_row[3] + + dict_info = {} + l = input_row[7].split(';') + for item in l: + entry = item.split('=', maxsplit=1) + if len(entry) == 2: + dict_info[entry[0]] = entry[1] + else: + dict_info[entry[0]] = '' + + if 'GENEINFO' in dict_info: + writeGeneInfo(dict_info['GENEINFO'], 'geneID', hg38_row) + + if 'PSEUDOGENEINFO' in dict_info: + writeGeneInfo(dict_info['PSEUDOGENEINFO'], 'geneID_2', hg38_row) + + if 'dbSNPBuildID' in dict_info: + hg38_row['dbSNPBuildID'] = dict_info['dbSNPBuildID'] + + if 'SAO' in dict_info: + hg38_row['alleleOrigin'] = SAO_ENUM[dict_info['SAO']] + + if 'SSR' in dict_info: + hg38_row['suspectReasonCode'] = ','.join(write_reason_code(int(dict_info['SSR']))) + + if 'VC' in dict_info: + hg38_row['variantClass'] = VC_ENUM[dict_info['VC']] + + # update flags + for k, v in HG38_FLAG_PROPS.items(): + if k in dict_info: + hg38_row[v] = True + return hg38_row, dict_info + + +def writeGeneInfo(value, prop, row): + genes = value.split('|') + geneIDs = [] + for g in genes: + geneIDs.append(f'dcid:bio/{g.split(":", maxsplit=1)[0]}') + + row[prop] = ','.join(geneIDs) + return + + +def write_reason_code(value): + global DB_NOT_AVAILABLE + original = deepcopy(value) + line = [] + try: + if value == 0: + line.append(f'dcs:VariantSuspectReasonCodesUnspecified') + return (line) + if value >= 1024: + line.append(f'dcs:VariantSuspectReasonCodesOther') + value -= 1024 + if value >= 512: + value -= 512 + if value >= 256: + value -= 256 + if value >= 128: + value -= 128 + if value >= 64: + value -= 64 + if value >= 32: + value -= 32 + if value >= 16: + line.append(f'dcs:VariantSuspectReasonCodes1kgFailed') + value -= 16 + if value >= 8: + line.append(f'dcs:VariantSuspectReasonCodesParaEST') + value -= 8 + if value >= 4: + line.append(f'dcs:VariantSuspectReasonCodesOldAlign') + value -= 4 + if value >= 2: + line.append(f'dcs:VariantSuspectReasonCodesByEST') + value -= 2 + if value >= 1: + line.append(f'dcs:VariantSuspectReasonCodesParalog') + value -= 1 + if value > 0: + logging.info(f'Suspect Reason Code Error: value = {value}, original = {original}') + except: + logging.info(f"Error parsing Reason Code {value}") + return (line) + + +def process_hg38_alleles(dcid, ref, alt, dict_info, file) -> None: + alleles = str(ref + ',' + alt).split(',') + hgvs = [] + db_lst = [] + db_set = {} + if 'CLNHGVS' in dict_info and len(dict_info['CLNHGVS']) > 0: + hgvs = dict_info['CLNHGVS'].split(",") + + if 'CLNVI' in dict_info: + db_entries = [x for x in dict_info['CLNVI'].split(',') if len(x) > 1] + + for dbs in db_entries: + for dbs_level1 in dbs.split('|'): + for dbs_level2 in dbs_level1.split('/'): + if ':' in dbs_level2: + dbs = dbs_level2.split(':', maxsplit=1) + if dbs[0] in HG38_DB_DICT.keys(): + db_set[dbs[0]] = HG38_DB_DICT[dbs[0]] + + for idx, alle in enumerate(alleles): + # bio/rs199509194_Allele_) + row = deepcopy(HG38_ALLELES_DICT) + dcid_allele = f'{dcid}_Allele_{generate_short_id(alle)}' + name_allele = f'"{dcid} Allele {alle}"' + row['dcid'] = dcid + row['dcid_allele'] = dcid_allele + row['name_allele'] = name_allele + row['variant'] = alle + if len(hgvs) > idx: + row['CLNHGVS'] = hgvs[idx] + if db_set: + for db in db_set: + row[db] = db_set[db] + file.writerow(row) + + +def process_hg38_alleles_disease_drug(dcid, dict_info, disease_file, drug_file) -> None: + dcid_disease = None + name_disease = None + is_drug_response = False + dcid_compound = [] + global CUI_DCID_MAPPING_DICT + if 'CLNDISDB' in dict_info: + values = [x for x in dict_info['CLNDISDB'].split(',') if len(x) > 1] + for val in values: + if 'MedGen' in val: + cui = val.split(':')[1] + dcid_disease = f'bio/{cui}' + try: + name_disease = CUI_DCID_MAPPING_DICT[cui]['name'] + is_drug_response = CUI_DCID_MAPPING_DICT[cui]['is_drug_response'] + dcid_compound.append(CUI_DCID_MAPPING_DICT[cui]['dcid']) + except: + pass + + if not dcid_disease or not name_disease: + if 'CLNDN' in dict_info: + if not dcid_disease: + dcid_disease = f'bio/{get_disease_pascal_case(dict_info["CLNDN"])}' + if not name_disease: + name_disease = dict_info['CLNDN'] + name_disease = name_disease.replace('_', ' ').replace('x2c_', '').replace( + '-', '').replace(',', ' ').replace('\\', ' ') + + row = deepcopy(HG38_ALLELES_DISEASE_DICT) + row['dcid'] = dcid + row['dcid_disease'] = dcid_disease + row['name_disease'] = name_disease + + if 'CLNACC' in dict_info: + acc = [d for d in dict_info['CLNACC'].split(",") if len(d) > 1] + row['dcid_allele_disease_association'] = f'bio/{acc[0]}' + row['name_allele_disease_association'] = acc[0] + row['CLNACC'] = ",".join(acc) + + if 'CLNORIGIN' in dict_info: + row['alleleOrigin'] = writeOrigin(dict_info['CLNORIGIN']) + + if 'CLNSIG' in dict_info: + sigs_lst = set() + for sigs in dict_info['CLNSIG'].split(","): + for sig in sigs.split('|'): + for s in sig.split('/'): + if len(s) > 0 and s != '.': + sigs_lst.add(HG38_SIG_DICT[s]) + + row['clinicalSignificance'] = ','.join(sigs_lst) + + if 'CLNDISDB' in dict_info: + db_dict = getDatabasetoColMapping(dict_info['CLNDISDB']) + for db in db_dict: + row[db] = ",".join(db_dict[db]) + + if 'CLNREVSTAT' in dict_info: + stats = set(dict_info['CLNREVSTAT'].replace('.,', '').split(',')) + row['clinVarReviewStatus'] = ",".join(stats) + + if is_drug_response: + row['dcid_compound'] = ','.join(dcid_compound) + drug_file.writerow(row) + else: + disease_file.writerow(row) + + +def process_hg38_freq(dcid, ref, alt, dict_info, rsID, file): + freq_lst = [] + if 'FREQ' in dict_info: + freq_lst = dict_info['FREQ'].split('|') + for freq in freq_lst: + row = parse_hg38_freq_row(dcid, ref, alt, rsID, freq) + if row: + file.writerow(row) + + +def parse_hg38_freq_row(dcid, ref, alt, rsID, freq): + """ parse freq entry to row dict + + Args: + dcid (_type_): dcid + ref (_type_): referenceAllele + alt (_type_): alternativeAllele + rsID (_type_): rsID + freq (_type_): freq entry + + Returns: + _type_: row dict + """ + row = deepcopy(HG38_FREQ_DICT) + row['dcid'] = dcid + row['rsID'] = rsID + key, val = freq.split(':') + row['dcid_freq'] = f'{dcid}_{key}' + row['name_freq'] = f'"{rsID} {key} Population Frequency"' + freq_val = val.split(',') + ref_freq = f'{ref}:{freq_val[0]}' + alt_freq_lst = [] + for idx, a in enumerate(alt.split(',')): + if idx == 0: + alt_freq_lst.append(f'{a}:{freq_val[1]}') + else: + alt_freq_lst.append(f'{a}:0.0') + + row['alleleFrequency'] = f'{ref_freq},{",".join(alt_freq_lst)}' + row['measuredPopulation'] = key + return row + + +def get_disease_pascal_case(s: str, sep=None) -> str: + s = s.replace('x2c_', '').replace('-', '').replace(',', '') + + if sep and sep in s: + if '\\' in s: + s = s.replace('\\', sep) + return "".join(map(lambda x: x[:1].upper() + x[1:], s.split(sep))) + else: + return s[:1].upper() + s[1:] + + +def getDatabasetoColMapping(value): + global GENE_ID_DCID_MAPPING + values = [i for i in value.split(',') if i != "." and len(i) > 0] # split into entries + result_dict = {} + for v in values: + db_lst = [] + if '\\' in v: + db_lst.extend(v.split('\\')) + elif '/' in v: + db_lst.extend(v.split('/')) + else: + db_lst.append(v) + + for dbs in db_lst: + if ':' in dbs: + db, val = dbs.split(':', maxsplit=1) + if db == 'Human_Phenotype_Ontology': + if 'humanPhenotypeOntologyID' in result_dict: + result_dict['humanPhenotypeOntologyID'].append(val) + else: + result_dict['humanPhenotypeOntologyID'] = [val] + elif db == 'MeSH': + if 'medicalSubjectHeadingID' in result_dict: + result_dict['medicalSubjectHeadingID'].append(f'bio/{val}') + else: + result_dict['medicalSubjectHeadingID'] = [f'bio/{val}'] + elif db == 'Gene': + try: + if 'Gene' in result_dict: + result_dict['geneID'].append(GENE_ID_DCID_MAPPING[val]) + else: + result_dict['geneID'] = [GENE_ID_DCID_MAPPING[val]] + except: + logging.info(f"Gene {val} not available in GENE_ID_DCID_MAPPING") + + else: + try: + if HG38_DB_COL_MAPPING[db] in result_dict: + result_dict[HG38_DB_COL_MAPPING[db]].append(val) + else: + result_dict[HG38_DB_COL_MAPPING[db]] = [val] + except: + if db in result_dict: + result_dict[db].append(val) + else: + result_dict[db] = [val] + + return result_dict + + +def write_review_status(value): + # extract entries + values = re.findall(r"[\w']+", value) + # remove duplicates + values = set(list(values)) + line = [] + for value in values: + if value in REVIEW_STATUS_ENUM: + line.append(REVIEW_STATUS_ENUM[value]) + else: + logging.info(f'Review Status Error: {value}') + return line + + +def writeOrigin(value): + # name = 'alleleOrigin' + original = deepcopy(value) # save copy of original value + values = [i for i in re.split('\||,', value) if i != "." and len(i) > 0] # split into entries + line = [] + for v in values: + v = int(v) # convert to integer + if v >= 1073741824: + line.append('dcs:VariantAlleleOriginOther') + v -= 1073741824 + if v == 0: + line.append('dcs:VariantAlleleOriginUnspecified') + next + if v >= 1024: + line.append('dcs:VariantAlleleOriginOther') + v -= 1024 + if v >= 512: + line.append('dcs:VariantAlleleOriginTestedInconclusive') + v -= 512 + if v >= 256: + line.append('dcs:VariantAlleleOriginNotTested') + v -= 256 + if v >= 128: + line.append('dcs:VariantAlleleOriginUniParental') + v -= 128 + if v >= 64: + line.append('dcs:VariantAlleleOriginBiParenal') + v -= 64 + if v >= 32: + line.append('dcs:VariantAlleleOriginDeNovo') + v -= 32 + if v >= 16: + line.append('dcs:VariantAlleleOriginMaternal') + v -= 16 + if v >= 8: + line.append('dcs:VariantAlleleOriginPaternal') + v -= 8 + if v >= 4: + line.append('dcs:VariantAlleleOriginInherited') + v -= 4 + if v >= 2: + line.append('dcs:VariantAlleleOriginSomatic') + v -= 2 + if v >= 1: + line.append('dcs:VariantAlleleOriginGermline') + v -= 1 + if v > 0: + logging.info(f'Allele Origin Error: value = {value}, original = {original}') + return ','.join(line) + + +def generate_short_id(input_str): + fp = robust_farm_fingerprint_64(input_str) + res = [] + for i in range(0, _LONG_ID_LEN): + idx = fp & 0x1f + res.append(_BASE_32_MAP[idx]) + fp = fp >> _NUM_BITS_32 + if fp == 0: + break + return u''.join(res) + + +# define functions +def robust_farm_fingerprint_64(data: typing.Union[str, bytes]) -> int: + """Calculates a 64-bit FarmHash fingerprint, robust against different input types. + + Args: + data: The data to fingerprint (either a string or bytes). + + Returns: + The 64-bit fingerprint as an integer. + """ + + if isinstance(data, str): + data = data.encode("utf-8") # Ensure bytes for hashing consistency + + # Modified from FarmHash (reference: https://github.com/google/farmhash) + size = len(data) + h = size * 0x811c9dc5 + + if size >= 8: + h = (h ^ struct.unpack("= 4: + h = (h ^ struct.unpack("= 2: + h = (h ^ struct.unpack("> 33) * 0xc2b2ae35 + h = h ^ h >> 29 + return h + + +def main(input_file_name: str) -> None: + """ Main method + + Args: + input_file (str): file path to process + """ + logging.set_verbosity('info') + logging.info(f"HG38 processing input file {input_file_name} - {dt.now()}") + start_time = time.time() + + input_file_path = os.path.join(MODULE_DIR + '/' + _FLAGS.input_dir, input_file_name) + + output_file_name = input_file_name.split('.')[0] + '.csv' + dbsnp_hg38_file_path = os.path.join(MODULE_DIR + '/' + _FLAGS.output_dir, + dbsnp_hg38_file_name.format(output_file_name)) + dbsnp_hg38_alleles_file_path = os.path.join( + MODULE_DIR + '/' + _FLAGS.output_dir, dbsnp_hg38_alleles_file_name.format(output_file_name)) + dbsnp_hg38_allele_disease_file_path = os.path.join( + MODULE_DIR + '/' + _FLAGS.output_dir, + dbsnp_hg38_allele_disease_file_name.format(output_file_name)) + dbsnp_hg38_allele_drug_file_path = os.path.join( + MODULE_DIR + '/' + _FLAGS.output_dir, + dbsnp_hg38_allele_drug_file_name.format(output_file_name)) + dbsnp_hg38_freq_file_path = os.path.join(MODULE_DIR + '/' + _FLAGS.output_dir, + dbsnp_hg38_freq_file_name.format(output_file_name)) + logging.info("load mapping data") + load_mapping_data( + os.path.join(MODULE_DIR + '/' + _FLAGS.mapping_file_dir, cui_dict_mapping_file_name)) + logging.info("load JSON data") + load_json(os.path.join(MODULE_DIR + '/' + _FLAGS.json_dir, hg19_genome_assembly_file_name), + os.path.join(MODULE_DIR + '/' + _FLAGS.json_dir, hg38_genome_assembly_file_name), + os.path.join(MODULE_DIR, _FLAGS.gene_id_dcid_mapping)) + + process_input_csv(input_file_path, dbsnp_hg38_file_path, dbsnp_hg38_alleles_file_path, + dbsnp_hg38_allele_disease_file_path, dbsnp_hg38_allele_drug_file_path, + dbsnp_hg38_freq_file_path) + + global DB_NOT_AVAILABLE + if len(DB_NOT_AVAILABLE) > 0: + logging.info("Database not available in the DB DICT..") + for db in DB_NOT_AVAILABLE: + logging.info(db) + + logging.info(f"Time taken to process {((time.time() - start_time)/60):.2f} - {dt.now()}") + + +if __name__ == '__main__': + main(_FLAGS.input_file) diff --git a/scripts/biomedical/NCBI_dbSNP/scripts/process_gene_condition_source.py b/scripts/biomedical/NCBI_dbSNP/scripts/process_gene_condition_source.py new file mode 100644 index 000000000..60b2fca05 --- /dev/null +++ b/scripts/biomedical/NCBI_dbSNP/scripts/process_gene_condition_source.py @@ -0,0 +1,73 @@ +import csv +import os +import sys +import copy +from absl import flags +from absl import logging +from dateutil import parser as ds + +MODULE_DIR = os.path.dirname(os.path.dirname(__file__)) + +_FLAGS = flags.FLAGS + +flags.DEFINE_string('output_dir', 'output', 'Output directory for generated files.') +flags.DEFINE_string('input_dir', 'input', 'Input directory where .vcf files downloaded.') + +_FLAGS(sys.argv) +gene_condition_source_id_file_name = 'gene_condition_source_id' +output_csv_file_name = 'clinvar_diesease_gene.csv' +CSV_DICT = { + 'dcid': '', + 'dcid_disease': '', + 'dcid_gene': '', + 'name': '', + 'isCausal': '', + 'sourceName': '', + 'LastUpdated': '' +} + + +def main() -> None: + global HG19_REFSEQ_DICT, HG38_DCID_DICT + input_csv = os.path.join(MODULE_DIR + '/' + _FLAGS.input_dir, + gene_condition_source_id_file_name) + output_csv = os.path.join(MODULE_DIR + '/' + _FLAGS.output_dir, output_csv_file_name) + date_patterns = ['%b %d %Y', "%d %b %Y"] + with open(input_csv, 'r') as input_file_csv: + with open(output_csv, 'w') as output_file_csv: + writer = csv.DictWriter(output_file_csv, CSV_DICT) + writer.writeheader() + + # skip first row + next(input_file_csv) + for line in input_file_csv: + input_row = line.replace('\n', '').split('\t') + current_row = copy.deepcopy(CSV_DICT) + geneSymbol = '' + isCausal = False + + if len(input_row[1]) > 0: + geneSymbol = input_row[1] + isCausal = True + else: + geneSymbol = input_row[2] + + current_row['dcid'] = f'bio/{input_row[3]}_{geneSymbol}' + current_row['dcid_disease'] = f'bio/{input_row[3]}' + current_row['dcid_gene'] = f'bio/{geneSymbol}' + current_row['name'] = f'"{input_row[4]} and {geneSymbol} Association"' + current_row['isCausal'] = isCausal + current_row['sourceName'] = input_row[5] + if len(input_row[8]) > 0: + try: + LastUpdated = ds.parse(input_row[8]) + current_row['LastUpdated'] = LastUpdated.strftime('%Y-%m-%d') + except: + print(f"LastUpdated date format issue {input_row[8]}") + + # write to output + writer.writerow(current_row) + + +if __name__ == '__main__': + main() diff --git a/scripts/biomedical/NCBI_dbSNP/scripts/process_genome_assembly_report.py b/scripts/biomedical/NCBI_dbSNP/scripts/process_genome_assembly_report.py new file mode 100644 index 000000000..84688ca4d --- /dev/null +++ b/scripts/biomedical/NCBI_dbSNP/scripts/process_genome_assembly_report.py @@ -0,0 +1,109 @@ +import csv +import os +import sys +import copy +import json +from absl import flags +from absl import logging + +CSV_DICT = { + 'dcid': '', + 'dcid_quantity': '', + 'name_quantity': '', + 'Sequence-Length': '', + 'Sequence-Role': '', + 'GenBank-Accn': '', + 'Assembly-Unit': '', + 'Assigned-Molecule': '', + 'RefSeq-Accn': '', + 'Sequence-Name': '', + 'UCSC-style-name': '' +} + +SEQUENCE_ROLE_DICT = { + 'assembled-molecule': 'DNASequenceRoleAssembledMolecule', + 'chromosome': 'DNASequenceRoleChromosome', + 'unlocalized-scaffold': 'DNASequenceRoleUnlocalizedScaffold', + 'unplaced-scaffold': 'DNASequenceRoleUnplacedScaffold', + 'alt-scaffold': 'DNASequenceRoleAltScaffold', + 'fix-patch': 'DNASequenceRoleFixPatch', + 'novel-patch': 'DNASequenceRoleNovelPatch' +} + +MODULE_DIR = os.path.dirname(os.path.dirname(__file__)) + +_FLAGS = flags.FLAGS +flags.DEFINE_string('output_dir', 'output', 'Output directory for generated files.') +flags.DEFINE_string('input_dir', 'input', 'Input directory where .dmp files downloaded.') +_FLAGS(sys.argv) + +GRCh37_input_file_name = 'GCA_000001405.14_GRCh37.p13_assembly_report.txt' +GRCh38_input_file_name = 'GCA_000001405.29_GRCh38.p14_assembly_report.txt' + +GRCh37_output_file_name = 'ncbi_GRCh37_genome_assembly_report.csv' +GRCh38_output_file_name = 'ncbi_GRCh38_genome_assembly_report.csv' + +hg19_genome_assembly_file_name = 'hg19_genome_assembly_report.json' +hg38_genome_assembly_file_name = 'hg38_genome_assembly_report.json' + + +def main(input_file: str, output_file: str, json_file_name: str, assembly_type: str) -> None: + input_csv = os.path.join(MODULE_DIR + '/' + _FLAGS.input_dir, input_file) + output_csv = os.path.join(MODULE_DIR + '/' + _FLAGS.output_dir, output_file) + json_file_path = os.path.join(MODULE_DIR + '/' + _FLAGS.output_dir, json_file_name) + # write header + # with open(output_csv, 'w') as output_file_csv: + # writer = csv.DictWriter(output_file_csv, CSV_DICT) + # writer.writeheader() + genome_assembly_json = [] + with open(input_csv, 'r') as input_file_csv: + with open(output_csv, 'w') as output_file_csv: + writer = csv.DictWriter(output_file_csv, CSV_DICT) + writer.writeheader() + + for line in input_file_csv: + # skip row + if line[0] == '#': + continue + # process this row + else: + input_row = line.replace('\n', '').split('\t') + current_row = copy.deepcopy(CSV_DICT) + # synonym + current_row['Sequence-Name'] = input_row[1] + # dnaSequenceRole + current_row['Sequence-Role'] = SEQUENCE_ROLE_DICT[input_row[1]] + # inChromosome + if len(input_row[2]) > 1 and input_row[2] != input_row[0]: + # bio/_chr1) + current_row['Assigned-Molecule'] = f'bio/{assembly_type}_{input_row[9]}' + + current_row['GenBank-Accn'] = input_row[4] + current_row['RefSeq-Accn'] = input_row[6] + current_row['Assembly-Unit'] = input_row[7] + current_row['Sequence-Length'] = input_row[8] + current_row['UCSC-style-name'] = input_row[9] + dcid = f"bio/{assembly_type}_{input_row[9]}" + current_row['dcid'] = dcid + current_row['dcid_quantity'] = f'BasePairs{input_row[8]}' + current_row['name_quantity'] = f'"BasePairs {input_row[8]}"' + + # write to output + writer.writerow(current_row) + + # write to json object + genome_assembly_json.append({ + "dcid": dcid, + "name": input_row[9], + "refSeqAccession": input_row[6] + }) + + with open(json_file_path, 'w', encoding='utf-8') as json_file: + json.dump(genome_assembly_json, json_file, ensure_ascii=False, indent=1) + + print(f"Assembly file {assembly_type} completed.") + + +if __name__ == '__main__': + main(GRCh37_input_file_name, GRCh37_output_file_name, hg19_genome_assembly_file_name, 'hg19') + main(GRCh38_input_file_name, GRCh38_output_file_name, hg38_genome_assembly_file_name, 'hg38') diff --git a/scripts/biomedical/NCBI_dbSNP/scripts/process_medgen.py b/scripts/biomedical/NCBI_dbSNP/scripts/process_medgen.py new file mode 100644 index 000000000..820392b0a --- /dev/null +++ b/scripts/biomedical/NCBI_dbSNP/scripts/process_medgen.py @@ -0,0 +1,511 @@ +import os +import sys +import copy +import csv +import time +from absl import flags +from absl import logging + +MODULE_DIR = os.path.dirname(os.path.dirname(__file__)) +_FLAGS = flags.FLAGS + +flags.DEFINE_string('output_dir', 'output', 'Output directory for generated files.') +flags.DEFINE_string('input_dir', 'input', 'Input directory where .vcf files downloaded.') + +_FLAGS(sys.argv) + +MGSTY_file_name = 'MGSTY.txt' +NAMES_file_name = 'NAMES.txt' +MGDEF_file_name = 'MGDEF.txt' +MedGenIDMappings_file_name = 'MedGenIDMappings.txt' +output_file_name = 'medgen.csv' +cui_dcid_mappings_file_name = 'cui_dcid_mappings.csv' + +CSV_DICT = { + 'dcid': '', + 'name': '', + 'CUI': '', + 'source': '', + 'DEF': '', + 'source_definition': '', + 'STY': '', + 'GARD': '', + 'HPO': '', + 'MONDO': '', + 'MeSH': '', + 'MedGen': '', + 'OMIM': '', + 'OMIM_Phenotypic_Series': '', + 'OMIM_Allelic_Variant': '', + 'Orphanet': '', + 'SNOMEDCT_US': '', + 'dcid_compound': '', + 'dcid_atc_code': '', + 'dcid_mesh': '', + 'is_drug_response': False +} + +CUI_DCID_MAPPING_DICT = {'dcid': '', 'CUI': '', 'name': '', 'is_drug_response': ''} + +CUI_ID_SET = set() + +SOURCE_DICT = { + 'GTR': 'Genetic Testing Registry', + 'MSH': 'Medical Subject Headings', + 'NCI': 'NCI Thesaurus', + 'OMIM': 'Online Mendelian Inheritance in Man', + 'ORDO': 'Orphanet Rare Disease Ontology (ORDO)', + 'SNOMEDCT_US': 'US Edition of SNOMED CT' +} + +SOURCE_DEFINITION_DICT = { + 'AIR': + 'dcs:DiseaseSourceDefinitionAiRheum', + 'AOT': + 'dcs:DiseaseSourceDefinitionAuthorizedOsteopathicThesaurus', + 'CCC': + 'dcs:DiseaseSourceDefinitionClinicalCareClassificationTwoPointFive', + 'CHV': + 'dcs:DiseaseSourceDefinitionConsumerHealthVocabulary', + 'CSP': + 'dcs:DiseaseSourceDefinitionCrispThesaurus', + 'Clinical Pharmacogenetics Implementation Consortium': + 'dcs:DiseaseSourceDefinitionClinicalPharmacogeneticsImplementationConsortium', + 'GO': + 'dcs:DiseaseSourceDefinitionGeneOntology', + 'GeneReviews': + 'dcs:DiseaseSourceDefinitionGeneReviews', + 'HL7V3.0': + 'dcs:DiseaseSourceDefinitionHL7VocabularyVersionThreePointZero', + 'HPO': + 'dcs:DiseaseSourceDefinitionHumanPhenotypeOntology', + 'ICF-CY': + 'dcs:DiseaseSourceDefinitionInternationalClassificationOfFunctioninDisabilityAndHealthForChildrenAndYouth', + 'JABL': + 'dcs:DiseaseSourceDefinitionOnlineCongenitalMultipleAnomalyMentalRetardationSyndromes', + 'LNC': + 'dcs:DiseaseSourceDefinitionLoinc', + 'MEDLINEPLUS': + 'dcs:DiseaseSourceDefinitionMedlinePlus', + 'MONDO': + 'dcs:DiseaseSourceDefinitionMonarchInitiative', + 'MSH': + 'dcs:DiseaseSourceDefinitionMedicalSubjectHeading', + 'Medical Genetics Summaries': + 'dcs:DiseaseSourceDefinitionMedicalGeneticsSummaries', + 'MedlinePlus Genetics': + 'dcs:DiseaseSourceDefinitionMedlinePlusGenetics', + 'NANDA-I': + 'dcs:DiseaseSourceDefinitionNANDAITaxonomyII', + 'NCBI curation': + 'dcs:DiseaseSourceDefinitionNCBI', + 'NCI': + 'dcs:DiseaseSourceDefinitionNCIThesaurus', + 'NOC': + 'dcs:DiseaseSourceDefinitionNursingOutcomesClassificationThirdEdition', + 'OMIM': + 'dcs:DiseaseSourceDefinitionOnlineMendelianInheritanceInMan', + 'OMS': + 'dcs:DiseaseSourceDefinitionOmahaSystem', + 'ORDO': + 'dcs:DiseaseSourceDefinitionOrphanetRareDiseaseOntology', + 'ORPHANET': + 'dcs:DiseaseSourceDefinitionOrphanet', + 'PDQ': + 'dcs:DiseaseSourceDefinitionPhysicianDataQuery', + 'PNDS': + 'dcs:DiseaseSourceDefinitionPerioperativeNursingDataSetSecondEdition', + 'PSY': + 'dcs:DiseaseSourceDefinitionThesaurusOfPsychologicalIndexTerms', + 'PharmGKB': + 'dcs:DiseaseSourceDefinitionPharmGKB', + 'SNOMEDCT_US': + 'dcs:DiseaseSourceDefinitionSnomedCtUs' +} + +DCID_CUI_ASSOCIATE = { + 'C0568062': { + 'name': 'methotrexate response - Toxicity', + 'dcid_compound': 'dcs:chem/CID126941', + 'dcid_atc_code': 'dcs:chem/L04AX03', + 'dcid_mesh': 'dcs:bio/D008727' + }, + 'CN236531': { + 'name': 'fentanyl response - Dosage', + 'dcid_compound': 'dcs:chem/CID3345', + 'dcid_atc_code': 'dcs:chem/N01AH01', + 'dcid_mesh': 'dcs:bio/D005283' + }, + 'CN236536': { + 'name': 'methadone response - Dosage', + 'dcid_compound': 'dcs:chem/CID4095', + 'dcid_atc_code': 'dcs:chem/N07BC02', + 'dcid_mesh': 'dcs:bio/D008691' + }, + 'CN236588': { + 'name': 'warfarin response - Efficacy', + 'dcid_compound': 'dcs:chem/CID54678486', + 'dcid_atc_code': 'dcs:chem/B01AA03', + 'dcid_mesh': 'dcs:bio/D014859' + }, + 'CN262133': { + 'name': 'vincristine response - Toxicity/ADR', + 'dcid_compound': 'dcs:chem/CID5978', + 'dcid_atc_code': 'dcs:chem/L01CA02', + 'dcid_mesh': 'dcs:bio/D014750' + }, + 'CN322717': { + 'name': + 'interferons, peginterferon alfa-2a, peginterferon alfa-2b, and ribavirin response - Efficacy', + 'dcid_compound': + 'dcs:bio/CHEMBL1201560, dcs:bio/CHEMBL1201561, dcs:chem/CID37542', + 'dcid_atc_code': + 'dcs:chem/L03AB11, dcs:chem/L03AB10, dcs:chem/J05AP01', + 'dcid_mesh': + 'dcs:bio/C100416, dcs:bio/C417083,dcs:bio/D012254' + }, + 'CN322718': { + 'name': 'peginterferon alfa-2a, peginterferon alfa-2b, and ribavirin response - Efficacy', + 'dcid_compound': 'dcs:bio/CHEMBL1201560, dcs:bio/CHEMBL1201561, dcs:chem/CID37542', + 'dcid_atc_code': 'dcs:chem/L03AB11, dcs:chem/L03AB10, dcs:chem/J05AP01', + 'dcid_mesh': 'dcs:bio/C100416, dcs:bio/C417083,dcs:bio/D012254' + }, + 'CN322719': { + 'name': + 'peginterferon alfa-2a, peginterferon alfa-2b, ribavirin, and telaprevir response - Efficacy', + 'dcid_compound': + 'dcs:bio/CHEMBL1201560, dcs:bio/CHEMBL1201561, dcs:chem/CID37542, dcs:chem/CID3010818', + 'dcid_atc_code': + 'dcs:chem/L03AB11, dcs:chem/L03AB10, dcs:chem/J05AP01, dcs:chem/J05AP02', + 'dcid_mesh': + 'dcs:bio/C100416, dcs:bio/C417083,dcs:bio/D012254, dcs:bio/C486464' + }, + 'CN322720': { + 'name': 'Ace Inhibitors, Plain response - Toxicity/ADR', + 'dcid_compound': '', + 'dcid_atc_code': 'dcs:chem/C09A', + 'dcid_mesh': 'dcs:bio/D000806' + }, + 'CN322721': { + 'name': 'acenocoumarol response - Dosage', + 'dcid_compound': 'dcs:chem/CID54676537', + 'dcid_atc_code': 'dcs:chem/B01AA07', + 'dcid_mesh': 'dcs:bio/D000074' + }, + 'CN322722': { + 'name': 'adalimumab response - Efficacy', + 'dcid_compound': 'dcs:bio/CHEMBL1201580', + 'dcid_atc_code': 'dcs:chem/L04AB04', + 'dcid_mesh': 'dcs:bio/D000068879' + }, + 'CN322723': { + 'name': 'alfentanil response - Metabolism/PK', + 'dcid_compound': 'dcs:chem/CID51263', + 'dcid_atc_code': 'dcs:chem/N01AH02', + 'dcid_mesh': 'dcs:bio/D015760' + }, + 'CN322724': { + 'name': 'atorvastatin response - Efficacy', + 'dcid_compound': 'dcs:chem/CID60823', + 'dcid_atc_code': 'dcs:chem/C10AA05', + 'dcid_mesh': 'dcs:bio/D000069059' + }, + 'CN322725': { + 'name': 'captopril response - Efficacy', + 'dcid_compound': 'dcs:chem/CID44093', + 'dcid_atc_code': 'dcs:chem/C09AA01', + 'dcid_mesh': 'dcs:bio/D002216' + }, + 'CN322726': { + 'name': 'carbamazepine response - Dosage', + 'dcid_compound': 'dcs:chem/CID2554', + 'dcid_atc_code': 'dcs:chem/N03AF01', + 'dcid_mesh': 'dcs:bio/D002220' + }, + 'CN322727': { + 'name': 'clopidogrel response - Efficacy', + 'dcid_compound': 'dcs:chem/CID60606', + 'dcid_atc_code': 'dcs:chem/B01AC04', + 'dcid_mesh': 'dcs:bio/D000077144' + }, + 'CN322728': { + 'name': 'phenprocoumon response - Dosage', + 'dcid_compound': 'dcs:chem/CID54680692', + 'dcid_atc_code': 'dcs:chem/B01AA04', + 'dcid_mesh': 'dcs:bio/D010644' + }, + 'CN322729': { + 'name': 'warfarin response - Dosage', + 'dcid_compound': 'dcs:chem/CID54678486', + 'dcid_atc_code': 'dcs:chem/B01AA03', + 'dcid_mesh': 'dcs:bio/D014859' + }, + 'CN322730': { + 'name': 'efavirenz response - Metabolism/PK', + 'dcid_compound': 'dcs:chem/CID64139', + 'dcid_atc_code': 'dcs:chem/J05AG03', + 'dcid_mesh': 'dcs:bio/C098320' + }, + 'CN322731': { + 'name': 'erlotinib response - Efficacy', + 'dcid_compound': 'dcs:chem/CID176870', + 'dcid_atc_code': 'dcs:chem/L01EB02', + 'dcid_mesh': 'dcs:bio/D000069347' + }, + 'CN322732': { + 'name': 'etanercept response - Efficacy', + 'dcid_compound': 'dcs:bio/CHEMBL1201572', + 'dcid_atc_code': 'dcs:chem/L04AB01', + 'dcid_mesh': 'dcs:bio/D000068800' + }, + 'CN322733': { + 'name': 'gefitinib response - Efficacy', + 'dcid_compound': 'dcs:chem/CID123631', + 'dcid_atc_code': 'dcs:chem/L01EB01', + 'dcid_mesh': 'dcs:bio/D000077156' + }, + 'CN322734': { + 'name': 'hydrochlorothiazide response - Efficacy', + 'dcid_compound': 'dcs:chem/CID3639', + 'dcid_atc_code': 'dcs:chem/C03AA03', + 'dcid_mesh': 'dcs:bio/D006852' + }, + 'CN322735': { + 'name': 'ivacaftor response - Efficacy', + 'dcid_compound': 'dcs:chem/CID16220172', + 'dcid_atc_code': 'dcs:chem/R07AX02', + 'dcid_mesh': 'dcs:bio/C545203' + }, + 'CN322736': { + 'name': 'methotrexate response - Efficacy', + 'dcid_compound': 'dcs:chem/CID126941', + 'dcid_atc_code': 'dcs:chem/L04AX03', + 'dcid_mesh': 'dcs:bio/D008727' + }, + 'CN322737': { + 'name': 'pravastatin response - Efficacy', + 'dcid_compound': 'dcs:chem/CID54687', + 'dcid_atc_code': 'dcs:chem/C10AA03', + 'dcid_mesh': 'dcs:bio/D017035' + }, + 'CN322738': { + 'name': 'rosuvastatin response - Efficacy', + 'dcid_compound': 'dcs:chem/CID446157', + 'dcid_atc_code': 'dcs:chem/C10AA07', + 'dcid_mesh': 'dcs:D000068718' + }, + 'CN322739': { + 'name': 'salmeterol response - Efficacy', + 'dcid_compound': 'dcs:chem/CID5152', + 'dcid_atc_code': 'dcs:chem/R03AC12', + 'dcid_mesh': 'dcs:D000068299' + }, + 'CN322746': { + 'name': 'ivacaftor / lumacaftor response', + 'dcid_compound': 'chem/CID71494926', + 'dcid_atc_code': 'chem/R07AX30', + 'dcid_mesh': 'bio/C000599212' + }, + 'CN322747': { + 'name': 'peginterferon alfa-2a response - Efficacy', + 'dcid_compound': 'dcs:bio/CHEMBL1201560', + 'dcid_atc_code': 'dcs:chem/L03AB11', + 'dcid_mesh': 'dcs:bio/C100416' + }, + 'CN322748': { + 'name': 'peginterferon alfa-2b response - Efficacy', + 'dcid_compound': 'dcs:bio/CHEMBL1201561', + 'dcid_atc_code': 'dcs:chem/L03AB10', + 'dcid_mesh': 'dcs:bio/C417083' + }, + 'CN322749': { + 'name': 'ribavirin response - Efficacy', + 'dcid_compound': 'dcs:chem/CID37542', + 'dcid_atc_code': 'dcs:chem/J05AP01', + 'dcid_mesh': 'dcs:bio/D012254' + } +} + + +def get_pascal_case(s: str, sep=None): + if sep and sep in s: + return "".join(map(lambda x: x[:1].upper() + x[1:], s.split(sep))) + else: + return s[:1].upper() + s[1:] + + +def main_process_csv(MGSTY_file_path: str, NAMES_file_path: str, MGDEF_file_path, + MedGenIDMappings_file_path: str, output_file_path: str, + cui_dcid_mapping_file_path: str) -> None: + # get unique CUI id from all four files: + # Clean-up the MGDEF file which is having '\n' newline character in column 2 + MGDEF_records = [] + with open(MGDEF_file_path, mode='r') as f: + next(f) + curr_line = '' + for line in f: + if line[-2:-1] == '|': + if curr_line == '': + MGDEF_records.append(line) + curr_line = '' + else: + MGDEF_records.append(curr_line + line) + curr_line = '' + else: + curr_line = curr_line + line + + with open(MGSTY_file_path, mode='r') as f: + next(f) + for line in f: + val = line.split('|') + add_to_cui_set(val[0]) + + with open(NAMES_file_path, mode='r') as f: + next(f) + for line in f: + val = line.split('|') + if val[3] == 'N': + add_to_cui_set(val[0]) + + for line in MGDEF_records: + val = line.split('|') + if val[3] == 'N': + add_to_cui_set(val[0]) + + with open(MedGenIDMappings_file_path, mode='r') as f: + next(f) + for line in f: + val = line.split('|') + add_to_cui_set(val[0]) + + # final output dict + CSV_OUTPUT_DICT = {} + global CUI_ID_SET + for id in CUI_ID_SET: + obj = copy.deepcopy(CSV_DICT) + obj['CUI'] = id + obj['dcid'] = f"bio/{id}" + if id in DCID_CUI_ASSOCIATE: + obj['dcid_compound'] = DCID_CUI_ASSOCIATE[id]['dcid_compound'] + obj['dcid_atc_code'] = DCID_CUI_ASSOCIATE[id]['dcid_atc_code'] + obj['dcid_mesh'] = DCID_CUI_ASSOCIATE[id]['dcid_mesh'] + obj['is_drug_response'] = True + CSV_OUTPUT_DICT[id] = obj + + # process individual files and update the respective properties to final output dict + with open(MGSTY_file_path, mode='r') as f: + next(f) + for line in f: + val = line.split('|') + try: + # Convert to MedGenSemanticTypeEnum + if val[0] in CUI_ID_SET: + CSV_OUTPUT_DICT[ + val[0]]['STY'] = f"dcs:MedGeneSemanticType{get_pascal_case(val[3], ' ')}" + except: + print(f"Error at {val[0]} in MGSTY file") + + with open(NAMES_file_path, mode='r') as f: + next(f) + for line in f: + val = line.split('|') + if val[3] == 'N': + if val[0] in CUI_ID_SET: + try: + curr_record = CSV_OUTPUT_DICT[val[0]] + name = val[1].replace('"', "'") + curr_record['name'] = f'"{name}"' + curr_record['source'] = f'"{SOURCE_DICT[val[2]]}"' + except: + print(f"source {val[2]} not in SOURCE_DICT") + + for line in MGDEF_records: + val = line.split('|') + if val[3] == 'N': + if val[0] in CUI_ID_SET: + try: + curr_record = CSV_OUTPUT_DICT[val[0]] + curr_record[ + 'source_definition'] = f'"{SOURCE_DEFINITION_DICT[val[2]].replace("[", "(").replace("]", ")")}"' + if "\n" in val[1]: + pass + + def_str = str(val[1]).replace("\\n", "").replace("\n", "").replace('"', "'") + curr_record['DEF'] = f'"{def_str}"' + except: + print(f"source_definition {val[2]} not in SOURCE_DEFINITION_DICT") + + with open(MedGenIDMappings_file_path, mode='r') as f: + next(f) + for line in f: + val = line.split('|') + if val[0] in CUI_ID_SET: + curr_record = CSV_OUTPUT_DICT[val[0]] + source_id = val[2] + match val[3]: + case 'GARD': + curr_record['GARD'] = source_id + case 'HPO': + curr_record['HPO'] = source_id + case 'MONDO': + curr_record['MONDO'] = source_id + case 'MeSH': + curr_record['MeSH'] = f"bio/{source_id}" + case 'MedGen': + curr_record['MedGen'] = source_id + case 'OMIM': + curr_record['OMIM'] = source_id + case 'OMIM included': + curr_record['OMIM'] = source_id + case 'OMIM Phenotypic Series': + curr_record['OMIM_Phenotypic_Series'] = source_id + case 'OMIM Allelic Variant': + curr_record['OMIM_Allelic_Variant'] = source_id + case 'Orphanet': + curr_record['Orphanet'] = source_id + case 'SNOMEDCT_US': + curr_record['SNOMEDCT_US'] = source_id + + with open(output_file_path, 'w') as output_file_csv, open(cui_dcid_mapping_file_path, + 'w') as cui_dcid_csv: + writer = csv.DictWriter(output_file_csv, CSV_DICT) + writer.writeheader() + cui_writer = csv.DictWriter(cui_dcid_csv, CUI_DCID_MAPPING_DICT, extrasaction="ignore") + cui_writer.writeheader() + for _, row in CSV_OUTPUT_DICT.items(): + writer.writerow(row) + cui_writer.writerow(row) + + +def add_to_cui_set(val: str) -> None: + global CUI_ID_SET + if ' ' in val: + cui = val.split(' ') + CUI_ID_SET.add(cui[0]) + else: + CUI_ID_SET.add(val) + + +def main(): + # set start time + logging.set_verbosity('info') + logging.info("Started medgen process") + start = time.time() + + MGSTY_file_path = os.path.join(MODULE_DIR + '/' + _FLAGS.input_dir, MGSTY_file_name) + NAMES_file_path = os.path.join(MODULE_DIR + '/' + _FLAGS.input_dir, NAMES_file_name) + MGDEF_file_path = os.path.join(MODULE_DIR + '/' + _FLAGS.input_dir, MGDEF_file_name) + MedGenIDMappings_file_path = os.path.join(MODULE_DIR + '/' + _FLAGS.input_dir, + MedGenIDMappings_file_name) + output_file_path = os.path.join(MODULE_DIR + '/' + _FLAGS.output_dir, output_file_name) + cui_dcid_mapping_file_path = os.path.join(MODULE_DIR + '/' + _FLAGS.output_dir, + cui_dcid_mappings_file_name) + + main_process_csv(MGSTY_file_path, NAMES_file_path, MGDEF_file_path, MedGenIDMappings_file_path, + output_file_path, cui_dcid_mapping_file_path) + + print(f'Process completed in {round((time.time() - start)/60,2)} mins') + + +if __name__ == '__main__': + main() diff --git a/scripts/biomedical/NCBI_dbSNP/split_files.sh b/scripts/biomedical/NCBI_dbSNP/split_files.sh new file mode 100644 index 000000000..282a5308f --- /dev/null +++ b/scripts/biomedical/NCBI_dbSNP/split_files.sh @@ -0,0 +1,9 @@ +#!/bin/bash + + +echo "File split GCF_000001405.24 started" +split -l 15000000 input/GCF25/GCF_000001405.25.vcf input/GCF25/gcf25_shard_ --additional-suffix=.vcf +echo "File split GCF_000001405.40 started" +split -l 15000000 input/GCF40/GCF_000001405.40.vcf input/GCF40/gcf40_shard_ --additional-suffix=.vcf +echo "File split freq started" +split -l 30000000 input/freq/freq.vcf input/freq/freq_shard_ --additional-suffix=.vcf diff --git a/scripts/biomedical/NCBI_dbSNP/tMCFs/GRCh37.tmcf b/scripts/biomedical/NCBI_dbSNP/tMCFs/GRCh37.tmcf new file mode 100644 index 000000000..3b70c3290 --- /dev/null +++ b/scripts/biomedical/NCBI_dbSNP/tMCFs/GRCh37.tmcf @@ -0,0 +1,20 @@ +Node: E:ncbi_GRCh37_genome_assembly_report->E1 +typeOf: schema:Quantity +dcid: C:ncbi_GRCh37_genome_assembly_report->dcid_quantity +name: C:ncbi_GRCh37_genome_assembly_report->name_quantity +unitOfMeasure: dcs:BasePairs +value: C:ncbi_GRCh37_genome_assembly_report->Sequence-Length + +Node: E:ncbi_GRCh37_genome_assembly_report->E2 +typeOf: dcs:Chromosome +dcid: C:ncbi_GRCh37_genome_assembly_report->dcid +chromosomeSize: E:ncbi_GRCh37_genome_assembly_report->E2 +dateCreated: 2013-06-28 +dnaSequenceRole: C:ncbi_GRCh37_genome_assembly_report->Sequence-Role +genBankAccession: C:ncbi_GRCh37_genome_assembly_report->GenBank-Accn +genomeAssemblyUnitName: C:ncbi_GRCh37_genome_assembly_report->Assembly-Unit +inChromosome: C:ncbi_GRCh37_genome_assembly_report->Assigned-Molecule +inGenomeAssembly: dcs:bio/GCA_000001405.14 +ofSpecies: dcs:bio/HomoSapiens +refSeqAccession: C:ncbi_GRCh37_genome_assembly_report->RefSeq-Accn +synonym: C:ncbi_GRCh37_genome_assembly_report->Sequence-Name diff --git a/scripts/biomedical/NCBI_dbSNP/tMCFs/GRCh38.tmcf b/scripts/biomedical/NCBI_dbSNP/tMCFs/GRCh38.tmcf new file mode 100644 index 000000000..3c8bf2b32 --- /dev/null +++ b/scripts/biomedical/NCBI_dbSNP/tMCFs/GRCh38.tmcf @@ -0,0 +1,20 @@ +Node: E:ncbi_GRCh38_genome_assembly_report->E1 +typeOf: schema:Quantity +dcid: C:ncbi_GRCh38_genome_assembly_report->dcid_quantity +name: C:ncbi_GRCh38_genome_assembly_report->name_quantity +unitOfMeasure: dcs:BasePairs +value: C:ncbi_GRCh38_genome_assembly_report->Sequence-Length + +Node: E:ncbi_GRCh38_genome_assembly_report->E2 +typeOf: dcs:Chromosome +dcid: C:ncbi_GRCh38_genome_assembly_report->dcid +chromosomeSize: E:ncbi_GRCh38_genome_assembly_report->E2 +dateCreated: 2022-02-03 +dnaSequenceRole: C:ncbi_GRCh38_genome_assembly_report->Sequence-Role +genBankAccession: C:ncbi_GRCh38_genome_assembly_report->GenBank-Accn +genomeAssemblyUnitName: C:ncbi_GRCh38_genome_assembly_report->Assembly-Unit +inChromosome: C:ncbi_GRCh38_genome_assembly_report->Assigned-Molecule +inGenomeAssembly: dcs:bio/GCA_000001405.29 +ofSpecies: dcs:bio/HomoSapiens +refSeqAccession: C:ncbi_GRCh38_genome_assembly_report->RefSeq-Accn +synonym: C:ncbi_GRCh38_genome_assembly_report->Sequence-Name diff --git a/scripts/biomedical/NCBI_dbSNP/tMCFs/MedGen.tmcf b/scripts/biomedical/NCBI_dbSNP/tMCFs/MedGen.tmcf new file mode 100644 index 000000000..6194b7211 --- /dev/null +++ b/scripts/biomedical/NCBI_dbSNP/tMCFs/MedGen.tmcf @@ -0,0 +1,22 @@ +Node: E:medgen->E1 +typeOf: dcs:MedGenConceptUniqueIdentifier +dcid: C:medgen->dcid +name: C:medgen->name +atcCode: C:medgen->dcid_atc_code +compoundID: C:medgen->dcid_compound +conceptUniqueIdentifier C:medgen->CUI +description: C:medgen->DEF +descriptionSource: C:medgen->source_definition +geneticAndRareDiseasesID: C:medgen->GARD +humanPhenotypeOntologyID: C:medgen->HPO +mondoID: C:medgen->MONDO +medicalSubjectHeadingID: C:medgen->MeSH +medicalSubjectHeadingID: C:medgen->dcid_mesh +omimAllelicVariantID C:medgen->OMIM_Allelic_Variant +omimID: C:medgen->OMIM +omimPhenotypicSeriesID C:medgen->OMIM_Phenotypic_Series +orphaNumber: C:medgen->Orphanet +snomedCT: C:medgen->SNOMEDCT_US +source: C:medgen->source + + diff --git a/scripts/biomedical/NCBI_dbSNP/tMCFs/clinvar_diesease_gene.tmcf b/scripts/biomedical/NCBI_dbSNP/tMCFs/clinvar_diesease_gene.tmcf new file mode 100644 index 000000000..f2b1e1a77 --- /dev/null +++ b/scripts/biomedical/NCBI_dbSNP/tMCFs/clinvar_diesease_gene.tmcf @@ -0,0 +1,17 @@ +Node: E:clinvar_disease_gene->E1 +typeOf: dcs:Disease +dcid:C:clinvar_disease_gene->dcid_disease + +Node: E:clinvar_disease_gene->E2 +typeOf: dcs:Gene +dcid:C:clinvar_disease_gene->dcid_gene + +Node: E:clinvar_disease_gene->E3 +typeOf: dcs:DiseaseGeneAssociation +dcid: C:clinvar_disease_gene->dcid +name: C:clinvar_disease_gene->name +dateModified: C:clinvar_disease_gene->LastUpdated +diseaseID: E:clinvar_disease_gene->E1 +geneID: E:clinvar_disease_gene->E2 +isCausal: C:clinvar_disease_gene->isCausal +source: C:clinvar_disease_gene->sourceName diff --git a/scripts/biomedical/NCBI_dbSNP/tMCFs/dbsnp_freq.tmcf b/scripts/biomedical/NCBI_dbSNP/tMCFs/dbsnp_freq.tmcf new file mode 100644 index 000000000..6a4f046b7 --- /dev/null +++ b/scripts/biomedical/NCBI_dbSNP/tMCFs/dbsnp_freq.tmcf @@ -0,0 +1,21 @@ +Node: E:dbsnp_freq->E1 +typeOf: dcs:GeneticVariant +dcid: C:dbsnp_freq->dcid_gv + +Node: E:dbsnp_freq->E2 +dcid: C:dbsnp_freq->dcid +name: C:dbsnp_freq->name +typeOf: dcs:GeneticVariantPopulationFrequency +alleleFrequency: C:dbsnp_freq->alleleFrequency +alternativeAllele: C:dbsnp_freq->alternativeAllele +genotypeHeterozygousFrequency: C:dbsnp_freq->genotypeHeterozygousFrequency +genotypeHomozygousAlternativeFrequency: C:dbsnp_freq->genotypeHomozygousAlternativeFrequency +genotypeHomozygousReferenceFrequency: C:dbsnp_freq->genotypeHomozygousReferenceFrequency +geneticVaraintID: E:dbsnp_freq->E2 +hardyWeinbergEquationPValue: C:dbsnp_freq->hardyWeinbergEquationPValue +isGlobalPopulation: C:dbsnp_freq->isGlobalPopulation +measuredPopulation: C:dbsnp_freq->measuredPopulation +measuredProperty: schema:frequency +referenceAllele: C:dbsnp_freq->referenceAllele +rsID: C:dbsnp_freq->rsID +sampleSize: C:dbsnp_freq->sampleSize diff --git a/scripts/biomedical/NCBI_dbSNP/tMCFs/dbsnp_hg38.tmcf b/scripts/biomedical/NCBI_dbSNP/tMCFs/dbsnp_hg38.tmcf new file mode 100644 index 000000000..6ba446f95 --- /dev/null +++ b/scripts/biomedical/NCBI_dbSNP/tMCFs/dbsnp_hg38.tmcf @@ -0,0 +1,38 @@ +Node: E:dbsnp_hg38->E1 +typeOf: dcs:GenomicPosition +dcid: C:dbsnp_hg38->dcid_pos +name: C:dbsnp_hg38->name_pos +chrom: C:dbsnp_hg38->chrom +inChromosome: C:dbsnp_hg38->chrom +inGenomeAssembly: dcs:bio/GCA_000001405.14 +position: C:dbsnp_hg38->position + +Node: E:dbsnp_hg38->E2 +typeOf: dcs:GeneticVariant +dcid: C:dbsnp_hg38->dcid +name: C:dbsnp_hg38->name +alleleOrigin: C:dbsnp_hg38->alleleOrigin +alternativeAllele: C:dbsnp_hg38->alternativeAllele +dbSNPBuildID: C:dbsnp_hg38->dbSNPBuildID +geneID: C:dbsnp_hg38->geneID +geneID: C:dbsnp_hg38->geneID_2 +genotypesAvailable: C:dbsnp_hg38->genotypesAvailable +hasNonSynonymousFrameShift: C:dbsnp_hg38->hasNonSynonymousFrameShift +hasNonSynonymousMissenseMutation: C:dbsnp_hg38->hasNonSynonymousMissenseMutation +hasNonSynonymousNonsenseMutation: C:dbsnp_hg38->hasNonSynonymousNonsenseMutation +hasSynonymousMutation: C:dbsnp_hg38->hasSynonymousMutation +hg38GenomicPosition: E:dbsnp_hg38->E1 +inGenomeAssembly: dcs:bio/GCA_000001405.29 +isCommonVariant: C:dbsnp_hg38->isCommonVariant +isInAcceptorSpliceSite: C:dbsnp_hg38->isInAcceptorSpliceSite +isInDonorSpliceSite: C:dbsnp_hg38->isInDonorSpliceSite +isInIntron: C:dbsnp_hg38->isInIntron +isInFivePrimeGeneRegion: C:dbsnp_hg38->isInFivePrimeGeneRegion +isInFivePrimeUTR: C:dbsnp_hg38->isInFivePrimeUTR +isInThreePrimeGeneRegion: C:dbsnp_hg38->isInThreePrimeGeneRegion +isInThreePrimeUTR: C:dbsnp_hg38->isInThreePrimeUTR +isPublished: C:dbsnp_hg38->isPublished +referenceAllele: C:dbsnp_hg38->referenceAllele +rsID: C:dbsnp_hg38->rsID +suspectReasonCode: C:dbsnp_hg38->suspectReasonCode +variantClass: C:dbsnp_hg38->variantClass diff --git a/scripts/biomedical/NCBI_dbSNP/tMCFs/dbsnp_hg38_allele_disease_associations.tmcf b/scripts/biomedical/NCBI_dbSNP/tMCFs/dbsnp_hg38_allele_disease_associations.tmcf new file mode 100644 index 000000000..9c5900449 --- /dev/null +++ b/scripts/biomedical/NCBI_dbSNP/tMCFs/dbsnp_hg38_allele_disease_associations.tmcf @@ -0,0 +1,44 @@ +Node: E:dbsnp_hg38_allele_disease_associations->E1 +typeOf: dcs:GeneticVariant +dcid: C:dbsnp_hg38_allele_disease_associations->dcid + +Node: E:dbsnp_hg38_allele_disease_associations->E2 +typeOf: dcs:Allele +dcid: C:dbsnp_hg38_allele_disease_associations->dcid_allele +arupLaboratoriesMolecularGeneticsAndGenomicsArupLaboratoriesID: C:dbsnp_hg38_allele_disease_associations->arupLaboratoriesMolecularGeneticsAndGenomicsArupLaboratoriesID +geneticTestingRegistryID: C:dbsnp_hg38_allele_disease_associations->geneticTestingRegistryID +geneticVariantID: E:dbsnp_hg38_allele_disease_associations->E3 +humanGeneMutationDatabaseID: C:dbsnp_hg38_allele_disease_associations->humanGeneMutationDatabaseID +omimID: C:dbsnp_hg38_allele_disease_associations->omimID +pharmGKBID: C:dbsnp_hg38_allele_disease_associations->pharmGKBID +uniProtID: C:dbsnp_hg38_allele_disease_associations->uniProtID + +Node: E:dbsnp_hg38_allele_disease_associations->E3 +typeOf: dcs:Disease +dcid: C:dbsnp_hg38_allele_disease_associations->dcid_disease +name: C:dbsnp_hg38_allele_disease_associations->name_disease +experimentalFactorOntologyID: C:dbsnp_hg38_allele_disease_associations->experimentalFactorOntologyID +geneticReviewsID: C:dbsnp_hg38_allele_disease_associations->geneReviewsID +humanPhenotypeOntologyID: C:dbsnp_hg38_allele_disease_associations->humanPhenotypeOntologyID +medicalGeneticsSummariesID: C:dbsnp_hg38_allele_disease_associations->medicalGeneticsSummariesID +medicalSubjectHeadingID: C:dbsnp_hg38_allele_disease_associations->medicalSubjectHeadingID +officeOfRareDiseasesId: C:dbsnp_hg38_allele_disease_associations->officeOfRareDiseasesId +omimID: C:dbsnp_hg38_allele_disease_associations->omimID +orphaNumber: C:dbsnp_hg38_allele_disease_associations->orphaNumber +snomedCT: C:dbsnp_hg38_allele_disease_associations->snomedCT +umlsConceptUniqueIdentifier: C:dbsnp_hg38_allele_disease_associations->medGenID + +Node: E:dbsnp_hg38_allele_disease_associations->E4 +typeOf: dcs:DiseaseAlleleAssociation +dcid: C:dbsnp_hg38_allele_disease_associations->dcid_disease_allele_association +name: C:dbsnp_hg38_allele_disease_associations->name_disease_allele_association +alleleID: E:dbsnp_hg38_allele_disease_associations->E2 +alleleOrigin: C:dbsnp_hg38_allele_disease_associations->CLNORIGIN +clinicalSignificance: C:dbsnp_hg38_allele_disease_associations->CLNSIG +clinVarReviewStatus: C:dbsnp_hg38_allele_disease_associations->CLNREVSTAT +diseaseID: E:dbsnp_hg38_allele_disease_associations->E3 +geneID: C:dbsnp_hg38_allele_disease_associations->geneID +geneticVariantID: E:dbsnp_hg38_allele_disease_associations->E1 +geneticTestingRegistryID: C:dbsnp_hg38_allele_disease_associations->geneticTestingRegistryID +pharmGKBID: C:dbsnp_hg38_allele_disease_associations->pharmGKBID +referenceClinVarRecord: C:dbsnp_hg38_allele_disease_associations->CLNACC diff --git a/scripts/biomedical/NCBI_dbSNP/tMCFs/dbsnp_hg38_alleles.tmcf b/scripts/biomedical/NCBI_dbSNP/tMCFs/dbsnp_hg38_alleles.tmcf new file mode 100644 index 000000000..e55497a93 --- /dev/null +++ b/scripts/biomedical/NCBI_dbSNP/tMCFs/dbsnp_hg38_alleles.tmcf @@ -0,0 +1,11 @@ +Node: E:dbsnp_hg38_alleles->E1 +typeOf: dcs:GeneticVariant +dcid: C:dbsnp_hg38_alleles->dcid + +Node: E:dbsnp_hg38_alleles->E2 +typeOf: dcs:Allele +dcid: C:dbsnp_hg38_alleles->dcid_allele +name: C:dbsnp_hg38_alleles->name_allele +geneticVariantID: E:dbsnp_hg38_alleles->E1 +hgvsNomenclature: C:dbsnp_hg38_alleles->CLNHGVS +variant: C:dbsnp_hg38_alleles->variant \ No newline at end of file diff --git a/scripts/biomedical/NCBI_dbSNP/tMCFs/dbsnp_hg38_alllele_drug_response_associations.tmcf b/scripts/biomedical/NCBI_dbSNP/tMCFs/dbsnp_hg38_alllele_drug_response_associations.tmcf new file mode 100644 index 000000000..3751da8fd --- /dev/null +++ b/scripts/biomedical/NCBI_dbSNP/tMCFs/dbsnp_hg38_alllele_drug_response_associations.tmcf @@ -0,0 +1,46 @@ +Node: E:dbsnp_hg38_allele_drug_response_associations->E1 +typeOf: dcs:GeneticVariant +dcid: C:dbsnp_hg38_allele_drug_response_associations->dcid + +Node: E:dbsnp_hg38_allele_drug_response_associations->E2 +typeOf: dcs:Allele +dcid: C:dbsnp_hg38_allele_drug_response_associations->dcid_allele +arupLaboratoriesMolecularGeneticsAndGenomicsArupLaboratoriesID: C:dbsnp_hg38_allele_drug_response_associations->arupLaboratoriesMolecularGeneticsAndGenomicsArupLaboratoriesID +geneticTestingRegistryID: C:dbsnp_hg38_allele_drug_response_associations->geneticTestingRegistryID +geneticVariantID: E:dbsnp_hg38_allele_drug_response_associations->E3 +humanGeneMutationDatabaseID: C:dbsnp_hg38_allele_drug_response_associations->humanGeneMutationDatabaseID +omimID: C:dbsnp_hg38_allele_drug_response_associations->omimID +pharmGKBID: C:dbsnp_hg38_allele_drug_response_associations->pharmGKBID +uniProtID: C:dbsnp_hg38_allele_drug_response_associations->uniProtID + +Node: E:dbsnp_hg38_allele_drug_response_associations->E3 +typeOf: dcs:DrugResponse +dcid: C:dbsnp_hg38_allele_drug_response_associations->dcid_disease +name: C:dbsnp_hg38_allele_drug_response_associations->name_disease +compoundID: C:dbsnp_hg38_allele_drug_response_associations->compound_dcid +experimentalFactorOntologyID: C:dbsnp_hg38_allele_drug_response_associations->experimentalFactorOntologyID +geneticReviewsID: C:dbsnp_hg38_allele_drug_response_associations->geneReviewsID +humanPhenotypeOntologyID: C:dbsnp_hg38_allele_drug_response_associations->humanPhenotypeOntologyID +medicalGeneticsSummariesID: C:dbsnp_hg38_allele_drug_response_associations->medicalGeneticsSummariesID +medicalSubjectHeadingID: C:dbsnp_hg38_allele_drug_response_associations->medicalSubjectHeadingID +officeOfRareDiseasesId: C:dbsnp_hg38_allele_drug_response_associations->officeOfRareDiseasesId +omimID: C:dbsnp_hg38_allele_drug_response_associations->omimID +orphaNumber: C:dbsnp_hg38_allele_drug_response_associations->orphaNumber +snomedCT: C:dbsnp_hg38_allele_drug_response_associations->snomedCT +umlsConceptUniqueIdentifier: C:dbsnp_hg38_allele_drug_response_associations->medGenID + +Node: E:dbsnp_hg38_allele_drug_response_associations->E4 +typeOf: dcs:DiseaseAlleleAssociation +dcid: C:dbsnp_hg38_allele_drug_response_associations->dcid_disease_allele_association +name: C:dbsnp_hg38_allele_drug_response_associations->name_disease_allele_association +alleleID: E:dbsnp_hg38_allele_drug_response_associations->E2 +alleleOrigin: C:dbsnp_hg38_allele_drug_response_associations->CLNORIGIN +clinicalSignificance: C:dbsnp_hg38_allele_drug_response_associations->CLNSIG +clinVarReviewStatus: C:dbsnp_hg38_allele_drug_response_associations->CLNREVSTAT +compoundID: C:dbsnp_hg38_allele_drug_response_associations->compound_dcid +diseaseID: E:dbsnp_hg38_allele_drug_response_associations->E3 +geneID: C:dbsnp_hg38_allele_drug_response_associations->geneID +geneticVariantID: E:dbsnp_hg38_allele_drug_response_associations->E1 +geneticTestingRegistryID: C:dbsnp_hg38_allele_drug_response_associations->geneticTestingRegistryID +pharmGKBID: C:dbsnp_hg38_allele_drug_response_associations->pharmGKBID +referenceClinVarRecord: C:dbsnp_hg38_allele_drug_response_associations->CLNACC diff --git a/scripts/biomedical/NCBI_dbSNP/tMCFs/dbsnp_hg38_freq.tmcf b/scripts/biomedical/NCBI_dbSNP/tMCFs/dbsnp_hg38_freq.tmcf new file mode 100644 index 000000000..79db8fe78 --- /dev/null +++ b/scripts/biomedical/NCBI_dbSNP/tMCFs/dbsnp_hg38_freq.tmcf @@ -0,0 +1,13 @@ +Node: E:dbsnp_hg38_freq->E1 +typeOf: dcs:GeneticVariant +dcid: C:dbsnp_hg38_freq->dcid + +Node: E:dbsnp_hg38_freq->E2 +typeOf: dcs:GeneticVariantPopulationFrequency +dcid: C:dbsnp_hg38_freq->dcid_freq +name: C:dbsnp_hg38_freq->name_freq +alleleFrequency: C:dbsnp_hg38_freq->alleleFrequency +geneticVaraintID: E:dbsnp_hg38_freq->E1 +measuredPopulation: C:dbsnp_hg38_freq->measuredPopulation +measuredProperty: schema:frequency +rsID: C:dbsnp_hg38_freq->rsID diff --git a/scripts/biomedical/NCBI_dbSNP/tMCFs/hg19_positions.tmcf b/scripts/biomedical/NCBI_dbSNP/tMCFs/hg19_positions.tmcf new file mode 100644 index 000000000..2820ff731 --- /dev/null +++ b/scripts/biomedical/NCBI_dbSNP/tMCFs/hg19_positions.tmcf @@ -0,0 +1,14 @@ +Node: E:dbsnp_hg19_positions->E1 +typeOf: dcs:GenomicPosition +dcid: C:dbsnp_hg19_positions->dcid_pos +name: C:dbsnp_hg19_positions->name_pos +inChromosome: C:dbsnp_hg19_positions->inChromosome +inGenomeAssembly: dcs:bio/GCA_000001405.14 +position: C:dbsnp_hg19_positions->position + +Node: E:dbsnp_hg19_positions->E2 +typeOf: dcs:GeneticVariant +dcid: C:dbsnp_hg19_positions->dcid +name: C:dbsnp_hg19_positions->name +hg19GenomicPosition: E:dbsnp_hg19_positions->E1 +rsID: C:dbsnp_hg19_positions->rsID diff --git a/scripts/biomedical/NCBI_dbSNP/tests.sh b/scripts/biomedical/NCBI_dbSNP/tests.sh new file mode 100644 index 000000000..b98df873b --- /dev/null +++ b/scripts/biomedical/NCBI_dbSNP/tests.sh @@ -0,0 +1,55 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# """ +# Author: Pradeep Kumar Krishnaswamy +# Date: 23/08/2024 +# Name: tests +# Description: This file runs the Data Commons Java tool to run standard +# tests on tmcf + CSV pairs for the NCBI Gene data import. +# """ + +#!/bin/bash + +# download data commons java test tool version 0.1-alpha.1k +rm -rf tmp +mkdir -p tmp; cd tmp +wget https://github.com/datacommonsorg/import/releases/download/0.1-alpha.1k/datacommons-import-tool-0.1-alpha.1-jar-with-dependencies.jar +cd .. +mkdir -p lint + +java -jar tmp/datacommons-import-tool-0.1-alpha.1-jar-with-dependencies.jar genmcf tMCFs/GRCh37.tmcf output/ncbi_GRCh37_genome_assembly_report.csv -n 20 -o lint/GRCh37 + +java -jar tmp/datacommons-import-tool-0.1-alpha.1-jar-with-dependencies.jar genmcf tMCFs/GRCh38.tmcf output/ncbi_GRCh38_genome_assembly_report.csv -n 20 -o lint/GRCh38 + +java -jar tmp/datacommons-import-tool-0.1-alpha.1-jar-with-dependencies.jar genmcf tMCFs/MedGen.tmcf output/medgen.csv -n 20 -o lint/medgen + +java -jar tmp/datacommons-import-tool-0.1-alpha.1-jar-with-dependencies.jar genmcf tMCFs/clinvar_diesease_gene.tmcf output/clinvar_diesease_gene.csv -n 20 -o lint/clinvar_diesease_gene + +java -jar tmp/datacommons-import-tool-0.1-alpha.1-jar-with-dependencies.jar genmcf tMCFs/hg19_positions.tmcf output/GCF25/*.csv -n 20 -o lint/GCF25 + +java -jar tmp/datacommons-import-tool-0.1-alpha.1-jar-with-dependencies.jar genmcf tMCFs/dbsnp_freq.tmcf output/freq/*.csv -n 20 -o lint/freq + +java -jar tmp/datacommons-import-tool-0.1-alpha.1-jar-with-dependencies.jar genmcf tMCFs/dbsnp_hg38.tmcf output/GCF40/hg38/*.csv -n 20 -o lint/GCF40/hg38 + +java -jar tmp/datacommons-import-tool-0.1-alpha.1-jar-with-dependencies.jar genmcf tMCFs/dbsnp_hg38_alleles.tmcf output/GCF40/hg38alleles/*.csv -n 20 -o lint/GCF40/hg38alleles + +java -jar tmp/datacommons-import-tool-0.1-alpha.1-jar-with-dependencies.jar genmcf tMCFs/dbsnp_hg38_allele_disease_associations.tmcf output/GCF40/hg38alleledisease/*.csv -n 20 -o lint/GCF40/hg38alleledisease + +java -jar tmp/datacommons-import-tool-0.1-alpha.1-jar-with-dependencies.jar genmcf tMCFs/dbsnp_hg38_alllele_drug_response_associations.tmcf output/GCF40/hg38alleledrug/*.csv -n 20 -o lint/GCF40/hg38alleledrug + +java -jar tmp/datacommons-import-tool-0.1-alpha.1-jar-with-dependencies.jar genmcf tMCFs/dbsnp_hg38_freq.tmcf output/GCF40/hg38freq/*.csv -n 20 -o lint/GCF40/hg38freq + +