Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

NCBI dbSNP #1116

Draft
wants to merge 1 commit into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 54 additions & 0 deletions scripts/biomedical/NCBI_dbSNP/background_process.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
#!/bin/bash
# Sleep as long as N jobs of given name are running in background

if [ $# -lt 1 ]; then
echo "Usage: $0 <python script path> <input file path>"
exit 1
fi

PYTHON_PATH=$1
INPUT_PATH=$2

echo " $PYTHON_PATH - $INPUT_PATH"

# Count the number of background jobs
function num_jobs {
local name="$PYTHON_PATH"; shift;
if [[ "$name" == "" ]]; then
echo $(jobs -r | wc -l)
else
echo $(ps -ef | egrep "$name" | wc -l)
fi
}

# Get the number of cores
function num_cores {
[[ -z "$cores" ]] && cores=$(cat /proc/cpuinfo | grep "processor" | wc -l)
cores=${cores:-"10"}
echo $cores
}

# Sleep while there are atleast N background jobs
function sleep_while_active {
max_jobs=6
job_name="$PYTHON_PATH"
max_jobs=${max_jobs:-$(num_cores)}
j=$(num_jobs $job_name);
echo "No of jobs $j and job name $job_name"
while (( $j > ${max_jobs:-0} )); do
sleep 1;
j=$(num_jobs $job_name);
done;
}

# Run processes in background
csv_files=$(ls $INPUT_PATH*_shard_*.vcf | xargs -n 1 basename)
#echo "files $csv_files"
for file in $csv_files; do
# Run the process per file in background
# python $2--input_file=$file & sleep_while_active
echo "$PYTHON_PATH --input_file=$file"
python3 $PYTHON_PATH --input_file=$file & sleep_while_active
done
wait

47 changes: 47 additions & 0 deletions scripts/biomedical/NCBI_dbSNP/download.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
#!/bin/bash

mkdir -p input; cd input

# download genome assemblies files
curl -L -O https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.14_GRCh37.p13/GCA_000001405.14_GRCh37.p13_assembly_report.txt

curl -L -O https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.29_GRCh38.p14/GCA_000001405.29_GRCh38.p14_assembly_report.txt

curl -L -O https://ftp.ncbi.nlm.nih.gov/pub/medgen/NAMES.RRF.gz
gunzip NAMES.RRF.gz
mv NAMES.RRF NAMES.txt

curl -L -O https://ftp.ncbi.nlm.nih.gov/pub/medgen/MGDEF.RRF.gz
gunzip MGDEF.RRF.gz
mv MGDEF.RRF MGDEF.txt

curl -L -O https://ftp.ncbi.nlm.nih.gov/pub/medgen/MGSTY.RRF.gz
gunzip MGSTY.RRF.gz
mv MGSTY.RRF MGSTY.txt

curl -L -O https://ftp.ncbi.nlm.nih.gov/pub/medgen/MedGenIDMappings.txt.gz
gunzip MedGenIDMappings.txt.gz

curl -L -O https://ftp.ncbi.nlm.nih.gov/pub/clinvar/gene_condition_source_id
gunzip gene_condition_source_id
mv gene_condition_source_id gene_condition_source_id.txt

mkdir -p GCF25; cd GCF25

curl -L -O https://ftp.ncbi.nlm.nih.gov/snp/latest_release/VCF/GCF_000001405.25.gz
gunzip GCF_000001405.25.gz
mv GCF_000001405.25 GCF_000001405.25.vcf

cd ..
mkdir -p GCF40; cd GCF40

curl -L -O https://ftp.ncbi.nlm.nih.gov/snp/latest_release/VCF/GCF_000001405.40.gz
gunzip GCF_000001405.40.gz
mv GCF_000001405.40 GCF_000001405.40.vcf

cd ..
mkdir -p freq; cd freq

curl -L -O https://ftp.ncbi.nlm.nih.gov/snp/population_frequency/latest_release/freq.vcf.gz
gunzip freq.vcf.gz
cd ..
43 changes: 43 additions & 0 deletions scripts/biomedical/NCBI_dbSNP/run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

#!/bin/bash

# make all required directories
mkdir -p output
mkdir -p output/GCF25
mkdir -p output/GCF40
mkdir -p output/GCF40/hg38
mkdir -p output/GCF40/hg38alleledisease
mkdir -p output/GCF40/hg38alleledrug
mkdir -p output/GCF40/hg38alleles
mkdir -p output/GCF40/hg38freq
mkdir -p output/freq

echo "File split started"
sh split_files.sh
echo "Splitting Completed"

# Command
echo "Running python script"

python3 scripts/process_medgen.py
python3 scripts/process_genome_assembly_report.py
python3 scripts/process_gene_condition_source.py

echo "Start background process"
sh background_process.sh scripts/process_dbsnp_hg19_positions.py input/GCF25/
sh background_process.sh scripts/process_dbsnp_hg38.py input/GCF40/
sh background_process.sh scripts/process_dbsnp_freq.py input/freq/
echo "Background process completed"
249 changes: 249 additions & 0 deletions scripts/biomedical/NCBI_dbSNP/scripts/process_dbsnp_freq.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,249 @@
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Author: Pradeep Kumar Krishnaswamy
Date: 13-Oct-2024
Name: process_dbsnp_freq
Description: cleaning the NCBI dbSNP freq input file.
@source data: Download NCBI dbSNP data from FTP location. Refer to download.sh for details
"""

import csv
import os
import sys
import time
from copy import deepcopy
from absl import flags
from absl import logging
from datetime import datetime as dt

MODULE_DIR = os.path.dirname(os.path.dirname(__file__))

# Setup path for import from data/util
# or set `export PYTHONPATH="./:<repo>/data/util"` in bash
_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
sys.path.append(_SCRIPT_DIR)
_DATA_DIR = _SCRIPT_DIR.split('/data/')[0]
sys.path.append(os.path.join(_DATA_DIR, 'data/util'))

import file_util
from counters import Counters

# for local testing purpose only
# from Utils.counters import Counters
# import Utils.file_util as file_util

_FLAGS = flags.FLAGS

flags.DEFINE_string('input_file', 'freq_shard_aa.vcf',
'Input file to process. Mandatory to pass this argument')
flags.DEFINE_string('output_dir', 'output/freq', 'Output directory for generated files.')
flags.DEFINE_string('input_dir', 'input/freq', 'Input directory where .vcf files downloaded.')

_FLAGS(sys.argv)

CSV_DICT = {
'dcid_gv': '',
'dcid': '',
'name': '',
'alleleFrequency': '',
'alternativeAllele': '',
'genotypeHeterozygousFrequency': '',
'genotypeHomozygousAlternativeFrequency': '',
'genotypeHomozygousReferenceFrequency': '',
'hardyWeinbergEquationPValue': '',
'isGlobalPopulation': '',
'measuredPopulation': '',
'referenceAllele': '',
'rsID': '',
'sampleSize': ''
}

COLUMN_CODES_DICT = {
9: {
'name': 'European',
'isGlobal': False
},
10: {
'name': 'African Others',
'isGlobal': False
},
11: {
'name': 'East Asian',
'isGlobal': False
},
12: {
'name': 'African American',
'isGlobal': False
},
13: {
'name': 'Latin American 1',
'isGlobal': False
},
14: {
'name': 'Latin American 2',
'isGlobal': False
},
15: {
'name': 'Other Asian',
'isGlobal': False
},
16: {
'name': 'South Asian',
'isGlobal': False
},
17: {
'name': 'Other',
'isGlobal': False
},
18: {
'name': 'African',
'isGlobal': False
},
19: {
'name': 'Asian',
'isGlobal': False
},
20: {
'name': 'Total',
'isGlobal': True
}
}


def process_input_csv(input_file: str, output_freq_file_path: str) -> None:
""" Row by row processing of NCBI dbSNP freq input file
Args:
input_file (str): file path to process
output_freq_file_path (str): output file path to save cleaned csv
"""
with open(output_freq_file_path, 'w') as output_hg38_freq:
writer_hg38_freq = csv.DictWriter(output_hg38_freq, CSV_DICT, extrasaction='ignore')
writer_hg38_freq.writeheader()
counters = Counters()
counters.add_counter('total', file_util.file_estimate_num_rows(input_file))

with open(input_file, 'r') as input_file_csv:
for line in input_file_csv:
# skip row
if line[0] == '#':
continue
# process this row
else:
input_row = line.replace('\n', '').split('\t')

dciv_gv = f'bio/{input_row[2]}'
rsID = input_row[2]
#print(rsID, end='\r')
if rsID == '.':
continue
refAllele = input_row[3]
altAllele = input_row[4]

for i in range(9, 21):
try:
row = deepcopy(CSV_DICT)
row['dcid_gv'] = dciv_gv
#print(dciv_gv, end='\r')
row['rsID'] = rsID
row['referenceAllele'] = refAllele
row['alternativeAllele'] = altAllele
row['dcid'] = f"bio/{rsID}_{COLUMN_CODES_DICT[i]['name'].replace(' ', '_')}"
row['name'] = f'"{rsID} {COLUMN_CODES_DICT[i]["name"]} Population Frequency"'
row['isGlobalPopulation'] = COLUMN_CODES_DICT[i]['isGlobal']
row['measuredPopulation'] = COLUMN_CODES_DICT[i]['name']
row['genotypeHeterozygousFrequency'] = "0.00000"
row['genotypeHomozygousAlternativeFrequency'] = "0.00000"
row['genotypeHomozygousReferenceFrequency'] = "0.00000"
if input_row[i].count(':') != 5:
# Skip this record as it cannot be unpacked to desired format
continue
# column format “AN:AC:HWEP:GR:GV:GA”.
row = parse_freq_row(input_row[i], refAllele, altAllele, row)
writer_hg38_freq.writerow(row)

except Exception as e:
print(input_row[i], e, rsID)
counters.add_counter('processed', 1)


def parse_freq_row(freq_value, refAllele, altAllele, row):
""" parser for freq input value for the given row dict

Args:
freq_value (_type_): freq string e.g. format “AN:AC:HWEP:GR:GV:GA”.
refAllele (_type_): referenceAllele
altAllele (_type_): alternativeAllele
row (_type_): input row dict

Returns:
_type_: row dict
"""
AN, AC, HWEP, GR, GV, GA = [
int(x) if x.lstrip("-").isdigit() else x for x in freq_value.split(':')
]
if isinstance(AC, str):
ref_val = 0.0
alt_alleles = AC.split(',')
if AN != 0:
ref_val = eval(f"({AN}-{'-'.join(alt_alleles)})/{AN}")

alt_lst = altAllele.split(',')
alt_val_lst = []

for idx, alt_allele in enumerate(alt_alleles):
alt_val = "0.00000"
if AN != 0:
alt_val = f"{(int(alt_allele)/AN):.5f}"
alt_val_lst.append(f"{alt_lst[idx]}:{alt_val}")
row['alleleFrequency'] = f"{refAllele}:{ref_val:.5f}, {', '.join(alt_val_lst)}"

else:
if AN == 0:
row['alleleFrequency'] = f"{refAllele}:0.00000, {altAllele}:0.00000"

else:
row['alleleFrequency'] = f"{refAllele}:{((AN-AC)/AN):.5f}, {altAllele}:{(AC/AN):.5f}"

if AN != 0:
row['genotypeHeterozygousFrequency'] = f"{(GV / (AN / 2)):.5f}"
row['genotypeHomozygousAlternativeFrequency'] = f"{(GA / (AN / 2)):.5f}"
row['genotypeHomozygousReferenceFrequency'] = f"{(GR / (AN / 2)):.5f}"

row['hardyWeinbergEquationPValue'] = HWEP
row['sampleSize'] = AN
return row


def main(input_file: str) -> None:
""" Main method

Args:
input_file (str): file path to process
"""
start_time = time.time()
logging.set_verbosity('info')
logging.info(f"Freq processing input file {input_file} - {dt.now()}")
input_file_path = os.path.join(MODULE_DIR + '/' + _FLAGS.input_dir, input_file)

output_file = input_file.split('.')[0] + '.csv'
output_csv = os.path.join(MODULE_DIR + '/' + _FLAGS.output_dir, output_file)
logging.info(f"output_csv {output_csv}")

process_input_csv(input_file_path, output_csv)
logging.info(f"Time taken to process {((time.time() - start_time)/60):.2f} - {dt.now()}")


if __name__ == '__main__':
main(_FLAGS.input_file)
Loading
Loading