Skip to content

Commit

Permalink
added bayesdel download & update testdata & minor changes
Browse files Browse the repository at this point in the history
  • Loading branch information
MarvinDo committed Oct 25, 2023
1 parent 4b5abff commit f03137f
Show file tree
Hide file tree
Showing 10 changed files with 198 additions and 218 deletions.
35 changes: 33 additions & 2 deletions data/download_data.sh
Original file line number Diff line number Diff line change
Expand Up @@ -338,8 +338,10 @@ rm priors_hg19.vcf.gz.tbi
cd $dbs
mkdir -p SpliceAI
cd SpliceAI
wget https://download.imgag.de/ahsturm1/spliceai_scores_2022_02_09_GRCh38.vcf.gz
tabix -p vcf spliceai_scores_2022_02_09_GRCh38.vcf.gz
wget https://download.imgag.de/public/splicing/spliceai_scores_2023_10_24_GRCh38.vcf.gz -O spliceai_scores_2023_10_24_GRCh38.vcf.gz --no-check-certificate
tabix -C -m 9 -p vcf spliceai_scores_2023_10_24_GRCh38.vcf.gz
#wget https://download.imgag.de/ahsturm1/spliceai_scores_2022_02_09_GRCh38.vcf.gz
#tabix -p vcf spliceai_scores_2022_02_09_GRCh38.vcf.gz
'


Expand Down Expand Up @@ -434,6 +436,35 @@ wget https://www.omim.org/static/omim/data/mim2gene.txt

# download bayesDEL
#wget -O bayesdel.tgz https://drive.google.com/file/d/0Byvs2ppGNyXlN0xvSzA4LUgybzg/view?usp=share_link&resourcekey=0-ULPKwYu4hPGuMZ-eY1Z2Tw
#curl -L 'https://drive.google.com/file/d/0Byvs2ppGNyXlN0xvSzA4LUgybzg/view?usp=drive_link&resourcekey=0-ULPKwYu4hPGuMZ-eY1Z2Tw&confirm=t' > bayesdel.tgz

#https://drive.google.com/file/d/0Byvs2ppGNyXlN0xvSzA4LUgybzg/view?usp=drive_link&resourcekey=0-ULPKwYu4hPGuMZ-eY1Z2Tw

#https://drive.google.com/uc?id=0Byvs2ppGNyXlN0xvSzA4LUgybzg&export=download

cd $dbs
mkdir -p BayesDEL
cd BayesDEL

bayesdel_file=bayesdel_240817_noaf
curl -L 'https://drive.google.com/file/d/0Byvs2ppGNyXlN0xvSzA4LUgybzg/view?usp=drive_link&resourcekey=0-ULPKwYu4hPGuMZ-eY1Z2Tw&confirm=t' > $bayesdel_file.tgz

mkdir -p $bayesdel_file && tar -xvzf $bayesdel_file.tgz -C $bayesdel_file --strip-components 1
rm $bayesdel_file.tgz

rm $bayesdel_file/*.tbi

python3 $tools/db_converter_bayesdel.py -i $bayesdel_file -o $bayesdel_file.vcf

#$ngsbits/VcfCheck -lines 0 -in $bayesdel_file.vcf -ref $data/genomes/GRCh37.fa

$ngsbits/VcfSort -in $bayesdel_file.vcf -out $bayesdel_file.vcf
#$ngsbits/VcfLeftNormalize -in $bayesdel_file.vcf -stream -ref $data/genomes/GRCh37.fa -out $bayesdel_file.vcf.2
#$ngsbits/VcfStreamSort -in $bayesdel_file.vcf.2 -out $bayesdel_file.vcf
#awk -v OFS="\t" '!/##/ {$9=$10=""}1' $bayesdel_file.vcf |sed 's/^\s\+//g' > $bayesdel_file.vcf.2 # remove SAMPLE and FORMAT columns from vcf as they are added by vcfsort
#mv -f $bayesdel_file.vcf.2 $bayesdel_file.vcf
#bgzip $bayesdel_file.vcf
#$ngsbits/VcfCheck -in $bayesdel_file.vcf.gz -ref $data/genomes/GRCh37.fa



Expand Down
1 change: 1 addition & 0 deletions resources/backups/database_dumper/dump_database.sh
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ echo $DATE > most_recent_dump.txt
#mysqldump --quick -h SRV011.img.med.uni-tuebingen.de -P 3306 -u ahdoebm1 -p --column-statistics=0 --no-tablespaces --no-create-info -r /mnt/storage2/users/ahdoebm1/HerediVar/resources/backups/database_dumper/init_db/criterium_new.sql HerediVar_ahdoebm1 classification_criterium_strength classification_criterium
#mysqldump --quick -h SRV011.img.med.uni-tuebingen.de -P 3306 -u ahdoebm1 -p --column-statistics=0 --no-tablespaces --no-create-info -r /mnt/storage2/users/ahdoebm1/HerediVar/resources/backups/database_dumper/init_db/criterium_old.sql HerediVar_ahdoebm1_test classification_criterium_strength classification_criterium

#mysqldump --quick -h sql.img.med.uni-tuebingen.de -P 3306 -u ahdoebm1 -p --column-statistics=0 --no-tablespaces --no-create-info -r /mnt/storage2/users/ahdoebm1/HerediVar/resources/backups/database_dumper/temp.sql HerediVar_ahdoebm1 classification_scheme classification_criterium classification_criterium_strength mutually_exclusive_criteria

#mysql -h SRV011.img.med.uni-tuebingen.de -P 3306 -u ahdoebm1 -p HerediVar_ahdoebm1_test < /mnt/storage2/users/ahdoebm1/HerediVar/resources/backups/database_dumper/init_db/criterium_new.sql

Expand Down
4 changes: 2 additions & 2 deletions resources/classification_schemes/ClinGen_BRCA1_v1.0.0.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"classification_scheme_id": "13",
"name": "enigma-brca1_v4",
"classification_scheme_id": "",
"name": "enigma-brca1",
"display_name": "ClinGen ENIGMA BRCA1 v1.0.0",
"type": "acmg-enigma-brca1",
"reference": "https://cspec.genome.network/cspec/ui/svi/doc/GN092",
Expand Down
2 changes: 1 addition & 1 deletion resources/classification_schemes/ClinGen_BRCA2_v1.0.0.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"classification_scheme_id": "14",
"classification_scheme_id": "",
"name": "enigma-brca2",
"display_name": "ClinGen ENIGMA BRCA2 v1.0.0",
"type": "acmg-enigma-brca2",
Expand Down
3 changes: 2 additions & 1 deletion src/annotation_service/annotation_jobs/spliceai_job.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,8 @@ def annotate_missing_spliceai(self, input_vcf_path, output_vcf_path):
#returncode, stderr, stdout = functions.execute_command(["ls", "-l", "/tmp"], "ls")
#print(stdout)
spliceai_code, spliceai_stderr, spliceai_stdout = self.annotate_spliceai_algorithm(input_vcf_path, output_vcf_path)
errors.append(spliceai_stderr)
if 'ERROR' in spliceai_stderr:
errors.append(spliceai_stderr)

# need to insert some code here to merge the newly annotated variants and previously
# annotated ones from the db if there are files which contain more than one variant!
Expand Down
40 changes: 25 additions & 15 deletions src/annotation_service/heredicare_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,12 +65,22 @@ def get_bearer(self):
else:
resp = requests.post(url, auth=auth, data=data)
if resp.status_code != 200: # bearer is None
message = "ERROR: HerediCare API client credentials endpoint returned an HTTP " + str(resp.status_code) + " error: " + resp.text
message = "ERROR: HerediCare API client credentials endpoint returned an HTTP " + str(resp.status_code) + " error: " + self.extract_error_message(resp.text)
status = "error"
else:
bearer = resp.json()["access_token"]
return bearer, status, message

def extract_error_message(self, error_text):
result = error_text
cropped_text = functions.find_between(error_text, prefix="<div id=\"reasons\"", postfix="(</div>|$)")
if cropped_text is not None:
result = cropped_text
else:
cropped_text = functions.find_between(error_text, prefix="<title>", postfix="</title>")
if cropped_text is not None:
result = cropped_text
return result


def update_token(self, timestamp):
Expand Down Expand Up @@ -111,7 +121,7 @@ def get_vid_list(self):
message = "ERROR: HerediCare API get vid list endpoint returned an HTTP 401, unauthorized error. Attempting retry."
status = "retry"
elif resp.status_code != 200: # any other kind of error
message = "ERROR: HerediCare API get vid list endpoint returned an HTTP " + str(resp.status_code) + " error: " + resp.text
message = "ERROR: HerediCare API get vid list endpoint returned an HTTP " + str(resp.status_code) + " error: " + + self.extract_error_message(resp.text)
status = "error"
else: # request was successful
vids = resp.json()['items']
Expand Down Expand Up @@ -159,7 +169,7 @@ def get_variant(self, vid):
message = "ERROR: HerediCare API get vid list endpoint returned an HTTP 401, unauthorized error. Attempting retry."
status = "retry"
elif resp.status_code != 200:
message = "ERROR: HerediCare API get variant details endpoint returned an HTTP " + str(resp.status_code) + " error: " + resp.text
message = "ERROR: HerediCare API get variant details endpoint returned an HTTP " + str(resp.status_code) + " error: " + self.extract_error_message(resp.text)
status = "error"
else: # success
raw_variant = resp.json()
Expand All @@ -186,18 +196,18 @@ def convert_heredicare_variant_raw(self, raw_variant):



if __name__ == "__main__":
functions.read_dotenv()

vids, status, message = heredicare_interface.get_vid_list()

for vid_raw in vids:
vid = vid_raw['record_id']
variant, status, message = heredicare_interface.get_variant(vid)

if variant["PATH_TF"] != "-1":
print(variant)
break
#if __name__ == "__main__":
# functions.read_dotenv()
#
# vids, status, message = heredicare_interface.get_vid_list()
#
# for vid_raw in vids:
# vid = vid_raw['record_id']
# variant, status, message = heredicare_interface.get_variant(vid)
#
# if variant["PATH_TF"] != "-1":
# print(variant)
# break



Expand Down
6 changes: 5 additions & 1 deletion src/common/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -646,4 +646,8 @@ def sort_classes(a, b):
return 1
elif a_importance < b_importance:
return -1
return 0
return 0

def reverse_seq(seq):
seq = seq.upper().replace('A', 't').replace('T', 'a').replace('G', 'c').replace('C', 'c')
return seq.upper()
11 changes: 6 additions & 5 deletions src/frontend_celery/start_keycloak.sh
Original file line number Diff line number Diff line change
Expand Up @@ -59,8 +59,12 @@ keycloak_path=$tools/keycloak

if [ "${WEBAPP_ENV}" == "dev" ]
then
export NO_PROXY=srv020.img.med.uni-tuebingen.de
$keycloak_path/bin/kc.sh start-dev --hostname srv020.img.med.uni-tuebingen.de --http-port 5050 #--features=admin-fine-grained-authz # --log-level debug
set -o allexport
extension=env_
source $root/common/.$extension$WEBAPP_ENV
set +o allexport
export NO_PROXY=$KEYCLOAK_HOST
$keycloak_path/bin/kc.sh start-dev --hostname $KEYCLOAK_HOST --http-port $KEYCLOAK_PORT #--features=admin-fine-grained-authz # --log-level debug
fi

if [ "${WEBAPP_ENV}" == "prod" ]
Expand All @@ -69,6 +73,3 @@ then
fi





213 changes: 22 additions & 191 deletions src/frontend_celery/tests/data/heredivar_test_data.sql

Large diffs are not rendered by default.

101 changes: 101 additions & 0 deletions src/tools/db_converter_bayesdel.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
import sys
from os import path
sys.path.append(path.dirname(path.dirname(path.abspath(__file__))))
import argparse
from gc import collect
from ntpath import join
import common.functions as functions
from os import listdir
from os.path import isfile, join, abspath
import gzip
import pandas as pd

parser = argparse.ArgumentParser(description="")
parser.add_argument("-i", "--input", default="", help="path to input folder")
parser.add_argument("-o", "--output", default="", help="output file path. If not given will default to stdout")
#parser.add_argument("-t", "--transcripts", default="", help="The path to the ensembl gff3 file (eg. http://ftp.ensembl.org/pub/current_gff3/homo_sapiens/Homo_sapiens.GRCh38.110.gff3.gz)")

args = parser.parse_args()

if args.output != "":
sys.stdout = open(args.output, 'w')

if args.input != "":
input_path = args.input
else:
input_path = sys.stdin

#transcript_path = args.transcripts

info_headers = ["##INFO=<ID=BayesDEL_noAF,Number=1,Type=Float,Description=\"Missense variant functional predictions by BayesDel tool (Feng 2017) used without allele frequency. Score bigger or equal to 0.16: damaging; Score smaller than 0.16: tolerated.\">"]
functions.write_vcf_header(info_headers)


#def read_transcripts(path):
# transcripts = []
# with open(path, 'r') as file:
# for line in file:
# line = line.strip()
# if line.startswith('#') or line == '':
# continue
#
# parts = line.split('\t')
# biotype = parts[2]
# chrom = parts[0]
# start = int(parts[3])
# end = int(parts[4])
# orientation = parts[6]
# info = parts[8]
#
# if biotype in ['gene', 'ncRNA_gene', 'pseudogene']:
# transcript_id = functions.find_between(info, prefix="ID=gene:", postfix="(;|$)")
# if transcript_id is None:
# continue
#
# transcripts.append([transcript_id, orientation, chrom, start, end])
#
# result = pd.DataFrame(transcripts, columns=['transcript', 'orientation', 'chrom', 'start', 'end'])
# return result


#transcripts = read_transcripts(transcript_path)
#functions.eprint(transcripts)

bayesdel_files = [abspath(join(input_path, f)) for f in listdir(input_path) if isfile(join(input_path, f))]


functions.eprint(len(bayesdel_files))

for bayesdel_file in bayesdel_files:
functions.eprint(bayesdel_file)
file=gzip.open(bayesdel_file, 'rb')
for line in file:
line = line.decode('utf-8')
line = line.strip()

if line.startswith('#') or line == '':
continue
##CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO
parts = line.split('\t')
chrom = parts[0]
pos = parts[1]
ref = parts[2]
alt = parts[3]
#possible_transcripts = transcripts[(transcripts['chrom'] == chrom) & (transcripts['start'] <= int(pos)) & (transcripts['end'] >= int(pos))]
#if len(possible_transcripts) == 1 :
# orientation = possible_transcripts.iloc[0]['orientation']
# if orientation == '-':
# ref = functions.reverse_seq(ref)
# alt = functions.reverse_seq(alt)
# #functions.eprint("reversed: " + str(pos))

vcf_parts = [chrom, pos, '.', ref, alt, '.', '.', "BayesDEL_noAF=" + str(parts[4])]
vcf_line = '\t'.join(vcf_parts)
print(vcf_line)


file.close()




0 comments on commit f03137f

Please sign in to comment.