added bayesdel download & update testdata & minor changes

GC-HBOC · Oct 25, 2023 · f03137f · f03137f
1 parent 4b5abff
commit f03137f
Show file tree

Hide file tree

Showing 10 changed files with 198 additions and 218 deletions.
diff --git a/data/download_data.sh b/data/download_data.sh
@@ -338,8 +338,10 @@ rm priors_hg19.vcf.gz.tbi
 cd $dbs
 mkdir -p SpliceAI
 cd SpliceAI
-wget https://download.imgag.de/ahsturm1/spliceai_scores_2022_02_09_GRCh38.vcf.gz
-tabix -p vcf spliceai_scores_2022_02_09_GRCh38.vcf.gz
+wget https://download.imgag.de/public/splicing/spliceai_scores_2023_10_24_GRCh38.vcf.gz -O spliceai_scores_2023_10_24_GRCh38.vcf.gz --no-check-certificate
+tabix -C -m 9 -p vcf spliceai_scores_2023_10_24_GRCh38.vcf.gz
+#wget https://download.imgag.de/ahsturm1/spliceai_scores_2022_02_09_GRCh38.vcf.gz
+#tabix -p vcf spliceai_scores_2022_02_09_GRCh38.vcf.gz
 '
 
 
@@ -434,6 +436,35 @@ wget https://www.omim.org/static/omim/data/mim2gene.txt
 
 # download bayesDEL
 #wget -O bayesdel.tgz https://drive.google.com/file/d/0Byvs2ppGNyXlN0xvSzA4LUgybzg/view?usp=share_link&resourcekey=0-ULPKwYu4hPGuMZ-eY1Z2Tw
+#curl -L 'https://drive.google.com/file/d/0Byvs2ppGNyXlN0xvSzA4LUgybzg/view?usp=drive_link&resourcekey=0-ULPKwYu4hPGuMZ-eY1Z2Tw&confirm=t' > bayesdel.tgz
+
+#https://drive.google.com/file/d/0Byvs2ppGNyXlN0xvSzA4LUgybzg/view?usp=drive_link&resourcekey=0-ULPKwYu4hPGuMZ-eY1Z2Tw
+
+#https://drive.google.com/uc?id=0Byvs2ppGNyXlN0xvSzA4LUgybzg&export=download
+
+cd $dbs
+mkdir -p BayesDEL
+cd BayesDEL
+
+bayesdel_file=bayesdel_240817_noaf
+curl -L 'https://drive.google.com/file/d/0Byvs2ppGNyXlN0xvSzA4LUgybzg/view?usp=drive_link&resourcekey=0-ULPKwYu4hPGuMZ-eY1Z2Tw&confirm=t' > $bayesdel_file.tgz
+
+mkdir -p $bayesdel_file && tar -xvzf $bayesdel_file.tgz -C $bayesdel_file --strip-components 1
+rm $bayesdel_file.tgz
+
+rm $bayesdel_file/*.tbi
+
+python3 $tools/db_converter_bayesdel.py -i $bayesdel_file -o $bayesdel_file.vcf
+
+#$ngsbits/VcfCheck -lines 0 -in $bayesdel_file.vcf -ref $data/genomes/GRCh37.fa
+
+$ngsbits/VcfSort -in $bayesdel_file.vcf -out $bayesdel_file.vcf
+#$ngsbits/VcfLeftNormalize -in $bayesdel_file.vcf -stream -ref $data/genomes/GRCh37.fa -out $bayesdel_file.vcf.2
+#$ngsbits/VcfStreamSort -in $bayesdel_file.vcf.2 -out $bayesdel_file.vcf
+#awk -v OFS="\t" '!/##/ {$9=$10=""}1' $bayesdel_file.vcf |sed 's/^\s\+//g' > $bayesdel_file.vcf.2 # remove SAMPLE and FORMAT columns from vcf as they are added by vcfsort
+#mv -f $bayesdel_file.vcf.2 $bayesdel_file.vcf
+#bgzip $bayesdel_file.vcf
+#$ngsbits/VcfCheck -in $bayesdel_file.vcf.gz -ref $data/genomes/GRCh37.fa
 
 
 

diff --git a/resources/backups/database_dumper/dump_database.sh b/resources/backups/database_dumper/dump_database.sh
@@ -35,6 +35,7 @@ echo $DATE > most_recent_dump.txt
 #mysqldump --quick -h SRV011.img.med.uni-tuebingen.de -P 3306 -u ahdoebm1 -p --column-statistics=0 --no-tablespaces --no-create-info -r /mnt/storage2/users/ahdoebm1/HerediVar/resources/backups/database_dumper/init_db/criterium_new.sql HerediVar_ahdoebm1 classification_criterium_strength classification_criterium
 #mysqldump --quick -h SRV011.img.med.uni-tuebingen.de -P 3306 -u ahdoebm1 -p --column-statistics=0 --no-tablespaces --no-create-info -r /mnt/storage2/users/ahdoebm1/HerediVar/resources/backups/database_dumper/init_db/criterium_old.sql HerediVar_ahdoebm1_test classification_criterium_strength classification_criterium
 
+#mysqldump --quick -h sql.img.med.uni-tuebingen.de -P 3306 -u ahdoebm1 -p --column-statistics=0 --no-tablespaces --no-create-info -r /mnt/storage2/users/ahdoebm1/HerediVar/resources/backups/database_dumper/temp.sql HerediVar_ahdoebm1 classification_scheme classification_criterium classification_criterium_strength mutually_exclusive_criteria
 
 #mysql -h SRV011.img.med.uni-tuebingen.de -P 3306 -u ahdoebm1 -p HerediVar_ahdoebm1_test < /mnt/storage2/users/ahdoebm1/HerediVar/resources/backups/database_dumper/init_db/criterium_new.sql
 

diff --git a/resources/classification_schemes/ClinGen_BRCA1_v1.0.0.json b/resources/classification_schemes/ClinGen_BRCA1_v1.0.0.json
@@ -1,6 +1,6 @@
 {
-    "classification_scheme_id": "13",
-    "name": "enigma-brca1_v4",
+    "classification_scheme_id": "",
+    "name": "enigma-brca1",
     "display_name": "ClinGen ENIGMA BRCA1 v1.0.0",
     "type": "acmg-enigma-brca1",
     "reference": "https://cspec.genome.network/cspec/ui/svi/doc/GN092",

diff --git a/resources/classification_schemes/ClinGen_BRCA2_v1.0.0.json b/resources/classification_schemes/ClinGen_BRCA2_v1.0.0.json
@@ -1,5 +1,5 @@
 {
-    "classification_scheme_id": "14",
+    "classification_scheme_id": "",
     "name": "enigma-brca2",
     "display_name": "ClinGen ENIGMA BRCA2 v1.0.0",
     "type": "acmg-enigma-brca2",

diff --git a/src/annotation_service/annotation_jobs/spliceai_job.py b/src/annotation_service/annotation_jobs/spliceai_job.py
@@ -75,7 +75,8 @@ def annotate_missing_spliceai(self, input_vcf_path, output_vcf_path):
             #returncode, stderr, stdout = functions.execute_command(["ls", "-l", "/tmp"], "ls")
             #print(stdout)
             spliceai_code, spliceai_stderr, spliceai_stdout = self.annotate_spliceai_algorithm(input_vcf_path, output_vcf_path)
-            errors.append(spliceai_stderr)
+            if 'ERROR' in spliceai_stderr:
+                errors.append(spliceai_stderr)
 
         # need to insert some code here to merge the newly annotated variants and previously 
         # annotated ones from the db if there are files which contain more than one variant! 

diff --git a/src/annotation_service/heredicare_interface.py b/src/annotation_service/heredicare_interface.py
@@ -65,12 +65,22 @@ def get_bearer(self):
         else:
             resp = requests.post(url, auth=auth, data=data)
             if resp.status_code != 200: # bearer is None
-                message = "ERROR: HerediCare API client credentials endpoint returned an HTTP " + str(resp.status_code) + " error: " + resp.text
+                message = "ERROR: HerediCare API client credentials endpoint returned an HTTP " + str(resp.status_code) + " error: " + self.extract_error_message(resp.text)
                 status = "error"
             else:
                 bearer = resp.json()["access_token"]
         return bearer, status, message
 
+    def extract_error_message(self, error_text):
+        result = error_text
+        cropped_text = functions.find_between(error_text, prefix="<div id=\"reasons\"", postfix="(</div>|$)")
+        if cropped_text is not None:
+            result = cropped_text
+        else:
+            cropped_text = functions.find_between(error_text, prefix="<title>", postfix="</title>")
+            if cropped_text is not None:
+                result = cropped_text
+        return result
 
 
     def update_token(self, timestamp):
@@ -111,7 +121,7 @@ def get_vid_list(self):
             message = "ERROR: HerediCare API get vid list endpoint returned an HTTP 401, unauthorized error. Attempting retry."
             status = "retry"
         elif resp.status_code != 200: # any other kind of error
-            message = "ERROR: HerediCare API get vid list endpoint returned an HTTP " + str(resp.status_code) + " error: " + resp.text
+            message = "ERROR: HerediCare API get vid list endpoint returned an HTTP " + str(resp.status_code) + " error: " + + self.extract_error_message(resp.text)
             status = "error"
         else: # request was successful
             vids = resp.json()['items']
@@ -159,7 +169,7 @@ def get_variant(self, vid):
             message = "ERROR: HerediCare API get vid list endpoint returned an HTTP 401, unauthorized error. Attempting retry."
             status = "retry"
         elif resp.status_code != 200:
-            message = "ERROR: HerediCare API get variant details endpoint returned an HTTP " + str(resp.status_code) + " error: " + resp.text
+            message = "ERROR: HerediCare API get variant details endpoint returned an HTTP " + str(resp.status_code) + " error: " + self.extract_error_message(resp.text)
             status = "error"
         else: # success
             raw_variant = resp.json()
@@ -186,18 +196,18 @@ def convert_heredicare_variant_raw(self, raw_variant):
 
 
 
-if __name__ == "__main__":
-    functions.read_dotenv()
-
-    vids, status, message = heredicare_interface.get_vid_list()
-
-    for vid_raw in vids:
-        vid = vid_raw['record_id']
-        variant, status, message = heredicare_interface.get_variant(vid)
-
-        if variant["PATH_TF"] != "-1":
-            print(variant)
-            break
+#if __name__ == "__main__":
+#    functions.read_dotenv()
+#
+#    vids, status, message = heredicare_interface.get_vid_list()
+#
+#    for vid_raw in vids:
+#        vid = vid_raw['record_id']
+#        variant, status, message = heredicare_interface.get_variant(vid)
+#
+#        if variant["PATH_TF"] != "-1":
+#            print(variant)
+#            break
 
 
 

diff --git a/src/common/functions.py b/src/common/functions.py
@@ -646,4 +646,8 @@ def sort_classes(a, b):
         return 1
     elif a_importance < b_importance:
         return -1
-    return 0
+    return 0
+
+def reverse_seq(seq):
+    seq = seq.upper().replace('A', 't').replace('T', 'a').replace('G', 'c').replace('C', 'c')
+    return seq.upper()
diff --git a/src/frontend_celery/start_keycloak.sh b/src/frontend_celery/start_keycloak.sh
@@ -59,8 +59,12 @@ keycloak_path=$tools/keycloak
 
 if [ "${WEBAPP_ENV}" == "dev" ]
 then
-    export NO_PROXY=srv020.img.med.uni-tuebingen.de
-    $keycloak_path/bin/kc.sh start-dev --hostname srv020.img.med.uni-tuebingen.de --http-port 5050 #--features=admin-fine-grained-authz # --log-level debug
+    set -o allexport
+    extension=env_
+    source $root/common/.$extension$WEBAPP_ENV
+    set +o allexport
+    export NO_PROXY=$KEYCLOAK_HOST
+    $keycloak_path/bin/kc.sh start-dev --hostname $KEYCLOAK_HOST --http-port $KEYCLOAK_PORT #--features=admin-fine-grained-authz # --log-level debug
 fi
 
 if [ "${WEBAPP_ENV}" == "prod" ]
@@ -69,6 +73,3 @@ then
 fi
 
 
-
-
-
diff --git a/src/frontend_celery/tests/data/heredivar_test_data.sql b/src/frontend_celery/tests/data/heredivar_test_data.sql
diff --git a/src/tools/db_converter_bayesdel.py b/src/tools/db_converter_bayesdel.py
@@ -0,0 +1,101 @@
+import sys
+from os import path
+sys.path.append(path.dirname(path.dirname(path.abspath(__file__))))
+import argparse
+from gc import collect
+from ntpath import join
+import common.functions as functions
+from os import listdir
+from os.path import isfile, join, abspath
+import gzip
+import pandas as pd
+
+parser = argparse.ArgumentParser(description="")
+parser.add_argument("-i", "--input",  default="", help="path to input folder")
+parser.add_argument("-o", "--output", default="", help="output file path. If not given will default to stdout")
+#parser.add_argument("-t", "--transcripts", default="", help="The path to the ensembl gff3 file (eg. http://ftp.ensembl.org/pub/current_gff3/homo_sapiens/Homo_sapiens.GRCh38.110.gff3.gz)")
+
+args = parser.parse_args()
+
+if args.output != "":
+    sys.stdout = open(args.output, 'w')
+
+if args.input != "":
+    input_path = args.input
+else:
+    input_path = sys.stdin
+
+#transcript_path = args.transcripts
+
+info_headers = ["##INFO=<ID=BayesDEL_noAF,Number=1,Type=Float,Description=\"Missense variant functional predictions by BayesDel tool (Feng 2017) used without allele frequency. Score bigger or equal to 0.16: damaging; Score smaller than 0.16: tolerated.\">"]
+functions.write_vcf_header(info_headers)
+
+
+#def read_transcripts(path):
+#    transcripts = []
+#    with open(path, 'r') as file:
+#        for line in file:
+#            line = line.strip() 
+#            if line.startswith('#') or line == '':
+#                continue
+#            
+#            parts = line.split('\t')
+#            biotype = parts[2]
+#            chrom = parts[0]
+#            start = int(parts[3])
+#            end = int(parts[4])
+#            orientation = parts[6]
+#            info = parts[8]
+#            
+#            if biotype in ['gene', 'ncRNA_gene', 'pseudogene']:
+#                transcript_id = functions.find_between(info, prefix="ID=gene:", postfix="(;|$)")
+#                if transcript_id is None:
+#                    continue
+#
+#                transcripts.append([transcript_id, orientation, chrom, start, end])
+#    
+#    result = pd.DataFrame(transcripts, columns=['transcript', 'orientation', 'chrom', 'start', 'end'])
+#    return result
+
+
+#transcripts = read_transcripts(transcript_path)
+#functions.eprint(transcripts)
+
+bayesdel_files = [abspath(join(input_path, f)) for f in listdir(input_path) if isfile(join(input_path, f))]
+
+
+functions.eprint(len(bayesdel_files))
+
+for bayesdel_file in bayesdel_files:
+    functions.eprint(bayesdel_file)
+    file=gzip.open(bayesdel_file, 'rb')
+    for line in file:
+        line = line.decode('utf-8')
+        line = line.strip()
+
+        if line.startswith('#') or line == '':
+            continue
+        ##CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO
+        parts = line.split('\t')
+        chrom = parts[0]
+        pos = parts[1]
+        ref = parts[2]
+        alt = parts[3]
+        #possible_transcripts = transcripts[(transcripts['chrom'] == chrom) & (transcripts['start'] <= int(pos)) & (transcripts['end'] >= int(pos))]
+        #if len(possible_transcripts) == 1 :
+        #    orientation = possible_transcripts.iloc[0]['orientation']
+        #    if orientation == '-':
+        #        ref = functions.reverse_seq(ref)
+        #        alt = functions.reverse_seq(alt)
+        #        #functions.eprint("reversed: " + str(pos))
+
+        vcf_parts = [chrom, pos, '.', ref, alt, '.', '.', "BayesDEL_noAF=" + str(parts[4])]
+        vcf_line = '\t'.join(vcf_parts)
+        print(vcf_line)
+
+
+    file.close()
+
+
+
+