updating august 25 2021

Phillip-a-richmond · Aug 25, 2021 · 01dbe30 · 01dbe30
1 parent b719bec
commit 01dbe30
Show file tree

Hide file tree

Showing 9 changed files with 510 additions and 23 deletions.
diff --git a/CreateAltPositionGnomAD.py b/CreateAltPositionGnomAD.py
@@ -0,0 +1,110 @@
+import sys, argparse
+
+def GetOptions():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-I","--Infile",help="Input gnomAD SV bed file", type=str, required=True)
+    parser.add_argument("-O","--OutPrefix",help="Output prefix for gnomAD altered files", type=str, required=True)
+    args = parser.parse_args()
+    infilename=args.Infile
+    outfileprefix=args.OutPrefix
+    return infilename,outfileprefix
+
+
+# Purpose of this function is to take in the full gnomadSV bed file, extract the insertion calls, and then list the alternate coordinates for those insertion calls
+# Reasoning behind this is that when we run MELT, it looks like the annotation is pinging the alternative loci, and not providing these useful annotation flags, e.g. an, ac, af
+
+def ParseInputWrite_INS_shifted(infilename,outfileprefix):
+    infile = open(infilename,'r')
+    outfile = open("%s.ins.shifted.bed"%outfileprefix, 'w')
+    # Make an outfile with chr* for the chrom names
+    outfile2 = open("%s.ins.shifted.chr.bed"%outfileprefix, 'w')
+
+    # Read through the file, grab the columns you want
+    # Select the INS calls
+    # write output to new file
+
+    # Columns we want here are for the alternate mapping locus of the insertion calls
+    # which in this case are shifted by...what seems to be the size of an LTR?
+    # In our input, these columns are relevant:
+    # shifted coords: chr2 (col[7]), pos2 (col[16]), end2 (col[10])
+    # variant identifier: name (col[3])
+    # SV length: svlen (col[30])
+    # SV type: svtype (col[31])
+    # allele freq: AN (col[34]), AC (col[35]), AF (col[36]), NHOMALT (col[40]), POPMAXAF (col[69])
+    # So together we'll make a new file that has: 
+    # chr2,pos2,end2,name,svlen,svtype,AN,AC,AF,NHOMALT,POPMAXAF
+    for line in infile.readlines():
+        cols = line.strip('\n').split('\t')
+        chr2=cols[7]
+        pos2=cols[16]
+        end2=cols[10]
+        name=cols[3]
+        svlen=cols[30]
+        svtype=cols[31]
+        an=cols[35]
+        ac=cols[36]
+        af=cols[37]
+        nhomalt=cols[41]
+        popmaxaf=cols[70]
+        # catch header
+        if line[0] == '#':
+            #print('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s'%(chr2,pos2,end2,name,svlen,svtype,an,ac,af,nhomalt,popmaxaf))
+            outfile.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n'%(chr2,pos2,end2,name,svlen,svtype,an,ac,af,nhomalt,popmaxaf))
+        # keep only the insertions
+        if svtype=='INS':
+            #print('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s'%(chr2,pos2,end2,name,svlen,svtype,an,ac,af,nhomalt,popmaxaf))
+            outfile.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n'%(chr2,pos2,end2,name,svlen,svtype,an,ac,af,nhomalt,popmaxaf))
+            outfile2.write('chr%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n'%(chr2,pos2,end2,name,svlen,svtype,an,ac,af,nhomalt,popmaxaf))
+
+
+
+# Purpose of this function is to input the gnomadSV bed file, extract informative columns, and print the output.
+# Reasoning behind this is that the annotations provided by AnnotSV don't include useful info like allele frequency, etc.
+def ParseInputWrite_ALL_reformat(infilename,outfileprefix):
+    infile = open(infilename,'r')
+    outfile = open("%s.reformatted.bed"%outfileprefix,'w')
+    outfile2 = open("%s.reformatted.chr.bed"%outfileprefix, 'w')
+
+
+    # Read through the file, grab the columns you want
+    # write output to new file
+
+    # Columns we want here are primary position columns, sv len, and allele freqs
+    # In our input, these columns are relevant:
+    # coords: chr (col[0]), start (col[1]), end (col[2])
+    # variant identifier: name (col[3])
+    # SV length: svlen (col[30])
+    # SV type: svtype (col[31])
+    # allele freq: AN (col[34]), AC (col[35]), AF (col[36]), NHOMALT (col[40]), POPMAXAF (col[69])
+    # So together we'll make a new file that has: 
+    # chr2,pos2,end2,name,svlen,svtype,AN,AC,AF,NHOMALT,POPMAXAF
+    for line in infile.readlines():
+        cols = line.strip('\n').split('\t')
+        chrom=cols[0]
+        start=cols[1]
+        end=cols[2]
+        name=cols[3]
+        svlen=cols[30]
+        svtype=cols[31]
+        an=cols[35]
+        ac=cols[36]
+        af=cols[37]
+        nhomalt=cols[41]
+        popmaxaf=cols[70]
+        #print('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s'%(chrom,start,end,name,svlen,svtype,an,ac,af,nhomalt,popmaxaf))
+        outfile.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n'%(chrom,start,end,name,svlen,svtype,an,ac,af,nhomalt,popmaxaf))
+        outfile2.write('chr%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n'%(chrom,start,end,name,svlen,svtype,an,ac,af,nhomalt,popmaxaf))
+
+
+
+def Main():
+    Infilename,Outfileprefix = GetOptions()
+    print("Infile: %s\nOutfilePrefix: %s\n"%(Infilename,Outfileprefix))
+    ParseInputWrite_INS_shifted(Infilename,Outfileprefix)
+    ParseInputWrite_ALL_reformat(Infilename,Outfileprefix)
+
+if __name__=="__main__":
+    Main()
+
+
+
diff --git a/CreateNewAnnotations.sh b/CreateNewAnnotations.sh
@@ -0,0 +1,94 @@
+# This script details how I went from the downloaded GRCh37 file, into GRCh38 files.
+
+# The sub-files in this process are for either alternative coordinates with insertions (ins.shifted) or just reformatted
+# This script is meant to run from:
+cd /mnt/common/DATABASES/REFERENCES/GRCh37/GNOMADSV/V2.1/
+
+# Step 1 - run python script to split out and reformat 2 sub files, shifted and reformatted
+python CreateAltPositionGnomAD.py -I gnomad_v2.1_sv.sites.bed -O gnomad_v2.1_sv.sites.bed.altered
+
+# This makes these files:
+#-rwxrwx--- 1 prichmond domain users  12M Aug 25 14:03 gnomad_v2.1_sv.sites.bed.altered.ins.shifted.bed
+#-rwxrwx--- 1 prichmond domain users  12M Aug 25 14:03 gnomad_v2.1_sv.sites.bed.altered.ins.shifted.chr.bed
+#-rwxrwx--- 1 prichmond domain users   96 Aug 25 14:03 gnomad_v2.1_sv.sites.bed.altered.ins.shifted.chr.bed.header
+#drwxrwx--- 2 prichmond domain users 1.2K Aug 25 14:03 .
+#-rwxrwx--- 1 prichmond domain users  42M Aug 25 14:03 gnomad_v2.1_sv.sites.bed.altered.reformatted.bed
+#-rwxrwx--- 1 prichmond domain users  43M Aug 25 14:03 gnomad_v2.1_sv.sites.bed.altered.reformatted.chr.bed
+#-rwxrwx--- 1 prichmond domain users  36M Aug 25 14:03 gnomad_v2.1_sv.sites.bed.altered.reformatted.chr.bed.header
+
+# Step 2 - liftOver
+# FOr details on this go look at that script
+sh /mnt/common/Precision/LiftOver/RunLiftOver.sh
+
+# This made these files:
+# In: /mnt/common/DATABASES/REFERENCES/GRCh38/GNOMADSV/V2.1/
+# gnomad_v2.1_sv.sites.altered.ins.shifted.chr.hg38.bed
+# gnomad_v2.1_sv.sites.altered.reformatted.chr.hg38.bed
+
+
+# Step 3 - Re-header lifted-over files
+cat /mnt/common/DATABASES/REFERENCES/GRCh37/GNOMADSV/V2.1/gnomad_v2.1_sv.sites.bed.altered.reformatted.chr.bed.header \
+	/mnt/common/DATABASES/REFERENCES/GRCh38/GNOMADSV/V2.1/gnomad_v2.1_sv.sites.altered.reformatted.chr.hg38.bed \
+	> /mnt/common/DATABASES/REFERENCES/GRCh38/GNOMADSV/V2.1/gnomad_v2.1_sv.sites.bed.altered.reformatted.chr.hg38.headered.bed
+
+
+cat /mnt/common/DATABASES/REFERENCES/GRCh37/GNOMADSV/V2.1/gnomad_v2.1_sv.sites.bed.altered.ins.shifted.chr.bed.header \
+	/mnt/common/DATABASES/REFERENCES/GRCh38/GNOMADSV/V2.1/gnomad_v2.1_sv.sites.altered.ins.shifted.chr.hg38.bed \
+	> /mnt/common/DATABASES/REFERENCES/GRCh38/GNOMADSV/V2.1/gnomad_v2.1_sv.sites.bed.altered.ins.shifted.chr.hg38.headered.bed
+
+
+# Step 4 trim the 'chr'
+sed -e 's/^chr//g' /mnt/common/DATABASES/REFERENCES/GRCh38/GNOMADSV/V2.1/gnomad_v2.1_sv.sites.bed.altered.ins.shifted.chr.hg38.headered.bed \
+	> /mnt/common/DATABASES/REFERENCES/GRCh38/GNOMADSV/V2.1/gnomad_v2.1_sv.sites.bed.altered.ins.shifted.hg38.bed
+
+sed -e 's/^chr//g' /mnt/common/DATABASES/REFERENCES/GRCh38/GNOMADSV/V2.1/gnomad_v2.1_sv.sites.bed.altered.reformatted.chr.hg38.headered.bed \
+	> /mnt/common/DATABASES/REFERENCES/GRCh38/GNOMADSV/V2.1/gnomad_v2.1_sv.sites.bed.altered.reformatted.hg38.bed
+
+# Step 5 split into sub-files for AnnotSV
+# Full Reformatted
+# gnomAD_AF
+cut -f1,2,3,9 /mnt/common/DATABASES/REFERENCES/GRCh38/GNOMADSV/V2.1/gnomad_v2.1_sv.sites.bed.altered.reformatted.hg38.bed > \
+	/mnt/common/DATABASES/REFERENCES/GRCh38/GNOMADSV/V2.1/UserGNOMADSV-AF.bed
+# gnomAD_NHOMALT
+cut -f1,2,3,10 /mnt/common/DATABASES/REFERENCES/GRCh38/GNOMADSV/V2.1/gnomad_v2.1_sv.sites.bed.altered.reformatted.hg38.bed > \
+	/mnt/common/DATABASES/REFERENCES/GRCh38/GNOMADSV/V2.1/UserGNOMADSV-NHOMALT.bed
+
+
+# Shifted Insertions
+# gnomAD_AF
+cut -f1,2,3,9 /mnt/common/DATABASES/REFERENCES/GRCh38/GNOMADSV/V2.1/gnomad_v2.1_sv.sites.bed.altered.reformatted.hg38.bed > \
+        /mnt/common/DATABASES/REFERENCES/GRCh38/GNOMADSV/V2.1/UserGNOMADSV-AF-insertions.bed
+# gnomAD_NHOMALT
+cut -f1,2,3,10 /mnt/common/DATABASES/REFERENCES/GRCh38/GNOMADSV/V2.1/gnomad_v2.1_sv.sites.bed.altered.reformatted.hg38.bed > \
+        /mnt/common/DATABASES/REFERENCES/GRCh38/GNOMADSV/V2.1/UserGNOMADSV-NHOMALT-insertions.bed
+
+
+# Step 6 copy over to AnnotSV
+ANNOTSVDIR=/mnt/common/Precision/AnnotSV/share/AnnotSV/Annotations_Human/Users/GRCh38/
+# FtIncludedInSV
+# Full reformatted
+cp /mnt/common/DATABASES/REFERENCES/GRCh38/GNOMADSV/V2.1/UserGNOMADSV-AF.bed  \
+	$ANNOTSVDIR/FtIncludedInSV/UserGNOMADSV-AF.bed
+cp /mnt/common/DATABASES/REFERENCES/GRCh38/GNOMADSV/V2.1/UserGNOMADSV-NHOMALT.bed  \
+	$ANNOTSVDIR/FtIncludedInSV/UserGNOMADSV-NHOMALT.bed
+
+# Shifted insertions
+cp /mnt/common/DATABASES/REFERENCES/GRCh38/GNOMADSV/V2.1/UserGNOMADSV-AF-insertions.bed  \
+	$ANNOTSVDIR/FtIncludedInSV/UserGNOMADSV-AF-insertions.bed
+cp /mnt/common/DATABASES/REFERENCES/GRCh38/GNOMADSV/V2.1/UserGNOMADSV-NHOMALT-insertions.bed  \
+	$ANNOTSVDIR/FtIncludedInSV/UserGNOMADSV-NHOMALT-insertions.bed
+
+#SVincludedInFt
+# Full reformatted
+cp /mnt/common/DATABASES/REFERENCES/GRCh38/GNOMADSV/V2.1/UserGNOMADSV-AF.bed  \
+	$ANNOTSVDIR/SVincludedInFt/UserGNOMADSV-AF.bed
+cp /mnt/common/DATABASES/REFERENCES/GRCh38/GNOMADSV/V2.1/UserGNOMADSV-NHOMALT.bed  \
+	$ANNOTSVDIR/SVincludedInFt/UserGNOMADSV-NHOMALT.bed
+
+# Shifted insertions
+cp /mnt/common/DATABASES/REFERENCES/GRCh38/GNOMADSV/V2.1/UserGNOMADSV-AF-insertions.bed  \
+	$ANNOTSVDIR/SVincludedInFt/UserGNOMADSV-AF-insertions.bed
+cp /mnt/common/DATABASES/REFERENCES/GRCh38/GNOMADSV/V2.1/UserGNOMADSV-NHOMALT-insertions.bed  \
+	$ANNOTSVDIR/SVincludedInFt/UserGNOMADSV-NHOMALT-insertions.bed
+
+
diff --git a/TemplateScripts/AnnotSV.sh b/TemplateScripts/AnnotSV.sh
@@ -0,0 +1,32 @@
+# Load BCFTools/BEDTools
+source /mnt/common/WASSERMAN_SOFTWARE/AnnotateVariants/opt/miniconda3/etc/profile.d/conda.sh
+conda activate /mnt/common/WASSERMAN_SOFTWARE/AnnotateVariants/opt/AnnotateVariantsEnvironment/
+
+# BGZip the sub VCFs
+bgzip ALU.final_comp.vcf
+tabix ALU.final_comp.vcf.gz
+
+bgzip HERVK.final_comp.vcf
+tabix HERVK.final_comp.vcf.gz
+
+bgzip SVA.final_comp.vcf
+tabix SVA.final_comp.vcf.gz
+
+bgzip LINE1.final_comp.vcf
+tabix LINE1.final_comp.vcf.gz
+
+# Concatenate the MELT files
+bcftools concat -a \
+       ALU.final_comp.vcf.gz \
+       HERVK.final_comp.vcf.gz \
+       SVA.final_comp.vcf.gz \
+       LINE1.final_comp.vcf.gz \
+	-o MergedMEI.vcf
+
+export ANNOTSV=/mnt/common/Precision/AnnotSV/
+$ANNOTSV/bin/AnnotSV -SVinputFile MergedMEI.vcf \
+	-genomeBuild GRCh38 \
+	-overlap 80 \
+	-reciprocal yes \
+       	-outputFile MergedMEI.annotated.tsv 
+
diff --git a/TemplateScripts/MELT_AnnotSV.sh b/TemplateScripts/MELT_AnnotSV.sh
@@ -0,0 +1,76 @@
+#!/bin/bash
+
+#SBATCH [email protected]
+#SBATCH --mail-type=ALL
+
+## CPU Usage
+#SBATCH --mem=8G
+#SBATCH --cpus-per-task=1
+#SBATCH --time=1:00:00
+#SBATCH --nodes=1
+
+## Output and Stderr
+#SBATCH --output=%x-%j.out
+#SBATCH --error=%x-%j.error
+
+
+##########
+# Set up #
+##########
+
+# Load environment 
+ANNOTATE_VARIANTS_DIR=/mnt/common/WASSERMAN_SOFTWARE/AnnotateVariants/
+source $ANNOTATE_VARIANTS_DIR/opt/miniconda3/etc/profile.d/conda.sh
+conda activate $ANNOTATE_VARIANTS_DIR/opt/AnnotateVariantsEnvironment/
+
+# Number of threads
+NSLOTS=$SLURM_CPUS_PER_TASK
+
+# Here working dir is the MELT subdirectory, assumes you have run MELT
+WORKING_DIR=/mnt/scratch/Public/TESTING/GenomicsPipelineTest/MELT/
+
+## Set working space
+cd $WORKING_DIR
+
+#### GRCh38 #### 
+echo "GRCh38 genome"
+GENOME=GRCh38
+FASTA_DIR=/mnt/common/DATABASES/REFERENCES/GRCh38/GENOME/1000G/
+FASTA_FILE=GRCh38_full_analysis_set_plus_decoy_hla.fa
+FAMILY_ID=EPGEN029
+
+# BGZip the sub VCFs
+bgzip ALU.final_comp.vcf
+tabix ALU.final_comp.vcf.gz
+
+bgzip HERVK.final_comp.vcf
+tabix HERVK.final_comp.vcf.gz
+
+bgzip SVA.final_comp.vcf
+tabix SVA.final_comp.vcf.gz
+
+bgzip LINE1.final_comp.vcf
+tabix LINE1.final_comp.vcf.gz
+
+# Concatenate the MELT files
+bcftools concat -a \
+       ALU.final_comp.vcf.gz \
+       HERVK.final_comp.vcf.gz \
+       SVA.final_comp.vcf.gz \
+       LINE1.final_comp.vcf.gz \
+	-o MergedMEI.vcf
+
+export ANNOTSV=/mnt/common/Precision/AnnotSV/
+# Annotate with targetd approach for epilepsy genes
+$ANNOTSV/bin/AnnotSV -SVinputFile MergedMEI.vcf \
+	-genomeBuild $GENOME \
+        -candidateGenesFile /mnt/scratch/Precision/EPGEN/PROCESS/EPGEN_Genes.txt \
+        -candidateGenesFiltering yes \
+       	-outputFile ${FAMILY_ID}_MergedMEI.epilepsygenes.annotated.tsv 
+
+$ANNOTSV/bin/AnnotSV -SVinputFile MergedMEI.vcf \
+	-genomeBuild $GENOME \
+       	-outputFile ${FAMILY_ID}_MergedMEI.annotated.tsv 
+
+
+
diff --git a/TemplateScripts/RareDiseasePipeline_NonTrio.sh b/TemplateScripts/RareDiseasePipeline_NonTrio.sh
@@ -101,6 +101,9 @@ PEDMAKER=$ANNOTATE_VARIANTS_DIR/PipelineScripts/MakePED.py
 EHDN_DIR_VAR='\/mnt\/common\/Precision\/ExpansionHunterDenovo\/ExpansionHunterDenovo-v0.9.0-linux_x86_64\/'
 ANNOVAR_DIR_VAR='\/mnt\/common\/WASSERMAN_SOFTWARE\/annovar\/'
 EHDN_BACKGROUND_DIR_VAR='\/mnt\/common\/DATABASES\/REFERENCES\/GRCh38\/ExpansionHunterDeNovo\/EHdn_v0.9.0_1000G_hg38\/'
+SMOOVE_SIF_VAR='\/mnt\/common\/Precision\/Smoove\/smoove_latest.sif'
+ANNOTSV_DIR_VAR='\/mnt\/common\/Precision\/AnnotSV\/'
+
 
 # For EHdn
 MINICONDA_DIR_VAR='\/mnt\/common\/Precision\/Miniconda2\/'
@@ -453,9 +456,32 @@ sed -i "s/annovar_dir/$ANNOVAR_DIR_VAR/g" ${WORKING_DIR}/${FAMILY_ID}_${STEP9_TE
 sed -i "s/ehdn_dir_var/$EHDN_DIR_VAR/g"  ${WORKING_DIR}/${FAMILY_ID}_${STEP9_TEMPLATE}
 sed -i "s/ehdn_background_dir/$EHDN_BACKGROUND_DIR_VAR/g"  ${WORKING_DIR}/${FAMILY_ID}_${STEP9_TEMPLATE}
 
-exit
+###########
 # Step 10 #
-# ExpansionHunter 3
+###########
+
+# Smoove + AnnotSV
+STEP10_TEMPLATE=RunSmooveAnnotSV_Template.sh
+
+## Input: dupremoved.sorted.bam, all of them in the working directory
+## Output: Annotated Smoove TSV
+
+## Copy the template
+cp ${TEMPLATE_DIR}/$STEP10_TEMPLATE ${WORKING_DIR}/${FAMILY_ID}_${STEP10_TEMPLATE}
+## Copy the smoove.sh script (no edits required)
+cp ${TEMPLATE_DIR}/smoove.sh ${WORKING_DIR}/smoove.sh
+
+
+## Edit the template
+sed -i "s/annotate_variants_dir/$ANNOTATE_VARIANTS_DIR_VAR/g" ${WORKING_DIR}/${FAMILY_ID}_${STEP10_TEMPLATE}
+sed -i "s/working_dir/$WORKING_DIR_VAR/g" ${WORKING_DIR}/${FAMILY_ID}_${STEP10_TEMPLATE}
+sed -i "s/email_address/$EMAIL/g" ${WORKING_DIR}/${FAMILY_ID}_${STEP10_TEMPLATE}
+sed -i "s/fasta_dir/$FASTA_DIR_VAR/g" ${WORKING_DIR}/${FAMILY_ID}_${STEP10_TEMPLATE}
+sed -i "s/fasta_file/$FASTA_FILE/g" ${WORKING_DIR}/${FAMILY_ID}_${STEP10_TEMPLATE}
+sed -i "s/genome_build/$GENOME_BUILD/g" ${WORKING_DIR}/${FAMILY_ID}_${STEP10_TEMPLATE}
+sed -i "s/family_id/$FAMILY_ID/g" ${WORKING_DIR}/${FAMILY_ID}_${STEP10_TEMPLATE}
+sed -i "s/smoove_sif/$SMOOVE_SIF_VAR/g" ${WORKING_DIR}/${FAMILY_ID}_${STEP10_TEMPLATE}
+sed -i "s/annotsv_dir/$ANNOTSV_DIR_VAR/g" ${WORKING_DIR}/${FAMILY_ID}_${STEP10_TEMPLATE}