Skip to content

Commit

Permalink
updating august 25 2021
Browse files Browse the repository at this point in the history
  • Loading branch information
Phillip Richmond committed Aug 25, 2021
1 parent b719bec commit 01dbe30
Show file tree
Hide file tree
Showing 9 changed files with 510 additions and 23 deletions.
110 changes: 110 additions & 0 deletions CreateAltPositionGnomAD.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
import sys, argparse

def GetOptions():
parser = argparse.ArgumentParser()
parser.add_argument("-I","--Infile",help="Input gnomAD SV bed file", type=str, required=True)
parser.add_argument("-O","--OutPrefix",help="Output prefix for gnomAD altered files", type=str, required=True)
args = parser.parse_args()
infilename=args.Infile
outfileprefix=args.OutPrefix
return infilename,outfileprefix


# Purpose of this function is to take in the full gnomadSV bed file, extract the insertion calls, and then list the alternate coordinates for those insertion calls
# Reasoning behind this is that when we run MELT, it looks like the annotation is pinging the alternative loci, and not providing these useful annotation flags, e.g. an, ac, af

def ParseInputWrite_INS_shifted(infilename,outfileprefix):
infile = open(infilename,'r')
outfile = open("%s.ins.shifted.bed"%outfileprefix, 'w')
# Make an outfile with chr* for the chrom names
outfile2 = open("%s.ins.shifted.chr.bed"%outfileprefix, 'w')

# Read through the file, grab the columns you want
# Select the INS calls
# write output to new file

# Columns we want here are for the alternate mapping locus of the insertion calls
# which in this case are shifted by...what seems to be the size of an LTR?
# In our input, these columns are relevant:
# shifted coords: chr2 (col[7]), pos2 (col[16]), end2 (col[10])
# variant identifier: name (col[3])
# SV length: svlen (col[30])
# SV type: svtype (col[31])
# allele freq: AN (col[34]), AC (col[35]), AF (col[36]), NHOMALT (col[40]), POPMAXAF (col[69])
# So together we'll make a new file that has:
# chr2,pos2,end2,name,svlen,svtype,AN,AC,AF,NHOMALT,POPMAXAF
for line in infile.readlines():
cols = line.strip('\n').split('\t')
chr2=cols[7]
pos2=cols[16]
end2=cols[10]
name=cols[3]
svlen=cols[30]
svtype=cols[31]
an=cols[35]
ac=cols[36]
af=cols[37]
nhomalt=cols[41]
popmaxaf=cols[70]
# catch header
if line[0] == '#':
#print('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s'%(chr2,pos2,end2,name,svlen,svtype,an,ac,af,nhomalt,popmaxaf))
outfile.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n'%(chr2,pos2,end2,name,svlen,svtype,an,ac,af,nhomalt,popmaxaf))
# keep only the insertions
if svtype=='INS':
#print('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s'%(chr2,pos2,end2,name,svlen,svtype,an,ac,af,nhomalt,popmaxaf))
outfile.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n'%(chr2,pos2,end2,name,svlen,svtype,an,ac,af,nhomalt,popmaxaf))
outfile2.write('chr%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n'%(chr2,pos2,end2,name,svlen,svtype,an,ac,af,nhomalt,popmaxaf))



# Purpose of this function is to input the gnomadSV bed file, extract informative columns, and print the output.
# Reasoning behind this is that the annotations provided by AnnotSV don't include useful info like allele frequency, etc.
def ParseInputWrite_ALL_reformat(infilename,outfileprefix):
infile = open(infilename,'r')
outfile = open("%s.reformatted.bed"%outfileprefix,'w')
outfile2 = open("%s.reformatted.chr.bed"%outfileprefix, 'w')


# Read through the file, grab the columns you want
# write output to new file

# Columns we want here are primary position columns, sv len, and allele freqs
# In our input, these columns are relevant:
# coords: chr (col[0]), start (col[1]), end (col[2])
# variant identifier: name (col[3])
# SV length: svlen (col[30])
# SV type: svtype (col[31])
# allele freq: AN (col[34]), AC (col[35]), AF (col[36]), NHOMALT (col[40]), POPMAXAF (col[69])
# So together we'll make a new file that has:
# chr2,pos2,end2,name,svlen,svtype,AN,AC,AF,NHOMALT,POPMAXAF
for line in infile.readlines():
cols = line.strip('\n').split('\t')
chrom=cols[0]
start=cols[1]
end=cols[2]
name=cols[3]
svlen=cols[30]
svtype=cols[31]
an=cols[35]
ac=cols[36]
af=cols[37]
nhomalt=cols[41]
popmaxaf=cols[70]
#print('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s'%(chrom,start,end,name,svlen,svtype,an,ac,af,nhomalt,popmaxaf))
outfile.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n'%(chrom,start,end,name,svlen,svtype,an,ac,af,nhomalt,popmaxaf))
outfile2.write('chr%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n'%(chrom,start,end,name,svlen,svtype,an,ac,af,nhomalt,popmaxaf))



def Main():
Infilename,Outfileprefix = GetOptions()
print("Infile: %s\nOutfilePrefix: %s\n"%(Infilename,Outfileprefix))
ParseInputWrite_INS_shifted(Infilename,Outfileprefix)
ParseInputWrite_ALL_reformat(Infilename,Outfileprefix)

if __name__=="__main__":
Main()



94 changes: 94 additions & 0 deletions CreateNewAnnotations.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
# This script details how I went from the downloaded GRCh37 file, into GRCh38 files.

# The sub-files in this process are for either alternative coordinates with insertions (ins.shifted) or just reformatted
# This script is meant to run from:
cd /mnt/common/DATABASES/REFERENCES/GRCh37/GNOMADSV/V2.1/

# Step 1 - run python script to split out and reformat 2 sub files, shifted and reformatted
python CreateAltPositionGnomAD.py -I gnomad_v2.1_sv.sites.bed -O gnomad_v2.1_sv.sites.bed.altered

# This makes these files:
#-rwxrwx--- 1 prichmond domain users 12M Aug 25 14:03 gnomad_v2.1_sv.sites.bed.altered.ins.shifted.bed
#-rwxrwx--- 1 prichmond domain users 12M Aug 25 14:03 gnomad_v2.1_sv.sites.bed.altered.ins.shifted.chr.bed
#-rwxrwx--- 1 prichmond domain users 96 Aug 25 14:03 gnomad_v2.1_sv.sites.bed.altered.ins.shifted.chr.bed.header
#drwxrwx--- 2 prichmond domain users 1.2K Aug 25 14:03 .
#-rwxrwx--- 1 prichmond domain users 42M Aug 25 14:03 gnomad_v2.1_sv.sites.bed.altered.reformatted.bed
#-rwxrwx--- 1 prichmond domain users 43M Aug 25 14:03 gnomad_v2.1_sv.sites.bed.altered.reformatted.chr.bed
#-rwxrwx--- 1 prichmond domain users 36M Aug 25 14:03 gnomad_v2.1_sv.sites.bed.altered.reformatted.chr.bed.header

# Step 2 - liftOver
# FOr details on this go look at that script
sh /mnt/common/Precision/LiftOver/RunLiftOver.sh

# This made these files:
# In: /mnt/common/DATABASES/REFERENCES/GRCh38/GNOMADSV/V2.1/
# gnomad_v2.1_sv.sites.altered.ins.shifted.chr.hg38.bed
# gnomad_v2.1_sv.sites.altered.reformatted.chr.hg38.bed


# Step 3 - Re-header lifted-over files
cat /mnt/common/DATABASES/REFERENCES/GRCh37/GNOMADSV/V2.1/gnomad_v2.1_sv.sites.bed.altered.reformatted.chr.bed.header \
/mnt/common/DATABASES/REFERENCES/GRCh38/GNOMADSV/V2.1/gnomad_v2.1_sv.sites.altered.reformatted.chr.hg38.bed \
> /mnt/common/DATABASES/REFERENCES/GRCh38/GNOMADSV/V2.1/gnomad_v2.1_sv.sites.bed.altered.reformatted.chr.hg38.headered.bed


cat /mnt/common/DATABASES/REFERENCES/GRCh37/GNOMADSV/V2.1/gnomad_v2.1_sv.sites.bed.altered.ins.shifted.chr.bed.header \
/mnt/common/DATABASES/REFERENCES/GRCh38/GNOMADSV/V2.1/gnomad_v2.1_sv.sites.altered.ins.shifted.chr.hg38.bed \
> /mnt/common/DATABASES/REFERENCES/GRCh38/GNOMADSV/V2.1/gnomad_v2.1_sv.sites.bed.altered.ins.shifted.chr.hg38.headered.bed


# Step 4 trim the 'chr'
sed -e 's/^chr//g' /mnt/common/DATABASES/REFERENCES/GRCh38/GNOMADSV/V2.1/gnomad_v2.1_sv.sites.bed.altered.ins.shifted.chr.hg38.headered.bed \
> /mnt/common/DATABASES/REFERENCES/GRCh38/GNOMADSV/V2.1/gnomad_v2.1_sv.sites.bed.altered.ins.shifted.hg38.bed

sed -e 's/^chr//g' /mnt/common/DATABASES/REFERENCES/GRCh38/GNOMADSV/V2.1/gnomad_v2.1_sv.sites.bed.altered.reformatted.chr.hg38.headered.bed \
> /mnt/common/DATABASES/REFERENCES/GRCh38/GNOMADSV/V2.1/gnomad_v2.1_sv.sites.bed.altered.reformatted.hg38.bed

# Step 5 split into sub-files for AnnotSV
# Full Reformatted
# gnomAD_AF
cut -f1,2,3,9 /mnt/common/DATABASES/REFERENCES/GRCh38/GNOMADSV/V2.1/gnomad_v2.1_sv.sites.bed.altered.reformatted.hg38.bed > \
/mnt/common/DATABASES/REFERENCES/GRCh38/GNOMADSV/V2.1/UserGNOMADSV-AF.bed
# gnomAD_NHOMALT
cut -f1,2,3,10 /mnt/common/DATABASES/REFERENCES/GRCh38/GNOMADSV/V2.1/gnomad_v2.1_sv.sites.bed.altered.reformatted.hg38.bed > \
/mnt/common/DATABASES/REFERENCES/GRCh38/GNOMADSV/V2.1/UserGNOMADSV-NHOMALT.bed


# Shifted Insertions
# gnomAD_AF
cut -f1,2,3,9 /mnt/common/DATABASES/REFERENCES/GRCh38/GNOMADSV/V2.1/gnomad_v2.1_sv.sites.bed.altered.reformatted.hg38.bed > \
/mnt/common/DATABASES/REFERENCES/GRCh38/GNOMADSV/V2.1/UserGNOMADSV-AF-insertions.bed
# gnomAD_NHOMALT
cut -f1,2,3,10 /mnt/common/DATABASES/REFERENCES/GRCh38/GNOMADSV/V2.1/gnomad_v2.1_sv.sites.bed.altered.reformatted.hg38.bed > \
/mnt/common/DATABASES/REFERENCES/GRCh38/GNOMADSV/V2.1/UserGNOMADSV-NHOMALT-insertions.bed


# Step 6 copy over to AnnotSV
ANNOTSVDIR=/mnt/common/Precision/AnnotSV/share/AnnotSV/Annotations_Human/Users/GRCh38/
# FtIncludedInSV
# Full reformatted
cp /mnt/common/DATABASES/REFERENCES/GRCh38/GNOMADSV/V2.1/UserGNOMADSV-AF.bed \
$ANNOTSVDIR/FtIncludedInSV/UserGNOMADSV-AF.bed
cp /mnt/common/DATABASES/REFERENCES/GRCh38/GNOMADSV/V2.1/UserGNOMADSV-NHOMALT.bed \
$ANNOTSVDIR/FtIncludedInSV/UserGNOMADSV-NHOMALT.bed

# Shifted insertions
cp /mnt/common/DATABASES/REFERENCES/GRCh38/GNOMADSV/V2.1/UserGNOMADSV-AF-insertions.bed \
$ANNOTSVDIR/FtIncludedInSV/UserGNOMADSV-AF-insertions.bed
cp /mnt/common/DATABASES/REFERENCES/GRCh38/GNOMADSV/V2.1/UserGNOMADSV-NHOMALT-insertions.bed \
$ANNOTSVDIR/FtIncludedInSV/UserGNOMADSV-NHOMALT-insertions.bed

#SVincludedInFt
# Full reformatted
cp /mnt/common/DATABASES/REFERENCES/GRCh38/GNOMADSV/V2.1/UserGNOMADSV-AF.bed \
$ANNOTSVDIR/SVincludedInFt/UserGNOMADSV-AF.bed
cp /mnt/common/DATABASES/REFERENCES/GRCh38/GNOMADSV/V2.1/UserGNOMADSV-NHOMALT.bed \
$ANNOTSVDIR/SVincludedInFt/UserGNOMADSV-NHOMALT.bed

# Shifted insertions
cp /mnt/common/DATABASES/REFERENCES/GRCh38/GNOMADSV/V2.1/UserGNOMADSV-AF-insertions.bed \
$ANNOTSVDIR/SVincludedInFt/UserGNOMADSV-AF-insertions.bed
cp /mnt/common/DATABASES/REFERENCES/GRCh38/GNOMADSV/V2.1/UserGNOMADSV-NHOMALT-insertions.bed \
$ANNOTSVDIR/SVincludedInFt/UserGNOMADSV-NHOMALT-insertions.bed


32 changes: 32 additions & 0 deletions TemplateScripts/AnnotSV.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# Load BCFTools/BEDTools
source /mnt/common/WASSERMAN_SOFTWARE/AnnotateVariants/opt/miniconda3/etc/profile.d/conda.sh
conda activate /mnt/common/WASSERMAN_SOFTWARE/AnnotateVariants/opt/AnnotateVariantsEnvironment/

# BGZip the sub VCFs
bgzip ALU.final_comp.vcf
tabix ALU.final_comp.vcf.gz

bgzip HERVK.final_comp.vcf
tabix HERVK.final_comp.vcf.gz

bgzip SVA.final_comp.vcf
tabix SVA.final_comp.vcf.gz

bgzip LINE1.final_comp.vcf
tabix LINE1.final_comp.vcf.gz

# Concatenate the MELT files
bcftools concat -a \
ALU.final_comp.vcf.gz \
HERVK.final_comp.vcf.gz \
SVA.final_comp.vcf.gz \
LINE1.final_comp.vcf.gz \
-o MergedMEI.vcf

export ANNOTSV=/mnt/common/Precision/AnnotSV/
$ANNOTSV/bin/AnnotSV -SVinputFile MergedMEI.vcf \
-genomeBuild GRCh38 \
-overlap 80 \
-reciprocal yes \
-outputFile MergedMEI.annotated.tsv

76 changes: 76 additions & 0 deletions TemplateScripts/MELT_AnnotSV.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
#!/bin/bash

#SBATCH [email protected]
#SBATCH --mail-type=ALL

## CPU Usage
#SBATCH --mem=8G
#SBATCH --cpus-per-task=1
#SBATCH --time=1:00:00
#SBATCH --nodes=1

## Output and Stderr
#SBATCH --output=%x-%j.out
#SBATCH --error=%x-%j.error


##########
# Set up #
##########

# Load environment
ANNOTATE_VARIANTS_DIR=/mnt/common/WASSERMAN_SOFTWARE/AnnotateVariants/
source $ANNOTATE_VARIANTS_DIR/opt/miniconda3/etc/profile.d/conda.sh
conda activate $ANNOTATE_VARIANTS_DIR/opt/AnnotateVariantsEnvironment/

# Number of threads
NSLOTS=$SLURM_CPUS_PER_TASK

# Here working dir is the MELT subdirectory, assumes you have run MELT
WORKING_DIR=/mnt/scratch/Public/TESTING/GenomicsPipelineTest/MELT/

## Set working space
cd $WORKING_DIR

#### GRCh38 ####
echo "GRCh38 genome"
GENOME=GRCh38
FASTA_DIR=/mnt/common/DATABASES/REFERENCES/GRCh38/GENOME/1000G/
FASTA_FILE=GRCh38_full_analysis_set_plus_decoy_hla.fa
FAMILY_ID=EPGEN029

# BGZip the sub VCFs
bgzip ALU.final_comp.vcf
tabix ALU.final_comp.vcf.gz

bgzip HERVK.final_comp.vcf
tabix HERVK.final_comp.vcf.gz

bgzip SVA.final_comp.vcf
tabix SVA.final_comp.vcf.gz

bgzip LINE1.final_comp.vcf
tabix LINE1.final_comp.vcf.gz

# Concatenate the MELT files
bcftools concat -a \
ALU.final_comp.vcf.gz \
HERVK.final_comp.vcf.gz \
SVA.final_comp.vcf.gz \
LINE1.final_comp.vcf.gz \
-o MergedMEI.vcf

export ANNOTSV=/mnt/common/Precision/AnnotSV/
# Annotate with targetd approach for epilepsy genes
$ANNOTSV/bin/AnnotSV -SVinputFile MergedMEI.vcf \
-genomeBuild $GENOME \
-candidateGenesFile /mnt/scratch/Precision/EPGEN/PROCESS/EPGEN_Genes.txt \
-candidateGenesFiltering yes \
-outputFile ${FAMILY_ID}_MergedMEI.epilepsygenes.annotated.tsv

$ANNOTSV/bin/AnnotSV -SVinputFile MergedMEI.vcf \
-genomeBuild $GENOME \
-outputFile ${FAMILY_ID}_MergedMEI.annotated.tsv



30 changes: 28 additions & 2 deletions TemplateScripts/RareDiseasePipeline_NonTrio.sh
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,9 @@ PEDMAKER=$ANNOTATE_VARIANTS_DIR/PipelineScripts/MakePED.py
EHDN_DIR_VAR='\/mnt\/common\/Precision\/ExpansionHunterDenovo\/ExpansionHunterDenovo-v0.9.0-linux_x86_64\/'
ANNOVAR_DIR_VAR='\/mnt\/common\/WASSERMAN_SOFTWARE\/annovar\/'
EHDN_BACKGROUND_DIR_VAR='\/mnt\/common\/DATABASES\/REFERENCES\/GRCh38\/ExpansionHunterDeNovo\/EHdn_v0.9.0_1000G_hg38\/'
SMOOVE_SIF_VAR='\/mnt\/common\/Precision\/Smoove\/smoove_latest.sif'
ANNOTSV_DIR_VAR='\/mnt\/common\/Precision\/AnnotSV\/'


# For EHdn
MINICONDA_DIR_VAR='\/mnt\/common\/Precision\/Miniconda2\/'
Expand Down Expand Up @@ -453,9 +456,32 @@ sed -i "s/annovar_dir/$ANNOVAR_DIR_VAR/g" ${WORKING_DIR}/${FAMILY_ID}_${STEP9_TE
sed -i "s/ehdn_dir_var/$EHDN_DIR_VAR/g" ${WORKING_DIR}/${FAMILY_ID}_${STEP9_TEMPLATE}
sed -i "s/ehdn_background_dir/$EHDN_BACKGROUND_DIR_VAR/g" ${WORKING_DIR}/${FAMILY_ID}_${STEP9_TEMPLATE}

exit
###########
# Step 10 #
# ExpansionHunter 3
###########

# Smoove + AnnotSV
STEP10_TEMPLATE=RunSmooveAnnotSV_Template.sh

## Input: dupremoved.sorted.bam, all of them in the working directory
## Output: Annotated Smoove TSV

## Copy the template
cp ${TEMPLATE_DIR}/$STEP10_TEMPLATE ${WORKING_DIR}/${FAMILY_ID}_${STEP10_TEMPLATE}
## Copy the smoove.sh script (no edits required)
cp ${TEMPLATE_DIR}/smoove.sh ${WORKING_DIR}/smoove.sh


## Edit the template
sed -i "s/annotate_variants_dir/$ANNOTATE_VARIANTS_DIR_VAR/g" ${WORKING_DIR}/${FAMILY_ID}_${STEP10_TEMPLATE}
sed -i "s/working_dir/$WORKING_DIR_VAR/g" ${WORKING_DIR}/${FAMILY_ID}_${STEP10_TEMPLATE}
sed -i "s/email_address/$EMAIL/g" ${WORKING_DIR}/${FAMILY_ID}_${STEP10_TEMPLATE}
sed -i "s/fasta_dir/$FASTA_DIR_VAR/g" ${WORKING_DIR}/${FAMILY_ID}_${STEP10_TEMPLATE}
sed -i "s/fasta_file/$FASTA_FILE/g" ${WORKING_DIR}/${FAMILY_ID}_${STEP10_TEMPLATE}
sed -i "s/genome_build/$GENOME_BUILD/g" ${WORKING_DIR}/${FAMILY_ID}_${STEP10_TEMPLATE}
sed -i "s/family_id/$FAMILY_ID/g" ${WORKING_DIR}/${FAMILY_ID}_${STEP10_TEMPLATE}
sed -i "s/smoove_sif/$SMOOVE_SIF_VAR/g" ${WORKING_DIR}/${FAMILY_ID}_${STEP10_TEMPLATE}
sed -i "s/annotsv_dir/$ANNOTSV_DIR_VAR/g" ${WORKING_DIR}/${FAMILY_ID}_${STEP10_TEMPLATE}



Expand Down
Loading

0 comments on commit 01dbe30

Please sign in to comment.