-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Phillip Richmond
committed
Aug 25, 2021
1 parent
b719bec
commit 01dbe30
Showing
9 changed files
with
510 additions
and
23 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,110 @@ | ||
import sys, argparse | ||
|
||
def GetOptions(): | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument("-I","--Infile",help="Input gnomAD SV bed file", type=str, required=True) | ||
parser.add_argument("-O","--OutPrefix",help="Output prefix for gnomAD altered files", type=str, required=True) | ||
args = parser.parse_args() | ||
infilename=args.Infile | ||
outfileprefix=args.OutPrefix | ||
return infilename,outfileprefix | ||
|
||
|
||
# Purpose of this function is to take in the full gnomadSV bed file, extract the insertion calls, and then list the alternate coordinates for those insertion calls | ||
# Reasoning behind this is that when we run MELT, it looks like the annotation is pinging the alternative loci, and not providing these useful annotation flags, e.g. an, ac, af | ||
|
||
def ParseInputWrite_INS_shifted(infilename,outfileprefix): | ||
infile = open(infilename,'r') | ||
outfile = open("%s.ins.shifted.bed"%outfileprefix, 'w') | ||
# Make an outfile with chr* for the chrom names | ||
outfile2 = open("%s.ins.shifted.chr.bed"%outfileprefix, 'w') | ||
|
||
# Read through the file, grab the columns you want | ||
# Select the INS calls | ||
# write output to new file | ||
|
||
# Columns we want here are for the alternate mapping locus of the insertion calls | ||
# which in this case are shifted by...what seems to be the size of an LTR? | ||
# In our input, these columns are relevant: | ||
# shifted coords: chr2 (col[7]), pos2 (col[16]), end2 (col[10]) | ||
# variant identifier: name (col[3]) | ||
# SV length: svlen (col[30]) | ||
# SV type: svtype (col[31]) | ||
# allele freq: AN (col[34]), AC (col[35]), AF (col[36]), NHOMALT (col[40]), POPMAXAF (col[69]) | ||
# So together we'll make a new file that has: | ||
# chr2,pos2,end2,name,svlen,svtype,AN,AC,AF,NHOMALT,POPMAXAF | ||
for line in infile.readlines(): | ||
cols = line.strip('\n').split('\t') | ||
chr2=cols[7] | ||
pos2=cols[16] | ||
end2=cols[10] | ||
name=cols[3] | ||
svlen=cols[30] | ||
svtype=cols[31] | ||
an=cols[35] | ||
ac=cols[36] | ||
af=cols[37] | ||
nhomalt=cols[41] | ||
popmaxaf=cols[70] | ||
# catch header | ||
if line[0] == '#': | ||
#print('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s'%(chr2,pos2,end2,name,svlen,svtype,an,ac,af,nhomalt,popmaxaf)) | ||
outfile.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n'%(chr2,pos2,end2,name,svlen,svtype,an,ac,af,nhomalt,popmaxaf)) | ||
# keep only the insertions | ||
if svtype=='INS': | ||
#print('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s'%(chr2,pos2,end2,name,svlen,svtype,an,ac,af,nhomalt,popmaxaf)) | ||
outfile.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n'%(chr2,pos2,end2,name,svlen,svtype,an,ac,af,nhomalt,popmaxaf)) | ||
outfile2.write('chr%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n'%(chr2,pos2,end2,name,svlen,svtype,an,ac,af,nhomalt,popmaxaf)) | ||
|
||
|
||
|
||
# Purpose of this function is to input the gnomadSV bed file, extract informative columns, and print the output. | ||
# Reasoning behind this is that the annotations provided by AnnotSV don't include useful info like allele frequency, etc. | ||
def ParseInputWrite_ALL_reformat(infilename,outfileprefix): | ||
infile = open(infilename,'r') | ||
outfile = open("%s.reformatted.bed"%outfileprefix,'w') | ||
outfile2 = open("%s.reformatted.chr.bed"%outfileprefix, 'w') | ||
|
||
|
||
# Read through the file, grab the columns you want | ||
# write output to new file | ||
|
||
# Columns we want here are primary position columns, sv len, and allele freqs | ||
# In our input, these columns are relevant: | ||
# coords: chr (col[0]), start (col[1]), end (col[2]) | ||
# variant identifier: name (col[3]) | ||
# SV length: svlen (col[30]) | ||
# SV type: svtype (col[31]) | ||
# allele freq: AN (col[34]), AC (col[35]), AF (col[36]), NHOMALT (col[40]), POPMAXAF (col[69]) | ||
# So together we'll make a new file that has: | ||
# chr2,pos2,end2,name,svlen,svtype,AN,AC,AF,NHOMALT,POPMAXAF | ||
for line in infile.readlines(): | ||
cols = line.strip('\n').split('\t') | ||
chrom=cols[0] | ||
start=cols[1] | ||
end=cols[2] | ||
name=cols[3] | ||
svlen=cols[30] | ||
svtype=cols[31] | ||
an=cols[35] | ||
ac=cols[36] | ||
af=cols[37] | ||
nhomalt=cols[41] | ||
popmaxaf=cols[70] | ||
#print('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s'%(chrom,start,end,name,svlen,svtype,an,ac,af,nhomalt,popmaxaf)) | ||
outfile.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n'%(chrom,start,end,name,svlen,svtype,an,ac,af,nhomalt,popmaxaf)) | ||
outfile2.write('chr%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n'%(chrom,start,end,name,svlen,svtype,an,ac,af,nhomalt,popmaxaf)) | ||
|
||
|
||
|
||
def Main(): | ||
Infilename,Outfileprefix = GetOptions() | ||
print("Infile: %s\nOutfilePrefix: %s\n"%(Infilename,Outfileprefix)) | ||
ParseInputWrite_INS_shifted(Infilename,Outfileprefix) | ||
ParseInputWrite_ALL_reformat(Infilename,Outfileprefix) | ||
|
||
if __name__=="__main__": | ||
Main() | ||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,94 @@ | ||
# This script details how I went from the downloaded GRCh37 file, into GRCh38 files. | ||
|
||
# The sub-files in this process are for either alternative coordinates with insertions (ins.shifted) or just reformatted | ||
# This script is meant to run from: | ||
cd /mnt/common/DATABASES/REFERENCES/GRCh37/GNOMADSV/V2.1/ | ||
|
||
# Step 1 - run python script to split out and reformat 2 sub files, shifted and reformatted | ||
python CreateAltPositionGnomAD.py -I gnomad_v2.1_sv.sites.bed -O gnomad_v2.1_sv.sites.bed.altered | ||
|
||
# This makes these files: | ||
#-rwxrwx--- 1 prichmond domain users 12M Aug 25 14:03 gnomad_v2.1_sv.sites.bed.altered.ins.shifted.bed | ||
#-rwxrwx--- 1 prichmond domain users 12M Aug 25 14:03 gnomad_v2.1_sv.sites.bed.altered.ins.shifted.chr.bed | ||
#-rwxrwx--- 1 prichmond domain users 96 Aug 25 14:03 gnomad_v2.1_sv.sites.bed.altered.ins.shifted.chr.bed.header | ||
#drwxrwx--- 2 prichmond domain users 1.2K Aug 25 14:03 . | ||
#-rwxrwx--- 1 prichmond domain users 42M Aug 25 14:03 gnomad_v2.1_sv.sites.bed.altered.reformatted.bed | ||
#-rwxrwx--- 1 prichmond domain users 43M Aug 25 14:03 gnomad_v2.1_sv.sites.bed.altered.reformatted.chr.bed | ||
#-rwxrwx--- 1 prichmond domain users 36M Aug 25 14:03 gnomad_v2.1_sv.sites.bed.altered.reformatted.chr.bed.header | ||
|
||
# Step 2 - liftOver | ||
# FOr details on this go look at that script | ||
sh /mnt/common/Precision/LiftOver/RunLiftOver.sh | ||
|
||
# This made these files: | ||
# In: /mnt/common/DATABASES/REFERENCES/GRCh38/GNOMADSV/V2.1/ | ||
# gnomad_v2.1_sv.sites.altered.ins.shifted.chr.hg38.bed | ||
# gnomad_v2.1_sv.sites.altered.reformatted.chr.hg38.bed | ||
|
||
|
||
# Step 3 - Re-header lifted-over files | ||
cat /mnt/common/DATABASES/REFERENCES/GRCh37/GNOMADSV/V2.1/gnomad_v2.1_sv.sites.bed.altered.reformatted.chr.bed.header \ | ||
/mnt/common/DATABASES/REFERENCES/GRCh38/GNOMADSV/V2.1/gnomad_v2.1_sv.sites.altered.reformatted.chr.hg38.bed \ | ||
> /mnt/common/DATABASES/REFERENCES/GRCh38/GNOMADSV/V2.1/gnomad_v2.1_sv.sites.bed.altered.reformatted.chr.hg38.headered.bed | ||
|
||
|
||
cat /mnt/common/DATABASES/REFERENCES/GRCh37/GNOMADSV/V2.1/gnomad_v2.1_sv.sites.bed.altered.ins.shifted.chr.bed.header \ | ||
/mnt/common/DATABASES/REFERENCES/GRCh38/GNOMADSV/V2.1/gnomad_v2.1_sv.sites.altered.ins.shifted.chr.hg38.bed \ | ||
> /mnt/common/DATABASES/REFERENCES/GRCh38/GNOMADSV/V2.1/gnomad_v2.1_sv.sites.bed.altered.ins.shifted.chr.hg38.headered.bed | ||
|
||
|
||
# Step 4 trim the 'chr' | ||
sed -e 's/^chr//g' /mnt/common/DATABASES/REFERENCES/GRCh38/GNOMADSV/V2.1/gnomad_v2.1_sv.sites.bed.altered.ins.shifted.chr.hg38.headered.bed \ | ||
> /mnt/common/DATABASES/REFERENCES/GRCh38/GNOMADSV/V2.1/gnomad_v2.1_sv.sites.bed.altered.ins.shifted.hg38.bed | ||
|
||
sed -e 's/^chr//g' /mnt/common/DATABASES/REFERENCES/GRCh38/GNOMADSV/V2.1/gnomad_v2.1_sv.sites.bed.altered.reformatted.chr.hg38.headered.bed \ | ||
> /mnt/common/DATABASES/REFERENCES/GRCh38/GNOMADSV/V2.1/gnomad_v2.1_sv.sites.bed.altered.reformatted.hg38.bed | ||
|
||
# Step 5 split into sub-files for AnnotSV | ||
# Full Reformatted | ||
# gnomAD_AF | ||
cut -f1,2,3,9 /mnt/common/DATABASES/REFERENCES/GRCh38/GNOMADSV/V2.1/gnomad_v2.1_sv.sites.bed.altered.reformatted.hg38.bed > \ | ||
/mnt/common/DATABASES/REFERENCES/GRCh38/GNOMADSV/V2.1/UserGNOMADSV-AF.bed | ||
# gnomAD_NHOMALT | ||
cut -f1,2,3,10 /mnt/common/DATABASES/REFERENCES/GRCh38/GNOMADSV/V2.1/gnomad_v2.1_sv.sites.bed.altered.reformatted.hg38.bed > \ | ||
/mnt/common/DATABASES/REFERENCES/GRCh38/GNOMADSV/V2.1/UserGNOMADSV-NHOMALT.bed | ||
|
||
|
||
# Shifted Insertions | ||
# gnomAD_AF | ||
cut -f1,2,3,9 /mnt/common/DATABASES/REFERENCES/GRCh38/GNOMADSV/V2.1/gnomad_v2.1_sv.sites.bed.altered.reformatted.hg38.bed > \ | ||
/mnt/common/DATABASES/REFERENCES/GRCh38/GNOMADSV/V2.1/UserGNOMADSV-AF-insertions.bed | ||
# gnomAD_NHOMALT | ||
cut -f1,2,3,10 /mnt/common/DATABASES/REFERENCES/GRCh38/GNOMADSV/V2.1/gnomad_v2.1_sv.sites.bed.altered.reformatted.hg38.bed > \ | ||
/mnt/common/DATABASES/REFERENCES/GRCh38/GNOMADSV/V2.1/UserGNOMADSV-NHOMALT-insertions.bed | ||
|
||
|
||
# Step 6 copy over to AnnotSV | ||
ANNOTSVDIR=/mnt/common/Precision/AnnotSV/share/AnnotSV/Annotations_Human/Users/GRCh38/ | ||
# FtIncludedInSV | ||
# Full reformatted | ||
cp /mnt/common/DATABASES/REFERENCES/GRCh38/GNOMADSV/V2.1/UserGNOMADSV-AF.bed \ | ||
$ANNOTSVDIR/FtIncludedInSV/UserGNOMADSV-AF.bed | ||
cp /mnt/common/DATABASES/REFERENCES/GRCh38/GNOMADSV/V2.1/UserGNOMADSV-NHOMALT.bed \ | ||
$ANNOTSVDIR/FtIncludedInSV/UserGNOMADSV-NHOMALT.bed | ||
|
||
# Shifted insertions | ||
cp /mnt/common/DATABASES/REFERENCES/GRCh38/GNOMADSV/V2.1/UserGNOMADSV-AF-insertions.bed \ | ||
$ANNOTSVDIR/FtIncludedInSV/UserGNOMADSV-AF-insertions.bed | ||
cp /mnt/common/DATABASES/REFERENCES/GRCh38/GNOMADSV/V2.1/UserGNOMADSV-NHOMALT-insertions.bed \ | ||
$ANNOTSVDIR/FtIncludedInSV/UserGNOMADSV-NHOMALT-insertions.bed | ||
|
||
#SVincludedInFt | ||
# Full reformatted | ||
cp /mnt/common/DATABASES/REFERENCES/GRCh38/GNOMADSV/V2.1/UserGNOMADSV-AF.bed \ | ||
$ANNOTSVDIR/SVincludedInFt/UserGNOMADSV-AF.bed | ||
cp /mnt/common/DATABASES/REFERENCES/GRCh38/GNOMADSV/V2.1/UserGNOMADSV-NHOMALT.bed \ | ||
$ANNOTSVDIR/SVincludedInFt/UserGNOMADSV-NHOMALT.bed | ||
|
||
# Shifted insertions | ||
cp /mnt/common/DATABASES/REFERENCES/GRCh38/GNOMADSV/V2.1/UserGNOMADSV-AF-insertions.bed \ | ||
$ANNOTSVDIR/SVincludedInFt/UserGNOMADSV-AF-insertions.bed | ||
cp /mnt/common/DATABASES/REFERENCES/GRCh38/GNOMADSV/V2.1/UserGNOMADSV-NHOMALT-insertions.bed \ | ||
$ANNOTSVDIR/SVincludedInFt/UserGNOMADSV-NHOMALT-insertions.bed | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
# Load BCFTools/BEDTools | ||
source /mnt/common/WASSERMAN_SOFTWARE/AnnotateVariants/opt/miniconda3/etc/profile.d/conda.sh | ||
conda activate /mnt/common/WASSERMAN_SOFTWARE/AnnotateVariants/opt/AnnotateVariantsEnvironment/ | ||
|
||
# BGZip the sub VCFs | ||
bgzip ALU.final_comp.vcf | ||
tabix ALU.final_comp.vcf.gz | ||
|
||
bgzip HERVK.final_comp.vcf | ||
tabix HERVK.final_comp.vcf.gz | ||
|
||
bgzip SVA.final_comp.vcf | ||
tabix SVA.final_comp.vcf.gz | ||
|
||
bgzip LINE1.final_comp.vcf | ||
tabix LINE1.final_comp.vcf.gz | ||
|
||
# Concatenate the MELT files | ||
bcftools concat -a \ | ||
ALU.final_comp.vcf.gz \ | ||
HERVK.final_comp.vcf.gz \ | ||
SVA.final_comp.vcf.gz \ | ||
LINE1.final_comp.vcf.gz \ | ||
-o MergedMEI.vcf | ||
|
||
export ANNOTSV=/mnt/common/Precision/AnnotSV/ | ||
$ANNOTSV/bin/AnnotSV -SVinputFile MergedMEI.vcf \ | ||
-genomeBuild GRCh38 \ | ||
-overlap 80 \ | ||
-reciprocal yes \ | ||
-outputFile MergedMEI.annotated.tsv | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
#!/bin/bash | ||
|
||
#SBATCH [email protected] | ||
#SBATCH --mail-type=ALL | ||
|
||
## CPU Usage | ||
#SBATCH --mem=8G | ||
#SBATCH --cpus-per-task=1 | ||
#SBATCH --time=1:00:00 | ||
#SBATCH --nodes=1 | ||
|
||
## Output and Stderr | ||
#SBATCH --output=%x-%j.out | ||
#SBATCH --error=%x-%j.error | ||
|
||
|
||
########## | ||
# Set up # | ||
########## | ||
|
||
# Load environment | ||
ANNOTATE_VARIANTS_DIR=/mnt/common/WASSERMAN_SOFTWARE/AnnotateVariants/ | ||
source $ANNOTATE_VARIANTS_DIR/opt/miniconda3/etc/profile.d/conda.sh | ||
conda activate $ANNOTATE_VARIANTS_DIR/opt/AnnotateVariantsEnvironment/ | ||
|
||
# Number of threads | ||
NSLOTS=$SLURM_CPUS_PER_TASK | ||
|
||
# Here working dir is the MELT subdirectory, assumes you have run MELT | ||
WORKING_DIR=/mnt/scratch/Public/TESTING/GenomicsPipelineTest/MELT/ | ||
|
||
## Set working space | ||
cd $WORKING_DIR | ||
|
||
#### GRCh38 #### | ||
echo "GRCh38 genome" | ||
GENOME=GRCh38 | ||
FASTA_DIR=/mnt/common/DATABASES/REFERENCES/GRCh38/GENOME/1000G/ | ||
FASTA_FILE=GRCh38_full_analysis_set_plus_decoy_hla.fa | ||
FAMILY_ID=EPGEN029 | ||
|
||
# BGZip the sub VCFs | ||
bgzip ALU.final_comp.vcf | ||
tabix ALU.final_comp.vcf.gz | ||
|
||
bgzip HERVK.final_comp.vcf | ||
tabix HERVK.final_comp.vcf.gz | ||
|
||
bgzip SVA.final_comp.vcf | ||
tabix SVA.final_comp.vcf.gz | ||
|
||
bgzip LINE1.final_comp.vcf | ||
tabix LINE1.final_comp.vcf.gz | ||
|
||
# Concatenate the MELT files | ||
bcftools concat -a \ | ||
ALU.final_comp.vcf.gz \ | ||
HERVK.final_comp.vcf.gz \ | ||
SVA.final_comp.vcf.gz \ | ||
LINE1.final_comp.vcf.gz \ | ||
-o MergedMEI.vcf | ||
|
||
export ANNOTSV=/mnt/common/Precision/AnnotSV/ | ||
# Annotate with targetd approach for epilepsy genes | ||
$ANNOTSV/bin/AnnotSV -SVinputFile MergedMEI.vcf \ | ||
-genomeBuild $GENOME \ | ||
-candidateGenesFile /mnt/scratch/Precision/EPGEN/PROCESS/EPGEN_Genes.txt \ | ||
-candidateGenesFiltering yes \ | ||
-outputFile ${FAMILY_ID}_MergedMEI.epilepsygenes.annotated.tsv | ||
|
||
$ANNOTSV/bin/AnnotSV -SVinputFile MergedMEI.vcf \ | ||
-genomeBuild $GENOME \ | ||
-outputFile ${FAMILY_ID}_MergedMEI.annotated.tsv | ||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.