-
Notifications
You must be signed in to change notification settings - Fork 0
/
VariantAnnotationComs.txt
323 lines (233 loc) · 11 KB
/
VariantAnnotationComs.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
######## example bash script to annotate variants #########################
#Start with unrelated EUR exome file
#use plink to excluded variants which failed QC and people who opt out, as well as mtDNA
module load 2019
module load PLINK/1.9b_6.17-x86_64-GCC-8.3.0
plink --bfile unrelated_wes_200k_EUR --exclude defaultVariantExclusions.txt --not-chr 25 --remove defaultSubjectExclusions.txt --make-bed --out e200k
#get frequency of variants and identify the minor allele
plink --bfile e200k --freq --out e200k
gzip e200k*
########### MAKE SETID for pLOF category ################################################
#format into VEP format
zcat e200k.frq.gz | tail -n +2 | awk '{if ($5>0) print $2,$1,$2,$3,$4,$5,$6}' | sed 's/:/\t/1' | sed 's/:/\t/1' | awk '{print $1,$2,$5,$7,$6,".","."}' > SNPs4VEP.txt
#annotate with VEP to LOF variants
vep --cache -i SNPs4VEP.txt --vcf --fields "Allele,Consequence,Gene" -o SNPanno.vcf
filter_vep -i SNPanno.vcf -o LOF.txt -filter "Consequence is start_lost or Consequence is stop_lost or Consequence is frameshift_variant or Consequence is stop_gained or Consequence is splice_donor_variant or Consequence is splice_acceptor_variant or Consequence is transcript_ablation" --only_matched
#get LOF variants
tail -n +5 LOF.txt | awk '{print $3}' > LOFvariants.txt
tail -n +5 ${i}LOF.txt | cut -f8 | cut -d'|' -f1 | sed 's/CSQ=//g' > LOFtestedalleles.txt
paste LOFvariants.txt LOFtestedalleles.txt > LOFvariantsAlleles.txt
#make a wide format snpmap
tail -n +5 LOF.txt | awk -v OFS="" '{print $3,"|",$8}' | sed 's/,/|/g' | cut -d'|' -f 1,$(seq 4 3 1200 | tr '\n' ',' | sed 's/,$//g') | sed 's/|/ /g' > VEPsnpmapwide.txt
#make long format snpmap
cat > makelongVEP.R << END
args = commandArgs(trailingOnly=TRUE)
#read in wide map with the right number of columns
a<-read.table(args[1], header = FALSE, sep = " ", col.names = paste0("V",seq_len(args[2])), fill = TRUE)
#make sure gene and snp names are not factors
a<-data.frame(lapply(a, as.character), stringsAsFactors=FALSE)
#melt the data to gene snp for skat.setID
library(data.table)
b<- melt(a, id.vars="V1")
rm(a)
#Remove the NAs because lots of columns will be empty
b<-b[!b$value=="",]
#get genes and snp, order by gene and make there be one gene per row
b<-b[,c(3,1)]
b<-b[order(b$value),]
#remove duplicate lines
b<-unique(b)
fwrite(b,file=args[3], col.names=F, row.names=F, sep="\t", na=NA, quote=F)
END
#find how many columns in wide format
col=$(awk '{print NF}' VEPsnpmapwide.txt | sort -nu | tail -n 1)
echo $col
Rscript makelongVEP.R VEPsnpmapwide.txt $col VEPsnpmaplong.txt
#give the full allele tested name
sed 's/:/\t/g' LOFvariantsAlleles.txt | sed 's/_/\t/g' | awk '{l=length($4)-length($5);if (l==1) print $1,$2,$3,$4,$4,$5; else print $1,$2,$3,$4,$5,$5}' | awk '{if ($6=="-") print $1,$2,$3,$4,$3; else print $1,$2,$3,$4,$5}' | sed 's/ /:/1' | sed 's/ /:/1' | sed 's/ /_/1' > LOFvariantsAlleles.txt1
mv LOFvariantsAlleles.txt incorrectA2LOFvariantsAlleles.txt
mv LOFvariantsAlleles.txt1 LOFvariantsAlleles.txt
#get rare variants when the restricted to people included in SKAT-O analysis
#make this file by re-running above plink command except restrict to people of interest
cat e200knoMissing.frq | tail -n +2 | awk '{if ($5>0) print}' | awk '{if ($5<0.01) print $2}' > rarevariants.txt
#make setID
zcat VEPsnpmaplong.txt.gz | grep -w -f rarevariants.txt > rareSKAT.setID
#change snp names that are too long
awk '{print $2}' rareSKAT.setID | grep -e ".\{35\}" | wc -l
#963
#lets change their names to longsnp1...longsnp995
awk '{print $2}' rareSKAT.setID | grep -e ".\{35\}" > tmp1
for i in {1..963}; do echo "longsnp${i}" >> tmp2; done
paste tmp1 tmp2 > longsnpnamescode.txt
rm tmp1
rm tmp2
#change the names in the skat setid
awk '{print $1}' longsnpnamescode.txt > toolongsnpnames.txt
awk '{print $2}' longsnpnamescode.txt > trimmedsnpnames.txt
awk '
FILENAME == ARGV[1] { listA[$1] = FNR; next }
FILENAME == ARGV[2] { listB[FNR] = $1; next }
{
for (i = 1; i <= NF; i++) {
if ($i in listA) {
$i = listB[listA[$i]]
}
}
print
}
' toolongsnpnames.txt trimmedsnpnames.txt rareSKAT.setID > tmp.txt
mv tmp.txt rareSKAT.setID
####################################### MAKE setID for HiCpLOF ########################
#annotate LOF variants with LOFTEE
awk '{print $1}' LOFvariantsAlleles.txt > tmp
grep -w -f tmp SNPs4VEP.txt > LOFSNPs4VEP.txt
vep --cache -i LOFSNPs4VEP.txt --vcf --fields "Allele,Consequence,Gene,LoF,LoF_filter,LoF_flags,LoF_info" -o LOFSNPannoLOFTEE.vcf --dir_plugin loftee --plugin LoF,loftee_path:loftee,human_ancestor_fa:human_ancestor.fa.gz,conservation_file:phylocsf_gerp.sql
gzip LOFSNPannoLOFTEE.vcf
gzip LOFSNPs4VEP.txt
#Make snpmap
#filter for the variants where they have HIC LOF
filter_vep -i LOFSNPannoLOFTEE.vcf -o HICLOFSNP.txt -filter "LoF is HC" --only_matched
#get the variants
tail -n +9 LOFSNPannoLOFTEE.vcf | awk '{print $3,$5}' > HICLOFvariantsAlleles.txt
#make the snpmap
#get snp name and csq section then get every 4th cell (gene id)
tail -n +9 HICLOFSNP.txt | awk '{print $3,"|",$8}' | sed 's/,/|/g' | cut -d'|' -f1,$(seq 4 7 1900 | tr '\n' ',' | sed 's/,$//g') | sed 's/|/ /g' | sed 's/ / /g' > HICLOFmapwide.txt
#see how many columns
col=$(awk '{print NF}' HICLOFmapwide.txt | sort -nu | tail -n 1)
echo $col
Rscript makelongVEP.R HICLOFmapwide.txt $col HICLOFmaplong.txt
#make setID
zcat HICLOFmaplong.txt.gz | grep -w -f rarevariants.txt > rareSKAT.setID
#lets see if the names are too long
awk '{print $2}' rareSKAT.setID | grep -e ".\{35\}" | wc -l
#698
#lets change their names to longsnp1...longsnp995
awk '{print $2}' rareSKAT.setID | grep -e ".\{35\}" > tmp1
for i in {1..698}; do echo "longsnp${i}" >> tmp2; done
paste tmp1 tmp2 > longsnpnamescode.txt
rm tmp1
rm tmp2
#change the names in the skat setid
awk '{print $1}' longsnpnamescode.txt > toolongsnpnames.txt
awk '{print $2}' longsnpnamescode.txt > trimmedsnpnames.txt
awk '
FILENAME == ARGV[1] { listA[$1] = FNR; next }
FILENAME == ARGV[2] { listB[FNR] = $1; next }
{
for (i = 1; i <= NF; i++) {
if ($i in listA) {
$i = listB[listA[$i]]
}
}
print
}
' toolongsnpnames.txt trimmedsnpnames.txt rareSKAT.setID > tmp.txt
mv tmp.txt rareSKAT.setID
###################### MAKE pLOF+REVEL setID ######################################
#annotate variants
vep --cache -i ${1}_SNPs4VEP.txt -o SNPanno.vcf --plugin REVEL,new_tabbed_revel_grch38only.tsv.gz
#get missense variants
zcat *_SNPanno.vcf.gz | grep "REVEL=" | cut -f1,4,14 | sed 's/IMPACT.*REVEL=//g' | uniq > AllREVELmissense.txt
#get REVEL >=0.5 variants
awk '{if ($3>=0.5) print $1,$2}' AllREVELmissense.txt | sort -k2 | uniq > 5REVELmissense.txt
#add LOF variants
zcat VEPsnpmaplong.txt.gz > LOFrevel5.txt
awk '{print $2,$1}' 5REVELmissense.txt >> LOFrevel5.txt
sed -i 's/\t/ /g' LOFrevel5.txt
sort LOFrevel5.txt | uniq > LOFrevel5.txt1
mv LOFrevel5.txt1 LOFrevel5.txt
#get tested variants
grep -w -f rarevariants.txt LOFrevel5.txt | awk '{print $2}' | sort | uniq > tmp
#get alleles
zcat e200k.frq.gz | tail -n +2 | awk '{print $2,$3}' | grep -w -f tmp > REVELLOFvariantsAlleles.txt
rm tmp
grep -w -f rarevariants.txt LOFrevel5.txt > rareSKAT.setID
#lets see if the names are too long
awk '{print $2}' rareSKAT.setID | grep -e ".\{35\}" | wc -l
#963
#lets change their names to longsnp1...longsnp995
awk '{print $2}' rareSKAT.setID | grep -e ".\{35\}" > tmp1
for i in {1..963}; do echo "longsnp${i}" >> tmp2; done
paste tmp1 tmp2 > longsnpnamescode.txt
rm tmp1
rm tmp2
#change the names in the skat setid
awk '{print $1}' longsnpnamescode.txt > toolongsnpnames.txt
awk '{print $2}' longsnpnamescode.txt > trimmedsnpnames.txt
awk '
FILENAME == ARGV[1] { listA[$1] = FNR; next }
FILENAME == ARGV[2] { listB[FNR] = $1; next }
{
for (i = 1; i <= NF; i++) {
if ($i in listA) {
$i = listB[listA[$i]]
}
}
print
}
' toolongsnpnames.txt trimmedsnpnames.txt rareSKAT.setID > tmp.txt
mv tmp.txt rareSKAT.setID
########################### MAKE pLOF+Missense setID ################################
#combined missense and pLOF
awk '{print $1,$2}' AllREVELmissense.txt | sort -k2 | uniq > missense.txt
zcat VEPsnpmaplong.txt.gz > LOFmissense.txt
awk '{print $2,$1}' missense.txt >> LOFmissense.txt
sed -i 's/\t/ /g' LOFmissense.txt
sort LOFmissense.txt | uniq > LOFmissense.txt1
mv LOFmissense.txt1 LOFmissense.txt
#get tested variants
grep -w -f rarevariants.txt LOFmissense.txt | awk '{print $2}' | sort | uniq > tmp
#get alleles
zcat e200k.frq.gz | tail -n +2 | awk '{print $2,$3}' | grep -w -f tmp > REVELLOFvariantsAlleles.txt
rm tmp
#overlap rare variants with LOFmissense variants
grep -w -f rarevariants.txt LOFmissense.txt > rareSKAT.setID
#lets see if the names are too long
awk '{print $2}' rareSKAT.setID | grep -e ".\{35\}" | wc -l
#963
#lets change their names to longsnp1...longsnp995
awk '{print $2}' rareSKAT.setID | grep -e ".\{35\}" > tmp1
for i in {1..963}; do echo "longsnp${i}" >> tmp2; done
paste tmp1 tmp2 > longsnpnamescode.txt
rm tmp1
rm tmp2
#change the names in the skat setid
awk '{print $1}' longsnpnamescode.txt > toolongsnpnames.txt
awk '{print $2}' longsnpnamescode.txt > trimmedsnpnames.txt
awk '
FILENAME == ARGV[1] { listA[$1] = FNR; next }
FILENAME == ARGV[2] { listB[FNR] = $1; next }
{
for (i = 1; i <= NF; i++) {
if ($i in listA) {
$i = listB[listA[$i]]
}
}
print
}
' toolongsnpnames.txt trimmedsnpnames.txt rareSKAT.setID > tmp.txt
mv tmp.txt rareSKAT.setID
########################### MAKE synonymous variants setID ######################
#filter for the variants where they have synonymous
filter_vep -i SNPanno.vcf.gz -o LOF.txt -filter "Consequence is synonymous_variant" --only_matched
tail -n +5 LOF.txt | awk '{print $3}' > LOFvariants.txt
#get the test alleles
tail -n +5 LOF.txt | cut -f8 | cut -d'|' -f1 | sed 's/CSQ=//g' > LOFtestedalleles.txt
paste LOFvariants.txt LOFtestedalleles.txt > LOFvariantsAlleles.txt
mv LOFvariantsAlleles.txt SYNvariantsAlleles.txt
#make the snpmap
tail -n +5 LOF.txt | awk -v OFS="" '{print $3,"|",$8}' | sed 's/,/|/g' | cut -d'|' -f 1,$(seq 4 3 1200 | tr '\n' ',' | sed 's/,$//g') | sed 's/|/ /g' > VEPsnpmapwide.txt
#see how many columns
col=$(awk '{print NF}' VEPsnpmapwide.txt | sort -nu | tail -n 1)
echo $col
#95
cut -d' ' -f$col VEPsnpmapwide.txt | sort | uniq
Rscript makelongVEP.R VEPsnpmapwide.txt $col VEPsnpmaplong.txt
gzip VEPsnpmapwide.txt
gzip VEPsnpmaplong.txt
##give the full allele tested name
sed 's/:/\t/g' SYNvariantsAlleles.txt | sed 's/_/\t/g' | awk '{l=length($4)-length($5);if (l==1) print $1,$2,$3,$4,$4,$5; else print $1,$2,$3,$4,$5,$5}' | awk '{if ($6=="-") print $1,$2,$3,$4,$3; else print $1,$2,$3,$4,$5}' | sed 's/ /:/1' | sed 's/ /:/1' | sed 's/ /_/1' > SYNvariantsAlleles.txt1
mv SYNvariantsAlleles.txt incorrectA2SYNvariantsAlleles.txt
mv SYNvariantsAlleles.txt1 SYNvariantsAlleles.txt
#make setID
zcat VEPsnpmaplong.txt.gz | grep -w -f rarevariants.txt > rareSKAT.setID