-
Notifications
You must be signed in to change notification settings - Fork 0
/
SNV_plotter.R
359 lines (301 loc) · 12.2 KB
/
SNV_plotter.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
# QC Script re: RedDog Het/Hom ratios
#
# A QC script for summarising and plotting
# het and hom SNPs location in excluded
# and included regions
# Import required packages
library(vcfR)
library(ape)
library(dplyr)
library(stringr)
library(tidyr)
library(reshape2)
library(data.table)
library(ggplot2)
#------------------------------------------------
# Last updated Aug 20 2017
#
# TO DO:
# - reshape the summary table to be counts for
# each type and location and add in ratios
# for each
# - work out why the second plot
# (commented out) crashes the pdf, or change
# to ggplot
#
#------------------------------------------------
#----------------------Example usage:------------
#
# After sourcing the script provide the parameters below and run.
#
# ref_id is what is listed in the RedDog output (the reference genome),
# refernce genome must be supplied as fasta. Excluded regions should be
# supplied as a csv with one region per line and no headers.
#
#-----Example parameters:---------
#
# working_dir_path <- "/Users/zdyson/Dropbox/Folder/15052017_QC_Script"
#
# vcf_path <- "vcf"
#
# ids <- c("20087_8#59","20087_8#60","20087_8#63",
# "20087_8#64","20087_8#68","20087_8#69",
# "20087_8#70","20087_8#72")
#
# ref_id <- "AL513382"
#
# ref_genome_path <- "test_data_reference/Typhi_CT18.gbk.fasta"
#
# excluded_regions_path <- "test_data_excluded_regions/CT18_final_repeats_phage_forParseSNPTable.csv"
#
#--------To run:------------------
#
# plot_hets(working_dir_path, vcf_path, ids, ref_id, ref_genome_path, excluded_regions_path)
#
#--------------------------------
plot_hets <- function(working_dir_path, vcf_path, ids, ref_id, ref_genome_path, excluded_regions_path){
# Set working directory
setwd(working_dir_path)
# Mapped isolates to be examined
list_of_ids <- ids
# Set chromosome that reads have been
# mapped to ino order to produce the
# vcf files
reference_id <- ref_id
# Parse reference and annotations
ref_genome <- ape::read.dna(ref_genome_path,
format = "fasta")
excluded_regions <- read.table(
excluded_regions_path,
sep=",")
colnames(excluded_regions) <- c("region_start","region_end")
# Initialise graphical devices
pdf("compiled_snp_distributions_in_excluded_and_non_excluded_regions.pdf", width=15, height=20)
# Analyse the het/hom SNPs for each isolate
for (name in 1:length(list_of_ids)){ # should multithread this with for each
# Parse het and hom vcf files
vcf_het <- read.vcfR(
paste0(vcf_path, "/",
list_of_ids[name],"_",
reference_id,"_het.vcf"),
verbose = TRUE )
vcf_hom <- read.vcfR(
paste0(vcf_path, "/",
list_of_ids[name],"_",
reference_id,"_q30.vcf"),
verbose = TRUE )
# Create tables of het and hom snps from vcf
het_vcf_df <- tbl_df(vcf_het@fix)
hom_vcf_df <- tbl_df(vcf_hom@fix)
# create new columns for snp type + code and location in (un)excluded region(s)
hom_vcf_df <- mutate(hom_vcf_df, SNP_type="homozygous",
SNP_code="1",SNP_region="undetermined")
het_vcf_df <- mutate(het_vcf_df, SNP_type="heterozygous",
SNP_code="2",SNP_region="undetermined")
# merge het and hom snps into a single table
het_hom_vcf_df <- rbind(het_vcf_df, hom_vcf_df)
# remove indels
het_hom_vcf_df <- het_hom_vcf_df %>%
filter(!grepl('INDEL', INFO))
# Get details of isolate mapping
isolate_name <- list_of_ids[name]
chromosome_id <- het_hom_vcf_df$CHROM[1]
# Pull out DP4 and calculate prop reads mapped to reference and altenrate
# and output a tidy summary table of this information
het_hom_vcf_df <- het_hom_vcf_df %>%
# Extract and create a new column for DP4 from INFO
mutate(DP4=sub(".*?DP4=(.*?);.*", "\\1",
het_hom_vcf_df$INFO)) %>%
# Create a new column with the name of the isolate
mutate(Isolate_ID=isolate_name) %>%
# Split DP4 across 4 columns
separate(DP4, c("DP4_fwd_ref","DP4_rev_ref",
"DP4_fwd_alt","DP4_rev_alt"),
sep=",") %>%
# Calculate the total proportion of reads mapped
# to each SNP
mutate(total_DP4=(as.numeric(DP4_fwd_ref)+
as.numeric(DP4_rev_ref)+
as.numeric(DP4_fwd_alt)+
as.numeric(DP4_rev_alt))) %>%
# Determine proportion of reads mapped to ref and alt
mutate(prop_ref=
as.numeric(((as.numeric(DP4_fwd_ref)+
as.numeric(DP4_rev_ref))/
as.numeric(total_DP4))),
prop_alt=
as.numeric(((as.numeric(DP4_fwd_alt)+
as.numeric(DP4_rev_alt))/
as.numeric(total_DP4)))) %>%
# Remove commas from ALT column
mutate(ALT=gsub(",", ".", ALT)) %>%
# Select required columns for tidy data frame
select(Isolate_ID, CHROM, POS, REF, ALT,
prop_ref, prop_alt, SNP_region,
SNP_type, SNP_code) %>%
# Reorder data frame by SNP position
arrange(as.numeric(POS))
# build vector of excluded regions
all_excuded_regions <- NULL
for (excluded_region in 1:nrow(excluded_regions)){
all_excuded_regions <- c(all_excuded_regions, c( excluded_regions[excluded_region,1]: excluded_regions[excluded_region,2]))
}
# Annotate SNPs from excluded regions and create tables
for (snp in 1:nrow(het_hom_vcf_df)) {
if (as.numeric(het_hom_vcf_df[snp,3]) %in% all_excuded_regions){
het_hom_vcf_df[snp,8] <- "excluded"
}else{
het_hom_vcf_df[snp,8] <- "included"
}
}
# Create an included regions only summary table
het_hom_vcf_included_only_df <- het_hom_vcf_df[het_hom_vcf_df$SNP_region == "included",]
# Create an included regions only het SNP summary table
het_hom_vcf_included_het_only_df <-
het_hom_vcf_df[het_hom_vcf_df$SNP_region == "included" &
het_hom_vcf_df$SNP_type=="heterozygous",]
# Create an included regions only hom SNP summary table
het_hom_vcf_included_hom_only_df <-
het_hom_vcf_df[het_hom_vcf_df$SNP_region == "included" &
het_hom_vcf_df$SNP_type=="homozygous",]
#--------Pannel figure of all SNPs------------------
par(mar=c(4,4,4,4))
par(oma=c(2,2,2,2))
par(mai=c(1,1,1,1))
par(mfrow=c(2,2))
# Plot all SNPs
plot(het_hom_vcf_df$prop_ref~het_hom_vcf_df$POS,
xlim=c(0,length(ref_genome)), ylim=c(0,1),
col="black",
xlab="Position in genome",
ylab="Percentage of reads",
main=paste("Distribution of all SNPs in",
isolate_name))
points(het_hom_vcf_df$prop_alt~het_hom_vcf_df$POS, col="red")
# Plot all included SNPs
plot(het_hom_vcf_included_only_df$prop_ref~het_hom_vcf_included_only_df$POS,
xlim=c(0,length(ref_genome)), ylim=c(0,1),
col="black",
xlab="Position in genome",
ylab="Percentage of reads",
main=paste("Distribution of all non-excluded SNPs in",
isolate_name))
points(het_hom_vcf_included_only_df$prop_alt~het_hom_vcf_included_only_df$POS,
col="red")
# Plot only included het SNPs
plot(
het_hom_vcf_included_het_only_df$prop_ref~het_hom_vcf_included_het_only_df$POS,
xlim=c(0,length(ref_genome)), ylim=c(0,1),
col="black",
xlab="Position in genome",
ylab="Percentage of reads",
main=paste("Distribution of all non-exclued heterozygous SNPs in",
isolate_name))
points(
het_hom_vcf_included_het_only_df$prop_alt~het_hom_vcf_included_het_only_df$POS,
col="red")
# Plot only included hom SNPs
plot(het_hom_vcf_included_hom_only_df$prop_ref~het_hom_vcf_included_hom_only_df$POS,
xlim=c(0,length(ref_genome)), ylim=c(0,1),
col="black",
xlab="Position in genome",
ylab="Percentage of reads",
main=paste("Distribution of all non-excluded homozygous SNPs in",
isolate_name))
points(het_hom_vcf_included_hom_only_df$prop_alt~het_hom_vcf_included_hom_only_df$POS,
col="red")
#-----------------Figure of SNP density--------------------------
# Set up plot layout
layout(matrix(c(1:4), nrow=2, byrow=TRUE),
widths=c(0.80,0.20), heights=c(0.20,0.80))
par(mar=c(2,2,0,0))
par(oma=c(0,0,4,0))
# Plot top histogram
par(mai=c(0,0.4,0,0))
hom_density <- NULL
het_density <- NULL
if (length(hom_vcf_df$POS) > 1){
hom_density <- density(as.numeric(hom_vcf_df$POS))
}
if (length(het_vcf_df$POS) > 1){
het_density <- density(as.numeric(het_vcf_df$POS))
}
plot(hom_density, col="green",
xlim=c(0,length(ref_genome)),
ylim=(c(0,max(c(hom_density$y, het_density$y)))),
type="l",
lwd=2,
main="",
xlab="",
ylab="",
bty="n", xaxt="n", yaxt="n")
lines(het_density, col="orange",
xlim=c(0,length(ref_genome)), type="l",
lwd=2)
text("top", "hets", col="orange")
text("top", "homs", col="green")
# empty
plot(1:10,1:10,bty="n", xaxt="n", yaxt="n", xlab="", ylab="",col="white")
# Plot all SNPs
par(mai=c(0.4,0.4,0,0))
plot(het_hom_vcf_df$prop_ref~het_hom_vcf_df$POS,
xlim=c(0,length(ref_genome)), ylim=c(0,1),
col="black",
xlab="Position in genome",
ylab="Percentage of reads",
main="")
points(het_hom_vcf_df$prop_alt~het_hom_vcf_df$POS, col="red")
# Plot right histogram
par(mai=c(0.4,0,0,0))
prop_ref_density <- density(het_hom_vcf_df$prop_ref)
plot(prop_ref_density$y, prop_ref_density$x, col="black",
type="l", ylim=c(0,1), lwd=2,xlab="",ylab="",bty="n",
xaxt="n", yaxt="n")
prop_alt_density <- density(het_hom_vcf_df$prop_alt)
lines(prop_alt_density$y, prop_alt_density$x, col="red",
lwd=2)
title(paste("Distribution of all SNPs in",isolate_name), outer=T)
# Write the summary tables to file (fwrite does a better job with large tables)
fwrite(het_hom_vcf_df,
paste0(isolate_name,"_processed_vcf_files_long_format.csv"),
quote=F, row.names=F)
# Merge loci without SNPs into data tables
all_loci <- tbl_df(1:length(ref_genome))
colnames(all_loci) <- "POS"
het_hom_vcf_df <- merge(het_hom_vcf_df, all_loci, by="POS", all=T)
het_hom_vcf_df$SNP_type[is.na(
het_hom_vcf_df$SNP_type)] <- "no_snp"
het_hom_vcf_df$SNP_code[is.na(
het_hom_vcf_df$SNP_code)] <- "0"
het_hom_vcf_df$Isolate_ID[is.na(
het_hom_vcf_df$Isolate_ID)] <- isolate_name
het_hom_vcf_df$CHROM[is.na(
het_hom_vcf_df$CHROM)] <- chromosome_id
# Sort by SNP position
het_hom_vcf_df <- arrange(het_hom_vcf_df, as.numeric(POS))
# Create a matrix of SNPs
het_hom_vcf_df$POS <- factor(het_hom_vcf_df$POS, c(1:length(ref_genome)))
het_hom_matrix <- dcast(data=het_hom_vcf_df, formula=POS~Isolate_ID)
# Write the summary tables to file (fwrite does a better job with large tables)
fwrite(het_hom_matrix,
paste0(isolate_name,"_processed_vcf_matrix.csv"),
quote=F)
}
# Switch off graphical device and write plots to file
dev.off()
# Compile data files
compiled_data <- do.call("rbind",
lapply(list.files(pattern="*long_format.csv"),
read.csv, header=T))
# Compile matrix by lines (as strings) to save memory
compiled_matrix <- do.call("cbind",
lapply(list.files(pattern="*vcf_matrix.csv"),
read.csv, header=T, row.names=1, check.names=F))
# Summarise SNP counts by region and type
summary_table <- summarise(group_by(compiled_data, Isolate_ID, SNP_type, SNP_region), count=n())
# Write compiled and summarised data tables to file
fwrite(compiled_data, "compiled_data.csv", quote=F)
fwrite(compiled_matrix, "compiled_matrix.csv", quote =F,row.names=T)
fwrite(summary_table, "compiled_summary_table.csv", quote=F)
print("Finished")
}