-
Notifications
You must be signed in to change notification settings - Fork 0
/
VisualiseChangesInGeneFreqs.Rmd
88 lines (70 loc) · 4.81 KB
/
VisualiseChangesInGeneFreqs.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
---
title: "R Notebook"
output: html_notebook
---
```{r}
# read in gene presence absence matrix
gene_presence_absence <- readRDS(file = "gene_presence_absence.rds")
```
```{r}
# create time-point specific gene presence absence matrices
Mass_year_dict <- c(rep(2001,length(Mass_Samples_accCodes[Mass_Samples_accCodes$`Year of Isolation`==2001,]$`Isolate Name`)),
rep(2004,length(Mass_Samples_accCodes[Mass_Samples_accCodes$`Year of Isolation`==2004,]$`Isolate Name`)),
rep(2007,length(Mass_Samples_accCodes[Mass_Samples_accCodes$`Year of Isolation`==2007,]$`Isolate Name`)))
names(Mass_year_dict) <- c(Mass_Samples_accCodes[Mass_Samples_accCodes$`Year of Isolation`==2001,]$`Isolate Name`, Mass_Samples_accCodes[Mass_Samples_accCodes$`Year of Isolation`==2004,]$`Isolate Name`, Mass_Samples_accCodes[Mass_Samples_accCodes$`Year of Isolation`==2007,]$`Isolate Name`)
# empty gene presence absence matrix
gene_presence_absence_2001 <- data.frame(matrix(0, nrow = length(cls_files)+1, ncol = length(Mass_Samples_accCodes[Mass_Samples_accCodes$`Year of Isolation`==2001,]$`Isolate Name`)+1))
gene_presence_absence_2001[1,-1] <- (Mass_Samples_accCodes[Mass_Samples_accCodes$`Year of Isolation`==2001,]$`Isolate Name`)
gene_presence_absence_2001[-1,1] <- gene_presence_absence[-1,1]
gene_presence_absence_2001[-1,-1] <- gene_presence_absence[-1,c(FALSE,Mass_year_dict[unlist(gene_presence_absence[1,-1])]==2001)]
gene_presence_absence_2004 <- data.frame(matrix(0, nrow = length(cls_files)+1, ncol = length(Mass_Samples_accCodes[Mass_Samples_accCodes$`Year of Isolation`==2004,]$`Isolate Name`)+1))
gene_presence_absence_2004[1,-1] <- (Mass_Samples_accCodes[Mass_Samples_accCodes$`Year of Isolation`==2004,]$`Isolate Name`)
gene_presence_absence_2004[-1,1] <- gene_presence_absence[-1,1]
gene_presence_absence_2004[-1,-1] <- gene_presence_absence[-1,c(FALSE,Mass_year_dict[unlist(gene_presence_absence[1,-1])]==2004)]
gene_presence_absence_2007 <- data.frame(matrix(0, nrow = length(cls_files)+1, ncol = length(Mass_Samples_accCodes[Mass_Samples_accCodes$`Year of Isolation`==2007,]$`Isolate Name`)+1))
gene_presence_absence_2007[1,-1] <- (Mass_Samples_accCodes[Mass_Samples_accCodes$`Year of Isolation`==2007,]$`Isolate Name`)
gene_presence_absence_2007[-1,1] <- gene_presence_absence[-1,1]
gene_presence_absence_2007[-1,-1] <- gene_presence_absence[-1,c(FALSE,Mass_year_dict[unlist(gene_presence_absence[1,-1])]==2007)]
```
### A bit of data exploration ---
# Investigating the relationship of gene frequency and the change in frequency over time
```{r}
# calculate cluster freqs per time point
gene_cluster_freqs_1 <- rep(0, length(cls_files))
for(i in 1:length(cls_files)){
gene_cluster_freqs_1[i] <- sum(as.numeric(gene_presence_absence_2001[i+1,-1]==1))/length(gene_presence_absence_2001[1,-1])
}
#gene_cluster_freqs_1 <- rep(0, length(cls_files))
#calculate_rowSum <- function(data_row){
#print(sum(as.numeric(data_row==1)))
# sum(as.numeric(data_row==1))
#}
#gene_cluster_freqs_1 <- apply(gene_presence_absence_2001[-1,-1],1,calculate_rowSum)
gene_cluster_freqs_2 <- rep(0, length(cls_files))
for(i in 1:length(cls_files)){
gene_cluster_freqs_2[i] <- sum(as.numeric(gene_presence_absence_2004[i+1,-1]==1))/length(gene_presence_absence_2004[1,-1])
}
gene_cluster_freqs_3 <- rep(0, length(cls_files))
for(i in 1:length(cls_files)){
gene_cluster_freqs_3[i] <- sum(as.numeric(gene_presence_absence_2007[i+1,-1]==1))/length(gene_presence_absence_2007[1,-1])
}
```
```{r}
plot(1:1500, gene_cluster_freqs_1[sort.list(gene_cluster_freqs_1)[1:1500]],type = "b", lty = 1)
lines(1:1500, gene_cluster_freqs_2[sort.list(gene_cluster_freqs_1)[1:1500]], col = "#E69F00",type = "b")
lines(1:1500, gene_cluster_freqs_3[sort.list(gene_cluster_freqs_1)[1:1500]], col = "#56B4E9", type = "b")
legend(1, 1, legend=c("2001", "2004", "2007"),col=c("black", "#E69F00","#56B4E9"), lty=1, cex=1)
#plot(1:length(gene_cluster_freqs_1), gene_cluster_freqs_3[sort.list(gene_cluster_freqs_1)])
```
```{r}
# Plotting the changes in gene cluster frequencies between the 1st time point and the 2nd and 3rd, respectively, including a rolling mean
# I thought maybe there would be a correlation between the gene frequency and it variability. Well, there isn't.
library(zoo)
plot(1:1500, (gene_cluster_freqs_1-gene_cluster_freqs_2)[sort.list(gene_cluster_freqs_1)[1:1500]],type = "b", lty = 1)
lines(rollmean((gene_cluster_freqs_1-gene_cluster_freqs_2), 50), col = "#E69F00", lwd = 5)
legend(1, 0.15, legend=c("2001 minus 2004", "Rollmean"),col=c("black", "#E69F00"), lty=1, cex=1)
plot(1:1500, (gene_cluster_freqs_1-gene_cluster_freqs_3)[sort.list(gene_cluster_freqs_1)[1:1500]],type = "b", lty = 1)
lines(rollmean((gene_cluster_freqs_1-gene_cluster_freqs_3), 50), col = "#56B4E9", lwd = 5)
legend(1, 0.15, legend=c("2001 minus 2007", "Rollmean"),col=c("black", "#56B4E9"), lty=1, cex=1)
```
###