-
Notifications
You must be signed in to change notification settings - Fork 0
/
homebrew_DGE.R
173 lines (140 loc) · 5.04 KB
/
homebrew_DGE.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
# Develop home-made method for differential gene expression
# Compare results with EdgeR
# Use data from old project
# Load required packages
library(tidyverse)
library(ggplot2)
library(MASS)
library(lmtest)
# Set working directory
setwd('/Users/jamesboot/Documents/GitHubEnterprise/RNAseq/')
# Specify data and meta directory
dataDir <- '/Users/jamesboot/Documents/9.Genome Centre Files/YungYao_Analysis/STAR'
metaDir <- '/Users/jamesboot/Documents/9.Genome Centre Files/YungYao_Analysis/SRA'
# Find files
inFiles <- list.files(path = dataDir,
pattern = '*_ReadsPerGene.out.tab',
full.names = T)
# Get sample names from file names
samples <- gsub(
'_ReadsPerGene.out.tab',
'',
list.files(path = dataDir,
pattern = '*_ReadsPerGene.out.tab')
)
# Load files
rawDat <- lapply(inFiles, function(x){
read.delim(x,
sep = '\t',
skip = 4,
header = F)
})
# Rename elements of list
names(rawDat) <- samples
# Tidy up each of the dataframes in list
# Use col 4 of the dataframe, see link below for reason
# https://www.biostars.org/p/285757/
# https://www.biostars.org/p/407788/
for (x in samples) {
row.names(rawDat[[x]]) <- rawDat[[x]]$V1
rawDat[[x]] <- rawDat[[x]][, c(1,4)]
colnames(rawDat[[x]]) <- c('GeneID', x)
}
# Merge list elements
countMat <- rawDat %>%
reduce(left_join, by = "GeneID")
# Make gene names row names
row.names(countMat) <- countMat$GeneID
countMat$GeneID <- NULL
# Need to update the sample names (colnames)
GSE159_SRA <- read.delim(file.path(metaDir, 'GSE159_SraRunTable.txt'),
sep = ',')[, c('Run', 'Sample.Name')]
GSE159_SampNames <- read.csv(file.path(metaDir, 'GSE159_SampNames.csv'),
header = F,
col.names = c(
'SRA.Sample',
'Real.Sample'
))
GSE189_SRA <- read.delim(file.path(metaDir, 'GSE189_SraRunTable.txt'),
sep = ',')[, c('Run', 'Sample.Name')]
GSE189_SampNames <- read.csv(file.path(metaDir, 'GSE189_SampNames.csv'),
header = F,
col.names = c(
'SRA.Sample',
'Real.Sample'
))
All_SRA <- rbind(GSE159_SRA, GSE189_SRA)
colnames(All_SRA) <- c('Run', 'SRA.Sample')
All_SampNames <- rbind(GSE159_SampNames, GSE189_SampNames)
SampleNameLookup <- list(All_SRA, All_SampNames) %>%
reduce(left_join, by = "SRA.Sample")
colnames(countMat) <-
plyr::mapvalues(colnames(countMat),
as.character(SampleNameLookup$Run),
as.character(SampleNameLookup$Real.Sample))
# Subset counts matrix to samples of itnerest
soi <- c("A_ctrl_0", "A_mut_0", "B_ctrl_0", "B_mut_0", "C_ctrl_0", "C_mut_0")
CountMat <- countMat[, soi]
# Create a meta data dataframe
meta <- data.frame('Sample' = colnames(CountMat),
'Group' = rep(c('CORR', 'DMD'), 3))
# Plot distribution of counts
CountMatLog <- log10(CountMat)
ggplot(CountMatLog, aes(x=A_ctrl_0)) + geom_histogram()
ggplot(CountMat, aes(x=A_ctrl_0)) + geom_histogram(binwidth = 10)
# Filter - remove genes with max counts of 10 across all samples
maximums <- apply(CountMat, MARGIN = 1, max)
filtCountMat <- CountMat[maximums >= 10, ]
# Normalisation (convert to counts per million)
# Calculate normalisation factors
normFactors <- apply(filtCountMat, MARGIN = 2, function(x){
1e6/sum(x)
})
# Apply normalisation factors
normCountMat <- filtCountMat
for (x in 1:ncol(normCountMat)) {
normCountMat[, x] <- normCountMat[, x]*normFactors[x]
}
# Perform differential expression with glm poisson
# Test on a single row
# Use group from meta data as grouping factor
Group <- meta$Group
# Fit models
fit0 <- glm.nb(unlist(normCountMat[1, ]) ~ 1)
fit1 <- glm.nb(unlist(normCountMat[1, ]) ~ Group)
# Perform likelihood ratio test for differences in models
lrtResult <- lrtest(fit1, fit0)
lrtResult$`Pr(>Chisq)`[2]
# Apply to whole data
prelimRes <- apply(normCountMat, 1,
function(y) {
# Fit null model
fit0 <- glm.nb(y ~ 1)
# Fit proposed model
fit1 <- glm.nb(y ~ Group)
# Compare fit of models
lrtResult <- lrtest(fit1, fit0)
# Return p-value
return(lrtResult$`Pr(>Chisq)`[2])
})
# Calculate adjusted p-values
results <- data.frame(Gene = names(prelimRes),
PVal = prelimRes,
FDR = p.adjust(prelimRes, method = 'fdr')
)
# Extract FDR <0.05
sigDEGs <- results[(results$FDR < 0.05), ]
sort(sigDEGs$FDR, decreasing = F)
# Look at expression of results
normCountMat[sigDEGs,]
# Now perform analysis in edegR to compare
library(edgeR)
y <- DGEList(counts=normCountMat,group=Group)
y <- calcNormFactors(y)
design <- model.matrix(~Group)
y <- estimateDisp(y,design)
fit <- glmFit(y,design)
lrt <- glmLRT(fit,coef=2)
edgeR_res <- topTags(lrt, p.value = 0.05, n = 1000)[['table']]
# Are sigDEGs all contains within EdgeR DEGs
sigDEGs %in% edgerDEGs