-
Notifications
You must be signed in to change notification settings - Fork 1
/
combineEstimates.R
144 lines (119 loc) · 4.85 KB
/
combineEstimates.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
library(kdensity)
library(GoFKernel)
library(getopt)
spec = matrix(c(
"mcmcSummary" , "m", 1, "character",
"sampleTime" , "s", 1, "double",
"latentBound" , "b", 1, "double",
"timeUnit" , "t", 1, "double",
"lastSample" , "l", 1, "double",
"genes" , "g", 1, "integer"
), byrow=TRUE, ncol=4)
opt = getopt(spec);
if (is.null(opt$mcmcSummary)) {
stop("MCMC summary file is not provided. Exiting")
} else {
inFile <- opt$mcmcSummary
}
if (is.null(opt$sampleTime)) {
stop("The sample time is not provided. Exiting")
} else {
lowerInt <- opt$sampleTime
}
if (is.null(opt$latentBound)) {
stop("The latent bound is not provided. Exiting")
} else {
upperInt <- opt$latentBound
}
if (is.null(opt$timeUnit)) {
stop("The time unit is not provided. Exiting")
} else {
timeUnit <- opt$timeUnit
}
if (is.null(opt$lastSample)) {
stop("The time of the last sample is not provided. Exiting")
} else {
timeZero <- opt$lastSample
}
if (is.null(opt$genes)) {
stop("The number of genes is not provided. Exiting")
} else {
numGenes <- opt$genes
}
# args = commandArgs(trailingOnly = TRUE)
# if(length(args) != 6) {
# stop("Incorrect number of arguments")
# } else {
#
# # MCMC file
# inFile <- args[1]
#
# # Integration bounds
# lowerInt <- as.numeric(args[2])
# upperInt <- as.numeric(args[3])
#
# # Used to convert to calendar time, HIVtree reports in backwards time
# # with a different time scale
# timeUnit <- as.numeric(args[4])
# timeZero <- as.numeric(args[5])
#
# # Number of genes in the input file
# numGenes <- as.numeric(args[6])
# }
times<- read.csv(inFile)
if ((dim(times)[2] / 2) != numGenes) {
stop("The number of genes reported does not match the number of columns in the input file. There should be twice as many columns at genes, one column for the posterior and one for the prior of each gene.")
}
maxGenes <- 10
post <- data.frame(nrow = maxGenes)
prior <- data.frame(nrow = maxGenes)
# Give error message if too many genes
if (numGenes > 10) {
stop("Too many sequences. There must be 10 or fewer genes.")
}
# Calculates a KDE for the priors and posteriors
for (i in 1:numGenes) {
if (! all(is.na(times[ , i]))) {
post[i] <- kdensity(times[, i], na.rm = TRUE, normalize = FALSE)
prior[i] <- kdensity(times[, i + numGenes], na.rm = TRUE, normalize = FALSE)
} else {
# The data is missing
post[i] <- function(x) 1
prior[i] <- function(x) 1
}
}
# There are less than 10 genes being analyzed
for (i in (numGenes+1):maxGenes) {
post[i] <- function(x) 1
prior[i] <- function(x) 1
}
# Converts between the time scale used in HIVtree to calendar time
adjTime <- function (time) {
timeZero - time * timeUnit
}
# Calculates normalization constant
constant <- integrate(function(x) post[[1]](x) * post[[2]](x) * post[[3]](x) * post[[4]](x) * post[[5]](x)
* post[[6]](x) * post[[7]](x) * post[[8]](x) * post[[9]](x) * post[[10]](x) / prior[[1]](x) /
prior[[2]](x) / prior[[3]](x) / prior[[4]](x) / prior[[5]](x) / prior[[6]](x) /
prior[[7]](x) / prior[[8]](x) / prior[[9]](x) / prior[[10]](x), lower = lowerInt, upper =
upperInt)$value
dist <- function (x) {
integrate(function (x) post[[1]](x) * post[[2]](x) * post[[3]](x) * post[[4]](x) * post[[5]](x)
* post[[6]](x) * post[[7]](x) * post[[8]](x) * post[[9]](x) * post[[10]](x) / prior[[1]](x) /
prior[[2]](x) / prior[[3]](x) / prior[[4]](x) / prior[[5]](x) / prior[[6]](x) /
prior[[7]](x) / prior[[8]](x) / prior[[9]](x) / prior[[10]](x) /constant, lower = lowerInt, upper = x)[[1]]
}
invFunc <- inverse(dist, lower = lowerInt, upper = upperInt)
# Finds the 95% credible set (equal tail probability)
lowerEst <- invFunc(.025)
upperEst <- invFunc(.975)
# Finds mean of combined posterior
meanEst <- integrate(function(x) x * post[[1]](x) * post[[2]](x) * post[[3]](x) * post[[4]](x) * post[[5]](x)
* post[[6]](x) * post[[7]](x) * post[[8]](x) * post[[9]](x) * post[[10]](x) / prior[[1]](x) /
prior[[2]](x) / prior[[3]](x) / prior[[4]](x) / prior[[5]](x) / prior[[6]](x) /
prior[[7]](x) / prior[[8]](x) / prior[[9]](x) / prior[[10]](x) /constant, lower = lowerInt, upper = upperInt)[[1]]
print(paste("Mean:", adjTime(meanEst), ", 95% Credible Interval:", adjTime(upperEst),"-" , adjTime(lowerEst), sep = " "))
#plot(function (x) post[[1]](x) * post[[2]](x) * post[[3]](x) * post[[4]](x) * post[[5]](x)
# * post[[6]](x) * post[[7]](x) * post[[8]](x) * post[[9]](x) * post[[10]](x) / prior[[1]](x) /
# prior[[2]](x) / prior[[3]](x) / prior[[4]](x) / prior[[5]](x) / prior[[6]](x) /
# prior[[7]](x) / prior[[8]](x) / prior[[9]](x) / prior[[10]](x) /constant, xlim = c(lowerInt, upperInt), ylab = "Posterior Density", xlab = "Integration Time")