-
Notifications
You must be signed in to change notification settings - Fork 26
/
boxplot.with.outlier.label.r
236 lines (189 loc) · 9.79 KB
/
boxplot.with.outlier.label.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
# some helpful threads
# https://stat.ethz.ch/pipermail/r-help/2008-September/172641.html
# http://tolstoy.newcastle.edu.au/R/e4/help/08/02/4875.html
# http://tolstoy.newcastle.edu.au/R/e2/help/07/01/8598.html
# http://www.r-statistics.com/wp-content/uploads/2011/01/boxplot-add-label-for-outliers.r.txt
# last updated: 31.10.2011
# This is instead of the 20.6.11 version...
boxplot.with.outlier.label <- function(y, label_name, ..., spread_text = T, data, plot = T, range = 1.5, label.col = "blue", push_text_right = 1.3, # enlarge push_text_right in order to push the text labels further from their point
segement_width_as_percent_of_label_dist = .45, # Change this if you want to have the line closer to the label (range should be between 0 to 1
jitter_if_duplicate = T, jitter_only_positive_duplicates = F)
{
# notes - this functions doesn't work if there are any missing values in the data.
# You must pre-process the data to make sure it is "complete".
# change log:
# 19.04.2011 - added support to "names" and "at" parameters.
# jitter_if_duplicate - will jitter (Actually just add a bit of numbers) so to be able to decide on which location to plot the label when having identical variables...
require(plyr) # for is.formula and ddply
# a function to jitter data in case of ties in Y's
jitter.duplicate <- function(x, only_positive = F)
{
if(only_positive) {
ss <- x > 0
} else {
ss <- T
}
ss_dup <- duplicated(x[ss])
# ss <- ss & ss_dup
temp_length <- length(x[ss][ss_dup])
x[ss][ss_dup] <- x[ss][ss_dup] + seq(from = 0.00001, to = 0.00002, length.out = temp_length)
x
}
# jitter.duplicate(c(1:5))
# jitter.duplicate(c(1:5,5,2))
# duplicated(jitter.duplicate(c(1:5,5,2)))
# jitter.duplicate(c(0,0,1:5,5,2))
# duplicated(jitter.duplicate(c(0,0,1:5,5,2)))
# handle cases where
if(jitter_if_duplicate) {
# warning("duplicate jutter of values in y is ON")
if(!missing(data)) { #e.g: we DO have data
# if(exists("y") && is.formula(y)) { # F && NULL # F & NULL
y_name <- as.character(substitute(y)) # I could have also used as.list(match.call())
# credit to Uwe Ligges and Marc Schwartz for the help
# https://mail.google.com/mail/?shva=1#inbox/12dd7ca2f9bfbc39
if(length(y_name) > 1) { # then it is a formula (for example: "~", "y", "x"
model_frame_y <- model.frame(y, data = data)
temp_y <- model_frame_y[,1]
temp_y <- jitter.duplicate(temp_y, jitter_only_positive_duplicates) # notice that the default of the function is to work only with positive values...
# the_txt <- paste(names(model_frame_y)[1], "temp_y", sep = "<<-") # wrong...
the_txt <- paste("data['",names(model_frame_y)[1],"'] <- temp_y", sep = "")
eval(parse(text = the_txt)) # jutter out y var so to be able to handle identical values.
} else { # this isn't a formula
data[,y_name] <- jitter.duplicate(data[,y_name], jitter_only_positive_duplicates)
y <- data[,y_name] # this will make it possible for boxplot(y, data) to work later (since it is not supposed to work with data when it's not a formula, but now it does :))
}
} else { # there is no "data"
if(is.formula(y)) { # if(exists("y") && is.formula(y)) { # F && NULL # F & NULL
temp_y <- model.frame(y)[,1]
temp_y <- jitter.duplicate(temp_y, jitter_only_positive_duplicates) # notice that the default of the function is to work only with positive values...
temp_y_name <- names(model.frame(y))[1] # we must extract the "names" before introducing a new enbironment (or there will be an error)
environment(y) <- new.env()
assign(temp_y_name, temp_y, environment(y))
# Credit and thanks for doing this goes to Niels Richard Hansen (2 Jan 30, 2011)
# http://r.789695.n4.nabble.com/environment-question-changing-variables-from-a-formula-through-model-frame-td3246608.html
# warning("Your original variable (in the global environemnt) was just jittered.") # maybe I should add a user input before doing this....
# the_txt <- paste(names(model_frame_y)[1], "temp_y", sep = "<<-")
# eval(parse(text = the_txt)) # jutter out y var so to be able to handle identical values.
} else {
y <- jitter.duplicate(y, jitter_only_positive_duplicates)
}
}
}
# the_txt <- paste("print(",names(model_frame_y)[1], ")")
# eval(parse(text = the_txt)) # jutter out y var so to be able to handle identical values.
# print(ls())
# y should be a formula of the type: y~x, y~a*b
# or it could be simply y
if(missing(data)) {
boxdata <- boxplot(y, plot = plot,range = range ,...)
} else {
boxdata <- boxplot(y, plot = plot,data = data, range = range ,...)
}
if(length(boxdata$names) == 1 && boxdata$names =="") boxdata$names <- 1 # this is for cases of type: boxplot(y) (when there is no dependent group)
if(length(boxdata$out) == 0 ) {
warning("No outliers detected for this boxplot")
return(invisible())
}
if(!missing(data)) attach(data) # this might lead to problams I should check out for alternatives for using attach here...
# creating a data.frame with information from the boxplot output about the outliers (location and group)
boxdata_group_name <- factor(boxdata$group)
levels(boxdata_group_name) <- boxdata$names[as.numeric(levels(boxdata_group_name))] # the subseting is for cases where we have some sub groups with no outliers
if(!is.null(list(...)$at)) { # if the user chose to use the "at" parameter, then we would like the function to still function (added on 19.04.2011)
boxdata$group <- list(...)$at[boxdata$group]
}
boxdata_outlier_df <- data.frame(group = boxdata_group_name, y = boxdata$out, x = boxdata$group)
# Let's extract the x,y variables from the formula:
if(is.formula(y))
{
model_frame_y <- model.frame(y)
# old solution: (which caused problems if we used the names parameter when using a 2 way formula... (since the order of the names is different then the levels order we get from using factor)
# y <- model_frame_y[,1]
# x <- model_frame_y[,-1]
y <- model_frame_y[,1]
x <- model_frame_y[,-1]
if(!is.null(dim(x))) { # then x is a matrix/data.frame of the type x1*x2*..and so on - and we should merge all the variations...
x <- apply(x,1, paste, collapse = ".")
}
} else {
# if(missing(x)) x <- rep(1, length(y))
x <- rep(1, length(y)) # we do this in case y comes as a vector and without x
}
# and put all the variables (x, y, and outlier label name) into one data.frame
DATA <- data.frame(label_name, x ,y)
if(!is.null(list(...)$names)) { # if the user chose to use the names parameter, then we would like the function to still function (added on 19.04.2011)
DATA$x <- factor(DATA$x, levels = unique(DATA$x))
levels(DATA$x) = list(...)$names # enable us to handle when the user adds the "names" parameter # fixed on 19.04.11 # notice that DATA$x must be of the "correct" order (that's why I used split above
# warning("Careful, the use of the 'names' parameter is experimental. If you notice any errors please e-mail me at: [email protected]")
}
if(!missing(data)) detach(data) # we don't need to have "data" attached anymore.
# let's only keep the rows with our outliers
boxplot.outlier.data <- function(xx, y_name = "y")
{
y <- xx[,y_name]
boxplot_range <- range(boxplot.stats(y, coef = range )$stats)
ss <- (y < boxplot_range[1]) | (y > boxplot_range[2])
return(xx[ss,])
}
outlier_df <-ddply(DATA, .(x), boxplot.outlier.data)
# create propor x/y locations to handle over-laping dots...
if(spread_text) {
# credit: Greg Snow
require(TeachingDemos)
temp_x <- boxdata_outlier_df[,"x"]
temp_y1 <- boxdata_outlier_df[,"y"]
temp_y2 <- temp_y1
for(i in unique(temp_x))
{
tmp <- temp_x == i
temp_y2[ tmp ] <- spread.labs( temp_y2[ tmp ], 1.3*strheight('A'), maxiter=6000, stepsize = 0.05) #, min=0 )
}
}
# max(strwidth(c("asa", "a"))
# move_text_right <- max(strwidth(outlier_df[,"label_name"]))
# plotting the outlier labels :) (I wish there was a non-loop wise way for doing this)
for(i in seq_len(dim(boxdata_outlier_df)[1]))
{
# ss <- (outlier_df[,"x"] %in% boxdata_outlier_df[i,]$group) & (outlier_df[,"y"] %in% boxdata_outlier_df[i,]$y)
# if(jitter_if_duplicate) {
# ss <- (outlier_df[,"x"] %in% boxdata_outlier_df[i,]$group) & closest.number(outlier_df[,"y"] boxdata_outlier_df[i,]$y)
# } else {
ss <- (outlier_df[,"x"] %in% boxdata_outlier_df[i,]$group) & (outlier_df[,"y"] %in% boxdata_outlier_df[i,]$y)
# }
current_label <- outlier_df[ss,"label_name"]
temp_x <- boxdata_outlier_df[i,"x"]
temp_y <- boxdata_outlier_df[i,"y"]
# cbind(boxdata_outlier_df, temp_y2)
# outlier_df
if(spread_text) {
temp_y_new <- temp_y2[i] # not ss
move_text_right <- strwidth(current_label) * push_text_right
text( temp_x+move_text_right, temp_y_new, current_label, col = label.col)
# strwidth
segments( temp_x+(move_text_right/6), temp_y, temp_x+(move_text_right*segement_width_as_percent_of_label_dist), temp_y_new )
} else {
text(temp_x, temp_y, current_label, pos = 4, col = label.col)
}
}
# outputing some of the information we collected
invisible(list(boxdata = boxdata, boxdata_outlier_df = boxdata_outlier_df, outlier_df=outlier_df))
}
########################################
### examples to see that it works
# library(plyr)
# library(TeachingDemos)
# source("http://www.r-statistics.com/wp-content/uploads/2011/01/boxplot-with-outlier-label-r.txt") # Load the function
# set.seed(210)
# n <- 20
# y <- rnorm(n)
# x1 <- sample(letters[1:3], n,T)
# lab_y <- sample(letters, n)
# boxplot.with.outlier.label(y~x1, lab_y, push_text_right = 1.5, range = .3)
# data.frame(y, x1, lab_y)
# set.seed(10)
# x2 <- sample(letters[1:3], n,T)
# boxplot.with.outlier.label(y~x1*x2, lab_y, push_text_right = 1.5, range = .3)
# data.frame(y, x1, x2, lab_y)
# y1 <- y
# y1[1] <- NA
# boxplot.with.outlier.label(y1~x1, lab_y, push_text_right = 1.5, range = .3)