siegel.tukey.r

siegel.tukey <- function(x, y, id.col = FALSE, adjust.median = F, 
    rnd = -1, alternative = "two.sided", mu = 0, paired = FALSE, 
    exact = FALSE, correct = TRUE, conf.int = FALSE, conf.level = 0.95) {
    ###### published on:
    #   http://www.r-statistics.com/2010/02/siegel-tukey-a-non-parametric-test-for-equality-in-variability-r-code/
    ## Main author of the function:  Daniel Malter
    
    # x: a vector of data
    
    # y: Group indicator (if id.col=TRUE); data of the second
    #   group (if
    # id.col=FALSE). If y is the group indicator it MUST take 0
    #   or 1 to indicate
    # the groups, and x must contain the data for both groups.
    
    # id.col: If TRUE (default), then x is the data column and y
    #   is the ID column,
    # indicating the groups. If FALSE, x and y are both data
    #   columns. id.col must
    # be FALSE only if both data columns are of the same length.
    
    # adjust.median: Should between-group differences in medians
    #   be leveled before
    # performing the test? In certain cases, the Siegel-Tukey
    #   test is susceptible
    # to median differences and may indicate significant
    #   differences in
    # variability that, in reality, stem from differences in
    #   medians.
    
    # rnd: Should the data be rounded and, if so, to which
    #   decimal? The default
    # (-1) uses the data as is. Otherwise, rnd must be a
    #   non-negative integer.
    # Typically, this option is not needed. However,
    #   occasionally, differences in
    # the precision with which certain functions return values
    #   cause the merging
    # of two data frames to fail within the siegel.tukey
    #   function. Only then
    # rounding is necessary. This operation should not be
    #   performed if it affects
    # the ranks of observations.
    
    # … arguments passed on to the Wilcoxon test. See
    #   ?wilcox.test
    
    # Value: Among other output, the function returns the data,
    #   the Siegel-Tukey
    # ranks, the associated Wilcoxon’s W and the p-value for a
    #   Wilcoxon test on
    # tie-adjusted Siegel-Tukey ranks (i.e., it performs and
    #   returns a
    # Siegel-Tukey test). If significant, the group with the
    #   smaller rank sum has
    # greater variability.
    
    # References: Sidney Siegel and John Wilder Tukey (1960) “A
    #   nonparametric sum
    # of ranks procedure for relative spread in unpaired
    #   samples.” Journal of the
    # American Statistical Association. See also, David J.
    #   Sheskin (2004)
    # ”Handbook of parametric and nonparametric statistical
    #   procedures.” 3rd
    # edition. Chapman and Hall/CRC. Boca Raton, FL.
    
    # Notes: The Siegel-Tukey test has relatively low power and
    #   may, under certain
    # conditions, indicate significance due to differences in
    #   medians rather than
    # differences in variabilities (consider using the argument
    #   adjust.median).
    
    # Output (in this order)
    
    # 1. Group medians (after median adjustment if specified)
    # 2. Wilcoxon-test for between-group differences in medians
    #   (after the median
    # adjustment if specified)
    # 3. Data, group membership, and the Siegel-Tukey ranks
    # 4. Mean Siegel-Tukey rank by group (smaller values indicate
    #   greater
    # variability)
    # 5. Siegel-Tukey test (Wilcoxon test on tie-adjusted
    #   Siegel-Tukey ranks)
    
    is.formula <- function(x) class(x) == "formula"
    
    if (is.formula(x)) {
        y <- do.call(c, list(as.name(all.vars(x)[2])), envir = parent.frame(2))
        x <- do.call(c, list(as.name(all.vars(x)[1])), envir = parent.frame(2))  # I am using parent.frame(2) since if the name of the variable in the equation is 'x', then we will mistakenly get the function in here instead of the vector.
        id.col <- TRUE
        # print(x)
        # print(ls.str())
        # data=data.frame(c(x,y),rep(c(0,1),c(length(x),length(y))))
        # print(data)
    }
    
    if (id.col == FALSE) {
        data = data.frame(c(x, y), rep(c(0, 1), c(length(x), length(y))))
    } else {
        data = data.frame(x, y)
    }
    names(data) = c("x", "y")
    data = data[order(data$x), ]
    if (rnd > -1) {
        data$x = round(data$x, rnd)
    }
    
    if (adjust.median == T) {
        cat("\n", "Adjusting medians...", "\n", sep = "")
        data$x[data$y == 0] = data$x[data$y == 0] - (median(data$x[data$y == 
            0]))
        data$x[data$y == 1] = data$x[data$y == 1] - (median(data$x[data$y == 
            1]))
    }
    cat("\n", "Median of group 1 = ", median(data$x[data$y == 0]), 
        "\n", sep = "")
    cat("Median of group 2 = ", median(data$x[data$y == 1]), "\n", 
        "\n", sep = "")
    cat("Testing median differences...", "\n")
    print(wilcox.test(data$x[data$y == 0], data$x[data$y == 1]))
    
    # The following must be done for the case when id.col==F
    x <- data$x
    y <- data$y
    
    cat("Performing Siegel-Tukey rank transformation...", "\n", 
        "\n")
    
    
    sort.x <- sort(data$x)
    sort.id <- data$y[order(data$x)]
    
    data.matrix <- data.frame(sort.x, sort.id)
    
    base1 <- c(1, 4)
    iterator1 <- matrix(seq(from = 1, to = length(x), by = 4)) - 
        1
    rank1 <- apply(iterator1, 1, function(x) x + base1)
    
    iterator2 <- matrix(seq(from = 2, to = length(x), by = 4))
    base2 <- c(0, 1)
    rank2 <- apply(iterator2, 1, function(x) x + base2)
    
    #print(rank1)
    #print(rank2)
    
    if (length(rank1) == length(rank2)) {
        rank <- c(rank1[1:floor(length(x)/2)], rev(rank2[1:ceiling(length(x)/2)]))
    } else {
        rank <- c(rank1[1:ceiling(length(x)/2)], rev(rank2[1:floor(length(x)/2)]))
    }
    
    
    unique.ranks <- tapply(rank, sort.x, mean)
    unique.x <- as.numeric(as.character(names(unique.ranks)))
    
    rank.matrix <- data.frame(unique.x, unique.ranks)
    
    ST.matrix <- merge(data.matrix, rank.matrix, by.x = "sort.x", 
        by.y = "unique.x")
    
    print(ST.matrix)
    
    cat("\n", "Performing Siegel-Tukey test...", "\n", sep = "")
    
    ranks0 <- ST.matrix$unique.ranks[ST.matrix$sort.id == 0]
    ranks1 <- ST.matrix$unique.ranks[ST.matrix$sort.id == 1]
    
    cat("\n", "Mean rank of group 0: ", mean(ranks0), "\n", sep = "")
    cat("Mean rank of group 1: ", mean(ranks1), "\n", sep = "")
    
    print(wilcox.test(ranks0, ranks1, alternative = alternative, 
        mu = mu, paired = paired, exact = exact, correct = correct, 
        conf.int = conf.int, conf.level = conf.level))
} 


if(F) {

#Example:
 
### 1
x=c(4,4,5,5,6,6)
y=c(0,0,1,9,10,10)
siegel.tukey(x,y, F)
siegel.tukey(x,y) #same as above

### 2
# example for a non equal number of cases:
x=c(4,4,5,5,6,6)
y=c(0,0,1,9,10)
siegel.tukey(x,y,F)

### 3
x <- c(33, 62, 84, 85, 88, 93, 97, 4, 16, 48, 51, 66, 98)
id <- c(0,0,0,0,0,0,0,1,1,1,1,1,1)
siegel.tukey(x,id,T)
siegel.tukey(x~id) # from now on, this also works as a function...
siegel.tukey(x,id,T,adjust.median=F,exact=T)

### 4
x<-c(177,200,227,230,232,268,272,297,47,105,126,142,158,172,197,220,225,230,262,270)
id<-c(rep(0,8),rep(1,12))
siegel.tukey(x,id,T,adjust.median=T)


### 5
x=c(33,62,84,85,88,93,97)
y=c(4,16,48,51,66,98) 
siegel.tukey(x,y)

### 6
x<-c(0,0,1,4,4,5,5,6,6,9,10,10)
id<-c(0,0,0,1,1,1,1,1,1,0,0,0)
siegel.tukey(x,id,T)

### 7
x <- c(85,106,96, 105, 104, 108, 86)
id<-c(0,0,1,1,1,1,1)
siegel.tukey(x,id,T)

}