-
-
Notifications
You must be signed in to change notification settings - Fork 16
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add coefficient of variation function #236
Changes from 16 commits
3ca9b46
9be7427
37da910
25d0a3f
b586bb0
82fbcc0
9e85075
34c0f9b
4f94216
ac39f96
98d4752
cc84f46
4d7675f
c2e7fe3
471df7f
22083dc
f75e0f7
0273879
694dcc5
78f1e26
76b151e
6ae9050
cb353c4
d11f029
fa10fbc
53c701a
63b4edd
243109e
0d11c8a
d9cb700
3626721
21ea852
a4cff3d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,161 @@ | ||
# distribution_mode ---------------------------------- | ||
|
||
#' Compute mode for a statistical distribution | ||
#' | ||
#' @param x An atomic vector, a list, or a data frame. | ||
#' | ||
#' @return | ||
#' | ||
#' The value that appears most frequently in the provided data. | ||
#' The returned data structure will be the same as the entered one. | ||
#' | ||
#' @seealso For continuous variables, the | ||
#' **Highest Maximum a Posteriori probability estimate (MAP)** may be | ||
#' a more useful way to estimate the most commonly-observed value | ||
#' than the mode. See [bayestestR::map_estimate()]. | ||
#' | ||
#' @examples | ||
#' | ||
#' distribution_mode(c(1, 2, 3, 3, 4, 5)) | ||
#' distribution_mode(c(1.5, 2.3, 3.7, 3.7, 4.0, 5)) | ||
#' | ||
#' @export | ||
distribution_mode <- function(x) { | ||
# TODO: Add support for weights, trim, binned (method) | ||
uniqv <- unique(x) | ||
tab <- tabulate(match(x, uniqv)) | ||
idx <- which.max(tab) | ||
uniqv[idx] | ||
} | ||
|
||
#' Compute the coefficient of variation | ||
#' | ||
#' Compute the coefficient of variation (CV, ratio of the standard deviation to | ||
#' the mean, \eqn{\sigma/\mu}) for a set of numeric values. | ||
#' | ||
#' @return The computed coefficient of variation for `x`. | ||
#' @export | ||
#' | ||
#' @examples | ||
#' coef_var(1:10) | ||
#' coef_var(c(1:10, 100), method = "median_mad") | ||
#' coef_var(c(1:10, 100), method = "qcd") | ||
#' coef_var(mu = 10, sigma = 20) | ||
#' coef_var(mu = 10, sigma = 20, method = "unbiased", n = 30) | ||
#' cv(1:10) | ||
coef_var <- function(x, ...) { | ||
UseMethod("coef_var") | ||
} | ||
|
||
#' @name cv | ||
#' @rdname coef_var | ||
#' @export | ||
cv <- coef_var | ||
|
||
#' @name distribution_cv | ||
#' @rdname coef_var | ||
#' @export | ||
distribution_cv <- coef_var | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If we remove alias |
||
|
||
#' @export | ||
coef_var.default <- function(x, verbose = TRUE, ...) { | ||
if (verbose) { | ||
warning(insight::format_message( | ||
paste0("Can't compute the coefficient of variation objects of class '", class(x)[1], "'.") | ||
), call. = FALSE) | ||
} | ||
NULL | ||
} | ||
|
||
#' @param x A numeric vector, or vector of values than can be converted to numeric. | ||
mattansb marked this conversation as resolved.
Show resolved
Hide resolved
|
||
#' @param mu A numeric vector of mean values to use to compute the coefficient | ||
#' of variation. If supplied, `x` is not used to compute the mean. | ||
#' @param sigma A numeric vector of standard deviation values to use to compute the coefficient | ||
#' of variation. If supplied, `x` is not used to compute the SD. | ||
#' @param method Method to use to compute the CV. Can be `"standard"` to compute | ||
#' by dividing the standard deviation by the mean, `"unbiased"` for the | ||
#' unbiased estimator for normally distributed data, or one of two robust | ||
#' alternatives: `"median_mad"` to divide the median by the [stats::mad()], | ||
#' or `"qcd"` (quartile coefficient of dispersion, interquartile range divided | ||
#' by the sum of the quartiles \[twice the midhinge\]: \eqn{(Q_3 - Q_1)/(Q_3 + Q_1)}. | ||
#' @param trim the fraction (0 to 0.5) of values to be trimmed from | ||
#' each end of `x` before the mean and standard deviation (or alternatvies) | ||
#' are computed. Values of `trim` outside the range of (0 to 0.5) are taken | ||
#' as the nearest endpoint. | ||
#' @param na.rm Logical. Should `NA` values be removed before computing (`TRUE`) | ||
#' or not (`FALSE`, default)? | ||
#' @param n If `method = "unbiased"` and both `mu` and `sigma` are provided (not | ||
#' computed from `x`), what sample size to use to adjust the computed CV | ||
#' for small-sample bias? | ||
#' @param ... Further arguments passed to computation functions. | ||
#' | ||
#' @rdname coef_var | ||
#' | ||
#' @export | ||
coef_var.numeric <- function(x, mu = NULL, sigma = NULL, | ||
method = c("standard", "unbiased", "median_mad", "qcd"), | ||
trim = 0, na.rm = FALSE, n = NULL, ...) { | ||
# TODO: Support weights | ||
method <- match.arg(method, choices = c("standard", "unbiased", "median_mad", "qcd")) | ||
if (is.null(mu) || is.null(sigma)) { | ||
if (isTRUE(na.rm)) { | ||
mattansb marked this conversation as resolved.
Show resolved
Hide resolved
|
||
x <- .drop_na(x) | ||
} | ||
n <- length(x) | ||
x <- .trim_values(x, trim = trim, n = n) | ||
} | ||
if (is.null(mu)) { | ||
mu <- switch( | ||
method, | ||
standard = , unbiased = mean(x, ...), | ||
median_mad = stats::median(x, ...), | ||
qcd = unname(sum(stats::quantile(x, probs = c(.25, .75), ...))) | ||
) | ||
} | ||
if (is.null(sigma)) { | ||
sigma <- switch( | ||
method, | ||
standard = , unbiased = stats::sd(x, ...), | ||
median_mad = stats::mad(x, center = mu, ...), | ||
qcd = unname(diff(stats::quantile(x, probs = c(.25, .75), ...))) | ||
) | ||
} | ||
out <- sigma / mu | ||
if (method == "unbiased") { | ||
if (is.null(n)) { | ||
stop(insight::format_message( | ||
"A value for `n` must be provided when `method = \"unbiased\"` and both `mu` and `sigma` are provided." | ||
), call. = FALSE) | ||
} | ||
# from DescTools::CoefVar | ||
out <- out * (1 - 1 / (4 * (n - 1)) + 1 / n * out^2 + 1 / (2 * (n - 1)^2)) | ||
} | ||
return(out) | ||
} | ||
|
||
|
||
|
||
|
||
# descriptives helpers | ||
|
||
.drop_na <- function(x) { | ||
x[!is.na(x)] | ||
} | ||
|
||
.trim_values <- function(x, trim = 0, n = NULL, weights = NULL) { | ||
# TODO: Support weights | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @bwiernik Can this already be merged, or do you also want to work in this PR on supporting weights here? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We should at least print a message if |
||
if (!is.numeric(trim) || length(trim) != 1L) { | ||
stop("`trim` must be a single numeric value.", call. = FALSE) | ||
} | ||
if (is.null(NULL)) { | ||
n <- length(x) | ||
} | ||
if (trim > 0 && n) { | ||
if (anyNA(x)) return(NA_real_) | ||
if (trim >= 0.5) return(stats::median(x, na.rm = FALSE)) | ||
lo <- floor(n * trim) + 1 | ||
hi <- n + 1 - lo | ||
x <- sort.int(x, partial = unique(c(lo, hi)))[lo:hi] | ||
} | ||
x | ||
} |
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
test_that("coefficient of variation works", { | ||
expect_equal(coef_var(1:10), 0.5504819) | ||
expect_equal(coef_var(1:10, method = "unbiased"), 0.55527) | ||
expect_equal(coef_var(c(1:10, 100), method = "median_mad"), 0.7413) | ||
expect_equal(coef_var(c(1:10, 100), method = "qcd"), 0.4166667) | ||
expect_equal(coef_var(mu = 10, sigma = 20), 2) | ||
expect_equal(coef_var(mu = 10, sigma = 20, method = "unbiased", n = 30), 2.250614) | ||
expect_equal(cv(1:10), 0.5504819) | ||
expect_equal(distribution_cv(1:10), 0.5504819) | ||
}) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I am wondering if we should support this alias. It is quite non-descriptive, and can be easily confused with cross-validation.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah I'm torn. "CV" is by far the most common way to write the stat name (analogous to "SD"), but yeah it's not that descriptive and is easily confused with cross validation.
I guess I lean toward not including. Thoughts @DominiqueMakowski @mattansb @strengejacke ?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@DominiqueMakowski @mattansb @strengejacke @etiennebacher WDYT?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
coef_var
is explicit and short enough to me, and I agree thatcv
can be confused with cross validationThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
agree, we should remove this alias.