easystats · IndrajeetPatil · Sep 15, 2022 · Aug 28, 2022 · Aug 28, 2022 · Aug 29, 2022
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -62,7 +62,7 @@ VignetteBuilder:
 Encoding: UTF-8
 Language: en-US
 Roxygen: list(markdown = TRUE)
-RoxygenNote: 7.2.1
+RoxygenNote: 7.2.1.9000
 Config/testthat/edition: 3
 Config/Needs/website:
     rstudio/bslib,

diff --git a/NAMESPACE b/NAMESPACE
@@ -20,6 +20,8 @@ S3method(center,factor)
 S3method(center,grouped_df)
 S3method(center,logical)
 S3method(center,numeric)
+S3method(coef_var,default)
+S3method(coef_var,numeric)
 S3method(convert_na_to,character)
 S3method(convert_na_to,data.frame)
 S3method(convert_na_to,default)
@@ -163,6 +165,7 @@ export(center)
 export(centre)
 export(change_code)
 export(change_scale)
+export(coef_var)
 export(coerce_to_numeric)
 export(colnames_to_row)
 export(column_as_rownames)
@@ -201,6 +204,7 @@ export(degroup)
 export(demean)
 export(describe_distribution)
 export(detrend)
+export(distribution_coef_var)
 export(distribution_mode)
 export(empty_columns)
 export(empty_rows)

diff --git a/R/describe_distribution.R b/R/describe_distribution.R
@@ -448,7 +448,7 @@ describe_distribution.grouped_df <- function(x,
     verbose = verbose
   )
 
-  out <- do.call(rbind, lapply(1:length(groups), function(i) {
+  out <- do.call(rbind, lapply(seq_along(groups), function(i) {
     d <- describe_distribution.data.frame(
       groups[[i]][select],
       centrality = centrality,
@@ -511,27 +511,3 @@ print.parameters_distribution <- function(x, digits = 2, ...) {
   )
   out[[1]]
 }
-
-# distribution_mode ----------------------------------
-
-#' Compute mode for a statistical distribution
-#'
-#' @param x An atomic vector, a list, or a data frame.
-#'
-#' @return
-#'
-#' The value that appears most frequently in the provided data.
-#' The returned data structure will be the same as the entered one.
-#'
-#' @examples
-#'
-#' distribution_mode(c(1, 2, 3, 3, 4, 5))
-#' distribution_mode(c(1.5, 2.3, 3.7, 3.7, 4.0, 5))
-#'
-#' @export
-distribution_mode <- function(x) {
-  uniqv <- unique(x)
-  tab <- tabulate(match(x, uniqv))
-  idx <- which.max(tab)
-  uniqv[idx]
-}
diff --git a/R/descriptives.R b/R/descriptives.R
@@ -0,0 +1,179 @@
+# distribution_mode ----------------------------------
+
+#' Compute mode for a statistical distribution
+#'
+#' @param x An atomic vector, a list, or a data frame.
+#'
+#' @return
+#'
+#' The value that appears most frequently in the provided data.
+#' The returned data structure will be the same as the entered one.
+#'
+#' @seealso For continuous variables, the
+#'   **Highest Maximum a Posteriori probability estimate (MAP)** may be
+#'   a more useful way to estimate the most commonly-observed value
+#'   than the mode. See [bayestestR::map_estimate()].
+#'
+#' @examples
+#'
+#' distribution_mode(c(1, 2, 3, 3, 4, 5))
+#' distribution_mode(c(1.5, 2.3, 3.7, 3.7, 4.0, 5))
+#'
+#' @export
+distribution_mode <- function(x) {
+  # TODO: Add support for weights, trim, binned (method)
+  uniqv <- unique(x)
+  tab <- tabulate(match(x, uniqv))
+  idx <- which.max(tab)
+  uniqv[idx]
+}
+
+#' Compute the coefficient of variation
+#'
+#' Compute the coefficient of variation (CV, ratio of the standard deviation to
+#' the mean, \eqn{\sigma/\mu}) for a set of numeric values.
+#'
+#' @return The computed coefficient of variation for `x`.
+#' @export
+#'
+#' @examples
+#' coef_var(1:10)
+#' coef_var(c(1:10, 100), method = "median_mad")
+#' coef_var(c(1:10, 100), method = "qcd")
+#' coef_var(mu = 10, sigma = 20)
+#' coef_var(mu = 10, sigma = 20, method = "unbiased", n = 30)
+coef_var <- function(x, ...) {
+  UseMethod("coef_var")
+}
+#' @name distribution_cv
+#' @rdname coef_var
+#' @export
+distribution_coef_var <- coef_var
+
+#' @export
+coef_var.default <- function(x, verbose = TRUE, ...) {
+  if (verbose) {
+    warning(insight::format_message(
+      paste0("Can't compute the coefficient of variation objects of class '", class(x)[1], "'.")
+    ), call. = FALSE)
+  }
+  NULL
+}
+
+#' @param x A numeric vector of ratio scale (see details), or vector of values than can be coerced to one.
+#' @param mu A numeric vector of mean values to use to compute the coefficient
+#'   of variation. If supplied, `x` is not used to compute the mean.
+#' @param sigma A numeric vector of standard deviation values to use to compute the coefficient
+#'   of variation. If supplied, `x` is not used to compute the SD.
+#' @param method Method to use to compute the CV. Can be `"standard"` to compute
+#'   by dividing the standard deviation by the mean, `"unbiased"` for the
+#'   unbiased estimator for normally distributed data, or one of two robust
+#'   alternatives: `"median_mad"` to divide the median by the [stats::mad()],
+#'   or `"qcd"` (quartile coefficient of dispersion, interquartile range divided
+#'   by the sum of the quartiles \[twice the midhinge\]: \eqn{(Q_3 - Q_1)/(Q_3 + Q_1)}.
+#' @param trim the fraction (0 to 0.5) of values to be trimmed from
+#'   each end of `x` before the mean and standard deviation (or alternatvies)
+#'   are computed. Values of `trim` outside the range of (0 to 0.5) are taken
+#'   as the nearest endpoint.
+#' @param na.rm Logical. Should `NA` values be removed before computing (`TRUE`)
+#'   or not (`FALSE`, default)?
+#' @param n If `method = "unbiased"` and both `mu` and `sigma` are provided (not
+#'   computed from `x`), what sample size to use to adjust the computed CV
+#'   for small-sample bias?
+#' @param ... Further arguments passed to computation functions.
+#'
+#' @details
+#' CV is only applicable of values taken on a ratio scale: values that have a
+#' *fixed* meaningfully defined 0 (which is either the lowest or highest
+#' possible value), and that ratios between them are interpretable For example,
+#' how many sandwiches have I eaten this week? 0 means "none" and 20 sandwiches
+#' is 4 times more than 5 sandwiches. If I were to center the number of
+#' sandwiches, it will no longer be on a ratio scale (0 is no "none" it is the
+#' mean, and the ratio between 4 and -2 is not meaningful). Scaling a ratio
+#' scale still results in a ratio scale. So I can re define "how many half
+#' sandwiches did I eat this week ( = sandwiches * 0.5) and 0 would still mean
+#' "none", and 20 half-sandwiches is still 4 times more than 5 half-sandwiches.
+#'
+#' This means that CV is **NOT** invariance to shifting, but it is to scaling:
+#' ```{r}
+#' sandwiches <- c(0, 4, 15, 0, 0, 5, 2, 7)
+#' coef_var(sandwiches)
+#'
+#' coef_var(sandwiches / 2) # same
+#'
+#' coef_var(sandwiches + 4) # different! 0 is no longer meaningful!
+#' ````
+#'
+#' @rdname coef_var
+#'
+#' @export
+coef_var.numeric <- function(x, mu = NULL, sigma = NULL,
+                             method = c("standard", "unbiased", "median_mad", "qcd"),
+                             trim = 0, na.rm = FALSE, n = NULL, ...) {
+  # TODO: Support weights
+  if (all(c(-1, 1) %in% sign(x))){
+    stop("CV only applicable for ratio scale variables")
+  }
+  method <- match.arg(method, choices = c("standard", "unbiased", "median_mad", "qcd"))
+  if (is.null(mu) || is.null(sigma)) {
+    if (isTRUE(na.rm)) {
+      x <- .drop_na(x)
+    }
+    n <- length(x)
+    x <- .trim_values(x, trim = trim, n = n)
+  }
+  if (is.null(mu)) {
+    mu <- switch(
+      method,
+      standard = , unbiased = mean(x, ...),
+      median_mad = stats::median(x, ...),
+      qcd = unname(sum(stats::quantile(x, probs = c(.25, .75), ...)))
+    )
+  }
+  if (is.null(sigma)) {
+    sigma <- switch(
+      method,
+      standard = , unbiased = stats::sd(x, ...),
+      median_mad = stats::mad(x, center = mu, ...),
+      qcd = unname(diff(stats::quantile(x, probs = c(.25, .75), ...)))
+    )
+  }
+  out <- sigma / mu
+  if (method == "unbiased") {
+    if (is.null(n)) {
+      stop(insight::format_message(
+        "A value for `n` must be provided when `method = \"unbiased\"` and both `mu` and `sigma` are provided."
+      ), call. = FALSE)
+    }
+    # from DescTools::CoefVar
+    out <- out * (1 - 1 / (4 * (n - 1)) + 1 / n * out^2 + 1 / (2 * (n - 1)^2))
+  }
+  return(out)
+}
+
+
+
+
+# descriptives helpers
+
+.drop_na <- function(x) {
+  x[!is.na(x)]
+}
+
+.trim_values <- function(x, trim = 0, n = NULL, weights = NULL) {
+  # TODO: Support weights
+  if (!is.numeric(trim) || length(trim) != 1L) {
+    stop("`trim` must be a single numeric value.", call. = FALSE)
+  }
+  if (is.null(NULL)) {
+    n <- length(x)
+  }
+  if (trim > 0 && n) {
+    if (anyNA(x)) return(NA_real_)
+    if (trim >= 0.5) return(stats::median(x, na.rm = FALSE))
+    lo <- floor(n * trim) + 1
+    hi <- n + 1 - lo
+    x <- sort.int(x, partial = unique(c(lo, hi)))[lo:hi]
+  }
+  x
+}
diff --git a/man/coef_var.Rd b/man/coef_var.Rd
diff --git a/man/distribution_mode.Rd b/man/distribution_mode.Rd
diff --git a/tests/testthat/test-coef_var.R b/tests/testthat/test-coef_var.R
@@ -0,0 +1,9 @@
+test_that("coefficient of variation works", {
+  expect_equal(coef_var(1:10), 0.5504818826)
+  expect_equal(coef_var(1:10, method = "unbiased"), 0.5552700246)
+  expect_equal(coef_var(c(1:10, 100), method = "median_mad"), 0.7413)
+  expect_equal(coef_var(c(1:10, 100), method = "qcd"), 0.4166666667)
+  expect_equal(coef_var(mu = 10, sigma = 20), 2)
+  expect_equal(coef_var(mu = 10, sigma = 20, method = "unbiased", n = 30), 2.250614348)
+  expect_equal(distribution_coef_var(1:10), 0.5504818826)
+})