Draft new data_summary() function (#482)

* Draft new `data_summary()` function * check if we can avoid duplicated code * pkgdown * fix * fixes * lintr * add tests * code style * desc, news * fix * add print method and snapshot test * add test * correct english form * test * include NA, sort output * add test * meaningful code comments * add test * allow n() * docs * only n() * informative error for incorrect length, add "value_at()" function * pkgdown * docs, test, examples, xref * add test * fix tests * fix * lintr * lintr * lintr * lintr (reduce complexity) * fix * fix * address comments * use text_concatenate * remove value_at, move code into dev-folder * update docs * update test
easystats · Mar 4, 2024 · 7a1f372 · 7a1f372
1 parent 83f9703
commit 7a1f372
Show file tree

Hide file tree

Showing 15 changed files with 805 additions and 138 deletions.
diff --git a/.dev/test-value_at.R b/.dev/test-value_at.R
@@ -0,0 +1,12 @@
+test_that("value_at", {
+  data(efc, package = "datawizard")
+  expect_equal(value_at(efc$e42dep, 5), 4, ignore_attr = TRUE)
+  expect_equal(value_at(efc$c12hour, 4), NA_real_, ignore_attr = TRUE)
+  expect_equal(value_at(efc$c12hour, 4, remove_na = TRUE), 168, ignore_attr = TRUE)
+  expect_equal(value_at(efc$c12hour, 5:7), efc$c12hour[5:7], ignore_attr = TRUE)
+  expect_equal(value_at(efc$e42dep, 123456, default = 55), 55, ignore_attr = TRUE)
+  expect_null(value_at(efc$e42dep, 123456))
+  expect_null(value_at(efc$e42dep, NULL))
+  expect_error(value_at(efc$e42dep, NA), regex = "`position` can't")
+  expect_error(value_at(efc$e42dep, c(3, NA)), regex = "`position` can't")
+})
diff --git a/.dev/value_at.R b/.dev/value_at.R
@@ -0,0 +1,52 @@
+#' @title Find the value(s) at a specific position in a variable
+#' @name value_at
+#'
+#' @description This function can be used to extract one or more values at a
+#' specific position in a variable.
+#'
+#' @param x A vector or factor.
+#' @param position An integer or a vector of integers, indicating the position(s)
+#' of the value(s) to be returned. Negative values are counted from the end of
+#' the vector. If `NA`, an error is thrown.
+#' @param remove_na Logical, if `TRUE`, missing values are removed before
+#' computing the position. If `FALSE`, missing values are included in the
+#' computation.
+#' @param default The value to be returned if the position is out of range.
+#'
+#' @seealso `data_summary()` to use `value_at()` inside a `data_summary()` call.
+#'
+#' @return A vector with the value(s) at the specified position(s).
+#'
+#' @examples
+#' data(mtcars)
+#' # 5th value
+#' value_at(mtcars$mpg, 5)
+#' # last value
+#' value_at(mtcars$mpg, -1)
+#' # out of range, return default
+#' value_at(mtcars$mpg, 150)
+#' # return 2nd and fifth value
+#' value_at(mtcars$mpg, c(2, 5))
+#' @export
+value_at <- function(x, position = 1, default = NULL, remove_na = FALSE) {
+  if (remove_na) {
+    x <- x[!is.na(x)]
+  }
+  n <- length(x)
+  unlist(lapply(position, .values_at, x = x, n = n, default = default), use.names = FALSE)
+}
+
+# helper ----
+
+.values_at <- function(x, position, n, default) {
+  if (is.na(position)) {
+    insight::format_error("`position` can't be `NA`.")
+  }
+  if (position < 0L) {
+    position <- position + n + 1
+  }
+  if (position <= 0 || position > n) {
+    return(default)
+  }
+  x[position]
+}
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Type: Package
 Package: datawizard
 Title: Easy Data Wrangling and Statistical Transformations
-Version: 0.9.1.4
+Version: 0.9.1.5
 Authors@R: c(
     person("Indrajeet", "Patil", , "[email protected]", role = "aut",
            comment = c(ORCID = "0000-0003-1995-6531", Twitter = "@patilindrajeets")),

diff --git a/NAMESPACE b/NAMESPACE
@@ -52,6 +52,10 @@ S3method(data_modify,data.frame)
 S3method(data_modify,default)
 S3method(data_modify,grouped_df)
 S3method(data_peek,data.frame)
+S3method(data_summary,data.frame)
+S3method(data_summary,default)
+S3method(data_summary,grouped_df)
+S3method(data_summary,matrix)
 S3method(data_tabulate,data.frame)
 S3method(data_tabulate,default)
 S3method(data_tabulate,grouped_df)
@@ -90,6 +94,7 @@ S3method(plot,visualisation_recipe)
 S3method(print,data_codebook)
 S3method(print,data_seek)
 S3method(print,dw_data_peek)
+S3method(print,dw_data_summary)
 S3method(print,dw_data_tabulate)
 S3method(print,dw_data_tabulates)
 S3method(print,dw_data_xtabulate)
@@ -249,6 +254,7 @@ export(data_rotate)
 export(data_seek)
 export(data_select)
 export(data_separate)
+export(data_summary)
 export(data_tabulate)
 export(data_to_long)
 export(data_to_wide)

diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,9 @@
 # datawizard 0.9.2
 
+NEW FUNCTIONS
+
+* `data_summary()`, to compute summary statistics of (grouped) data frames.
+
 CHANGES
 
 * `data_modify()` gets three new arguments, `.at`, `.if` and `.modify`, to modify

diff --git a/R/adjust.R b/R/adjust.R
@@ -124,11 +124,11 @@ adjust <- function(data,
       predictors[predictors == predictors_num] <- paste0("s(", predictors_num, ")")
     }
     formula_predictors <- paste(c("1", predictors), collapse = " + ")
-    formula <- paste(var, "~", formula_predictors)
+    model_formula <- paste(var, "~", formula_predictors)
 
     x <- .model_adjust_for(
       data = data[unique(c(var, effect, facs))],
-      formula,
+      model_formula = model_formula,
       multilevel = multilevel,
       additive = additive,
       bayesian = bayesian,
@@ -148,7 +148,7 @@ data_adjust <- adjust
 
 #' @keywords internal
 .model_adjust_for <- function(data,
-                              formula,
+                              model_formula,
                               multilevel = FALSE,
                               additive = FALSE,
                               bayesian = FALSE,
@@ -159,32 +159,28 @@ data_adjust <- adjust
     # Bayesian
     if (bayesian) {
       insight::check_if_installed("rstanarm")
-      model <- rstanarm::stan_gamm4(stats::as.formula(formula), random = formula_random, data = data, refresh = 0)
+      model <- rstanarm::stan_gamm4(stats::as.formula(model_formula), random = formula_random, data = data, refresh = 0)
       # Frequentist
     } else {
       insight::check_if_installed("gamm4")
-      model <- gamm4::gamm4(stats::as.formula(formula), random = formula_random, data = data)
+      model <- gamm4::gamm4(stats::as.formula(model_formula), random = formula_random, data = data)
     }
 
     # Linear -------------------------
-  } else {
+  } else if (bayesian) {
     # Bayesian
-    if (bayesian) {
-      insight::check_if_installed("rstanarm")
-      if (multilevel) {
-        model <- rstanarm::stan_lmer(paste(formula, formula_random), data = data, refresh = 0)
-      } else {
-        model <- rstanarm::stan_glm(formula, data = data, refresh = 0)
-      }
-      # Frequentist
+    insight::check_if_installed("rstanarm")
+    if (multilevel) {
+      model <- rstanarm::stan_lmer(paste(model_formula, formula_random), data = data, refresh = 0)
     } else {
-      if (multilevel) {
-        insight::check_if_installed("lme4")
-        model <- lme4::lmer(paste(formula, formula_random), data = data)
-      } else {
-        model <- stats::lm(formula, data = data)
-      }
+      model <- rstanarm::stan_glm(model_formula, data = data, refresh = 0)
     }
+  } else if (multilevel) {
+    # Frequentist
+    insight::check_if_installed("lme4")
+    model <- lme4::lmer(paste(model_formula, formula_random), data = data)
+  } else {
+    model <- stats::lm(model_formula, data = data)
   }
 
   adjusted <- insight::get_residuals(model)

diff --git a/R/assign_labels.R b/R/assign_labels.R
@@ -79,7 +79,7 @@ assign_labels.numeric <- function(x, variable = NULL, values = NULL, ...) {
       attr(x, "label") <- variable
     } else {
       insight::format_error(
-        "Variable labels (argument `variable`) must be provided as a single character string, e.g. `variable = \"mylabel\"`."
+        "Variable labels (argument `variable`) must be provided as a single character string, e.g. `variable = \"mylabel\"`." # nolint
       )
     }
   }
@@ -88,13 +88,13 @@ assign_labels.numeric <- function(x, variable = NULL, values = NULL, ...) {
   if (!is.null(values)) {
     # extract unique values
     unique_values <- as.vector(sort(stats::na.omit(unique(x))))
-    labels <- NULL
+    value_labels <- NULL
 
     # do we have a names vector for "values"?
     # else check if number of labels and values match
     if (is.null(names(values))) {
       if (length(values) == length(unique_values)) {
-        labels <- stats::setNames(unique_values, values)
+        value_labels <- stats::setNames(unique_values, values)
       } else {
         insight::format_error(
           "Cannot add labels. Number of unique values and number of value labels are not equal.",
@@ -114,11 +114,11 @@ assign_labels.numeric <- function(x, variable = NULL, values = NULL, ...) {
 
       if (length(values)) {
         # we need to switch names and values
-        labels <- stats::setNames(coerce_to_numeric(names(values)), values)
+        value_labels <- stats::setNames(coerce_to_numeric(names(values)), values)
       }
     }
 
-    attr(x, "labels") <- labels
+    attr(x, "labels") <- value_labels
   }
 
   x

diff --git a/R/categorize.R b/R/categorize.R
@@ -145,28 +145,8 @@ categorize.numeric <- function(x,
                                labels = NULL,
                                verbose = TRUE,
                                ...) {
-  # check arguments
-  if (is.character(split)) {
-    split <- match.arg(
-      split,
-      choices = c(
-        "median", "mean", "quantile", "equal_length", "equal_range",
-        "equal", "equal_distance", "range", "distance"
-      )
-    )
-  }
-
-  if (is.character(split) && split %in% c("quantile", "equal_length") && is.null(n_groups)) {
-    insight::format_error(
-      "Recoding based on quantiles or equal-sized groups requires the `n_groups` argument to be specified."
-    )
-  }
-
-  if (is.character(split) && split == "equal_range" && is.null(n_groups) && is.null(range)) {
-    insight::format_error(
-      "Recoding into groups with equal range requires either the `range` or `n_groups` argument to be specified."
-    )
-  }
+  # sanity check
+  split <- .sanitize_split_arg(split, n_groups, range)
 
   # handle aliases
   if (identical(split, "equal_length")) split <- "length"
@@ -221,28 +201,7 @@ categorize.numeric <- function(x,
   original_x[!is.na(original_x)] <- out
 
   # turn into factor?
-  if (!is.null(labels)) {
-    if (length(labels) == length(unique(out))) {
-      original_x <- as.factor(original_x)
-      levels(original_x) <- labels
-    } else if (length(labels) == 1 && labels %in% c("mean", "median")) {
-      original_x <- as.factor(original_x)
-      no_na_x <- original_x[!is.na(original_x)]
-      if (labels == "mean") {
-        labels <- stats::aggregate(x, list(no_na_x), FUN = mean, na.rm = TRUE)$x
-      } else {
-        labels <- stats::aggregate(x, list(no_na_x), FUN = stats::median, na.rm = TRUE)$x
-      }
-      levels(original_x) <- insight::format_value(labels, ...)
-    } else if (isTRUE(verbose)) {
-      insight::format_warning(
-        "Argument `labels` and levels of the recoded variable are not of the same length.",
-        "Variable will not be converted to factor."
-      )
-    }
-  }
-
-  original_x
+  .original_x_to_factor(original_x, x, labels, out, verbose, ...)
 }
 
 
@@ -283,15 +242,15 @@ categorize.data.frame <- function(x,
   # create the new variables and updates "select", so new variables are processed
   if (!isFALSE(append)) {
     # process arguments
-    args <- .process_append(
+    my_args <- .process_append(
       x,
       select,
       append,
       append_suffix = "_r"
     )
     # update processed arguments
-    x <- args$x
-    select <- args$select
+    x <- my_args$x
+    select <- my_args$select
   }
 
   x[select] <- lapply(
@@ -342,15 +301,15 @@ categorize.grouped_df <- function(x,
   # create the new variables and updates "select", so new variables are processed
   if (!isFALSE(append)) {
     # process arguments
-    args <- .process_append(
+    my_args <- .process_append(
       x,
       select,
       append,
       append_suffix = "_r"
     )
     # update processed arguments
-    x <- args$x
-    select <- args$select
+    x <- my_args$x
+    select <- my_args$select
   }
 
   x <- as.data.frame(x)
@@ -387,3 +346,56 @@ categorize.grouped_df <- function(x,
   }
   seq(lowest, max(x), by = range)
 }
+
+
+.sanitize_split_arg <- function(split, n_groups, range) {
+  # check arguments
+  if (is.character(split)) {
+    split <- match.arg(
+      split,
+      choices = c(
+        "median", "mean", "quantile", "equal_length", "equal_range",
+        "equal", "equal_distance", "range", "distance"
+      )
+    )
+  }
+
+  if (is.character(split) && split %in% c("quantile", "equal_length") && is.null(n_groups)) {
+    insight::format_error(
+      "Recoding based on quantiles or equal-sized groups requires the `n_groups` argument to be specified."
+    )
+  }
+
+  if (is.character(split) && split == "equal_range" && is.null(n_groups) && is.null(range)) {
+    insight::format_error(
+      "Recoding into groups with equal range requires either the `range` or `n_groups` argument to be specified."
+    )
+  }
+
+  split
+}
+
+
+.original_x_to_factor <- function(original_x, x, labels, out, verbose, ...) {
+  if (!is.null(labels)) {
+    if (length(labels) == length(unique(out))) {
+      original_x <- as.factor(original_x)
+      levels(original_x) <- labels
+    } else if (length(labels) == 1 && labels %in% c("mean", "median")) {
+      original_x <- as.factor(original_x)
+      no_na_x <- original_x[!is.na(original_x)]
+      if (labels == "mean") {
+        labels <- stats::aggregate(x, list(no_na_x), FUN = mean, na.rm = TRUE)$x
+      } else {
+        labels <- stats::aggregate(x, list(no_na_x), FUN = stats::median, na.rm = TRUE)$x
+      }
+      levels(original_x) <- insight::format_value(labels, ...)
+    } else if (isTRUE(verbose)) {
+      insight::format_warning(
+        "Argument `labels` and levels of the recoded variable are not of the same length.",
+        "Variable will not be converted to factor."
+      )
+    }
+  }
+  original_x
+}