diff --git a/R/data_summary.R b/R/data_summary.R index 3b02762ec..8d89b93ae 100644 --- a/R/data_summary.R +++ b/R/data_summary.R @@ -8,6 +8,9 @@ #' @param by Optional character string, indicating the name of a variable in `x`. #' If supplied, the data will be split by this variable and summary statistics #' will be computed for each group. +#' @param include_na Logical, if `TRUE`, missing values are included as a level +#' in the grouping variable. If `FALSE`, missing values are omitted from the +#' grouping variable. #' @param ... One or more named expressions that define the new variable name #' and the function to compute the summary statistic. Example: #' `mean_sepal_width = mean(Sepal.Width)`. The expression can also be provided @@ -42,8 +45,8 @@ data_summary <- function(x, ...) { #' @export -data_summary.matrix <- function(x, ..., by = NULL) { - data_summary(as.data.frame(x), ..., by = by) +data_summary.matrix <- function(x, ..., by = NULL, include_na = TRUE) { + data_summary(as.data.frame(x), ..., by = by, include_na = include_na) } @@ -55,7 +58,7 @@ data_summary.default <- function(x, ...) { #' @rdname data_summary #' @export -data_summary.data.frame <- function(x, ..., by = NULL) { +data_summary.data.frame <- function(x, ..., by = NULL, include_na = TRUE) { dots <- eval(substitute(alist(...))) # do we have any expression at all? @@ -81,8 +84,15 @@ data_summary.data.frame <- function(x, ..., by = NULL) { .misspelled_string(colnames(x), by_not_found, "Possibly misspelled?") ) } - # split data - split_data <- split(x, x[by]) + # split data, add NA levels, if requested + l <- lapply(x[by], function(i) { + if (include_na) { + addNA(i) + } else { + i + } + }) + split_data <- split(x, l, drop = TRUE) out <- lapply(split_data, function(s) { # no data for combination? Return NULL if (nrow(s) == 0) { @@ -100,6 +110,9 @@ data_summary.data.frame <- function(x, ..., by = NULL) { }) out <- do.call(rbind, out) } + # sort data + out <- data_arrange(out, select = by) + # data attributes class(out) <- c("dw_data_summary", "data.frame") rownames(out) <- NULL out @@ -107,7 +120,7 @@ data_summary.data.frame <- function(x, ..., by = NULL) { #' @export -data_summary.grouped_df <- function(x, ..., by = NULL) { +data_summary.grouped_df <- function(x, ..., by = NULL, include_na = TRUE) { # extract group variables grps <- attr(x, "groups", exact = TRUE) group_variables <- data_remove(grps, ".rows") @@ -118,7 +131,7 @@ data_summary.grouped_df <- function(x, ..., by = NULL) { # remove information specific to grouped df's attr(x, "groups") <- NULL class(x) <- "data.frame" - data_summary(x, ..., by = by) + data_summary(x, ..., by = by, include_na = include_na) } @@ -172,6 +185,6 @@ print.dw_data_summary <- function(x, ...) { if (nrow(x) == 0) { cat("No matches found.\n") } else { - cat(insight::export_table(x, ...)) + cat(insight::export_table(x, missing = "", ...)) } } diff --git a/man/data_summary.Rd b/man/data_summary.Rd index 6c0e89744..0602dc7da 100644 --- a/man/data_summary.Rd +++ b/man/data_summary.Rd @@ -7,7 +7,7 @@ \usage{ data_summary(x, ...) -\method{data_summary}{data.frame}(x, ..., by = NULL) +\method{data_summary}{data.frame}(x, ..., by = NULL, include_na = TRUE) } \arguments{ \item{x}{A (grouped) data frame.} @@ -20,6 +20,10 @@ as a character string, e.g. \code{"mean_sepal_width = mean(Sepal.Width)"}.} \item{by}{Optional character string, indicating the name of a variable in \code{x}. If supplied, the data will be split by this variable and summary statistics will be computed for each group.} + +\item{include_na}{Logical, if \code{TRUE}, missing values are included as a level +in the grouping variable. If \code{FALSE}, missing values are omitted from the +grouping variable.} } \value{ A data frame with the requested summary statistics. diff --git a/tests/testthat/_snaps/data_summary.md b/tests/testthat/_snaps/data_summary.md index 65d9e82f2..44b7f3f4c 100644 --- a/tests/testthat/_snaps/data_summary.md +++ b/tests/testthat/_snaps/data_summary.md @@ -10,3 +10,46 @@ 1 | 4 | 26.27 | 5.41 1 | 5 | 21.38 | 6.66 +# data_summary, with NA + + Code + print(out) + Output + c172code | MW + ---------------- + 1 | 87.12 + 2 | 94.05 + 3 | 75.00 + | 47.80 + +--- + + Code + print(out) + Output + c172code | MW + ---------------- + 1 | 87.12 + 2 | 94.05 + 3 | 75.00 + +--- + + Code + print(out) + Output + e42dep | c172code | MW + -------------------------- + 1 | 2 | 17.00 + 2 | 2 | 34.25 + 3 | 1 | 39.50 + 3 | 2 | 52.44 + 3 | 3 | 52.00 + 3 | | 84.00 + 4 | 1 | 134.75 + 4 | 2 | 119.26 + 4 | 3 | 88.80 + 4 | | 43.29 + | 2 | + | | 7.00 + diff --git a/tests/testthat/test-data_summary.R b/tests/testthat/test-data_summary.R index b82d78095..23eb02a7c 100644 --- a/tests/testthat/test-data_summary.R +++ b/tests/testthat/test-data_summary.R @@ -146,6 +146,18 @@ test_that("data_summary, print", { }) +test_that("data_summary, with NA", { + data(efc, package = "datawizard") + out <- data_summary(efc, MW = mean(c12hour, na.rm = TRUE), by = "c172code") + expect_snapshot(print(out)) + out <- data_summary(efc, MW = mean(c12hour, na.rm = TRUE), by = "c172code", include_na = FALSE) + expect_snapshot(print(out)) + # sorting for multiple groups + out <- data_summary(efc, MW = mean(c12hour, na.rm = TRUE), by = c("e42dep", "c172code")) + expect_snapshot(print(out)) +}) + + test_that("data_summary, inside functions", { foo1 <- function(x, ...) { datawizard::data_summary(x, ..., by = "Species")