include NA, sort output

easystats · Mar 3, 2024 · ce578d5 · ce578d5
1 parent b3c7628
commit ce578d5
Show file tree

Hide file tree

Showing 4 changed files with 81 additions and 9 deletions.
diff --git a/R/data_summary.R b/R/data_summary.R
@@ -8,6 +8,9 @@
 #' @param by Optional character string, indicating the name of a variable in `x`.
 #' If supplied, the data will be split by this variable and summary statistics
 #' will be computed for each group.
+#' @param include_na Logical, if `TRUE`, missing values are included as a level
+#' in the grouping variable. If `FALSE`, missing values are omitted from the
+#' grouping variable.
 #' @param ... One or more named expressions that define the new variable name
 #' and the function to compute the summary statistic. Example:
 #' `mean_sepal_width = mean(Sepal.Width)`. The expression can also be provided
@@ -42,8 +45,8 @@ data_summary <- function(x, ...) {
 
 
 #' @export
-data_summary.matrix <- function(x, ..., by = NULL) {
-  data_summary(as.data.frame(x), ..., by = by)
+data_summary.matrix <- function(x, ..., by = NULL, include_na = TRUE) {
+  data_summary(as.data.frame(x), ..., by = by, include_na = include_na)
 }
 
 
@@ -55,7 +58,7 @@ data_summary.default <- function(x, ...) {
 
 #' @rdname data_summary
 #' @export
-data_summary.data.frame <- function(x, ..., by = NULL) {
+data_summary.data.frame <- function(x, ..., by = NULL, include_na = TRUE) {
   dots <- eval(substitute(alist(...)))
 
   # do we have any expression at all?
@@ -81,8 +84,15 @@ data_summary.data.frame <- function(x, ..., by = NULL) {
         .misspelled_string(colnames(x), by_not_found, "Possibly misspelled?")
       )
     }
-    # split data
-    split_data <- split(x, x[by])
+    # split data, add NA levels, if requested
+    l <- lapply(x[by], function(i) {
+      if (include_na) {
+        addNA(i)
+      } else {
+        i
+      }
+    })
+    split_data <- split(x, l, drop = TRUE)
     out <- lapply(split_data, function(s) {
       # no data for combination? Return NULL
       if (nrow(s) == 0) {
@@ -100,14 +110,17 @@ data_summary.data.frame <- function(x, ..., by = NULL) {
     })
     out <- do.call(rbind, out)
   }
+  # sort data
+  out <- data_arrange(out, select = by)
+  # data attributes
   class(out) <- c("dw_data_summary", "data.frame")
   rownames(out) <- NULL
   out
 }
 
 
 #' @export
-data_summary.grouped_df <- function(x, ..., by = NULL) {
+data_summary.grouped_df <- function(x, ..., by = NULL, include_na = TRUE) {
   # extract group variables
   grps <- attr(x, "groups", exact = TRUE)
   group_variables <- data_remove(grps, ".rows")
@@ -118,7 +131,7 @@ data_summary.grouped_df <- function(x, ..., by = NULL) {
   # remove information specific to grouped df's
   attr(x, "groups") <- NULL
   class(x) <- "data.frame"
-  data_summary(x, ..., by = by)
+  data_summary(x, ..., by = by, include_na = include_na)
 }
 
 
@@ -172,6 +185,6 @@ print.dw_data_summary <- function(x, ...) {
   if (nrow(x) == 0) {
     cat("No matches found.\n")
   } else {
-    cat(insight::export_table(x, ...))
+    cat(insight::export_table(x, missing = "<NA>", ...))
   }
 }
diff --git a/man/data_summary.Rd b/man/data_summary.Rd
diff --git a/tests/testthat/_snaps/data_summary.md b/tests/testthat/_snaps/data_summary.md
@@ -10,3 +10,46 @@
        1 |    4 | 26.27 | 5.41
        1 |    5 | 21.38 | 6.66
 
+# data_summary, with NA
+
+    Code
+      print(out)
+    Output
+      c172code |    MW
+      ----------------
+             1 | 87.12
+             2 | 94.05
+             3 | 75.00
+          <NA> | 47.80
+
+---
+
+    Code
+      print(out)
+    Output
+      c172code |    MW
+      ----------------
+             1 | 87.12
+             2 | 94.05
+             3 | 75.00
+
+---
+
+    Code
+      print(out)
+    Output
+      e42dep | c172code |     MW
+      --------------------------
+      1      |        2 |  17.00
+      2      |        2 |  34.25
+      3      |        1 |  39.50
+      3      |        2 |  52.44
+      3      |        3 |  52.00
+      3      |     <NA> |  84.00
+      4      |        1 | 134.75
+      4      |        2 | 119.26
+      4      |        3 |  88.80
+      4      |     <NA> |  43.29
+      <NA>   |        2 |   <NA>
+      <NA>   |     <NA> |   7.00
+
diff --git a/tests/testthat/test-data_summary.R b/tests/testthat/test-data_summary.R
@@ -146,6 +146,18 @@ test_that("data_summary, print", {
 })
 
 
+test_that("data_summary, with NA", {
+  data(efc, package = "datawizard")
+  out <- data_summary(efc, MW = mean(c12hour, na.rm = TRUE), by = "c172code")
+  expect_snapshot(print(out))
+  out <- data_summary(efc, MW = mean(c12hour, na.rm = TRUE), by = "c172code", include_na = FALSE)
+  expect_snapshot(print(out))
+  # sorting for multiple groups
+  out <- data_summary(efc, MW = mean(c12hour, na.rm = TRUE), by = c("e42dep", "c172code"))
+  expect_snapshot(print(out))
+})
+
+
 test_that("data_summary, inside functions", {
   foo1 <- function(x, ...) {
     datawizard::data_summary(x, ..., by = "Species")