From e2d6091ecec3be3857910f21d2576e0375e3ddbb Mon Sep 17 00:00:00 2001 From: Daniel Date: Sat, 22 Jun 2024 15:53:38 +0200 Subject: [PATCH 1/6] Unexpected missing values in `data_tabulate()` Fixes #514 --- R/data_summary.R | 14 +++++++------- R/data_tabulate.R | 26 ++++++++++++------------- R/data_xtabulate.R | 6 +++--- man/data_summary.Rd | 4 ++-- man/data_tabulate.Rd | 12 ++++++------ tests/testthat/test-data_summary.R | 2 +- tests/testthat/test-data_tabulate.R | 30 ++++++++++++++--------------- 7 files changed, 47 insertions(+), 47 deletions(-) diff --git a/R/data_summary.R b/R/data_summary.R index 8d15f8483..61cb3a04c 100644 --- a/R/data_summary.R +++ b/R/data_summary.R @@ -8,7 +8,7 @@ #' @param by Optional character string, indicating the name of a variable in `x`. #' If supplied, the data will be split by this variable and summary statistics #' will be computed for each group. -#' @param include_na Logical. If `TRUE`, missing values are included as a level +#' @param remove_na Logical. If `TRUE`, missing values are included as a level #' in the grouping variable. If `FALSE`, missing values are omitted from the #' grouping variable. #' @param ... One or more named expressions that define the new variable name @@ -57,8 +57,8 @@ data_summary <- function(x, ...) { #' @export -data_summary.matrix <- function(x, ..., by = NULL, include_na = TRUE) { - data_summary(as.data.frame(x), ..., by = by, include_na = include_na) +data_summary.matrix <- function(x, ..., by = NULL, remove_na = FALSE) { + data_summary(as.data.frame(x), ..., by = by, remove_na = remove_na) } @@ -70,7 +70,7 @@ data_summary.default <- function(x, ...) { #' @rdname data_summary #' @export -data_summary.data.frame <- function(x, ..., by = NULL, include_na = TRUE) { +data_summary.data.frame <- function(x, ..., by = NULL, remove_na = FALSE) { dots <- eval(substitute(alist(...))) # do we have any expression at all? @@ -103,7 +103,7 @@ data_summary.data.frame <- function(x, ..., by = NULL, include_na = TRUE) { } # split data, add NA levels, if requested l <- lapply(x[by], function(i) { - if (include_na && anyNA(i)) { + if (remove_na && anyNA(i)) { addNA(i) } else { i @@ -137,7 +137,7 @@ data_summary.data.frame <- function(x, ..., by = NULL, include_na = TRUE) { #' @export -data_summary.grouped_df <- function(x, ..., by = NULL, include_na = TRUE) { +data_summary.grouped_df <- function(x, ..., by = NULL, remove_na = FALSE) { # extract group variables grps <- attr(x, "groups", exact = TRUE) group_variables <- data_remove(grps, ".rows") @@ -148,7 +148,7 @@ data_summary.grouped_df <- function(x, ..., by = NULL, include_na = TRUE) { # remove information specific to grouped df's attr(x, "groups") <- NULL class(x) <- "data.frame" - data_summary(x, ..., by = by, include_na = include_na) + data_summary(x, ..., by = by, remove_na = remove_na) } diff --git a/R/data_tabulate.R b/R/data_tabulate.R index 74f4f2e03..08b03e706 100644 --- a/R/data_tabulate.R +++ b/R/data_tabulate.R @@ -15,7 +15,7 @@ #' factor levels are dropped from the frequency table. #' @param name Optional character string, which includes the name that is used #' for printing. -#' @param include_na Logical, if `TRUE`, missing values are included in the +#' @param remove_na Logical, if `TRUE`, missing values are included in the #' frequency or crosstable, else missing values are omitted. #' @param collapse Logical, if `TRUE` collapses multiple tables into one larger #' table for printing. This affects only printing, not the returned object. @@ -40,7 +40,7 @@ #' (missing) values by default. The first column indicates values of `x`, the #' first row indicates values of `by` (including missing values). The last row #' and column contain the total frequencies for each row and column, respectively. -#' Setting `include_na = FALSE` will omit missing values from the crosstable. +#' Setting `remove_na = FALSE` will omit missing values from the crosstable. #' Setting `proportions` to `"row"` or `"column"` will add row or column #' percentages. Setting `proportions` to `"full"` will add relative frequencies #' for the full table. @@ -62,7 +62,7 @@ #' data_tabulate(efc$c172code) #' #' # drop missing values -#' data_tabulate(efc$c172code, include_na = FALSE) +#' data_tabulate(efc$c172code, remove_na = FALSE) #' #' # data frame #' data_tabulate(efc, c("e42dep", "c172code")) @@ -109,7 +109,7 @@ #' efc$c172code, #' by = efc$e16sex, #' proportions = "column", -#' include_na = FALSE +#' remove_na = FALSE #' ) #' #' # round percentages @@ -133,7 +133,7 @@ data_tabulate.default <- function(x, by = NULL, drop_levels = FALSE, weights = NULL, - include_na = TRUE, + remove_na = FALSE, proportions = NULL, name = NULL, verbose = TRUE, @@ -163,7 +163,7 @@ data_tabulate.default <- function(x, x, by = by, weights = weights, - include_na = include_na, + remove_na = remove_na, proportions = proportions, obj_name = obj_name, group_variable = group_variable @@ -172,12 +172,12 @@ data_tabulate.default <- function(x, # frequency table if (is.null(weights)) { - if (include_na) { + if (remove_na) { freq_table <- tryCatch(table(addNA(x)), error = function(e) NULL) } else { freq_table <- tryCatch(table(x), error = function(e) NULL) } - } else if (include_na) { + } else if (remove_na) { # weighted frequency table, including NA freq_table <- tryCatch( stats::xtabs( @@ -218,7 +218,7 @@ data_tabulate.default <- function(x, out$`Raw %` <- 100 * out$N / sum(out$N) # if we have missing values, we add a row with NA - if (include_na) { + if (remove_na) { out$`Valid %` <- c(100 * out$N[-nrow(out)] / sum(out$N[-nrow(out)]), NA) valid_n <- sum(out$N[-length(out$N)], na.rm = TRUE) } else { @@ -271,7 +271,7 @@ data_tabulate.data.frame <- function(x, by = NULL, drop_levels = FALSE, weights = NULL, - include_na = TRUE, + remove_na = FALSE, proportions = NULL, collapse = FALSE, verbose = TRUE, @@ -297,7 +297,7 @@ data_tabulate.data.frame <- function(x, proportions = proportions, drop_levels = drop_levels, weights = weights, - include_na = include_na, + remove_na = remove_na, name = i, verbose = verbose, ... @@ -326,7 +326,7 @@ data_tabulate.grouped_df <- function(x, proportions = NULL, drop_levels = FALSE, weights = NULL, - include_na = TRUE, + remove_na = FALSE, collapse = FALSE, verbose = TRUE, ...) { @@ -362,7 +362,7 @@ data_tabulate.grouped_df <- function(x, verbose = verbose, drop_levels = drop_levels, weights = weights, - include_na = include_na, + remove_na = remove_na, by = by, proportions = proportions, group_variable = group_variable, diff --git a/R/data_xtabulate.R b/R/data_xtabulate.R index 5c387ff95..81ba26c59 100644 --- a/R/data_xtabulate.R +++ b/R/data_xtabulate.R @@ -3,7 +3,7 @@ .crosstable <- function(x, by, weights = NULL, - include_na = TRUE, + remove_na = FALSE, proportions = NULL, obj_name = NULL, group_variable = NULL) { @@ -12,12 +12,12 @@ } # frequency table if (is.null(weights)) { - if (include_na) { + if (remove_na) { x_table <- tryCatch(table(addNA(x), addNA(by)), error = function(e) NULL) } else { x_table <- tryCatch(table(x, by), error = function(e) NULL) } - } else if (include_na) { + } else if (remove_na) { # weighted frequency table, including NA x_table <- tryCatch( stats::xtabs( diff --git a/man/data_summary.Rd b/man/data_summary.Rd index ccbf4c524..70d2e2576 100644 --- a/man/data_summary.Rd +++ b/man/data_summary.Rd @@ -7,7 +7,7 @@ \usage{ data_summary(x, ...) -\method{data_summary}{data.frame}(x, ..., by = NULL, include_na = TRUE) +\method{data_summary}{data.frame}(x, ..., by = NULL, remove_na = FALSE) } \arguments{ \item{x}{A (grouped) data frame.} @@ -22,7 +22,7 @@ summary function \code{n()} can be used to count the number of observations.} If supplied, the data will be split by this variable and summary statistics will be computed for each group.} -\item{include_na}{Logical. If \code{TRUE}, missing values are included as a level +\item{remove_na}{Logical. If \code{TRUE}, missing values are included as a level in the grouping variable. If \code{FALSE}, missing values are omitted from the grouping variable.} } diff --git a/man/data_tabulate.Rd b/man/data_tabulate.Rd index 3f17bb21c..58a0c2954 100644 --- a/man/data_tabulate.Rd +++ b/man/data_tabulate.Rd @@ -14,7 +14,7 @@ data_tabulate(x, ...) by = NULL, drop_levels = FALSE, weights = NULL, - include_na = TRUE, + remove_na = FALSE, proportions = NULL, name = NULL, verbose = TRUE, @@ -30,7 +30,7 @@ data_tabulate(x, ...) by = NULL, drop_levels = FALSE, weights = NULL, - include_na = TRUE, + remove_na = FALSE, proportions = NULL, collapse = FALSE, verbose = TRUE, @@ -62,7 +62,7 @@ factor levels are dropped from the frequency table.} \item{weights}{Optional numeric vector of weights. Must be of the same length as \code{x}. If \code{weights} is supplied, weighted frequencies are calculated.} -\item{include_na}{Logical, if \code{TRUE}, missing values are included in the +\item{remove_na}{Logical, if \code{TRUE}, missing values are included in the frequency or crosstable, else missing values are omitted.} \item{proportions}{Optional character string, indicating the type of @@ -173,7 +173,7 @@ If \code{by} is supplied, a crosstable is created. The crosstable includes \verb (missing) values by default. The first column indicates values of \code{x}, the first row indicates values of \code{by} (including missing values). The last row and column contain the total frequencies for each row and column, respectively. -Setting \code{include_na = FALSE} will omit missing values from the crosstable. +Setting \code{remove_na = FALSE} will omit missing values from the crosstable. Setting \code{proportions} to \code{"row"} or \code{"column"} will add row or column percentages. Setting \code{proportions} to \code{"full"} will add relative frequencies for the full table. @@ -189,7 +189,7 @@ data(efc) data_tabulate(efc$c172code) # drop missing values -data_tabulate(efc$c172code, include_na = FALSE) +data_tabulate(efc$c172code, remove_na = FALSE) # data frame data_tabulate(efc, c("e42dep", "c172code")) @@ -236,7 +236,7 @@ data_tabulate( efc$c172code, by = efc$e16sex, proportions = "column", - include_na = FALSE + remove_na = FALSE ) # round percentages diff --git a/tests/testthat/test-data_summary.R b/tests/testthat/test-data_summary.R index 746d4c51a..c8fac4a88 100644 --- a/tests/testthat/test-data_summary.R +++ b/tests/testthat/test-data_summary.R @@ -175,7 +175,7 @@ test_that("data_summary, with NA", { data(efc, package = "datawizard") out <- data_summary(efc, MW = mean(c12hour, na.rm = TRUE), by = "c172code") expect_snapshot(print(out)) - out <- data_summary(efc, MW = mean(c12hour, na.rm = TRUE), by = "c172code", include_na = FALSE) + out <- data_summary(efc, MW = mean(c12hour, na.rm = TRUE), by = "c172code", remove_na = FALSE) expect_snapshot(print(out)) # sorting for multiple groups out <- data_summary(efc, MW = mean(c12hour, na.rm = TRUE), by = c("e42dep", "c172code")) diff --git a/tests/testthat/test-data_tabulate.R b/tests/testthat/test-data_tabulate.R index 5108e29c2..6c443a860 100644 --- a/tests/testthat/test-data_tabulate.R +++ b/tests/testthat/test-data_tabulate.R @@ -287,11 +287,11 @@ test_that("data_tabulate exclude/include missing values", { efc$e16sex[sample.int(nrow(efc), 5)] <- NA out <- data_tabulate(efc$c172code) expect_identical(out$N, c(8L, 66L, 16L, 10L)) - out <- data_tabulate(efc$c172code, include_na = FALSE) + out <- data_tabulate(efc$c172code, remove_na = FALSE) expect_identical(out$N, c(8L, 66L, 16L)) out <- data_tabulate(efc$c172code, weights = efc$weights) expect_identical(out$N, c(10, 67, 15, 13)) - out <- data_tabulate(efc$c172code, include_na = FALSE, weights = efc$weights) + out <- data_tabulate(efc$c172code, remove_na = FALSE, weights = efc$weights) expect_identical(out$N, c(10, 67, 15)) }) @@ -305,17 +305,17 @@ test_that("data_tabulate, cross tables", { efc$e16sex[sample.int(nrow(efc), 5)] <- NA expect_snapshot(print(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full"))) - expect_snapshot(print(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full", include_na = FALSE))) + expect_snapshot(print(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full", remove_na = FALSE))) expect_snapshot(print(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full", weights = efc$weights))) - expect_snapshot(print(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full", include_na = FALSE, weights = efc$weights))) # nolint + expect_snapshot(print(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full", remove_na = FALSE, weights = efc$weights))) # nolint expect_snapshot(print(data_tabulate(efc, "c172code", by = efc$e16sex, proportions = "row"))) - expect_snapshot(print(data_tabulate(efc, "c172code", by = efc$e16sex, proportions = "row", include_na = FALSE))) + expect_snapshot(print(data_tabulate(efc, "c172code", by = efc$e16sex, proportions = "row", remove_na = FALSE))) expect_snapshot(print(data_tabulate(efc, "c172code", by = efc$e16sex, proportions = "row", weights = efc$weights))) - expect_snapshot(print(data_tabulate(efc, "c172code", by = efc$e16sex, proportions = "row", include_na = FALSE, weights = efc$weights))) # nolint + expect_snapshot(print(data_tabulate(efc, "c172code", by = efc$e16sex, proportions = "row", remove_na = FALSE, weights = efc$weights))) # nolint expect_snapshot(print(data_tabulate(efc, "c172code", by = "e16sex", proportions = "column"))) - expect_snapshot(print(data_tabulate(efc, "c172code", by = "e16sex", proportions = "column", include_na = FALSE))) + expect_snapshot(print(data_tabulate(efc, "c172code", by = "e16sex", proportions = "column", remove_na = FALSE))) expect_snapshot(print(data_tabulate(efc, "c172code", by = "e16sex", proportions = "column", weights = "weights"))) - expect_snapshot(print(data_tabulate(efc, "c172code", by = "e16sex", proportions = "column", include_na = FALSE, weights = "weights"))) # nolint + expect_snapshot(print(data_tabulate(efc, "c172code", by = "e16sex", proportions = "column", remove_na = FALSE, weights = "weights"))) # nolint }) test_that("data_tabulate, cross tables, HTML", { @@ -326,11 +326,11 @@ test_that("data_tabulate, cross tables, HTML", { efc$e16sex[sample.int(nrow(efc), 5)] <- NA expect_s3_class(print_html(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full")), "gt_tbl") - expect_s3_class(print_html(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full", include_na = FALSE)), "gt_tbl") # nolint + expect_s3_class(print_html(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full", remove_na = FALSE)), "gt_tbl") # nolint expect_s3_class(print_html(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full", weights = efc$weights)), "gt_tbl") # nolint - expect_s3_class(print_html(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full", include_na = FALSE, weights = efc$weights)), "gt_tbl") # nolint + expect_s3_class(print_html(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full", remove_na = FALSE, weights = efc$weights)), "gt_tbl") # nolint expect_s3_class(print_html(data_tabulate(efc, "c172code", by = efc$e16sex, proportions = "row")), "gt_tbl") - expect_s3_class(print_html(data_tabulate(efc, "c172code", by = efc$e16sex, proportions = "row", include_na = FALSE, weights = efc$weights)), "gt_tbl") # nolint + expect_s3_class(print_html(data_tabulate(efc, "c172code", by = efc$e16sex, proportions = "row", remove_na = FALSE, weights = efc$weights)), "gt_tbl") # nolint }) test_that("data_tabulate, cross tables, grouped df", { @@ -377,9 +377,9 @@ test_that("data_tabulate, cross tables, markdown", { efc$e16sex[sample.int(nrow(efc), 5)] <- NA expect_snapshot(print_md(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full"))) - expect_snapshot(print_md(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full", include_na = FALSE))) + expect_snapshot(print_md(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full", remove_na = FALSE))) expect_snapshot(print_md(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full", weights = efc$weights))) - expect_snapshot(print_md(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full", include_na = FALSE, weights = efc$weights))) # nolint + expect_snapshot(print_md(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full", remove_na = FALSE, weights = efc$weights))) # nolint }) @@ -389,12 +389,12 @@ test_that("data_tabulate, validate against table", { data(mtcars) # frequency table out1 <- as.data.frame(table(mtcars$cyl)) - out2 <- data_tabulate(mtcars$cyl, include_na = FALSE) + out2 <- data_tabulate(mtcars$cyl, remove_na = FALSE) expect_identical(out1$Freq, out2$N) # crosstable out1 <- data_arrange(as.data.frame(table(mtcars$cyl, mtcars$gear)), c("Var1", "Var2")) out2 <- data_rename(data_to_long( - as.data.frame(data_tabulate(mtcars$cyl, by = mtcars$gear, include_na = FALSE)), 2:4, + as.data.frame(data_tabulate(mtcars$cyl, by = mtcars$gear, remove_na = FALSE)), 2:4, names_to = "Var2", values_to = "Freq" ), "mtcars$cyl", "Var1") out1[[2]] <- as.character(out1[[2]]) From f9aa53e10ebacce86235b83e4a939145befed2d6 Mon Sep 17 00:00:00 2001 From: Daniel Date: Sat, 22 Jun 2024 15:57:34 +0200 Subject: [PATCH 2/6] reverse usage --- R/data_summary.R | 6 +-- R/data_tabulate.R | 30 +++++++------- R/data_xtabulate.R | 20 ++++----- man/data_tabulate.Rd | 4 +- tests/testthat/_snaps/data_summary.new.md | 50 +++++++++++++++++++++++ 5 files changed, 80 insertions(+), 30 deletions(-) create mode 100644 tests/testthat/_snaps/data_summary.new.md diff --git a/R/data_summary.R b/R/data_summary.R index 61cb3a04c..f77ebe013 100644 --- a/R/data_summary.R +++ b/R/data_summary.R @@ -103,10 +103,10 @@ data_summary.data.frame <- function(x, ..., by = NULL, remove_na = FALSE) { } # split data, add NA levels, if requested l <- lapply(x[by], function(i) { - if (remove_na && anyNA(i)) { - addNA(i) - } else { + if (remove_na || !anyNA(i)) { i + } else { + addNA(i) } }) split_data <- split(x, l, drop = TRUE) diff --git a/R/data_tabulate.R b/R/data_tabulate.R index 08b03e706..4c0aa10b3 100644 --- a/R/data_tabulate.R +++ b/R/data_tabulate.R @@ -62,7 +62,7 @@ #' data_tabulate(efc$c172code) #' #' # drop missing values -#' data_tabulate(efc$c172code, remove_na = FALSE) +#' data_tabulate(efc$c172code, remove_na = TRUE) #' #' # data frame #' data_tabulate(efc, c("e42dep", "c172code")) @@ -109,7 +109,7 @@ #' efc$c172code, #' by = efc$e16sex, #' proportions = "column", -#' remove_na = FALSE +#' remove_na = TRUE #' ) #' #' # round percentages @@ -173,29 +173,29 @@ data_tabulate.default <- function(x, # frequency table if (is.null(weights)) { if (remove_na) { - freq_table <- tryCatch(table(addNA(x)), error = function(e) NULL) - } else { freq_table <- tryCatch(table(x), error = function(e) NULL) + } else { + freq_table <- tryCatch(table(addNA(x)), error = function(e) NULL) } } else if (remove_na) { - # weighted frequency table, including NA + # weighted frequency table, excluding NA freq_table <- tryCatch( stats::xtabs( weights ~ x, - data = data.frame(weights = weights, x = addNA(x)), - na.action = stats::na.pass, - addNA = TRUE + data = data.frame(weights = weights, x = x), + na.action = stats::na.omit, + addNA = FALSE ), error = function(e) NULL ) } else { - # weighted frequency table, excluding NA + # weighted frequency table, including NA freq_table <- tryCatch( stats::xtabs( weights ~ x, - data = data.frame(weights = weights, x = x), - na.action = stats::na.omit, - addNA = FALSE + data = data.frame(weights = weights, x = addNA(x)), + na.action = stats::na.pass, + addNA = TRUE ), error = function(e) NULL ) @@ -219,11 +219,11 @@ data_tabulate.default <- function(x, out$`Raw %` <- 100 * out$N / sum(out$N) # if we have missing values, we add a row with NA if (remove_na) { - out$`Valid %` <- c(100 * out$N[-nrow(out)] / sum(out$N[-nrow(out)]), NA) - valid_n <- sum(out$N[-length(out$N)], na.rm = TRUE) - } else { out$`Valid %` <- 100 * out$N / sum(out$N) valid_n <- sum(out$N, na.rm = TRUE) + } else { + out$`Valid %` <- c(100 * out$N[-nrow(out)] / sum(out$N[-nrow(out)]), NA) + valid_n <- sum(out$N[-length(out$N)], na.rm = TRUE) } out$`Cumulative %` <- cumsum(out$`Valid %`) diff --git a/R/data_xtabulate.R b/R/data_xtabulate.R index 81ba26c59..7943cbab9 100644 --- a/R/data_xtabulate.R +++ b/R/data_xtabulate.R @@ -13,29 +13,29 @@ # frequency table if (is.null(weights)) { if (remove_na) { - x_table <- tryCatch(table(addNA(x), addNA(by)), error = function(e) NULL) - } else { x_table <- tryCatch(table(x, by), error = function(e) NULL) + } else { + x_table <- tryCatch(table(addNA(x), addNA(by)), error = function(e) NULL) } } else if (remove_na) { - # weighted frequency table, including NA + # weighted frequency table, excluding NA x_table <- tryCatch( stats::xtabs( weights ~ x + by, - data = data.frame(weights = weights, x = addNA(x), by = addNA(by)), - na.action = stats::na.pass, - addNA = TRUE + data = data.frame(weights = weights, x = x, by = by), + na.action = stats::na.omit, + addNA = FALSE ), error = function(e) NULL ) } else { - # weighted frequency table, excluding NA + # weighted frequency table, including NA x_table <- tryCatch( stats::xtabs( weights ~ x + by, - data = data.frame(weights = weights, x = x, by = by), - na.action = stats::na.omit, - addNA = FALSE + data = data.frame(weights = weights, x = addNA(x), by = addNA(by)), + na.action = stats::na.pass, + addNA = TRUE ), error = function(e) NULL ) diff --git a/man/data_tabulate.Rd b/man/data_tabulate.Rd index 58a0c2954..6f572a05a 100644 --- a/man/data_tabulate.Rd +++ b/man/data_tabulate.Rd @@ -189,7 +189,7 @@ data(efc) data_tabulate(efc$c172code) # drop missing values -data_tabulate(efc$c172code, remove_na = FALSE) +data_tabulate(efc$c172code, remove_na = TRUE) # data frame data_tabulate(efc, c("e42dep", "c172code")) @@ -236,7 +236,7 @@ data_tabulate( efc$c172code, by = efc$e16sex, proportions = "column", - remove_na = FALSE + remove_na = TRUE ) # round percentages diff --git a/tests/testthat/_snaps/data_summary.new.md b/tests/testthat/_snaps/data_summary.new.md new file mode 100644 index 000000000..070c31ce8 --- /dev/null +++ b/tests/testthat/_snaps/data_summary.new.md @@ -0,0 +1,50 @@ +# data_summary, print + + Code + print(out) + Output + am | gear | MW | SD + ------------------------ + 0 | 3 | 16.11 | 3.37 + 0 | 4 | 21.05 | 3.07 + 1 | 4 | 26.27 | 5.41 + 1 | 5 | 21.38 | 6.66 + +# data_summary, with NA + + Code + print(out) + Output + c172code | MW + ---------------- + 1 | 87.12 + 2 | 94.05 + 3 | 75.00 + +--- + + Code + print(out) + Output + c172code | MW + ---------------- + 1 | 87.12 + 2 | 94.05 + 3 | 75.00 + +--- + + Code + print(out) + Output + e42dep | c172code | MW + -------------------------- + 1 | 2 | 17.00 + 2 | 2 | 34.25 + 3 | 1 | 39.50 + 3 | 2 | 52.44 + 3 | 3 | 52.00 + 4 | 1 | 134.75 + 4 | 2 | 119.26 + 4 | 3 | 88.80 + From 388d9a037a65a495dcb55701a5a0ea7c9093dbd2 Mon Sep 17 00:00:00 2001 From: Daniel Date: Sat, 22 Jun 2024 16:00:26 +0200 Subject: [PATCH 3/6] fix tests (FALSE -> TRUE) --- tests/testthat/_snaps/data_summary.new.md | 50 ----------------------- tests/testthat/_snaps/data_tabulate.md | 16 ++++---- tests/testthat/test-data_summary.R | 2 +- tests/testthat/test-data_tabulate.R | 30 +++++++------- 4 files changed, 24 insertions(+), 74 deletions(-) delete mode 100644 tests/testthat/_snaps/data_summary.new.md diff --git a/tests/testthat/_snaps/data_summary.new.md b/tests/testthat/_snaps/data_summary.new.md deleted file mode 100644 index 070c31ce8..000000000 --- a/tests/testthat/_snaps/data_summary.new.md +++ /dev/null @@ -1,50 +0,0 @@ -# data_summary, print - - Code - print(out) - Output - am | gear | MW | SD - ------------------------ - 0 | 3 | 16.11 | 3.37 - 0 | 4 | 21.05 | 3.07 - 1 | 4 | 26.27 | 5.41 - 1 | 5 | 21.38 | 6.66 - -# data_summary, with NA - - Code - print(out) - Output - c172code | MW - ---------------- - 1 | 87.12 - 2 | 94.05 - 3 | 75.00 - ---- - - Code - print(out) - Output - c172code | MW - ---------------- - 1 | 87.12 - 2 | 94.05 - 3 | 75.00 - ---- - - Code - print(out) - Output - e42dep | c172code | MW - -------------------------- - 1 | 2 | 17.00 - 2 | 2 | 34.25 - 3 | 1 | 39.50 - 3 | 2 | 52.44 - 3 | 3 | 52.00 - 4 | 1 | 134.75 - 4 | 2 | 119.26 - 4 | 3 | 88.80 - diff --git a/tests/testthat/_snaps/data_tabulate.md b/tests/testthat/_snaps/data_tabulate.md index 59a20dc01..ffde63088 100644 --- a/tests/testthat/_snaps/data_tabulate.md +++ b/tests/testthat/_snaps/data_tabulate.md @@ -259,7 +259,7 @@ Code print(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full", - include_na = FALSE)) + remove_na = TRUE)) Output efc$c172code | male | female | Total -------------+------------+------------+------ @@ -288,7 +288,7 @@ Code print(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full", - include_na = FALSE, weights = efc$weights)) + remove_na = TRUE, weights = efc$weights)) Output efc$c172code | male | female | Total -------------+------------+------------+------ @@ -317,7 +317,7 @@ Code print(data_tabulate(efc, "c172code", by = efc$e16sex, proportions = "row", - include_na = FALSE)) + remove_na = TRUE)) Output c172code | male | female | Total ---------+------------+------------+------ @@ -348,7 +348,7 @@ Code print(data_tabulate(efc, "c172code", by = efc$e16sex, proportions = "row", - include_na = FALSE, weights = efc$weights)) + remove_na = TRUE, weights = efc$weights)) Output c172code | male | female | Total ---------+------------+------------+------ @@ -378,7 +378,7 @@ Code print(data_tabulate(efc, "c172code", by = "e16sex", proportions = "column", - include_na = FALSE)) + remove_na = TRUE)) Output c172code | male | female | Total ---------+------------+------------+------ @@ -409,7 +409,7 @@ Code print(data_tabulate(efc, "c172code", by = "e16sex", proportions = "column", - include_na = FALSE, weights = "weights")) + remove_na = TRUE, weights = "weights")) Output c172code | male | female | Total ---------+------------+------------+------ @@ -497,7 +497,7 @@ Code print_md(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full", - include_na = FALSE)) + remove_na = TRUE)) Output [1] "|efc$c172code | male| female| Total|" [2] "|:------------|----------:|----------:|-----:|" @@ -534,7 +534,7 @@ Code print_md(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full", - include_na = FALSE, weights = efc$weights)) + remove_na = TRUE, weights = efc$weights)) Output [1] "|efc$c172code | male| female| Total|" [2] "|:------------|----------:|----------:|-----:|" diff --git a/tests/testthat/test-data_summary.R b/tests/testthat/test-data_summary.R index c8fac4a88..c60b142d2 100644 --- a/tests/testthat/test-data_summary.R +++ b/tests/testthat/test-data_summary.R @@ -175,7 +175,7 @@ test_that("data_summary, with NA", { data(efc, package = "datawizard") out <- data_summary(efc, MW = mean(c12hour, na.rm = TRUE), by = "c172code") expect_snapshot(print(out)) - out <- data_summary(efc, MW = mean(c12hour, na.rm = TRUE), by = "c172code", remove_na = FALSE) + out <- data_summary(efc, MW = mean(c12hour, na.rm = TRUE), by = "c172code", remove_na = TRUE) expect_snapshot(print(out)) # sorting for multiple groups out <- data_summary(efc, MW = mean(c12hour, na.rm = TRUE), by = c("e42dep", "c172code")) diff --git a/tests/testthat/test-data_tabulate.R b/tests/testthat/test-data_tabulate.R index 6c443a860..9848d42b9 100644 --- a/tests/testthat/test-data_tabulate.R +++ b/tests/testthat/test-data_tabulate.R @@ -287,11 +287,11 @@ test_that("data_tabulate exclude/include missing values", { efc$e16sex[sample.int(nrow(efc), 5)] <- NA out <- data_tabulate(efc$c172code) expect_identical(out$N, c(8L, 66L, 16L, 10L)) - out <- data_tabulate(efc$c172code, remove_na = FALSE) + out <- data_tabulate(efc$c172code, remove_na = TRUE) expect_identical(out$N, c(8L, 66L, 16L)) out <- data_tabulate(efc$c172code, weights = efc$weights) expect_identical(out$N, c(10, 67, 15, 13)) - out <- data_tabulate(efc$c172code, remove_na = FALSE, weights = efc$weights) + out <- data_tabulate(efc$c172code, remove_na = TRUE, weights = efc$weights) expect_identical(out$N, c(10, 67, 15)) }) @@ -305,17 +305,17 @@ test_that("data_tabulate, cross tables", { efc$e16sex[sample.int(nrow(efc), 5)] <- NA expect_snapshot(print(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full"))) - expect_snapshot(print(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full", remove_na = FALSE))) + expect_snapshot(print(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full", remove_na = TRUE))) expect_snapshot(print(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full", weights = efc$weights))) - expect_snapshot(print(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full", remove_na = FALSE, weights = efc$weights))) # nolint + expect_snapshot(print(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full", remove_na = TRUE, weights = efc$weights))) # nolint expect_snapshot(print(data_tabulate(efc, "c172code", by = efc$e16sex, proportions = "row"))) - expect_snapshot(print(data_tabulate(efc, "c172code", by = efc$e16sex, proportions = "row", remove_na = FALSE))) + expect_snapshot(print(data_tabulate(efc, "c172code", by = efc$e16sex, proportions = "row", remove_na = TRUE))) expect_snapshot(print(data_tabulate(efc, "c172code", by = efc$e16sex, proportions = "row", weights = efc$weights))) - expect_snapshot(print(data_tabulate(efc, "c172code", by = efc$e16sex, proportions = "row", remove_na = FALSE, weights = efc$weights))) # nolint + expect_snapshot(print(data_tabulate(efc, "c172code", by = efc$e16sex, proportions = "row", remove_na = TRUE, weights = efc$weights))) # nolint expect_snapshot(print(data_tabulate(efc, "c172code", by = "e16sex", proportions = "column"))) - expect_snapshot(print(data_tabulate(efc, "c172code", by = "e16sex", proportions = "column", remove_na = FALSE))) + expect_snapshot(print(data_tabulate(efc, "c172code", by = "e16sex", proportions = "column", remove_na = TRUE))) expect_snapshot(print(data_tabulate(efc, "c172code", by = "e16sex", proportions = "column", weights = "weights"))) - expect_snapshot(print(data_tabulate(efc, "c172code", by = "e16sex", proportions = "column", remove_na = FALSE, weights = "weights"))) # nolint + expect_snapshot(print(data_tabulate(efc, "c172code", by = "e16sex", proportions = "column", remove_na = TRUE, weights = "weights"))) # nolint }) test_that("data_tabulate, cross tables, HTML", { @@ -326,11 +326,11 @@ test_that("data_tabulate, cross tables, HTML", { efc$e16sex[sample.int(nrow(efc), 5)] <- NA expect_s3_class(print_html(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full")), "gt_tbl") - expect_s3_class(print_html(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full", remove_na = FALSE)), "gt_tbl") # nolint + expect_s3_class(print_html(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full", remove_na = TRUE)), "gt_tbl") # nolint expect_s3_class(print_html(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full", weights = efc$weights)), "gt_tbl") # nolint - expect_s3_class(print_html(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full", remove_na = FALSE, weights = efc$weights)), "gt_tbl") # nolint + expect_s3_class(print_html(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full", remove_na = TRUE, weights = efc$weights)), "gt_tbl") # nolint expect_s3_class(print_html(data_tabulate(efc, "c172code", by = efc$e16sex, proportions = "row")), "gt_tbl") - expect_s3_class(print_html(data_tabulate(efc, "c172code", by = efc$e16sex, proportions = "row", remove_na = FALSE, weights = efc$weights)), "gt_tbl") # nolint + expect_s3_class(print_html(data_tabulate(efc, "c172code", by = efc$e16sex, proportions = "row", remove_na = TRUE, weights = efc$weights)), "gt_tbl") # nolint }) test_that("data_tabulate, cross tables, grouped df", { @@ -377,9 +377,9 @@ test_that("data_tabulate, cross tables, markdown", { efc$e16sex[sample.int(nrow(efc), 5)] <- NA expect_snapshot(print_md(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full"))) - expect_snapshot(print_md(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full", remove_na = FALSE))) + expect_snapshot(print_md(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full", remove_na = TRUE))) expect_snapshot(print_md(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full", weights = efc$weights))) - expect_snapshot(print_md(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full", remove_na = FALSE, weights = efc$weights))) # nolint + expect_snapshot(print_md(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full", remove_na = TRUE, weights = efc$weights))) # nolint }) @@ -389,12 +389,12 @@ test_that("data_tabulate, validate against table", { data(mtcars) # frequency table out1 <- as.data.frame(table(mtcars$cyl)) - out2 <- data_tabulate(mtcars$cyl, remove_na = FALSE) + out2 <- data_tabulate(mtcars$cyl, remove_na = TRUE) expect_identical(out1$Freq, out2$N) # crosstable out1 <- data_arrange(as.data.frame(table(mtcars$cyl, mtcars$gear)), c("Var1", "Var2")) out2 <- data_rename(data_to_long( - as.data.frame(data_tabulate(mtcars$cyl, by = mtcars$gear, remove_na = FALSE)), 2:4, + as.data.frame(data_tabulate(mtcars$cyl, by = mtcars$gear, remove_na = TRUE)), 2:4, names_to = "Var2", values_to = "Freq" ), "mtcars$cyl", "Var1") out1[[2]] <- as.character(out1[[2]]) From acf8cc45e1bdd375ab9ababcc9da5fcba478568b Mon Sep 17 00:00:00 2001 From: Daniel Date: Sun, 23 Jun 2024 10:02:20 +0200 Subject: [PATCH 4/6] docs, news --- DESCRIPTION | 2 +- NEWS.md | 5 +++++ R/data_summary.R | 6 +++--- R/data_tabulate.R | 2 +- man/data_summary.Rd | 6 +++--- man/data_tabulate.Rd | 2 +- 6 files changed, 14 insertions(+), 9 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 0a7fb9594..c56e3f9dd 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: datawizard Title: Easy Data Wrangling and Statistical Transformations -Version: 0.11.0.3 +Version: 0.11.0.4 Authors@R: c( person("Indrajeet", "Patil", , "patilindrajeet.science@gmail.com", role = "aut", comment = c(ORCID = "0000-0003-1995-6531", Twitter = "@patilindrajeets")), diff --git a/NEWS.md b/NEWS.md index ada1f08ea..8e5001f77 100644 --- a/NEWS.md +++ b/NEWS.md @@ -2,6 +2,11 @@ BREAKING CHANGES +* The argument `include_na` in `data_tabulate()` and `data_summary()` has been + renamed into `remove_na`. Consequently, to mimic former behaviour, `FALSE` and + `TRUE` need to be switched (i.e. `remove_na = TRUE` is equivalent to the former + `include_na 0 FALSE`). + * Class names for objects returned by `data_tabulate()` have been changed to `datawizard_table` and `datawizard_crosstable` (resp. the plural forms, `*_tables`), to provide a clearer and more consistent naming scheme. diff --git a/R/data_summary.R b/R/data_summary.R index f77ebe013..7662d0c94 100644 --- a/R/data_summary.R +++ b/R/data_summary.R @@ -8,9 +8,9 @@ #' @param by Optional character string, indicating the name of a variable in `x`. #' If supplied, the data will be split by this variable and summary statistics #' will be computed for each group. -#' @param remove_na Logical. If `TRUE`, missing values are included as a level -#' in the grouping variable. If `FALSE`, missing values are omitted from the -#' grouping variable. +#' @param remove_na Logical. If `TRUE`, missing values are omitted from the +#' grouping variable. If `FALSE` (default), missing values are included as a +#' level in the grouping variable. #' @param ... One or more named expressions that define the new variable name #' and the function to compute the summary statistic. Example: #' `mean_sepal_width = mean(Sepal.Width)`. The expression can also be provided diff --git a/R/data_tabulate.R b/R/data_tabulate.R index 4c0aa10b3..952f921d9 100644 --- a/R/data_tabulate.R +++ b/R/data_tabulate.R @@ -15,7 +15,7 @@ #' factor levels are dropped from the frequency table. #' @param name Optional character string, which includes the name that is used #' for printing. -#' @param remove_na Logical, if `TRUE`, missing values are included in the +#' @param remove_na Logical, if `FALSE`, missing values are included in the #' frequency or crosstable, else missing values are omitted. #' @param collapse Logical, if `TRUE` collapses multiple tables into one larger #' table for printing. This affects only printing, not the returned object. diff --git a/man/data_summary.Rd b/man/data_summary.Rd index 70d2e2576..24cfa1a9f 100644 --- a/man/data_summary.Rd +++ b/man/data_summary.Rd @@ -22,9 +22,9 @@ summary function \code{n()} can be used to count the number of observations.} If supplied, the data will be split by this variable and summary statistics will be computed for each group.} -\item{remove_na}{Logical. If \code{TRUE}, missing values are included as a level -in the grouping variable. If \code{FALSE}, missing values are omitted from the -grouping variable.} +\item{remove_na}{Logical. If \code{TRUE}, missing values are omitted from the +grouping variable. If \code{FALSE} (default), missing values are included as a +level in the grouping variable.} } \value{ A data frame with the requested summary statistics. diff --git a/man/data_tabulate.Rd b/man/data_tabulate.Rd index 6f572a05a..2feadf3a9 100644 --- a/man/data_tabulate.Rd +++ b/man/data_tabulate.Rd @@ -62,7 +62,7 @@ factor levels are dropped from the frequency table.} \item{weights}{Optional numeric vector of weights. Must be of the same length as \code{x}. If \code{weights} is supplied, weighted frequencies are calculated.} -\item{remove_na}{Logical, if \code{TRUE}, missing values are included in the +\item{remove_na}{Logical, if \code{FALSE}, missing values are included in the frequency or crosstable, else missing values are omitted.} \item{proportions}{Optional character string, indicating the type of From 51c8b5f588e3697f71d49e25f30aab61fd4c1597 Mon Sep 17 00:00:00 2001 From: Daniel Date: Sun, 23 Jun 2024 11:17:25 +0200 Subject: [PATCH 5/6] Update NEWS.md Co-authored-by: Indrajeet Patil --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 8e5001f77..0954f1214 100644 --- a/NEWS.md +++ b/NEWS.md @@ -5,7 +5,7 @@ BREAKING CHANGES * The argument `include_na` in `data_tabulate()` and `data_summary()` has been renamed into `remove_na`. Consequently, to mimic former behaviour, `FALSE` and `TRUE` need to be switched (i.e. `remove_na = TRUE` is equivalent to the former - `include_na 0 FALSE`). + `include_na = FALSE`). * Class names for objects returned by `data_tabulate()` have been changed to `datawizard_table` and `datawizard_crosstable` (resp. the plural forms, From 17f8736c0c3354c20cf98adf60d9622e9d1b229b Mon Sep 17 00:00:00 2001 From: Daniel Date: Sun, 23 Jun 2024 12:42:50 +0200 Subject: [PATCH 6/6] add comment --- R/data_tabulate.R | 4 ++++ R/data_xtabulate.R | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/R/data_tabulate.R b/R/data_tabulate.R index 952f921d9..e94fc5d55 100644 --- a/R/data_tabulate.R +++ b/R/data_tabulate.R @@ -173,6 +173,10 @@ data_tabulate.default <- function(x, # frequency table if (is.null(weights)) { if (remove_na) { + # we have a `.default` and a `.data.frame` method for `data_tabulate()`. + # since this is the default, `x` can be an object which cannot be used + # with `table()`, that's why we add `tryCatch()` here. Below we give an + # informative error message for non-supported objects. freq_table <- tryCatch(table(x), error = function(e) NULL) } else { freq_table <- tryCatch(table(addNA(x)), error = function(e) NULL) diff --git a/R/data_xtabulate.R b/R/data_xtabulate.R index 7943cbab9..08be1eeca 100644 --- a/R/data_xtabulate.R +++ b/R/data_xtabulate.R @@ -12,6 +12,10 @@ } # frequency table if (is.null(weights)) { + # we have a `.default` and a `.data.frame` method for `data_tabulate()`. + # since this is the default, `x` can be an object which cannot be used + # with `table()`, that's why we add `tryCatch()` here. Below we give an + # informative error message for non-supported objects. if (remove_na) { x_table <- tryCatch(table(x, by), error = function(e) NULL) } else {