From be6e2bffb59fbcc21ee92bb07225989fd94f12e4 Mon Sep 17 00:00:00 2001 From: Daniel Date: Tue, 13 Feb 2024 22:43:03 +0100 Subject: [PATCH] `data_tabulate()` gains `by` argument for crosstables (#481) * `data_tabluate()` gains `by` argument for crosstables * fix * fix * fix * fix * fix * fix * fix * fix * fix * fix * version * fix * update tests * fixes update tests * docs, add print_html methods * update news * code structure * fixes * add tests * print markdown method * add tests for markdown print * lintr * align values in tables * tests for HTML * add test * use same column as rowname * Update NEWS.md Co-authored-by: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com> * address comments * cell -> full * update snapshots * one more test --------- Co-authored-by: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com> --- DESCRIPTION | 2 +- NAMESPACE | 6 + NEWS.md | 5 +- R/data_arrange.R | 6 +- R/data_tabulate.R | 160 ++++++++++-- R/data_xtabulate.R | 347 +++++++++++++++++++++++++ inst/WORDLIST | 2 + man/data_tabulate.Rd | 61 ++++- tests/testthat/_snaps/data_tabulate.md | 319 ++++++++++++++++++++++- tests/testthat/test-data_tabulate.R | 158 ++++++++++- 10 files changed, 1024 insertions(+), 42 deletions(-) create mode 100644 R/data_xtabulate.R diff --git a/DESCRIPTION b/DESCRIPTION index 21699fa07..048d15065 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: datawizard Title: Easy Data Wrangling and Statistical Transformations -Version: 0.9.1.3 +Version: 0.9.1.4 Authors@R: c( person("Indrajeet", "Patil", , "patilindrajeet.science@gmail.com", role = "aut", comment = c(ORCID = "0000-0003-1995-6531", Twitter = "@patilindrajeets")), diff --git a/NAMESPACE b/NAMESPACE index a2a911db3..e89863807 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -67,6 +67,7 @@ S3method(describe_distribution,numeric) S3method(format,data_codebook) S3method(format,dw_data_peek) S3method(format,dw_data_tabulate) +S3method(format,dw_data_xtabulate) S3method(format,dw_groupmeans) S3method(format,parameters_distribution) S3method(kurtosis,data.frame) @@ -91,6 +92,8 @@ S3method(print,data_seek) S3method(print,dw_data_peek) S3method(print,dw_data_tabulate) S3method(print,dw_data_tabulates) +S3method(print,dw_data_xtabulate) +S3method(print,dw_data_xtabulates) S3method(print,dw_groupmeans) S3method(print,dw_groupmeans_list) S3method(print,dw_transformer) @@ -102,10 +105,13 @@ S3method(print_html,data_codebook) S3method(print_html,dw_data_peek) S3method(print_html,dw_data_tabulate) S3method(print_html,dw_data_tabulates) +S3method(print_html,dw_data_xtabulate) +S3method(print_html,dw_data_xtabulates) S3method(print_md,data_codebook) S3method(print_md,dw_data_peek) S3method(print_md,dw_data_tabulate) S3method(print_md,dw_data_tabulates) +S3method(print_md,dw_data_xtabulate) S3method(ranktransform,data.frame) S3method(ranktransform,factor) S3method(ranktransform,grouped_df) diff --git a/NEWS.md b/NEWS.md index ffb41206d..968c9d417 100644 --- a/NEWS.md +++ b/NEWS.md @@ -5,7 +5,10 @@ CHANGES * `data_modify()` gets three new arguments, `.at`, `.if` and `.modify`, to modify variables at specific positions or based on logical conditions. -* `data_tabulate()` gets a `weights` argument, to compute weighted frequency tables. +* `data_tabulate()` was revised and gets several new arguments: a `weights` + argument, to compute weighted frequency tables. `include_na` allows to include + or omit missing values from the table. Furthermore, a `by` argument was added, + to compute crosstables (#479, #481). # datawizard 0.9.1 diff --git a/R/data_arrange.R b/R/data_arrange.R index 5666fde42..58c8a5ab3 100644 --- a/R/data_arrange.R +++ b/R/data_arrange.R @@ -46,8 +46,8 @@ data_arrange.default <- function(data, select = NULL, safe = TRUE) { dont_exist <- select[which(!select %in% names(data))] if (length(dont_exist) > 0) { - if (!safe) { - insight::format_error( + if (safe) { + insight::format_warning( paste0( "The following column(s) don't exist in the dataset: ", text_concatenate(dont_exist), "." @@ -55,7 +55,7 @@ data_arrange.default <- function(data, select = NULL, safe = TRUE) { .misspelled_string(names(data), dont_exist, "Possibly misspelled?") ) } else { - insight::format_warning( + insight::format_error( paste0( "The following column(s) don't exist in the dataset: ", text_concatenate(dont_exist), "." diff --git a/R/data_tabulate.R b/R/data_tabulate.R index 4d3bb810b..92432a602 100644 --- a/R/data_tabulate.R +++ b/R/data_tabulate.R @@ -1,20 +1,30 @@ -#' @title Create frequency tables of variables +#' @title Create frequency and crosstables of variables #' @name data_tabulate #' -#' @description This function creates frequency tables of variables, including -#' the number of levels/values as well as the distribution of raw, valid and -#' cumulative percentages. +#' @description This function creates frequency or crosstables of variables, +#' including the number of levels/values as well as the distribution of raw, +#' valid and cumulative percentages. For crosstables, row, column and cell +#' percentages can be calculated. #' #' @param x A (grouped) data frame, a vector or factor. -#' @param drop_levels Logical, if `TRUE`, factor levels that do not occur in +#' @param by Optional vector or factor. If supplied, a crosstable is created. +#' If `x` is a data frame, `by` can also be a character string indicating the +#' name of a variable in `x`. +#' @param drop_levels Logical, if `FALSE`, factor levels that do not occur in #' the data are included in the table (with frequency of zero), else unused #' factor levels are dropped from the frequency table. #' @param name Optional character string, which includes the name that is used #' for printing. +#' @param include_na Logical, if `TRUE`, missing values are included in the +#' frequency or crosstable, else missing values are omitted. #' @param collapse Logical, if `TRUE` collapses multiple tables into one larger #' table for printing. This affects only printing, not the returned object. #' @param weights Optional numeric vector of weights. Must be of the same length #' as `x`. If `weights` is supplied, weighted frequencies are calculated. +#' @param proportions Optional character string, indicating the type of +#' percentages to be calculated. Only applies to crosstables, i.e. when `by` is +#' not `NULL`. Can be `"row"` (row percentages), `"column"` (column percentages) +#' or `"full"` (to calculate relative frequencies for the full table). #' @param ... not used. #' @inheritParams find_columns #' @@ -22,11 +32,16 @@ #' as data frame per variable. #' #' @examplesIf requireNamespace("poorman") +#' # frequency tables ------- +#' # ------------------------ #' data(efc) #' #' # vector/factor #' data_tabulate(efc$c172code) #' +#' # drop missing values +#' data_tabulate(efc$c172code, include_na = FALSE) +#' #' # data frame #' data_tabulate(efc, c("e42dep", "c172code")) #' @@ -54,6 +69,30 @@ #' efc$weights <- abs(rnorm(n = nrow(efc), mean = 1, sd = 0.5)) #' data_tabulate(efc$e42dep, weights = efc$weights) #' +#' # crosstables ------ +#' # ------------------ +#' +#' # add some missing values +#' set.seed(123) +#' efc$e16sex[sample.int(nrow(efc), 5)] <- NA +#' +#' data_tabulate(efc, "c172code", by = "e16sex") +#' +#' # add row and column percentages +#' data_tabulate(efc, "c172code", by = "e16sex", proportions = "row") +#' data_tabulate(efc, "c172code", by = "e16sex", proportions = "column") +#' +#' # omit missing values +#' data_tabulate( +#' efc$c172code, +#' by = efc$e16sex, +#' proportions = "column", +#' include_na = FALSE +#' ) +#' +#' # round percentages +#' out <- data_tabulate(efc, "c172code", by = "e16sex", proportions = "column") +#' print(out, digits = 0) #' @export data_tabulate <- function(x, ...) { UseMethod("data_tabulate") @@ -62,7 +101,15 @@ data_tabulate <- function(x, ...) { #' @rdname data_tabulate #' @export -data_tabulate.default <- function(x, drop_levels = FALSE, weights = NULL, name = NULL, verbose = TRUE, ...) { +data_tabulate.default <- function(x, + by = NULL, + drop_levels = FALSE, + weights = NULL, + include_na = TRUE, + proportions = NULL, + name = NULL, + verbose = TRUE, + ...) { # save label attribute, before it gets lost... var_label <- attr(x, "label", exact = TRUE) @@ -78,25 +125,52 @@ data_tabulate.default <- function(x, drop_levels = FALSE, weights = NULL, name = x <- droplevels(x) } - # check for correct length of weights - must be equal to "x" - if (!is.null(weights) && length(weights) != length(x)) { - insight::format_error("Length of `weights` must be equal to length of `x`.") + # validate "weights" + weights <- .validate_table_weights(weights, x) + + # we go into another function for crosstables here... + if (!is.null(by)) { + by <- .validate_by(by, x) + return(.crosstable( + x, + by = by, + weights = weights, + include_na = include_na, + proportions = proportions, + obj_name = obj_name, + group_variable = group_variable + )) } # frequency table if (is.null(weights)) { - freq_table <- tryCatch(table(addNA(x)), error = function(e) NULL) - } else { - # weighted frequency table + if (include_na) { + freq_table <- tryCatch(table(addNA(x)), error = function(e) NULL) + } else { + freq_table <- tryCatch(table(x), error = function(e) NULL) + } + } else if (include_na) { + # weighted frequency table, including NA freq_table <- tryCatch( stats::xtabs( weights ~ x, - data = data.frame(weights = weights, x = x), + data = data.frame(weights = weights, x = addNA(x)), na.action = stats::na.pass, addNA = TRUE ), error = function(e) NULL ) + } else { + # weighted frequency table, excluding NA + freq_table <- tryCatch( + stats::xtabs( + weights ~ x, + data = data.frame(weights = weights, x = x), + na.action = stats::na.omit, + addNA = FALSE + ), + error = function(e) NULL + ) } if (is.null(freq_table)) { @@ -115,7 +189,14 @@ data_tabulate.default <- function(x, drop_levels = FALSE, weights = NULL, name = } out$`Raw %` <- 100 * out$N / sum(out$N) - out$`Valid %` <- c(100 * out$N[-nrow(out)] / sum(out$N[-nrow(out)]), NA) + # if we have missing values, we add a row with NA + if (include_na) { + out$`Valid %` <- c(100 * out$N[-nrow(out)] / sum(out$N[-nrow(out)]), NA) + valid_n <- sum(out$N[-length(out$N)], na.rm = TRUE) + } else { + out$`Valid %` <- 100 * out$N / sum(out$N) + valid_n <- sum(out$N, na.rm = TRUE) + } out$`Cumulative %` <- cumsum(out$`Valid %`) # add information about variable/group names @@ -144,7 +225,7 @@ data_tabulate.default <- function(x, drop_levels = FALSE, weights = NULL, name = attr(out, "weights") <- weights attr(out, "total_n") <- sum(out$N, na.rm = TRUE) - attr(out, "valid_n") <- sum(out$N[-length(out$N)], na.rm = TRUE) + attr(out, "valid_n") <- valid_n class(out) <- c("dw_data_tabulate", "data.frame") @@ -159,9 +240,12 @@ data_tabulate.data.frame <- function(x, exclude = NULL, ignore_case = FALSE, regex = FALSE, - collapse = FALSE, + by = NULL, drop_levels = FALSE, weights = NULL, + include_na = TRUE, + proportions = NULL, + collapse = FALSE, verbose = TRUE, ...) { # evaluate arguments @@ -172,11 +256,31 @@ data_tabulate.data.frame <- function(x, regex = regex, verbose = verbose ) + + # validate "by" + by <- .validate_by(by, x) + # validate "weights" + weights <- .validate_table_weights(weights, x) + out <- lapply(select, function(i) { - data_tabulate(x[[i]], drop_levels = drop_levels, weights = weights, name = i, verbose = verbose, ...) + data_tabulate( + x[[i]], + by = by, + proportions = proportions, + drop_levels = drop_levels, + weights = weights, + include_na = include_na, + name = i, + verbose = verbose, + ... + ) }) - class(out) <- c("dw_data_tabulates", "list") + if (is.null(by)) { + class(out) <- c("dw_data_tabulates", "list") + } else { + class(out) <- c("dw_data_xtabulates", "list") + } attr(out, "collapse") <- isTRUE(collapse) attr(out, "is_weighted") <- !is.null(weights) @@ -190,10 +294,13 @@ data_tabulate.grouped_df <- function(x, exclude = NULL, ignore_case = FALSE, regex = FALSE, - verbose = TRUE, - collapse = FALSE, + by = NULL, + proportions = NULL, drop_levels = FALSE, weights = NULL, + include_na = TRUE, + collapse = FALSE, + verbose = TRUE, ...) { # works only for dplyr >= 0.8.0 grps <- attr(x, "groups", exact = TRUE) @@ -210,6 +317,7 @@ data_tabulate.grouped_df <- function(x, ) x <- as.data.frame(x) + out <- list() for (i in seq_along(grps)) { rows <- grps[[i]] @@ -227,11 +335,18 @@ data_tabulate.grouped_df <- function(x, verbose = verbose, drop_levels = drop_levels, weights = weights, + include_na = include_na, + by = by, + proportions = proportions, group_variable = group_variable, ... )) } - class(out) <- c("dw_data_tabulates", "list") + if (is.null(by)) { + class(out) <- c("dw_data_tabulates", "list") + } else { + class(out) <- c("dw_data_xtabulates", "list") + } attr(out, "collapse") <- isTRUE(collapse) attr(out, "is_weighted") <- !is.null(weights) @@ -239,8 +354,6 @@ data_tabulate.grouped_df <- function(x, } - - # methods -------------------- #' @importFrom insight print_html @@ -286,7 +399,6 @@ format.dw_data_tabulate <- function(x, format = "text", big_mark = NULL, ...) { } - #' @export print.dw_data_tabulate <- function(x, big_mark = NULL, ...) { a <- attributes(x) diff --git a/R/data_xtabulate.R b/R/data_xtabulate.R new file mode 100644 index 000000000..c8e622e2d --- /dev/null +++ b/R/data_xtabulate.R @@ -0,0 +1,347 @@ +# helper to compute crosstables -------------- + +.crosstable <- function(x, + by, + weights = NULL, + include_na = TRUE, + proportions = NULL, + obj_name = NULL, + group_variable = NULL) { + if (!is.null(proportions)) { + proportions <- match.arg(proportions, c("row", "column", "full")) + } + # frequency table + if (is.null(weights)) { + if (include_na) { + x_table <- tryCatch(table(addNA(x), addNA(by)), error = function(e) NULL) + } else { + x_table <- tryCatch(table(x, by), error = function(e) NULL) + } + } else if (include_na) { + # weighted frequency table, including NA + x_table <- tryCatch( + stats::xtabs( + weights ~ x + by, + data = data.frame(weights = weights, x = addNA(x), by = addNA(by)), + na.action = stats::na.pass, + addNA = TRUE + ), + error = function(e) NULL + ) + } else { + # weighted frequency table, excluding NA + x_table <- tryCatch( + stats::xtabs( + weights ~ x + by, + data = data.frame(weights = weights, x = x, by = by), + na.action = stats::na.omit, + addNA = FALSE + ), + error = function(e) NULL + ) + } + + if (is.null(x_table)) { + insight::format_warning(paste0("Can't compute cross tables for objects of class `", class(x)[1], "`.")) + return(NULL) + } + + out <- as.data.frame(stats::ftable(x_table)) + colnames(out) <- c("Value", "by", "N") + total_n <- sum(out$N, na.rm = TRUE) + + # we want to round N for weighted frequencies + if (!is.null(weights)) { + out$N <- round(out$N) + total_n <- round(total_n) + } + + out <- data_to_wide(out, values_from = "N", names_from = "by") + + # use variable name as column name + if (!is.null(obj_name)) { + colnames(out)[1] <- obj_name + } + + # for grouped data frames, add info about grouping variables + if (!is.null(group_variable)) { + var_info <- toString(lapply(colnames(group_variable), function(i) { + sprintf("%s (%s)", i, group_variable[[i]]) + })) + out <- cbind(out[1], data.frame(Group = var_info, stringsAsFactors = FALSE), out[-1]) + } + + attr(out, "total_n") <- total_n + attr(out, "weights") <- weights + attr(out, "proportions") <- proportions + + class(out) <- c("dw_data_xtabulate", "data.frame") + + out +} + + +# methods --------------------- + + +#' @export +format.dw_data_xtabulate <- function(x, format = "text", digits = 1, big_mark = NULL, ...) { + # convert to character manually, else, for large numbers, + # format_table() returns scientific notation + x <- as.data.frame(x) + + # remove group variable + x$Group <- NULL + + # compute total N for rows and colummns + total_n <- attributes(x)$total_n + total_column <- rowSums(x[, -1], na.rm = TRUE) + total_row <- c(colSums(x[, -1], na.rm = TRUE), total_n) + + # proportions? + props <- attributes(x)$proportions + + if (!is.null(props)) { + # we copy x to tmp, because when we create strings with "sprintf()", the + # variable is coerced to character, and in subsequent iterations of the loop, + # mathemathical operations are not possible anymore + tmp <- x + if (identical(props, "row")) { + for (i in seq_len(nrow(x))) { + tmp[i, -1] <- paste( + format(x[i, -1]), + format(sprintf("(%.*f%%)", digits, 100 * x[i, -1] / sum(x[i, -1], na.rm = TRUE)), justify = "right") + ) + } + } else if (identical(props, "column")) { + for (i in seq_len(ncol(x))[-1]) { + tmp[, i] <- paste( + format(x[, i]), + format(sprintf("(%.*f%%)", digits, 100 * x[, i] / sum(x[, i], na.rm = TRUE)), justify = "right") + ) + } + } else if (identical(props, "full")) { + for (i in seq_len(ncol(x))[-1]) { + tmp[, i] <- paste( + format(x[, i]), + format(sprintf("(%.*f%%)", digits, 100 * x[, i] / total_n), justify = "right") + ) + } + } + # copy back final result + x <- tmp + } + + x[] <- lapply(x, as.character) + + # format data frame + ftab <- insight::format_table(x, ...) + # replace empty cells with NA + ftab[] <- lapply(ftab, function(i) { + i[i == ""] <- ifelse(identical(format, "text"), "", "(NA)") # nolint + i + }) + # Remove ".00" from numbers + ftab$Total <- gsub("\\.00$", "", as.character(total_column)) + # for text format, insert "empty row" before last total row + if (identical(format, "text") || identical(format, "markdown")) { + empty_row <- as.data.frame(t(data.frame( + rep("", ncol(ftab)), + c("Total", as.character(total_row)), + stringsAsFactors = FALSE + ))) + } else { + empty_row <- as.data.frame(t(data.frame( + c("Total", as.character(total_row)), + stringsAsFactors = FALSE + ))) + } + colnames(empty_row) <- colnames(ftab) + ftab <- rbind(ftab, empty_row) + ftab[nrow(ftab), ] <- gsub("\\.00$", "", ftab[nrow(ftab), ]) + + # insert big marks? + ftab$Total <- .add_commas_in_numbers(ftab$Total, big_mark) + ftab[nrow(ftab), ] <- .add_commas_in_numbers(ftab[nrow(ftab), ], big_mark) + + # also format NA column name + colnames(ftab)[colnames(ftab) == "NA"] <- ifelse(identical(format, "text"), "", "(NA)") + + ftab +} + + +#' @export +print.dw_data_xtabulate <- function(x, big_mark = NULL, ...) { + # grouped data? if yes, add information on grouping factor + if (is.null(x[["Group"]])) { + caption <- NULL + } else { + caption <- paste0("Grouped by ", x[["Group"]][1]) + x$Group <- NULL + } + + # print table + cat(insight::export_table( + format(x, big_mark = big_mark, ...), + cross = "+", + missing = "", + caption = caption, + empty_line = "-" + )) + invisible(x) +} + + +#' @export +print_md.dw_data_xtabulate <- function(x, big_mark = NULL, ...) { + # grouped data? if yes, add information on grouping factor + if (is.null(x[["Group"]])) { + caption <- NULL + } else { + caption <- paste0("Grouped by ", x[["Group"]][1]) + x$Group <- NULL + } + + # print table + insight::export_table( + format(x, format = "markdown", big_mark = big_mark, ...), + cross = "+", + missing = "", + caption = caption, + empty_line = "-", + format = "markdown" + ) +} + + +#' @export +print_html.dw_data_xtabulate <- function(x, big_mark = NULL, ...) { + # grouped data? if yes, add information on grouping factor + if (!is.null(x[["Group"]])) { + x$groups <- paste0("Grouped by ", x[["Group"]][1]) + x$Group <- NULL + } + + # print table + insight::export_table( + format(x, big_mark = big_mark, format = "html", ...), + missing = "(NA)", + format = "html", + group_by = "groups" + ) +} + + +#' @export +print.dw_data_xtabulates <- function(x, big_mark = NULL, ...) { + for (i in seq_along(x)) { + print(x[[i]], big_mark = big_mark, ...) + cat("\n") + } + invisible(x) +} + + +#' @export +print_html.dw_data_xtabulates <- function(x, big_mark = NULL, ...) { + if (length(x) == 1) { + print_html(x[[1]], big_mark = big_mark, ...) + } else { + x <- lapply(x, function(i) { + # grouped data? if yes, add information on grouping factor + if (!is.null(i[["Group"]])) { + i$groups <- paste0("Grouped by ", i[["Group"]][1]) + i$Group <- NULL + } + format(i, format = "html", big_mark = big_mark, ...) + }) + + out <- do.call(rbind, x) + + # print table + insight::export_table( + out, + missing = "(NA)", + format = "html", + group_by = "groups" + ) + } +} + + +# helper --------------------- + +.validate_by <- function(by, x) { + if (!is.null(by)) { + if (is.character(by)) { + # If "by" is a character string, must be of length 1 + if (length(by) > 1) { + insight::format_error( + "If `by` is a string indicating a variable name, `by` must be of length 1.", + "You may use `data_group()` to group by multiple variables, then call `data_tabulate()`." + ) + } + # if "by" is a character, "x" must be a data frame + if (!is.data.frame(x)) { + insight::format_error("If `by` is a string indicating a variable name, `x` must be a data frame.") + } + # is "by" a column in "x"? + if (!by %in% colnames(x)) { + insight::format_error(sprintf( + "The variable specified in `by` was not found in `x`. %s", + .misspelled_string(names(x), by, "Possibly misspelled?") + )) + } + by <- x[[by]] + } + # is "by" of same length as "x"? + if (is.data.frame(x) && length(by) != nrow(x)) { + insight::format_error("Length of `by` must be equal to number of rows in `x`.") # nolint + } + if (!is.data.frame(x) && length(by) != length(x)) { + insight::format_error("Length of `by` must be equal to length of `x`.") # nolint + } + if (!is.factor(by)) { + # coerce "by" to factor, including labels + by <- to_factor(by, labels_to_levels = TRUE, verbose = FALSE) + } + } + + by +} + + +.validate_table_weights <- function(weights, x) { + if (!is.null(weights)) { + if (is.character(weights)) { + # If "weights" is a character string, must be of length 1 + if (length(weights) > 1) { + insight::format_error( + "If `weights` is a string indicating a variable name, `weights` must be of length 1." + ) + } + # if "weights" is a character, "x" must be a data frame + if (!is.data.frame(x)) { + insight::format_error("If `weights` is a string indicating a variable name, `x` must be a data frame.") # nolint + } + # is "by" a column in "x"? + if (!weights %in% colnames(x)) { + insight::format_error(sprintf( + "The variable specified in `weights` was not found in `x`. %s", + .misspelled_string(names(x), weights, "Possibly misspelled?") + )) + } + weights <- x[[weights]] + } + # is "by" of same length as "x"? + if (is.data.frame(x) && length(weights) != nrow(x)) { + insight::format_error("Length of `weights` must be equal to number of rows in `x`.") # nolint + } + if (!is.data.frame(x) && length(weights) != length(x)) { + insight::format_error("Length of `weights` must be equal to length of `x`.") # nolint + } + } + + weights +} diff --git a/inst/WORDLIST b/inst/WORDLIST index d48430478..5b12a2523 100644 --- a/inst/WORDLIST +++ b/inst/WORDLIST @@ -50,6 +50,8 @@ bmwiernik codebook codebooks coercible +crosstable +crosstables csv de doi diff --git a/man/data_tabulate.Rd b/man/data_tabulate.Rd index 5332f386f..0a155ee22 100644 --- a/man/data_tabulate.Rd +++ b/man/data_tabulate.Rd @@ -4,14 +4,17 @@ \alias{data_tabulate} \alias{data_tabulate.default} \alias{data_tabulate.data.frame} -\title{Create frequency tables of variables} +\title{Create frequency and crosstables of variables} \usage{ data_tabulate(x, ...) \method{data_tabulate}{default}( x, + by = NULL, drop_levels = FALSE, weights = NULL, + include_na = TRUE, + proportions = NULL, name = NULL, verbose = TRUE, ... @@ -23,9 +26,12 @@ data_tabulate(x, ...) exclude = NULL, ignore_case = FALSE, regex = FALSE, - collapse = FALSE, + by = NULL, drop_levels = FALSE, weights = NULL, + include_na = TRUE, + proportions = NULL, + collapse = FALSE, verbose = TRUE, ... ) @@ -35,13 +41,25 @@ data_tabulate(x, ...) \item{...}{not used.} -\item{drop_levels}{Logical, if \code{TRUE}, factor levels that do not occur in +\item{by}{Optional vector or factor. If supplied, a crosstable is created. +If \code{x} is a data frame, \code{by} can also be a character string indicating the +name of a variable in \code{x}.} + +\item{drop_levels}{Logical, if \code{FALSE}, factor levels that do not occur in the data are included in the table (with frequency of zero), else unused factor levels are dropped from the frequency table.} \item{weights}{Optional numeric vector of weights. Must be of the same length as \code{x}. If \code{weights} is supplied, weighted frequencies are calculated.} +\item{include_na}{Logical, if \code{TRUE}, missing values are included in the +frequency or crosstable, else missing values are omitted.} + +\item{proportions}{Optional character string, indicating the type of +percentages to be calculated. Only applies to crosstables, i.e. when \code{by} is +not \code{NULL}. Can be \code{"row"} (row percentages), \code{"column"} (column percentages) +or \code{"full"} (to calculate relative frequencies for the full table).} + \item{name}{Optional character string, which includes the name that is used for printing.} @@ -104,17 +122,23 @@ A data frame, or a list of data frames, with one frequency table as data frame per variable. } \description{ -This function creates frequency tables of variables, including -the number of levels/values as well as the distribution of raw, valid and -cumulative percentages. +This function creates frequency or crosstables of variables, +including the number of levels/values as well as the distribution of raw, +valid and cumulative percentages. For crosstables, row, column and cell +percentages can be calculated. } \examples{ \dontshow{if (requireNamespace("poorman")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +# frequency tables ------- +# ------------------------ data(efc) # vector/factor data_tabulate(efc$c172code) +# drop missing values +data_tabulate(efc$c172code, include_na = FALSE) + # data frame data_tabulate(efc, c("e42dep", "c172code")) @@ -141,5 +165,30 @@ print(data_tabulate(x), big_mark = "") set.seed(123) efc$weights <- abs(rnorm(n = nrow(efc), mean = 1, sd = 0.5)) data_tabulate(efc$e42dep, weights = efc$weights) + +# crosstables ------ +# ------------------ + +# add some missing values +set.seed(123) +efc$e16sex[sample.int(nrow(efc), 5)] <- NA + +data_tabulate(efc, "c172code", by = "e16sex") + +# add row and column percentages +data_tabulate(efc, "c172code", by = "e16sex", proportions = "row") +data_tabulate(efc, "c172code", by = "e16sex", proportions = "column") + +# omit missing values +data_tabulate( + efc$c172code, + by = efc$e16sex, + proportions = "column", + include_na = FALSE +) + +# round percentages +out <- data_tabulate(efc, "c172code", by = "e16sex", proportions = "column") +print(out, digits = 0) \dontshow{\}) # examplesIf} } diff --git a/tests/testthat/_snaps/data_tabulate.md b/tests/testthat/_snaps/data_tabulate.md index 7f482ad86..e4b3e8628 100644 --- a/tests/testthat/_snaps/data_tabulate.md +++ b/tests/testthat/_snaps/data_tabulate.md @@ -50,8 +50,9 @@ | 4 | 67 | 63.81 | 67.00 | 100.00 | | 5 | 4.76 | | ---------+-------+----+-------+---------+------------- - e16sex | 1 | 50 | 47.62 | 100.00 | 100.00 - | 2 | 55 | 52.38 | | + e16sex | 1 | 50 | 47.62 | 47.62 | 47.62 + | 2 | 55 | 52.38 | 52.38 | 100.00 + | | 0 | 0.00 | | ------------------------------------------------------ --- @@ -69,9 +70,10 @@ [8] "| | 4| 67| 63.81| 67.00| 100.00|" [9] "| | (NA)| 5| 4.76| (NA)| (NA)|" [10] "| | | | | | |" - [11] "|e16sex | 1| 50| 47.62| 100.00| 100.00|" - [12] "| | 2| 55| 52.38| (NA)| (NA)|" - [13] "| | | | | | |" + [11] "|e16sex | 1| 50| 47.62| 47.62| 47.62|" + [12] "| | 2| 55| 52.38| 52.38| 100.00|" + [13] "| | (NA)| 0| 0.00| (NA)| (NA)|" + [14] "| | | | | | |" attr(,"format") [1] "pipe" attr(,"class") @@ -239,3 +241,310 @@ | | | 2 | 3.70 | | ------------------------------------------------------------------- +# data_tabulate, cross tables + + Code + print(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full")) + Output + efc$c172code | male | female | | Total + -------------+------------+------------+----------+------ + 1 | 5 (5.0%) | 2 (2.0%) | 1 (1.0%) | 8 + 2 | 31 (31.0%) | 33 (33.0%) | 2 (2.0%) | 66 + 3 | 4 (4.0%) | 11 (11.0%) | 1 (1.0%) | 16 + | 5 (5.0%) | 4 (4.0%) | 1 (1.0%) | 10 + -------------+------------+------------+----------+------ + Total | 45 | 50 | 5 | 100 + +--- + + Code + print(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full", + include_na = FALSE)) + Output + efc$c172code | male | female | Total + -------------+------------+------------+------ + 1 | 5 (5.8%) | 2 (2.3%) | 7 + 2 | 31 (36.0%) | 33 (38.4%) | 64 + 3 | 4 (4.7%) | 11 (12.8%) | 15 + -------------+------------+------------+------ + Total | 40 | 46 | 86 + +--- + + Code + print(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full", + weights = efc$weights)) + Output + efc$c172code | male | female | | Total + -------------+------------+------------+----------+------ + 1 | 5 (4.8%) | 3 (2.9%) | 2 (1.9%) | 10 + 2 | 32 (30.5%) | 32 (30.5%) | 3 (2.9%) | 67 + 3 | 3 (2.9%) | 11 (10.5%) | 1 (1.0%) | 15 + | 8 (7.6%) | 5 (4.8%) | 1 (1.0%) | 14 + -------------+------------+------------+----------+------ + Total | 48 | 51 | 7 | 105 + +--- + + Code + print(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full", + include_na = FALSE, weights = efc$weights)) + Output + efc$c172code | male | female | Total + -------------+------------+------------+------ + 1 | 5 (5.8%) | 3 (3.5%) | 8 + 2 | 32 (37.2%) | 32 (37.2%) | 64 + 3 | 3 (3.5%) | 11 (12.8%) | 14 + -------------+------------+------------+------ + Total | 40 | 46 | 86 + +--- + + Code + print(data_tabulate(efc, "c172code", by = efc$e16sex, proportions = "row")) + Output + c172code | male | female | | Total + ---------+------------+------------+-----------+------ + 1 | 5 (62.5%) | 2 (25.0%) | 1 (12.5%) | 8 + 2 | 31 (47.0%) | 33 (50.0%) | 2 (3.0%) | 66 + 3 | 4 (25.0%) | 11 (68.8%) | 1 (6.2%) | 16 + | 5 (50.0%) | 4 (40.0%) | 1 (10.0%) | 10 + ---------+------------+------------+-----------+------ + Total | 45 | 50 | 5 | 100 + + +--- + + Code + print(data_tabulate(efc, "c172code", by = efc$e16sex, proportions = "row", + include_na = FALSE)) + Output + c172code | male | female | Total + ---------+------------+------------+------ + 1 | 5 (71.4%) | 2 (28.6%) | 7 + 2 | 31 (48.4%) | 33 (51.6%) | 64 + 3 | 4 (26.7%) | 11 (73.3%) | 15 + ---------+------------+------------+------ + Total | 40 | 46 | 86 + + +--- + + Code + print(data_tabulate(efc, "c172code", by = efc$e16sex, proportions = "row", + weights = efc$weights)) + Output + c172code | male | female | | Total + ---------+------------+------------+-----------+------ + 1 | 5 (50.0%) | 3 (30.0%) | 2 (20.0%) | 10 + 2 | 32 (47.8%) | 32 (47.8%) | 3 (4.5%) | 67 + 3 | 3 (20.0%) | 11 (73.3%) | 1 (6.7%) | 15 + | 8 (57.1%) | 5 (35.7%) | 1 (7.1%) | 14 + ---------+------------+------------+-----------+------ + Total | 48 | 51 | 7 | 105 + + +--- + + Code + print(data_tabulate(efc, "c172code", by = efc$e16sex, proportions = "row", + include_na = FALSE, weights = efc$weights)) + Output + c172code | male | female | Total + ---------+------------+------------+------ + 1 | 5 (62.5%) | 3 (37.5%) | 8 + 2 | 32 (50.0%) | 32 (50.0%) | 64 + 3 | 3 (21.4%) | 11 (78.6%) | 14 + ---------+------------+------------+------ + Total | 40 | 46 | 86 + + +--- + + Code + print(data_tabulate(efc, "c172code", by = "e16sex", proportions = "column")) + Output + c172code | male | female | | Total + ---------+------------+------------+-----------+------ + 1 | 5 (11.1%) | 2 (4.0%) | 1 (20.0%) | 8 + 2 | 31 (68.9%) | 33 (66.0%) | 2 (40.0%) | 66 + 3 | 4 (8.9%) | 11 (22.0%) | 1 (20.0%) | 16 + | 5 (11.1%) | 4 (8.0%) | 1 (20.0%) | 10 + ---------+------------+------------+-----------+------ + Total | 45 | 50 | 5 | 100 + + +--- + + Code + print(data_tabulate(efc, "c172code", by = "e16sex", proportions = "column", + include_na = FALSE)) + Output + c172code | male | female | Total + ---------+------------+------------+------ + 1 | 5 (12.5%) | 2 (4.3%) | 7 + 2 | 31 (77.5%) | 33 (71.7%) | 64 + 3 | 4 (10.0%) | 11 (23.9%) | 15 + ---------+------------+------------+------ + Total | 40 | 46 | 86 + + +--- + + Code + print(data_tabulate(efc, "c172code", by = "e16sex", proportions = "column", + weights = "weights")) + Output + c172code | male | female | | Total + ---------+------------+------------+-----------+------ + 1 | 5 (10.4%) | 3 (5.9%) | 2 (28.6%) | 10 + 2 | 32 (66.7%) | 32 (62.7%) | 3 (42.9%) | 67 + 3 | 3 (6.2%) | 11 (21.6%) | 1 (14.3%) | 15 + | 8 (16.7%) | 5 (9.8%) | 1 (14.3%) | 14 + ---------+------------+------------+-----------+------ + Total | 48 | 51 | 7 | 105 + + +--- + + Code + print(data_tabulate(efc, "c172code", by = "e16sex", proportions = "column", + include_na = FALSE, weights = "weights")) + Output + c172code | male | female | Total + ---------+------------+------------+------ + 1 | 5 (12.5%) | 3 (6.5%) | 8 + 2 | 32 (80.0%) | 32 (69.6%) | 64 + 3 | 3 (7.5%) | 11 (23.9%) | 14 + ---------+------------+------------+------ + Total | 40 | 46 | 86 + + +# data_tabulate, cross tables, grouped df + + Code + print(data_tabulate(grp, "c172code", by = "e16sex", proportions = "row")) + Output + Grouped by e42dep (1) + + c172code | male | | Total + ---------+------------+------------+------ + 2 | 2 (100.0%) | 0 (0.0%) | 2 + | 0 (NaN%) | 0 (NaN%) | 0 + ---------+------------+------------+------ + Total | 2 | 0 | 2 + + Grouped by e42dep (2) + + c172code | male | female | | Total + ---------+-----------+-----------+-----------+------ + 2 | 2 (50.0%) | 2 (50.0%) | 0 (0.0%) | 4 + | 0 (NaN%) | 0 (NaN%) | 0 (NaN%) | 0 + ---------+-----------+-----------+-----------+------ + Total | 2 | 2 | 0 | 4 + + Grouped by e42dep (3) + + c172code | male | female | | Total + ---------+-----------+------------+-----------+------ + 1 | 2 (50.0%) | 2 (50.0%) | 0 (0.0%) | 4 + 2 | 4 (25.0%) | 11 (68.8%) | 1 (6.2%) | 16 + 3 | 1 (16.7%) | 5 (83.3%) | 0 (0.0%) | 6 + | 1 (50.0%) | 0 (0.0%) | 1 (50.0%) | 2 + ---------+-----------+------------+-----------+------ + Total | 8 | 18 | 2 | 28 + + Grouped by e42dep (4) + + c172code | male | female | | Total + ---------+------------+------------+-----------+------ + 1 | 3 (75.0%) | 0 (0.0%) | 1 (25.0%) | 4 + 2 | 23 (54.8%) | 18 (42.9%) | 1 (2.4%) | 42 + 3 | 3 (30.0%) | 6 (60.0%) | 1 (10.0%) | 10 + | 3 (42.9%) | 4 (57.1%) | 0 (0.0%) | 7 + ---------+------------+------------+-----------+------ + Total | 32 | 28 | 3 | 63 + + Grouped by e42dep (NA) + + c172code | male | female | | Total + ---------+------------+------------+------------+------ + 2 | 0 (0.0%) | 2 (100.0%) | 0 (0.0%) | 2 + | 1 (100.0%) | 0 (0.0%) | 0 (0.0%) | 1 + ---------+------------+------------+------------+------ + Total | 1 | 2 | 0 | 3 + + +# data_tabulate, cross tables, markdown + + Code + print_md(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full")) + Output + [1] "|efc$c172code | male| female| (NA) | Total|" + [2] "|:------------|----------:|----------:|:--------|-----:|" + [3] "|1 | 5 (5.0%)| 2 (2.0%)|1 (1.0%) | 8|" + [4] "|2 | 31 (31.0%)| 33 (33.0%)|2 (2.0%) | 66|" + [5] "|3 | 4 (4.0%)| 11 (11.0%)|1 (1.0%) | 16|" + [6] "| | 5 (5.0%)| 4 (4.0%)|1 (1.0%) | 10|" + [7] "| | | | | |" + [8] "|Total | 45| 50| 5 | 100|" + attr(,"format") + [1] "pipe" + attr(,"class") + [1] "knitr_kable" "character" + +--- + + Code + print_md(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full", + include_na = FALSE)) + Output + [1] "|efc$c172code | male| female| Total|" + [2] "|:------------|----------:|----------:|-----:|" + [3] "|1 | 5 (5.8%)| 2 (2.3%)| 7|" + [4] "|2 | 31 (36.0%)| 33 (38.4%)| 64|" + [5] "|3 | 4 (4.7%)| 11 (12.8%)| 15|" + [6] "| | | | |" + [7] "|Total | 40| 46| 86|" + attr(,"format") + [1] "pipe" + attr(,"class") + [1] "knitr_kable" "character" + +--- + + Code + print_md(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full", + weights = efc$weights)) + Output + [1] "|efc$c172code | male| female| (NA) | Total|" + [2] "|:------------|----------:|----------:|:--------|-----:|" + [3] "|1 | 5 (4.8%)| 3 (2.9%)|2 (1.9%) | 10|" + [4] "|2 | 32 (30.5%)| 32 (30.5%)|3 (2.9%) | 67|" + [5] "|3 | 3 (2.9%)| 11 (10.5%)|1 (1.0%) | 15|" + [6] "| | 8 (7.6%)| 5 (4.8%)|1 (1.0%) | 14|" + [7] "| | | | | |" + [8] "|Total | 48| 51| 7 | 105|" + attr(,"format") + [1] "pipe" + attr(,"class") + [1] "knitr_kable" "character" + +--- + + Code + print_md(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full", + include_na = FALSE, weights = efc$weights)) + Output + [1] "|efc$c172code | male| female| Total|" + [2] "|:------------|----------:|----------:|-----:|" + [3] "|1 | 5 (5.8%)| 3 (3.5%)| 8|" + [4] "|2 | 32 (37.2%)| 32 (37.2%)| 64|" + [5] "|3 | 3 (3.5%)| 11 (12.8%)| 14|" + [6] "| | | | |" + [7] "|Total | 40| 46| 86|" + attr(,"format") + [1] "pipe" + attr(,"class") + [1] "knitr_kable" "character" + diff --git a/tests/testthat/test-data_tabulate.R b/tests/testthat/test-data_tabulate.R index 4390c1af5..1ca47a967 100644 --- a/tests/testthat/test-data_tabulate.R +++ b/tests/testthat/test-data_tabulate.R @@ -1,6 +1,5 @@ -data(efc) - test_that("data_tabulate factor", { + data(efc, package = "datawizard") x <- data_tabulate(efc$e42dep) expect_identical(as.vector(x$Value), as.vector(sort(unique( addNA(efc$e42dep) @@ -17,6 +16,7 @@ test_that("data_tabulate factor", { test_that("data_tabulate numeric", { + data(efc, package = "datawizard") x <- data_tabulate(efc$neg_c_7) expect_identical(as.vector(x$Value), as.vector(sort(unique( addNA(efc$neg_c_7) @@ -32,6 +32,14 @@ test_that("data_tabulate numeric", { }) +test_that("data_tabulate, HTML", { + skip_if_not_installed("gt") + data(efc, package = "datawizard") + expect_s3_class(print_html(data_tabulate(efc$c172code)), "gt_tbl") + expect_s3_class(print_html(data_tabulate(efc, "c172code")), "gt_tbl") +}) + + test_that("data_tabulate, weights", { data(efc, package = "datawizard") set.seed(123) @@ -62,6 +70,7 @@ test_that("data_tabulate, weights", { test_that("data_tabulate data.frame", { + data(efc, package = "datawizard") x <- data_tabulate(efc, c("e16sex", "c172code")) expect_s3_class(x, "list") expect_length(x, 2L) @@ -116,6 +125,12 @@ test_that("data_tabulate data.frame", { }) +test_that("data_tabulate unsupported class", { + data(mtcars) + expect_warning(data_tabulate(lm(mpg ~ hp, data = mtcars)), regex = "Can't compute frequency tables") +}) + + test_that("data_tabulate print", { set.seed(123) x <- sample.int(3, 1e6, TRUE) @@ -138,11 +153,13 @@ test_that("data_tabulate print", { test_that("data_tabulate print", { + data(efc, package = "datawizard") expect_snapshot(data_tabulate(efc$e42dep)) }) test_that("data_tabulate print multiple", { + data(efc, package = "datawizard") expect_snapshot(data_tabulate(efc, c("c172code", "e16sex"))) }) @@ -157,6 +174,7 @@ test_that("data_tabulate big numbers", { test_that("data_tabulate print multiple, collapse", { skip_if_not(packageVersion("insight") > "0.17.0", "insight must be >= 0.17.0") + data(efc, package = "datawizard") expect_snapshot(data_tabulate(efc, c("c172code", "e16sex"), collapse = TRUE)) }) @@ -164,6 +182,7 @@ test_that("data_tabulate print multiple, collapse", { test_that("data_tabulate grouped data.frame", { skip_if_not_installed("poorman") + data(efc, package = "datawizard") x <- data_tabulate(poorman::group_by(efc, e16sex), "c172code") expect_s3_class(x, "list") expect_length(x, 2L) @@ -213,11 +232,13 @@ test_that("data_tabulate grouped data.frame", { test_that("data_tabulate print grouped data", { skip_if_not_installed("poorman") + data(efc, package = "datawizard") expect_snapshot(data_tabulate(poorman::group_by(efc, e16sex), "c172code")) }) test_that("data_tabulate print, collapse groups", { skip_if_not_installed("poorman") + data(efc, package = "datawizard") expect_snapshot( data_tabulate(poorman::group_by(efc, e16sex), "c172code", collapse = TRUE) ) @@ -225,6 +246,7 @@ test_that("data_tabulate print, collapse groups", { test_that("data_tabulate print, collapse groups, drop levels", { skip_if_not_installed("poorman") + data(efc, package = "datawizard") expect_snapshot( data_tabulate( poorman::group_by(efc, e16sex), @@ -235,11 +257,143 @@ test_that("data_tabulate print, collapse groups, drop levels", { ) }) +test_that("data_tabulate drop levels", { + x <- factor(rep(letters[1:3], 3), levels = letters[1:5]) + out1 <- data_tabulate(x, drop_levels = FALSE) + out2 <- data_tabulate(x, drop_levels = TRUE) + expect_identical(out1$N, c(3L, 3L, 3L, 0L, 0L, 0L)) + expect_identical(as.character(out1$Value), c("a", "b", "c", "d", "e", NA)) + expect_identical(out2$N, c(3L, 3L, 3L, 0L)) + expect_identical(as.character(out2$Value), c("a", "b", "c", NA)) +}) + # select helpers ------------------------------ test_that("data_tabulate regex", { + data(mtcars) expect_identical( data_tabulate(mtcars, select = "arb", regex = TRUE), data_tabulate(mtcars, select = "carb") ) }) + + +# missing values ------------------------------ + +test_that("data_tabulate exclude/include missing values", { + data(efc, package = "datawizard") + set.seed(123) + efc$weights <- abs(rnorm(n = nrow(efc), mean = 1, sd = 0.5)) + efc$e16sex[sample.int(nrow(efc), 5)] <- NA + out <- data_tabulate(efc$c172code) + expect_identical(out$N, c(8L, 66L, 16L, 10L)) + out <- data_tabulate(efc$c172code, include_na = FALSE) + expect_identical(out$N, c(8L, 66L, 16L)) + out <- data_tabulate(efc$c172code, weights = efc$weights) + expect_identical(out$N, c(10, 67, 15, 13)) + out <- data_tabulate(efc$c172code, include_na = FALSE, weights = efc$weights) + expect_identical(out$N, c(10, 67, 15)) +}) + + +# cross tables ------------------------------ +test_that("data_tabulate, cross tables", { + data(efc, package = "datawizard") + set.seed(123) + efc$weights <- abs(rnorm(n = nrow(efc), mean = 1, sd = 0.5)) + efc$e16sex[sample.int(nrow(efc), 5)] <- NA + + expect_snapshot(print(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full"))) + expect_snapshot(print(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full", include_na = FALSE))) + expect_snapshot(print(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full", weights = efc$weights))) + expect_snapshot(print(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full", include_na = FALSE, weights = efc$weights))) # nolint + expect_snapshot(print(data_tabulate(efc, "c172code", by = efc$e16sex, proportions = "row"))) + expect_snapshot(print(data_tabulate(efc, "c172code", by = efc$e16sex, proportions = "row", include_na = FALSE))) + expect_snapshot(print(data_tabulate(efc, "c172code", by = efc$e16sex, proportions = "row", weights = efc$weights))) + expect_snapshot(print(data_tabulate(efc, "c172code", by = efc$e16sex, proportions = "row", include_na = FALSE, weights = efc$weights))) # nolint + expect_snapshot(print(data_tabulate(efc, "c172code", by = "e16sex", proportions = "column"))) + expect_snapshot(print(data_tabulate(efc, "c172code", by = "e16sex", proportions = "column", include_na = FALSE))) + expect_snapshot(print(data_tabulate(efc, "c172code", by = "e16sex", proportions = "column", weights = "weights"))) + expect_snapshot(print(data_tabulate(efc, "c172code", by = "e16sex", proportions = "column", include_na = FALSE, weights = "weights"))) # nolint +}) + +test_that("data_tabulate, cross tables, HTML", { + skip_if_not_installed("gt") + data(efc, package = "datawizard") + set.seed(123) + efc$weights <- abs(rnorm(n = nrow(efc), mean = 1, sd = 0.5)) + efc$e16sex[sample.int(nrow(efc), 5)] <- NA + + expect_s3_class(print_html(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full")), "gt_tbl") + expect_s3_class(print_html(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full", include_na = FALSE)), "gt_tbl") # nolint + expect_s3_class(print_html(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full", weights = efc$weights)), "gt_tbl") # nolint + expect_s3_class(print_html(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full", include_na = FALSE, weights = efc$weights)), "gt_tbl") # nolint + expect_s3_class(print_html(data_tabulate(efc, "c172code", by = efc$e16sex, proportions = "row")), "gt_tbl") + expect_s3_class(print_html(data_tabulate(efc, "c172code", by = efc$e16sex, proportions = "row", include_na = FALSE, weights = efc$weights)), "gt_tbl") # nolint +}) + +test_that("data_tabulate, cross tables, grouped df", { + data(efc, package = "datawizard") + set.seed(123) + efc$weights <- abs(rnorm(n = nrow(efc), mean = 1, sd = 0.5)) + efc$e16sex[sample.int(nrow(efc), 5)] <- NA + grp <- data_group(efc, "e42dep") + expect_snapshot(print(data_tabulate(grp, "c172code", by = "e16sex", proportions = "row"))) +}) + +test_that("data_tabulate, cross tables, errors by", { + data(efc, package = "datawizard") + set.seed(123) + efc$weights <- abs(rnorm(n = nrow(efc), mean = 1, sd = 0.5)) + efc$e16sex[sample.int(nrow(efc), 5)] <- NA + expect_error(data_tabulate(efc$c172code, by = "e16sex"), regex = "If `by` is a string") + expect_error(data_tabulate(efc$c172code, by = efc$e16sex[-1]), regex = "Length of `by`") + expect_error(data_tabulate(efc, "c172code", by = efc$e16sex[-1]), regex = "Length of `by`") + expect_error(data_tabulate(efc, "c172code", by = "c16sex"), regex = "not found") + expect_error(data_tabulate(efc, "c172code", by = c("e16sex", "e42dep")), regex = "You may use") +}) + +test_that("data_tabulate, cross tables, errors weights", { + data(efc, package = "datawizard") + set.seed(123) + efc$weights <- abs(rnorm(n = nrow(efc), mean = 1, sd = 0.5)) + efc$e16sex[sample.int(nrow(efc), 5)] <- NA + expect_error(data_tabulate(efc$c172code, weights = "weights"), regex = "If `weights`") + expect_error(data_tabulate(efc$c172code, weights = efc$weights[-1]), regex = "Length of `weights`") + expect_error(data_tabulate(efc, "c172code", weights = efc$weights[-1]), regex = "Length of `weights`") + expect_error(data_tabulate(efc, "c172code", weights = "weigths"), regex = "not found") + expect_error(data_tabulate(efc, "c172code", weights = c("e16sex", "e42dep")), regex = "length 1") +}) + + +# markdown ------------------------- + +test_that("data_tabulate, cross tables, markdown", { + data(efc, package = "datawizard") + set.seed(123) + efc$weights <- abs(rnorm(n = nrow(efc), mean = 1, sd = 0.5)) + efc$e16sex[sample.int(nrow(efc), 5)] <- NA + + expect_snapshot(print_md(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full"))) + expect_snapshot(print_md(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full", include_na = FALSE))) + expect_snapshot(print_md(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full", weights = efc$weights))) + expect_snapshot(print_md(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full", include_na = FALSE, weights = efc$weights))) # nolint +}) + +# validate against table ------------------------- + +test_that("data_tabulate, validate against table", { + data(mtcars) + # frequency table + out1 <- as.data.frame(table(mtcars$cyl)) + out2 <- data_tabulate(mtcars$cyl, include_na = FALSE) + expect_identical(out1$Freq, out2$N) + # crosstable + out1 <- data_arrange(as.data.frame(table(mtcars$cyl, mtcars$gear)), c("Var1", "Var2")) + out2 <- data_rename(data_to_long( + as.data.frame(data_tabulate(mtcars$cyl, by = mtcars$gear, include_na = FALSE)), 2:4, + names_to = "Var2", values_to = "Freq" + ), "mtcars$cyl", "Var1") + out1[[2]] <- as.character(out1[[2]]) + expect_equal(out1, out2, ignore_attr = TRUE) +})