diff --git a/NAMESPACE b/NAMESPACE index a2a911db3..e539c8c79 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -67,6 +67,7 @@ S3method(describe_distribution,numeric) S3method(format,data_codebook) S3method(format,dw_data_peek) S3method(format,dw_data_tabulate) +S3method(format,dw_data_xtabulate) S3method(format,dw_groupmeans) S3method(format,parameters_distribution) S3method(kurtosis,data.frame) @@ -91,6 +92,8 @@ S3method(print,data_seek) S3method(print,dw_data_peek) S3method(print,dw_data_tabulate) S3method(print,dw_data_tabulates) +S3method(print,dw_data_xtabulate) +S3method(print,dw_data_xtabulates) S3method(print,dw_groupmeans) S3method(print,dw_groupmeans_list) S3method(print,dw_transformer) diff --git a/R/data_arrange.R b/R/data_arrange.R index 5666fde42..58c8a5ab3 100644 --- a/R/data_arrange.R +++ b/R/data_arrange.R @@ -46,8 +46,8 @@ data_arrange.default <- function(data, select = NULL, safe = TRUE) { dont_exist <- select[which(!select %in% names(data))] if (length(dont_exist) > 0) { - if (!safe) { - insight::format_error( + if (safe) { + insight::format_warning( paste0( "The following column(s) don't exist in the dataset: ", text_concatenate(dont_exist), "." @@ -55,7 +55,7 @@ data_arrange.default <- function(data, select = NULL, safe = TRUE) { .misspelled_string(names(data), dont_exist, "Possibly misspelled?") ) } else { - insight::format_warning( + insight::format_error( paste0( "The following column(s) don't exist in the dataset: ", text_concatenate(dont_exist), "." diff --git a/R/data_tabulate.R b/R/data_tabulate.R index 4d3bb810b..42f71d8a6 100644 --- a/R/data_tabulate.R +++ b/R/data_tabulate.R @@ -6,11 +6,16 @@ #' cumulative percentages. #' #' @param x A (grouped) data frame, a vector or factor. +#' @param by Optional vector or factor. If supplied, a crosstable is created. +#' If `x` is a data frame, `by` can also be a character string indicating the +#' name of a variable in `x`. #' @param drop_levels Logical, if `TRUE`, factor levels that do not occur in #' the data are included in the table (with frequency of zero), else unused #' factor levels are dropped from the frequency table. #' @param name Optional character string, which includes the name that is used #' for printing. +#' @param include_na Logical, if `TRUE`, missing values are included in the +#' frequency or cross table, else missing values are omitted. #' @param collapse Logical, if `TRUE` collapses multiple tables into one larger #' table for printing. This affects only printing, not the returned object. #' @param weights Optional numeric vector of weights. Must be of the same length @@ -62,7 +67,15 @@ data_tabulate <- function(x, ...) { #' @rdname data_tabulate #' @export -data_tabulate.default <- function(x, drop_levels = FALSE, weights = NULL, name = NULL, verbose = TRUE, ...) { +data_tabulate.default <- function(x, + by = NULL, + drop_levels = FALSE, + weights = NULL, + include_na = TRUE, + proportions = NULL, + name = NULL, + verbose = TRUE, + ...) { # save label attribute, before it gets lost... var_label <- attr(x, "label", exact = TRUE) @@ -83,20 +96,42 @@ data_tabulate.default <- function(x, drop_levels = FALSE, weights = NULL, name = insight::format_error("Length of `weights` must be equal to length of `x`.") } + # we go into another function for crosstables here... + if (!is.null(by)) { + by <- .validate_by(by, x) + return(.crosstable(x, by, weights, proportions, obj_name, group_variable, ...)) + } + # frequency table if (is.null(weights)) { - freq_table <- tryCatch(table(addNA(x)), error = function(e) NULL) + if (include_na) { + freq_table <- tryCatch(table(addNA(x)), error = function(e) NULL) + } else { + freq_table <- tryCatch(table(x), error = function(e) NULL) + } } else { # weighted frequency table - freq_table <- tryCatch( - stats::xtabs( - weights ~ x, - data = data.frame(weights = weights, x = x), - na.action = stats::na.pass, - addNA = TRUE - ), - error = function(e) NULL - ) + if (include_na) { + freq_table <- tryCatch( + stats::xtabs( + weights ~ x, + data = data.frame(weights = weights, x = addNA(x)), + na.action = stats::na.pass, + addNA = TRUE + ), + error = function(e) NULL + ) + } else { + freq_table <- tryCatch( + stats::xtabs( + weights ~ x, + data = data.frame(weights = weights, x = x), + na.action = stats::na.omit, + addNA = FALSE + ), + error = function(e) NULL + ) + } } if (is.null(freq_table)) { @@ -159,9 +194,12 @@ data_tabulate.data.frame <- function(x, exclude = NULL, ignore_case = FALSE, regex = FALSE, - collapse = FALSE, + by = NULL, drop_levels = FALSE, weights = NULL, + include_na = TRUE, + proportions = NULL, + collapse = FALSE, verbose = TRUE, ...) { # evaluate arguments @@ -172,11 +210,24 @@ data_tabulate.data.frame <- function(x, regex = regex, verbose = verbose ) + # validate "by" + by <- .validate_by(by, x) + out <- lapply(select, function(i) { - data_tabulate(x[[i]], drop_levels = drop_levels, weights = weights, name = i, verbose = verbose, ...) + data_tabulate( + x[[i]], + by = by, + proportions = proportions, + drop_levels = drop_levels, + weights = weights, + include_na = include_na, + name = i, + verbose = verbose, + ... + ) }) - class(out) <- c("dw_data_tabulates", "list") + class(out) <- ifelse(is.null(by), c("dw_data_tabulates", "list"), c("dw_data_xtabulates", "list")) attr(out, "collapse") <- isTRUE(collapse) attr(out, "is_weighted") <- !is.null(weights) @@ -190,10 +241,13 @@ data_tabulate.grouped_df <- function(x, exclude = NULL, ignore_case = FALSE, regex = FALSE, - verbose = TRUE, - collapse = FALSE, + by = NULL, + proportions = NULL, drop_levels = FALSE, weights = NULL, + include_na = TRUE, + collapse = FALSE, + verbose = TRUE, ...) { # works only for dplyr >= 0.8.0 grps <- attr(x, "groups", exact = TRUE) @@ -210,6 +264,7 @@ data_tabulate.grouped_df <- function(x, ) x <- as.data.frame(x) + out <- list() for (i in seq_along(grps)) { rows <- grps[[i]] @@ -227,11 +282,14 @@ data_tabulate.grouped_df <- function(x, verbose = verbose, drop_levels = drop_levels, weights = weights, + include_na = include_na, + by = by, + proportions = proportions, group_variable = group_variable, ... )) } - class(out) <- c("dw_data_tabulates", "list") + class(out) <- ifelse(is.null(by), c("dw_data_tabulates", "list"), c("dw_data_xtabulates", "list")) attr(out, "collapse") <- isTRUE(collapse) attr(out, "is_weighted") <- !is.null(weights) @@ -239,8 +297,6 @@ data_tabulate.grouped_df <- function(x, } - - # methods -------------------- #' @importFrom insight print_html @@ -286,7 +342,6 @@ format.dw_data_tabulate <- function(x, format = "text", big_mark = NULL, ...) { } - #' @export print.dw_data_tabulate <- function(x, big_mark = NULL, ...) { a <- attributes(x) diff --git a/R/data_xtabulate.R b/R/data_xtabulate.R new file mode 100644 index 000000000..078b4b98f --- /dev/null +++ b/R/data_xtabulate.R @@ -0,0 +1,240 @@ +# helper to compute crosstables -------------- + +.crosstable <- function(x, + by, + weights = NULL, + include_na = TRUE, + proportions = NULL, + obj_name = NULL, + group_variable = NULL, + ...) { + if (!is.null(proportions)) { + proportions <- match.arg(proportions, c("row", "column", "cell")) + } + # frequency table + if (is.null(weights)) { + if (include_na) { + x_table <- tryCatch(table(addNA(x), addNA(by)), error = function(e) NULL) + } else { + x_table <- tryCatch(table(x, by), error = function(e) NULL) + } + } else { + # weighted frequency table + if (include_na) { + x_table <- tryCatch( + stats::xtabs( + weights ~ x + by, + data = data.frame(weights = weights, x = addNA(x), by = addNA(by)), + na.action = stats::na.pass, + addNA = TRUE + ), + error = function(e) NULL + ) + } else { + x_table <- tryCatch( + stats::xtabs( + weights ~ x + by, + data = data.frame(weights = weights, x = x, by = by), + na.action = stats::na.omit, + addNA = FALSE + ), + error = function(e) NULL + ) + } + } + + if (is.null(x_table)) { + insight::format_warning(paste0("Can't compute cross tables for objects of class `", class(x)[1], "`.")) + return(NULL) + } + + out <- as.data.frame(stats::ftable(x_table)) + colnames(out) <- c("Value", "by", "N") + + total_n <- sum(out$N, na.rm = TRUE) + valid_n <- sum(out$N[!is.na(out$N)], na.rm = TRUE) + + out <- data_to_wide(out, values_from = "N", names_from = "by") + + # use variable name as column name + if (!is.null(obj_name)) { + colnames(out)[1] <- obj_name + } + + # for grouped data frames, add info about grouping variables + if (!is.null(group_variable)) { + var_info <- toString(lapply(colnames(group_variable), function(i) { + sprintf("%s (%s)", i, group_variable[[i]]) + })) + out <- cbind(out[1], data.frame(Group = var_info, stringsAsFactors = FALSE), out[-1]) + } + + attr(out, "total_n") <- total_n + attr(out, "valid_n") <- valid_n + attr(out, "weights") <- weights + attr(out, "proportions") <- proportions + + class(out) <- c("dw_data_xtabulate", "data.frame") + + out +} + + +#' @export +format.dw_data_xtabulate <- function(x, format = "text", digits = 1, big_mark = NULL, ...) { + # convert to character manually, else, for large numbers, + # format_table() returns scientific notation + x <- as.data.frame(x) + + # compute total N for rows and colummns + total_n <- attributes(x)$total_n + total_column <- rowSums(x[, -1], na.rm = TRUE) + total_row <- c(colSums(x[, -1], na.rm = TRUE), total_n) + + # proportions? + props <- attributes(x)$proportions + + if (!is.null(props)) { + # we copy x to tmp, because when we create strings with "sprintf()", the + # variable is coerced to character, and in subsequent iterations of the loop, + # mathemathical operations are not possible anymore + tmp <- x + if (identical(props, "row")) { + for (i in seq_len(nrow(x))) { + tmp[i, -1] <- sprintf( + "%s (%.*f%%)", + x[i, -1], + digits, + 100 * x[i, -1] / sum(x[i, -1], na.rm = TRUE) + ) + } + } else if (identical(props, "column")) { + for (i in seq_len(ncol(x))[-1]) { + tmp[, i] <- sprintf( + "%s (%.*f%%)", + x[, i], + digits, + 100 * x[, i] / sum(x[, i], na.rm = TRUE) + ) + } + } else if (identical(props, "cell")) { + for (i in seq_len(ncol(x))[-1]) { + tmp[, i] <- sprintf( + "%s (%.*f%%)", + x[, i], + digits, + 100 * x[, i] / total_n + ) + } + } + # copy back final result + x <- tmp + } + + x[] <- lapply(x, as.character) + + # format data frame + ftab <- insight::format_table(x, ...) + ftab[] <- lapply(ftab, function(i) { + i[i == ""] <- ifelse(identical(format, "text"), "", "(NA)") # nolint + i + }) + ftab$Total <- gsub("\\.00$", "", as.character(total_column)) + sub <- as.data.frame(t(data.frame( + rep("", ncol(ftab)), + c("Total", as.character(total_row)), + stringsAsFactors = FALSE + ))) + colnames(sub) <- colnames(ftab) + ftab <- rbind(ftab, sub) + ftab[nrow(ftab), ] <- gsub("\\.00$", "", ftab[nrow(ftab), ]) + + # insert big marks? + ftab$Total <- .add_commas_in_numbers(ftab$Total, big_mark) + ftab[nrow(ftab), ] <- .add_commas_in_numbers(ftab[nrow(ftab), ], big_mark) + + ftab +} + + +#' @export +print.dw_data_xtabulate <- function(x, big_mark = NULL, ...) { + a <- attributes(x) + a$total_n <- .add_commas_in_numbers(a$total_n, big_mark) + a$valid_n <- .add_commas_in_numbers(a$valid_n, big_mark) + + # grouped data? if yes, add information on grouping factor + if (is.null(x[["Group"]])) { + caption <- NULL + } else { + caption <- paste0("Grouped by ", x[["Group"]][1]) + } + + # summary of total and valid N (we may add mean/sd as well?) + summary_line <- sprintf( + "\ntotal N=%s valid N=%s%s\n", + a$total_n, + a$valid_n, + ifelse(is.null(a$weights), "", " (weighted)") + ) + + # print table + cat(insight::export_table( + format(x, big_mark = big_mark, ...), + cross = "+", + missing = "", + footer = summary_line, + caption = caption, + empty_line = "-" + )) + invisible(x) +} + + +#' @export +print.dw_data_xtabulates <- function(x, big_mark = NULL, ...) { + for (i in seq_along(x)) { + print(x[[i]], big_mark = big_mark, ...) + cat("\n") + } + invisible(x) +} + + +# helper --------------------- + +.validate_by <- function(by, x) { + if (!is.null(by)) { + if (is.character(by)) { + # If "by" is a character string, must be of length 1 + if (length(by) > 1) { + insight::format_error( + "If argument `by` is a character indicating a variable name, `by` must be of length 1.", + "You may use `data_group()` to group by multiple variables, then call `data_tabulate()`." + ) + } + # if "by" is a character, "x" must be a data frame + if (!is.data.frame(x)) { + insight::format_error("If argument `by` is a character indicating a variable name, `x` must be a data frame.") + } + # is "by" a column in "x"? + if (!by %in% colnames(x)) { + insight::format_error(sprintf( + "The variable specified in `by` was not found in `x`. %s", + .misspelled_string(names(x), by, "Possibly misspelled?") + )) + } + by <- x[[by]] + } + # is "by" of same length as "x"? + if (is.data.frame(x) && length(by) != nrow(x)) { + insight::format_error("The variable specified in `by` must have the same length as rows in `x`.") # nolint + } + if (!is.factor(by)) { + # coerce "by" to factor, including labels + by <- to_factor(by, labels_to_levels = TRUE, verbose = FALSE) + } + } + + by +} diff --git a/man/data_tabulate.Rd b/man/data_tabulate.Rd index 5332f386f..5d966b676 100644 --- a/man/data_tabulate.Rd +++ b/man/data_tabulate.Rd @@ -10,8 +10,11 @@ data_tabulate(x, ...) \method{data_tabulate}{default}( x, + by = NULL, drop_levels = FALSE, weights = NULL, + include_na = TRUE, + proportions = NULL, name = NULL, verbose = TRUE, ... @@ -23,9 +26,12 @@ data_tabulate(x, ...) exclude = NULL, ignore_case = FALSE, regex = FALSE, - collapse = FALSE, + by = NULL, drop_levels = FALSE, weights = NULL, + include_na = TRUE, + proportions = NULL, + collapse = FALSE, verbose = TRUE, ... ) @@ -35,6 +41,10 @@ data_tabulate(x, ...) \item{...}{not used.} +\item{by}{Optional vector or factor. If supplied, a crosstable is created. +If \code{x} is a data frame, \code{by} can also be a character string indicating the +name of a variable in \code{x}.} + \item{drop_levels}{Logical, if \code{TRUE}, factor levels that do not occur in the data are included in the table (with frequency of zero), else unused factor levels are dropped from the frequency table.} @@ -42,6 +52,9 @@ factor levels are dropped from the frequency table.} \item{weights}{Optional numeric vector of weights. Must be of the same length as \code{x}. If \code{weights} is supplied, weighted frequencies are calculated.} +\item{include_na}{Logical, if \code{TRUE}, missing values are included in the +frequency or cross table, else missing values are omitted.} + \item{name}{Optional character string, which includes the name that is used for printing.}