diff --git a/DESCRIPTION b/DESCRIPTION index 2ed1150ae..0a7fb9594 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: datawizard Title: Easy Data Wrangling and Statistical Transformations -Version: 0.11.0.2 +Version: 0.11.0.3 Authors@R: c( person("Indrajeet", "Patil", , "patilindrajeet.science@gmail.com", role = "aut", comment = c(ORCID = "0000-0003-1995-6531", Twitter = "@patilindrajeets")), diff --git a/NAMESPACE b/NAMESPACE index 5926d19ab..a08798db1 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,5 +1,7 @@ # Generated by roxygen2: do not edit by hand +S3method(as.data.frame,datawizard_crosstabs) +S3method(as.data.frame,datawizard_tables) S3method(as.double,parameters_kurtosis) S3method(as.double,parameters_skewness) S3method(as.double,parameters_smoothness) @@ -69,9 +71,9 @@ S3method(describe_distribution,grouped_df) S3method(describe_distribution,list) S3method(describe_distribution,numeric) S3method(format,data_codebook) +S3method(format,datawizard_crosstab) +S3method(format,datawizard_table) S3method(format,dw_data_peek) -S3method(format,dw_data_tabulate) -S3method(format,dw_data_xtabulate) S3method(format,dw_groupmeans) S3method(format,parameters_distribution) S3method(kurtosis,data.frame) @@ -93,12 +95,12 @@ S3method(normalize,numeric) S3method(plot,visualisation_recipe) S3method(print,data_codebook) S3method(print,data_seek) +S3method(print,datawizard_crosstab) +S3method(print,datawizard_crosstabs) +S3method(print,datawizard_table) +S3method(print,datawizard_tables) S3method(print,dw_data_peek) S3method(print,dw_data_summary) -S3method(print,dw_data_tabulate) -S3method(print,dw_data_tabulates) -S3method(print,dw_data_xtabulate) -S3method(print,dw_data_xtabulates) S3method(print,dw_groupmeans) S3method(print,dw_groupmeans_list) S3method(print,dw_transformer) @@ -107,16 +109,16 @@ S3method(print,parameters_kurtosis) S3method(print,parameters_skewness) S3method(print,visualisation_recipe) S3method(print_html,data_codebook) +S3method(print_html,datawizard_crosstab) +S3method(print_html,datawizard_crosstabs) +S3method(print_html,datawizard_table) +S3method(print_html,datawizard_tables) S3method(print_html,dw_data_peek) -S3method(print_html,dw_data_tabulate) -S3method(print_html,dw_data_tabulates) -S3method(print_html,dw_data_xtabulate) -S3method(print_html,dw_data_xtabulates) S3method(print_md,data_codebook) +S3method(print_md,datawizard_crosstab) +S3method(print_md,datawizard_table) +S3method(print_md,datawizard_tables) S3method(print_md,dw_data_peek) -S3method(print_md,dw_data_tabulate) -S3method(print_md,dw_data_tabulates) -S3method(print_md,dw_data_xtabulate) S3method(ranktransform,data.frame) S3method(ranktransform,factor) S3method(ranktransform,grouped_df) diff --git a/NEWS.md b/NEWS.md index 1e49e91a4..ada1f08ea 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,10 +1,21 @@ # datawizard 0.11.0.1 -## Changes +BREAKING CHANGES + +* Class names for objects returned by `data_tabulate()` have been changed to + `datawizard_table` and `datawizard_crosstable` (resp. the plural forms, + `*_tables`), to provide a clearer and more consistent naming scheme. + +CHANGES * `data_select()` can directly rename selected variables when a named vector is provided in `select`, e.g. `data_select(mtcars, c(new1 = "mpg", new2 = "cyl"))`. +* `data_tabulate()` gains an `as.data.frame()` method, to return the frequency + table as a data frame. The structure of the returned object is a nested data + frame, where the first column contains name of the variable for which + frequencies were calculated, and the second column contains the frequency table. + # datawizard 0.11.0 BREAKING CHANGES diff --git a/R/data_tabulate.R b/R/data_tabulate.R index 6a26a39c9..74f4f2e03 100644 --- a/R/data_tabulate.R +++ b/R/data_tabulate.R @@ -28,6 +28,13 @@ #' @param ... not used. #' @inheritParams extract_column_names #' +#' @details +#' There is an `as.data.frame()` method, to return the frequency tables as a +#' data frame. The structure of the returned object is a nested data frame, +#' where the first column contains name of the variable for which frequencies +#' were calculated, and the second column is a list column that contains the +#' frequency tables as data frame. See 'Examples'. +#' #' @section Crosstables: #' If `by` is supplied, a crosstable is created. The crosstable includes `` #' (missing) values by default. The first column indicates values of `x`, the @@ -108,6 +115,12 @@ #' # round percentages #' out <- data_tabulate(efc, "c172code", by = "e16sex", proportions = "column") #' print(out, digits = 0) +#' +#' # coerce to data frames +#' result <- data_tabulate(efc, "c172code", by = "e16sex") +#' as.data.frame(result) +#' as.data.frame(result)$table +#' as.data.frame(result, add_total = TRUE)$table #' @export data_tabulate <- function(x, ...) { UseMethod("data_tabulate") @@ -242,7 +255,7 @@ data_tabulate.default <- function(x, attr(out, "total_n") <- sum(out$N, na.rm = TRUE) attr(out, "valid_n") <- valid_n - class(out) <- c("dw_data_tabulate", "data.frame") + class(out) <- c("datawizard_table", "data.frame") out } @@ -292,9 +305,9 @@ data_tabulate.data.frame <- function(x, }) if (is.null(by)) { - class(out) <- c("dw_data_tabulates", "list") + class(out) <- c("datawizard_tables", "list") } else { - class(out) <- c("dw_data_xtabulates", "list") + class(out) <- c("datawizard_crosstabs", "list") } attr(out, "collapse") <- isTRUE(collapse) attr(out, "is_weighted") <- !is.null(weights) @@ -357,9 +370,9 @@ data_tabulate.grouped_df <- function(x, )) } if (is.null(by)) { - class(out) <- c("dw_data_tabulates", "list") + class(out) <- c("datawizard_tables", "list") } else { - class(out) <- c("dw_data_xtabulates", "list") + class(out) <- c("datawizard_crosstabs", "list") } attr(out, "collapse") <- isTRUE(collapse) attr(out, "is_weighted") <- !is.null(weights) @@ -380,8 +393,64 @@ insight::print_html insight::print_md +#' @rdname data_tabulate +#' @param add_total For crosstables (i.e. when `by` is not `NULL`), a row and +#' column with the total N values are added to the data frame. `add_total` has +#' no effect in `as.data.frame()` for simple frequency tables. +#' @inheritParams base::as.data.frame +#' @export +as.data.frame.datawizard_tables <- function(x, + row.names = NULL, + optional = FALSE, + ..., + stringsAsFactors = FALSE, + add_total = FALSE) { + # extract variables of frequencies + selected_vars <- unlist(lapply(x, function(i) attributes(i)$varname)) + # coerce to data frame, remove rownames + data_frames <- lapply(x, function(i) { + # the `format()` methods for objects returned by `data_tabulate()` call + # `as.data.frame()` - we have to pay attention to avoid infinite iterations + # here. At the moment, this is no problem, as objects we have at this stage + # are of class "datawizard_table" or "datawizard_crosstab", while this + # `as.data.frame()` method is only called for "datawizard_tables" (the plural) + # form). Else, we would need to modify the class attribute here, + # e.g. class(i) <- "data.frame" + if (add_total) { + # to add the total column and row, we simply can call `format()` + out <- as.data.frame(format(i)) + for (cols in 2:ncol(out)) { + # since "format()" returns a character matrix, we want to convert + # the columns to numeric. We have to exclude the first column, as the + # first column is character, due to the added "Total" value. + out[[cols]] <- as.numeric(out[[cols]]) + } + # after formatting, we have a "separator" row for nicer printing. + # this should also be removed + out <- remove_empty_rows(out) + } else { + out <- as.data.frame(i) + } + rownames(out) <- NULL + out + }) + # create nested data frame + result <- data.frame( + var = selected_vars, + table = I(data_frames), + stringsAsFactors = stringsAsFactors + ) + # consider additional arguments + rownames(result) <- row.names + result +} + +#' @export +as.data.frame.datawizard_crosstabs <- as.data.frame.datawizard_tables + + #' @export -format.dw_data_tabulate <- function(x, format = "text", big_mark = NULL, ...) { +format.datawizard_table <- function(x, format = "text", big_mark = NULL, ...) { # convert to character manually, else, for large numbers, # format_table() returns scientific notation x <- as.data.frame(x) @@ -414,7 +483,7 @@ format.dw_data_tabulate <- function(x, format = "text", big_mark = NULL, ...) { #' @export -print.dw_data_tabulate <- function(x, big_mark = NULL, ...) { +print.datawizard_table <- function(x, big_mark = NULL, ...) { a <- attributes(x) # "table" header with variable label/name, and type @@ -456,7 +525,7 @@ print.dw_data_tabulate <- function(x, big_mark = NULL, ...) { #' @export -print_html.dw_data_tabulate <- function(x, big_mark = NULL, ...) { +print_html.datawizard_table <- function(x, big_mark = NULL, ...) { a <- attributes(x) # "table" header with variable label/name, and type @@ -486,7 +555,7 @@ print_html.dw_data_tabulate <- function(x, big_mark = NULL, ...) { #' @export -print_md.dw_data_tabulate <- function(x, big_mark = NULL, ...) { +print_md.datawizard_table <- function(x, big_mark = NULL, ...) { a <- attributes(x) # "table" header with variable label/name, and type @@ -516,7 +585,7 @@ print_md.dw_data_tabulate <- function(x, big_mark = NULL, ...) { #' @export -print.dw_data_tabulates <- function(x, big_mark = NULL, ...) { +print.datawizard_tables <- function(x, big_mark = NULL, ...) { # check if we have weights is_weighted <- isTRUE(attributes(x)$is_weighted) @@ -555,7 +624,7 @@ print.dw_data_tabulates <- function(x, big_mark = NULL, ...) { #' @export -print_html.dw_data_tabulates <- function(x, big_mark = NULL, ...) { +print_html.datawizard_tables <- function(x, big_mark = NULL, ...) { # check if we have weights is_weighted <- isTRUE(attributes(x)$is_weighted) @@ -584,7 +653,7 @@ print_html.dw_data_tabulates <- function(x, big_mark = NULL, ...) { #' @export -print_md.dw_data_tabulates <- function(x, big_mark = NULL, ...) { +print_md.datawizard_tables <- function(x, big_mark = NULL, ...) { # check if we have weights is_weighted <- isTRUE(attributes(x)$is_weighted) diff --git a/R/data_xtabulate.R b/R/data_xtabulate.R index 3cb25d62b..5c387ff95 100644 --- a/R/data_xtabulate.R +++ b/R/data_xtabulate.R @@ -74,8 +74,9 @@ attr(out, "total_n") <- total_n attr(out, "weights") <- weights attr(out, "proportions") <- proportions + attr(out, "varname") <- obj_name - class(out) <- c("dw_data_xtabulate", "data.frame") + class(out) <- c("datawizard_crosstab", "data.frame") out } @@ -85,7 +86,7 @@ #' @export -format.dw_data_xtabulate <- function(x, format = "text", digits = 1, big_mark = NULL, ...) { +format.datawizard_crosstab <- function(x, format = "text", digits = 1, big_mark = NULL, ...) { # convert to character manually, else, for large numbers, # format_table() returns scientific notation x <- as.data.frame(x) @@ -178,7 +179,7 @@ format.dw_data_xtabulate <- function(x, format = "text", digits = 1, big_mark = #' @export -print.dw_data_xtabulate <- function(x, big_mark = NULL, ...) { +print.datawizard_crosstab <- function(x, big_mark = NULL, ...) { # grouped data? if yes, add information on grouping factor if (is.null(x[["Group"]])) { caption <- NULL @@ -200,7 +201,7 @@ print.dw_data_xtabulate <- function(x, big_mark = NULL, ...) { #' @export -print_md.dw_data_xtabulate <- function(x, big_mark = NULL, ...) { +print_md.datawizard_crosstab <- function(x, big_mark = NULL, ...) { # grouped data? if yes, add information on grouping factor if (is.null(x[["Group"]])) { caption <- NULL @@ -222,7 +223,7 @@ print_md.dw_data_xtabulate <- function(x, big_mark = NULL, ...) { #' @export -print_html.dw_data_xtabulate <- function(x, big_mark = NULL, ...) { +print_html.datawizard_crosstab <- function(x, big_mark = NULL, ...) { # grouped data? if yes, add information on grouping factor if (!is.null(x[["Group"]])) { x$groups <- paste0("Grouped by ", x[["Group"]][1]) @@ -240,7 +241,7 @@ print_html.dw_data_xtabulate <- function(x, big_mark = NULL, ...) { #' @export -print.dw_data_xtabulates <- function(x, big_mark = NULL, ...) { +print.datawizard_crosstabs <- function(x, big_mark = NULL, ...) { for (i in seq_along(x)) { print(x[[i]], big_mark = big_mark, ...) cat("\n") @@ -250,7 +251,7 @@ print.dw_data_xtabulates <- function(x, big_mark = NULL, ...) { #' @export -print_html.dw_data_xtabulates <- function(x, big_mark = NULL, ...) { +print_html.datawizard_crosstabs <- function(x, big_mark = NULL, ...) { if (length(x) == 1) { print_html(x[[1]], big_mark = big_mark, ...) } else { diff --git a/man/data_tabulate.Rd b/man/data_tabulate.Rd index b744c1f1b..3f17bb21c 100644 --- a/man/data_tabulate.Rd +++ b/man/data_tabulate.Rd @@ -4,6 +4,7 @@ \alias{data_tabulate} \alias{data_tabulate.default} \alias{data_tabulate.data.frame} +\alias{as.data.frame.datawizard_tables} \title{Create frequency and crosstables of variables} \usage{ data_tabulate(x, ...) @@ -35,6 +36,15 @@ data_tabulate(x, ...) verbose = TRUE, ... ) + +\method{as.data.frame}{datawizard_tables}( + x, + row.names = NULL, + optional = FALSE, + ..., + stringsAsFactors = FALSE, + add_total = FALSE +) } \arguments{ \item{x}{A (grouped) data frame, a vector or factor.} @@ -116,6 +126,24 @@ functions (see 'Details'), this argument may be used as workaround.} \item{collapse}{Logical, if \code{TRUE} collapses multiple tables into one larger table for printing. This affects only printing, not the returned object.} + +\item{row.names}{\code{NULL} or a character vector giving the row + names for the data frame. Missing values are not allowed.} + +\item{optional}{logical. If \code{TRUE}, setting row names and + converting column names (to syntactic names: see + \code{\link[base]{make.names}}) is optional. Note that all of \R's + \pkg{base} package \code{as.data.frame()} methods use + \code{optional} only for column names treatment, basically with the + meaning of \code{\link[base]{data.frame}(*, check.names = !optional)}. + See also the \code{make.names} argument of the \code{matrix} method.} + +\item{stringsAsFactors}{logical: should the character vector be converted + to a factor?} + +\item{add_total}{For crosstables (i.e. when \code{by} is not \code{NULL}), a row and +column with the total N values are added to the data frame. \code{add_total} has +no effect in \code{as.data.frame()} for simple frequency tables.} } \value{ A data frame, or a list of data frames, with one frequency table @@ -127,6 +155,13 @@ including the number of levels/values as well as the distribution of raw, valid and cumulative percentages. For crosstables, row, column and cell percentages can be calculated. } +\details{ +There is an \code{as.data.frame()} method, to return the frequency tables as a +data frame. The structure of the returned object is a nested data frame, +where the first column contains name of the variable for which frequencies +were calculated, and the second column is a list column that contains the +frequency tables as data frame. See 'Examples'. +} \note{ There are \code{print_html()} and \code{print_md()} methods available for printing frequency or crosstables in HTML and markdown format, e.g. @@ -207,5 +242,11 @@ data_tabulate( # round percentages out <- data_tabulate(efc, "c172code", by = "e16sex", proportions = "column") print(out, digits = 0) + +# coerce to data frames +result <- data_tabulate(efc, "c172code", by = "e16sex") +as.data.frame(result) +as.data.frame(result)$table +as.data.frame(result, add_total = TRUE)$table \dontshow{\}) # examplesIf} } diff --git a/tests/testthat/test-data_tabulate.R b/tests/testthat/test-data_tabulate.R index 39f5d44c6..5108e29c2 100644 --- a/tests/testthat/test-data_tabulate.R +++ b/tests/testthat/test-data_tabulate.R @@ -81,7 +81,7 @@ test_that("data_tabulate data.frame", { "Variable", "Value", "N", "Raw %", "Valid %", "Cumulative %" ), - class = c("dw_data_tabulate", "data.frame"), + class = c("datawizard_table", "data.frame"), row.names = 1:3, type = "numeric", varname = "e16sex", @@ -99,7 +99,7 @@ test_that("data_tabulate data.frame", { "Variable", "Value", "N", "Raw %", "Valid %", "Cumulative %" ), - class = c("dw_data_tabulate", "data.frame"), + class = c("datawizard_table", "data.frame"), row.names = 1:4, type = "numeric", varname = "c172code", @@ -139,7 +139,7 @@ test_that("data_tabulate print", { attributes(out), list( names = c("Variable", "Value", "N", "Raw %", "Valid %", "Cumulative %"), - class = c("dw_data_tabulate", "data.frame"), + class = c("datawizard_table", "data.frame"), row.names = 1:4, type = "integer", varname = "Large Number", @@ -197,7 +197,7 @@ test_that("data_tabulate grouped data.frame", { "Valid %", "Cumulative %" ), - class = c("dw_data_tabulate", "data.frame"), + class = c("datawizard_table", "data.frame"), row.names = 1:4, type = "numeric", varname = "c172code", @@ -268,6 +268,7 @@ test_that("data_tabulate drop levels", { # select helpers ------------------------------ + test_that("data_tabulate regex", { data(mtcars) expect_identical( @@ -296,6 +297,7 @@ test_that("data_tabulate exclude/include missing values", { # cross tables ------------------------------ + test_that("data_tabulate, cross tables", { data(efc, package = "datawizard") set.seed(123) @@ -380,6 +382,7 @@ test_that("data_tabulate, cross tables, markdown", { expect_snapshot(print_md(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full", include_na = FALSE, weights = efc$weights))) # nolint }) + # validate against table ------------------------- test_that("data_tabulate, validate against table", { @@ -405,3 +408,68 @@ test_that("data_tabulate, correct 0% for proportions", { expect_identical(format(out[[1]])[[4]], c("0 (0%)", "0 (0%)", "0 (0%)", "0 (0%)", "", "0")) expect_snapshot(print(out[[1]])) }) + + +# coercing to data frame ------------------------- + +test_that("data_tabulate, as.data.frame, frequency tables", { + data(mtcars) + # frequency table + x <- data_tabulate(mtcars$cyl) + out <- as.data.frame(x) + expect_named(out, c("Variable", "Value", "N", "Raw %", "Valid %", "Cumulative %")) + expect_identical(out$Variable, c("mtcars$cyl", "mtcars$cyl", "mtcars$cyl", "mtcars$cyl")) + expect_false(any(vapply(out[2:ncol(out)], is.character, logical(1)))) + # frequency tables + x <- data_tabulate(mtcars, select = c("cyl", "am")) + out <- as.data.frame(x) + expect_named(out, c("var", "table")) + expect_equal(vapply(out, class, character(1)), c("character", "AsIs"), ignore_attr = TRUE) + expect_length(out$table, 2L) + expect_named(out$table[[1]], c("Variable", "Value", "N", "Raw %", "Valid %", "Cumulative %")) + expect_identical(out$table[[1]]$Variable, c("cyl", "cyl", "cyl", "cyl")) + expect_false(any(vapply(out$table[[1]][2:ncol(out$table[[1]])], is.character, logical(1)))) +}) + + +test_that("data_tabulate, as.data.frame, cross tables", { + data(mtcars) + # cross table + x <- data_tabulate(mtcars, "cyl", by = "am") + out <- as.data.frame(x) + expect_named(out, c("var", "table")) + expect_equal(vapply(out, class, character(1)), c("character", "AsIs"), ignore_attr = TRUE) + expect_length(out$table, 1L) + expect_named(out$table[[1]], c("cyl", "0", "1", "NA")) + expect_identical(nrow(out$table[[1]]), 4L) + # cross tables + x <- data_tabulate(mtcars, c("cyl", "vs"), by = "am") + out <- as.data.frame(x) + expect_named(out, c("var", "table")) + expect_equal(vapply(out, class, character(1)), c("character", "AsIs"), ignore_attr = TRUE) + expect_length(out$table, 2L) + expect_named(out$table[[1]], c("cyl", "0", "1", "NA")) + expect_identical(nrow(out$table[[1]]), 4L) +}) + + +test_that("data_tabulate, as.data.frame, cross tables with total N", { + # cross table, with total + x <- data_tabulate(mtcars, "cyl", by = "am") + out <- as.data.frame(x, add_total = TRUE) + expect_named(out, c("var", "table")) + expect_equal(vapply(out, class, character(1)), c("character", "AsIs"), ignore_attr = TRUE) + expect_length(out$table, 1L) + expect_named(out$table[[1]], c("cyl", "0", "1", "", "Total")) + expect_identical(nrow(out$table[[1]]), 5L) + expect_identical(out$table[[1]]$cyl, c("4", "6", "8", NA, "Total")) + # cross tables, with total + x <- data_tabulate(mtcars, c("cyl", "vs"), by = "am") + out <- as.data.frame(x, add_total = TRUE) + expect_named(out, c("var", "table")) + expect_equal(vapply(out, class, character(1)), c("character", "AsIs"), ignore_attr = TRUE) + expect_length(out$table, 2L) + expect_named(out$table[[1]], c("cyl", "0", "1", "", "Total")) + expect_identical(nrow(out$table[[1]]), 5L) + expect_identical(out$table[[1]]$cyl, c("4", "6", "8", NA, "Total")) +})