From bab9a673893673b61516fa8e76d2bf501919aa4f Mon Sep 17 00:00:00 2001 From: Daniel Date: Sat, 22 Jun 2024 11:21:23 +0200 Subject: [PATCH 01/15] Define data frame method for `dw_data_xtabulates` object? Fixes #516 --- NAMESPACE | 2 ++ R/data_tabulate.R | 32 ++++++++++++++++++++++++++++++++ man/data_tabulate.Rd | 5 +++++ 3 files changed, 39 insertions(+) diff --git a/NAMESPACE b/NAMESPACE index 5926d19ab..cdaac1b63 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -227,6 +227,8 @@ export(column_as_rownames) export(contr.deviation) export(convert_na_to) export(convert_to_na) +export(data.frame.dw_data_tabulates) +export(data.frame.dw_data_xtabulate) export(data_addprefix) export(data_addsuffix) export(data_adjust) diff --git a/R/data_tabulate.R b/R/data_tabulate.R index 6a26a39c9..5bb5d521f 100644 --- a/R/data_tabulate.R +++ b/R/data_tabulate.R @@ -25,6 +25,7 @@ #' percentages to be calculated. Only applies to crosstables, i.e. when `by` is #' not `NULL`. Can be `"row"` (row percentages), `"column"` (column percentages) #' or `"full"` (to calculate relative frequencies for the full table). +#' @param add_total Add total. #' @param ... not used. #' @inheritParams extract_column_names #' @@ -380,6 +381,37 @@ insight::print_html insight::print_md +#' @rdname data_tabulate +#' @export +data.frame.dw_data_tabulates <- function(x, add_total = FALSE, ...) { + # extract variables of frequencies + selected_vars <- lapply(x, function(i) attributes(i)$varname) + # coerce to data frame, remove rownames + data_frames <- lapply(x, function(i) { + if (add_total) { + out <- as.data.frame(format(i)) + for (i in 2:ncol(out)) { + out[[i]] <- as.numeric(out[[i]]) + } + out <- remove_empty_rows(out) + } else { + out <- as.data.frame(i) + } + rownames(out) <- NULL + out + }) + # create nested data frame + data.frame( + var = selected_vars, + table = I(data_frames), + stringsAsFactors = FALSE + ) +} + +#' @export +data.frame.dw_data_xtabulate <- data.frame.dw_data_tabulates + + #' @export format.dw_data_tabulate <- function(x, format = "text", big_mark = NULL, ...) { # convert to character manually, else, for large numbers, diff --git a/man/data_tabulate.Rd b/man/data_tabulate.Rd index b744c1f1b..2ea9b531e 100644 --- a/man/data_tabulate.Rd +++ b/man/data_tabulate.Rd @@ -4,6 +4,7 @@ \alias{data_tabulate} \alias{data_tabulate.default} \alias{data_tabulate.data.frame} +\alias{data.frame.dw_data_tabulates} \title{Create frequency and crosstables of variables} \usage{ data_tabulate(x, ...) @@ -35,6 +36,8 @@ data_tabulate(x, ...) verbose = TRUE, ... ) + +data.frame.dw_data_tabulates(x, add_total = FALSE, ...) } \arguments{ \item{x}{A (grouped) data frame, a vector or factor.} @@ -116,6 +119,8 @@ functions (see 'Details'), this argument may be used as workaround.} \item{collapse}{Logical, if \code{TRUE} collapses multiple tables into one larger table for printing. This affects only printing, not the returned object.} + +\item{add_total}{Add total.} } \value{ A data frame, or a list of data frames, with one frequency table From cf6b6a35fa56eac75f4a7a67a953f187694d232e Mon Sep 17 00:00:00 2001 From: Daniel Date: Sat, 22 Jun 2024 11:39:57 +0200 Subject: [PATCH 02/15] fix --- NAMESPACE | 4 ++-- R/data_tabulate.R | 4 ++-- man/data_tabulate.Rd | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index cdaac1b63..2427deab0 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,5 +1,7 @@ # Generated by roxygen2: do not edit by hand +S3method(as.data.frame,dw_data_tabulates) +S3method(as.data.frame,dw_data_xtabulate) S3method(as.double,parameters_kurtosis) S3method(as.double,parameters_skewness) S3method(as.double,parameters_smoothness) @@ -227,8 +229,6 @@ export(column_as_rownames) export(contr.deviation) export(convert_na_to) export(convert_to_na) -export(data.frame.dw_data_tabulates) -export(data.frame.dw_data_xtabulate) export(data_addprefix) export(data_addsuffix) export(data_adjust) diff --git a/R/data_tabulate.R b/R/data_tabulate.R index 5bb5d521f..94e75e629 100644 --- a/R/data_tabulate.R +++ b/R/data_tabulate.R @@ -383,7 +383,7 @@ insight::print_md #' @rdname data_tabulate #' @export -data.frame.dw_data_tabulates <- function(x, add_total = FALSE, ...) { +as.data.frame.dw_data_tabulates <- function(x, add_total = FALSE, ...) { # extract variables of frequencies selected_vars <- lapply(x, function(i) attributes(i)$varname) # coerce to data frame, remove rownames @@ -409,7 +409,7 @@ data.frame.dw_data_tabulates <- function(x, add_total = FALSE, ...) { } #' @export -data.frame.dw_data_xtabulate <- data.frame.dw_data_tabulates +as.data.frame.dw_data_xtabulate <- as.data.frame.dw_data_tabulates #' @export diff --git a/man/data_tabulate.Rd b/man/data_tabulate.Rd index 2ea9b531e..59ca21ef4 100644 --- a/man/data_tabulate.Rd +++ b/man/data_tabulate.Rd @@ -4,7 +4,7 @@ \alias{data_tabulate} \alias{data_tabulate.default} \alias{data_tabulate.data.frame} -\alias{data.frame.dw_data_tabulates} +\alias{as.data.frame.dw_data_tabulates} \title{Create frequency and crosstables of variables} \usage{ data_tabulate(x, ...) @@ -37,7 +37,7 @@ data_tabulate(x, ...) ... ) -data.frame.dw_data_tabulates(x, add_total = FALSE, ...) +\method{as.data.frame}{dw_data_tabulates}(x, add_total = FALSE, ...) } \arguments{ \item{x}{A (grouped) data frame, a vector or factor.} From ba08078ba31478b7f6dfb12f7688d020a0662c5d Mon Sep 17 00:00:00 2001 From: Daniel Date: Sat, 22 Jun 2024 11:44:12 +0200 Subject: [PATCH 03/15] fix --- R/data_xtabulate.R | 1 + 1 file changed, 1 insertion(+) diff --git a/R/data_xtabulate.R b/R/data_xtabulate.R index 3cb25d62b..7be1a309d 100644 --- a/R/data_xtabulate.R +++ b/R/data_xtabulate.R @@ -74,6 +74,7 @@ attr(out, "total_n") <- total_n attr(out, "weights") <- weights attr(out, "proportions") <- proportions + attr(out, "varname") <- obj_name class(out) <- c("dw_data_xtabulate", "data.frame") From 02adebc37c05deadc27a89c5d4559aae79bc863f Mon Sep 17 00:00:00 2001 From: Daniel Date: Sat, 22 Jun 2024 14:30:52 +0200 Subject: [PATCH 04/15] rename class attributes --- NAMESPACE | 22 +++++++++--------- R/data_tabulate.R | 35 +++++++++++++++-------------- R/data_xtabulate.R | 4 ++-- man/data_tabulate.Rd | 4 ++-- tests/testthat/test-data_tabulate.R | 8 +++---- 5 files changed, 37 insertions(+), 36 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 2427deab0..25157eae8 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,7 +1,7 @@ # Generated by roxygen2: do not edit by hand -S3method(as.data.frame,dw_data_tabulates) -S3method(as.data.frame,dw_data_xtabulate) +S3method(as.data.frame,datawizard_crosstabs) +S3method(as.data.frame,datawizard_tables) S3method(as.double,parameters_kurtosis) S3method(as.double,parameters_skewness) S3method(as.double,parameters_smoothness) @@ -71,8 +71,8 @@ S3method(describe_distribution,grouped_df) S3method(describe_distribution,list) S3method(describe_distribution,numeric) S3method(format,data_codebook) +S3method(format,datawizard_table) S3method(format,dw_data_peek) -S3method(format,dw_data_tabulate) S3method(format,dw_data_xtabulate) S3method(format,dw_groupmeans) S3method(format,parameters_distribution) @@ -95,12 +95,12 @@ S3method(normalize,numeric) S3method(plot,visualisation_recipe) S3method(print,data_codebook) S3method(print,data_seek) +S3method(print,datawizard_crosstabs) +S3method(print,datawizard_table) +S3method(print,datawizard_tables) S3method(print,dw_data_peek) S3method(print,dw_data_summary) -S3method(print,dw_data_tabulate) -S3method(print,dw_data_tabulates) S3method(print,dw_data_xtabulate) -S3method(print,dw_data_xtabulates) S3method(print,dw_groupmeans) S3method(print,dw_groupmeans_list) S3method(print,dw_transformer) @@ -109,15 +109,15 @@ S3method(print,parameters_kurtosis) S3method(print,parameters_skewness) S3method(print,visualisation_recipe) S3method(print_html,data_codebook) +S3method(print_html,datawizard_crosstabs) +S3method(print_html,datawizard_table) +S3method(print_html,datawizard_tables) S3method(print_html,dw_data_peek) -S3method(print_html,dw_data_tabulate) -S3method(print_html,dw_data_tabulates) S3method(print_html,dw_data_xtabulate) -S3method(print_html,dw_data_xtabulates) S3method(print_md,data_codebook) +S3method(print_md,datawizard_table) +S3method(print_md,datawizard_tables) S3method(print_md,dw_data_peek) -S3method(print_md,dw_data_tabulate) -S3method(print_md,dw_data_tabulates) S3method(print_md,dw_data_xtabulate) S3method(ranktransform,data.frame) S3method(ranktransform,factor) diff --git a/R/data_tabulate.R b/R/data_tabulate.R index 94e75e629..2319b5933 100644 --- a/R/data_tabulate.R +++ b/R/data_tabulate.R @@ -243,7 +243,7 @@ data_tabulate.default <- function(x, attr(out, "total_n") <- sum(out$N, na.rm = TRUE) attr(out, "valid_n") <- valid_n - class(out) <- c("dw_data_tabulate", "data.frame") + class(out) <- c("datawizard_table", "data.frame") out } @@ -293,9 +293,9 @@ data_tabulate.data.frame <- function(x, }) if (is.null(by)) { - class(out) <- c("dw_data_tabulates", "list") + class(out) <- c("datawizard_tables", "list") } else { - class(out) <- c("dw_data_xtabulates", "list") + class(out) <- c("datawizard_crosstabs", "list") } attr(out, "collapse") <- isTRUE(collapse) attr(out, "is_weighted") <- !is.null(weights) @@ -358,9 +358,9 @@ data_tabulate.grouped_df <- function(x, )) } if (is.null(by)) { - class(out) <- c("dw_data_tabulates", "list") + class(out) <- c("datawizard_tables", "list") } else { - class(out) <- c("dw_data_xtabulates", "list") + class(out) <- c("datawizard_crosstabs", "list") } attr(out, "collapse") <- isTRUE(collapse) attr(out, "is_weighted") <- !is.null(weights) @@ -383,15 +383,16 @@ insight::print_md #' @rdname data_tabulate #' @export -as.data.frame.dw_data_tabulates <- function(x, add_total = FALSE, ...) { +as.data.frame.datawizard_tables <- function(x, add_total = FALSE, ...) { # extract variables of frequencies - selected_vars <- lapply(x, function(i) attributes(i)$varname) + selected_vars <- unlist(lapply(x, function(i) attributes(i)$varname)) # coerce to data frame, remove rownames data_frames <- lapply(x, function(i) { + class(i) <- "data.frame" if (add_total) { out <- as.data.frame(format(i)) - for (i in 2:ncol(out)) { - out[[i]] <- as.numeric(out[[i]]) + for (cols in 2:ncol(out)) { + out[[cols]] <- as.numeric(out[[cols]]) } out <- remove_empty_rows(out) } else { @@ -409,11 +410,11 @@ as.data.frame.dw_data_tabulates <- function(x, add_total = FALSE, ...) { } #' @export -as.data.frame.dw_data_xtabulate <- as.data.frame.dw_data_tabulates +as.data.frame.datawizard_crosstabs <- as.data.frame.datawizard_tables #' @export -format.dw_data_tabulate <- function(x, format = "text", big_mark = NULL, ...) { +format.datawizard_table <- function(x, format = "text", big_mark = NULL, ...) { # convert to character manually, else, for large numbers, # format_table() returns scientific notation x <- as.data.frame(x) @@ -446,7 +447,7 @@ format.dw_data_tabulate <- function(x, format = "text", big_mark = NULL, ...) { #' @export -print.dw_data_tabulate <- function(x, big_mark = NULL, ...) { +print.datawizard_table <- function(x, big_mark = NULL, ...) { a <- attributes(x) # "table" header with variable label/name, and type @@ -488,7 +489,7 @@ print.dw_data_tabulate <- function(x, big_mark = NULL, ...) { #' @export -print_html.dw_data_tabulate <- function(x, big_mark = NULL, ...) { +print_html.datawizard_table <- function(x, big_mark = NULL, ...) { a <- attributes(x) # "table" header with variable label/name, and type @@ -518,7 +519,7 @@ print_html.dw_data_tabulate <- function(x, big_mark = NULL, ...) { #' @export -print_md.dw_data_tabulate <- function(x, big_mark = NULL, ...) { +print_md.datawizard_table <- function(x, big_mark = NULL, ...) { a <- attributes(x) # "table" header with variable label/name, and type @@ -548,7 +549,7 @@ print_md.dw_data_tabulate <- function(x, big_mark = NULL, ...) { #' @export -print.dw_data_tabulates <- function(x, big_mark = NULL, ...) { +print.datawizard_tables <- function(x, big_mark = NULL, ...) { # check if we have weights is_weighted <- isTRUE(attributes(x)$is_weighted) @@ -587,7 +588,7 @@ print.dw_data_tabulates <- function(x, big_mark = NULL, ...) { #' @export -print_html.dw_data_tabulates <- function(x, big_mark = NULL, ...) { +print_html.datawizard_tables <- function(x, big_mark = NULL, ...) { # check if we have weights is_weighted <- isTRUE(attributes(x)$is_weighted) @@ -616,7 +617,7 @@ print_html.dw_data_tabulates <- function(x, big_mark = NULL, ...) { #' @export -print_md.dw_data_tabulates <- function(x, big_mark = NULL, ...) { +print_md.datawizard_tables <- function(x, big_mark = NULL, ...) { # check if we have weights is_weighted <- isTRUE(attributes(x)$is_weighted) diff --git a/R/data_xtabulate.R b/R/data_xtabulate.R index 7be1a309d..1fe64f69a 100644 --- a/R/data_xtabulate.R +++ b/R/data_xtabulate.R @@ -241,7 +241,7 @@ print_html.dw_data_xtabulate <- function(x, big_mark = NULL, ...) { #' @export -print.dw_data_xtabulates <- function(x, big_mark = NULL, ...) { +print.datawizard_crosstabs <- function(x, big_mark = NULL, ...) { for (i in seq_along(x)) { print(x[[i]], big_mark = big_mark, ...) cat("\n") @@ -251,7 +251,7 @@ print.dw_data_xtabulates <- function(x, big_mark = NULL, ...) { #' @export -print_html.dw_data_xtabulates <- function(x, big_mark = NULL, ...) { +print_html.datawizard_crosstabs <- function(x, big_mark = NULL, ...) { if (length(x) == 1) { print_html(x[[1]], big_mark = big_mark, ...) } else { diff --git a/man/data_tabulate.Rd b/man/data_tabulate.Rd index 59ca21ef4..7c2657278 100644 --- a/man/data_tabulate.Rd +++ b/man/data_tabulate.Rd @@ -4,7 +4,7 @@ \alias{data_tabulate} \alias{data_tabulate.default} \alias{data_tabulate.data.frame} -\alias{as.data.frame.dw_data_tabulates} +\alias{as.data.frame.datawizard_tables} \title{Create frequency and crosstables of variables} \usage{ data_tabulate(x, ...) @@ -37,7 +37,7 @@ data_tabulate(x, ...) ... ) -\method{as.data.frame}{dw_data_tabulates}(x, add_total = FALSE, ...) +\method{as.data.frame}{datawizard_tables}(x, add_total = FALSE, ...) } \arguments{ \item{x}{A (grouped) data frame, a vector or factor.} diff --git a/tests/testthat/test-data_tabulate.R b/tests/testthat/test-data_tabulate.R index 39f5d44c6..059ab5a7f 100644 --- a/tests/testthat/test-data_tabulate.R +++ b/tests/testthat/test-data_tabulate.R @@ -81,7 +81,7 @@ test_that("data_tabulate data.frame", { "Variable", "Value", "N", "Raw %", "Valid %", "Cumulative %" ), - class = c("dw_data_tabulate", "data.frame"), + class = c("datawizard_table", "data.frame"), row.names = 1:3, type = "numeric", varname = "e16sex", @@ -99,7 +99,7 @@ test_that("data_tabulate data.frame", { "Variable", "Value", "N", "Raw %", "Valid %", "Cumulative %" ), - class = c("dw_data_tabulate", "data.frame"), + class = c("datawizard_table", "data.frame"), row.names = 1:4, type = "numeric", varname = "c172code", @@ -139,7 +139,7 @@ test_that("data_tabulate print", { attributes(out), list( names = c("Variable", "Value", "N", "Raw %", "Valid %", "Cumulative %"), - class = c("dw_data_tabulate", "data.frame"), + class = c("datawizard_table", "data.frame"), row.names = 1:4, type = "integer", varname = "Large Number", @@ -197,7 +197,7 @@ test_that("data_tabulate grouped data.frame", { "Valid %", "Cumulative %" ), - class = c("dw_data_tabulate", "data.frame"), + class = c("datawizard_table", "data.frame"), row.names = 1:4, type = "numeric", varname = "c172code", From e9b3734a76f3dbfe4f860565c30f55a1e6054200 Mon Sep 17 00:00:00 2001 From: Daniel Date: Sat, 22 Jun 2024 14:32:14 +0200 Subject: [PATCH 05/15] rename class attr --- NAMESPACE | 8 ++++---- R/data_xtabulate.R | 10 +++++----- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 25157eae8..a08798db1 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -71,9 +71,9 @@ S3method(describe_distribution,grouped_df) S3method(describe_distribution,list) S3method(describe_distribution,numeric) S3method(format,data_codebook) +S3method(format,datawizard_crosstab) S3method(format,datawizard_table) S3method(format,dw_data_peek) -S3method(format,dw_data_xtabulate) S3method(format,dw_groupmeans) S3method(format,parameters_distribution) S3method(kurtosis,data.frame) @@ -95,12 +95,12 @@ S3method(normalize,numeric) S3method(plot,visualisation_recipe) S3method(print,data_codebook) S3method(print,data_seek) +S3method(print,datawizard_crosstab) S3method(print,datawizard_crosstabs) S3method(print,datawizard_table) S3method(print,datawizard_tables) S3method(print,dw_data_peek) S3method(print,dw_data_summary) -S3method(print,dw_data_xtabulate) S3method(print,dw_groupmeans) S3method(print,dw_groupmeans_list) S3method(print,dw_transformer) @@ -109,16 +109,16 @@ S3method(print,parameters_kurtosis) S3method(print,parameters_skewness) S3method(print,visualisation_recipe) S3method(print_html,data_codebook) +S3method(print_html,datawizard_crosstab) S3method(print_html,datawizard_crosstabs) S3method(print_html,datawizard_table) S3method(print_html,datawizard_tables) S3method(print_html,dw_data_peek) -S3method(print_html,dw_data_xtabulate) S3method(print_md,data_codebook) +S3method(print_md,datawizard_crosstab) S3method(print_md,datawizard_table) S3method(print_md,datawizard_tables) S3method(print_md,dw_data_peek) -S3method(print_md,dw_data_xtabulate) S3method(ranktransform,data.frame) S3method(ranktransform,factor) S3method(ranktransform,grouped_df) diff --git a/R/data_xtabulate.R b/R/data_xtabulate.R index 1fe64f69a..5c387ff95 100644 --- a/R/data_xtabulate.R +++ b/R/data_xtabulate.R @@ -76,7 +76,7 @@ attr(out, "proportions") <- proportions attr(out, "varname") <- obj_name - class(out) <- c("dw_data_xtabulate", "data.frame") + class(out) <- c("datawizard_crosstab", "data.frame") out } @@ -86,7 +86,7 @@ #' @export -format.dw_data_xtabulate <- function(x, format = "text", digits = 1, big_mark = NULL, ...) { +format.datawizard_crosstab <- function(x, format = "text", digits = 1, big_mark = NULL, ...) { # convert to character manually, else, for large numbers, # format_table() returns scientific notation x <- as.data.frame(x) @@ -179,7 +179,7 @@ format.dw_data_xtabulate <- function(x, format = "text", digits = 1, big_mark = #' @export -print.dw_data_xtabulate <- function(x, big_mark = NULL, ...) { +print.datawizard_crosstab <- function(x, big_mark = NULL, ...) { # grouped data? if yes, add information on grouping factor if (is.null(x[["Group"]])) { caption <- NULL @@ -201,7 +201,7 @@ print.dw_data_xtabulate <- function(x, big_mark = NULL, ...) { #' @export -print_md.dw_data_xtabulate <- function(x, big_mark = NULL, ...) { +print_md.datawizard_crosstab <- function(x, big_mark = NULL, ...) { # grouped data? if yes, add information on grouping factor if (is.null(x[["Group"]])) { caption <- NULL @@ -223,7 +223,7 @@ print_md.dw_data_xtabulate <- function(x, big_mark = NULL, ...) { #' @export -print_html.dw_data_xtabulate <- function(x, big_mark = NULL, ...) { +print_html.datawizard_crosstab <- function(x, big_mark = NULL, ...) { # grouped data? if yes, add information on grouping factor if (!is.null(x[["Group"]])) { x$groups <- paste0("Grouped by ", x[["Group"]][1]) From 43864c9d471d0d53e737ff28f9c64ffcd3969560 Mon Sep 17 00:00:00 2001 From: Daniel Date: Sat, 22 Jun 2024 14:47:07 +0200 Subject: [PATCH 06/15] news, version, docs --- DESCRIPTION | 2 +- NEWS.md | 5 +++++ R/data_tabulate.R | 19 +++++++++++++++++-- man/data_tabulate.Rd | 17 ++++++++++++++++- 4 files changed, 39 insertions(+), 4 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 2ed1150ae..0a7fb9594 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: datawizard Title: Easy Data Wrangling and Statistical Transformations -Version: 0.11.0.2 +Version: 0.11.0.3 Authors@R: c( person("Indrajeet", "Patil", , "patilindrajeet.science@gmail.com", role = "aut", comment = c(ORCID = "0000-0003-1995-6531", Twitter = "@patilindrajeets")), diff --git a/NEWS.md b/NEWS.md index 1e49e91a4..69c228291 100644 --- a/NEWS.md +++ b/NEWS.md @@ -5,6 +5,11 @@ * `data_select()` can directly rename selected variables when a named vector is provided in `select`, e.g. `data_select(mtcars, c(new1 = "mpg", new2 = "cyl"))`. +* `data_tabulate()` gains an `as.data.frame()` method, to return the frequency + table as a data frame. The structure of the returned object is a nested data + frame, where the first column contains name of the variable for which + frequencies were calculated, and the second column contains the frequency table. + # datawizard 0.11.0 BREAKING CHANGES diff --git a/R/data_tabulate.R b/R/data_tabulate.R index 2319b5933..18b7c0a8d 100644 --- a/R/data_tabulate.R +++ b/R/data_tabulate.R @@ -25,10 +25,19 @@ #' percentages to be calculated. Only applies to crosstables, i.e. when `by` is #' not `NULL`. Can be `"row"` (row percentages), `"column"` (column percentages) #' or `"full"` (to calculate relative frequencies for the full table). -#' @param add_total Add total. +#' @param add_total For crosstables (i.e. when `by` is not `NULL`), a row and +#' column with the total N values are added to the data frame. `add_total` has +#' no effect in `as.data.frame()` for simple frequency tables. #' @param ... not used. #' @inheritParams extract_column_names #' +#' @details +#' There is an `as.data.frame()` method, to return the frequency tables as a +#' data frame. The structure of the returned object is a nested data frame, +#' where the first column contains name of the variable for which frequencies +#' were calculated, and the second column is a list column, contains the +#' frequency tables as data frame. See 'Examples'. +#' #' @section Crosstables: #' If `by` is supplied, a crosstable is created. The crosstable includes `` #' (missing) values by default. The first column indicates values of `x`, the @@ -109,6 +118,12 @@ #' # round percentages #' out <- data_tabulate(efc, "c172code", by = "e16sex", proportions = "column") #' print(out, digits = 0) +#' +#' # coerce to data frames +#' result <- data_tabulate(efc, "c172code", by = "e16sex") +#' as.data.frame(result) +#' as.data.frame(result)$table +#' as.data.frame(result, add_total = TRUE)$table #' @export data_tabulate <- function(x, ...) { UseMethod("data_tabulate") @@ -388,7 +403,7 @@ as.data.frame.datawizard_tables <- function(x, add_total = FALSE, ...) { selected_vars <- unlist(lapply(x, function(i) attributes(i)$varname)) # coerce to data frame, remove rownames data_frames <- lapply(x, function(i) { - class(i) <- "data.frame" + # class(i) <- "data.frame" if (add_total) { out <- as.data.frame(format(i)) for (cols in 2:ncol(out)) { diff --git a/man/data_tabulate.Rd b/man/data_tabulate.Rd index 7c2657278..3020c8c35 100644 --- a/man/data_tabulate.Rd +++ b/man/data_tabulate.Rd @@ -120,7 +120,9 @@ functions (see 'Details'), this argument may be used as workaround.} \item{collapse}{Logical, if \code{TRUE} collapses multiple tables into one larger table for printing. This affects only printing, not the returned object.} -\item{add_total}{Add total.} +\item{add_total}{For crosstables (i.e. when \code{by} is not \code{NULL}), a row and +column with the total N values are added to the data frame. \code{add_total} has +no effect in \code{as.data.frame()} for simple frequency tables.} } \value{ A data frame, or a list of data frames, with one frequency table @@ -132,6 +134,13 @@ including the number of levels/values as well as the distribution of raw, valid and cumulative percentages. For crosstables, row, column and cell percentages can be calculated. } +\details{ +There is an \code{as.data.frame()} method, to return the frequency tables as a +data frame. The structure of the returned object is a nested data frame, +where the first column contains name of the variable for which frequencies +were calculated, and the second column is a list column, contains the +frequency tables as data frame. See 'Examples'. +} \note{ There are \code{print_html()} and \code{print_md()} methods available for printing frequency or crosstables in HTML and markdown format, e.g. @@ -212,5 +221,11 @@ data_tabulate( # round percentages out <- data_tabulate(efc, "c172code", by = "e16sex", proportions = "column") print(out, digits = 0) + +# coerce to data frames +result <- data_tabulate(efc, "c172code", by = "e16sex") +as.data.frame(result) +as.data.frame(result)$table +as.data.frame(result, add_total = TRUE)$table \dontshow{\}) # examplesIf} } From 0b859a833e214c91d0d7f4a3c71c7429b60443e2 Mon Sep 17 00:00:00 2001 From: Daniel Date: Sat, 22 Jun 2024 14:59:22 +0200 Subject: [PATCH 07/15] add tests --- tests/testthat/test-data_tabulate.R | 56 +++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/tests/testthat/test-data_tabulate.R b/tests/testthat/test-data_tabulate.R index 059ab5a7f..d9141aa75 100644 --- a/tests/testthat/test-data_tabulate.R +++ b/tests/testthat/test-data_tabulate.R @@ -405,3 +405,59 @@ test_that("data_tabulate, correct 0% for proportions", { expect_identical(format(out[[1]])[[4]], c("0 (0%)", "0 (0%)", "0 (0%)", "0 (0%)", "", "0")) expect_snapshot(print(out[[1]])) }) + + +# coercing to data frame ------------------------- + +test_that("data_tabulate, as.data.frame", { + data(mtcars) + # frequency table + x <- data_tabulate(mtcars$cyl) + out <- as.data.frame(x) + expect_named(out, c("Variable", "Value", "N", "Raw %", "Valid %", "Cumulative %")) + expect_identical(out$Variable, c("mtcars$cyl", "mtcars$cyl", "mtcars$cyl", "mtcars$cyl")) + expect_false(any(vapply(out[2:ncol(out)], is.character, logical(1)))) + # frequency tables + x <- data_tabulate(mtcars, select = c("cyl", "am")) + out <- as.data.frame(x) + expect_named(out, c("var", "table")) + expect_equal(vapply(out, class, character(1)), c("character", "AsIs"), ignore_attr = TRUE) + expect_length(out$table, 2L) + expect_named(out$table[[1]], c("Variable", "Value", "N", "Raw %", "Valid %", "Cumulative %")) + expect_identical(out$table[[1]]$Variable, c("cyl", "cyl", "cyl", "cyl")) + expect_false(any(vapply(out$table[[1]][2:ncol(out$table[[1]])], is.character, logical(1)))) + # cross table + x <- data_tabulate(mtcars, "cyl", by = "am") + out <- as.data.frame(x) + expect_named(out, c("var", "table")) + expect_equal(vapply(out, class, character(1)), c("character", "AsIs"), ignore_attr = TRUE) + expect_length(out$table, 1L) + expect_named(out$table[[1]], c("cyl", "0", "1", "NA")) + expect_identical(nrow(out$table[[1]]), 4L) + # cross table, with total + x <- data_tabulate(mtcars, "cyl", by = "am") + out <- as.data.frame(x, add_total = TRUE) + expect_named(out, c("var", "table")) + expect_equal(vapply(out, class, character(1)), c("character", "AsIs"), ignore_attr = TRUE) + expect_length(out$table, 1L) + expect_named(out$table[[1]], c("cyl", "0", "1", "", "Total")) + expect_identical(nrow(out$table[[1]]), 5L) + expect_identical(out$table[[1]]$cyl, c("4", "6", "8", NA, "Total")) + # cross tables + x <- data_tabulate(mtcars, c("cyl", "vs"), by = "am") + out <- as.data.frame(x) + expect_named(out, c("var", "table")) + expect_equal(vapply(out, class, character(1)), c("character", "AsIs"), ignore_attr = TRUE) + expect_length(out$table, 2L) + expect_named(out$table[[1]], c("cyl", "0", "1", "NA")) + expect_identical(nrow(out$table[[1]]), 4L) + # cross tables, with total + x <- data_tabulate(mtcars, c("cyl", "vs"), by = "am") + out <- as.data.frame(x, add_total = TRUE) + expect_named(out, c("var", "table")) + expect_equal(vapply(out, class, character(1)), c("character", "AsIs"), ignore_attr = TRUE) + expect_length(out$table, 2L) + expect_named(out$table[[1]], c("cyl", "0", "1", "", "Total")) + expect_identical(nrow(out$table[[1]]), 5L) + expect_identical(out$table[[1]]$cyl, c("4", "6", "8", NA, "Total")) +}) From 953adc3bf03a8a2fdc50328edf12718f201b1e2b Mon Sep 17 00:00:00 2001 From: Daniel Date: Sat, 22 Jun 2024 15:03:24 +0200 Subject: [PATCH 08/15] comply with S3 generic --- R/data_tabulate.R | 21 +++++++++++++++------ man/data_tabulate.Rd | 23 ++++++++++++++++++++++- 2 files changed, 37 insertions(+), 7 deletions(-) diff --git a/R/data_tabulate.R b/R/data_tabulate.R index 18b7c0a8d..9a053508e 100644 --- a/R/data_tabulate.R +++ b/R/data_tabulate.R @@ -25,9 +25,6 @@ #' percentages to be calculated. Only applies to crosstables, i.e. when `by` is #' not `NULL`. Can be `"row"` (row percentages), `"column"` (column percentages) #' or `"full"` (to calculate relative frequencies for the full table). -#' @param add_total For crosstables (i.e. when `by` is not `NULL`), a row and -#' column with the total N values are added to the data frame. `add_total` has -#' no effect in `as.data.frame()` for simple frequency tables. #' @param ... not used. #' @inheritParams extract_column_names #' @@ -397,8 +394,17 @@ insight::print_md #' @rdname data_tabulate +#' @param add_total For crosstables (i.e. when `by` is not `NULL`), a row and +#' column with the total N values are added to the data frame. `add_total` has +#' no effect in `as.data.frame()` for simple frequency tables. +#' @inheritParams base::as.data.frame #' @export -as.data.frame.datawizard_tables <- function(x, add_total = FALSE, ...) { +as.data.frame.datawizard_tables <- function(x, + row.names = NULL, + optional = FALSE, + ..., + stringsAsFactors = FALSE, + add_total = FALSE) { # extract variables of frequencies selected_vars <- unlist(lapply(x, function(i) attributes(i)$varname)) # coerce to data frame, remove rownames @@ -417,11 +423,14 @@ as.data.frame.datawizard_tables <- function(x, add_total = FALSE, ...) { out }) # create nested data frame - data.frame( + result <- data.frame( var = selected_vars, table = I(data_frames), - stringsAsFactors = FALSE + stringsAsFactors = stringsAsFactors ) + # consider additional arguments + rownames(result) <- row.names + result } #' @export diff --git a/man/data_tabulate.Rd b/man/data_tabulate.Rd index 3020c8c35..1d8c77acd 100644 --- a/man/data_tabulate.Rd +++ b/man/data_tabulate.Rd @@ -37,7 +37,14 @@ data_tabulate(x, ...) ... ) -\method{as.data.frame}{datawizard_tables}(x, add_total = FALSE, ...) +\method{as.data.frame}{datawizard_tables}( + x, + row.names = NULL, + optional = FALSE, + ..., + stringsAsFactors = FALSE, + add_total = FALSE +) } \arguments{ \item{x}{A (grouped) data frame, a vector or factor.} @@ -120,6 +127,20 @@ functions (see 'Details'), this argument may be used as workaround.} \item{collapse}{Logical, if \code{TRUE} collapses multiple tables into one larger table for printing. This affects only printing, not the returned object.} +\item{row.names}{\code{NULL} or a character vector giving the row + names for the data frame. Missing values are not allowed.} + +\item{optional}{logical. If \code{TRUE}, setting row names and + converting column names (to syntactic names: see + \code{\link[base]{make.names}}) is optional. Note that all of \R's + \pkg{base} package \code{as.data.frame()} methods use + \code{optional} only for column names treatment, basically with the + meaning of \code{\link[base]{data.frame}(*, check.names = !optional)}. + See also the \code{make.names} argument of the \code{matrix} method.} + +\item{stringsAsFactors}{logical: should the character vector be converted + to a factor?} + \item{add_total}{For crosstables (i.e. when \code{by} is not \code{NULL}), a row and column with the total N values are added to the data frame. \code{add_total} has no effect in \code{as.data.frame()} for simple frequency tables.} From 6f73045169209e06a2c1cc2129196b3cee205ed7 Mon Sep 17 00:00:00 2001 From: Daniel Date: Sat, 22 Jun 2024 15:06:55 +0200 Subject: [PATCH 09/15] news --- NEWS.md | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 69c228291..ada1f08ea 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,6 +1,12 @@ # datawizard 0.11.0.1 -## Changes +BREAKING CHANGES + +* Class names for objects returned by `data_tabulate()` have been changed to + `datawizard_table` and `datawizard_crosstable` (resp. the plural forms, + `*_tables`), to provide a clearer and more consistent naming scheme. + +CHANGES * `data_select()` can directly rename selected variables when a named vector is provided in `select`, e.g. `data_select(mtcars, c(new1 = "mpg", new2 = "cyl"))`. From f7105c0d6e6ddad4f54a9ee78159e08189f1add6 Mon Sep 17 00:00:00 2001 From: Daniel Date: Sat, 22 Jun 2024 15:09:00 +0200 Subject: [PATCH 10/15] split tests into smaller pieces --- tests/testthat/test-data_tabulate.R | 30 ++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/tests/testthat/test-data_tabulate.R b/tests/testthat/test-data_tabulate.R index d9141aa75..5108e29c2 100644 --- a/tests/testthat/test-data_tabulate.R +++ b/tests/testthat/test-data_tabulate.R @@ -268,6 +268,7 @@ test_that("data_tabulate drop levels", { # select helpers ------------------------------ + test_that("data_tabulate regex", { data(mtcars) expect_identical( @@ -296,6 +297,7 @@ test_that("data_tabulate exclude/include missing values", { # cross tables ------------------------------ + test_that("data_tabulate, cross tables", { data(efc, package = "datawizard") set.seed(123) @@ -380,6 +382,7 @@ test_that("data_tabulate, cross tables, markdown", { expect_snapshot(print_md(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full", include_na = FALSE, weights = efc$weights))) # nolint }) + # validate against table ------------------------- test_that("data_tabulate, validate against table", { @@ -409,7 +412,7 @@ test_that("data_tabulate, correct 0% for proportions", { # coercing to data frame ------------------------- -test_that("data_tabulate, as.data.frame", { +test_that("data_tabulate, as.data.frame, frequency tables", { data(mtcars) # frequency table x <- data_tabulate(mtcars$cyl) @@ -426,6 +429,11 @@ test_that("data_tabulate, as.data.frame", { expect_named(out$table[[1]], c("Variable", "Value", "N", "Raw %", "Valid %", "Cumulative %")) expect_identical(out$table[[1]]$Variable, c("cyl", "cyl", "cyl", "cyl")) expect_false(any(vapply(out$table[[1]][2:ncol(out$table[[1]])], is.character, logical(1)))) +}) + + +test_that("data_tabulate, as.data.frame, cross tables", { + data(mtcars) # cross table x <- data_tabulate(mtcars, "cyl", by = "am") out <- as.data.frame(x) @@ -434,6 +442,18 @@ test_that("data_tabulate, as.data.frame", { expect_length(out$table, 1L) expect_named(out$table[[1]], c("cyl", "0", "1", "NA")) expect_identical(nrow(out$table[[1]]), 4L) + # cross tables + x <- data_tabulate(mtcars, c("cyl", "vs"), by = "am") + out <- as.data.frame(x) + expect_named(out, c("var", "table")) + expect_equal(vapply(out, class, character(1)), c("character", "AsIs"), ignore_attr = TRUE) + expect_length(out$table, 2L) + expect_named(out$table[[1]], c("cyl", "0", "1", "NA")) + expect_identical(nrow(out$table[[1]]), 4L) +}) + + +test_that("data_tabulate, as.data.frame, cross tables with total N", { # cross table, with total x <- data_tabulate(mtcars, "cyl", by = "am") out <- as.data.frame(x, add_total = TRUE) @@ -443,14 +463,6 @@ test_that("data_tabulate, as.data.frame", { expect_named(out$table[[1]], c("cyl", "0", "1", "", "Total")) expect_identical(nrow(out$table[[1]]), 5L) expect_identical(out$table[[1]]$cyl, c("4", "6", "8", NA, "Total")) - # cross tables - x <- data_tabulate(mtcars, c("cyl", "vs"), by = "am") - out <- as.data.frame(x) - expect_named(out, c("var", "table")) - expect_equal(vapply(out, class, character(1)), c("character", "AsIs"), ignore_attr = TRUE) - expect_length(out$table, 2L) - expect_named(out$table[[1]], c("cyl", "0", "1", "NA")) - expect_identical(nrow(out$table[[1]]), 4L) # cross tables, with total x <- data_tabulate(mtcars, c("cyl", "vs"), by = "am") out <- as.data.frame(x, add_total = TRUE) From f71a09f80ee450e6010346d754805ded8efb1a92 Mon Sep 17 00:00:00 2001 From: Daniel Date: Sat, 22 Jun 2024 15:10:33 +0200 Subject: [PATCH 11/15] typo --- R/data_tabulate.R | 2 +- man/data_tabulate.Rd | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/R/data_tabulate.R b/R/data_tabulate.R index 9a053508e..fbebf67d6 100644 --- a/R/data_tabulate.R +++ b/R/data_tabulate.R @@ -32,7 +32,7 @@ #' There is an `as.data.frame()` method, to return the frequency tables as a #' data frame. The structure of the returned object is a nested data frame, #' where the first column contains name of the variable for which frequencies -#' were calculated, and the second column is a list column, contains the +#' were calculated, and the second column is a list column that contains the #' frequency tables as data frame. See 'Examples'. #' #' @section Crosstables: diff --git a/man/data_tabulate.Rd b/man/data_tabulate.Rd index 1d8c77acd..3f17bb21c 100644 --- a/man/data_tabulate.Rd +++ b/man/data_tabulate.Rd @@ -159,7 +159,7 @@ percentages can be calculated. There is an \code{as.data.frame()} method, to return the frequency tables as a data frame. The structure of the returned object is a nested data frame, where the first column contains name of the variable for which frequencies -were calculated, and the second column is a list column, contains the +were calculated, and the second column is a list column that contains the frequency tables as data frame. See 'Examples'. } \note{ From 99807e54f9e49aa776a81cf36ea137aa93c6ee07 Mon Sep 17 00:00:00 2001 From: Daniel Date: Sat, 22 Jun 2024 15:15:28 +0200 Subject: [PATCH 12/15] add comment --- R/data_tabulate.R | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/R/data_tabulate.R b/R/data_tabulate.R index fbebf67d6..a713109a2 100644 --- a/R/data_tabulate.R +++ b/R/data_tabulate.R @@ -409,7 +409,13 @@ as.data.frame.datawizard_tables <- function(x, selected_vars <- unlist(lapply(x, function(i) attributes(i)$varname)) # coerce to data frame, remove rownames data_frames <- lapply(x, function(i) { - # class(i) <- "data.frame" + # the `format()` methods for objects returned by `data_tabuldate()` call + # `as.data.frame()` - we have to pay attention to avoid infinite iterations + # here. At the moment, this is no problem, as objects we have at this stage + # are of class "datawizard_table" or "datawizard_crosstab", while this + # `as.data.frame()` method is only called for "datawizard_tables" (the plural) + # form). Else, we would need to modify the class attribute here, + # e.g. class(i) <- "data.frame" if (add_total) { out <- as.data.frame(format(i)) for (cols in 2:ncol(out)) { From 9d1aaf44abc9ecfb82f3c03585bfabf1396acdcc Mon Sep 17 00:00:00 2001 From: Daniel Date: Sat, 22 Jun 2024 15:15:45 +0200 Subject: [PATCH 13/15] typo --- R/data_tabulate.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/data_tabulate.R b/R/data_tabulate.R index a713109a2..d61c3da6a 100644 --- a/R/data_tabulate.R +++ b/R/data_tabulate.R @@ -409,7 +409,7 @@ as.data.frame.datawizard_tables <- function(x, selected_vars <- unlist(lapply(x, function(i) attributes(i)$varname)) # coerce to data frame, remove rownames data_frames <- lapply(x, function(i) { - # the `format()` methods for objects returned by `data_tabuldate()` call + # the `format()` methods for objects returned by `data_tabulate()` call # `as.data.frame()` - we have to pay attention to avoid infinite iterations # here. At the moment, this is no problem, as objects we have at this stage # are of class "datawizard_table" or "datawizard_crosstab", while this From 28f5766bfd3da0c9b808b33e43b5ef37b57d3028 Mon Sep 17 00:00:00 2001 From: Daniel Date: Sat, 22 Jun 2024 15:17:44 +0200 Subject: [PATCH 14/15] comment --- R/data_tabulate.R | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/R/data_tabulate.R b/R/data_tabulate.R index d61c3da6a..cadc3df49 100644 --- a/R/data_tabulate.R +++ b/R/data_tabulate.R @@ -417,8 +417,12 @@ as.data.frame.datawizard_tables <- function(x, # form). Else, we would need to modify the class attribute here, # e.g. class(i) <- "data.frame" if (add_total) { + # to add the total column and row, we simply can call `format()` out <- as.data.frame(format(i)) for (cols in 2:ncol(out)) { + # since "format()" returns a character matrix, we want to convert + # the columns to numeric. We have to exclude the first column, as the + # first column is character, due to the added "Total" value. out[[cols]] <- as.numeric(out[[cols]]) } out <- remove_empty_rows(out) From 2465b7eba4b2d924ed601e77f48f924428245f0b Mon Sep 17 00:00:00 2001 From: Daniel Date: Sat, 22 Jun 2024 15:18:25 +0200 Subject: [PATCH 15/15] comment --- R/data_tabulate.R | 2 ++ 1 file changed, 2 insertions(+) diff --git a/R/data_tabulate.R b/R/data_tabulate.R index cadc3df49..74f4f2e03 100644 --- a/R/data_tabulate.R +++ b/R/data_tabulate.R @@ -425,6 +425,8 @@ as.data.frame.datawizard_tables <- function(x, # first column is character, due to the added "Total" value. out[[cols]] <- as.numeric(out[[cols]]) } + # after formatting, we have a "separator" row for nicer printing. + # this should also be removed out <- remove_empty_rows(out) } else { out <- as.data.frame(i)