From a58a03134426a480f7fda1c9ac8a4d0d23553764 Mon Sep 17 00:00:00 2001 From: Daniel Date: Mon, 23 Dec 2024 16:29:02 +0100 Subject: [PATCH] Fix data tab (#577) * Fix `data_tabulate()` for multiple `by` in HTML * add * news * fix * comments * fix * fix * styler * styler * ... * fix * fix * add test * fix --- DESCRIPTION | 2 +- NAMESPACE | 1 + NEWS.md | 9 +- R/data_xtabulate.R | 235 +++++++++++++++++-------- R/select_nse.R | 12 +- tests/testthat/_snaps/data_tabulate.md | 71 +++----- tests/testthat/test-data_tabulate.R | 2 + 7 files changed, 201 insertions(+), 131 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 68cfb6741..034c823ed 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: datawizard Title: Easy Data Wrangling and Statistical Transformations -Version: 0.13.0.19 +Version: 0.13.0.20 Authors@R: c( person("Indrajeet", "Patil", , "patilindrajeet.science@gmail.com", role = "aut", comment = c(ORCID = "0000-0003-1995-6531")), diff --git a/NAMESPACE b/NAMESPACE index 7e97817b9..61c3fe388 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -116,6 +116,7 @@ S3method(print_html,datawizard_tables) S3method(print_html,dw_data_peek) S3method(print_md,data_codebook) S3method(print_md,datawizard_crosstab) +S3method(print_md,datawizard_crosstabs) S3method(print_md,datawizard_table) S3method(print_md,datawizard_tables) S3method(print_md,dw_data_peek) diff --git a/NEWS.md b/NEWS.md index 35e549ffa..f332643fb 100644 --- a/NEWS.md +++ b/NEWS.md @@ -4,18 +4,23 @@ BREAKING CHANGES AND DEPRECATIONS * *datawizard* now requires R >= 4.0 (#515). -* Argument `drop_na` in `data_match()` is deprecated now. Please use +* Argument `drop_na` in `data_match()` is deprecated now. Please use `remove_na` instead. * In `data_rename()` (#567): - argument `pattern` is deprecated. Use `select` instead. - - argument `safe` is deprecated. The function now errors when `select` + - argument `safe` is deprecated. The function now errors when `select` contains unknown column names. - when `replacement` is `NULL`, an error is now thrown (previously, column indices were used as new names). - if `select` (previously `pattern`) is a named vector, then all elements must be named, e.g. `c(length = "Sepal.Length", "Sepal.Width")` errors. +* `print()` methods for `data_tabulate()` with multiple sub-tables (i.e. when + length of `by` was > 1) were revised. Now, an integrated table instead of + multiple tables is returned. Furthermore, `print_html()` did not work, which + was also fixed now. + CHANGES * The `select` argument, which is available in different functions to select diff --git a/R/data_xtabulate.R b/R/data_xtabulate.R index c9595eccf..6bd015171 100644 --- a/R/data_xtabulate.R +++ b/R/data_xtabulate.R @@ -90,18 +90,23 @@ #' @export -format.datawizard_crosstab <- function(x, format = "text", digits = 1, big_mark = NULL, ...) { +format.datawizard_crosstab <- function(x, + format = "text", + digits = 1, + big_mark = NULL, + include_total_row = TRUE, + ...) { # convert to character manually, else, for large numbers, # format_table() returns scientific notation x <- as.data.frame(x) - # remove group variable - x$Group <- NULL + # find numeric columns, only for these we need row/column sums + numeric_columns <- vapply(x, is.numeric, logical(1)) - # compute total N for rows and colummns + # compute total N for rows and columns total_n <- attributes(x)$total_n - total_column <- rowSums(x[, -1], na.rm = TRUE) - total_row <- c(colSums(x[, -1], na.rm = TRUE), total_n) + total_column <- rowSums(x[numeric_columns], na.rm = TRUE) + total_row <- c(colSums(x[numeric_columns], na.rm = TRUE), total_n) # proportions? props <- attributes(x)$proportions @@ -113,16 +118,16 @@ format.datawizard_crosstab <- function(x, format = "text", digits = 1, big_mark tmp <- x if (identical(props, "row")) { for (i in seq_len(nrow(x))) { - row_sum <- sum(x[i, -1], na.rm = TRUE) + row_sum <- sum(x[i, numeric_columns], na.rm = TRUE) if (row_sum == 0) { row_sum_string <- "(0%)" } else { - row_sum_string <- sprintf("(%.*f%%)", digits, 100 * x[i, -1] / row_sum) + row_sum_string <- sprintf("(%.*f%%)", digits, 100 * x[i, numeric_columns] / row_sum) } - tmp[i, -1] <- paste(format(x[i, -1]), format(row_sum_string, justify = "right")) + tmp[i, numeric_columns] <- paste(format(x[i, numeric_columns]), format(row_sum_string, justify = "right")) } } else if (identical(props, "column")) { - for (i in seq_len(ncol(x))[-1]) { + for (i in seq_len(ncol(x))[numeric_columns]) { col_sum <- sum(x[, i], na.rm = TRUE) if (col_sum == 0) { col_sum_string <- "(0%)" @@ -132,7 +137,7 @@ format.datawizard_crosstab <- function(x, format = "text", digits = 1, big_mark tmp[, i] <- paste(format(x[, i]), format(col_sum_string, justify = "right")) } } else if (identical(props, "full")) { - for (i in seq_len(ncol(x))[-1]) { + for (i in seq_len(ncol(x))[numeric_columns]) { tmp[, i] <- paste( format(x[, i]), format(sprintf("(%.*f%%)", digits, 100 * x[, i] / total_n), justify = "right") @@ -154,22 +159,29 @@ format.datawizard_crosstab <- function(x, format = "text", digits = 1, big_mark }) # Remove ".00" from numbers ftab$Total <- gsub("\\.00$", "", as.character(total_column)) - # for text format, insert "empty row" before last total row - if (identical(format, "text") || identical(format, "markdown")) { - empty_row <- as.data.frame(t(data.frame( - rep("", ncol(ftab)), - c("Total", as.character(total_row)), - stringsAsFactors = FALSE - ))) - } else { - empty_row <- as.data.frame(t(data.frame( - c("Total", as.character(total_row)), - stringsAsFactors = FALSE - ))) + + # add final total row to each sub-table. For multiple, collapsed table + # (i.e. when length of `by` > 1), we don't want multiple total rows in the + # table, so we would set include_total_row = FALSE for objects of class + # `datawizard_crosstabs` (note plural s!) + if (include_total_row) { + # for text format, insert "empty row" before last total row + if (identical(format, "text") || identical(format, "markdown")) { + empty_row <- as.data.frame(t(data.frame( + rep("", ncol(ftab)), + c("Total", as.character(total_row)), + stringsAsFactors = FALSE + ))) + } else { + empty_row <- as.data.frame(t(data.frame( + c("Total", as.character(total_row)), + stringsAsFactors = FALSE + ))) + } + colnames(empty_row) <- colnames(ftab) + ftab <- rbind(ftab, empty_row) + ftab[nrow(ftab), ] <- gsub("\\.00$", "", ftab[nrow(ftab), ]) } - colnames(empty_row) <- colnames(ftab) - ftab <- rbind(ftab, empty_row) - ftab[nrow(ftab), ] <- gsub("\\.00$", "", ftab[nrow(ftab), ]) # insert big marks? ftab$Total <- .add_commas_in_numbers(ftab$Total, big_mark) @@ -182,31 +194,30 @@ format.datawizard_crosstab <- function(x, format = "text", digits = 1, big_mark } + +# print, datawizard_crosstab --------------------- + + #' @export print.datawizard_crosstab <- function(x, big_mark = NULL, ...) { - # grouped data? if yes, add information on grouping factor - if (is.null(x[["Group"]])) { - caption <- NULL - } else { - caption <- paste0("Grouped by ", x[["Group"]][1]) - x$Group <- NULL - } - - # print table - cat(insight::export_table( - format(x, big_mark = big_mark, ...), - cross = "+", - missing = "", - caption = caption, - empty_line = "-", - ... - )) + .print_text_table(x, big_mark, format = "text", ...) invisible(x) } #' @export print_md.datawizard_crosstab <- function(x, big_mark = NULL, ...) { + .print_text_table(x, big_mark, format = "markdown", ...) +} + + +#' @export +print_html.datawizard_crosstab <- function(x, big_mark = NULL, ...) { + .print_text_table(x, big_mark, format = "html", ...) +} + + +.print_text_table <- function(x, big_mark = NULL, format = "text", ...) { # grouped data? if yes, add information on grouping factor if (is.null(x[["Group"]])) { caption <- NULL @@ -215,50 +226,57 @@ print_md.datawizard_crosstab <- function(x, big_mark = NULL, ...) { x$Group <- NULL } - # print table - insight::export_table( - format(x, format = "markdown", big_mark = big_mark, ...), - cross = "+", - missing = "", + # prepare table arguments + fun_args <- list( + format(x, big_mark = big_mark, format = format, ...), caption = caption, - empty_line = "-", - format = "markdown" + format = format ) -} - - -#' @export -print_html.datawizard_crosstab <- function(x, big_mark = NULL, ...) { - # grouped data? if yes, add information on grouping factor - if (!is.null(x[["Group"]])) { - x$groups <- paste0("Grouped by ", x[["Group"]][1]) - x$Group <- NULL + if (format != "html") { + fun_args$cross <- "+" + fun_args$empty_line <- "-" } + if (format == "text") { + fun_args$missing <- "" + } else { + fun_args$missing <- "(NA)" + } + out <- do.call(insight::export_table, c(fun_args, list(...))) # print table - insight::export_table( - format(x, big_mark = big_mark, format = "html", ...), - missing = "(NA)", - format = "html", - by = "groups" - ) + if (identical(format, "text")) { + cat(out) + } else { + out + } } +# print, datawizard_crosstabs --------------------- + + #' @export print.datawizard_crosstabs <- function(x, big_mark = NULL, ...) { - for (i in seq_along(x)) { - print(x[[i]], big_mark = big_mark, ...) - cat("\n") - } + .print_text_tables(x, big_mark, format = "text", ...) invisible(x) } +#' @export +print_md.datawizard_crosstabs <- function(x, big_mark = NULL, ...) { + .print_text_tables(x, big_mark, format = "markdown", ...) +} + + #' @export print_html.datawizard_crosstabs <- function(x, big_mark = NULL, ...) { + .print_text_tables(x, big_mark, format = "html", ...) +} + + +.print_text_tables <- function(x, big_mark = NULL, format = "text", ...) { if (length(x) == 1) { - print_html(x[[1]], big_mark = big_mark, ...) + .print_text_table(x[[1]], big_mark = big_mark, format = format, ...) } else { x <- lapply(x, function(i) { # grouped data? if yes, add information on grouping factor @@ -266,24 +284,89 @@ print_html.datawizard_crosstabs <- function(x, big_mark = NULL, ...) { i$groups <- paste0("Grouped by ", i[["Group"]][1]) i$Group <- NULL } - format(i, format = "html", big_mark = big_mark, ...) + # if we don't have the gt-grouping variable "groups" yet, we use it now + # for grouping. Else, we use a new column named "Variable", to avoid + # overwriting the groups-variable from grouped data frames + if (is.null(i$groups) && identical(format, "html")) { + grp_variable <- "groups" + } else { + grp_variable <- "Variable" + } + # first variable differs for each data frame, so we harmonize it here + i[[grp_variable]] <- colnames(i)[1] + colnames(i)[1] <- "Value" + # move column to first position + i <- data_relocate(i, select = grp_variable, before = 1) + # format data frame + format(i, format = format, big_mark = big_mark, include_total_row = FALSE, ...) }) + # now bind, but we need to check for equal number of columns + if (all(lengths(x) == max(length(x)))) { + out <- do.call(rbind, x) + } else { + # if not all tables have identical columns, we can use "data_merge()", + # which safely row-binds all data frames. However, the column order can be + # messed up, so we save column order here and restore it later + col_order <- colnames(x[[which.max(lengths(x))]]) + out <- data_merge(x, join = "bind")[col_order] + } - out <- do.call(rbind, x) + # split tables for grouped data frames + if (!is.null(out$groups)) { + out <- split(out, out$groups) + out <- lapply(out, function(subtable) { + # for text and markdown, if we split tables, we remove the "groups" + # variable. we need to keep it for HTML tables. + if (!identical(format, "html")) { + attr(subtable, "table_caption") <- c(unique(subtable$groups), "blue") + subtable$groups <- NULL + } + # remove duplicated names + for (grpvars in c("Variable", "Group")) { + if (!is.null(subtable[[grpvars]])) { + subtable[[grpvars]][duplicated(subtable[[grpvars]])] <- "" + } + } + subtable + }) + # no splitting of grouped data frames into list for HTML format, + # because splitting is done by the `by` argument later + if (identical(format, "html")) { + out <- do.call(rbind, out) + } + } - # print table - insight::export_table( + # prepare table arguments + fun_args <- list( out, - missing = "(NA)", - format = "html", + format = format, by = "groups" ) + if (format != "html") { + fun_args$cross <- "+" + fun_args$empty_line <- "-" + } + if (format == "text") { + fun_args$missing <- "" + } else { + fun_args$missing <- "(NA)" + } + out <- do.call(insight::export_table, c(fun_args, list(...))) + + # print table + if (identical(format, "text")) { + cat(out) + } else { + out + } } } + # helper --------------------- + .validate_by <- function(by, x) { if (!is.null(by)) { if (is.character(by)) { diff --git a/R/select_nse.R b/R/select_nse.R index a085a4ce3..aa9632eb0 100644 --- a/R/select_nse.R +++ b/R/select_nse.R @@ -198,13 +198,11 @@ } # small helper, to avoid duplicated code -.action_if_not_found <- function( - x, - columns, - matches, - verbose, - ifnotfound -) { +.action_if_not_found <- function(x, + columns, + matches, + verbose, + ifnotfound) { msg <- paste0( "Following variable(s) were not found: ", toString(x[is.na(matches)]) diff --git a/tests/testthat/_snaps/data_tabulate.md b/tests/testthat/_snaps/data_tabulate.md index ffde63088..323658ccc 100644 --- a/tests/testthat/_snaps/data_tabulate.md +++ b/tests/testthat/_snaps/data_tabulate.md @@ -311,7 +311,6 @@ | 5 (50.0%) | 4 (40.0%) | 1 (10.0%) | 10 ---------+------------+------------+-----------+------ Total | 45 | 50 | 5 | 100 - --- @@ -326,7 +325,6 @@ 3 | 4 (26.7%) | 11 (73.3%) | 15 ---------+------------+------------+------ Total | 40 | 46 | 86 - --- @@ -342,7 +340,6 @@ | 8 (57.1%) | 5 (35.7%) | 1 (7.1%) | 14 ---------+------------+------------+-----------+------ Total | 48 | 51 | 7 | 105 - --- @@ -357,7 +354,6 @@ 3 | 3 (21.4%) | 11 (78.6%) | 14 ---------+------------+------------+------ Total | 40 | 46 | 86 - --- @@ -372,7 +368,6 @@ | 5 (11.1%) | 4 (8.0%) | 1 (20.0%) | 10 ---------+------------+------------+-----------+------ Total | 45 | 50 | 5 | 100 - --- @@ -387,7 +382,6 @@ 3 | 4 (10.0%) | 11 (23.9%) | 15 ---------+------------+------------+------ Total | 40 | 46 | 86 - --- @@ -403,7 +397,6 @@ | 8 (16.7%) | 5 (9.8%) | 1 (14.3%) | 14 ---------+------------+------------+-----------+------ Total | 48 | 51 | 7 | 105 - --- @@ -418,7 +411,6 @@ 3 | 3 (7.5%) | 11 (23.9%) | 14 ---------+------------+------------+------ Total | 40 | 46 | 86 - # data_tabulate, cross tables, grouped df @@ -427,53 +419,42 @@ Output Grouped by e42dep (1) - c172code | male | | Total - ---------+------------+------------+------ - 2 | 2 (100.0%) | 0 (0.0%) | 2 - | 0 (0%) | 0 (0%) | 0 - ---------+------------+------------+------ - Total | 2 | 0 | 2 + Variable | Value | male | female | | Total + ---------+-------+------------+--------+------------+------ + c172code | 2 | 2 (100.0%) | | 0 (0.0%) | 2 + | NA | 0 (0%) | | 0 (0%) | 0 Grouped by e42dep (2) - c172code | male | female | | Total - ---------+-----------+-----------+-----------+------ - 2 | 2 (50.0%) | 2 (50.0%) | 0 (0.0%) | 4 - | 0 (0%) | 0 (0%) | 0 (0%) | 0 - ---------+-----------+-----------+-----------+------ - Total | 2 | 2 | 0 | 4 + Variable | Value | male | female | | Total + ---------+-------+-----------+-----------+-----------+------ + c172code | 2 | 2 (50.0%) | 2 (50.0%) | 0 (0.0%) | 4 + | NA | 0 (0%) | 0 (0%) | 0 (0%) | 0 Grouped by e42dep (3) - c172code | male | female | | Total - ---------+-----------+------------+-----------+------ - 1 | 2 (50.0%) | 2 (50.0%) | 0 (0.0%) | 4 - 2 | 4 (25.0%) | 11 (68.8%) | 1 (6.2%) | 16 - 3 | 1 (16.7%) | 5 (83.3%) | 0 (0.0%) | 6 - | 1 (50.0%) | 0 (0.0%) | 1 (50.0%) | 2 - ---------+-----------+------------+-----------+------ - Total | 8 | 18 | 2 | 28 + Variable | Value | male | female | | Total + ---------+-------+-----------+------------+-----------+------ + c172code | 1 | 2 (50.0%) | 2 (50.0%) | 0 (0.0%) | 4 + | 2 | 4 (25.0%) | 11 (68.8%) | 1 (6.2%) | 16 + | 3 | 1 (16.7%) | 5 (83.3%) | 0 (0.0%) | 6 + | NA | 1 (50.0%) | 0 (0.0%) | 1 (50.0%) | 2 Grouped by e42dep (4) - c172code | male | female | | Total - ---------+------------+------------+-----------+------ - 1 | 3 (75.0%) | 0 (0.0%) | 1 (25.0%) | 4 - 2 | 23 (54.8%) | 18 (42.9%) | 1 (2.4%) | 42 - 3 | 3 (30.0%) | 6 (60.0%) | 1 (10.0%) | 10 - | 3 (42.9%) | 4 (57.1%) | 0 (0.0%) | 7 - ---------+------------+------------+-----------+------ - Total | 32 | 28 | 3 | 63 + Variable | Value | male | female | | Total + ---------+-------+------------+------------+-----------+------ + c172code | 1 | 3 (75.0%) | 0 (0.0%) | 1 (25.0%) | 4 + | 2 | 23 (54.8%) | 18 (42.9%) | 1 (2.4%) | 42 + | 3 | 3 (30.0%) | 6 (60.0%) | 1 (10.0%) | 10 + | NA | 3 (42.9%) | 4 (57.1%) | 0 (0.0%) | 7 Grouped by e42dep (NA) - c172code | male | female | | Total - ---------+------------+------------+------------+------ - 2 | 0 (0.0%) | 2 (100.0%) | 0 (0.0%) | 2 - | 1 (100.0%) | 0 (0.0%) | 0 (0.0%) | 1 - ---------+------------+------------+------------+------ - Total | 1 | 2 | 0 | 3 - + Variable | Value | male | female | | Total + ---------+-------+------------+------------+------------+------ + c172code | 2 | 0 (0.0%) | 2 (100.0%) | 0 (0.0%) | 2 + | NA | 1 (100.0%) | 0 (0.0%) | 0 (0.0%) | 1 # data_tabulate, cross tables, markdown @@ -485,7 +466,7 @@ [3] "|1 | 5 (5.0%)| 2 (2.0%)|1 (1.0%) | 8|" [4] "|2 | 31 (31.0%)| 33 (33.0%)|2 (2.0%) | 66|" [5] "|3 | 4 (4.0%)| 11 (11.0%)|1 (1.0%) | 16|" - [6] "| | 5 (5.0%)| 4 (4.0%)|1 (1.0%) | 10|" + [6] "|(NA) | 5 (5.0%)| 4 (4.0%)|1 (1.0%) | 10|" [7] "| | | | | |" [8] "|Total | 45| 50| 5 | 100|" attr(,"format") @@ -522,7 +503,7 @@ [3] "|1 | 5 (4.8%)| 3 (2.9%)|2 (1.9%) | 10|" [4] "|2 | 32 (30.5%)| 32 (30.5%)|3 (2.9%) | 67|" [5] "|3 | 3 (2.9%)| 11 (10.5%)|1 (1.0%) | 15|" - [6] "| | 8 (7.6%)| 5 (4.8%)|1 (1.0%) | 14|" + [6] "|(NA) | 8 (7.6%)| 5 (4.8%)|1 (1.0%) | 14|" [7] "| | | | | |" [8] "|Total | 48| 51| 7 | 105|" attr(,"format") diff --git a/tests/testthat/test-data_tabulate.R b/tests/testthat/test-data_tabulate.R index 74c3c2f23..64cc45883 100644 --- a/tests/testthat/test-data_tabulate.R +++ b/tests/testthat/test-data_tabulate.R @@ -339,6 +339,8 @@ test_that("data_tabulate, cross tables, grouped df", { efc$e16sex[sample.int(nrow(efc), 5)] <- NA grp <- data_group(efc, "e42dep") expect_snapshot(print(data_tabulate(grp, "c172code", by = "e16sex", proportions = "row"))) + skip_if_not_installed("gt") + expect_s3_class(print_html(data_tabulate(grp, "c172code", by = "e16sex", proportions = "row")), "gt_tbl") # nolint }) test_that("data_tabulate, cross tables, errors by", {