Fix data tab (#577)

* Fix `data_tabulate()` for multiple `by` in HTML * add * news * fix * comments * fix * fix * styler * styler * ... * fix * fix * add test * fix
easystats · Dec 23, 2024 · a58a031 · a58a031
1 parent 3533bba
commit a58a031
Show file tree

Hide file tree

Showing 7 changed files with 201 additions and 131 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Type: Package
 Package: datawizard
 Title: Easy Data Wrangling and Statistical Transformations
-Version: 0.13.0.19
+Version: 0.13.0.20
 Authors@R: c(
     person("Indrajeet", "Patil", , "[email protected]", role = "aut",
            comment = c(ORCID = "0000-0003-1995-6531")),

diff --git a/NAMESPACE b/NAMESPACE
@@ -116,6 +116,7 @@ S3method(print_html,datawizard_tables)
 S3method(print_html,dw_data_peek)
 S3method(print_md,data_codebook)
 S3method(print_md,datawizard_crosstab)
+S3method(print_md,datawizard_crosstabs)
 S3method(print_md,datawizard_table)
 S3method(print_md,datawizard_tables)
 S3method(print_md,dw_data_peek)

diff --git a/NEWS.md b/NEWS.md
@@ -4,18 +4,23 @@ BREAKING CHANGES AND DEPRECATIONS
 
 * *datawizard* now requires R >= 4.0 (#515).
 
-* Argument `drop_na` in `data_match()` is deprecated now. Please use 
+* Argument `drop_na` in `data_match()` is deprecated now. Please use
   `remove_na` instead.
 
 * In `data_rename()` (#567):
   - argument `pattern` is deprecated. Use `select` instead.
-  - argument `safe` is deprecated. The function now errors when `select` 
+  - argument `safe` is deprecated. The function now errors when `select`
     contains unknown column names.
   - when `replacement` is `NULL`, an error is now thrown (previously, column
     indices were used as new names).
   - if `select` (previously `pattern`) is a named vector, then all elements
     must be named, e.g. `c(length = "Sepal.Length", "Sepal.Width")` errors.
 
+* `print()` methods for `data_tabulate()` with multiple sub-tables (i.e. when
+  length of `by` was > 1) were revised. Now, an integrated table instead of
+  multiple tables is returned. Furthermore, `print_html()` did not work, which
+  was also fixed now.
+
 CHANGES
 
 * The `select` argument, which is available in different functions to select

diff --git a/R/data_xtabulate.R b/R/data_xtabulate.R
@@ -90,18 +90,23 @@
 
 
 #' @export
-format.datawizard_crosstab <- function(x, format = "text", digits = 1, big_mark = NULL, ...) {
+format.datawizard_crosstab <- function(x,
+                                       format = "text",
+                                       digits = 1,
+                                       big_mark = NULL,
+                                       include_total_row = TRUE,
+                                       ...) {
   # convert to character manually, else, for large numbers,
   # format_table() returns scientific notation
   x <- as.data.frame(x)
 
-  # remove group variable
-  x$Group <- NULL
+  # find numeric columns, only for these we need row/column sums
+  numeric_columns <- vapply(x, is.numeric, logical(1))
 
-  # compute total N for rows and colummns
+  # compute total N for rows and columns
   total_n <- attributes(x)$total_n
-  total_column <- rowSums(x[, -1], na.rm = TRUE)
-  total_row <- c(colSums(x[, -1], na.rm = TRUE), total_n)
+  total_column <- rowSums(x[numeric_columns], na.rm = TRUE)
+  total_row <- c(colSums(x[numeric_columns], na.rm = TRUE), total_n)
 
   # proportions?
   props <- attributes(x)$proportions
@@ -113,16 +118,16 @@ format.datawizard_crosstab <- function(x, format = "text", digits = 1, big_mark
     tmp <- x
     if (identical(props, "row")) {
       for (i in seq_len(nrow(x))) {
-        row_sum <- sum(x[i, -1], na.rm = TRUE)
+        row_sum <- sum(x[i, numeric_columns], na.rm = TRUE)
         if (row_sum == 0) {
           row_sum_string <- "(0%)"
         } else {
-          row_sum_string <- sprintf("(%.*f%%)", digits, 100 * x[i, -1] / row_sum)
+          row_sum_string <- sprintf("(%.*f%%)", digits, 100 * x[i, numeric_columns] / row_sum)
         }
-        tmp[i, -1] <- paste(format(x[i, -1]), format(row_sum_string, justify = "right"))
+        tmp[i, numeric_columns] <- paste(format(x[i, numeric_columns]), format(row_sum_string, justify = "right"))
       }
     } else if (identical(props, "column")) {
-      for (i in seq_len(ncol(x))[-1]) {
+      for (i in seq_len(ncol(x))[numeric_columns]) {
         col_sum <- sum(x[, i], na.rm = TRUE)
         if (col_sum == 0) {
           col_sum_string <- "(0%)"
@@ -132,7 +137,7 @@ format.datawizard_crosstab <- function(x, format = "text", digits = 1, big_mark
         tmp[, i] <- paste(format(x[, i]), format(col_sum_string, justify = "right"))
       }
     } else if (identical(props, "full")) {
-      for (i in seq_len(ncol(x))[-1]) {
+      for (i in seq_len(ncol(x))[numeric_columns]) {
         tmp[, i] <- paste(
           format(x[, i]),
           format(sprintf("(%.*f%%)", digits, 100 * x[, i] / total_n), justify = "right")
@@ -154,22 +159,29 @@ format.datawizard_crosstab <- function(x, format = "text", digits = 1, big_mark
   })
   # Remove ".00" from numbers
   ftab$Total <- gsub("\\.00$", "", as.character(total_column))
-  # for text format, insert "empty row" before last total row
-  if (identical(format, "text") || identical(format, "markdown")) {
-    empty_row <- as.data.frame(t(data.frame(
-      rep("", ncol(ftab)),
-      c("Total", as.character(total_row)),
-      stringsAsFactors = FALSE
-    )))
-  } else {
-    empty_row <- as.data.frame(t(data.frame(
-      c("Total", as.character(total_row)),
-      stringsAsFactors = FALSE
-    )))
+
+  # add final total row to each sub-table. For multiple, collapsed table
+  # (i.e. when length of `by` > 1), we don't want multiple total rows in the
+  # table, so we would set include_total_row = FALSE for objects of class
+  # `datawizard_crosstabs` (note plural s!)
+  if (include_total_row) {
+    # for text format, insert "empty row" before last total row
+    if (identical(format, "text") || identical(format, "markdown")) {
+      empty_row <- as.data.frame(t(data.frame(
+        rep("", ncol(ftab)),
+        c("Total", as.character(total_row)),
+        stringsAsFactors = FALSE
+      )))
+    } else {
+      empty_row <- as.data.frame(t(data.frame(
+        c("Total", as.character(total_row)),
+        stringsAsFactors = FALSE
+      )))
+    }
+    colnames(empty_row) <- colnames(ftab)
+    ftab <- rbind(ftab, empty_row)
+    ftab[nrow(ftab), ] <- gsub("\\.00$", "", ftab[nrow(ftab), ])
   }
-  colnames(empty_row) <- colnames(ftab)
-  ftab <- rbind(ftab, empty_row)
-  ftab[nrow(ftab), ] <- gsub("\\.00$", "", ftab[nrow(ftab), ])
 
   # insert big marks?
   ftab$Total <- .add_commas_in_numbers(ftab$Total, big_mark)
@@ -182,31 +194,30 @@ format.datawizard_crosstab <- function(x, format = "text", digits = 1, big_mark
 }
 
 
+
+# print, datawizard_crosstab ---------------------
+
+
 #' @export
 print.datawizard_crosstab <- function(x, big_mark = NULL, ...) {
-  # grouped data? if yes, add information on grouping factor
-  if (is.null(x[["Group"]])) {
-    caption <- NULL
-  } else {
-    caption <- paste0("Grouped by ", x[["Group"]][1])
-    x$Group <- NULL
-  }
-
-  # print table
-  cat(insight::export_table(
-    format(x, big_mark = big_mark, ...),
-    cross = "+",
-    missing = "<NA>",
-    caption = caption,
-    empty_line = "-",
-    ...
-  ))
+  .print_text_table(x, big_mark, format = "text", ...)
   invisible(x)
 }
 
 
 #' @export
 print_md.datawizard_crosstab <- function(x, big_mark = NULL, ...) {
+  .print_text_table(x, big_mark, format = "markdown", ...)
+}
+
+
+#' @export
+print_html.datawizard_crosstab <- function(x, big_mark = NULL, ...) {
+  .print_text_table(x, big_mark, format = "html", ...)
+}
+
+
+.print_text_table <- function(x, big_mark = NULL, format = "text", ...) {
   # grouped data? if yes, add information on grouping factor
   if (is.null(x[["Group"]])) {
     caption <- NULL
@@ -215,75 +226,147 @@ print_md.datawizard_crosstab <- function(x, big_mark = NULL, ...) {
     x$Group <- NULL
   }
 
-  # print table
-  insight::export_table(
-    format(x, format = "markdown", big_mark = big_mark, ...),
-    cross = "+",
-    missing = "<NA>",
+  # prepare table arguments
+  fun_args <- list(
+    format(x, big_mark = big_mark, format = format, ...),
     caption = caption,
-    empty_line = "-",
-    format = "markdown"
+    format = format
   )
-}
-
-
-#' @export
-print_html.datawizard_crosstab <- function(x, big_mark = NULL, ...) {
-  # grouped data? if yes, add information on grouping factor
-  if (!is.null(x[["Group"]])) {
-    x$groups <- paste0("Grouped by ", x[["Group"]][1])
-    x$Group <- NULL
+  if (format != "html") {
+    fun_args$cross <- "+"
+    fun_args$empty_line <- "-"
   }
+  if (format == "text") {
+    fun_args$missing <- "<NA>"
+  } else {
+    fun_args$missing <- "(NA)"
+  }
+  out <- do.call(insight::export_table, c(fun_args, list(...)))
 
   # print table
-  insight::export_table(
-    format(x, big_mark = big_mark, format = "html", ...),
-    missing = "(NA)",
-    format = "html",
-    by = "groups"
-  )
+  if (identical(format, "text")) {
+    cat(out)
+  } else {
+    out
+  }
 }
 
 
+# print, datawizard_crosstabs ---------------------
+
+
 #' @export
 print.datawizard_crosstabs <- function(x, big_mark = NULL, ...) {
-  for (i in seq_along(x)) {
-    print(x[[i]], big_mark = big_mark, ...)
-    cat("\n")
-  }
+  .print_text_tables(x, big_mark, format = "text", ...)
   invisible(x)
 }
 
 
+#' @export
+print_md.datawizard_crosstabs <- function(x, big_mark = NULL, ...) {
+  .print_text_tables(x, big_mark, format = "markdown", ...)
+}
+
+
 #' @export
 print_html.datawizard_crosstabs <- function(x, big_mark = NULL, ...) {
+  .print_text_tables(x, big_mark, format = "html", ...)
+}
+
+
+.print_text_tables <- function(x, big_mark = NULL, format = "text", ...) {
   if (length(x) == 1) {
-    print_html(x[[1]], big_mark = big_mark, ...)
+    .print_text_table(x[[1]], big_mark = big_mark, format = format, ...)
   } else {
     x <- lapply(x, function(i) {
       # grouped data? if yes, add information on grouping factor
       if (!is.null(i[["Group"]])) {
         i$groups <- paste0("Grouped by ", i[["Group"]][1])
         i$Group <- NULL
       }
-      format(i, format = "html", big_mark = big_mark, ...)
+      # if we don't have the gt-grouping variable "groups" yet, we use it now
+      # for grouping. Else, we use a new column named "Variable", to avoid
+      # overwriting the groups-variable from grouped data frames
+      if (is.null(i$groups) && identical(format, "html")) {
+        grp_variable <- "groups"
+      } else {
+        grp_variable <- "Variable"
+      }
+      # first variable differs for each data frame, so we harmonize it here
+      i[[grp_variable]] <- colnames(i)[1]
+      colnames(i)[1] <- "Value"
+      # move column to first position
+      i <- data_relocate(i, select = grp_variable, before = 1)
+      # format data frame
+      format(i, format = format, big_mark = big_mark, include_total_row = FALSE, ...)
     })
+    # now bind, but we need to check for equal number of columns
+    if (all(lengths(x) == max(length(x)))) {
+      out <- do.call(rbind, x)
+    } else {
+      # if not all tables have identical columns, we can use "data_merge()",
+      # which safely row-binds all data frames. However, the column order can be
+      # messed up, so we save column order here and restore it later
+      col_order <- colnames(x[[which.max(lengths(x))]])
+      out <- data_merge(x, join = "bind")[col_order]
+    }
 
-    out <- do.call(rbind, x)
+    # split tables for grouped data frames
+    if (!is.null(out$groups)) {
+      out <- split(out, out$groups)
+      out <- lapply(out, function(subtable) {
+        # for text and markdown, if we split tables, we remove the "groups"
+        # variable. we need to keep it for HTML tables.
+        if (!identical(format, "html")) {
+          attr(subtable, "table_caption") <- c(unique(subtable$groups), "blue")
+          subtable$groups <- NULL
+        }
+        # remove duplicated names
+        for (grpvars in c("Variable", "Group")) {
+          if (!is.null(subtable[[grpvars]])) {
+            subtable[[grpvars]][duplicated(subtable[[grpvars]])] <- ""
+          }
+        }
+        subtable
+      })
+      # no splitting of grouped data frames into list for HTML format,
+      # because splitting is done by the `by` argument later
+      if (identical(format, "html")) {
+        out <- do.call(rbind, out)
+      }
+    }
 
-    # print table
-    insight::export_table(
+    # prepare table arguments
+    fun_args <- list(
       out,
-      missing = "(NA)",
-      format = "html",
+      format = format,
       by = "groups"
     )
+    if (format != "html") {
+      fun_args$cross <- "+"
+      fun_args$empty_line <- "-"
+    }
+    if (format == "text") {
+      fun_args$missing <- "<NA>"
+    } else {
+      fun_args$missing <- "(NA)"
+    }
+    out <- do.call(insight::export_table, c(fun_args, list(...)))
+
+    # print table
+    if (identical(format, "text")) {
+      cat(out)
+    } else {
+      out
+    }
   }
 }
 
 
+
 # helper ---------------------
 
+
 .validate_by <- function(by, x) {
   if (!is.null(by)) {
     if (is.character(by)) {

diff --git a/R/select_nse.R b/R/select_nse.R
@@ -198,13 +198,11 @@
 }
 
 # small helper, to avoid duplicated code
-.action_if_not_found <- function(
-  x,
-  columns,
-  matches,
-  verbose,
-  ifnotfound
-) {
+.action_if_not_found <- function(x,
+                                 columns,
+                                 matches,
+                                 verbose,
+                                 ifnotfound) {
   msg <- paste0(
     "Following variable(s) were not found: ",
     toString(x[is.na(matches)])