From be6e2bffb59fbcc21ee92bb07225989fd94f12e4 Mon Sep 17 00:00:00 2001
From: Daniel <mail@danielluedecke.de>
Date: Tue, 13 Feb 2024 22:43:03 +0100
Subject: [PATCH] `data_tabulate()` gains `by` argument for crosstables (#481)

* `data_tabluate()` gains `by` argument for crosstables

* fix

* fix

* fix

* fix

* fix

* fix

* fix

* fix

* fix

* fix

* version

* fix

* update tests

* fixes update tests

* docs, add print_html methods

* update news

* code structure

* fixes

* add tests

* print markdown method

* add tests for markdown print

* lintr

* align values in tables

* tests for HTML

* add test

* use same column as rowname

* Update NEWS.md

Co-authored-by: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com>

* address comments

* cell -> full

* update snapshots

* one more test

---------

Co-authored-by: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com>
---
 DESCRIPTION                            |   2 +-
 NAMESPACE                              |   6 +
 NEWS.md                                |   5 +-
 R/data_arrange.R                       |   6 +-
 R/data_tabulate.R                      | 160 ++++++++++--
 R/data_xtabulate.R                     | 347 +++++++++++++++++++++++++
 inst/WORDLIST                          |   2 +
 man/data_tabulate.Rd                   |  61 ++++-
 tests/testthat/_snaps/data_tabulate.md | 319 ++++++++++++++++++++++-
 tests/testthat/test-data_tabulate.R    | 158 ++++++++++-
 10 files changed, 1024 insertions(+), 42 deletions(-)
 create mode 100644 R/data_xtabulate.R

diff --git a/DESCRIPTION b/DESCRIPTION
index 21699fa07..048d15065 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,7 +1,7 @@
 Type: Package
 Package: datawizard
 Title: Easy Data Wrangling and Statistical Transformations
-Version: 0.9.1.3
+Version: 0.9.1.4
 Authors@R: c(
     person("Indrajeet", "Patil", , "patilindrajeet.science@gmail.com", role = "aut",
            comment = c(ORCID = "0000-0003-1995-6531", Twitter = "@patilindrajeets")),
diff --git a/NAMESPACE b/NAMESPACE
index a2a911db3..e89863807 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -67,6 +67,7 @@ S3method(describe_distribution,numeric)
 S3method(format,data_codebook)
 S3method(format,dw_data_peek)
 S3method(format,dw_data_tabulate)
+S3method(format,dw_data_xtabulate)
 S3method(format,dw_groupmeans)
 S3method(format,parameters_distribution)
 S3method(kurtosis,data.frame)
@@ -91,6 +92,8 @@ S3method(print,data_seek)
 S3method(print,dw_data_peek)
 S3method(print,dw_data_tabulate)
 S3method(print,dw_data_tabulates)
+S3method(print,dw_data_xtabulate)
+S3method(print,dw_data_xtabulates)
 S3method(print,dw_groupmeans)
 S3method(print,dw_groupmeans_list)
 S3method(print,dw_transformer)
@@ -102,10 +105,13 @@ S3method(print_html,data_codebook)
 S3method(print_html,dw_data_peek)
 S3method(print_html,dw_data_tabulate)
 S3method(print_html,dw_data_tabulates)
+S3method(print_html,dw_data_xtabulate)
+S3method(print_html,dw_data_xtabulates)
 S3method(print_md,data_codebook)
 S3method(print_md,dw_data_peek)
 S3method(print_md,dw_data_tabulate)
 S3method(print_md,dw_data_tabulates)
+S3method(print_md,dw_data_xtabulate)
 S3method(ranktransform,data.frame)
 S3method(ranktransform,factor)
 S3method(ranktransform,grouped_df)
diff --git a/NEWS.md b/NEWS.md
index ffb41206d..968c9d417 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -5,7 +5,10 @@ CHANGES
 * `data_modify()` gets three new arguments, `.at`, `.if` and `.modify`, to modify
   variables at specific positions or based on logical conditions.
 
-* `data_tabulate()` gets a `weights` argument, to compute weighted frequency tables.
+* `data_tabulate()` was revised and gets several new arguments: a `weights`
+  argument, to compute weighted frequency tables. `include_na` allows to include
+  or omit missing values from the table. Furthermore, a `by` argument was added,
+  to compute crosstables (#479, #481).
 
 # datawizard 0.9.1
 
diff --git a/R/data_arrange.R b/R/data_arrange.R
index 5666fde42..58c8a5ab3 100644
--- a/R/data_arrange.R
+++ b/R/data_arrange.R
@@ -46,8 +46,8 @@ data_arrange.default <- function(data, select = NULL, safe = TRUE) {
   dont_exist <- select[which(!select %in% names(data))]
 
   if (length(dont_exist) > 0) {
-    if (!safe) {
-      insight::format_error(
+    if (safe) {
+      insight::format_warning(
         paste0(
           "The following column(s) don't exist in the dataset: ",
           text_concatenate(dont_exist), "."
@@ -55,7 +55,7 @@ data_arrange.default <- function(data, select = NULL, safe = TRUE) {
         .misspelled_string(names(data), dont_exist, "Possibly misspelled?")
       )
     } else {
-      insight::format_warning(
+      insight::format_error(
         paste0(
           "The following column(s) don't exist in the dataset: ",
           text_concatenate(dont_exist), "."
diff --git a/R/data_tabulate.R b/R/data_tabulate.R
index 4d3bb810b..92432a602 100644
--- a/R/data_tabulate.R
+++ b/R/data_tabulate.R
@@ -1,20 +1,30 @@
-#' @title Create frequency tables of variables
+#' @title Create frequency and crosstables of variables
 #' @name data_tabulate
 #'
-#' @description This function creates frequency tables of variables, including
-#' the number of levels/values as well as the distribution of raw, valid and
-#' cumulative percentages.
+#' @description This function creates frequency or crosstables of variables,
+#' including the number of levels/values as well as the distribution of raw,
+#' valid and cumulative percentages. For crosstables, row, column  and cell
+#' percentages can be calculated.
 #'
 #' @param x A (grouped) data frame, a vector or factor.
-#' @param drop_levels Logical, if `TRUE`, factor levels that do not occur in
+#' @param by Optional vector or factor. If supplied, a crosstable is created.
+#' If `x` is a data frame, `by` can also be a character string indicating the
+#' name of a variable in `x`.
+#' @param drop_levels Logical, if `FALSE`, factor levels that do not occur in
 #' the data are included in the table (with frequency of zero), else unused
 #' factor levels are dropped from the frequency table.
 #' @param name Optional character string, which includes the name that is used
 #' for printing.
+#' @param include_na Logical, if `TRUE`, missing values are included in the
+#' frequency or crosstable, else missing values are omitted.
 #' @param collapse Logical, if `TRUE` collapses multiple tables into one larger
 #' table for printing. This affects only printing, not the returned object.
 #' @param weights Optional numeric vector of weights. Must be of the same length
 #' as `x`. If `weights` is supplied, weighted frequencies are calculated.
+#' @param proportions Optional character string, indicating the type of
+#' percentages to be calculated. Only applies to crosstables, i.e. when `by` is
+#' not `NULL`. Can be `"row"` (row percentages), `"column"` (column percentages)
+#' or `"full"` (to calculate relative frequencies for the full table).
 #' @param ... not used.
 #' @inheritParams find_columns
 #'
@@ -22,11 +32,16 @@
 #' as data frame per variable.
 #'
 #' @examplesIf requireNamespace("poorman")
+#' # frequency tables -------
+#' # ------------------------
 #' data(efc)
 #'
 #' # vector/factor
 #' data_tabulate(efc$c172code)
 #'
+#' # drop missing values
+#' data_tabulate(efc$c172code, include_na = FALSE)
+#'
 #' # data frame
 #' data_tabulate(efc, c("e42dep", "c172code"))
 #'
@@ -54,6 +69,30 @@
 #' efc$weights <- abs(rnorm(n = nrow(efc), mean = 1, sd = 0.5))
 #' data_tabulate(efc$e42dep, weights = efc$weights)
 #'
+#' # crosstables ------
+#' # ------------------
+#'
+#' # add some missing values
+#' set.seed(123)
+#' efc$e16sex[sample.int(nrow(efc), 5)] <- NA
+#'
+#' data_tabulate(efc, "c172code", by = "e16sex")
+#'
+#' # add row and column percentages
+#' data_tabulate(efc, "c172code", by = "e16sex", proportions = "row")
+#' data_tabulate(efc, "c172code", by = "e16sex", proportions = "column")
+#'
+#' # omit missing values
+#' data_tabulate(
+#'   efc$c172code,
+#'   by = efc$e16sex,
+#'   proportions = "column",
+#'   include_na = FALSE
+#' )
+#'
+#' # round percentages
+#' out <- data_tabulate(efc, "c172code", by = "e16sex", proportions = "column")
+#' print(out, digits = 0)
 #' @export
 data_tabulate <- function(x, ...) {
   UseMethod("data_tabulate")
@@ -62,7 +101,15 @@ data_tabulate <- function(x, ...) {
 
 #' @rdname data_tabulate
 #' @export
-data_tabulate.default <- function(x, drop_levels = FALSE, weights = NULL, name = NULL, verbose = TRUE, ...) {
+data_tabulate.default <- function(x,
+                                  by = NULL,
+                                  drop_levels = FALSE,
+                                  weights = NULL,
+                                  include_na = TRUE,
+                                  proportions = NULL,
+                                  name = NULL,
+                                  verbose = TRUE,
+                                  ...) {
   # save label attribute, before it gets lost...
   var_label <- attr(x, "label", exact = TRUE)
 
@@ -78,25 +125,52 @@ data_tabulate.default <- function(x, drop_levels = FALSE, weights = NULL, name =
     x <- droplevels(x)
   }
 
-  # check for correct length of weights - must be equal to "x"
-  if (!is.null(weights) && length(weights) != length(x)) {
-    insight::format_error("Length of `weights` must be equal to length of `x`.")
+  # validate "weights"
+  weights <- .validate_table_weights(weights, x)
+
+  # we go into another function for crosstables here...
+  if (!is.null(by)) {
+    by <- .validate_by(by, x)
+    return(.crosstable(
+      x,
+      by = by,
+      weights = weights,
+      include_na = include_na,
+      proportions = proportions,
+      obj_name = obj_name,
+      group_variable = group_variable
+    ))
   }
 
   # frequency table
   if (is.null(weights)) {
-    freq_table <- tryCatch(table(addNA(x)), error = function(e) NULL)
-  } else {
-    # weighted frequency table
+    if (include_na) {
+      freq_table <- tryCatch(table(addNA(x)), error = function(e) NULL)
+    } else {
+      freq_table <- tryCatch(table(x), error = function(e) NULL)
+    }
+  } else if (include_na) {
+    # weighted frequency table, including NA
     freq_table <- tryCatch(
       stats::xtabs(
         weights ~ x,
-        data = data.frame(weights = weights, x = x),
+        data = data.frame(weights = weights, x = addNA(x)),
         na.action = stats::na.pass,
         addNA = TRUE
       ),
       error = function(e) NULL
     )
+  } else {
+    # weighted frequency table, excluding NA
+    freq_table <- tryCatch(
+      stats::xtabs(
+        weights ~ x,
+        data = data.frame(weights = weights, x = x),
+        na.action = stats::na.omit,
+        addNA = FALSE
+      ),
+      error = function(e) NULL
+    )
   }
 
   if (is.null(freq_table)) {
@@ -115,7 +189,14 @@ data_tabulate.default <- function(x, drop_levels = FALSE, weights = NULL, name =
   }
 
   out$`Raw %` <- 100 * out$N / sum(out$N)
-  out$`Valid %` <- c(100 * out$N[-nrow(out)] / sum(out$N[-nrow(out)]), NA)
+  # if we have missing values, we add a row with NA
+  if (include_na) {
+    out$`Valid %` <- c(100 * out$N[-nrow(out)] / sum(out$N[-nrow(out)]), NA)
+    valid_n <- sum(out$N[-length(out$N)], na.rm = TRUE)
+  } else {
+    out$`Valid %` <- 100 * out$N / sum(out$N)
+    valid_n <- sum(out$N, na.rm = TRUE)
+  }
   out$`Cumulative %` <- cumsum(out$`Valid %`)
 
   # add information about variable/group names
@@ -144,7 +225,7 @@ data_tabulate.default <- function(x, drop_levels = FALSE, weights = NULL, name =
   attr(out, "weights") <- weights
 
   attr(out, "total_n") <- sum(out$N, na.rm = TRUE)
-  attr(out, "valid_n") <- sum(out$N[-length(out$N)], na.rm = TRUE)
+  attr(out, "valid_n") <- valid_n
 
   class(out) <- c("dw_data_tabulate", "data.frame")
 
@@ -159,9 +240,12 @@ data_tabulate.data.frame <- function(x,
                                      exclude = NULL,
                                      ignore_case = FALSE,
                                      regex = FALSE,
-                                     collapse = FALSE,
+                                     by = NULL,
                                      drop_levels = FALSE,
                                      weights = NULL,
+                                     include_na = TRUE,
+                                     proportions = NULL,
+                                     collapse = FALSE,
                                      verbose = TRUE,
                                      ...) {
   # evaluate arguments
@@ -172,11 +256,31 @@ data_tabulate.data.frame <- function(x,
     regex = regex,
     verbose = verbose
   )
+
+  # validate "by"
+  by <- .validate_by(by, x)
+  # validate "weights"
+  weights <- .validate_table_weights(weights, x)
+
   out <- lapply(select, function(i) {
-    data_tabulate(x[[i]], drop_levels = drop_levels, weights = weights, name = i, verbose = verbose, ...)
+    data_tabulate(
+      x[[i]],
+      by = by,
+      proportions = proportions,
+      drop_levels = drop_levels,
+      weights = weights,
+      include_na = include_na,
+      name = i,
+      verbose = verbose,
+      ...
+    )
   })
 
-  class(out) <- c("dw_data_tabulates", "list")
+  if (is.null(by)) {
+    class(out) <- c("dw_data_tabulates", "list")
+  } else {
+    class(out) <- c("dw_data_xtabulates", "list")
+  }
   attr(out, "collapse") <- isTRUE(collapse)
   attr(out, "is_weighted") <- !is.null(weights)
 
@@ -190,10 +294,13 @@ data_tabulate.grouped_df <- function(x,
                                      exclude = NULL,
                                      ignore_case = FALSE,
                                      regex = FALSE,
-                                     verbose = TRUE,
-                                     collapse = FALSE,
+                                     by = NULL,
+                                     proportions = NULL,
                                      drop_levels = FALSE,
                                      weights = NULL,
+                                     include_na = TRUE,
+                                     collapse = FALSE,
+                                     verbose = TRUE,
                                      ...) {
   # works only for dplyr >= 0.8.0
   grps <- attr(x, "groups", exact = TRUE)
@@ -210,6 +317,7 @@ data_tabulate.grouped_df <- function(x,
   )
 
   x <- as.data.frame(x)
+
   out <- list()
   for (i in seq_along(grps)) {
     rows <- grps[[i]]
@@ -227,11 +335,18 @@ data_tabulate.grouped_df <- function(x,
       verbose = verbose,
       drop_levels = drop_levels,
       weights = weights,
+      include_na = include_na,
+      by = by,
+      proportions = proportions,
       group_variable = group_variable,
       ...
     ))
   }
-  class(out) <- c("dw_data_tabulates", "list")
+  if (is.null(by)) {
+    class(out) <- c("dw_data_tabulates", "list")
+  } else {
+    class(out) <- c("dw_data_xtabulates", "list")
+  }
   attr(out, "collapse") <- isTRUE(collapse)
   attr(out, "is_weighted") <- !is.null(weights)
 
@@ -239,8 +354,6 @@ data_tabulate.grouped_df <- function(x,
 }
 
 
-
-
 # methods --------------------
 
 #' @importFrom insight print_html
@@ -286,7 +399,6 @@ format.dw_data_tabulate <- function(x, format = "text", big_mark = NULL, ...) {
 }
 
 
-
 #' @export
 print.dw_data_tabulate <- function(x, big_mark = NULL, ...) {
   a <- attributes(x)
diff --git a/R/data_xtabulate.R b/R/data_xtabulate.R
new file mode 100644
index 000000000..c8e622e2d
--- /dev/null
+++ b/R/data_xtabulate.R
@@ -0,0 +1,347 @@
+# helper to compute crosstables --------------
+
+.crosstable <- function(x,
+                        by,
+                        weights = NULL,
+                        include_na = TRUE,
+                        proportions = NULL,
+                        obj_name = NULL,
+                        group_variable = NULL) {
+  if (!is.null(proportions)) {
+    proportions <- match.arg(proportions, c("row", "column", "full"))
+  }
+  # frequency table
+  if (is.null(weights)) {
+    if (include_na) {
+      x_table <- tryCatch(table(addNA(x), addNA(by)), error = function(e) NULL)
+    } else {
+      x_table <- tryCatch(table(x, by), error = function(e) NULL)
+    }
+  } else if (include_na) {
+    # weighted frequency table, including NA
+    x_table <- tryCatch(
+      stats::xtabs(
+        weights ~ x + by,
+        data = data.frame(weights = weights, x = addNA(x), by = addNA(by)),
+        na.action = stats::na.pass,
+        addNA = TRUE
+      ),
+      error = function(e) NULL
+    )
+  } else {
+    # weighted frequency table, excluding NA
+    x_table <- tryCatch(
+      stats::xtabs(
+        weights ~ x + by,
+        data = data.frame(weights = weights, x = x, by = by),
+        na.action = stats::na.omit,
+        addNA = FALSE
+      ),
+      error = function(e) NULL
+    )
+  }
+
+  if (is.null(x_table)) {
+    insight::format_warning(paste0("Can't compute cross tables for objects of class `", class(x)[1], "`."))
+    return(NULL)
+  }
+
+  out <- as.data.frame(stats::ftable(x_table))
+  colnames(out) <- c("Value", "by", "N")
+  total_n <- sum(out$N, na.rm = TRUE)
+
+  # we want to round N for weighted frequencies
+  if (!is.null(weights)) {
+    out$N <- round(out$N)
+    total_n <- round(total_n)
+  }
+
+  out <- data_to_wide(out, values_from = "N", names_from = "by")
+
+  # use variable name as column name
+  if (!is.null(obj_name)) {
+    colnames(out)[1] <- obj_name
+  }
+
+  # for grouped data frames, add info about grouping variables
+  if (!is.null(group_variable)) {
+    var_info <- toString(lapply(colnames(group_variable), function(i) {
+      sprintf("%s (%s)", i, group_variable[[i]])
+    }))
+    out <- cbind(out[1], data.frame(Group = var_info, stringsAsFactors = FALSE), out[-1])
+  }
+
+  attr(out, "total_n") <- total_n
+  attr(out, "weights") <- weights
+  attr(out, "proportions") <- proportions
+
+  class(out) <- c("dw_data_xtabulate", "data.frame")
+
+  out
+}
+
+
+# methods ---------------------
+
+
+#' @export
+format.dw_data_xtabulate <- function(x, format = "text", digits = 1, big_mark = NULL, ...) {
+  # convert to character manually, else, for large numbers,
+  # format_table() returns scientific notation
+  x <- as.data.frame(x)
+
+  # remove group variable
+  x$Group <- NULL
+
+  # compute total N for rows and colummns
+  total_n <- attributes(x)$total_n
+  total_column <- rowSums(x[, -1], na.rm = TRUE)
+  total_row <- c(colSums(x[, -1], na.rm = TRUE), total_n)
+
+  # proportions?
+  props <- attributes(x)$proportions
+
+  if (!is.null(props)) {
+    # we copy x to tmp, because when we create strings with "sprintf()", the
+    # variable is coerced to character, and in subsequent iterations of the loop,
+    # mathemathical operations are not possible anymore
+    tmp <- x
+    if (identical(props, "row")) {
+      for (i in seq_len(nrow(x))) {
+        tmp[i, -1] <- paste(
+          format(x[i, -1]),
+          format(sprintf("(%.*f%%)", digits, 100 * x[i, -1] / sum(x[i, -1], na.rm = TRUE)), justify = "right")
+        )
+      }
+    } else if (identical(props, "column")) {
+      for (i in seq_len(ncol(x))[-1]) {
+        tmp[, i] <- paste(
+          format(x[, i]),
+          format(sprintf("(%.*f%%)", digits, 100 * x[, i] / sum(x[, i], na.rm = TRUE)), justify = "right")
+        )
+      }
+    } else if (identical(props, "full")) {
+      for (i in seq_len(ncol(x))[-1]) {
+        tmp[, i] <- paste(
+          format(x[, i]),
+          format(sprintf("(%.*f%%)", digits, 100 * x[, i] / total_n), justify = "right")
+        )
+      }
+    }
+    # copy back final result
+    x <- tmp
+  }
+
+  x[] <- lapply(x, as.character)
+
+  # format data frame
+  ftab <- insight::format_table(x, ...)
+  # replace empty cells with NA
+  ftab[] <- lapply(ftab, function(i) {
+    i[i == ""] <- ifelse(identical(format, "text"), "<NA>", "(NA)") # nolint
+    i
+  })
+  # Remove ".00" from numbers
+  ftab$Total <- gsub("\\.00$", "", as.character(total_column))
+  # for text format, insert "empty row" before last total row
+  if (identical(format, "text") || identical(format, "markdown")) {
+    empty_row <- as.data.frame(t(data.frame(
+      rep("", ncol(ftab)),
+      c("Total", as.character(total_row)),
+      stringsAsFactors = FALSE
+    )))
+  } else {
+    empty_row <- as.data.frame(t(data.frame(
+      c("Total", as.character(total_row)),
+      stringsAsFactors = FALSE
+    )))
+  }
+  colnames(empty_row) <- colnames(ftab)
+  ftab <- rbind(ftab, empty_row)
+  ftab[nrow(ftab), ] <- gsub("\\.00$", "", ftab[nrow(ftab), ])
+
+  # insert big marks?
+  ftab$Total <- .add_commas_in_numbers(ftab$Total, big_mark)
+  ftab[nrow(ftab), ] <- .add_commas_in_numbers(ftab[nrow(ftab), ], big_mark)
+
+  # also format NA column name
+  colnames(ftab)[colnames(ftab) == "NA"] <- ifelse(identical(format, "text"), "<NA>", "(NA)")
+
+  ftab
+}
+
+
+#' @export
+print.dw_data_xtabulate <- function(x, big_mark = NULL, ...) {
+  # grouped data? if yes, add information on grouping factor
+  if (is.null(x[["Group"]])) {
+    caption <- NULL
+  } else {
+    caption <- paste0("Grouped by ", x[["Group"]][1])
+    x$Group <- NULL
+  }
+
+  # print table
+  cat(insight::export_table(
+    format(x, big_mark = big_mark, ...),
+    cross = "+",
+    missing = "<NA>",
+    caption = caption,
+    empty_line = "-"
+  ))
+  invisible(x)
+}
+
+
+#' @export
+print_md.dw_data_xtabulate <- function(x, big_mark = NULL, ...) {
+  # grouped data? if yes, add information on grouping factor
+  if (is.null(x[["Group"]])) {
+    caption <- NULL
+  } else {
+    caption <- paste0("Grouped by ", x[["Group"]][1])
+    x$Group <- NULL
+  }
+
+  # print table
+  insight::export_table(
+    format(x, format = "markdown", big_mark = big_mark, ...),
+    cross = "+",
+    missing = "<NA>",
+    caption = caption,
+    empty_line = "-",
+    format = "markdown"
+  )
+}
+
+
+#' @export
+print_html.dw_data_xtabulate <- function(x, big_mark = NULL, ...) {
+  # grouped data? if yes, add information on grouping factor
+  if (!is.null(x[["Group"]])) {
+    x$groups <- paste0("Grouped by ", x[["Group"]][1])
+    x$Group <- NULL
+  }
+
+  # print table
+  insight::export_table(
+    format(x, big_mark = big_mark, format = "html", ...),
+    missing = "(NA)",
+    format = "html",
+    group_by = "groups"
+  )
+}
+
+
+#' @export
+print.dw_data_xtabulates <- function(x, big_mark = NULL, ...) {
+  for (i in seq_along(x)) {
+    print(x[[i]], big_mark = big_mark, ...)
+    cat("\n")
+  }
+  invisible(x)
+}
+
+
+#' @export
+print_html.dw_data_xtabulates <- function(x, big_mark = NULL, ...) {
+  if (length(x) == 1) {
+    print_html(x[[1]], big_mark = big_mark, ...)
+  } else {
+    x <- lapply(x, function(i) {
+      # grouped data? if yes, add information on grouping factor
+      if (!is.null(i[["Group"]])) {
+        i$groups <- paste0("Grouped by ", i[["Group"]][1])
+        i$Group <- NULL
+      }
+      format(i, format = "html", big_mark = big_mark, ...)
+    })
+
+    out <- do.call(rbind, x)
+
+    # print table
+    insight::export_table(
+      out,
+      missing = "(NA)",
+      format = "html",
+      group_by = "groups"
+    )
+  }
+}
+
+
+# helper ---------------------
+
+.validate_by <- function(by, x) {
+  if (!is.null(by)) {
+    if (is.character(by)) {
+      # If "by" is a character string, must be of length 1
+      if (length(by) > 1) {
+        insight::format_error(
+          "If `by` is a string indicating a variable name, `by` must be of length 1.",
+          "You may use `data_group()` to group by multiple variables, then call `data_tabulate()`."
+        )
+      }
+      # if "by" is a character, "x" must be a data frame
+      if (!is.data.frame(x)) {
+        insight::format_error("If `by` is a string indicating a variable name, `x` must be a data frame.")
+      }
+      # is "by" a column in "x"?
+      if (!by %in% colnames(x)) {
+        insight::format_error(sprintf(
+          "The variable specified in `by` was not found in `x`. %s",
+          .misspelled_string(names(x), by, "Possibly misspelled?")
+        ))
+      }
+      by <- x[[by]]
+    }
+    # is "by" of same length as "x"?
+    if (is.data.frame(x) && length(by) != nrow(x)) {
+      insight::format_error("Length of `by` must be equal to number of rows in `x`.") # nolint
+    }
+    if (!is.data.frame(x) && length(by) != length(x)) {
+      insight::format_error("Length of `by` must be equal to length of `x`.") # nolint
+    }
+    if (!is.factor(by)) {
+      # coerce "by" to factor, including labels
+      by <- to_factor(by, labels_to_levels = TRUE, verbose = FALSE)
+    }
+  }
+
+  by
+}
+
+
+.validate_table_weights <- function(weights, x) {
+  if (!is.null(weights)) {
+    if (is.character(weights)) {
+      # If "weights" is a character string, must be of length 1
+      if (length(weights) > 1) {
+        insight::format_error(
+          "If `weights` is a string indicating a variable name, `weights` must be of length 1."
+        )
+      }
+      # if "weights" is a character, "x" must be a data frame
+      if (!is.data.frame(x)) {
+        insight::format_error("If `weights` is a string indicating a variable name, `x` must be a data frame.") # nolint
+      }
+      # is "by" a column in "x"?
+      if (!weights %in% colnames(x)) {
+        insight::format_error(sprintf(
+          "The variable specified in `weights` was not found in `x`. %s",
+          .misspelled_string(names(x), weights, "Possibly misspelled?")
+        ))
+      }
+      weights <- x[[weights]]
+    }
+    # is "by" of same length as "x"?
+    if (is.data.frame(x) && length(weights) != nrow(x)) {
+      insight::format_error("Length of `weights` must be equal to number of rows in `x`.") # nolint
+    }
+    if (!is.data.frame(x) && length(weights) != length(x)) {
+      insight::format_error("Length of `weights` must be equal to length of `x`.") # nolint
+    }
+  }
+
+  weights
+}
diff --git a/inst/WORDLIST b/inst/WORDLIST
index d48430478..5b12a2523 100644
--- a/inst/WORDLIST
+++ b/inst/WORDLIST
@@ -50,6 +50,8 @@ bmwiernik
 codebook
 codebooks
 coercible
+crosstable
+crosstables
 csv
 de
 doi
diff --git a/man/data_tabulate.Rd b/man/data_tabulate.Rd
index 5332f386f..0a155ee22 100644
--- a/man/data_tabulate.Rd
+++ b/man/data_tabulate.Rd
@@ -4,14 +4,17 @@
 \alias{data_tabulate}
 \alias{data_tabulate.default}
 \alias{data_tabulate.data.frame}
-\title{Create frequency tables of variables}
+\title{Create frequency and crosstables of variables}
 \usage{
 data_tabulate(x, ...)
 
 \method{data_tabulate}{default}(
   x,
+  by = NULL,
   drop_levels = FALSE,
   weights = NULL,
+  include_na = TRUE,
+  proportions = NULL,
   name = NULL,
   verbose = TRUE,
   ...
@@ -23,9 +26,12 @@ data_tabulate(x, ...)
   exclude = NULL,
   ignore_case = FALSE,
   regex = FALSE,
-  collapse = FALSE,
+  by = NULL,
   drop_levels = FALSE,
   weights = NULL,
+  include_na = TRUE,
+  proportions = NULL,
+  collapse = FALSE,
   verbose = TRUE,
   ...
 )
@@ -35,13 +41,25 @@ data_tabulate(x, ...)
 
 \item{...}{not used.}
 
-\item{drop_levels}{Logical, if \code{TRUE}, factor levels that do not occur in
+\item{by}{Optional vector or factor. If supplied, a crosstable is created.
+If \code{x} is a data frame, \code{by} can also be a character string indicating the
+name of a variable in \code{x}.}
+
+\item{drop_levels}{Logical, if \code{FALSE}, factor levels that do not occur in
 the data are included in the table (with frequency of zero), else unused
 factor levels are dropped from the frequency table.}
 
 \item{weights}{Optional numeric vector of weights. Must be of the same length
 as \code{x}. If \code{weights} is supplied, weighted frequencies are calculated.}
 
+\item{include_na}{Logical, if \code{TRUE}, missing values are included in the
+frequency or crosstable, else missing values are omitted.}
+
+\item{proportions}{Optional character string, indicating the type of
+percentages to be calculated. Only applies to crosstables, i.e. when \code{by} is
+not \code{NULL}. Can be \code{"row"} (row percentages), \code{"column"} (column percentages)
+or \code{"full"} (to calculate relative frequencies for the full table).}
+
 \item{name}{Optional character string, which includes the name that is used
 for printing.}
 
@@ -104,17 +122,23 @@ A data frame, or a list of data frames, with one frequency table
 as data frame per variable.
 }
 \description{
-This function creates frequency tables of variables, including
-the number of levels/values as well as the distribution of raw, valid and
-cumulative percentages.
+This function creates frequency or crosstables of variables,
+including the number of levels/values as well as the distribution of raw,
+valid and cumulative percentages. For crosstables, row, column  and cell
+percentages can be calculated.
 }
 \examples{
 \dontshow{if (requireNamespace("poorman")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
+# frequency tables -------
+# ------------------------
 data(efc)
 
 # vector/factor
 data_tabulate(efc$c172code)
 
+# drop missing values
+data_tabulate(efc$c172code, include_na = FALSE)
+
 # data frame
 data_tabulate(efc, c("e42dep", "c172code"))
 
@@ -141,5 +165,30 @@ print(data_tabulate(x), big_mark = "")
 set.seed(123)
 efc$weights <- abs(rnorm(n = nrow(efc), mean = 1, sd = 0.5))
 data_tabulate(efc$e42dep, weights = efc$weights)
+
+# crosstables ------
+# ------------------
+
+# add some missing values
+set.seed(123)
+efc$e16sex[sample.int(nrow(efc), 5)] <- NA
+
+data_tabulate(efc, "c172code", by = "e16sex")
+
+# add row and column percentages
+data_tabulate(efc, "c172code", by = "e16sex", proportions = "row")
+data_tabulate(efc, "c172code", by = "e16sex", proportions = "column")
+
+# omit missing values
+data_tabulate(
+  efc$c172code,
+  by = efc$e16sex,
+  proportions = "column",
+  include_na = FALSE
+)
+
+# round percentages
+out <- data_tabulate(efc, "c172code", by = "e16sex", proportions = "column")
+print(out, digits = 0)
 \dontshow{\}) # examplesIf}
 }
diff --git a/tests/testthat/_snaps/data_tabulate.md b/tests/testthat/_snaps/data_tabulate.md
index 7f482ad86..e4b3e8628 100644
--- a/tests/testthat/_snaps/data_tabulate.md
+++ b/tests/testthat/_snaps/data_tabulate.md
@@ -50,8 +50,9 @@
                |     4 | 67 | 63.81 |   67.00 |       100.00
                |  <NA> |  5 |  4.76 |    <NA> |         <NA>
       ---------+-------+----+-------+---------+-------------
-      e16sex   |     1 | 50 | 47.62 |  100.00 |       100.00
-               |     2 | 55 | 52.38 |    <NA> |         <NA>
+      e16sex   |     1 | 50 | 47.62 |   47.62 |        47.62
+               |     2 | 55 | 52.38 |   52.38 |       100.00
+               |  <NA> |  0 |  0.00 |    <NA> |         <NA>
       ------------------------------------------------------
 
 ---
@@ -69,9 +70,10 @@
        [8] "|         |     4| 67| 63.81|   67.00|       100.00|"
        [9] "|         |  (NA)|  5|  4.76|    (NA)|         (NA)|"
       [10] "|         |      |   |      |        |             |"
-      [11] "|e16sex   |     1| 50| 47.62|  100.00|       100.00|"
-      [12] "|         |     2| 55| 52.38|    (NA)|         (NA)|"
-      [13] "|         |      |   |      |        |             |"
+      [11] "|e16sex   |     1| 50| 47.62|   47.62|        47.62|"
+      [12] "|         |     2| 55| 52.38|   52.38|       100.00|"
+      [13] "|         |  (NA)|  0|  0.00|    (NA)|         (NA)|"
+      [14] "|         |      |   |      |        |             |"
       attr(,"format")
       [1] "pipe"
       attr(,"class")
@@ -239,3 +241,310 @@
                |            |  <NA> |  2 |  3.70 |    <NA> |         <NA>
       -------------------------------------------------------------------
 
+# data_tabulate, cross tables
+
+    Code
+      print(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full"))
+    Output
+      efc$c172code |       male |     female |     <NA> | Total
+      -------------+------------+------------+----------+------
+      1            |  5  (5.0%) |  2  (2.0%) | 1 (1.0%) |     8
+      2            | 31 (31.0%) | 33 (33.0%) | 2 (2.0%) |    66
+      3            |  4  (4.0%) | 11 (11.0%) | 1 (1.0%) |    16
+      <NA>         |  5  (5.0%) |  4  (4.0%) | 1 (1.0%) |    10
+      -------------+------------+------------+----------+------
+      Total        |         45 |         50 |        5 |   100
+
+---
+
+    Code
+      print(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full",
+      include_na = FALSE))
+    Output
+      efc$c172code |       male |     female | Total
+      -------------+------------+------------+------
+      1            |  5  (5.8%) |  2  (2.3%) |     7
+      2            | 31 (36.0%) | 33 (38.4%) |    64
+      3            |  4  (4.7%) | 11 (12.8%) |    15
+      -------------+------------+------------+------
+      Total        |         40 |         46 |    86
+
+---
+
+    Code
+      print(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full",
+      weights = efc$weights))
+    Output
+      efc$c172code |       male |     female |     <NA> | Total
+      -------------+------------+------------+----------+------
+      1            |  5  (4.8%) |  3  (2.9%) | 2 (1.9%) |    10
+      2            | 32 (30.5%) | 32 (30.5%) | 3 (2.9%) |    67
+      3            |  3  (2.9%) | 11 (10.5%) | 1 (1.0%) |    15
+      <NA>         |  8  (7.6%) |  5  (4.8%) | 1 (1.0%) |    14
+      -------------+------------+------------+----------+------
+      Total        |         48 |         51 |        7 |   105
+
+---
+
+    Code
+      print(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full",
+      include_na = FALSE, weights = efc$weights))
+    Output
+      efc$c172code |       male |     female | Total
+      -------------+------------+------------+------
+      1            |  5  (5.8%) |  3  (3.5%) |     8
+      2            | 32 (37.2%) | 32 (37.2%) |    64
+      3            |  3  (3.5%) | 11 (12.8%) |    14
+      -------------+------------+------------+------
+      Total        |         40 |         46 |    86
+
+---
+
+    Code
+      print(data_tabulate(efc, "c172code", by = efc$e16sex, proportions = "row"))
+    Output
+      c172code |       male |     female |      <NA> | Total
+      ---------+------------+------------+-----------+------
+      1        |  5 (62.5%) |  2 (25.0%) | 1 (12.5%) |     8
+      2        | 31 (47.0%) | 33 (50.0%) | 2  (3.0%) |    66
+      3        |  4 (25.0%) | 11 (68.8%) | 1  (6.2%) |    16
+      <NA>     |  5 (50.0%) |  4 (40.0%) | 1 (10.0%) |    10
+      ---------+------------+------------+-----------+------
+      Total    |         45 |         50 |         5 |   100
+      
+
+---
+
+    Code
+      print(data_tabulate(efc, "c172code", by = efc$e16sex, proportions = "row",
+      include_na = FALSE))
+    Output
+      c172code |       male |     female | Total
+      ---------+------------+------------+------
+      1        |  5 (71.4%) |  2 (28.6%) |     7
+      2        | 31 (48.4%) | 33 (51.6%) |    64
+      3        |  4 (26.7%) | 11 (73.3%) |    15
+      ---------+------------+------------+------
+      Total    |         40 |         46 |    86
+      
+
+---
+
+    Code
+      print(data_tabulate(efc, "c172code", by = efc$e16sex, proportions = "row",
+      weights = efc$weights))
+    Output
+      c172code |       male |     female |      <NA> | Total
+      ---------+------------+------------+-----------+------
+      1        |  5 (50.0%) |  3 (30.0%) | 2 (20.0%) |    10
+      2        | 32 (47.8%) | 32 (47.8%) | 3  (4.5%) |    67
+      3        |  3 (20.0%) | 11 (73.3%) | 1  (6.7%) |    15
+      <NA>     |  8 (57.1%) |  5 (35.7%) | 1  (7.1%) |    14
+      ---------+------------+------------+-----------+------
+      Total    |         48 |         51 |         7 |   105
+      
+
+---
+
+    Code
+      print(data_tabulate(efc, "c172code", by = efc$e16sex, proportions = "row",
+      include_na = FALSE, weights = efc$weights))
+    Output
+      c172code |       male |     female | Total
+      ---------+------------+------------+------
+      1        |  5 (62.5%) |  3 (37.5%) |     8
+      2        | 32 (50.0%) | 32 (50.0%) |    64
+      3        |  3 (21.4%) | 11 (78.6%) |    14
+      ---------+------------+------------+------
+      Total    |         40 |         46 |    86
+      
+
+---
+
+    Code
+      print(data_tabulate(efc, "c172code", by = "e16sex", proportions = "column"))
+    Output
+      c172code |       male |     female |      <NA> | Total
+      ---------+------------+------------+-----------+------
+      1        |  5 (11.1%) |  2  (4.0%) | 1 (20.0%) |     8
+      2        | 31 (68.9%) | 33 (66.0%) | 2 (40.0%) |    66
+      3        |  4  (8.9%) | 11 (22.0%) | 1 (20.0%) |    16
+      <NA>     |  5 (11.1%) |  4  (8.0%) | 1 (20.0%) |    10
+      ---------+------------+------------+-----------+------
+      Total    |         45 |         50 |         5 |   100
+      
+
+---
+
+    Code
+      print(data_tabulate(efc, "c172code", by = "e16sex", proportions = "column",
+        include_na = FALSE))
+    Output
+      c172code |       male |     female | Total
+      ---------+------------+------------+------
+      1        |  5 (12.5%) |  2  (4.3%) |     7
+      2        | 31 (77.5%) | 33 (71.7%) |    64
+      3        |  4 (10.0%) | 11 (23.9%) |    15
+      ---------+------------+------------+------
+      Total    |         40 |         46 |    86
+      
+
+---
+
+    Code
+      print(data_tabulate(efc, "c172code", by = "e16sex", proportions = "column",
+        weights = "weights"))
+    Output
+      c172code |       male |     female |      <NA> | Total
+      ---------+------------+------------+-----------+------
+      1        |  5 (10.4%) |  3  (5.9%) | 2 (28.6%) |    10
+      2        | 32 (66.7%) | 32 (62.7%) | 3 (42.9%) |    67
+      3        |  3  (6.2%) | 11 (21.6%) | 1 (14.3%) |    15
+      <NA>     |  8 (16.7%) |  5  (9.8%) | 1 (14.3%) |    14
+      ---------+------------+------------+-----------+------
+      Total    |         48 |         51 |         7 |   105
+      
+
+---
+
+    Code
+      print(data_tabulate(efc, "c172code", by = "e16sex", proportions = "column",
+        include_na = FALSE, weights = "weights"))
+    Output
+      c172code |       male |     female | Total
+      ---------+------------+------------+------
+      1        |  5 (12.5%) |  3  (6.5%) |     8
+      2        | 32 (80.0%) | 32 (69.6%) |    64
+      3        |  3  (7.5%) | 11 (23.9%) |    14
+      ---------+------------+------------+------
+      Total    |         40 |         46 |    86
+      
+
+# data_tabulate, cross tables, grouped df
+
+    Code
+      print(data_tabulate(grp, "c172code", by = "e16sex", proportions = "row"))
+    Output
+      Grouped by e42dep (1)
+      
+      c172code |       male |       <NA> | Total
+      ---------+------------+------------+------
+      2        | 2 (100.0%) | 0   (0.0%) |     2
+      <NA>     |   0 (NaN%) |   0 (NaN%) |     0
+      ---------+------------+------------+------
+      Total    |          2 |          0 |     2
+      
+      Grouped by e42dep (2)
+      
+      c172code |      male |    female |      <NA> | Total
+      ---------+-----------+-----------+-----------+------
+      2        | 2 (50.0%) | 2 (50.0%) | 0  (0.0%) |     4
+      <NA>     |  0 (NaN%) |  0 (NaN%) |  0 (NaN%) |     0
+      ---------+-----------+-----------+-----------+------
+      Total    |         2 |         2 |         0 |     4
+      
+      Grouped by e42dep (3)
+      
+      c172code |      male |     female |      <NA> | Total
+      ---------+-----------+------------+-----------+------
+      1        | 2 (50.0%) |  2 (50.0%) | 0  (0.0%) |     4
+      2        | 4 (25.0%) | 11 (68.8%) | 1  (6.2%) |    16
+      3        | 1 (16.7%) |  5 (83.3%) | 0  (0.0%) |     6
+      <NA>     | 1 (50.0%) |  0  (0.0%) | 1 (50.0%) |     2
+      ---------+-----------+------------+-----------+------
+      Total    |         8 |         18 |         2 |    28
+      
+      Grouped by e42dep (4)
+      
+      c172code |       male |     female |      <NA> | Total
+      ---------+------------+------------+-----------+------
+      1        |  3 (75.0%) |  0  (0.0%) | 1 (25.0%) |     4
+      2        | 23 (54.8%) | 18 (42.9%) | 1  (2.4%) |    42
+      3        |  3 (30.0%) |  6 (60.0%) | 1 (10.0%) |    10
+      <NA>     |  3 (42.9%) |  4 (57.1%) | 0  (0.0%) |     7
+      ---------+------------+------------+-----------+------
+      Total    |         32 |         28 |         3 |    63
+      
+      Grouped by e42dep (NA)
+      
+      c172code |       male |     female |       <NA> | Total
+      ---------+------------+------------+------------+------
+      2        | 0   (0.0%) | 2 (100.0%) | 0   (0.0%) |     2
+      <NA>     | 1 (100.0%) | 0   (0.0%) | 0   (0.0%) |     1
+      ---------+------------+------------+------------+------
+      Total    |          1 |          2 |          0 |     3
+      
+
+# data_tabulate, cross tables, markdown
+
+    Code
+      print_md(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full"))
+    Output
+      [1] "|efc$c172code |       male|     female|    (NA) | Total|"
+      [2] "|:------------|----------:|----------:|:--------|-----:|"
+      [3] "|1            |  5  (5.0%)|  2  (2.0%)|1 (1.0%) |     8|"
+      [4] "|2            | 31 (31.0%)| 33 (33.0%)|2 (2.0%) |    66|"
+      [5] "|3            |  4  (4.0%)| 11 (11.0%)|1 (1.0%) |    16|"
+      [6] "|<NA>         |  5  (5.0%)|  4  (4.0%)|1 (1.0%) |    10|"
+      [7] "|             |           |           |         |      |"
+      [8] "|Total        |         45|         50|       5 |   100|"
+      attr(,"format")
+      [1] "pipe"
+      attr(,"class")
+      [1] "knitr_kable" "character"  
+
+---
+
+    Code
+      print_md(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full",
+      include_na = FALSE))
+    Output
+      [1] "|efc$c172code |       male|     female| Total|"
+      [2] "|:------------|----------:|----------:|-----:|"
+      [3] "|1            |  5  (5.8%)|  2  (2.3%)|     7|"
+      [4] "|2            | 31 (36.0%)| 33 (38.4%)|    64|"
+      [5] "|3            |  4  (4.7%)| 11 (12.8%)|    15|"
+      [6] "|             |           |           |      |"
+      [7] "|Total        |         40|         46|    86|"
+      attr(,"format")
+      [1] "pipe"
+      attr(,"class")
+      [1] "knitr_kable" "character"  
+
+---
+
+    Code
+      print_md(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full",
+      weights = efc$weights))
+    Output
+      [1] "|efc$c172code |       male|     female|    (NA) | Total|"
+      [2] "|:------------|----------:|----------:|:--------|-----:|"
+      [3] "|1            |  5  (4.8%)|  3  (2.9%)|2 (1.9%) |    10|"
+      [4] "|2            | 32 (30.5%)| 32 (30.5%)|3 (2.9%) |    67|"
+      [5] "|3            |  3  (2.9%)| 11 (10.5%)|1 (1.0%) |    15|"
+      [6] "|<NA>         |  8  (7.6%)|  5  (4.8%)|1 (1.0%) |    14|"
+      [7] "|             |           |           |         |      |"
+      [8] "|Total        |         48|         51|       7 |   105|"
+      attr(,"format")
+      [1] "pipe"
+      attr(,"class")
+      [1] "knitr_kable" "character"  
+
+---
+
+    Code
+      print_md(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full",
+      include_na = FALSE, weights = efc$weights))
+    Output
+      [1] "|efc$c172code |       male|     female| Total|"
+      [2] "|:------------|----------:|----------:|-----:|"
+      [3] "|1            |  5  (5.8%)|  3  (3.5%)|     8|"
+      [4] "|2            | 32 (37.2%)| 32 (37.2%)|    64|"
+      [5] "|3            |  3  (3.5%)| 11 (12.8%)|    14|"
+      [6] "|             |           |           |      |"
+      [7] "|Total        |         40|         46|    86|"
+      attr(,"format")
+      [1] "pipe"
+      attr(,"class")
+      [1] "knitr_kable" "character"  
+
diff --git a/tests/testthat/test-data_tabulate.R b/tests/testthat/test-data_tabulate.R
index 4390c1af5..1ca47a967 100644
--- a/tests/testthat/test-data_tabulate.R
+++ b/tests/testthat/test-data_tabulate.R
@@ -1,6 +1,5 @@
-data(efc)
-
 test_that("data_tabulate factor", {
+  data(efc, package = "datawizard")
   x <- data_tabulate(efc$e42dep)
   expect_identical(as.vector(x$Value), as.vector(sort(unique(
     addNA(efc$e42dep)
@@ -17,6 +16,7 @@ test_that("data_tabulate factor", {
 
 
 test_that("data_tabulate numeric", {
+  data(efc, package = "datawizard")
   x <- data_tabulate(efc$neg_c_7)
   expect_identical(as.vector(x$Value), as.vector(sort(unique(
     addNA(efc$neg_c_7)
@@ -32,6 +32,14 @@ test_that("data_tabulate numeric", {
 })
 
 
+test_that("data_tabulate, HTML", {
+  skip_if_not_installed("gt")
+  data(efc, package = "datawizard")
+  expect_s3_class(print_html(data_tabulate(efc$c172code)), "gt_tbl")
+  expect_s3_class(print_html(data_tabulate(efc, "c172code")), "gt_tbl")
+})
+
+
 test_that("data_tabulate, weights", {
   data(efc, package = "datawizard")
   set.seed(123)
@@ -62,6 +70,7 @@ test_that("data_tabulate, weights", {
 
 
 test_that("data_tabulate data.frame", {
+  data(efc, package = "datawizard")
   x <- data_tabulate(efc, c("e16sex", "c172code"))
   expect_s3_class(x, "list")
   expect_length(x, 2L)
@@ -116,6 +125,12 @@ test_that("data_tabulate data.frame", {
 })
 
 
+test_that("data_tabulate unsupported class", {
+  data(mtcars)
+  expect_warning(data_tabulate(lm(mpg ~ hp, data = mtcars)), regex = "Can't compute frequency tables")
+})
+
+
 test_that("data_tabulate print", {
   set.seed(123)
   x <- sample.int(3, 1e6, TRUE)
@@ -138,11 +153,13 @@ test_that("data_tabulate print", {
 
 
 test_that("data_tabulate print", {
+  data(efc, package = "datawizard")
   expect_snapshot(data_tabulate(efc$e42dep))
 })
 
 
 test_that("data_tabulate print multiple", {
+  data(efc, package = "datawizard")
   expect_snapshot(data_tabulate(efc, c("c172code", "e16sex")))
 })
 
@@ -157,6 +174,7 @@ test_that("data_tabulate big numbers", {
 
 test_that("data_tabulate print multiple, collapse", {
   skip_if_not(packageVersion("insight") > "0.17.0", "insight must be >= 0.17.0")
+  data(efc, package = "datawizard")
   expect_snapshot(data_tabulate(efc, c("c172code", "e16sex"), collapse = TRUE))
 })
 
@@ -164,6 +182,7 @@ test_that("data_tabulate print multiple, collapse", {
 
 test_that("data_tabulate grouped data.frame", {
   skip_if_not_installed("poorman")
+  data(efc, package = "datawizard")
   x <- data_tabulate(poorman::group_by(efc, e16sex), "c172code")
   expect_s3_class(x, "list")
   expect_length(x, 2L)
@@ -213,11 +232,13 @@ test_that("data_tabulate grouped data.frame", {
 
 test_that("data_tabulate print grouped data", {
   skip_if_not_installed("poorman")
+  data(efc, package = "datawizard")
   expect_snapshot(data_tabulate(poorman::group_by(efc, e16sex), "c172code"))
 })
 
 test_that("data_tabulate print, collapse groups", {
   skip_if_not_installed("poorman")
+  data(efc, package = "datawizard")
   expect_snapshot(
     data_tabulate(poorman::group_by(efc, e16sex), "c172code", collapse = TRUE)
   )
@@ -225,6 +246,7 @@ test_that("data_tabulate print, collapse groups", {
 
 test_that("data_tabulate print, collapse groups, drop levels", {
   skip_if_not_installed("poorman")
+  data(efc, package = "datawizard")
   expect_snapshot(
     data_tabulate(
       poorman::group_by(efc, e16sex),
@@ -235,11 +257,143 @@ test_that("data_tabulate print, collapse groups, drop levels", {
   )
 })
 
+test_that("data_tabulate drop levels", {
+  x <- factor(rep(letters[1:3], 3), levels = letters[1:5])
+  out1 <- data_tabulate(x, drop_levels = FALSE)
+  out2 <- data_tabulate(x, drop_levels = TRUE)
+  expect_identical(out1$N, c(3L, 3L, 3L, 0L, 0L, 0L))
+  expect_identical(as.character(out1$Value), c("a", "b", "c", "d", "e", NA))
+  expect_identical(out2$N, c(3L, 3L, 3L, 0L))
+  expect_identical(as.character(out2$Value), c("a", "b", "c", NA))
+})
+
 
 # select helpers ------------------------------
 test_that("data_tabulate regex", {
+  data(mtcars)
   expect_identical(
     data_tabulate(mtcars, select = "arb", regex = TRUE),
     data_tabulate(mtcars, select = "carb")
   )
 })
+
+
+# missing values ------------------------------
+
+test_that("data_tabulate exclude/include missing values", {
+  data(efc, package = "datawizard")
+  set.seed(123)
+  efc$weights <- abs(rnorm(n = nrow(efc), mean = 1, sd = 0.5))
+  efc$e16sex[sample.int(nrow(efc), 5)] <- NA
+  out <- data_tabulate(efc$c172code)
+  expect_identical(out$N, c(8L, 66L, 16L, 10L))
+  out <- data_tabulate(efc$c172code, include_na = FALSE)
+  expect_identical(out$N, c(8L, 66L, 16L))
+  out <- data_tabulate(efc$c172code, weights = efc$weights)
+  expect_identical(out$N, c(10, 67, 15, 13))
+  out <- data_tabulate(efc$c172code, include_na = FALSE, weights = efc$weights)
+  expect_identical(out$N, c(10, 67, 15))
+})
+
+
+# cross tables ------------------------------
+test_that("data_tabulate, cross tables", {
+  data(efc, package = "datawizard")
+  set.seed(123)
+  efc$weights <- abs(rnorm(n = nrow(efc), mean = 1, sd = 0.5))
+  efc$e16sex[sample.int(nrow(efc), 5)] <- NA
+
+  expect_snapshot(print(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full")))
+  expect_snapshot(print(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full", include_na = FALSE)))
+  expect_snapshot(print(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full", weights = efc$weights)))
+  expect_snapshot(print(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full", include_na = FALSE, weights = efc$weights))) # nolint
+  expect_snapshot(print(data_tabulate(efc, "c172code", by = efc$e16sex, proportions = "row")))
+  expect_snapshot(print(data_tabulate(efc, "c172code", by = efc$e16sex, proportions = "row", include_na = FALSE)))
+  expect_snapshot(print(data_tabulate(efc, "c172code", by = efc$e16sex, proportions = "row", weights = efc$weights)))
+  expect_snapshot(print(data_tabulate(efc, "c172code", by = efc$e16sex, proportions = "row", include_na = FALSE, weights = efc$weights))) # nolint
+  expect_snapshot(print(data_tabulate(efc, "c172code", by = "e16sex", proportions = "column")))
+  expect_snapshot(print(data_tabulate(efc, "c172code", by = "e16sex", proportions = "column", include_na = FALSE)))
+  expect_snapshot(print(data_tabulate(efc, "c172code", by = "e16sex", proportions = "column", weights = "weights")))
+  expect_snapshot(print(data_tabulate(efc, "c172code", by = "e16sex", proportions = "column", include_na = FALSE, weights = "weights"))) # nolint
+})
+
+test_that("data_tabulate, cross tables, HTML", {
+  skip_if_not_installed("gt")
+  data(efc, package = "datawizard")
+  set.seed(123)
+  efc$weights <- abs(rnorm(n = nrow(efc), mean = 1, sd = 0.5))
+  efc$e16sex[sample.int(nrow(efc), 5)] <- NA
+
+  expect_s3_class(print_html(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full")), "gt_tbl")
+  expect_s3_class(print_html(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full", include_na = FALSE)), "gt_tbl") # nolint
+  expect_s3_class(print_html(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full", weights = efc$weights)), "gt_tbl") # nolint
+  expect_s3_class(print_html(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full", include_na = FALSE, weights = efc$weights)), "gt_tbl") # nolint
+  expect_s3_class(print_html(data_tabulate(efc, "c172code", by = efc$e16sex, proportions = "row")), "gt_tbl")
+  expect_s3_class(print_html(data_tabulate(efc, "c172code", by = efc$e16sex, proportions = "row", include_na = FALSE, weights = efc$weights)), "gt_tbl") # nolint
+})
+
+test_that("data_tabulate, cross tables, grouped df", {
+  data(efc, package = "datawizard")
+  set.seed(123)
+  efc$weights <- abs(rnorm(n = nrow(efc), mean = 1, sd = 0.5))
+  efc$e16sex[sample.int(nrow(efc), 5)] <- NA
+  grp <- data_group(efc, "e42dep")
+  expect_snapshot(print(data_tabulate(grp, "c172code", by = "e16sex", proportions = "row")))
+})
+
+test_that("data_tabulate, cross tables, errors by", {
+  data(efc, package = "datawizard")
+  set.seed(123)
+  efc$weights <- abs(rnorm(n = nrow(efc), mean = 1, sd = 0.5))
+  efc$e16sex[sample.int(nrow(efc), 5)] <- NA
+  expect_error(data_tabulate(efc$c172code, by = "e16sex"), regex = "If `by` is a string")
+  expect_error(data_tabulate(efc$c172code, by = efc$e16sex[-1]), regex = "Length of `by`")
+  expect_error(data_tabulate(efc, "c172code", by = efc$e16sex[-1]), regex = "Length of `by`")
+  expect_error(data_tabulate(efc, "c172code", by = "c16sex"), regex = "not found")
+  expect_error(data_tabulate(efc, "c172code", by = c("e16sex", "e42dep")), regex = "You may use")
+})
+
+test_that("data_tabulate, cross tables, errors weights", {
+  data(efc, package = "datawizard")
+  set.seed(123)
+  efc$weights <- abs(rnorm(n = nrow(efc), mean = 1, sd = 0.5))
+  efc$e16sex[sample.int(nrow(efc), 5)] <- NA
+  expect_error(data_tabulate(efc$c172code, weights = "weights"), regex = "If `weights`")
+  expect_error(data_tabulate(efc$c172code, weights = efc$weights[-1]), regex = "Length of `weights`")
+  expect_error(data_tabulate(efc, "c172code", weights = efc$weights[-1]), regex = "Length of `weights`")
+  expect_error(data_tabulate(efc, "c172code", weights = "weigths"), regex = "not found")
+  expect_error(data_tabulate(efc, "c172code", weights = c("e16sex", "e42dep")), regex = "length 1")
+})
+
+
+# markdown -------------------------
+
+test_that("data_tabulate, cross tables, markdown", {
+  data(efc, package = "datawizard")
+  set.seed(123)
+  efc$weights <- abs(rnorm(n = nrow(efc), mean = 1, sd = 0.5))
+  efc$e16sex[sample.int(nrow(efc), 5)] <- NA
+
+  expect_snapshot(print_md(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full")))
+  expect_snapshot(print_md(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full", include_na = FALSE)))
+  expect_snapshot(print_md(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full", weights = efc$weights)))
+  expect_snapshot(print_md(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full", include_na = FALSE, weights = efc$weights))) # nolint
+})
+
+# validate against table -------------------------
+
+test_that("data_tabulate, validate against table", {
+  data(mtcars)
+  # frequency table
+  out1 <- as.data.frame(table(mtcars$cyl))
+  out2 <- data_tabulate(mtcars$cyl, include_na = FALSE)
+  expect_identical(out1$Freq, out2$N)
+  # crosstable
+  out1 <- data_arrange(as.data.frame(table(mtcars$cyl, mtcars$gear)), c("Var1", "Var2"))
+  out2 <- data_rename(data_to_long(
+    as.data.frame(data_tabulate(mtcars$cyl, by = mtcars$gear, include_na = FALSE)), 2:4,
+    names_to = "Var2", values_to = "Freq"
+  ), "mtcars$cyl", "Var1")
+  out1[[2]] <- as.character(out1[[2]])
+  expect_equal(out1, out2, ignore_attr = TRUE)
+})