From c5fdc834f68d2b3ee8668712df05b1f191a77195 Mon Sep 17 00:00:00 2001
From: Daniel <mail@danielluedecke.de>
Date: Tue, 13 Feb 2024 14:05:35 +0100
Subject: [PATCH] fixes update tests

---
 R/data_tabulate.R                      |   9 +-
 R/data_xtabulate.R                     |  43 ++++-
 tests/testthat/_snaps/data_tabulate.md | 234 +++++++++++++++++++++++++
 tests/testthat/test-data_tabulate.R    |  71 ++++++++
 4 files changed, 349 insertions(+), 8 deletions(-)

diff --git a/R/data_tabulate.R b/R/data_tabulate.R
index 2f083e698..82e479489 100644
--- a/R/data_tabulate.R
+++ b/R/data_tabulate.R
@@ -91,10 +91,8 @@ data_tabulate.default <- function(x,
     x <- droplevels(x)
   }
 
-  # check for correct length of weights - must be equal to "x"
-  if (!is.null(weights) && length(weights) != length(x)) {
-    insight::format_error("Length of `weights` must be equal to length of `x`.")
-  }
+  # validate "weights"
+  weights <- .validate_tableweights(weights, x)
 
   # we go into another function for crosstables here...
   if (!is.null(by)) {
@@ -225,8 +223,11 @@ data_tabulate.data.frame <- function(x,
     regex = regex,
     verbose = verbose
   )
+
   # validate "by"
   by <- .validate_by(by, x)
+  # validate "weights"
+  weights <- .validate_tableweights(weights, x)
 
   out <- lapply(select, function(i) {
     data_tabulate(
diff --git a/R/data_xtabulate.R b/R/data_xtabulate.R
index 5dd7b239e..7599672b2 100644
--- a/R/data_xtabulate.R
+++ b/R/data_xtabulate.R
@@ -202,13 +202,13 @@ print.dw_data_xtabulates <- function(x, big_mark = NULL, ...) {
       # If "by" is a character string, must be of length 1
       if (length(by) > 1) {
         insight::format_error(
-          "If argument `by` is a character indicating a variable name, `by` must be of length 1.",
+          "If `by` is a string indicating a variable name, `by` must be of length 1.",
           "You may use `data_group()` to group by multiple variables, then call `data_tabulate()`."
         )
       }
       # if "by" is a character, "x" must be a data frame
       if (!is.data.frame(x)) {
-        insight::format_error("If argument `by` is a character indicating a variable name, `x` must be a data frame.")
+        insight::format_error("If `by` is a string indicating a variable name, `x` must be a data frame.")
       }
       # is "by" a column in "x"?
       if (!by %in% colnames(x)) {
@@ -221,10 +221,10 @@ print.dw_data_xtabulates <- function(x, big_mark = NULL, ...) {
     }
     # is "by" of same length as "x"?
     if (is.data.frame(x) && length(by) != nrow(x)) {
-      insight::format_error("The variable specified in `by` must have the same length as rows in `x`.") # nolint
+      insight::format_error("Length of `by` must be equal to number of rows in `x`.") # nolint
     }
     if (!is.data.frame(x) && length(by) != length(x)) {
-      insight::format_error("The variable specified in `by` must have the same length as `x`.") # nolint
+      insight::format_error("Length of `by` must be equal to length of `x`.") # nolint
     }
     if (!is.factor(by)) {
       # coerce "by" to factor, including labels
@@ -234,3 +234,38 @@ print.dw_data_xtabulates <- function(x, big_mark = NULL, ...) {
 
   by
 }
+
+
+.validate_tableweights <- function(weights, x) {
+  if (!is.null(weights)) {
+    if (is.character(weights)) {
+      # If "weights" is a character string, must be of length 1
+      if (length(weights) > 1) {
+        insight::format_error(
+          "If `weights` is a string indicating a variable name, `weights` must be of length 1."
+        )
+      }
+      # if "weights" is a character, "x" must be a data frame
+      if (!is.data.frame(x)) {
+        insight::format_error("If `weights` is a string indicating a variable name, `x` must be a data frame.") # nolint
+      }
+      # is "by" a column in "x"?
+      if (!weights %in% colnames(x)) {
+        insight::format_error(sprintf(
+          "The variable specified in `weights` was not found in `x`. %s",
+          .misspelled_string(names(x), weights, "Possibly misspelled?")
+        ))
+      }
+      weights <- x[[weights]]
+    }
+    # is "by" of same length as "x"?
+    if (is.data.frame(x) && length(weights) != nrow(x)) {
+      insight::format_error("Length of `weights` must be equal to number of rows in `x`.") # nolint
+    }
+    if (!is.data.frame(x) && length(weights) != length(x)) {
+      insight::format_error("Length of `weights` must be equal to length of `x`.") # nolint
+    }
+  }
+
+  weights
+}
diff --git a/tests/testthat/_snaps/data_tabulate.md b/tests/testthat/_snaps/data_tabulate.md
index 6fcf1d93b..2564735ce 100644
--- a/tests/testthat/_snaps/data_tabulate.md
+++ b/tests/testthat/_snaps/data_tabulate.md
@@ -241,3 +241,237 @@
                |            |  <NA> |  2 |  3.70 |    <NA> |         <NA>
       -------------------------------------------------------------------
 
+# data_tabulate, cross tables
+
+    Code
+      print(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "cell"))
+    Output
+      efc$c172code |       male |     female |       NA | Total
+      -------------+------------+------------+----------+------
+      1            |   5 (5.0%) |   2 (2.0%) | 1 (1.0%) |     8
+      2            | 31 (31.0%) | 33 (33.0%) | 2 (2.0%) |    66
+      3            |   4 (4.0%) | 11 (11.0%) | 1 (1.0%) |    16
+      <NA>         |   5 (5.0%) |   4 (4.0%) | 1 (1.0%) |    10
+      -------------+------------+------------+----------+------
+      Total        |         45 |         50 |        5 |   100
+
+---
+
+    Code
+      print(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "cell",
+      include_na = FALSE))
+    Output
+      efc$c172code |       male |     female | Total
+      -------------+------------+------------+------
+      1            |   5 (5.8%) |   2 (2.3%) |     7
+      2            | 31 (36.0%) | 33 (38.4%) |    64
+      3            |   4 (4.7%) | 11 (12.8%) |    15
+      -------------+------------+------------+------
+      Total        |         40 |         46 |    86
+
+---
+
+    Code
+      print(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "cell",
+      weights = efc$weights))
+    Output
+      efc$c172code |       male |     female |       NA | Total
+      -------------+------------+------------+----------+------
+      1            |   5 (4.8%) |   3 (2.9%) | 2 (1.9%) |    10
+      2            | 32 (30.5%) | 32 (30.5%) | 3 (2.9%) |    67
+      3            |   3 (2.9%) | 11 (10.5%) | 1 (1.0%) |    15
+      <NA>         |   8 (7.6%) |   5 (4.8%) | 1 (1.0%) |    14
+      -------------+------------+------------+----------+------
+      Total        |         48 |         51 |        7 |   105
+
+---
+
+    Code
+      print(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "cell",
+      include_na = FALSE, weights = efc$weights))
+    Output
+      efc$c172code |       male |     female | Total
+      -------------+------------+------------+------
+      1            |   5 (5.8%) |   3 (3.5%) |     8
+      2            | 32 (37.2%) | 32 (37.2%) |    64
+      3            |   3 (3.5%) | 11 (12.8%) |    14
+      -------------+------------+------------+------
+      Total        |         40 |         46 |    86
+
+---
+
+    Code
+      print(data_tabulate(efc, "c172code", by = efc$e16sex, proportions = "row"))
+    Output
+      c172code |       male |     female |        NA | Total
+      ---------+------------+------------+-----------+------
+      1        |  5 (62.5%) |  2 (25.0%) | 1 (12.5%) |     8
+      2        | 31 (47.0%) | 33 (50.0%) |  2 (3.0%) |    66
+      3        |  4 (25.0%) | 11 (68.8%) |  1 (6.2%) |    16
+      <NA>     |  5 (50.0%) |  4 (40.0%) | 1 (10.0%) |    10
+      ---------+------------+------------+-----------+------
+      Total    |         45 |         50 |         5 |   100
+      
+
+---
+
+    Code
+      print(data_tabulate(efc, "c172code", by = efc$e16sex, proportions = "row",
+      include_na = FALSE))
+    Output
+      c172code |       male |     female | Total
+      ---------+------------+------------+------
+      1        |  5 (71.4%) |  2 (28.6%) |     7
+      2        | 31 (48.4%) | 33 (51.6%) |    64
+      3        |  4 (26.7%) | 11 (73.3%) |    15
+      ---------+------------+------------+------
+      Total    |         40 |         46 |    86
+      
+
+---
+
+    Code
+      print(data_tabulate(efc, "c172code", by = efc$e16sex, proportions = "row",
+      weights = efc$weights))
+    Output
+      c172code |       male |     female |        NA | Total
+      ---------+------------+------------+-----------+------
+      1        |  5 (50.0%) |  3 (30.0%) | 2 (20.0%) |    10
+      2        | 32 (47.8%) | 32 (47.8%) |  3 (4.5%) |    67
+      3        |  3 (20.0%) | 11 (73.3%) |  1 (6.7%) |    15
+      <NA>     |  8 (57.1%) |  5 (35.7%) |  1 (7.1%) |    14
+      ---------+------------+------------+-----------+------
+      Total    |         48 |         51 |         7 |   105
+      
+
+---
+
+    Code
+      print(data_tabulate(efc, "c172code", by = efc$e16sex, proportions = "row",
+      include_na = FALSE, weights = efc$weights))
+    Output
+      c172code |       male |     female | Total
+      ---------+------------+------------+------
+      1        |  5 (62.5%) |  3 (37.5%) |     8
+      2        | 32 (50.0%) | 32 (50.0%) |    64
+      3        |  3 (21.4%) | 11 (78.6%) |    14
+      ---------+------------+------------+------
+      Total    |         40 |         46 |    86
+      
+
+---
+
+    Code
+      print(data_tabulate(efc, "c172code", by = "e16sex", proportions = "column"))
+    Output
+      c172code |       male |     female |        NA | Total
+      ---------+------------+------------+-----------+------
+      1        |  5 (11.1%) |   2 (4.0%) | 1 (20.0%) |     8
+      2        | 31 (68.9%) | 33 (66.0%) | 2 (40.0%) |    66
+      3        |   4 (8.9%) | 11 (22.0%) | 1 (20.0%) |    16
+      <NA>     |  5 (11.1%) |   4 (8.0%) | 1 (20.0%) |    10
+      ---------+------------+------------+-----------+------
+      Total    |         45 |         50 |         5 |   100
+      
+
+---
+
+    Code
+      print(data_tabulate(efc, "c172code", by = "e16sex", proportions = "column",
+        include_na = FALSE))
+    Output
+      c172code |       male |     female | Total
+      ---------+------------+------------+------
+      1        |  5 (12.5%) |   2 (4.3%) |     7
+      2        | 31 (77.5%) | 33 (71.7%) |    64
+      3        |  4 (10.0%) | 11 (23.9%) |    15
+      ---------+------------+------------+------
+      Total    |         40 |         46 |    86
+      
+
+---
+
+    Code
+      print(data_tabulate(efc, "c172code", by = "e16sex", proportions = "column",
+        weights = "weights"))
+    Output
+      c172code |       male |     female |        NA | Total
+      ---------+------------+------------+-----------+------
+      1        |  5 (10.4%) |   3 (5.9%) | 2 (28.6%) |    10
+      2        | 32 (66.7%) | 32 (62.7%) | 3 (42.9%) |    67
+      3        |   3 (6.2%) | 11 (21.6%) | 1 (14.3%) |    15
+      <NA>     |  8 (16.7%) |   5 (9.8%) | 1 (14.3%) |    14
+      ---------+------------+------------+-----------+------
+      Total    |         48 |         51 |         7 |   105
+      
+
+---
+
+    Code
+      print(data_tabulate(efc, "c172code", by = "e16sex", proportions = "column",
+        include_na = FALSE, weights = "weights"))
+    Output
+      c172code |       male |     female | Total
+      ---------+------------+------------+------
+      1        |  5 (12.5%) |   3 (6.5%) |     8
+      2        | 32 (80.0%) | 32 (69.6%) |    64
+      3        |   3 (7.5%) | 11 (23.9%) |    14
+      ---------+------------+------------+------
+      Total    |         40 |         46 |    86
+      
+
+# data_tabulate, cross tables, grouped df
+
+    Code
+      print(data_tabulate(grp, "c172code", by = "e16sex", proportions = "row"))
+    Output
+      Grouped by e42dep (1)
+      
+      c172code |       male |       NA | Total
+      ---------+------------+----------+------
+      2        | 2 (100.0%) | 0 (0.0%) |     2
+      <NA>     |   0 (NaN%) | 0 (NaN%) |     0
+      ---------+------------+----------+------
+      Total    |          2 |        0 |     2
+      
+      Grouped by e42dep (2)
+      
+      c172code |      male |    female |       NA | Total
+      ---------+-----------+-----------+----------+------
+      2        | 2 (50.0%) | 2 (50.0%) | 0 (0.0%) |     4
+      <NA>     |  0 (NaN%) |  0 (NaN%) | 0 (NaN%) |     0
+      ---------+-----------+-----------+----------+------
+      Total    |         2 |         2 |        0 |     4
+      
+      Grouped by e42dep (3)
+      
+      c172code |      male |     female |        NA | Total
+      ---------+-----------+------------+-----------+------
+      1        | 2 (50.0%) |  2 (50.0%) |  0 (0.0%) |     4
+      2        | 4 (25.0%) | 11 (68.8%) |  1 (6.2%) |    16
+      3        | 1 (16.7%) |  5 (83.3%) |  0 (0.0%) |     6
+      <NA>     | 1 (50.0%) |   0 (0.0%) | 1 (50.0%) |     2
+      ---------+-----------+------------+-----------+------
+      Total    |         8 |         18 |         2 |    28
+      
+      Grouped by e42dep (4)
+      
+      c172code |       male |     female |        NA | Total
+      ---------+------------+------------+-----------+------
+      1        |  3 (75.0%) |   0 (0.0%) | 1 (25.0%) |     4
+      2        | 23 (54.8%) | 18 (42.9%) |  1 (2.4%) |    42
+      3        |  3 (30.0%) |  6 (60.0%) | 1 (10.0%) |    10
+      <NA>     |  3 (42.9%) |  4 (57.1%) |  0 (0.0%) |     7
+      ---------+------------+------------+-----------+------
+      Total    |         32 |         28 |         3 |    63
+      
+      Grouped by e42dep (NA)
+      
+      c172code |       male |     female |       NA | Total
+      ---------+------------+------------+----------+------
+      2        |   0 (0.0%) | 2 (100.0%) | 0 (0.0%) |     2
+      <NA>     | 1 (100.0%) |   0 (0.0%) | 0 (0.0%) |     1
+      ---------+------------+------------+----------+------
+      Total    |          1 |          2 |        0 |     3
+      
+
diff --git a/tests/testthat/test-data_tabulate.R b/tests/testthat/test-data_tabulate.R
index cd6e6df3f..7ed76e7c9 100644
--- a/tests/testthat/test-data_tabulate.R
+++ b/tests/testthat/test-data_tabulate.R
@@ -252,3 +252,74 @@ test_that("data_tabulate regex", {
     data_tabulate(mtcars, select = "carb")
   )
 })
+
+
+test_that("data_tabulate exclude/include missing values", {
+  data(efc, package = "datawizard")
+  set.seed(123)
+  efc$weights <- abs(rnorm(n = nrow(efc), mean = 1, sd = 0.5))
+  efc$e16sex[sample.int(nrow(efc), 5)] <- NA
+  out <- data_tabulate(efc$c172code)
+  expect_identical(out$N, c(8L, 66L, 16L, 10L))
+  out <- data_tabulate(efc$c172code, include_na = FALSE)
+  expect_identical(out$N, c(8L, 66L, 16L))
+  out <- data_tabulate(efc$c172code, weights = efc$weights)
+  expect_identical(out$N, c(10, 67, 15, 13))
+  out <- data_tabulate(efc$c172code, include_na = FALSE, weights = efc$weights)
+  expect_identical(out$N, c(10, 67, 15))
+})
+
+
+# cross tables ------------------------------
+test_that("data_tabulate, cross tables", {
+  data(efc, package = "datawizard")
+  set.seed(123)
+  efc$weights <- abs(rnorm(n = nrow(efc), mean = 1, sd = 0.5))
+  efc$e16sex[sample.int(nrow(efc), 5)] <- NA
+
+  expect_snapshot(print(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "cell")))
+  expect_snapshot(print(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "cell", include_na = FALSE)))
+  expect_snapshot(print(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "cell", weights = efc$weights)))
+  expect_snapshot(print(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "cell", include_na = FALSE, weights = efc$weights))) # nolint
+  expect_snapshot(print(data_tabulate(efc, "c172code", by = efc$e16sex, proportions = "row")))
+  expect_snapshot(print(data_tabulate(efc, "c172code", by = efc$e16sex, proportions = "row", include_na = FALSE)))
+  expect_snapshot(print(data_tabulate(efc, "c172code", by = efc$e16sex, proportions = "row", weights = efc$weights)))
+  expect_snapshot(print(data_tabulate(efc, "c172code", by = efc$e16sex, proportions = "row", include_na = FALSE, weights = efc$weights))) # nolint
+  expect_snapshot(print(data_tabulate(efc, "c172code", by = "e16sex", proportions = "column")))
+  expect_snapshot(print(data_tabulate(efc, "c172code", by = "e16sex", proportions = "column", include_na = FALSE)))
+  expect_snapshot(print(data_tabulate(efc, "c172code", by = "e16sex", proportions = "column", weights = "weights")))
+  expect_snapshot(print(data_tabulate(efc, "c172code", by = "e16sex", proportions = "column", include_na = FALSE, weights = "weights"))) # nolint
+})
+
+test_that("data_tabulate, cross tables, grouped df", {
+  data(efc, package = "datawizard")
+  set.seed(123)
+  efc$weights <- abs(rnorm(n = nrow(efc), mean = 1, sd = 0.5))
+  efc$e16sex[sample.int(nrow(efc), 5)] <- NA
+  grp <- data_group(efc, "e42dep")
+  expect_snapshot(print(data_tabulate(grp, "c172code", by = "e16sex", proportions = "row")))
+})
+
+test_that("data_tabulate, cross tables, errors", {
+  data(efc, package = "datawizard")
+  set.seed(123)
+  efc$weights <- abs(rnorm(n = nrow(efc), mean = 1, sd = 0.5))
+  efc$e16sex[sample.int(nrow(efc), 5)] <- NA
+  expect_error(data_tabulate(efc$c172code, by = "e16sex"), regex = "If `by` is a string")
+  expect_error(data_tabulate(efc$c172code, by = efc$e16sex[-1]), regex = "Length of `by`")
+  expect_error(data_tabulate(efc, "c172code", by = efc$e16sex[-1]), regex = "Length of `by`")
+  expect_error(data_tabulate(efc, "c172code", by = "c16sex"), regex = "not found")
+  expect_error(data_tabulate(efc, "c172code", by = c("e16sex", "e42dep")), regex = "You may use")
+})
+
+test_that("data_tabulate, cross tables, errors", {
+  data(efc, package = "datawizard")
+  set.seed(123)
+  efc$weights <- abs(rnorm(n = nrow(efc), mean = 1, sd = 0.5))
+  efc$e16sex[sample.int(nrow(efc), 5)] <- NA
+  expect_error(data_tabulate(efc$c172code, weights = "weights"), regex = "If `weights`")
+  expect_error(data_tabulate(efc$c172code, weights = efc$weights[-1]), regex = "Length of `weights`")
+  expect_error(data_tabulate(efc, "c172code", weights = efc$weights[-1]), regex = "Length of `weights`")
+  expect_error(data_tabulate(efc, "c172code", weights = "weigths"), regex = "not found")
+  expect_error(data_tabulate(efc, "c172code", weights = c("e16sex", "e42dep")), regex = "length 1")
+})