From ab9f0064c35050359247688f48583349704aa8f7 Mon Sep 17 00:00:00 2001
From: rempsyc <remi.theriault@mail.mcgill.ca>
Date: Mon, 11 Nov 2024 21:25:58 +0100
Subject: [PATCH] Suggestion of new function: `describe_missing()` Fixes #454

---
 NAMESPACE                                  |   1 +
 R/describe_missing.R                       | 118 ++++
 man/describe_missing.Rd                    |  86 +++
 tests/testthat/_snaps/data_codebook.new.md | 705 +++++++++++++++++++++
 tests/testthat/_snaps/describe_missing.md  |  38 ++
 tests/testthat/test-describe_missing.R     |  26 +
 6 files changed, 974 insertions(+)
 create mode 100644 R/describe_missing.R
 create mode 100644 man/describe_missing.Rd
 create mode 100644 tests/testthat/_snaps/data_codebook.new.md
 create mode 100644 tests/testthat/_snaps/describe_missing.md
 create mode 100644 tests/testthat/test-describe_missing.R

diff --git a/NAMESPACE b/NAMESPACE
index 7e97817b9..e463f7261 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -267,6 +267,7 @@ export(data_write)
 export(degroup)
 export(demean)
 export(describe_distribution)
+export(describe_missing)
 export(detrend)
 export(distribution_coef_var)
 export(distribution_mode)
diff --git a/R/describe_missing.R b/R/describe_missing.R
new file mode 100644
index 000000000..2f2f0da9c
--- /dev/null
+++ b/R/describe_missing.R
@@ -0,0 +1,118 @@
+#' @title Describe Missing Values in Data According to Guidelines
+#'
+#' @description Provides a detailed description of missing values in a data frame.
+#' This function reports both absolute and percentage missing values of specified
+#' column lists or scales, following recommended guidelines. Some authors recommend
+#' reporting item-level missingness per scale, as well as a participant's maximum
+#' number of missing items by scale. For example, Parent (2013) writes:
+#'
+#' *I recommend that authors (a) state their tolerance level for missing data by scale
+#' or subscale (e.g., "We calculated means for all subscales on which participants gave
+#' at least 75% complete data") and then (b) report the individual missingness rates
+#' by scale per data point (i.e., the number of missing values out of all data points
+#' on that scale for all participants) and the maximum by participant (e.g., "For Attachment
+#' Anxiety, a total of 4 missing data points out of 100 were observed, with no participant
+#' missing more than a single data point").*
+#'
+#' @param data The data frame to be analyzed.
+#' @param vars Variable (or lists of variables) to check for missing values (NAs).
+#' @param scales The scale names to check for missing values (as a character vector).
+#' @keywords missing values NA guidelines
+#' @return A dataframe with the following columns:
+#'  - `var`: Variables selected.
+#'  - `items`: Number of items for selected variables.
+#'  - `na`: Number of missing cell values for those variables (e.g., 2 missing
+#'  values for the first participant + 2 missing values for the second participant
+#'  = total of 4 missing values).
+#'  - `cells`: Total number of cells (i.e., number of participants multiplied by
+#'  the number of variables, `items`).
+#'  - `na_percent`: The percentage of missing values (`na` divided by `cells`).
+#'  - `na_max`: The number of missing values for the participant with the most
+#'  missing values for the selected variables.
+#'  - `na_max_percent`: The amount of missing values for the participant with
+#'  the most missing values for the selected variables, as a percentage
+#'  (i.e., `na_max` divided by the number of selected variables, `items`).
+#'  - `all_na`: The number of participants missing 100% of items for that scale
+#'  (the selected variables).
+#'
+#' @export
+#' @references Parent, M. C. (2013). Handling item-level missing
+#' data: Simpler is just as good. *The Counseling Psychologist*,
+#' *41*(4), 568-600. https://doi.org/10.1177%2F0011000012445176
+#' @examples
+#' # Use the entire data frame
+#' describe_missing(airquality)
+#'
+#' # Use selected columns explicitly
+#' describe_missing(airquality,
+#'   vars = list(
+#'     c("Ozone", "Solar.R", "Wind"),
+#'     c("Temp", "Month", "Day")
+#'   )
+#' )
+#'
+#' # If the questionnaire items start with the same name, e.g.,
+#' set.seed(15)
+#' fun <- function() {
+#'   c(sample(c(NA, 1:10), replace = TRUE), NA, NA, NA)
+#' }
+#' df <- data.frame(
+#'   ID = c("idz", NA),
+#'   open_1 = fun(), open_2 = fun(), open_3 = fun(),
+#'   extrovert_1 = fun(), extrovert_2 = fun(), extrovert_3 = fun(),
+#'   agreeable_1 = fun(), agreeable_2 = fun(), agreeable_3 = fun()
+#' )
+#'
+#' # One can list the scale names directly:
+#' describe_missing(df, scales = c("ID", "open", "extrovert", "agreeable"))
+
+
+
+describe_missing <- function(data, vars = NULL, scales = NULL) {
+  classes <- lapply(data, class)
+  if (missing(vars) & missing(scales)) {
+    vars.internal <- names(data)
+  } else if (!missing(scales)) {
+    vars.internal <- lapply(scales, function(x) {
+      grep(paste0("^", x), names(data), value = TRUE)
+    })
+  }
+  if (!missing(vars)) {
+    vars.internal <- vars
+  }
+  if (!is.list(vars.internal)) {
+    vars.internal <- list(vars.internal)
+  }
+  na_df <- .describe_missing(data)
+  if (!missing(vars) | !missing(scales)) {
+    na_list <- lapply(vars.internal, function(x) {
+      data_subset <- data[, x, drop = FALSE]
+      .describe_missing(data_subset)
+    })
+    na_df$var <- "Total"
+    na_df <- do.call(rbind, c(na_list, list(na_df)))
+  }
+  na_df
+}
+
+.describe_missing <- function(data) {
+  var <- paste0(names(data)[1], ":", names(data)[ncol(data)])
+  items <- ncol(data)
+  na <- sum(is.na(data))
+  cells <- nrow(data) * ncol(data)
+  na_percent <- round(na / cells * 100, 2)
+  na_max <- max(rowSums(is.na(data)))
+  na_max_percent <- round(na_max / items * 100, 2)
+  all_na <- sum(apply(data, 1, function(x) all(is.na(x))))
+  
+  data.frame(
+    var = var,
+    items = items,
+    na = na,
+    cells = cells,
+    na_percent = na_percent,
+    na_max = na_max,
+    na_max_percent = na_max_percent,
+    all_na = all_na
+  )
+}
diff --git a/man/describe_missing.Rd b/man/describe_missing.Rd
new file mode 100644
index 000000000..c206a23ce
--- /dev/null
+++ b/man/describe_missing.Rd
@@ -0,0 +1,86 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/describe_missing.R
+\name{describe_missing}
+\alias{describe_missing}
+\title{Describe Missing Values in Data According to Guidelines}
+\usage{
+describe_missing(data, vars = NULL, scales = NULL)
+}
+\arguments{
+\item{data}{The data frame to be analyzed.}
+
+\item{vars}{Variable (or lists of variables) to check for missing values (NAs).}
+
+\item{scales}{The scale names to check for missing values (as a character vector).}
+}
+\value{
+A dataframe with the following columns:
+\itemize{
+\item \code{var}: Variables selected.
+\item \code{items}: Number of items for selected variables.
+\item \code{na}: Number of missing cell values for those variables (e.g., 2 missing
+values for the first participant + 2 missing values for the second participant
+= total of 4 missing values).
+\item \code{cells}: Total number of cells (i.e., number of participants multiplied by
+the number of variables, \code{items}).
+\item \code{na_percent}: The percentage of missing values (\code{na} divided by \code{cells}).
+\item \code{na_max}: The number of missing values for the participant with the most
+missing values for the selected variables.
+\item \code{na_max_percent}: The amount of missing values for the participant with
+the most missing values for the selected variables, as a percentage
+(i.e., \code{na_max} divided by the number of selected variables, \code{items}).
+\item \code{all_na}: The number of participants missing 100\% of items for that scale
+(the selected variables).
+}
+}
+\description{
+Provides a detailed description of missing values in a data frame.
+This function reports both absolute and percentage missing values of specified
+column lists or scales, following recommended guidelines. Some authors recommend
+reporting item-level missingness per scale, as well as a participant's maximum
+number of missing items by scale. For example, Parent (2013) writes:
+
+\emph{I recommend that authors (a) state their tolerance level for missing data by scale
+or subscale (e.g., "We calculated means for all subscales on which participants gave
+at least 75\% complete data") and then (b) report the individual missingness rates
+by scale per data point (i.e., the number of missing values out of all data points
+on that scale for all participants) and the maximum by participant (e.g., "For Attachment
+Anxiety, a total of 4 missing data points out of 100 were observed, with no participant
+missing more than a single data point").}
+}
+\examples{
+# Use the entire data frame
+describe_missing(airquality)
+
+# Use selected columns explicitly
+describe_missing(airquality,
+  vars = list(
+    c("Ozone", "Solar.R", "Wind"),
+    c("Temp", "Month", "Day")
+  )
+)
+
+# If the questionnaire items start with the same name, e.g.,
+set.seed(15)
+fun <- function() {
+  c(sample(c(NA, 1:10), replace = TRUE), NA, NA, NA)
+}
+df <- data.frame(
+  ID = c("idz", NA),
+  open_1 = fun(), open_2 = fun(), open_3 = fun(),
+  extrovert_1 = fun(), extrovert_2 = fun(), extrovert_3 = fun(),
+  agreeable_1 = fun(), agreeable_2 = fun(), agreeable_3 = fun()
+)
+
+# One can list the scale names directly:
+describe_missing(df, scales = c("ID", "open", "extrovert", "agreeable"))
+}
+\references{
+Parent, M. C. (2013). Handling item-level missing
+data: Simpler is just as good. \emph{The Counseling Psychologist},
+\emph{41}(4), 568-600. https://doi.org/10.1177\%2F0011000012445176
+}
+\keyword{NA}
+\keyword{guidelines}
+\keyword{missing}
+\keyword{values}
diff --git a/tests/testthat/_snaps/data_codebook.new.md b/tests/testthat/_snaps/data_codebook.new.md
new file mode 100644
index 000000000..2ba496ef4
--- /dev/null
+++ b/tests/testthat/_snaps/data_codebook.new.md
@@ -0,0 +1,705 @@
+# data_codebook iris
+
+    Code
+      data_codebook(iris)
+    Output
+      iris (150 rows and 5 variables, 5 shown)
+      
+      ID | Name         | Type        | Missings |     Values |          N
+      ---+--------------+-------------+----------+------------+-----------
+      1  | Sepal.Length | numeric     | 0 (0.0%) | [4.3, 7.9] |        150
+      ---+--------------+-------------+----------+------------+-----------
+      2  | Sepal.Width  | numeric     | 0 (0.0%) |   [2, 4.4] |        150
+      ---+--------------+-------------+----------+------------+-----------
+      3  | Petal.Length | numeric     | 0 (0.0%) |   [1, 6.9] |        150
+      ---+--------------+-------------+----------+------------+-----------
+      4  | Petal.Width  | numeric     | 0 (0.0%) | [0.1, 2.5] |        150
+      ---+--------------+-------------+----------+------------+-----------
+      5  | Species      | categorical | 0 (0.0%) |     setosa | 50 (33.3%)
+         |              |             |          | versicolor | 50 (33.3%)
+         |              |             |          |  virginica | 50 (33.3%)
+      --------------------------------------------------------------------
+
+# data_codebook iris, reordered
+
+    Code
+      data_codebook(iris[c(1, 2, 5, 3, 4)])
+    Output
+      iris[c(1, 2, 5, 3, 4)] (150 rows and 5 variables, 5 shown)
+      
+      ID | Name         | Type        | Missings |     Values |          N
+      ---+--------------+-------------+----------+------------+-----------
+      1  | Sepal.Length | numeric     | 0 (0.0%) | [4.3, 7.9] |        150
+      ---+--------------+-------------+----------+------------+-----------
+      2  | Sepal.Width  | numeric     | 0 (0.0%) |   [2, 4.4] |        150
+      ---+--------------+-------------+----------+------------+-----------
+      3  | Species      | categorical | 0 (0.0%) |     setosa | 50 (33.3%)
+         |              |             |          | versicolor | 50 (33.3%)
+         |              |             |          |  virginica | 50 (33.3%)
+      ---+--------------+-------------+----------+------------+-----------
+      4  | Petal.Length | numeric     | 0 (0.0%) |   [1, 6.9] |        150
+      ---+--------------+-------------+----------+------------+-----------
+      5  | Petal.Width  | numeric     | 0 (0.0%) | [0.1, 2.5] |        150
+      --------------------------------------------------------------------
+
+# data_codebook NaN and Inf
+
+    Code
+      data_codebook(d)
+    Output
+      d (9 rows and 1 variables, 1 shown)
+      
+      ID | Name | Type    |  Missings | Values |         N
+      ---+------+---------+-----------+--------+----------
+      1  | x    | numeric | 2 (22.2%) |      1 | 3 (42.9%)
+         |      |         |           |      2 | 1 (14.3%)
+         |      |         |           |      4 | 2 (28.6%)
+         |      |         |           |    Inf | 1 (14.3%)
+      ----------------------------------------------------
+
+---
+
+    Code
+      data_codebook(d)
+    Output
+      d (102 rows and 1 variables, 1 shown)
+      
+      ID | Name | Type    | Missings |  Values |           N
+      ---+------+---------+----------+---------+------------
+      1  | x    | numeric | 0 (0.0%) | [1, 15] | 102 (98.1%)
+         |      |         |          |     Inf |   2 ( 1.9%)
+      ------------------------------------------------------
+
+---
+
+    Code
+      data_codebook(d, range_at = 100)
+    Output
+      d (102 rows and 1 variables, 1 shown)
+      
+      ID | Name | Type    | Missings | Values |          N
+      ---+------+---------+----------+--------+-----------
+      1  | x    | numeric | 0 (0.0%) |      1 |  4 ( 4.0%)
+         |      |         |          |      2 |  5 ( 5.0%)
+         |      |         |          |      3 |  6 ( 6.0%)
+         |      |         |          |      4 |  5 ( 5.0%)
+         |      |         |          |      5 |  8 ( 8.0%)
+         |      |         |          |      6 | 10 (10.0%)
+         |      |         |          |      7 |  6 ( 6.0%)
+         |      |         |          |      8 |  3 ( 3.0%)
+         |      |         |          |      9 | 13 (13.0%)
+         |      |         |          |     10 |  7 ( 7.0%)
+         |      |         |          |  (...) |           
+      ----------------------------------------------------
+
+---
+
+    Code
+      data_codebook(d, range_at = 100, max_values = 4)
+    Output
+      d (102 rows and 1 variables, 1 shown)
+      
+      ID | Name | Type    | Missings | Values |        N
+      ---+------+---------+----------+--------+---------
+      1  | x    | numeric | 0 (0.0%) |      1 | 4 (4.0%)
+         |      |         |          |      2 | 5 (5.0%)
+         |      |         |          |      3 | 6 (6.0%)
+         |      |         |          |      4 | 5 (5.0%)
+         |      |         |          |  (...) |         
+      --------------------------------------------------
+
+# data_codebook iris, select
+
+    Code
+      data_codebook(iris, select = starts_with("Sepal"))
+    Output
+      iris (150 rows and 5 variables, 2 shown)
+      
+      ID | Name         | Type    | Missings |     Values |   N
+      ---+--------------+---------+----------+------------+----
+      1  | Sepal.Length | numeric | 0 (0.0%) | [4.3, 7.9] | 150
+      ---+--------------+---------+----------+------------+----
+      2  | Sepal.Width  | numeric | 0 (0.0%) |   [2, 4.4] | 150
+      ---------------------------------------------------------
+
+# data_codebook iris, select, ID
+
+    Code
+      data_codebook(iris, select = starts_with("Petal"))
+    Output
+      iris (150 rows and 5 variables, 2 shown)
+      
+      ID | Name         | Type    | Missings |     Values |   N
+      ---+--------------+---------+----------+------------+----
+      3  | Petal.Length | numeric | 0 (0.0%) |   [1, 6.9] | 150
+      ---+--------------+---------+----------+------------+----
+      4  | Petal.Width  | numeric | 0 (0.0%) | [0.1, 2.5] | 150
+      ---------------------------------------------------------
+
+# data_codebook efc
+
+    Code
+      print(data_codebook(efc), table_width = Inf)
+    Output
+      efc (100 rows and 5 variables, 5 shown)
+      
+      ID | Name     | Label                                    | Type        |   Missings |   Values | Value Labels                    |          N
+      ---+----------+------------------------------------------+-------------+------------+----------+---------------------------------+-----------
+      1  | c12hour  | average number of hours of care per week | numeric     |   2 (2.0%) | [5, 168] |                                 |         98
+      ---+----------+------------------------------------------+-------------+------------+----------+---------------------------------+-----------
+      2  | e16sex   | elder's gender                           | numeric     |   0 (0.0%) |        1 | male                            | 46 (46.0%)
+         |          |                                          |             |            |        2 | female                          | 54 (54.0%)
+      ---+----------+------------------------------------------+-------------+------------+----------+---------------------------------+-----------
+      3  | e42dep   | elder's dependency                       | categorical |   3 (3.0%) |        1 | independent                     |  2 ( 2.1%)
+         |          |                                          |             |            |        2 | slightly dependent              |  4 ( 4.1%)
+         |          |                                          |             |            |        3 | moderately dependent            | 28 (28.9%)
+         |          |                                          |             |            |        4 | severely dependent              | 63 (64.9%)
+      ---+----------+------------------------------------------+-------------+------------+----------+---------------------------------+-----------
+      4  | c172code | carer's level of education               | numeric     | 10 (10.0%) |        1 | low level of education          |  8 ( 8.9%)
+         |          |                                          |             |            |        2 | intermediate level of education | 66 (73.3%)
+         |          |                                          |             |            |        3 | high level of education         | 16 (17.8%)
+      ---+----------+------------------------------------------+-------------+------------+----------+---------------------------------+-----------
+      5  | neg_c_7  | Negative impact with 7 items             | numeric     |   3 (3.0%) |  [7, 28] |                                 |         97
+      ---------------------------------------------------------------------------------------------------------------------------------------------
+
+---
+
+    Code
+      print(data_codebook(efc), table_width = "auto", remove_duplicates = FALSE)
+    Output
+      efc (100 rows and 5 variables, 5 shown)
+      
+      ID | Name     | Label                                    | Type       
+      ---+----------+------------------------------------------+------------
+      1  | c12hour  | average number of hours of care per week | numeric    
+      ---+----------+------------------------------------------+------------
+      2  | e16sex   | elder's gender                           | numeric    
+      ---+----------+------------------------------------------+------------
+      ---+----------+------------------------------------------+------------
+      3  | e42dep   | elder's dependency                       | categorical
+      ---+----------+------------------------------------------+------------
+      ---+----------+------------------------------------------+------------
+      ---+----------+------------------------------------------+------------
+      ---+----------+------------------------------------------+------------
+      4  | c172code | carer's level of education               | numeric    
+      ---+----------+------------------------------------------+------------
+      ---+----------+------------------------------------------+------------
+      ---+----------+------------------------------------------+------------
+      5  | neg_c_7  | Negative impact with 7 items             | numeric    
+      ----------------------------------------------------------------------
+      
+      ID |   Missings |   Values | Value Labels                    |          N
+      ---+------------+----------+---------------------------------+-----------
+      1  |   2 (2.0%) | [5, 168] |                                 |         98
+      ---+------------+----------+---------------------------------+-----------
+      2  |   0 (0.0%) |        1 | male                            | 46 (46.0%)
+         |            |        2 | female                          | 54 (54.0%)
+      ---+------------+----------+---------------------------------+-----------
+      3  |   3 (3.0%) |        1 | independent                     |  2 ( 2.1%)
+         |            |        2 | slightly dependent              |  4 ( 4.1%)
+         |            |        3 | moderately dependent            | 28 (28.9%)
+         |            |        4 | severely dependent              | 63 (64.9%)
+      ---+------------+----------+---------------------------------+-----------
+      4  | 10 (10.0%) |        1 | low level of education          |  8 ( 8.9%)
+         |            |        2 | intermediate level of education | 66 (73.3%)
+         |            |        3 | high level of education         | 16 (17.8%)
+      ---+------------+----------+---------------------------------+-----------
+      5  |   3 (3.0%) |  [7, 28] |                                 |         97
+      -------------------------------------------------------------------------
+
+---
+
+    Code
+      print(data_codebook(efc), table_width = "auto", remove_duplicates = TRUE)
+    Output
+      efc (100 rows and 5 variables, 5 shown)
+      
+      ID | Name     | Label                                    | Type       
+      ---+----------+------------------------------------------+------------
+      1  | c12hour  | average number of hours of care per week | numeric    
+      ---+----------+------------------------------------------+------------
+      2  | e16sex   | elder's gender                           | numeric    
+      ---+----------+------------------------------------------+------------
+      ---+----------+------------------------------------------+------------
+      3  | e42dep   | elder's dependency                       | categorical
+      ---+----------+------------------------------------------+------------
+      ---+----------+------------------------------------------+------------
+      ---+----------+------------------------------------------+------------
+      ---+----------+------------------------------------------+------------
+      4  | c172code | carer's level of education               | numeric    
+      ---+----------+------------------------------------------+------------
+      ---+----------+------------------------------------------+------------
+      ---+----------+------------------------------------------+------------
+      5  | neg_c_7  | Negative impact with 7 items             | numeric    
+      ----------------------------------------------------------------------
+      
+      ID |   Missings |   Values | Value Labels                    |          N
+      ---+------------+----------+---------------------------------+-----------
+      1  |   2 (2.0%) | [5, 168] |                                 |         98
+      ---+------------+----------+---------------------------------+-----------
+      2  |   0 (0.0%) |        1 | male                            | 46 (46.0%)
+         |            |        2 | female                          | 54 (54.0%)
+      ---+------------+----------+---------------------------------+-----------
+      3  |   3 (3.0%) |        1 | independent                     |  2 ( 2.1%)
+         |            |        2 | slightly dependent              |  4 ( 4.1%)
+         |            |        3 | moderately dependent            | 28 (28.9%)
+         |            |        4 | severely dependent              | 63 (64.9%)
+      ---+------------+----------+---------------------------------+-----------
+      4  | 10 (10.0%) |        1 | low level of education          |  8 ( 8.9%)
+         |            |        2 | intermediate level of education | 66 (73.3%)
+         |            |        3 | high level of education         | 16 (17.8%)
+      ---+------------+----------+---------------------------------+-----------
+      5  |   3 (3.0%) |  [7, 28] |                                 |         97
+      -------------------------------------------------------------------------
+
+# data_codebook efc, variable_label_width
+
+    Code
+      print(out, table_width = Inf)
+    Output
+      efc (100 rows and 5 variables, 5 shown)
+      
+      ID | Name     | Label                        | Type        |   Missings |   Values | Value Labels                    |          N
+      ---+----------+------------------------------+-------------+------------+----------+---------------------------------+-----------
+      1  | c12hour  | average number of hours of   | numeric     |   2 (2.0%) | [5, 168] |                                 |         98
+         |          | care per week                |             |            |          |                                 |           
+      ---+----------+------------------------------+-------------+------------+----------+---------------------------------+-----------
+      2  | e16sex   | elder's gender               | numeric     |   0 (0.0%) |        1 | male                            | 46 (46.0%)
+         |          |                              |             |            |        2 | female                          | 54 (54.0%)
+      ---+----------+------------------------------+-------------+------------+----------+---------------------------------+-----------
+      3  | e42dep   | elder's dependency           | categorical |   3 (3.0%) |        1 | independent                     |  2 ( 2.1%)
+         |          |                              |             |            |        2 | slightly dependent              |  4 ( 4.1%)
+         |          |                              |             |            |        3 | moderately dependent            | 28 (28.9%)
+         |          |                              |             |            |        4 | severely dependent              | 63 (64.9%)
+      ---+----------+------------------------------+-------------+------------+----------+---------------------------------+-----------
+      4  | c172code | carer's level of education   | numeric     | 10 (10.0%) |        1 | low level of education          |  8 ( 8.9%)
+         |          |                              |             |            |        2 | intermediate level of education | 66 (73.3%)
+         |          |                              |             |            |        3 | high level of education         | 16 (17.8%)
+      ---+----------+------------------------------+-------------+------------+----------+---------------------------------+-----------
+      5  | neg_c_7  | Negative impact with 7 items | numeric     |   3 (3.0%) |  [7, 28] |                                 |         97
+      ---------------------------------------------------------------------------------------------------------------------------------
+
+---
+
+    Code
+      print(out, table_width = "auto", remove_duplicates = FALSE)
+    Output
+      efc (100 rows and 5 variables, 5 shown)
+      
+      ID | Name     | Label                        | Type        |   Missings
+      ---+----------+------------------------------+-------------+-----------
+      1  | c12hour  | average number of hours of   | numeric     |   2 (2.0%)
+         |          | care per week                |             |           
+      ---+----------+------------------------------+-------------+-----------
+      2  | e16sex   | elder's gender               | numeric     |   0 (0.0%)
+      ---+----------+------------------------------+-------------+-----------
+      ---+----------+------------------------------+-------------+-----------
+      3  | e42dep   | elder's dependency           | categorical |   3 (3.0%)
+      ---+----------+------------------------------+-------------+-----------
+      ---+----------+------------------------------+-------------+-----------
+      ---+----------+------------------------------+-------------+-----------
+      ---+----------+------------------------------+-------------+-----------
+      4  | c172code | carer's level of education   | numeric     | 10 (10.0%)
+      ---+----------+------------------------------+-------------+-----------
+      ---+----------+------------------------------+-------------+-----------
+      ---+----------+------------------------------+-------------+-----------
+      5  | neg_c_7  | Negative impact with 7 items | numeric     |   3 (3.0%)
+      -----------------------------------------------------------------------
+      
+      ID |   Values | Value Labels                    |          N
+      ---+----------+---------------------------------+-----------
+      1  | [5, 168] |                                 |         98
+      ---+----------+---------------------------------+-----------
+      ---+----------+---------------------------------+-----------
+      2  |        1 | male                            | 46 (46.0%)
+         |        2 | female                          | 54 (54.0%)
+      ---+----------+---------------------------------+-----------
+      3  |        1 | independent                     |  2 ( 2.1%)
+         |        2 | slightly dependent              |  4 ( 4.1%)
+         |        3 | moderately dependent            | 28 (28.9%)
+         |        4 | severely dependent              | 63 (64.9%)
+      ---+----------+---------------------------------+-----------
+      4  |        1 | low level of education          |  8 ( 8.9%)
+         |        2 | intermediate level of education | 66 (73.3%)
+         |        3 | high level of education         | 16 (17.8%)
+      ---+----------+---------------------------------+-----------
+      5  |  [7, 28] |                                 |         97
+      ------------------------------------------------------------
+
+---
+
+    Code
+      print(out, table_width = "auto", remove_duplicates = TRUE)
+    Output
+      efc (100 rows and 5 variables, 5 shown)
+      
+      ID | Name     | Label                        | Type        |   Missings
+      ---+----------+------------------------------+-------------+-----------
+      1  | c12hour  | average number of hours of   | numeric     |   2 (2.0%)
+         |          | care per week                |             |           
+      ---+----------+------------------------------+-------------+-----------
+      2  | e16sex   | elder's gender               | numeric     |   0 (0.0%)
+      ---+----------+------------------------------+-------------+-----------
+      ---+----------+------------------------------+-------------+-----------
+      3  | e42dep   | elder's dependency           | categorical |   3 (3.0%)
+      ---+----------+------------------------------+-------------+-----------
+      ---+----------+------------------------------+-------------+-----------
+      ---+----------+------------------------------+-------------+-----------
+      ---+----------+------------------------------+-------------+-----------
+      4  | c172code | carer's level of education   | numeric     | 10 (10.0%)
+      ---+----------+------------------------------+-------------+-----------
+      ---+----------+------------------------------+-------------+-----------
+      ---+----------+------------------------------+-------------+-----------
+      5  | neg_c_7  | Negative impact with 7 items | numeric     |   3 (3.0%)
+      -----------------------------------------------------------------------
+      
+      ID |   Values | Value Labels                    |          N
+      ---+----------+---------------------------------+-----------
+      1  | [5, 168] |                                 |         98
+      ---+----------+---------------------------------+-----------
+      ---+----------+---------------------------------+-----------
+      2  |        1 | male                            | 46 (46.0%)
+         |        2 | female                          | 54 (54.0%)
+      ---+----------+---------------------------------+-----------
+      3  |        1 | independent                     |  2 ( 2.1%)
+         |        2 | slightly dependent              |  4 ( 4.1%)
+         |        3 | moderately dependent            | 28 (28.9%)
+         |        4 | severely dependent              | 63 (64.9%)
+      ---+----------+---------------------------------+-----------
+      4  |        1 | low level of education          |  8 ( 8.9%)
+         |        2 | intermediate level of education | 66 (73.3%)
+         |        3 | high level of education         | 16 (17.8%)
+      ---+----------+---------------------------------+-----------
+      5  |  [7, 28] |                                 |         97
+      ------------------------------------------------------------
+
+# data_codebook efc, value_label_width
+
+    Code
+      print(out, table_width = Inf)
+    Output
+      efc (100 rows and 5 variables, 5 shown)
+      
+      ID | Name     | Label                        | Type        |   Missings |   Values | Value Labels     |          N
+      ---+----------+------------------------------+-------------+------------+----------+------------------+-----------
+      1  | c12hour  | average number of hours of   | numeric     |   2 (2.0%) | [5, 168] |                  |         98
+         |          | care per week                |             |            |          |                  |           
+      ---+----------+------------------------------+-------------+------------+----------+------------------+-----------
+      2  | e16sex   | elder's gender               | numeric     |   0 (0.0%) |        1 | male             | 46 (46.0%)
+         |          |                              |             |            |        2 | female           | 54 (54.0%)
+      ---+----------+------------------------------+-------------+------------+----------+------------------+-----------
+      3  | e42dep   | elder's dependency           | categorical |   3 (3.0%) |        1 | independent      |  2 ( 2.1%)
+         |          |                              |             |            |        2 | slightly...      |  4 ( 4.1%)
+         |          |                              |             |            |        3 | moderately...    | 28 (28.9%)
+         |          |                              |             |            |        4 | severely...      | 63 (64.9%)
+      ---+----------+------------------------------+-------------+------------+----------+------------------+-----------
+      4  | c172code | carer's level of education   | numeric     | 10 (10.0%) |        1 | low level of...  |  8 ( 8.9%)
+         |          |                              |             |            |        2 | intermediate...  | 66 (73.3%)
+         |          |                              |             |            |        3 | high level of... | 16 (17.8%)
+      ---+----------+------------------------------+-------------+------------+----------+------------------+-----------
+      5  | neg_c_7  | Negative impact with 7 items | numeric     |   3 (3.0%) |  [7, 28] |                  |         97
+      ------------------------------------------------------------------------------------------------------------------
+
+---
+
+    Code
+      print(out, table_width = "auto", remove_duplicates = FALSE)
+    Output
+      efc (100 rows and 5 variables, 5 shown)
+      
+      ID | Name     | Label                        | Type        |   Missings
+      ---+----------+------------------------------+-------------+-----------
+      1  | c12hour  | average number of hours of   | numeric     |   2 (2.0%)
+         |          | care per week                |             |           
+      ---+----------+------------------------------+-------------+-----------
+      2  | e16sex   | elder's gender               | numeric     |   0 (0.0%)
+      ---+----------+------------------------------+-------------+-----------
+      ---+----------+------------------------------+-------------+-----------
+      3  | e42dep   | elder's dependency           | categorical |   3 (3.0%)
+      ---+----------+------------------------------+-------------+-----------
+      ---+----------+------------------------------+-------------+-----------
+      ---+----------+------------------------------+-------------+-----------
+      ---+----------+------------------------------+-------------+-----------
+      4  | c172code | carer's level of education   | numeric     | 10 (10.0%)
+      ---+----------+------------------------------+-------------+-----------
+      ---+----------+------------------------------+-------------+-----------
+      ---+----------+------------------------------+-------------+-----------
+      5  | neg_c_7  | Negative impact with 7 items | numeric     |   3 (3.0%)
+      -----------------------------------------------------------------------
+      
+      ID |   Values | Value Labels     |          N
+      ---+----------+------------------+-----------
+      1  | [5, 168] |                  |         98
+      ---+----------+------------------+-----------
+      ---+----------+------------------+-----------
+      2  |        1 | male             | 46 (46.0%)
+         |        2 | female           | 54 (54.0%)
+      ---+----------+------------------+-----------
+      3  |        1 | independent      |  2 ( 2.1%)
+         |        2 | slightly...      |  4 ( 4.1%)
+         |        3 | moderately...    | 28 (28.9%)
+         |        4 | severely...      | 63 (64.9%)
+      ---+----------+------------------+-----------
+      4  |        1 | low level of...  |  8 ( 8.9%)
+         |        2 | intermediate...  | 66 (73.3%)
+         |        3 | high level of... | 16 (17.8%)
+      ---+----------+------------------+-----------
+      5  |  [7, 28] |                  |         97
+      ---------------------------------------------
+
+---
+
+    Code
+      print(out, table_width = "auto", remove_duplicates = TRUE)
+    Output
+      efc (100 rows and 5 variables, 5 shown)
+      
+      ID | Name     | Label                        | Type        |   Missings
+      ---+----------+------------------------------+-------------+-----------
+      1  | c12hour  | average number of hours of   | numeric     |   2 (2.0%)
+         |          | care per week                |             |           
+      ---+----------+------------------------------+-------------+-----------
+      2  | e16sex   | elder's gender               | numeric     |   0 (0.0%)
+      ---+----------+------------------------------+-------------+-----------
+      ---+----------+------------------------------+-------------+-----------
+      3  | e42dep   | elder's dependency           | categorical |   3 (3.0%)
+      ---+----------+------------------------------+-------------+-----------
+      ---+----------+------------------------------+-------------+-----------
+      ---+----------+------------------------------+-------------+-----------
+      ---+----------+------------------------------+-------------+-----------
+      4  | c172code | carer's level of education   | numeric     | 10 (10.0%)
+      ---+----------+------------------------------+-------------+-----------
+      ---+----------+------------------------------+-------------+-----------
+      ---+----------+------------------------------+-------------+-----------
+      5  | neg_c_7  | Negative impact with 7 items | numeric     |   3 (3.0%)
+      -----------------------------------------------------------------------
+      
+      ID |   Values | Value Labels     |          N
+      ---+----------+------------------+-----------
+      1  | [5, 168] |                  |         98
+      ---+----------+------------------+-----------
+      ---+----------+------------------+-----------
+      2  |        1 | male             | 46 (46.0%)
+         |        2 | female           | 54 (54.0%)
+      ---+----------+------------------+-----------
+      3  |        1 | independent      |  2 ( 2.1%)
+         |        2 | slightly...      |  4 ( 4.1%)
+         |        3 | moderately...    | 28 (28.9%)
+         |        4 | severely...      | 63 (64.9%)
+      ---+----------+------------------+-----------
+      4  |        1 | low level of...  |  8 ( 8.9%)
+         |        2 | intermediate...  | 66 (73.3%)
+         |        3 | high level of... | 16 (17.8%)
+      ---+----------+------------------+-----------
+      5  |  [7, 28] |                  |         97
+      ---------------------------------------------
+
+# data_codebook truncated data
+
+    Code
+      data_codebook(d, max_values = 5)
+    Output
+      d (100 rows and 2 variables, 2 shown)
+      
+      ID | Name | Type      | Missings |  Values |        N
+      ---+------+-----------+----------+---------+---------
+      1  | a    | integer   | 0 (0.0%) | [1, 15] |      100
+      ---+------+-----------+----------+---------+---------
+      2  | b    | character | 0 (0.0%) |       a | 4 (4.0%)
+         |      |           |          |       b | 3 (3.0%)
+         |      |           |          |       c | 5 (5.0%)
+         |      |           |          |       d | 4 (4.0%)
+         |      |           |          |       e | 3 (3.0%)
+         |      |           |          |   (...) |         
+      -----------------------------------------------------
+
+# data_codebook mixed numeric lengths
+
+    Code
+      data_codebook(d)
+    Output
+      d (100 rows and 2 variables, 2 shown)
+      
+      ID | Name | Type    | Missings |  Values |          N
+      ---+------+---------+----------+---------+-----------
+      1  | a    | integer | 0 (0.0%) |       1 | 28 (28.0%)
+         |      |         |          |       2 | 26 (26.0%)
+         |      |         |          |       3 | 29 (29.0%)
+         |      |         |          |       4 | 17 (17.0%)
+      ---+------+---------+----------+---------+-----------
+      2  | b    | integer | 0 (0.0%) | [5, 15] |        100
+      -----------------------------------------------------
+
+# data_codebook mixed range_at
+
+    Code
+      data_codebook(d, range_at = 3)
+    Output
+      d (100 rows and 2 variables, 2 shown)
+      
+      ID | Name | Type    | Missings |  Values |   N
+      ---+------+---------+----------+---------+----
+      1  | a    | integer | 0 (0.0%) |  [1, 4] | 100
+      ---+------+---------+----------+---------+----
+      2  | b    | integer | 0 (0.0%) | [5, 15] | 100
+      ----------------------------------------------
+
+# data_codebook logicals
+
+    Code
+      data_codebook(d)
+    Output
+      d (100 rows and 3 variables, 3 shown)
+      
+      ID | Name | Type      | Missings |  Values |          N
+      ---+------+-----------+----------+---------+-----------
+      1  | a    | integer   | 0 (0.0%) | [1, 15] |        100
+      ---+------+-----------+----------+---------+-----------
+      2  | b    | character | 0 (0.0%) |       a | 26 (26.0%)
+         |      |           |          |       b | 38 (38.0%)
+         |      |           |          |       c | 36 (36.0%)
+      ---+------+-----------+----------+---------+-----------
+      3  | c    | logical   | 0 (0.0%) |   FALSE | 42 (42.0%)
+         |      |           |          |    TRUE | 58 (58.0%)
+      -------------------------------------------------------
+
+# data_codebook labelled data exceptions
+
+    Code
+      data_codebook(d)
+    Output
+      d (100 rows and 3 variables, 3 shown)
+      
+      ID | Name | Type    |   Missings | Values | Value Labels |          N
+      ---+------+---------+------------+--------+--------------+-----------
+      1  | f1   | integer | 17 (17.0%) |      1 | One          | 21 (25.3%)
+         |      |         |            |      2 | Two          | 20 (24.1%)
+         |      |         |            |      3 | Three        | 23 (27.7%)
+         |      |         |            |      5 | Five         | 19 (22.9%)
+      ---+------+---------+------------+--------+--------------+-----------
+      2  | f2   | integer |   0 (0.0%) |      1 | One          | 25 (25.0%)
+         |      |         |            |      2 | Two          | 20 (20.0%)
+         |      |         |            |      3 | Three        | 14 (14.0%)
+         |      |         |            |      4 | 4            | 17 (17.0%)
+         |      |         |            |      5 | Five         | 24 (24.0%)
+      ---+------+---------+------------+--------+--------------+-----------
+      3  | f3   | integer |   0 (0.0%) |      1 | One          | 21 (21.0%)
+         |      |         |            |      2 | Two          | 24 (24.0%)
+         |      |         |            |      3 | Three        | 16 (16.0%)
+         |      |         |            |      4 | Four         | 14 (14.0%)
+         |      |         |            |      5 | Five         | 25 (25.0%)
+      ---------------------------------------------------------------------
+
+# data_codebook labelled data factors
+
+    Code
+      data_codebook(d)
+    Output
+      d (100 rows and 3 variables, 3 shown)
+      
+      ID | Name | Type        | Missings | Values | Value Labels |          N
+      ---+------+-------------+----------+--------+--------------+-----------
+      1  | f1   | categorical | 0 (0.0%) |      a | A            | 35 (35.0%)
+         |      |             |          |      b | Bee          | 32 (32.0%)
+         |      |             |          |      c | Cee          | 33 (33.0%)
+      ---+------+-------------+----------+--------+--------------+-----------
+      2  | f2   | categorical | 0 (0.0%) |      a | A            | 30 (30.0%)
+         |      |             |          |      b | Bee          | 38 (38.0%)
+         |      |             |          |      c | Cee          | 32 (32.0%)
+      ---+------+-------------+----------+--------+--------------+-----------
+      3  | f3   | categorical | 0 (0.0%) |      a | A            | 23 (23.0%)
+         |      |             |          |      b | Bee          | 28 (28.0%)
+         |      |             |          |      c | Cee          | 49 (49.0%)
+      -----------------------------------------------------------------------
+
+# data_codebook works with numbers < 1
+
+    Code
+      data_codebook(d)
+    Output
+      d (6 rows and 2 variables, 2 shown)
+      
+      ID | Name | Type    | Missings | Values |         N
+      ---+------+---------+----------+--------+----------
+      1  | a    | numeric | 0 (0.0%) |      1 | 2 (33.3%)
+         |      |         |          |      2 | 2 (33.3%)
+         |      |         |          |      3 | 2 (33.3%)
+      ---+------+---------+----------+--------+----------
+      2  | b    | numeric | 0 (0.0%) |      0 | 3 (50.0%)
+         |      |         |          |      1 | 2 (33.3%)
+         |      |         |          |      2 | 1 (16.7%)
+      ---------------------------------------------------
+
+# data_codebook, big marks
+
+    Code
+      data_codebook(d)
+    Output
+      d (1,000,000 rows and 2 variables, 2 shown)
+      
+      ID | Name | Type        | Missings | Values |               N
+      ---+------+-------------+----------+--------+----------------
+      1  | f1   | categorical | 0 (0.0%) |      a | 333,238 (33.3%)
+         |      |             |          |      b | 332,910 (33.3%)
+         |      |             |          |      c | 333,852 (33.4%)
+      ---+------+-------------+----------+--------+----------------
+      2  | f2   | categorical | 0 (0.0%) |      1 | 333,285 (33.3%)
+         |      |             |          |      2 | 333,358 (33.3%)
+         |      |             |          |      3 | 333,357 (33.3%)
+      -------------------------------------------------------------
+
+# data_codebook, tagged NA
+
+    Code
+      data_codebook(data.frame(x))
+    Output
+      data.frame(x) (26 rows and 1 variables, 1 shown)
+      
+      ID | Name | Type    |   Missings | Values | Value Labels |         N
+      ---+------+---------+------------+--------+--------------+----------
+      1  | x    | numeric | 12 (46.2%) |      1 | Agreement    | 4 (15.4%)
+         |      |         |            |      2 | 2            | 4 (15.4%)
+         |      |         |            |      3 | 3            | 4 (15.4%)
+         |      |         |            |      4 | Disagreement | 2 ( 7.7%)
+         |      |         |            |  NA(a) | Refused      | 4 (15.4%)
+         |      |         |            |  NA(c) | First        | 5 (19.2%)
+         |      |         |            |  NA(z) | Not home     | 3 (11.5%)
+      --------------------------------------------------------------------
+
+---
+
+    Code
+      data_codebook(data.frame(x))
+    Output
+      data.frame(x) (23 rows and 1 variables, 1 shown)
+      
+      ID | Name | Type    |  Missings | Values | Value Labels |         N
+      ---+------+---------+-----------+--------+--------------+----------
+      1  | x    | numeric | 9 (39.1%) |      1 | Agreement    | 4 (17.4%)
+         |      |         |           |      2 | 2            | 4 (17.4%)
+         |      |         |           |      3 | 3            | 4 (17.4%)
+         |      |         |           |      4 | Disagreement | 2 ( 8.7%)
+         |      |         |           |  NA(a) | Refused      | 4 (17.4%)
+         |      |         |           |  NA(c) | First        | 5 (21.7%)
+      -------------------------------------------------------------------
+
+# data_codebook, negative label values #334
+
+    Code
+      data_codebook(data.frame(x1, x2))
+    Output
+      data.frame(x1, x2) (4 rows and 2 variables, 2 shown)
+      
+      ID | Name | Type    | Missings | Values | Value Labels |         N
+      ---+------+---------+----------+--------+--------------+----------
+      1  | x1   | integer | 0 (0.0%) |      1 | Agreement    | 1 (25.0%)
+         |      |         |          |      2 | 2            | 1 (25.0%)
+         |      |         |          |      3 | 3            | 1 (25.0%)
+         |      |         |          |      4 | Disagreement | 1 (25.0%)
+      ---+------+---------+----------+--------+--------------+----------
+      2  | x2   | numeric | 0 (0.0%) |     -9 | Missing      | 1 (25.0%)
+         |      |         |          |      1 | Agreement    | 1 (25.0%)
+         |      |         |          |      2 | 2            | 1 (25.0%)
+         |      |         |          |      3 | 3            | 1 (25.0%)
+      ------------------------------------------------------------------
+
diff --git a/tests/testthat/_snaps/describe_missing.md b/tests/testthat/_snaps/describe_missing.md
new file mode 100644
index 000000000..04080c190
--- /dev/null
+++ b/tests/testthat/_snaps/describe_missing.md
@@ -0,0 +1,38 @@
+# describe_missing
+
+    Code
+      describe_missing(airquality)
+    Output
+              var items na cells na_percent na_max na_max_percent all_na
+      1 Ozone:Day     6 44   918       4.79      2          33.33      0
+
+---
+
+    Code
+      describe_missing(airquality, vars = list(c("Ozone", "Solar.R", "Wind"), c(
+        "Temp", "Month", "Day")))
+    Output
+               var items na cells na_percent na_max na_max_percent all_na
+      1 Ozone:Wind     3 44   459       9.59      2          66.67      0
+      2   Temp:Day     3  0   459       0.00      0           0.00      0
+      3      Total     6 44   918       4.79      2          33.33      0
+
+---
+
+    Code
+      df <- data.frame(ID = c("idz", NA), scale1_Q1 = fun(), scale1_Q2 = fun(),
+      scale1_Q3 = fun(), scale2_Q1 = fun(), scale2_Q2 = fun(), scale2_Q3 = fun(),
+      scale3_Q1 = fun(), scale3_Q2 = fun(), scale3_Q3 = fun())
+
+---
+
+    Code
+      describe_missing(df, scales = c("ID", "scale1", "scale2", "scale3"))
+    Output
+                        var items na cells na_percent na_max na_max_percent all_na
+      1               ID:ID     1  7    14      50.00      1            100      7
+      2 scale1_Q1:scale1_Q3     3 11    42      26.19      3            100      3
+      3 scale2_Q1:scale2_Q3     3 17    42      40.48      3            100      3
+      4 scale3_Q1:scale3_Q3     3 10    42      23.81      3            100      3
+      5               Total    10 45   140      32.14     10            100      2
+
diff --git a/tests/testthat/test-describe_missing.R b/tests/testthat/test-describe_missing.R
new file mode 100644
index 000000000..27d44c386
--- /dev/null
+++ b/tests/testthat/test-describe_missing.R
@@ -0,0 +1,26 @@
+test_that("describe_missing", {
+  expect_snapshot(describe_missing(airquality))
+
+  # Use selected columns explicitly
+  expect_snapshot(describe_missing(airquality,
+    vars = list(
+      c("Ozone", "Solar.R", "Wind"),
+      c("Temp", "Month", "Day")
+    )
+  ))
+
+  # If the questionnaire items start with the same name, e.g.,
+  set.seed(15)
+  fun <- function() {
+    c(sample(c(NA, 1:10), replace = TRUE), NA, NA, NA)
+  }
+  expect_snapshot(df <- data.frame(
+    ID = c("idz", NA),
+    scale1_Q1 = fun(), scale1_Q2 = fun(), scale1_Q3 = fun(),
+    scale2_Q1 = fun(), scale2_Q2 = fun(), scale2_Q3 = fun(),
+    scale3_Q1 = fun(), scale3_Q2 = fun(), scale3_Q3 = fun()
+  ))
+
+  # One can list the scale names directly:
+  expect_snapshot(describe_missing(df, scales = c("ID", "scale1", "scale2", "scale3")))
+})