From ab9f0064c35050359247688f48583349704aa8f7 Mon Sep 17 00:00:00 2001 From: rempsyc Date: Mon, 11 Nov 2024 21:25:58 +0100 Subject: [PATCH] Suggestion of new function: `describe_missing()` Fixes #454 --- NAMESPACE | 1 + R/describe_missing.R | 118 ++++ man/describe_missing.Rd | 86 +++ tests/testthat/_snaps/data_codebook.new.md | 705 +++++++++++++++++++++ tests/testthat/_snaps/describe_missing.md | 38 ++ tests/testthat/test-describe_missing.R | 26 + 6 files changed, 974 insertions(+) create mode 100644 R/describe_missing.R create mode 100644 man/describe_missing.Rd create mode 100644 tests/testthat/_snaps/data_codebook.new.md create mode 100644 tests/testthat/_snaps/describe_missing.md create mode 100644 tests/testthat/test-describe_missing.R diff --git a/NAMESPACE b/NAMESPACE index 7e97817b9..e463f7261 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -267,6 +267,7 @@ export(data_write) export(degroup) export(demean) export(describe_distribution) +export(describe_missing) export(detrend) export(distribution_coef_var) export(distribution_mode) diff --git a/R/describe_missing.R b/R/describe_missing.R new file mode 100644 index 000000000..2f2f0da9c --- /dev/null +++ b/R/describe_missing.R @@ -0,0 +1,118 @@ +#' @title Describe Missing Values in Data According to Guidelines +#' +#' @description Provides a detailed description of missing values in a data frame. +#' This function reports both absolute and percentage missing values of specified +#' column lists or scales, following recommended guidelines. Some authors recommend +#' reporting item-level missingness per scale, as well as a participant's maximum +#' number of missing items by scale. For example, Parent (2013) writes: +#' +#' *I recommend that authors (a) state their tolerance level for missing data by scale +#' or subscale (e.g., "We calculated means for all subscales on which participants gave +#' at least 75% complete data") and then (b) report the individual missingness rates +#' by scale per data point (i.e., the number of missing values out of all data points +#' on that scale for all participants) and the maximum by participant (e.g., "For Attachment +#' Anxiety, a total of 4 missing data points out of 100 were observed, with no participant +#' missing more than a single data point").* +#' +#' @param data The data frame to be analyzed. +#' @param vars Variable (or lists of variables) to check for missing values (NAs). +#' @param scales The scale names to check for missing values (as a character vector). +#' @keywords missing values NA guidelines +#' @return A dataframe with the following columns: +#' - `var`: Variables selected. +#' - `items`: Number of items for selected variables. +#' - `na`: Number of missing cell values for those variables (e.g., 2 missing +#' values for the first participant + 2 missing values for the second participant +#' = total of 4 missing values). +#' - `cells`: Total number of cells (i.e., number of participants multiplied by +#' the number of variables, `items`). +#' - `na_percent`: The percentage of missing values (`na` divided by `cells`). +#' - `na_max`: The number of missing values for the participant with the most +#' missing values for the selected variables. +#' - `na_max_percent`: The amount of missing values for the participant with +#' the most missing values for the selected variables, as a percentage +#' (i.e., `na_max` divided by the number of selected variables, `items`). +#' - `all_na`: The number of participants missing 100% of items for that scale +#' (the selected variables). +#' +#' @export +#' @references Parent, M. C. (2013). Handling item-level missing +#' data: Simpler is just as good. *The Counseling Psychologist*, +#' *41*(4), 568-600. https://doi.org/10.1177%2F0011000012445176 +#' @examples +#' # Use the entire data frame +#' describe_missing(airquality) +#' +#' # Use selected columns explicitly +#' describe_missing(airquality, +#' vars = list( +#' c("Ozone", "Solar.R", "Wind"), +#' c("Temp", "Month", "Day") +#' ) +#' ) +#' +#' # If the questionnaire items start with the same name, e.g., +#' set.seed(15) +#' fun <- function() { +#' c(sample(c(NA, 1:10), replace = TRUE), NA, NA, NA) +#' } +#' df <- data.frame( +#' ID = c("idz", NA), +#' open_1 = fun(), open_2 = fun(), open_3 = fun(), +#' extrovert_1 = fun(), extrovert_2 = fun(), extrovert_3 = fun(), +#' agreeable_1 = fun(), agreeable_2 = fun(), agreeable_3 = fun() +#' ) +#' +#' # One can list the scale names directly: +#' describe_missing(df, scales = c("ID", "open", "extrovert", "agreeable")) + + + +describe_missing <- function(data, vars = NULL, scales = NULL) { + classes <- lapply(data, class) + if (missing(vars) & missing(scales)) { + vars.internal <- names(data) + } else if (!missing(scales)) { + vars.internal <- lapply(scales, function(x) { + grep(paste0("^", x), names(data), value = TRUE) + }) + } + if (!missing(vars)) { + vars.internal <- vars + } + if (!is.list(vars.internal)) { + vars.internal <- list(vars.internal) + } + na_df <- .describe_missing(data) + if (!missing(vars) | !missing(scales)) { + na_list <- lapply(vars.internal, function(x) { + data_subset <- data[, x, drop = FALSE] + .describe_missing(data_subset) + }) + na_df$var <- "Total" + na_df <- do.call(rbind, c(na_list, list(na_df))) + } + na_df +} + +.describe_missing <- function(data) { + var <- paste0(names(data)[1], ":", names(data)[ncol(data)]) + items <- ncol(data) + na <- sum(is.na(data)) + cells <- nrow(data) * ncol(data) + na_percent <- round(na / cells * 100, 2) + na_max <- max(rowSums(is.na(data))) + na_max_percent <- round(na_max / items * 100, 2) + all_na <- sum(apply(data, 1, function(x) all(is.na(x)))) + + data.frame( + var = var, + items = items, + na = na, + cells = cells, + na_percent = na_percent, + na_max = na_max, + na_max_percent = na_max_percent, + all_na = all_na + ) +} diff --git a/man/describe_missing.Rd b/man/describe_missing.Rd new file mode 100644 index 000000000..c206a23ce --- /dev/null +++ b/man/describe_missing.Rd @@ -0,0 +1,86 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/describe_missing.R +\name{describe_missing} +\alias{describe_missing} +\title{Describe Missing Values in Data According to Guidelines} +\usage{ +describe_missing(data, vars = NULL, scales = NULL) +} +\arguments{ +\item{data}{The data frame to be analyzed.} + +\item{vars}{Variable (or lists of variables) to check for missing values (NAs).} + +\item{scales}{The scale names to check for missing values (as a character vector).} +} +\value{ +A dataframe with the following columns: +\itemize{ +\item \code{var}: Variables selected. +\item \code{items}: Number of items for selected variables. +\item \code{na}: Number of missing cell values for those variables (e.g., 2 missing +values for the first participant + 2 missing values for the second participant += total of 4 missing values). +\item \code{cells}: Total number of cells (i.e., number of participants multiplied by +the number of variables, \code{items}). +\item \code{na_percent}: The percentage of missing values (\code{na} divided by \code{cells}). +\item \code{na_max}: The number of missing values for the participant with the most +missing values for the selected variables. +\item \code{na_max_percent}: The amount of missing values for the participant with +the most missing values for the selected variables, as a percentage +(i.e., \code{na_max} divided by the number of selected variables, \code{items}). +\item \code{all_na}: The number of participants missing 100\% of items for that scale +(the selected variables). +} +} +\description{ +Provides a detailed description of missing values in a data frame. +This function reports both absolute and percentage missing values of specified +column lists or scales, following recommended guidelines. Some authors recommend +reporting item-level missingness per scale, as well as a participant's maximum +number of missing items by scale. For example, Parent (2013) writes: + +\emph{I recommend that authors (a) state their tolerance level for missing data by scale +or subscale (e.g., "We calculated means for all subscales on which participants gave +at least 75\% complete data") and then (b) report the individual missingness rates +by scale per data point (i.e., the number of missing values out of all data points +on that scale for all participants) and the maximum by participant (e.g., "For Attachment +Anxiety, a total of 4 missing data points out of 100 were observed, with no participant +missing more than a single data point").} +} +\examples{ +# Use the entire data frame +describe_missing(airquality) + +# Use selected columns explicitly +describe_missing(airquality, + vars = list( + c("Ozone", "Solar.R", "Wind"), + c("Temp", "Month", "Day") + ) +) + +# If the questionnaire items start with the same name, e.g., +set.seed(15) +fun <- function() { + c(sample(c(NA, 1:10), replace = TRUE), NA, NA, NA) +} +df <- data.frame( + ID = c("idz", NA), + open_1 = fun(), open_2 = fun(), open_3 = fun(), + extrovert_1 = fun(), extrovert_2 = fun(), extrovert_3 = fun(), + agreeable_1 = fun(), agreeable_2 = fun(), agreeable_3 = fun() +) + +# One can list the scale names directly: +describe_missing(df, scales = c("ID", "open", "extrovert", "agreeable")) +} +\references{ +Parent, M. C. (2013). Handling item-level missing +data: Simpler is just as good. \emph{The Counseling Psychologist}, +\emph{41}(4), 568-600. https://doi.org/10.1177\%2F0011000012445176 +} +\keyword{NA} +\keyword{guidelines} +\keyword{missing} +\keyword{values} diff --git a/tests/testthat/_snaps/data_codebook.new.md b/tests/testthat/_snaps/data_codebook.new.md new file mode 100644 index 000000000..2ba496ef4 --- /dev/null +++ b/tests/testthat/_snaps/data_codebook.new.md @@ -0,0 +1,705 @@ +# data_codebook iris + + Code + data_codebook(iris) + Output + iris (150 rows and 5 variables, 5 shown) + + ID | Name | Type | Missings | Values | N + ---+--------------+-------------+----------+------------+----------- + 1 | Sepal.Length | numeric | 0 (0.0%) | [4.3, 7.9] | 150 + ---+--------------+-------------+----------+------------+----------- + 2 | Sepal.Width | numeric | 0 (0.0%) | [2, 4.4] | 150 + ---+--------------+-------------+----------+------------+----------- + 3 | Petal.Length | numeric | 0 (0.0%) | [1, 6.9] | 150 + ---+--------------+-------------+----------+------------+----------- + 4 | Petal.Width | numeric | 0 (0.0%) | [0.1, 2.5] | 150 + ---+--------------+-------------+----------+------------+----------- + 5 | Species | categorical | 0 (0.0%) | setosa | 50 (33.3%) + | | | | versicolor | 50 (33.3%) + | | | | virginica | 50 (33.3%) + -------------------------------------------------------------------- + +# data_codebook iris, reordered + + Code + data_codebook(iris[c(1, 2, 5, 3, 4)]) + Output + iris[c(1, 2, 5, 3, 4)] (150 rows and 5 variables, 5 shown) + + ID | Name | Type | Missings | Values | N + ---+--------------+-------------+----------+------------+----------- + 1 | Sepal.Length | numeric | 0 (0.0%) | [4.3, 7.9] | 150 + ---+--------------+-------------+----------+------------+----------- + 2 | Sepal.Width | numeric | 0 (0.0%) | [2, 4.4] | 150 + ---+--------------+-------------+----------+------------+----------- + 3 | Species | categorical | 0 (0.0%) | setosa | 50 (33.3%) + | | | | versicolor | 50 (33.3%) + | | | | virginica | 50 (33.3%) + ---+--------------+-------------+----------+------------+----------- + 4 | Petal.Length | numeric | 0 (0.0%) | [1, 6.9] | 150 + ---+--------------+-------------+----------+------------+----------- + 5 | Petal.Width | numeric | 0 (0.0%) | [0.1, 2.5] | 150 + -------------------------------------------------------------------- + +# data_codebook NaN and Inf + + Code + data_codebook(d) + Output + d (9 rows and 1 variables, 1 shown) + + ID | Name | Type | Missings | Values | N + ---+------+---------+-----------+--------+---------- + 1 | x | numeric | 2 (22.2%) | 1 | 3 (42.9%) + | | | | 2 | 1 (14.3%) + | | | | 4 | 2 (28.6%) + | | | | Inf | 1 (14.3%) + ---------------------------------------------------- + +--- + + Code + data_codebook(d) + Output + d (102 rows and 1 variables, 1 shown) + + ID | Name | Type | Missings | Values | N + ---+------+---------+----------+---------+------------ + 1 | x | numeric | 0 (0.0%) | [1, 15] | 102 (98.1%) + | | | | Inf | 2 ( 1.9%) + ------------------------------------------------------ + +--- + + Code + data_codebook(d, range_at = 100) + Output + d (102 rows and 1 variables, 1 shown) + + ID | Name | Type | Missings | Values | N + ---+------+---------+----------+--------+----------- + 1 | x | numeric | 0 (0.0%) | 1 | 4 ( 4.0%) + | | | | 2 | 5 ( 5.0%) + | | | | 3 | 6 ( 6.0%) + | | | | 4 | 5 ( 5.0%) + | | | | 5 | 8 ( 8.0%) + | | | | 6 | 10 (10.0%) + | | | | 7 | 6 ( 6.0%) + | | | | 8 | 3 ( 3.0%) + | | | | 9 | 13 (13.0%) + | | | | 10 | 7 ( 7.0%) + | | | | (...) | + ---------------------------------------------------- + +--- + + Code + data_codebook(d, range_at = 100, max_values = 4) + Output + d (102 rows and 1 variables, 1 shown) + + ID | Name | Type | Missings | Values | N + ---+------+---------+----------+--------+--------- + 1 | x | numeric | 0 (0.0%) | 1 | 4 (4.0%) + | | | | 2 | 5 (5.0%) + | | | | 3 | 6 (6.0%) + | | | | 4 | 5 (5.0%) + | | | | (...) | + -------------------------------------------------- + +# data_codebook iris, select + + Code + data_codebook(iris, select = starts_with("Sepal")) + Output + iris (150 rows and 5 variables, 2 shown) + + ID | Name | Type | Missings | Values | N + ---+--------------+---------+----------+------------+---- + 1 | Sepal.Length | numeric | 0 (0.0%) | [4.3, 7.9] | 150 + ---+--------------+---------+----------+------------+---- + 2 | Sepal.Width | numeric | 0 (0.0%) | [2, 4.4] | 150 + --------------------------------------------------------- + +# data_codebook iris, select, ID + + Code + data_codebook(iris, select = starts_with("Petal")) + Output + iris (150 rows and 5 variables, 2 shown) + + ID | Name | Type | Missings | Values | N + ---+--------------+---------+----------+------------+---- + 3 | Petal.Length | numeric | 0 (0.0%) | [1, 6.9] | 150 + ---+--------------+---------+----------+------------+---- + 4 | Petal.Width | numeric | 0 (0.0%) | [0.1, 2.5] | 150 + --------------------------------------------------------- + +# data_codebook efc + + Code + print(data_codebook(efc), table_width = Inf) + Output + efc (100 rows and 5 variables, 5 shown) + + ID | Name | Label | Type | Missings | Values | Value Labels | N + ---+----------+------------------------------------------+-------------+------------+----------+---------------------------------+----------- + 1 | c12hour | average number of hours of care per week | numeric | 2 (2.0%) | [5, 168] | | 98 + ---+----------+------------------------------------------+-------------+------------+----------+---------------------------------+----------- + 2 | e16sex | elder's gender | numeric | 0 (0.0%) | 1 | male | 46 (46.0%) + | | | | | 2 | female | 54 (54.0%) + ---+----------+------------------------------------------+-------------+------------+----------+---------------------------------+----------- + 3 | e42dep | elder's dependency | categorical | 3 (3.0%) | 1 | independent | 2 ( 2.1%) + | | | | | 2 | slightly dependent | 4 ( 4.1%) + | | | | | 3 | moderately dependent | 28 (28.9%) + | | | | | 4 | severely dependent | 63 (64.9%) + ---+----------+------------------------------------------+-------------+------------+----------+---------------------------------+----------- + 4 | c172code | carer's level of education | numeric | 10 (10.0%) | 1 | low level of education | 8 ( 8.9%) + | | | | | 2 | intermediate level of education | 66 (73.3%) + | | | | | 3 | high level of education | 16 (17.8%) + ---+----------+------------------------------------------+-------------+------------+----------+---------------------------------+----------- + 5 | neg_c_7 | Negative impact with 7 items | numeric | 3 (3.0%) | [7, 28] | | 97 + --------------------------------------------------------------------------------------------------------------------------------------------- + +--- + + Code + print(data_codebook(efc), table_width = "auto", remove_duplicates = FALSE) + Output + efc (100 rows and 5 variables, 5 shown) + + ID | Name | Label | Type + ---+----------+------------------------------------------+------------ + 1 | c12hour | average number of hours of care per week | numeric + ---+----------+------------------------------------------+------------ + 2 | e16sex | elder's gender | numeric + ---+----------+------------------------------------------+------------ + ---+----------+------------------------------------------+------------ + 3 | e42dep | elder's dependency | categorical + ---+----------+------------------------------------------+------------ + ---+----------+------------------------------------------+------------ + ---+----------+------------------------------------------+------------ + ---+----------+------------------------------------------+------------ + 4 | c172code | carer's level of education | numeric + ---+----------+------------------------------------------+------------ + ---+----------+------------------------------------------+------------ + ---+----------+------------------------------------------+------------ + 5 | neg_c_7 | Negative impact with 7 items | numeric + ---------------------------------------------------------------------- + + ID | Missings | Values | Value Labels | N + ---+------------+----------+---------------------------------+----------- + 1 | 2 (2.0%) | [5, 168] | | 98 + ---+------------+----------+---------------------------------+----------- + 2 | 0 (0.0%) | 1 | male | 46 (46.0%) + | | 2 | female | 54 (54.0%) + ---+------------+----------+---------------------------------+----------- + 3 | 3 (3.0%) | 1 | independent | 2 ( 2.1%) + | | 2 | slightly dependent | 4 ( 4.1%) + | | 3 | moderately dependent | 28 (28.9%) + | | 4 | severely dependent | 63 (64.9%) + ---+------------+----------+---------------------------------+----------- + 4 | 10 (10.0%) | 1 | low level of education | 8 ( 8.9%) + | | 2 | intermediate level of education | 66 (73.3%) + | | 3 | high level of education | 16 (17.8%) + ---+------------+----------+---------------------------------+----------- + 5 | 3 (3.0%) | [7, 28] | | 97 + ------------------------------------------------------------------------- + +--- + + Code + print(data_codebook(efc), table_width = "auto", remove_duplicates = TRUE) + Output + efc (100 rows and 5 variables, 5 shown) + + ID | Name | Label | Type + ---+----------+------------------------------------------+------------ + 1 | c12hour | average number of hours of care per week | numeric + ---+----------+------------------------------------------+------------ + 2 | e16sex | elder's gender | numeric + ---+----------+------------------------------------------+------------ + ---+----------+------------------------------------------+------------ + 3 | e42dep | elder's dependency | categorical + ---+----------+------------------------------------------+------------ + ---+----------+------------------------------------------+------------ + ---+----------+------------------------------------------+------------ + ---+----------+------------------------------------------+------------ + 4 | c172code | carer's level of education | numeric + ---+----------+------------------------------------------+------------ + ---+----------+------------------------------------------+------------ + ---+----------+------------------------------------------+------------ + 5 | neg_c_7 | Negative impact with 7 items | numeric + ---------------------------------------------------------------------- + + ID | Missings | Values | Value Labels | N + ---+------------+----------+---------------------------------+----------- + 1 | 2 (2.0%) | [5, 168] | | 98 + ---+------------+----------+---------------------------------+----------- + 2 | 0 (0.0%) | 1 | male | 46 (46.0%) + | | 2 | female | 54 (54.0%) + ---+------------+----------+---------------------------------+----------- + 3 | 3 (3.0%) | 1 | independent | 2 ( 2.1%) + | | 2 | slightly dependent | 4 ( 4.1%) + | | 3 | moderately dependent | 28 (28.9%) + | | 4 | severely dependent | 63 (64.9%) + ---+------------+----------+---------------------------------+----------- + 4 | 10 (10.0%) | 1 | low level of education | 8 ( 8.9%) + | | 2 | intermediate level of education | 66 (73.3%) + | | 3 | high level of education | 16 (17.8%) + ---+------------+----------+---------------------------------+----------- + 5 | 3 (3.0%) | [7, 28] | | 97 + ------------------------------------------------------------------------- + +# data_codebook efc, variable_label_width + + Code + print(out, table_width = Inf) + Output + efc (100 rows and 5 variables, 5 shown) + + ID | Name | Label | Type | Missings | Values | Value Labels | N + ---+----------+------------------------------+-------------+------------+----------+---------------------------------+----------- + 1 | c12hour | average number of hours of | numeric | 2 (2.0%) | [5, 168] | | 98 + | | care per week | | | | | + ---+----------+------------------------------+-------------+------------+----------+---------------------------------+----------- + 2 | e16sex | elder's gender | numeric | 0 (0.0%) | 1 | male | 46 (46.0%) + | | | | | 2 | female | 54 (54.0%) + ---+----------+------------------------------+-------------+------------+----------+---------------------------------+----------- + 3 | e42dep | elder's dependency | categorical | 3 (3.0%) | 1 | independent | 2 ( 2.1%) + | | | | | 2 | slightly dependent | 4 ( 4.1%) + | | | | | 3 | moderately dependent | 28 (28.9%) + | | | | | 4 | severely dependent | 63 (64.9%) + ---+----------+------------------------------+-------------+------------+----------+---------------------------------+----------- + 4 | c172code | carer's level of education | numeric | 10 (10.0%) | 1 | low level of education | 8 ( 8.9%) + | | | | | 2 | intermediate level of education | 66 (73.3%) + | | | | | 3 | high level of education | 16 (17.8%) + ---+----------+------------------------------+-------------+------------+----------+---------------------------------+----------- + 5 | neg_c_7 | Negative impact with 7 items | numeric | 3 (3.0%) | [7, 28] | | 97 + --------------------------------------------------------------------------------------------------------------------------------- + +--- + + Code + print(out, table_width = "auto", remove_duplicates = FALSE) + Output + efc (100 rows and 5 variables, 5 shown) + + ID | Name | Label | Type | Missings + ---+----------+------------------------------+-------------+----------- + 1 | c12hour | average number of hours of | numeric | 2 (2.0%) + | | care per week | | + ---+----------+------------------------------+-------------+----------- + 2 | e16sex | elder's gender | numeric | 0 (0.0%) + ---+----------+------------------------------+-------------+----------- + ---+----------+------------------------------+-------------+----------- + 3 | e42dep | elder's dependency | categorical | 3 (3.0%) + ---+----------+------------------------------+-------------+----------- + ---+----------+------------------------------+-------------+----------- + ---+----------+------------------------------+-------------+----------- + ---+----------+------------------------------+-------------+----------- + 4 | c172code | carer's level of education | numeric | 10 (10.0%) + ---+----------+------------------------------+-------------+----------- + ---+----------+------------------------------+-------------+----------- + ---+----------+------------------------------+-------------+----------- + 5 | neg_c_7 | Negative impact with 7 items | numeric | 3 (3.0%) + ----------------------------------------------------------------------- + + ID | Values | Value Labels | N + ---+----------+---------------------------------+----------- + 1 | [5, 168] | | 98 + ---+----------+---------------------------------+----------- + ---+----------+---------------------------------+----------- + 2 | 1 | male | 46 (46.0%) + | 2 | female | 54 (54.0%) + ---+----------+---------------------------------+----------- + 3 | 1 | independent | 2 ( 2.1%) + | 2 | slightly dependent | 4 ( 4.1%) + | 3 | moderately dependent | 28 (28.9%) + | 4 | severely dependent | 63 (64.9%) + ---+----------+---------------------------------+----------- + 4 | 1 | low level of education | 8 ( 8.9%) + | 2 | intermediate level of education | 66 (73.3%) + | 3 | high level of education | 16 (17.8%) + ---+----------+---------------------------------+----------- + 5 | [7, 28] | | 97 + ------------------------------------------------------------ + +--- + + Code + print(out, table_width = "auto", remove_duplicates = TRUE) + Output + efc (100 rows and 5 variables, 5 shown) + + ID | Name | Label | Type | Missings + ---+----------+------------------------------+-------------+----------- + 1 | c12hour | average number of hours of | numeric | 2 (2.0%) + | | care per week | | + ---+----------+------------------------------+-------------+----------- + 2 | e16sex | elder's gender | numeric | 0 (0.0%) + ---+----------+------------------------------+-------------+----------- + ---+----------+------------------------------+-------------+----------- + 3 | e42dep | elder's dependency | categorical | 3 (3.0%) + ---+----------+------------------------------+-------------+----------- + ---+----------+------------------------------+-------------+----------- + ---+----------+------------------------------+-------------+----------- + ---+----------+------------------------------+-------------+----------- + 4 | c172code | carer's level of education | numeric | 10 (10.0%) + ---+----------+------------------------------+-------------+----------- + ---+----------+------------------------------+-------------+----------- + ---+----------+------------------------------+-------------+----------- + 5 | neg_c_7 | Negative impact with 7 items | numeric | 3 (3.0%) + ----------------------------------------------------------------------- + + ID | Values | Value Labels | N + ---+----------+---------------------------------+----------- + 1 | [5, 168] | | 98 + ---+----------+---------------------------------+----------- + ---+----------+---------------------------------+----------- + 2 | 1 | male | 46 (46.0%) + | 2 | female | 54 (54.0%) + ---+----------+---------------------------------+----------- + 3 | 1 | independent | 2 ( 2.1%) + | 2 | slightly dependent | 4 ( 4.1%) + | 3 | moderately dependent | 28 (28.9%) + | 4 | severely dependent | 63 (64.9%) + ---+----------+---------------------------------+----------- + 4 | 1 | low level of education | 8 ( 8.9%) + | 2 | intermediate level of education | 66 (73.3%) + | 3 | high level of education | 16 (17.8%) + ---+----------+---------------------------------+----------- + 5 | [7, 28] | | 97 + ------------------------------------------------------------ + +# data_codebook efc, value_label_width + + Code + print(out, table_width = Inf) + Output + efc (100 rows and 5 variables, 5 shown) + + ID | Name | Label | Type | Missings | Values | Value Labels | N + ---+----------+------------------------------+-------------+------------+----------+------------------+----------- + 1 | c12hour | average number of hours of | numeric | 2 (2.0%) | [5, 168] | | 98 + | | care per week | | | | | + ---+----------+------------------------------+-------------+------------+----------+------------------+----------- + 2 | e16sex | elder's gender | numeric | 0 (0.0%) | 1 | male | 46 (46.0%) + | | | | | 2 | female | 54 (54.0%) + ---+----------+------------------------------+-------------+------------+----------+------------------+----------- + 3 | e42dep | elder's dependency | categorical | 3 (3.0%) | 1 | independent | 2 ( 2.1%) + | | | | | 2 | slightly... | 4 ( 4.1%) + | | | | | 3 | moderately... | 28 (28.9%) + | | | | | 4 | severely... | 63 (64.9%) + ---+----------+------------------------------+-------------+------------+----------+------------------+----------- + 4 | c172code | carer's level of education | numeric | 10 (10.0%) | 1 | low level of... | 8 ( 8.9%) + | | | | | 2 | intermediate... | 66 (73.3%) + | | | | | 3 | high level of... | 16 (17.8%) + ---+----------+------------------------------+-------------+------------+----------+------------------+----------- + 5 | neg_c_7 | Negative impact with 7 items | numeric | 3 (3.0%) | [7, 28] | | 97 + ------------------------------------------------------------------------------------------------------------------ + +--- + + Code + print(out, table_width = "auto", remove_duplicates = FALSE) + Output + efc (100 rows and 5 variables, 5 shown) + + ID | Name | Label | Type | Missings + ---+----------+------------------------------+-------------+----------- + 1 | c12hour | average number of hours of | numeric | 2 (2.0%) + | | care per week | | + ---+----------+------------------------------+-------------+----------- + 2 | e16sex | elder's gender | numeric | 0 (0.0%) + ---+----------+------------------------------+-------------+----------- + ---+----------+------------------------------+-------------+----------- + 3 | e42dep | elder's dependency | categorical | 3 (3.0%) + ---+----------+------------------------------+-------------+----------- + ---+----------+------------------------------+-------------+----------- + ---+----------+------------------------------+-------------+----------- + ---+----------+------------------------------+-------------+----------- + 4 | c172code | carer's level of education | numeric | 10 (10.0%) + ---+----------+------------------------------+-------------+----------- + ---+----------+------------------------------+-------------+----------- + ---+----------+------------------------------+-------------+----------- + 5 | neg_c_7 | Negative impact with 7 items | numeric | 3 (3.0%) + ----------------------------------------------------------------------- + + ID | Values | Value Labels | N + ---+----------+------------------+----------- + 1 | [5, 168] | | 98 + ---+----------+------------------+----------- + ---+----------+------------------+----------- + 2 | 1 | male | 46 (46.0%) + | 2 | female | 54 (54.0%) + ---+----------+------------------+----------- + 3 | 1 | independent | 2 ( 2.1%) + | 2 | slightly... | 4 ( 4.1%) + | 3 | moderately... | 28 (28.9%) + | 4 | severely... | 63 (64.9%) + ---+----------+------------------+----------- + 4 | 1 | low level of... | 8 ( 8.9%) + | 2 | intermediate... | 66 (73.3%) + | 3 | high level of... | 16 (17.8%) + ---+----------+------------------+----------- + 5 | [7, 28] | | 97 + --------------------------------------------- + +--- + + Code + print(out, table_width = "auto", remove_duplicates = TRUE) + Output + efc (100 rows and 5 variables, 5 shown) + + ID | Name | Label | Type | Missings + ---+----------+------------------------------+-------------+----------- + 1 | c12hour | average number of hours of | numeric | 2 (2.0%) + | | care per week | | + ---+----------+------------------------------+-------------+----------- + 2 | e16sex | elder's gender | numeric | 0 (0.0%) + ---+----------+------------------------------+-------------+----------- + ---+----------+------------------------------+-------------+----------- + 3 | e42dep | elder's dependency | categorical | 3 (3.0%) + ---+----------+------------------------------+-------------+----------- + ---+----------+------------------------------+-------------+----------- + ---+----------+------------------------------+-------------+----------- + ---+----------+------------------------------+-------------+----------- + 4 | c172code | carer's level of education | numeric | 10 (10.0%) + ---+----------+------------------------------+-------------+----------- + ---+----------+------------------------------+-------------+----------- + ---+----------+------------------------------+-------------+----------- + 5 | neg_c_7 | Negative impact with 7 items | numeric | 3 (3.0%) + ----------------------------------------------------------------------- + + ID | Values | Value Labels | N + ---+----------+------------------+----------- + 1 | [5, 168] | | 98 + ---+----------+------------------+----------- + ---+----------+------------------+----------- + 2 | 1 | male | 46 (46.0%) + | 2 | female | 54 (54.0%) + ---+----------+------------------+----------- + 3 | 1 | independent | 2 ( 2.1%) + | 2 | slightly... | 4 ( 4.1%) + | 3 | moderately... | 28 (28.9%) + | 4 | severely... | 63 (64.9%) + ---+----------+------------------+----------- + 4 | 1 | low level of... | 8 ( 8.9%) + | 2 | intermediate... | 66 (73.3%) + | 3 | high level of... | 16 (17.8%) + ---+----------+------------------+----------- + 5 | [7, 28] | | 97 + --------------------------------------------- + +# data_codebook truncated data + + Code + data_codebook(d, max_values = 5) + Output + d (100 rows and 2 variables, 2 shown) + + ID | Name | Type | Missings | Values | N + ---+------+-----------+----------+---------+--------- + 1 | a | integer | 0 (0.0%) | [1, 15] | 100 + ---+------+-----------+----------+---------+--------- + 2 | b | character | 0 (0.0%) | a | 4 (4.0%) + | | | | b | 3 (3.0%) + | | | | c | 5 (5.0%) + | | | | d | 4 (4.0%) + | | | | e | 3 (3.0%) + | | | | (...) | + ----------------------------------------------------- + +# data_codebook mixed numeric lengths + + Code + data_codebook(d) + Output + d (100 rows and 2 variables, 2 shown) + + ID | Name | Type | Missings | Values | N + ---+------+---------+----------+---------+----------- + 1 | a | integer | 0 (0.0%) | 1 | 28 (28.0%) + | | | | 2 | 26 (26.0%) + | | | | 3 | 29 (29.0%) + | | | | 4 | 17 (17.0%) + ---+------+---------+----------+---------+----------- + 2 | b | integer | 0 (0.0%) | [5, 15] | 100 + ----------------------------------------------------- + +# data_codebook mixed range_at + + Code + data_codebook(d, range_at = 3) + Output + d (100 rows and 2 variables, 2 shown) + + ID | Name | Type | Missings | Values | N + ---+------+---------+----------+---------+---- + 1 | a | integer | 0 (0.0%) | [1, 4] | 100 + ---+------+---------+----------+---------+---- + 2 | b | integer | 0 (0.0%) | [5, 15] | 100 + ---------------------------------------------- + +# data_codebook logicals + + Code + data_codebook(d) + Output + d (100 rows and 3 variables, 3 shown) + + ID | Name | Type | Missings | Values | N + ---+------+-----------+----------+---------+----------- + 1 | a | integer | 0 (0.0%) | [1, 15] | 100 + ---+------+-----------+----------+---------+----------- + 2 | b | character | 0 (0.0%) | a | 26 (26.0%) + | | | | b | 38 (38.0%) + | | | | c | 36 (36.0%) + ---+------+-----------+----------+---------+----------- + 3 | c | logical | 0 (0.0%) | FALSE | 42 (42.0%) + | | | | TRUE | 58 (58.0%) + ------------------------------------------------------- + +# data_codebook labelled data exceptions + + Code + data_codebook(d) + Output + d (100 rows and 3 variables, 3 shown) + + ID | Name | Type | Missings | Values | Value Labels | N + ---+------+---------+------------+--------+--------------+----------- + 1 | f1 | integer | 17 (17.0%) | 1 | One | 21 (25.3%) + | | | | 2 | Two | 20 (24.1%) + | | | | 3 | Three | 23 (27.7%) + | | | | 5 | Five | 19 (22.9%) + ---+------+---------+------------+--------+--------------+----------- + 2 | f2 | integer | 0 (0.0%) | 1 | One | 25 (25.0%) + | | | | 2 | Two | 20 (20.0%) + | | | | 3 | Three | 14 (14.0%) + | | | | 4 | 4 | 17 (17.0%) + | | | | 5 | Five | 24 (24.0%) + ---+------+---------+------------+--------+--------------+----------- + 3 | f3 | integer | 0 (0.0%) | 1 | One | 21 (21.0%) + | | | | 2 | Two | 24 (24.0%) + | | | | 3 | Three | 16 (16.0%) + | | | | 4 | Four | 14 (14.0%) + | | | | 5 | Five | 25 (25.0%) + --------------------------------------------------------------------- + +# data_codebook labelled data factors + + Code + data_codebook(d) + Output + d (100 rows and 3 variables, 3 shown) + + ID | Name | Type | Missings | Values | Value Labels | N + ---+------+-------------+----------+--------+--------------+----------- + 1 | f1 | categorical | 0 (0.0%) | a | A | 35 (35.0%) + | | | | b | Bee | 32 (32.0%) + | | | | c | Cee | 33 (33.0%) + ---+------+-------------+----------+--------+--------------+----------- + 2 | f2 | categorical | 0 (0.0%) | a | A | 30 (30.0%) + | | | | b | Bee | 38 (38.0%) + | | | | c | Cee | 32 (32.0%) + ---+------+-------------+----------+--------+--------------+----------- + 3 | f3 | categorical | 0 (0.0%) | a | A | 23 (23.0%) + | | | | b | Bee | 28 (28.0%) + | | | | c | Cee | 49 (49.0%) + ----------------------------------------------------------------------- + +# data_codebook works with numbers < 1 + + Code + data_codebook(d) + Output + d (6 rows and 2 variables, 2 shown) + + ID | Name | Type | Missings | Values | N + ---+------+---------+----------+--------+---------- + 1 | a | numeric | 0 (0.0%) | 1 | 2 (33.3%) + | | | | 2 | 2 (33.3%) + | | | | 3 | 2 (33.3%) + ---+------+---------+----------+--------+---------- + 2 | b | numeric | 0 (0.0%) | 0 | 3 (50.0%) + | | | | 1 | 2 (33.3%) + | | | | 2 | 1 (16.7%) + --------------------------------------------------- + +# data_codebook, big marks + + Code + data_codebook(d) + Output + d (1,000,000 rows and 2 variables, 2 shown) + + ID | Name | Type | Missings | Values | N + ---+------+-------------+----------+--------+---------------- + 1 | f1 | categorical | 0 (0.0%) | a | 333,238 (33.3%) + | | | | b | 332,910 (33.3%) + | | | | c | 333,852 (33.4%) + ---+------+-------------+----------+--------+---------------- + 2 | f2 | categorical | 0 (0.0%) | 1 | 333,285 (33.3%) + | | | | 2 | 333,358 (33.3%) + | | | | 3 | 333,357 (33.3%) + ------------------------------------------------------------- + +# data_codebook, tagged NA + + Code + data_codebook(data.frame(x)) + Output + data.frame(x) (26 rows and 1 variables, 1 shown) + + ID | Name | Type | Missings | Values | Value Labels | N + ---+------+---------+------------+--------+--------------+---------- + 1 | x | numeric | 12 (46.2%) | 1 | Agreement | 4 (15.4%) + | | | | 2 | 2 | 4 (15.4%) + | | | | 3 | 3 | 4 (15.4%) + | | | | 4 | Disagreement | 2 ( 7.7%) + | | | | NA(a) | Refused | 4 (15.4%) + | | | | NA(c) | First | 5 (19.2%) + | | | | NA(z) | Not home | 3 (11.5%) + -------------------------------------------------------------------- + +--- + + Code + data_codebook(data.frame(x)) + Output + data.frame(x) (23 rows and 1 variables, 1 shown) + + ID | Name | Type | Missings | Values | Value Labels | N + ---+------+---------+-----------+--------+--------------+---------- + 1 | x | numeric | 9 (39.1%) | 1 | Agreement | 4 (17.4%) + | | | | 2 | 2 | 4 (17.4%) + | | | | 3 | 3 | 4 (17.4%) + | | | | 4 | Disagreement | 2 ( 8.7%) + | | | | NA(a) | Refused | 4 (17.4%) + | | | | NA(c) | First | 5 (21.7%) + ------------------------------------------------------------------- + +# data_codebook, negative label values #334 + + Code + data_codebook(data.frame(x1, x2)) + Output + data.frame(x1, x2) (4 rows and 2 variables, 2 shown) + + ID | Name | Type | Missings | Values | Value Labels | N + ---+------+---------+----------+--------+--------------+---------- + 1 | x1 | integer | 0 (0.0%) | 1 | Agreement | 1 (25.0%) + | | | | 2 | 2 | 1 (25.0%) + | | | | 3 | 3 | 1 (25.0%) + | | | | 4 | Disagreement | 1 (25.0%) + ---+------+---------+----------+--------+--------------+---------- + 2 | x2 | numeric | 0 (0.0%) | -9 | Missing | 1 (25.0%) + | | | | 1 | Agreement | 1 (25.0%) + | | | | 2 | 2 | 1 (25.0%) + | | | | 3 | 3 | 1 (25.0%) + ------------------------------------------------------------------ + diff --git a/tests/testthat/_snaps/describe_missing.md b/tests/testthat/_snaps/describe_missing.md new file mode 100644 index 000000000..04080c190 --- /dev/null +++ b/tests/testthat/_snaps/describe_missing.md @@ -0,0 +1,38 @@ +# describe_missing + + Code + describe_missing(airquality) + Output + var items na cells na_percent na_max na_max_percent all_na + 1 Ozone:Day 6 44 918 4.79 2 33.33 0 + +--- + + Code + describe_missing(airquality, vars = list(c("Ozone", "Solar.R", "Wind"), c( + "Temp", "Month", "Day"))) + Output + var items na cells na_percent na_max na_max_percent all_na + 1 Ozone:Wind 3 44 459 9.59 2 66.67 0 + 2 Temp:Day 3 0 459 0.00 0 0.00 0 + 3 Total 6 44 918 4.79 2 33.33 0 + +--- + + Code + df <- data.frame(ID = c("idz", NA), scale1_Q1 = fun(), scale1_Q2 = fun(), + scale1_Q3 = fun(), scale2_Q1 = fun(), scale2_Q2 = fun(), scale2_Q3 = fun(), + scale3_Q1 = fun(), scale3_Q2 = fun(), scale3_Q3 = fun()) + +--- + + Code + describe_missing(df, scales = c("ID", "scale1", "scale2", "scale3")) + Output + var items na cells na_percent na_max na_max_percent all_na + 1 ID:ID 1 7 14 50.00 1 100 7 + 2 scale1_Q1:scale1_Q3 3 11 42 26.19 3 100 3 + 3 scale2_Q1:scale2_Q3 3 17 42 40.48 3 100 3 + 4 scale3_Q1:scale3_Q3 3 10 42 23.81 3 100 3 + 5 Total 10 45 140 32.14 10 100 2 + diff --git a/tests/testthat/test-describe_missing.R b/tests/testthat/test-describe_missing.R new file mode 100644 index 000000000..27d44c386 --- /dev/null +++ b/tests/testthat/test-describe_missing.R @@ -0,0 +1,26 @@ +test_that("describe_missing", { + expect_snapshot(describe_missing(airquality)) + + # Use selected columns explicitly + expect_snapshot(describe_missing(airquality, + vars = list( + c("Ozone", "Solar.R", "Wind"), + c("Temp", "Month", "Day") + ) + )) + + # If the questionnaire items start with the same name, e.g., + set.seed(15) + fun <- function() { + c(sample(c(NA, 1:10), replace = TRUE), NA, NA, NA) + } + expect_snapshot(df <- data.frame( + ID = c("idz", NA), + scale1_Q1 = fun(), scale1_Q2 = fun(), scale1_Q3 = fun(), + scale2_Q1 = fun(), scale2_Q2 = fun(), scale2_Q3 = fun(), + scale3_Q1 = fun(), scale3_Q2 = fun(), scale3_Q3 = fun() + )) + + # One can list the scale names directly: + expect_snapshot(describe_missing(df, scales = c("ID", "scale1", "scale2", "scale3"))) +})