From f8799004397a94fbd89dfb0ec613ae4a6be22da9 Mon Sep 17 00:00:00 2001 From: rempsyc Date: Mon, 11 Nov 2024 12:02:25 +0100 Subject: [PATCH 01/10] Suggestion of new function: `describe_missing()` Fixes #454 --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index ba821b0ba..2325c062d 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: datawizard Title: Easy Data Wrangling and Statistical Transformations -Version: 0.13.0.12 +Version: 0.13.0.13 Authors@R: c( person("Indrajeet", "Patil", , "patilindrajeet.science@gmail.com", role = "aut", comment = c(ORCID = "0000-0003-1995-6531")), From ab9f0064c35050359247688f48583349704aa8f7 Mon Sep 17 00:00:00 2001 From: rempsyc Date: Mon, 11 Nov 2024 21:25:58 +0100 Subject: [PATCH 02/10] Suggestion of new function: `describe_missing()` Fixes #454 --- NAMESPACE | 1 + R/describe_missing.R | 118 ++++ man/describe_missing.Rd | 86 +++ tests/testthat/_snaps/data_codebook.new.md | 705 +++++++++++++++++++++ tests/testthat/_snaps/describe_missing.md | 38 ++ tests/testthat/test-describe_missing.R | 26 + 6 files changed, 974 insertions(+) create mode 100644 R/describe_missing.R create mode 100644 man/describe_missing.Rd create mode 100644 tests/testthat/_snaps/data_codebook.new.md create mode 100644 tests/testthat/_snaps/describe_missing.md create mode 100644 tests/testthat/test-describe_missing.R diff --git a/NAMESPACE b/NAMESPACE index 7e97817b9..e463f7261 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -267,6 +267,7 @@ export(data_write) export(degroup) export(demean) export(describe_distribution) +export(describe_missing) export(detrend) export(distribution_coef_var) export(distribution_mode) diff --git a/R/describe_missing.R b/R/describe_missing.R new file mode 100644 index 000000000..2f2f0da9c --- /dev/null +++ b/R/describe_missing.R @@ -0,0 +1,118 @@ +#' @title Describe Missing Values in Data According to Guidelines +#' +#' @description Provides a detailed description of missing values in a data frame. +#' This function reports both absolute and percentage missing values of specified +#' column lists or scales, following recommended guidelines. Some authors recommend +#' reporting item-level missingness per scale, as well as a participant's maximum +#' number of missing items by scale. For example, Parent (2013) writes: +#' +#' *I recommend that authors (a) state their tolerance level for missing data by scale +#' or subscale (e.g., "We calculated means for all subscales on which participants gave +#' at least 75% complete data") and then (b) report the individual missingness rates +#' by scale per data point (i.e., the number of missing values out of all data points +#' on that scale for all participants) and the maximum by participant (e.g., "For Attachment +#' Anxiety, a total of 4 missing data points out of 100 were observed, with no participant +#' missing more than a single data point").* +#' +#' @param data The data frame to be analyzed. +#' @param vars Variable (or lists of variables) to check for missing values (NAs). +#' @param scales The scale names to check for missing values (as a character vector). +#' @keywords missing values NA guidelines +#' @return A dataframe with the following columns: +#' - `var`: Variables selected. +#' - `items`: Number of items for selected variables. +#' - `na`: Number of missing cell values for those variables (e.g., 2 missing +#' values for the first participant + 2 missing values for the second participant +#' = total of 4 missing values). +#' - `cells`: Total number of cells (i.e., number of participants multiplied by +#' the number of variables, `items`). +#' - `na_percent`: The percentage of missing values (`na` divided by `cells`). +#' - `na_max`: The number of missing values for the participant with the most +#' missing values for the selected variables. +#' - `na_max_percent`: The amount of missing values for the participant with +#' the most missing values for the selected variables, as a percentage +#' (i.e., `na_max` divided by the number of selected variables, `items`). +#' - `all_na`: The number of participants missing 100% of items for that scale +#' (the selected variables). +#' +#' @export +#' @references Parent, M. C. (2013). Handling item-level missing +#' data: Simpler is just as good. *The Counseling Psychologist*, +#' *41*(4), 568-600. https://doi.org/10.1177%2F0011000012445176 +#' @examples +#' # Use the entire data frame +#' describe_missing(airquality) +#' +#' # Use selected columns explicitly +#' describe_missing(airquality, +#' vars = list( +#' c("Ozone", "Solar.R", "Wind"), +#' c("Temp", "Month", "Day") +#' ) +#' ) +#' +#' # If the questionnaire items start with the same name, e.g., +#' set.seed(15) +#' fun <- function() { +#' c(sample(c(NA, 1:10), replace = TRUE), NA, NA, NA) +#' } +#' df <- data.frame( +#' ID = c("idz", NA), +#' open_1 = fun(), open_2 = fun(), open_3 = fun(), +#' extrovert_1 = fun(), extrovert_2 = fun(), extrovert_3 = fun(), +#' agreeable_1 = fun(), agreeable_2 = fun(), agreeable_3 = fun() +#' ) +#' +#' # One can list the scale names directly: +#' describe_missing(df, scales = c("ID", "open", "extrovert", "agreeable")) + + + +describe_missing <- function(data, vars = NULL, scales = NULL) { + classes <- lapply(data, class) + if (missing(vars) & missing(scales)) { + vars.internal <- names(data) + } else if (!missing(scales)) { + vars.internal <- lapply(scales, function(x) { + grep(paste0("^", x), names(data), value = TRUE) + }) + } + if (!missing(vars)) { + vars.internal <- vars + } + if (!is.list(vars.internal)) { + vars.internal <- list(vars.internal) + } + na_df <- .describe_missing(data) + if (!missing(vars) | !missing(scales)) { + na_list <- lapply(vars.internal, function(x) { + data_subset <- data[, x, drop = FALSE] + .describe_missing(data_subset) + }) + na_df$var <- "Total" + na_df <- do.call(rbind, c(na_list, list(na_df))) + } + na_df +} + +.describe_missing <- function(data) { + var <- paste0(names(data)[1], ":", names(data)[ncol(data)]) + items <- ncol(data) + na <- sum(is.na(data)) + cells <- nrow(data) * ncol(data) + na_percent <- round(na / cells * 100, 2) + na_max <- max(rowSums(is.na(data))) + na_max_percent <- round(na_max / items * 100, 2) + all_na <- sum(apply(data, 1, function(x) all(is.na(x)))) + + data.frame( + var = var, + items = items, + na = na, + cells = cells, + na_percent = na_percent, + na_max = na_max, + na_max_percent = na_max_percent, + all_na = all_na + ) +} diff --git a/man/describe_missing.Rd b/man/describe_missing.Rd new file mode 100644 index 000000000..c206a23ce --- /dev/null +++ b/man/describe_missing.Rd @@ -0,0 +1,86 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/describe_missing.R +\name{describe_missing} +\alias{describe_missing} +\title{Describe Missing Values in Data According to Guidelines} +\usage{ +describe_missing(data, vars = NULL, scales = NULL) +} +\arguments{ +\item{data}{The data frame to be analyzed.} + +\item{vars}{Variable (or lists of variables) to check for missing values (NAs).} + +\item{scales}{The scale names to check for missing values (as a character vector).} +} +\value{ +A dataframe with the following columns: +\itemize{ +\item \code{var}: Variables selected. +\item \code{items}: Number of items for selected variables. +\item \code{na}: Number of missing cell values for those variables (e.g., 2 missing +values for the first participant + 2 missing values for the second participant += total of 4 missing values). +\item \code{cells}: Total number of cells (i.e., number of participants multiplied by +the number of variables, \code{items}). +\item \code{na_percent}: The percentage of missing values (\code{na} divided by \code{cells}). +\item \code{na_max}: The number of missing values for the participant with the most +missing values for the selected variables. +\item \code{na_max_percent}: The amount of missing values for the participant with +the most missing values for the selected variables, as a percentage +(i.e., \code{na_max} divided by the number of selected variables, \code{items}). +\item \code{all_na}: The number of participants missing 100\% of items for that scale +(the selected variables). +} +} +\description{ +Provides a detailed description of missing values in a data frame. +This function reports both absolute and percentage missing values of specified +column lists or scales, following recommended guidelines. Some authors recommend +reporting item-level missingness per scale, as well as a participant's maximum +number of missing items by scale. For example, Parent (2013) writes: + +\emph{I recommend that authors (a) state their tolerance level for missing data by scale +or subscale (e.g., "We calculated means for all subscales on which participants gave +at least 75\% complete data") and then (b) report the individual missingness rates +by scale per data point (i.e., the number of missing values out of all data points +on that scale for all participants) and the maximum by participant (e.g., "For Attachment +Anxiety, a total of 4 missing data points out of 100 were observed, with no participant +missing more than a single data point").} +} +\examples{ +# Use the entire data frame +describe_missing(airquality) + +# Use selected columns explicitly +describe_missing(airquality, + vars = list( + c("Ozone", "Solar.R", "Wind"), + c("Temp", "Month", "Day") + ) +) + +# If the questionnaire items start with the same name, e.g., +set.seed(15) +fun <- function() { + c(sample(c(NA, 1:10), replace = TRUE), NA, NA, NA) +} +df <- data.frame( + ID = c("idz", NA), + open_1 = fun(), open_2 = fun(), open_3 = fun(), + extrovert_1 = fun(), extrovert_2 = fun(), extrovert_3 = fun(), + agreeable_1 = fun(), agreeable_2 = fun(), agreeable_3 = fun() +) + +# One can list the scale names directly: +describe_missing(df, scales = c("ID", "open", "extrovert", "agreeable")) +} +\references{ +Parent, M. C. (2013). Handling item-level missing +data: Simpler is just as good. \emph{The Counseling Psychologist}, +\emph{41}(4), 568-600. https://doi.org/10.1177\%2F0011000012445176 +} +\keyword{NA} +\keyword{guidelines} +\keyword{missing} +\keyword{values} diff --git a/tests/testthat/_snaps/data_codebook.new.md b/tests/testthat/_snaps/data_codebook.new.md new file mode 100644 index 000000000..2ba496ef4 --- /dev/null +++ b/tests/testthat/_snaps/data_codebook.new.md @@ -0,0 +1,705 @@ +# data_codebook iris + + Code + data_codebook(iris) + Output + iris (150 rows and 5 variables, 5 shown) + + ID | Name | Type | Missings | Values | N + ---+--------------+-------------+----------+------------+----------- + 1 | Sepal.Length | numeric | 0 (0.0%) | [4.3, 7.9] | 150 + ---+--------------+-------------+----------+------------+----------- + 2 | Sepal.Width | numeric | 0 (0.0%) | [2, 4.4] | 150 + ---+--------------+-------------+----------+------------+----------- + 3 | Petal.Length | numeric | 0 (0.0%) | [1, 6.9] | 150 + ---+--------------+-------------+----------+------------+----------- + 4 | Petal.Width | numeric | 0 (0.0%) | [0.1, 2.5] | 150 + ---+--------------+-------------+----------+------------+----------- + 5 | Species | categorical | 0 (0.0%) | setosa | 50 (33.3%) + | | | | versicolor | 50 (33.3%) + | | | | virginica | 50 (33.3%) + -------------------------------------------------------------------- + +# data_codebook iris, reordered + + Code + data_codebook(iris[c(1, 2, 5, 3, 4)]) + Output + iris[c(1, 2, 5, 3, 4)] (150 rows and 5 variables, 5 shown) + + ID | Name | Type | Missings | Values | N + ---+--------------+-------------+----------+------------+----------- + 1 | Sepal.Length | numeric | 0 (0.0%) | [4.3, 7.9] | 150 + ---+--------------+-------------+----------+------------+----------- + 2 | Sepal.Width | numeric | 0 (0.0%) | [2, 4.4] | 150 + ---+--------------+-------------+----------+------------+----------- + 3 | Species | categorical | 0 (0.0%) | setosa | 50 (33.3%) + | | | | versicolor | 50 (33.3%) + | | | | virginica | 50 (33.3%) + ---+--------------+-------------+----------+------------+----------- + 4 | Petal.Length | numeric | 0 (0.0%) | [1, 6.9] | 150 + ---+--------------+-------------+----------+------------+----------- + 5 | Petal.Width | numeric | 0 (0.0%) | [0.1, 2.5] | 150 + -------------------------------------------------------------------- + +# data_codebook NaN and Inf + + Code + data_codebook(d) + Output + d (9 rows and 1 variables, 1 shown) + + ID | Name | Type | Missings | Values | N + ---+------+---------+-----------+--------+---------- + 1 | x | numeric | 2 (22.2%) | 1 | 3 (42.9%) + | | | | 2 | 1 (14.3%) + | | | | 4 | 2 (28.6%) + | | | | Inf | 1 (14.3%) + ---------------------------------------------------- + +--- + + Code + data_codebook(d) + Output + d (102 rows and 1 variables, 1 shown) + + ID | Name | Type | Missings | Values | N + ---+------+---------+----------+---------+------------ + 1 | x | numeric | 0 (0.0%) | [1, 15] | 102 (98.1%) + | | | | Inf | 2 ( 1.9%) + ------------------------------------------------------ + +--- + + Code + data_codebook(d, range_at = 100) + Output + d (102 rows and 1 variables, 1 shown) + + ID | Name | Type | Missings | Values | N + ---+------+---------+----------+--------+----------- + 1 | x | numeric | 0 (0.0%) | 1 | 4 ( 4.0%) + | | | | 2 | 5 ( 5.0%) + | | | | 3 | 6 ( 6.0%) + | | | | 4 | 5 ( 5.0%) + | | | | 5 | 8 ( 8.0%) + | | | | 6 | 10 (10.0%) + | | | | 7 | 6 ( 6.0%) + | | | | 8 | 3 ( 3.0%) + | | | | 9 | 13 (13.0%) + | | | | 10 | 7 ( 7.0%) + | | | | (...) | + ---------------------------------------------------- + +--- + + Code + data_codebook(d, range_at = 100, max_values = 4) + Output + d (102 rows and 1 variables, 1 shown) + + ID | Name | Type | Missings | Values | N + ---+------+---------+----------+--------+--------- + 1 | x | numeric | 0 (0.0%) | 1 | 4 (4.0%) + | | | | 2 | 5 (5.0%) + | | | | 3 | 6 (6.0%) + | | | | 4 | 5 (5.0%) + | | | | (...) | + -------------------------------------------------- + +# data_codebook iris, select + + Code + data_codebook(iris, select = starts_with("Sepal")) + Output + iris (150 rows and 5 variables, 2 shown) + + ID | Name | Type | Missings | Values | N + ---+--------------+---------+----------+------------+---- + 1 | Sepal.Length | numeric | 0 (0.0%) | [4.3, 7.9] | 150 + ---+--------------+---------+----------+------------+---- + 2 | Sepal.Width | numeric | 0 (0.0%) | [2, 4.4] | 150 + --------------------------------------------------------- + +# data_codebook iris, select, ID + + Code + data_codebook(iris, select = starts_with("Petal")) + Output + iris (150 rows and 5 variables, 2 shown) + + ID | Name | Type | Missings | Values | N + ---+--------------+---------+----------+------------+---- + 3 | Petal.Length | numeric | 0 (0.0%) | [1, 6.9] | 150 + ---+--------------+---------+----------+------------+---- + 4 | Petal.Width | numeric | 0 (0.0%) | [0.1, 2.5] | 150 + --------------------------------------------------------- + +# data_codebook efc + + Code + print(data_codebook(efc), table_width = Inf) + Output + efc (100 rows and 5 variables, 5 shown) + + ID | Name | Label | Type | Missings | Values | Value Labels | N + ---+----------+------------------------------------------+-------------+------------+----------+---------------------------------+----------- + 1 | c12hour | average number of hours of care per week | numeric | 2 (2.0%) | [5, 168] | | 98 + ---+----------+------------------------------------------+-------------+------------+----------+---------------------------------+----------- + 2 | e16sex | elder's gender | numeric | 0 (0.0%) | 1 | male | 46 (46.0%) + | | | | | 2 | female | 54 (54.0%) + ---+----------+------------------------------------------+-------------+------------+----------+---------------------------------+----------- + 3 | e42dep | elder's dependency | categorical | 3 (3.0%) | 1 | independent | 2 ( 2.1%) + | | | | | 2 | slightly dependent | 4 ( 4.1%) + | | | | | 3 | moderately dependent | 28 (28.9%) + | | | | | 4 | severely dependent | 63 (64.9%) + ---+----------+------------------------------------------+-------------+------------+----------+---------------------------------+----------- + 4 | c172code | carer's level of education | numeric | 10 (10.0%) | 1 | low level of education | 8 ( 8.9%) + | | | | | 2 | intermediate level of education | 66 (73.3%) + | | | | | 3 | high level of education | 16 (17.8%) + ---+----------+------------------------------------------+-------------+------------+----------+---------------------------------+----------- + 5 | neg_c_7 | Negative impact with 7 items | numeric | 3 (3.0%) | [7, 28] | | 97 + --------------------------------------------------------------------------------------------------------------------------------------------- + +--- + + Code + print(data_codebook(efc), table_width = "auto", remove_duplicates = FALSE) + Output + efc (100 rows and 5 variables, 5 shown) + + ID | Name | Label | Type + ---+----------+------------------------------------------+------------ + 1 | c12hour | average number of hours of care per week | numeric + ---+----------+------------------------------------------+------------ + 2 | e16sex | elder's gender | numeric + ---+----------+------------------------------------------+------------ + ---+----------+------------------------------------------+------------ + 3 | e42dep | elder's dependency | categorical + ---+----------+------------------------------------------+------------ + ---+----------+------------------------------------------+------------ + ---+----------+------------------------------------------+------------ + ---+----------+------------------------------------------+------------ + 4 | c172code | carer's level of education | numeric + ---+----------+------------------------------------------+------------ + ---+----------+------------------------------------------+------------ + ---+----------+------------------------------------------+------------ + 5 | neg_c_7 | Negative impact with 7 items | numeric + ---------------------------------------------------------------------- + + ID | Missings | Values | Value Labels | N + ---+------------+----------+---------------------------------+----------- + 1 | 2 (2.0%) | [5, 168] | | 98 + ---+------------+----------+---------------------------------+----------- + 2 | 0 (0.0%) | 1 | male | 46 (46.0%) + | | 2 | female | 54 (54.0%) + ---+------------+----------+---------------------------------+----------- + 3 | 3 (3.0%) | 1 | independent | 2 ( 2.1%) + | | 2 | slightly dependent | 4 ( 4.1%) + | | 3 | moderately dependent | 28 (28.9%) + | | 4 | severely dependent | 63 (64.9%) + ---+------------+----------+---------------------------------+----------- + 4 | 10 (10.0%) | 1 | low level of education | 8 ( 8.9%) + | | 2 | intermediate level of education | 66 (73.3%) + | | 3 | high level of education | 16 (17.8%) + ---+------------+----------+---------------------------------+----------- + 5 | 3 (3.0%) | [7, 28] | | 97 + ------------------------------------------------------------------------- + +--- + + Code + print(data_codebook(efc), table_width = "auto", remove_duplicates = TRUE) + Output + efc (100 rows and 5 variables, 5 shown) + + ID | Name | Label | Type + ---+----------+------------------------------------------+------------ + 1 | c12hour | average number of hours of care per week | numeric + ---+----------+------------------------------------------+------------ + 2 | e16sex | elder's gender | numeric + ---+----------+------------------------------------------+------------ + ---+----------+------------------------------------------+------------ + 3 | e42dep | elder's dependency | categorical + ---+----------+------------------------------------------+------------ + ---+----------+------------------------------------------+------------ + ---+----------+------------------------------------------+------------ + ---+----------+------------------------------------------+------------ + 4 | c172code | carer's level of education | numeric + ---+----------+------------------------------------------+------------ + ---+----------+------------------------------------------+------------ + ---+----------+------------------------------------------+------------ + 5 | neg_c_7 | Negative impact with 7 items | numeric + ---------------------------------------------------------------------- + + ID | Missings | Values | Value Labels | N + ---+------------+----------+---------------------------------+----------- + 1 | 2 (2.0%) | [5, 168] | | 98 + ---+------------+----------+---------------------------------+----------- + 2 | 0 (0.0%) | 1 | male | 46 (46.0%) + | | 2 | female | 54 (54.0%) + ---+------------+----------+---------------------------------+----------- + 3 | 3 (3.0%) | 1 | independent | 2 ( 2.1%) + | | 2 | slightly dependent | 4 ( 4.1%) + | | 3 | moderately dependent | 28 (28.9%) + | | 4 | severely dependent | 63 (64.9%) + ---+------------+----------+---------------------------------+----------- + 4 | 10 (10.0%) | 1 | low level of education | 8 ( 8.9%) + | | 2 | intermediate level of education | 66 (73.3%) + | | 3 | high level of education | 16 (17.8%) + ---+------------+----------+---------------------------------+----------- + 5 | 3 (3.0%) | [7, 28] | | 97 + ------------------------------------------------------------------------- + +# data_codebook efc, variable_label_width + + Code + print(out, table_width = Inf) + Output + efc (100 rows and 5 variables, 5 shown) + + ID | Name | Label | Type | Missings | Values | Value Labels | N + ---+----------+------------------------------+-------------+------------+----------+---------------------------------+----------- + 1 | c12hour | average number of hours of | numeric | 2 (2.0%) | [5, 168] | | 98 + | | care per week | | | | | + ---+----------+------------------------------+-------------+------------+----------+---------------------------------+----------- + 2 | e16sex | elder's gender | numeric | 0 (0.0%) | 1 | male | 46 (46.0%) + | | | | | 2 | female | 54 (54.0%) + ---+----------+------------------------------+-------------+------------+----------+---------------------------------+----------- + 3 | e42dep | elder's dependency | categorical | 3 (3.0%) | 1 | independent | 2 ( 2.1%) + | | | | | 2 | slightly dependent | 4 ( 4.1%) + | | | | | 3 | moderately dependent | 28 (28.9%) + | | | | | 4 | severely dependent | 63 (64.9%) + ---+----------+------------------------------+-------------+------------+----------+---------------------------------+----------- + 4 | c172code | carer's level of education | numeric | 10 (10.0%) | 1 | low level of education | 8 ( 8.9%) + | | | | | 2 | intermediate level of education | 66 (73.3%) + | | | | | 3 | high level of education | 16 (17.8%) + ---+----------+------------------------------+-------------+------------+----------+---------------------------------+----------- + 5 | neg_c_7 | Negative impact with 7 items | numeric | 3 (3.0%) | [7, 28] | | 97 + --------------------------------------------------------------------------------------------------------------------------------- + +--- + + Code + print(out, table_width = "auto", remove_duplicates = FALSE) + Output + efc (100 rows and 5 variables, 5 shown) + + ID | Name | Label | Type | Missings + ---+----------+------------------------------+-------------+----------- + 1 | c12hour | average number of hours of | numeric | 2 (2.0%) + | | care per week | | + ---+----------+------------------------------+-------------+----------- + 2 | e16sex | elder's gender | numeric | 0 (0.0%) + ---+----------+------------------------------+-------------+----------- + ---+----------+------------------------------+-------------+----------- + 3 | e42dep | elder's dependency | categorical | 3 (3.0%) + ---+----------+------------------------------+-------------+----------- + ---+----------+------------------------------+-------------+----------- + ---+----------+------------------------------+-------------+----------- + ---+----------+------------------------------+-------------+----------- + 4 | c172code | carer's level of education | numeric | 10 (10.0%) + ---+----------+------------------------------+-------------+----------- + ---+----------+------------------------------+-------------+----------- + ---+----------+------------------------------+-------------+----------- + 5 | neg_c_7 | Negative impact with 7 items | numeric | 3 (3.0%) + ----------------------------------------------------------------------- + + ID | Values | Value Labels | N + ---+----------+---------------------------------+----------- + 1 | [5, 168] | | 98 + ---+----------+---------------------------------+----------- + ---+----------+---------------------------------+----------- + 2 | 1 | male | 46 (46.0%) + | 2 | female | 54 (54.0%) + ---+----------+---------------------------------+----------- + 3 | 1 | independent | 2 ( 2.1%) + | 2 | slightly dependent | 4 ( 4.1%) + | 3 | moderately dependent | 28 (28.9%) + | 4 | severely dependent | 63 (64.9%) + ---+----------+---------------------------------+----------- + 4 | 1 | low level of education | 8 ( 8.9%) + | 2 | intermediate level of education | 66 (73.3%) + | 3 | high level of education | 16 (17.8%) + ---+----------+---------------------------------+----------- + 5 | [7, 28] | | 97 + ------------------------------------------------------------ + +--- + + Code + print(out, table_width = "auto", remove_duplicates = TRUE) + Output + efc (100 rows and 5 variables, 5 shown) + + ID | Name | Label | Type | Missings + ---+----------+------------------------------+-------------+----------- + 1 | c12hour | average number of hours of | numeric | 2 (2.0%) + | | care per week | | + ---+----------+------------------------------+-------------+----------- + 2 | e16sex | elder's gender | numeric | 0 (0.0%) + ---+----------+------------------------------+-------------+----------- + ---+----------+------------------------------+-------------+----------- + 3 | e42dep | elder's dependency | categorical | 3 (3.0%) + ---+----------+------------------------------+-------------+----------- + ---+----------+------------------------------+-------------+----------- + ---+----------+------------------------------+-------------+----------- + ---+----------+------------------------------+-------------+----------- + 4 | c172code | carer's level of education | numeric | 10 (10.0%) + ---+----------+------------------------------+-------------+----------- + ---+----------+------------------------------+-------------+----------- + ---+----------+------------------------------+-------------+----------- + 5 | neg_c_7 | Negative impact with 7 items | numeric | 3 (3.0%) + ----------------------------------------------------------------------- + + ID | Values | Value Labels | N + ---+----------+---------------------------------+----------- + 1 | [5, 168] | | 98 + ---+----------+---------------------------------+----------- + ---+----------+---------------------------------+----------- + 2 | 1 | male | 46 (46.0%) + | 2 | female | 54 (54.0%) + ---+----------+---------------------------------+----------- + 3 | 1 | independent | 2 ( 2.1%) + | 2 | slightly dependent | 4 ( 4.1%) + | 3 | moderately dependent | 28 (28.9%) + | 4 | severely dependent | 63 (64.9%) + ---+----------+---------------------------------+----------- + 4 | 1 | low level of education | 8 ( 8.9%) + | 2 | intermediate level of education | 66 (73.3%) + | 3 | high level of education | 16 (17.8%) + ---+----------+---------------------------------+----------- + 5 | [7, 28] | | 97 + ------------------------------------------------------------ + +# data_codebook efc, value_label_width + + Code + print(out, table_width = Inf) + Output + efc (100 rows and 5 variables, 5 shown) + + ID | Name | Label | Type | Missings | Values | Value Labels | N + ---+----------+------------------------------+-------------+------------+----------+------------------+----------- + 1 | c12hour | average number of hours of | numeric | 2 (2.0%) | [5, 168] | | 98 + | | care per week | | | | | + ---+----------+------------------------------+-------------+------------+----------+------------------+----------- + 2 | e16sex | elder's gender | numeric | 0 (0.0%) | 1 | male | 46 (46.0%) + | | | | | 2 | female | 54 (54.0%) + ---+----------+------------------------------+-------------+------------+----------+------------------+----------- + 3 | e42dep | elder's dependency | categorical | 3 (3.0%) | 1 | independent | 2 ( 2.1%) + | | | | | 2 | slightly... | 4 ( 4.1%) + | | | | | 3 | moderately... | 28 (28.9%) + | | | | | 4 | severely... | 63 (64.9%) + ---+----------+------------------------------+-------------+------------+----------+------------------+----------- + 4 | c172code | carer's level of education | numeric | 10 (10.0%) | 1 | low level of... | 8 ( 8.9%) + | | | | | 2 | intermediate... | 66 (73.3%) + | | | | | 3 | high level of... | 16 (17.8%) + ---+----------+------------------------------+-------------+------------+----------+------------------+----------- + 5 | neg_c_7 | Negative impact with 7 items | numeric | 3 (3.0%) | [7, 28] | | 97 + ------------------------------------------------------------------------------------------------------------------ + +--- + + Code + print(out, table_width = "auto", remove_duplicates = FALSE) + Output + efc (100 rows and 5 variables, 5 shown) + + ID | Name | Label | Type | Missings + ---+----------+------------------------------+-------------+----------- + 1 | c12hour | average number of hours of | numeric | 2 (2.0%) + | | care per week | | + ---+----------+------------------------------+-------------+----------- + 2 | e16sex | elder's gender | numeric | 0 (0.0%) + ---+----------+------------------------------+-------------+----------- + ---+----------+------------------------------+-------------+----------- + 3 | e42dep | elder's dependency | categorical | 3 (3.0%) + ---+----------+------------------------------+-------------+----------- + ---+----------+------------------------------+-------------+----------- + ---+----------+------------------------------+-------------+----------- + ---+----------+------------------------------+-------------+----------- + 4 | c172code | carer's level of education | numeric | 10 (10.0%) + ---+----------+------------------------------+-------------+----------- + ---+----------+------------------------------+-------------+----------- + ---+----------+------------------------------+-------------+----------- + 5 | neg_c_7 | Negative impact with 7 items | numeric | 3 (3.0%) + ----------------------------------------------------------------------- + + ID | Values | Value Labels | N + ---+----------+------------------+----------- + 1 | [5, 168] | | 98 + ---+----------+------------------+----------- + ---+----------+------------------+----------- + 2 | 1 | male | 46 (46.0%) + | 2 | female | 54 (54.0%) + ---+----------+------------------+----------- + 3 | 1 | independent | 2 ( 2.1%) + | 2 | slightly... | 4 ( 4.1%) + | 3 | moderately... | 28 (28.9%) + | 4 | severely... | 63 (64.9%) + ---+----------+------------------+----------- + 4 | 1 | low level of... | 8 ( 8.9%) + | 2 | intermediate... | 66 (73.3%) + | 3 | high level of... | 16 (17.8%) + ---+----------+------------------+----------- + 5 | [7, 28] | | 97 + --------------------------------------------- + +--- + + Code + print(out, table_width = "auto", remove_duplicates = TRUE) + Output + efc (100 rows and 5 variables, 5 shown) + + ID | Name | Label | Type | Missings + ---+----------+------------------------------+-------------+----------- + 1 | c12hour | average number of hours of | numeric | 2 (2.0%) + | | care per week | | + ---+----------+------------------------------+-------------+----------- + 2 | e16sex | elder's gender | numeric | 0 (0.0%) + ---+----------+------------------------------+-------------+----------- + ---+----------+------------------------------+-------------+----------- + 3 | e42dep | elder's dependency | categorical | 3 (3.0%) + ---+----------+------------------------------+-------------+----------- + ---+----------+------------------------------+-------------+----------- + ---+----------+------------------------------+-------------+----------- + ---+----------+------------------------------+-------------+----------- + 4 | c172code | carer's level of education | numeric | 10 (10.0%) + ---+----------+------------------------------+-------------+----------- + ---+----------+------------------------------+-------------+----------- + ---+----------+------------------------------+-------------+----------- + 5 | neg_c_7 | Negative impact with 7 items | numeric | 3 (3.0%) + ----------------------------------------------------------------------- + + ID | Values | Value Labels | N + ---+----------+------------------+----------- + 1 | [5, 168] | | 98 + ---+----------+------------------+----------- + ---+----------+------------------+----------- + 2 | 1 | male | 46 (46.0%) + | 2 | female | 54 (54.0%) + ---+----------+------------------+----------- + 3 | 1 | independent | 2 ( 2.1%) + | 2 | slightly... | 4 ( 4.1%) + | 3 | moderately... | 28 (28.9%) + | 4 | severely... | 63 (64.9%) + ---+----------+------------------+----------- + 4 | 1 | low level of... | 8 ( 8.9%) + | 2 | intermediate... | 66 (73.3%) + | 3 | high level of... | 16 (17.8%) + ---+----------+------------------+----------- + 5 | [7, 28] | | 97 + --------------------------------------------- + +# data_codebook truncated data + + Code + data_codebook(d, max_values = 5) + Output + d (100 rows and 2 variables, 2 shown) + + ID | Name | Type | Missings | Values | N + ---+------+-----------+----------+---------+--------- + 1 | a | integer | 0 (0.0%) | [1, 15] | 100 + ---+------+-----------+----------+---------+--------- + 2 | b | character | 0 (0.0%) | a | 4 (4.0%) + | | | | b | 3 (3.0%) + | | | | c | 5 (5.0%) + | | | | d | 4 (4.0%) + | | | | e | 3 (3.0%) + | | | | (...) | + ----------------------------------------------------- + +# data_codebook mixed numeric lengths + + Code + data_codebook(d) + Output + d (100 rows and 2 variables, 2 shown) + + ID | Name | Type | Missings | Values | N + ---+------+---------+----------+---------+----------- + 1 | a | integer | 0 (0.0%) | 1 | 28 (28.0%) + | | | | 2 | 26 (26.0%) + | | | | 3 | 29 (29.0%) + | | | | 4 | 17 (17.0%) + ---+------+---------+----------+---------+----------- + 2 | b | integer | 0 (0.0%) | [5, 15] | 100 + ----------------------------------------------------- + +# data_codebook mixed range_at + + Code + data_codebook(d, range_at = 3) + Output + d (100 rows and 2 variables, 2 shown) + + ID | Name | Type | Missings | Values | N + ---+------+---------+----------+---------+---- + 1 | a | integer | 0 (0.0%) | [1, 4] | 100 + ---+------+---------+----------+---------+---- + 2 | b | integer | 0 (0.0%) | [5, 15] | 100 + ---------------------------------------------- + +# data_codebook logicals + + Code + data_codebook(d) + Output + d (100 rows and 3 variables, 3 shown) + + ID | Name | Type | Missings | Values | N + ---+------+-----------+----------+---------+----------- + 1 | a | integer | 0 (0.0%) | [1, 15] | 100 + ---+------+-----------+----------+---------+----------- + 2 | b | character | 0 (0.0%) | a | 26 (26.0%) + | | | | b | 38 (38.0%) + | | | | c | 36 (36.0%) + ---+------+-----------+----------+---------+----------- + 3 | c | logical | 0 (0.0%) | FALSE | 42 (42.0%) + | | | | TRUE | 58 (58.0%) + ------------------------------------------------------- + +# data_codebook labelled data exceptions + + Code + data_codebook(d) + Output + d (100 rows and 3 variables, 3 shown) + + ID | Name | Type | Missings | Values | Value Labels | N + ---+------+---------+------------+--------+--------------+----------- + 1 | f1 | integer | 17 (17.0%) | 1 | One | 21 (25.3%) + | | | | 2 | Two | 20 (24.1%) + | | | | 3 | Three | 23 (27.7%) + | | | | 5 | Five | 19 (22.9%) + ---+------+---------+------------+--------+--------------+----------- + 2 | f2 | integer | 0 (0.0%) | 1 | One | 25 (25.0%) + | | | | 2 | Two | 20 (20.0%) + | | | | 3 | Three | 14 (14.0%) + | | | | 4 | 4 | 17 (17.0%) + | | | | 5 | Five | 24 (24.0%) + ---+------+---------+------------+--------+--------------+----------- + 3 | f3 | integer | 0 (0.0%) | 1 | One | 21 (21.0%) + | | | | 2 | Two | 24 (24.0%) + | | | | 3 | Three | 16 (16.0%) + | | | | 4 | Four | 14 (14.0%) + | | | | 5 | Five | 25 (25.0%) + --------------------------------------------------------------------- + +# data_codebook labelled data factors + + Code + data_codebook(d) + Output + d (100 rows and 3 variables, 3 shown) + + ID | Name | Type | Missings | Values | Value Labels | N + ---+------+-------------+----------+--------+--------------+----------- + 1 | f1 | categorical | 0 (0.0%) | a | A | 35 (35.0%) + | | | | b | Bee | 32 (32.0%) + | | | | c | Cee | 33 (33.0%) + ---+------+-------------+----------+--------+--------------+----------- + 2 | f2 | categorical | 0 (0.0%) | a | A | 30 (30.0%) + | | | | b | Bee | 38 (38.0%) + | | | | c | Cee | 32 (32.0%) + ---+------+-------------+----------+--------+--------------+----------- + 3 | f3 | categorical | 0 (0.0%) | a | A | 23 (23.0%) + | | | | b | Bee | 28 (28.0%) + | | | | c | Cee | 49 (49.0%) + ----------------------------------------------------------------------- + +# data_codebook works with numbers < 1 + + Code + data_codebook(d) + Output + d (6 rows and 2 variables, 2 shown) + + ID | Name | Type | Missings | Values | N + ---+------+---------+----------+--------+---------- + 1 | a | numeric | 0 (0.0%) | 1 | 2 (33.3%) + | | | | 2 | 2 (33.3%) + | | | | 3 | 2 (33.3%) + ---+------+---------+----------+--------+---------- + 2 | b | numeric | 0 (0.0%) | 0 | 3 (50.0%) + | | | | 1 | 2 (33.3%) + | | | | 2 | 1 (16.7%) + --------------------------------------------------- + +# data_codebook, big marks + + Code + data_codebook(d) + Output + d (1,000,000 rows and 2 variables, 2 shown) + + ID | Name | Type | Missings | Values | N + ---+------+-------------+----------+--------+---------------- + 1 | f1 | categorical | 0 (0.0%) | a | 333,238 (33.3%) + | | | | b | 332,910 (33.3%) + | | | | c | 333,852 (33.4%) + ---+------+-------------+----------+--------+---------------- + 2 | f2 | categorical | 0 (0.0%) | 1 | 333,285 (33.3%) + | | | | 2 | 333,358 (33.3%) + | | | | 3 | 333,357 (33.3%) + ------------------------------------------------------------- + +# data_codebook, tagged NA + + Code + data_codebook(data.frame(x)) + Output + data.frame(x) (26 rows and 1 variables, 1 shown) + + ID | Name | Type | Missings | Values | Value Labels | N + ---+------+---------+------------+--------+--------------+---------- + 1 | x | numeric | 12 (46.2%) | 1 | Agreement | 4 (15.4%) + | | | | 2 | 2 | 4 (15.4%) + | | | | 3 | 3 | 4 (15.4%) + | | | | 4 | Disagreement | 2 ( 7.7%) + | | | | NA(a) | Refused | 4 (15.4%) + | | | | NA(c) | First | 5 (19.2%) + | | | | NA(z) | Not home | 3 (11.5%) + -------------------------------------------------------------------- + +--- + + Code + data_codebook(data.frame(x)) + Output + data.frame(x) (23 rows and 1 variables, 1 shown) + + ID | Name | Type | Missings | Values | Value Labels | N + ---+------+---------+-----------+--------+--------------+---------- + 1 | x | numeric | 9 (39.1%) | 1 | Agreement | 4 (17.4%) + | | | | 2 | 2 | 4 (17.4%) + | | | | 3 | 3 | 4 (17.4%) + | | | | 4 | Disagreement | 2 ( 8.7%) + | | | | NA(a) | Refused | 4 (17.4%) + | | | | NA(c) | First | 5 (21.7%) + ------------------------------------------------------------------- + +# data_codebook, negative label values #334 + + Code + data_codebook(data.frame(x1, x2)) + Output + data.frame(x1, x2) (4 rows and 2 variables, 2 shown) + + ID | Name | Type | Missings | Values | Value Labels | N + ---+------+---------+----------+--------+--------------+---------- + 1 | x1 | integer | 0 (0.0%) | 1 | Agreement | 1 (25.0%) + | | | | 2 | 2 | 1 (25.0%) + | | | | 3 | 3 | 1 (25.0%) + | | | | 4 | Disagreement | 1 (25.0%) + ---+------+---------+----------+--------+--------------+---------- + 2 | x2 | numeric | 0 (0.0%) | -9 | Missing | 1 (25.0%) + | | | | 1 | Agreement | 1 (25.0%) + | | | | 2 | 2 | 1 (25.0%) + | | | | 3 | 3 | 1 (25.0%) + ------------------------------------------------------------------ + diff --git a/tests/testthat/_snaps/describe_missing.md b/tests/testthat/_snaps/describe_missing.md new file mode 100644 index 000000000..04080c190 --- /dev/null +++ b/tests/testthat/_snaps/describe_missing.md @@ -0,0 +1,38 @@ +# describe_missing + + Code + describe_missing(airquality) + Output + var items na cells na_percent na_max na_max_percent all_na + 1 Ozone:Day 6 44 918 4.79 2 33.33 0 + +--- + + Code + describe_missing(airquality, vars = list(c("Ozone", "Solar.R", "Wind"), c( + "Temp", "Month", "Day"))) + Output + var items na cells na_percent na_max na_max_percent all_na + 1 Ozone:Wind 3 44 459 9.59 2 66.67 0 + 2 Temp:Day 3 0 459 0.00 0 0.00 0 + 3 Total 6 44 918 4.79 2 33.33 0 + +--- + + Code + df <- data.frame(ID = c("idz", NA), scale1_Q1 = fun(), scale1_Q2 = fun(), + scale1_Q3 = fun(), scale2_Q1 = fun(), scale2_Q2 = fun(), scale2_Q3 = fun(), + scale3_Q1 = fun(), scale3_Q2 = fun(), scale3_Q3 = fun()) + +--- + + Code + describe_missing(df, scales = c("ID", "scale1", "scale2", "scale3")) + Output + var items na cells na_percent na_max na_max_percent all_na + 1 ID:ID 1 7 14 50.00 1 100 7 + 2 scale1_Q1:scale1_Q3 3 11 42 26.19 3 100 3 + 3 scale2_Q1:scale2_Q3 3 17 42 40.48 3 100 3 + 4 scale3_Q1:scale3_Q3 3 10 42 23.81 3 100 3 + 5 Total 10 45 140 32.14 10 100 2 + diff --git a/tests/testthat/test-describe_missing.R b/tests/testthat/test-describe_missing.R new file mode 100644 index 000000000..27d44c386 --- /dev/null +++ b/tests/testthat/test-describe_missing.R @@ -0,0 +1,26 @@ +test_that("describe_missing", { + expect_snapshot(describe_missing(airquality)) + + # Use selected columns explicitly + expect_snapshot(describe_missing(airquality, + vars = list( + c("Ozone", "Solar.R", "Wind"), + c("Temp", "Month", "Day") + ) + )) + + # If the questionnaire items start with the same name, e.g., + set.seed(15) + fun <- function() { + c(sample(c(NA, 1:10), replace = TRUE), NA, NA, NA) + } + expect_snapshot(df <- data.frame( + ID = c("idz", NA), + scale1_Q1 = fun(), scale1_Q2 = fun(), scale1_Q3 = fun(), + scale2_Q1 = fun(), scale2_Q2 = fun(), scale2_Q3 = fun(), + scale3_Q1 = fun(), scale3_Q2 = fun(), scale3_Q3 = fun() + )) + + # One can list the scale names directly: + expect_snapshot(describe_missing(df, scales = c("ID", "scale1", "scale2", "scale3"))) +}) From 218b7f4b453ae36ca073cb6f4d05041318ed8f3f Mon Sep 17 00:00:00 2001 From: rempsyc Date: Mon, 11 Nov 2024 21:46:27 +0100 Subject: [PATCH 03/10] styler, update dic --- R/describe_missing.R | 5 +---- inst/WORDLIST | 14 +++++--------- pkgdown/_pkgdown.yaml | 1 + 3 files changed, 7 insertions(+), 13 deletions(-) diff --git a/R/describe_missing.R b/R/describe_missing.R index 2f2f0da9c..6563d7fe5 100644 --- a/R/describe_missing.R +++ b/R/describe_missing.R @@ -65,9 +65,6 @@ #' #' # One can list the scale names directly: #' describe_missing(df, scales = c("ID", "open", "extrovert", "agreeable")) - - - describe_missing <- function(data, vars = NULL, scales = NULL) { classes <- lapply(data, class) if (missing(vars) & missing(scales)) { @@ -104,7 +101,7 @@ describe_missing <- function(data, vars = NULL, scales = NULL) { na_max <- max(rowSums(is.na(data))) na_max_percent <- round(na_max / items * 100, 2) all_na <- sum(apply(data, 1, function(x) all(is.na(x)))) - + data.frame( var = var, items = items, diff --git a/inst/WORDLIST b/inst/WORDLIST index eda7dc71c..08ec792c0 100644 --- a/inst/WORDLIST +++ b/inst/WORDLIST @@ -8,14 +8,13 @@ CMD Carle Catran Crosstables -Dhaliwal -Disaggregating DOI De -Dom +Dhaliwal +Disaggregating EFC -Enders EUROFAMCARE +Enders Fairbrother GLMM Gelman @@ -54,7 +53,6 @@ Winsorizing al behaviour behaviours -bmwiernik codebook codebooks coercible @@ -77,7 +75,6 @@ joss labelled labelling leptokurtic -lifecycle lm lme meaned @@ -88,7 +85,6 @@ modelling nd panelr partialization -patilindrajeets platykurtic poorman pre @@ -102,7 +98,6 @@ recodes recoding recodings relevel -rempsyc reproducibility rescale rescaled @@ -111,7 +106,8 @@ rio rowid sd stackexchange -strengejacke +subscale +subscales tailedness th tibble diff --git a/pkgdown/_pkgdown.yaml b/pkgdown/_pkgdown.yaml index 31ec901d0..7422638cf 100644 --- a/pkgdown/_pkgdown.yaml +++ b/pkgdown/_pkgdown.yaml @@ -66,6 +66,7 @@ reference: - data_tabulate - data_peek - data_seek + - describe_missing - means_by_group - contains("distribution") - kurtosis From ebaeb6885a25eb428ca8e2eed986f93b23808978 Mon Sep 17 00:00:00 2001 From: rempsyc Date: Mon, 11 Nov 2024 22:06:53 +0100 Subject: [PATCH 04/10] Suggestion of new function: `describe_missing()` Fixes #454 --- R/describe_missing.R | 8 ++++---- tests/testthat/_snaps/describe_missing.md | 7 ------- tests/testthat/test-describe_missing.R | 11 ++++++----- 3 files changed, 10 insertions(+), 16 deletions(-) diff --git a/R/describe_missing.R b/R/describe_missing.R index 6563d7fe5..1d0cc7e11 100644 --- a/R/describe_missing.R +++ b/R/describe_missing.R @@ -67,7 +67,7 @@ #' describe_missing(df, scales = c("ID", "open", "extrovert", "agreeable")) describe_missing <- function(data, vars = NULL, scales = NULL) { classes <- lapply(data, class) - if (missing(vars) & missing(scales)) { + if (missing(vars) && missing(scales)) { vars.internal <- names(data) } else if (!missing(scales)) { vars.internal <- lapply(scales, function(x) { @@ -81,7 +81,7 @@ describe_missing <- function(data, vars = NULL, scales = NULL) { vars.internal <- list(vars.internal) } na_df <- .describe_missing(data) - if (!missing(vars) | !missing(scales)) { + if (!missing(vars) || !missing(scales)) { na_list <- lapply(vars.internal, function(x) { data_subset <- data[, x, drop = FALSE] .describe_missing(data_subset) @@ -93,7 +93,7 @@ describe_missing <- function(data, vars = NULL, scales = NULL) { } .describe_missing <- function(data) { - var <- paste0(names(data)[1], ":", names(data)[ncol(data)]) + my_var <- paste0(names(data)[1], ":", names(data)[ncol(data)]) items <- ncol(data) na <- sum(is.na(data)) cells <- nrow(data) * ncol(data) @@ -103,7 +103,7 @@ describe_missing <- function(data, vars = NULL, scales = NULL) { all_na <- sum(apply(data, 1, function(x) all(is.na(x)))) data.frame( - var = var, + var = my_var, items = items, na = na, cells = cells, diff --git a/tests/testthat/_snaps/describe_missing.md b/tests/testthat/_snaps/describe_missing.md index 04080c190..14d910bf0 100644 --- a/tests/testthat/_snaps/describe_missing.md +++ b/tests/testthat/_snaps/describe_missing.md @@ -17,13 +17,6 @@ 2 Temp:Day 3 0 459 0.00 0 0.00 0 3 Total 6 44 918 4.79 2 33.33 0 ---- - - Code - df <- data.frame(ID = c("idz", NA), scale1_Q1 = fun(), scale1_Q2 = fun(), - scale1_Q3 = fun(), scale2_Q1 = fun(), scale2_Q2 = fun(), scale2_Q3 = fun(), - scale3_Q1 = fun(), scale3_Q2 = fun(), scale3_Q3 = fun()) - --- Code diff --git a/tests/testthat/test-describe_missing.R b/tests/testthat/test-describe_missing.R index 27d44c386..2159082be 100644 --- a/tests/testthat/test-describe_missing.R +++ b/tests/testthat/test-describe_missing.R @@ -14,13 +14,14 @@ test_that("describe_missing", { fun <- function() { c(sample(c(NA, 1:10), replace = TRUE), NA, NA, NA) } - expect_snapshot(df <- data.frame( + + # One can list the scale names directly: + df <- data.frame( ID = c("idz", NA), scale1_Q1 = fun(), scale1_Q2 = fun(), scale1_Q3 = fun(), scale2_Q1 = fun(), scale2_Q2 = fun(), scale2_Q3 = fun(), - scale3_Q1 = fun(), scale3_Q2 = fun(), scale3_Q3 = fun() - )) - - # One can list the scale names directly: + scale3_Q1 = fun(), scale3_Q2 = fun(), scale3_Q3 = fun(), + stringsAsFactors = FALSE + ) expect_snapshot(describe_missing(df, scales = c("ID", "scale1", "scale2", "scale3"))) }) From c3c1302701610b2d81078aa2e9b54daa1c8c0171 Mon Sep 17 00:00:00 2001 From: rempsyc Date: Mon, 11 Nov 2024 22:22:14 +0100 Subject: [PATCH 05/10] news.md --- NEWS.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/NEWS.md b/NEWS.md index 8fc8a29ca..5bebe01c9 100644 --- a/NEWS.md +++ b/NEWS.md @@ -5,6 +5,10 @@ BREAKING CHANGES * Argument `drop_na` in `data_match()` is deprecated now. Please use `remove_na` instead. +NEW FUNCTIONS + +* `describe_missing()`, to comprehensively report on missing values in a data frame. + CHANGES * The `select` argument, which is available in different functions to select From fbdd26d46e12898e4e05a480365fa1b6e76d4f79 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Th=C3=A9riault?= <13123390+rempsyc@users.noreply.github.com> Date: Mon, 16 Dec 2024 19:08:19 -0500 Subject: [PATCH 06/10] Update R/describe_missing.R Co-authored-by: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com> --- R/describe_missing.R | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/R/describe_missing.R b/R/describe_missing.R index 1d0cc7e11..dbf59ae9f 100644 --- a/R/describe_missing.R +++ b/R/describe_missing.R @@ -21,9 +21,7 @@ #' @return A dataframe with the following columns: #' - `var`: Variables selected. #' - `items`: Number of items for selected variables. -#' - `na`: Number of missing cell values for those variables (e.g., 2 missing -#' values for the first participant + 2 missing values for the second participant -#' = total of 4 missing values). +#' - `na`: Number of missing values for those variables. #' - `cells`: Total number of cells (i.e., number of participants multiplied by #' the number of variables, `items`). #' - `na_percent`: The percentage of missing values (`na` divided by `cells`). From 72041f5b90c3ed4ff3fd51f27defa810afcb9a35 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Th=C3=A9riault?= <13123390+rempsyc@users.noreply.github.com> Date: Mon, 16 Dec 2024 22:36:14 -0500 Subject: [PATCH 07/10] address comments and suggestions --- R/describe_missing.R | 118 ++++++++++++++++++++++++---------------- inst/WORDLIST | 5 ++ man/describe_missing.Rd | 90 +++++++++++++++++------------- 3 files changed, 129 insertions(+), 84 deletions(-) diff --git a/R/describe_missing.R b/R/describe_missing.R index dbf59ae9f..cd9c23231 100644 --- a/R/describe_missing.R +++ b/R/describe_missing.R @@ -2,9 +2,19 @@ #' #' @description Provides a detailed description of missing values in a data frame. #' This function reports both absolute and percentage missing values of specified -#' column lists or scales, following recommended guidelines. Some authors recommend -#' reporting item-level missingness per scale, as well as a participant's maximum -#' number of missing items by scale. For example, Parent (2013) writes: +#' column lists or scales, following recommended guidelines. +#' +#' @details +#' In psychology, it is common to ask participants to answer questionnaires in +#' which people answer several questions about a specific topic. For example, +#' people could answer 10 different questions about how extroversioned they are. +#' In turn, researchers calculate the average for those 10 questions (called +#' items). These questionnaires are called (e.g., Likert) "scales" (such as the +#' Rosenberg Self-Esteem Scale, also known as the RSES). +#' +#' Some authors recommend reporting item-level missingness per scale, as well +#' as a participant's maximum number of missing items by scale. For example, +#' Parent (2013) writes: #' #' *I recommend that authors (a) state their tolerance level for missing data by scale #' or subscale (e.g., "We calculated means for all subscales on which participants gave @@ -16,22 +26,28 @@ #' #' @param data The data frame to be analyzed. #' @param vars Variable (or lists of variables) to check for missing values (NAs). -#' @param scales The scale names to check for missing values (as a character vector). -#' @keywords missing values NA guidelines +#' @param scales If you rely on composite scores such as psychological scales +#' or questionnaires, you can provide the shared suffix among those variables +#' (as a character vector). This is useful if the variables you want to check +#' the average of all start with the same name (e.g., `varx`), such as is +#' commonly the case for Likert scales (such as `varx_1`, `varx_2`, `varx_3`, +#' etc.). #' @return A dataframe with the following columns: -#' - `var`: Variables selected. -#' - `items`: Number of items for selected variables. -#' - `na`: Number of missing values for those variables. -#' - `cells`: Total number of cells (i.e., number of participants multiplied by -#' the number of variables, `items`). -#' - `na_percent`: The percentage of missing values (`na` divided by `cells`). -#' - `na_max`: The number of missing values for the participant with the most +#' - `variable`: Variables selected. +#' - `n_columns`: Number of items for selected variables. +#' - `n_missing`: Number of missing values for those variables (NA stands for Not +#' Available). +#' - `n_cells`: Total number of cells (i.e., number of participants multiplied by +#' the number of columns, `n_columns`). +#' - `missing_percent`: The percentage of missing values (`na` divided by `cells`). +#' - `missing_max`: The number of missing values for the participant with the most #' missing values for the selected variables. -#' - `na_max_percent`: The amount of missing values for the participant with +#' - `missing_max_percent`: The amount of missing values for the participant with #' the most missing values for the selected variables, as a percentage -#' (i.e., `na_max` divided by the number of selected variables, `items`). -#' - `all_na`: The number of participants missing 100% of items for that scale +#' (i.e., `missing_max` divided by the number of selected columns, `n_columns`). +#' - `all_missing`: The number of participants missing 100% of items for that scale #' (the selected variables). +#' @param ... Arguments passed down to other functions. Currently not used. #' #' @export #' @references Parent, M. C. (2013). Handling item-level missing @@ -39,59 +55,68 @@ #' *41*(4), 568-600. https://doi.org/10.1177%2F0011000012445176 #' @examples #' # Use the entire data frame -#' describe_missing(airquality) -#' -#' # Use selected columns explicitly -#' describe_missing(airquality, -#' vars = list( -#' c("Ozone", "Solar.R", "Wind"), -#' c("Temp", "Month", "Day") -#' ) -#' ) -#' -#' # If the questionnaire items start with the same name, e.g., #' set.seed(15) #' fun <- function() { #' c(sample(c(NA, 1:10), replace = TRUE), NA, NA, NA) #' } #' df <- data.frame( #' ID = c("idz", NA), -#' open_1 = fun(), open_2 = fun(), open_3 = fun(), -#' extrovert_1 = fun(), extrovert_2 = fun(), extrovert_3 = fun(), -#' agreeable_1 = fun(), agreeable_2 = fun(), agreeable_3 = fun() +#' openness_1 = fun(), openness_2 = fun(), openness_3 = fun(), +#' extroversion_1 = fun(), extroversion_2 = fun(), extroversion_3 = fun(), +#' agreeableness_1 = fun(), agreeableness_2 = fun(), agreeableness_3 = fun() #' ) +#' describe_missing(df) #' -#' # One can list the scale names directly: -#' describe_missing(df, scales = c("ID", "open", "extrovert", "agreeable")) -describe_missing <- function(data, vars = NULL, scales = NULL) { - classes <- lapply(data, class) - if (missing(vars) && missing(scales)) { +#' # If the questionnaire items start with the same name, +#' # one can list the scale names directly: +#' describe_missing(df, scales = c("ID", "openness", "extroversion", "agreeableness")) +#' +#' # Otherwise you can provide nested columns manually: +#' describe_missing(df, +#' select = list( +#' c("ID"), +#' c("openness_1", "openness_2", "openness_3"), +#' c("extroversion_1", "extroversion_2", "extroversion_3"), +#' c("agreeableness_1", "agreeableness_2", "agreeableness_3") +#' ) +#' ) +#' + +describe_missing <- function(data, select = NULL, scales = NULL, ...) { + vars <- select + if (!is.null(vars) && missing(scales)) { vars.internal <- names(data) } else if (!missing(scales)) { vars.internal <- lapply(scales, function(x) { grep(paste0("^", x), names(data), value = TRUE) }) + } else if (is.null(vars) && missing(scales)){ + vars <- as.list(names(data)) } - if (!missing(vars)) { + if (!is.null(vars)) { vars.internal <- vars } if (!is.list(vars.internal)) { vars.internal <- list(vars.internal) } na_df <- .describe_missing(data) - if (!missing(vars) || !missing(scales)) { + if (!is.null(vars) || !missing(scales)) { na_list <- lapply(vars.internal, function(x) { data_subset <- data[, x, drop = FALSE] .describe_missing(data_subset) }) - na_df$var <- "Total" + na_df$variable <- "Total" na_df <- do.call(rbind, c(na_list, list(na_df))) } na_df } .describe_missing <- function(data) { - my_var <- paste0(names(data)[1], ":", names(data)[ncol(data)]) + if (ncol(data) > 1) { + my_var <- paste0(names(data)[1], ":", names(data)[ncol(data)]) + } else { + my_var <- names(data) + } items <- ncol(data) na <- sum(is.na(data)) cells <- nrow(data) * ncol(data) @@ -101,13 +126,14 @@ describe_missing <- function(data, vars = NULL, scales = NULL) { all_na <- sum(apply(data, 1, function(x) all(is.na(x)))) data.frame( - var = my_var, - items = items, - na = na, - cells = cells, - na_percent = na_percent, - na_max = na_max, - na_max_percent = na_max_percent, - all_na = all_na + variable = my_var, + n_columns = items, + n_missing = na, + n_cells = cells, + missing_percent = na_percent, + complete_percent = 100 - na_percent, + missing_max = na_max, + missing_max_percent = na_max_percent, + all_missing = all_na ) } diff --git a/inst/WORDLIST b/inst/WORDLIST index a8b4ff08d..221363971 100644 --- a/inst/WORDLIST +++ b/inst/WORDLIST @@ -78,6 +78,7 @@ labelling leptokurtic lm lme +macOS meaned mesokurtic midhinge @@ -91,6 +92,8 @@ poorman pre pth px +quartile +quartiles readr readxl recode @@ -107,6 +110,8 @@ rio rowid sd stackexchange +subscale +subscales tailedness th tibble diff --git a/man/describe_missing.Rd b/man/describe_missing.Rd index c206a23ce..6d8d92bec 100644 --- a/man/describe_missing.Rd +++ b/man/describe_missing.Rd @@ -4,41 +4,57 @@ \alias{describe_missing} \title{Describe Missing Values in Data According to Guidelines} \usage{ -describe_missing(data, vars = NULL, scales = NULL) +describe_missing(data, select = NULL, scales = NULL, ...) } \arguments{ \item{data}{The data frame to be analyzed.} -\item{vars}{Variable (or lists of variables) to check for missing values (NAs).} +\item{scales}{If you rely on composite scores such as psychological scales +or questionnaires, you can provide the shared suffix among those variables +(as a character vector). This is useful if the variables you want to check +the average of all start with the same name (e.g., \code{varx}), such as is +commonly the case for Likert scales (such as \code{varx_1}, \code{varx_2}, \code{varx_3}, +etc.).} + +\item{...}{Arguments passed down to other functions. Currently not used.} -\item{scales}{The scale names to check for missing values (as a character vector).} +\item{vars}{Variable (or lists of variables) to check for missing values (NAs).} } \value{ A dataframe with the following columns: \itemize{ -\item \code{var}: Variables selected. -\item \code{items}: Number of items for selected variables. -\item \code{na}: Number of missing cell values for those variables (e.g., 2 missing -values for the first participant + 2 missing values for the second participant -= total of 4 missing values). -\item \code{cells}: Total number of cells (i.e., number of participants multiplied by -the number of variables, \code{items}). -\item \code{na_percent}: The percentage of missing values (\code{na} divided by \code{cells}). -\item \code{na_max}: The number of missing values for the participant with the most +\item \code{variable}: Variables selected. +\item \code{n_columns}: Number of items for selected variables. +\item \code{n_missing}: Number of missing values for those variables (NA stands for Not +Available). +\item \code{n_cells}: Total number of cells (i.e., number of participants multiplied by +the number of columns, \code{n_columns}). +\item \code{missing_percent}: The percentage of missing values (\code{na} divided by \code{cells}). +\item \code{missing_max}: The number of missing values for the participant with the most missing values for the selected variables. -\item \code{na_max_percent}: The amount of missing values for the participant with +\item \code{missing_max_percent}: The amount of missing values for the participant with the most missing values for the selected variables, as a percentage -(i.e., \code{na_max} divided by the number of selected variables, \code{items}). -\item \code{all_na}: The number of participants missing 100\% of items for that scale +(i.e., \code{missing_max} divided by the number of selected columns, \code{n_columns}). +\item \code{all_missing}: The number of participants missing 100\% of items for that scale (the selected variables). } } \description{ Provides a detailed description of missing values in a data frame. This function reports both absolute and percentage missing values of specified -column lists or scales, following recommended guidelines. Some authors recommend -reporting item-level missingness per scale, as well as a participant's maximum -number of missing items by scale. For example, Parent (2013) writes: +column lists or scales, following recommended guidelines. +} +\details{ +In psychology, it is common to ask participants to answer questionnaires in +which people answer several questions about a specific topic. For example, +people could answer 10 different questions about how extroversioned they are. +In turn, researchers calculate the average for those 10 questions (called +items). These questionnaires are called (e.g., Likert) "scales" (such as the +Rosenberg Self-Esteem Scale, also known as the RSES). + +Some authors recommend reporting item-level missingness per scale, as well +as a participant's maximum number of missing items by scale. For example, +Parent (2013) writes: \emph{I recommend that authors (a) state their tolerance level for missing data by scale or subscale (e.g., "We calculated means for all subscales on which participants gave @@ -50,37 +66,35 @@ missing more than a single data point").} } \examples{ # Use the entire data frame -describe_missing(airquality) - -# Use selected columns explicitly -describe_missing(airquality, - vars = list( - c("Ozone", "Solar.R", "Wind"), - c("Temp", "Month", "Day") - ) -) - -# If the questionnaire items start with the same name, e.g., set.seed(15) fun <- function() { c(sample(c(NA, 1:10), replace = TRUE), NA, NA, NA) } df <- data.frame( ID = c("idz", NA), - open_1 = fun(), open_2 = fun(), open_3 = fun(), - extrovert_1 = fun(), extrovert_2 = fun(), extrovert_3 = fun(), - agreeable_1 = fun(), agreeable_2 = fun(), agreeable_3 = fun() + openness_1 = fun(), openness_2 = fun(), openness_3 = fun(), + extroversion_1 = fun(), extroversion_2 = fun(), extroversion_3 = fun(), + agreeableness_1 = fun(), agreeableness_2 = fun(), agreeableness_3 = fun() +) +describe_missing(df) + +# If the questionnaire items start with the same name, +# one can list the scale names directly: +describe_missing(df, scales = c("ID", "openness", "extroversion", "agreeableness")) + +# Otherwise you can provide nested columns manually: +describe_missing(df, + select = list( + c("ID"), + c("openness_1", "openness_2", "openness_3"), + c("extroversion_1", "extroversion_2", "extroversion_3"), + c("agreeableness_1", "agreeableness_2", "agreeableness_3") + ) ) -# One can list the scale names directly: -describe_missing(df, scales = c("ID", "open", "extrovert", "agreeable")) } \references{ Parent, M. C. (2013). Handling item-level missing data: Simpler is just as good. \emph{The Counseling Psychologist}, \emph{41}(4), 568-600. https://doi.org/10.1177\%2F0011000012445176 } -\keyword{NA} -\keyword{guidelines} -\keyword{missing} -\keyword{values} From 835b3bb6e83efa282c384f75fd90d81a399fb83a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Th=C3=A9riault?= <13123390+rempsyc@users.noreply.github.com> Date: Mon, 16 Dec 2024 22:50:38 -0500 Subject: [PATCH 08/10] update snapshots, wordlist, lintrs, styler, note --- R/describe_missing.R | 7 +- inst/WORDLIST | 2 + man/describe_missing.Rd | 6 +- tests/testthat/_snaps/data_codebook.new.md | 705 --------------------- tests/testthat/_snaps/describe_missing.md | 56 +- 5 files changed, 52 insertions(+), 724 deletions(-) delete mode 100644 tests/testthat/_snaps/data_codebook.new.md diff --git a/R/describe_missing.R b/R/describe_missing.R index cd9c23231..8c171d58e 100644 --- a/R/describe_missing.R +++ b/R/describe_missing.R @@ -7,7 +7,7 @@ #' @details #' In psychology, it is common to ask participants to answer questionnaires in #' which people answer several questions about a specific topic. For example, -#' people could answer 10 different questions about how extroversioned they are. +#' people could answer 10 different questions about how extroverted they are. #' In turn, researchers calculate the average for those 10 questions (called #' items). These questionnaires are called (e.g., Likert) "scales" (such as the #' Rosenberg Self-Esteem Scale, also known as the RSES). @@ -25,7 +25,7 @@ #' missing more than a single data point").* #' #' @param data The data frame to be analyzed. -#' @param vars Variable (or lists of variables) to check for missing values (NAs). +#' @param select Variable (or lists of variables) to check for missing values (NAs). #' @param scales If you rely on composite scores such as psychological scales #' or questionnaires, you can provide the shared suffix among those variables #' (as a character vector). This is useful if the variables you want to check @@ -81,7 +81,6 @@ #' ) #' ) #' - describe_missing <- function(data, select = NULL, scales = NULL, ...) { vars <- select if (!is.null(vars) && missing(scales)) { @@ -90,7 +89,7 @@ describe_missing <- function(data, select = NULL, scales = NULL, ...) { vars.internal <- lapply(scales, function(x) { grep(paste0("^", x), names(data), value = TRUE) }) - } else if (is.null(vars) && missing(scales)){ + } else if (is.null(vars) && missing(scales)) { vars <- as.list(names(data)) } if (!is.null(vars)) { diff --git a/inst/WORDLIST b/inst/WORDLIST index 221363971..bf52882d2 100644 --- a/inst/WORDLIST +++ b/inst/WORDLIST @@ -26,6 +26,7 @@ Heisig Herrington Hoffmann Joanes +Likert Llabre Lumley MADs @@ -34,6 +35,7 @@ Minitab ORCID PSU Posteriori +RSES Ranktransform Recode Recoding diff --git a/man/describe_missing.Rd b/man/describe_missing.Rd index 6d8d92bec..2213a97ae 100644 --- a/man/describe_missing.Rd +++ b/man/describe_missing.Rd @@ -9,6 +9,8 @@ describe_missing(data, select = NULL, scales = NULL, ...) \arguments{ \item{data}{The data frame to be analyzed.} +\item{select}{Variable (or lists of variables) to check for missing values (NAs).} + \item{scales}{If you rely on composite scores such as psychological scales or questionnaires, you can provide the shared suffix among those variables (as a character vector). This is useful if the variables you want to check @@ -17,8 +19,6 @@ commonly the case for Likert scales (such as \code{varx_1}, \code{varx_2}, \code etc.).} \item{...}{Arguments passed down to other functions. Currently not used.} - -\item{vars}{Variable (or lists of variables) to check for missing values (NAs).} } \value{ A dataframe with the following columns: @@ -47,7 +47,7 @@ column lists or scales, following recommended guidelines. \details{ In psychology, it is common to ask participants to answer questionnaires in which people answer several questions about a specific topic. For example, -people could answer 10 different questions about how extroversioned they are. +people could answer 10 different questions about how extroverted they are. In turn, researchers calculate the average for those 10 questions (called items). These questionnaires are called (e.g., Likert) "scales" (such as the Rosenberg Self-Esteem Scale, also known as the RSES). diff --git a/tests/testthat/_snaps/data_codebook.new.md b/tests/testthat/_snaps/data_codebook.new.md deleted file mode 100644 index 2ba496ef4..000000000 --- a/tests/testthat/_snaps/data_codebook.new.md +++ /dev/null @@ -1,705 +0,0 @@ -# data_codebook iris - - Code - data_codebook(iris) - Output - iris (150 rows and 5 variables, 5 shown) - - ID | Name | Type | Missings | Values | N - ---+--------------+-------------+----------+------------+----------- - 1 | Sepal.Length | numeric | 0 (0.0%) | [4.3, 7.9] | 150 - ---+--------------+-------------+----------+------------+----------- - 2 | Sepal.Width | numeric | 0 (0.0%) | [2, 4.4] | 150 - ---+--------------+-------------+----------+------------+----------- - 3 | Petal.Length | numeric | 0 (0.0%) | [1, 6.9] | 150 - ---+--------------+-------------+----------+------------+----------- - 4 | Petal.Width | numeric | 0 (0.0%) | [0.1, 2.5] | 150 - ---+--------------+-------------+----------+------------+----------- - 5 | Species | categorical | 0 (0.0%) | setosa | 50 (33.3%) - | | | | versicolor | 50 (33.3%) - | | | | virginica | 50 (33.3%) - -------------------------------------------------------------------- - -# data_codebook iris, reordered - - Code - data_codebook(iris[c(1, 2, 5, 3, 4)]) - Output - iris[c(1, 2, 5, 3, 4)] (150 rows and 5 variables, 5 shown) - - ID | Name | Type | Missings | Values | N - ---+--------------+-------------+----------+------------+----------- - 1 | Sepal.Length | numeric | 0 (0.0%) | [4.3, 7.9] | 150 - ---+--------------+-------------+----------+------------+----------- - 2 | Sepal.Width | numeric | 0 (0.0%) | [2, 4.4] | 150 - ---+--------------+-------------+----------+------------+----------- - 3 | Species | categorical | 0 (0.0%) | setosa | 50 (33.3%) - | | | | versicolor | 50 (33.3%) - | | | | virginica | 50 (33.3%) - ---+--------------+-------------+----------+------------+----------- - 4 | Petal.Length | numeric | 0 (0.0%) | [1, 6.9] | 150 - ---+--------------+-------------+----------+------------+----------- - 5 | Petal.Width | numeric | 0 (0.0%) | [0.1, 2.5] | 150 - -------------------------------------------------------------------- - -# data_codebook NaN and Inf - - Code - data_codebook(d) - Output - d (9 rows and 1 variables, 1 shown) - - ID | Name | Type | Missings | Values | N - ---+------+---------+-----------+--------+---------- - 1 | x | numeric | 2 (22.2%) | 1 | 3 (42.9%) - | | | | 2 | 1 (14.3%) - | | | | 4 | 2 (28.6%) - | | | | Inf | 1 (14.3%) - ---------------------------------------------------- - ---- - - Code - data_codebook(d) - Output - d (102 rows and 1 variables, 1 shown) - - ID | Name | Type | Missings | Values | N - ---+------+---------+----------+---------+------------ - 1 | x | numeric | 0 (0.0%) | [1, 15] | 102 (98.1%) - | | | | Inf | 2 ( 1.9%) - ------------------------------------------------------ - ---- - - Code - data_codebook(d, range_at = 100) - Output - d (102 rows and 1 variables, 1 shown) - - ID | Name | Type | Missings | Values | N - ---+------+---------+----------+--------+----------- - 1 | x | numeric | 0 (0.0%) | 1 | 4 ( 4.0%) - | | | | 2 | 5 ( 5.0%) - | | | | 3 | 6 ( 6.0%) - | | | | 4 | 5 ( 5.0%) - | | | | 5 | 8 ( 8.0%) - | | | | 6 | 10 (10.0%) - | | | | 7 | 6 ( 6.0%) - | | | | 8 | 3 ( 3.0%) - | | | | 9 | 13 (13.0%) - | | | | 10 | 7 ( 7.0%) - | | | | (...) | - ---------------------------------------------------- - ---- - - Code - data_codebook(d, range_at = 100, max_values = 4) - Output - d (102 rows and 1 variables, 1 shown) - - ID | Name | Type | Missings | Values | N - ---+------+---------+----------+--------+--------- - 1 | x | numeric | 0 (0.0%) | 1 | 4 (4.0%) - | | | | 2 | 5 (5.0%) - | | | | 3 | 6 (6.0%) - | | | | 4 | 5 (5.0%) - | | | | (...) | - -------------------------------------------------- - -# data_codebook iris, select - - Code - data_codebook(iris, select = starts_with("Sepal")) - Output - iris (150 rows and 5 variables, 2 shown) - - ID | Name | Type | Missings | Values | N - ---+--------------+---------+----------+------------+---- - 1 | Sepal.Length | numeric | 0 (0.0%) | [4.3, 7.9] | 150 - ---+--------------+---------+----------+------------+---- - 2 | Sepal.Width | numeric | 0 (0.0%) | [2, 4.4] | 150 - --------------------------------------------------------- - -# data_codebook iris, select, ID - - Code - data_codebook(iris, select = starts_with("Petal")) - Output - iris (150 rows and 5 variables, 2 shown) - - ID | Name | Type | Missings | Values | N - ---+--------------+---------+----------+------------+---- - 3 | Petal.Length | numeric | 0 (0.0%) | [1, 6.9] | 150 - ---+--------------+---------+----------+------------+---- - 4 | Petal.Width | numeric | 0 (0.0%) | [0.1, 2.5] | 150 - --------------------------------------------------------- - -# data_codebook efc - - Code - print(data_codebook(efc), table_width = Inf) - Output - efc (100 rows and 5 variables, 5 shown) - - ID | Name | Label | Type | Missings | Values | Value Labels | N - ---+----------+------------------------------------------+-------------+------------+----------+---------------------------------+----------- - 1 | c12hour | average number of hours of care per week | numeric | 2 (2.0%) | [5, 168] | | 98 - ---+----------+------------------------------------------+-------------+------------+----------+---------------------------------+----------- - 2 | e16sex | elder's gender | numeric | 0 (0.0%) | 1 | male | 46 (46.0%) - | | | | | 2 | female | 54 (54.0%) - ---+----------+------------------------------------------+-------------+------------+----------+---------------------------------+----------- - 3 | e42dep | elder's dependency | categorical | 3 (3.0%) | 1 | independent | 2 ( 2.1%) - | | | | | 2 | slightly dependent | 4 ( 4.1%) - | | | | | 3 | moderately dependent | 28 (28.9%) - | | | | | 4 | severely dependent | 63 (64.9%) - ---+----------+------------------------------------------+-------------+------------+----------+---------------------------------+----------- - 4 | c172code | carer's level of education | numeric | 10 (10.0%) | 1 | low level of education | 8 ( 8.9%) - | | | | | 2 | intermediate level of education | 66 (73.3%) - | | | | | 3 | high level of education | 16 (17.8%) - ---+----------+------------------------------------------+-------------+------------+----------+---------------------------------+----------- - 5 | neg_c_7 | Negative impact with 7 items | numeric | 3 (3.0%) | [7, 28] | | 97 - --------------------------------------------------------------------------------------------------------------------------------------------- - ---- - - Code - print(data_codebook(efc), table_width = "auto", remove_duplicates = FALSE) - Output - efc (100 rows and 5 variables, 5 shown) - - ID | Name | Label | Type - ---+----------+------------------------------------------+------------ - 1 | c12hour | average number of hours of care per week | numeric - ---+----------+------------------------------------------+------------ - 2 | e16sex | elder's gender | numeric - ---+----------+------------------------------------------+------------ - ---+----------+------------------------------------------+------------ - 3 | e42dep | elder's dependency | categorical - ---+----------+------------------------------------------+------------ - ---+----------+------------------------------------------+------------ - ---+----------+------------------------------------------+------------ - ---+----------+------------------------------------------+------------ - 4 | c172code | carer's level of education | numeric - ---+----------+------------------------------------------+------------ - ---+----------+------------------------------------------+------------ - ---+----------+------------------------------------------+------------ - 5 | neg_c_7 | Negative impact with 7 items | numeric - ---------------------------------------------------------------------- - - ID | Missings | Values | Value Labels | N - ---+------------+----------+---------------------------------+----------- - 1 | 2 (2.0%) | [5, 168] | | 98 - ---+------------+----------+---------------------------------+----------- - 2 | 0 (0.0%) | 1 | male | 46 (46.0%) - | | 2 | female | 54 (54.0%) - ---+------------+----------+---------------------------------+----------- - 3 | 3 (3.0%) | 1 | independent | 2 ( 2.1%) - | | 2 | slightly dependent | 4 ( 4.1%) - | | 3 | moderately dependent | 28 (28.9%) - | | 4 | severely dependent | 63 (64.9%) - ---+------------+----------+---------------------------------+----------- - 4 | 10 (10.0%) | 1 | low level of education | 8 ( 8.9%) - | | 2 | intermediate level of education | 66 (73.3%) - | | 3 | high level of education | 16 (17.8%) - ---+------------+----------+---------------------------------+----------- - 5 | 3 (3.0%) | [7, 28] | | 97 - ------------------------------------------------------------------------- - ---- - - Code - print(data_codebook(efc), table_width = "auto", remove_duplicates = TRUE) - Output - efc (100 rows and 5 variables, 5 shown) - - ID | Name | Label | Type - ---+----------+------------------------------------------+------------ - 1 | c12hour | average number of hours of care per week | numeric - ---+----------+------------------------------------------+------------ - 2 | e16sex | elder's gender | numeric - ---+----------+------------------------------------------+------------ - ---+----------+------------------------------------------+------------ - 3 | e42dep | elder's dependency | categorical - ---+----------+------------------------------------------+------------ - ---+----------+------------------------------------------+------------ - ---+----------+------------------------------------------+------------ - ---+----------+------------------------------------------+------------ - 4 | c172code | carer's level of education | numeric - ---+----------+------------------------------------------+------------ - ---+----------+------------------------------------------+------------ - ---+----------+------------------------------------------+------------ - 5 | neg_c_7 | Negative impact with 7 items | numeric - ---------------------------------------------------------------------- - - ID | Missings | Values | Value Labels | N - ---+------------+----------+---------------------------------+----------- - 1 | 2 (2.0%) | [5, 168] | | 98 - ---+------------+----------+---------------------------------+----------- - 2 | 0 (0.0%) | 1 | male | 46 (46.0%) - | | 2 | female | 54 (54.0%) - ---+------------+----------+---------------------------------+----------- - 3 | 3 (3.0%) | 1 | independent | 2 ( 2.1%) - | | 2 | slightly dependent | 4 ( 4.1%) - | | 3 | moderately dependent | 28 (28.9%) - | | 4 | severely dependent | 63 (64.9%) - ---+------------+----------+---------------------------------+----------- - 4 | 10 (10.0%) | 1 | low level of education | 8 ( 8.9%) - | | 2 | intermediate level of education | 66 (73.3%) - | | 3 | high level of education | 16 (17.8%) - ---+------------+----------+---------------------------------+----------- - 5 | 3 (3.0%) | [7, 28] | | 97 - ------------------------------------------------------------------------- - -# data_codebook efc, variable_label_width - - Code - print(out, table_width = Inf) - Output - efc (100 rows and 5 variables, 5 shown) - - ID | Name | Label | Type | Missings | Values | Value Labels | N - ---+----------+------------------------------+-------------+------------+----------+---------------------------------+----------- - 1 | c12hour | average number of hours of | numeric | 2 (2.0%) | [5, 168] | | 98 - | | care per week | | | | | - ---+----------+------------------------------+-------------+------------+----------+---------------------------------+----------- - 2 | e16sex | elder's gender | numeric | 0 (0.0%) | 1 | male | 46 (46.0%) - | | | | | 2 | female | 54 (54.0%) - ---+----------+------------------------------+-------------+------------+----------+---------------------------------+----------- - 3 | e42dep | elder's dependency | categorical | 3 (3.0%) | 1 | independent | 2 ( 2.1%) - | | | | | 2 | slightly dependent | 4 ( 4.1%) - | | | | | 3 | moderately dependent | 28 (28.9%) - | | | | | 4 | severely dependent | 63 (64.9%) - ---+----------+------------------------------+-------------+------------+----------+---------------------------------+----------- - 4 | c172code | carer's level of education | numeric | 10 (10.0%) | 1 | low level of education | 8 ( 8.9%) - | | | | | 2 | intermediate level of education | 66 (73.3%) - | | | | | 3 | high level of education | 16 (17.8%) - ---+----------+------------------------------+-------------+------------+----------+---------------------------------+----------- - 5 | neg_c_7 | Negative impact with 7 items | numeric | 3 (3.0%) | [7, 28] | | 97 - --------------------------------------------------------------------------------------------------------------------------------- - ---- - - Code - print(out, table_width = "auto", remove_duplicates = FALSE) - Output - efc (100 rows and 5 variables, 5 shown) - - ID | Name | Label | Type | Missings - ---+----------+------------------------------+-------------+----------- - 1 | c12hour | average number of hours of | numeric | 2 (2.0%) - | | care per week | | - ---+----------+------------------------------+-------------+----------- - 2 | e16sex | elder's gender | numeric | 0 (0.0%) - ---+----------+------------------------------+-------------+----------- - ---+----------+------------------------------+-------------+----------- - 3 | e42dep | elder's dependency | categorical | 3 (3.0%) - ---+----------+------------------------------+-------------+----------- - ---+----------+------------------------------+-------------+----------- - ---+----------+------------------------------+-------------+----------- - ---+----------+------------------------------+-------------+----------- - 4 | c172code | carer's level of education | numeric | 10 (10.0%) - ---+----------+------------------------------+-------------+----------- - ---+----------+------------------------------+-------------+----------- - ---+----------+------------------------------+-------------+----------- - 5 | neg_c_7 | Negative impact with 7 items | numeric | 3 (3.0%) - ----------------------------------------------------------------------- - - ID | Values | Value Labels | N - ---+----------+---------------------------------+----------- - 1 | [5, 168] | | 98 - ---+----------+---------------------------------+----------- - ---+----------+---------------------------------+----------- - 2 | 1 | male | 46 (46.0%) - | 2 | female | 54 (54.0%) - ---+----------+---------------------------------+----------- - 3 | 1 | independent | 2 ( 2.1%) - | 2 | slightly dependent | 4 ( 4.1%) - | 3 | moderately dependent | 28 (28.9%) - | 4 | severely dependent | 63 (64.9%) - ---+----------+---------------------------------+----------- - 4 | 1 | low level of education | 8 ( 8.9%) - | 2 | intermediate level of education | 66 (73.3%) - | 3 | high level of education | 16 (17.8%) - ---+----------+---------------------------------+----------- - 5 | [7, 28] | | 97 - ------------------------------------------------------------ - ---- - - Code - print(out, table_width = "auto", remove_duplicates = TRUE) - Output - efc (100 rows and 5 variables, 5 shown) - - ID | Name | Label | Type | Missings - ---+----------+------------------------------+-------------+----------- - 1 | c12hour | average number of hours of | numeric | 2 (2.0%) - | | care per week | | - ---+----------+------------------------------+-------------+----------- - 2 | e16sex | elder's gender | numeric | 0 (0.0%) - ---+----------+------------------------------+-------------+----------- - ---+----------+------------------------------+-------------+----------- - 3 | e42dep | elder's dependency | categorical | 3 (3.0%) - ---+----------+------------------------------+-------------+----------- - ---+----------+------------------------------+-------------+----------- - ---+----------+------------------------------+-------------+----------- - ---+----------+------------------------------+-------------+----------- - 4 | c172code | carer's level of education | numeric | 10 (10.0%) - ---+----------+------------------------------+-------------+----------- - ---+----------+------------------------------+-------------+----------- - ---+----------+------------------------------+-------------+----------- - 5 | neg_c_7 | Negative impact with 7 items | numeric | 3 (3.0%) - ----------------------------------------------------------------------- - - ID | Values | Value Labels | N - ---+----------+---------------------------------+----------- - 1 | [5, 168] | | 98 - ---+----------+---------------------------------+----------- - ---+----------+---------------------------------+----------- - 2 | 1 | male | 46 (46.0%) - | 2 | female | 54 (54.0%) - ---+----------+---------------------------------+----------- - 3 | 1 | independent | 2 ( 2.1%) - | 2 | slightly dependent | 4 ( 4.1%) - | 3 | moderately dependent | 28 (28.9%) - | 4 | severely dependent | 63 (64.9%) - ---+----------+---------------------------------+----------- - 4 | 1 | low level of education | 8 ( 8.9%) - | 2 | intermediate level of education | 66 (73.3%) - | 3 | high level of education | 16 (17.8%) - ---+----------+---------------------------------+----------- - 5 | [7, 28] | | 97 - ------------------------------------------------------------ - -# data_codebook efc, value_label_width - - Code - print(out, table_width = Inf) - Output - efc (100 rows and 5 variables, 5 shown) - - ID | Name | Label | Type | Missings | Values | Value Labels | N - ---+----------+------------------------------+-------------+------------+----------+------------------+----------- - 1 | c12hour | average number of hours of | numeric | 2 (2.0%) | [5, 168] | | 98 - | | care per week | | | | | - ---+----------+------------------------------+-------------+------------+----------+------------------+----------- - 2 | e16sex | elder's gender | numeric | 0 (0.0%) | 1 | male | 46 (46.0%) - | | | | | 2 | female | 54 (54.0%) - ---+----------+------------------------------+-------------+------------+----------+------------------+----------- - 3 | e42dep | elder's dependency | categorical | 3 (3.0%) | 1 | independent | 2 ( 2.1%) - | | | | | 2 | slightly... | 4 ( 4.1%) - | | | | | 3 | moderately... | 28 (28.9%) - | | | | | 4 | severely... | 63 (64.9%) - ---+----------+------------------------------+-------------+------------+----------+------------------+----------- - 4 | c172code | carer's level of education | numeric | 10 (10.0%) | 1 | low level of... | 8 ( 8.9%) - | | | | | 2 | intermediate... | 66 (73.3%) - | | | | | 3 | high level of... | 16 (17.8%) - ---+----------+------------------------------+-------------+------------+----------+------------------+----------- - 5 | neg_c_7 | Negative impact with 7 items | numeric | 3 (3.0%) | [7, 28] | | 97 - ------------------------------------------------------------------------------------------------------------------ - ---- - - Code - print(out, table_width = "auto", remove_duplicates = FALSE) - Output - efc (100 rows and 5 variables, 5 shown) - - ID | Name | Label | Type | Missings - ---+----------+------------------------------+-------------+----------- - 1 | c12hour | average number of hours of | numeric | 2 (2.0%) - | | care per week | | - ---+----------+------------------------------+-------------+----------- - 2 | e16sex | elder's gender | numeric | 0 (0.0%) - ---+----------+------------------------------+-------------+----------- - ---+----------+------------------------------+-------------+----------- - 3 | e42dep | elder's dependency | categorical | 3 (3.0%) - ---+----------+------------------------------+-------------+----------- - ---+----------+------------------------------+-------------+----------- - ---+----------+------------------------------+-------------+----------- - ---+----------+------------------------------+-------------+----------- - 4 | c172code | carer's level of education | numeric | 10 (10.0%) - ---+----------+------------------------------+-------------+----------- - ---+----------+------------------------------+-------------+----------- - ---+----------+------------------------------+-------------+----------- - 5 | neg_c_7 | Negative impact with 7 items | numeric | 3 (3.0%) - ----------------------------------------------------------------------- - - ID | Values | Value Labels | N - ---+----------+------------------+----------- - 1 | [5, 168] | | 98 - ---+----------+------------------+----------- - ---+----------+------------------+----------- - 2 | 1 | male | 46 (46.0%) - | 2 | female | 54 (54.0%) - ---+----------+------------------+----------- - 3 | 1 | independent | 2 ( 2.1%) - | 2 | slightly... | 4 ( 4.1%) - | 3 | moderately... | 28 (28.9%) - | 4 | severely... | 63 (64.9%) - ---+----------+------------------+----------- - 4 | 1 | low level of... | 8 ( 8.9%) - | 2 | intermediate... | 66 (73.3%) - | 3 | high level of... | 16 (17.8%) - ---+----------+------------------+----------- - 5 | [7, 28] | | 97 - --------------------------------------------- - ---- - - Code - print(out, table_width = "auto", remove_duplicates = TRUE) - Output - efc (100 rows and 5 variables, 5 shown) - - ID | Name | Label | Type | Missings - ---+----------+------------------------------+-------------+----------- - 1 | c12hour | average number of hours of | numeric | 2 (2.0%) - | | care per week | | - ---+----------+------------------------------+-------------+----------- - 2 | e16sex | elder's gender | numeric | 0 (0.0%) - ---+----------+------------------------------+-------------+----------- - ---+----------+------------------------------+-------------+----------- - 3 | e42dep | elder's dependency | categorical | 3 (3.0%) - ---+----------+------------------------------+-------------+----------- - ---+----------+------------------------------+-------------+----------- - ---+----------+------------------------------+-------------+----------- - ---+----------+------------------------------+-------------+----------- - 4 | c172code | carer's level of education | numeric | 10 (10.0%) - ---+----------+------------------------------+-------------+----------- - ---+----------+------------------------------+-------------+----------- - ---+----------+------------------------------+-------------+----------- - 5 | neg_c_7 | Negative impact with 7 items | numeric | 3 (3.0%) - ----------------------------------------------------------------------- - - ID | Values | Value Labels | N - ---+----------+------------------+----------- - 1 | [5, 168] | | 98 - ---+----------+------------------+----------- - ---+----------+------------------+----------- - 2 | 1 | male | 46 (46.0%) - | 2 | female | 54 (54.0%) - ---+----------+------------------+----------- - 3 | 1 | independent | 2 ( 2.1%) - | 2 | slightly... | 4 ( 4.1%) - | 3 | moderately... | 28 (28.9%) - | 4 | severely... | 63 (64.9%) - ---+----------+------------------+----------- - 4 | 1 | low level of... | 8 ( 8.9%) - | 2 | intermediate... | 66 (73.3%) - | 3 | high level of... | 16 (17.8%) - ---+----------+------------------+----------- - 5 | [7, 28] | | 97 - --------------------------------------------- - -# data_codebook truncated data - - Code - data_codebook(d, max_values = 5) - Output - d (100 rows and 2 variables, 2 shown) - - ID | Name | Type | Missings | Values | N - ---+------+-----------+----------+---------+--------- - 1 | a | integer | 0 (0.0%) | [1, 15] | 100 - ---+------+-----------+----------+---------+--------- - 2 | b | character | 0 (0.0%) | a | 4 (4.0%) - | | | | b | 3 (3.0%) - | | | | c | 5 (5.0%) - | | | | d | 4 (4.0%) - | | | | e | 3 (3.0%) - | | | | (...) | - ----------------------------------------------------- - -# data_codebook mixed numeric lengths - - Code - data_codebook(d) - Output - d (100 rows and 2 variables, 2 shown) - - ID | Name | Type | Missings | Values | N - ---+------+---------+----------+---------+----------- - 1 | a | integer | 0 (0.0%) | 1 | 28 (28.0%) - | | | | 2 | 26 (26.0%) - | | | | 3 | 29 (29.0%) - | | | | 4 | 17 (17.0%) - ---+------+---------+----------+---------+----------- - 2 | b | integer | 0 (0.0%) | [5, 15] | 100 - ----------------------------------------------------- - -# data_codebook mixed range_at - - Code - data_codebook(d, range_at = 3) - Output - d (100 rows and 2 variables, 2 shown) - - ID | Name | Type | Missings | Values | N - ---+------+---------+----------+---------+---- - 1 | a | integer | 0 (0.0%) | [1, 4] | 100 - ---+------+---------+----------+---------+---- - 2 | b | integer | 0 (0.0%) | [5, 15] | 100 - ---------------------------------------------- - -# data_codebook logicals - - Code - data_codebook(d) - Output - d (100 rows and 3 variables, 3 shown) - - ID | Name | Type | Missings | Values | N - ---+------+-----------+----------+---------+----------- - 1 | a | integer | 0 (0.0%) | [1, 15] | 100 - ---+------+-----------+----------+---------+----------- - 2 | b | character | 0 (0.0%) | a | 26 (26.0%) - | | | | b | 38 (38.0%) - | | | | c | 36 (36.0%) - ---+------+-----------+----------+---------+----------- - 3 | c | logical | 0 (0.0%) | FALSE | 42 (42.0%) - | | | | TRUE | 58 (58.0%) - ------------------------------------------------------- - -# data_codebook labelled data exceptions - - Code - data_codebook(d) - Output - d (100 rows and 3 variables, 3 shown) - - ID | Name | Type | Missings | Values | Value Labels | N - ---+------+---------+------------+--------+--------------+----------- - 1 | f1 | integer | 17 (17.0%) | 1 | One | 21 (25.3%) - | | | | 2 | Two | 20 (24.1%) - | | | | 3 | Three | 23 (27.7%) - | | | | 5 | Five | 19 (22.9%) - ---+------+---------+------------+--------+--------------+----------- - 2 | f2 | integer | 0 (0.0%) | 1 | One | 25 (25.0%) - | | | | 2 | Two | 20 (20.0%) - | | | | 3 | Three | 14 (14.0%) - | | | | 4 | 4 | 17 (17.0%) - | | | | 5 | Five | 24 (24.0%) - ---+------+---------+------------+--------+--------------+----------- - 3 | f3 | integer | 0 (0.0%) | 1 | One | 21 (21.0%) - | | | | 2 | Two | 24 (24.0%) - | | | | 3 | Three | 16 (16.0%) - | | | | 4 | Four | 14 (14.0%) - | | | | 5 | Five | 25 (25.0%) - --------------------------------------------------------------------- - -# data_codebook labelled data factors - - Code - data_codebook(d) - Output - d (100 rows and 3 variables, 3 shown) - - ID | Name | Type | Missings | Values | Value Labels | N - ---+------+-------------+----------+--------+--------------+----------- - 1 | f1 | categorical | 0 (0.0%) | a | A | 35 (35.0%) - | | | | b | Bee | 32 (32.0%) - | | | | c | Cee | 33 (33.0%) - ---+------+-------------+----------+--------+--------------+----------- - 2 | f2 | categorical | 0 (0.0%) | a | A | 30 (30.0%) - | | | | b | Bee | 38 (38.0%) - | | | | c | Cee | 32 (32.0%) - ---+------+-------------+----------+--------+--------------+----------- - 3 | f3 | categorical | 0 (0.0%) | a | A | 23 (23.0%) - | | | | b | Bee | 28 (28.0%) - | | | | c | Cee | 49 (49.0%) - ----------------------------------------------------------------------- - -# data_codebook works with numbers < 1 - - Code - data_codebook(d) - Output - d (6 rows and 2 variables, 2 shown) - - ID | Name | Type | Missings | Values | N - ---+------+---------+----------+--------+---------- - 1 | a | numeric | 0 (0.0%) | 1 | 2 (33.3%) - | | | | 2 | 2 (33.3%) - | | | | 3 | 2 (33.3%) - ---+------+---------+----------+--------+---------- - 2 | b | numeric | 0 (0.0%) | 0 | 3 (50.0%) - | | | | 1 | 2 (33.3%) - | | | | 2 | 1 (16.7%) - --------------------------------------------------- - -# data_codebook, big marks - - Code - data_codebook(d) - Output - d (1,000,000 rows and 2 variables, 2 shown) - - ID | Name | Type | Missings | Values | N - ---+------+-------------+----------+--------+---------------- - 1 | f1 | categorical | 0 (0.0%) | a | 333,238 (33.3%) - | | | | b | 332,910 (33.3%) - | | | | c | 333,852 (33.4%) - ---+------+-------------+----------+--------+---------------- - 2 | f2 | categorical | 0 (0.0%) | 1 | 333,285 (33.3%) - | | | | 2 | 333,358 (33.3%) - | | | | 3 | 333,357 (33.3%) - ------------------------------------------------------------- - -# data_codebook, tagged NA - - Code - data_codebook(data.frame(x)) - Output - data.frame(x) (26 rows and 1 variables, 1 shown) - - ID | Name | Type | Missings | Values | Value Labels | N - ---+------+---------+------------+--------+--------------+---------- - 1 | x | numeric | 12 (46.2%) | 1 | Agreement | 4 (15.4%) - | | | | 2 | 2 | 4 (15.4%) - | | | | 3 | 3 | 4 (15.4%) - | | | | 4 | Disagreement | 2 ( 7.7%) - | | | | NA(a) | Refused | 4 (15.4%) - | | | | NA(c) | First | 5 (19.2%) - | | | | NA(z) | Not home | 3 (11.5%) - -------------------------------------------------------------------- - ---- - - Code - data_codebook(data.frame(x)) - Output - data.frame(x) (23 rows and 1 variables, 1 shown) - - ID | Name | Type | Missings | Values | Value Labels | N - ---+------+---------+-----------+--------+--------------+---------- - 1 | x | numeric | 9 (39.1%) | 1 | Agreement | 4 (17.4%) - | | | | 2 | 2 | 4 (17.4%) - | | | | 3 | 3 | 4 (17.4%) - | | | | 4 | Disagreement | 2 ( 8.7%) - | | | | NA(a) | Refused | 4 (17.4%) - | | | | NA(c) | First | 5 (21.7%) - ------------------------------------------------------------------- - -# data_codebook, negative label values #334 - - Code - data_codebook(data.frame(x1, x2)) - Output - data.frame(x1, x2) (4 rows and 2 variables, 2 shown) - - ID | Name | Type | Missings | Values | Value Labels | N - ---+------+---------+----------+--------+--------------+---------- - 1 | x1 | integer | 0 (0.0%) | 1 | Agreement | 1 (25.0%) - | | | | 2 | 2 | 1 (25.0%) - | | | | 3 | 3 | 1 (25.0%) - | | | | 4 | Disagreement | 1 (25.0%) - ---+------+---------+----------+--------+--------------+---------- - 2 | x2 | numeric | 0 (0.0%) | -9 | Missing | 1 (25.0%) - | | | | 1 | Agreement | 1 (25.0%) - | | | | 2 | 2 | 1 (25.0%) - | | | | 3 | 3 | 1 (25.0%) - ------------------------------------------------------------------ - diff --git a/tests/testthat/_snaps/describe_missing.md b/tests/testthat/_snaps/describe_missing.md index 14d910bf0..f86ae4b92 100644 --- a/tests/testthat/_snaps/describe_missing.md +++ b/tests/testthat/_snaps/describe_missing.md @@ -3,8 +3,22 @@ Code describe_missing(airquality) Output - var items na cells na_percent na_max na_max_percent all_na - 1 Ozone:Day 6 44 918 4.79 2 33.33 0 + variable n_columns n_missing n_cells missing_percent complete_percent + 1 Ozone 1 37 153 24.18 75.82 + 2 Solar.R 1 7 153 4.58 95.42 + 3 Wind 1 0 153 0.00 100.00 + 4 Temp 1 0 153 0.00 100.00 + 5 Month 1 0 153 0.00 100.00 + 6 Day 1 0 153 0.00 100.00 + 7 Total 6 44 918 4.79 95.21 + missing_max missing_max_percent all_missing + 1 1 100.00 37 + 2 1 100.00 7 + 3 0 0.00 0 + 4 0 0.00 0 + 5 0 0.00 0 + 6 0 0.00 0 + 7 2 33.33 0 --- @@ -12,20 +26,38 @@ describe_missing(airquality, vars = list(c("Ozone", "Solar.R", "Wind"), c( "Temp", "Month", "Day"))) Output - var items na cells na_percent na_max na_max_percent all_na - 1 Ozone:Wind 3 44 459 9.59 2 66.67 0 - 2 Temp:Day 3 0 459 0.00 0 0.00 0 - 3 Total 6 44 918 4.79 2 33.33 0 + variable n_columns n_missing n_cells missing_percent complete_percent + 1 Ozone 1 37 153 24.18 75.82 + 2 Solar.R 1 7 153 4.58 95.42 + 3 Wind 1 0 153 0.00 100.00 + 4 Temp 1 0 153 0.00 100.00 + 5 Month 1 0 153 0.00 100.00 + 6 Day 1 0 153 0.00 100.00 + 7 Total 6 44 918 4.79 95.21 + missing_max missing_max_percent all_missing + 1 1 100.00 37 + 2 1 100.00 7 + 3 0 0.00 0 + 4 0 0.00 0 + 5 0 0.00 0 + 6 0 0.00 0 + 7 2 33.33 0 --- Code describe_missing(df, scales = c("ID", "scale1", "scale2", "scale3")) Output - var items na cells na_percent na_max na_max_percent all_na - 1 ID:ID 1 7 14 50.00 1 100 7 - 2 scale1_Q1:scale1_Q3 3 11 42 26.19 3 100 3 - 3 scale2_Q1:scale2_Q3 3 17 42 40.48 3 100 3 - 4 scale3_Q1:scale3_Q3 3 10 42 23.81 3 100 3 - 5 Total 10 45 140 32.14 10 100 2 + variable n_columns n_missing n_cells missing_percent + 1 ID 1 7 14 50.00 + 2 scale1_Q1:scale1_Q3 3 11 42 26.19 + 3 scale2_Q1:scale2_Q3 3 17 42 40.48 + 4 scale3_Q1:scale3_Q3 3 10 42 23.81 + 5 Total 10 45 140 32.14 + complete_percent missing_max missing_max_percent all_missing + 1 50.00 1 100 7 + 2 73.81 3 100 3 + 3 59.52 3 100 3 + 4 76.19 3 100 3 + 5 67.86 10 100 2 From e8d393d066f1d9383b71bf77d7e8c1d91d55a4c7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Th=C3=A9riault?= <13123390+rempsyc@users.noreply.github.com> Date: Thu, 19 Dec 2024 14:38:56 -0500 Subject: [PATCH 09/10] rework describe_missing --- DESCRIPTION | 2 +- NEWS.md | 2 +- R/describe_missing.R | 176 +++++++++------------- inst/WORDLIST | 4 - man/describe_missing.Rd | 154 +++++++++++-------- tests/testthat/_snaps/describe_missing.md | 94 ++++++------ tests/testthat/test-describe_missing.R | 45 ++++-- 7 files changed, 244 insertions(+), 233 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 68cfb6741..034c823ed 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: datawizard Title: Easy Data Wrangling and Statistical Transformations -Version: 0.13.0.19 +Version: 0.13.0.20 Authors@R: c( person("Indrajeet", "Patil", , "patilindrajeet.science@gmail.com", role = "aut", comment = c(ORCID = "0000-0003-1995-6531")), diff --git a/NEWS.md b/NEWS.md index b84e403f6..0e9dc28a9 100644 --- a/NEWS.md +++ b/NEWS.md @@ -18,7 +18,7 @@ BREAKING CHANGES AND DEPRECATIONS NEW FUNCTIONS -* `describe_missing()`, to comprehensively report on missing values in a data frame. +* `describe_missing()`, to report on missing values in a data frame. CHANGES diff --git a/R/describe_missing.R b/R/describe_missing.R index 8c171d58e..8d215e366 100644 --- a/R/describe_missing.R +++ b/R/describe_missing.R @@ -2,59 +2,27 @@ #' #' @description Provides a detailed description of missing values in a data frame. #' This function reports both absolute and percentage missing values of specified -#' column lists or scales, following recommended guidelines. +#' variables. #' -#' @details -#' In psychology, it is common to ask participants to answer questionnaires in -#' which people answer several questions about a specific topic. For example, -#' people could answer 10 different questions about how extroverted they are. -#' In turn, researchers calculate the average for those 10 questions (called -#' items). These questionnaires are called (e.g., Likert) "scales" (such as the -#' Rosenberg Self-Esteem Scale, also known as the RSES). -#' -#' Some authors recommend reporting item-level missingness per scale, as well -#' as a participant's maximum number of missing items by scale. For example, -#' Parent (2013) writes: -#' -#' *I recommend that authors (a) state their tolerance level for missing data by scale -#' or subscale (e.g., "We calculated means for all subscales on which participants gave -#' at least 75% complete data") and then (b) report the individual missingness rates -#' by scale per data point (i.e., the number of missing values out of all data points -#' on that scale for all participants) and the maximum by participant (e.g., "For Attachment -#' Anxiety, a total of 4 missing data points out of 100 were observed, with no participant -#' missing more than a single data point").* -#' -#' @param data The data frame to be analyzed. -#' @param select Variable (or lists of variables) to check for missing values (NAs). -#' @param scales If you rely on composite scores such as psychological scales -#' or questionnaires, you can provide the shared suffix among those variables -#' (as a character vector). This is useful if the variables you want to check -#' the average of all start with the same name (e.g., `varx`), such as is -#' commonly the case for Likert scales (such as `varx_1`, `varx_2`, `varx_3`, -#' etc.). +#' @inheritParams extract_column_names +#' @param by Optional character string, indicating the names of one or more +#' variables in the data frame. If supplied, the data will be split by these +#' variables and summary statistics will be computed for each group. Useful +#' for survey data by first reshaping the data to the long format. +#' @param sort Logical. Whether to sort the result from highest to lowest +#' percentage of missing data. #' @return A dataframe with the following columns: #' - `variable`: Variables selected. -#' - `n_columns`: Number of items for selected variables. -#' - `n_missing`: Number of missing values for those variables (NA stands for Not -#' Available). -#' - `n_cells`: Total number of cells (i.e., number of participants multiplied by -#' the number of columns, `n_columns`). -#' - `missing_percent`: The percentage of missing values (`na` divided by `cells`). -#' - `missing_max`: The number of missing values for the participant with the most -#' missing values for the selected variables. -#' - `missing_max_percent`: The amount of missing values for the participant with -#' the most missing values for the selected variables, as a percentage -#' (i.e., `missing_max` divided by the number of selected columns, `n_columns`). -#' - `all_missing`: The number of participants missing 100% of items for that scale -#' (the selected variables). +#' - `n_missing`: Number of missing values. +#' - `missing_percent`: Percentage of missing values. +#' - `complete_percent`: Percentage of non-missing values. #' @param ... Arguments passed down to other functions. Currently not used. #' #' @export -#' @references Parent, M. C. (2013). Handling item-level missing -#' data: Simpler is just as good. *The Counseling Psychologist*, -#' *41*(4), 568-600. https://doi.org/10.1177%2F0011000012445176 #' @examples -#' # Use the entire data frame +#' describe_missing(airquality) +#' +#' # Survey data #' set.seed(15) #' fun <- function() { #' c(sample(c(NA, 1:10), replace = TRUE), NA, NA, NA) @@ -65,74 +33,78 @@ #' extroversion_1 = fun(), extroversion_2 = fun(), extroversion_3 = fun(), #' agreeableness_1 = fun(), agreeableness_2 = fun(), agreeableness_3 = fun() #' ) -#' describe_missing(df) #' -#' # If the questionnaire items start with the same name, -#' # one can list the scale names directly: -#' describe_missing(df, scales = c("ID", "openness", "extroversion", "agreeableness")) +#' df_long <- reshape_longer( +#' df, +#' select = -1, +#' names_sep = "_", +#' names_to = c("dimension", "item")) #' -#' # Otherwise you can provide nested columns manually: -#' describe_missing(df, -#' select = list( -#' c("ID"), -#' c("openness_1", "openness_2", "openness_3"), -#' c("extroversion_1", "extroversion_2", "extroversion_3"), -#' c("agreeableness_1", "agreeableness_2", "agreeableness_3") -#' ) -#' ) +#' describe_missing( +#' df_long, +#' select = -c(1, 3), +#' by = "dimension") #' -describe_missing <- function(data, select = NULL, scales = NULL, ...) { - vars <- select - if (!is.null(vars) && missing(scales)) { - vars.internal <- names(data) - } else if (!missing(scales)) { - vars.internal <- lapply(scales, function(x) { - grep(paste0("^", x), names(data), value = TRUE) - }) - } else if (is.null(vars) && missing(scales)) { - vars <- as.list(names(data)) - } - if (!is.null(vars)) { - vars.internal <- vars +describe_missing <- function(data, + select = NULL, + exclude = NULL, + ignore_case = FALSE, + regex = FALSE, + verbose = TRUE, + by = NULL, + sort = FALSE, + ...) { + if (!is.null(select) || !is.null(exclude)) { + data <- data_select( + data = data, + select = select, + exclude = exclude, + ignore_case = ignore_case, + regex = regex, + verbose = verbose, + ... + ) } - if (!is.list(vars.internal)) { - vars.internal <- list(vars.internal) - } - na_df <- .describe_missing(data) - if (!is.null(vars) || !missing(scales)) { - na_list <- lapply(vars.internal, function(x) { + if (!is.null(by)) { + if (!by %in% names(data)) { + stop("The 'by' column does not exist in the data.") + } + grouped_data <- split(data, data[[by]]) + na_list <- lapply(names(grouped_data), function(group_name) { + group <- grouped_data[[group_name]] + # Identify columns to analyze (exclude the 'by' column) + cols_to_analyze <- setdiff(names(group), by) + group_na_list <- lapply(cols_to_analyze, function(x) { + data_subset <- group[, x, drop = FALSE] + .describe_missing(data_subset) + }) + group_na_df <- do.call(rbind, group_na_list) + group_na_df$variable <- group_name + group_na_df + }) + } else { + na_list <- lapply(names(data), function(x) { data_subset <- data[, x, drop = FALSE] .describe_missing(data_subset) }) - na_df$variable <- "Total" - na_df <- do.call(rbind, c(na_list, list(na_df))) } + na_df <- do.call(rbind, na_list) + if (isTRUE(sort)) { + na_df <- na_df[order(-na_df$missing_percent), ] + } + na_df_tot <- .describe_missing(data) + na_df_tot$variable <- "Total" + na_df <- rbind(na_df, na_df_tot) na_df } .describe_missing <- function(data) { - if (ncol(data) > 1) { - my_var <- paste0(names(data)[1], ":", names(data)[ncol(data)]) - } else { - my_var <- names(data) - } - items <- ncol(data) - na <- sum(is.na(data)) - cells <- nrow(data) * ncol(data) - na_percent <- round(na / cells * 100, 2) - na_max <- max(rowSums(is.na(data))) - na_max_percent <- round(na_max / items * 100, 2) - all_na <- sum(apply(data, 1, function(x) all(is.na(x)))) - + n_missing <- sum(is.na(data)) + missing_percent <- round(n_missing / (nrow(data) * ncol(data)) * 100, 2) data.frame( - variable = my_var, - n_columns = items, - n_missing = na, - n_cells = cells, - missing_percent = na_percent, - complete_percent = 100 - na_percent, - missing_max = na_max, - missing_max_percent = na_max_percent, - all_missing = all_na + variable = names(data)[1], + n_missing = n_missing, + missing_percent = missing_percent, + complete_percent = 100 - missing_percent ) } diff --git a/inst/WORDLIST b/inst/WORDLIST index bf52882d2..bbafb3bd2 100644 --- a/inst/WORDLIST +++ b/inst/WORDLIST @@ -26,7 +26,6 @@ Heisig Herrington Hoffmann Joanes -Likert Llabre Lumley MADs @@ -35,7 +34,6 @@ Minitab ORCID PSU Posteriori -RSES Ranktransform Recode Recoding @@ -112,8 +110,6 @@ rio rowid sd stackexchange -subscale -subscales tailedness th tibble diff --git a/man/describe_missing.Rd b/man/describe_missing.Rd index 2213a97ae..bda13d11c 100644 --- a/man/describe_missing.Rd +++ b/man/describe_missing.Rd @@ -4,19 +4,86 @@ \alias{describe_missing} \title{Describe Missing Values in Data According to Guidelines} \usage{ -describe_missing(data, select = NULL, scales = NULL, ...) +describe_missing( + data, + select = NULL, + exclude = NULL, + ignore_case = FALSE, + regex = FALSE, + verbose = TRUE, + by = NULL, + sort = FALSE, + ... +) } \arguments{ -\item{data}{The data frame to be analyzed.} +\item{data}{A data frame.} + +\item{select}{Variables that will be included when performing the required +tasks. Can be either +\itemize{ +\item a variable specified as a literal variable name (e.g., \code{column_name}), +\item a string with the variable name (e.g., \code{"column_name"}), a character +vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a +character vector of variable names including ranges specified via \code{:} +(e.g., \code{c("col1:col3", "col5")}), +\item for some functions, like \code{data_select()} or \code{data_rename()}, \code{select} can +be a named character vector. In this case, the names are used to rename +the columns in the output data frame. See 'Details' in the related +functions to see where this option applies. +\item a formula with variable names (e.g., \code{~column_1 + column_2}), +\item a vector of positive integers, giving the positions counting from the left +(e.g. \code{1} or \code{c(1, 3, 5)}), +\item a vector of negative integers, giving the positions counting from the +right (e.g., \code{-1} or \code{-1:-3}), +\item one of the following select-helpers: \code{starts_with()}, \code{ends_with()}, +\code{contains()}, a range using \code{:}, or \code{regex()}. \code{starts_with()}, +\code{ends_with()}, and \code{contains()} accept several patterns, e.g +\code{starts_with("Sep", "Petal")}. \code{regex()} can be used to define regular +expression patterns. +\item a function testing for logical conditions, e.g. \code{is.numeric()} (or +\code{is.numeric}), or any user-defined function that selects the variables +for which the function returns \code{TRUE} (like: \code{foo <- function(x) mean(x) > 3}), +\item ranges specified via literal variable names, select-helpers (except +\code{regex()}) and (user-defined) functions can be negated, i.e. return +non-matching elements, when prefixed with a \code{-}, e.g. \code{-ends_with()}, +\code{-is.numeric} or \code{-(Sepal.Width:Petal.Length)}. \strong{Note:} Negation means +that matches are \emph{excluded}, and thus, the \code{exclude} argument can be +used alternatively. For instance, \code{select=-ends_with("Length")} (with +\code{-}) is equivalent to \code{exclude=ends_with("Length")} (no \code{-}). In case +negation should not work as expected, use the \code{exclude} argument instead. +} + +If \code{NULL}, selects all columns. Patterns that found no matches are silently +ignored, e.g. \code{extract_column_names(iris, select = c("Species", "Test"))} +will just return \code{"Species"}.} + +\item{exclude}{See \code{select}, however, column names matched by the pattern +from \code{exclude} will be excluded instead of selected. If \code{NULL} (the default), +excludes no columns.} + +\item{ignore_case}{Logical, if \code{TRUE} and when one of the select-helpers or +a regular expression is used in \code{select}, ignores lower/upper case in the +search pattern when matching against variable names.} + +\item{regex}{Logical, if \code{TRUE}, the search pattern from \code{select} will be +treated as regular expression. When \code{regex = TRUE}, select \emph{must} be a +character string (or a variable containing a character string) and is not +allowed to be one of the supported select-helpers or a character vector +of length > 1. \code{regex = TRUE} is comparable to using one of the two +select-helpers, \code{select = contains()} or \code{select = regex()}, however, +since the select-helpers may not work when called from inside other +functions (see 'Details'), this argument may be used as workaround.} -\item{select}{Variable (or lists of variables) to check for missing values (NAs).} +\item{verbose}{Toggle warnings.} -\item{scales}{If you rely on composite scores such as psychological scales -or questionnaires, you can provide the shared suffix among those variables -(as a character vector). This is useful if the variables you want to check -the average of all start with the same name (e.g., \code{varx}), such as is -commonly the case for Likert scales (such as \code{varx_1}, \code{varx_2}, \code{varx_3}, -etc.).} +\item{by}{Optional character string, indicating the names of one or more +variables in the data frame. If supplied, the data will be split by these +variables and summary statistics will be computed for each group. Useful +for survey data by first reshaping the data to the long format.} + +\item{sort}{Logical. Whether to sort the result from highest to lowest +percentage of missing data.} \item{...}{Arguments passed down to other functions. Currently not used.} } @@ -24,48 +91,20 @@ etc.).} A dataframe with the following columns: \itemize{ \item \code{variable}: Variables selected. -\item \code{n_columns}: Number of items for selected variables. -\item \code{n_missing}: Number of missing values for those variables (NA stands for Not -Available). -\item \code{n_cells}: Total number of cells (i.e., number of participants multiplied by -the number of columns, \code{n_columns}). -\item \code{missing_percent}: The percentage of missing values (\code{na} divided by \code{cells}). -\item \code{missing_max}: The number of missing values for the participant with the most -missing values for the selected variables. -\item \code{missing_max_percent}: The amount of missing values for the participant with -the most missing values for the selected variables, as a percentage -(i.e., \code{missing_max} divided by the number of selected columns, \code{n_columns}). -\item \code{all_missing}: The number of participants missing 100\% of items for that scale -(the selected variables). +\item \code{n_missing}: Number of missing values. +\item \code{missing_percent}: Percentage of missing values. +\item \code{complete_percent}: Percentage of non-missing values. } } \description{ Provides a detailed description of missing values in a data frame. This function reports both absolute and percentage missing values of specified -column lists or scales, following recommended guidelines. -} -\details{ -In psychology, it is common to ask participants to answer questionnaires in -which people answer several questions about a specific topic. For example, -people could answer 10 different questions about how extroverted they are. -In turn, researchers calculate the average for those 10 questions (called -items). These questionnaires are called (e.g., Likert) "scales" (such as the -Rosenberg Self-Esteem Scale, also known as the RSES). - -Some authors recommend reporting item-level missingness per scale, as well -as a participant's maximum number of missing items by scale. For example, -Parent (2013) writes: - -\emph{I recommend that authors (a) state their tolerance level for missing data by scale -or subscale (e.g., "We calculated means for all subscales on which participants gave -at least 75\% complete data") and then (b) report the individual missingness rates -by scale per data point (i.e., the number of missing values out of all data points -on that scale for all participants) and the maximum by participant (e.g., "For Attachment -Anxiety, a total of 4 missing data points out of 100 were observed, with no participant -missing more than a single data point").} +variables. } \examples{ -# Use the entire data frame +describe_missing(airquality) + +# Survey data set.seed(15) fun <- function() { c(sample(c(NA, 1:10), replace = TRUE), NA, NA, NA) @@ -76,25 +115,16 @@ df <- data.frame( extroversion_1 = fun(), extroversion_2 = fun(), extroversion_3 = fun(), agreeableness_1 = fun(), agreeableness_2 = fun(), agreeableness_3 = fun() ) -describe_missing(df) -# If the questionnaire items start with the same name, -# one can list the scale names directly: -describe_missing(df, scales = c("ID", "openness", "extroversion", "agreeableness")) +df_long <- reshape_longer( + df, + select = -1, + names_sep = "_", + names_to = c("dimension", "item")) -# Otherwise you can provide nested columns manually: -describe_missing(df, - select = list( - c("ID"), - c("openness_1", "openness_2", "openness_3"), - c("extroversion_1", "extroversion_2", "extroversion_3"), - c("agreeableness_1", "agreeableness_2", "agreeableness_3") - ) -) +describe_missing( + df_long, + select = -c(1, 3), + by = "dimension") } -\references{ -Parent, M. C. (2013). Handling item-level missing -data: Simpler is just as good. \emph{The Counseling Psychologist}, -\emph{41}(4), 568-600. https://doi.org/10.1177\%2F0011000012445176 -} diff --git a/tests/testthat/_snaps/describe_missing.md b/tests/testthat/_snaps/describe_missing.md index f86ae4b92..5c598e2de 100644 --- a/tests/testthat/_snaps/describe_missing.md +++ b/tests/testthat/_snaps/describe_missing.md @@ -1,63 +1,61 @@ # describe_missing Code - describe_missing(airquality) + describe_missing(airquality2) Output - variable n_columns n_missing n_cells missing_percent complete_percent - 1 Ozone 1 37 153 24.18 75.82 - 2 Solar.R 1 7 153 4.58 95.42 - 3 Wind 1 0 153 0.00 100.00 - 4 Temp 1 0 153 0.00 100.00 - 5 Month 1 0 153 0.00 100.00 - 6 Day 1 0 153 0.00 100.00 - 7 Total 6 44 918 4.79 95.21 - missing_max missing_max_percent all_missing - 1 1 100.00 37 - 2 1 100.00 7 - 3 0 0.00 0 - 4 0 0.00 0 - 5 0 0.00 0 - 6 0 0.00 0 - 7 2 33.33 0 + variable n_missing missing_percent complete_percent + 1 Solar.R 7 4.58 95.42 + 2 Wind 0 0.00 100.00 + 3 Temp 0 0.00 100.00 + 4 Month 0 0.00 100.00 + 5 Day 0 0.00 100.00 + 6 Ozone 37 24.18 75.82 + 7 Total 44 4.79 95.21 --- Code - describe_missing(airquality, vars = list(c("Ozone", "Solar.R", "Wind"), c( - "Temp", "Month", "Day"))) + describe_missing(airquality2, sort = TRUE) Output - variable n_columns n_missing n_cells missing_percent complete_percent - 1 Ozone 1 37 153 24.18 75.82 - 2 Solar.R 1 7 153 4.58 95.42 - 3 Wind 1 0 153 0.00 100.00 - 4 Temp 1 0 153 0.00 100.00 - 5 Month 1 0 153 0.00 100.00 - 6 Day 1 0 153 0.00 100.00 - 7 Total 6 44 918 4.79 95.21 - missing_max missing_max_percent all_missing - 1 1 100.00 37 - 2 1 100.00 7 - 3 0 0.00 0 - 4 0 0.00 0 - 5 0 0.00 0 - 6 0 0.00 0 - 7 2 33.33 0 + variable n_missing missing_percent complete_percent + 6 Ozone 37 24.18 75.82 + 1 Solar.R 7 4.58 95.42 + 2 Wind 0 0.00 100.00 + 3 Temp 0 0.00 100.00 + 4 Month 0 0.00 100.00 + 5 Day 0 0.00 100.00 + 11 Total 44 4.79 95.21 --- Code - describe_missing(df, scales = c("ID", "scale1", "scale2", "scale3")) + describe_missing(airquality2, select = "Ozone:Temp") Output - variable n_columns n_missing n_cells missing_percent - 1 ID 1 7 14 50.00 - 2 scale1_Q1:scale1_Q3 3 11 42 26.19 - 3 scale2_Q1:scale2_Q3 3 17 42 40.48 - 4 scale3_Q1:scale3_Q3 3 10 42 23.81 - 5 Total 10 45 140 32.14 - complete_percent missing_max missing_max_percent all_missing - 1 50.00 1 100 7 - 2 73.81 3 100 3 - 3 59.52 3 100 3 - 4 76.19 3 100 3 - 5 67.86 10 100 2 + variable n_missing missing_percent complete_percent + 1 Ozone 37 24.18 75.82 + 2 Day 0 0.00 100.00 + 3 Month 0 0.00 100.00 + 4 Temp 0 0.00 100.00 + 5 Total 37 6.05 93.95 + +--- + + Code + describe_missing(airquality2, exclude = "Ozone:Temp") + Output + variable n_missing missing_percent complete_percent + 1 Solar.R 7 4.58 95.42 + 2 Wind 0 0.00 100.00 + 3 Total 7 2.29 97.71 + +--- + + Code + describe_missing(df_long, select = -c(1, 3), by = "dimension") + Output + variable n_missing missing_percent complete_percent + 1 agreeableness 10 23.81 76.19 + 2 extroversion 17 40.48 59.52 + 3 openness 11 26.19 73.81 + 4 Total 38 15.08 84.92 diff --git a/tests/testthat/test-describe_missing.R b/tests/testthat/test-describe_missing.R index 2159082be..9c3dba43d 100644 --- a/tests/testthat/test-describe_missing.R +++ b/tests/testthat/test-describe_missing.R @@ -1,27 +1,42 @@ test_that("describe_missing", { - expect_snapshot(describe_missing(airquality)) + airquality2 <- cbind(airquality[2:6], airquality[1]) - # Use selected columns explicitly - expect_snapshot(describe_missing(airquality, - vars = list( - c("Ozone", "Solar.R", "Wind"), - c("Temp", "Month", "Day") - ) + expect_snapshot(describe_missing(airquality2)) + + expect_snapshot(describe_missing(airquality2, sort = TRUE)) + + expect_snapshot(describe_missing( + airquality2, + select = "Ozone:Temp" )) - # If the questionnaire items start with the same name, e.g., + expect_snapshot(describe_missing( + airquality2, + exclude = "Ozone:Temp" + )) + + # Testing the 'by' argument for survey scales set.seed(15) fun <- function() { c(sample(c(NA, 1:10), replace = TRUE), NA, NA, NA) } - - # One can list the scale names directly: df <- data.frame( ID = c("idz", NA), - scale1_Q1 = fun(), scale1_Q2 = fun(), scale1_Q3 = fun(), - scale2_Q1 = fun(), scale2_Q2 = fun(), scale2_Q3 = fun(), - scale3_Q1 = fun(), scale3_Q2 = fun(), scale3_Q3 = fun(), - stringsAsFactors = FALSE + openness_1 = fun(), openness_2 = fun(), openness_3 = fun(), + extroversion_1 = fun(), extroversion_2 = fun(), extroversion_3 = fun(), + agreeableness_1 = fun(), agreeableness_2 = fun(), agreeableness_3 = fun() ) - expect_snapshot(describe_missing(df, scales = c("ID", "scale1", "scale2", "scale3"))) + + # Pivot and group using datawizard + df_long <- reshape_longer(df, + select = -1, + names_sep = "_", + names_to = c("dimension", "item") + ) + + # Run describe_missing with 'by' argument + expect_snapshot(describe_missing( + df_long, + select = -c(1, 3), by = "dimension" + )) }) From f26f247b28dd7c8bc1401c93829dac0a4e181888 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Th=C3=A9riault?= <13123390+rempsyc@users.noreply.github.com> Date: Thu, 19 Dec 2024 14:50:28 -0500 Subject: [PATCH 10/10] styler, lints --- R/describe_missing.R | 20 +++++++++++--------- man/describe_missing.Rd | 6 ++++-- tests/testthat/test-describe_missing.R | 3 ++- 3 files changed, 17 insertions(+), 12 deletions(-) diff --git a/R/describe_missing.R b/R/describe_missing.R index 8d215e366..620d14da1 100644 --- a/R/describe_missing.R +++ b/R/describe_missing.R @@ -38,12 +38,14 @@ #' df, #' select = -1, #' names_sep = "_", -#' names_to = c("dimension", "item")) +#' names_to = c("dimension", "item") +#' ) #' #' describe_missing( #' df_long, #' select = -c(1, 3), -#' by = "dimension") +#' by = "dimension" +#' ) #' describe_missing <- function(data, select = NULL, @@ -65,9 +67,14 @@ describe_missing <- function(data, ... ) } - if (!is.null(by)) { + if (is.null(by)) { + na_list <- lapply(names(data), function(x) { + data_subset <- data[, x, drop = FALSE] + .describe_missing(data_subset) + }) + } else { if (!by %in% names(data)) { - stop("The 'by' column does not exist in the data.") + stop("The 'by' column does not exist in the data.", call. = FALSE) } grouped_data <- split(data, data[[by]]) na_list <- lapply(names(grouped_data), function(group_name) { @@ -82,11 +89,6 @@ describe_missing <- function(data, group_na_df$variable <- group_name group_na_df }) - } else { - na_list <- lapply(names(data), function(x) { - data_subset <- data[, x, drop = FALSE] - .describe_missing(data_subset) - }) } na_df <- do.call(rbind, na_list) if (isTRUE(sort)) { diff --git a/man/describe_missing.Rd b/man/describe_missing.Rd index bda13d11c..daf863738 100644 --- a/man/describe_missing.Rd +++ b/man/describe_missing.Rd @@ -120,11 +120,13 @@ df_long <- reshape_longer( df, select = -1, names_sep = "_", - names_to = c("dimension", "item")) + names_to = c("dimension", "item") +) describe_missing( df_long, select = -c(1, 3), - by = "dimension") + by = "dimension" +) } diff --git a/tests/testthat/test-describe_missing.R b/tests/testthat/test-describe_missing.R index 9c3dba43d..d26758cca 100644 --- a/tests/testthat/test-describe_missing.R +++ b/tests/testthat/test-describe_missing.R @@ -24,7 +24,8 @@ test_that("describe_missing", { ID = c("idz", NA), openness_1 = fun(), openness_2 = fun(), openness_3 = fun(), extroversion_1 = fun(), extroversion_2 = fun(), extroversion_3 = fun(), - agreeableness_1 = fun(), agreeableness_2 = fun(), agreeableness_3 = fun() + agreeableness_1 = fun(), agreeableness_2 = fun(), agreeableness_3 = fun(), + stringsAsFactors = FALSE ) # Pivot and group using datawizard