From e8d393d066f1d9383b71bf77d7e8c1d91d55a4c7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Th=C3=A9riault?= <13123390+rempsyc@users.noreply.github.com> Date: Thu, 19 Dec 2024 14:38:56 -0500 Subject: [PATCH] rework describe_missing --- DESCRIPTION | 2 +- NEWS.md | 2 +- R/describe_missing.R | 176 +++++++++------------- inst/WORDLIST | 4 - man/describe_missing.Rd | 154 +++++++++++-------- tests/testthat/_snaps/describe_missing.md | 94 ++++++------ tests/testthat/test-describe_missing.R | 45 ++++-- 7 files changed, 244 insertions(+), 233 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 68cfb6741..034c823ed 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: datawizard Title: Easy Data Wrangling and Statistical Transformations -Version: 0.13.0.19 +Version: 0.13.0.20 Authors@R: c( person("Indrajeet", "Patil", , "patilindrajeet.science@gmail.com", role = "aut", comment = c(ORCID = "0000-0003-1995-6531")), diff --git a/NEWS.md b/NEWS.md index b84e403f6..0e9dc28a9 100644 --- a/NEWS.md +++ b/NEWS.md @@ -18,7 +18,7 @@ BREAKING CHANGES AND DEPRECATIONS NEW FUNCTIONS -* `describe_missing()`, to comprehensively report on missing values in a data frame. +* `describe_missing()`, to report on missing values in a data frame. CHANGES diff --git a/R/describe_missing.R b/R/describe_missing.R index 8c171d58e..8d215e366 100644 --- a/R/describe_missing.R +++ b/R/describe_missing.R @@ -2,59 +2,27 @@ #' #' @description Provides a detailed description of missing values in a data frame. #' This function reports both absolute and percentage missing values of specified -#' column lists or scales, following recommended guidelines. +#' variables. #' -#' @details -#' In psychology, it is common to ask participants to answer questionnaires in -#' which people answer several questions about a specific topic. For example, -#' people could answer 10 different questions about how extroverted they are. -#' In turn, researchers calculate the average for those 10 questions (called -#' items). These questionnaires are called (e.g., Likert) "scales" (such as the -#' Rosenberg Self-Esteem Scale, also known as the RSES). -#' -#' Some authors recommend reporting item-level missingness per scale, as well -#' as a participant's maximum number of missing items by scale. For example, -#' Parent (2013) writes: -#' -#' *I recommend that authors (a) state their tolerance level for missing data by scale -#' or subscale (e.g., "We calculated means for all subscales on which participants gave -#' at least 75% complete data") and then (b) report the individual missingness rates -#' by scale per data point (i.e., the number of missing values out of all data points -#' on that scale for all participants) and the maximum by participant (e.g., "For Attachment -#' Anxiety, a total of 4 missing data points out of 100 were observed, with no participant -#' missing more than a single data point").* -#' -#' @param data The data frame to be analyzed. -#' @param select Variable (or lists of variables) to check for missing values (NAs). -#' @param scales If you rely on composite scores such as psychological scales -#' or questionnaires, you can provide the shared suffix among those variables -#' (as a character vector). This is useful if the variables you want to check -#' the average of all start with the same name (e.g., `varx`), such as is -#' commonly the case for Likert scales (such as `varx_1`, `varx_2`, `varx_3`, -#' etc.). +#' @inheritParams extract_column_names +#' @param by Optional character string, indicating the names of one or more +#' variables in the data frame. If supplied, the data will be split by these +#' variables and summary statistics will be computed for each group. Useful +#' for survey data by first reshaping the data to the long format. +#' @param sort Logical. Whether to sort the result from highest to lowest +#' percentage of missing data. #' @return A dataframe with the following columns: #' - `variable`: Variables selected. -#' - `n_columns`: Number of items for selected variables. -#' - `n_missing`: Number of missing values for those variables (NA stands for Not -#' Available). -#' - `n_cells`: Total number of cells (i.e., number of participants multiplied by -#' the number of columns, `n_columns`). -#' - `missing_percent`: The percentage of missing values (`na` divided by `cells`). -#' - `missing_max`: The number of missing values for the participant with the most -#' missing values for the selected variables. -#' - `missing_max_percent`: The amount of missing values for the participant with -#' the most missing values for the selected variables, as a percentage -#' (i.e., `missing_max` divided by the number of selected columns, `n_columns`). -#' - `all_missing`: The number of participants missing 100% of items for that scale -#' (the selected variables). +#' - `n_missing`: Number of missing values. +#' - `missing_percent`: Percentage of missing values. +#' - `complete_percent`: Percentage of non-missing values. #' @param ... Arguments passed down to other functions. Currently not used. #' #' @export -#' @references Parent, M. C. (2013). Handling item-level missing -#' data: Simpler is just as good. *The Counseling Psychologist*, -#' *41*(4), 568-600. https://doi.org/10.1177%2F0011000012445176 #' @examples -#' # Use the entire data frame +#' describe_missing(airquality) +#' +#' # Survey data #' set.seed(15) #' fun <- function() { #' c(sample(c(NA, 1:10), replace = TRUE), NA, NA, NA) @@ -65,74 +33,78 @@ #' extroversion_1 = fun(), extroversion_2 = fun(), extroversion_3 = fun(), #' agreeableness_1 = fun(), agreeableness_2 = fun(), agreeableness_3 = fun() #' ) -#' describe_missing(df) #' -#' # If the questionnaire items start with the same name, -#' # one can list the scale names directly: -#' describe_missing(df, scales = c("ID", "openness", "extroversion", "agreeableness")) +#' df_long <- reshape_longer( +#' df, +#' select = -1, +#' names_sep = "_", +#' names_to = c("dimension", "item")) #' -#' # Otherwise you can provide nested columns manually: -#' describe_missing(df, -#' select = list( -#' c("ID"), -#' c("openness_1", "openness_2", "openness_3"), -#' c("extroversion_1", "extroversion_2", "extroversion_3"), -#' c("agreeableness_1", "agreeableness_2", "agreeableness_3") -#' ) -#' ) +#' describe_missing( +#' df_long, +#' select = -c(1, 3), +#' by = "dimension") #' -describe_missing <- function(data, select = NULL, scales = NULL, ...) { - vars <- select - if (!is.null(vars) && missing(scales)) { - vars.internal <- names(data) - } else if (!missing(scales)) { - vars.internal <- lapply(scales, function(x) { - grep(paste0("^", x), names(data), value = TRUE) - }) - } else if (is.null(vars) && missing(scales)) { - vars <- as.list(names(data)) - } - if (!is.null(vars)) { - vars.internal <- vars +describe_missing <- function(data, + select = NULL, + exclude = NULL, + ignore_case = FALSE, + regex = FALSE, + verbose = TRUE, + by = NULL, + sort = FALSE, + ...) { + if (!is.null(select) || !is.null(exclude)) { + data <- data_select( + data = data, + select = select, + exclude = exclude, + ignore_case = ignore_case, + regex = regex, + verbose = verbose, + ... + ) } - if (!is.list(vars.internal)) { - vars.internal <- list(vars.internal) - } - na_df <- .describe_missing(data) - if (!is.null(vars) || !missing(scales)) { - na_list <- lapply(vars.internal, function(x) { + if (!is.null(by)) { + if (!by %in% names(data)) { + stop("The 'by' column does not exist in the data.") + } + grouped_data <- split(data, data[[by]]) + na_list <- lapply(names(grouped_data), function(group_name) { + group <- grouped_data[[group_name]] + # Identify columns to analyze (exclude the 'by' column) + cols_to_analyze <- setdiff(names(group), by) + group_na_list <- lapply(cols_to_analyze, function(x) { + data_subset <- group[, x, drop = FALSE] + .describe_missing(data_subset) + }) + group_na_df <- do.call(rbind, group_na_list) + group_na_df$variable <- group_name + group_na_df + }) + } else { + na_list <- lapply(names(data), function(x) { data_subset <- data[, x, drop = FALSE] .describe_missing(data_subset) }) - na_df$variable <- "Total" - na_df <- do.call(rbind, c(na_list, list(na_df))) } + na_df <- do.call(rbind, na_list) + if (isTRUE(sort)) { + na_df <- na_df[order(-na_df$missing_percent), ] + } + na_df_tot <- .describe_missing(data) + na_df_tot$variable <- "Total" + na_df <- rbind(na_df, na_df_tot) na_df } .describe_missing <- function(data) { - if (ncol(data) > 1) { - my_var <- paste0(names(data)[1], ":", names(data)[ncol(data)]) - } else { - my_var <- names(data) - } - items <- ncol(data) - na <- sum(is.na(data)) - cells <- nrow(data) * ncol(data) - na_percent <- round(na / cells * 100, 2) - na_max <- max(rowSums(is.na(data))) - na_max_percent <- round(na_max / items * 100, 2) - all_na <- sum(apply(data, 1, function(x) all(is.na(x)))) - + n_missing <- sum(is.na(data)) + missing_percent <- round(n_missing / (nrow(data) * ncol(data)) * 100, 2) data.frame( - variable = my_var, - n_columns = items, - n_missing = na, - n_cells = cells, - missing_percent = na_percent, - complete_percent = 100 - na_percent, - missing_max = na_max, - missing_max_percent = na_max_percent, - all_missing = all_na + variable = names(data)[1], + n_missing = n_missing, + missing_percent = missing_percent, + complete_percent = 100 - missing_percent ) } diff --git a/inst/WORDLIST b/inst/WORDLIST index bf52882d2..bbafb3bd2 100644 --- a/inst/WORDLIST +++ b/inst/WORDLIST @@ -26,7 +26,6 @@ Heisig Herrington Hoffmann Joanes -Likert Llabre Lumley MADs @@ -35,7 +34,6 @@ Minitab ORCID PSU Posteriori -RSES Ranktransform Recode Recoding @@ -112,8 +110,6 @@ rio rowid sd stackexchange -subscale -subscales tailedness th tibble diff --git a/man/describe_missing.Rd b/man/describe_missing.Rd index 2213a97ae..bda13d11c 100644 --- a/man/describe_missing.Rd +++ b/man/describe_missing.Rd @@ -4,19 +4,86 @@ \alias{describe_missing} \title{Describe Missing Values in Data According to Guidelines} \usage{ -describe_missing(data, select = NULL, scales = NULL, ...) +describe_missing( + data, + select = NULL, + exclude = NULL, + ignore_case = FALSE, + regex = FALSE, + verbose = TRUE, + by = NULL, + sort = FALSE, + ... +) } \arguments{ -\item{data}{The data frame to be analyzed.} +\item{data}{A data frame.} + +\item{select}{Variables that will be included when performing the required +tasks. Can be either +\itemize{ +\item a variable specified as a literal variable name (e.g., \code{column_name}), +\item a string with the variable name (e.g., \code{"column_name"}), a character +vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a +character vector of variable names including ranges specified via \code{:} +(e.g., \code{c("col1:col3", "col5")}), +\item for some functions, like \code{data_select()} or \code{data_rename()}, \code{select} can +be a named character vector. In this case, the names are used to rename +the columns in the output data frame. See 'Details' in the related +functions to see where this option applies. +\item a formula with variable names (e.g., \code{~column_1 + column_2}), +\item a vector of positive integers, giving the positions counting from the left +(e.g. \code{1} or \code{c(1, 3, 5)}), +\item a vector of negative integers, giving the positions counting from the +right (e.g., \code{-1} or \code{-1:-3}), +\item one of the following select-helpers: \code{starts_with()}, \code{ends_with()}, +\code{contains()}, a range using \code{:}, or \code{regex()}. \code{starts_with()}, +\code{ends_with()}, and \code{contains()} accept several patterns, e.g +\code{starts_with("Sep", "Petal")}. \code{regex()} can be used to define regular +expression patterns. +\item a function testing for logical conditions, e.g. \code{is.numeric()} (or +\code{is.numeric}), or any user-defined function that selects the variables +for which the function returns \code{TRUE} (like: \code{foo <- function(x) mean(x) > 3}), +\item ranges specified via literal variable names, select-helpers (except +\code{regex()}) and (user-defined) functions can be negated, i.e. return +non-matching elements, when prefixed with a \code{-}, e.g. \code{-ends_with()}, +\code{-is.numeric} or \code{-(Sepal.Width:Petal.Length)}. \strong{Note:} Negation means +that matches are \emph{excluded}, and thus, the \code{exclude} argument can be +used alternatively. For instance, \code{select=-ends_with("Length")} (with +\code{-}) is equivalent to \code{exclude=ends_with("Length")} (no \code{-}). In case +negation should not work as expected, use the \code{exclude} argument instead. +} + +If \code{NULL}, selects all columns. Patterns that found no matches are silently +ignored, e.g. \code{extract_column_names(iris, select = c("Species", "Test"))} +will just return \code{"Species"}.} + +\item{exclude}{See \code{select}, however, column names matched by the pattern +from \code{exclude} will be excluded instead of selected. If \code{NULL} (the default), +excludes no columns.} + +\item{ignore_case}{Logical, if \code{TRUE} and when one of the select-helpers or +a regular expression is used in \code{select}, ignores lower/upper case in the +search pattern when matching against variable names.} + +\item{regex}{Logical, if \code{TRUE}, the search pattern from \code{select} will be +treated as regular expression. When \code{regex = TRUE}, select \emph{must} be a +character string (or a variable containing a character string) and is not +allowed to be one of the supported select-helpers or a character vector +of length > 1. \code{regex = TRUE} is comparable to using one of the two +select-helpers, \code{select = contains()} or \code{select = regex()}, however, +since the select-helpers may not work when called from inside other +functions (see 'Details'), this argument may be used as workaround.} -\item{select}{Variable (or lists of variables) to check for missing values (NAs).} +\item{verbose}{Toggle warnings.} -\item{scales}{If you rely on composite scores such as psychological scales -or questionnaires, you can provide the shared suffix among those variables -(as a character vector). This is useful if the variables you want to check -the average of all start with the same name (e.g., \code{varx}), such as is -commonly the case for Likert scales (such as \code{varx_1}, \code{varx_2}, \code{varx_3}, -etc.).} +\item{by}{Optional character string, indicating the names of one or more +variables in the data frame. If supplied, the data will be split by these +variables and summary statistics will be computed for each group. Useful +for survey data by first reshaping the data to the long format.} + +\item{sort}{Logical. Whether to sort the result from highest to lowest +percentage of missing data.} \item{...}{Arguments passed down to other functions. Currently not used.} } @@ -24,48 +91,20 @@ etc.).} A dataframe with the following columns: \itemize{ \item \code{variable}: Variables selected. -\item \code{n_columns}: Number of items for selected variables. -\item \code{n_missing}: Number of missing values for those variables (NA stands for Not -Available). -\item \code{n_cells}: Total number of cells (i.e., number of participants multiplied by -the number of columns, \code{n_columns}). -\item \code{missing_percent}: The percentage of missing values (\code{na} divided by \code{cells}). -\item \code{missing_max}: The number of missing values for the participant with the most -missing values for the selected variables. -\item \code{missing_max_percent}: The amount of missing values for the participant with -the most missing values for the selected variables, as a percentage -(i.e., \code{missing_max} divided by the number of selected columns, \code{n_columns}). -\item \code{all_missing}: The number of participants missing 100\% of items for that scale -(the selected variables). +\item \code{n_missing}: Number of missing values. +\item \code{missing_percent}: Percentage of missing values. +\item \code{complete_percent}: Percentage of non-missing values. } } \description{ Provides a detailed description of missing values in a data frame. This function reports both absolute and percentage missing values of specified -column lists or scales, following recommended guidelines. -} -\details{ -In psychology, it is common to ask participants to answer questionnaires in -which people answer several questions about a specific topic. For example, -people could answer 10 different questions about how extroverted they are. -In turn, researchers calculate the average for those 10 questions (called -items). These questionnaires are called (e.g., Likert) "scales" (such as the -Rosenberg Self-Esteem Scale, also known as the RSES). - -Some authors recommend reporting item-level missingness per scale, as well -as a participant's maximum number of missing items by scale. For example, -Parent (2013) writes: - -\emph{I recommend that authors (a) state their tolerance level for missing data by scale -or subscale (e.g., "We calculated means for all subscales on which participants gave -at least 75\% complete data") and then (b) report the individual missingness rates -by scale per data point (i.e., the number of missing values out of all data points -on that scale for all participants) and the maximum by participant (e.g., "For Attachment -Anxiety, a total of 4 missing data points out of 100 were observed, with no participant -missing more than a single data point").} +variables. } \examples{ -# Use the entire data frame +describe_missing(airquality) + +# Survey data set.seed(15) fun <- function() { c(sample(c(NA, 1:10), replace = TRUE), NA, NA, NA) @@ -76,25 +115,16 @@ df <- data.frame( extroversion_1 = fun(), extroversion_2 = fun(), extroversion_3 = fun(), agreeableness_1 = fun(), agreeableness_2 = fun(), agreeableness_3 = fun() ) -describe_missing(df) -# If the questionnaire items start with the same name, -# one can list the scale names directly: -describe_missing(df, scales = c("ID", "openness", "extroversion", "agreeableness")) +df_long <- reshape_longer( + df, + select = -1, + names_sep = "_", + names_to = c("dimension", "item")) -# Otherwise you can provide nested columns manually: -describe_missing(df, - select = list( - c("ID"), - c("openness_1", "openness_2", "openness_3"), - c("extroversion_1", "extroversion_2", "extroversion_3"), - c("agreeableness_1", "agreeableness_2", "agreeableness_3") - ) -) +describe_missing( + df_long, + select = -c(1, 3), + by = "dimension") } -\references{ -Parent, M. C. (2013). Handling item-level missing -data: Simpler is just as good. \emph{The Counseling Psychologist}, -\emph{41}(4), 568-600. https://doi.org/10.1177\%2F0011000012445176 -} diff --git a/tests/testthat/_snaps/describe_missing.md b/tests/testthat/_snaps/describe_missing.md index f86ae4b92..5c598e2de 100644 --- a/tests/testthat/_snaps/describe_missing.md +++ b/tests/testthat/_snaps/describe_missing.md @@ -1,63 +1,61 @@ # describe_missing Code - describe_missing(airquality) + describe_missing(airquality2) Output - variable n_columns n_missing n_cells missing_percent complete_percent - 1 Ozone 1 37 153 24.18 75.82 - 2 Solar.R 1 7 153 4.58 95.42 - 3 Wind 1 0 153 0.00 100.00 - 4 Temp 1 0 153 0.00 100.00 - 5 Month 1 0 153 0.00 100.00 - 6 Day 1 0 153 0.00 100.00 - 7 Total 6 44 918 4.79 95.21 - missing_max missing_max_percent all_missing - 1 1 100.00 37 - 2 1 100.00 7 - 3 0 0.00 0 - 4 0 0.00 0 - 5 0 0.00 0 - 6 0 0.00 0 - 7 2 33.33 0 + variable n_missing missing_percent complete_percent + 1 Solar.R 7 4.58 95.42 + 2 Wind 0 0.00 100.00 + 3 Temp 0 0.00 100.00 + 4 Month 0 0.00 100.00 + 5 Day 0 0.00 100.00 + 6 Ozone 37 24.18 75.82 + 7 Total 44 4.79 95.21 --- Code - describe_missing(airquality, vars = list(c("Ozone", "Solar.R", "Wind"), c( - "Temp", "Month", "Day"))) + describe_missing(airquality2, sort = TRUE) Output - variable n_columns n_missing n_cells missing_percent complete_percent - 1 Ozone 1 37 153 24.18 75.82 - 2 Solar.R 1 7 153 4.58 95.42 - 3 Wind 1 0 153 0.00 100.00 - 4 Temp 1 0 153 0.00 100.00 - 5 Month 1 0 153 0.00 100.00 - 6 Day 1 0 153 0.00 100.00 - 7 Total 6 44 918 4.79 95.21 - missing_max missing_max_percent all_missing - 1 1 100.00 37 - 2 1 100.00 7 - 3 0 0.00 0 - 4 0 0.00 0 - 5 0 0.00 0 - 6 0 0.00 0 - 7 2 33.33 0 + variable n_missing missing_percent complete_percent + 6 Ozone 37 24.18 75.82 + 1 Solar.R 7 4.58 95.42 + 2 Wind 0 0.00 100.00 + 3 Temp 0 0.00 100.00 + 4 Month 0 0.00 100.00 + 5 Day 0 0.00 100.00 + 11 Total 44 4.79 95.21 --- Code - describe_missing(df, scales = c("ID", "scale1", "scale2", "scale3")) + describe_missing(airquality2, select = "Ozone:Temp") Output - variable n_columns n_missing n_cells missing_percent - 1 ID 1 7 14 50.00 - 2 scale1_Q1:scale1_Q3 3 11 42 26.19 - 3 scale2_Q1:scale2_Q3 3 17 42 40.48 - 4 scale3_Q1:scale3_Q3 3 10 42 23.81 - 5 Total 10 45 140 32.14 - complete_percent missing_max missing_max_percent all_missing - 1 50.00 1 100 7 - 2 73.81 3 100 3 - 3 59.52 3 100 3 - 4 76.19 3 100 3 - 5 67.86 10 100 2 + variable n_missing missing_percent complete_percent + 1 Ozone 37 24.18 75.82 + 2 Day 0 0.00 100.00 + 3 Month 0 0.00 100.00 + 4 Temp 0 0.00 100.00 + 5 Total 37 6.05 93.95 + +--- + + Code + describe_missing(airquality2, exclude = "Ozone:Temp") + Output + variable n_missing missing_percent complete_percent + 1 Solar.R 7 4.58 95.42 + 2 Wind 0 0.00 100.00 + 3 Total 7 2.29 97.71 + +--- + + Code + describe_missing(df_long, select = -c(1, 3), by = "dimension") + Output + variable n_missing missing_percent complete_percent + 1 agreeableness 10 23.81 76.19 + 2 extroversion 17 40.48 59.52 + 3 openness 11 26.19 73.81 + 4 Total 38 15.08 84.92 diff --git a/tests/testthat/test-describe_missing.R b/tests/testthat/test-describe_missing.R index 2159082be..9c3dba43d 100644 --- a/tests/testthat/test-describe_missing.R +++ b/tests/testthat/test-describe_missing.R @@ -1,27 +1,42 @@ test_that("describe_missing", { - expect_snapshot(describe_missing(airquality)) + airquality2 <- cbind(airquality[2:6], airquality[1]) - # Use selected columns explicitly - expect_snapshot(describe_missing(airquality, - vars = list( - c("Ozone", "Solar.R", "Wind"), - c("Temp", "Month", "Day") - ) + expect_snapshot(describe_missing(airquality2)) + + expect_snapshot(describe_missing(airquality2, sort = TRUE)) + + expect_snapshot(describe_missing( + airquality2, + select = "Ozone:Temp" )) - # If the questionnaire items start with the same name, e.g., + expect_snapshot(describe_missing( + airquality2, + exclude = "Ozone:Temp" + )) + + # Testing the 'by' argument for survey scales set.seed(15) fun <- function() { c(sample(c(NA, 1:10), replace = TRUE), NA, NA, NA) } - - # One can list the scale names directly: df <- data.frame( ID = c("idz", NA), - scale1_Q1 = fun(), scale1_Q2 = fun(), scale1_Q3 = fun(), - scale2_Q1 = fun(), scale2_Q2 = fun(), scale2_Q3 = fun(), - scale3_Q1 = fun(), scale3_Q2 = fun(), scale3_Q3 = fun(), - stringsAsFactors = FALSE + openness_1 = fun(), openness_2 = fun(), openness_3 = fun(), + extroversion_1 = fun(), extroversion_2 = fun(), extroversion_3 = fun(), + agreeableness_1 = fun(), agreeableness_2 = fun(), agreeableness_3 = fun() ) - expect_snapshot(describe_missing(df, scales = c("ID", "scale1", "scale2", "scale3"))) + + # Pivot and group using datawizard + df_long <- reshape_longer(df, + select = -1, + names_sep = "_", + names_to = c("dimension", "item") + ) + + # Run describe_missing with 'by' argument + expect_snapshot(describe_missing( + df_long, + select = -c(1, 3), by = "dimension" + )) })