Skip to content

Commit

Permalink
address comments and suggestions
Browse files Browse the repository at this point in the history
  • Loading branch information
rempsyc committed Dec 17, 2024
1 parent fbdd26d commit 72041f5
Show file tree
Hide file tree
Showing 3 changed files with 129 additions and 84 deletions.
118 changes: 72 additions & 46 deletions R/describe_missing.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,19 @@
#'
#' @description Provides a detailed description of missing values in a data frame.
#' This function reports both absolute and percentage missing values of specified
#' column lists or scales, following recommended guidelines. Some authors recommend
#' reporting item-level missingness per scale, as well as a participant's maximum
#' number of missing items by scale. For example, Parent (2013) writes:
#' column lists or scales, following recommended guidelines.
#'
#' @details
#' In psychology, it is common to ask participants to answer questionnaires in
#' which people answer several questions about a specific topic. For example,
#' people could answer 10 different questions about how extroversioned they are.
#' In turn, researchers calculate the average for those 10 questions (called
#' items). These questionnaires are called (e.g., Likert) "scales" (such as the
#' Rosenberg Self-Esteem Scale, also known as the RSES).
#'
#' Some authors recommend reporting item-level missingness per scale, as well
#' as a participant's maximum number of missing items by scale. For example,
#' Parent (2013) writes:
#'
#' *I recommend that authors (a) state their tolerance level for missing data by scale
#' or subscale (e.g., "We calculated means for all subscales on which participants gave
Expand All @@ -16,82 +26,97 @@
#'
#' @param data The data frame to be analyzed.
#' @param vars Variable (or lists of variables) to check for missing values (NAs).
#' @param scales The scale names to check for missing values (as a character vector).
#' @keywords missing values NA guidelines
#' @param scales If you rely on composite scores such as psychological scales
#' or questionnaires, you can provide the shared suffix among those variables
#' (as a character vector). This is useful if the variables you want to check
#' the average of all start with the same name (e.g., `varx`), such as is
#' commonly the case for Likert scales (such as `varx_1`, `varx_2`, `varx_3`,
#' etc.).
#' @return A dataframe with the following columns:
#' - `var`: Variables selected.
#' - `items`: Number of items for selected variables.
#' - `na`: Number of missing values for those variables.
#' - `cells`: Total number of cells (i.e., number of participants multiplied by
#' the number of variables, `items`).
#' - `na_percent`: The percentage of missing values (`na` divided by `cells`).
#' - `na_max`: The number of missing values for the participant with the most
#' - `variable`: Variables selected.
#' - `n_columns`: Number of items for selected variables.
#' - `n_missing`: Number of missing values for those variables (NA stands for Not
#' Available).
#' - `n_cells`: Total number of cells (i.e., number of participants multiplied by
#' the number of columns, `n_columns`).
#' - `missing_percent`: The percentage of missing values (`na` divided by `cells`).
#' - `missing_max`: The number of missing values for the participant with the most
#' missing values for the selected variables.
#' - `na_max_percent`: The amount of missing values for the participant with
#' - `missing_max_percent`: The amount of missing values for the participant with
#' the most missing values for the selected variables, as a percentage
#' (i.e., `na_max` divided by the number of selected variables, `items`).
#' - `all_na`: The number of participants missing 100% of items for that scale
#' (i.e., `missing_max` divided by the number of selected columns, `n_columns`).
#' - `all_missing`: The number of participants missing 100% of items for that scale
#' (the selected variables).
#' @param ... Arguments passed down to other functions. Currently not used.
#'
#' @export
#' @references Parent, M. C. (2013). Handling item-level missing
#' data: Simpler is just as good. *The Counseling Psychologist*,
#' *41*(4), 568-600. https://doi.org/10.1177%2F0011000012445176
#' @examples
#' # Use the entire data frame
#' describe_missing(airquality)
#'
#' # Use selected columns explicitly
#' describe_missing(airquality,
#' vars = list(
#' c("Ozone", "Solar.R", "Wind"),
#' c("Temp", "Month", "Day")
#' )
#' )
#'
#' # If the questionnaire items start with the same name, e.g.,
#' set.seed(15)
#' fun <- function() {
#' c(sample(c(NA, 1:10), replace = TRUE), NA, NA, NA)
#' }
#' df <- data.frame(
#' ID = c("idz", NA),
#' open_1 = fun(), open_2 = fun(), open_3 = fun(),
#' extrovert_1 = fun(), extrovert_2 = fun(), extrovert_3 = fun(),
#' agreeable_1 = fun(), agreeable_2 = fun(), agreeable_3 = fun()
#' openness_1 = fun(), openness_2 = fun(), openness_3 = fun(),
#' extroversion_1 = fun(), extroversion_2 = fun(), extroversion_3 = fun(),
#' agreeableness_1 = fun(), agreeableness_2 = fun(), agreeableness_3 = fun()
#' )
#' describe_missing(df)
#'
#' # One can list the scale names directly:
#' describe_missing(df, scales = c("ID", "open", "extrovert", "agreeable"))
describe_missing <- function(data, vars = NULL, scales = NULL) {
classes <- lapply(data, class)
if (missing(vars) && missing(scales)) {
#' # If the questionnaire items start with the same name,
#' # one can list the scale names directly:
#' describe_missing(df, scales = c("ID", "openness", "extroversion", "agreeableness"))
#'
#' # Otherwise you can provide nested columns manually:
#' describe_missing(df,
#' select = list(
#' c("ID"),
#' c("openness_1", "openness_2", "openness_3"),
#' c("extroversion_1", "extroversion_2", "extroversion_3"),
#' c("agreeableness_1", "agreeableness_2", "agreeableness_3")
#' )
#' )
#'

describe_missing <- function(data, select = NULL, scales = NULL, ...) {
vars <- select
if (!is.null(vars) && missing(scales)) {
vars.internal <- names(data)
} else if (!missing(scales)) {
vars.internal <- lapply(scales, function(x) {
grep(paste0("^", x), names(data), value = TRUE)
})
} else if (is.null(vars) && missing(scales)){

Check warning on line 93 in R/describe_missing.R

View workflow job for this annotation

GitHub Actions / lint-changed-files / lint-changed-files

file=R/describe_missing.R,line=93,col=47,[brace_linter] There should be a space before an opening curly brace.

Check warning on line 93 in R/describe_missing.R

View workflow job for this annotation

GitHub Actions / lint-changed-files / lint-changed-files

file=R/describe_missing.R,line=93,col=47,[paren_body_linter] Put a space between a right parenthesis and a body expression.

Check warning on line 93 in R/describe_missing.R

View workflow job for this annotation

GitHub Actions / lint / lint

file=R/describe_missing.R,line=93,col=47,[brace_linter] There should be a space before an opening curly brace.

Check warning on line 93 in R/describe_missing.R

View workflow job for this annotation

GitHub Actions / lint / lint

file=R/describe_missing.R,line=93,col=47,[paren_body_linter] Put a space between a right parenthesis and a body expression.
vars <- as.list(names(data))
}
if (!missing(vars)) {
if (!is.null(vars)) {
vars.internal <- vars
}
if (!is.list(vars.internal)) {
vars.internal <- list(vars.internal)
}
na_df <- .describe_missing(data)
if (!missing(vars) || !missing(scales)) {
if (!is.null(vars) || !missing(scales)) {
na_list <- lapply(vars.internal, function(x) {
data_subset <- data[, x, drop = FALSE]
.describe_missing(data_subset)
})
na_df$var <- "Total"
na_df$variable <- "Total"
na_df <- do.call(rbind, c(na_list, list(na_df)))
}
na_df
}

.describe_missing <- function(data) {
my_var <- paste0(names(data)[1], ":", names(data)[ncol(data)])
if (ncol(data) > 1) {
my_var <- paste0(names(data)[1], ":", names(data)[ncol(data)])
} else {
my_var <- names(data)
}
items <- ncol(data)
na <- sum(is.na(data))
cells <- nrow(data) * ncol(data)
Expand All @@ -101,13 +126,14 @@ describe_missing <- function(data, vars = NULL, scales = NULL) {
all_na <- sum(apply(data, 1, function(x) all(is.na(x))))

data.frame(
var = my_var,
items = items,
na = na,
cells = cells,
na_percent = na_percent,
na_max = na_max,
na_max_percent = na_max_percent,
all_na = all_na
variable = my_var,
n_columns = items,
n_missing = na,
n_cells = cells,
missing_percent = na_percent,
complete_percent = 100 - na_percent,
missing_max = na_max,
missing_max_percent = na_max_percent,
all_missing = all_na
)
}
5 changes: 5 additions & 0 deletions inst/WORDLIST
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ labelling
leptokurtic
lm
lme
macOS
meaned
mesokurtic
midhinge
Expand All @@ -91,6 +92,8 @@ poorman
pre
pth
px
quartile
quartiles
readr
readxl
recode
Expand All @@ -107,6 +110,8 @@ rio
rowid
sd
stackexchange
subscale
subscales
tailedness
th
tibble
Expand Down
90 changes: 52 additions & 38 deletions man/describe_missing.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 72041f5

Please sign in to comment.