diff --git a/.dev/copy_files_to_other_repos.R b/.dev/copy_files_to_other_repos.R deleted file mode 100644 index 2239f7314..000000000 --- a/.dev/copy_files_to_other_repos.R +++ /dev/null @@ -1,23 +0,0 @@ - -library(here) -library(magrittr) -library(fs) -library(purrr) - -home_dir <- here::dr_here() - -common_root <- here::here("Documents", "GitHub") - -pkg_with_change <- "datawizard" -pkg_without_change <- easystats:::.packages_on_cran() %>% setdiff(pkg_with_change) - -copy_from_folder <- map_chr(pkg_with_change, ~here::here(common_root, .x, ".github", "workflows")) -copy_to_folders <- map_chr(pkg_without_change, ~here::here(common_root, .x, ".github", "workflows")) - -file_name <- "R-CMD-check-strict.yaml" - -purrr::walk2( - .x = here::here(copy_from_folder, file_name), - .y = here::here(copy_to_folders, file_name), - .f = ~ fs::file_copy(.x, .y, overwrite = TRUE), -) diff --git a/.github/workflows/html-5-check.yaml b/.github/workflows/html-5-check.yaml index 1439a3228..f25b4eeaf 100644 --- a/.github/workflows/html-5-check.yaml +++ b/.github/workflows/html-5-check.yaml @@ -6,8 +6,8 @@ on: pull_request: branches: [main, master] -name: HTML5 check +name: html-5-check jobs: - HTML5-check: + html-5-check: uses: easystats/workflows/.github/workflows/html-5-check.yaml@main diff --git a/.github/workflows/update-to-latest-easystats.yaml b/.github/workflows/update-to-latest-easystats.yaml new file mode 100644 index 000000000..7718d763a --- /dev/null +++ b/.github/workflows/update-to-latest-easystats.yaml @@ -0,0 +1,10 @@ +on: + schedule: + # Check for dependency updates once a month + - cron: "0 0 1 * *" + +name: update-to-latest-easystats + +jobs: + update-to-latest-easystats: + uses: easystats/workflows/.github/workflows/update-to-latest-easystats.yaml@main diff --git a/.lintr b/.lintr index 7fa66fe1c..8aebdfc14 100644 --- a/.lintr +++ b/.lintr @@ -13,6 +13,7 @@ linters: linters_with_defaults( todo_comment_linter = NULL, undesirable_function_linter(c("mapply" = NA, "sapply" = NA, "setwd" = NA)), undesirable_operator_linter = NULL, + if_not_else_linter(exceptions = character(0L)), unnecessary_concatenation_linter(allow_single_expression = FALSE), defaults = linters_with_tags(tags = NULL) ) diff --git a/DESCRIPTION b/DESCRIPTION index 1144e0b80..048d15065 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: datawizard Title: Easy Data Wrangling and Statistical Transformations -Version: 0.8.0.13 +Version: 0.9.1.4 Authors@R: c( person("Indrajeet", "Patil", , "patilindrajeet.science@gmail.com", role = "aut", comment = c(ORCID = "0000-0003-1995-6531", Twitter = "@patilindrajeets")), @@ -33,13 +33,14 @@ BugReports: https://github.com/easystats/datawizard/issues Depends: R (>= 3.6) Imports: - insight (>= 0.19.3.2), + insight (>= 0.19.8), stats, utils Suggests: bayestestR, boot, brms, + curl, data.table, dplyr (>= 1.0), effectsize, @@ -62,7 +63,7 @@ Suggests: rmarkdown, rstanarm, see, - testthat (>= 3.1.6), + testthat (>= 3.2.0), tibble, tidyr, withr @@ -71,7 +72,7 @@ VignetteBuilder: Encoding: UTF-8 Language: en-US Roxygen: list(markdown = TRUE) -RoxygenNote: 7.2.3.9000 +RoxygenNote: 7.3.1 Config/testthat/edition: 3 Config/testthat/parallel: true Config/Needs/website: diff --git a/NAMESPACE b/NAMESPACE index 7809afcf4..e89863807 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -67,6 +67,7 @@ S3method(describe_distribution,numeric) S3method(format,data_codebook) S3method(format,dw_data_peek) S3method(format,dw_data_tabulate) +S3method(format,dw_data_xtabulate) S3method(format,dw_groupmeans) S3method(format,parameters_distribution) S3method(kurtosis,data.frame) @@ -91,6 +92,8 @@ S3method(print,data_seek) S3method(print,dw_data_peek) S3method(print,dw_data_tabulate) S3method(print,dw_data_tabulates) +S3method(print,dw_data_xtabulate) +S3method(print,dw_data_xtabulates) S3method(print,dw_groupmeans) S3method(print,dw_groupmeans_list) S3method(print,dw_transformer) @@ -102,10 +105,13 @@ S3method(print_html,data_codebook) S3method(print_html,dw_data_peek) S3method(print_html,dw_data_tabulate) S3method(print_html,dw_data_tabulates) +S3method(print_html,dw_data_xtabulate) +S3method(print_html,dw_data_xtabulates) S3method(print_md,data_codebook) S3method(print_md,dw_data_peek) S3method(print_md,dw_data_tabulate) S3method(print_md,dw_data_tabulates) +S3method(print_md,dw_data_xtabulate) S3method(ranktransform,data.frame) S3method(ranktransform,factor) S3method(ranktransform,grouped_df) @@ -167,7 +173,9 @@ S3method(to_factor,Date) S3method(to_factor,character) S3method(to_factor,data.frame) S3method(to_factor,default) +S3method(to_factor,double) S3method(to_factor,factor) +S3method(to_factor,haven_labelled) S3method(to_factor,logical) S3method(to_factor,numeric) S3method(to_numeric,Date) @@ -179,6 +187,7 @@ S3method(to_numeric,data.frame) S3method(to_numeric,default) S3method(to_numeric,double) S3method(to_numeric,factor) +S3method(to_numeric,haven_labelled) S3method(to_numeric,logical) S3method(to_numeric,numeric) S3method(unnormalize,data.frame) diff --git a/NEWS.md b/NEWS.md index c747eb8dc..968c9d417 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,4 +1,34 @@ -# datawizard (devel) +# datawizard 0.9.2 + +CHANGES + +* `data_modify()` gets three new arguments, `.at`, `.if` and `.modify`, to modify + variables at specific positions or based on logical conditions. + +* `data_tabulate()` was revised and gets several new arguments: a `weights` + argument, to compute weighted frequency tables. `include_na` allows to include + or omit missing values from the table. Furthermore, a `by` argument was added, + to compute crosstables (#479, #481). + +# datawizard 0.9.1 + +CHANGES + +* `rescale()` gains `multiply` and `add` arguments, to expand ranges by a given + factor or value. + +* `to_factor()` and `to_numeric()` now support class `haven_labelled`. + +BUG FIXES + +* `to_numeric()` now correctly deals with inversed factor levels when + `preserve_levels = TRUE`. + +* `to_numeric()` inversed order of value labels when `dummy_factors = FALSE`. + +* `convert_to_na()` now preserves attributes for factors when `drop_levels = TRUE`. + +# datawizard 0.9.0 NEW FUNCTIONS @@ -41,6 +71,10 @@ BUG FIXES * Fixed issues in `data_write()` when writing labelled data into SPSS format and vectors were of different type as value labels. +* Fixed issues in `data_write()` when writing labelled data into SPSS format + for character vectors with missing value labels, but existing variable + labels. + * Fixed issue in `recode_into()` with probably wrong case number printed in the warning when several recode patterns match to one case. diff --git a/R/categorize.R b/R/categorize.R index 0df7edd4c..d440069d8 100644 --- a/R/categorize.R +++ b/R/categorize.R @@ -192,11 +192,11 @@ categorize.numeric <- function(x, breaks <- split } else { breaks <- switch(split, - "median" = stats::median(x), - "mean" = mean(x), - "length" = n_groups, - "quantile" = stats::quantile(x, probs = seq_len(n_groups) / n_groups), - "range" = .equal_range(x, range, n_groups, lowest), + median = stats::median(x), + mean = mean(x), + length = n_groups, + quantile = stats::quantile(x, probs = seq_len(n_groups) / n_groups), + range = .equal_range(x, range, n_groups, lowest), NULL ) } diff --git a/R/contrs.R b/R/contrs.R index 2e5638cea..bea9999a3 100644 --- a/R/contrs.R +++ b/R/contrs.R @@ -32,58 +32,59 @@ #' #' @seealso [stats::contr.sum()] #' -#' @examples -#' if (FALSE) { -#' data("mtcars") +#' @examplesIf !identical(Sys.getenv("IN_PKGDOWN"), "true") +#' \donttest{ +#' data("mtcars") #' -#' mtcars <- data_modify(mtcars, cyl = factor(cyl)) +#' mtcars <- data_modify(mtcars, cyl = factor(cyl)) #' -#' c.treatment <- cbind(Intercept = 1, contrasts(mtcars$cyl)) -#' solve(c.treatment) -#' #> 4 6 8 -#' #> Intercept 1 0 0 # mean of the 1st level -#' #> 6 -1 1 0 # 2nd level - 1st level -#' #> 8 -1 0 1 # 3rd level - 1st level +#' c.treatment <- cbind(Intercept = 1, contrasts(mtcars$cyl)) +#' solve(c.treatment) +#' #> 4 6 8 +#' #> Intercept 1 0 0 # mean of the 1st level +#' #> 6 -1 1 0 # 2nd level - 1st level +#' #> 8 -1 0 1 # 3rd level - 1st level #' -#' contrasts(mtcars$cyl) <- contr.sum -#' c.sum <- cbind(Intercept = 1, contrasts(mtcars$cyl)) -#' solve(c.sum) -#' #> 4 6 8 -#' #> Intercept 0.333 0.333 0.333 # overall mean -#' #> 0.667 -0.333 -0.333 # deviation of 1st from overall mean -#' #> -0.333 0.667 -0.333 # deviation of 2nd from overall mean +#' contrasts(mtcars$cyl) <- contr.sum +#' c.sum <- cbind(Intercept = 1, contrasts(mtcars$cyl)) +#' solve(c.sum) +#' #> 4 6 8 +#' #> Intercept 0.333 0.333 0.333 # overall mean +#' #> 0.667 -0.333 -0.333 # deviation of 1st from overall mean +#' #> -0.333 0.667 -0.333 # deviation of 2nd from overall mean #' #' -#' contrasts(mtcars$cyl) <- contr.deviation -#' c.deviation <- cbind(Intercept = 1, contrasts(mtcars$cyl)) -#' solve(c.deviation) -#' #> 4 6 8 -#' #> Intercept 0.333 0.333 0.333 # overall mean -#' #> 6 -1.000 1.000 0.000 # 2nd level - 1st level -#' #> 8 -1.000 0.000 1.000 # 3rd level - 1st level +#' contrasts(mtcars$cyl) <- contr.deviation +#' c.deviation <- cbind(Intercept = 1, contrasts(mtcars$cyl)) +#' solve(c.deviation) +#' #> 4 6 8 +#' #> Intercept 0.333 0.333 0.333 # overall mean +#' #> 6 -1.000 1.000 0.000 # 2nd level - 1st level +#' #> 8 -1.000 0.000 1.000 # 3rd level - 1st level #' -#' ## With Interactions ----------------------------------------- -#' mtcars <- data_modify(mtcars, am = C(am, contr = contr.deviation)) -#' mtcars <- data_arrange(mtcars, select = c("cyl", "am")) +#' ## With Interactions ----------------------------------------- +#' mtcars <- data_modify(mtcars, am = C(am, contr = contr.deviation)) +#' mtcars <- data_arrange(mtcars, select = c("cyl", "am")) #' -#' mm <- unique(model.matrix(~ cyl * am, data = mtcars)) -#' rownames(mm) <- c( -#' "cyl4.am0", "cyl4.am1", "cyl6.am0", -#' "cyl6.am1", "cyl8.am0", "cyl8.am1" -#' ) +#' mm <- unique(model.matrix(~ cyl * am, data = mtcars)) +#' rownames(mm) <- c( +#' "cyl4.am0", "cyl4.am1", "cyl6.am0", +#' "cyl6.am1", "cyl8.am0", "cyl8.am1" +#' ) #' -#' solve(mm) -#' #> cyl4.am0 cyl4.am1 cyl6.am0 cyl6.am1 cyl8.am0 cyl8.am1 -#' #> (Intercept) 0.167 0.167 0.167 0.167 0.167 0.167 # overall mean -#' #> cyl6 -0.500 -0.500 0.500 0.500 0.000 0.000 # cyl MAIN eff: 2nd - 1st -#' #> cyl8 -0.500 -0.500 0.000 0.000 0.500 0.500 # cyl MAIN eff: 2nd - 1st -#' #> am1 -0.333 0.333 -0.333 0.333 -0.333 0.333 # am MAIN eff -#' #> cyl6:am1 1.000 -1.000 -1.000 1.000 0.000 0.000 -#' #> cyl8:am1 1.000 -1.000 0.000 0.000 -1.000 1.000 +#' solve(mm) +#' #> cyl4.am0 cyl4.am1 cyl6.am0 cyl6.am1 cyl8.am0 cyl8.am1 +#' #> (Intercept) 0.167 0.167 0.167 0.167 0.167 0.167 # overall mean +#' #> cyl6 -0.500 -0.500 0.500 0.500 0.000 0.000 # cyl MAIN eff: 2nd - 1st +#' #> cyl8 -0.500 -0.500 0.000 0.000 0.500 0.500 # cyl MAIN eff: 2nd - 1st +#' #> am1 -0.333 0.333 -0.333 0.333 -0.333 0.333 # am MAIN eff +#' #> cyl6:am1 1.000 -1.000 -1.000 1.000 0.000 0.000 +#' #> cyl8:am1 1.000 -1.000 0.000 0.000 -1.000 1.000 #' } #' #' @export -contr.deviation <- function(n, base = 1, +contr.deviation <- function(n, + base = 1, contrasts = TRUE, sparse = FALSE) { cont <- stats::contr.treatment(n, diff --git a/R/convert_to_na.R b/R/convert_to_na.R index 116c71c35..0e95b7c5f 100644 --- a/R/convert_to_na.R +++ b/R/convert_to_na.R @@ -105,7 +105,11 @@ convert_to_na.factor <- function(x, na = NULL, drop_levels = FALSE, verbose = TR # drop unused labels value_labels <- attr(x, "labels", exact = TRUE) if (is.factor(x) && isTRUE(drop_levels)) { + # save label attribute + variable_label <- attr(x, "label", exact = TRUE) x <- droplevels(x) + # droplevels() discards attributes, so we need to re-assign them + attr(x, "label") <- variable_label } attr(x, "labels") <- value_labels[!value_labels %in% na] } diff --git a/R/data_arrange.R b/R/data_arrange.R index 5666fde42..58c8a5ab3 100644 --- a/R/data_arrange.R +++ b/R/data_arrange.R @@ -46,8 +46,8 @@ data_arrange.default <- function(data, select = NULL, safe = TRUE) { dont_exist <- select[which(!select %in% names(data))] if (length(dont_exist) > 0) { - if (!safe) { - insight::format_error( + if (safe) { + insight::format_warning( paste0( "The following column(s) don't exist in the dataset: ", text_concatenate(dont_exist), "." @@ -55,7 +55,7 @@ data_arrange.default <- function(data, select = NULL, safe = TRUE) { .misspelled_string(names(data), dont_exist, "Possibly misspelled?") ) } else { - insight::format_warning( + insight::format_error( paste0( "The following column(s) don't exist in the dataset: ", text_concatenate(dont_exist), "." diff --git a/R/data_codebook.R b/R/data_codebook.R index 077dfd6d5..7608f08d5 100644 --- a/R/data_codebook.R +++ b/R/data_codebook.R @@ -456,8 +456,14 @@ print_md.data_codebook <- function(x, ...) { # need to remove this one x$Prop <- NULL align <- c( - "ID" = "l", "Name" = "l", "Label" = "l", "Type" = "l", "Missings" = "r", - "Values" = "r", "Value Labels" = "l", "N" = "r" + ID = "l", + Name = "l", + Label = "l", + Type = "l", + Missings = "r", + Values = "r", + `Value Labels` = "l", + N = "r" ) align <- align[colnames(x)] paste0(unname(align), collapse = "") diff --git a/R/data_extract.R b/R/data_extract.R index df38dcbde..b5613309c 100644 --- a/R/data_extract.R +++ b/R/data_extract.R @@ -114,10 +114,10 @@ data_extract.data.frame <- function(data, # chose which matched variables to extract select <- switch(extract, - "first" = select[1L], - "last" = select[length(select)], - "odd" = select[seq(1L, length(select), 2L)], - "even" = select[seq(2L, length(select), 2L)], + first = select[1L], + last = select[length(select)], + odd = select[seq(1L, length(select), 2L)], + even = select[seq(2L, length(select), 2L)], select ) diff --git a/R/data_match.R b/R/data_match.R index 6b931c292..c03b3f222 100644 --- a/R/data_match.R +++ b/R/data_match.R @@ -109,17 +109,17 @@ data_match <- function(x, to, match = "and", return_indices = FALSE, drop_na = T # evaluate match <- match.arg(tolower(match), c("and", "&", "&&", "or", "|", "||", "!", "not")) match <- switch(match, - "&" = , - "&&" = , - "and" = "and", - "!" = , - "not" = "not", + `&` = , + `&&` = , + and = "and", + `!` = , + not = "not", "or" ) - # sanity check + # validation check shared_columns <- intersect(colnames(x), colnames(to)) - if (is.null(shared_columns) || length(shared_columns) == 0) { + if (is.null(shared_columns) || length(shared_columns) == 0L) { insight::format_error( "None of the columns from the data frame with matching conditions were found in `x`." ) @@ -179,9 +179,9 @@ data_filter <- function(x, ...) { #' @export data_filter.data.frame <- function(x, ...) { out <- x - dots <- match.call(expand.dots = FALSE)$`...` + dots <- match.call(expand.dots = FALSE)[["..."]] - if (any(nchar(names(dots)) > 0)) { + if (any(nzchar(names(dots), keepNA = TRUE))) { insight::format_error( "Filtering did not work. Please check if you need `==` (instead of `=`) for comparison." ) @@ -207,7 +207,7 @@ data_filter.data.frame <- function(x, ...) { symbol <- dots[[i]] # evaluate, we may have a variable with filter expression eval_symbol <- .dynEval(symbol, ifnotfound = NULL) - # sanity check: is variable named like a function? + # validation check: is variable named like a function? if (is.function(eval_symbol)) { eval_symbol <- .dynGet(symbol, ifnotfound = NULL) } @@ -277,15 +277,14 @@ data_filter.data.frame <- function(x, ...) { #' @export data_filter.grouped_df <- function(x, ...) { - # works only for dplyr >= 0.8.0 grps <- attr(x, "groups", exact = TRUE) grps <- grps[[".rows"]] - dots <- match.call(expand.dots = FALSE)$`...` + dots <- match.call(expand.dots = FALSE)[["..."]] out <- lapply(grps, function(grp) { - args <- list(x[grp, ]) - args <- c(args, dots) - do.call("data_filter.data.frame", args) + arguments <- list(x[grp, ]) + arguments <- c(arguments, dots) + do.call("data_filter.data.frame", arguments) }) out <- do.call(rbind, out) diff --git a/R/data_merge.R b/R/data_merge.R index c8a90cb0e..58be12f8a 100644 --- a/R/data_merge.R +++ b/R/data_merge.R @@ -274,13 +274,13 @@ data_merge.data.frame <- function(x, y, join = "left", by = NULL, id = NULL, ver all_columns <- union(colnames(x), colnames(y)) out <- switch(join, - "full" = merge(x, y, all = TRUE, sort = FALSE, by = by), - "left" = merge(x, y, all.x = TRUE, sort = FALSE, by = by), - "right" = merge(x, y, all.y = TRUE, sort = FALSE, by = by), - "inner" = merge(x, y, sort = FALSE, by = by), - "semi" = x[x[[by]] %in% y[[by]], , drop = FALSE], - "anti" = x[!x[[by]] %in% y[[by]], , drop = FALSE], - "bind" = .bind_data_frames(x, y) + full = merge(x, y, all = TRUE, sort = FALSE, by = by), + left = merge(x, y, all.x = TRUE, sort = FALSE, by = by), + right = merge(x, y, all.y = TRUE, sort = FALSE, by = by), + inner = merge(x, y, sort = FALSE, by = by), + semi = x[x[[by]] %in% y[[by]], , drop = FALSE], + anti = x[!x[[by]] %in% y[[by]], , drop = FALSE], + bind = .bind_data_frames(x, y) ) diff --git a/R/data_modify.R b/R/data_modify.R index 9fa9a3b9a..40a186736 100644 --- a/R/data_modify.R +++ b/R/data_modify.R @@ -23,8 +23,21 @@ #' - Using `NULL` as right-hand side removes a variable from the data frame. #' Example: `Petal.Width = NULL`. #' -#' Note that newly created variables can be used in subsequent expressions. -#' See also 'Examples'. +#' Note that newly created variables can be used in subsequent expressions, +#' including `.at` or `.if`. See also 'Examples'. +#' +#' @param .at A character vector of variable names that should be modified. This +#' argument is used in combination with the `.modify` argument. Note that only one +#' of `.at` or `.if` can be provided, but not both at the same time. Newly created +#' variables in `...` can also be selected, see 'Examples'. +#' @param .if A function that returns `TRUE` for columns in the data frame where +#' `.if` applies. This argument is used in combination with the `.modify` argument. +#' Note that only one of `.at` or `.if` can be provided, but not both at the same +#' time. Newly created variables in `...` can also be selected, see 'Examples'. +#' @param .modify A function that modifies the variables defined in `.at` or `.if`. +#' This argument is used in combination with either the `.at` or the `.if` argument. +#' Note that the modified variable (i.e. the result from `.modify`) must be either +#' of length 1 or of same length as the input variable. #' #' @note `data_modify()` can also be used inside functions. However, it is #' recommended to pass the recode-expression as character vector or list of @@ -91,6 +104,32 @@ #' #' new_exp <- c("SW_double = 2 * Sepal.Width", "SW_fraction = SW_double / 10") #' foo(iris, new_exp) +#' +#' # modify at specific positions or if condition is met +#' d <- iris[1:5, ] +#' data_modify(d, .at = "Species", .modify = as.numeric) +#' data_modify(d, .if = is.factor, .modify = as.numeric) +#' +#' # can be combined with dots +#' data_modify(d, new_length = Petal.Length * 2, .at = "Species", .modify = as.numeric) +#' +#' # new variables used in `.at` or `.if` +#' data_modify( +#' d, +#' new_length = Petal.Length * 2, +#' .at = c("Petal.Length", "new_length"), +#' .modify = round +#' ) +#' +#' # combine "data_find()" and ".at" argument +#' out <- data_modify( +#' d, +#' .at = data_find(d, select = starts_with("Sepal")), +#' .modify = as.factor +#' ) +#' # "Sepal.Length" and "Sepal.Width" are now factors +#' str(out) +#' #' @export data_modify <- function(data, ...) { UseMethod("data_modify") @@ -101,114 +140,121 @@ data_modify.default <- function(data, ...) { insight::format_error("`data` must be a data frame.") } +#' @rdname data_modify #' @export -data_modify.data.frame <- function(data, ...) { +data_modify.data.frame <- function(data, ..., .if = NULL, .at = NULL, .modify = NULL) { dots <- eval(substitute(alist(...))) - column_names <- colnames(data) - # we check for character vector of expressions, in which case - # "dots" should be unnamed - if (is.null(names(dots))) { - # if we have multiple strings, concatenate them to a character vector - # and put it into a list... - if (length(dots) > 1) { - if (all(vapply(dots, is.character, logical(1)))) { - dots <- list(unlist(dots)) - } else { - insight::format_error("You cannot mix string and literal representation of expressions.") + # check if we have dots, or only at/modify ---- + + if (length(dots)) { + # we check for character vector of expressions, in which case + # "dots" should be unnamed + if (is.null(names(dots))) { + # if we have multiple strings, concatenate them to a character vector + # and put it into a list... + if (length(dots) > 1) { + if (all(vapply(dots, is.character, logical(1)))) { + dots <- list(unlist(dots)) + } else { + insight::format_error("You cannot mix string and literal representation of expressions.") + } + } + # expression is given as character string, e.g. + # a <- "double_SepWidth = 2 * Sepal.Width" + # data_modify(iris, a) + # or as character vector, e.g. + # data_modify(iris, c("var_a = Sepal.Width / 10", "var_b = Sepal.Width * 10")) + character_symbol <- tryCatch(.dynEval(dots[[1]]), error = function(e) NULL) + # do we have a character vector? Then we can proceed + if (is.character(character_symbol)) { + dots <- lapply(character_symbol, function(s) { + # turn value from character vector into expression + str2lang(.dynEval(s)) + }) + names(dots) <- vapply(dots, function(n) insight::safe_deparse(n[[2]]), character(1)) } } - # expression is given as character string, e.g. - # a <- "double_SepWidth = 2 * Sepal.Width" - # data_modify(iris, a) - # or as character vector, e.g. - # data_modify(iris, c("var_a = Sepal.Width / 10", "var_b = Sepal.Width * 10")) - character_symbol <- tryCatch(.dynEval(dots[[1]]), error = function(e) NULL) - # do we have a character vector? Then we can proceed - if (is.character(character_symbol)) { - dots <- lapply(character_symbol, function(s) { - # turn value from character vector into expression - str2lang(.dynEval(s)) - }) - names(dots) <- vapply(dots, function(n) insight::safe_deparse(n[[2]]), character(1)) - } - } - for (i in seq_along(dots)) { - # iterate expressions for new variables - symbol <- dots[[i]] + for (i in seq_along(dots)) { + # iterate expressions for new variables + symbol <- dots[[i]] - # expression is given as character string in a variable, but named, e.g. - # a <- "2 * Sepal.Width" - # data_modify(iris, double_SepWidth = a) - # we reconstruct the symbol as if it were provided as literal expression. - # However, we need to check that we don't have a character vector, - # like: data_modify(iris, new_var = "a") - # this one should be recycled instead. - if (!is.character(symbol)) { - eval_symbol <- .dynEval(symbol, ifnotfound = NULL) - if (is.character(eval_symbol)) { - symbol <- try(str2lang(paste0(names(dots)[i], " = ", eval_symbol)), silent = TRUE) - # we may have the edge-case of having a function that returns a character - # vector, like "new_var = sample(letters[1:3])". In this case, "eval_symbol" - # is of type character, but no symbol, thus str2lang() above creates a - # wrong pattern. We then take "eval_symbol" as character input. - if (inherits(symbol, "try-error")) { - symbol <- str2lang(paste0( - names(dots)[i], - " = c(", paste0("\"", eval_symbol, "\"", collapse = ","), ")" - )) + # expression is given as character string in a variable, but named, e.g. + # a <- "2 * Sepal.Width" + # data_modify(iris, double_SepWidth = a) + # we reconstruct the symbol as if it were provided as literal expression. + # However, we need to check that we don't have a character vector, + # like: data_modify(iris, new_var = "a") + # this one should be recycled instead. + if (!is.character(symbol)) { + eval_symbol <- .dynEval(symbol, ifnotfound = NULL) + if (is.character(eval_symbol)) { + symbol <- try(str2lang(paste0(names(dots)[i], " = ", eval_symbol)), silent = TRUE) + # we may have the edge-case of having a function that returns a character + # vector, like "new_var = sample(letters[1:3])". In this case, "eval_symbol" + # is of type character, but no symbol, thus str2lang() above creates a + # wrong pattern. We then take "eval_symbol" as character input. + if (inherits(symbol, "try-error")) { + symbol <- str2lang(paste0( + names(dots)[i], + " = c(", paste0("\"", eval_symbol, "\"", collapse = ","), ")" + )) + } } } - } - # finally, we can evaluate expression and get values for new variables - new_variable <- try(with(data, eval(symbol)), silent = TRUE) + # finally, we can evaluate expression and get values for new variables + new_variable <- try(with(data, eval(symbol)), silent = TRUE) - # successful, or any errors, like misspelled variable name? - if (inherits(new_variable, "try-error")) { - # in which step did error happen? - step_number <- switch(as.character(i), - "1" = "the first expression", - "2" = "the second expression", - "3" = "the third expression", - paste("expression", i) - ) - step_msg <- paste0("There was an error in ", step_number, ".") - # try to find out which variable was the cause for the error - error_msg <- attributes(new_variable)$condition$message - if (grepl("object '(.*)' not found", error_msg)) { - error_var <- gsub("object '(.*)' not found", "\\1", error_msg) + # successful, or any errors, like misspelled variable name? + if (inherits(new_variable, "try-error")) { + # in which step did error happen? + step_number <- switch(as.character(i), + "1" = "the first expression", + "2" = "the second expression", + "3" = "the third expression", + paste("expression", i) + ) + step_msg <- paste0("There was an error in ", step_number, ".") + # try to find out which variable was the cause for the error + error_msg <- attributes(new_variable)$condition$message + if (grepl("object '(.*)' not found", error_msg)) { + error_var <- gsub("object '(.*)' not found", "\\1", error_msg) + insight::format_error( + paste0(step_msg, " Variable \"", error_var, "\" was not found in the dataset or in the environment."), + .misspelled_string(colnames(data), error_var, "Possibly misspelled or not yet defined?") + ) + } else { + insight::format_error(paste0( + step_msg, " ", insight::format_capitalize(error_msg), + ". Possibly misspelled or not yet defined?" + )) + } + } + + # give informative error when new variable doesn't match number of rows + if (!is.null(new_variable) && length(new_variable) != nrow(data) && (nrow(data) %% length(new_variable)) != 0) { insight::format_error( - paste0(step_msg, " Variable \"", error_var, "\" was not found in the dataset or in the environment."), - .misspelled_string(colnames(data), error_var, "Possibly misspelled or not yet defined?") + "New variable has not the same length as the other variables in the data frame and cannot be recycled." ) - } else { - insight::format_error(paste0( - step_msg, " ", insight::format_capitalize(error_msg), - ". Possibly misspelled or not yet defined?" - )) } - } - # give informative error when new variable doesn't match number of rows - if (!is.null(new_variable) && length(new_variable) != nrow(data) && (nrow(data) %% length(new_variable)) != 0) { - insight::format_error( - "New variable has not the same length as the other variables in the data frame and cannot be recycled." - ) + data[[names(dots)[i]]] <- new_variable } - - data[[names(dots)[i]]] <- new_variable } + # check if we have at/modify ---- + data <- .modify_at(data, .at, .if, .modify) + data } #' @export -data_modify.grouped_df <- function(data, ...) { +data_modify.grouped_df <- function(data, ..., .if = NULL, .at = NULL, .modify = NULL) { # we need to evaluate dots here, and pass them with "do.call" to # the data.frame method later... - dots <- match.call(expand.dots = FALSE)$`...` + dots <- match.call(expand.dots = FALSE)[["..."]] # works only for dplyr >= 0.8.0 grps <- attr(data, "groups", exact = TRUE) @@ -262,8 +308,70 @@ data_modify.grouped_df <- function(data, ...) { data[rows, ] <- data_modify.data.frame(data[rows, ], ...) } + # check if we have at/modify ---- + data <- .modify_at(data, .at, .if, .modify) + # set back attributes and class data <- .replace_attrs(data, attr_data) class(data) <- class_attr data } + + +# helper ------------- + +.modify_at <- function(data, .at, .if, .modify) { + # check if ".at" or ".if" is defined, but not ".modify" + if (is.null(.modify)) { + if (!is.null(.at) || !is.null(.if)) { + insight::format_error("You need to specify `.modify` when using `.at` or `.if`.") + } + return(data) + } + # make sure "modify" is a function + if (!is.function(.modify)) { + insight::format_error("`.modify` must be a function.") + } + # make sure either .at or .if is defined, not both + if (!is.null(.at) && !is.null(.if)) { + insight::format_error("You cannot use both `.at` and `.if` at the same time.") + } + # make sure at least one of .at or .if is defined + if (is.null(.at) && is.null(.if)) { + insight::format_error("You need to specify either `.at` or `.if`.") + } + + column_names <- colnames(data) + + # if we have ".if" defined, specify ".at" + if (!is.null(.if)) { + .at <- column_names[vapply(data, .if, logical(1))] + } + # check for valid defined column names + if (!all(.at %in% column_names)) { + not_found <- .at[!.at %in% column_names] + insight::format_error( + paste0( + "Variable", + ifelse(length(not_found) > 1, "s ", " "), + text_concatenate(not_found, enclose = "\""), + ifelse(length(not_found) > 1, " were", " was"), + " not found in the dataset." + ), + .misspelled_string(column_names, not_found, "Possibly misspelled or not yet defined?") + ) + } + for (i in .at) { + result <- tryCatch(.modify(data[[i]]), warning = function(e) e, error = function(e) e) + if (inherits(result, c("error", "warning"))) { + insight::format_error( + paste0("Error in modifying variable \"", i, "\": ", result$message), + "Please check if you correctly specified the `.modify` function." + ) + } else { + data[[i]] <- result + } + } + + data +} diff --git a/R/data_partition.R b/R/data_partition.R index ce4bc1d38..60eb57e35 100644 --- a/R/data_partition.R +++ b/R/data_partition.R @@ -52,7 +52,7 @@ data_partition <- function(data, verbose = TRUE, training_proportion = proportion, ...) { - # Sanity checks + # validation checks data <- .coerce_to_dataframe(data) if (sum(proportion) > 1) { diff --git a/R/data_read.R b/R/data_read.R index 2c3061570..b02b7ca87 100644 --- a/R/data_read.R +++ b/R/data_read.R @@ -97,14 +97,14 @@ data_read <- function(path, # read data out <- switch(file_type, - "txt" = , - "csv" = .read_text(path, encoding, verbose, ...), - "xls" = , - "xlsx" = .read_excel(path, encoding, verbose, ...), - "sav" = , - "por" = .read_spss(path, encoding, convert_factors, verbose, ...), - "dta" = .read_stata(path, encoding, convert_factors, verbose, ...), - "sas7bdat" = .read_sas(path, path_catalog, encoding, convert_factors, verbose, ...), + txt = , + csv = .read_text(path, encoding, verbose, ...), + xls = , + xlsx = .read_excel(path, encoding, verbose, ...), + sav = , + por = .read_spss(path, encoding, convert_factors, verbose, ...), + dta = .read_stata(path, encoding, convert_factors, verbose, ...), + sas7bdat = .read_sas(path, path_catalog, encoding, convert_factors, verbose, ...), .read_unknown(path, convert_factors, verbose, ...) ) @@ -175,7 +175,10 @@ data_read <- function(path, value_labels <- value_labels[value_labels %in% unique(i)] # guess variable type - if (!is.character(i)) { + if (is.character(i)) { + # we need this to drop haven-specific class attributes + i <- as.character(i) + } else { # if all values are labelled, we assume factor. Use labels as levels if (!is.null(value_labels) && length(value_labels) == insight::n_unique(i)) { if (is.numeric(i)) { @@ -189,9 +192,6 @@ data_read <- function(path, # else, fall back to numeric i <- as.numeric(i) } - } else { - # we need this to drop haven-specific class attributes - i <- as.character(i) } # drop unused value labels @@ -302,9 +302,7 @@ data_read <- function(path, # not. Else, tell user. if (!is.data.frame(out)) { tmp <- tryCatch(as.data.frame(out, stringsAsFactors = FALSE), error = function(e) NULL) - if (!is.null(tmp)) { - out <- tmp - } else { + if (is.null(tmp)) { if (verbose) { insight::format_warning( paste0("Imported file is no data frame, but of class \"", class(out)[1], "\"."), @@ -312,6 +310,8 @@ data_read <- function(path, ) } return(out) + } else { + out <- tmp } } diff --git a/R/data_rescale.R b/R/data_rescale.R index ce5059160..85ff885c6 100644 --- a/R/data_rescale.R +++ b/R/data_rescale.R @@ -1,15 +1,26 @@ #' Rescale Variables to a New Range #' -#' Rescale variables to a new range. -#' Can also be used to reverse-score variables (change the keying/scoring direction). +#' Rescale variables to a new range. Can also be used to reverse-score variables +#' (change the keying/scoring direction), or to expand a range. #' #' @inheritParams categorize #' @inheritParams find_columns #' @inheritParams standardize.data.frame #' -#' @param to Numeric vector of length 2 giving the new range that the variable will have after rescaling. -#' To reverse-score a variable, the range should be given with the maximum value first. -#' See examples. +#' @param to Numeric vector of length 2 giving the new range that the variable +#' will have after rescaling. To reverse-score a variable, the range should +#' be given with the maximum value first. See examples. +#' @param multiply If not `NULL`, `to` is ignored and `multiply` will be used, +#' giving the factor by which the actual range of `x` should be expanded. +#' For example, if a vector ranges from 5 to 15 and `multiply = 1.1`, the current +#' range of 10 will be expanded by the factor of 1.1, giving a new range of +#' 11. Thus, the rescaled vector would range from 4.5 to 15.5. +#' @param add A vector of length 1 or 2. If not `NULL`, `to` is ignored and `add` +#' will be used, giving the amount by which the minimum and maximum of the +#' actual range of `x` should be expanded. For example, if a vector ranges from +#' 5 to 15 and `add = 1`, the range will be expanded from 4 to 16. If `add` is +#' of length 2, then the first value is used for the lower bound and the second +#' value for the upper bound. #' @param range Initial (old) range of values. If `NULL`, will take the range of #' the input vector (`range(x)`). #' @param ... Arguments passed to or from other methods. @@ -37,6 +48,21 @@ #' "Sepal.Length" = c(0, 1), #' "Petal.Length" = c(-1, 0) #' ))) +#' +#' # "expand" ranges by a factor or a given value +#' x <- 5:15 +#' x +#' # both will expand the range by 10% +#' rescale(x, multiply = 1.1) +#' rescale(x, add = 0.5) +#' +#' # expand range by different values +#' rescale(x, add = c(1, 3)) +#' +#' # Specify list of multipliers +#' d <- data.frame(x = 5:15, y = 5:15) +#' rescale(d, multiply = list(x = 1.1, y = 0.5)) +#' #' @inherit data_rename #' #' @return A rescaled object. @@ -75,6 +101,8 @@ rescale.default <- function(x, verbose = TRUE, ...) { #' @export rescale.numeric <- function(x, to = c(0, 100), + multiply = NULL, + add = NULL, range = NULL, verbose = TRUE, ...) { @@ -91,6 +119,9 @@ rescale.numeric <- function(x, range <- c(min(x, na.rm = TRUE), max(x, na.rm = TRUE)) } + # check if user specified "multiply" or "add", and then update "to" + to <- .update_to(x, to, multiply, add) + # called from "makepredictcal()"? Then we have additional arguments dot_args <- list(...) required_dot_args <- c("min_value", "max_value", "new_min", "new_max") @@ -144,6 +175,8 @@ rescale.grouped_df <- function(x, select = NULL, exclude = NULL, to = c(0, 100), + multiply = NULL, + add = NULL, range = NULL, append = FALSE, ignore_case = FALSE, @@ -188,6 +221,8 @@ rescale.grouped_df <- function(x, select = select, exclude = exclude, to = to, + multiply = multiply, + add = add, range = range, append = FALSE, # need to set to FALSE here, else variable will be doubled add_transform_class = FALSE, @@ -207,6 +242,8 @@ rescale.data.frame <- function(x, select = NULL, exclude = NULL, to = c(0, 100), + multiply = NULL, + add = NULL, range = NULL, append = FALSE, ignore_case = FALSE, @@ -245,9 +282,61 @@ rescale.data.frame <- function(x, if (!is.list(to)) { to <- stats::setNames(rep(list(to), length(select)), select) } + # Transform the 'multiply' so that it is a list now + if (!is.null(multiply) && !is.list(multiply)) { + multiply <- stats::setNames(rep(list(multiply), length(select)), select) + } + # Transform the 'add' so that it is a list now + if (!is.null(add) && !is.list(add)) { + add <- stats::setNames(rep(list(add), length(select)), select) + } + # update "to" if user specified "multiply" or "add" + to[] <- lapply(names(to), function(i) { + .update_to(x[[i]], to[[i]], multiply[[i]], add[[i]]) + }) x[select] <- as.data.frame(sapply(select, function(n) { rescale(x[[n]], to = to[[n]], range = range[[n]], add_transform_class = FALSE) }, simplify = FALSE)) x } + + +# helper ---------------------------------------------------------------------- + +# expand the new target range by multiplying or adding +.update_to <- function(x, to, multiply, add) { + # check if user specified "multiply" or "add", and if not, return "to" + if (is.null(multiply) && is.null(add)) { + return(to) + } + # only one of "multiply" or "add" can be specified + if (!is.null(multiply) && !is.null(add)) { + insight::format_error("Only one of `multiply` or `add` can be specified.") + } + # multiply? If yes, calculate the "add" value + if (!is.null(multiply)) { + # check for correct length + if (length(multiply) > 1) { + insight::format_error("The length of `multiply` must be 1.") + } + add <- (diff(range(x, na.rm = TRUE)) * (multiply - 1)) / 2 + } + # add? + if (!is.null(add)) { + # add must be of length 1 or 2 + if (length(add) > 2) { + insight::format_error("The length of `add` must be 1 or 2.") + } + # if add is of length 2, then the first value is used for the lower bound + # and the second value for the upper bound + if (length(add) == 2) { + add_low <- add[1] + add_high <- add[2] + } else { + add_low <- add_high <- add + } + to <- c(min(x, na.rm = TRUE) - add_low, max(x, na.rm = TRUE) + add_high) + } + to +} diff --git a/R/data_reverse.R b/R/data_reverse.R index b9615417e..2fc9ef493 100644 --- a/R/data_reverse.R +++ b/R/data_reverse.R @@ -93,24 +93,24 @@ reverse.numeric <- function(x, } # old minimum and maximum - min <- min(range) - max <- max(range) + min_value <- min(range) + max_value <- max(range) # check if a valid range (i.e. vector of length 2) is provided if (length(range) > 2) { insight::format_error( "`range` must be a numeric vector of length two, indicating lowest and highest value of the required range.", - sprintf("Did you want to provide `range = c(%g, %g)`?", min, max) + sprintf("Did you want to provide `range = c(%g, %g)`?", min_value, max_value) ) } - new_min <- max - new_max <- min + new_min <- max_value + new_max <- min_value - out <- as.vector((new_max - new_min) / (max - min) * (x - min) + new_min) + out <- as.vector((new_max - new_min) / (max_value - min_value) * (x - min_value) + new_min) # labelled data? - out <- .set_back_labels(out, x) + out <- .set_back_labels(out, x, reverse_values = TRUE) out } @@ -134,7 +134,9 @@ reverse.factor <- function(x, range = NULL, verbose = TRUE, ...) { # save for later use original_x <- x - if (!is.null(range)) { + if (is.null(range)) { + old_levels <- levels(x) + } else { # no missing values allowed if (anyNA(range)) { insight::format_error("`range` is not allowed to have missing values.") @@ -180,8 +182,6 @@ reverse.factor <- function(x, range = NULL, verbose = TRUE, ...) { } old_levels <- range x <- factor(x, levels = range) - } else { - old_levels <- levels(x) } int_x <- as.integer(x) @@ -189,7 +189,7 @@ reverse.factor <- function(x, range = NULL, verbose = TRUE, ...) { x <- factor(rev_x, levels = seq_len(length(old_levels)), labels = old_levels) # labelled data? - x <- .set_back_labels(x, original_x) + x <- .set_back_labels(x, original_x, reverse_values = TRUE) x } @@ -225,7 +225,7 @@ reverse.grouped_df <- function(x, # create the new variables and updates "select", so new variables are processed if (!isFALSE(append)) { # process arguments - args <- .process_append( + arguments <- .process_append( x, select, append, @@ -233,8 +233,8 @@ reverse.grouped_df <- function(x, preserve_value_labels = TRUE ) # update processed arguments - x <- args$x - select <- args$select + x <- arguments$x + select <- arguments$select } x <- as.data.frame(x) @@ -279,7 +279,7 @@ reverse.data.frame <- function(x, # create the new variables and updates "select", so new variables are processed if (!isFALSE(append)) { # process arguments - args <- .process_append( + arguments <- .process_append( x, select, append, @@ -287,8 +287,8 @@ reverse.data.frame <- function(x, preserve_value_labels = TRUE ) # update processed arguments - x <- args$x - select <- args$select + x <- arguments$x + select <- arguments$select } # Transform the range so that it is a list now diff --git a/R/data_rotate.R b/R/data_rotate.R index f9a3389d3..2bb39a85d 100644 --- a/R/data_rotate.R +++ b/R/data_rotate.R @@ -56,7 +56,7 @@ data_rotate <- function(data, rownames = NULL, colnames = FALSE, verbose = TRUE) # warning after possible removal of columns if (verbose && insight::n_unique(vapply(data, typeof, FUN.VALUE = character(1L))) > 1L) { - insight::format_warning("Your data frame contains mixed types of data. After transposition, all variables will be transformed into characters.") + insight::format_warning("Your data frame contains mixed types of data. After transposition, all variables will be transformed into characters.") # nolint } # rotate data frame by 90 degrees diff --git a/R/data_seek.R b/R/data_seek.R index c0a56ab8f..17463878d 100644 --- a/R/data_seek.R +++ b/R/data_seek.R @@ -63,7 +63,7 @@ data_seek <- function(data, pattern, seek = c("names", "labels"), fuzzy = FALSE) # check valid args seek <- intersect(seek, c("names", "labels", "values", "levels", "column_names", "columns", "all")) if (is.null(seek) || !length(seek)) { - insight::format_error("`seek` must be one of \"names\", \"labels\", \"values\", a combination of these options, or \"all\".") + insight::format_error("`seek` must be one of \"names\", \"labels\", \"values\", a combination of these options, or \"all\".") # nolint } pos1 <- pos2 <- pos3 <- NULL @@ -71,7 +71,7 @@ data_seek <- function(data, pattern, seek = c("names", "labels"), fuzzy = FALSE) pos <- unlist(lapply(pattern, function(search_pattern) { # search in variable names? if (any(seek %in% c("names", "columns", "column_names", "all"))) { - pos1 <- which(grepl(search_pattern, colnames(data))) + pos1 <- grep(search_pattern, colnames(data)) # find in near distance? if (fuzzy) { pos1 <- c(pos1, .fuzzy_grep(x = colnames(data), pattern = search_pattern)) @@ -80,15 +80,15 @@ data_seek <- function(data, pattern, seek = c("names", "labels"), fuzzy = FALSE) # search in variable labels? if (any(seek %in% c("labels", "all"))) { - labels <- insight::compact_character(unlist(lapply(data, attr, which = "label", exact = TRUE))) - if (!is.null(labels) && length(labels)) { - found <- grepl(search_pattern, labels) - pos2 <- match(names(labels)[found], colnames(data)) + var_labels <- insight::compact_character(unlist(lapply(data, attr, which = "label", exact = TRUE))) + if (!is.null(var_labels) && length(var_labels)) { + found <- grepl(search_pattern, var_labels) + pos2 <- match(names(var_labels)[found], colnames(data)) # find in near distanc? if (fuzzy) { - found <- .fuzzy_grep(x = labels, pattern = search_pattern) + found <- .fuzzy_grep(x = var_labels, pattern = search_pattern) if (length(found)) { - pos2 <- c(pos2, match(names(labels)[found], colnames(data))) + pos2 <- c(pos2, match(names(var_labels)[found], colnames(data))) } } } @@ -129,7 +129,7 @@ data_seek <- function(data, pattern, seek = c("names", "labels"), fuzzy = FALSE) pos <- unique(pos) # variable labels of matching variables - labels <- vapply( + var_labels <- vapply( colnames(data[pos]), function(i) { l <- attr(data[[i]], "label", exact = TRUE) @@ -145,7 +145,7 @@ data_seek <- function(data, pattern, seek = c("names", "labels"), fuzzy = FALSE) out <- data.frame( index = pos, column = colnames(data)[pos], - labels = labels, + labels = var_labels, stringsAsFactors = FALSE ) # no row names diff --git a/R/data_separate.R b/R/data_separate.R index eb6d571f1..53243fb33 100644 --- a/R/data_separate.R +++ b/R/data_separate.R @@ -229,9 +229,9 @@ data_separate <- function(data, l <- l[!vapply(l, function(i) all(is.na(i)), TRUE)] # define number of new columns, based on user-choice n_cols <- switch(guess_columns, - "min" = min(l, na.rm = TRUE), - "max" = max(l, na.rm = TRUE), - "mode" = distribution_mode(l), + min = min(l, na.rm = TRUE), + max = max(l, na.rm = TRUE), + mode = distribution_mode(l), ) # tell user if (verbose && insight::n_unique(l) != 1 && !is.numeric(separator)) { @@ -254,7 +254,7 @@ data_separate <- function(data, # catch error if (is.null(separated_columns)) { insight::format_error( - "Something went wrong. Probably the number of provided column names did not match number of newly created columns?" + "Something went wrong. Probably the number of provided column names did not match number of newly created columns?" # nolint ) } @@ -264,14 +264,12 @@ data_separate <- function(data, # if no column names provided, use standard names if (is.null(new_columns[[sep_column]])) { new_column_names <- paste0(sep_column, "_", seq_along(out)) - } else { + } else if (make_unique_colnames) { # if we have multiple columns that were separated, we avoid duplicated # column names of created variables by appending name of original column - if (make_unique_colnames) { - new_column_names <- paste0(sep_column, "_", new_columns[[sep_column]]) - } else { - new_column_names <- new_columns[[sep_column]] - } + new_column_names <- paste0(sep_column, "_", new_columns[[sep_column]]) + } else { + new_column_names <- new_columns[[sep_column]] } colnames(out) <- new_column_names @@ -338,29 +336,27 @@ data_separate <- function(data, out <- rep(NA_character_, times = n_cols) } else if (n_values > n_cols) { # we have more values than required - drop extra columns - if (extra == "drop_left") { - out <- i[(n_values - n_cols + 1):n_values] - } else if (extra == "drop_right") { - out <- i[1:n_cols] - } else if (extra == "merge_left") { - out <- paste(i[1:(n_values - n_cols + 1)], collapse = " ") - out <- c(out, i[(n_values - n_cols + 2):n_values]) - } else { - out <- i[1:(n_cols - 1)] - out <- c(out, paste(i[n_cols:n_values], collapse = " ")) - } + out <- switch(extra, + drop_left = i[(n_values - n_cols + 1):n_values], + drop_right = i[1:n_cols], + merge_left = { + tmp <- paste(i[1:(n_values - n_cols + 1)], collapse = " ") + c(tmp, i[(n_values - n_cols + 2):n_values]) + }, + { + tmp <- i[1:(n_cols - 1)] + c(tmp, paste(i[n_cols:n_values], collapse = " ")) + } + ) warn_extra <- TRUE } else if (n_values < n_cols) { # we have fewer values than required - fill columns - if (fill == "left") { - out <- c(rep(NA_character_, times = n_cols - n_values), i) - } else if (fill == "right") { - out <- c(i, rep(NA_character_, times = n_cols - n_values)) - } else if (fill == "value_left") { - out <- c(rep(i[1], times = n_cols - n_values), i) - } else { - out <- c(i, rep(i[length(i)], times = n_cols - n_values)) - } + out <- switch(fill, + left = c(rep(NA_character_, times = n_cols - n_values), i), + right = c(i, rep(NA_character_, times = n_cols - n_values)), + value_left = c(rep(i[1], times = n_cols - n_values), i), + c(i, rep(i[length(i)], times = n_cols - n_values)) + ) warn_fill <- TRUE } else { out <- i @@ -374,10 +370,10 @@ data_separate <- function(data, "`", sep_column, "`", " returned more columns than expected after splitting. ", switch(extra, - "drop_left" = "Left-most columns have been dropped.", - "drop_right" = "Right-most columns have been dropped.", - "merge_left" = "Left-most columns have been merged together.", - "merge_right" = "Right-most columns have been merged together." + drop_left = "Left-most columns have been dropped.", + drop_right = "Right-most columns have been dropped.", + merge_left = "Left-most columns have been merged together.", + merge_right = "Right-most columns have been merged together." ) )) } @@ -386,10 +382,10 @@ data_separate <- function(data, "`", sep_column, "`", "returned fewer columns than expected after splitting. ", switch(fill, - "left" = "Left-most columns were filled with `NA`.", - "right" = "Right-most columns were filled with `NA`.", - "value_left" = "Left-most columns were filled with first value.", - "value_right" = "Right-most columns were filled with last value." + left = "Left-most columns were filled with `NA`.", + right = "Right-most columns were filled with `NA`.", + value_left = "Left-most columns were filled with first value.", + value_right = "Right-most columns were filled with last value." ) )) } diff --git a/R/data_tabulate.R b/R/data_tabulate.R index 71b694eb8..04f205ec8 100644 --- a/R/data_tabulate.R +++ b/R/data_tabulate.R @@ -1,30 +1,62 @@ -#' @title Create frequency tables of variables +#' @title Create frequency and crosstables of variables #' @name data_tabulate #' -#' @description This function creates frequency tables of variables, including -#' the number of levels/values as well as the distribution of raw, valid and -#' cumulative percentages. +#' @description This function creates frequency or crosstables of variables, +#' including the number of levels/values as well as the distribution of raw, +#' valid and cumulative percentages. For crosstables, row, column and cell +#' percentages can be calculated. #' #' @param x A (grouped) data frame, a vector or factor. -#' @param drop_levels Logical, if `TRUE`, factor levels that do not occur in +#' @param by Optional vector or factor. If supplied, a crosstable is created. +#' If `x` is a data frame, `by` can also be a character string indicating the +#' name of a variable in `x`. +#' @param drop_levels Logical, if `FALSE`, factor levels that do not occur in #' the data are included in the table (with frequency of zero), else unused #' factor levels are dropped from the frequency table. #' @param name Optional character string, which includes the name that is used #' for printing. +#' @param include_na Logical, if `TRUE`, missing values are included in the +#' frequency or crosstable, else missing values are omitted. #' @param collapse Logical, if `TRUE` collapses multiple tables into one larger #' table for printing. This affects only printing, not the returned object. +#' @param weights Optional numeric vector of weights. Must be of the same length +#' as `x`. If `weights` is supplied, weighted frequencies are calculated. +#' @param proportions Optional character string, indicating the type of +#' percentages to be calculated. Only applies to crosstables, i.e. when `by` is +#' not `NULL`. Can be `"row"` (row percentages), `"column"` (column percentages) +#' or `"full"` (to calculate relative frequencies for the full table). #' @param ... not used. #' @inheritParams find_columns #' +#' @section Crosstables: +#' If `by` is supplied, a crosstable is created. The crosstable includes `` +#' (missing) values by default. The first column indicates values of `x`, the +#' first row indicates values of `by` (including missing values). The last row +#' and column contain the total frequencies for each row and column, respectively. +#' Setting `include_na = FALSE` will omit missing values from the crosstable. +#' Setting `proportions` to `"row"` or `"column"` will add row or column +#' percentages. Setting `proportions` to `"full"` will add relative frequencies +#' for the full table. +#' +#' @note +#' There are `print_html()` and `print_md()` methods available for printing +#' frequency or crosstables in HTML and markdown format, e.g. +#' `print_html(data_tabulate(x))`. +#' #' @return A data frame, or a list of data frames, with one frequency table #' as data frame per variable. #' #' @examplesIf requireNamespace("poorman") +#' # frequency tables ------- +#' # ------------------------ #' data(efc) #' #' # vector/factor #' data_tabulate(efc$c172code) #' +#' # drop missing values +#' data_tabulate(efc$c172code, include_na = FALSE) +#' #' # data frame #' data_tabulate(efc, c("e42dep", "c172code")) #' @@ -46,6 +78,36 @@ #' #' # to remove the big mark, use "print(..., big_mark = "")" #' print(data_tabulate(x), big_mark = "") +#' +#' # weighted frequencies +#' set.seed(123) +#' efc$weights <- abs(rnorm(n = nrow(efc), mean = 1, sd = 0.5)) +#' data_tabulate(efc$e42dep, weights = efc$weights) +#' +#' # crosstables ------ +#' # ------------------ +#' +#' # add some missing values +#' set.seed(123) +#' efc$e16sex[sample.int(nrow(efc), 5)] <- NA +#' +#' data_tabulate(efc, "c172code", by = "e16sex") +#' +#' # add row and column percentages +#' data_tabulate(efc, "c172code", by = "e16sex", proportions = "row") +#' data_tabulate(efc, "c172code", by = "e16sex", proportions = "column") +#' +#' # omit missing values +#' data_tabulate( +#' efc$c172code, +#' by = efc$e16sex, +#' proportions = "column", +#' include_na = FALSE +#' ) +#' +#' # round percentages +#' out <- data_tabulate(efc, "c172code", by = "e16sex", proportions = "column") +#' print(out, digits = 0) #' @export data_tabulate <- function(x, ...) { UseMethod("data_tabulate") @@ -54,7 +116,15 @@ data_tabulate <- function(x, ...) { #' @rdname data_tabulate #' @export -data_tabulate.default <- function(x, drop_levels = FALSE, name = NULL, verbose = TRUE, ...) { +data_tabulate.default <- function(x, + by = NULL, + drop_levels = FALSE, + weights = NULL, + include_na = TRUE, + proportions = NULL, + name = NULL, + verbose = TRUE, + ...) { # save label attribute, before it gets lost... var_label <- attr(x, "label", exact = TRUE) @@ -70,8 +140,53 @@ data_tabulate.default <- function(x, drop_levels = FALSE, name = NULL, verbose = x <- droplevels(x) } + # validate "weights" + weights <- .validate_table_weights(weights, x) + + # we go into another function for crosstables here... + if (!is.null(by)) { + by <- .validate_by(by, x) + return(.crosstable( + x, + by = by, + weights = weights, + include_na = include_na, + proportions = proportions, + obj_name = obj_name, + group_variable = group_variable + )) + } + # frequency table - freq_table <- tryCatch(table(addNA(x)), error = function(e) NULL) + if (is.null(weights)) { + if (include_na) { + freq_table <- tryCatch(table(addNA(x)), error = function(e) NULL) + } else { + freq_table <- tryCatch(table(x), error = function(e) NULL) + } + } else if (include_na) { + # weighted frequency table, including NA + freq_table <- tryCatch( + stats::xtabs( + weights ~ x, + data = data.frame(weights = weights, x = addNA(x)), + na.action = stats::na.pass, + addNA = TRUE + ), + error = function(e) NULL + ) + } else { + # weighted frequency table, excluding NA + freq_table <- tryCatch( + stats::xtabs( + weights ~ x, + data = data.frame(weights = weights, x = x), + na.action = stats::na.omit, + addNA = FALSE + ), + error = function(e) NULL + ) + } if (is.null(freq_table)) { insight::format_warning(paste0("Can't compute frequency tables for objects of class `", class(x)[1], "`.")) @@ -83,8 +198,20 @@ data_tabulate.default <- function(x, drop_levels = FALSE, name = NULL, verbose = replacement = c("Value", "N") ) + # we want to round N for weighted frequencies + if (!is.null(weights)) { + out$N <- round(out$N) + } + out$`Raw %` <- 100 * out$N / sum(out$N) - out$`Valid %` <- c(100 * out$N[-nrow(out)] / sum(out$N[-nrow(out)]), NA) + # if we have missing values, we add a row with NA + if (include_na) { + out$`Valid %` <- c(100 * out$N[-nrow(out)] / sum(out$N[-nrow(out)]), NA) + valid_n <- sum(out$N[-length(out$N)], na.rm = TRUE) + } else { + out$`Valid %` <- 100 * out$N / sum(out$N) + valid_n <- sum(out$N, na.rm = TRUE) + } out$`Cumulative %` <- cumsum(out$`Valid %`) # add information about variable/group names @@ -110,9 +237,10 @@ data_tabulate.default <- function(x, drop_levels = FALSE, name = NULL, verbose = attr(out, "object") <- obj_name attr(out, "group_variable") <- group_variable attr(out, "duplicate_varnames") <- duplicated(out$Variable) + attr(out, "weights") <- weights attr(out, "total_n") <- sum(out$N, na.rm = TRUE) - attr(out, "valid_n") <- sum(out$N[-length(out$N)], na.rm = TRUE) + attr(out, "valid_n") <- valid_n class(out) <- c("dw_data_tabulate", "data.frame") @@ -127,8 +255,12 @@ data_tabulate.data.frame <- function(x, exclude = NULL, ignore_case = FALSE, regex = FALSE, - collapse = FALSE, + by = NULL, drop_levels = FALSE, + weights = NULL, + include_na = TRUE, + proportions = NULL, + collapse = FALSE, verbose = TRUE, ...) { # evaluate arguments @@ -139,12 +271,33 @@ data_tabulate.data.frame <- function(x, regex = regex, verbose = verbose ) + + # validate "by" + by <- .validate_by(by, x) + # validate "weights" + weights <- .validate_table_weights(weights, x) + out <- lapply(select, function(i) { - data_tabulate(x[[i]], drop_levels = drop_levels, name = i, verbose = verbose, ...) + data_tabulate( + x[[i]], + by = by, + proportions = proportions, + drop_levels = drop_levels, + weights = weights, + include_na = include_na, + name = i, + verbose = verbose, + ... + ) }) - class(out) <- c("dw_data_tabulates", "list") + if (is.null(by)) { + class(out) <- c("dw_data_tabulates", "list") + } else { + class(out) <- c("dw_data_xtabulates", "list") + } attr(out, "collapse") <- isTRUE(collapse) + attr(out, "is_weighted") <- !is.null(weights) out } @@ -156,9 +309,13 @@ data_tabulate.grouped_df <- function(x, exclude = NULL, ignore_case = FALSE, regex = FALSE, - verbose = TRUE, - collapse = FALSE, + by = NULL, + proportions = NULL, drop_levels = FALSE, + weights = NULL, + include_na = TRUE, + collapse = FALSE, + verbose = TRUE, ...) { # works only for dplyr >= 0.8.0 grps <- attr(x, "groups", exact = TRUE) @@ -175,14 +332,15 @@ data_tabulate.grouped_df <- function(x, ) x <- as.data.frame(x) + out <- list() for (i in seq_along(grps)) { rows <- grps[[i]] # save information about grouping factors - if (!is.null(group_variables)) { - group_variable <- group_variables[i, , drop = FALSE] - } else { + if (is.null(group_variables)) { group_variable <- NULL + } else { + group_variable <- group_variables[i, , drop = FALSE] } out <- c(out, data_tabulate( data_filter(x, rows), @@ -191,19 +349,26 @@ data_tabulate.grouped_df <- function(x, ignore_case = ignore_case, verbose = verbose, drop_levels = drop_levels, + weights = weights, + include_na = include_na, + by = by, + proportions = proportions, group_variable = group_variable, ... )) } - class(out) <- c("dw_data_tabulates", "list") + if (is.null(by)) { + class(out) <- c("dw_data_tabulates", "list") + } else { + class(out) <- c("dw_data_xtabulates", "list") + } attr(out, "collapse") <- isTRUE(collapse) + attr(out, "is_weighted") <- !is.null(weights) out } - - # methods -------------------- #' @importFrom insight print_html @@ -226,7 +391,7 @@ format.dw_data_tabulate <- function(x, format = "text", big_mark = NULL, ...) { # format data frame ftab <- insight::format_table(x, ...) ftab[] <- lapply(ftab, function(i) { - i[i == ""] <- ifelse(identical(format, "text"), "", "(NA)") + i[i == ""] <- ifelse(identical(format, "text"), "", "(NA)") # nolint i }) ftab$N <- gsub("\\.00$", "", ftab$N) @@ -249,7 +414,6 @@ format.dw_data_tabulate <- function(x, format = "text", big_mark = NULL, ...) { } - #' @export print.dw_data_tabulate <- function(x, big_mark = NULL, ...) { a <- attributes(x) @@ -270,7 +434,12 @@ print.dw_data_tabulate <- function(x, big_mark = NULL, ...) { a$valid_n <- .add_commas_in_numbers(a$valid_n, big_mark) # summary of total and valid N (we may add mean/sd as well?) - summary_line <- sprintf("# total N=%s valid N=%s\n\n", a$total_n, a$valid_n) + summary_line <- sprintf( + "# total N=%s valid N=%s%s\n\n", + a$total_n, + a$valid_n, + ifelse(is.null(a$weights), "", " (weighted)") + ) cat(insight::print_color(summary_line, "blue")) # remove information that goes into the header/footer @@ -295,7 +464,12 @@ print_html.dw_data_tabulate <- function(x, big_mark = NULL, ...) { caption <- .table_header(x, "html") # summary of total and valid N (we may add mean/sd as well?) - footer <- sprintf("total N=%i valid N=%i\n\n", a$total_n, a$valid_n) + footer <- sprintf( + "total N=%i valid N=%i%s", + a$total_n, + a$valid_n, + ifelse(is.null(a$weights), "", " (weighted)") + ) # remove information that goes into the header/footer x$Variable <- NULL @@ -320,7 +494,12 @@ print_md.dw_data_tabulate <- function(x, big_mark = NULL, ...) { caption <- .table_header(x, "markdown") # summary of total and valid N (we may add mean/sd as well?) - footer <- sprintf("total N=%i valid N=%i\n\n", a$total_n, a$valid_n) + footer <- sprintf( + "total N=%i valid N=%i%s\n\n", + a$total_n, + a$valid_n, + ifelse(is.null(a$weights), "", " (weighted)") + ) # remove information that goes into the header/footer x$Variable <- NULL @@ -339,6 +518,9 @@ print_md.dw_data_tabulate <- function(x, big_mark = NULL, ...) { #' @export print.dw_data_tabulates <- function(x, big_mark = NULL, ...) { + # check if we have weights + is_weighted <- isTRUE(attributes(x)$is_weighted) + a <- attributes(x) if (!isTRUE(a$collapse) || length(x) == 1) { for (i in seq_along(x)) { @@ -347,16 +529,20 @@ print.dw_data_tabulates <- function(x, big_mark = NULL, ...) { } } else { x <- lapply(x, function(i) { - attr <- attributes(i) + i_attr <- attributes(i) i <- format(i, format = "text", big_mark = big_mark, ...) - i$Variable[attr$duplicate_varnames] <- "" - if (!is.null(i$Group)) i$Group[attr$duplicate_varnames] <- "" + i$Variable[i_attr$duplicate_varnames] <- "" + if (!is.null(i$Group)) i$Group[i_attr$duplicate_varnames] <- "" i[nrow(i) + 1, ] <- "" i }) out <- do.call(rbind, x) - cat(insight::print_color("# Frequency Table\n\n", "blue")) + if (is_weighted) { + cat(insight::print_color("# Frequency Table (weighted)\n\n", "blue")) + } else { + cat(insight::print_color("# Frequency Table\n\n", "blue")) + } # print table cat(insight::export_table( @@ -371,13 +557,16 @@ print.dw_data_tabulates <- function(x, big_mark = NULL, ...) { #' @export print_html.dw_data_tabulates <- function(x, big_mark = NULL, ...) { + # check if we have weights + is_weighted <- isTRUE(attributes(x)$is_weighted) + if (length(x) == 1) { print_html(x[[1]], big_mark = big_mark, ...) } else { x <- lapply(x, function(i) { - attr <- attributes(i) + i_attr <- attributes(i) i <- format(i, format = "html", big_mark = big_mark, ...) - i$Variable[attr$duplicate_varnames] <- "" + i$Variable[i_attr$duplicate_varnames] <- "" i }) @@ -387,7 +576,7 @@ print_html.dw_data_tabulates <- function(x, big_mark = NULL, ...) { insight::export_table( out, missing = "", - caption = "Frequency Table", + caption = ifelse(is_weighted, "Frequency Table (weighted)", "Frequency Table"), format = "html", group_by = "Group" ) @@ -397,14 +586,17 @@ print_html.dw_data_tabulates <- function(x, big_mark = NULL, ...) { #' @export print_md.dw_data_tabulates <- function(x, big_mark = NULL, ...) { + # check if we have weights + is_weighted <- isTRUE(attributes(x)$is_weighted) + if (length(x) == 1) { print_md(x[[1]], big_mark = big_mark, ...) } else { x <- lapply(x, function(i) { - attr <- attributes(i) + i_attr <- attributes(i) i <- format(i, format = "markdown", big_mark = big_mark, ...) - i$Variable[attr$duplicate_varnames] <- "" - if (!is.null(i$Group)) i$Group[attr$duplicate_varnames] <- "" + i$Variable[i_attr$duplicate_varnames] <- "" + if (!is.null(i$Group)) i$Group[i_attr$duplicate_varnames] <- "" i[nrow(i) + 1, ] <- "" i }) @@ -417,7 +609,7 @@ print_md.dw_data_tabulates <- function(x, big_mark = NULL, ...) { missing = "(NA)", empty_line = "-", format = "markdown", - title = "Frequency Table" + title = ifelse(is_weighted, "Frequency Table (weighted)", "Frequency Table") ) } } @@ -489,14 +681,14 @@ print_md.dw_data_tabulates <- function(x, big_mark = NULL, ...) { } switch(vt, - "ord" = "ordinal", - "fct" = "categorical", - "dbl" = "numeric", - "int" = "integer", - "chr" = "character", - "lbl" = "labelled", - "cpl" = "complex", - "lgl" = "logical", + ord = "ordinal", + fct = "categorical", + dbl = "numeric", + int = "integer", + chr = "character", + lbl = "labelled", + cpl = "complex", + lgl = "logical", vt ) } diff --git a/R/data_write.R b/R/data_write.R index b8d710d2e..7e4e543d6 100644 --- a/R/data_write.R +++ b/R/data_write.R @@ -19,13 +19,13 @@ data_write <- function(data, # check file type, so we know the target dta format file_type <- .file_ext(path) type <- switch(file_type, - "txt" = , - "csv" = "csv", - "sav" = , - "por" = "spss", - "zsav" = "zspss", - "dta" = "stata", - "xpt" = "sas", + txt = , + csv = "csv", + sav = , + por = "spss", + zsav = "zspss", + dta = "stata", + xpt = "sas", "unknown" ) @@ -142,9 +142,13 @@ data_write <- function(data, # character requires special preparation to save value labels # haven:::vec_cast_named requires "x" and "labels" to be of same type if (is.character(i)) { + # only prepare value labels when these are not NULL + if (!is.null(value_labels)) { + value_labels <- stats::setNames(as.character(value_labels), names(value_labels)) + } haven::labelled( x = i, - labels = stats::setNames(as.character(value_labels), names(value_labels)), + labels = value_labels, label = variable_label ) } else { diff --git a/R/data_xtabulate.R b/R/data_xtabulate.R new file mode 100644 index 000000000..c8e622e2d --- /dev/null +++ b/R/data_xtabulate.R @@ -0,0 +1,347 @@ +# helper to compute crosstables -------------- + +.crosstable <- function(x, + by, + weights = NULL, + include_na = TRUE, + proportions = NULL, + obj_name = NULL, + group_variable = NULL) { + if (!is.null(proportions)) { + proportions <- match.arg(proportions, c("row", "column", "full")) + } + # frequency table + if (is.null(weights)) { + if (include_na) { + x_table <- tryCatch(table(addNA(x), addNA(by)), error = function(e) NULL) + } else { + x_table <- tryCatch(table(x, by), error = function(e) NULL) + } + } else if (include_na) { + # weighted frequency table, including NA + x_table <- tryCatch( + stats::xtabs( + weights ~ x + by, + data = data.frame(weights = weights, x = addNA(x), by = addNA(by)), + na.action = stats::na.pass, + addNA = TRUE + ), + error = function(e) NULL + ) + } else { + # weighted frequency table, excluding NA + x_table <- tryCatch( + stats::xtabs( + weights ~ x + by, + data = data.frame(weights = weights, x = x, by = by), + na.action = stats::na.omit, + addNA = FALSE + ), + error = function(e) NULL + ) + } + + if (is.null(x_table)) { + insight::format_warning(paste0("Can't compute cross tables for objects of class `", class(x)[1], "`.")) + return(NULL) + } + + out <- as.data.frame(stats::ftable(x_table)) + colnames(out) <- c("Value", "by", "N") + total_n <- sum(out$N, na.rm = TRUE) + + # we want to round N for weighted frequencies + if (!is.null(weights)) { + out$N <- round(out$N) + total_n <- round(total_n) + } + + out <- data_to_wide(out, values_from = "N", names_from = "by") + + # use variable name as column name + if (!is.null(obj_name)) { + colnames(out)[1] <- obj_name + } + + # for grouped data frames, add info about grouping variables + if (!is.null(group_variable)) { + var_info <- toString(lapply(colnames(group_variable), function(i) { + sprintf("%s (%s)", i, group_variable[[i]]) + })) + out <- cbind(out[1], data.frame(Group = var_info, stringsAsFactors = FALSE), out[-1]) + } + + attr(out, "total_n") <- total_n + attr(out, "weights") <- weights + attr(out, "proportions") <- proportions + + class(out) <- c("dw_data_xtabulate", "data.frame") + + out +} + + +# methods --------------------- + + +#' @export +format.dw_data_xtabulate <- function(x, format = "text", digits = 1, big_mark = NULL, ...) { + # convert to character manually, else, for large numbers, + # format_table() returns scientific notation + x <- as.data.frame(x) + + # remove group variable + x$Group <- NULL + + # compute total N for rows and colummns + total_n <- attributes(x)$total_n + total_column <- rowSums(x[, -1], na.rm = TRUE) + total_row <- c(colSums(x[, -1], na.rm = TRUE), total_n) + + # proportions? + props <- attributes(x)$proportions + + if (!is.null(props)) { + # we copy x to tmp, because when we create strings with "sprintf()", the + # variable is coerced to character, and in subsequent iterations of the loop, + # mathemathical operations are not possible anymore + tmp <- x + if (identical(props, "row")) { + for (i in seq_len(nrow(x))) { + tmp[i, -1] <- paste( + format(x[i, -1]), + format(sprintf("(%.*f%%)", digits, 100 * x[i, -1] / sum(x[i, -1], na.rm = TRUE)), justify = "right") + ) + } + } else if (identical(props, "column")) { + for (i in seq_len(ncol(x))[-1]) { + tmp[, i] <- paste( + format(x[, i]), + format(sprintf("(%.*f%%)", digits, 100 * x[, i] / sum(x[, i], na.rm = TRUE)), justify = "right") + ) + } + } else if (identical(props, "full")) { + for (i in seq_len(ncol(x))[-1]) { + tmp[, i] <- paste( + format(x[, i]), + format(sprintf("(%.*f%%)", digits, 100 * x[, i] / total_n), justify = "right") + ) + } + } + # copy back final result + x <- tmp + } + + x[] <- lapply(x, as.character) + + # format data frame + ftab <- insight::format_table(x, ...) + # replace empty cells with NA + ftab[] <- lapply(ftab, function(i) { + i[i == ""] <- ifelse(identical(format, "text"), "", "(NA)") # nolint + i + }) + # Remove ".00" from numbers + ftab$Total <- gsub("\\.00$", "", as.character(total_column)) + # for text format, insert "empty row" before last total row + if (identical(format, "text") || identical(format, "markdown")) { + empty_row <- as.data.frame(t(data.frame( + rep("", ncol(ftab)), + c("Total", as.character(total_row)), + stringsAsFactors = FALSE + ))) + } else { + empty_row <- as.data.frame(t(data.frame( + c("Total", as.character(total_row)), + stringsAsFactors = FALSE + ))) + } + colnames(empty_row) <- colnames(ftab) + ftab <- rbind(ftab, empty_row) + ftab[nrow(ftab), ] <- gsub("\\.00$", "", ftab[nrow(ftab), ]) + + # insert big marks? + ftab$Total <- .add_commas_in_numbers(ftab$Total, big_mark) + ftab[nrow(ftab), ] <- .add_commas_in_numbers(ftab[nrow(ftab), ], big_mark) + + # also format NA column name + colnames(ftab)[colnames(ftab) == "NA"] <- ifelse(identical(format, "text"), "", "(NA)") + + ftab +} + + +#' @export +print.dw_data_xtabulate <- function(x, big_mark = NULL, ...) { + # grouped data? if yes, add information on grouping factor + if (is.null(x[["Group"]])) { + caption <- NULL + } else { + caption <- paste0("Grouped by ", x[["Group"]][1]) + x$Group <- NULL + } + + # print table + cat(insight::export_table( + format(x, big_mark = big_mark, ...), + cross = "+", + missing = "", + caption = caption, + empty_line = "-" + )) + invisible(x) +} + + +#' @export +print_md.dw_data_xtabulate <- function(x, big_mark = NULL, ...) { + # grouped data? if yes, add information on grouping factor + if (is.null(x[["Group"]])) { + caption <- NULL + } else { + caption <- paste0("Grouped by ", x[["Group"]][1]) + x$Group <- NULL + } + + # print table + insight::export_table( + format(x, format = "markdown", big_mark = big_mark, ...), + cross = "+", + missing = "", + caption = caption, + empty_line = "-", + format = "markdown" + ) +} + + +#' @export +print_html.dw_data_xtabulate <- function(x, big_mark = NULL, ...) { + # grouped data? if yes, add information on grouping factor + if (!is.null(x[["Group"]])) { + x$groups <- paste0("Grouped by ", x[["Group"]][1]) + x$Group <- NULL + } + + # print table + insight::export_table( + format(x, big_mark = big_mark, format = "html", ...), + missing = "(NA)", + format = "html", + group_by = "groups" + ) +} + + +#' @export +print.dw_data_xtabulates <- function(x, big_mark = NULL, ...) { + for (i in seq_along(x)) { + print(x[[i]], big_mark = big_mark, ...) + cat("\n") + } + invisible(x) +} + + +#' @export +print_html.dw_data_xtabulates <- function(x, big_mark = NULL, ...) { + if (length(x) == 1) { + print_html(x[[1]], big_mark = big_mark, ...) + } else { + x <- lapply(x, function(i) { + # grouped data? if yes, add information on grouping factor + if (!is.null(i[["Group"]])) { + i$groups <- paste0("Grouped by ", i[["Group"]][1]) + i$Group <- NULL + } + format(i, format = "html", big_mark = big_mark, ...) + }) + + out <- do.call(rbind, x) + + # print table + insight::export_table( + out, + missing = "(NA)", + format = "html", + group_by = "groups" + ) + } +} + + +# helper --------------------- + +.validate_by <- function(by, x) { + if (!is.null(by)) { + if (is.character(by)) { + # If "by" is a character string, must be of length 1 + if (length(by) > 1) { + insight::format_error( + "If `by` is a string indicating a variable name, `by` must be of length 1.", + "You may use `data_group()` to group by multiple variables, then call `data_tabulate()`." + ) + } + # if "by" is a character, "x" must be a data frame + if (!is.data.frame(x)) { + insight::format_error("If `by` is a string indicating a variable name, `x` must be a data frame.") + } + # is "by" a column in "x"? + if (!by %in% colnames(x)) { + insight::format_error(sprintf( + "The variable specified in `by` was not found in `x`. %s", + .misspelled_string(names(x), by, "Possibly misspelled?") + )) + } + by <- x[[by]] + } + # is "by" of same length as "x"? + if (is.data.frame(x) && length(by) != nrow(x)) { + insight::format_error("Length of `by` must be equal to number of rows in `x`.") # nolint + } + if (!is.data.frame(x) && length(by) != length(x)) { + insight::format_error("Length of `by` must be equal to length of `x`.") # nolint + } + if (!is.factor(by)) { + # coerce "by" to factor, including labels + by <- to_factor(by, labels_to_levels = TRUE, verbose = FALSE) + } + } + + by +} + + +.validate_table_weights <- function(weights, x) { + if (!is.null(weights)) { + if (is.character(weights)) { + # If "weights" is a character string, must be of length 1 + if (length(weights) > 1) { + insight::format_error( + "If `weights` is a string indicating a variable name, `weights` must be of length 1." + ) + } + # if "weights" is a character, "x" must be a data frame + if (!is.data.frame(x)) { + insight::format_error("If `weights` is a string indicating a variable name, `x` must be a data frame.") # nolint + } + # is "by" a column in "x"? + if (!weights %in% colnames(x)) { + insight::format_error(sprintf( + "The variable specified in `weights` was not found in `x`. %s", + .misspelled_string(names(x), weights, "Possibly misspelled?") + )) + } + weights <- x[[weights]] + } + # is "by" of same length as "x"? + if (is.data.frame(x) && length(weights) != nrow(x)) { + insight::format_error("Length of `weights` must be equal to number of rows in `x`.") # nolint + } + if (!is.data.frame(x) && length(weights) != length(x)) { + insight::format_error("Length of `weights` must be equal to length of `x`.") # nolint + } + } + + weights +} diff --git a/R/describe_distribution.R b/R/describe_distribution.R index e01243388..37850299a 100644 --- a/R/describe_distribution.R +++ b/R/describe_distribution.R @@ -82,18 +82,18 @@ describe_distribution.list <- function(x, num_el <- which(vapply(x, is.numeric, FUN.VALUE = logical(1L))) # get elements names as is - # ex: list(mtcars$mpg, mtcars$cyl) -> c("mtcars$mpg", "mtcars$cyl") + # ex: `list(mtcars$mpg, mtcars$cyl) -> c("mtcars$mpg", "mtcars$cyl")` nm <- vapply(sys.call()[[2]], insight::safe_deparse, FUN.VALUE = character(1L))[-1] - if (!isTRUE(include_factors)) { - x <- x[num_el] + if (isTRUE(include_factors)) { + x <- x[c(num_el, factor_el)] if (length(nm) != 0) { - nm <- nm[num_el] + nm <- nm[c(num_el, factor_el)] } } else { - x <- x[c(num_el, factor_el)] + x <- x[num_el] if (length(nm) != 0) { - nm <- nm[c(num_el, factor_el)] + nm <- nm[num_el] } } @@ -123,12 +123,12 @@ describe_distribution.list <- function(x, })) - if (!is.null(names(x))) { - empty_names <- which(names(x) == "") + if (is.null(names(x))) { + new_names <- nm + } else { + empty_names <- which(!nzchar(names(x), keepNA = TRUE)) new_names <- names(x) new_names[empty_names] <- nm[empty_names] - } else { - new_names <- nm } out$Variable <- new_names @@ -230,7 +230,7 @@ describe_distribution.numeric <- function(x, out$n <- length(x) out$n_Missing <- n_missing - out$`.temp` <- NULL + out$.temp <- NULL class(out) <- unique(c("parameters_distribution", "see_parameters_distribution", class(out))) attr(out, "data") <- x diff --git a/R/labels_to_levels.R b/R/labels_to_levels.R index be1a8d8b7..c1ff97a16 100644 --- a/R/labels_to_levels.R +++ b/R/labels_to_levels.R @@ -58,7 +58,7 @@ labels_to_levels.data.frame <- function(x, regex = FALSE, verbose = TRUE, ...) { - # sanity check, return as is for complete factor + # validation check, return as is for complete factor if (all(vapply(x, is.factor, TRUE))) { return(x) } @@ -79,7 +79,7 @@ labels_to_levels.data.frame <- function(x, # create the new variables and updates "select", so new variables are processed if (!isFALSE(append)) { # process arguments - args <- .process_append( + arguments <- .process_append( x, select, append, @@ -89,8 +89,8 @@ labels_to_levels.data.frame <- function(x, keep_character = FALSE ) # update processed arguments - x <- args$x - select <- args$select + x <- arguments$x + select <- arguments$select } x[select] <- lapply( diff --git a/R/means_by_group.R b/R/means_by_group.R index 1d3f6fd52..faa73eba6 100644 --- a/R/means_by_group.R +++ b/R/means_by_group.R @@ -60,7 +60,7 @@ means_by_group.numeric <- function(x, weights = NULL, digits = NULL, ...) { - # sanity check for arguments + # validation check for arguments # "group" must be provided if (is.null(group)) { diff --git a/R/normalize.R b/R/normalize.R index d8efb6c80..ba2eee40d 100644 --- a/R/normalize.R +++ b/R/normalize.R @@ -202,15 +202,15 @@ normalize.grouped_df <- function(x, # create the new variables and updates "select", so new variables are processed if (!isFALSE(append)) { # process arguments - args <- .process_append( + my_args <- .process_append( x, select, append, append_suffix = "_n" ) # update processed arguments - x <- args$x - select <- args$select + x <- my_args$x + select <- my_args$select } x <- as.data.frame(x) @@ -274,15 +274,15 @@ normalize.data.frame <- function(x, # create the new variables and updates "select", so new variables are processed if (!isFALSE(append)) { # process arguments - args <- .process_append( + my_args <- .process_append( x, select, append, append_suffix = "_n" ) # update processed arguments - x <- args$x - select <- args$select + x <- my_args$x + select <- my_args$select } x[select] <- lapply( diff --git a/R/select_nse.R b/R/select_nse.R index 8d312f7b1..118d40b15 100644 --- a/R/select_nse.R +++ b/R/select_nse.R @@ -96,11 +96,11 @@ type <- typeof(x) out <- switch(type, - "integer" = x, - "double" = as.integer(x), - "character" = .select_char(data, x, ignore_case, regex = regex, verbose), - "symbol" = .select_symbol(data, x, ignore_case, regex = regex, verbose), - "language" = .eval_call(data, x, ignore_case, regex = regex, verbose), + integer = x, + double = as.integer(x), + character = .select_char(data, x, ignore_case, regex = regex, verbose), + symbol = .select_symbol(data, x, ignore_case, regex = regex, verbose), + language = .eval_call(data, x, ignore_case, regex = regex, verbose), insight::format_error(paste0( "Expressions of type <", typeof(x), "> cannot be evaluated for use when subsetting." @@ -254,13 +254,13 @@ `[` = .select_square_bracket(x, data, ignore_case, regex, verbose), `$` = .select_dollar(x, data, ignore_case, regex, verbose), `~` = .select_tilde(x, data, ignore_case, regex, verbose), - "list" = .select_list(x, data, ignore_case, regex, verbose), - "names" = .select_names(x, data, ignore_case, regex, verbose), - "starts_with" = , - "ends_with" = , - "matches" = , - "contains" = , - "regex" = .select_helper(x, data, ignore_case, regex, verbose), + list = .select_list(x, data, ignore_case, regex, verbose), + names = .select_names(x, data, ignore_case, regex, verbose), + starts_with = , + ends_with = , + matches = , + contains = , + regex = .select_helper(x, data, ignore_case, regex, verbose), .select_context(x, data, ignore_case, regex, verbose) ) } @@ -369,10 +369,10 @@ helper <- insight::safe_deparse(lst_expr[[1]]) rgx <- switch(helper, - "starts_with" = paste0("^(", collapsed_patterns, ")"), - "ends_with" = paste0("(", collapsed_patterns, ")$"), - "contains" = paste0("(", collapsed_patterns, ")"), - "regex" = collapsed_patterns, + starts_with = paste0("^(", collapsed_patterns, ")"), + ends_with = paste0("(", collapsed_patterns, ")$"), + contains = paste0("(", collapsed_patterns, ")"), + regex = collapsed_patterns, insight::format_error("There is no select helper called '", helper, "'.") ) grep(rgx, colnames(data), ignore.case = ignore_case) diff --git a/R/skewness_kurtosis.R b/R/skewness_kurtosis.R index 13ad4e422..e0da83c54 100644 --- a/R/skewness_kurtosis.R +++ b/R/skewness_kurtosis.R @@ -474,16 +474,16 @@ summary.parameters_kurtosis <- function(object, test = FALSE, ...) { } switch(type, - "1" = , - "I" = , - "classic" = "1", - "2" = , - "II" = , - "SPSS" = , - "SAS" = "2", - "3" = , - "III" = , - "Minitab" = "3" + `1` = , + I = , + classic = "1", + `2` = , + II = , + SPSS = , + SAS = "2", + `3` = , + III = , + Minitab = "3" ) } diff --git a/R/to_factor.R b/R/to_factor.R index 82cabaaad..c31580072 100644 --- a/R/to_factor.R +++ b/R/to_factor.R @@ -79,6 +79,12 @@ to_factor.character <- to_factor.numeric #' @export to_factor.Date <- to_factor.numeric +#' @export +to_factor.haven_labelled <- to_factor.numeric + +#' @export +to_factor.double <- to_factor.numeric + #' @rdname to_factor #' @export to_factor.data.frame <- function(x, @@ -89,7 +95,7 @@ to_factor.data.frame <- function(x, regex = FALSE, verbose = TRUE, ...) { - # sanity check, return as is for complete factor + # validation check, return as is for complete factor if (all(vapply(x, is.factor, FUN.VALUE = logical(1L)))) { return(x) } diff --git a/R/to_numeric.R b/R/to_numeric.R index 3d3f1c7a5..c43956399 100644 --- a/R/to_numeric.R +++ b/R/to_numeric.R @@ -17,6 +17,12 @@ #' @inheritParams find_columns #' @inheritParams categorize #' +#' @note By default, `to_numeric()` converts factors into "binary" dummies, i.e. +#' each factor level is converted into a separate column filled with a binary +#' 0-1 value. If only one column is required, use `dummy_factors = FALSE`. If +#' you want to preserve the original factor levels (in case these represent +#' numeric values), use `preserve_levels = TRUE`. +#' #' @section Selection of variables - `select` argument: #' For most functions that have a `select` argument the complete input data #' frame is returned, even when `select` only selects a range of variables. @@ -34,6 +40,8 @@ #' x <- as.factor(mtcars$gear) #' to_numeric(x, dummy_factors = FALSE) #' to_numeric(x, dummy_factors = FALSE, preserve_levels = TRUE) +#' # same as: +#' coerce_to_numeric(x) #' #' @return A data frame of numeric variables. #' @@ -69,12 +77,12 @@ to_numeric.data.frame <- function(x, regex = FALSE, verbose = TRUE, ...) { - # sanity check, return as is for complete numeric + # validation check, return as is for complete numeric if (all(vapply(x, is.numeric, FUN.VALUE = logical(1L)))) { return(x) } - attr <- attributes(x) + df_attr <- attributes(x) # evaluate arguments select <- .select_nse(select, @@ -91,7 +99,7 @@ to_numeric.data.frame <- function(x, # drop numerics, when append is not FALSE select <- colnames(x[select])[!vapply(x[select], is.numeric, FUN.VALUE = logical(1L))] # process arguments - args <- .process_append( + fun_args <- .process_append( x, select, append, @@ -99,8 +107,8 @@ to_numeric.data.frame <- function(x, keep_factors = TRUE ) # update processed arguments - x <- args$x - select <- args$select + x <- fun_args$x + select <- fun_args$select } out <- sapply( @@ -129,7 +137,7 @@ to_numeric.data.frame <- function(x, } # due to the special handling of dummy factors, we need to take care - # of appending the data here again. usually, "args$x" includes the appended + # of appending the data here again. usually, "fun_args$x" includes the appended # data, which does not work here... if (!isFALSE(append)) { @@ -141,14 +149,14 @@ to_numeric.data.frame <- function(x, } # add back custom attributes - out <- .replace_attrs(out, attr) + out <- .replace_attrs(out, df_attr) out } #' @export to_numeric.numeric <- function(x, verbose = TRUE, ...) { - .set_back_labels(as.numeric(x), x) + .set_back_labels(as.numeric(x), x, reverse_values = FALSE) } #' @export @@ -157,6 +165,9 @@ to_numeric.double <- to_numeric.numeric #' @export to_numeric.logical <- to_numeric.numeric +#' @export +to_numeric.haven_labelled <- to_numeric.numeric + #' @export to_numeric.Date <- function(x, verbose = TRUE, ...) { if (verbose) { @@ -208,24 +219,29 @@ to_numeric.factor <- function(x, # if the first observation was missing, add NA row and bind data frame if (i == 1 && na_values[i] == 1) { out <- rbind(NA, out) - } else { + } else if (na_values[i] == rows_x) { # if the last observation was NA, add NA row to data frame - if (na_values[i] == rows_x) { - out <- rbind(out, NA) - } else { - # else, pick rows from beginning to current NA value, add NA, - # and rbind the remaining rows - out <- rbind(out[1:(na_values[i] - 1), ], NA, out[na_values[i]:nrow(out), ]) - } + out <- rbind(out, NA) + } else { + # else, pick rows from beginning to current NA value, add NA, + # and rbind the remaining rows + out <- rbind(out[1:(na_values[i] - 1), ], NA, out[na_values[i]:nrow(out), ]) } } rownames(out) <- NULL } names(out) <- levels(x) } else if (preserve_levels) { - out <- .set_back_labels(as.numeric(as.character(x)), x) + if (is.unsorted(levels(x))) { + x_inverse <- rep(NA_real_, length(x)) + for (i in 1:nlevels(x)) { + x_inverse[x == levels(x)[i]] <- as.numeric(levels(x)[nlevels(x) - i + 1]) + } + x <- factor(x_inverse) + } + out <- .set_back_labels(as.numeric(as.character(x)), x, reverse_values = FALSE) } else { - out <- .set_back_labels(as.numeric(x), x) + out <- .set_back_labels(as.numeric(x), x, reverse_values = FALSE) } # shift to requested starting value diff --git a/R/utils.R b/R/utils.R index 0f84466a9..663af028e 100644 --- a/R/utils.R +++ b/R/utils.R @@ -210,7 +210,7 @@ nchar_open <- nchar(open) nchar_close <- nchar(close) - # Sanity checks + # validation checks stopifnot(exprs = { is.character(fmt) length(fmt) == 1L diff --git a/R/utils_labels.R b/R/utils_labels.R index a7f4fa2c3..64b517086 100644 --- a/R/utils_labels.R +++ b/R/utils_labels.R @@ -3,12 +3,29 @@ # to the transformed vector #' @keywords internal -.set_back_labels <- function(new, old, include_values = TRUE) { +.set_back_labels <- function(new, old, include_values = TRUE, reverse_values = FALSE) { # labelled data? attr(new, "label") <- attr(old, "label", exact = TRUE) - labels <- attr(old, "labels", exact = TRUE) - if (isTRUE(include_values) && !is.null(labels)) { - attr(new, "labels") <- stats::setNames(rev(labels), names(labels)) + value_labels <- attr(old, "labels", exact = TRUE) + # "include_values" is used to preserve value labels + if (isTRUE(include_values) && !is.null(value_labels)) { + if (reverse_values) { + # reverse values? Used for "reverse_scale()" + attr(new, "labels") <- stats::setNames(rev(value_labels), names(value_labels)) + } else { + # keep value oder? Used for "to_numeric()" + if (is.numeric(new)) { + if (any(grepl("[^0-9]", value_labels))) { + # if we have any non-numeric characters, convert to numeric + attr(new, "labels") <- stats::setNames(as.numeric(as.factor(value_labels)), names(value_labels)) + } else { + # if we have numeric, or "numeric character" (like "1", "2", "3" etc.) + attr(new, "labels") <- stats::setNames(as.numeric(value_labels), names(value_labels)) + } + } else { + attr(new, "labels") <- stats::setNames(value_labels, names(value_labels)) + } + } } else if (isFALSE(include_values)) { attr(new, "labels") <- NULL } @@ -30,7 +47,7 @@ # check positions of matching values and levels levels_in_labs <- stats::na.omit(match(value_labels, levels(x))) labs_in_levels <- stats::na.omit(match(levels(x), value_labels)) - # sanity check - if labelled values and levels don't match + # validation check - if labelled values and levels don't match if (!length(levels_in_labs) || !length(labs_in_levels)) { if (verbose) { insight::format_alert( diff --git a/R/utils_standardize_center.R b/R/utils_standardize_center.R index 0220fdd04..67c345d2e 100644 --- a/R/utils_standardize_center.R +++ b/R/utils_standardize_center.R @@ -30,7 +30,7 @@ vals <- x[valid_x] } - # Sanity checks + # validation checks check <- .check_standardize_numeric(x, name = NULL, verbose = verbose, reference = reference, center = center) if (is.factor(vals) || is.character(vals)) { diff --git a/R/weighted_mean_median_sd_mad.R b/R/weighted_mean_median_sd_mad.R index 86f685307..8965e8006 100644 --- a/R/weighted_mean_median_sd_mad.R +++ b/R/weighted_mean_median_sd_mad.R @@ -49,7 +49,7 @@ weighted_median <- function(x, weights = NULL, remove_na = TRUE, verbose = TRUE, weights <- complete$weights[order] rw <- cumsum(weights) / sum(weights) - # sanity check + # validation check if (all(is.na(rw))) { return(NA_real_) } diff --git a/cran-comments.md b/cran-comments.md index 6f845d45c..a4f3b7bd5 100644 --- a/cran-comments.md +++ b/cran-comments.md @@ -4,12 +4,11 @@ ## revdepcheck results -We checked 16 reverse dependencies, comparing R CMD check results across CRAN and dev versions of this package. +We checked 17 reverse dependencies, comparing R CMD check results across CRAN and dev versions of this package. * We saw 0 new problems * We failed to check 0 packages -## Other comments +## Comments -This release fixes the issue with `package_version()` reported by Kurt Hornik -on June 14th. +Second submission of 0.9.1 following email exchange on timeout of vignette rendering in one of the pre-tests. diff --git a/inst/WORDLIST b/inst/WORDLIST index 01e138dd1..5b12a2523 100644 --- a/inst/WORDLIST +++ b/inst/WORDLIST @@ -50,6 +50,8 @@ bmwiernik codebook codebooks coercible +crosstable +crosstables csv de doi @@ -62,6 +64,7 @@ ggplot's https interpretability interpretable +inversed joss labelled labelling diff --git a/man/contr.deviation.Rd b/man/contr.deviation.Rd index fd5de7a4b..d9d378429 100644 --- a/man/contr.deviation.Rd +++ b/man/contr.deviation.Rd @@ -49,55 +49,56 @@ the differences the \strong{A} and \strong{B} group means from the overall mean, respectively. } \examples{ -if (FALSE) { - data("mtcars") +\dontshow{if (!identical(Sys.getenv("IN_PKGDOWN"), "true")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +\donttest{ +data("mtcars") - mtcars <- data_modify(mtcars, cyl = factor(cyl)) +mtcars <- data_modify(mtcars, cyl = factor(cyl)) - c.treatment <- cbind(Intercept = 1, contrasts(mtcars$cyl)) - solve(c.treatment) - #> 4 6 8 - #> Intercept 1 0 0 # mean of the 1st level - #> 6 -1 1 0 # 2nd level - 1st level - #> 8 -1 0 1 # 3rd level - 1st level +c.treatment <- cbind(Intercept = 1, contrasts(mtcars$cyl)) +solve(c.treatment) +#> 4 6 8 +#> Intercept 1 0 0 # mean of the 1st level +#> 6 -1 1 0 # 2nd level - 1st level +#> 8 -1 0 1 # 3rd level - 1st level - contrasts(mtcars$cyl) <- contr.sum - c.sum <- cbind(Intercept = 1, contrasts(mtcars$cyl)) - solve(c.sum) - #> 4 6 8 - #> Intercept 0.333 0.333 0.333 # overall mean - #> 0.667 -0.333 -0.333 # deviation of 1st from overall mean - #> -0.333 0.667 -0.333 # deviation of 2nd from overall mean +contrasts(mtcars$cyl) <- contr.sum +c.sum <- cbind(Intercept = 1, contrasts(mtcars$cyl)) +solve(c.sum) +#> 4 6 8 +#> Intercept 0.333 0.333 0.333 # overall mean +#> 0.667 -0.333 -0.333 # deviation of 1st from overall mean +#> -0.333 0.667 -0.333 # deviation of 2nd from overall mean - contrasts(mtcars$cyl) <- contr.deviation - c.deviation <- cbind(Intercept = 1, contrasts(mtcars$cyl)) - solve(c.deviation) - #> 4 6 8 - #> Intercept 0.333 0.333 0.333 # overall mean - #> 6 -1.000 1.000 0.000 # 2nd level - 1st level - #> 8 -1.000 0.000 1.000 # 3rd level - 1st level +contrasts(mtcars$cyl) <- contr.deviation +c.deviation <- cbind(Intercept = 1, contrasts(mtcars$cyl)) +solve(c.deviation) +#> 4 6 8 +#> Intercept 0.333 0.333 0.333 # overall mean +#> 6 -1.000 1.000 0.000 # 2nd level - 1st level +#> 8 -1.000 0.000 1.000 # 3rd level - 1st level - ## With Interactions ----------------------------------------- - mtcars <- data_modify(mtcars, am = C(am, contr = contr.deviation)) - mtcars <- data_arrange(mtcars, select = c("cyl", "am")) +## With Interactions ----------------------------------------- +mtcars <- data_modify(mtcars, am = C(am, contr = contr.deviation)) +mtcars <- data_arrange(mtcars, select = c("cyl", "am")) - mm <- unique(model.matrix(~ cyl * am, data = mtcars)) - rownames(mm) <- c( - "cyl4.am0", "cyl4.am1", "cyl6.am0", - "cyl6.am1", "cyl8.am0", "cyl8.am1" - ) +mm <- unique(model.matrix(~ cyl * am, data = mtcars)) +rownames(mm) <- c( + "cyl4.am0", "cyl4.am1", "cyl6.am0", + "cyl6.am1", "cyl8.am0", "cyl8.am1" +) - solve(mm) - #> cyl4.am0 cyl4.am1 cyl6.am0 cyl6.am1 cyl8.am0 cyl8.am1 - #> (Intercept) 0.167 0.167 0.167 0.167 0.167 0.167 # overall mean - #> cyl6 -0.500 -0.500 0.500 0.500 0.000 0.000 # cyl MAIN eff: 2nd - 1st - #> cyl8 -0.500 -0.500 0.000 0.000 0.500 0.500 # cyl MAIN eff: 2nd - 1st - #> am1 -0.333 0.333 -0.333 0.333 -0.333 0.333 # am MAIN eff - #> cyl6:am1 1.000 -1.000 -1.000 1.000 0.000 0.000 - #> cyl8:am1 1.000 -1.000 0.000 0.000 -1.000 1.000 +solve(mm) +#> cyl4.am0 cyl4.am1 cyl6.am0 cyl6.am1 cyl8.am0 cyl8.am1 +#> (Intercept) 0.167 0.167 0.167 0.167 0.167 0.167 # overall mean +#> cyl6 -0.500 -0.500 0.500 0.500 0.000 0.000 # cyl MAIN eff: 2nd - 1st +#> cyl8 -0.500 -0.500 0.000 0.000 0.500 0.500 # cyl MAIN eff: 2nd - 1st +#> am1 -0.333 0.333 -0.333 0.333 -0.333 0.333 # am MAIN eff +#> cyl6:am1 1.000 -1.000 -1.000 1.000 0.000 0.000 +#> cyl8:am1 1.000 -1.000 0.000 0.000 -1.000 1.000 } - +\dontshow{\}) # examplesIf} } \seealso{ \code{\link[stats:contrast]{stats::contr.sum()}} diff --git a/man/data_modify.Rd b/man/data_modify.Rd index 7f13a8c08..8cac82205 100644 --- a/man/data_modify.Rd +++ b/man/data_modify.Rd @@ -2,9 +2,12 @@ % Please edit documentation in R/data_modify.R \name{data_modify} \alias{data_modify} +\alias{data_modify.data.frame} \title{Create new variables in a data frame} \usage{ data_modify(data, ...) + +\method{data_modify}{data.frame}(data, ..., .if = NULL, .at = NULL, .modify = NULL) } \arguments{ \item{data}{A data frame} @@ -29,8 +32,23 @@ character vector is provided, you may not add further elements to \code{...}. Example: \code{Petal.Width = NULL}. } -Note that newly created variables can be used in subsequent expressions. -See also 'Examples'.} +Note that newly created variables can be used in subsequent expressions, +including \code{.at} or \code{.if}. See also 'Examples'.} + +\item{.if}{A function that returns \code{TRUE} for columns in the data frame where +\code{.if} applies. This argument is used in combination with the \code{.modify} argument. +Note that only one of \code{.at} or \code{.if} can be provided, but not both at the same +time. Newly created variables in \code{...} can also be selected, see 'Examples'.} + +\item{.at}{A character vector of variable names that should be modified. This +argument is used in combination with the \code{.modify} argument. Note that only one +of \code{.at} or \code{.if} can be provided, but not both at the same time. Newly created +variables in \code{...} can also be selected, see 'Examples'.} + +\item{.modify}{A function that modifies the variables defined in \code{.at} or \code{.if}. +This argument is used in combination with either the \code{.at} or the \code{.if} argument. +Note that the modified variable (i.e. the result from \code{.modify}) must be either +of length 1 or of same length as the input variable.} } \description{ Create new variables or modify existing variables in a data frame. Unlike \code{base::transform()}, \code{data_modify()} @@ -103,4 +121,30 @@ foo(iris, "var_a = Sepal.Width / 10") new_exp <- c("SW_double = 2 * Sepal.Width", "SW_fraction = SW_double / 10") foo(iris, new_exp) + +# modify at specific positions or if condition is met +d <- iris[1:5, ] +data_modify(d, .at = "Species", .modify = as.numeric) +data_modify(d, .if = is.factor, .modify = as.numeric) + +# can be combined with dots +data_modify(d, new_length = Petal.Length * 2, .at = "Species", .modify = as.numeric) + +# new variables used in `.at` or `.if` +data_modify( + d, + new_length = Petal.Length * 2, + .at = c("Petal.Length", "new_length"), + .modify = round +) + +# combine "data_find()" and ".at" argument +out <- data_modify( + d, + .at = data_find(d, select = starts_with("Sepal")), + .modify = as.factor +) +# "Sepal.Length" and "Sepal.Width" are now factors +str(out) + } diff --git a/man/data_tabulate.Rd b/man/data_tabulate.Rd index a69feb83a..34961481a 100644 --- a/man/data_tabulate.Rd +++ b/man/data_tabulate.Rd @@ -4,11 +4,21 @@ \alias{data_tabulate} \alias{data_tabulate.default} \alias{data_tabulate.data.frame} -\title{Create frequency tables of variables} +\title{Create frequency and crosstables of variables} \usage{ data_tabulate(x, ...) -\method{data_tabulate}{default}(x, drop_levels = FALSE, name = NULL, verbose = TRUE, ...) +\method{data_tabulate}{default}( + x, + by = NULL, + drop_levels = FALSE, + weights = NULL, + include_na = TRUE, + proportions = NULL, + name = NULL, + verbose = TRUE, + ... +) \method{data_tabulate}{data.frame}( x, @@ -16,8 +26,12 @@ data_tabulate(x, ...) exclude = NULL, ignore_case = FALSE, regex = FALSE, - collapse = FALSE, + by = NULL, drop_levels = FALSE, + weights = NULL, + include_na = TRUE, + proportions = NULL, + collapse = FALSE, verbose = TRUE, ... ) @@ -27,10 +41,25 @@ data_tabulate(x, ...) \item{...}{not used.} -\item{drop_levels}{Logical, if \code{TRUE}, factor levels that do not occur in +\item{by}{Optional vector or factor. If supplied, a crosstable is created. +If \code{x} is a data frame, \code{by} can also be a character string indicating the +name of a variable in \code{x}.} + +\item{drop_levels}{Logical, if \code{FALSE}, factor levels that do not occur in the data are included in the table (with frequency of zero), else unused factor levels are dropped from the frequency table.} +\item{weights}{Optional numeric vector of weights. Must be of the same length +as \code{x}. If \code{weights} is supplied, weighted frequencies are calculated.} + +\item{include_na}{Logical, if \code{TRUE}, missing values are included in the +frequency or crosstable, else missing values are omitted.} + +\item{proportions}{Optional character string, indicating the type of +percentages to be calculated. Only applies to crosstables, i.e. when \code{by} is +not \code{NULL}. Can be \code{"row"} (row percentages), \code{"column"} (column percentages) +or \code{"full"} (to calculate relative frequencies for the full table).} + \item{name}{Optional character string, which includes the name that is used for printing.} @@ -93,17 +122,40 @@ A data frame, or a list of data frames, with one frequency table as data frame per variable. } \description{ -This function creates frequency tables of variables, including -the number of levels/values as well as the distribution of raw, valid and -cumulative percentages. +This function creates frequency or crosstables of variables, +including the number of levels/values as well as the distribution of raw, +valid and cumulative percentages. For crosstables, row, column and cell +percentages can be calculated. +} +\note{ +There are \code{print_html()} and \code{print_md()} methods available for printing +frequency or crosstables in HTML and markdown format, e.g. +\code{print_html(data_tabulate(x))}. +} +\section{Crosstables}{ + +If \code{by} is supplied, a crosstable is created. The crosstable includes \verb{} +(missing) values by default. The first column indicates values of \code{x}, the +first row indicates values of \code{by} (including missing values). The last row +and column contain the total frequencies for each row and column, respectively. +Setting \code{include_na = FALSE} will omit missing values from the crosstable. +Setting \code{proportions} to \code{"row"} or \code{"column"} will add row or column +percentages. Setting \code{proportions} to \code{"full"} will add relative frequencies +for the full table. } + \examples{ \dontshow{if (requireNamespace("poorman")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +# frequency tables ------- +# ------------------------ data(efc) # vector/factor data_tabulate(efc$c172code) +# drop missing values +data_tabulate(efc$c172code, include_na = FALSE) + # data frame data_tabulate(efc, c("e42dep", "c172code")) @@ -125,5 +177,35 @@ data_tabulate(x, name = "Large Number") # to remove the big mark, use "print(..., big_mark = "")" print(data_tabulate(x), big_mark = "") + +# weighted frequencies +set.seed(123) +efc$weights <- abs(rnorm(n = nrow(efc), mean = 1, sd = 0.5)) +data_tabulate(efc$e42dep, weights = efc$weights) + +# crosstables ------ +# ------------------ + +# add some missing values +set.seed(123) +efc$e16sex[sample.int(nrow(efc), 5)] <- NA + +data_tabulate(efc, "c172code", by = "e16sex") + +# add row and column percentages +data_tabulate(efc, "c172code", by = "e16sex", proportions = "row") +data_tabulate(efc, "c172code", by = "e16sex", proportions = "column") + +# omit missing values +data_tabulate( + efc$c172code, + by = efc$e16sex, + proportions = "column", + include_na = FALSE +) + +# round percentages +out <- data_tabulate(efc, "c172code", by = "e16sex", proportions = "column") +print(out, digits = 0) \dontshow{\}) # examplesIf} } diff --git a/man/datawizard-package.Rd b/man/datawizard-package.Rd index a9ea35f0a..ca3dc7e59 100644 --- a/man/datawizard-package.Rd +++ b/man/datawizard-package.Rd @@ -3,7 +3,6 @@ \docType{package} \name{datawizard-package} \alias{datawizard-package} -\alias{_PACKAGE} \alias{datawizard} \title{datawizard: Easy Data Wrangling and Statistical Transformations} \description{ diff --git a/man/rescale.Rd b/man/rescale.Rd index fc8b0f2bf..83cc3d64d 100644 --- a/man/rescale.Rd +++ b/man/rescale.Rd @@ -11,13 +11,23 @@ rescale(x, ...) change_scale(x, ...) -\method{rescale}{numeric}(x, to = c(0, 100), range = NULL, verbose = TRUE, ...) +\method{rescale}{numeric}( + x, + to = c(0, 100), + multiply = NULL, + add = NULL, + range = NULL, + verbose = TRUE, + ... +) \method{rescale}{data.frame}( x, select = NULL, exclude = NULL, to = c(0, 100), + multiply = NULL, + add = NULL, range = NULL, append = FALSE, ignore_case = FALSE, @@ -31,9 +41,22 @@ change_scale(x, ...) \item{...}{Arguments passed to or from other methods.} -\item{to}{Numeric vector of length 2 giving the new range that the variable will have after rescaling. -To reverse-score a variable, the range should be given with the maximum value first. -See examples.} +\item{to}{Numeric vector of length 2 giving the new range that the variable +will have after rescaling. To reverse-score a variable, the range should +be given with the maximum value first. See examples.} + +\item{multiply}{If not \code{NULL}, \code{to} is ignored and \code{multiply} will be used, +giving the factor by which the actual range of \code{x} should be expanded. +For example, if a vector ranges from 5 to 15 and \code{multiply = 1.1}, the current +range of 10 will be expanded by the factor of 1.1, giving a new range of +11. Thus, the rescaled vector would range from 4.5 to 15.5.} + +\item{add}{A vector of length 1 or 2. If not \code{NULL}, \code{to} is ignored and \code{add} +will be used, giving the amount by which the minimum and maximum of the +actual range of \code{x} should be expanded. For example, if a vector ranges from +5 to 15 and \code{add = 1}, the range will be expanded from 4 to 16. If \code{add} is +of length 2, then the first value is used for the lower bound and the second +value for the upper bound.} \item{range}{Initial (old) range of values. If \code{NULL}, will take the range of the input vector (\code{range(x)}).} @@ -103,8 +126,8 @@ functions (see 'Details'), this argument may be used as workaround.} A rescaled object. } \description{ -Rescale variables to a new range. -Can also be used to reverse-score variables (change the keying/scoring direction). +Rescale variables to a new range. Can also be used to reverse-score variables +(change the keying/scoring direction), or to expand a range. } \section{Selection of variables - the \code{select} argument}{ @@ -138,6 +161,21 @@ head(rescale(iris, to = list( "Sepal.Length" = c(0, 1), "Petal.Length" = c(-1, 0) ))) + +# "expand" ranges by a factor or a given value +x <- 5:15 +x +# both will expand the range by 10\% +rescale(x, multiply = 1.1) +rescale(x, add = 0.5) + +# expand range by different values +rescale(x, add = c(1, 3)) + +# Specify list of multipliers +d <- data.frame(x = 5:15, y = 5:15) +rescale(d, multiply = list(x = 1.1, y = 0.5)) + } \seealso{ See \code{\link[=makepredictcall.dw_transformer]{makepredictcall.dw_transformer()}} for use in model formulas. diff --git a/man/to_numeric.Rd b/man/to_numeric.Rd index 540f4f65a..7c78b1ba6 100644 --- a/man/to_numeric.Rd +++ b/man/to_numeric.Rd @@ -106,6 +106,13 @@ Convert data to numeric by converting characters to factors and factors to either numeric levels or dummy variables. The "counterpart" to convert variables into factors is \code{to_factor()}. } +\note{ +By default, \code{to_numeric()} converts factors into "binary" dummies, i.e. +each factor level is converted into a separate column filled with a binary +0-1 value. If only one column is required, use \code{dummy_factors = FALSE}. If +you want to preserve the original factor levels (in case these represent +numeric values), use \code{preserve_levels = TRUE}. +} \section{Selection of variables - \code{select} argument}{ For most functions that have a \code{select} argument the complete input data @@ -125,5 +132,7 @@ to_numeric(head(ToothGrowth), dummy_factors = FALSE) x <- as.factor(mtcars$gear) to_numeric(x, dummy_factors = FALSE) to_numeric(x, dummy_factors = FALSE, preserve_levels = TRUE) +# same as: +coerce_to_numeric(x) } diff --git a/tests/testthat/_snaps/data_rescale.md b/tests/testthat/_snaps/data_rescale.md index 4091116e3..e02d7eb14 100644 --- a/tests/testthat/_snaps/data_rescale.md +++ b/tests/testthat/_snaps/data_rescale.md @@ -2,7 +2,7 @@ Code head(rescale(iris, to = c(0, 1))) - Message + Message Variables of class `factor` can't be rescaled and remain unchanged. Output Sepal.Length Sepal.Width Petal.Length Petal.Width Species @@ -30,7 +30,7 @@ Code head(rescale(iris, to = list(Sepal.Length = c(0, 1), Petal.Length = c(-1, 0)))) - Message + Message Variables of class `factor` can't be rescaled and remain unchanged. Output Sepal.Length Sepal.Width Petal.Length Petal.Width Species diff --git a/tests/testthat/_snaps/data_separate.md b/tests/testthat/_snaps/data_separate.md index e46ee77c2..c3176c7b3 100644 --- a/tests/testthat/_snaps/data_separate.md +++ b/tests/testthat/_snaps/data_separate.md @@ -123,7 +123,7 @@ Code data_separate(d_sep, guess_columns = "mode", select = NULL) - Message + Message Column `x` had different number of values after splitting. Variable was split into 3 columns. `x` returned more columns than expected after splitting. Right-most diff --git a/tests/testthat/_snaps/data_tabulate.md b/tests/testthat/_snaps/data_tabulate.md index 77c128c40..e4b3e8628 100644 --- a/tests/testthat/_snaps/data_tabulate.md +++ b/tests/testthat/_snaps/data_tabulate.md @@ -1,3 +1,84 @@ +# data_tabulate, weights + + Code + print(data_tabulate(efc$e42dep, weights = efc$weights)) + Output + elder's dependency (efc$e42dep) + # total N=105 valid N=100 (weighted) + + Value | N | Raw % | Valid % | Cumulative % + ------+----+-------+---------+------------- + 1 | 3 | 2.86 | 3.00 | 3.00 + 2 | 4 | 3.81 | 4.00 | 7.00 + 3 | 26 | 24.76 | 26.00 | 33.00 + 4 | 67 | 63.81 | 67.00 | 100.00 + | 5 | 4.76 | | + +--- + + Code + print_md(data_tabulate(efc$e42dep, weights = efc$weights)) + Output + [1] "Table: elder's dependency (efc$e42dep) (categorical)" + [2] "" + [3] "|Value | N| Raw %| Valid %| Cumulative %|" + [4] "|:-----|--:|-----:|-------:|------------:|" + [5] "|1 | 3| 2.86| 3.00| 3.00|" + [6] "|2 | 4| 3.81| 4.00| 7.00|" + [7] "|3 | 26| 24.76| 26.00| 33.00|" + [8] "|4 | 67| 63.81| 67.00| 100.00|" + [9] "|(NA) | 5| 4.76| (NA)| (NA)|" + [10] "total N=105 valid N=100 (weighted)\n\n" + attr(,"format") + [1] "pipe" + attr(,"class") + [1] "knitr_kable" "character" + +--- + + Code + print(data_tabulate(efc, c("e42dep", "e16sex"), collapse = TRUE, weights = efc$ + weights)) + Output + # Frequency Table (weighted) + + Variable | Value | N | Raw % | Valid % | Cumulative % + ---------+-------+----+-------+---------+------------- + e42dep | 1 | 3 | 2.86 | 3.00 | 3.00 + | 2 | 4 | 3.81 | 4.00 | 7.00 + | 3 | 26 | 24.76 | 26.00 | 33.00 + | 4 | 67 | 63.81 | 67.00 | 100.00 + | | 5 | 4.76 | | + ---------+-------+----+-------+---------+------------- + e16sex | 1 | 50 | 47.62 | 47.62 | 47.62 + | 2 | 55 | 52.38 | 52.38 | 100.00 + | | 0 | 0.00 | | + ------------------------------------------------------ + +--- + + Code + print_md(data_tabulate(efc, c("e42dep", "e16sex"), weights = efc$weights)) + Output + [1] "Table: Frequency Table (weighted)" + [2] "" + [3] "|Variable | Value| N| Raw %| Valid %| Cumulative %|" + [4] "|:--------|-----:|--:|-----:|-------:|------------:|" + [5] "|e42dep | 1| 3| 2.86| 3.00| 3.00|" + [6] "| | 2| 4| 3.81| 4.00| 7.00|" + [7] "| | 3| 26| 24.76| 26.00| 33.00|" + [8] "| | 4| 67| 63.81| 67.00| 100.00|" + [9] "| | (NA)| 5| 4.76| (NA)| (NA)|" + [10] "| | | | | | |" + [11] "|e16sex | 1| 50| 47.62| 47.62| 47.62|" + [12] "| | 2| 55| 52.38| 52.38| 100.00|" + [13] "| | (NA)| 0| 0.00| (NA)| (NA)|" + [14] "| | | | | | |" + attr(,"format") + [1] "pipe" + attr(,"class") + [1] "knitr_kable" "character" + # data_tabulate print Code @@ -160,3 +241,310 @@ | | | 2 | 3.70 | | ------------------------------------------------------------------- +# data_tabulate, cross tables + + Code + print(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full")) + Output + efc$c172code | male | female | | Total + -------------+------------+------------+----------+------ + 1 | 5 (5.0%) | 2 (2.0%) | 1 (1.0%) | 8 + 2 | 31 (31.0%) | 33 (33.0%) | 2 (2.0%) | 66 + 3 | 4 (4.0%) | 11 (11.0%) | 1 (1.0%) | 16 + | 5 (5.0%) | 4 (4.0%) | 1 (1.0%) | 10 + -------------+------------+------------+----------+------ + Total | 45 | 50 | 5 | 100 + +--- + + Code + print(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full", + include_na = FALSE)) + Output + efc$c172code | male | female | Total + -------------+------------+------------+------ + 1 | 5 (5.8%) | 2 (2.3%) | 7 + 2 | 31 (36.0%) | 33 (38.4%) | 64 + 3 | 4 (4.7%) | 11 (12.8%) | 15 + -------------+------------+------------+------ + Total | 40 | 46 | 86 + +--- + + Code + print(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full", + weights = efc$weights)) + Output + efc$c172code | male | female | | Total + -------------+------------+------------+----------+------ + 1 | 5 (4.8%) | 3 (2.9%) | 2 (1.9%) | 10 + 2 | 32 (30.5%) | 32 (30.5%) | 3 (2.9%) | 67 + 3 | 3 (2.9%) | 11 (10.5%) | 1 (1.0%) | 15 + | 8 (7.6%) | 5 (4.8%) | 1 (1.0%) | 14 + -------------+------------+------------+----------+------ + Total | 48 | 51 | 7 | 105 + +--- + + Code + print(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full", + include_na = FALSE, weights = efc$weights)) + Output + efc$c172code | male | female | Total + -------------+------------+------------+------ + 1 | 5 (5.8%) | 3 (3.5%) | 8 + 2 | 32 (37.2%) | 32 (37.2%) | 64 + 3 | 3 (3.5%) | 11 (12.8%) | 14 + -------------+------------+------------+------ + Total | 40 | 46 | 86 + +--- + + Code + print(data_tabulate(efc, "c172code", by = efc$e16sex, proportions = "row")) + Output + c172code | male | female | | Total + ---------+------------+------------+-----------+------ + 1 | 5 (62.5%) | 2 (25.0%) | 1 (12.5%) | 8 + 2 | 31 (47.0%) | 33 (50.0%) | 2 (3.0%) | 66 + 3 | 4 (25.0%) | 11 (68.8%) | 1 (6.2%) | 16 + | 5 (50.0%) | 4 (40.0%) | 1 (10.0%) | 10 + ---------+------------+------------+-----------+------ + Total | 45 | 50 | 5 | 100 + + +--- + + Code + print(data_tabulate(efc, "c172code", by = efc$e16sex, proportions = "row", + include_na = FALSE)) + Output + c172code | male | female | Total + ---------+------------+------------+------ + 1 | 5 (71.4%) | 2 (28.6%) | 7 + 2 | 31 (48.4%) | 33 (51.6%) | 64 + 3 | 4 (26.7%) | 11 (73.3%) | 15 + ---------+------------+------------+------ + Total | 40 | 46 | 86 + + +--- + + Code + print(data_tabulate(efc, "c172code", by = efc$e16sex, proportions = "row", + weights = efc$weights)) + Output + c172code | male | female | | Total + ---------+------------+------------+-----------+------ + 1 | 5 (50.0%) | 3 (30.0%) | 2 (20.0%) | 10 + 2 | 32 (47.8%) | 32 (47.8%) | 3 (4.5%) | 67 + 3 | 3 (20.0%) | 11 (73.3%) | 1 (6.7%) | 15 + | 8 (57.1%) | 5 (35.7%) | 1 (7.1%) | 14 + ---------+------------+------------+-----------+------ + Total | 48 | 51 | 7 | 105 + + +--- + + Code + print(data_tabulate(efc, "c172code", by = efc$e16sex, proportions = "row", + include_na = FALSE, weights = efc$weights)) + Output + c172code | male | female | Total + ---------+------------+------------+------ + 1 | 5 (62.5%) | 3 (37.5%) | 8 + 2 | 32 (50.0%) | 32 (50.0%) | 64 + 3 | 3 (21.4%) | 11 (78.6%) | 14 + ---------+------------+------------+------ + Total | 40 | 46 | 86 + + +--- + + Code + print(data_tabulate(efc, "c172code", by = "e16sex", proportions = "column")) + Output + c172code | male | female | | Total + ---------+------------+------------+-----------+------ + 1 | 5 (11.1%) | 2 (4.0%) | 1 (20.0%) | 8 + 2 | 31 (68.9%) | 33 (66.0%) | 2 (40.0%) | 66 + 3 | 4 (8.9%) | 11 (22.0%) | 1 (20.0%) | 16 + | 5 (11.1%) | 4 (8.0%) | 1 (20.0%) | 10 + ---------+------------+------------+-----------+------ + Total | 45 | 50 | 5 | 100 + + +--- + + Code + print(data_tabulate(efc, "c172code", by = "e16sex", proportions = "column", + include_na = FALSE)) + Output + c172code | male | female | Total + ---------+------------+------------+------ + 1 | 5 (12.5%) | 2 (4.3%) | 7 + 2 | 31 (77.5%) | 33 (71.7%) | 64 + 3 | 4 (10.0%) | 11 (23.9%) | 15 + ---------+------------+------------+------ + Total | 40 | 46 | 86 + + +--- + + Code + print(data_tabulate(efc, "c172code", by = "e16sex", proportions = "column", + weights = "weights")) + Output + c172code | male | female | | Total + ---------+------------+------------+-----------+------ + 1 | 5 (10.4%) | 3 (5.9%) | 2 (28.6%) | 10 + 2 | 32 (66.7%) | 32 (62.7%) | 3 (42.9%) | 67 + 3 | 3 (6.2%) | 11 (21.6%) | 1 (14.3%) | 15 + | 8 (16.7%) | 5 (9.8%) | 1 (14.3%) | 14 + ---------+------------+------------+-----------+------ + Total | 48 | 51 | 7 | 105 + + +--- + + Code + print(data_tabulate(efc, "c172code", by = "e16sex", proportions = "column", + include_na = FALSE, weights = "weights")) + Output + c172code | male | female | Total + ---------+------------+------------+------ + 1 | 5 (12.5%) | 3 (6.5%) | 8 + 2 | 32 (80.0%) | 32 (69.6%) | 64 + 3 | 3 (7.5%) | 11 (23.9%) | 14 + ---------+------------+------------+------ + Total | 40 | 46 | 86 + + +# data_tabulate, cross tables, grouped df + + Code + print(data_tabulate(grp, "c172code", by = "e16sex", proportions = "row")) + Output + Grouped by e42dep (1) + + c172code | male | | Total + ---------+------------+------------+------ + 2 | 2 (100.0%) | 0 (0.0%) | 2 + | 0 (NaN%) | 0 (NaN%) | 0 + ---------+------------+------------+------ + Total | 2 | 0 | 2 + + Grouped by e42dep (2) + + c172code | male | female | | Total + ---------+-----------+-----------+-----------+------ + 2 | 2 (50.0%) | 2 (50.0%) | 0 (0.0%) | 4 + | 0 (NaN%) | 0 (NaN%) | 0 (NaN%) | 0 + ---------+-----------+-----------+-----------+------ + Total | 2 | 2 | 0 | 4 + + Grouped by e42dep (3) + + c172code | male | female | | Total + ---------+-----------+------------+-----------+------ + 1 | 2 (50.0%) | 2 (50.0%) | 0 (0.0%) | 4 + 2 | 4 (25.0%) | 11 (68.8%) | 1 (6.2%) | 16 + 3 | 1 (16.7%) | 5 (83.3%) | 0 (0.0%) | 6 + | 1 (50.0%) | 0 (0.0%) | 1 (50.0%) | 2 + ---------+-----------+------------+-----------+------ + Total | 8 | 18 | 2 | 28 + + Grouped by e42dep (4) + + c172code | male | female | | Total + ---------+------------+------------+-----------+------ + 1 | 3 (75.0%) | 0 (0.0%) | 1 (25.0%) | 4 + 2 | 23 (54.8%) | 18 (42.9%) | 1 (2.4%) | 42 + 3 | 3 (30.0%) | 6 (60.0%) | 1 (10.0%) | 10 + | 3 (42.9%) | 4 (57.1%) | 0 (0.0%) | 7 + ---------+------------+------------+-----------+------ + Total | 32 | 28 | 3 | 63 + + Grouped by e42dep (NA) + + c172code | male | female | | Total + ---------+------------+------------+------------+------ + 2 | 0 (0.0%) | 2 (100.0%) | 0 (0.0%) | 2 + | 1 (100.0%) | 0 (0.0%) | 0 (0.0%) | 1 + ---------+------------+------------+------------+------ + Total | 1 | 2 | 0 | 3 + + +# data_tabulate, cross tables, markdown + + Code + print_md(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full")) + Output + [1] "|efc$c172code | male| female| (NA) | Total|" + [2] "|:------------|----------:|----------:|:--------|-----:|" + [3] "|1 | 5 (5.0%)| 2 (2.0%)|1 (1.0%) | 8|" + [4] "|2 | 31 (31.0%)| 33 (33.0%)|2 (2.0%) | 66|" + [5] "|3 | 4 (4.0%)| 11 (11.0%)|1 (1.0%) | 16|" + [6] "| | 5 (5.0%)| 4 (4.0%)|1 (1.0%) | 10|" + [7] "| | | | | |" + [8] "|Total | 45| 50| 5 | 100|" + attr(,"format") + [1] "pipe" + attr(,"class") + [1] "knitr_kable" "character" + +--- + + Code + print_md(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full", + include_na = FALSE)) + Output + [1] "|efc$c172code | male| female| Total|" + [2] "|:------------|----------:|----------:|-----:|" + [3] "|1 | 5 (5.8%)| 2 (2.3%)| 7|" + [4] "|2 | 31 (36.0%)| 33 (38.4%)| 64|" + [5] "|3 | 4 (4.7%)| 11 (12.8%)| 15|" + [6] "| | | | |" + [7] "|Total | 40| 46| 86|" + attr(,"format") + [1] "pipe" + attr(,"class") + [1] "knitr_kable" "character" + +--- + + Code + print_md(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full", + weights = efc$weights)) + Output + [1] "|efc$c172code | male| female| (NA) | Total|" + [2] "|:------------|----------:|----------:|:--------|-----:|" + [3] "|1 | 5 (4.8%)| 3 (2.9%)|2 (1.9%) | 10|" + [4] "|2 | 32 (30.5%)| 32 (30.5%)|3 (2.9%) | 67|" + [5] "|3 | 3 (2.9%)| 11 (10.5%)|1 (1.0%) | 15|" + [6] "| | 8 (7.6%)| 5 (4.8%)|1 (1.0%) | 14|" + [7] "| | | | | |" + [8] "|Total | 48| 51| 7 | 105|" + attr(,"format") + [1] "pipe" + attr(,"class") + [1] "knitr_kable" "character" + +--- + + Code + print_md(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full", + include_na = FALSE, weights = efc$weights)) + Output + [1] "|efc$c172code | male| female| Total|" + [2] "|:------------|----------:|----------:|-----:|" + [3] "|1 | 5 (5.8%)| 3 (3.5%)| 8|" + [4] "|2 | 32 (37.2%)| 32 (37.2%)| 64|" + [5] "|3 | 3 (3.5%)| 11 (12.8%)| 14|" + [6] "| | | | |" + [7] "|Total | 40| 46| 86|" + attr(,"format") + [1] "pipe" + attr(,"class") + [1] "knitr_kable" "character" + diff --git a/tests/testthat/_snaps/describe_distribution.md b/tests/testthat/_snaps/describe_distribution.md index 5c582859d..0261ab9b1 100644 --- a/tests/testthat/_snaps/describe_distribution.md +++ b/tests/testthat/_snaps/describe_distribution.md @@ -16,3 +16,12 @@ ----------------------------------------------------------- | | [VC, OJ] | 0 | -2.07 | 60 | 0 +# describe_distribution formatting + + Code + format(x) + Output + Mean | SD | IQR | Range | Quartiles | Skewness | Kurtosis | n | n_Missing + -------------------------------------------------------------------------------------- + 3.06 | 0.44 | 0.52 | [2.00, 4.40] | 2.80, 3.30 | 0.32 | 0.23 | 150 | 0 + diff --git a/tests/testthat/test-assign_labels.R b/tests/testthat/test-assign_labels.R index fd74705ef..300ae643f 100644 --- a/tests/testthat/test-assign_labels.R +++ b/tests/testthat/test-assign_labels.R @@ -16,7 +16,7 @@ test_that("assign_labels, named values", { out <- assign_labels( x, variable = "Labelled factor", - values = c(`a` = "low", `b` = "mid", `c` = "high") + values = c(a = "low", b = "mid", c = "high") ) expect_identical(attributes(out)$label, "Labelled factor") expect_identical(attributes(out)$labels, c(low = "a", mid = "b", high = "c")) diff --git a/tests/testthat/test-attributes.R b/tests/testthat/test-attributes.R index 4eb261a0e..2fc88ecc9 100644 --- a/tests/testthat/test-attributes.R +++ b/tests/testthat/test-attributes.R @@ -123,6 +123,10 @@ test_that("convert_to_na, attributes preserved", { attr(x, "myattri") <- "I'm here" x2 <- convert_to_na(x, na = 2, verbose = FALSE) expect_identical(attr(x2, "myattri", exact = TRUE), "I'm here") + # label attribute is preserved + attr(x$Species, "label") <- "Species Variable" + x2 <- convert_to_na(x, na = "setosa", drop_levels = TRUE, verbose = FALSE) + expect_identical(attributes(x$Species)$label, "Species Variable") }) diff --git a/tests/testthat/test-data_codebook.R b/tests/testthat/test-data_codebook.R index daaf1f77b..26a67ccf6 100644 --- a/tests/testthat/test-data_codebook.R +++ b/tests/testthat/test-data_codebook.R @@ -159,9 +159,11 @@ test_that("data_codebook, tagged NA", { 1:4, haven::tagged_na("a", "c", "z") ), labels = c( - "Agreement" = 1, "Disagreement" = 4, - "First" = haven::tagged_na("c"), "Refused" = haven::tagged_na("a"), - "Not home" = haven::tagged_na("z") + Agreement = 1, + Disagreement = 4, + First = haven::tagged_na("c"), + Refused = haven::tagged_na("a"), + `Not home` = haven::tagged_na("z") ) ) expect_snapshot(data_codebook(data.frame(x))) @@ -174,9 +176,11 @@ test_that("data_codebook, tagged NA", { 1:4, haven::tagged_na("a", "c") ), labels = c( - "Agreement" = 1, "Disagreement" = 4, - "First" = haven::tagged_na("c"), "Refused" = haven::tagged_na("a"), - "Not home" = haven::tagged_na("z") + Agreement = 1, + Disagreement = 4, + First = haven::tagged_na("c"), + Refused = haven::tagged_na("a"), + `Not home` = haven::tagged_na("z") ) ) expect_snapshot(data_codebook(data.frame(x))) @@ -187,11 +191,11 @@ test_that("data_codebook, negative label values #334", { skip_if_not_installed("haven") x1 <- haven::labelled( x = 1:4, - labels = c("Agreement" = 1, "Disagreement" = 4, "Missing" = -9) + labels = c(Agreement = 1, Disagreement = 4, Missing = -9) ) x2 <- haven::labelled( x = c(1:3, -9), - labels = c("Agreement" = 1, "Disagreement" = 4, "Missing" = -9) + labels = c(Agreement = 1, Disagreement = 4, Missing = -9) ) expect_snapshot(data_codebook(data.frame(x1, x2))) }) diff --git a/tests/testthat/test-data_merge.R b/tests/testthat/test-data_merge.R index b98f4b8c8..ef48355bb 100644 --- a/tests/testthat/test-data_merge.R +++ b/tests/testthat/test-data_merge.R @@ -250,9 +250,9 @@ test_that("join data frames in a list", { # join empty data frames ----------------------- -x <- data.frame("x" = character(), stringsAsFactors = FALSE) -y <- data.frame("x" = character(), stringsAsFactors = FALSE) -z <- data.frame("y" = character(), stringsAsFactors = FALSE) +x <- data.frame(x = character(), stringsAsFactors = FALSE) +y <- data.frame(x = character(), stringsAsFactors = FALSE) +z <- data.frame(y = character(), stringsAsFactors = FALSE) test_that("join empty data frames", { expect_identical(dim(data_merge(x, y, join = "left")), c(0L, 1L)) diff --git a/tests/testthat/test-data_modify.R b/tests/testthat/test-data_modify.R index 661993ca5..9bb0a92d6 100644 --- a/tests/testthat/test-data_modify.R +++ b/tests/testthat/test-data_modify.R @@ -490,3 +490,63 @@ test_that("data_modify works with functions that return character vectors", { out <- data_modify(iris, grp = sample(letters[1:3], nrow(iris), TRUE)) expect_identical(head(out$grp), c("c", "c", "c", "b", "c", "b")) }) + + +test_that("data_modify .if/.at arguments", { + data(iris) + d <- iris[1:5, ] + # validate results + out <- data_modify(d, .at = "Species", .modify = as.numeric) + expect_identical(out$Species, c(1, 1, 1, 1, 1)) + out <- data_modify(d, .if = is.factor, .modify = as.numeric) + expect_identical(out$Species, c(1, 1, 1, 1, 1)) + out <- data_modify(d, new_length = Petal.Length * 2, .at = "Species", .modify = as.numeric) + expect_identical(out$Species, c(1, 1, 1, 1, 1)) + expect_named(out, c( + "Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width", + "Species", "new_length" + )) + # .at and .if cannot be used at same timne + expect_error( + data_modify(d, .at = "Species", .if = is.factor, .modify = as.numeric), + regex = "You cannot use both" + ) + # modify must be a function + expect_error( + data_modify(d, .at = "Species", .modify = "a"), + regex = "`.modify` must" + ) + # unknown variable + expect_error( + data_modify(d, .at = c("Species", "Test"), .modify = as.numeric), + regex = "Variable \"Test\"" + ) + # unknown variables + expect_error( + data_modify(d, .at = c("Species", "Hi", "Test"), .modify = as.numeric), + regex = "Variables \"Hi\" and \"Test\"" + ) + # one of .at or .if must be specified + expect_error( + data_modify(d, .modify = as.numeric), + regex = "You need to specify" + ) + # function not applicable to factors + expect_error( + data_modify(d, .at = "Species", .modify = function(x) 2 / y + x), + regex = "Error in modifying variable" + ) + # function not applicable to factors + expect_error( + data_modify(d, .at = "Species", .modify = function(x) 2 * x), + regex = "Error in modifying variable" + ) + # .modify needs to be specified + expect_error( + data_modify(d, .at = "Species", .if = is.factor), + regex = "You need to specify" + ) + # newly created variables are processed by if/at + out <- data_modify(d, new_length = Petal.Length * 2, .if = is.numeric, .modify = round) + expect_equal(out$new_length, c(3, 3, 3, 3, 3), ignore_attr = TRUE) +}) diff --git a/tests/testthat/test-data_partition.R b/tests/testthat/test-data_partition.R index f9119f046..7443465a8 100644 --- a/tests/testthat/test-data_partition.R +++ b/tests/testthat/test-data_partition.R @@ -10,7 +10,7 @@ test_that("data_partition works as expected", { expect_snapshot(data_partition(letters, seed = 123)) - # sanity checks + # validation checks expect_warning( data_partition(iris, 0.7, row_id = "Species"), diff --git a/tests/testthat/test-data_read.R b/tests/testthat/test-data_read.R index b73a16052..fd4884deb 100644 --- a/tests/testthat/test-data_read.R +++ b/tests/testthat/test-data_read.R @@ -1,6 +1,3 @@ -skip_on_cran() -skip_if_offline() - skip_if_not_installed("httr") skip_if_not_installed("readxl") skip_if_not_installed("haven") @@ -8,6 +5,11 @@ skip_if_not_installed("readr") skip_if_not_installed("data.table") skip_if_not_installed("rio") +skip_on_cran() + +skip_if_not_installed("curl") +skip_if_offline() + # csv ------------------------- test_that("data_read - csv", { diff --git a/tests/testthat/test-data_rescale.R b/tests/testthat/test-data_rescale.R index 2c24f0687..9caf3ee16 100644 --- a/tests/testthat/test-data_rescale.R +++ b/tests/testthat/test-data_rescale.R @@ -37,8 +37,8 @@ test_that("rescale works as expected", { expect_snapshot( head(rescale(iris, to = list( - "Sepal.Length" = c(0, 1), - "Petal.Length" = c(-1, 0) + Sepal.Length = c(0, 1), + Petal.Length = c(-1, 0) ))) ) }) @@ -109,3 +109,67 @@ test_that("data_rescale regex", { ignore_attr = TRUE ) }) + + +# expanding range ------------------------------ +test_that("data_rescale can expand range", { + # for vectors + x <- 5:15 + expect_equal( + rescale(x, multiply = 1.1), + c(4.5, 5.6, 6.7, 7.8, 8.9, 10, 11.1, 12.2, 13.3, 14.4, 15.5), + ignore_attr = TRUE + ) + expect_equal(rescale(x, multiply = 1.1), rescale(x, add = 0.5), ignore_attr = TRUE) + expect_error(rescale(x, multiply = 0.9, add = 1), regex = "Only one of") + expect_error(rescale(x, multiply = c(1.2, 1.4)), regex = "The length of") + + # different values for add + expect_equal( + rescale(x, add = c(1, 3)), + c(4, 5.4, 6.8, 8.2, 9.6, 11, 12.4, 13.8, 15.2, 16.6, 18), + ignore_attr = TRUE + ) + expect_error(rescale(x, add = 1:3), regex = "The length of") + + # works with NA + expect_equal( + rescale(rep(NA_real_, 3), multiply = 1.1), + rep(NA_real_, 3), + ignore_attr = TRUE + ) + expect_equal( + rescale(rep(NA_real_, 3), add = 2), + rep(NA_real_, 3), + ignore_attr = TRUE + ) + + # for data frames + d <- data.frame(x = 5:15, y = 5:15) + expect_equal( + rescale(d, multiply = 1.1), + rescale(d, add = 0.5), + ignore_attr = TRUE + ) + expect_equal( + rescale(d, multiply = list(x = 1.1, y = 0.5)), + rescale(d, add = list(x = 0.5, y = -2.5)), + ignore_attr = TRUE + ) + # data frames accept multiple add-values per column + out <- rescale(d, add = list(x = c(1, 3), y = c(2, 4))) + expect_equal( + out$x, + rescale(d$x, add = c(1, 3)), + ignore_attr = TRUE + ) + expect_equal( + out$y, + rescale(d$y, add = c(2, 4)), + ignore_attr = TRUE + ) + + expect_error(rescale(d, multiply = 0.9, add = 1), regex = "Only one of") + expect_error(rescale(d, multiply = list(x = 0.9, y = 2), add = list(y = 1)), regex = "Only one of") + expect_error(rescale(d, multiply = c(0.9, 1.5)), regex = "The length of") +}) diff --git a/tests/testthat/test-data_reverse.R b/tests/testthat/test-data_reverse.R index b1fe1f73e..794591904 100644 --- a/tests/testthat/test-data_reverse.R +++ b/tests/testthat/test-data_reverse.R @@ -335,16 +335,20 @@ test_that("reverse_scale warns if single value to reverse", { test_that("reverse_scale select helpers", { data(iris) out <- rescale(iris, to = list( - "Sepal.Length" = c(0, 1), - "Petal.Length" = c(-1, 0) + Sepal.Length = c(0, 1), + Petal.Length = c(-1, 0) ), select = ends_with("length")) expect_identical(out$Sepal.Length, iris$Sepal.Length, tolerance = 1e-3) - out <- rescale(iris, to = list( - "Sepal.Length" = c(0, 1), - "Petal.Length" = c(-1, 0) - ), select = ends_with("length"), ignore_case = TRUE) + out <- rescale(iris, + to = list( + Sepal.Length = c(0, 1), + Petal.Length = c(-1, 0) + ), + select = ends_with("length"), + ignore_case = TRUE + ) expect_identical(head(out$Sepal.Length), c(0.22222, 0.16667, 0.11111, 0.08333, 0.19444, 0.30556), tolerance = 1e-3) }) diff --git a/tests/testthat/test-data_tabulate.R b/tests/testthat/test-data_tabulate.R index bf8010b3f..1ca47a967 100644 --- a/tests/testthat/test-data_tabulate.R +++ b/tests/testthat/test-data_tabulate.R @@ -1,6 +1,5 @@ -data(efc) - test_that("data_tabulate factor", { + data(efc, package = "datawizard") x <- data_tabulate(efc$e42dep) expect_identical(as.vector(x$Value), as.vector(sort(unique( addNA(efc$e42dep) @@ -17,6 +16,7 @@ test_that("data_tabulate factor", { test_that("data_tabulate numeric", { + data(efc, package = "datawizard") x <- data_tabulate(efc$neg_c_7) expect_identical(as.vector(x$Value), as.vector(sort(unique( addNA(efc$neg_c_7) @@ -32,7 +32,45 @@ test_that("data_tabulate numeric", { }) +test_that("data_tabulate, HTML", { + skip_if_not_installed("gt") + data(efc, package = "datawizard") + expect_s3_class(print_html(data_tabulate(efc$c172code)), "gt_tbl") + expect_s3_class(print_html(data_tabulate(efc, "c172code")), "gt_tbl") +}) + + +test_that("data_tabulate, weights", { + data(efc, package = "datawizard") + set.seed(123) + efc$weights <- abs(rnorm(n = nrow(efc), mean = 1, sd = 0.5)) + # vector/factor + out1 <- data_tabulate(efc$e42dep, weights = efc$weights) + out2 <- data_tabulate(efc$e42dep) + expect_equal(out1$N, c(3, 4, 26, 67, 5), ignore_attr = TRUE) + expect_equal(out2$N, c(2L, 4L, 28L, 63L, 3L), ignore_attr = TRUE) + expect_equal( + out1$N, + round(xtabs(efc$weights ~ efc$e42dep, addNA = TRUE)), + ignore_attr = TRUE + ) + # data frames + out <- data_tabulate(efc, c("e42dep", "e16sex"), weights = efc$weights) + expect_equal(out[[1]]$N, out1$N, ignore_attr = TRUE) + # mismatch of lengths + w <- c(efc$weights, 1) + expect_error(data_tabulate(efc$e42dep, weights = w), regex = "Length of `weights`") + # correct table footer + expect_snapshot(print(data_tabulate(efc$e42dep, weights = efc$weights))) + expect_snapshot(print_md(data_tabulate(efc$e42dep, weights = efc$weights))) + # correct table caption + expect_snapshot(print(data_tabulate(efc, c("e42dep", "e16sex"), collapse = TRUE, weights = efc$weights))) + expect_snapshot(print_md(data_tabulate(efc, c("e42dep", "e16sex"), weights = efc$weights))) +}) + + test_that("data_tabulate data.frame", { + data(efc, package = "datawizard") x <- data_tabulate(efc, c("e16sex", "c172code")) expect_s3_class(x, "list") expect_length(x, 2L) @@ -87,9 +125,15 @@ test_that("data_tabulate data.frame", { }) +test_that("data_tabulate unsupported class", { + data(mtcars) + expect_warning(data_tabulate(lm(mpg ~ hp, data = mtcars)), regex = "Can't compute frequency tables") +}) + + test_that("data_tabulate print", { set.seed(123) - x <- sample(1:3, 1e6, TRUE) + x <- sample.int(3, 1e6, TRUE) out <- data_tabulate(x, name = "Large Number") expect_identical( attributes(out), @@ -109,18 +153,20 @@ test_that("data_tabulate print", { test_that("data_tabulate print", { + data(efc, package = "datawizard") expect_snapshot(data_tabulate(efc$e42dep)) }) test_that("data_tabulate print multiple", { + data(efc, package = "datawizard") expect_snapshot(data_tabulate(efc, c("c172code", "e16sex"))) }) test_that("data_tabulate big numbers", { set.seed(123) - x <- sample(1:5, size = 1e7, TRUE) + x <- sample.int(5, size = 1e7, TRUE) expect_snapshot(data_tabulate(x)) expect_snapshot(print(data_tabulate(x), big_mark = "-")) }) @@ -128,6 +174,7 @@ test_that("data_tabulate big numbers", { test_that("data_tabulate print multiple, collapse", { skip_if_not(packageVersion("insight") > "0.17.0", "insight must be >= 0.17.0") + data(efc, package = "datawizard") expect_snapshot(data_tabulate(efc, c("c172code", "e16sex"), collapse = TRUE)) }) @@ -135,6 +182,7 @@ test_that("data_tabulate print multiple, collapse", { test_that("data_tabulate grouped data.frame", { skip_if_not_installed("poorman") + data(efc, package = "datawizard") x <- data_tabulate(poorman::group_by(efc, e16sex), "c172code") expect_s3_class(x, "list") expect_length(x, 2L) @@ -184,11 +232,13 @@ test_that("data_tabulate grouped data.frame", { test_that("data_tabulate print grouped data", { skip_if_not_installed("poorman") + data(efc, package = "datawizard") expect_snapshot(data_tabulate(poorman::group_by(efc, e16sex), "c172code")) }) test_that("data_tabulate print, collapse groups", { skip_if_not_installed("poorman") + data(efc, package = "datawizard") expect_snapshot( data_tabulate(poorman::group_by(efc, e16sex), "c172code", collapse = TRUE) ) @@ -196,6 +246,7 @@ test_that("data_tabulate print, collapse groups", { test_that("data_tabulate print, collapse groups, drop levels", { skip_if_not_installed("poorman") + data(efc, package = "datawizard") expect_snapshot( data_tabulate( poorman::group_by(efc, e16sex), @@ -206,11 +257,143 @@ test_that("data_tabulate print, collapse groups, drop levels", { ) }) +test_that("data_tabulate drop levels", { + x <- factor(rep(letters[1:3], 3), levels = letters[1:5]) + out1 <- data_tabulate(x, drop_levels = FALSE) + out2 <- data_tabulate(x, drop_levels = TRUE) + expect_identical(out1$N, c(3L, 3L, 3L, 0L, 0L, 0L)) + expect_identical(as.character(out1$Value), c("a", "b", "c", "d", "e", NA)) + expect_identical(out2$N, c(3L, 3L, 3L, 0L)) + expect_identical(as.character(out2$Value), c("a", "b", "c", NA)) +}) + # select helpers ------------------------------ test_that("data_tabulate regex", { + data(mtcars) expect_identical( data_tabulate(mtcars, select = "arb", regex = TRUE), data_tabulate(mtcars, select = "carb") ) }) + + +# missing values ------------------------------ + +test_that("data_tabulate exclude/include missing values", { + data(efc, package = "datawizard") + set.seed(123) + efc$weights <- abs(rnorm(n = nrow(efc), mean = 1, sd = 0.5)) + efc$e16sex[sample.int(nrow(efc), 5)] <- NA + out <- data_tabulate(efc$c172code) + expect_identical(out$N, c(8L, 66L, 16L, 10L)) + out <- data_tabulate(efc$c172code, include_na = FALSE) + expect_identical(out$N, c(8L, 66L, 16L)) + out <- data_tabulate(efc$c172code, weights = efc$weights) + expect_identical(out$N, c(10, 67, 15, 13)) + out <- data_tabulate(efc$c172code, include_na = FALSE, weights = efc$weights) + expect_identical(out$N, c(10, 67, 15)) +}) + + +# cross tables ------------------------------ +test_that("data_tabulate, cross tables", { + data(efc, package = "datawizard") + set.seed(123) + efc$weights <- abs(rnorm(n = nrow(efc), mean = 1, sd = 0.5)) + efc$e16sex[sample.int(nrow(efc), 5)] <- NA + + expect_snapshot(print(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full"))) + expect_snapshot(print(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full", include_na = FALSE))) + expect_snapshot(print(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full", weights = efc$weights))) + expect_snapshot(print(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full", include_na = FALSE, weights = efc$weights))) # nolint + expect_snapshot(print(data_tabulate(efc, "c172code", by = efc$e16sex, proportions = "row"))) + expect_snapshot(print(data_tabulate(efc, "c172code", by = efc$e16sex, proportions = "row", include_na = FALSE))) + expect_snapshot(print(data_tabulate(efc, "c172code", by = efc$e16sex, proportions = "row", weights = efc$weights))) + expect_snapshot(print(data_tabulate(efc, "c172code", by = efc$e16sex, proportions = "row", include_na = FALSE, weights = efc$weights))) # nolint + expect_snapshot(print(data_tabulate(efc, "c172code", by = "e16sex", proportions = "column"))) + expect_snapshot(print(data_tabulate(efc, "c172code", by = "e16sex", proportions = "column", include_na = FALSE))) + expect_snapshot(print(data_tabulate(efc, "c172code", by = "e16sex", proportions = "column", weights = "weights"))) + expect_snapshot(print(data_tabulate(efc, "c172code", by = "e16sex", proportions = "column", include_na = FALSE, weights = "weights"))) # nolint +}) + +test_that("data_tabulate, cross tables, HTML", { + skip_if_not_installed("gt") + data(efc, package = "datawizard") + set.seed(123) + efc$weights <- abs(rnorm(n = nrow(efc), mean = 1, sd = 0.5)) + efc$e16sex[sample.int(nrow(efc), 5)] <- NA + + expect_s3_class(print_html(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full")), "gt_tbl") + expect_s3_class(print_html(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full", include_na = FALSE)), "gt_tbl") # nolint + expect_s3_class(print_html(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full", weights = efc$weights)), "gt_tbl") # nolint + expect_s3_class(print_html(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full", include_na = FALSE, weights = efc$weights)), "gt_tbl") # nolint + expect_s3_class(print_html(data_tabulate(efc, "c172code", by = efc$e16sex, proportions = "row")), "gt_tbl") + expect_s3_class(print_html(data_tabulate(efc, "c172code", by = efc$e16sex, proportions = "row", include_na = FALSE, weights = efc$weights)), "gt_tbl") # nolint +}) + +test_that("data_tabulate, cross tables, grouped df", { + data(efc, package = "datawizard") + set.seed(123) + efc$weights <- abs(rnorm(n = nrow(efc), mean = 1, sd = 0.5)) + efc$e16sex[sample.int(nrow(efc), 5)] <- NA + grp <- data_group(efc, "e42dep") + expect_snapshot(print(data_tabulate(grp, "c172code", by = "e16sex", proportions = "row"))) +}) + +test_that("data_tabulate, cross tables, errors by", { + data(efc, package = "datawizard") + set.seed(123) + efc$weights <- abs(rnorm(n = nrow(efc), mean = 1, sd = 0.5)) + efc$e16sex[sample.int(nrow(efc), 5)] <- NA + expect_error(data_tabulate(efc$c172code, by = "e16sex"), regex = "If `by` is a string") + expect_error(data_tabulate(efc$c172code, by = efc$e16sex[-1]), regex = "Length of `by`") + expect_error(data_tabulate(efc, "c172code", by = efc$e16sex[-1]), regex = "Length of `by`") + expect_error(data_tabulate(efc, "c172code", by = "c16sex"), regex = "not found") + expect_error(data_tabulate(efc, "c172code", by = c("e16sex", "e42dep")), regex = "You may use") +}) + +test_that("data_tabulate, cross tables, errors weights", { + data(efc, package = "datawizard") + set.seed(123) + efc$weights <- abs(rnorm(n = nrow(efc), mean = 1, sd = 0.5)) + efc$e16sex[sample.int(nrow(efc), 5)] <- NA + expect_error(data_tabulate(efc$c172code, weights = "weights"), regex = "If `weights`") + expect_error(data_tabulate(efc$c172code, weights = efc$weights[-1]), regex = "Length of `weights`") + expect_error(data_tabulate(efc, "c172code", weights = efc$weights[-1]), regex = "Length of `weights`") + expect_error(data_tabulate(efc, "c172code", weights = "weigths"), regex = "not found") + expect_error(data_tabulate(efc, "c172code", weights = c("e16sex", "e42dep")), regex = "length 1") +}) + + +# markdown ------------------------- + +test_that("data_tabulate, cross tables, markdown", { + data(efc, package = "datawizard") + set.seed(123) + efc$weights <- abs(rnorm(n = nrow(efc), mean = 1, sd = 0.5)) + efc$e16sex[sample.int(nrow(efc), 5)] <- NA + + expect_snapshot(print_md(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full"))) + expect_snapshot(print_md(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full", include_na = FALSE))) + expect_snapshot(print_md(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full", weights = efc$weights))) + expect_snapshot(print_md(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full", include_na = FALSE, weights = efc$weights))) # nolint +}) + +# validate against table ------------------------- + +test_that("data_tabulate, validate against table", { + data(mtcars) + # frequency table + out1 <- as.data.frame(table(mtcars$cyl)) + out2 <- data_tabulate(mtcars$cyl, include_na = FALSE) + expect_identical(out1$Freq, out2$N) + # crosstable + out1 <- data_arrange(as.data.frame(table(mtcars$cyl, mtcars$gear)), c("Var1", "Var2")) + out2 <- data_rename(data_to_long( + as.data.frame(data_tabulate(mtcars$cyl, by = mtcars$gear, include_na = FALSE)), 2:4, + names_to = "Var2", values_to = "Freq" + ), "mtcars$cyl", "Var1") + out1[[2]] <- as.character(out1[[2]]) + expect_equal(out1, out2, ignore_attr = TRUE) +}) diff --git a/tests/testthat/test-data_to_factor.R b/tests/testthat/test-data_to_factor.R index 45859d724..e45423bd5 100644 --- a/tests/testthat/test-data_to_factor.R +++ b/tests/testthat/test-data_to_factor.R @@ -22,7 +22,7 @@ test_that("to_factor", { # numeric, partially labelled test_that("to_factor", { x <- c(10, 11, 12) - attr(x, "labels") <- c("ten" = 10, "twelve" = 12) + attr(x, "labels") <- c(ten = 10, twelve = 12) expect_message( expect_identical( to_factor(x), @@ -82,12 +82,14 @@ test_that("to_factor regex", { # SPSS file, many value labels ----------------------------------- -skip_on_cran() -skip_if_offline() - skip_if_not_installed("httr") skip_if_not_installed("haven") +skip_on_cran() + +skip_if_not_installed("curl") +skip_if_offline() + # Output validated against SPSS output from original dataset test_that("data_read, convert many labels correctly", { @@ -139,3 +141,24 @@ test_that("data_read, convert many labels correctly", { expect_snapshot(data_tabulate(to_factor(d$c12c))) unlink(temp_file) }) + + +test_that("to_factor works with haven_labelled, convert many labels correctly", { + skip_if_not_installed("withr") + withr::with_tempfile("temp_file", fileext = ".sav", code = { + request <- httr::GET("https://raw.github.com/easystats/circus/main/data/EFC.sav") + httr::stop_for_status(request) + writeBin(httr::content(request, type = "raw"), temp_file) + + d <- haven::read_spss(temp_file) + x <- to_factor(d$c172code) + expect_identical( + levels(x), + c( + "low level of education", + "intermediate level of education", + "high level of education" + ) + ) + }) +}) diff --git a/tests/testthat/test-data_to_numeric.R b/tests/testthat/test-data_to_numeric.R index 68eb8f0dc..464c35e8d 100644 --- a/tests/testthat/test-data_to_numeric.R +++ b/tests/testthat/test-data_to_numeric.R @@ -44,7 +44,6 @@ test_that("convert factor to numeric", { expect_snapshot(to_numeric(f)) }) - test_that("convert factor to numeric", { expect_identical(to_numeric(c("abc", "xyz")), c(1, 2)) expect_identical(to_numeric(c("123", "789")), c(123, 789)) @@ -52,7 +51,6 @@ test_that("convert factor to numeric", { expect_identical(to_numeric(c("1L", "2e-3", "ABC")), c(1, 2, 3)) }) - test_that("convert factor to numeric, dummy factors", { expect_identical( to_numeric(c("abc", "xyz"), dummy_factors = TRUE), @@ -66,7 +64,6 @@ test_that("convert factor to numeric, dummy factors", { ) }) - test_that("convert factor to numeric, append", { data(efc) expect_identical( @@ -94,13 +91,11 @@ test_that("convert factor to numeric, append", { ) }) - test_that("convert factor to numeric, all numeric", { data(mtcars) expect_identical(to_numeric(mtcars), mtcars) }) - test_that("convert factor to numeric, dummy factors, with NA", { x1 <- factor(rep(c("a", "b"), 3)) x2 <- factor(c("a", NA_character_, "a", "b", "a", "b")) @@ -153,6 +148,20 @@ test_that("convert factor to numeric, dummy factors, with NA", { expect_identical(nrow(to_numeric(x7, dummy_factors = TRUE)), length(x7)) }) +test_that("to_numeric, inverse factor levels", { + f <- c(0, 0, 1, 1, 1, 0) + x1 <- factor(f, levels = c(0, 1)) + x2 <- factor(f, levels = c(1, 0)) + out <- to_numeric(x1, dummy_factors = FALSE, preserve_levels = FALSE) + expect_identical(out, c(1, 1, 2, 2, 2, 1)) + out <- to_numeric(x2, dummy_factors = FALSE, preserve_levels = FALSE) + expect_identical(out, c(2, 2, 1, 1, 1, 2)) + out <- to_numeric(x1, dummy_factors = FALSE, preserve_levels = TRUE) + expect_identical(out, c(0, 0, 1, 1, 1, 0)) + out <- to_numeric(x2, dummy_factors = FALSE, preserve_levels = TRUE) + expect_identical(out, c(1, 1, 0, 0, 0, 1)) +}) + # select helpers ------------------------------ test_that("to_numeric regex", { expect_identical( @@ -160,3 +169,54 @@ test_that("to_numeric regex", { to_numeric(mtcars, select = "mpg") ) }) + + +test_that("to_numeric works with haven_labelled, convert many labels correctly", { + skip_on_cran() + skip_if_not_installed("httr") + skip_if_not_installed("haven") + skip_if_not_installed("withr") + skip_if_not_installed("curl") + skip_if_offline() + + withr::with_tempfile("temp_file", fileext = ".sav", code = { + request <- httr::GET("https://raw.github.com/easystats/circus/main/data/EFC.sav") + httr::stop_for_status(request) + writeBin(httr::content(request, type = "raw"), temp_file) + + d <- haven::read_spss(temp_file) + x <- to_numeric(d$c172code) + expect_identical(as.vector(table(x)), c(180L, 506L, 156L)) + }) +}) + + +test_that("to_numeric preserves correct label order", { + x <- factor(c(1, 2, 3, 4)) + x <- assign_labels(x, values = c("one", "two", "three", "four")) + out <- to_numeric(x, dummy_factors = FALSE) + expect_identical( + attributes(out)$labels, + c(one = 1, two = 2, three = 3, four = 4) + ) + # correctly reverse scale + out <- to_numeric(reverse_scale(x), dummy_factors = FALSE) + expect_identical( + attributes(out)$labels, + c(one = 4, two = 3, three = 2, four = 1) + ) + # factor with alphabetical values + x <- factor(letters[1:4]) + x <- assign_labels(x, values = c("one", "two", "three", "four")) + out <- to_numeric(x, dummy_factors = FALSE) + expect_identical( + attributes(out)$labels, + c(one = 1, two = 2, three = 3, four = 4) + ) + # correctly reverse scale + out <- to_numeric(reverse_scale(x), dummy_factors = FALSE) + expect_identical( + attributes(out)$labels, + c(one = 4, two = 3, three = 2, four = 1) + ) +}) diff --git a/tests/testthat/test-data_write.R b/tests/testthat/test-data_write.R index a51931bd7..fecdc4767 100644 --- a/tests/testthat/test-data_write.R +++ b/tests/testthat/test-data_write.R @@ -1,10 +1,12 @@ -skip_on_cran() -skip_if_offline() - skip_if_not_installed("httr") skip_if_not_installed("haven") skip_if_not_installed("readr") +skip_on_cran() + +skip_if_not_installed("curl") +skip_if_offline() + # prepare data set --------------- data(efc) @@ -15,114 +17,113 @@ d$e42dep <- droplevels(d$e42dep) # SPSS ------------------------------------- -tmp <- tempfile(fileext = ".sav") -on.exit(unlink(tmp)) - test_that("data_write, SPSS", { - expect_message(data_write(d, tmp)) - d2 <- data_read(tmp, verbose = FALSE) - - expect_equal( - to_factor(d, select = c("e16sex", "c172code")), - d2, - ignore_attr = TRUE - ) + skip_if_not_installed("withr") + withr::with_tempfile("tmp", fileext = ".sav", code = { + expect_message(data_write(d, tmp)) + d2 <- data_read(tmp, verbose = FALSE) + expect_equal( + to_factor(d, select = c("e16sex", "c172code")), + d2, + ignore_attr = TRUE + ) + }) }) -tmp <- tempfile(fileext = ".sav") -on.exit(unlink(tmp)) - test_that("data_write, SPSS, mixed types of labelled vectors", { - d <- data.frame( - a = 1:3, - b = letters[1:3], - c = factor(letters[1:3]), - d = as.Date(c("2022-01-01", "2022-02-01", "2022-03-01")), - e = c(TRUE, FALSE, FALSE), - stringsAsFactors = FALSE - ) - - # Date and Logical cannot be labelled - d$a <- assign_labels(d$a, variable = "First", values = c("one", "two", "three")) - d$b <- assign_labels(d$b, variable = "Second", values = c("A", "B", "C")) - d$c <- assign_labels(d$c, variable = "Third", values = c("ey", "bee", "see")) - - # expect message, but no error - expect_message(data_write(d, "test.sav"), regex = "Preparing") + skip_if_not_installed("withr") + withr::with_tempfile("tmp", fileext = ".sav", code = { + d <- data.frame( + a = 1:3, + b = letters[1:3], + c = factor(letters[1:3]), + d = as.Date(c("2022-01-01", "2022-02-01", "2022-03-01")), + e = c(TRUE, FALSE, FALSE), + stringsAsFactors = FALSE + ) + + # Date and Logical cannot be labelled + d$a <- assign_labels(d$a, variable = "First", values = c("one", "two", "three")) + d$b <- assign_labels(d$b, variable = "Second", values = c("A", "B", "C")) + d$c <- assign_labels(d$c, variable = "Third", values = c("ey", "bee", "see")) + + expect_message(data_write(d, tmp), regex = "Preparing") + }) }) # Stata ------------------------------------- -tmp <- tempfile(fileext = ".dta") -on.exit(unlink(tmp)) - test_that("data_write, Stata", { - data_write(d, tmp, verbose = FALSE) - d2 <- data_read(tmp, verbose = FALSE) - - expect_equal( - to_factor(d, select = c("e16sex", "c172code")), - d2, - ignore_attr = TRUE - ) + skip_if_not_installed("withr") + withr::with_tempfile("tmp", fileext = ".dta", code = { + data_write(d, tmp, verbose = FALSE) + d2 <- data_read(tmp, verbose = FALSE) + + expect_equal( + to_factor(d, select = c("e16sex", "c172code")), + d2, + ignore_attr = TRUE + ) + }) }) # csv ------------------------- -tmp <- tempfile(fileext = ".csv") -on.exit(unlink(tmp)) - test_that("data_write, CSV, keep numeric", { - data_write(d, tmp) - d2 <- data_read(tmp) - - expect_equal( - to_numeric(d, dummy_factors = FALSE, preserve_levels = TRUE), - d2, - ignore_attr = TRUE - ) + skip_if_not_installed("withr") + withr::with_tempfile("tmp", fileext = ".csv", code = { + data_write(d, tmp) + d2 <- data_read(tmp) + + expect_equal( + to_numeric(d, dummy_factors = FALSE, preserve_levels = TRUE), + d2, + ignore_attr = TRUE + ) + }) }) test_that("data_write, CSV, convert to factor", { - data_write(d, tmp, convert_factors = TRUE) - d2 <- data_read(tmp) - out <- to_factor(d, select = c("e16sex", "c172code")) - out$e16sex <- as.character(out$e16sex) - out$c172code <- as.character(out$c172code) - out$e42dep <- as.numeric(as.character(out$e42dep)) - - expect_equal(out, d2, ignore_attr = TRUE) + skip_if_not_installed("withr") + withr::with_tempfile("tmp", fileext = ".csv", code = { + data_write(d, tmp, convert_factors = TRUE) + d2 <- data_read(tmp) + out <- to_factor(d, select = c("e16sex", "c172code")) + out$e16sex <- as.character(out$e16sex) + out$c172code <- as.character(out$c172code) + out$e42dep <- as.numeric(as.character(out$e42dep)) + expect_equal(out, d2, ignore_attr = TRUE) + }) }) -# main file -tmp <- tempfile(fileext = ".csv") -on.exit(unlink(tmp)) - -# file for labels -fpath <- dirname(tmp) -fname <- sub("\\.csv$", "", basename(tmp)) -tmp2 <- paste0(fpath, .Platform$file.sep, fname, "_labels.csv") -on.exit(unlink(tmp2)) - test_that("data_write, CSV, create labels file", { - data(efc) - expect_message(data_write(efc, tmp, save_labels = TRUE)) - d <- data_read(tmp2) - - expect_identical(d$variable[2], "e16sex") - expect_identical(d$label[2], "elder's gender") - expect_identical(d$labels[2], "1=male; 2=female") - - expect_message(data_write(efc, tmp, save_labels = TRUE, delimiter = ";")) - d <- data_read(tmp2) - expect_identical(d$variable[2], "e16sex") - expect_identical(d$label[2], "elder's gender") - expect_identical(d$labels[2], "1=male; 2=female") + skip_if_not_installed("withr") + withr::with_tempfile("tmp", fileext = ".csv", code = { + # file for labels + fpath <- dirname(tmp) + fname <- sub("\\.csv$", "", basename(tmp)) + tmp2 <- paste0(fpath, .Platform$file.sep, fname, "_labels.csv") + on.exit(unlink(tmp2)) + + data(efc) + expect_message(data_write(efc, tmp, save_labels = TRUE)) + d <- data_read(tmp2) + + expect_identical(d$variable[2], "e16sex") + expect_identical(d$label[2], "elder's gender") + expect_identical(d$labels[2], "1=male; 2=female") + + expect_message(data_write(efc, tmp, save_labels = TRUE, delimiter = ";")) + d <- data_read(tmp2) + expect_identical(d$variable[2], "e16sex") + expect_identical(d$label[2], "elder's gender") + expect_identical(d$labels[2], "1=male; 2=female") + }) }) @@ -132,3 +133,23 @@ test_that("data_write, no file extension", { expect_error(data_write(d, "mytestfile")) expect_error(data_write(d, NULL)) }) + + +# writing character vector works for missing value labels ------------------ + +test_that("data_write, existing variable label but missing value labels", { + skip_if_not_installed("withr") + withr::with_tempfile("tmp", fileext = ".sav", code = { + d <- data.frame( + a = letters[1:3], + stringsAsFactors = FALSE + ) + d$a <- assign_labels(d$a, variable = "First") + # expect message, but no error + expect_message(data_write(d, tmp), regex = "Preparing") + + # check if data is really the same + d2 <- data_read(tmp, verbose = FALSE) + expect_identical(d2, d) + }) +}) diff --git a/tests/testthat/test-describe_distribution.R b/tests/testthat/test-describe_distribution.R index 5976286ae..83d2abb33 100644 --- a/tests/testthat/test-describe_distribution.R +++ b/tests/testthat/test-describe_distribution.R @@ -4,8 +4,8 @@ test_that("describe_distribution - numeric: works with basic numeric vector", { skip_if_not_installed("bayestestR") x <- describe_distribution(mtcars$mpg) - expect_equal(dim(x), c(1, 9)) - expect_equal(round(x$Mean), 20) + expect_identical(dim(x), c(1L, 9L)) + expect_identical(round(x$Mean), 20) }) test_that("describe_distribution - numeric: correctly handles missing values", { @@ -15,8 +15,8 @@ test_that("describe_distribution - numeric: correctly handles missing values", { test <- mtcars$mpg test[1] <- NA with_missing <- describe_distribution(test) - expect_equal(with_missing$n, 31) - expect_equal(with_missing$n_Missing, 1) + expect_identical(with_missing$n, 31L) + expect_identical(with_missing$n_Missing, 1L) expect_false(with_missing$Mean == no_missing$Mean) }) @@ -24,7 +24,7 @@ test_that("describe_distribution - numeric: works with quartiles", { skip_if_not_installed("bayestestR") x <- describe_distribution(mtcars$mpg, quartiles = TRUE) - expect_equal(dim(x), c(1, 11)) + expect_identical(dim(x), c(1L, 11L)) expect_true("Q1" %in% names(x)) expect_true("Q3" %in% names(x)) }) @@ -33,7 +33,7 @@ test_that("describe_distribution - numeric: works with range", { skip_if_not_installed("bayestestR") x <- describe_distribution(mtcars$mpg, range = FALSE) - expect_equal(dim(x), c(1, 7)) + expect_identical(dim(x), c(1L, 7L)) expect_false("min" %in% names(x)) expect_false("max" %in% names(x)) }) @@ -53,8 +53,8 @@ test_that("describe_distribution - data frame: works with basic data frame", { skip_if_not_installed("bayestestR") x <- describe_distribution(mtcars) - expect_equal(dim(x), c(11, 10)) - expect_equal(round(x[1, "Mean"]), 20) + expect_identical(dim(x), c(11L, 10L)) + expect_identical(round(x[1, "Mean"]), 20) }) test_that("describe_distribution - data frame: correctly handles missing values", { @@ -64,8 +64,8 @@ test_that("describe_distribution - data frame: correctly handles missing values" test <- mtcars test[1, ] <- NA with_missing <- describe_distribution(test) - expect_equal(unique(with_missing$n), 31) - expect_equal(unique(with_missing$n_Missing), 1) + expect_identical(unique(with_missing$n), 31L) + expect_identical(unique(with_missing$n_Missing), 1L) expect_false(unique(with_missing$Mean == no_missing$Mean)) }) @@ -73,7 +73,7 @@ test_that("describe_distribution - data frame: works with quartiles", { skip_if_not_installed("bayestestR") x <- describe_distribution(mtcars, quartiles = TRUE) - expect_equal(dim(x), c(11, 12)) + expect_identical(dim(x), c(11L, 12L)) expect_true("Q1" %in% names(x)) expect_true("Q3" %in% names(x)) }) @@ -82,7 +82,7 @@ test_that("describe_distribution - data frame: works with range", { skip_if_not_installed("bayestestR") x <- describe_distribution(mtcars, range = FALSE) - expect_equal(dim(x), c(11, 8)) + expect_identical(dim(x), c(11L, 8L)) expect_false("min" %in% names(x)) expect_false("max" %in% names(x)) }) @@ -120,14 +120,14 @@ test_that("describe_distribution - list: works with basic list", { named <- describe_distribution(list(foo = mtcars$mpg, foo2 = mtcars$cyl)) mix <- describe_distribution(list(foo = mtcars$mpg, mtcars$cyl)) - expect_equal(dim(stored), c(2, 10)) - expect_equal(round(stored$Mean), c(20, 6)) - expect_equal(dim(unnamed), c(2, 10)) - expect_equal(round(unnamed$Mean), c(20, 6)) - expect_equal(dim(named), c(2, 10)) - expect_equal(round(named$Mean), c(20, 6)) - expect_equal(dim(mix), c(2, 10)) - expect_equal(round(mix$Mean), c(20, 6)) + expect_identical(dim(stored), c(2L, 10L)) + expect_identical(round(stored$Mean), c(20, 6)) + expect_identical(dim(unnamed), c(2L, 10L)) + expect_identical(round(unnamed$Mean), c(20, 6)) + expect_identical(dim(named), c(2L, 10L)) + expect_identical(round(named$Mean), c(20, 6)) + expect_identical(dim(mix), c(2L, 10L)) + expect_identical(round(mix$Mean), c(20, 6)) }) test_that("describe_distribution - list: works with include_factors", { @@ -140,14 +140,14 @@ test_that("describe_distribution - list: works with include_factors", { x2 <- describe_distribution(list(mtcars$mpg, factor(mtcars$cyl)), include_factors = TRUE ) - expect_equal(dim(x2), c(2, 10)) - expect_equal(x2$Variable, c("mtcars$mpg", "factor(mtcars$cyl)")) + expect_identical(dim(x2), c(2L, 10L)) + expect_identical(x2$Variable, c("mtcars$mpg", "factor(mtcars$cyl)")) x3 <- describe_distribution(list(mtcars$mpg, foo = factor(mtcars$cyl)), include_factors = TRUE ) - expect_equal(dim(x3), c(2, 10)) - expect_equal(x3$Variable, c("mtcars$mpg", "foo")) + expect_identical(dim(x3), c(2L, 10L)) + expect_identical(x3$Variable, c("mtcars$mpg", "foo")) }) test_that("describe_distribution - list: correctly removes character elements", { @@ -167,10 +167,10 @@ test_that("describe_distribution - list: correctly handles variable names", { named <- describe_distribution(list(foo = mtcars$mpg, foo2 = mtcars$cyl)) mix <- describe_distribution(list(foo = mtcars$mpg, mtcars$cyl)) - expect_equal(stored$Variable, c("Var_1", "Var_2")) - expect_equal(unnamed$Variable, c("mtcars$mpg", "mtcars$cyl")) - expect_equal(named$Variable, c("foo", "foo2")) - expect_equal(mix$Variable, c("foo", "mtcars$cyl")) + expect_identical(stored$Variable, c("Var_1", "Var_2")) + expect_identical(unnamed$Variable, c("mtcars$mpg", "mtcars$cyl")) + expect_identical(named$Variable, c("foo", "foo2")) + expect_identical(mix$Variable, c("foo", "mtcars$cyl")) }) test_that("describe_distribution - list: correctly handles missing values", { @@ -182,8 +182,8 @@ test_that("describe_distribution - list: correctly handles missing values", { test[1] <- NA test2[1] <- NA with_missing <- describe_distribution(list(test, test2)) - expect_equal(unique(with_missing$n), 31) - expect_equal(unique(with_missing$n_Missing), 1) + expect_identical(unique(with_missing$n), 31L) + expect_identical(unique(with_missing$n_Missing), 1L) expect_false(unique(with_missing$Mean == no_missing$Mean)) }) @@ -191,7 +191,7 @@ test_that("describe_distribution - list: works with quartiles", { skip_if_not_installed("bayestestR") x <- describe_distribution(list(mtcars$mpg, mtcars$cyl), quartiles = TRUE) - expect_equal(dim(x), c(2, 12)) + expect_identical(dim(x), c(2L, 12L)) expect_true("Q1" %in% names(x)) expect_true("Q3" %in% names(x)) }) @@ -200,7 +200,7 @@ test_that("describe_distribution - list: works with range", { skip_if_not_installed("bayestestR") x <- describe_distribution(list(mtcars$mpg, mtcars$cyl), range = FALSE) - expect_equal(dim(x), c(2, 8)) + expect_identical(dim(x), c(2L, 8L)) expect_false("min" %in% names(x)) expect_false("max" %in% names(x)) }) @@ -215,7 +215,7 @@ test_that("describe_distribution - select", { data(iris) out <- describe_distribution(iris, select = starts_with("Petal")) - expect_equal(out$Variable, c("Petal.Length", "Petal.Width")) + expect_identical(out$Variable, c("Petal.Length", "Petal.Width")) expect_equal(out$Mean, c(3.758000, 1.199333), tolerance = 1e-3) expect_null(describe_distribution(iris, select = "Species")) @@ -235,12 +235,12 @@ test_that("describe_distribution - grouped df", { x <- data_group(iris, Species) out <- describe_distribution(x, select = starts_with("Petal")) - expect_equal(out$.group, c( + expect_identical(out$.group, c( "Species=setosa", "Species=setosa", "Species=versicolor", "Species=versicolor", "Species=virginica", "Species=virginica" )) - expect_equal(out$Variable, c( + expect_identical(out$Variable, c( "Petal.Length", "Petal.Width", "Petal.Length", "Petal.Width", "Petal.Length", "Petal.Width" @@ -250,23 +250,22 @@ test_that("describe_distribution - grouped df", { # distribution_mode -------------------------- - test_that("distribution_mode works as expected", { skip_if_not_installed("bayestestR") # atomic vector - expect_equal(distribution_mode(c(1, 2, 3, 3, 4, 5)), 3) - expect_equal(distribution_mode(c(1, 2, 3, 3, 4, 4, 5)), 3) - expect_equal(distribution_mode(c(1.5, 2.3, 3.7, 3.7, 4.0, 5)), 3.7) + expect_identical(distribution_mode(c(1, 2, 3, 3, 4, 5)), 3) + expect_identical(distribution_mode(c(1, 2, 3, 3, 4, 4, 5)), 3) + expect_identical(distribution_mode(c(1.5, 2.3, 3.7, 3.7, 4.0, 5)), 3.7) # list - expect_equal(distribution_mode(list(1, 2, 3, 3, 4, 5)), list(3)) + expect_identical(distribution_mode(list(1, 2, 3, 3, 4, 5)), list(3)) # scalar - expect_equal(distribution_mode("a"), "a") + expect_identical(distribution_mode("a"), "a") # empty - expect_null(distribution_mode(c())) + expect_null(distribution_mode(NULL)) }) # select helpers ------------------------------ @@ -275,6 +274,15 @@ test_that("describe_distribution regex", { expect_equal( describe_distribution(mtcars, select = "pg", regex = TRUE), - describe_distribution(mtcars, select = "mpg") + describe_distribution(mtcars, select = "mpg"), + ignore_attr = TRUE ) }) + +# formatting ------------------------------ +test_that("describe_distribution formatting", { + skip_if_not_installed("bayestestR") + data(iris) + x <- describe_distribution(iris$Sepal.Width, quartiles = TRUE) + expect_snapshot(format(x)) +}) diff --git a/tests/testthat/test-labels_to_levels.R b/tests/testthat/test-labels_to_levels.R index 866154c8f..87a0418b9 100644 --- a/tests/testthat/test-labels_to_levels.R +++ b/tests/testthat/test-labels_to_levels.R @@ -28,6 +28,18 @@ test_that("labels_to_levels, factor, error on no labels", { expect_error(labels_to_levels(iris), regex = "Could not change factor") }) +test_that("labels_to_levels, data frame, append", { + data(efc) + out <- labels_to_levels(efc, append = "_ll") + expect_named(out, c("c12hour", "e16sex", "e42dep", "c172code", "neg_c_7", "e42dep_ll")) +}) + +test_that("labels_to_levels, data frame, append", { + data(iris) + d <- as.data.frame(lapply(iris, as.factor)) + expect_identical(labels_to_levels(d), d) +}) + test_that("labels_to_levels, factor, data frame", { data(efc) out <- labels_to_levels(efc) diff --git a/tests/testthat/test-standardize_models.R b/tests/testthat/test-standardize_models.R index dcaa832d5..d8f117cb9 100644 --- a/tests/testthat/test-standardize_models.R +++ b/tests/testthat/test-standardize_models.R @@ -58,19 +58,21 @@ test_that("transformations", { effectsize::standardize_parameters(fit_exp, method = "refit")[2, 2], unname(coef(fit_scale1)[2]), tolerance = 1e-4 + ignore_attr = TRUE ) expect_equal( effectsize::standardize_parameters(fit_exp, method = "basic")[2, 2], unname(coef(fit_scale2)[2]), tolerance = 1e-4 + ignore_attr = TRUE ) skip_if_not_installed("insight", minimum_version = "0.10.0") d <- data.frame( time = as.factor(c(1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5)), group = c(1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2), - sum = c(0, 5, 10, 15, 20, 0, 20, 25, 45, 50, 0, 5, 10, 15, 20, 0, 20, 25, 45, 50, 0, 5, 10, 15, 20, 0, 20, 25, 45, 50) + sum = c(0, 5, 10, 15, 20, 0, 20, 25, 45, 50, 0, 5, 10, 15, 20, 0, 20, 25, 45, 50, 0, 5, 10, 15, 20, 0, 20, 25, 45, 50) # nolint ) m <- lm(log(sum + 1) ~ as.numeric(time) * group, data = d) @@ -113,12 +115,14 @@ test_that("weights", { stdREFIT[[2]], effectsize::standardize_parameters(m, method = "posthoc")[[2]], tolerance = 1e-4 + ignore_attr = TRUE ) expect_equal( stdREFIT[[2]], effectsize::standardize_parameters(m, method = "basic")[[2]], tolerance = 1e-4 + ignore_attr = TRUE ) }) @@ -293,9 +297,12 @@ test_that("offsets", { m <- glm(cyl ~ hp + offset(wt), family = poisson(), data = mtcars) - expect_warning({ - mz <- standardize(m) - }, regexp = NA) + expect_warning( + { + mz <- standardize(m) + }, + regexp = NA + ) par1 <- parameters::model_parameters(mz) par2 <- effectsize::standardize_parameters(m, method = "basic") @@ -311,15 +318,12 @@ test_that("brms", { skip_if_not_installed("brms") invisible( - capture.output( - { - mod <- brms::brm( - mpg ~ hp, - data = mtcars, - refresh = 0, chains = 1, silent = 2 - ) - } - ) + capture.output({ + mod <- brms::brm(mpg ~ hp, + data = mtcars, + refresh = 0, chains = 1, silent = 2 + ) + }) ) expect_warning( diff --git a/vignettes/standardize_data.Rmd b/vignettes/standardize_data.Rmd index acbaa5ccc..7b0e33bab 100644 --- a/vignettes/standardize_data.Rmd +++ b/vignettes/standardize_data.Rmd @@ -26,10 +26,18 @@ pkgs <- c( "see", "ggplot2", "parameters", - "lme4" + "lme4", + "curl" ) +pkg_available <- all(vapply(pkgs, requireNamespace, quietly = TRUE, FUN.VALUE = logical(1L))) -if (!all(vapply(pkgs, requireNamespace, quietly = TRUE, FUN.VALUE = logical(1L)))) { +if (pkg_available) { + net_available <- curl::has_internet() +} else { + net_available <- FALSE +} + +if (!pkg_available || !net_available) { knitr::opts_chunk$set(eval = FALSE) } ```