diff --git a/R/data_rotate.R b/R/data_rotate.R index f9a3389d3..2bb39a85d 100644 --- a/R/data_rotate.R +++ b/R/data_rotate.R @@ -56,7 +56,7 @@ data_rotate <- function(data, rownames = NULL, colnames = FALSE, verbose = TRUE) # warning after possible removal of columns if (verbose && insight::n_unique(vapply(data, typeof, FUN.VALUE = character(1L))) > 1L) { - insight::format_warning("Your data frame contains mixed types of data. After transposition, all variables will be transformed into characters.") + insight::format_warning("Your data frame contains mixed types of data. After transposition, all variables will be transformed into characters.") # nolint } # rotate data frame by 90 degrees diff --git a/R/data_seek.R b/R/data_seek.R index c0a56ab8f..17463878d 100644 --- a/R/data_seek.R +++ b/R/data_seek.R @@ -63,7 +63,7 @@ data_seek <- function(data, pattern, seek = c("names", "labels"), fuzzy = FALSE) # check valid args seek <- intersect(seek, c("names", "labels", "values", "levels", "column_names", "columns", "all")) if (is.null(seek) || !length(seek)) { - insight::format_error("`seek` must be one of \"names\", \"labels\", \"values\", a combination of these options, or \"all\".") + insight::format_error("`seek` must be one of \"names\", \"labels\", \"values\", a combination of these options, or \"all\".") # nolint } pos1 <- pos2 <- pos3 <- NULL @@ -71,7 +71,7 @@ data_seek <- function(data, pattern, seek = c("names", "labels"), fuzzy = FALSE) pos <- unlist(lapply(pattern, function(search_pattern) { # search in variable names? if (any(seek %in% c("names", "columns", "column_names", "all"))) { - pos1 <- which(grepl(search_pattern, colnames(data))) + pos1 <- grep(search_pattern, colnames(data)) # find in near distance? if (fuzzy) { pos1 <- c(pos1, .fuzzy_grep(x = colnames(data), pattern = search_pattern)) @@ -80,15 +80,15 @@ data_seek <- function(data, pattern, seek = c("names", "labels"), fuzzy = FALSE) # search in variable labels? if (any(seek %in% c("labels", "all"))) { - labels <- insight::compact_character(unlist(lapply(data, attr, which = "label", exact = TRUE))) - if (!is.null(labels) && length(labels)) { - found <- grepl(search_pattern, labels) - pos2 <- match(names(labels)[found], colnames(data)) + var_labels <- insight::compact_character(unlist(lapply(data, attr, which = "label", exact = TRUE))) + if (!is.null(var_labels) && length(var_labels)) { + found <- grepl(search_pattern, var_labels) + pos2 <- match(names(var_labels)[found], colnames(data)) # find in near distanc? if (fuzzy) { - found <- .fuzzy_grep(x = labels, pattern = search_pattern) + found <- .fuzzy_grep(x = var_labels, pattern = search_pattern) if (length(found)) { - pos2 <- c(pos2, match(names(labels)[found], colnames(data))) + pos2 <- c(pos2, match(names(var_labels)[found], colnames(data))) } } } @@ -129,7 +129,7 @@ data_seek <- function(data, pattern, seek = c("names", "labels"), fuzzy = FALSE) pos <- unique(pos) # variable labels of matching variables - labels <- vapply( + var_labels <- vapply( colnames(data[pos]), function(i) { l <- attr(data[[i]], "label", exact = TRUE) @@ -145,7 +145,7 @@ data_seek <- function(data, pattern, seek = c("names", "labels"), fuzzy = FALSE) out <- data.frame( index = pos, column = colnames(data)[pos], - labels = labels, + labels = var_labels, stringsAsFactors = FALSE ) # no row names diff --git a/R/data_separate.R b/R/data_separate.R index c5e83183d..53243fb33 100644 --- a/R/data_separate.R +++ b/R/data_separate.R @@ -254,7 +254,7 @@ data_separate <- function(data, # catch error if (is.null(separated_columns)) { insight::format_error( - "Something went wrong. Probably the number of provided column names did not match number of newly created columns?" + "Something went wrong. Probably the number of provided column names did not match number of newly created columns?" # nolint ) } @@ -264,14 +264,12 @@ data_separate <- function(data, # if no column names provided, use standard names if (is.null(new_columns[[sep_column]])) { new_column_names <- paste0(sep_column, "_", seq_along(out)) - } else { + } else if (make_unique_colnames) { # if we have multiple columns that were separated, we avoid duplicated # column names of created variables by appending name of original column - if (make_unique_colnames) { - new_column_names <- paste0(sep_column, "_", new_columns[[sep_column]]) - } else { - new_column_names <- new_columns[[sep_column]] - } + new_column_names <- paste0(sep_column, "_", new_columns[[sep_column]]) + } else { + new_column_names <- new_columns[[sep_column]] } colnames(out) <- new_column_names @@ -338,29 +336,27 @@ data_separate <- function(data, out <- rep(NA_character_, times = n_cols) } else if (n_values > n_cols) { # we have more values than required - drop extra columns - if (extra == "drop_left") { - out <- i[(n_values - n_cols + 1):n_values] - } else if (extra == "drop_right") { - out <- i[1:n_cols] - } else if (extra == "merge_left") { - out <- paste(i[1:(n_values - n_cols + 1)], collapse = " ") - out <- c(out, i[(n_values - n_cols + 2):n_values]) - } else { - out <- i[1:(n_cols - 1)] - out <- c(out, paste(i[n_cols:n_values], collapse = " ")) - } + out <- switch(extra, + drop_left = i[(n_values - n_cols + 1):n_values], + drop_right = i[1:n_cols], + merge_left = { + tmp <- paste(i[1:(n_values - n_cols + 1)], collapse = " ") + c(tmp, i[(n_values - n_cols + 2):n_values]) + }, + { + tmp <- i[1:(n_cols - 1)] + c(tmp, paste(i[n_cols:n_values], collapse = " ")) + } + ) warn_extra <- TRUE } else if (n_values < n_cols) { # we have fewer values than required - fill columns - if (fill == "left") { - out <- c(rep(NA_character_, times = n_cols - n_values), i) - } else if (fill == "right") { - out <- c(i, rep(NA_character_, times = n_cols - n_values)) - } else if (fill == "value_left") { - out <- c(rep(i[1], times = n_cols - n_values), i) - } else { - out <- c(i, rep(i[length(i)], times = n_cols - n_values)) - } + out <- switch(fill, + left = c(rep(NA_character_, times = n_cols - n_values), i), + right = c(i, rep(NA_character_, times = n_cols - n_values)), + value_left = c(rep(i[1], times = n_cols - n_values), i), + c(i, rep(i[length(i)], times = n_cols - n_values)) + ) warn_fill <- TRUE } else { out <- i diff --git a/R/describe_distribution.R b/R/describe_distribution.R index 00a8a2475..37850299a 100644 --- a/R/describe_distribution.R +++ b/R/describe_distribution.R @@ -82,18 +82,18 @@ describe_distribution.list <- function(x, num_el <- which(vapply(x, is.numeric, FUN.VALUE = logical(1L))) # get elements names as is - # ex: list(mtcars$mpg, mtcars$cyl) -> c("mtcars$mpg", "mtcars$cyl") + # ex: `list(mtcars$mpg, mtcars$cyl) -> c("mtcars$mpg", "mtcars$cyl")` nm <- vapply(sys.call()[[2]], insight::safe_deparse, FUN.VALUE = character(1L))[-1] - if (!isTRUE(include_factors)) { - x <- x[num_el] + if (isTRUE(include_factors)) { + x <- x[c(num_el, factor_el)] if (length(nm) != 0) { - nm <- nm[num_el] + nm <- nm[c(num_el, factor_el)] } } else { - x <- x[c(num_el, factor_el)] + x <- x[num_el] if (length(nm) != 0) { - nm <- nm[c(num_el, factor_el)] + nm <- nm[num_el] } } @@ -123,12 +123,12 @@ describe_distribution.list <- function(x, })) - if (!is.null(names(x))) { - empty_names <- which(names(x) == "") + if (is.null(names(x))) { + new_names <- nm + } else { + empty_names <- which(!nzchar(names(x), keepNA = TRUE)) new_names <- names(x) new_names[empty_names] <- nm[empty_names] - } else { - new_names <- nm } out$Variable <- new_names diff --git a/R/labels_to_levels.R b/R/labels_to_levels.R index ed63e0583..c1ff97a16 100644 --- a/R/labels_to_levels.R +++ b/R/labels_to_levels.R @@ -79,7 +79,7 @@ labels_to_levels.data.frame <- function(x, # create the new variables and updates "select", so new variables are processed if (!isFALSE(append)) { # process arguments - args <- .process_append( + arguments <- .process_append( x, select, append, @@ -89,8 +89,8 @@ labels_to_levels.data.frame <- function(x, keep_character = FALSE ) # update processed arguments - x <- args$x - select <- args$select + x <- arguments$x + select <- arguments$select } x[select] <- lapply( diff --git a/R/to_numeric.R b/R/to_numeric.R index 8bfcac6bc..c43956399 100644 --- a/R/to_numeric.R +++ b/R/to_numeric.R @@ -17,6 +17,12 @@ #' @inheritParams find_columns #' @inheritParams categorize #' +#' @note By default, `to_numeric()` converts factors into "binary" dummies, i.e. +#' each factor level is converted into a separate column filled with a binary +#' 0-1 value. If only one column is required, use `dummy_factors = FALSE`. If +#' you want to preserve the original factor levels (in case these represent +#' numeric values), use `preserve_levels = TRUE`. +#' #' @section Selection of variables - `select` argument: #' For most functions that have a `select` argument the complete input data #' frame is returned, even when `select` only selects a range of variables. @@ -34,6 +40,8 @@ #' x <- as.factor(mtcars$gear) #' to_numeric(x, dummy_factors = FALSE) #' to_numeric(x, dummy_factors = FALSE, preserve_levels = TRUE) +#' # same as: +#' coerce_to_numeric(x) #' #' @return A data frame of numeric variables. #' @@ -211,15 +219,13 @@ to_numeric.factor <- function(x, # if the first observation was missing, add NA row and bind data frame if (i == 1 && na_values[i] == 1) { out <- rbind(NA, out) - } else { + } else if (na_values[i] == rows_x) { # if the last observation was NA, add NA row to data frame - if (na_values[i] == rows_x) { - out <- rbind(out, NA) - } else { - # else, pick rows from beginning to current NA value, add NA, - # and rbind the remaining rows - out <- rbind(out[1:(na_values[i] - 1), ], NA, out[na_values[i]:nrow(out), ]) - } + out <- rbind(out, NA) + } else { + # else, pick rows from beginning to current NA value, add NA, + # and rbind the remaining rows + out <- rbind(out[1:(na_values[i] - 1), ], NA, out[na_values[i]:nrow(out), ]) } } rownames(out) <- NULL diff --git a/man/to_numeric.Rd b/man/to_numeric.Rd index 540f4f65a..7c78b1ba6 100644 --- a/man/to_numeric.Rd +++ b/man/to_numeric.Rd @@ -106,6 +106,13 @@ Convert data to numeric by converting characters to factors and factors to either numeric levels or dummy variables. The "counterpart" to convert variables into factors is \code{to_factor()}. } +\note{ +By default, \code{to_numeric()} converts factors into "binary" dummies, i.e. +each factor level is converted into a separate column filled with a binary +0-1 value. If only one column is required, use \code{dummy_factors = FALSE}. If +you want to preserve the original factor levels (in case these represent +numeric values), use \code{preserve_levels = TRUE}. +} \section{Selection of variables - \code{select} argument}{ For most functions that have a \code{select} argument the complete input data @@ -125,5 +132,7 @@ to_numeric(head(ToothGrowth), dummy_factors = FALSE) x <- as.factor(mtcars$gear) to_numeric(x, dummy_factors = FALSE) to_numeric(x, dummy_factors = FALSE, preserve_levels = TRUE) +# same as: +coerce_to_numeric(x) } diff --git a/tests/testthat/_snaps/describe_distribution.md b/tests/testthat/_snaps/describe_distribution.md index 5c582859d..0261ab9b1 100644 --- a/tests/testthat/_snaps/describe_distribution.md +++ b/tests/testthat/_snaps/describe_distribution.md @@ -16,3 +16,12 @@ ----------------------------------------------------------- | | [VC, OJ] | 0 | -2.07 | 60 | 0 +# describe_distribution formatting + + Code + format(x) + Output + Mean | SD | IQR | Range | Quartiles | Skewness | Kurtosis | n | n_Missing + -------------------------------------------------------------------------------------- + 3.06 | 0.44 | 0.52 | [2.00, 4.40] | 2.80, 3.30 | 0.32 | 0.23 | 150 | 0 + diff --git a/tests/testthat/test-describe_distribution.R b/tests/testthat/test-describe_distribution.R index 5976286ae..83d2abb33 100644 --- a/tests/testthat/test-describe_distribution.R +++ b/tests/testthat/test-describe_distribution.R @@ -4,8 +4,8 @@ test_that("describe_distribution - numeric: works with basic numeric vector", { skip_if_not_installed("bayestestR") x <- describe_distribution(mtcars$mpg) - expect_equal(dim(x), c(1, 9)) - expect_equal(round(x$Mean), 20) + expect_identical(dim(x), c(1L, 9L)) + expect_identical(round(x$Mean), 20) }) test_that("describe_distribution - numeric: correctly handles missing values", { @@ -15,8 +15,8 @@ test_that("describe_distribution - numeric: correctly handles missing values", { test <- mtcars$mpg test[1] <- NA with_missing <- describe_distribution(test) - expect_equal(with_missing$n, 31) - expect_equal(with_missing$n_Missing, 1) + expect_identical(with_missing$n, 31L) + expect_identical(with_missing$n_Missing, 1L) expect_false(with_missing$Mean == no_missing$Mean) }) @@ -24,7 +24,7 @@ test_that("describe_distribution - numeric: works with quartiles", { skip_if_not_installed("bayestestR") x <- describe_distribution(mtcars$mpg, quartiles = TRUE) - expect_equal(dim(x), c(1, 11)) + expect_identical(dim(x), c(1L, 11L)) expect_true("Q1" %in% names(x)) expect_true("Q3" %in% names(x)) }) @@ -33,7 +33,7 @@ test_that("describe_distribution - numeric: works with range", { skip_if_not_installed("bayestestR") x <- describe_distribution(mtcars$mpg, range = FALSE) - expect_equal(dim(x), c(1, 7)) + expect_identical(dim(x), c(1L, 7L)) expect_false("min" %in% names(x)) expect_false("max" %in% names(x)) }) @@ -53,8 +53,8 @@ test_that("describe_distribution - data frame: works with basic data frame", { skip_if_not_installed("bayestestR") x <- describe_distribution(mtcars) - expect_equal(dim(x), c(11, 10)) - expect_equal(round(x[1, "Mean"]), 20) + expect_identical(dim(x), c(11L, 10L)) + expect_identical(round(x[1, "Mean"]), 20) }) test_that("describe_distribution - data frame: correctly handles missing values", { @@ -64,8 +64,8 @@ test_that("describe_distribution - data frame: correctly handles missing values" test <- mtcars test[1, ] <- NA with_missing <- describe_distribution(test) - expect_equal(unique(with_missing$n), 31) - expect_equal(unique(with_missing$n_Missing), 1) + expect_identical(unique(with_missing$n), 31L) + expect_identical(unique(with_missing$n_Missing), 1L) expect_false(unique(with_missing$Mean == no_missing$Mean)) }) @@ -73,7 +73,7 @@ test_that("describe_distribution - data frame: works with quartiles", { skip_if_not_installed("bayestestR") x <- describe_distribution(mtcars, quartiles = TRUE) - expect_equal(dim(x), c(11, 12)) + expect_identical(dim(x), c(11L, 12L)) expect_true("Q1" %in% names(x)) expect_true("Q3" %in% names(x)) }) @@ -82,7 +82,7 @@ test_that("describe_distribution - data frame: works with range", { skip_if_not_installed("bayestestR") x <- describe_distribution(mtcars, range = FALSE) - expect_equal(dim(x), c(11, 8)) + expect_identical(dim(x), c(11L, 8L)) expect_false("min" %in% names(x)) expect_false("max" %in% names(x)) }) @@ -120,14 +120,14 @@ test_that("describe_distribution - list: works with basic list", { named <- describe_distribution(list(foo = mtcars$mpg, foo2 = mtcars$cyl)) mix <- describe_distribution(list(foo = mtcars$mpg, mtcars$cyl)) - expect_equal(dim(stored), c(2, 10)) - expect_equal(round(stored$Mean), c(20, 6)) - expect_equal(dim(unnamed), c(2, 10)) - expect_equal(round(unnamed$Mean), c(20, 6)) - expect_equal(dim(named), c(2, 10)) - expect_equal(round(named$Mean), c(20, 6)) - expect_equal(dim(mix), c(2, 10)) - expect_equal(round(mix$Mean), c(20, 6)) + expect_identical(dim(stored), c(2L, 10L)) + expect_identical(round(stored$Mean), c(20, 6)) + expect_identical(dim(unnamed), c(2L, 10L)) + expect_identical(round(unnamed$Mean), c(20, 6)) + expect_identical(dim(named), c(2L, 10L)) + expect_identical(round(named$Mean), c(20, 6)) + expect_identical(dim(mix), c(2L, 10L)) + expect_identical(round(mix$Mean), c(20, 6)) }) test_that("describe_distribution - list: works with include_factors", { @@ -140,14 +140,14 @@ test_that("describe_distribution - list: works with include_factors", { x2 <- describe_distribution(list(mtcars$mpg, factor(mtcars$cyl)), include_factors = TRUE ) - expect_equal(dim(x2), c(2, 10)) - expect_equal(x2$Variable, c("mtcars$mpg", "factor(mtcars$cyl)")) + expect_identical(dim(x2), c(2L, 10L)) + expect_identical(x2$Variable, c("mtcars$mpg", "factor(mtcars$cyl)")) x3 <- describe_distribution(list(mtcars$mpg, foo = factor(mtcars$cyl)), include_factors = TRUE ) - expect_equal(dim(x3), c(2, 10)) - expect_equal(x3$Variable, c("mtcars$mpg", "foo")) + expect_identical(dim(x3), c(2L, 10L)) + expect_identical(x3$Variable, c("mtcars$mpg", "foo")) }) test_that("describe_distribution - list: correctly removes character elements", { @@ -167,10 +167,10 @@ test_that("describe_distribution - list: correctly handles variable names", { named <- describe_distribution(list(foo = mtcars$mpg, foo2 = mtcars$cyl)) mix <- describe_distribution(list(foo = mtcars$mpg, mtcars$cyl)) - expect_equal(stored$Variable, c("Var_1", "Var_2")) - expect_equal(unnamed$Variable, c("mtcars$mpg", "mtcars$cyl")) - expect_equal(named$Variable, c("foo", "foo2")) - expect_equal(mix$Variable, c("foo", "mtcars$cyl")) + expect_identical(stored$Variable, c("Var_1", "Var_2")) + expect_identical(unnamed$Variable, c("mtcars$mpg", "mtcars$cyl")) + expect_identical(named$Variable, c("foo", "foo2")) + expect_identical(mix$Variable, c("foo", "mtcars$cyl")) }) test_that("describe_distribution - list: correctly handles missing values", { @@ -182,8 +182,8 @@ test_that("describe_distribution - list: correctly handles missing values", { test[1] <- NA test2[1] <- NA with_missing <- describe_distribution(list(test, test2)) - expect_equal(unique(with_missing$n), 31) - expect_equal(unique(with_missing$n_Missing), 1) + expect_identical(unique(with_missing$n), 31L) + expect_identical(unique(with_missing$n_Missing), 1L) expect_false(unique(with_missing$Mean == no_missing$Mean)) }) @@ -191,7 +191,7 @@ test_that("describe_distribution - list: works with quartiles", { skip_if_not_installed("bayestestR") x <- describe_distribution(list(mtcars$mpg, mtcars$cyl), quartiles = TRUE) - expect_equal(dim(x), c(2, 12)) + expect_identical(dim(x), c(2L, 12L)) expect_true("Q1" %in% names(x)) expect_true("Q3" %in% names(x)) }) @@ -200,7 +200,7 @@ test_that("describe_distribution - list: works with range", { skip_if_not_installed("bayestestR") x <- describe_distribution(list(mtcars$mpg, mtcars$cyl), range = FALSE) - expect_equal(dim(x), c(2, 8)) + expect_identical(dim(x), c(2L, 8L)) expect_false("min" %in% names(x)) expect_false("max" %in% names(x)) }) @@ -215,7 +215,7 @@ test_that("describe_distribution - select", { data(iris) out <- describe_distribution(iris, select = starts_with("Petal")) - expect_equal(out$Variable, c("Petal.Length", "Petal.Width")) + expect_identical(out$Variable, c("Petal.Length", "Petal.Width")) expect_equal(out$Mean, c(3.758000, 1.199333), tolerance = 1e-3) expect_null(describe_distribution(iris, select = "Species")) @@ -235,12 +235,12 @@ test_that("describe_distribution - grouped df", { x <- data_group(iris, Species) out <- describe_distribution(x, select = starts_with("Petal")) - expect_equal(out$.group, c( + expect_identical(out$.group, c( "Species=setosa", "Species=setosa", "Species=versicolor", "Species=versicolor", "Species=virginica", "Species=virginica" )) - expect_equal(out$Variable, c( + expect_identical(out$Variable, c( "Petal.Length", "Petal.Width", "Petal.Length", "Petal.Width", "Petal.Length", "Petal.Width" @@ -250,23 +250,22 @@ test_that("describe_distribution - grouped df", { # distribution_mode -------------------------- - test_that("distribution_mode works as expected", { skip_if_not_installed("bayestestR") # atomic vector - expect_equal(distribution_mode(c(1, 2, 3, 3, 4, 5)), 3) - expect_equal(distribution_mode(c(1, 2, 3, 3, 4, 4, 5)), 3) - expect_equal(distribution_mode(c(1.5, 2.3, 3.7, 3.7, 4.0, 5)), 3.7) + expect_identical(distribution_mode(c(1, 2, 3, 3, 4, 5)), 3) + expect_identical(distribution_mode(c(1, 2, 3, 3, 4, 4, 5)), 3) + expect_identical(distribution_mode(c(1.5, 2.3, 3.7, 3.7, 4.0, 5)), 3.7) # list - expect_equal(distribution_mode(list(1, 2, 3, 3, 4, 5)), list(3)) + expect_identical(distribution_mode(list(1, 2, 3, 3, 4, 5)), list(3)) # scalar - expect_equal(distribution_mode("a"), "a") + expect_identical(distribution_mode("a"), "a") # empty - expect_null(distribution_mode(c())) + expect_null(distribution_mode(NULL)) }) # select helpers ------------------------------ @@ -275,6 +274,15 @@ test_that("describe_distribution regex", { expect_equal( describe_distribution(mtcars, select = "pg", regex = TRUE), - describe_distribution(mtcars, select = "mpg") + describe_distribution(mtcars, select = "mpg"), + ignore_attr = TRUE ) }) + +# formatting ------------------------------ +test_that("describe_distribution formatting", { + skip_if_not_installed("bayestestR") + data(iris) + x <- describe_distribution(iris$Sepal.Width, quartiles = TRUE) + expect_snapshot(format(x)) +}) diff --git a/tests/testthat/test-labels_to_levels.R b/tests/testthat/test-labels_to_levels.R index 866154c8f..87a0418b9 100644 --- a/tests/testthat/test-labels_to_levels.R +++ b/tests/testthat/test-labels_to_levels.R @@ -28,6 +28,18 @@ test_that("labels_to_levels, factor, error on no labels", { expect_error(labels_to_levels(iris), regex = "Could not change factor") }) +test_that("labels_to_levels, data frame, append", { + data(efc) + out <- labels_to_levels(efc, append = "_ll") + expect_named(out, c("c12hour", "e16sex", "e42dep", "c172code", "neg_c_7", "e42dep_ll")) +}) + +test_that("labels_to_levels, data frame, append", { + data(iris) + d <- as.data.frame(lapply(iris, as.factor)) + expect_identical(labels_to_levels(d), d) +}) + test_that("labels_to_levels, factor, data frame", { data(efc) out <- labels_to_levels(efc)