From 7db36d6d558152aa1d1d0baaa2f5a245119db863 Mon Sep 17 00:00:00 2001 From: Daniel Date: Sat, 8 Jun 2024 18:25:36 +0200 Subject: [PATCH] Allow named character vector in `select`, to rename variables (#512) * Allow named character vector in `select`, to rename variables * typo * tests * Update R/select_nse.R Co-authored-by: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com> * Update NEWS.md Co-authored-by: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com> * lintr * add allow_rename * fix tests * error on duplicated names * Update tests/testthat/test-data_select.R Co-authored-by: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com> * Update tests/testthat/test-data_select.R Co-authored-by: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com> * Update R/select_nse.R Co-authored-by: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com> * add test --------- Co-authored-by: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com> --- DESCRIPTION | 2 +- NEWS.md | 5 ++++ R/data_select.R | 6 ++++ R/extract_column_names.R | 7 +++++ R/row_means.R | 6 ++-- R/select_nse.R | 27 ++++++++++++++++- R/to_factor.R | 6 ++-- R/unnormalize.R | 18 ++++++------ man/extract_column_names.Rd | 7 +++++ tests/testthat/test-data_select.R | 48 ++++++++++++++++++++++++++++--- 10 files changed, 111 insertions(+), 21 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 44785810d..2ed1150ae 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: datawizard Title: Easy Data Wrangling and Statistical Transformations -Version: 0.11.0.1 +Version: 0.11.0.2 Authors@R: c( person("Indrajeet", "Patil", , "patilindrajeet.science@gmail.com", role = "aut", comment = c(ORCID = "0000-0003-1995-6531", Twitter = "@patilindrajeets")), diff --git a/NEWS.md b/NEWS.md index 4b346e9cb..1e49e91a4 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,10 @@ # datawizard 0.11.0.1 +## Changes + +* `data_select()` can directly rename selected variables when a named vector + is provided in `select`, e.g. `data_select(mtcars, c(new1 = "mpg", new2 = "cyl"))`. + # datawizard 0.11.0 BREAKING CHANGES diff --git a/R/data_select.R b/R/data_select.R index c59d41b72..0f62ba398 100644 --- a/R/data_select.R +++ b/R/data_select.R @@ -13,6 +13,7 @@ data_select <- function(data, exclude, ignore_case = ignore_case, regex = regex, + allow_rename = TRUE, verbose = FALSE ) @@ -28,6 +29,11 @@ data_select <- function(data, out <- data[columns] + # for named character vectors, we offer the service to directly rename the columns + if (!is.null(names(columns))) { + colnames(out) <- names(columns) + } + # add back attributes out <- .replace_attrs(out, a) out diff --git a/R/extract_column_names.R b/R/extract_column_names.R index f1cd68be2..b89173a8c 100644 --- a/R/extract_column_names.R +++ b/R/extract_column_names.R @@ -63,6 +63,10 @@ #' #' @details #' +#' Specifically for `data_select()`, `select` can also be a named character +#' vector. In this case, the names are used to rename the columns in the +#' output data frame. See 'Examples'. +#' #' Note that it is possible to either pass an entire select helper or only the #' pattern inside a select helper as a function argument: #' @@ -124,6 +128,9 @@ #' # find numeric with mean > 3.5 #' numeric_mean_35 <- function(x) is.numeric(x) && mean(x, na.rm = TRUE) > 3.5 #' extract_column_names(iris, numeric_mean_35) +#' +#' # rename returned columns for "data_select()" +#' head(data_select(mtcars, c(`Miles per Gallon` = "mpg", Cylinders = "cyl"))) #' @export extract_column_names <- function(data, select = NULL, diff --git a/R/row_means.R b/R/row_means.R index fdcaa49fd..4d2876c6a 100644 --- a/R/row_means.R +++ b/R/row_means.R @@ -110,7 +110,9 @@ row_means <- function(data, } # proceed here if min_valid is not NULL - if (!is.null(min_valid)) { + if (is.null(min_valid)) { + out <- rowMeans(data, na.rm = remove_na) + } else { # is 'min_valid' indicating a proportion? decimals <- min_valid %% 1 if (decimals != 0) { @@ -126,8 +128,6 @@ row_means <- function(data, to_na <- rowSums(is.na(data)) > ncol(data) - min_valid out <- rowMeans(data, na.rm = TRUE) out[to_na] <- NA - } else { - out <- rowMeans(data, na.rm = remove_na) } # round, if requested diff --git a/R/select_nse.R b/R/select_nse.R index eb1c6012a..8f9eba096 100644 --- a/R/select_nse.R +++ b/R/select_nse.R @@ -2,7 +2,8 @@ # https://github.com/nathaneastwood/poorman/blob/master/R/select_positions.R .select_nse <- function(select, data, exclude, ignore_case, regex = FALSE, - remove_group_var = FALSE, verbose = FALSE) { + remove_group_var = FALSE, allow_rename = FALSE, + verbose = FALSE) { .check_data(data) columns <- colnames(data) @@ -75,6 +76,30 @@ out <- setdiff(out, grp_vars) } + # for named character vectors, we offer the service to rename the columns + if (allow_rename && typeof(expr_select) == "language") { + # safe evaluation of the expression, to get the named vector from "select" + new_names <- tryCatch(eval(expr_select), error = function(e) NULL) + # check if we really have a named vector + if (!is.null(new_names) && !is.null(names(new_names))) { + # if so, copy names + all_names <- names(new_names) + # if some of the elements don't have a name, we set the value as name + names(new_names)[!nzchar(all_names)] <- new_names[!nzchar(all_names)] + # after inclusion and exclusion, the original values in "select" + # may have changed, so we check that we only add names of valid values + out <- stats::setNames(out, names(new_names)[new_names %in% out]) + # check if we have any duplicated names, and if so, give an error + if (anyDuplicated(names(out)) > 0) { + insight::format_error(paste0( + "Following names are duplicated after renaming: ", + text_concatenate(names(out)[duplicated(names(out))], enclose = "`"), + ". Using duplicated names is no good practice and therefore discouraged. Please provide unique names." + )) + } + } + } + out } diff --git a/R/to_factor.R b/R/to_factor.R index 8fa46d404..f0c59ed61 100644 --- a/R/to_factor.R +++ b/R/to_factor.R @@ -115,7 +115,7 @@ to_factor.data.frame <- function(x, # drop factors, when append is not FALSE select <- colnames(x[select])[!vapply(x[select], is.factor, FUN.VALUE = logical(1L))] # process arguments - args <- .process_append( + my_args <- .process_append( x, select, append, @@ -125,8 +125,8 @@ to_factor.data.frame <- function(x, preserve_value_labels = TRUE ) # update processed arguments - x <- args$x - select <- args$select + x <- my_args$x + select <- my_args$select } x[select] <- lapply(x[select], to_factor, verbose = verbose, ...) diff --git a/R/unnormalize.R b/R/unnormalize.R index ea41dcd61..694af9d8f 100644 --- a/R/unnormalize.R +++ b/R/unnormalize.R @@ -24,7 +24,12 @@ unnormalize.numeric <- function(x, verbose = TRUE, ...) { dots <- match.call(expand.dots = FALSE)[["..."]] grp_attr_dw <- eval(dots$grp_attr_dw, envir = parent.frame(1L)) - if (!is.null(grp_attr_dw)) { + if (is.null(grp_attr_dw)) { + include_bounds <- attr(x, "include_bounds") + min_value <- attr(x, "min_value") + range_difference <- attr(x, "range_difference") + to_range <- attr(x, "to_range") + } else { names(grp_attr_dw) <- gsub(".*\\.", "", names(grp_attr_dw)) include_bounds <- grp_attr_dw["include_bounds"] min_value <- grp_attr_dw["min_value"] @@ -33,11 +38,6 @@ unnormalize.numeric <- function(x, verbose = TRUE, ...) { if (is.na(to_range)) { to_range <- NULL } - } else { - include_bounds <- attr(x, "include_bounds") - min_value <- attr(x, "min_value") - range_difference <- attr(x, "range_difference") - to_range <- attr(x, "to_range") } if (is.null(min_value) || is.null(range_difference)) { @@ -78,10 +78,10 @@ unnormalize.data.frame <- function(x, dots <- match.call(expand.dots = FALSE)[["..."]] - if (!is.null(dots$grp_attr_dw)) { - grp_attr_dw <- eval(dots$grp_attr_dw, envir = parent.frame(1L)) - } else { + if (is.null(dots$grp_attr_dw)) { grp_attr_dw <- NULL + } else { + grp_attr_dw <- eval(dots$grp_attr_dw, envir = parent.frame(1L)) } for (i in select) { diff --git a/man/extract_column_names.Rd b/man/extract_column_names.Rd index b8c646789..6805d9569 100644 --- a/man/extract_column_names.Rd +++ b/man/extract_column_names.Rd @@ -125,6 +125,10 @@ columns. match a certain search pattern, while \code{data_select()} returns the found data. } \details{ +Specifically for \code{data_select()}, \code{select} can also be a named character +vector. In this case, the names are used to rename the columns in the +output data frame. See 'Examples'. + Note that it is possible to either pass an entire select helper or only the pattern inside a select helper as a function argument: @@ -182,6 +186,9 @@ extract_column_names(iris, starts_with("Sepal"), exclude = contains("Width")) # find numeric with mean > 3.5 numeric_mean_35 <- function(x) is.numeric(x) && mean(x, na.rm = TRUE) > 3.5 extract_column_names(iris, numeric_mean_35) + +# rename returned columns for "data_select()" +head(data_select(mtcars, c(`Miles per Gallon` = "mpg", Cylinders = "cyl"))) } \seealso{ \itemize{ diff --git a/tests/testthat/test-data_select.R b/tests/testthat/test-data_select.R index 2557a1f7b..6b78ec602 100644 --- a/tests/testthat/test-data_select.R +++ b/tests/testthat/test-data_select.R @@ -106,22 +106,22 @@ test_that("data_select works with user-defined select-functions", { test_that("data_select works with negated select-functions", { expect_identical( data_select(iris, -is.numeric()), - iris[sapply(iris, function(i) !is.numeric(i))] + iris[sapply(iris, function(i) !is.numeric(i))] # nolint ) expect_identical( data_select(iris, -is.numeric), - iris[sapply(iris, function(i) !is.numeric(i))] + iris[sapply(iris, function(i) !is.numeric(i))] # nolint ) expect_identical( data_select(iris, -is.factor()), - iris[sapply(iris, function(i) !is.factor(i))] + iris[sapply(iris, function(i) !is.factor(i))] # nolint ) expect_identical( data_select(iris, -is.factor), - iris[sapply(iris, function(i) !is.factor(i))] + iris[sapply(iris, function(i) !is.factor(i))] # nolint ) expect_identical(data_select(iris, -is.logical), iris) @@ -402,3 +402,43 @@ test_that("old solution still works", { c("Sepal.Length", "Sepal.Width") ) }) + +test_that("data_select renames variables on the fly", { + data(mtcars) + expect_named( + data_select(mtcars, c(new = "mpg", old = "cyl", hoho = "wt")), + c("new", "old", "hoho") + ) + expect_named( + data_select(mtcars, c(new = "mpg", "cyl", hoho = "wt")), + c("new", "cyl", "hoho") + ) + expect_named( + data_select(mtcars, c("mpg", "cyl", "wt")), + c("mpg", "cyl", "wt") + ) + # don't fail for non-existing columns + expect_named( + data_select(mtcars, c(new = "mpg", "cyl", hoho = "wt", test = "grea")), + c("new", "cyl", "hoho") + ) + # check that excluded variables don't cause troubles + expect_named( + data_select(mtcars, c(new = "mpg", "cyl", hoho = "wt"), exclude = "wt"), + c("new", "cyl") + ) + # error when names are not unique + expect_error( + data_select(mtcars, c(new = "mpg", old = "cyl", new = "wt")), # nolint + regex = "Following names are duplicated" + ) + expect_error( + data_select(mtcars, c(new = "mpg", "cyl", cyl = "wt")), # nolint + regex = "Following names are duplicated" + ) + # when new name is used in exclude, it should be ignored + expect_named( + data_select(mtcars, c(drat = "mpg"), exclude = "drat"), + "drat" + ) +})