Allow named character vector in select, to rename variables (#512)

* Allow named character vector in `select`, to rename variables * typo * tests * Update R/select_nse.R Co-authored-by: Etienne Bacher <[email protected]> * Update NEWS.md Co-authored-by: Etienne Bacher <[email protected]> * lintr * add allow_rename * fix tests * error on duplicated names * Update tests/testthat/test-data_select.R Co-authored-by: Etienne Bacher <[email protected]> * Update tests/testthat/test-data_select.R Co-authored-by: Etienne Bacher <[email protected]> * Update R/select_nse.R Co-authored-by: Etienne Bacher <[email protected]> * add test --------- Co-authored-by: Etienne Bacher <[email protected]>
easystats · Jun 8, 2024 · 7db36d6 · 7db36d6
1 parent 1fafecd
commit 7db36d6
Show file tree

Hide file tree

Showing 10 changed files with 111 additions and 21 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Type: Package
 Package: datawizard
 Title: Easy Data Wrangling and Statistical Transformations
-Version: 0.11.0.1
+Version: 0.11.0.2
 Authors@R: c(
     person("Indrajeet", "Patil", , "[email protected]", role = "aut",
            comment = c(ORCID = "0000-0003-1995-6531", Twitter = "@patilindrajeets")),

diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,10 @@
 # datawizard 0.11.0.1
 
+## Changes
+
+* `data_select()` can directly rename selected variables when a named vector
+  is provided in `select`, e.g. `data_select(mtcars, c(new1 = "mpg", new2 = "cyl"))`.
+
 # datawizard 0.11.0
 
 BREAKING CHANGES

diff --git a/R/data_select.R b/R/data_select.R
@@ -13,6 +13,7 @@ data_select <- function(data,
     exclude,
     ignore_case = ignore_case,
     regex = regex,
+    allow_rename = TRUE,
     verbose = FALSE
   )
 
@@ -28,6 +29,11 @@ data_select <- function(data,
 
   out <- data[columns]
 
+  # for named character vectors, we offer the service to directly rename the columns
+  if (!is.null(names(columns))) {
+    colnames(out) <- names(columns)
+  }
+
   # add back attributes
   out <- .replace_attrs(out, a)
   out

diff --git a/R/extract_column_names.R b/R/extract_column_names.R
@@ -63,6 +63,10 @@
 #'
 #' @details
 #'
+#' Specifically for `data_select()`, `select` can also be a named character
+#' vector. In this case, the names are used to rename the columns in the
+#' output data frame. See 'Examples'.
+#'
 #' Note that it is possible to either pass an entire select helper or only the
 #' pattern inside a select helper as a function argument:
 #'
@@ -124,6 +128,9 @@
 #' # find numeric with mean > 3.5
 #' numeric_mean_35 <- function(x) is.numeric(x) && mean(x, na.rm = TRUE) > 3.5
 #' extract_column_names(iris, numeric_mean_35)
+#'
+#' # rename returned columns for "data_select()"
+#' head(data_select(mtcars, c(`Miles per Gallon` = "mpg", Cylinders = "cyl")))
 #' @export
 extract_column_names <- function(data,
                                  select = NULL,

diff --git a/R/row_means.R b/R/row_means.R
@@ -110,7 +110,9 @@ row_means <- function(data,
   }
 
   # proceed here if min_valid is not NULL
-  if (!is.null(min_valid)) {
+  if (is.null(min_valid)) {
+    out <- rowMeans(data, na.rm = remove_na)
+  } else {
     # is 'min_valid' indicating a proportion?
     decimals <- min_valid %% 1
     if (decimals != 0) {
@@ -126,8 +128,6 @@ row_means <- function(data,
     to_na <- rowSums(is.na(data)) > ncol(data) - min_valid
     out <- rowMeans(data, na.rm = TRUE)
     out[to_na] <- NA
-  } else {
-    out <- rowMeans(data, na.rm = remove_na)
   }
 
   # round, if requested

diff --git a/R/select_nse.R b/R/select_nse.R
@@ -2,7 +2,8 @@
 # https://github.com/nathaneastwood/poorman/blob/master/R/select_positions.R
 
 .select_nse <- function(select, data, exclude, ignore_case, regex = FALSE,
-                        remove_group_var = FALSE, verbose = FALSE) {
+                        remove_group_var = FALSE, allow_rename = FALSE,
+                        verbose = FALSE) {
   .check_data(data)
   columns <- colnames(data)
 
@@ -75,6 +76,30 @@
     out <- setdiff(out, grp_vars)
   }
 
+  # for named character vectors, we offer the service to rename the columns
+  if (allow_rename && typeof(expr_select) == "language") {
+    # safe evaluation of the expression, to get the named vector from "select"
+    new_names <- tryCatch(eval(expr_select), error = function(e) NULL)
+    # check if we really have a named vector
+    if (!is.null(new_names) && !is.null(names(new_names))) {
+      # if so, copy names
+      all_names <- names(new_names)
+      # if some of the elements don't have a name, we set the value as name
+      names(new_names)[!nzchar(all_names)] <- new_names[!nzchar(all_names)]
+      # after inclusion and exclusion, the original values in "select"
+      # may have changed, so we check that we only add names of valid values
+      out <- stats::setNames(out, names(new_names)[new_names %in% out])
+      # check if we have any duplicated names, and if so, give an error
+      if (anyDuplicated(names(out)) > 0) {
+        insight::format_error(paste0(
+          "Following names are duplicated after renaming: ",
+          text_concatenate(names(out)[duplicated(names(out))], enclose = "`"),
+          ". Using duplicated names is no good practice and therefore discouraged. Please provide unique names."
+        ))
+      }
+    }
+  }
+
   out
 }
 

diff --git a/R/to_factor.R b/R/to_factor.R
@@ -115,7 +115,7 @@ to_factor.data.frame <- function(x,
     # drop factors, when append is not FALSE
     select <- colnames(x[select])[!vapply(x[select], is.factor, FUN.VALUE = logical(1L))]
     # process arguments
-    args <- .process_append(
+    my_args <- .process_append(
       x,
       select,
       append,
@@ -125,8 +125,8 @@ to_factor.data.frame <- function(x,
       preserve_value_labels = TRUE
     )
     # update processed arguments
-    x <- args$x
-    select <- args$select
+    x <- my_args$x
+    select <- my_args$select
   }
 
   x[select] <- lapply(x[select], to_factor, verbose = verbose, ...)

diff --git a/R/unnormalize.R b/R/unnormalize.R
@@ -24,7 +24,12 @@ unnormalize.numeric <- function(x, verbose = TRUE, ...) {
   dots <- match.call(expand.dots = FALSE)[["..."]]
   grp_attr_dw <- eval(dots$grp_attr_dw, envir = parent.frame(1L))
 
-  if (!is.null(grp_attr_dw)) {
+  if (is.null(grp_attr_dw)) {
+    include_bounds <- attr(x, "include_bounds")
+    min_value <- attr(x, "min_value")
+    range_difference <- attr(x, "range_difference")
+    to_range <- attr(x, "to_range")
+  } else {
     names(grp_attr_dw) <- gsub(".*\\.", "", names(grp_attr_dw))
     include_bounds <- grp_attr_dw["include_bounds"]
     min_value <- grp_attr_dw["min_value"]
@@ -33,11 +38,6 @@ unnormalize.numeric <- function(x, verbose = TRUE, ...) {
     if (is.na(to_range)) {
       to_range <- NULL
     }
-  } else {
-    include_bounds <- attr(x, "include_bounds")
-    min_value <- attr(x, "min_value")
-    range_difference <- attr(x, "range_difference")
-    to_range <- attr(x, "to_range")
   }
 
   if (is.null(min_value) || is.null(range_difference)) {
@@ -78,10 +78,10 @@ unnormalize.data.frame <- function(x,
 
   dots <- match.call(expand.dots = FALSE)[["..."]]
 
-  if (!is.null(dots$grp_attr_dw)) {
-    grp_attr_dw <- eval(dots$grp_attr_dw, envir = parent.frame(1L))
-  } else {
+  if (is.null(dots$grp_attr_dw)) {
     grp_attr_dw <- NULL
+  } else {
+    grp_attr_dw <- eval(dots$grp_attr_dw, envir = parent.frame(1L))
   }
 
   for (i in select) {

diff --git a/man/extract_column_names.Rd b/man/extract_column_names.Rd
diff --git a/tests/testthat/test-data_select.R b/tests/testthat/test-data_select.R
@@ -106,22 +106,22 @@ test_that("data_select works with user-defined select-functions", {
 test_that("data_select works with negated select-functions", {
   expect_identical(
     data_select(iris, -is.numeric()),
-    iris[sapply(iris, function(i) !is.numeric(i))]
+    iris[sapply(iris, function(i) !is.numeric(i))] # nolint
   )
 
   expect_identical(
     data_select(iris, -is.numeric),
-    iris[sapply(iris, function(i) !is.numeric(i))]
+    iris[sapply(iris, function(i) !is.numeric(i))] # nolint
   )
 
   expect_identical(
     data_select(iris, -is.factor()),
-    iris[sapply(iris, function(i) !is.factor(i))]
+    iris[sapply(iris, function(i) !is.factor(i))] # nolint
   )
 
   expect_identical(
     data_select(iris, -is.factor),
-    iris[sapply(iris, function(i) !is.factor(i))]
+    iris[sapply(iris, function(i) !is.factor(i))] # nolint
   )
 
   expect_identical(data_select(iris, -is.logical), iris)
@@ -402,3 +402,43 @@ test_that("old solution still works", {
     c("Sepal.Length", "Sepal.Width")
   )
 })
+
+test_that("data_select renames variables on the fly", {
+  data(mtcars)
+  expect_named(
+    data_select(mtcars, c(new = "mpg", old = "cyl", hoho = "wt")),
+    c("new", "old", "hoho")
+  )
+  expect_named(
+    data_select(mtcars, c(new = "mpg", "cyl", hoho = "wt")),
+    c("new", "cyl", "hoho")
+  )
+  expect_named(
+    data_select(mtcars, c("mpg", "cyl", "wt")),
+    c("mpg", "cyl", "wt")
+  )
+  # don't fail for non-existing columns
+  expect_named(
+    data_select(mtcars, c(new = "mpg", "cyl", hoho = "wt", test = "grea")),
+    c("new", "cyl", "hoho")
+  )
+  # check that excluded variables don't cause troubles
+  expect_named(
+    data_select(mtcars, c(new = "mpg", "cyl", hoho = "wt"), exclude = "wt"),
+    c("new", "cyl")
+  )
+  # error when names are not unique
+  expect_error(
+    data_select(mtcars, c(new = "mpg", old = "cyl", new = "wt")), # nolint
+    regex = "Following names are duplicated"
+  )
+  expect_error(
+    data_select(mtcars, c(new = "mpg", "cyl", cyl = "wt")), # nolint
+    regex = "Following names are duplicated"
+  )
+  # when new name is used in exclude, it should be ignored
+  expect_named(
+    data_select(mtcars, c(drat = "mpg"), exclude = "drat"),
+    "drat"
+  )
+})