Skip to content

Commit

Permalink
Allow named character vector in select, to rename variables (#512)
Browse files Browse the repository at this point in the history
* Allow named character vector in `select`, to rename variables

* typo

* tests

* Update R/select_nse.R

Co-authored-by: Etienne Bacher <[email protected]>

* Update NEWS.md

Co-authored-by: Etienne Bacher <[email protected]>

* lintr

* add allow_rename

* fix tests

* error on duplicated names

* Update tests/testthat/test-data_select.R

Co-authored-by: Etienne Bacher <[email protected]>

* Update tests/testthat/test-data_select.R

Co-authored-by: Etienne Bacher <[email protected]>

* Update R/select_nse.R

Co-authored-by: Etienne Bacher <[email protected]>

* add test

---------

Co-authored-by: Etienne Bacher <[email protected]>
  • Loading branch information
strengejacke and etiennebacher authored Jun 8, 2024
1 parent 1fafecd commit 7db36d6
Show file tree
Hide file tree
Showing 10 changed files with 111 additions and 21 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Type: Package
Package: datawizard
Title: Easy Data Wrangling and Statistical Transformations
Version: 0.11.0.1
Version: 0.11.0.2
Authors@R: c(
person("Indrajeet", "Patil", , "[email protected]", role = "aut",
comment = c(ORCID = "0000-0003-1995-6531", Twitter = "@patilindrajeets")),
Expand Down
5 changes: 5 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
# datawizard 0.11.0.1

## Changes

* `data_select()` can directly rename selected variables when a named vector
is provided in `select`, e.g. `data_select(mtcars, c(new1 = "mpg", new2 = "cyl"))`.

# datawizard 0.11.0

BREAKING CHANGES
Expand Down
6 changes: 6 additions & 0 deletions R/data_select.R
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ data_select <- function(data,
exclude,
ignore_case = ignore_case,
regex = regex,
allow_rename = TRUE,
verbose = FALSE
)

Expand All @@ -28,6 +29,11 @@ data_select <- function(data,

out <- data[columns]

# for named character vectors, we offer the service to directly rename the columns
if (!is.null(names(columns))) {
colnames(out) <- names(columns)
}

# add back attributes
out <- .replace_attrs(out, a)
out
Expand Down
7 changes: 7 additions & 0 deletions R/extract_column_names.R
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,10 @@
#'
#' @details
#'
#' Specifically for `data_select()`, `select` can also be a named character
#' vector. In this case, the names are used to rename the columns in the
#' output data frame. See 'Examples'.
#'
#' Note that it is possible to either pass an entire select helper or only the
#' pattern inside a select helper as a function argument:
#'
Expand Down Expand Up @@ -124,6 +128,9 @@
#' # find numeric with mean > 3.5
#' numeric_mean_35 <- function(x) is.numeric(x) && mean(x, na.rm = TRUE) > 3.5
#' extract_column_names(iris, numeric_mean_35)
#'
#' # rename returned columns for "data_select()"
#' head(data_select(mtcars, c(`Miles per Gallon` = "mpg", Cylinders = "cyl")))
#' @export
extract_column_names <- function(data,
select = NULL,
Expand Down
6 changes: 3 additions & 3 deletions R/row_means.R
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,9 @@ row_means <- function(data,
}

# proceed here if min_valid is not NULL
if (!is.null(min_valid)) {
if (is.null(min_valid)) {
out <- rowMeans(data, na.rm = remove_na)
} else {
# is 'min_valid' indicating a proportion?
decimals <- min_valid %% 1
if (decimals != 0) {
Expand All @@ -126,8 +128,6 @@ row_means <- function(data,
to_na <- rowSums(is.na(data)) > ncol(data) - min_valid
out <- rowMeans(data, na.rm = TRUE)
out[to_na] <- NA
} else {
out <- rowMeans(data, na.rm = remove_na)
}

# round, if requested
Expand Down
27 changes: 26 additions & 1 deletion R/select_nse.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@
# https://github.com/nathaneastwood/poorman/blob/master/R/select_positions.R

.select_nse <- function(select, data, exclude, ignore_case, regex = FALSE,
remove_group_var = FALSE, verbose = FALSE) {
remove_group_var = FALSE, allow_rename = FALSE,
verbose = FALSE) {
.check_data(data)
columns <- colnames(data)

Expand Down Expand Up @@ -75,6 +76,30 @@
out <- setdiff(out, grp_vars)
}

# for named character vectors, we offer the service to rename the columns
if (allow_rename && typeof(expr_select) == "language") {
# safe evaluation of the expression, to get the named vector from "select"
new_names <- tryCatch(eval(expr_select), error = function(e) NULL)
# check if we really have a named vector
if (!is.null(new_names) && !is.null(names(new_names))) {
# if so, copy names
all_names <- names(new_names)
# if some of the elements don't have a name, we set the value as name
names(new_names)[!nzchar(all_names)] <- new_names[!nzchar(all_names)]
# after inclusion and exclusion, the original values in "select"
# may have changed, so we check that we only add names of valid values
out <- stats::setNames(out, names(new_names)[new_names %in% out])
# check if we have any duplicated names, and if so, give an error
if (anyDuplicated(names(out)) > 0) {
insight::format_error(paste0(
"Following names are duplicated after renaming: ",
text_concatenate(names(out)[duplicated(names(out))], enclose = "`"),
". Using duplicated names is no good practice and therefore discouraged. Please provide unique names."
))
}
}
}

out
}

Expand Down
6 changes: 3 additions & 3 deletions R/to_factor.R
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ to_factor.data.frame <- function(x,
# drop factors, when append is not FALSE
select <- colnames(x[select])[!vapply(x[select], is.factor, FUN.VALUE = logical(1L))]
# process arguments
args <- .process_append(
my_args <- .process_append(
x,
select,
append,
Expand All @@ -125,8 +125,8 @@ to_factor.data.frame <- function(x,
preserve_value_labels = TRUE
)
# update processed arguments
x <- args$x
select <- args$select
x <- my_args$x
select <- my_args$select
}

x[select] <- lapply(x[select], to_factor, verbose = verbose, ...)
Expand Down
18 changes: 9 additions & 9 deletions R/unnormalize.R
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,12 @@ unnormalize.numeric <- function(x, verbose = TRUE, ...) {
dots <- match.call(expand.dots = FALSE)[["..."]]
grp_attr_dw <- eval(dots$grp_attr_dw, envir = parent.frame(1L))

if (!is.null(grp_attr_dw)) {
if (is.null(grp_attr_dw)) {
include_bounds <- attr(x, "include_bounds")
min_value <- attr(x, "min_value")
range_difference <- attr(x, "range_difference")
to_range <- attr(x, "to_range")
} else {
names(grp_attr_dw) <- gsub(".*\\.", "", names(grp_attr_dw))
include_bounds <- grp_attr_dw["include_bounds"]
min_value <- grp_attr_dw["min_value"]
Expand All @@ -33,11 +38,6 @@ unnormalize.numeric <- function(x, verbose = TRUE, ...) {
if (is.na(to_range)) {
to_range <- NULL
}
} else {
include_bounds <- attr(x, "include_bounds")
min_value <- attr(x, "min_value")
range_difference <- attr(x, "range_difference")
to_range <- attr(x, "to_range")
}

if (is.null(min_value) || is.null(range_difference)) {
Expand Down Expand Up @@ -78,10 +78,10 @@ unnormalize.data.frame <- function(x,

dots <- match.call(expand.dots = FALSE)[["..."]]

if (!is.null(dots$grp_attr_dw)) {
grp_attr_dw <- eval(dots$grp_attr_dw, envir = parent.frame(1L))
} else {
if (is.null(dots$grp_attr_dw)) {
grp_attr_dw <- NULL
} else {
grp_attr_dw <- eval(dots$grp_attr_dw, envir = parent.frame(1L))
}

for (i in select) {
Expand Down
7 changes: 7 additions & 0 deletions man/extract_column_names.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

48 changes: 44 additions & 4 deletions tests/testthat/test-data_select.R
Original file line number Diff line number Diff line change
Expand Up @@ -106,22 +106,22 @@ test_that("data_select works with user-defined select-functions", {
test_that("data_select works with negated select-functions", {
expect_identical(
data_select(iris, -is.numeric()),
iris[sapply(iris, function(i) !is.numeric(i))]
iris[sapply(iris, function(i) !is.numeric(i))] # nolint
)

expect_identical(
data_select(iris, -is.numeric),
iris[sapply(iris, function(i) !is.numeric(i))]
iris[sapply(iris, function(i) !is.numeric(i))] # nolint
)

expect_identical(
data_select(iris, -is.factor()),
iris[sapply(iris, function(i) !is.factor(i))]
iris[sapply(iris, function(i) !is.factor(i))] # nolint
)

expect_identical(
data_select(iris, -is.factor),
iris[sapply(iris, function(i) !is.factor(i))]
iris[sapply(iris, function(i) !is.factor(i))] # nolint
)

expect_identical(data_select(iris, -is.logical), iris)
Expand Down Expand Up @@ -402,3 +402,43 @@ test_that("old solution still works", {
c("Sepal.Length", "Sepal.Width")
)
})

test_that("data_select renames variables on the fly", {
data(mtcars)
expect_named(
data_select(mtcars, c(new = "mpg", old = "cyl", hoho = "wt")),
c("new", "old", "hoho")
)
expect_named(
data_select(mtcars, c(new = "mpg", "cyl", hoho = "wt")),
c("new", "cyl", "hoho")
)
expect_named(
data_select(mtcars, c("mpg", "cyl", "wt")),
c("mpg", "cyl", "wt")
)
# don't fail for non-existing columns
expect_named(
data_select(mtcars, c(new = "mpg", "cyl", hoho = "wt", test = "grea")),
c("new", "cyl", "hoho")
)
# check that excluded variables don't cause troubles
expect_named(
data_select(mtcars, c(new = "mpg", "cyl", hoho = "wt"), exclude = "wt"),
c("new", "cyl")
)
# error when names are not unique
expect_error(
data_select(mtcars, c(new = "mpg", old = "cyl", new = "wt")), # nolint
regex = "Following names are duplicated"
)
expect_error(
data_select(mtcars, c(new = "mpg", "cyl", cyl = "wt")), # nolint
regex = "Following names are duplicated"
)
# when new name is used in exclude, it should be ignored
expect_named(
data_select(mtcars, c(drat = "mpg"), exclude = "drat"),
"drat"
)
})

0 comments on commit 7db36d6

Please sign in to comment.