Allow glue-styled pattern for data_rename() (#563)

* Allow curl-styled pattern for `data_rename()` * add test * news * lintr * curl -> glue * allow alias * add options * typo * address comments * remove # in comment * fix
easystats · Nov 27, 2024 · b727cb6 · b727cb6
1 parent 9c8fcc0
commit b727cb6
Show file tree

Hide file tree

Showing 18 changed files with 401 additions and 92 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Type: Package
 Package: datawizard
 Title: Easy Data Wrangling and Statistical Transformations
-Version: 0.13.0.13
+Version: 0.13.0.14
 Authors@R: c(
     person("Indrajeet", "Patil", , "[email protected]", role = "aut",
            comment = c(ORCID = "0000-0003-1995-6531")),

diff --git a/NEWS.md b/NEWS.md
@@ -19,8 +19,11 @@ CHANGES
 * `data_read()` no longer shows warning about forthcoming breaking changes
   in upstream packages when reading `.RData` files.
 
-* `data_modify()` now recognizes `n()`, for example to create an index for data groups
-  with `1:n()` (#535).
+* `data_modify()` now recognizes `n()`, for example to create an index for data
+  groups with `1:n()` (#535).
+
+* The `replacement` argument in `data_rename()` now supports glue-styled
+  tokens  (#563).
 
 BUG FIXES
 

diff --git a/R/data_rename.R b/R/data_rename.R
@@ -10,18 +10,43 @@
 #'   pipe-workflow.
 #'
 #' @param data A data frame, or an object that can be coerced to a data frame.
-#' @param pattern Character vector. For `data_rename()`, indicates columns that
-#'   should be selected for renaming. Can be `NULL` (in which case all columns
-#'   are selected). For `data_addprefix()` or `data_addsuffix()`, a character
-#'   string, which will be added as prefix or suffix to the column names. For
-#'   `data_rename()`, `pattern` can also be a named vector. In this case, names
-#'   are used as values for the `replacement` argument (i.e. `pattern` can be a
-#'   character vector using `<new name> = "<old name>"` and argument `replacement`
-#'   will be ignored then).
-#' @param replacement Character vector. Indicates the new name of the columns
-#'   selected in `pattern`. Can be `NULL` (in which case column are numbered
-#'   in sequential order). If not `NULL`, `pattern` and `replacement` must be
-#'   of the same length. If `pattern` is a named vector, `replacement` is ignored.
+#' @param pattern Character vector.
+#'   - For `data_addprefix()` or `data_addsuffix()`, a character string, which
+#'     will be added as prefix or suffix to the column names.
+#'   - For `data_rename()`, indicates columns that should be selected for
+#'     renaming. Can be `NULL` (in which case all columns are selected).
+#'     `pattern` can also be a named vector. In this case, names are used as
+#'     values for the `replacement` argument (i.e. `pattern` can be a character
+#'     vector using `<new name> = "<old name>"` and argument `replacement` will
+#'     be ignored then).
+#' @param replacement Character vector. Can be one of the following:
+#'   - A character vector that indicates the new names of the columns selected
+#'     in `pattern`. `pattern` and `replacement` must be of the same length.
+#'   - `NULL`, in which case columns are numbered in sequential order.
+#'   - A string (i.e. character vector of length 1) with a "glue" styled pattern.
+#'     Currently supported tokens are:
+#'     - `{col}` which will be replaced by the column name, i.e. the
+#'       corresponding value in `pattern`.
+#'     - `{n}` will be replaced by the number of the variable that is replaced.
+#'     - `{letter}` will be replaced by alphabetical letters in sequential order.
+#'       If more than 26 letters are required, letters are repeated, but have
+#'       sequential numeric indices (e.g., `a1` to `z1`, followed by `a2` to `z2`).
+#'     - Finally, the name of a user-defined object that is available in the
+#'       environment can be used. Note that the object's name is not allowed to
+#'       be one of the pre-defined tokens, `"col"`, `"n"` and `"letter"`.
+#'
+#'     An example for the use of tokens is...
+#'     ```r
+#'     data_rename(
+#'       mtcars,
+#'       pattern = c("am", "vs"),
+#'       replacement = "new_name_from_{col}"
+#'     )
+#'     ```
+#'     ... which would return new column names `new_name_from_am` and
+#'     `new_name_from_vs`. See 'Examples'.
+#'
+#' If `pattern` is a named vector, `replacement` is ignored.
 #' @param rows Vector of row names.
 #' @param safe Do not throw error if for instance the variable to be
 #'   renamed/removed doesn't exist.
@@ -45,13 +70,26 @@
 #'
 #' # Change all
 #' head(data_rename(iris, replacement = paste0("Var", 1:5)))
+#'
+#' # Use glue-styled patterns
+#' head(data_rename(mtcars[1:3], c("mpg", "cyl", "disp"), "formerly_{col}"))
+#' head(data_rename(mtcars[1:3], c("mpg", "cyl", "disp"), "{col}_is_column_{n}"))
+#' head(data_rename(mtcars[1:3], c("mpg", "cyl", "disp"), "new_{letter}"))
+#'
+#' # User-defined glue-styled patterns from objects in environment
+#' x <- c("hi", "there", "!")
+#' head(data_rename(mtcars[1:3], c("mpg", "cyl", "disp"), "col_{x}"))
 #' @seealso
-#' - Functions to rename stuff: [data_rename()], [data_rename_rows()], [data_addprefix()], [data_addsuffix()]
-#' - Functions to reorder or remove columns: [data_reorder()], [data_relocate()], [data_remove()]
-#' - Functions to reshape, pivot or rotate data frames: [data_to_long()], [data_to_wide()], [data_rotate()]
+#' - Functions to rename stuff: [data_rename()], [data_rename_rows()],
+#'   [data_addprefix()], [data_addsuffix()]
+#' - Functions to reorder or remove columns: [data_reorder()], [data_relocate()],
+#'   [data_remove()]
+#' - Functions to reshape, pivot or rotate data frames: [data_to_long()],
+#'   [data_to_wide()], [data_rotate()]
 #' - Functions to recode data: [rescale()], [reverse()], [categorize()],
 #'   [recode_values()], [slide()]
-#' - Functions to standardize, normalize, rank-transform: [center()], [standardize()], [normalize()], [ranktransform()], [winsorize()]
+#' - Functions to standardize, normalize, rank-transform: [center()], [standardize()],
+#'   [normalize()], [ranktransform()], [winsorize()]
 #' - Split and merge data frames: [data_partition()], [data_merge()]
 #' - Functions to find or select columns: [data_select()], [extract_column_names()]
 #' - Functions to filter rows: [data_match()], [data_filter()]
@@ -122,14 +160,17 @@ data_rename <- function(data,
     }
   }
 
+  # check if we have "glue" styled replacement-string
+  glue_style <- length(replacement) == 1 && grepl("{", replacement, fixed = TRUE)
+
   if (length(replacement) > length(pattern) && verbose) {
     insight::format_alert(
       paste0(
         "There are more names in `replacement` than in `pattern`. The last ",
         length(replacement) - length(pattern), " names of `replacement` are not used."
       )
     )
-  } else if (length(replacement) < length(pattern) && verbose) {
+  } else if (length(replacement) < length(pattern) && verbose && !glue_style) {
     insight::format_alert(
       paste0(
         "There are more names in `pattern` than in `replacement`. The last ",
@@ -138,6 +179,11 @@ data_rename <- function(data,
     )
   }
 
+  # if we have glue-styled replacement-string, create replacement pattern now
+  if (glue_style) {
+    replacement <- .glue_replacement(pattern, replacement)
+  }
+
   for (i in seq_along(pattern)) {
     if (!is.na(replacement[i])) {
       data <- .data_rename(data, pattern[i], replacement[i], safe, verbose)
@@ -167,6 +213,84 @@ data_rename <- function(data,
 }
 
 
+.glue_replacement <- function(pattern, replacement) {
+  # this function replaces "glue" tokens into their related
+  # real names/values. Currently, following tokens are accepted:
+  # - {col}: replacement is the name of the column (indicated in "pattern")
+  # - {letter}: replacement is lower-case alphabetically letter, in sequential order
+  # - {n}: replacement is the number of the variable out of n, that should be renamed
+  out <- rep_len("", length(pattern))
+
+  # for alphabetical letters, we prepare a string if we have more than
+  # 26 columns to rename
+  if (length(out) > 26) {
+    long_letters <- paste0(
+      rep.int(letters[1:26], times = ceiling(length(out) / 26)),
+      rep(1:ceiling(length(out) / 26), each = 26)
+    )
+  } else {
+    long_letters <- letters[1:26]
+  }
+  long_letters <- long_letters[seq_len(length(out))]
+
+  for (i in seq_along(out)) {
+    # prepare pattern
+    column_name <- pattern[i]
+    out[i] <- replacement
+    # replace first pre-defined token
+    out[i] <- gsub(
+      "(.*)(\\{col\\})(.*)",
+      replacement = paste0("\\1", column_name, "\\3"),
+      x = out[i]
+    )
+    # replace second pre-defined token
+    out[i] <- gsub(
+      "(.*)(\\{n\\})(.*)",
+      replacement = paste0("\\1", i, "\\3"),
+      x = out[i]
+    )
+    # replace third pre-defined token
+    out[i] <- gsub(
+      "(.*)(\\{letter\\})(.*)",
+      replacement = paste0("\\1", long_letters[i], "\\3"),
+      x = out[i]
+    )
+    # extract all non-standard tokens
+    matches <- unlist(
+      regmatches(out[i], gregexpr("\\{([^}]*)\\}", out[i])),
+      use.names = FALSE
+    )
+    # do we have any additional tokens, i.e. variable names from the environment?
+    # users can also specify variable names, where the
+    if (length(matches)) {
+      # if so, iterate all tokens
+      for (token in matches) {
+        # evaluate token-object from the environment
+        values <- .dynEval(
+          str2lang(gsub("\\{(.*)\\}", "\\1", token)),
+          ifnotfound = insight::format_error(paste0(
+            "The object `", token, "` was not found. Please check if it really exists."
+          ))
+        )
+        # check for correct length
+        if (length(values) != length(pattern)) {
+          insight::format_error(paste0(
+            "The number of values provided in `", token, "` (", length(values),
+            " values) do not match the number of columns to rename (",
+            length(pattern), " columns)."
+          ))
+        }
+        # replace token with values from the object
+        if (length(values)) {
+          out[i] <- gsub(token, values[i], out[i], fixed = TRUE)
+        }
+      }
+    }
+  }
+  out
+}
+
+
 # Row.names ----------------------------------------------------------------
 
 #' @rdname data_rename

diff --git a/man/categorize.Rd b/man/categorize.Rd
diff --git a/man/data_match.Rd b/man/data_match.Rd
diff --git a/man/data_merge.Rd b/man/data_merge.Rd
diff --git a/man/data_partition.Rd b/man/data_partition.Rd