Merge branch 'main' into rempsyc/issue454

easystats · Nov 22, 2024 · 357dbbc · 357dbbc
2 parents c3c1302 + 2741cdc
commit 357dbbc
Show file tree

Hide file tree

Showing 6 changed files with 114 additions and 8 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -23,6 +23,9 @@ CHANGES
 * `data_read()` no longer shows warning about forthcoming breaking changes
   in upstream packages when reading `.RData` files.
 
+* `data_modify()` now recognizes `n()`, for example to create an index for data groups
+  with `1:n()` (#535).
+
 BUG FIXES
 
 * `describe_distribution()` no longer errors if the sample was too sparse to compute

diff --git a/R/data_modify.R b/R/data_modify.R
@@ -22,6 +22,9 @@
 #'   character vector is provided, you may not add further elements to `...`.
 #' - Using `NULL` as right-hand side removes a variable from the data frame.
 #'   Example: `Petal.Width = NULL`.
+#' - For data frames (including grouped ones), the function `n()` can be used to count the
+#'   number of observations and thereby, for instance, create index values by
+#'   using `id = 1:n()` or `id = 3:(n()+2)` and similar.
 #'
 #' Note that newly created variables can be used in subsequent expressions,
 #' including `.at` or `.if`. See also 'Examples'.
@@ -92,7 +95,8 @@
 #'   grouped_efc,
 #'   c12hour_c = center(c12hour),
 #'   c12hour_z = c12hour_c / sd(c12hour, na.rm = TRUE),
-#'   c12hour_z2 = standardize(c12hour)
+#'   c12hour_z2 = standardize(c12hour),
+#'   id = 1:n()
 #' )
 #' head(new_efc)
 #'
@@ -145,6 +149,11 @@ data_modify.default <- function(data, ...) {
 data_modify.data.frame <- function(data, ..., .if = NULL, .at = NULL, .modify = NULL) {
   dots <- eval(substitute(alist(...)))
 
+  # error for data frames with no rows...
+  if (nrow(data) == 0) {
+    insight::format_error("`data` is an empty data frame. `data_modify()` only works for data frames with at least one row.") # nolint
+  }
+
   # check if we have dots, or only at/modify ----
 
   if (length(dots)) {
@@ -201,6 +210,10 @@ data_modify.grouped_df <- function(data, ..., .if = NULL, .at = NULL, .modify =
   # the data.frame method later...
   dots <- match.call(expand.dots = FALSE)[["..."]]
 
+  # error for data frames with no rows...
+  if (nrow(data) == 0) {
+    insight::format_error("`data` is an empty data frame. `data_modify()` only works for data frames with at least one row.") # nolint
+  }
 
   grps <- attr(data, "groups", exact = TRUE)
   grps <- grps[[".rows"]]
@@ -352,8 +365,12 @@ data_modify.grouped_df <- function(data, ..., .if = NULL, .at = NULL, .modify =
   # finally, we can evaluate expression and get values for new variables
   symbol_string <- insight::safe_deparse(symbol)
   if (!is.null(symbol_string) && all(symbol_string == "n()")) {
-    # "special" functions
+    # "special" functions - using "n()" just returns number of rows
     new_variable <- nrow(data)
+  } else if (!is.null(symbol_string) && length(symbol_string) == 1 && grepl("\\bn\\(\\)", symbol_string)) {
+    # "special" functions, like "1:n()" or similar - but not "1:fun()"
+    symbol_string <- str2lang(gsub("n()", "nrow(data)", symbol_string, fixed = TRUE))
+    new_variable <- try(with(data, eval(symbol_string)), silent = TRUE)
   } else {
     # default evaluation of expression
     new_variable <- try(with(data, eval(symbol)), silent = TRUE)

diff --git a/R/standardize.models.R b/R/standardize.models.R
@@ -78,6 +78,14 @@ standardize.default <- function(x,
     return(x)
   }
 
+  # check model formula. Some notations don't work when standardizing data
+  insight::formula_ok(
+    x,
+    action = "error",
+    prefix_msg = "Model cannot be standardized.",
+    verbose = verbose
+  )
+
   data_std <- NULL # needed to avoid note
   .standardize_models(x,
     robust = robust, two_sd = two_sd,

diff --git a/man/data_modify.Rd b/man/data_modify.Rd
diff --git a/tests/testthat/test-data_modify.R b/tests/testthat/test-data_modify.R
@@ -353,6 +353,16 @@ test_that("data_modify errors for non df", {
 })
 
 
+test_that("data_modify errors for empty data frames", {
+  data(mtcars)
+  x <- mtcars[1, ]
+  expect_error(
+    data_modify(x[-1, ], new_var = 5),
+    regex = "empty data frame"
+  )
+})
+
+
 test_that("data_modify errors for non df", {
   data(efc)
   a <- "center(c22hour)" # <---------------- error in variable name
@@ -492,6 +502,20 @@ test_that("data_modify works with functions that return character vectors", {
 })
 
 
+test_that("data_modify 1:n() and similar works in (grouped) data frames", {
+  data(mtcars)
+  out <- data_modify(mtcars, Trials = 1:n()) # nolint
+  expect_identical(out$Trials, 1:32)
+  x <- data_group(mtcars, "gear")
+  out <- data_modify(x, Trials = 1:n()) # nolint
+  expect_identical(out$Trials[out$gear == 3], 1:15)
+  expect_identical(out$Trials[out$gear == 4], 1:12)
+  out <- data_modify(x, Trials = 3:(n() + 2))
+  expect_identical(out$Trials[out$gear == 3], 3:17)
+  expect_identical(out$Trials[out$gear == 4], 3:14)
+})
+
+
 test_that("data_modify .if/.at arguments", {
   data(iris)
   d <- iris[1:5, ]
@@ -550,3 +574,31 @@ test_that("data_modify .if/.at arguments", {
   out <- data_modify(d, new_length = Petal.Length * 2, .if = is.numeric, .modify = round)
   expect_equal(out$new_length, c(3, 3, 3, 3, 3), ignore_attr = TRUE)
 })
+
+
+skip_if_not_installed("withr")
+
+withr::with_environment(
+  new.env(),
+  test_that("data_modify 1:n() and similar works in (grouped) data frames inside function calls", {
+    data(mtcars)
+    x <- data_group(mtcars, "gear")
+
+    foo <- function(d) {
+      out <- data_modify(d, Trials = 1:n()) # nolint
+      out$Trials
+    }
+    expect_identical(
+      foo(x),
+      c(
+        1L, 2L, 3L, 1L, 2L, 3L, 4L, 4L, 5L, 6L, 7L, 5L, 6L, 7L, 8L,
+        9L, 10L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 11L, 1L, 2L, 3L,
+        4L, 5L, 12L
+      )
+    )
+  })
+)
+
+test_that("data_modify errors on non-defined function", {
+  expect_error(data_modify(iris, Species = foo()))
+})
diff --git a/tests/testthat/test-standardize_models.R b/tests/testthat/test-standardize_models.R
@@ -31,6 +31,29 @@ test_that("standardize | errors", {
 })
 
 
+test_that("standardize | problematic formulas", {
+  data(mtcars)
+  m <- lm(mpg ~ hp, data = mtcars)
+  expect_equal(
+    coef(standardise(m)),
+    c(`(Intercept)` = -3.14935717633686e-17, hp = -0.776168371826586),
+    tolerance = 1e-4
+  )
+
+  colnames(mtcars)[1] <- "1_mpg"
+  m <- lm(`1_mpg` ~ hp, data = mtcars)
+  expect_error(standardise(m), regex = "Looks like")
+
+  # works interactive only
+  # data(mtcars)
+  # m <- lm(mtcars$mpg ~ mtcars$hp)
+  # expect_error(standardise(m), regex = "model formulas")
+
+  m <- lm(mtcars[, 1] ~ hp, data = mtcars)
+  expect_error(standardise(m), regex = "indexed data")
+})
+
+
 # Transformations ---------------------------------------------------------
 test_that("transformations", {
   skip_if_not_installed("effectsize")
@@ -206,15 +229,14 @@ test_that("standardize non-Gaussian response", {
 # variables evaluated in the environment $$$ ------------------------------
 test_that("variables evaluated in the environment", {
   m <- lm(mtcars$mpg ~ mtcars$cyl + am, data = mtcars)
-  w <- capture_warnings(standardize(m))
-  expect_true(any(grepl("mtcars$mpg", w, fixed = TRUE)))
+  w <- capture_error(standardize(m))
+  expect_true(any(grepl("Using `$`", w, fixed = TRUE)))
 
   ## Note:
   # No idea why this is suddenly not giving a warning on older R versions.
   m <- lm(mtcars$mpg ~ mtcars$cyl + mtcars$am, data = mtcars)
-  warns <- capture_warnings(standardize(m))
-  expect_true(any(grepl("mtcars$mpg", warns, fixed = TRUE)))
-  expect_true(any(grepl("No variables", warns, fixed = TRUE)))
+  w <- capture_error(standardize(m))
+  expect_true(any(grepl("Using `$`", w, fixed = TRUE)))
 })