easystats · etiennebacher · Mar 23, 2024 · Mar 20, 2024 · Mar 20, 2024 · Mar 20, 2024
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Type: Package
 Package: datawizard
 Title: Easy Data Wrangling and Statistical Transformations
-Version: 0.9.1.7
+Version: 0.9.1.8
 Authors@R: c(
     person("Indrajeet", "Patil", , "[email protected]", role = "aut",
            comment = c(ORCID = "0000-0003-1995-6531", Twitter = "@patilindrajeets")),

diff --git a/NAMESPACE b/NAMESPACE
@@ -249,6 +249,7 @@ export(data_remove)
 export(data_rename)
 export(data_rename_rows)
 export(data_reorder)
+export(data_replicate)
 export(data_restoretype)
 export(data_rotate)
 export(data_seek)

diff --git a/NEWS.md b/NEWS.md
@@ -12,6 +12,9 @@ NEW FUNCTIONS
 
 * `data_summary()`, to compute summary statistics of (grouped) data frames.
 
+* `data_replicate()`, to expand a data frame by replicating rows based on another
+  variable that contains the counts of replications per row.
+
 CHANGES
 
 * `data_modify()` gets three new arguments, `.at`, `.if` and `.modify`, to modify

diff --git a/R/data_replicate.R b/R/data_replicate.R
@@ -0,0 +1,120 @@
+#' @title Expand (i.e. replicate rows) a data frame
+#' @name data_replicate
+#'
+#' @description
+#' Expand a data frame by replicating rows based on another variable that
+#' contains the counts of replications per row.
+#'
+#' @param data A data frame.
+#' @param expand The name of the column that contains the counts of replications
+#' for each row. Can also be a numeric value, indicating the position of that
+#' column. Note that the variable indicated by `expand` must be an integer vector.
+#' @param remove_na Logical. If `TRUE`, missing values in the column
+#' provided in `expand` are removed from the data frame. If `FALSE` and `expand`
+#' contains missing values, the function will throw an error.
+#' @param ... Currently not used.
+#' @inheritParams find_columns
+#'
+#' @return A dataframe with each row replicated as many times as defined in `expand`.
+#'
+#' @examples
+#' data(mtcars)
+#' data_replicate(head(mtcars), "carb")
+#' @export
+data_replicate <- function(data,
+                           expand = NULL,
+                           select = NULL,
+                           exclude = NULL,
+                           remove_na = FALSE,
+                           ignore_case = FALSE,
+                           verbose = TRUE,
+                           regex = FALSE,
+                           ...) {
+  # we need a name for the new column
+  if (is.null(expand)) {
+    insight::format_error(
+      "No column that should be used to expand the data frame was provided. Please use `expand` to define a column."
+    )
+  }
+
+  # only one column name
+  if (length(expand) > 1) {
+    insight::format_error(
+      "Please provide only a single string for `expand`, no character vector with multiple values."
+    )
+  }
+
+  # check if numerics, and if so, use column name
+  if (is.numeric(expand)) {
+    expand <- colnames(data)[expand]
+  }
+
+  # check if in data
+  if (!expand %in% colnames(data)) {
+    insight::format_error(
+      "The column provided in `expand` does not exist in the data frame.",
+      .misspelled_string(colnames(data), expand, "Possibly misspelled?")
+    )
+  }
+
+  # check that "expand" contains no Inf
+  if (any(is.infinite(data[[expand]]))) {
+    insight::format_error(
+      "The column provided in `expand` contains infinite values. Please provide a column that does not contain infinite values." # nolint
+    )
+  }
+
+  # check that "expand" is integer
+  if (!.is_integer(data[[expand]])) {
+    insight::format_error(
+      "The column provided in `expand` is not of type integer. Please provide a column that contains integer values." # nolint
+    )
+  }
+
+  # evaluate select/exclude, may be select-helpers
+  select <- .select_nse(select,
+    data,
+    exclude,
+    ignore_case,
+    regex = regex,
+    verbose = verbose
+  )
+
+  # extract variable that contains the counts of replicates
+  replicates <- data[[expand]]
+  # we can remove that column now
+  data[[expand]] <- NULL
+
+  # also remove "expand" from "select" string
+  select <- setdiff(select, expand)
+
+  # if user doesn't want to remove "NA", but replicates contain "NA",
+  # give informative error here
+  if (!remove_na && anyNA(replicates)) {
+    insight::format_error(
+      "The column provided in `expand` contains missing values, but `remove_na` is set to `FALSE`.",
+      "Please set `remove_na` to `TRUE` or remove the missing values from the `expand` variable."
+    )
+  }
+
+  # remove rows where "expand" is NA
+  data <- data[!is.na(replicates), ]
+  replicates <- replicates[!is.na(replicates)]
+
+  # fin
+  as.data.frame(do.call(cbind, lapply(data[select], rep.int, times = replicates)))
+}
+
+
+# is.integer(c(1, 2)) -> FALSE
+# all(c(1, 2) %% 1 == 0) -> TRUE
+.is_integer <- function(x, remove_na = TRUE) {
+  if (remove_na) {
+    x <- x[!is.na(x)]
+  }
+  tryCatch(
+    all(x %% 1 == 0),
+    warning = function(w) is.integer(x),
+    error = function(e) FALSE
+  )
+}
diff --git a/_pkgdown.yaml b/_pkgdown.yaml
@@ -20,6 +20,7 @@ reference:
       - data_partition
       - data_rotate
       - data_group
+      - data_replicate
       - data_duplicated
       - data_unique
 

diff --git a/man/data_replicate.Rd b/man/data_replicate.Rd
diff --git a/tests/testthat/test-data_replicate.R b/tests/testthat/test-data_replicate.R
@@ -0,0 +1,54 @@
+test_that("data_replicate: simple use case", {
+  data(mtcars)
+  d <- head(mtcars)
+  out <- data_replicate(d, "carb")
+  expect_identical(dim(out), c(13L, 10L))
+  expect_identical(out$disp, c(160, 160, 160, 160, 160, 160, 160, 160, 108, 258, 360, 360, 225))
+  expect_named(out, c("mpg", "cyl", "disp", "hp", "drat", "wt", "qsec", "vs", "am", "gear"))
+
+  out <- data_replicate(d, 11)
+  expect_identical(dim(out), c(13L, 10L))
+  expect_identical(out$disp, c(160, 160, 160, 160, 160, 160, 160, 160, 108, 258, 360, 360, 225))
+  expect_named(out, c("mpg", "cyl", "disp", "hp", "drat", "wt", "qsec", "vs", "am", "gear"))
+
+  d$mpg[5] <- NA
+  out <- data_replicate(d, "carb")
+  expect_identical(dim(out), c(13L, 10L))
+  expect_identical(out$mpg, c(21, 21, 21, 21, 21, 21, 21, 21, 22.8, 21.4, NA, NA, 18.1))
+  expect_named(out, c("mpg", "cyl", "disp", "hp", "drat", "wt", "qsec", "vs", "am", "gear"))
+
+  d$carb[3] <- NA
+  out <- data_replicate(d, "carb", remove_na = TRUE)
+  expect_identical(dim(out), c(12L, 10L))
+  expect_identical(out$mpg, c(21, 21, 21, 21, 21, 21, 21, 21, 21.4, NA, NA, 18.1))
+  expect_named(out, c("mpg", "cyl", "disp", "hp", "drat", "wt", "qsec", "vs", "am", "gear"))
+
+  out <- data_replicate(d, "carb", select = c("disp", "hp"), remove_na = TRUE)
+  expect_identical(dim(out), c(12L, 2L))
+  expect_identical(out$disp, c(160, 160, 160, 160, 160, 160, 160, 160, 258, 360, 360, 225))
+  expect_named(out, c("disp", "hp"))
+
+  d <- data.frame(
+    a = c("a", "b", "c"),
+    b = 1:3,
+    rep = c(3, 2, 4),
+    stringsAsFactors = FALSE
+  )
+  out <- data_replicate(d, "rep")
+  expect_identical(out$a, c("a", "a", "a", "b", "b", "c", "c", "c", "c"))
+})
+
+
+test_that("data_replicate: errors", {
+  data(mtcars)
+  d <- head(mtcars)
+  expect_error(data_replicate(d), regex = "No column")
+  expect_error(data_replicate(d, expand = c("mpg", "gear")), regex = "a single string")
+  expect_error(data_replicate(d, expand = "geas"), regex = "The column provided")
+  expect_error(data_replicate(d, expand = "qsec"), regex = "The column provided")
+  d$carb[3] <- NA
+  expect_error(data_replicate(d, "carb"), regex = "missing values")
+  d <- head(mtcars)
+  d$carb[3] <- Inf
+  expect_error(data_replicate(d, "carb"), regex = "infinite values")
+})