easystats · etiennebacher · Mar 23, 2024 · Mar 20, 2024 · Mar 20, 2024 · Mar 20, 2024
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Type: Package
 Package: datawizard
 Title: Easy Data Wrangling and Statistical Transformations
-Version: 0.9.1.7
+Version: 0.9.1.8
 Authors@R: c(
     person("Indrajeet", "Patil", , "[email protected]", role = "aut",
            comment = c(ORCID = "0000-0003-1995-6531", Twitter = "@patilindrajeets")),

diff --git a/NAMESPACE b/NAMESPACE
@@ -233,6 +233,7 @@ export(data_adjust)
 export(data_arrange)
 export(data_codebook)
 export(data_duplicated)
+export(data_expand)
 export(data_extract)
 export(data_filter)
 export(data_find)

diff --git a/NEWS.md b/NEWS.md
@@ -12,6 +12,9 @@ NEW FUNCTIONS
 
 * `data_summary()`, to compute summary statistics of (grouped) data frames.
 
+* `data_expand()`, to expand a data frame by replicating rows based on another
+  variable that contains the counts of replications per row.
+
 CHANGES
 
 * `data_modify()` gets three new arguments, `.at`, `.if` and `.modify`, to modify

diff --git a/R/data_expand.R b/R/data_expand.R
@@ -0,0 +1,88 @@
+#' @title Expand (i.e. replicate rows) a data frame
+#' @name data_expand
+#'
+#' @description
+#' Expand a data frame by replicating rows based on another variable that
+#' contains the counts of replications per row.
+#'
+#' @param data A data frame.
+#' @param expand The name of the column that contains the counts of replications
+#' for each row.
+#' @param remove_na Logical, if `TRUE`, missing values (`NA`) in the column
+#' provided in `expand` are removed from the data frame. If `FALSE` and `expand`
+#' contains missing values, the function will throw an error.
+#' @param ... Currently not used.
+#' @inheritParams find_columns
+#'
+#' @return `data`, with each row replicated as many times as defined in `expand`.
+#'
+#' @examples
+#' data(mtcars)
+#' data_expand(head(mtcars), "carb")
+#' @export
+data_expand <- function(data,
+                        expand = NULL,
+                        select = NULL,
+                        exclude = NULL,
+                        remove_na = FALSE,
+                        ignore_case = FALSE,
+                        verbose = TRUE,
+                        regex = FALSE,
+                        ...) {
+  # we need a name for the new column
+  if (is.null(expand)) {
+    insight::format_error(
+      "No column that should be used to expand the data frame was provided. Please use `expand` to define a column."
+    )
+  }
+
+  # only one column name
+  if (length(expand) > 1) {
+    insight::format_error(
+      "Please provide only a single string for `expand`, no character vector with multiple values."
+    )
+  }
+
+  # check if in data
+  if (!expand %in% colnames(data)) {
+    insight::format_error(
+      "The column provided in `expand` does not exist in the data frame.",
+      .misspelled_string(colnames(data), expand, "Possibly misspelled?")
+    )
+  }
+
+  # evaluate select/exclude, may be select-helpers
+  select <- .select_nse(select,
+    data,
+    exclude,
+    ignore_case,
+    regex = regex,
+    verbose = verbose
+  )
+
+  # extract variable that contains the counts of replicates
+  replicates <- data[[expand]]
+  # we can remove that column now
+  data[[expand]] <- NULL
+
+  # also remove "expand" from "select" string
+  select <- setdiff(select, expand)
+
+  # if user doesn't want to remove "NA", but replicates contain "NA",
+  # give informative error here
+  if (!remove_na && anyNA(replicates)) {
+    insight::format_error(
+      "The column provided in `expand` contains missing values, but `remove_na` is set to `FALSE`.",
+      "Please set `remove_na` to `TRUE` or remove the missing values from the data frame."
+    )
+  }
+
+  # remove rows where "expand" is NA
+  data <- data[!is.na(replicates), ]
+  replicates <- replicates[!is.na(replicates)]
+
+  # fin
+  as.data.frame(do.call(cbind, lapply(data[select], function(variable) {
+    unlist(Map(rep, variable, replicates), use.names = FALSE)
+  })))
+}
diff --git a/_pkgdown.yaml b/_pkgdown.yaml
@@ -20,6 +20,7 @@ reference:
       - data_partition
       - data_rotate
       - data_group
+      - data_expand
       - data_duplicated
       - data_unique
 

diff --git a/man/data_expand.Rd b/man/data_expand.Rd
diff --git a/tests/testthat/test-data_expand.R b/tests/testthat/test-data_expand.R
@@ -0,0 +1,35 @@
+test_that("data_expand: simple use case", {
+  data(mtcars)
+  d <- as.data.frame(head(mtcars))
+  out <- data_expand(d, "carb")
+  expect_identical(dim(out), c(13L, 10L))
+  expect_identical(out$disp, c(160, 160, 160, 160, 160, 160, 160, 160, 108, 258, 360, 360, 225))
+  expect_named(out, c("mpg", "cyl", "disp", "hp", "drat", "wt", "qsec", "vs", "am", "gear"))
+
+  d$mpg[5] <- NA
+  out <- data_expand(d, "carb")
+  expect_identical(dim(out), c(13L, 10L))
+  expect_identical(out$mpg, c(21, 21, 21, 21, 21, 21, 21, 21, 22.8, 21.4, NA, NA, 18.1))
+  expect_named(out, c("mpg", "cyl", "disp", "hp", "drat", "wt", "qsec", "vs", "am", "gear"))
+
+  d$carb[3] <- NA
+  out <- data_expand(d, "carb", remove_na = TRUE)
+  expect_identical(dim(out), c(12L, 10L))
+  expect_identical(out$mpg, c(21, 21, 21, 21, 21, 21, 21, 21, 21.4, NA, NA, 18.1))
+  expect_named(out, c("mpg", "cyl", "disp", "hp", "drat", "wt", "qsec", "vs", "am", "gear"))
+
+  out <- data_expand(d, "carb", select = c("disp", "hp"), remove_na = TRUE)
+  expect_identical(dim(out), c(12L, 2L))
+  expect_identical(out$disp, c(160, 160, 160, 160, 160, 160, 160, 160, 258, 360, 360, 225))
+  expect_named(out, c("disp", "hp"))
+})
+
+test_that("data_expand: errors", {
+  data(mtcars)
+  d <- as.data.frame(head(mtcars))
+  expect_error(data_expand(d), regex = "No column")
+  expect_error(data_expand(d, expand = c("mpg", "gear")), regex = "a single string")
+  expect_error(data_expand(d, expand = "geas"), regex = "The column provided")
+  d$carb[3] <- NA
+  expect_error(data_expand(d, "carb"), regex = "missing values")
+})