Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement data_replicate() #488

Merged
merged 17 commits into from
Mar 23, 2024
Merged
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Type: Package
Package: datawizard
Title: Easy Data Wrangling and Statistical Transformations
Version: 0.9.1.7
Version: 0.9.1.8
Authors@R: c(
person("Indrajeet", "Patil", , "[email protected]", role = "aut",
comment = c(ORCID = "0000-0003-1995-6531", Twitter = "@patilindrajeets")),
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -249,6 +249,7 @@ export(data_remove)
export(data_rename)
export(data_rename_rows)
export(data_reorder)
export(data_replicate)
export(data_restoretype)
export(data_rotate)
export(data_seek)
Expand Down
3 changes: 3 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@ NEW FUNCTIONS

* `data_summary()`, to compute summary statistics of (grouped) data frames.

* `data_replicate()`, to expand a data frame by replicating rows based on another
variable that contains the counts of replications per row.

CHANGES

* `data_modify()` gets three new arguments, `.at`, `.if` and `.modify`, to modify
Expand Down
86 changes: 86 additions & 0 deletions R/data_replicate.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
#' @title Expand (i.e. replicate rows) a data frame
#' @name data_replicate
#'
#' @description
#' Expand a data frame by replicating rows based on another variable that
#' contains the counts of replications per row.
#'
#' @param data A data frame.
#' @param expand The name of the column that contains the counts of replications
#' for each row.
#' @param remove_na Logical. If `TRUE`, missing values in the column
#' provided in `expand` are removed from the data frame. If `FALSE` and `expand`
#' contains missing values, the function will throw an error.
#' @param ... Currently not used.
#' @inheritParams find_columns
#'
#' @return A dataframe with each row replicated as many times as defined in `expand`.
#'
#' @examples
#' data(mtcars)
#' data_replicate(head(mtcars), "carb")
#' @export
data_replicate <- function(data,
expand = NULL,
select = NULL,
exclude = NULL,
remove_na = FALSE,
ignore_case = FALSE,
verbose = TRUE,
regex = FALSE,
...) {
# we need a name for the new column
if (is.null(expand)) {
insight::format_error(
"No column that should be used to expand the data frame was provided. Please use `expand` to define a column."
)
}

# only one column name
if (length(expand) > 1) {
insight::format_error(
"Please provide only a single string for `expand`, no character vector with multiple values."
)
}

# check if in data
if (!expand %in% colnames(data)) {
insight::format_error(
"The column provided in `expand` does not exist in the data frame.",
.misspelled_string(colnames(data), expand, "Possibly misspelled?")
)
}

# evaluate select/exclude, may be select-helpers
select <- .select_nse(select,
data,
exclude,
ignore_case,
regex = regex,
verbose = verbose
)

# extract variable that contains the counts of replicates
replicates <- data[[expand]]
# we can remove that column now
data[[expand]] <- NULL

# also remove "expand" from "select" string
select <- setdiff(select, expand)

# if user doesn't want to remove "NA", but replicates contain "NA",
# give informative error here
if (!remove_na && anyNA(replicates)) {
insight::format_error(
"The column provided in `expand` contains missing values, but `remove_na` is set to `FALSE`.",
"Please set `remove_na` to `TRUE` or remove the missing values from the `expand` variable."
)
}

# remove rows where "expand" is NA
data <- data[!is.na(replicates), ]
replicates <- replicates[!is.na(replicates)]

# fin
as.data.frame(do.call(cbind, lapply(data[select], rep.int, times = replicates)))
}
1 change: 1 addition & 0 deletions _pkgdown.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ reference:
- data_partition
- data_rotate
- data_group
- data_replicate
- data_duplicated
- data_unique

Expand Down
92 changes: 92 additions & 0 deletions man/data_replicate.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

35 changes: 35 additions & 0 deletions tests/testthat/test-data_replicate.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
test_that("data_replicate: simple use case", {
data(mtcars)
d <- head(mtcars)
out <- data_replicate(d, "carb")
expect_identical(dim(out), c(13L, 10L))
expect_identical(out$disp, c(160, 160, 160, 160, 160, 160, 160, 160, 108, 258, 360, 360, 225))
expect_named(out, c("mpg", "cyl", "disp", "hp", "drat", "wt", "qsec", "vs", "am", "gear"))

d$mpg[5] <- NA
out <- data_replicate(d, "carb")
expect_identical(dim(out), c(13L, 10L))
expect_identical(out$mpg, c(21, 21, 21, 21, 21, 21, 21, 21, 22.8, 21.4, NA, NA, 18.1))
expect_named(out, c("mpg", "cyl", "disp", "hp", "drat", "wt", "qsec", "vs", "am", "gear"))

d$carb[3] <- NA
out <- data_replicate(d, "carb", remove_na = TRUE)
expect_identical(dim(out), c(12L, 10L))
expect_identical(out$mpg, c(21, 21, 21, 21, 21, 21, 21, 21, 21.4, NA, NA, 18.1))
expect_named(out, c("mpg", "cyl", "disp", "hp", "drat", "wt", "qsec", "vs", "am", "gear"))

out <- data_replicate(d, "carb", select = c("disp", "hp"), remove_na = TRUE)
expect_identical(dim(out), c(12L, 2L))
expect_identical(out$disp, c(160, 160, 160, 160, 160, 160, 160, 160, 258, 360, 360, 225))
expect_named(out, c("disp", "hp"))
})

test_that("data_replicate: errors", {
data(mtcars)
d <- head(mtcars)
expect_error(data_replicate(d), regex = "No column")
expect_error(data_replicate(d, expand = c("mpg", "gear")), regex = "a single string")
expect_error(data_replicate(d, expand = "geas"), regex = "The column provided")
d$carb[3] <- NA
expect_error(data_replicate(d, "carb"), regex = "missing values")
})