Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement data_replicate() #488

Merged
merged 17 commits into from
Mar 23, 2024
Merged
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Type: Package
Package: datawizard
Title: Easy Data Wrangling and Statistical Transformations
Version: 0.9.1.7
Version: 0.9.1.8
Authors@R: c(
person("Indrajeet", "Patil", , "[email protected]", role = "aut",
comment = c(ORCID = "0000-0003-1995-6531", Twitter = "@patilindrajeets")),
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -249,6 +249,7 @@ export(data_remove)
export(data_rename)
export(data_rename_rows)
export(data_reorder)
export(data_replicate)
export(data_restoretype)
export(data_rotate)
export(data_seek)
Expand Down
3 changes: 3 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@ NEW FUNCTIONS

* `data_summary()`, to compute summary statistics of (grouped) data frames.

* `data_replicate()`, to expand a data frame by replicating rows based on another
variable that contains the counts of replications per row.

CHANGES

* `data_modify()` gets three new arguments, `.at`, `.if` and `.modify`, to modify
Expand Down
120 changes: 120 additions & 0 deletions R/data_replicate.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
#' @title Expand (i.e. replicate rows) a data frame
#' @name data_replicate
#'
#' @description
#' Expand a data frame by replicating rows based on another variable that
#' contains the counts of replications per row.
#'
#' @param data A data frame.
#' @param expand The name of the column that contains the counts of replications
#' for each row. Can also be a numeric value, indicating the position of that
#' column. Note that the variable indicated by `expand` must be an integer vector.
#' @param remove_na Logical. If `TRUE`, missing values in the column
#' provided in `expand` are removed from the data frame. If `FALSE` and `expand`
#' contains missing values, the function will throw an error.
#' @param ... Currently not used.
#' @inheritParams find_columns
#'
#' @return A dataframe with each row replicated as many times as defined in `expand`.
#'
#' @examples
#' data(mtcars)
#' data_replicate(head(mtcars), "carb")
#' @export
data_replicate <- function(data,
expand = NULL,
select = NULL,
exclude = NULL,
remove_na = FALSE,
ignore_case = FALSE,
verbose = TRUE,
regex = FALSE,
...) {
# we need a name for the new column
if (is.null(expand)) {
insight::format_error(
"No column that should be used to expand the data frame was provided. Please use `expand` to define a column."
)
}

# only one column name
if (length(expand) > 1) {
insight::format_error(
"Please provide only a single string for `expand`, no character vector with multiple values."
)
}

# check if numerics, and if so, use column name
if (is.numeric(expand)) {
expand <- colnames(data)[expand]
}

# check if in data
if (!expand %in% colnames(data)) {
insight::format_error(
"The column provided in `expand` does not exist in the data frame.",
.misspelled_string(colnames(data), expand, "Possibly misspelled?")
)
}

# check that "expand" contains no Inf
if (any(is.infinite(data[[expand]]))) {
insight::format_error(
"The column provided in `expand` contains infinite values. Please provide a column that does not contain infinite values." # nolint
)
}

# check that "expand" is integer
if (!.is_integer(data[[expand]])) {
insight::format_error(
"The column provided in `expand` is not of type integer. Please provide a column that contains integer values." # nolint
)
}

# evaluate select/exclude, may be select-helpers
select <- .select_nse(select,
data,
exclude,
ignore_case,
regex = regex,
verbose = verbose
)

# extract variable that contains the counts of replicates
replicates <- data[[expand]]
# we can remove that column now
data[[expand]] <- NULL

# also remove "expand" from "select" string
select <- setdiff(select, expand)

# if user doesn't want to remove "NA", but replicates contain "NA",
# give informative error here
if (!remove_na && anyNA(replicates)) {
insight::format_error(
"The column provided in `expand` contains missing values, but `remove_na` is set to `FALSE`.",
"Please set `remove_na` to `TRUE` or remove the missing values from the `expand` variable."
)
}

# remove rows where "expand" is NA
data <- data[!is.na(replicates), ]
replicates <- replicates[!is.na(replicates)]

# fin
as.data.frame(do.call(cbind, lapply(data[select], rep.int, times = replicates)))
}


# is.integer(c(1, 2)) -> FALSE
# all(c(1, 2) %% 1 == 0) -> TRUE
.is_integer <- function(x, remove_na = TRUE) {
strengejacke marked this conversation as resolved.
Show resolved Hide resolved
if (remove_na) {
x <- x[!is.na(x)]
}
tryCatch(
all(x %% 1 == 0),
warning = function(w) is.integer(x),
etiennebacher marked this conversation as resolved.
Show resolved Hide resolved
error = function(e) FALSE
)
}
1 change: 1 addition & 0 deletions _pkgdown.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ reference:
- data_partition
- data_rotate
- data_group
- data_replicate
- data_duplicated
- data_unique

Expand Down
93 changes: 93 additions & 0 deletions man/data_replicate.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

54 changes: 54 additions & 0 deletions tests/testthat/test-data_replicate.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
test_that("data_replicate: simple use case", {
data(mtcars)
d <- head(mtcars)
out <- data_replicate(d, "carb")
expect_identical(dim(out), c(13L, 10L))
expect_identical(out$disp, c(160, 160, 160, 160, 160, 160, 160, 160, 108, 258, 360, 360, 225))
expect_named(out, c("mpg", "cyl", "disp", "hp", "drat", "wt", "qsec", "vs", "am", "gear"))

out <- data_replicate(d, 11)
expect_identical(dim(out), c(13L, 10L))
expect_identical(out$disp, c(160, 160, 160, 160, 160, 160, 160, 160, 108, 258, 360, 360, 225))
expect_named(out, c("mpg", "cyl", "disp", "hp", "drat", "wt", "qsec", "vs", "am", "gear"))

d$mpg[5] <- NA
out <- data_replicate(d, "carb")
expect_identical(dim(out), c(13L, 10L))
expect_identical(out$mpg, c(21, 21, 21, 21, 21, 21, 21, 21, 22.8, 21.4, NA, NA, 18.1))
expect_named(out, c("mpg", "cyl", "disp", "hp", "drat", "wt", "qsec", "vs", "am", "gear"))

d$carb[3] <- NA
out <- data_replicate(d, "carb", remove_na = TRUE)
expect_identical(dim(out), c(12L, 10L))
expect_identical(out$mpg, c(21, 21, 21, 21, 21, 21, 21, 21, 21.4, NA, NA, 18.1))
expect_named(out, c("mpg", "cyl", "disp", "hp", "drat", "wt", "qsec", "vs", "am", "gear"))

out <- data_replicate(d, "carb", select = c("disp", "hp"), remove_na = TRUE)
expect_identical(dim(out), c(12L, 2L))
expect_identical(out$disp, c(160, 160, 160, 160, 160, 160, 160, 160, 258, 360, 360, 225))
expect_named(out, c("disp", "hp"))

d <- data.frame(
a = c("a", "b", "c"),
b = 1:3,
rep = c(3, 2, 4),
stringsAsFactors = FALSE
)
out <- data_replicate(d, "rep")
expect_identical(out$a, c("a", "a", "a", "b", "b", "c", "c", "c", "c"))
})


test_that("data_replicate: errors", {
data(mtcars)
d <- head(mtcars)
expect_error(data_replicate(d), regex = "No column")
expect_error(data_replicate(d, expand = c("mpg", "gear")), regex = "a single string")
expect_error(data_replicate(d, expand = "geas"), regex = "The column provided")
expect_error(data_replicate(d, expand = "qsec"), regex = "The column provided")
d$carb[3] <- NA
expect_error(data_replicate(d, "carb"), regex = "missing values")
d <- head(mtcars)
d$carb[3] <- Inf
expect_error(data_replicate(d, "carb"), regex = "infinite values")
})
Loading