From 410ea7a5dcf733dd7e76a368d230f9fe4c5e0728 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Tue, 1 Oct 2024 10:00:07 +0200 Subject: [PATCH 01/19] Update `DESCRIPTION` to use latest 'easystats' dependencies (#542) [create-pull-request] automated change Co-authored-by: IndrajeetPatil <11330453+IndrajeetPatil@users.noreply.github.com> --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 67e87fd6c..dbf2318f7 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -33,7 +33,7 @@ BugReports: https://github.com/easystats/datawizard/issues Depends: R (>= 3.6) Imports: - insight (>= 0.20.3), + insight (>= 0.20.4), stats, utils Suggests: From 07b20ec0be4b558e69eadb983581a47604419037 Mon Sep 17 00:00:00 2001 From: Daniel Date: Wed, 2 Oct 2024 07:20:21 +0200 Subject: [PATCH 02/19] Remove Twitter-tag from Authors@R (#545) Remove Tweitter-tag from Authors@R --- DESCRIPTION | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index dbf2318f7..3d1a0081a 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -4,19 +4,19 @@ Title: Easy Data Wrangling and Statistical Transformations Version: 0.12.3.4 Authors@R: c( person("Indrajeet", "Patil", , "patilindrajeet.science@gmail.com", role = "aut", - comment = c(ORCID = "0000-0003-1995-6531", Twitter = "@patilindrajeets")), + comment = c(ORCID = "0000-0003-1995-6531")), person("Etienne", "Bacher", , "etienne.bacher@protonmail.com", role = c("aut", "cre"), comment = c(ORCID = "0000-0002-9271-5075")), person("Dominique", "Makowski", , "dom.makowski@gmail.com", role = "aut", - comment = c(ORCID = "0000-0001-5375-9967", Twitter = "@Dom_Makowski")), + comment = c(ORCID = "0000-0001-5375-9967")), person("Daniel", "Lüdecke", , "d.luedecke@uke.de", role = "aut", - comment = c(ORCID = "0000-0002-8895-3206", Twitter = "@strengejacke")), + comment = c(ORCID = "0000-0002-8895-3206")), person("Mattan S.", "Ben-Shachar", , "matanshm@post.bgu.ac.il", role = "aut", comment = c(ORCID = "0000-0002-4287-4801")), person("Brenton M.", "Wiernik", , "brenton@wiernik.org", role = "aut", - comment = c(ORCID = "0000-0001-9560-6336", Twitter = "@bmwiernik")), + comment = c(ORCID = "0000-0001-9560-6336")), person("Rémi", "Thériault", , "remi.theriault@mail.mcgill.ca", role = "ctb", - comment = c(ORCID = "0000-0003-4315-6788", Twitter = "@rempsyc")), + comment = c(ORCID = "0000-0003-4315-6788")), person("Thomas J.", "Faulkenberry", , "faulkenberry@tarleton.edu", role = "rev"), person("Robert", "Garrett", , "rcg4@illinois.edu", role = "rev") ) From ea2f16f98927c8265b3302eafbe0298484bc5622 Mon Sep 17 00:00:00 2001 From: Daniel Date: Wed, 2 Oct 2024 16:32:57 +0200 Subject: [PATCH 03/19] Default values of `to_numeric()` (#547) Fixes #544 --- R/to_numeric.R | 18 +++++++++--------- man/datawizard-package.Rd | 10 +++++----- man/to_numeric.Rd | 16 ++++++++-------- tests/testthat/_snaps/data_to_numeric.md | 4 ++-- tests/testthat/test-data_to_numeric.R | 8 ++++---- 5 files changed, 28 insertions(+), 28 deletions(-) diff --git a/R/to_numeric.R b/R/to_numeric.R index e38e12e80..3e75bccbd 100644 --- a/R/to_numeric.R +++ b/R/to_numeric.R @@ -17,11 +17,11 @@ #' @inheritParams extract_column_names #' @inheritParams categorize #' -#' @note By default, `to_numeric()` converts factors into "binary" dummies, i.e. +#' @note When factors should be converted into multiple "binary" dummies, i.e. #' each factor level is converted into a separate column filled with a binary -#' 0-1 value. If only one column is required, use `dummy_factors = FALSE`. If -#' you want to preserve the original factor levels (in case these represent -#' numeric values), use `preserve_levels = TRUE`. +#' 0-1 value, set `dummy_factors = TRUE`. If you want to preserve the original +#' factor levels (in case these represent numeric values), use +#' `preserve_levels = TRUE`. #' #' @section Selection of variables - `select` argument: #' For most functions that have a `select` argument the complete input data @@ -34,12 +34,12 @@ #' #' @examples #' to_numeric(head(ToothGrowth)) -#' to_numeric(head(ToothGrowth), dummy_factors = FALSE) +#' to_numeric(head(ToothGrowth), dummy_factors = TRUE) #' #' # factors #' x <- as.factor(mtcars$gear) -#' to_numeric(x, dummy_factors = FALSE) -#' to_numeric(x, dummy_factors = FALSE, preserve_levels = TRUE) +#' to_numeric(x) +#' to_numeric(x, preserve_levels = TRUE) #' # same as: #' coerce_to_numeric(x) #' @@ -69,7 +69,7 @@ to_numeric.default <- function(x, verbose = TRUE, ...) { to_numeric.data.frame <- function(x, select = NULL, exclude = NULL, - dummy_factors = TRUE, + dummy_factors = FALSE, preserve_levels = FALSE, lowest = NULL, append = FALSE, @@ -191,7 +191,7 @@ to_numeric.POSIXlt <- to_numeric.Date #' @export to_numeric.factor <- function(x, - dummy_factors = TRUE, + dummy_factors = FALSE, preserve_levels = FALSE, lowest = NULL, verbose = TRUE, diff --git a/man/datawizard-package.Rd b/man/datawizard-package.Rd index db38bc334..d389df6ac 100644 --- a/man/datawizard-package.Rd +++ b/man/datawizard-package.Rd @@ -33,16 +33,16 @@ Useful links: Authors: \itemize{ - \item Indrajeet Patil \email{patilindrajeet.science@gmail.com} (\href{https://orcid.org/0000-0003-1995-6531}{ORCID}) (@patilindrajeets) - \item Dominique Makowski \email{dom.makowski@gmail.com} (\href{https://orcid.org/0000-0001-5375-9967}{ORCID}) (@Dom_Makowski) - \item Daniel Lüdecke \email{d.luedecke@uke.de} (\href{https://orcid.org/0000-0002-8895-3206}{ORCID}) (@strengejacke) + \item Indrajeet Patil \email{patilindrajeet.science@gmail.com} (\href{https://orcid.org/0000-0003-1995-6531}{ORCID}) + \item Dominique Makowski \email{dom.makowski@gmail.com} (\href{https://orcid.org/0000-0001-5375-9967}{ORCID}) + \item Daniel Lüdecke \email{d.luedecke@uke.de} (\href{https://orcid.org/0000-0002-8895-3206}{ORCID}) \item Mattan S. Ben-Shachar \email{matanshm@post.bgu.ac.il} (\href{https://orcid.org/0000-0002-4287-4801}{ORCID}) - \item Brenton M. Wiernik \email{brenton@wiernik.org} (\href{https://orcid.org/0000-0001-9560-6336}{ORCID}) (@bmwiernik) + \item Brenton M. Wiernik \email{brenton@wiernik.org} (\href{https://orcid.org/0000-0001-9560-6336}{ORCID}) } Other contributors: \itemize{ - \item Rémi Thériault \email{remi.theriault@mail.mcgill.ca} (\href{https://orcid.org/0000-0003-4315-6788}{ORCID}) (@rempsyc) [contributor] + \item Rémi Thériault \email{remi.theriault@mail.mcgill.ca} (\href{https://orcid.org/0000-0003-4315-6788}{ORCID}) [contributor] \item Thomas J. Faulkenberry \email{faulkenberry@tarleton.edu} [reviewer] \item Robert Garrett \email{rcg4@illinois.edu} [reviewer] } diff --git a/man/to_numeric.Rd b/man/to_numeric.Rd index 7478c9579..634906e4a 100644 --- a/man/to_numeric.Rd +++ b/man/to_numeric.Rd @@ -11,7 +11,7 @@ to_numeric(x, ...) x, select = NULL, exclude = NULL, - dummy_factors = TRUE, + dummy_factors = FALSE, preserve_levels = FALSE, lowest = NULL, append = FALSE, @@ -107,11 +107,11 @@ either numeric levels or dummy variables. The "counterpart" to convert variables into factors is \code{to_factor()}. } \note{ -By default, \code{to_numeric()} converts factors into "binary" dummies, i.e. +When factors should be converted into multiple "binary" dummies, i.e. each factor level is converted into a separate column filled with a binary -0-1 value. If only one column is required, use \code{dummy_factors = FALSE}. If -you want to preserve the original factor levels (in case these represent -numeric values), use \code{preserve_levels = TRUE}. +0-1 value, set \code{dummy_factors = TRUE}. If you want to preserve the original +factor levels (in case these represent numeric values), use +\code{preserve_levels = TRUE}. } \section{Selection of variables - \code{select} argument}{ @@ -126,12 +126,12 @@ to also include the original variables in the returned data frame. \examples{ to_numeric(head(ToothGrowth)) -to_numeric(head(ToothGrowth), dummy_factors = FALSE) +to_numeric(head(ToothGrowth), dummy_factors = TRUE) # factors x <- as.factor(mtcars$gear) -to_numeric(x, dummy_factors = FALSE) -to_numeric(x, dummy_factors = FALSE, preserve_levels = TRUE) +to_numeric(x) +to_numeric(x, preserve_levels = TRUE) # same as: coerce_to_numeric(x) diff --git a/tests/testthat/_snaps/data_to_numeric.md b/tests/testthat/_snaps/data_to_numeric.md index 42cb00b67..e963890a5 100644 --- a/tests/testthat/_snaps/data_to_numeric.md +++ b/tests/testthat/_snaps/data_to_numeric.md @@ -1,7 +1,7 @@ # convert data frame to numeric Code - to_numeric(head(ToothGrowth)) + to_numeric(head(ToothGrowth), dummy_factors = TRUE) Output len supp.OJ supp.VC dose 1 4.2 0 1 0.5 @@ -27,7 +27,7 @@ # convert factor to numeric Code - to_numeric(f) + to_numeric(f, dummy_factors = TRUE) Output a c i s t 1 0 0 0 1 0 diff --git a/tests/testthat/test-data_to_numeric.R b/tests/testthat/test-data_to_numeric.R index 464c35e8d..816591ac0 100644 --- a/tests/testthat/test-data_to_numeric.R +++ b/tests/testthat/test-data_to_numeric.R @@ -1,5 +1,5 @@ test_that("convert data frame to numeric", { - expect_snapshot(to_numeric(head(ToothGrowth))) + expect_snapshot(to_numeric(head(ToothGrowth), dummy_factors = TRUE)) expect_snapshot(to_numeric(head(ToothGrowth), dummy_factors = FALSE)) }) @@ -41,7 +41,7 @@ test_that("convert character to numeric lowest", { test_that("convert factor to numeric", { f <- factor(substring("statistics", 1:10, 1:10)) - expect_snapshot(to_numeric(f)) + expect_snapshot(to_numeric(f, dummy_factors = TRUE)) }) test_that("convert factor to numeric", { @@ -67,12 +67,12 @@ test_that("convert factor to numeric, dummy factors", { test_that("convert factor to numeric, append", { data(efc) expect_identical( - colnames(to_numeric(efc)), + colnames(to_numeric(efc, dummy_factors = TRUE)), c("c12hour", "e16sex", "e42dep.1", "e42dep.2", "e42dep.3", "e42dep.4", "c172code", "neg_c_7"), ignore_attr = TRUE ) expect_identical( - colnames(to_numeric(efc, append = TRUE)), + colnames(to_numeric(efc, dummy_factors = TRUE, append = TRUE)), c( "c12hour", "e16sex", "e42dep", "c172code", "neg_c_7", "e42dep_n", "e42dep_n.1", "e42dep_n.2", "e42dep_n.3", "e42dep_n.4" From 77d67f4dcfae2929de382687943e80a380272ab0 Mon Sep 17 00:00:00 2001 From: Daniel Date: Wed, 2 Oct 2024 18:03:09 +0200 Subject: [PATCH 04/19] categorize() add labels="range" option (#549) * categorize() add labels="range" option Fixes #548 * add test, docs * lintr * add option to decide on exclusive/inclusive breaks * docs --- R/categorize.R | 69 ++++++++++++++++++++++------- man/categorize.Rd | 27 +++++++++-- tests/testthat/_snaps/categorize.md | 47 ++++++++++++++++++++ tests/testthat/test-categorize.R | 24 ++++++++-- 4 files changed, 144 insertions(+), 23 deletions(-) create mode 100644 tests/testthat/_snaps/categorize.md diff --git a/R/categorize.R b/R/categorize.R index a6562ab68..9f8dd7505 100644 --- a/R/categorize.R +++ b/R/categorize.R @@ -31,10 +31,18 @@ #' for numeric variables, the minimum of the original input is preserved. For #' factors, the default minimum is `1`. For `split = "equal_range"`, the #' default minimum is always `1`, unless specified otherwise in `lowest`. +#' @param breaks Character, indicating whether breaks for categorizing data are +#' `"inclusive"` (values indicate the _upper_ bound of the _previous_ group or +#' interval) or `"exclusive"` (values indicate the _lower_ bound of the _next_ +#' group or interval to begin). Use `labels = "range"` to make this behaviour +#' easier to see. #' @param labels Character vector of value labels. If not `NULL`, `categorize()` #' will returns factors instead of numeric variables, with `labels` used -#' for labelling the factor levels. Can also be `"mean"` or `"median"` for a -#' factor with labels as the mean/median of each groups. +#' for labelling the factor levels. Can also be `"mean"`, `"median"`, +#' `"range"` or `"observed"` for a factor with labels as the mean/median, +#' the requested range (even if not all values of that range are present in +#' the data) or observed range (range of the actual recoded values) of each +#' group. See 'Examples'. #' @param append Logical or string. If `TRUE`, recoded or converted variables #' get new column names and are appended (column bind) to `x`, thus returning #' both the original and the recoded variables. The new columns get a suffix, @@ -53,7 +61,7 @@ #' #' # Splits and breaks (cut-off values) #' -#' Breaks are in general _exclusive_, this means that these values indicate +#' Breaks are by default _exclusive_, this means that these values indicate #' the lower bound of the next group or interval to begin. Take a simple #' example, a numeric variable with values from 1 to 9. The median would be 5, #' thus the first interval ranges from 1-4 and is recoded into 1, while 5-9 @@ -63,6 +71,9 @@ #' from 1 to 3 belong to the first interval and are recoded into 1 (because #' the next interval starts at 3.67), 4 to 6 into 2 and 7 to 9 into 3. #' +#' The opposite behaviour can be achieved using `breaks = "inclusive"`, in which +#' case +#' #' # Recoding into groups with equal size or range #' #' `split = "equal_length"` and `split = "equal_range"` try to divide the @@ -119,6 +130,13 @@ #' x <- sample(1:10, size = 30, replace = TRUE) #' categorize(x, "equal_length", n_groups = 3, labels = "mean") #' categorize(x, "equal_length", n_groups = 3, labels = "median") +#' +#' # cut numeric into groups with the requested range as a label name +#' # each category has the same range, and labels indicate this range +#' categorize(mtcars$mpg, "equal_length", n_groups = 5, labels = "range") +#' # in this example, each category has the same range, but labels only refer +#' # to the ranges of the actual values (present in the data) inside each group +#' categorize(mtcars$mpg, "equal_length", n_groups = 5, labels = "observed") #' @export categorize <- function(x, ...) { UseMethod("categorize") @@ -142,6 +160,7 @@ categorize.numeric <- function(x, n_groups = NULL, range = NULL, lowest = 1, + breaks = "exclusive", labels = NULL, verbose = TRUE, ...) { @@ -152,6 +171,9 @@ categorize.numeric <- function(x, if (identical(split, "equal_length")) split <- "length" if (identical(split, "equal_range")) split <- "range" + # check for valid values + breaks <- match.arg(breaks, c("exclusive", "inclusive")) + # save original_x <- x @@ -169,9 +191,9 @@ categorize.numeric <- function(x, } if (is.numeric(split)) { - breaks <- split + category_splits <- split } else { - breaks <- switch(split, + category_splits <- switch(split, median = stats::median(x), mean = mean(x), length = n_groups, @@ -182,15 +204,18 @@ categorize.numeric <- function(x, } # complete ranges, including minimum and maximum - if (!identical(split, "length")) breaks <- unique(c(min(x), breaks, max(x))) + if (!identical(split, "length")) { + category_splits <- unique(c(min(x), category_splits, max(x))) + } # recode into groups out <- droplevels(cut( x, - breaks = breaks, + breaks = category_splits, include.lowest = TRUE, - right = FALSE + right = identical(breaks, "inclusive") )) + cut_result <- out levels(out) <- 1:nlevels(out) # fix lowest value, add back into original vector @@ -201,7 +226,7 @@ categorize.numeric <- function(x, original_x[!is.na(original_x)] <- out # turn into factor? - .original_x_to_factor(original_x, x, labels, out, verbose, ...) + .original_x_to_factor(original_x, x, cut_result, labels, out, verbose, ...) } @@ -223,6 +248,7 @@ categorize.data.frame <- function(x, n_groups = NULL, range = NULL, lowest = 1, + breaks = "exclusive", labels = NULL, append = FALSE, ignore_case = FALSE, @@ -260,6 +286,7 @@ categorize.data.frame <- function(x, n_groups = n_groups, range = range, lowest = lowest, + breaks = breaks, labels = labels, verbose = verbose, ... @@ -276,6 +303,7 @@ categorize.grouped_df <- function(x, n_groups = NULL, range = NULL, lowest = 1, + breaks = "exclusive", labels = NULL, append = FALSE, ignore_case = FALSE, @@ -319,6 +347,7 @@ categorize.grouped_df <- function(x, n_groups = n_groups, range = range, lowest = lowest, + breaks = breaks, labels = labels, select = select, exclude = exclude, @@ -375,20 +404,26 @@ categorize.grouped_df <- function(x, } -.original_x_to_factor <- function(original_x, x, labels, out, verbose, ...) { +.original_x_to_factor <- function(original_x, x, cut_result, labels, out, verbose, ...) { if (!is.null(labels)) { if (length(labels) == length(unique(out))) { original_x <- as.factor(original_x) levels(original_x) <- labels - } else if (length(labels) == 1 && labels %in% c("mean", "median")) { + } else if (length(labels) == 1 && labels %in% c("mean", "median", "range", "observed")) { original_x <- as.factor(original_x) no_na_x <- original_x[!is.na(original_x)] - if (labels == "mean") { - labels <- stats::aggregate(x, list(no_na_x), FUN = mean, na.rm = TRUE)$x - } else { - labels <- stats::aggregate(x, list(no_na_x), FUN = stats::median, na.rm = TRUE)$x - } - levels(original_x) <- insight::format_value(labels, ...) + out <- switch(labels, + mean = stats::aggregate(x, list(no_na_x), FUN = mean, na.rm = TRUE)$x, + median = stats::aggregate(x, list(no_na_x), FUN = stats::median, na.rm = TRUE)$x, + # labels basically like what "cut()" returns + range = levels(cut_result), + # range based on the values that are actually present in the data + { + temp <- stats::aggregate(x, list(no_na_x), FUN = range, na.rm = TRUE)$x + apply(temp, 1, function(i) paste0("(", paste(as.vector(i), collapse = "-"), ")")) + } + ) + levels(original_x) <- insight::format_value(out, ...) } else if (isTRUE(verbose)) { insight::format_warning( "Argument `labels` and levels of the recoded variable are not of the same length.", diff --git a/man/categorize.Rd b/man/categorize.Rd index 28f823dd4..ca013ce2b 100644 --- a/man/categorize.Rd +++ b/man/categorize.Rd @@ -14,6 +14,7 @@ categorize(x, ...) n_groups = NULL, range = NULL, lowest = 1, + breaks = "exclusive", labels = NULL, verbose = TRUE, ... @@ -27,6 +28,7 @@ categorize(x, ...) n_groups = NULL, range = NULL, lowest = 1, + breaks = "exclusive", labels = NULL, append = FALSE, ignore_case = FALSE, @@ -67,10 +69,19 @@ for numeric variables, the minimum of the original input is preserved. For factors, the default minimum is \code{1}. For \code{split = "equal_range"}, the default minimum is always \code{1}, unless specified otherwise in \code{lowest}.} +\item{breaks}{Character, indicating whether breaks for categorizing data are +\code{"inclusive"} (values indicate the \emph{upper} bound of the \emph{previous} group or +interval) or \code{"exclusive"} (values indicate the \emph{lower} bound of the \emph{next} +group or interval to begin). Use \code{labels = "range"} to make this behaviour +easier to see.} + \item{labels}{Character vector of value labels. If not \code{NULL}, \code{categorize()} will returns factors instead of numeric variables, with \code{labels} used -for labelling the factor levels. Can also be \code{"mean"} or \code{"median"} for a -factor with labels as the mean/median of each groups.} +for labelling the factor levels. Can also be \code{"mean"}, \code{"median"}, +\code{"range"} or \code{"observed"} for a factor with labels as the mean/median, +the requested range (even if not all values of that range are present in +the data) or observed range (range of the actual recoded values) of each +group. See 'Examples'.} \item{verbose}{Toggle warnings.} @@ -145,7 +156,7 @@ It is basically a wrapper around base R's \code{cut()}, providing a simplified and more accessible way to define the interval breaks (cut-off values). } \section{Splits and breaks (cut-off values)}{ -Breaks are in general \emph{exclusive}, this means that these values indicate +Breaks are by default \emph{exclusive}, this means that these values indicate the lower bound of the next group or interval to begin. Take a simple example, a numeric variable with values from 1 to 9. The median would be 5, thus the first interval ranges from 1-4 and is recoded into 1, while 5-9 @@ -154,6 +165,9 @@ using \code{split = "quantile"} and \code{n_groups = 3} would define breaks at 3 and 6.33 (see \code{quantile(1:9, probs = c(1/3, 2/3))}), which means that values from 1 to 3 belong to the first interval and are recoded into 1 (because the next interval starts at 3.67), 4 to 6 into 2 and 7 to 9 into 3. + +The opposite behaviour can be achieved using \code{breaks = "inclusive"}, in which +case } \section{Recoding into groups with equal size or range}{ @@ -217,6 +231,13 @@ categorize(x, "equal_length", n_groups = 3, labels = c("low", "mid", "high")) x <- sample(1:10, size = 30, replace = TRUE) categorize(x, "equal_length", n_groups = 3, labels = "mean") categorize(x, "equal_length", n_groups = 3, labels = "median") + +# cut numeric into groups with the requested range as a label name +# each category has the same range, and labels indicate this range +categorize(mtcars$mpg, "equal_length", n_groups = 5, labels = "range") +# in this example, each category has the same range, but labels only refer +# to the ranges of the actual values (present in the data) inside each group +categorize(mtcars$mpg, "equal_length", n_groups = 5, labels = "observed") } \seealso{ \itemize{ diff --git a/tests/testthat/_snaps/categorize.md b/tests/testthat/_snaps/categorize.md new file mode 100644 index 000000000..d08c14c4d --- /dev/null +++ b/tests/testthat/_snaps/categorize.md @@ -0,0 +1,47 @@ +# categorize labelling ranged + + Code + categorize(mtcars$mpg, "equal_length", n_groups = 5) + Output + [1] 3 3 3 3 2 2 1 3 3 2 2 2 2 2 1 1 1 5 5 5 3 2 2 1 2 4 4 5 2 2 1 3 + +--- + + Code + categorize(mtcars$mpg, "equal_length", n_groups = 5, labels = "range") + Output + [1] [19.8,24.5) [19.8,24.5) [19.8,24.5) [19.8,24.5) [15.1,19.8) [15.1,19.8) + [7] [10.4,15.1) [19.8,24.5) [19.8,24.5) [15.1,19.8) [15.1,19.8) [15.1,19.8) + [13] [15.1,19.8) [15.1,19.8) [10.4,15.1) [10.4,15.1) [10.4,15.1) [29.2,33.9] + [19] [29.2,33.9] [29.2,33.9] [19.8,24.5) [15.1,19.8) [15.1,19.8) [10.4,15.1) + [25] [15.1,19.8) [24.5,29.2) [24.5,29.2) [29.2,33.9] [15.1,19.8) [15.1,19.8) + [31] [10.4,15.1) [19.8,24.5) + Levels: [10.4,15.1) [15.1,19.8) [19.8,24.5) [24.5,29.2) [29.2,33.9] + +--- + + Code + categorize(mtcars$mpg, "equal_length", n_groups = 5, labels = "observed") + Output + [1] (21-24.4) (21-24.4) (21-24.4) (21-24.4) (15.2-19.7) (15.2-19.7) + [7] (10.4-15) (21-24.4) (21-24.4) (15.2-19.7) (15.2-19.7) (15.2-19.7) + [13] (15.2-19.7) (15.2-19.7) (10.4-15) (10.4-15) (10.4-15) (30.4-33.9) + [19] (30.4-33.9) (30.4-33.9) (21-24.4) (15.2-19.7) (15.2-19.7) (10.4-15) + [25] (15.2-19.7) (26-27.3) (26-27.3) (30.4-33.9) (15.2-19.7) (15.2-19.7) + [31] (10.4-15) (21-24.4) + Levels: (10.4-15) (15.2-19.7) (21-24.4) (26-27.3) (30.4-33.9) + +--- + + Code + categorize(mtcars$mpg, "equal_length", n_groups = 5, labels = "observed", + breaks = "inclusive") + Output + [1] (21-24.4) (21-24.4) (21-24.4) (21-24.4) (15.2-19.7) (15.2-19.7) + [7] (10.4-15) (21-24.4) (21-24.4) (15.2-19.7) (15.2-19.7) (15.2-19.7) + [13] (15.2-19.7) (15.2-19.7) (10.4-15) (10.4-15) (10.4-15) (30.4-33.9) + [19] (30.4-33.9) (30.4-33.9) (21-24.4) (15.2-19.7) (15.2-19.7) (10.4-15) + [25] (15.2-19.7) (26-27.3) (26-27.3) (30.4-33.9) (15.2-19.7) (15.2-19.7) + [31] (10.4-15) (21-24.4) + Levels: (10.4-15) (15.2-19.7) (21-24.4) (26-27.3) (30.4-33.9) + diff --git a/tests/testthat/test-categorize.R b/tests/testthat/test-categorize.R index 0e0b5d317..9ab8eadde 100644 --- a/tests/testthat/test-categorize.R +++ b/tests/testthat/test-categorize.R @@ -1,5 +1,5 @@ set.seed(123) -d <- sample(1:10, size = 500, replace = TRUE) +d <- sample.int(10, size = 500, replace = TRUE) test_that("recode median", { expect_identical(categorize(d), ifelse(d >= median(d), 2, 1)) @@ -22,7 +22,7 @@ test_that("recode quantile", { }) set.seed(123) -d <- sample(1:100, size = 1000, replace = TRUE) +d <- sample.int(100, size = 1000, replace = TRUE) test_that("recode range", { expect_error(categorize(d, split = "range")) @@ -84,7 +84,7 @@ test_that("recode length", { }) set.seed(123) -x <- sample(1:10, size = 30, replace = TRUE) +x <- sample.int(10, size = 30, replace = TRUE) test_that("recode factor labels", { expect_type(categorize(x, "equal_length", n_groups = 3), "double") expect_s3_class(categorize(x, "equal_length", n_groups = 3, labels = c("low", "mid", "high")), "factor") @@ -232,3 +232,21 @@ test_that("categorize regex", { categorize(mtcars, select = "mpg") ) }) + + +# select helpers ------------------------------ +test_that("categorize labelling ranged", { + data(mtcars) + expect_snapshot(categorize(mtcars$mpg, "equal_length", n_groups = 5)) + expect_snapshot(categorize(mtcars$mpg, "equal_length", n_groups = 5, labels = "range")) + expect_snapshot(categorize(mtcars$mpg, "equal_length", n_groups = 5, labels = "observed")) +}) + +test_that("categorize breaks", { + data(mtcars) + expect_snapshot(categorize(mtcars$mpg, "equal_length", n_groups = 5, labels = "observed", breaks = "inclusive")) + expect_error( + categorize(mtcars$mpg, "equal_length", n_groups = 5, breaks = "something"), + regex = "should be one of" + ) +}) From 41aec02ed50f78865d6b9083563c9fa0dc2843d9 Mon Sep 17 00:00:00 2001 From: Daniel Date: Wed, 2 Oct 2024 18:06:36 +0200 Subject: [PATCH 05/19] typo in comment --- tests/testthat/test-categorize.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/testthat/test-categorize.R b/tests/testthat/test-categorize.R index 9ab8eadde..217797893 100644 --- a/tests/testthat/test-categorize.R +++ b/tests/testthat/test-categorize.R @@ -234,7 +234,7 @@ test_that("categorize regex", { }) -# select helpers ------------------------------ +# labelling ranges ------------------------------ test_that("categorize labelling ranged", { data(mtcars) expect_snapshot(categorize(mtcars$mpg, "equal_length", n_groups = 5)) From c4f3f29b0b8c5ef9016eca93328d46231eb62ff2 Mon Sep 17 00:00:00 2001 From: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com> Date: Sun, 6 Oct 2024 12:45:13 +0200 Subject: [PATCH 06/19] Release 0.13.0 (#546) * Increment version number to 0.13.0 * fix description, remove othe cran comments * remove deprecated functions and arguments * forgot to remove one arg * styler * update news * update news * add correct snapshot test * requires insight 0.20.5 --------- Co-authored-by: etiennebacher Co-authored-by: Daniel --- DESCRIPTION | 5 +-- NAMESPACE | 4 -- NEWS.md | 16 +++++++- R/data_partition.R | 8 ---- R/data_select.R | 22 ---------- R/demean.R | 19 +-------- R/descriptives.R | 9 +--- R/extract_column_names.R | 24 +---------- R/mean_sd.R | 14 +------ R/means_by_group.R | 15 ------- R/recode_values.R | 32 --------------- R/rescale_weights.R | 9 +--- R/skewness_kurtosis.R | 64 ----------------------------- R/text_format.R | 9 ---- cran-comments.md | 7 ---- man/coef_var.Rd | 3 -- man/data_partition.Rd | 3 -- man/demean.Rd | 11 ++--- man/extract_column_names.Rd | 22 ---------- man/mean_sd.Rd | 5 +-- man/means_by_group.Rd | 13 +----- man/recode_values.Rd | 15 ------- man/rescale_weights.Rd | 4 +- man/skewness.Rd | 4 -- man/text_format.Rd | 10 ----- tests/testthat/_snaps/categorize.md | 19 ++++----- tests/testthat/test-categorize.R | 2 +- tests/testthat/test-coef_var.R | 4 -- tests/testthat/test-mean_sd.R | 5 --- 29 files changed, 40 insertions(+), 337 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 3d1a0081a..f3e7599af 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: datawizard Title: Easy Data Wrangling and Statistical Transformations -Version: 0.12.3.4 +Version: 0.13.0 Authors@R: c( person("Indrajeet", "Patil", , "patilindrajeet.science@gmail.com", role = "aut", comment = c(ORCID = "0000-0003-1995-6531")), @@ -33,7 +33,7 @@ BugReports: https://github.com/easystats/datawizard/issues Depends: R (>= 3.6) Imports: - insight (>= 0.20.4), + insight (>= 0.20.5), stats, utils Suggests: @@ -76,4 +76,3 @@ RoxygenNote: 7.3.2 Config/testthat/edition: 3 Config/testthat/parallel: true Config/Needs/website: easystats/easystatstemplate -Remotes: easystats/insight diff --git a/NAMESPACE b/NAMESPACE index a08798db1..c435c0cc5 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -220,7 +220,6 @@ export(assign_labels) export(categorize) export(center) export(centre) -export(change_code) export(change_scale) export(coef_var) export(coerce_to_numeric) @@ -237,7 +236,6 @@ export(data_codebook) export(data_duplicated) export(data_extract) export(data_filter) -export(data_find) export(data_group) export(data_join) export(data_match) @@ -276,8 +274,6 @@ export(empty_columns) export(empty_rows) export(extract_column_names) export(find_columns) -export(format_text) -export(get_columns) export(kurtosis) export(labels_to_levels) export(mean_sd) diff --git a/NEWS.md b/NEWS.md index 8e329b0fb..f2ade5883 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,16 +1,30 @@ -# datawizard (development) +# datawizard 0.13.0 BREAKING CHANGES * `data_rename()` now errors when the `replacement` argument contains `NA` values or empty strings (#539). +* Removed deprecated functions `get_columns()`, `data_find()`, `format_text()` (#546). + +* Removed deprecated arguments `group` and `na.rm` in multiple functions. Use `by` and `remove_na` instead (#546). + +* The default value for the argument `dummy_factors` in `to_numeric()` has + changed from `TRUE` to `FALSE` (#544). + CHANGES * The `pattern` argument in `data_rename()` can also be a named vector. In this case, names are used as values for the `replacement` argument (i.e. `pattern` can be a character vector using ` = ""`). +* `categorize()` gains a new `breaks` argument, to decide whether breaks are + inclusive or exclusive (#548). + +* The `labels` argument in `categorize()` gets two new options, `"range"` and + `"observed"`, to use the range of categorized values as labels (i.e. factor + levels) (#548). + * Minor additions to `reshape_ci()` to work with forthcoming changes in the `{bayestestR}` package. diff --git a/R/data_partition.R b/R/data_partition.R index 09add9dd7..99f481e18 100644 --- a/R/data_partition.R +++ b/R/data_partition.R @@ -15,7 +15,6 @@ #' @param row_id Character string, indicating the name of the column that #' contains the row-id's. #' @param verbose Toggle messages and warnings. -#' @param group Deprecated. Use `by` instead. #' #' @return A list of data frames. The list includes one training set per given #' proportion and the remaining data as test set. List elements of training @@ -50,17 +49,10 @@ data_partition <- function(data, seed = NULL, row_id = ".row_id", verbose = TRUE, - group = NULL, ...) { # validation checks data <- .coerce_to_dataframe(data) - ## TODO: remove warning in future release - if (!is.null(group)) { - by <- group - insight::format_warning("Argument `group` is deprecated and will be removed in a future release. Please use `by` instead.") # nolint - } - if (sum(proportion) > 1) { insight::format_error("Sum of `proportion` cannot be higher than 1.") } diff --git a/R/data_select.R b/R/data_select.R index 0f62ba398..db91fc06b 100644 --- a/R/data_select.R +++ b/R/data_select.R @@ -38,25 +38,3 @@ data_select <- function(data, out <- .replace_attrs(out, a) out } - - -#' @rdname extract_column_names -#' @export -get_columns <- function(data, - select = NULL, - exclude = NULL, - ignore_case = FALSE, - regex = FALSE, - verbose = TRUE, - ...) { - insight::format_warning("Function `get_columns()` is deprecated and will be removed in a future release. Please use `data_select()` instead.") # nolint - data_select( - data, - select = select, - exclude = exclude, - ignore_case = ignore_case, - regex = regex, - verbose = verbose, - ... - ) -} diff --git a/R/demean.R b/R/demean.R index 94bfc255f..b5363edb6 100644 --- a/R/demean.R +++ b/R/demean.R @@ -43,7 +43,6 @@ #' attributes to indicate the within- and between-effects. This is only #' relevant when printing `model_parameters()` - in such cases, the #' within- and between-effects are printed in separated blocks. -#' @param group Deprecated. Use `by` instead. #' @inheritParams center #' #' @return @@ -285,14 +284,7 @@ demean <- function(x, suffix_demean = "_within", suffix_groupmean = "_between", add_attributes = TRUE, - verbose = TRUE, - group = NULL) { - ## TODO: remove warning in future release - if (!is.null(group)) { - by <- group - insight::format_warning("Argument `group` is deprecated and will be removed in a future release. Please use `by` instead.") # nolint - } - + verbose = TRUE) { degroup( x = x, select = select, @@ -317,14 +309,7 @@ degroup <- function(x, suffix_demean = "_within", suffix_groupmean = "_between", add_attributes = TRUE, - verbose = TRUE, - group = NULL) { - ## TODO: remove warning later - if (!is.null(group)) { - by <- group - insight::format_warning("Argument `group` is deprecated and will be removed in a future release. Please use `by` instead.") # nolint - } - + verbose = TRUE) { # ugly tibbles again... x <- .coerce_to_dataframe(x) diff --git a/R/descriptives.R b/R/descriptives.R index 097934d29..43479f697 100644 --- a/R/descriptives.R +++ b/R/descriptives.R @@ -77,7 +77,6 @@ coef_var.default <- function(x, verbose = TRUE, ...) { #' as the nearest endpoint. #' @param remove_na Logical. Should `NA` values be removed before computing (`TRUE`) #' or not (`FALSE`, default)? -#' @param na.rm Deprecated. Please use `remove_na` instead. #' @param n If `method = "unbiased"` and both `mu` and `sigma` are provided (not #' computed from `x`), what sample size to use to adjust the computed CV #' for small-sample bias? @@ -111,13 +110,7 @@ coef_var.default <- function(x, verbose = TRUE, ...) { #' @export coef_var.numeric <- function(x, mu = NULL, sigma = NULL, method = c("standard", "unbiased", "median_mad", "qcd"), - trim = 0, remove_na = FALSE, n = NULL, na.rm = FALSE, ...) { - # TODO: remove deprecated argument later - if (!missing(na.rm)) { - insight::format_warning("Argument `na.rm` is deprecated. Please use `remove_na` instead.") - remove_na <- na.rm - } - + trim = 0, remove_na = FALSE, n = NULL, ...) { # TODO: Support weights if (!missing(x) && all(c(-1, 1) %in% sign(x))) { insight::format_error("Coefficient of variation only applicable for ratio scale variables.") diff --git a/R/extract_column_names.R b/R/extract_column_names.R index b89173a8c..c245f4bab 100644 --- a/R/extract_column_names.R +++ b/R/extract_column_names.R @@ -160,28 +160,6 @@ extract_column_names <- function(data, columns } - -#' @rdname extract_column_names -#' @export -data_find <- function(data, - select = NULL, - exclude = NULL, - ignore_case = FALSE, - regex = FALSE, - verbose = TRUE, - ...) { - insight::format_warning("Function `data_find()` is deprecated and will be removed in a future release. Please use `extract_column_names()` instead.") # nolint - extract_column_names( - data, - select = select, - exclude = exclude, - ignore_case = ignore_case, - regex = regex, - verbose = verbose, - ... - ) -} - #' @rdname extract_column_names #' @export -find_columns <- data_find +find_columns <- extract_column_names diff --git a/R/mean_sd.R b/R/mean_sd.R index d18473d8d..42ce9b523 100644 --- a/R/mean_sd.R +++ b/R/mean_sd.R @@ -20,23 +20,13 @@ #' median_mad(mtcars$mpg) #' #' @export -mean_sd <- function(x, times = 1L, remove_na = TRUE, named = TRUE, na.rm = TRUE, ...) { - # TODO: remove deprecated argument later - if (!missing(na.rm)) { - insight::format_warning("Argument `na.rm` is deprecated. Please use `remove_na` instead.") - remove_na <- na.rm - } +mean_sd <- function(x, times = 1L, remove_na = TRUE, named = TRUE, ...) { .centrality_dispersion(x, type = "mean", times = times, remove_na = remove_na, named = named) } #' @export #' @rdname mean_sd -median_mad <- function(x, times = 1L, remove_na = TRUE, constant = 1.4826, named = TRUE, na.rm = TRUE, ...) { - # TODO: remove deprecated argument later - if (!missing(na.rm)) { - insight::format_warning("Argument `na.rm` is deprecated. Please use `remove_na` instead.") - remove_na <- na.rm - } +median_mad <- function(x, times = 1L, remove_na = TRUE, constant = 1.4826, named = TRUE, ...) { .centrality_dispersion(x, type = "median", times = times, remove_na = remove_na, constant = constant, named = named) } diff --git a/R/means_by_group.R b/R/means_by_group.R index ad188f275..39416bb11 100644 --- a/R/means_by_group.R +++ b/R/means_by_group.R @@ -19,7 +19,6 @@ #' @param digits Optional scalar, indicating the amount of digits after decimal #' point when rounding estimates and values. #' @param ... Currently not used -#' @param group Deprecated. Use `by` instead. #' @inheritParams find_columns #' #' @return A data frame with information on mean and further summary statistics @@ -60,14 +59,7 @@ means_by_group.numeric <- function(x, ci = 0.95, weights = NULL, digits = NULL, - group = NULL, ...) { - ## TODO: remove warning in future release - if (!is.null(group)) { - by <- group - insight::format_warning("Argument `group` is deprecated and will be removed in a future release. Please use `by` instead.") # nolint - } - # validation check for arguments # "by" must be provided @@ -139,14 +131,7 @@ means_by_group.data.frame <- function(x, ignore_case = FALSE, regex = FALSE, verbose = TRUE, - group = NULL, ...) { - ## TODO: remove warning in future release - if (!is.null(group)) { - by <- group - insight::format_warning("Argument `group` is deprecated and will be removed in a future release. Please use `by` instead.") # nolint - } - # evaluate select/exclude, may be select-helpers select <- .select_nse(select, x, diff --git a/R/recode_values.R b/R/recode_values.R index b4570bf44..a8e8d6d3b 100644 --- a/R/recode_values.R +++ b/R/recode_values.R @@ -527,35 +527,3 @@ recode_values.data.frame <- function(x, ok } - - -## TODO Deprecate and remove alias later - -#' @rdname recode_values -#' @export -change_code <- function(x, - select = NULL, - exclude = NULL, - recode = NULL, - default = NULL, - preserve_na = TRUE, - append = FALSE, - ignore_case = FALSE, - regex = FALSE, - verbose = TRUE, - ...) { - insight::format_warning("Function `change_code()` is deprecated. Please use `recode_values()` instead.") # nolint - recode_values( - x, - select = select, - exclude = exclude, - recode = recode, - default = default, - preserve_na = preserve_na, - append = append, - ignore_case = ignore_case, - regex = regex, - verbose = verbose, - ... - ) -} diff --git a/R/rescale_weights.R b/R/rescale_weights.R index 02aab1d2e..60d405c9d 100644 --- a/R/rescale_weights.R +++ b/R/rescale_weights.R @@ -20,7 +20,6 @@ #' @param nest Logical, if `TRUE` and `by` indicates at least two #' group variables, then groups are "nested", i.e. groups are now a #' combination from each group level of the variables in `by`. -#' @param group Deprecated. Use `by` instead. #' #' @return `data`, including the new weighting variables: `pweights_a` #' and `pweights_b`, which represent the rescaled design weights to use @@ -88,13 +87,7 @@ #' ) #' } #' @export -rescale_weights <- function(data, by, probability_weights, nest = FALSE, group = NULL) { - ## TODO: remove warning in future release - if (!is.null(group)) { - by <- group - insight::format_warning("Argument `group` is deprecated and will be removed in a future release. Please use `by` instead.") # nolint - } - +rescale_weights <- function(data, by, probability_weights, nest = FALSE) { if (inherits(by, "formula")) { by <- all.vars(by) } diff --git a/R/skewness_kurtosis.R b/R/skewness_kurtosis.R index 6142c59ad..23ced0a04 100644 --- a/R/skewness_kurtosis.R +++ b/R/skewness_kurtosis.R @@ -110,15 +110,7 @@ skewness.numeric <- function(x, type = "2", iterations = NULL, verbose = TRUE, - na.rm = TRUE, ...) { - # TODO: remove deprecated argument later - if (!missing(na.rm)) { - # TODO: add deprecation warning in a later update - insight::format_warning("Argument `na.rm` is deprecated and will be removed in a future release. Please use `remove_na` instead.") # nolint - remove_na <- na.rm - } - if (remove_na) x <- x[!is.na(x)] n <- length(x) out <- (sum((x - mean(x))^3) / n) / (sum((x - mean(x))^2) / n)^1.5 @@ -177,15 +169,7 @@ skewness.matrix <- function(x, remove_na = TRUE, type = "2", iterations = NULL, - na.rm = TRUE, ...) { - # TODO: remove deprecated argument later - if (!missing(na.rm)) { - # TODO: add deprecation warning in a later update - insight::format_warning("Argument `na.rm` is deprecated and will be removed in a future release. Please use `remove_na` instead.") # nolint - remove_na <- na.rm - } - .skewness <- apply( x, 2, @@ -213,15 +197,7 @@ skewness.data.frame <- function(x, remove_na = TRUE, type = "2", iterations = NULL, - na.rm = TRUE, ...) { - # TODO: remove deprecated argument later - if (!missing(na.rm)) { - # TODO: add deprecation warning in a later update - insight::format_warning("Argument `na.rm` is deprecated and will be removed in a future release. Please use `remove_na` instead.") # nolint - remove_na <- na.rm - } - .skewness <- lapply(x, skewness, remove_na = remove_na, @@ -241,15 +217,7 @@ skewness.default <- function(x, remove_na = TRUE, type = "2", iterations = NULL, - na.rm = TRUE, ...) { - # TODO: remove deprecated argument later - if (!missing(na.rm)) { - # TODO: add deprecation warning in a later update - insight::format_warning("Argument `na.rm` is deprecated and will be removed in a future release. Please use `remove_na` instead.") # nolint - remove_na <- na.rm - } - skewness( .factor_to_numeric(x), remove_na = remove_na, @@ -277,15 +245,7 @@ kurtosis.numeric <- function(x, type = "2", iterations = NULL, verbose = TRUE, - na.rm = TRUE, ...) { - # TODO: remove deprecated argument later - if (!missing(na.rm)) { - # TODO: add deprecation warning in a later update - insight::format_warning("Argument `na.rm` is deprecated and will be removed in a future release. Please use `remove_na` instead.") # nolint - remove_na <- na.rm - } - if (remove_na) x <- x[!is.na(x)] n <- length(x) out <- n * sum((x - mean(x))^4) / (sum((x - mean(x))^2)^2) @@ -342,15 +302,7 @@ kurtosis.matrix <- function(x, remove_na = TRUE, type = "2", iterations = NULL, - na.rm = TRUE, ...) { - # TODO: remove deprecated argument later - if (!missing(na.rm)) { - # TODO: add deprecation warning in a later update - insight::format_warning("Argument `na.rm` is deprecated and will be removed in a future release. Please use `remove_na` instead.") # nolint - remove_na <- na.rm - } - .kurtosis <- apply( x, 2, @@ -374,15 +326,7 @@ kurtosis.data.frame <- function(x, remove_na = TRUE, type = "2", iterations = NULL, - na.rm = TRUE, ...) { - # TODO: remove deprecated argument later - if (!missing(na.rm)) { - # TODO: add deprecation warning in a later update - insight::format_warning("Argument `na.rm` is deprecated and will be removed in a future release. Please use `remove_na` instead.") # nolint - remove_na <- na.rm - } - .kurtosis <- lapply(x, kurtosis, remove_na = remove_na, @@ -400,15 +344,7 @@ kurtosis.default <- function(x, remove_na = TRUE, type = "2", iterations = NULL, - na.rm = TRUE, ...) { - # TODO: remove deprecated argument later - if (!missing(na.rm)) { - # TODO: add deprecation warning in a later update - insight::format_warning("Argument `na.rm` is deprecated and will be removed in a future release. Please use `remove_na` instead.") # nolint - remove_na <- na.rm - } - kurtosis( .factor_to_numeric(x), remove_na = remove_na, diff --git a/R/text_format.R b/R/text_format.R index afdf4f861..0fa75bcac 100644 --- a/R/text_format.R +++ b/R/text_format.R @@ -42,15 +42,6 @@ text_format <- function(text, sep = ", ", last = " and ", width = NULL, enclose text_wrap(text_concatenate(text, sep = sep, last = last, enclose = enclose), width = width) } -## TODO Deprecate and remove alias later - -#' @rdname text_format -#' @export -format_text <- function(text, sep = ", ", last = " and ", width = NULL, enclose = NULL, ...) { - insight::format_warning("Function `format_text()` is deprecated and will be removed in a future release. Please use `text_format()` instead.") # nolint - text_format(text, sep = sep, last = last, width = width, enclose = enclose, ...) -} - #' @rdname text_format #' @export text_fullstop <- function(text) { diff --git a/cran-comments.md b/cran-comments.md index 2c30d1287..095f22e9a 100644 --- a/cran-comments.md +++ b/cran-comments.md @@ -9,10 +9,3 @@ We checked 18 reverse dependencies, comparing R CMD check results across CRAN an * We saw 0 new problems * We failed to check 0 packages -## Other comments - -This is a patch release that should (hopefully) fix a failure occurring on macOS -when building vignettes. This only happens on macOS with R 4.3. We tried to -reproduce this locally and in CI with the same setup, but we couldn't. Hence, we -removed all vignettes (except for one "Overview"), they are now only available -on the website. diff --git a/man/coef_var.Rd b/man/coef_var.Rd index 92274ca59..2ff973838 100644 --- a/man/coef_var.Rd +++ b/man/coef_var.Rd @@ -19,7 +19,6 @@ distribution_coef_var(x, ...) trim = 0, remove_na = FALSE, n = NULL, - na.rm = FALSE, ... ) } @@ -52,8 +51,6 @@ or not (\code{FALSE}, default)?} \item{n}{If \code{method = "unbiased"} and both \code{mu} and \code{sigma} are provided (not computed from \code{x}), what sample size to use to adjust the computed CV for small-sample bias?} - -\item{na.rm}{Deprecated. Please use \code{remove_na} instead.} } \value{ The computed coefficient of variation for \code{x}. diff --git a/man/data_partition.Rd b/man/data_partition.Rd index 68ac05a19..1150b4f28 100644 --- a/man/data_partition.Rd +++ b/man/data_partition.Rd @@ -11,7 +11,6 @@ data_partition( seed = NULL, row_id = ".row_id", verbose = TRUE, - group = NULL, ... ) } @@ -33,8 +32,6 @@ contains the row-id's.} \item{verbose}{Toggle messages and warnings.} -\item{group}{Deprecated. Use \code{by} instead.} - \item{...}{Other arguments passed to or from other functions.} } \value{ diff --git a/man/demean.Rd b/man/demean.Rd index 8a9a49308..fb4db3a29 100644 --- a/man/demean.Rd +++ b/man/demean.Rd @@ -14,8 +14,7 @@ demean( suffix_demean = "_within", suffix_groupmean = "_between", add_attributes = TRUE, - verbose = TRUE, - group = NULL + verbose = TRUE ) degroup( @@ -27,8 +26,7 @@ degroup( suffix_demean = "_within", suffix_groupmean = "_between", add_attributes = TRUE, - verbose = TRUE, - group = NULL + verbose = TRUE ) detrend( @@ -40,8 +38,7 @@ detrend( suffix_demean = "_within", suffix_groupmean = "_between", add_attributes = TRUE, - verbose = TRUE, - group = NULL + verbose = TRUE ) } \arguments{ @@ -86,8 +83,6 @@ within- and between-effects are printed in separated blocks.} \item{verbose}{Toggle warnings and messages.} -\item{group}{Deprecated. Use \code{by} instead.} - \item{center}{Method for centering. \code{demean()} always performs mean-centering, while \code{degroup()} can use \code{center = "median"} or \code{center = "mode"} for median- or mode-centering, and also \code{"min"} diff --git a/man/extract_column_names.Rd b/man/extract_column_names.Rd index 6805d9569..6e658ab33 100644 --- a/man/extract_column_names.Rd +++ b/man/extract_column_names.Rd @@ -2,9 +2,7 @@ % Please edit documentation in R/data_select.R, R/extract_column_names.R \name{data_select} \alias{data_select} -\alias{get_columns} \alias{extract_column_names} -\alias{data_find} \alias{find_columns} \title{Find or get columns in a data frame based on search patterns} \usage{ @@ -18,16 +16,6 @@ data_select( ... ) -get_columns( - data, - select = NULL, - exclude = NULL, - ignore_case = FALSE, - regex = FALSE, - verbose = TRUE, - ... -) - extract_column_names( data, select = NULL, @@ -38,16 +26,6 @@ extract_column_names( ... ) -data_find( - data, - select = NULL, - exclude = NULL, - ignore_case = FALSE, - regex = FALSE, - verbose = TRUE, - ... -) - find_columns( data, select = NULL, diff --git a/man/mean_sd.Rd b/man/mean_sd.Rd index f0ea239f8..33eeb4bc5 100644 --- a/man/mean_sd.Rd +++ b/man/mean_sd.Rd @@ -5,7 +5,7 @@ \alias{median_mad} \title{Summary Helpers} \usage{ -mean_sd(x, times = 1L, remove_na = TRUE, named = TRUE, na.rm = TRUE, ...) +mean_sd(x, times = 1L, remove_na = TRUE, named = TRUE, ...) median_mad( x, @@ -13,7 +13,6 @@ median_mad( remove_na = TRUE, constant = 1.4826, named = TRUE, - na.rm = TRUE, ... ) } @@ -29,8 +28,6 @@ or not (\code{FALSE}, default)?} \item{named}{Should the vector be named? (E.g., \code{c("-SD" = -1, Mean = 1, "+SD" = 2)}.)} -\item{na.rm}{Deprecated. Please use \code{remove_na} instead.} - \item{...}{Not used.} \item{constant}{scale factor.} diff --git a/man/means_by_group.Rd b/man/means_by_group.Rd index d7a6dfc96..ba2a7d0c8 100644 --- a/man/means_by_group.Rd +++ b/man/means_by_group.Rd @@ -8,15 +8,7 @@ \usage{ means_by_group(x, ...) -\method{means_by_group}{numeric}( - x, - by = NULL, - ci = 0.95, - weights = NULL, - digits = NULL, - group = NULL, - ... -) +\method{means_by_group}{numeric}(x, by = NULL, ci = 0.95, weights = NULL, digits = NULL, ...) \method{means_by_group}{data.frame}( x, @@ -29,7 +21,6 @@ means_by_group(x, ...) ignore_case = FALSE, regex = FALSE, verbose = TRUE, - group = NULL, ... ) } @@ -56,8 +47,6 @@ weights are used.} \item{digits}{Optional scalar, indicating the amount of digits after decimal point when rounding estimates and values.} -\item{group}{Deprecated. Use \code{by} instead.} - \item{select}{Variables that will be included when performing the required tasks. Can be either \itemize{ diff --git a/man/recode_values.Rd b/man/recode_values.Rd index 9810c0a2d..baa7afda9 100644 --- a/man/recode_values.Rd +++ b/man/recode_values.Rd @@ -4,7 +4,6 @@ \alias{recode_values} \alias{recode_values.numeric} \alias{recode_values.data.frame} -\alias{change_code} \title{Recode old values of variables into new values} \usage{ recode_values(x, ...) @@ -31,20 +30,6 @@ recode_values(x, ...) verbose = TRUE, ... ) - -change_code( - x, - select = NULL, - exclude = NULL, - recode = NULL, - default = NULL, - preserve_na = TRUE, - append = FALSE, - ignore_case = FALSE, - regex = FALSE, - verbose = TRUE, - ... -) } \arguments{ \item{x}{A data frame, numeric or character vector, or factor.} diff --git a/man/rescale_weights.Rd b/man/rescale_weights.Rd index 4a67d4100..d9651decb 100644 --- a/man/rescale_weights.Rd +++ b/man/rescale_weights.Rd @@ -4,7 +4,7 @@ \alias{rescale_weights} \title{Rescale design weights for multilevel analysis} \usage{ -rescale_weights(data, by, probability_weights, nest = FALSE, group = NULL) +rescale_weights(data, by, probability_weights, nest = FALSE) } \arguments{ \item{data}{A data frame.} @@ -21,8 +21,6 @@ sampling) weights of the survey data (level-1-weight).} \item{nest}{Logical, if \code{TRUE} and \code{by} indicates at least two group variables, then groups are "nested", i.e. groups are now a combination from each group level of the variables in \code{by}.} - -\item{group}{Deprecated. Use \code{by} instead.} } \value{ \code{data}, including the new weighting variables: \code{pweights_a} diff --git a/man/skewness.Rd b/man/skewness.Rd index a89d98067..0401e3a40 100644 --- a/man/skewness.Rd +++ b/man/skewness.Rd @@ -19,7 +19,6 @@ skewness(x, ...) type = "2", iterations = NULL, verbose = TRUE, - na.rm = TRUE, ... ) @@ -31,7 +30,6 @@ kurtosis(x, ...) type = "2", iterations = NULL, verbose = TRUE, - na.rm = TRUE, ... ) @@ -61,8 +59,6 @@ errors. If \code{NULL} (default), parametric standard errors are computed.} \item{verbose}{Toggle warnings and messages.} -\item{na.rm}{Deprecated. Please use \code{remove_na} instead.} - \item{digits}{Number of decimal places.} \item{test}{Logical, if \code{TRUE}, tests if skewness or kurtosis is diff --git a/man/text_format.Rd b/man/text_format.Rd index 5f246731f..14d64b096 100644 --- a/man/text_format.Rd +++ b/man/text_format.Rd @@ -2,7 +2,6 @@ % Please edit documentation in R/text_format.R \name{text_format} \alias{text_format} -\alias{format_text} \alias{text_fullstop} \alias{text_lastchar} \alias{text_concatenate} @@ -20,15 +19,6 @@ text_format( ... ) -format_text( - text, - sep = ", ", - last = " and ", - width = NULL, - enclose = NULL, - ... -) - text_fullstop(text) text_lastchar(text, n = 1) diff --git a/tests/testthat/_snaps/categorize.md b/tests/testthat/_snaps/categorize.md index d08c14c4d..9ed3c1115 100644 --- a/tests/testthat/_snaps/categorize.md +++ b/tests/testthat/_snaps/categorize.md @@ -31,17 +31,16 @@ [31] (10.4-15) (21-24.4) Levels: (10.4-15) (15.2-19.7) (21-24.4) (26-27.3) (30.4-33.9) ---- +# categorize breaks Code - categorize(mtcars$mpg, "equal_length", n_groups = 5, labels = "observed", - breaks = "inclusive") + categorize(mtcars$mpg, "equal_length", n_groups = 5, labels = "range", breaks = "inclusive") Output - [1] (21-24.4) (21-24.4) (21-24.4) (21-24.4) (15.2-19.7) (15.2-19.7) - [7] (10.4-15) (21-24.4) (21-24.4) (15.2-19.7) (15.2-19.7) (15.2-19.7) - [13] (15.2-19.7) (15.2-19.7) (10.4-15) (10.4-15) (10.4-15) (30.4-33.9) - [19] (30.4-33.9) (30.4-33.9) (21-24.4) (15.2-19.7) (15.2-19.7) (10.4-15) - [25] (15.2-19.7) (26-27.3) (26-27.3) (30.4-33.9) (15.2-19.7) (15.2-19.7) - [31] (10.4-15) (21-24.4) - Levels: (10.4-15) (15.2-19.7) (21-24.4) (26-27.3) (30.4-33.9) + [1] (19.8,24.5] (19.8,24.5] (19.8,24.5] (19.8,24.5] (15.1,19.8] (15.1,19.8] + [7] [10.4,15.1] (19.8,24.5] (19.8,24.5] (15.1,19.8] (15.1,19.8] (15.1,19.8] + [13] (15.1,19.8] (15.1,19.8] [10.4,15.1] [10.4,15.1] [10.4,15.1] (29.2,33.9] + [19] (29.2,33.9] (29.2,33.9] (19.8,24.5] (15.1,19.8] (15.1,19.8] [10.4,15.1] + [25] (15.1,19.8] (24.5,29.2] (24.5,29.2] (29.2,33.9] (15.1,19.8] (15.1,19.8] + [31] [10.4,15.1] (19.8,24.5] + Levels: [10.4,15.1] (15.1,19.8] (19.8,24.5] (24.5,29.2] (29.2,33.9] diff --git a/tests/testthat/test-categorize.R b/tests/testthat/test-categorize.R index 217797893..30453d9ad 100644 --- a/tests/testthat/test-categorize.R +++ b/tests/testthat/test-categorize.R @@ -244,7 +244,7 @@ test_that("categorize labelling ranged", { test_that("categorize breaks", { data(mtcars) - expect_snapshot(categorize(mtcars$mpg, "equal_length", n_groups = 5, labels = "observed", breaks = "inclusive")) + expect_snapshot(categorize(mtcars$mpg, "equal_length", n_groups = 5, labels = "range", breaks = "inclusive")) expect_error( categorize(mtcars$mpg, "equal_length", n_groups = 5, breaks = "something"), regex = "should be one of" diff --git a/tests/testthat/test-coef_var.R b/tests/testthat/test-coef_var.R index a55eb7b96..2ae2275cd 100644 --- a/tests/testthat/test-coef_var.R +++ b/tests/testthat/test-coef_var.R @@ -29,10 +29,6 @@ test_that("coef_var: argument 'remove_na' works", { ) }) -test_that("coef_var: deprecation warning", { - expect_warning(coef_var(c(1:10, NA), na.rm = TRUE)) -}) - test_that("coef_var: method 'unbiased' needs argument 'n' when sigma and mu are provided", { expect_error( coef_var(1:10, method = "unbiased", mu = 10, sigma = 20), diff --git a/tests/testthat/test-mean_sd.R b/tests/testthat/test-mean_sd.R index e0af8a0f1..3e0829fb1 100644 --- a/tests/testthat/test-mean_sd.R +++ b/tests/testthat/test-mean_sd.R @@ -15,8 +15,3 @@ test_that("mean_sd", { expect_equal(unname(diff(msd2)), rep(sd(mtcars[["mpg"]]), 6), tolerance = 0.00001) expect_named(msd2, c("-3 SD", "-2 SD", "-1 SD", "Mean", "+1 SD", "+2 SD", "+3 SD")) }) - -test_that("deprecation warning for `na.rm`", { - expect_warning(mean_sd(c(-1, 0, 1, NA), na.rm = TRUE)) - expect_warning(median_mad(c(-1, 0, 1, 2, 3, NA), na.rm = TRUE)) -}) From efa7df074d99a4e2f0467a0c35daa024e22cb0c2 Mon Sep 17 00:00:00 2001 From: etiennebacher Date: Mon, 7 Oct 2024 21:51:11 +0200 Subject: [PATCH 07/19] Remove "lifecycle" badge, following https://github.com/easystats/easystats/issues/432 --- README.Rmd | 2 +- README.md | 140 ++++++++++++++++++----------------------------------- 2 files changed, 47 insertions(+), 95 deletions(-) diff --git a/README.Rmd b/README.Rmd index ec0d01df7..39b8825ad 100644 --- a/README.Rmd +++ b/README.Rmd @@ -19,7 +19,7 @@ library(datawizard) [![DOI](https://joss.theoj.org/papers/10.21105/joss.04684/status.svg)](https://doi.org/10.21105/joss.04684) [![downloads](http://cranlogs.r-pkg.org/badges/datawizard)](https://cran.r-project.org/package=datawizard) -[![total](https://cranlogs.r-pkg.org/badges/grand-total/datawizard)](https://cranlogs.r-pkg.org/) [![lifecycle](https://img.shields.io/badge/lifecycle-maturing-blue.svg)](https://lifecycle.r-lib.org/articles/stages.html) +[![total](https://cranlogs.r-pkg.org/badges/grand-total/datawizard)](https://cranlogs.r-pkg.org/) diff --git a/README.md b/README.md index 712449df9..dd046ca12 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,6 @@ [![DOI](https://joss.theoj.org/papers/10.21105/joss.04684/status.svg)](https://doi.org/10.21105/joss.04684) [![downloads](http://cranlogs.r-pkg.org/badges/datawizard)](https://cran.r-project.org/package=datawizard) [![total](https://cranlogs.r-pkg.org/badges/grand-total/datawizard)](https://cranlogs.r-pkg.org/) -[![lifecycle](https://img.shields.io/badge/lifecycle-maturing-blue.svg)](https://lifecycle.r-lib.org/articles/stages.html) @@ -50,11 +49,11 @@ It covers two aspects of data preparation: badge](https://easystats.r-universe.dev/badges/datawizard)](https://easystats.r-universe.dev) [![R-CMD-check](https://github.com/easystats/datawizard/workflows/R-CMD-check/badge.svg?branch=main)](https://github.com/easystats/datawizard/actions) -| Type | Source | Command | -|----|----|----| -| Release | CRAN | `install.packages("datawizard")` | +| Type | Source | Command | +|-------------|------------|------------------------------------------------------------------------------| +| Release | CRAN | `install.packages("datawizard")` | | Development | r-universe | `install.packages("datawizard", repos = "https://easystats.r-universe.dev")` | -| Development | GitHub | `remotes::install_github("easystats/datawizard")` | +| Development | GitHub | `remotes::install_github("easystats/datawizard")` | > **Tip** > @@ -71,9 +70,10 @@ To cite the package, run the following command: citation("datawizard") To cite package 'datawizard' in publications use: - Patil et al., (2022). datawizard: An R Package for Easy Data - Preparation and Statistical Transformations. Journal of Open Source - Software, 7(78), 4684, https://doi.org/10.21105/joss.04684 + Patil et al., (2022). datawizard: An R Package for Easy + Data Preparation and Statistical Transformations. Journal + of Open Source Software, 7(78), 4684, + https://doi.org/10.21105/joss.04684 A BibTeX entry for LaTeX users is @@ -136,9 +136,6 @@ columns, can be achieved using `extract_column_names()` or # find column names matching a pattern extract_column_names(iris, starts_with("Sepal")) #> [1] "Sepal.Length" "Sepal.Width" -``` - -``` r # return data columns matching a pattern data_select(iris, starts_with("Sepal")) |> head() @@ -156,10 +153,8 @@ It is also possible to extract one or more variables: ``` r # single variable data_extract(mtcars, "gear") -#> [1] 4 4 4 3 3 3 3 4 4 4 4 3 3 3 3 3 3 4 4 4 3 3 3 3 3 4 5 5 5 5 5 4 -``` - -``` r +#> [1] 4 4 4 3 3 3 3 4 4 4 4 3 3 3 3 3 3 4 4 4 3 3 3 3 3 4 5 5 5 5 5 +#> [32] 4 # more variables head(data_extract(iris, ends_with("Width"))) @@ -220,17 +215,11 @@ x #> 1 1 a 5 1 #> 2 2 b 6 2 #> 3 3 c 7 3 -``` - -``` r y #> c d e id #> 1 6 f 100 2 #> 2 7 g 101 3 #> 3 8 h 102 4 -``` - -``` r data_merge(x, y, join = "full") #> a b c id d e @@ -238,50 +227,32 @@ data_merge(x, y, join = "full") #> 1 2 b 6 2 f 100 #> 2 3 c 7 3 g 101 #> 4 NA 8 4 h 102 -``` - -``` r data_merge(x, y, join = "left") #> a b c id d e #> 3 1 a 5 1 NA #> 1 2 b 6 2 f 100 #> 2 3 c 7 3 g 101 -``` - -``` r data_merge(x, y, join = "right") #> a b c id d e #> 1 2 b 6 2 f 100 #> 2 3 c 7 3 g 101 #> 3 NA 8 4 h 102 -``` - -``` r data_merge(x, y, join = "semi", by = "c") #> a b c id #> 2 2 b 6 2 #> 3 3 c 7 3 -``` - -``` r data_merge(x, y, join = "anti", by = "c") #> a b c id #> 1 1 a 5 1 -``` - -``` r data_merge(x, y, join = "inner") #> a b c id d e #> 1 2 b 6 2 f 100 #> 2 3 c 7 3 g 101 -``` - -``` r data_merge(x, y, join = "bind") #> a b c id d e @@ -322,17 +293,28 @@ data_to_wide(long_data, values_from = "value", id_cols = "Row_ID" ) -#> Row_ID X1 X2 X3 X4 X5 -#> 1 1 -0.08281164 -1.12490028 -0.70632036 -0.7027895 0.07633326 -#> 2 2 1.93468099 -0.87430362 0.96687656 0.2998642 -0.23035595 -#> 3 3 -2.05128979 0.04386162 -0.71016648 1.1494697 0.31746484 -#> 4 4 0.27773897 -0.58397514 -0.05917365 -0.3016415 -1.59268440 -#> 5 5 -1.52596060 -0.82329858 -0.23094342 -0.5473394 -0.18194062 -#> 6 6 -0.26916362 0.11059280 0.69200045 -0.3854041 1.75614174 -#> 7 7 1.23305388 0.36472778 1.35682290 0.2763720 0.11394932 -#> 8 8 0.63360774 0.05370100 1.78872284 0.1518608 -0.29216508 -#> 9 9 0.35271746 1.36867235 0.41071582 -0.4313808 1.75409316 -#> 10 10 -0.56048248 -0.38045724 -2.18785470 -1.8705001 1.80958455 +#> Row_ID X1 X2 X3 X4 +#> 1 1 -0.08281164 -1.12490028 -0.70632036 -0.7027895 +#> 2 2 1.93468099 -0.87430362 0.96687656 0.2998642 +#> 3 3 -2.05128979 0.04386162 -0.71016648 1.1494697 +#> 4 4 0.27773897 -0.58397514 -0.05917365 -0.3016415 +#> 5 5 -1.52596060 -0.82329858 -0.23094342 -0.5473394 +#> 6 6 -0.26916362 0.11059280 0.69200045 -0.3854041 +#> 7 7 1.23305388 0.36472778 1.35682290 0.2763720 +#> 8 8 0.63360774 0.05370100 1.78872284 0.1518608 +#> 9 9 0.35271746 1.36867235 0.41071582 -0.4313808 +#> 10 10 -0.56048248 -0.38045724 -2.18785470 -1.8705001 +#> X5 +#> 1 0.07633326 +#> 2 -0.23035595 +#> 3 0.31746484 +#> 4 -1.59268440 +#> 5 -0.18194062 +#> 6 1.75614174 +#> 7 0.11394932 +#> 8 -0.29216508 +#> 9 1.75409316 +#> 10 1.80958455 ``` ### Empty rows and columns @@ -352,22 +334,13 @@ tmp #> 3 3 3 NA 3 #> 4 NA NA NA NA #> 5 5 5 NA 5 -``` - -``` r # indices of empty columns or rows empty_columns(tmp) #> c #> 3 -``` - -``` r empty_rows(tmp) #> [1] 4 -``` - -``` r # remove empty columns or rows remove_empty_columns(tmp) @@ -377,18 +350,12 @@ remove_empty_columns(tmp) #> 3 3 3 3 #> 4 NA NA NA #> 5 5 5 5 -``` - -``` r remove_empty_rows(tmp) #> a b c d #> 1 1 1 NA 1 #> 2 2 NA NA NA #> 3 3 3 NA 3 #> 5 5 5 NA 5 -``` - -``` r # remove empty columns and rows remove_empty(tmp) @@ -409,9 +376,6 @@ table(x) #> x #> 1 2 3 4 5 6 7 8 9 10 #> 2 3 5 3 7 5 5 2 11 7 -``` - -``` r # cut into 3 groups, based on distribution (quantiles) table(categorize(x, split = "quantile", n_groups = 3)) @@ -445,26 +409,23 @@ summary(swiss) #> Mean : 41.144 Mean :19.94 #> 3rd Qu.: 93.125 3rd Qu.:21.70 #> Max. :100.000 Max. :26.60 -``` - -``` r # after summary(standardize(swiss)) -#> Fertility Agriculture Examination Education -#> Min. :-2.81327 Min. :-2.1778 Min. :-1.69084 Min. :-1.0378 -#> 1st Qu.:-0.43569 1st Qu.:-0.6499 1st Qu.:-0.56273 1st Qu.:-0.5178 -#> Median : 0.02061 Median : 0.1515 Median :-0.06134 Median :-0.3098 -#> Mean : 0.00000 Mean : 0.0000 Mean : 0.00000 Mean : 0.0000 -#> 3rd Qu.: 0.66504 3rd Qu.: 0.7481 3rd Qu.: 0.69074 3rd Qu.: 0.1062 -#> Max. : 1.78978 Max. : 1.7190 Max. : 2.57094 Max. : 4.3702 -#> Catholic Infant.Mortality -#> Min. :-0.9350 Min. :-3.13886 -#> 1st Qu.:-0.8620 1st Qu.:-0.61543 -#> Median :-0.6235 Median : 0.01972 -#> Mean : 0.0000 Mean : 0.00000 -#> 3rd Qu.: 1.2464 3rd Qu.: 0.60337 -#> Max. : 1.4113 Max. : 2.28566 +#> Fertility Agriculture Examination +#> Min. :-2.81327 Min. :-2.1778 Min. :-1.69084 +#> 1st Qu.:-0.43569 1st Qu.:-0.6499 1st Qu.:-0.56273 +#> Median : 0.02061 Median : 0.1515 Median :-0.06134 +#> Mean : 0.00000 Mean : 0.0000 Mean : 0.00000 +#> 3rd Qu.: 0.66504 3rd Qu.: 0.7481 3rd Qu.: 0.69074 +#> Max. : 1.78978 Max. : 1.7190 Max. : 2.57094 +#> Education Catholic Infant.Mortality +#> Min. :-1.0378 Min. :-0.9350 Min. :-3.13886 +#> 1st Qu.:-0.5178 1st Qu.:-0.8620 1st Qu.:-0.61543 +#> Median :-0.3098 Median :-0.6235 Median : 0.01972 +#> Mean : 0.0000 Mean : 0.0000 Mean : 0.00000 +#> 3rd Qu.: 0.1062 3rd Qu.: 1.2464 3rd Qu.: 0.60337 +#> Max. : 4.3702 Max. : 1.4113 Max. : 2.28566 ``` ### Winsorize @@ -486,9 +447,6 @@ anscombe #> 9 12 12 12 8 10.84 9.13 8.15 5.56 #> 10 7 7 7 8 4.82 7.26 6.42 7.91 #> 11 5 5 5 8 5.68 4.74 5.73 6.89 -``` - -``` r # after winsorize(anscombe) @@ -540,9 +498,6 @@ head(trees) #> 4 10.5 72 16.4 #> 5 10.7 81 18.8 #> 6 10.8 83 19.7 -``` - -``` r # after head(ranktransform(trees)) @@ -575,9 +530,6 @@ x #> Mazda RX4 21.0 6 160 110 #> Mazda RX4 Wag 21.0 6 160 110 #> Datsun 710 22.8 4 108 93 -``` - -``` r data_rotate(x) #> Mazda RX4 Mazda RX4 Wag Datsun 710 From db85541477a95bfd2f56055296fda66bbc61fee1 Mon Sep 17 00:00:00 2001 From: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com> Date: Mon, 7 Oct 2024 23:12:55 +0200 Subject: [PATCH 08/19] Do not error when bootstrapping CIs if sample is too sparse (#550) * Do not error when bootstrapping CIs if sample is too sparse * lint, style * conditional test * typo * style again * Update NEWS.md --- DESCRIPTION | 2 +- NEWS.md | 7 +++++++ R/describe_distribution.R | 23 ++++++++++++++++----- tests/testthat/test-describe_distribution.R | 13 ++++++++++++ 4 files changed, 39 insertions(+), 6 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index f3e7599af..8dd92fb7e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: datawizard Title: Easy Data Wrangling and Statistical Transformations -Version: 0.13.0 +Version: 0.13.0.1 Authors@R: c( person("Indrajeet", "Patil", , "patilindrajeet.science@gmail.com", role = "aut", comment = c(ORCID = "0000-0003-1995-6531")), diff --git a/NEWS.md b/NEWS.md index f2ade5883..c28ecc63f 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,10 @@ +# datawizard (development) + +BUG FIXES + +* `describe_distribution()` no longer errors if the sample was too sparse to compute + CIs. Instead, it warns the user and returns `NA` (#550). + # datawizard 0.13.0 BREAKING CHANGES diff --git a/R/describe_distribution.R b/R/describe_distribution.R index 41f2a8b83..2e61c1dc3 100644 --- a/R/describe_distribution.R +++ b/R/describe_distribution.R @@ -186,11 +186,24 @@ describe_distribution.numeric <- function(x, # Confidence Intervals if (!is.null(ci)) { insight::check_if_installed("boot") - results <- boot::boot( - data = x, - statistic = .boot_distribution, - R = iterations, - centrality = centrality + results <- tryCatch( + { + boot::boot( + data = x, + statistic = .boot_distribution, + R = iterations, + centrality = centrality + ) + }, + error = function(e) { + msg <- conditionMessage(e) + if (!is.null(msg) && msg == "sample is too sparse to find TD") { + insight::format_warning( + "When bootstrapping CIs, sample was too sparse to find TD. Returning NA for CIs." + ) + list(t = c(NA_real_, NA_real_)) + } + } ) out_ci <- bayestestR::ci(results$t, ci = ci, verbose = FALSE) out <- cbind(out, data.frame(CI_low = out_ci$CI_low[1], CI_high = out_ci$CI_high[1])) diff --git a/tests/testthat/test-describe_distribution.R b/tests/testthat/test-describe_distribution.R index 83d2abb33..dfa7bf617 100644 --- a/tests/testthat/test-describe_distribution.R +++ b/tests/testthat/test-describe_distribution.R @@ -286,3 +286,16 @@ test_that("describe_distribution formatting", { x <- describe_distribution(iris$Sepal.Width, quartiles = TRUE) expect_snapshot(format(x)) }) + +# other ----------------------------------- + +test_that("return NA in CI if sample is too sparse", { + skip_if_not_installed("bayestestR") + set.seed(123456) + expect_warning( + res <- describe_distribution(mtcars[mtcars$cyl == "6", ], wt, centrality = "map", ci = 0.95), # nolint + "When bootstrapping CIs, sample was too sparse to find TD" + ) + expect_identical(res$CI_low, NA) + expect_identical(res$CI_high, NA) +}) From 9dff2ae6b9d36fe43a72267cccaeb66196519dc7 Mon Sep 17 00:00:00 2001 From: Daniel Date: Thu, 10 Oct 2024 16:26:02 +0200 Subject: [PATCH 09/19] Allow `:` in select strings (#551) * Allow : in select strings * lintr * fix test * test. comment * news item --- DESCRIPTION | 2 +- NEWS.md | 6 ++++ R/extract_column_names.R | 11 ++++-- R/select_nse.R | 57 +++++++++++++++++++++++++------- man/adjust.Rd | 6 ++-- man/assign_labels.Rd | 6 ++-- man/categorize.Rd | 6 ++-- man/center.Rd | 6 ++-- man/convert_na_to.Rd | 6 ++-- man/convert_to_na.Rd | 6 ++-- man/data_codebook.Rd | 6 ++-- man/data_duplicated.Rd | 6 ++-- man/data_extract.Rd | 6 ++-- man/data_group.Rd | 6 ++-- man/data_peek.Rd | 6 ++-- man/data_relocate.Rd | 6 ++-- man/data_rename.Rd | 6 ++-- man/data_replicate.Rd | 6 ++-- man/data_separate.Rd | 6 ++-- man/data_tabulate.Rd | 6 ++-- man/data_to_long.Rd | 6 ++-- man/data_unique.Rd | 6 ++-- man/data_unite.Rd | 6 ++-- man/describe_distribution.Rd | 6 ++-- man/extract_column_names.Rd | 11 ++++-- man/labels_to_levels.Rd | 6 ++-- man/means_by_group.Rd | 6 ++-- man/normalize.Rd | 6 ++-- man/ranktransform.Rd | 6 ++-- man/recode_values.Rd | 6 ++-- man/rescale.Rd | 6 ++-- man/reverse.Rd | 6 ++-- man/row_means.Rd | 6 ++-- man/slide.Rd | 6 ++-- man/standardize.Rd | 6 ++-- man/to_factor.Rd | 6 ++-- man/to_numeric.Rd | 6 ++-- tests/testthat/test-select_nse.R | 21 ++++++++++++ 38 files changed, 217 insertions(+), 83 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 8dd92fb7e..4758f601c 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: datawizard Title: Easy Data Wrangling and Statistical Transformations -Version: 0.13.0.1 +Version: 0.13.0.2 Authors@R: c( person("Indrajeet", "Patil", , "patilindrajeet.science@gmail.com", role = "aut", comment = c(ORCID = "0000-0003-1995-6531")), diff --git a/NEWS.md b/NEWS.md index c28ecc63f..388c5a822 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,11 @@ # datawizard (development) +CHANGES + +* The `select` argument, which is available in different functions to select + variables, can now also be a character vector with quoted variable names, + including a colon to indicate a range of several variables (e.g. `"cyl:gear"`). + BUG FIXES * `describe_distribution()` no longer errors if the sample was too sparse to compute diff --git a/R/extract_column_names.R b/R/extract_column_names.R index c245f4bab..a3d120d3f 100644 --- a/R/extract_column_names.R +++ b/R/extract_column_names.R @@ -9,8 +9,10 @@ #' tasks. Can be either #' #' - a variable specified as a literal variable name (e.g., `column_name`), -#' - a string with the variable name (e.g., `"column_name"`), or a character -#' vector of variable names (e.g., `c("col1", "col2", "col3")`), +#' - a string with the variable name (e.g., `"column_name"`), a character +#' vector of variable names (e.g., `c("col1", "col2", "col3")`), or a +#' character vector of variable names including ranges specified via `:` +#' (e.g., `c("col1:col3", "col5")`), #' - a formula with variable names (e.g., `~column_1 + column_2`), #' - a vector of positive integers, giving the positions counting from the left #' (e.g. `1` or `c(1, 3, 5)`), @@ -116,7 +118,7 @@ #' ``` #' #' @examples -#' # Find columns names by pattern +#' # Find column names by pattern #' extract_column_names(iris, starts_with("Sepal")) #' extract_column_names(iris, ends_with("Width")) #' extract_column_names(iris, regex("\\.")) @@ -129,6 +131,9 @@ #' numeric_mean_35 <- function(x) is.numeric(x) && mean(x, na.rm = TRUE) > 3.5 #' extract_column_names(iris, numeric_mean_35) #' +#' # find range of colum names by range, using character vector +#' extract_column_names(mtcars, c("cyl:hp", "wt")) +#' #' # rename returned columns for "data_select()" #' head(data_select(mtcars, c(`Miles per Gallon` = "mpg", Cylinders = "cyl"))) #' @export diff --git a/R/select_nse.R b/R/select_nse.R index 8f9eba096..5120691a9 100644 --- a/R/select_nse.R +++ b/R/select_nse.R @@ -139,6 +139,7 @@ # Possibilities: # - quoted variable name # - quoted variable name with ignore case +# - quoted variable name with colon, to indicate range # - character that should be regex-ed on variable names # - special word "all" to return all vars @@ -146,31 +147,63 @@ # use colnames because names() doesn't work for matrices columns <- colnames(data) if (isTRUE(regex)) { + # string is a regular expression grep(x, columns) } else if (length(x) == 1L && x == "all") { + # string is "all" - select all columns seq_along(data) + } else if (any(grepl(":", x, fixed = TRUE))) { + # special pattern, as string (e.g.select = c("cyl:hp", "am")). However, + # this will first go into `.eval_call()` and thus only single elements + # are passed in `x` - we have never a character *vector* here + # check for valid names + colon_vars <- unlist(strsplit(x, ":", fixed = TRUE)) + colon_match <- match(colon_vars, columns) + if (anyNA(colon_match)) { + .warn_not_found(colon_vars, columns, colon_match, verbose) + matches <- NA + } else { + start_pos <- match(colon_vars[1], columns) + end_pos <- match(colon_vars[2], columns) + if (!is.na(start_pos) && !is.na(end_pos)) { + matches <- start_pos:end_pos + } else { + matches <- NA + } + } + matches[!is.na(matches)] } else if (isTRUE(ignore_case)) { + # find columns, case insensitive matches <- match(toupper(x), toupper(columns)) matches[!is.na(matches)] } else { + # find columns, case sensitive matches <- match(x, columns) - if (anyNA(matches) && verbose) { - insight::format_warning( - paste0( - "Following variable(s) were not found: ", - toString(x[is.na(matches)]) - ), - .misspelled_string( - columns, - x[is.na(matches)], - default_message = "Possibly misspelled?" - ) - ) + if (anyNA(matches)) { + .warn_not_found(x, columns, matches, verbose) } matches[!is.na(matches)] } } +# small helper, to avoid duplicated code +.warn_not_found <- function(x, columns, matches, verbose = TRUE) { + if (verbose) { + insight::format_warning( + paste0( + "Following variable(s) were not found: ", + toString(x[is.na(matches)]) + ), + .misspelled_string( + columns, + x[is.na(matches)], + default_message = "Possibly misspelled?" + ) + ) + } +} + + # 3 types of symbols: # - unquoted variables # - objects that need to be evaluated, e.g data_find(iris, i) where diff --git a/man/adjust.Rd b/man/adjust.Rd index 64e50d9d3..48b321b8f 100644 --- a/man/adjust.Rd +++ b/man/adjust.Rd @@ -43,8 +43,10 @@ out). If \code{NULL} (the default), all variables will be selected.} tasks. Can be either \itemize{ \item a variable specified as a literal variable name (e.g., \code{column_name}), -\item a string with the variable name (e.g., \code{"column_name"}), or a character -vector of variable names (e.g., \code{c("col1", "col2", "col3")}), +\item a string with the variable name (e.g., \code{"column_name"}), a character +vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a +character vector of variable names including ranges specified via \code{:} +(e.g., \code{c("col1:col3", "col5")}), \item a formula with variable names (e.g., \code{~column_1 + column_2}), \item a vector of positive integers, giving the positions counting from the left (e.g. \code{1} or \code{c(1, 3, 5)}), diff --git a/man/assign_labels.Rd b/man/assign_labels.Rd index cca14cc85..e6fd24252 100644 --- a/man/assign_labels.Rd +++ b/man/assign_labels.Rd @@ -38,8 +38,10 @@ labels are omitted.} tasks. Can be either \itemize{ \item a variable specified as a literal variable name (e.g., \code{column_name}), -\item a string with the variable name (e.g., \code{"column_name"}), or a character -vector of variable names (e.g., \code{c("col1", "col2", "col3")}), +\item a string with the variable name (e.g., \code{"column_name"}), a character +vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a +character vector of variable names including ranges specified via \code{:} +(e.g., \code{c("col1:col3", "col5")}), \item a formula with variable names (e.g., \code{~column_1 + column_2}), \item a vector of positive integers, giving the positions counting from the left (e.g. \code{1} or \code{c(1, 3, 5)}), diff --git a/man/categorize.Rd b/man/categorize.Rd index ca013ce2b..dbecbf5e6 100644 --- a/man/categorize.Rd +++ b/man/categorize.Rd @@ -89,8 +89,10 @@ group. See 'Examples'.} tasks. Can be either \itemize{ \item a variable specified as a literal variable name (e.g., \code{column_name}), -\item a string with the variable name (e.g., \code{"column_name"}), or a character -vector of variable names (e.g., \code{c("col1", "col2", "col3")}), +\item a string with the variable name (e.g., \code{"column_name"}), a character +vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a +character vector of variable names including ranges specified via \code{:} +(e.g., \code{c("col1:col3", "col5")}), \item a formula with variable names (e.g., \code{~column_1 + column_2}), \item a vector of positive integers, giving the positions counting from the left (e.g. \code{1} or \code{c(1, 3, 5)}), diff --git a/man/center.Rd b/man/center.Rd index f143f64b2..4774020ab 100644 --- a/man/center.Rd +++ b/man/center.Rd @@ -72,8 +72,10 @@ against the names of the selected variables.} tasks. Can be either \itemize{ \item a variable specified as a literal variable name (e.g., \code{column_name}), -\item a string with the variable name (e.g., \code{"column_name"}), or a character -vector of variable names (e.g., \code{c("col1", "col2", "col3")}), +\item a string with the variable name (e.g., \code{"column_name"}), a character +vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a +character vector of variable names including ranges specified via \code{:} +(e.g., \code{c("col1:col3", "col5")}), \item a formula with variable names (e.g., \code{~column_1 + column_2}), \item a vector of positive integers, giving the positions counting from the left (e.g. \code{1} or \code{c(1, 3, 5)}), diff --git a/man/convert_na_to.Rd b/man/convert_na_to.Rd index 91121ff94..702e0eb2e 100644 --- a/man/convert_na_to.Rd +++ b/man/convert_na_to.Rd @@ -41,8 +41,10 @@ replace \code{NA}.} tasks. Can be either \itemize{ \item a variable specified as a literal variable name (e.g., \code{column_name}), -\item a string with the variable name (e.g., \code{"column_name"}), or a character -vector of variable names (e.g., \code{c("col1", "col2", "col3")}), +\item a string with the variable name (e.g., \code{"column_name"}), a character +vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a +character vector of variable names including ranges specified via \code{:} +(e.g., \code{c("col1:col3", "col5")}), \item a formula with variable names (e.g., \code{~column_1 + column_2}), \item a vector of positive integers, giving the positions counting from the left (e.g. \code{1} or \code{c(1, 3, 5)}), diff --git a/man/convert_to_na.Rd b/man/convert_to_na.Rd index 2529294b7..fe308d61e 100644 --- a/man/convert_to_na.Rd +++ b/man/convert_to_na.Rd @@ -44,8 +44,10 @@ by \code{NA}, should unused levels be dropped?} tasks. Can be either \itemize{ \item a variable specified as a literal variable name (e.g., \code{column_name}), -\item a string with the variable name (e.g., \code{"column_name"}), or a character -vector of variable names (e.g., \code{c("col1", "col2", "col3")}), +\item a string with the variable name (e.g., \code{"column_name"}), a character +vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a +character vector of variable names including ranges specified via \code{:} +(e.g., \code{c("col1:col3", "col5")}), \item a formula with variable names (e.g., \code{~column_1 + column_2}), \item a vector of positive integers, giving the positions counting from the left (e.g. \code{1} or \code{c(1, 3, 5)}), diff --git a/man/data_codebook.Rd b/man/data_codebook.Rd index 4c0f935e7..319f4e5b6 100644 --- a/man/data_codebook.Rd +++ b/man/data_codebook.Rd @@ -34,8 +34,10 @@ data_codebook( tasks. Can be either \itemize{ \item a variable specified as a literal variable name (e.g., \code{column_name}), -\item a string with the variable name (e.g., \code{"column_name"}), or a character -vector of variable names (e.g., \code{c("col1", "col2", "col3")}), +\item a string with the variable name (e.g., \code{"column_name"}), a character +vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a +character vector of variable names including ranges specified via \code{:} +(e.g., \code{c("col1:col3", "col5")}), \item a formula with variable names (e.g., \code{~column_1 + column_2}), \item a vector of positive integers, giving the positions counting from the left (e.g. \code{1} or \code{c(1, 3, 5)}), diff --git a/man/data_duplicated.Rd b/man/data_duplicated.Rd index 73c3e8de1..88624c8c8 100644 --- a/man/data_duplicated.Rd +++ b/man/data_duplicated.Rd @@ -20,8 +20,10 @@ data_duplicated( tasks. Can be either \itemize{ \item a variable specified as a literal variable name (e.g., \code{column_name}), -\item a string with the variable name (e.g., \code{"column_name"}), or a character -vector of variable names (e.g., \code{c("col1", "col2", "col3")}), +\item a string with the variable name (e.g., \code{"column_name"}), a character +vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a +character vector of variable names including ranges specified via \code{:} +(e.g., \code{c("col1:col3", "col5")}), \item a formula with variable names (e.g., \code{~column_1 + column_2}), \item a vector of positive integers, giving the positions counting from the left (e.g. \code{1} or \code{c(1, 3, 5)}), diff --git a/man/data_extract.Rd b/man/data_extract.Rd index a0cd4e402..0b544e710 100644 --- a/man/data_extract.Rd +++ b/man/data_extract.Rd @@ -27,8 +27,10 @@ and data frame extensions (e.g., tibbles).} tasks. Can be either \itemize{ \item a variable specified as a literal variable name (e.g., \code{column_name}), -\item a string with the variable name (e.g., \code{"column_name"}), or a character -vector of variable names (e.g., \code{c("col1", "col2", "col3")}), +\item a string with the variable name (e.g., \code{"column_name"}), a character +vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a +character vector of variable names including ranges specified via \code{:} +(e.g., \code{c("col1:col3", "col5")}), \item a formula with variable names (e.g., \code{~column_1 + column_2}), \item a vector of positive integers, giving the positions counting from the left (e.g. \code{1} or \code{c(1, 3, 5)}), diff --git a/man/data_group.Rd b/man/data_group.Rd index 56f5f314e..9cb55de5d 100644 --- a/man/data_group.Rd +++ b/man/data_group.Rd @@ -24,8 +24,10 @@ data_ungroup(data, verbose = TRUE, ...) tasks. Can be either \itemize{ \item a variable specified as a literal variable name (e.g., \code{column_name}), -\item a string with the variable name (e.g., \code{"column_name"}), or a character -vector of variable names (e.g., \code{c("col1", "col2", "col3")}), +\item a string with the variable name (e.g., \code{"column_name"}), a character +vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a +character vector of variable names including ranges specified via \code{:} +(e.g., \code{c("col1:col3", "col5")}), \item a formula with variable names (e.g., \code{~column_1 + column_2}), \item a vector of positive integers, giving the positions counting from the left (e.g. \code{1} or \code{c(1, 3, 5)}), diff --git a/man/data_peek.Rd b/man/data_peek.Rd index 4f3f88e8a..9524c70ec 100644 --- a/man/data_peek.Rd +++ b/man/data_peek.Rd @@ -27,8 +27,10 @@ data_peek(x, ...) tasks. Can be either \itemize{ \item a variable specified as a literal variable name (e.g., \code{column_name}), -\item a string with the variable name (e.g., \code{"column_name"}), or a character -vector of variable names (e.g., \code{c("col1", "col2", "col3")}), +\item a string with the variable name (e.g., \code{"column_name"}), a character +vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a +character vector of variable names including ranges specified via \code{:} +(e.g., \code{c("col1:col3", "col5")}), \item a formula with variable names (e.g., \code{~column_1 + column_2}), \item a vector of positive integers, giving the positions counting from the left (e.g. \code{1} or \code{c(1, 3, 5)}), diff --git a/man/data_relocate.Rd b/man/data_relocate.Rd index 30e4dbbfe..9949b5d27 100644 --- a/man/data_relocate.Rd +++ b/man/data_relocate.Rd @@ -44,8 +44,10 @@ data_remove( tasks. Can be either \itemize{ \item a variable specified as a literal variable name (e.g., \code{column_name}), -\item a string with the variable name (e.g., \code{"column_name"}), or a character -vector of variable names (e.g., \code{c("col1", "col2", "col3")}), +\item a string with the variable name (e.g., \code{"column_name"}), a character +vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a +character vector of variable names including ranges specified via \code{:} +(e.g., \code{c("col1:col3", "col5")}), \item a formula with variable names (e.g., \code{~column_1 + column_2}), \item a vector of positive integers, giving the positions counting from the left (e.g. \code{1} or \code{c(1, 3, 5)}), diff --git a/man/data_rename.Rd b/man/data_rename.Rd index a45095805..2ff779c21 100644 --- a/man/data_rename.Rd +++ b/man/data_rename.Rd @@ -56,8 +56,10 @@ will be ignored then).} tasks. Can be either \itemize{ \item a variable specified as a literal variable name (e.g., \code{column_name}), -\item a string with the variable name (e.g., \code{"column_name"}), or a character -vector of variable names (e.g., \code{c("col1", "col2", "col3")}), +\item a string with the variable name (e.g., \code{"column_name"}), a character +vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a +character vector of variable names including ranges specified via \code{:} +(e.g., \code{c("col1:col3", "col5")}), \item a formula with variable names (e.g., \code{~column_1 + column_2}), \item a vector of positive integers, giving the positions counting from the left (e.g. \code{1} or \code{c(1, 3, 5)}), diff --git a/man/data_replicate.Rd b/man/data_replicate.Rd index 35448155d..5a427d570 100644 --- a/man/data_replicate.Rd +++ b/man/data_replicate.Rd @@ -27,8 +27,10 @@ column. Note that the variable indicated by \code{expand} must be an integer vec tasks. Can be either \itemize{ \item a variable specified as a literal variable name (e.g., \code{column_name}), -\item a string with the variable name (e.g., \code{"column_name"}), or a character -vector of variable names (e.g., \code{c("col1", "col2", "col3")}), +\item a string with the variable name (e.g., \code{"column_name"}), a character +vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a +character vector of variable names including ranges specified via \code{:} +(e.g., \code{c("col1:col3", "col5")}), \item a formula with variable names (e.g., \code{~column_1 + column_2}), \item a vector of positive integers, giving the positions counting from the left (e.g. \code{1} or \code{c(1, 3, 5)}), diff --git a/man/data_separate.Rd b/man/data_separate.Rd index 37528d46e..7c951f81c 100644 --- a/man/data_separate.Rd +++ b/man/data_separate.Rd @@ -30,8 +30,10 @@ data_separate( tasks. Can be either \itemize{ \item a variable specified as a literal variable name (e.g., \code{column_name}), -\item a string with the variable name (e.g., \code{"column_name"}), or a character -vector of variable names (e.g., \code{c("col1", "col2", "col3")}), +\item a string with the variable name (e.g., \code{"column_name"}), a character +vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a +character vector of variable names including ranges specified via \code{:} +(e.g., \code{c("col1:col3", "col5")}), \item a formula with variable names (e.g., \code{~column_1 + column_2}), \item a vector of positive integers, giving the positions counting from the left (e.g. \code{1} or \code{c(1, 3, 5)}), diff --git a/man/data_tabulate.Rd b/man/data_tabulate.Rd index 2feadf3a9..032c0b989 100644 --- a/man/data_tabulate.Rd +++ b/man/data_tabulate.Rd @@ -79,8 +79,10 @@ for printing.} tasks. Can be either \itemize{ \item a variable specified as a literal variable name (e.g., \code{column_name}), -\item a string with the variable name (e.g., \code{"column_name"}), or a character -vector of variable names (e.g., \code{c("col1", "col2", "col3")}), +\item a string with the variable name (e.g., \code{"column_name"}), a character +vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a +character vector of variable names including ranges specified via \code{:} +(e.g., \code{c("col1:col3", "col5")}), \item a formula with variable names (e.g., \code{~column_1 + column_2}), \item a vector of positive integers, giving the positions counting from the left (e.g. \code{1} or \code{c(1, 3, 5)}), diff --git a/man/data_to_long.Rd b/man/data_to_long.Rd index 741725d25..73b54219b 100644 --- a/man/data_to_long.Rd +++ b/man/data_to_long.Rd @@ -45,8 +45,10 @@ rows and fewer columns after the operation.} tasks. Can be either \itemize{ \item a variable specified as a literal variable name (e.g., \code{column_name}), -\item a string with the variable name (e.g., \code{"column_name"}), or a character -vector of variable names (e.g., \code{c("col1", "col2", "col3")}), +\item a string with the variable name (e.g., \code{"column_name"}), a character +vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a +character vector of variable names including ranges specified via \code{:} +(e.g., \code{c("col1:col3", "col5")}), \item a formula with variable names (e.g., \code{~column_1 + column_2}), \item a vector of positive integers, giving the positions counting from the left (e.g. \code{1} or \code{c(1, 3, 5)}), diff --git a/man/data_unique.Rd b/man/data_unique.Rd index 8a45bfc21..a0a70b92a 100644 --- a/man/data_unique.Rd +++ b/man/data_unique.Rd @@ -21,8 +21,10 @@ data_unique( tasks. Can be either \itemize{ \item a variable specified as a literal variable name (e.g., \code{column_name}), -\item a string with the variable name (e.g., \code{"column_name"}), or a character -vector of variable names (e.g., \code{c("col1", "col2", "col3")}), +\item a string with the variable name (e.g., \code{"column_name"}), a character +vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a +character vector of variable names including ranges specified via \code{:} +(e.g., \code{c("col1:col3", "col5")}), \item a formula with variable names (e.g., \code{~column_1 + column_2}), \item a vector of positive integers, giving the positions counting from the left (e.g. \code{1} or \code{c(1, 3, 5)}), diff --git a/man/data_unite.Rd b/man/data_unite.Rd index ba7710a8a..369fd33d8 100644 --- a/man/data_unite.Rd +++ b/man/data_unite.Rd @@ -27,8 +27,10 @@ data_unite( tasks. Can be either \itemize{ \item a variable specified as a literal variable name (e.g., \code{column_name}), -\item a string with the variable name (e.g., \code{"column_name"}), or a character -vector of variable names (e.g., \code{c("col1", "col2", "col3")}), +\item a string with the variable name (e.g., \code{"column_name"}), a character +vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a +character vector of variable names including ranges specified via \code{:} +(e.g., \code{c("col1:col3", "col5")}), \item a formula with variable names (e.g., \code{~column_1 + column_2}), \item a vector of positive integers, giving the positions counting from the left (e.g. \code{1} or \code{c(1, 3, 5)}), diff --git a/man/describe_distribution.Rd b/man/describe_distribution.Rd index 369bd9ef6..80b69e115 100644 --- a/man/describe_distribution.Rd +++ b/man/describe_distribution.Rd @@ -86,8 +86,10 @@ vector before the mean is computed.} tasks. Can be either \itemize{ \item a variable specified as a literal variable name (e.g., \code{column_name}), -\item a string with the variable name (e.g., \code{"column_name"}), or a character -vector of variable names (e.g., \code{c("col1", "col2", "col3")}), +\item a string with the variable name (e.g., \code{"column_name"}), a character +vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a +character vector of variable names including ranges specified via \code{:} +(e.g., \code{c("col1:col3", "col5")}), \item a formula with variable names (e.g., \code{~column_1 + column_2}), \item a vector of positive integers, giving the positions counting from the left (e.g. \code{1} or \code{c(1, 3, 5)}), diff --git a/man/extract_column_names.Rd b/man/extract_column_names.Rd index 6e658ab33..3ea5da7dc 100644 --- a/man/extract_column_names.Rd +++ b/man/extract_column_names.Rd @@ -43,8 +43,10 @@ find_columns( tasks. Can be either \itemize{ \item a variable specified as a literal variable name (e.g., \code{column_name}), -\item a string with the variable name (e.g., \code{"column_name"}), or a character -vector of variable names (e.g., \code{c("col1", "col2", "col3")}), +\item a string with the variable name (e.g., \code{"column_name"}), a character +vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a +character vector of variable names including ranges specified via \code{:} +(e.g., \code{c("col1:col3", "col5")}), \item a formula with variable names (e.g., \code{~column_1 + column_2}), \item a vector of positive integers, giving the positions counting from the left (e.g. \code{1} or \code{c(1, 3, 5)}), @@ -152,7 +154,7 @@ outer(iris, starts_with("Sep")) }\if{html}{\out{}} } \examples{ -# Find columns names by pattern +# Find column names by pattern extract_column_names(iris, starts_with("Sepal")) extract_column_names(iris, ends_with("Width")) extract_column_names(iris, regex("\\\\.")) @@ -165,6 +167,9 @@ extract_column_names(iris, starts_with("Sepal"), exclude = contains("Width")) numeric_mean_35 <- function(x) is.numeric(x) && mean(x, na.rm = TRUE) > 3.5 extract_column_names(iris, numeric_mean_35) +# find range of colum names by range, using character vector +extract_column_names(mtcars, c("cyl:hp", "wt")) + # rename returned columns for "data_select()" head(data_select(mtcars, c(`Miles per Gallon` = "mpg", Cylinders = "cyl"))) } diff --git a/man/labels_to_levels.Rd b/man/labels_to_levels.Rd index 8024eb2d3..163eb0eaa 100644 --- a/man/labels_to_levels.Rd +++ b/man/labels_to_levels.Rd @@ -33,8 +33,10 @@ allowed.} tasks. Can be either \itemize{ \item a variable specified as a literal variable name (e.g., \code{column_name}), -\item a string with the variable name (e.g., \code{"column_name"}), or a character -vector of variable names (e.g., \code{c("col1", "col2", "col3")}), +\item a string with the variable name (e.g., \code{"column_name"}), a character +vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a +character vector of variable names including ranges specified via \code{:} +(e.g., \code{c("col1:col3", "col5")}), \item a formula with variable names (e.g., \code{~column_1 + column_2}), \item a vector of positive integers, giving the positions counting from the left (e.g. \code{1} or \code{c(1, 3, 5)}), diff --git a/man/means_by_group.Rd b/man/means_by_group.Rd index ba2a7d0c8..6c06ac3b1 100644 --- a/man/means_by_group.Rd +++ b/man/means_by_group.Rd @@ -51,8 +51,10 @@ point when rounding estimates and values.} tasks. Can be either \itemize{ \item a variable specified as a literal variable name (e.g., \code{column_name}), -\item a string with the variable name (e.g., \code{"column_name"}), or a character -vector of variable names (e.g., \code{c("col1", "col2", "col3")}), +\item a string with the variable name (e.g., \code{"column_name"}), a character +vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a +character vector of variable names including ranges specified via \code{:} +(e.g., \code{c("col1:col3", "col5")}), \item a formula with variable names (e.g., \code{~column_1 + column_2}), \item a vector of positive integers, giving the positions counting from the left (e.g. \code{1} or \code{c(1, 3, 5)}), diff --git a/man/normalize.Rd b/man/normalize.Rd index 4a9a61a68..c325e98fe 100644 --- a/man/normalize.Rd +++ b/man/normalize.Rd @@ -71,8 +71,10 @@ the normalized vectors are rescaled to a range from \code{0 + include_bounds} to tasks. Can be either \itemize{ \item a variable specified as a literal variable name (e.g., \code{column_name}), -\item a string with the variable name (e.g., \code{"column_name"}), or a character -vector of variable names (e.g., \code{c("col1", "col2", "col3")}), +\item a string with the variable name (e.g., \code{"column_name"}), a character +vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a +character vector of variable names including ranges specified via \code{:} +(e.g., \code{c("col1:col3", "col5")}), \item a formula with variable names (e.g., \code{~column_1 + column_2}), \item a vector of positive integers, giving the positions counting from the left (e.g. \code{1} or \code{c(1, 3, 5)}), diff --git a/man/ranktransform.Rd b/man/ranktransform.Rd index c23105735..7046db2b5 100644 --- a/man/ranktransform.Rd +++ b/man/ranktransform.Rd @@ -39,8 +39,10 @@ details.} tasks. Can be either \itemize{ \item a variable specified as a literal variable name (e.g., \code{column_name}), -\item a string with the variable name (e.g., \code{"column_name"}), or a character -vector of variable names (e.g., \code{c("col1", "col2", "col3")}), +\item a string with the variable name (e.g., \code{"column_name"}), a character +vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a +character vector of variable names including ranges specified via \code{:} +(e.g., \code{c("col1:col3", "col5")}), \item a formula with variable names (e.g., \code{~column_1 + column_2}), \item a vector of positive integers, giving the positions counting from the left (e.g. \code{1} or \code{c(1, 3, 5)}), diff --git a/man/recode_values.Rd b/man/recode_values.Rd index baa7afda9..dece902f7 100644 --- a/man/recode_values.Rd +++ b/man/recode_values.Rd @@ -60,8 +60,10 @@ default value.} tasks. Can be either \itemize{ \item a variable specified as a literal variable name (e.g., \code{column_name}), -\item a string with the variable name (e.g., \code{"column_name"}), or a character -vector of variable names (e.g., \code{c("col1", "col2", "col3")}), +\item a string with the variable name (e.g., \code{"column_name"}), a character +vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a +character vector of variable names including ranges specified via \code{:} +(e.g., \code{c("col1:col3", "col5")}), \item a formula with variable names (e.g., \code{~column_1 + column_2}), \item a vector of positive integers, giving the positions counting from the left (e.g. \code{1} or \code{c(1, 3, 5)}), diff --git a/man/rescale.Rd b/man/rescale.Rd index 016a6f841..490964777 100644 --- a/man/rescale.Rd +++ b/man/rescale.Rd @@ -67,8 +67,10 @@ the input vector (\code{range(x)}).} tasks. Can be either \itemize{ \item a variable specified as a literal variable name (e.g., \code{column_name}), -\item a string with the variable name (e.g., \code{"column_name"}), or a character -vector of variable names (e.g., \code{c("col1", "col2", "col3")}), +\item a string with the variable name (e.g., \code{"column_name"}), a character +vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a +character vector of variable names including ranges specified via \code{:} +(e.g., \code{c("col1:col3", "col5")}), \item a formula with variable names (e.g., \code{~column_1 + column_2}), \item a vector of positive integers, giving the positions counting from the left (e.g. \code{1} or \code{c(1, 3, 5)}), diff --git a/man/reverse.Rd b/man/reverse.Rd index 6304dffc6..5767908ff 100644 --- a/man/reverse.Rd +++ b/man/reverse.Rd @@ -45,8 +45,10 @@ usually only makes sense when factor levels are numeric, not characters.} tasks. Can be either \itemize{ \item a variable specified as a literal variable name (e.g., \code{column_name}), -\item a string with the variable name (e.g., \code{"column_name"}), or a character -vector of variable names (e.g., \code{c("col1", "col2", "col3")}), +\item a string with the variable name (e.g., \code{"column_name"}), a character +vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a +character vector of variable names including ranges specified via \code{:} +(e.g., \code{c("col1:col3", "col5")}), \item a formula with variable names (e.g., \code{~column_1 + column_2}), \item a vector of positive integers, giving the positions counting from the left (e.g. \code{1} or \code{c(1, 3, 5)}), diff --git a/man/row_means.Rd b/man/row_means.Rd index c347fc6f1..4d61e8cf2 100644 --- a/man/row_means.Rd +++ b/man/row_means.Rd @@ -23,8 +23,10 @@ row_means( tasks. Can be either \itemize{ \item a variable specified as a literal variable name (e.g., \code{column_name}), -\item a string with the variable name (e.g., \code{"column_name"}), or a character -vector of variable names (e.g., \code{c("col1", "col2", "col3")}), +\item a string with the variable name (e.g., \code{"column_name"}), a character +vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a +character vector of variable names including ranges specified via \code{:} +(e.g., \code{c("col1:col3", "col5")}), \item a formula with variable names (e.g., \code{~column_1 + column_2}), \item a vector of positive integers, giving the positions counting from the left (e.g. \code{1} or \code{c(1, 3, 5)}), diff --git a/man/slide.Rd b/man/slide.Rd index ccc6bd7e9..c26943116 100644 --- a/man/slide.Rd +++ b/man/slide.Rd @@ -34,8 +34,10 @@ factors or character vectors to numeric values.} tasks. Can be either \itemize{ \item a variable specified as a literal variable name (e.g., \code{column_name}), -\item a string with the variable name (e.g., \code{"column_name"}), or a character -vector of variable names (e.g., \code{c("col1", "col2", "col3")}), +\item a string with the variable name (e.g., \code{"column_name"}), a character +vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a +character vector of variable names including ranges specified via \code{:} +(e.g., \code{c("col1:col3", "col5")}), \item a formula with variable names (e.g., \code{~column_1 + column_2}), \item a vector of positive integers, giving the positions counting from the left (e.g. \code{1} or \code{c(1, 3, 5)}), diff --git a/man/standardize.Rd b/man/standardize.Rd index 4041f2dc0..fcc8c6ae7 100644 --- a/man/standardize.Rd +++ b/man/standardize.Rd @@ -145,8 +145,10 @@ vectors as well.} tasks. Can be either \itemize{ \item a variable specified as a literal variable name (e.g., \code{column_name}), -\item a string with the variable name (e.g., \code{"column_name"}), or a character -vector of variable names (e.g., \code{c("col1", "col2", "col3")}), +\item a string with the variable name (e.g., \code{"column_name"}), a character +vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a +character vector of variable names including ranges specified via \code{:} +(e.g., \code{c("col1:col3", "col5")}), \item a formula with variable names (e.g., \code{~column_1 + column_2}), \item a vector of positive integers, giving the positions counting from the left (e.g. \code{1} or \code{c(1, 3, 5)}), diff --git a/man/to_factor.Rd b/man/to_factor.Rd index e035769ec..d544bdaae 100644 --- a/man/to_factor.Rd +++ b/man/to_factor.Rd @@ -36,8 +36,10 @@ the values of \code{x} (i.e. as if using \code{as.factor()}).} tasks. Can be either \itemize{ \item a variable specified as a literal variable name (e.g., \code{column_name}), -\item a string with the variable name (e.g., \code{"column_name"}), or a character -vector of variable names (e.g., \code{c("col1", "col2", "col3")}), +\item a string with the variable name (e.g., \code{"column_name"}), a character +vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a +character vector of variable names including ranges specified via \code{:} +(e.g., \code{c("col1:col3", "col5")}), \item a formula with variable names (e.g., \code{~column_1 + column_2}), \item a vector of positive integers, giving the positions counting from the left (e.g. \code{1} or \code{c(1, 3, 5)}), diff --git a/man/to_numeric.Rd b/man/to_numeric.Rd index 634906e4a..39f04c3a9 100644 --- a/man/to_numeric.Rd +++ b/man/to_numeric.Rd @@ -30,8 +30,10 @@ to_numeric(x, ...) tasks. Can be either \itemize{ \item a variable specified as a literal variable name (e.g., \code{column_name}), -\item a string with the variable name (e.g., \code{"column_name"}), or a character -vector of variable names (e.g., \code{c("col1", "col2", "col3")}), +\item a string with the variable name (e.g., \code{"column_name"}), a character +vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a +character vector of variable names including ranges specified via \code{:} +(e.g., \code{c("col1:col3", "col5")}), \item a formula with variable names (e.g., \code{~column_1 + column_2}), \item a vector of positive integers, giving the positions counting from the left (e.g. \code{1} or \code{c(1, 3, 5)}), diff --git a/tests/testthat/test-select_nse.R b/tests/testthat/test-select_nse.R index c0195ad94..fb0f6aefb 100644 --- a/tests/testthat/test-select_nse.R +++ b/tests/testthat/test-select_nse.R @@ -138,3 +138,24 @@ test_that(".select_nse: works with function and namespace", { out <- fun(insight::find_predictors(model, effects = "fixed", flatten = TRUE)) expect_identical(out, iris["Petal.Width"]) }) + +test_that(".select_nse: allow character vector with :", { + data(mtcars) + out <- data_select(mtcars, c("cyl:hp", "wt", "vs:gear")) + expect_named(out, c("cyl", "disp", "hp", "wt", "vs", "am", "gear")) + out <- data_select(mtcars, c("cyl:hp", "wta", "vs:gear")) + expect_named(out, c("cyl", "disp", "hp", "vs", "am", "gear")) + out <- data_select(mtcars, c("hp:cyl", "wta", "vs:gear")) + expect_named(out, c("hp", "disp", "cyl", "vs", "am", "gear")) + out <- data_select(mtcars, c("cyl:hq", "wt", "vs:gear")) + expect_named(out, c("wt", "vs", "am", "gear")) + + expect_warning( + center(mtcars, c("cyl:hp", "wta", "vs:gear"), verbose = TRUE), + regex = "Did you mean \"wt\"" + ) + expect_warning( + center(mtcars, c("cyl:hq", "wt", "vs:gear"), verbose = TRUE), + regex = "Did you mean one of \"hp\"" + ) +}) From 8ab24547120f07c4267b716dc9aedaccff8eb782 Mon Sep 17 00:00:00 2001 From: Daniel Date: Thu, 10 Oct 2024 21:57:27 +0200 Subject: [PATCH 10/19] Rename `drop_na` into `remove_na` in `data_match()` (#556) * Rename drop_na into remove_na? Fixes #554 * news, desc * typo --- DESCRIPTION | 2 +- NEWS.md | 5 +++++ R/data_group.R | 2 +- R/data_match.R | 19 ++++++++++++++++--- man/data_match.Rd | 14 ++++++++++++-- tests/testthat/test-data_match.R | 4 ++-- 6 files changed, 37 insertions(+), 9 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 4758f601c..20764a09e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: datawizard Title: Easy Data Wrangling and Statistical Transformations -Version: 0.13.0.2 +Version: 0.13.0.5 Authors@R: c( person("Indrajeet", "Patil", , "patilindrajeet.science@gmail.com", role = "aut", comment = c(ORCID = "0000-0003-1995-6531")), diff --git a/NEWS.md b/NEWS.md index 388c5a822..08e3527b3 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,10 @@ # datawizard (development) +BREAKING CHANGES + +* Argument `drop_na` in `data_match()` is deprecated now. Please use `remove_na` + instead. + CHANGES * The `select` argument, which is available in different functions to select diff --git a/R/data_group.R b/R/data_group.R index 00a7adf84..538c875c2 100644 --- a/R/data_group.R +++ b/R/data_group.R @@ -51,7 +51,7 @@ data_group <- function(data, to = my_grid[i, , drop = FALSE], match = "and", return_indices = TRUE, - drop_na = FALSE + remove_na = FALSE )) }) my_grid[[".rows"]] <- .rows diff --git a/R/data_match.R b/R/data_match.R index c03b3f222..6b522a0b8 100644 --- a/R/data_match.R +++ b/R/data_match.R @@ -15,7 +15,7 @@ #' @param return_indices Logical, if `FALSE`, return the vector of rows that #' can be used to filter the original data frame. If `FALSE` (default), #' returns directly the filtered data frame instead of the row indices. -#' @param drop_na Logical, if `TRUE`, missing values (`NA`s) are removed before +#' @param remove_na Logical, if `TRUE`, missing values (`NA`s) are removed before #' filtering the data. This is the default behaviour, however, sometimes when #' row indices are requested (i.e. `return_indices=TRUE`), it might be useful #' to preserve `NA` values, so returned row indices match the row indices of @@ -26,6 +26,7 @@ #' character vector (e.g. `c("x > 4", "y == 2")`) or a variable that contains #' the string representation of a logical expression. These might be useful #' when used in packages to avoid defining undefined global variables. +#' @param drop_na Deprecated, please use `remove_na` instead. #' #' @return A filtered data frame, or the row indices that match the specified #' configuration. @@ -100,12 +101,24 @@ #' data_filter(mtcars, fl) #' @inherit data_rename seealso #' @export -data_match <- function(x, to, match = "and", return_indices = FALSE, drop_na = TRUE, ...) { +data_match <- function(x, + to, + match = "and", + return_indices = FALSE, + remove_na = TRUE, + drop_na, + ...) { if (!is.data.frame(to)) { to <- as.data.frame(to) } original_x <- x + ## TODO: remove deprecated argument later + if (!missing(drop_na)) { + insight::format_warning("Argument `drop_na` is deprecated. Please use `remove_na` instead.") + remove_na <- drop_na + } + # evaluate match <- match.arg(tolower(match), c("and", "&", "&&", "or", "|", "||", "!", "not")) match <- switch(match, @@ -133,7 +146,7 @@ data_match <- function(x, to, match = "and", return_indices = FALSE, drop_na = T idx <- vector("numeric", length = 0L) } else { # remove missings before matching - if (isTRUE(drop_na)) { + if (isTRUE(remove_na)) { x <- x[stats::complete.cases(x), , drop = FALSE] } idx <- seq_len(nrow(x)) diff --git a/man/data_match.Rd b/man/data_match.Rd index a57c34768..a209170ab 100644 --- a/man/data_match.Rd +++ b/man/data_match.Rd @@ -5,7 +5,15 @@ \alias{data_filter} \title{Return filtered or sliced data frame, or row indices} \usage{ -data_match(x, to, match = "and", return_indices = FALSE, drop_na = TRUE, ...) +data_match( + x, + to, + match = "and", + return_indices = FALSE, + remove_na = TRUE, + drop_na, + ... +) data_filter(x, ...) } @@ -24,12 +32,14 @@ or \code{"not"} (or \code{"!"}).} can be used to filter the original data frame. If \code{FALSE} (default), returns directly the filtered data frame instead of the row indices.} -\item{drop_na}{Logical, if \code{TRUE}, missing values (\code{NA}s) are removed before +\item{remove_na}{Logical, if \code{TRUE}, missing values (\code{NA}s) are removed before filtering the data. This is the default behaviour, however, sometimes when row indices are requested (i.e. \code{return_indices=TRUE}), it might be useful to preserve \code{NA} values, so returned row indices match the row indices of the original data frame.} +\item{drop_na}{Deprecated, please use \code{remove_na} instead.} + \item{...}{A sequence of logical expressions indicating which rows to keep, or a numeric vector indicating the row indices of rows to keep. Can also be a string representation of a logical expression (e.g. \code{"x > 4"}), a diff --git a/tests/testthat/test-data_match.R b/tests/testthat/test-data_match.R index 75991b4b2..1a40f39fd 100644 --- a/tests/testthat/test-data_match.R +++ b/tests/testthat/test-data_match.R @@ -52,7 +52,7 @@ test_that("data_match works with missing data", { data.frame(c172code = 1, e16sex = 2), match = "not", return_indices = TRUE, - drop_na = FALSE + remove_na = FALSE )) expect_identical(x1, 41L) x1 <- length(data_match( @@ -60,7 +60,7 @@ test_that("data_match works with missing data", { data.frame(c172code = 1, e16sex = 2), match = "not", return_indices = TRUE, - drop_na = TRUE + remove_na = TRUE )) expect_identical(x1, 36L) }) From 5ce207b7169a20fdb4168fd83224d836592e6cca Mon Sep 17 00:00:00 2001 From: Daniel Date: Fri, 11 Oct 2024 11:43:00 +0200 Subject: [PATCH 11/19] Add `row_count()` to count specific values row-wise (#553) * Draft `row_count()` * docs, type safe comparisons * lintr * apply suggestions * add test * fix test * rename arg * switch TRUE and FALSE * update docs * resolve comment * comments * typo --- DESCRIPTION | 2 +- NAMESPACE | 1 + NEWS.md | 2 + R/row_count.R | 124 ++++++++++++++++++++++++++++++ man/row_count.Rd | 132 ++++++++++++++++++++++++++++++++ pkgdown/_pkgdown.yaml | 1 + tests/testthat/test-row_count.R | 57 ++++++++++++++ 7 files changed, 318 insertions(+), 1 deletion(-) create mode 100644 R/row_count.R create mode 100644 man/row_count.Rd create mode 100644 tests/testthat/test-row_count.R diff --git a/DESCRIPTION b/DESCRIPTION index 20764a09e..8fa9eee94 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: datawizard Title: Easy Data Wrangling and Statistical Transformations -Version: 0.13.0.5 +Version: 0.13.0.6 Authors@R: c( person("Indrajeet", "Patil", , "patilindrajeet.science@gmail.com", role = "aut", comment = c(ORCID = "0000-0003-1995-6531")), diff --git a/NAMESPACE b/NAMESPACE index c435c0cc5..1775af562 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -296,6 +296,7 @@ export(reshape_longer) export(reshape_wider) export(reverse) export(reverse_scale) +export(row_count) export(row_means) export(row_to_colnames) export(rowid_as_column) diff --git a/NEWS.md b/NEWS.md index 08e3527b3..da3296536 100644 --- a/NEWS.md +++ b/NEWS.md @@ -11,6 +11,8 @@ CHANGES variables, can now also be a character vector with quoted variable names, including a colon to indicate a range of several variables (e.g. `"cyl:gear"`). +* New function `row_count()`, to count specific values row-wise. + BUG FIXES * `describe_distribution()` no longer errors if the sample was too sparse to compute diff --git a/R/row_count.R b/R/row_count.R new file mode 100644 index 000000000..02b1c16dc --- /dev/null +++ b/R/row_count.R @@ -0,0 +1,124 @@ +#' @title Count specific values row-wise +#' @name row_count +#' @description `row_count()` mimics base R's `rowSums()`, with sums for a +#' specific value indicated by `count`. Hence, it is similar to +#' `rowSums(x == count, na.rm = TRUE)`, but offers some more options, including +#' strict comparisons. Comparisons using `==` coerce values to atomic vectors, +#' thus both `2 == 2` and `"2" == 2` are `TRUE`. In `row_count()`, it is also +#' possible to make "type safe" comparisons using the `allow_coercion` argument, +#' where `"2" == 2` is not true. +#' +#' @param data A data frame with at least two columns, where number of specific +#' values are counted row-wise. +#' @param count The value for which the row sum should be computed. May be a +#' numeric value, a character string (for factors or character vectors), `NA` or +#' `Inf`. +#' @param allow_coercion Logical. If `FALSE`, `count` matches only values of same +#' class (i.e. when `count = 2`, the value `"2"` is not counted and vice versa). +#' By default, when `allow_coercion = TRUE`, `count = 2` also matches `"2"`. In +#' order to count factor levels in the data, use `count = factor("level")`. See +#' 'Examples'. +#' +#' @inheritParams extract_column_names +#' @inheritParams row_means +#' +#' @return A vector with row-wise counts of values specified in `count`. +#' +#' @examples +#' dat <- data.frame( +#' c1 = c(1, 2, NA, 4), +#' c2 = c(NA, 2, NA, 5), +#' c3 = c(NA, 4, NA, NA), +#' c4 = c(2, 3, 7, 8) +#' ) +#' +#' # count all 4s per row +#' row_count(dat, count = 4) +#' # count all missing values per row +#' row_count(dat, count = NA) +#' +#' dat <- data.frame( +#' c1 = c("1", "2", NA, "3"), +#' c2 = c(NA, "2", NA, "3"), +#' c3 = c(NA, 4, NA, NA), +#' c4 = c(2, 3, 7, Inf) +#' ) +#' # count all 2s and "2"s per row +#' row_count(dat, count = 2) +#' # only count 2s, but not "2"s +#' row_count(dat, count = 2, allow_coercion = FALSE) +#' +#' dat <- data.frame( +#' c1 = factor(c("1", "2", NA, "3")), +#' c2 = c("2", "1", NA, "3"), +#' c3 = c(NA, 4, NA, NA), +#' c4 = c(2, 3, 7, Inf) +#' ) +#' # find only character "2"s +#' row_count(dat, count = "2", allow_coercion = FALSE) +#' # find only factor level "2"s +#' row_count(dat, count = factor("2"), allow_coercion = FALSE) +#' +#' @export +row_count <- function(data, + select = NULL, + exclude = NULL, + count = NULL, + allow_coercion = TRUE, + ignore_case = FALSE, + regex = FALSE, + verbose = TRUE) { + # evaluate arguments + select <- .select_nse(select, + data, + exclude, + ignore_case = ignore_case, + regex = regex, + verbose = verbose + ) + + if (is.null(count)) { + insight::format_error("`count` must be a valid value (including `NA` or `Inf`), but not `NULL`.") + } + + if (is.null(select) || length(select) == 0) { + insight::format_error("No columns selected.") + } + + data <- .coerce_to_dataframe(data[select]) + + # check if we have a data framme with at least two columns + if (nrow(data) < 1) { + insight::format_error("`data` must be a data frame with at least one row.") + } + + # check if we have a data framme with at least two columns + if (ncol(data) < 2) { + insight::format_error("`data` must be a data frame with at least two numeric columns.") + } + # special case: count missing + if (is.na(count)) { + rowSums(is.na(data)) + } else { + # comparisons in R using == coerce values into a atomic vector, i.e. + # 2 == "2" is TRUE. If `allow_coercion = FALSE`, we only want 2 == 2 or + # "2" == "2" (i.e. we want exact types to be compared only) + if (isFALSE(allow_coercion)) { + # we need the "type" of the count-value - we use class() instead of typeof(), + # because the latter sometimes returns unsuitable classes/types. compare + # typeof(as.Date("2020-01-01")), which returns "double". + count_type <- class(count)[1] + valid_columns <- vapply(data, inherits, TRUE, what = count_type) + # check if any columns left? + if (!any(valid_columns)) { + insight::format_error("No column has same type as the value provided in `count`. Set `allow_coercion = TRUE` or specify a valid value for `count`.") # nolint + } + data <- data[valid_columns] + } + # coerce - we have only valid columns anyway, and we need to coerce factors + # to vectors, else comparison with `==` errors. + count <- as.vector(count) + # finally, count + rowSums(data == count, na.rm = TRUE) + } +} diff --git a/man/row_count.Rd b/man/row_count.Rd new file mode 100644 index 000000000..7bf54fe5f --- /dev/null +++ b/man/row_count.Rd @@ -0,0 +1,132 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/row_count.R +\name{row_count} +\alias{row_count} +\title{Count specific values row-wise} +\usage{ +row_count( + data, + select = NULL, + exclude = NULL, + count = NULL, + allow_coercion = TRUE, + ignore_case = FALSE, + regex = FALSE, + verbose = TRUE +) +} +\arguments{ +\item{data}{A data frame with at least two columns, where number of specific +values are counted row-wise.} + +\item{select}{Variables that will be included when performing the required +tasks. Can be either +\itemize{ +\item a variable specified as a literal variable name (e.g., \code{column_name}), +\item a string with the variable name (e.g., \code{"column_name"}), a character +vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a +character vector of variable names including ranges specified via \code{:} +(e.g., \code{c("col1:col3", "col5")}), +\item a formula with variable names (e.g., \code{~column_1 + column_2}), +\item a vector of positive integers, giving the positions counting from the left +(e.g. \code{1} or \code{c(1, 3, 5)}), +\item a vector of negative integers, giving the positions counting from the +right (e.g., \code{-1} or \code{-1:-3}), +\item one of the following select-helpers: \code{starts_with()}, \code{ends_with()}, +\code{contains()}, a range using \code{:} or \code{regex("")}. \code{starts_with()}, +\code{ends_with()}, and \code{contains()} accept several patterns, e.g +\code{starts_with("Sep", "Petal")}. +\item or a function testing for logical conditions, e.g. \code{is.numeric()} (or +\code{is.numeric}), or any user-defined function that selects the variables +for which the function returns \code{TRUE} (like: \code{foo <- function(x) mean(x) > 3}), +\item ranges specified via literal variable names, select-helpers (except +\code{regex()}) and (user-defined) functions can be negated, i.e. return +non-matching elements, when prefixed with a \code{-}, e.g. \code{-ends_with("")}, +\code{-is.numeric} or \code{-(Sepal.Width:Petal.Length)}. \strong{Note:} Negation means +that matches are \emph{excluded}, and thus, the \code{exclude} argument can be +used alternatively. For instance, \code{select=-ends_with("Length")} (with +\code{-}) is equivalent to \code{exclude=ends_with("Length")} (no \code{-}). In case +negation should not work as expected, use the \code{exclude} argument instead. +} + +If \code{NULL}, selects all columns. Patterns that found no matches are silently +ignored, e.g. \code{extract_column_names(iris, select = c("Species", "Test"))} +will just return \code{"Species"}.} + +\item{exclude}{See \code{select}, however, column names matched by the pattern +from \code{exclude} will be excluded instead of selected. If \code{NULL} (the default), +excludes no columns.} + +\item{count}{The value for which the row sum should be computed. May be a +numeric value, a character string (for factors or character vectors), \code{NA} or +\code{Inf}.} + +\item{allow_coercion}{Logical. If \code{FALSE}, \code{count} matches only values of same +class (i.e. when \code{count = 2}, the value \code{"2"} is not counted and vice versa). +By default, when \code{allow_coercion = TRUE}, \code{count = 2} also matches \code{"2"}. In +order to count factor levels in the data, use \code{count = factor("level")}. See +'Examples'.} + +\item{ignore_case}{Logical, if \code{TRUE} and when one of the select-helpers or +a regular expression is used in \code{select}, ignores lower/upper case in the +search pattern when matching against variable names.} + +\item{regex}{Logical, if \code{TRUE}, the search pattern from \code{select} will be +treated as regular expression. When \code{regex = TRUE}, select \emph{must} be a +character string (or a variable containing a character string) and is not +allowed to be one of the supported select-helpers or a character vector +of length > 1. \code{regex = TRUE} is comparable to using one of the two +select-helpers, \code{select = contains("")} or \code{select = regex("")}, however, +since the select-helpers may not work when called from inside other +functions (see 'Details'), this argument may be used as workaround.} + +\item{verbose}{Toggle warnings.} +} +\value{ +A vector with row-wise counts of values specified in \code{count}. +} +\description{ +\code{row_count()} mimics base R's \code{rowSums()}, with sums for a +specific value indicated by \code{count}. Hence, it is similar to +\code{rowSums(x == count, na.rm = TRUE)}, but offers some more options, including +strict comparisons. Comparisons using \code{==} coerce values to atomic vectors, +thus both \code{2 == 2} and \code{"2" == 2} are \code{TRUE}. In \code{row_count()}, it is also +possible to make "type safe" comparisons using the \code{allow_coercion} argument, +where \code{"2" == 2} is not true. +} +\examples{ +dat <- data.frame( + c1 = c(1, 2, NA, 4), + c2 = c(NA, 2, NA, 5), + c3 = c(NA, 4, NA, NA), + c4 = c(2, 3, 7, 8) +) + +# count all 4s per row +row_count(dat, count = 4) +# count all missing values per row +row_count(dat, count = NA) + +dat <- data.frame( + c1 = c("1", "2", NA, "3"), + c2 = c(NA, "2", NA, "3"), + c3 = c(NA, 4, NA, NA), + c4 = c(2, 3, 7, Inf) +) +# count all 2s and "2"s per row +row_count(dat, count = 2) +# only count 2s, but not "2"s +row_count(dat, count = 2, allow_coercion = FALSE) + +dat <- data.frame( + c1 = factor(c("1", "2", NA, "3")), + c2 = c("2", "1", NA, "3"), + c3 = c(NA, 4, NA, NA), + c4 = c(2, 3, 7, Inf) +) +# find only character "2"s +row_count(dat, count = "2", allow_coercion = FALSE) +# find only factor level "2"s +row_count(dat, count = factor("2"), allow_coercion = FALSE) + +} diff --git a/pkgdown/_pkgdown.yaml b/pkgdown/_pkgdown.yaml index 6e6feb5b2..31ec901d0 100644 --- a/pkgdown/_pkgdown.yaml +++ b/pkgdown/_pkgdown.yaml @@ -71,6 +71,7 @@ reference: - kurtosis - smoothness - skewness + - row_count - row_means - weighted_mean - mean_sd diff --git a/tests/testthat/test-row_count.R b/tests/testthat/test-row_count.R new file mode 100644 index 000000000..0c7d67691 --- /dev/null +++ b/tests/testthat/test-row_count.R @@ -0,0 +1,57 @@ +test_that("row_count", { + d_mn <- data.frame( + c1 = c(1, 2, NA, 4), + c2 = c(NA, 2, NA, 5), + c3 = c(NA, 4, NA, NA), + c4 = c(2, 3, 7, 8) + ) + expect_identical(row_count(d_mn, count = 2), c(1, 2, 0, 0)) + expect_identical(row_count(d_mn, count = NA), c(2, 0, 3, 1)) + d_mn <- data.frame( + c1 = c("a", "b", NA, "c"), + c2 = c(NA, "b", NA, "d"), + c3 = c(NA, 4, NA, NA), + c4 = c(2, 3, 7, Inf), + stringsAsFactors = FALSE + ) + expect_identical(row_count(d_mn, count = "b"), c(0, 2, 0, 0)) + expect_identical(row_count(d_mn, count = Inf), c(0, 0, 0, 1)) +}) + +test_that("row_count, errors or messages", { + data(iris) + expect_error(expect_warning(row_count(iris, select = "abc")), regex = "must be a valid") + expect_error(expect_warning(row_count(iris, select = "abc", count = 3)), regex = "No columns") + expect_error(row_count(iris[1], count = 3), regex = "with at least") + expect_error(row_count(iris[-seq_len(nrow(iris)), , drop = FALSE], count = 2), regex = "one row") +}) + +test_that("row_count, allow_coercion match", { + d_mn <- data.frame( + c1 = c("1", "2", NA, "3"), + c2 = c(NA, "2", NA, "3"), + c3 = c(NA, 4, NA, NA), + c4 = c(2, 3, 7, Inf), + stringsAsFactors = FALSE + ) + expect_identical(row_count(d_mn, count = 2, allow_coercion = TRUE), c(1, 2, 0, 0)) + expect_identical(row_count(d_mn, count = 2, allow_coercion = FALSE), c(1, 0, 0, 0)) + expect_identical(row_count(d_mn, count = "2", allow_coercion = FALSE), c(0, 2, 0, 0)) + expect_identical(row_count(d_mn, count = factor("2"), allow_coercion = TRUE), c(1, 2, 0, 0)) + expect_error(row_count(d_mn, count = factor("2"), allow_coercion = FALSE), regex = "No column has") + + # mix character / factor + d_mn <- data.frame( + c1 = factor(c("1", "2", NA, "3")), + c2 = c("2", "1", NA, "3"), + c3 = c(NA, 4, NA, NA), + c4 = c(2, 3, 7, Inf), + stringsAsFactors = FALSE + ) + expect_identical(row_count(d_mn, count = 2, allow_coercion = TRUE), c(2, 1, 0, 0)) + expect_identical(row_count(d_mn, count = 2, allow_coercion = FALSE), c(1, 0, 0, 0)) + expect_identical(row_count(d_mn, count = "2", allow_coercion = FALSE), c(1, 0, 0, 0)) + expect_identical(row_count(d_mn, count = "2", allow_coercion = TRUE), c(2, 1, 0, 0)) + expect_identical(row_count(d_mn, count = factor("2"), allow_coercion = FALSE), c(0, 1, 0, 0)) + expect_identical(row_count(d_mn, count = factor("2"), allow_coercion = TRUE), c(2, 1, 0, 0)) +}) From 213b9d521eaa636b99a15b7a1962e96a5c9b939b Mon Sep 17 00:00:00 2001 From: Daniel Date: Fri, 11 Oct 2024 11:50:50 +0200 Subject: [PATCH 12/19] Add `row_sums()` (#552) * Draft `row_sums()` as complement to `row_means()` * version --- DESCRIPTION | 2 +- NAMESPACE | 1 + NEWS.md | 3 + R/row_means.R | 142 ++++++++++++++++++++++---------- man/row_means.Rd | 55 +++++++++---- tests/testthat/test-row_means.R | 11 ++- 6 files changed, 151 insertions(+), 63 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 8fa9eee94..841bb2573 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: datawizard Title: Easy Data Wrangling and Statistical Transformations -Version: 0.13.0.6 +Version: 0.13.0.7 Authors@R: c( person("Indrajeet", "Patil", , "patilindrajeet.science@gmail.com", role = "aut", comment = c(ORCID = "0000-0003-1995-6531")), diff --git a/NAMESPACE b/NAMESPACE index 1775af562..7e97817b9 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -298,6 +298,7 @@ export(reverse) export(reverse_scale) export(row_count) export(row_means) +export(row_sums) export(row_to_colnames) export(rowid_as_column) export(rownames_as_column) diff --git a/NEWS.md b/NEWS.md index da3296536..679a71a9c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -11,6 +11,9 @@ CHANGES variables, can now also be a character vector with quoted variable names, including a colon to indicate a range of several variables (e.g. `"cyl:gear"`). +* New function `row_sums()`, to calculate row sums (optionally with minimum + amount of valid values), as complement to `row_means()`. + * New function `row_count()`, to count specific values row-wise. BUG FIXES diff --git a/R/row_means.R b/R/row_means.R index 4d2876c6a..729c800be 100644 --- a/R/row_means.R +++ b/R/row_means.R @@ -1,15 +1,16 @@ -#' @title Row means (optionally with minimum amount of valid values) +#' @title Row means or sums (optionally with minimum amount of valid values) #' @name row_means -#' @description This function is similar to the SPSS `MEAN.n` function and computes -#' row means from a data frame or matrix if at least `min_valid` values of a row are -#' valid (and not `NA`). +#' @description This function is similar to the SPSS `MEAN.n` or `SUM.n` +#' function and computes row means or row sums from a data frame or matrix if at +#' least `min_valid` values of a row are valid (and not `NA`). #' -#' @param data A data frame with at least two columns, where row means are applied. +#' @param data A data frame with at least two columns, where row means or row +#' sums are applied. #' @param min_valid Optional, a numeric value of length 1. May either be #' - a numeric value that indicates the amount of valid values per row to -#' calculate the row mean; +#' calculate the row mean or row sum; #' - or a value between `0` and `1`, indicating a proportion of valid values per -#' row to calculate the row mean (see 'Details'). +#' row to calculate the row mean or row sum (see 'Details'). #' - `NULL` (default), in which all cases are considered. #' #' If a row's sum of valid values is less than `min_valid`, `NA` will be returned. @@ -17,21 +18,24 @@ #' used for rounding mean values. Negative values are allowed (see 'Details'). #' By default, `digits = NULL` and no rounding is used. #' @param remove_na Logical, if `TRUE` (default), removes missing (`NA`) values -#' before calculating row means. Only applies if `min_valuid` is not specified. +#' before calculating row means or row sums. Only applies if `min_valid` is not +#' specified. #' @param verbose Toggle warnings. #' @inheritParams extract_column_names #' -#' @return A vector with row means for those rows with at least `n` valid values. +#' @return A vector with row means (for `row_means()`) or row sums (for +#' `row_sums()`) for those rows with at least `n` valid values. #' -#' @details Rounding to a negative number of `digits` means rounding to a power of -#' ten, for example `row_means(df, 3, digits = -2)` rounds to the nearest hundred. -#' For `min_valid`, if not `NULL`, `min_valid` must be a numeric value from `0` -#' to `ncol(data)`. If a row in the data frame has at least `min_valid` -#' non-missing values, the row mean is returned. If `min_valid` is a non-integer -#' value from 0 to 1, `min_valid` is considered to indicate the proportion of -#' required non-missing values per row. E.g., if `min_valid = 0.75`, a row must -#' have at least `ncol(data) * min_valid` non-missing values for the row mean -#' to be calculated. See 'Examples'. +#' @details Rounding to a negative number of `digits` means rounding to a power +#' of ten, for example `row_means(df, 3, digits = -2)` rounds to the nearest +#' hundred. For `min_valid`, if not `NULL`, `min_valid` must be a numeric value +#' from `0` to `ncol(data)`. If a row in the data frame has at least `min_valid` +#' non-missing values, the row mean or row sum is returned. If `min_valid` is a +#' non-integer value from 0 to 1, `min_valid` is considered to indicate the +#' proportion of required non-missing values per row. E.g., if +#' `min_valid = 0.75`, a row must have at least `ncol(data) * min_valid` +#' non-missing values for the row mean or row sum to be calculated. See +#' 'Examples'. #' #' @examples #' dat <- data.frame( @@ -49,6 +53,7 @@ #' #' # needs at least 4 non-missing values per row #' row_means(dat, min_valid = 4) # 1 valid return value +#' row_sums(dat, min_valid = 4) # 1 valid return value #' #' # needs at least 3 non-missing values per row #' row_means(dat, min_valid = 3) # 2 valid return values @@ -61,6 +66,7 @@ #' #' # needs at least 50% of non-missing values per row #' row_means(dat, min_valid = 0.5) # 3 valid return values +#' row_sums(dat, min_valid = 0.5) #' #' # needs at least 75% of non-missing values per row #' row_means(dat, min_valid = 0.75) # 2 valid return values @@ -84,34 +90,52 @@ row_means <- function(data, verbose = verbose ) - if (is.null(select) || length(select) == 0) { - insight::format_error("No columns selected.") - } + # prepare data, sanity checks + data <- .prepare_row_data(data, select, min_valid, verbose) - data <- .coerce_to_dataframe(data[select]) + # calculate row means + .row_sums_or_means(data, min_valid, digits, remove_na, fun = "mean") +} - # n must be a numeric, non-missing value - if (!is.null(min_valid) && (all(is.na(min_valid)) || !is.numeric(min_valid) || length(min_valid) > 1)) { - insight::format_error("`min_valid` must be a numeric value of length 1.") - } - # make sure we only have numeric values - numeric_columns <- vapply(data, is.numeric, TRUE) - if (!all(numeric_columns)) { - if (verbose) { - insight::format_alert("Only numeric columns are considered for calculation.") - } - data <- data[numeric_columns] - } +#' @rdname row_means +#' @export +row_sums <- function(data, + select = NULL, + exclude = NULL, + min_valid = NULL, + digits = NULL, + ignore_case = FALSE, + regex = FALSE, + remove_na = FALSE, + verbose = TRUE) { + # evaluate arguments + select <- .select_nse(select, + data, + exclude, + ignore_case = ignore_case, + regex = regex, + verbose = verbose + ) + + # prepare data, sanity checks + data <- .prepare_row_data(data, select, min_valid, verbose) + + # calculate row sums + .row_sums_or_means(data, min_valid, digits, remove_na, fun = "sum") +} - # check if we have a data framme with at least two columns - if (ncol(data) < 2) { - insight::format_error("`data` must be a data frame with at least two numeric columns.") - } - # proceed here if min_valid is not NULL +# helper ------------------------ + +# calculate row means or sums +.row_sums_or_means <- function(data, min_valid, digits, remove_na, fun) { if (is.null(min_valid)) { - out <- rowMeans(data, na.rm = remove_na) + # calculate row means or sums for complete data + out <- switch(fun, + mean = rowMeans(data, na.rm = remove_na), + rowSums(data, na.rm = remove_na) + ) } else { # is 'min_valid' indicating a proportion? decimals <- min_valid %% 1 @@ -124,9 +148,12 @@ row_means <- function(data, insight::format_error("`min_valid` must be smaller or equal to number of columns in data frame.") } - # row means + # row means or sums to_na <- rowSums(is.na(data)) > ncol(data) - min_valid - out <- rowMeans(data, na.rm = TRUE) + out <- switch(fun, + mean = rowMeans(data, na.rm = TRUE), + rowSums(data, na.rm = TRUE) + ) out[to_na] <- NA } @@ -137,3 +164,34 @@ row_means <- function(data, out } + + +# check that data is in shape for row means or row sums +.prepare_row_data <- function(data, select, min_valid, verbose) { + if (is.null(select) || length(select) == 0) { + insight::format_error("No columns selected.") + } + + data <- .coerce_to_dataframe(data[select]) + + # n must be a numeric, non-missing value + if (!is.null(min_valid) && (all(is.na(min_valid)) || !is.numeric(min_valid) || length(min_valid) > 1)) { + insight::format_error("`min_valid` must be a numeric value of length 1.") + } + + # make sure we only have numeric values + numeric_columns <- vapply(data, is.numeric, TRUE) + if (!all(numeric_columns)) { + if (verbose) { + insight::format_alert("Only numeric columns are considered for calculation.") + } + data <- data[numeric_columns] + } + + # check if we have a data framme with at least two columns + if (ncol(data) < 2) { + insight::format_error("`data` must be a data frame with at least two numeric columns.") + } + + data +} diff --git a/man/row_means.Rd b/man/row_means.Rd index 4d61e8cf2..43d85b5b0 100644 --- a/man/row_means.Rd +++ b/man/row_means.Rd @@ -2,7 +2,8 @@ % Please edit documentation in R/row_means.R \name{row_means} \alias{row_means} -\title{Row means (optionally with minimum amount of valid values)} +\alias{row_sums} +\title{Row means or sums (optionally with minimum amount of valid values)} \usage{ row_means( data, @@ -15,9 +16,22 @@ row_means( remove_na = FALSE, verbose = TRUE ) + +row_sums( + data, + select = NULL, + exclude = NULL, + min_valid = NULL, + digits = NULL, + ignore_case = FALSE, + regex = FALSE, + remove_na = FALSE, + verbose = TRUE +) } \arguments{ -\item{data}{A data frame with at least two columns, where row means are applied.} +\item{data}{A data frame with at least two columns, where row means or row +sums are applied.} \item{select}{Variables that will be included when performing the required tasks. Can be either @@ -60,9 +74,9 @@ excludes no columns.} \item{min_valid}{Optional, a numeric value of length 1. May either be \itemize{ \item a numeric value that indicates the amount of valid values per row to -calculate the row mean; +calculate the row mean or row sum; \item or a value between \code{0} and \code{1}, indicating a proportion of valid values per -row to calculate the row mean (see 'Details'). +row to calculate the row mean or row sum (see 'Details'). \item \code{NULL} (default), in which all cases are considered. } @@ -86,28 +100,31 @@ since the select-helpers may not work when called from inside other functions (see 'Details'), this argument may be used as workaround.} \item{remove_na}{Logical, if \code{TRUE} (default), removes missing (\code{NA}) values -before calculating row means. Only applies if \code{min_valuid} is not specified.} +before calculating row means or row sums. Only applies if \code{min_valid} is not +specified.} \item{verbose}{Toggle warnings.} } \value{ -A vector with row means for those rows with at least \code{n} valid values. +A vector with row means (for \code{row_means()}) or row sums (for +\code{row_sums()}) for those rows with at least \code{n} valid values. } \description{ -This function is similar to the SPSS \code{MEAN.n} function and computes -row means from a data frame or matrix if at least \code{min_valid} values of a row are -valid (and not \code{NA}). +This function is similar to the SPSS \code{MEAN.n} or \code{SUM.n} +function and computes row means or row sums from a data frame or matrix if at +least \code{min_valid} values of a row are valid (and not \code{NA}). } \details{ -Rounding to a negative number of \code{digits} means rounding to a power of -ten, for example \code{row_means(df, 3, digits = -2)} rounds to the nearest hundred. -For \code{min_valid}, if not \code{NULL}, \code{min_valid} must be a numeric value from \code{0} -to \code{ncol(data)}. If a row in the data frame has at least \code{min_valid} -non-missing values, the row mean is returned. If \code{min_valid} is a non-integer -value from 0 to 1, \code{min_valid} is considered to indicate the proportion of -required non-missing values per row. E.g., if \code{min_valid = 0.75}, a row must -have at least \code{ncol(data) * min_valid} non-missing values for the row mean -to be calculated. See 'Examples'. +Rounding to a negative number of \code{digits} means rounding to a power +of ten, for example \code{row_means(df, 3, digits = -2)} rounds to the nearest +hundred. For \code{min_valid}, if not \code{NULL}, \code{min_valid} must be a numeric value +from \code{0} to \code{ncol(data)}. If a row in the data frame has at least \code{min_valid} +non-missing values, the row mean or row sum is returned. If \code{min_valid} is a +non-integer value from 0 to 1, \code{min_valid} is considered to indicate the +proportion of required non-missing values per row. E.g., if +\code{min_valid = 0.75}, a row must have at least \code{ncol(data) * min_valid} +non-missing values for the row mean or row sum to be calculated. See +'Examples'. } \examples{ dat <- data.frame( @@ -125,6 +142,7 @@ row_means(dat, remove_na = TRUE) # needs at least 4 non-missing values per row row_means(dat, min_valid = 4) # 1 valid return value +row_sums(dat, min_valid = 4) # 1 valid return value # needs at least 3 non-missing values per row row_means(dat, min_valid = 3) # 2 valid return values @@ -137,6 +155,7 @@ row_means(dat, select = c("c1", "c3"), min_valid = 1) # needs at least 50\% of non-missing values per row row_means(dat, min_valid = 0.5) # 3 valid return values +row_sums(dat, min_valid = 0.5) # needs at least 75\% of non-missing values per row row_means(dat, min_valid = 0.75) # 2 valid return values diff --git a/tests/testthat/test-row_means.R b/tests/testthat/test-row_means.R index 8d0504c69..4db0d7039 100644 --- a/tests/testthat/test-row_means.R +++ b/tests/testthat/test-row_means.R @@ -1,4 +1,4 @@ -test_that("row_means", { +test_that("row_means/sums", { d_mn <- data.frame( c1 = c(1, 2, NA, 4), c2 = c(NA, 2, NA, 5), @@ -14,14 +14,21 @@ test_that("row_means", { expect_equal(row_means(d_mn, min_valid = 2, digits = 1), c(1.5, 2.8, NA, 5.7), tolerance = 1e-1) expect_message(row_means(iris), regex = "Only numeric") expect_equal(row_means(iris, verbose = FALSE), rowMeans(iris[, 1:4]), tolerance = 1e-3, ignore_attr = TRUE) + expect_equal(row_sums(d_mn, min_valid = 4), c(NA, 11, NA, NA), tolerance = 1e-3) + expect_equal(row_sums(d_mn, min_valid = 3), c(NA, 11, NA, 17), tolerance = 1e-3) + expect_message(row_sums(iris), regex = "Only numeric") }) -test_that("row_means, errors or messages", { +test_that("row_means/sums, errors or messages", { data(iris) expect_error(expect_warning(row_means(iris, select = "abc")), regex = "No columns") + expect_error(expect_warning(row_sums(iris, select = "abc")), regex = "No columns") expect_error(row_means(iris[1], min_valid = 1), regex = "two numeric") expect_error(row_means(iris, min_valid = 1:4), regex = "numeric value") expect_error(row_means(iris, min_valid = "a"), regex = "numeric value") expect_message(row_means(iris[1:3, ], min_valid = 3), regex = "Only numeric") expect_silent(row_means(iris[1:3, ], min_valid = 3, verbose = FALSE)) + expect_error(row_sums(iris[1], min_valid = 1), regex = "two numeric") + expect_message(row_sums(iris[1:3, ], min_valid = 3), regex = "Only numeric") + expect_silent(row_sums(iris[1:3, ], min_valid = 3, verbose = FALSE)) }) From 3f46e31cae70f43588b2cd8bcf0beaf0b5a54d0c Mon Sep 17 00:00:00 2001 From: Daniel Date: Sat, 12 Oct 2024 09:05:22 +0200 Subject: [PATCH 13/19] set `trust = TRUE` also for RData files (#557) * set `trust = TRUE` also for RData files * fix test * lintr --- DESCRIPTION | 2 +- NEWS.md | 3 +++ R/data_read.R | 4 ++-- tests/testthat/test-data_read.R | 13 +++++++++++++ 4 files changed, 19 insertions(+), 3 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 841bb2573..cc9810def 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: datawizard Title: Easy Data Wrangling and Statistical Transformations -Version: 0.13.0.7 +Version: 0.13.0.8 Authors@R: c( person("Indrajeet", "Patil", , "patilindrajeet.science@gmail.com", role = "aut", comment = c(ORCID = "0000-0003-1995-6531")), diff --git a/NEWS.md b/NEWS.md index 679a71a9c..b5cdf84c0 100644 --- a/NEWS.md +++ b/NEWS.md @@ -16,6 +16,9 @@ CHANGES * New function `row_count()`, to count specific values row-wise. +* `data_read()` no longer shows warning about forthcoming breaking changes + in upstream packages when reading `.RData` files. + BUG FIXES * `describe_distribution()` no longer errors if the sample was too sparse to compute diff --git a/R/data_read.R b/R/data_read.R index 5137a7735..1306a3f32 100644 --- a/R/data_read.R +++ b/R/data_read.R @@ -161,7 +161,7 @@ data_read <- function(path, # user may decide whether we automatically detect variable type or not if (isTRUE(convert_factors)) { if (verbose) { - msg <- "Variables where all values have associated labels are now converted into factors. If this is not intended, use `convert_factors = FALSE`." + msg <- "Variables where all values have associated labels are now converted into factors. If this is not intended, use `convert_factors = FALSE`." # nolint insight::format_alert(msg) } x[] <- lapply(x, function(i) { @@ -296,7 +296,7 @@ data_read <- function(path, # set up arguments. for RDS, we set trust = TRUE, to avoid warnings rio_args <- list(file = path) # check if we have RDS, and if so, add trust = TRUE - if (file_type == "rds") { + if (file_type %in% c("rds", "rdata")) { rio_args$trust <- TRUE } out <- do.call(rio::import, c(rio_args, list(...))) diff --git a/tests/testthat/test-data_read.R b/tests/testthat/test-data_read.R index fd4884deb..ac316c706 100644 --- a/tests/testthat/test-data_read.R +++ b/tests/testthat/test-data_read.R @@ -154,6 +154,19 @@ test_that("data_read - RDS file, matrix, coercible", { }) + +# RData ----------------------------------- + +test_that("data_read - no warning for RData", { + withr::with_tempfile("temp_file", fileext = ".RData", code = { + data(mtcars) + save(mtcars, file = temp_file) + expect_silent(data_read(temp_file, verbose = FALSE)) + }) +}) + + + # SPSS file ----------------------------------- test_that("data_read - SPSS file", { From 003e2b80daee0398c6301f10ce3a353ac1efd7ca Mon Sep 17 00:00:00 2001 From: Daniel Date: Sat, 19 Oct 2024 16:27:16 +0200 Subject: [PATCH 14/19] `data_read()` preserves class for rds files (#558) * `data_read()` preserves class for rds files * desc. news * fix * add test * typo * Update NEWS.md --------- Co-authored-by: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com> --- DESCRIPTION | 2 +- NEWS.md | 3 +++ R/data_read.R | 41 +++++++++++++++++---------------- man/data_read.Rd | 32 +++++++++++++------------ tests/testthat/test-data_read.R | 27 ++++++++++++++++++++-- 5 files changed, 67 insertions(+), 38 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index cc9810def..f68a1e2eb 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: datawizard Title: Easy Data Wrangling and Statistical Transformations -Version: 0.13.0.8 +Version: 0.13.0.9 Authors@R: c( person("Indrajeet", "Patil", , "patilindrajeet.science@gmail.com", role = "aut", comment = c(ORCID = "0000-0003-1995-6531")), diff --git a/NEWS.md b/NEWS.md index b5cdf84c0..8fc8a29ca 100644 --- a/NEWS.md +++ b/NEWS.md @@ -24,6 +24,9 @@ BUG FIXES * `describe_distribution()` no longer errors if the sample was too sparse to compute CIs. Instead, it warns the user and returns `NA` (#550). +* `data_read()` preserves variable types when importing files from `rds` or + `rdata` format (#558). + # datawizard 0.13.0 BREAKING CHANGES diff --git a/R/data_read.R b/R/data_read.R index 1306a3f32..b24a5bdc2 100644 --- a/R/data_read.R +++ b/R/data_read.R @@ -15,15 +15,16 @@ #' for SAS data files. #' @param encoding The character encoding used for the file. Usually not needed. #' @param convert_factors If `TRUE` (default), numeric variables, where all -#' values have a value label, are assumed to be categorical and converted -#' into factors. If `FALSE`, no variable types are guessed and no conversion -#' of numeric variables into factors will be performed. See also section -#' 'Differences to other packages'. For `data_write()`, this argument only -#' applies to the text (e.g. `.txt` or `.csv`) or spreadsheet file formats (like -#' `.xlsx`). Converting to factors might be useful for these formats because -#' labelled numeric variables are then converted into factors and exported as -#' character columns - else, value labels would be lost and only numeric values -#' are written to the file. +#' values have a value label, are assumed to be categorical and converted into +#' factors. If `FALSE`, no variable types are guessed and no conversion of +#' numeric variables into factors will be performed. For `data_read()`, this +#' argument only applies to file types with *labelled data*, e.g. files from +#' SPSS, SAS or Stata. See also section 'Differences to other packages'. For +#' `data_write()`, this argument only applies to the text (e.g. `.txt` or +#' `.csv`) or spreadsheet file formats (like `.xlsx`). Converting to factors +#' might be useful for these formats because labelled numeric variables are then +#' converted into factors and exported as character columns - else, value labels +#' would be lost and only numeric values are written to the file. #' @param verbose Toggle warnings and messages. #' @param ... Arguments passed to the related `read_*()` or `write_*()` functions. #' @@ -65,12 +66,13 @@ #' @section Differences to other packages that read foreign data formats: #' `data_read()` is most comparable to `rio::import()`. For data files from #' SPSS, SAS or Stata, which support labelled data, variables are converted into -#' their most appropriate type. The major difference to `rio::import()` is that -#' `data_read()` automatically converts fully labelled numeric variables into -#' factors, where imported value labels will be set as factor levels. If a -#' numeric variable has _no_ value labels or less value labels than values, it -#' is not converted to factor. In this case, value labels are preserved as -#' `"labels"` attribute. Character vectors are preserved. Use +#' their most appropriate type. The major difference to `rio::import()` is for +#' data files from SPSS, SAS, or Stata, i.e. file types that support +#' *labelled data*. `data_read()` automatically converts fully labelled numeric +#' variables into factors, where imported value labels will be set as factor +#' levels. If a numeric variable has _no_ value labels or less value labels than +#' values, it is not converted to factor. In this case, value labels are +#' preserved as `"labels"` attribute. Character vectors are preserved. Use #' `convert_factors = FALSE` to remove the automatic conversion of numeric #' variables to factors. #' @@ -105,7 +107,7 @@ data_read <- function(path, por = .read_spss(path, encoding, convert_factors, verbose, ...), dta = .read_stata(path, encoding, convert_factors, verbose, ...), sas7bdat = .read_sas(path, path_catalog, encoding, convert_factors, verbose, ...), - .read_unknown(path, file_type, convert_factors, verbose, ...) + .read_unknown(path, file_type, verbose, ...) ) # tell user about empty columns @@ -188,7 +190,7 @@ data_read <- function(path, value_labels <- NULL attr(i, "converted_to_factor") <- TRUE } else { - # else, fall back to numeric + # else, fall back to numeric or factor i <- as.numeric(i) } @@ -288,7 +290,7 @@ data_read <- function(path, } -.read_unknown <- function(path, file_type, convert_factors, verbose, ...) { +.read_unknown <- function(path, file_type, verbose, ...) { insight::check_if_installed("rio", reason = paste0("to read files of type '", file_type, "'")) if (verbose) { insight::format_alert("Reading data...") @@ -317,6 +319,5 @@ data_read <- function(path, } out <- tmp } - - .post_process_imported_data(out, convert_factors, verbose) + out } diff --git a/man/data_read.Rd b/man/data_read.Rd index 1ae3cea8a..d7d26255b 100644 --- a/man/data_read.Rd +++ b/man/data_read.Rd @@ -33,15 +33,16 @@ for SAS data files.} \item{encoding}{The character encoding used for the file. Usually not needed.} \item{convert_factors}{If \code{TRUE} (default), numeric variables, where all -values have a value label, are assumed to be categorical and converted -into factors. If \code{FALSE}, no variable types are guessed and no conversion -of numeric variables into factors will be performed. See also section -'Differences to other packages'. For \code{data_write()}, this argument only -applies to the text (e.g. \code{.txt} or \code{.csv}) or spreadsheet file formats (like -\code{.xlsx}). Converting to factors might be useful for these formats because -labelled numeric variables are then converted into factors and exported as -character columns - else, value labels would be lost and only numeric values -are written to the file.} +values have a value label, are assumed to be categorical and converted into +factors. If \code{FALSE}, no variable types are guessed and no conversion of +numeric variables into factors will be performed. For \code{data_read()}, this +argument only applies to file types with \emph{labelled data}, e.g. files from +SPSS, SAS or Stata. See also section 'Differences to other packages'. For +\code{data_write()}, this argument only applies to the text (e.g. \code{.txt} or +\code{.csv}) or spreadsheet file formats (like \code{.xlsx}). Converting to factors +might be useful for these formats because labelled numeric variables are then +converted into factors and exported as character columns - else, value labels +would be lost and only numeric values are written to the file.} \item{verbose}{Toggle warnings and messages.} @@ -118,12 +119,13 @@ versions, use \code{compress = "none"}, for example \code{data_read()} is most comparable to \code{rio::import()}. For data files from SPSS, SAS or Stata, which support labelled data, variables are converted into -their most appropriate type. The major difference to \code{rio::import()} is that -\code{data_read()} automatically converts fully labelled numeric variables into -factors, where imported value labels will be set as factor levels. If a -numeric variable has \emph{no} value labels or less value labels than values, it -is not converted to factor. In this case, value labels are preserved as -\code{"labels"} attribute. Character vectors are preserved. Use +their most appropriate type. The major difference to \code{rio::import()} is for +data files from SPSS, SAS, or Stata, i.e. file types that support +\emph{labelled data}. \code{data_read()} automatically converts fully labelled numeric +variables into factors, where imported value labels will be set as factor +levels. If a numeric variable has \emph{no} value labels or less value labels than +values, it is not converted to factor. In this case, value labels are +preserved as \code{"labels"} attribute. Character vectors are preserved. Use \code{convert_factors = FALSE} to remove the automatic conversion of numeric variables to factors. } diff --git a/tests/testthat/test-data_read.R b/tests/testthat/test-data_read.R index ac316c706..15f1161d3 100644 --- a/tests/testthat/test-data_read.R +++ b/tests/testthat/test-data_read.R @@ -141,12 +141,12 @@ test_that("data_read - RDS file, matrix, coercible", { httr::stop_for_status(request) writeBin(httr::content(request, type = "raw"), temp_file) - expect_message(expect_message(expect_message({ + expect_message({ d <- data_read( temp_file, verbose = TRUE ) - })), regex = "0 out of 5") + }) expect_s3_class(d, "data.frame") expect_identical(dim(d), c(2L, 5L)) @@ -155,6 +155,29 @@ test_that("data_read - RDS file, matrix, coercible", { +# RDS file, preserve class /types ----------------------------------- + +test_that("data_read - RDS file, preserve class", { + withr::with_tempfile("temp_file", fileext = ".rds", code = { + request <- httr::GET("https://raw.github.com/easystats/circus/main/data/hiv.rds") + httr::stop_for_status(request) + writeBin(httr::content(request, type = "raw"), temp_file) + + d <- data_read(temp_file) + expect_s3_class(d, "data.frame") + expect_identical( + sapply(d, class), + c( + village = "integer", outcome = "integer", distance = "numeric", + amount = "numeric", incentive = "integer", age = "integer", + hiv2004 = "integer", agecat = "factor" + ) + ) + }) +}) + + + # RData ----------------------------------- test_that("data_read - no warning for RData", { From 5703d853911d5d1d000a4cc20788166e3bd98aae Mon Sep 17 00:00:00 2001 From: "Mattan S. Ben-Shachar" <35330040+mattansb@users.noreply.github.com> Date: Sun, 3 Nov 2024 10:53:12 +0200 Subject: [PATCH 15/19] Use dev insight test with changes introduced in https://github.com/easystats/insight/pull/952 --- DESCRIPTION | 1 + 1 file changed, 1 insertion(+) diff --git a/DESCRIPTION b/DESCRIPTION index f68a1e2eb..8127a2e79 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -76,3 +76,4 @@ RoxygenNote: 7.3.2 Config/testthat/edition: 3 Config/testthat/parallel: true Config/Needs/website: easystats/easystatstemplate +Remotes: easystats/insight From b723df21b3644975bc6e46c44115d8c7b2826f57 Mon Sep 17 00:00:00 2001 From: Daniel Date: Mon, 4 Nov 2024 22:59:39 +0100 Subject: [PATCH 16/19] Fix snapshot tests (#560) * Fix snapshot tests Fixes #559 * update * pass ... to export_table * update tests * pass ... * update tests * docs * Trigger CI * lintr --- DESCRIPTION | 2 +- R/data_codebook.R | 6 +- R/data_tabulate.R | 9 +- R/data_xtabulate.R | 3 +- R/describe_distribution.R | 2 +- man/data_codebook.Rd | 3 +- man/data_tabulate.Rd | 3 +- tests/testthat/_snaps/data_codebook.md | 264 ++++++++++++++++++++++++- tests/testthat/test-data_codebook.R | 32 +-- 9 files changed, 299 insertions(+), 25 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 8127a2e79..a680cfd02 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: datawizard Title: Easy Data Wrangling and Statistical Transformations -Version: 0.13.0.9 +Version: 0.13.0.11 Authors@R: c( person("Indrajeet", "Patil", , "patilindrajeet.science@gmail.com", role = "aut", comment = c(ORCID = "0000-0003-1995-6531")), diff --git a/R/data_codebook.R b/R/data_codebook.R index 71e2e5828..5dffbf9c2 100644 --- a/R/data_codebook.R +++ b/R/data_codebook.R @@ -33,7 +33,8 @@ #' #' @note There are methods to `print()` the data frame in a nicer output, as #' well methods for printing in markdown or HTML format (`print_md()` and -#' `print_html()`). +#' `print_html()`). The `print()` method for text outputs passes arguments in +#' `...` to [`insight::export_table()`]. #' #' @examples #' data(iris) @@ -369,7 +370,8 @@ print.data_codebook <- function(x, ...) { title = caption, empty_line = "-", cross = "+", - align = .get_codebook_align(x) + align = .get_codebook_align(x), + ... ) ) } diff --git a/R/data_tabulate.R b/R/data_tabulate.R index e94fc5d55..621fabbb1 100644 --- a/R/data_tabulate.R +++ b/R/data_tabulate.R @@ -48,7 +48,8 @@ #' @note #' There are `print_html()` and `print_md()` methods available for printing #' frequency or crosstables in HTML and markdown format, e.g. -#' `print_html(data_tabulate(x))`. +#' `print_html(data_tabulate(x))`. The `print()` method for text outputs passes +#' arguments in `...` to [`insight::export_table()`]. #' #' @return A data frame, or a list of data frames, with one frequency table #' as data frame per variable. @@ -522,7 +523,8 @@ print.datawizard_table <- function(x, big_mark = NULL, ...) { cat(insight::export_table( format(x, big_mark = big_mark, ...), cross = "+", - missing = "" + missing = "", + ... )) invisible(x) } @@ -621,7 +623,8 @@ print.datawizard_tables <- function(x, big_mark = NULL, ...) { out, missing = "", cross = "+", - empty_line = "-" + empty_line = "-", + ... )) } } diff --git a/R/data_xtabulate.R b/R/data_xtabulate.R index 08be1eeca..c9595eccf 100644 --- a/R/data_xtabulate.R +++ b/R/data_xtabulate.R @@ -198,7 +198,8 @@ print.datawizard_crosstab <- function(x, big_mark = NULL, ...) { cross = "+", missing = "", caption = caption, - empty_line = "-" + empty_line = "-", + ... )) invisible(x) } diff --git a/R/describe_distribution.R b/R/describe_distribution.R index 2e61c1dc3..64f6e29c1 100644 --- a/R/describe_distribution.R +++ b/R/describe_distribution.R @@ -513,7 +513,7 @@ print.parameters_distribution <- function(x, digits = 2, ...) { ci_brackets = TRUE, ... ) - cat(insight::export_table(formatted_table, format = "text", digits = digits)) + cat(insight::export_table(formatted_table, format = "text", digits = digits, ...)) invisible(x) } diff --git a/man/data_codebook.Rd b/man/data_codebook.Rd index 319f4e5b6..d5a542be4 100644 --- a/man/data_codebook.Rd +++ b/man/data_codebook.Rd @@ -127,7 +127,8 @@ labels, values or value range, frequencies, amount of missing values). \note{ There are methods to \code{print()} the data frame in a nicer output, as well methods for printing in markdown or HTML format (\code{print_md()} and -\code{print_html()}). +\code{print_html()}). The \code{print()} method for text outputs passes arguments in +\code{...} to \code{\link[insight:export_table]{insight::export_table()}}. } \examples{ data(iris) diff --git a/man/data_tabulate.Rd b/man/data_tabulate.Rd index 032c0b989..b28a26ede 100644 --- a/man/data_tabulate.Rd +++ b/man/data_tabulate.Rd @@ -167,7 +167,8 @@ frequency tables as data frame. See 'Examples'. \note{ There are \code{print_html()} and \code{print_md()} methods available for printing frequency or crosstables in HTML and markdown format, e.g. -\code{print_html(data_tabulate(x))}. +\code{print_html(data_tabulate(x))}. The \code{print()} method for text outputs passes +arguments in \code{...} to \code{\link[insight:export_table]{insight::export_table()}}. } \section{Crosstables}{ diff --git a/tests/testthat/_snaps/data_codebook.md b/tests/testthat/_snaps/data_codebook.md index c390ba890..8f9b9e7b5 100644 --- a/tests/testthat/_snaps/data_codebook.md +++ b/tests/testthat/_snaps/data_codebook.md @@ -139,7 +139,7 @@ # data_codebook efc Code - data_codebook(efc) + print(data_codebook(efc), table_width = Inf) Output efc (100 rows and 5 variables, 5 shown) @@ -162,10 +162,94 @@ 5 | neg_c_7 | Negative impact with 7 items | numeric | 3 (3.0%) | [7, 28] | | 97 --------------------------------------------------------------------------------------------------------------------------------------------- +--- + + Code + print(data_codebook(efc), table_width = "auto", remove_duplicates = FALSE) + Output + efc (100 rows and 5 variables, 5 shown) + + ID | Name | Label | Type + ---+----------+------------------------------------------+------------ + 1 | c12hour | average number of hours of care per week | numeric + ---+----------+------------------------------------------+------------ + 2 | e16sex | elder's gender | numeric + | | | + ---+----------+------------------------------------------+------------ + 3 | e42dep | elder's dependency | categorical + | | | + | | | + | | | + ---+----------+------------------------------------------+------------ + 4 | c172code | carer's level of education | numeric + | | | + | | | + ---+----------+------------------------------------------+------------ + 5 | neg_c_7 | Negative impact with 7 items | numeric + ---------------------------------------------------------------------- + + ID | Missings | Values | Value Labels | N + ---+------------+----------+---------------------------------+----------- + 1 | 2 (2.0%) | [5, 168] | | 98 + ---+------------+----------+---------------------------------+----------- + 2 | 0 (0.0%) | 1 | male | 46 (46.0%) + | | 2 | female | 54 (54.0%) + ---+------------+----------+---------------------------------+----------- + 3 | 3 (3.0%) | 1 | independent | 2 ( 2.1%) + | | 2 | slightly dependent | 4 ( 4.1%) + | | 3 | moderately dependent | 28 (28.9%) + | | 4 | severely dependent | 63 (64.9%) + ---+------------+----------+---------------------------------+----------- + 4 | 10 (10.0%) | 1 | low level of education | 8 ( 8.9%) + | | 2 | intermediate level of education | 66 (73.3%) + | | 3 | high level of education | 16 (17.8%) + ---+------------+----------+---------------------------------+----------- + 5 | 3 (3.0%) | [7, 28] | | 97 + ------------------------------------------------------------------------- + +--- + + Code + print(data_codebook(efc), table_width = "auto", remove_duplicates = TRUE) + Output + efc (100 rows and 5 variables, 5 shown) + + ID | Name | Label | Type + ---+----------+------------------------------------------+------------ + 1 | c12hour | average number of hours of care per week | numeric + ---+----------+------------------------------------------+------------ + 2 | e16sex | elder's gender | numeric + ---+----------+------------------------------------------+------------ + 3 | e42dep | elder's dependency | categorical + ---+----------+------------------------------------------+------------ + 4 | c172code | carer's level of education | numeric + ---+----------+------------------------------------------+------------ + 5 | neg_c_7 | Negative impact with 7 items | numeric + ---------------------------------------------------------------------- + + ID | Missings | Values | Value Labels | N + ---+------------+----------+---------------------------------+----------- + 1 | 2 (2.0%) | [5, 168] | | 98 + ---+------------+----------+---------------------------------+----------- + 2 | 0 (0.0%) | 1 | male | 46 (46.0%) + | | 2 | female | 54 (54.0%) + ---+------------+----------+---------------------------------+----------- + 3 | 3 (3.0%) | 1 | independent | 2 ( 2.1%) + | | 2 | slightly dependent | 4 ( 4.1%) + | | 3 | moderately dependent | 28 (28.9%) + | | 4 | severely dependent | 63 (64.9%) + ---+------------+----------+---------------------------------+----------- + 4 | 10 (10.0%) | 1 | low level of education | 8 ( 8.9%) + | | 2 | intermediate level of education | 66 (73.3%) + | | 3 | high level of education | 16 (17.8%) + ---+------------+----------+---------------------------------+----------- + 5 | 3 (3.0%) | [7, 28] | | 97 + ------------------------------------------------------------------------- + # data_codebook efc, variable_label_width Code - data_codebook(efc, variable_label_width = 30) + print(out, table_width = Inf) Output efc (100 rows and 5 variables, 5 shown) @@ -189,10 +273,97 @@ 5 | neg_c_7 | Negative impact with 7 items | numeric | 3 (3.0%) | [7, 28] | | 97 --------------------------------------------------------------------------------------------------------------------------------- +--- + + Code + print(out, table_width = "auto", remove_duplicates = FALSE) + Output + efc (100 rows and 5 variables, 5 shown) + + ID | Name | Label | Type | Missings + ---+----------+------------------------------+-------------+----------- + 1 | c12hour | average number of hours of | numeric | 2 (2.0%) + | | care per week | | + ---+----------+------------------------------+-------------+----------- + 2 | e16sex | elder's gender | numeric | 0 (0.0%) + | | | | + ---+----------+------------------------------+-------------+----------- + 3 | e42dep | elder's dependency | categorical | 3 (3.0%) + | | | | + | | | | + | | | | + ---+----------+------------------------------+-------------+----------- + 4 | c172code | carer's level of education | numeric | 10 (10.0%) + | | | | + | | | | + ---+----------+------------------------------+-------------+----------- + 5 | neg_c_7 | Negative impact with 7 items | numeric | 3 (3.0%) + ----------------------------------------------------------------------- + + ID | Values | Value Labels | N + ---+----------+---------------------------------+----------- + 1 | [5, 168] | | 98 + | | | + ---+----------+---------------------------------+----------- + 2 | 1 | male | 46 (46.0%) + | 2 | female | 54 (54.0%) + ---+----------+---------------------------------+----------- + 3 | 1 | independent | 2 ( 2.1%) + | 2 | slightly dependent | 4 ( 4.1%) + | 3 | moderately dependent | 28 (28.9%) + | 4 | severely dependent | 63 (64.9%) + ---+----------+---------------------------------+----------- + 4 | 1 | low level of education | 8 ( 8.9%) + | 2 | intermediate level of education | 66 (73.3%) + | 3 | high level of education | 16 (17.8%) + ---+----------+---------------------------------+----------- + 5 | [7, 28] | | 97 + ------------------------------------------------------------ + +--- + + Code + print(out, table_width = "auto", remove_duplicates = TRUE) + Output + efc (100 rows and 5 variables, 5 shown) + + ID | Name | Label | Type | Missings + ---+----------+------------------------------+-------------+----------- + 1 | c12hour | average number of hours of | numeric | 2 (2.0%) + | | care per week | | + ---+----------+------------------------------+-------------+----------- + 2 | e16sex | elder's gender | numeric | 0 (0.0%) + ---+----------+------------------------------+-------------+----------- + 3 | e42dep | elder's dependency | categorical | 3 (3.0%) + ---+----------+------------------------------+-------------+----------- + 4 | c172code | carer's level of education | numeric | 10 (10.0%) + ---+----------+------------------------------+-------------+----------- + 5 | neg_c_7 | Negative impact with 7 items | numeric | 3 (3.0%) + ----------------------------------------------------------------------- + + ID | Values | Value Labels | N + ---+----------+---------------------------------+----------- + 1 | [5, 168] | | 98 + ---+----------+---------------------------------+----------- + 2 | 1 | male | 46 (46.0%) + | 2 | female | 54 (54.0%) + ---+----------+---------------------------------+----------- + 3 | 1 | independent | 2 ( 2.1%) + | 2 | slightly dependent | 4 ( 4.1%) + | 3 | moderately dependent | 28 (28.9%) + | 4 | severely dependent | 63 (64.9%) + ---+----------+---------------------------------+----------- + 4 | 1 | low level of education | 8 ( 8.9%) + | 2 | intermediate level of education | 66 (73.3%) + | 3 | high level of education | 16 (17.8%) + ---+----------+---------------------------------+----------- + 5 | [7, 28] | | 97 + ------------------------------------------------------------ + # data_codebook efc, value_label_width Code - data_codebook(efc, variable_label_width = 30, value_label_width = 15) + print(out, table_width = Inf) Output efc (100 rows and 5 variables, 5 shown) @@ -216,6 +387,93 @@ 5 | neg_c_7 | Negative impact with 7 items | numeric | 3 (3.0%) | [7, 28] | | 97 ------------------------------------------------------------------------------------------------------------------ +--- + + Code + print(out, table_width = "auto", remove_duplicates = FALSE) + Output + efc (100 rows and 5 variables, 5 shown) + + ID | Name | Label | Type | Missings + ---+----------+------------------------------+-------------+----------- + 1 | c12hour | average number of hours of | numeric | 2 (2.0%) + | | care per week | | + ---+----------+------------------------------+-------------+----------- + 2 | e16sex | elder's gender | numeric | 0 (0.0%) + | | | | + ---+----------+------------------------------+-------------+----------- + 3 | e42dep | elder's dependency | categorical | 3 (3.0%) + | | | | + | | | | + | | | | + ---+----------+------------------------------+-------------+----------- + 4 | c172code | carer's level of education | numeric | 10 (10.0%) + | | | | + | | | | + ---+----------+------------------------------+-------------+----------- + 5 | neg_c_7 | Negative impact with 7 items | numeric | 3 (3.0%) + ----------------------------------------------------------------------- + + ID | Values | Value Labels | N + ---+----------+------------------+----------- + 1 | [5, 168] | | 98 + | | | + ---+----------+------------------+----------- + 2 | 1 | male | 46 (46.0%) + | 2 | female | 54 (54.0%) + ---+----------+------------------+----------- + 3 | 1 | independent | 2 ( 2.1%) + | 2 | slightly... | 4 ( 4.1%) + | 3 | moderately... | 28 (28.9%) + | 4 | severely... | 63 (64.9%) + ---+----------+------------------+----------- + 4 | 1 | low level of... | 8 ( 8.9%) + | 2 | intermediate... | 66 (73.3%) + | 3 | high level of... | 16 (17.8%) + ---+----------+------------------+----------- + 5 | [7, 28] | | 97 + --------------------------------------------- + +--- + + Code + print(out, table_width = "auto", remove_duplicates = TRUE) + Output + efc (100 rows and 5 variables, 5 shown) + + ID | Name | Label | Type | Missings + ---+----------+------------------------------+-------------+----------- + 1 | c12hour | average number of hours of | numeric | 2 (2.0%) + | | care per week | | + ---+----------+------------------------------+-------------+----------- + 2 | e16sex | elder's gender | numeric | 0 (0.0%) + ---+----------+------------------------------+-------------+----------- + 3 | e42dep | elder's dependency | categorical | 3 (3.0%) + ---+----------+------------------------------+-------------+----------- + 4 | c172code | carer's level of education | numeric | 10 (10.0%) + ---+----------+------------------------------+-------------+----------- + 5 | neg_c_7 | Negative impact with 7 items | numeric | 3 (3.0%) + ----------------------------------------------------------------------- + + ID | Values | Value Labels | N + ---+----------+------------------+----------- + 1 | [5, 168] | | 98 + ---+----------+------------------+----------- + 2 | 1 | male | 46 (46.0%) + | 2 | female | 54 (54.0%) + ---+----------+------------------+----------- + 3 | 1 | independent | 2 ( 2.1%) + | 2 | slightly... | 4 ( 4.1%) + | 3 | moderately... | 28 (28.9%) + | 4 | severely... | 63 (64.9%) + ---+----------+------------------+----------- + 4 | 1 | low level of... | 8 ( 8.9%) + | 2 | intermediate... | 66 (73.3%) + | 3 | high level of... | 16 (17.8%) + ---+----------+------------------+----------- + 5 | [7, 28] | | 97 + --------------------------------------------- + # data_codebook truncated data Code diff --git a/tests/testthat/test-data_codebook.R b/tests/testthat/test-data_codebook.R index 26a67ccf6..06e9bd2f9 100644 --- a/tests/testthat/test-data_codebook.R +++ b/tests/testthat/test-data_codebook.R @@ -19,7 +19,7 @@ test_that("data_codebook NaN and Inf", { set.seed(123) d <- data.frame( - x = c(sample(1:15, 100, TRUE), Inf, Inf) + x = c(sample.int(15, 100, TRUE), Inf, Inf) ) expect_snapshot(data_codebook(d)) expect_snapshot(data_codebook(d, range_at = 100)) @@ -38,24 +38,32 @@ test_that("data_codebook iris, select, ID", { test_that("data_codebook efc", { - expect_snapshot(data_codebook(efc)) + expect_snapshot(print(data_codebook(efc), table_width = Inf)) + expect_snapshot(print(data_codebook(efc), table_width = "auto", remove_duplicates = FALSE)) + expect_snapshot(print(data_codebook(efc), table_width = "auto", remove_duplicates = TRUE)) }) test_that("data_codebook efc, variable_label_width", { - expect_snapshot(data_codebook(efc, variable_label_width = 30)) + out <- data_codebook(efc, variable_label_width = 30) + expect_snapshot(print(out, table_width = Inf)) + expect_snapshot(print(out, table_width = "auto", remove_duplicates = FALSE)) + expect_snapshot(print(out, table_width = "auto", remove_duplicates = TRUE)) }) test_that("data_codebook efc, value_label_width", { - expect_snapshot(data_codebook(efc, variable_label_width = 30, value_label_width = 15)) + out <- data_codebook(efc, variable_label_width = 30, value_label_width = 15) + expect_snapshot(print(out, table_width = Inf)) + expect_snapshot(print(out, table_width = "auto", remove_duplicates = FALSE)) + expect_snapshot(print(out, table_width = "auto", remove_duplicates = TRUE)) }) test_that("data_codebook truncated data", { set.seed(123) d <- data.frame( - a = sample(1:15, 100, TRUE), + a = sample.int(15, 100, TRUE), b = sample(letters[1:18], 100, TRUE), stringsAsFactors = FALSE ) @@ -66,7 +74,7 @@ test_that("data_codebook truncated data", { test_that("data_codebook mixed numeric lengths", { set.seed(123) d <- data.frame( - a = sample(1:4, 100, TRUE), + a = sample.int(4, 100, TRUE), b = sample(5:15, 100, TRUE), stringsAsFactors = FALSE ) @@ -76,7 +84,7 @@ test_that("data_codebook mixed numeric lengths", { test_that("data_codebook mixed range_at", { set.seed(123) d <- data.frame( - a = sample(1:4, 100, TRUE), + a = sample.int(4, 100, TRUE), b = sample(5:15, 100, TRUE), stringsAsFactors = FALSE ) @@ -87,7 +95,7 @@ test_that("data_codebook mixed range_at", { test_that("data_codebook logicals", { set.seed(123) d <- data.frame( - a = sample(1:15, 100, TRUE), + a = sample.int(15, 100, TRUE), b = sample(letters[1:3], 100, TRUE), c = sample(c(TRUE, FALSE), 100, TRUE), stringsAsFactors = FALSE @@ -99,14 +107,14 @@ test_that("data_codebook logicals", { test_that("data_codebook labelled data exceptions", { set.seed(123) - f1 <- sample(1:5, 100, TRUE) + f1 <- sample.int(5, 100, TRUE) f1[f1 == 4] <- NA attr(f1, "labels") <- setNames(1:5, c("One", "Two", "Three", "Four", "Five")) - f2 <- sample(1:5, 100, TRUE) + f2 <- sample.int(5, 100, TRUE) attr(f2, "labels") <- setNames(c(1:3, 5), c("One", "Two", "Three", "Five")) - f3 <- sample(1:5, 100, TRUE) + f3 <- sample.int(5, 100, TRUE) attr(f3, "labels") <- setNames(1:5, c("One", "Two", "Three", "Four", "Five")) d <- data.frame(f1, f2, f3) @@ -143,7 +151,7 @@ test_that("data_codebook works with numbers < 1", { test_that("data_codebook, big marks", { set.seed(123) f1 <- factor(sample(c("c", "b", "a"), 1e6, TRUE)) - f2 <- factor(sample(1:3, 1e6, TRUE)) + f2 <- factor(sample.int(3, 1e6, TRUE)) d <- data.frame(f1, f2) expect_snapshot(data_codebook(d)) }) From c0ce69275e0bc81b864d50b333acb916a9f708b7 Mon Sep 17 00:00:00 2001 From: Daniel Date: Fri, 8 Nov 2024 22:34:19 +0100 Subject: [PATCH 17/19] Trigger CI --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index a680cfd02..ba821b0ba 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: datawizard Title: Easy Data Wrangling and Statistical Transformations -Version: 0.13.0.11 +Version: 0.13.0.12 Authors@R: c( person("Indrajeet", "Patil", , "patilindrajeet.science@gmail.com", role = "aut", comment = c(ORCID = "0000-0003-1995-6531")), From 9baa22be43af54053d5586323b901951faa4a9a5 Mon Sep 17 00:00:00 2001 From: Daniel Date: Thu, 21 Nov 2024 10:28:04 +0100 Subject: [PATCH 18/19] Allow `n()` in `data_modify()` (#535) * Allow `n()` in `data_modify()` * lintr, styler * Update NEWS.md Co-authored-by: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com> * Update R/data_modify.R Co-authored-by: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com> * comments * fix test * update rd * modify error msg * error on invalid function * move news item --------- Co-authored-by: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com> --- NEWS.md | 3 ++ R/data_modify.R | 21 +++++++++++-- man/data_modify.Rd | 6 +++- tests/testthat/test-data_modify.R | 52 +++++++++++++++++++++++++++++++ 4 files changed, 79 insertions(+), 3 deletions(-) diff --git a/NEWS.md b/NEWS.md index 8fc8a29ca..663efa310 100644 --- a/NEWS.md +++ b/NEWS.md @@ -19,6 +19,9 @@ CHANGES * `data_read()` no longer shows warning about forthcoming breaking changes in upstream packages when reading `.RData` files. +* `data_modify()` now recognizes `n()`, for example to create an index for data groups + with `1:n()` (#535). + BUG FIXES * `describe_distribution()` no longer errors if the sample was too sparse to compute diff --git a/R/data_modify.R b/R/data_modify.R index e7744c1f5..3e30b8f68 100644 --- a/R/data_modify.R +++ b/R/data_modify.R @@ -22,6 +22,9 @@ #' character vector is provided, you may not add further elements to `...`. #' - Using `NULL` as right-hand side removes a variable from the data frame. #' Example: `Petal.Width = NULL`. +#' - For data frames (including grouped ones), the function `n()` can be used to count the +#' number of observations and thereby, for instance, create index values by +#' using `id = 1:n()` or `id = 3:(n()+2)` and similar. #' #' Note that newly created variables can be used in subsequent expressions, #' including `.at` or `.if`. See also 'Examples'. @@ -92,7 +95,8 @@ #' grouped_efc, #' c12hour_c = center(c12hour), #' c12hour_z = c12hour_c / sd(c12hour, na.rm = TRUE), -#' c12hour_z2 = standardize(c12hour) +#' c12hour_z2 = standardize(c12hour), +#' id = 1:n() #' ) #' head(new_efc) #' @@ -145,6 +149,11 @@ data_modify.default <- function(data, ...) { data_modify.data.frame <- function(data, ..., .if = NULL, .at = NULL, .modify = NULL) { dots <- eval(substitute(alist(...))) + # error for data frames with no rows... + if (nrow(data) == 0) { + insight::format_error("`data` is an empty data frame. `data_modify()` only works for data frames with at least one row.") # nolint + } + # check if we have dots, or only at/modify ---- if (length(dots)) { @@ -201,6 +210,10 @@ data_modify.grouped_df <- function(data, ..., .if = NULL, .at = NULL, .modify = # the data.frame method later... dots <- match.call(expand.dots = FALSE)[["..."]] + # error for data frames with no rows... + if (nrow(data) == 0) { + insight::format_error("`data` is an empty data frame. `data_modify()` only works for data frames with at least one row.") # nolint + } grps <- attr(data, "groups", exact = TRUE) grps <- grps[[".rows"]] @@ -352,8 +365,12 @@ data_modify.grouped_df <- function(data, ..., .if = NULL, .at = NULL, .modify = # finally, we can evaluate expression and get values for new variables symbol_string <- insight::safe_deparse(symbol) if (!is.null(symbol_string) && all(symbol_string == "n()")) { - # "special" functions + # "special" functions - using "n()" just returns number of rows new_variable <- nrow(data) + } else if (!is.null(symbol_string) && length(symbol_string) == 1 && grepl("\\bn\\(\\)", symbol_string)) { + # "special" functions, like "1:n()" or similar - but not "1:fun()" + symbol_string <- str2lang(gsub("n()", "nrow(data)", symbol_string, fixed = TRUE)) + new_variable <- try(with(data, eval(symbol_string)), silent = TRUE) } else { # default evaluation of expression new_variable <- try(with(data, eval(symbol)), silent = TRUE) diff --git a/man/data_modify.Rd b/man/data_modify.Rd index 042962e03..28533ecea 100644 --- a/man/data_modify.Rd +++ b/man/data_modify.Rd @@ -30,6 +30,9 @@ type of expression cannot be mixed with other expressions, i.e. if a character vector is provided, you may not add further elements to \code{...}. \item Using \code{NULL} as right-hand side removes a variable from the data frame. Example: \code{Petal.Width = NULL}. +\item For data frames (including grouped ones), the function \code{n()} can be used to count the +number of observations and thereby, for instance, create index values by +using \code{id = 1:n()} or \code{id = 3:(n()+2)} and similar. } Note that newly created variables can be used in subsequent expressions, @@ -109,7 +112,8 @@ new_efc <- data_modify( grouped_efc, c12hour_c = center(c12hour), c12hour_z = c12hour_c / sd(c12hour, na.rm = TRUE), - c12hour_z2 = standardize(c12hour) + c12hour_z2 = standardize(c12hour), + id = 1:n() ) head(new_efc) diff --git a/tests/testthat/test-data_modify.R b/tests/testthat/test-data_modify.R index 9bb0a92d6..a7a153c43 100644 --- a/tests/testthat/test-data_modify.R +++ b/tests/testthat/test-data_modify.R @@ -353,6 +353,16 @@ test_that("data_modify errors for non df", { }) +test_that("data_modify errors for empty data frames", { + data(mtcars) + x <- mtcars[1, ] + expect_error( + data_modify(x[-1, ], new_var = 5), + regex = "empty data frame" + ) +}) + + test_that("data_modify errors for non df", { data(efc) a <- "center(c22hour)" # <---------------- error in variable name @@ -492,6 +502,20 @@ test_that("data_modify works with functions that return character vectors", { }) +test_that("data_modify 1:n() and similar works in (grouped) data frames", { + data(mtcars) + out <- data_modify(mtcars, Trials = 1:n()) # nolint + expect_identical(out$Trials, 1:32) + x <- data_group(mtcars, "gear") + out <- data_modify(x, Trials = 1:n()) # nolint + expect_identical(out$Trials[out$gear == 3], 1:15) + expect_identical(out$Trials[out$gear == 4], 1:12) + out <- data_modify(x, Trials = 3:(n() + 2)) + expect_identical(out$Trials[out$gear == 3], 3:17) + expect_identical(out$Trials[out$gear == 4], 3:14) +}) + + test_that("data_modify .if/.at arguments", { data(iris) d <- iris[1:5, ] @@ -550,3 +574,31 @@ test_that("data_modify .if/.at arguments", { out <- data_modify(d, new_length = Petal.Length * 2, .if = is.numeric, .modify = round) expect_equal(out$new_length, c(3, 3, 3, 3, 3), ignore_attr = TRUE) }) + + +skip_if_not_installed("withr") + +withr::with_environment( + new.env(), + test_that("data_modify 1:n() and similar works in (grouped) data frames inside function calls", { + data(mtcars) + x <- data_group(mtcars, "gear") + + foo <- function(d) { + out <- data_modify(d, Trials = 1:n()) # nolint + out$Trials + } + expect_identical( + foo(x), + c( + 1L, 2L, 3L, 1L, 2L, 3L, 4L, 4L, 5L, 6L, 7L, 5L, 6L, 7L, 8L, + 9L, 10L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 11L, 1L, 2L, 3L, + 4L, 5L, 12L + ) + ) + }) +) + +test_that("data_modify errors on non-defined function", { + expect_error(data_modify(iris, Species = foo())) +}) From 2741cdc5c86dd2e1f47e45d5bf9db8d5c9db1c91 Mon Sep 17 00:00:00 2001 From: Daniel Date: Thu, 21 Nov 2024 19:55:19 +0100 Subject: [PATCH 19/19] Make `standardize()` error messages clearer (#562) * Warn user for invalif formula * add tests * fix tests --- DESCRIPTION | 2 +- R/standardize.models.R | 8 ++++++ tests/testthat/test-standardize_models.R | 32 ++++++++++++++++++++---- 3 files changed, 36 insertions(+), 6 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index ba821b0ba..2325c062d 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: datawizard Title: Easy Data Wrangling and Statistical Transformations -Version: 0.13.0.12 +Version: 0.13.0.13 Authors@R: c( person("Indrajeet", "Patil", , "patilindrajeet.science@gmail.com", role = "aut", comment = c(ORCID = "0000-0003-1995-6531")), diff --git a/R/standardize.models.R b/R/standardize.models.R index a92ffe243..cf6062c78 100644 --- a/R/standardize.models.R +++ b/R/standardize.models.R @@ -78,6 +78,14 @@ standardize.default <- function(x, return(x) } + # check model formula. Some notations don't work when standardizing data + insight::formula_ok( + x, + action = "error", + prefix_msg = "Model cannot be standardized.", + verbose = verbose + ) + data_std <- NULL # needed to avoid note .standardize_models(x, robust = robust, two_sd = two_sd, diff --git a/tests/testthat/test-standardize_models.R b/tests/testthat/test-standardize_models.R index 706a4e6e7..d61caf450 100644 --- a/tests/testthat/test-standardize_models.R +++ b/tests/testthat/test-standardize_models.R @@ -31,6 +31,29 @@ test_that("standardize | errors", { }) +test_that("standardize | problematic formulas", { + data(mtcars) + m <- lm(mpg ~ hp, data = mtcars) + expect_equal( + coef(standardise(m)), + c(`(Intercept)` = -3.14935717633686e-17, hp = -0.776168371826586), + tolerance = 1e-4 + ) + + colnames(mtcars)[1] <- "1_mpg" + m <- lm(`1_mpg` ~ hp, data = mtcars) + expect_error(standardise(m), regex = "Looks like") + + # works interactive only + # data(mtcars) + # m <- lm(mtcars$mpg ~ mtcars$hp) + # expect_error(standardise(m), regex = "model formulas") + + m <- lm(mtcars[, 1] ~ hp, data = mtcars) + expect_error(standardise(m), regex = "indexed data") +}) + + # Transformations --------------------------------------------------------- test_that("transformations", { skip_if_not_installed("effectsize") @@ -206,15 +229,14 @@ test_that("standardize non-Gaussian response", { # variables evaluated in the environment $$$ ------------------------------ test_that("variables evaluated in the environment", { m <- lm(mtcars$mpg ~ mtcars$cyl + am, data = mtcars) - w <- capture_warnings(standardize(m)) - expect_true(any(grepl("mtcars$mpg", w, fixed = TRUE))) + w <- capture_error(standardize(m)) + expect_true(any(grepl("Using `$`", w, fixed = TRUE))) ## Note: # No idea why this is suddenly not giving a warning on older R versions. m <- lm(mtcars$mpg ~ mtcars$cyl + mtcars$am, data = mtcars) - warns <- capture_warnings(standardize(m)) - expect_true(any(grepl("mtcars$mpg", warns, fixed = TRUE))) - expect_true(any(grepl("No variables", warns, fixed = TRUE))) + w <- capture_error(standardize(m)) + expect_true(any(grepl("Using `$`", w, fixed = TRUE))) })