Merge branch 'main' into docs_smoothness

easystats · Nov 24, 2024 · 1ea2c3d · 1ea2c3d
2 parents 442689d + 2741cdc
commit 1ea2c3d
Show file tree

Hide file tree

Showing 93 changed files with 1,505 additions and 703 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,22 +1,22 @@
 Type: Package
 Package: datawizard
 Title: Easy Data Wrangling and Statistical Transformations
-Version: 0.12.3.4
+Version: 0.13.0.13
 Authors@R: c(
     person("Indrajeet", "Patil", , "[email protected]", role = "aut",
-           comment = c(ORCID = "0000-0003-1995-6531", Twitter = "@patilindrajeets")),
+           comment = c(ORCID = "0000-0003-1995-6531")),
     person("Etienne", "Bacher", , "[email protected]", role = c("aut", "cre"),
            comment = c(ORCID = "0000-0002-9271-5075")),
     person("Dominique", "Makowski", , "[email protected]", role = "aut",
-           comment = c(ORCID = "0000-0001-5375-9967", Twitter = "@Dom_Makowski")),
+           comment = c(ORCID = "0000-0001-5375-9967")),
     person("Daniel", "Lüdecke", , "[email protected]", role = "aut",
-           comment = c(ORCID = "0000-0002-8895-3206", Twitter = "@strengejacke")),
+           comment = c(ORCID = "0000-0002-8895-3206")),
     person("Mattan S.", "Ben-Shachar", , "[email protected]", role = "aut",
            comment = c(ORCID = "0000-0002-4287-4801")),
     person("Brenton M.", "Wiernik", , "[email protected]", role = "aut",
-           comment = c(ORCID = "0000-0001-9560-6336", Twitter = "@bmwiernik")),
+           comment = c(ORCID = "0000-0001-9560-6336")),
     person("Rémi", "Thériault", , "[email protected]", role = "ctb",
-           comment = c(ORCID = "0000-0003-4315-6788", Twitter = "@rempsyc")),
+           comment = c(ORCID = "0000-0003-4315-6788")),
     person("Thomas J.", "Faulkenberry", , "[email protected]", role = "rev"),
     person("Robert", "Garrett", , "[email protected]", role = "rev")
   )
@@ -33,7 +33,7 @@ BugReports: https://github.com/easystats/datawizard/issues
 Depends:
     R (>= 3.6)
 Imports:
-    insight (>= 0.20.3),
+    insight (>= 0.20.5),
     stats,
     utils
 Suggests:

diff --git a/NAMESPACE b/NAMESPACE
@@ -220,7 +220,6 @@ export(assign_labels)
 export(categorize)
 export(center)
 export(centre)
-export(change_code)
 export(change_scale)
 export(coef_var)
 export(coerce_to_numeric)
@@ -237,7 +236,6 @@ export(data_codebook)
 export(data_duplicated)
 export(data_extract)
 export(data_filter)
-export(data_find)
 export(data_group)
 export(data_join)
 export(data_match)
@@ -276,8 +274,6 @@ export(empty_columns)
 export(empty_rows)
 export(extract_column_names)
 export(find_columns)
-export(format_text)
-export(get_columns)
 export(kurtosis)
 export(labels_to_levels)
 export(mean_sd)
@@ -300,7 +296,9 @@ export(reshape_longer)
 export(reshape_wider)
 export(reverse)
 export(reverse_scale)
+export(row_count)
 export(row_means)
+export(row_sums)
 export(row_to_colnames)
 export(rowid_as_column)
 export(rownames_as_column)

diff --git a/NEWS.md b/NEWS.md
@@ -2,15 +2,61 @@
 
 BREAKING CHANGES
 
+* Argument `drop_na` in `data_match()` is deprecated now. Please use `remove_na`
+  instead.
+
+CHANGES
+
+* The `select` argument, which is available in different functions to select
+  variables, can now also be a character vector with quoted variable names,
+  including a colon to indicate a range of several variables (e.g. `"cyl:gear"`).
+
+* New function `row_sums()`, to calculate row sums (optionally with minimum
+  amount of valid values), as complement to `row_means()`.
+
+* New function `row_count()`, to count specific values row-wise.
+
+* `data_read()` no longer shows warning about forthcoming breaking changes
+  in upstream packages when reading `.RData` files.
+
+* `data_modify()` now recognizes `n()`, for example to create an index for data groups
+  with `1:n()` (#535).
+
+BUG FIXES
+
+* `describe_distribution()` no longer errors if the sample was too sparse to compute
+  CIs. Instead, it warns the user and returns `NA` (#550).
+
+* `data_read()` preserves variable types when importing files from `rds` or
+  `rdata` format (#558).
+
+# datawizard 0.13.0
+
+BREAKING CHANGES
+
 * `data_rename()` now errors when the `replacement` argument contains `NA` values
   or empty strings (#539).
 
+* Removed deprecated functions `get_columns()`, `data_find()`, `format_text()` (#546).
+
+* Removed deprecated arguments `group` and `na.rm` in multiple functions. Use `by` and `remove_na` instead (#546).
+
+* The default value for the argument `dummy_factors` in `to_numeric()` has
+  changed from `TRUE` to `FALSE` (#544).
+
 CHANGES
 
 * The `pattern` argument in `data_rename()` can also be a named vector. In this
   case, names are used as values for the `replacement` argument (i.e. `pattern`
   can be a character vector using `<new name> = "<old name>"`).
 
+* `categorize()` gains a new `breaks` argument, to decide whether breaks are
+  inclusive or exclusive (#548).
+
+* The `labels` argument in `categorize()` gets two new options, `"range"` and
+  `"observed"`, to use the range of categorized values as labels (i.e. factor
+  levels) (#548).
+
 * Minor additions to `reshape_ci()` to work with forthcoming changes in the
   `{bayestestR}` package.
 

diff --git a/R/categorize.R b/R/categorize.R
@@ -31,10 +31,18 @@
 #'   for numeric variables, the minimum of the original input is preserved. For
 #'   factors, the default minimum is `1`. For `split = "equal_range"`, the
 #'   default minimum is always `1`, unless specified otherwise in `lowest`.
+#' @param breaks Character, indicating whether breaks for categorizing data are
+#'   `"inclusive"` (values indicate the _upper_ bound of the _previous_ group or
+#'   interval) or `"exclusive"` (values indicate the _lower_ bound of the _next_
+#'   group or interval to begin). Use `labels = "range"` to make this behaviour
+#'   easier to see.
 #' @param labels Character vector of value labels. If not `NULL`, `categorize()`
 #'   will returns factors instead of numeric variables, with `labels` used
-#'   for labelling the factor levels. Can also be `"mean"` or `"median"` for a
-#'   factor with labels as the mean/median of each groups.
+#'   for labelling the factor levels. Can also be `"mean"`, `"median"`,
+#'   `"range"` or `"observed"` for a factor with labels as the mean/median,
+#'   the requested range (even if not all values of that range are present in
+#'   the data) or observed range (range of the actual recoded values) of each
+#'   group. See 'Examples'.
 #' @param append Logical or string. If `TRUE`, recoded or converted variables
 #'   get new column names and are appended (column bind) to `x`, thus returning
 #'   both the original and the recoded variables. The new columns get a suffix,
@@ -53,7 +61,7 @@
 #'
 #' # Splits and breaks (cut-off values)
 #'
-#' Breaks are in general _exclusive_, this means that these values indicate
+#' Breaks are by default _exclusive_, this means that these values indicate
 #' the lower bound of the next group or interval to begin. Take a simple
 #' example, a numeric variable with values from 1 to 9. The median would be 5,
 #' thus the first interval ranges from 1-4 and is recoded into 1, while 5-9
@@ -63,6 +71,9 @@
 #' from 1 to 3 belong to the first interval and are recoded into 1 (because
 #' the next interval starts at 3.67), 4 to 6 into 2 and 7 to 9 into 3.
 #'
+#' The opposite behaviour can be achieved using `breaks = "inclusive"`, in which
+#' case
+#'
 #' # Recoding into groups with equal size or range
 #'
 #' `split = "equal_length"` and `split = "equal_range"` try to divide the
@@ -119,6 +130,13 @@
 #' x <- sample(1:10, size = 30, replace = TRUE)
 #' categorize(x, "equal_length", n_groups = 3, labels = "mean")
 #' categorize(x, "equal_length", n_groups = 3, labels = "median")
+#'
+#' # cut numeric into groups with the requested range as a label name
+#' # each category has the same range, and labels indicate this range
+#' categorize(mtcars$mpg, "equal_length", n_groups = 5, labels = "range")
+#' # in this example, each category has the same range, but labels only refer
+#' # to the ranges of the actual values (present in the data) inside each group
+#' categorize(mtcars$mpg, "equal_length", n_groups = 5, labels = "observed")
 #' @export
 categorize <- function(x, ...) {
   UseMethod("categorize")
@@ -142,6 +160,7 @@ categorize.numeric <- function(x,
                                n_groups = NULL,
                                range = NULL,
                                lowest = 1,
+                               breaks = "exclusive",
                                labels = NULL,
                                verbose = TRUE,
                                ...) {
@@ -152,6 +171,9 @@ categorize.numeric <- function(x,
   if (identical(split, "equal_length")) split <- "length"
   if (identical(split, "equal_range")) split <- "range"
 
+  # check for valid values
+  breaks <- match.arg(breaks, c("exclusive", "inclusive"))
+
   # save
   original_x <- x
 
@@ -169,9 +191,9 @@ categorize.numeric <- function(x,
   }
 
   if (is.numeric(split)) {
-    breaks <- split
+    category_splits <- split
   } else {
-    breaks <- switch(split,
+    category_splits <- switch(split,
       median = stats::median(x),
       mean = mean(x),
       length = n_groups,
@@ -182,15 +204,18 @@ categorize.numeric <- function(x,
   }
 
   # complete ranges, including minimum and maximum
-  if (!identical(split, "length")) breaks <- unique(c(min(x), breaks, max(x)))
+  if (!identical(split, "length")) {
+    category_splits <- unique(c(min(x), category_splits, max(x)))
+  }
 
   # recode into groups
   out <- droplevels(cut(
     x,
-    breaks = breaks,
+    breaks = category_splits,
     include.lowest = TRUE,
-    right = FALSE
+    right = identical(breaks, "inclusive")
   ))
+  cut_result <- out
   levels(out) <- 1:nlevels(out)
 
   # fix lowest value, add back into original vector
@@ -201,7 +226,7 @@ categorize.numeric <- function(x,
   original_x[!is.na(original_x)] <- out
 
   # turn into factor?
-  .original_x_to_factor(original_x, x, labels, out, verbose, ...)
+  .original_x_to_factor(original_x, x, cut_result, labels, out, verbose, ...)
 }
 
 
@@ -223,6 +248,7 @@ categorize.data.frame <- function(x,
                                   n_groups = NULL,
                                   range = NULL,
                                   lowest = 1,
+                                  breaks = "exclusive",
                                   labels = NULL,
                                   append = FALSE,
                                   ignore_case = FALSE,
@@ -260,6 +286,7 @@ categorize.data.frame <- function(x,
     n_groups = n_groups,
     range = range,
     lowest = lowest,
+    breaks = breaks,
     labels = labels,
     verbose = verbose,
     ...
@@ -276,6 +303,7 @@ categorize.grouped_df <- function(x,
                                   n_groups = NULL,
                                   range = NULL,
                                   lowest = 1,
+                                  breaks = "exclusive",
                                   labels = NULL,
                                   append = FALSE,
                                   ignore_case = FALSE,
@@ -319,6 +347,7 @@ categorize.grouped_df <- function(x,
       n_groups = n_groups,
       range = range,
       lowest = lowest,
+      breaks = breaks,
       labels = labels,
       select = select,
       exclude = exclude,
@@ -375,20 +404,26 @@ categorize.grouped_df <- function(x,
 }
 
 
-.original_x_to_factor <- function(original_x, x, labels, out, verbose, ...) {
+.original_x_to_factor <- function(original_x, x, cut_result, labels, out, verbose, ...) {
   if (!is.null(labels)) {
     if (length(labels) == length(unique(out))) {
       original_x <- as.factor(original_x)
       levels(original_x) <- labels
-    } else if (length(labels) == 1 && labels %in% c("mean", "median")) {
+    } else if (length(labels) == 1 && labels %in% c("mean", "median", "range", "observed")) {
       original_x <- as.factor(original_x)
       no_na_x <- original_x[!is.na(original_x)]
-      if (labels == "mean") {
-        labels <- stats::aggregate(x, list(no_na_x), FUN = mean, na.rm = TRUE)$x
-      } else {
-        labels <- stats::aggregate(x, list(no_na_x), FUN = stats::median, na.rm = TRUE)$x
-      }
-      levels(original_x) <- insight::format_value(labels, ...)
+      out <- switch(labels,
+        mean = stats::aggregate(x, list(no_na_x), FUN = mean, na.rm = TRUE)$x,
+        median = stats::aggregate(x, list(no_na_x), FUN = stats::median, na.rm = TRUE)$x,
+        # labels basically like what "cut()" returns
+        range = levels(cut_result),
+        # range based on the values that are actually present in the data
+        {
+          temp <- stats::aggregate(x, list(no_na_x), FUN = range, na.rm = TRUE)$x
+          apply(temp, 1, function(i) paste0("(", paste(as.vector(i), collapse = "-"), ")"))
+        }
+      )
+      levels(original_x) <- insight::format_value(out, ...)
     } else if (isTRUE(verbose)) {
       insight::format_warning(
         "Argument `labels` and levels of the recoded variable are not of the same length.",

diff --git a/R/data_codebook.R b/R/data_codebook.R
@@ -33,7 +33,8 @@
 #'
 #' @note There are methods to `print()` the data frame in a nicer output, as
 #' well methods for printing in markdown or HTML format (`print_md()` and
-#' `print_html()`).
+#' `print_html()`). The `print()` method for text outputs passes arguments in
+#' `...` to [`insight::export_table()`].
 #'
 #' @examples
 #' data(iris)
@@ -369,7 +370,8 @@ print.data_codebook <- function(x, ...) {
       title = caption,
       empty_line = "-",
       cross = "+",
-      align = .get_codebook_align(x)
+      align = .get_codebook_align(x),
+      ...
     )
   )
 }

diff --git a/R/data_group.R b/R/data_group.R
@@ -51,7 +51,7 @@ data_group <- function(data,
       to = my_grid[i, , drop = FALSE],
       match = "and",
       return_indices = TRUE,
-      drop_na = FALSE
+      remove_na = FALSE
     ))
   })
   my_grid[[".rows"]] <- .rows

diff --git a/R/data_match.R b/R/data_match.R
@@ -15,7 +15,7 @@
 #' @param return_indices Logical, if `FALSE`, return the vector of rows that
 #'   can be used to filter the original data frame. If `FALSE` (default),
 #'   returns directly the filtered data frame instead of the row indices.
-#' @param drop_na Logical, if `TRUE`, missing values (`NA`s) are removed before
+#' @param remove_na Logical, if `TRUE`, missing values (`NA`s) are removed before
 #'   filtering the data. This is the default behaviour, however, sometimes when
 #'   row indices are requested (i.e. `return_indices=TRUE`), it might be useful
 #'   to preserve `NA` values, so returned row indices match the row indices of
@@ -26,6 +26,7 @@
 #'   character vector (e.g. `c("x > 4", "y == 2")`) or a variable that contains
 #'   the string representation of a logical expression. These might be useful
 #'   when used in packages to avoid defining undefined global variables.
+#' @param drop_na Deprecated, please use `remove_na` instead.
 #'
 #' @return A filtered data frame, or the row indices that match the specified
 #' configuration.
@@ -100,12 +101,24 @@
 #' data_filter(mtcars, fl)
 #' @inherit data_rename seealso
 #' @export
-data_match <- function(x, to, match = "and", return_indices = FALSE, drop_na = TRUE, ...) {
+data_match <- function(x,
+                       to,
+                       match = "and",
+                       return_indices = FALSE,
+                       remove_na = TRUE,
+                       drop_na,
+                       ...) {
   if (!is.data.frame(to)) {
     to <- as.data.frame(to)
   }
   original_x <- x
 
+  ## TODO: remove deprecated argument later
+  if (!missing(drop_na)) {
+    insight::format_warning("Argument `drop_na` is deprecated. Please use `remove_na` instead.")
+    remove_na <- drop_na
+  }
+
   # evaluate
   match <- match.arg(tolower(match), c("and", "&", "&&", "or", "|", "||", "!", "not"))
   match <- switch(match,
@@ -133,7 +146,7 @@ data_match <- function(x, to, match = "and", return_indices = FALSE, drop_na = T
     idx <- vector("numeric", length = 0L)
   } else {
     # remove missings before matching
-    if (isTRUE(drop_na)) {
+    if (isTRUE(remove_na)) {
       x <- x[stats::complete.cases(x), , drop = FALSE]
     }
     idx <- seq_len(nrow(x))