diff --git a/.Rbuildignore b/.Rbuildignore index d245bf8c3..65a0a7f67 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -48,3 +48,5 @@ references.bib ^CRAN-SUBMISSION$ docs ^.dev$ +^vignettes/s. +^vignettes/t. diff --git a/DESCRIPTION b/DESCRIPTION index 08e9c4c08..807894879 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,30 +1,30 @@ Type: Package Package: datawizard Title: Easy Data Wrangling and Statistical Transformations -Version: 0.11.0.2 +Version: 0.13.0.13 Authors@R: c( person("Indrajeet", "Patil", , "patilindrajeet.science@gmail.com", role = "aut", - comment = c(ORCID = "0000-0003-1995-6531", Twitter = "@patilindrajeets")), + comment = c(ORCID = "0000-0003-1995-6531")), person("Etienne", "Bacher", , "etienne.bacher@protonmail.com", role = c("aut", "cre"), comment = c(ORCID = "0000-0002-9271-5075")), person("Dominique", "Makowski", , "dom.makowski@gmail.com", role = "aut", - comment = c(ORCID = "0000-0001-5375-9967", Twitter = "@Dom_Makowski")), + comment = c(ORCID = "0000-0001-5375-9967")), person("Daniel", "Lüdecke", , "d.luedecke@uke.de", role = "aut", - comment = c(ORCID = "0000-0002-8895-3206", Twitter = "@strengejacke")), + comment = c(ORCID = "0000-0002-8895-3206")), person("Mattan S.", "Ben-Shachar", , "matanshm@post.bgu.ac.il", role = "aut", comment = c(ORCID = "0000-0002-4287-4801")), person("Brenton M.", "Wiernik", , "brenton@wiernik.org", role = "aut", - comment = c(ORCID = "0000-0001-9560-6336", Twitter = "@bmwiernik")), + comment = c(ORCID = "0000-0001-9560-6336")), person("Rémi", "Thériault", , "remi.theriault@mail.mcgill.ca", role = "ctb", - comment = c(ORCID = "0000-0003-4315-6788", Twitter = "@rempsyc")), + comment = c(ORCID = "0000-0003-4315-6788")), person("Thomas J.", "Faulkenberry", , "faulkenberry@tarleton.edu", role = "rev"), person("Robert", "Garrett", , "rcg4@illinois.edu", role = "rev") ) Maintainer: Etienne Bacher -Description: A lightweight package to assist in key steps involved in any data - analysis workflow: (1) wrangling the raw data to get it in the needed form, - (2) applying preprocessing steps and statistical transformations, and - (3) compute statistical summaries of data properties and distributions. +Description: A lightweight package to assist in key steps involved in any data + analysis workflow: (1) wrangling the raw data to get it in the needed form, + (2) applying preprocessing steps and statistical transformations, and + (3) compute statistical summaries of data properties and distributions. It is also the data wrangling backend for packages in 'easystats' ecosystem. References: Patil et al. (2022) . License: MIT + file LICENSE @@ -33,10 +33,10 @@ BugReports: https://github.com/easystats/datawizard/issues Depends: R (>= 4.0) Imports: - insight (>= 0.20.0), + insight (>= 0.20.5), stats, utils -Suggests: +Suggests: bayestestR, boot, brms, @@ -49,7 +49,6 @@ Suggests: ggplot2 (>= 3.5.0), gt, haven, - htmltools, httr, knitr, lme4, @@ -68,12 +67,13 @@ Suggests: tibble, tidyr, withr -VignetteBuilder: +VignetteBuilder: knitr Encoding: UTF-8 Language: en-US Roxygen: list(markdown = TRUE) -RoxygenNote: 7.3.1 +RoxygenNote: 7.3.2 Config/testthat/edition: 3 Config/testthat/parallel: true Config/Needs/website: easystats/easystatstemplate +Remotes: easystats/insight diff --git a/NAMESPACE b/NAMESPACE index 5926d19ab..7e97817b9 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,5 +1,7 @@ # Generated by roxygen2: do not edit by hand +S3method(as.data.frame,datawizard_crosstabs) +S3method(as.data.frame,datawizard_tables) S3method(as.double,parameters_kurtosis) S3method(as.double,parameters_skewness) S3method(as.double,parameters_smoothness) @@ -69,9 +71,9 @@ S3method(describe_distribution,grouped_df) S3method(describe_distribution,list) S3method(describe_distribution,numeric) S3method(format,data_codebook) +S3method(format,datawizard_crosstab) +S3method(format,datawizard_table) S3method(format,dw_data_peek) -S3method(format,dw_data_tabulate) -S3method(format,dw_data_xtabulate) S3method(format,dw_groupmeans) S3method(format,parameters_distribution) S3method(kurtosis,data.frame) @@ -93,12 +95,12 @@ S3method(normalize,numeric) S3method(plot,visualisation_recipe) S3method(print,data_codebook) S3method(print,data_seek) +S3method(print,datawizard_crosstab) +S3method(print,datawizard_crosstabs) +S3method(print,datawizard_table) +S3method(print,datawizard_tables) S3method(print,dw_data_peek) S3method(print,dw_data_summary) -S3method(print,dw_data_tabulate) -S3method(print,dw_data_tabulates) -S3method(print,dw_data_xtabulate) -S3method(print,dw_data_xtabulates) S3method(print,dw_groupmeans) S3method(print,dw_groupmeans_list) S3method(print,dw_transformer) @@ -107,16 +109,16 @@ S3method(print,parameters_kurtosis) S3method(print,parameters_skewness) S3method(print,visualisation_recipe) S3method(print_html,data_codebook) +S3method(print_html,datawizard_crosstab) +S3method(print_html,datawizard_crosstabs) +S3method(print_html,datawizard_table) +S3method(print_html,datawizard_tables) S3method(print_html,dw_data_peek) -S3method(print_html,dw_data_tabulate) -S3method(print_html,dw_data_tabulates) -S3method(print_html,dw_data_xtabulate) -S3method(print_html,dw_data_xtabulates) S3method(print_md,data_codebook) +S3method(print_md,datawizard_crosstab) +S3method(print_md,datawizard_table) +S3method(print_md,datawizard_tables) S3method(print_md,dw_data_peek) -S3method(print_md,dw_data_tabulate) -S3method(print_md,dw_data_tabulates) -S3method(print_md,dw_data_xtabulate) S3method(ranktransform,data.frame) S3method(ranktransform,factor) S3method(ranktransform,grouped_df) @@ -218,7 +220,6 @@ export(assign_labels) export(categorize) export(center) export(centre) -export(change_code) export(change_scale) export(coef_var) export(coerce_to_numeric) @@ -235,7 +236,6 @@ export(data_codebook) export(data_duplicated) export(data_extract) export(data_filter) -export(data_find) export(data_group) export(data_join) export(data_match) @@ -274,8 +274,6 @@ export(empty_columns) export(empty_rows) export(extract_column_names) export(find_columns) -export(format_text) -export(get_columns) export(kurtosis) export(labels_to_levels) export(mean_sd) @@ -298,7 +296,9 @@ export(reshape_longer) export(reshape_wider) export(reverse) export(reverse_scale) +export(row_count) export(row_means) +export(row_sums) export(row_to_colnames) export(rowid_as_column) export(rownames_as_column) diff --git a/NEWS.md b/NEWS.md index 1e49e91a4..663efa310 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,10 +1,115 @@ -# datawizard 0.11.0.1 +# datawizard (development) -## Changes +BREAKING CHANGES + +* Argument `drop_na` in `data_match()` is deprecated now. Please use `remove_na` + instead. + +CHANGES + +* The `select` argument, which is available in different functions to select + variables, can now also be a character vector with quoted variable names, + including a colon to indicate a range of several variables (e.g. `"cyl:gear"`). + +* New function `row_sums()`, to calculate row sums (optionally with minimum + amount of valid values), as complement to `row_means()`. + +* New function `row_count()`, to count specific values row-wise. + +* `data_read()` no longer shows warning about forthcoming breaking changes + in upstream packages when reading `.RData` files. + +* `data_modify()` now recognizes `n()`, for example to create an index for data groups + with `1:n()` (#535). + +BUG FIXES + +* `describe_distribution()` no longer errors if the sample was too sparse to compute + CIs. Instead, it warns the user and returns `NA` (#550). + +* `data_read()` preserves variable types when importing files from `rds` or + `rdata` format (#558). + +# datawizard 0.13.0 + +BREAKING CHANGES + +* `data_rename()` now errors when the `replacement` argument contains `NA` values + or empty strings (#539). + +* Removed deprecated functions `get_columns()`, `data_find()`, `format_text()` (#546). + +* Removed deprecated arguments `group` and `na.rm` in multiple functions. Use `by` and `remove_na` instead (#546). + +* The default value for the argument `dummy_factors` in `to_numeric()` has + changed from `TRUE` to `FALSE` (#544). + +CHANGES + +* The `pattern` argument in `data_rename()` can also be a named vector. In this + case, names are used as values for the `replacement` argument (i.e. `pattern` + can be a character vector using ` = ""`). + +* `categorize()` gains a new `breaks` argument, to decide whether breaks are + inclusive or exclusive (#548). + +* The `labels` argument in `categorize()` gets two new options, `"range"` and + `"observed"`, to use the range of categorized values as labels (i.e. factor + levels) (#548). + +* Minor additions to `reshape_ci()` to work with forthcoming changes in the + `{bayestestR}` package. + +# datawizard 0.12.3 + +CHANGES + +* `demean()` (and `degroup()`) now also work for nested designs, if argument + `nested = TRUE` and `by` specifies more than one variable (#533). + +* Vignettes are no longer provided in the package, they are now only available + on the website. There is only one "Overview" vignette available in the package, + it contains links to the other vignettes on the website. This is because there + are CRAN errors occurring when building vignettes on macOS and we couldn't + determine the cause after multiple patch releases (#534). + +# datawizard 0.12.2 + +* Remove `htmltools` from `Suggests` in an attempt of fixing an error in CRAN + checks due to failures to build a vignette (#528). + +# datawizard 0.12.1 + +This is a patch release to fix one error on CRAN checks occurring because of a +missing package namespace in one of the vignettes. + +# datawizard 0.12.0 + +BREAKING CHANGES + +* The argument `include_na` in `data_tabulate()` and `data_summary()` has been + renamed into `remove_na`. Consequently, to mimic former behaviour, `FALSE` and + `TRUE` need to be switched (i.e. `remove_na = TRUE` is equivalent to the former + `include_na = FALSE`). + +* Class names for objects returned by `data_tabulate()` have been changed to + `datawizard_table` and `datawizard_crosstable` (resp. the plural forms, + `*_tables`), to provide a clearer and more consistent naming scheme. + +CHANGES * `data_select()` can directly rename selected variables when a named vector is provided in `select`, e.g. `data_select(mtcars, c(new1 = "mpg", new2 = "cyl"))`. +* `data_tabulate()` gains an `as.data.frame()` method, to return the frequency + table as a data frame. The structure of the returned object is a nested data + frame, where the first column contains name of the variable for which + frequencies were calculated, and the second column contains the frequency table. + +* `demean()` (and `degroup()`) now also work for cross-classified designs, or + more generally, for data with multiple grouping or cluster variables (i.e. + `by` can now specify more than one variable). + # datawizard 0.11.0 BREAKING CHANGES @@ -43,8 +148,8 @@ BREAKING CHANGES * The following arguments were deprecated in 0.5.0 and are now removed: - * in `data_to_wide()`: `colnames_from`, `rows_from`, `sep` - * in `data_to_long()`: `colnames_to` + * in `data_to_wide()`: `colnames_from`, `rows_from`, `sep` + * in `data_to_long()`: `colnames_to` * in `data_partition()`: `training_proportion` NEW FUNCTIONS @@ -63,7 +168,7 @@ CHANGES argument, to compute weighted frequency tables. `include_na` allows to include or omit missing values from the table. Furthermore, a `by` argument was added, to compute crosstables (#479, #481). - + # datawizard 0.9.1 CHANGES @@ -114,7 +219,7 @@ CHANGES * `unnormalize()` and `unstandardize()` now work with grouped data (#415). -* `unnormalize()` now errors instead of emitting a warning if it doesn't have the +* `unnormalize()` now errors instead of emitting a warning if it doesn't have the necessary info (#415). BUG FIXES @@ -137,7 +242,7 @@ BUG FIXES * Fixed issue in `data_filter()` where functions containing a `=` (e.g. when naming arguments, like `grepl(pattern, x = a)`) were mistakenly seen as - faulty syntax. + faulty syntax. * Fixed issue in `empty_column()` for strings with invalid multibyte strings. For such data frames or files, `empty_column()` or `data_read()` no longer @@ -174,14 +279,14 @@ CHANGES NEW FUNCTIONS -* `rowid_as_column()` to complement `rownames_as_column()` (and to mimic - `tibble::rowid_to_column()`). Note that its behavior is different from +* `rowid_as_column()` to complement `rownames_as_column()` (and to mimic + `tibble::rowid_to_column()`). Note that its behavior is different from `tibble::rowid_to_column()` for grouped data. See the Details section in the docs. * `data_unite()`, to merge values of multiple variables into one new variable. -* `data_separate()`, as counterpart to `data_unite()`, to separate a single +* `data_separate()`, as counterpart to `data_unite()`, to separate a single variable into multiple new variables. * `data_modify()`, to create new variables, or modify or remove existing @@ -204,7 +309,7 @@ BUG FIXES * `center()` and `standardize()` did not work for grouped data frames (of class `grouped_df`) when `force = TRUE`. - + * The `data.frame` method of `describe_distribution()` returns `NULL` instead of an error if no valid variable were passed (for example a factor variable with `include_factors = FALSE`) (#421). @@ -232,12 +337,12 @@ BUG FIXES # datawizard 0.7.0 -BREAKING CHANGES +BREAKING CHANGES * In selection patterns, expressions like `-var1:var3` to exclude all variables between `var1` and `var3` are no longer accepted. The correct expression is `-(var1:var3)`. This is for 2 reasons: - + * to be consistent with the behavior for numerics (`-1:2` is not accepted but `-(1:2)` is); * to be consistent with `dplyr::select()`, which throws a warning and only @@ -249,8 +354,8 @@ NEW FUNCTIONS or more variables into a new variable. * `mean_sd()` and `median_mad()` for summarizing vectors to their mean (or - median) and a range of one SD (or MAD) above and below. - + median) and a range of one SD (or MAD) above and below. + * `data_write()` as counterpart to `data_read()`, to write data frames into CSV, SPSS, SAS, Stata files and many other file types. One advantage over existing functions to write data in other packages is that labelled (numeric) @@ -266,8 +371,8 @@ MINOR CHANGES * `data_rename()` gets a `verbose` argument. * `winsorize()` now errors if the threshold is incorrect (previously, it provided - a warning and returned the unchanged data). The argument `verbose` is now - useless but is kept for backward compatibility. The documentation now contains + a warning and returned the unchanged data). The argument `verbose` is now + useless but is kept for backward compatibility. The documentation now contains details about the valid values for `threshold` (#357). * In all functions that have arguments `select` and/or `exclude`, there is now one warning per misspelled variable. The previous behavior was to have only one @@ -288,7 +393,7 @@ BUG FIXES * Fix unexpected warning in `convert_na_to()` when `select` is a list (#352). * Fixed issue with correct labelling of numeric variables with more than nine unique values and associated value labels. - + # datawizard 0.6.5 @@ -320,7 +425,7 @@ NEW FUNCTIONS * `data_codebook()`: to generate codebooks of data frames. * New functions to deal with duplicates: `data_duplicated()` (keep all duplicates, - including the first occurrence) and `data_unique()` (returns the data, excluding + including the first occurrence) and `data_unique()` (returns the data, excluding all duplicates except one instance of each, based on the selected method). MINOR CHANGES @@ -330,15 +435,15 @@ MINOR CHANGES * The `include_bounds` argument in `normalize()` can now also be a numeric value, defining the limit to the upper and lower bound (i.e. the distance to 1 and 0). - -* `data_filter()` now works with grouped data. + +* `data_filter()` now works with grouped data. BUG FIXES * `data_read()` no longer prints message for empty columns when the data actually had no empty columns. - - * `data_to_wide()` now drops columns that are not in `id_cols` (if specified), + + * `data_to_wide()` now drops columns that are not in `id_cols` (if specified), `names_from`, or `values_from`. This is the behaviour observed in `tidyr::pivot_wider()`. # datawizard 0.6.3 @@ -770,4 +875,3 @@ NEW FUNCTIONS # datawizard 0.1.0 * First release. - diff --git a/R/categorize.R b/R/categorize.R index a6562ab68..9f8dd7505 100644 --- a/R/categorize.R +++ b/R/categorize.R @@ -31,10 +31,18 @@ #' for numeric variables, the minimum of the original input is preserved. For #' factors, the default minimum is `1`. For `split = "equal_range"`, the #' default minimum is always `1`, unless specified otherwise in `lowest`. +#' @param breaks Character, indicating whether breaks for categorizing data are +#' `"inclusive"` (values indicate the _upper_ bound of the _previous_ group or +#' interval) or `"exclusive"` (values indicate the _lower_ bound of the _next_ +#' group or interval to begin). Use `labels = "range"` to make this behaviour +#' easier to see. #' @param labels Character vector of value labels. If not `NULL`, `categorize()` #' will returns factors instead of numeric variables, with `labels` used -#' for labelling the factor levels. Can also be `"mean"` or `"median"` for a -#' factor with labels as the mean/median of each groups. +#' for labelling the factor levels. Can also be `"mean"`, `"median"`, +#' `"range"` or `"observed"` for a factor with labels as the mean/median, +#' the requested range (even if not all values of that range are present in +#' the data) or observed range (range of the actual recoded values) of each +#' group. See 'Examples'. #' @param append Logical or string. If `TRUE`, recoded or converted variables #' get new column names and are appended (column bind) to `x`, thus returning #' both the original and the recoded variables. The new columns get a suffix, @@ -53,7 +61,7 @@ #' #' # Splits and breaks (cut-off values) #' -#' Breaks are in general _exclusive_, this means that these values indicate +#' Breaks are by default _exclusive_, this means that these values indicate #' the lower bound of the next group or interval to begin. Take a simple #' example, a numeric variable with values from 1 to 9. The median would be 5, #' thus the first interval ranges from 1-4 and is recoded into 1, while 5-9 @@ -63,6 +71,9 @@ #' from 1 to 3 belong to the first interval and are recoded into 1 (because #' the next interval starts at 3.67), 4 to 6 into 2 and 7 to 9 into 3. #' +#' The opposite behaviour can be achieved using `breaks = "inclusive"`, in which +#' case +#' #' # Recoding into groups with equal size or range #' #' `split = "equal_length"` and `split = "equal_range"` try to divide the @@ -119,6 +130,13 @@ #' x <- sample(1:10, size = 30, replace = TRUE) #' categorize(x, "equal_length", n_groups = 3, labels = "mean") #' categorize(x, "equal_length", n_groups = 3, labels = "median") +#' +#' # cut numeric into groups with the requested range as a label name +#' # each category has the same range, and labels indicate this range +#' categorize(mtcars$mpg, "equal_length", n_groups = 5, labels = "range") +#' # in this example, each category has the same range, but labels only refer +#' # to the ranges of the actual values (present in the data) inside each group +#' categorize(mtcars$mpg, "equal_length", n_groups = 5, labels = "observed") #' @export categorize <- function(x, ...) { UseMethod("categorize") @@ -142,6 +160,7 @@ categorize.numeric <- function(x, n_groups = NULL, range = NULL, lowest = 1, + breaks = "exclusive", labels = NULL, verbose = TRUE, ...) { @@ -152,6 +171,9 @@ categorize.numeric <- function(x, if (identical(split, "equal_length")) split <- "length" if (identical(split, "equal_range")) split <- "range" + # check for valid values + breaks <- match.arg(breaks, c("exclusive", "inclusive")) + # save original_x <- x @@ -169,9 +191,9 @@ categorize.numeric <- function(x, } if (is.numeric(split)) { - breaks <- split + category_splits <- split } else { - breaks <- switch(split, + category_splits <- switch(split, median = stats::median(x), mean = mean(x), length = n_groups, @@ -182,15 +204,18 @@ categorize.numeric <- function(x, } # complete ranges, including minimum and maximum - if (!identical(split, "length")) breaks <- unique(c(min(x), breaks, max(x))) + if (!identical(split, "length")) { + category_splits <- unique(c(min(x), category_splits, max(x))) + } # recode into groups out <- droplevels(cut( x, - breaks = breaks, + breaks = category_splits, include.lowest = TRUE, - right = FALSE + right = identical(breaks, "inclusive") )) + cut_result <- out levels(out) <- 1:nlevels(out) # fix lowest value, add back into original vector @@ -201,7 +226,7 @@ categorize.numeric <- function(x, original_x[!is.na(original_x)] <- out # turn into factor? - .original_x_to_factor(original_x, x, labels, out, verbose, ...) + .original_x_to_factor(original_x, x, cut_result, labels, out, verbose, ...) } @@ -223,6 +248,7 @@ categorize.data.frame <- function(x, n_groups = NULL, range = NULL, lowest = 1, + breaks = "exclusive", labels = NULL, append = FALSE, ignore_case = FALSE, @@ -260,6 +286,7 @@ categorize.data.frame <- function(x, n_groups = n_groups, range = range, lowest = lowest, + breaks = breaks, labels = labels, verbose = verbose, ... @@ -276,6 +303,7 @@ categorize.grouped_df <- function(x, n_groups = NULL, range = NULL, lowest = 1, + breaks = "exclusive", labels = NULL, append = FALSE, ignore_case = FALSE, @@ -319,6 +347,7 @@ categorize.grouped_df <- function(x, n_groups = n_groups, range = range, lowest = lowest, + breaks = breaks, labels = labels, select = select, exclude = exclude, @@ -375,20 +404,26 @@ categorize.grouped_df <- function(x, } -.original_x_to_factor <- function(original_x, x, labels, out, verbose, ...) { +.original_x_to_factor <- function(original_x, x, cut_result, labels, out, verbose, ...) { if (!is.null(labels)) { if (length(labels) == length(unique(out))) { original_x <- as.factor(original_x) levels(original_x) <- labels - } else if (length(labels) == 1 && labels %in% c("mean", "median")) { + } else if (length(labels) == 1 && labels %in% c("mean", "median", "range", "observed")) { original_x <- as.factor(original_x) no_na_x <- original_x[!is.na(original_x)] - if (labels == "mean") { - labels <- stats::aggregate(x, list(no_na_x), FUN = mean, na.rm = TRUE)$x - } else { - labels <- stats::aggregate(x, list(no_na_x), FUN = stats::median, na.rm = TRUE)$x - } - levels(original_x) <- insight::format_value(labels, ...) + out <- switch(labels, + mean = stats::aggregate(x, list(no_na_x), FUN = mean, na.rm = TRUE)$x, + median = stats::aggregate(x, list(no_na_x), FUN = stats::median, na.rm = TRUE)$x, + # labels basically like what "cut()" returns + range = levels(cut_result), + # range based on the values that are actually present in the data + { + temp <- stats::aggregate(x, list(no_na_x), FUN = range, na.rm = TRUE)$x + apply(temp, 1, function(i) paste0("(", paste(as.vector(i), collapse = "-"), ")")) + } + ) + levels(original_x) <- insight::format_value(out, ...) } else if (isTRUE(verbose)) { insight::format_warning( "Argument `labels` and levels of the recoded variable are not of the same length.", diff --git a/R/data_codebook.R b/R/data_codebook.R index 71e2e5828..5dffbf9c2 100644 --- a/R/data_codebook.R +++ b/R/data_codebook.R @@ -33,7 +33,8 @@ #' #' @note There are methods to `print()` the data frame in a nicer output, as #' well methods for printing in markdown or HTML format (`print_md()` and -#' `print_html()`). +#' `print_html()`). The `print()` method for text outputs passes arguments in +#' `...` to [`insight::export_table()`]. #' #' @examples #' data(iris) @@ -369,7 +370,8 @@ print.data_codebook <- function(x, ...) { title = caption, empty_line = "-", cross = "+", - align = .get_codebook_align(x) + align = .get_codebook_align(x), + ... ) ) } diff --git a/R/data_group.R b/R/data_group.R index 00a7adf84..538c875c2 100644 --- a/R/data_group.R +++ b/R/data_group.R @@ -51,7 +51,7 @@ data_group <- function(data, to = my_grid[i, , drop = FALSE], match = "and", return_indices = TRUE, - drop_na = FALSE + remove_na = FALSE )) }) my_grid[[".rows"]] <- .rows diff --git a/R/data_match.R b/R/data_match.R index c03b3f222..6b522a0b8 100644 --- a/R/data_match.R +++ b/R/data_match.R @@ -15,7 +15,7 @@ #' @param return_indices Logical, if `FALSE`, return the vector of rows that #' can be used to filter the original data frame. If `FALSE` (default), #' returns directly the filtered data frame instead of the row indices. -#' @param drop_na Logical, if `TRUE`, missing values (`NA`s) are removed before +#' @param remove_na Logical, if `TRUE`, missing values (`NA`s) are removed before #' filtering the data. This is the default behaviour, however, sometimes when #' row indices are requested (i.e. `return_indices=TRUE`), it might be useful #' to preserve `NA` values, so returned row indices match the row indices of @@ -26,6 +26,7 @@ #' character vector (e.g. `c("x > 4", "y == 2")`) or a variable that contains #' the string representation of a logical expression. These might be useful #' when used in packages to avoid defining undefined global variables. +#' @param drop_na Deprecated, please use `remove_na` instead. #' #' @return A filtered data frame, or the row indices that match the specified #' configuration. @@ -100,12 +101,24 @@ #' data_filter(mtcars, fl) #' @inherit data_rename seealso #' @export -data_match <- function(x, to, match = "and", return_indices = FALSE, drop_na = TRUE, ...) { +data_match <- function(x, + to, + match = "and", + return_indices = FALSE, + remove_na = TRUE, + drop_na, + ...) { if (!is.data.frame(to)) { to <- as.data.frame(to) } original_x <- x + ## TODO: remove deprecated argument later + if (!missing(drop_na)) { + insight::format_warning("Argument `drop_na` is deprecated. Please use `remove_na` instead.") + remove_na <- drop_na + } + # evaluate match <- match.arg(tolower(match), c("and", "&", "&&", "or", "|", "||", "!", "not")) match <- switch(match, @@ -133,7 +146,7 @@ data_match <- function(x, to, match = "and", return_indices = FALSE, drop_na = T idx <- vector("numeric", length = 0L) } else { # remove missings before matching - if (isTRUE(drop_na)) { + if (isTRUE(remove_na)) { x <- x[stats::complete.cases(x), , drop = FALSE] } idx <- seq_len(nrow(x)) diff --git a/R/data_modify.R b/R/data_modify.R index e7744c1f5..3e30b8f68 100644 --- a/R/data_modify.R +++ b/R/data_modify.R @@ -22,6 +22,9 @@ #' character vector is provided, you may not add further elements to `...`. #' - Using `NULL` as right-hand side removes a variable from the data frame. #' Example: `Petal.Width = NULL`. +#' - For data frames (including grouped ones), the function `n()` can be used to count the +#' number of observations and thereby, for instance, create index values by +#' using `id = 1:n()` or `id = 3:(n()+2)` and similar. #' #' Note that newly created variables can be used in subsequent expressions, #' including `.at` or `.if`. See also 'Examples'. @@ -92,7 +95,8 @@ #' grouped_efc, #' c12hour_c = center(c12hour), #' c12hour_z = c12hour_c / sd(c12hour, na.rm = TRUE), -#' c12hour_z2 = standardize(c12hour) +#' c12hour_z2 = standardize(c12hour), +#' id = 1:n() #' ) #' head(new_efc) #' @@ -145,6 +149,11 @@ data_modify.default <- function(data, ...) { data_modify.data.frame <- function(data, ..., .if = NULL, .at = NULL, .modify = NULL) { dots <- eval(substitute(alist(...))) + # error for data frames with no rows... + if (nrow(data) == 0) { + insight::format_error("`data` is an empty data frame. `data_modify()` only works for data frames with at least one row.") # nolint + } + # check if we have dots, or only at/modify ---- if (length(dots)) { @@ -201,6 +210,10 @@ data_modify.grouped_df <- function(data, ..., .if = NULL, .at = NULL, .modify = # the data.frame method later... dots <- match.call(expand.dots = FALSE)[["..."]] + # error for data frames with no rows... + if (nrow(data) == 0) { + insight::format_error("`data` is an empty data frame. `data_modify()` only works for data frames with at least one row.") # nolint + } grps <- attr(data, "groups", exact = TRUE) grps <- grps[[".rows"]] @@ -352,8 +365,12 @@ data_modify.grouped_df <- function(data, ..., .if = NULL, .at = NULL, .modify = # finally, we can evaluate expression and get values for new variables symbol_string <- insight::safe_deparse(symbol) if (!is.null(symbol_string) && all(symbol_string == "n()")) { - # "special" functions + # "special" functions - using "n()" just returns number of rows new_variable <- nrow(data) + } else if (!is.null(symbol_string) && length(symbol_string) == 1 && grepl("\\bn\\(\\)", symbol_string)) { + # "special" functions, like "1:n()" or similar - but not "1:fun()" + symbol_string <- str2lang(gsub("n()", "nrow(data)", symbol_string, fixed = TRUE)) + new_variable <- try(with(data, eval(symbol_string)), silent = TRUE) } else { # default evaluation of expression new_variable <- try(with(data, eval(symbol)), silent = TRUE) diff --git a/R/data_partition.R b/R/data_partition.R index 09add9dd7..99f481e18 100644 --- a/R/data_partition.R +++ b/R/data_partition.R @@ -15,7 +15,6 @@ #' @param row_id Character string, indicating the name of the column that #' contains the row-id's. #' @param verbose Toggle messages and warnings. -#' @param group Deprecated. Use `by` instead. #' #' @return A list of data frames. The list includes one training set per given #' proportion and the remaining data as test set. List elements of training @@ -50,17 +49,10 @@ data_partition <- function(data, seed = NULL, row_id = ".row_id", verbose = TRUE, - group = NULL, ...) { # validation checks data <- .coerce_to_dataframe(data) - ## TODO: remove warning in future release - if (!is.null(group)) { - by <- group - insight::format_warning("Argument `group` is deprecated and will be removed in a future release. Please use `by` instead.") # nolint - } - if (sum(proportion) > 1) { insight::format_error("Sum of `proportion` cannot be higher than 1.") } diff --git a/R/data_read.R b/R/data_read.R index 5137a7735..b24a5bdc2 100644 --- a/R/data_read.R +++ b/R/data_read.R @@ -15,15 +15,16 @@ #' for SAS data files. #' @param encoding The character encoding used for the file. Usually not needed. #' @param convert_factors If `TRUE` (default), numeric variables, where all -#' values have a value label, are assumed to be categorical and converted -#' into factors. If `FALSE`, no variable types are guessed and no conversion -#' of numeric variables into factors will be performed. See also section -#' 'Differences to other packages'. For `data_write()`, this argument only -#' applies to the text (e.g. `.txt` or `.csv`) or spreadsheet file formats (like -#' `.xlsx`). Converting to factors might be useful for these formats because -#' labelled numeric variables are then converted into factors and exported as -#' character columns - else, value labels would be lost and only numeric values -#' are written to the file. +#' values have a value label, are assumed to be categorical and converted into +#' factors. If `FALSE`, no variable types are guessed and no conversion of +#' numeric variables into factors will be performed. For `data_read()`, this +#' argument only applies to file types with *labelled data*, e.g. files from +#' SPSS, SAS or Stata. See also section 'Differences to other packages'. For +#' `data_write()`, this argument only applies to the text (e.g. `.txt` or +#' `.csv`) or spreadsheet file formats (like `.xlsx`). Converting to factors +#' might be useful for these formats because labelled numeric variables are then +#' converted into factors and exported as character columns - else, value labels +#' would be lost and only numeric values are written to the file. #' @param verbose Toggle warnings and messages. #' @param ... Arguments passed to the related `read_*()` or `write_*()` functions. #' @@ -65,12 +66,13 @@ #' @section Differences to other packages that read foreign data formats: #' `data_read()` is most comparable to `rio::import()`. For data files from #' SPSS, SAS or Stata, which support labelled data, variables are converted into -#' their most appropriate type. The major difference to `rio::import()` is that -#' `data_read()` automatically converts fully labelled numeric variables into -#' factors, where imported value labels will be set as factor levels. If a -#' numeric variable has _no_ value labels or less value labels than values, it -#' is not converted to factor. In this case, value labels are preserved as -#' `"labels"` attribute. Character vectors are preserved. Use +#' their most appropriate type. The major difference to `rio::import()` is for +#' data files from SPSS, SAS, or Stata, i.e. file types that support +#' *labelled data*. `data_read()` automatically converts fully labelled numeric +#' variables into factors, where imported value labels will be set as factor +#' levels. If a numeric variable has _no_ value labels or less value labels than +#' values, it is not converted to factor. In this case, value labels are +#' preserved as `"labels"` attribute. Character vectors are preserved. Use #' `convert_factors = FALSE` to remove the automatic conversion of numeric #' variables to factors. #' @@ -105,7 +107,7 @@ data_read <- function(path, por = .read_spss(path, encoding, convert_factors, verbose, ...), dta = .read_stata(path, encoding, convert_factors, verbose, ...), sas7bdat = .read_sas(path, path_catalog, encoding, convert_factors, verbose, ...), - .read_unknown(path, file_type, convert_factors, verbose, ...) + .read_unknown(path, file_type, verbose, ...) ) # tell user about empty columns @@ -161,7 +163,7 @@ data_read <- function(path, # user may decide whether we automatically detect variable type or not if (isTRUE(convert_factors)) { if (verbose) { - msg <- "Variables where all values have associated labels are now converted into factors. If this is not intended, use `convert_factors = FALSE`." + msg <- "Variables where all values have associated labels are now converted into factors. If this is not intended, use `convert_factors = FALSE`." # nolint insight::format_alert(msg) } x[] <- lapply(x, function(i) { @@ -188,7 +190,7 @@ data_read <- function(path, value_labels <- NULL attr(i, "converted_to_factor") <- TRUE } else { - # else, fall back to numeric + # else, fall back to numeric or factor i <- as.numeric(i) } @@ -288,7 +290,7 @@ data_read <- function(path, } -.read_unknown <- function(path, file_type, convert_factors, verbose, ...) { +.read_unknown <- function(path, file_type, verbose, ...) { insight::check_if_installed("rio", reason = paste0("to read files of type '", file_type, "'")) if (verbose) { insight::format_alert("Reading data...") @@ -296,7 +298,7 @@ data_read <- function(path, # set up arguments. for RDS, we set trust = TRUE, to avoid warnings rio_args <- list(file = path) # check if we have RDS, and if so, add trust = TRUE - if (file_type == "rds") { + if (file_type %in% c("rds", "rdata")) { rio_args$trust <- TRUE } out <- do.call(rio::import, c(rio_args, list(...))) @@ -317,6 +319,5 @@ data_read <- function(path, } out <- tmp } - - .post_process_imported_data(out, convert_factors, verbose) + out } diff --git a/R/data_rename.R b/R/data_rename.R index b8f213c7f..18f45657b 100644 --- a/R/data_rename.R +++ b/R/data_rename.R @@ -13,11 +13,15 @@ #' @param pattern Character vector. For `data_rename()`, indicates columns that #' should be selected for renaming. Can be `NULL` (in which case all columns #' are selected). For `data_addprefix()` or `data_addsuffix()`, a character -#' string, which will be added as prefix or suffix to the column names. +#' string, which will be added as prefix or suffix to the column names. For +#' `data_rename()`, `pattern` can also be a named vector. In this case, names +#' are used as values for the `replacement` argument (i.e. `pattern` can be a +#' character vector using ` = ""` and argument `replacement` +#' will be ignored then). #' @param replacement Character vector. Indicates the new name of the columns #' selected in `pattern`. Can be `NULL` (in which case column are numbered #' in sequential order). If not `NULL`, `pattern` and `replacement` must be -#' of the same length. +#' of the same length. If `pattern` is a named vector, `replacement` is ignored. #' @param rows Vector of row names. #' @param safe Do not throw error if for instance the variable to be #' renamed/removed doesn't exist. @@ -33,12 +37,14 @@ #' head(data_rename(iris, "FakeCol", "length")) # This doesn't #' head(data_rename(iris, c("Sepal.Length", "Sepal.Width"), c("length", "width"))) #' +#' # use named vector to rename +#' head(data_rename(iris, c(length = "Sepal.Length", width = "Sepal.Width"))) +#' #' # Reset names #' head(data_rename(iris, NULL)) #' #' # Change all #' head(data_rename(iris, replacement = paste0("Var", 1:5))) -#' #' @seealso #' - Functions to rename stuff: [data_rename()], [data_rename_rows()], [data_addprefix()], [data_addsuffix()] #' - Functions to reorder or remove columns: [data_reorder()], [data_relocate()], [data_remove()] @@ -66,11 +72,44 @@ data_rename <- function(data, insight::format_error("Argument `pattern` must be of type character.") } + # check if `pattern` has names, and if so, use as "replacement" + if (!is.null(names(pattern))) { + replacement <- names(pattern) + } + # name columns 1, 2, 3 etc. if no replacement if (is.null(replacement)) { replacement <- paste0(seq_along(pattern)) } + # coerce to character + replacement <- as.character(replacement) + + # check if `replacement` has no empty strings and no NA values + invalid_replacement <- is.na(replacement) | !nzchar(replacement) + if (any(invalid_replacement)) { + if (is.null(names(pattern))) { + # when user did not match `pattern` with `replacement` + msg <- c( + "`replacement` is not allowed to have `NA` or empty strings.", + sprintf( + "Following values in `pattern` have no match in `replacement`: %s", + toString(pattern[invalid_replacement]) + ) + ) + } else { + # when user did not name all elements of `pattern` + msg <- c( + "Either name all elements of `pattern` or use `replacement`.", + sprintf( + "Following values in `pattern` were not named: %s", + toString(pattern[invalid_replacement]) + ) + ) + } + insight::format_error(msg) + } + # if duplicated names in replacement, append ".2", ".3", etc. to duplicates # ex: c("foo", "foo") -> c("foo", "foo.2") if (anyDuplicated(replacement) > 0L) { diff --git a/R/data_select.R b/R/data_select.R index 0f62ba398..db91fc06b 100644 --- a/R/data_select.R +++ b/R/data_select.R @@ -38,25 +38,3 @@ data_select <- function(data, out <- .replace_attrs(out, a) out } - - -#' @rdname extract_column_names -#' @export -get_columns <- function(data, - select = NULL, - exclude = NULL, - ignore_case = FALSE, - regex = FALSE, - verbose = TRUE, - ...) { - insight::format_warning("Function `get_columns()` is deprecated and will be removed in a future release. Please use `data_select()` instead.") # nolint - data_select( - data, - select = select, - exclude = exclude, - ignore_case = ignore_case, - regex = regex, - verbose = verbose, - ... - ) -} diff --git a/R/data_summary.R b/R/data_summary.R index 8d15f8483..7662d0c94 100644 --- a/R/data_summary.R +++ b/R/data_summary.R @@ -8,9 +8,9 @@ #' @param by Optional character string, indicating the name of a variable in `x`. #' If supplied, the data will be split by this variable and summary statistics #' will be computed for each group. -#' @param include_na Logical. If `TRUE`, missing values are included as a level -#' in the grouping variable. If `FALSE`, missing values are omitted from the -#' grouping variable. +#' @param remove_na Logical. If `TRUE`, missing values are omitted from the +#' grouping variable. If `FALSE` (default), missing values are included as a +#' level in the grouping variable. #' @param ... One or more named expressions that define the new variable name #' and the function to compute the summary statistic. Example: #' `mean_sepal_width = mean(Sepal.Width)`. The expression can also be provided @@ -57,8 +57,8 @@ data_summary <- function(x, ...) { #' @export -data_summary.matrix <- function(x, ..., by = NULL, include_na = TRUE) { - data_summary(as.data.frame(x), ..., by = by, include_na = include_na) +data_summary.matrix <- function(x, ..., by = NULL, remove_na = FALSE) { + data_summary(as.data.frame(x), ..., by = by, remove_na = remove_na) } @@ -70,7 +70,7 @@ data_summary.default <- function(x, ...) { #' @rdname data_summary #' @export -data_summary.data.frame <- function(x, ..., by = NULL, include_na = TRUE) { +data_summary.data.frame <- function(x, ..., by = NULL, remove_na = FALSE) { dots <- eval(substitute(alist(...))) # do we have any expression at all? @@ -103,10 +103,10 @@ data_summary.data.frame <- function(x, ..., by = NULL, include_na = TRUE) { } # split data, add NA levels, if requested l <- lapply(x[by], function(i) { - if (include_na && anyNA(i)) { - addNA(i) - } else { + if (remove_na || !anyNA(i)) { i + } else { + addNA(i) } }) split_data <- split(x, l, drop = TRUE) @@ -137,7 +137,7 @@ data_summary.data.frame <- function(x, ..., by = NULL, include_na = TRUE) { #' @export -data_summary.grouped_df <- function(x, ..., by = NULL, include_na = TRUE) { +data_summary.grouped_df <- function(x, ..., by = NULL, remove_na = FALSE) { # extract group variables grps <- attr(x, "groups", exact = TRUE) group_variables <- data_remove(grps, ".rows") @@ -148,7 +148,7 @@ data_summary.grouped_df <- function(x, ..., by = NULL, include_na = TRUE) { # remove information specific to grouped df's attr(x, "groups") <- NULL class(x) <- "data.frame" - data_summary(x, ..., by = by, include_na = include_na) + data_summary(x, ..., by = by, remove_na = remove_na) } diff --git a/R/data_tabulate.R b/R/data_tabulate.R index 6a26a39c9..621fabbb1 100644 --- a/R/data_tabulate.R +++ b/R/data_tabulate.R @@ -15,7 +15,7 @@ #' factor levels are dropped from the frequency table. #' @param name Optional character string, which includes the name that is used #' for printing. -#' @param include_na Logical, if `TRUE`, missing values are included in the +#' @param remove_na Logical, if `FALSE`, missing values are included in the #' frequency or crosstable, else missing values are omitted. #' @param collapse Logical, if `TRUE` collapses multiple tables into one larger #' table for printing. This affects only printing, not the returned object. @@ -28,12 +28,19 @@ #' @param ... not used. #' @inheritParams extract_column_names #' +#' @details +#' There is an `as.data.frame()` method, to return the frequency tables as a +#' data frame. The structure of the returned object is a nested data frame, +#' where the first column contains name of the variable for which frequencies +#' were calculated, and the second column is a list column that contains the +#' frequency tables as data frame. See 'Examples'. +#' #' @section Crosstables: #' If `by` is supplied, a crosstable is created. The crosstable includes `` #' (missing) values by default. The first column indicates values of `x`, the #' first row indicates values of `by` (including missing values). The last row #' and column contain the total frequencies for each row and column, respectively. -#' Setting `include_na = FALSE` will omit missing values from the crosstable. +#' Setting `remove_na = FALSE` will omit missing values from the crosstable. #' Setting `proportions` to `"row"` or `"column"` will add row or column #' percentages. Setting `proportions` to `"full"` will add relative frequencies #' for the full table. @@ -41,7 +48,8 @@ #' @note #' There are `print_html()` and `print_md()` methods available for printing #' frequency or crosstables in HTML and markdown format, e.g. -#' `print_html(data_tabulate(x))`. +#' `print_html(data_tabulate(x))`. The `print()` method for text outputs passes +#' arguments in `...` to [`insight::export_table()`]. #' #' @return A data frame, or a list of data frames, with one frequency table #' as data frame per variable. @@ -55,7 +63,7 @@ #' data_tabulate(efc$c172code) #' #' # drop missing values -#' data_tabulate(efc$c172code, include_na = FALSE) +#' data_tabulate(efc$c172code, remove_na = TRUE) #' #' # data frame #' data_tabulate(efc, c("e42dep", "c172code")) @@ -102,12 +110,18 @@ #' efc$c172code, #' by = efc$e16sex, #' proportions = "column", -#' include_na = FALSE +#' remove_na = TRUE #' ) #' #' # round percentages #' out <- data_tabulate(efc, "c172code", by = "e16sex", proportions = "column") #' print(out, digits = 0) +#' +#' # coerce to data frames +#' result <- data_tabulate(efc, "c172code", by = "e16sex") +#' as.data.frame(result) +#' as.data.frame(result)$table +#' as.data.frame(result, add_total = TRUE)$table #' @export data_tabulate <- function(x, ...) { UseMethod("data_tabulate") @@ -120,7 +134,7 @@ data_tabulate.default <- function(x, by = NULL, drop_levels = FALSE, weights = NULL, - include_na = TRUE, + remove_na = FALSE, proportions = NULL, name = NULL, verbose = TRUE, @@ -150,7 +164,7 @@ data_tabulate.default <- function(x, x, by = by, weights = weights, - include_na = include_na, + remove_na = remove_na, proportions = proportions, obj_name = obj_name, group_variable = group_variable @@ -159,30 +173,34 @@ data_tabulate.default <- function(x, # frequency table if (is.null(weights)) { - if (include_na) { - freq_table <- tryCatch(table(addNA(x)), error = function(e) NULL) - } else { + if (remove_na) { + # we have a `.default` and a `.data.frame` method for `data_tabulate()`. + # since this is the default, `x` can be an object which cannot be used + # with `table()`, that's why we add `tryCatch()` here. Below we give an + # informative error message for non-supported objects. freq_table <- tryCatch(table(x), error = function(e) NULL) + } else { + freq_table <- tryCatch(table(addNA(x)), error = function(e) NULL) } - } else if (include_na) { - # weighted frequency table, including NA + } else if (remove_na) { + # weighted frequency table, excluding NA freq_table <- tryCatch( stats::xtabs( weights ~ x, - data = data.frame(weights = weights, x = addNA(x)), - na.action = stats::na.pass, - addNA = TRUE + data = data.frame(weights = weights, x = x), + na.action = stats::na.omit, + addNA = FALSE ), error = function(e) NULL ) } else { - # weighted frequency table, excluding NA + # weighted frequency table, including NA freq_table <- tryCatch( stats::xtabs( weights ~ x, - data = data.frame(weights = weights, x = x), - na.action = stats::na.omit, - addNA = FALSE + data = data.frame(weights = weights, x = addNA(x)), + na.action = stats::na.pass, + addNA = TRUE ), error = function(e) NULL ) @@ -205,12 +223,12 @@ data_tabulate.default <- function(x, out$`Raw %` <- 100 * out$N / sum(out$N) # if we have missing values, we add a row with NA - if (include_na) { - out$`Valid %` <- c(100 * out$N[-nrow(out)] / sum(out$N[-nrow(out)]), NA) - valid_n <- sum(out$N[-length(out$N)], na.rm = TRUE) - } else { + if (remove_na) { out$`Valid %` <- 100 * out$N / sum(out$N) valid_n <- sum(out$N, na.rm = TRUE) + } else { + out$`Valid %` <- c(100 * out$N[-nrow(out)] / sum(out$N[-nrow(out)]), NA) + valid_n <- sum(out$N[-length(out$N)], na.rm = TRUE) } out$`Cumulative %` <- cumsum(out$`Valid %`) @@ -242,7 +260,7 @@ data_tabulate.default <- function(x, attr(out, "total_n") <- sum(out$N, na.rm = TRUE) attr(out, "valid_n") <- valid_n - class(out) <- c("dw_data_tabulate", "data.frame") + class(out) <- c("datawizard_table", "data.frame") out } @@ -258,7 +276,7 @@ data_tabulate.data.frame <- function(x, by = NULL, drop_levels = FALSE, weights = NULL, - include_na = TRUE, + remove_na = FALSE, proportions = NULL, collapse = FALSE, verbose = TRUE, @@ -284,7 +302,7 @@ data_tabulate.data.frame <- function(x, proportions = proportions, drop_levels = drop_levels, weights = weights, - include_na = include_na, + remove_na = remove_na, name = i, verbose = verbose, ... @@ -292,9 +310,9 @@ data_tabulate.data.frame <- function(x, }) if (is.null(by)) { - class(out) <- c("dw_data_tabulates", "list") + class(out) <- c("datawizard_tables", "list") } else { - class(out) <- c("dw_data_xtabulates", "list") + class(out) <- c("datawizard_crosstabs", "list") } attr(out, "collapse") <- isTRUE(collapse) attr(out, "is_weighted") <- !is.null(weights) @@ -313,7 +331,7 @@ data_tabulate.grouped_df <- function(x, proportions = NULL, drop_levels = FALSE, weights = NULL, - include_na = TRUE, + remove_na = FALSE, collapse = FALSE, verbose = TRUE, ...) { @@ -349,7 +367,7 @@ data_tabulate.grouped_df <- function(x, verbose = verbose, drop_levels = drop_levels, weights = weights, - include_na = include_na, + remove_na = remove_na, by = by, proportions = proportions, group_variable = group_variable, @@ -357,9 +375,9 @@ data_tabulate.grouped_df <- function(x, )) } if (is.null(by)) { - class(out) <- c("dw_data_tabulates", "list") + class(out) <- c("datawizard_tables", "list") } else { - class(out) <- c("dw_data_xtabulates", "list") + class(out) <- c("datawizard_crosstabs", "list") } attr(out, "collapse") <- isTRUE(collapse) attr(out, "is_weighted") <- !is.null(weights) @@ -380,8 +398,64 @@ insight::print_html insight::print_md +#' @rdname data_tabulate +#' @param add_total For crosstables (i.e. when `by` is not `NULL`), a row and +#' column with the total N values are added to the data frame. `add_total` has +#' no effect in `as.data.frame()` for simple frequency tables. +#' @inheritParams base::as.data.frame +#' @export +as.data.frame.datawizard_tables <- function(x, + row.names = NULL, + optional = FALSE, + ..., + stringsAsFactors = FALSE, + add_total = FALSE) { + # extract variables of frequencies + selected_vars <- unlist(lapply(x, function(i) attributes(i)$varname)) + # coerce to data frame, remove rownames + data_frames <- lapply(x, function(i) { + # the `format()` methods for objects returned by `data_tabulate()` call + # `as.data.frame()` - we have to pay attention to avoid infinite iterations + # here. At the moment, this is no problem, as objects we have at this stage + # are of class "datawizard_table" or "datawizard_crosstab", while this + # `as.data.frame()` method is only called for "datawizard_tables" (the plural) + # form). Else, we would need to modify the class attribute here, + # e.g. class(i) <- "data.frame" + if (add_total) { + # to add the total column and row, we simply can call `format()` + out <- as.data.frame(format(i)) + for (cols in 2:ncol(out)) { + # since "format()" returns a character matrix, we want to convert + # the columns to numeric. We have to exclude the first column, as the + # first column is character, due to the added "Total" value. + out[[cols]] <- as.numeric(out[[cols]]) + } + # after formatting, we have a "separator" row for nicer printing. + # this should also be removed + out <- remove_empty_rows(out) + } else { + out <- as.data.frame(i) + } + rownames(out) <- NULL + out + }) + # create nested data frame + result <- data.frame( + var = selected_vars, + table = I(data_frames), + stringsAsFactors = stringsAsFactors + ) + # consider additional arguments + rownames(result) <- row.names + result +} + +#' @export +as.data.frame.datawizard_crosstabs <- as.data.frame.datawizard_tables + + #' @export -format.dw_data_tabulate <- function(x, format = "text", big_mark = NULL, ...) { +format.datawizard_table <- function(x, format = "text", big_mark = NULL, ...) { # convert to character manually, else, for large numbers, # format_table() returns scientific notation x <- as.data.frame(x) @@ -414,7 +488,7 @@ format.dw_data_tabulate <- function(x, format = "text", big_mark = NULL, ...) { #' @export -print.dw_data_tabulate <- function(x, big_mark = NULL, ...) { +print.datawizard_table <- function(x, big_mark = NULL, ...) { a <- attributes(x) # "table" header with variable label/name, and type @@ -449,14 +523,15 @@ print.dw_data_tabulate <- function(x, big_mark = NULL, ...) { cat(insight::export_table( format(x, big_mark = big_mark, ...), cross = "+", - missing = "" + missing = "", + ... )) invisible(x) } #' @export -print_html.dw_data_tabulate <- function(x, big_mark = NULL, ...) { +print_html.datawizard_table <- function(x, big_mark = NULL, ...) { a <- attributes(x) # "table" header with variable label/name, and type @@ -486,7 +561,7 @@ print_html.dw_data_tabulate <- function(x, big_mark = NULL, ...) { #' @export -print_md.dw_data_tabulate <- function(x, big_mark = NULL, ...) { +print_md.datawizard_table <- function(x, big_mark = NULL, ...) { a <- attributes(x) # "table" header with variable label/name, and type @@ -516,7 +591,7 @@ print_md.dw_data_tabulate <- function(x, big_mark = NULL, ...) { #' @export -print.dw_data_tabulates <- function(x, big_mark = NULL, ...) { +print.datawizard_tables <- function(x, big_mark = NULL, ...) { # check if we have weights is_weighted <- isTRUE(attributes(x)$is_weighted) @@ -548,14 +623,15 @@ print.dw_data_tabulates <- function(x, big_mark = NULL, ...) { out, missing = "", cross = "+", - empty_line = "-" + empty_line = "-", + ... )) } } #' @export -print_html.dw_data_tabulates <- function(x, big_mark = NULL, ...) { +print_html.datawizard_tables <- function(x, big_mark = NULL, ...) { # check if we have weights is_weighted <- isTRUE(attributes(x)$is_weighted) @@ -584,7 +660,7 @@ print_html.dw_data_tabulates <- function(x, big_mark = NULL, ...) { #' @export -print_md.dw_data_tabulates <- function(x, big_mark = NULL, ...) { +print_md.datawizard_tables <- function(x, big_mark = NULL, ...) { # check if we have weights is_weighted <- isTRUE(attributes(x)$is_weighted) diff --git a/R/data_xtabulate.R b/R/data_xtabulate.R index 3cb25d62b..c9595eccf 100644 --- a/R/data_xtabulate.R +++ b/R/data_xtabulate.R @@ -3,7 +3,7 @@ .crosstable <- function(x, by, weights = NULL, - include_na = TRUE, + remove_na = FALSE, proportions = NULL, obj_name = NULL, group_variable = NULL) { @@ -12,30 +12,34 @@ } # frequency table if (is.null(weights)) { - if (include_na) { - x_table <- tryCatch(table(addNA(x), addNA(by)), error = function(e) NULL) - } else { + # we have a `.default` and a `.data.frame` method for `data_tabulate()`. + # since this is the default, `x` can be an object which cannot be used + # with `table()`, that's why we add `tryCatch()` here. Below we give an + # informative error message for non-supported objects. + if (remove_na) { x_table <- tryCatch(table(x, by), error = function(e) NULL) + } else { + x_table <- tryCatch(table(addNA(x), addNA(by)), error = function(e) NULL) } - } else if (include_na) { - # weighted frequency table, including NA + } else if (remove_na) { + # weighted frequency table, excluding NA x_table <- tryCatch( stats::xtabs( weights ~ x + by, - data = data.frame(weights = weights, x = addNA(x), by = addNA(by)), - na.action = stats::na.pass, - addNA = TRUE + data = data.frame(weights = weights, x = x, by = by), + na.action = stats::na.omit, + addNA = FALSE ), error = function(e) NULL ) } else { - # weighted frequency table, excluding NA + # weighted frequency table, including NA x_table <- tryCatch( stats::xtabs( weights ~ x + by, - data = data.frame(weights = weights, x = x, by = by), - na.action = stats::na.omit, - addNA = FALSE + data = data.frame(weights = weights, x = addNA(x), by = addNA(by)), + na.action = stats::na.pass, + addNA = TRUE ), error = function(e) NULL ) @@ -74,8 +78,9 @@ attr(out, "total_n") <- total_n attr(out, "weights") <- weights attr(out, "proportions") <- proportions + attr(out, "varname") <- obj_name - class(out) <- c("dw_data_xtabulate", "data.frame") + class(out) <- c("datawizard_crosstab", "data.frame") out } @@ -85,7 +90,7 @@ #' @export -format.dw_data_xtabulate <- function(x, format = "text", digits = 1, big_mark = NULL, ...) { +format.datawizard_crosstab <- function(x, format = "text", digits = 1, big_mark = NULL, ...) { # convert to character manually, else, for large numbers, # format_table() returns scientific notation x <- as.data.frame(x) @@ -178,7 +183,7 @@ format.dw_data_xtabulate <- function(x, format = "text", digits = 1, big_mark = #' @export -print.dw_data_xtabulate <- function(x, big_mark = NULL, ...) { +print.datawizard_crosstab <- function(x, big_mark = NULL, ...) { # grouped data? if yes, add information on grouping factor if (is.null(x[["Group"]])) { caption <- NULL @@ -193,14 +198,15 @@ print.dw_data_xtabulate <- function(x, big_mark = NULL, ...) { cross = "+", missing = "", caption = caption, - empty_line = "-" + empty_line = "-", + ... )) invisible(x) } #' @export -print_md.dw_data_xtabulate <- function(x, big_mark = NULL, ...) { +print_md.datawizard_crosstab <- function(x, big_mark = NULL, ...) { # grouped data? if yes, add information on grouping factor if (is.null(x[["Group"]])) { caption <- NULL @@ -222,7 +228,7 @@ print_md.dw_data_xtabulate <- function(x, big_mark = NULL, ...) { #' @export -print_html.dw_data_xtabulate <- function(x, big_mark = NULL, ...) { +print_html.datawizard_crosstab <- function(x, big_mark = NULL, ...) { # grouped data? if yes, add information on grouping factor if (!is.null(x[["Group"]])) { x$groups <- paste0("Grouped by ", x[["Group"]][1]) @@ -240,7 +246,7 @@ print_html.dw_data_xtabulate <- function(x, big_mark = NULL, ...) { #' @export -print.dw_data_xtabulates <- function(x, big_mark = NULL, ...) { +print.datawizard_crosstabs <- function(x, big_mark = NULL, ...) { for (i in seq_along(x)) { print(x[[i]], big_mark = big_mark, ...) cat("\n") @@ -250,7 +256,7 @@ print.dw_data_xtabulates <- function(x, big_mark = NULL, ...) { #' @export -print_html.dw_data_xtabulates <- function(x, big_mark = NULL, ...) { +print_html.datawizard_crosstabs <- function(x, big_mark = NULL, ...) { if (length(x) == 1) { print_html(x[[1]], big_mark = big_mark, ...) } else { diff --git a/R/demean.R b/R/demean.R index bbf7d2dfc..b5363edb6 100644 --- a/R/demean.R +++ b/R/demean.R @@ -12,7 +12,25 @@ #' @param select Character vector (or formula) with names of variables to select #' that should be group- and de-meaned. #' @param by Character vector (or formula) with the name of the variable that -#' indicates the group- or cluster-ID. +#' indicates the group- or cluster-ID. For cross-classified or nested designs, +#' `by` can also identify two or more variables as group- or cluster-IDs. If +#' the data is nested and should be treated as such, set `nested = TRUE`. Else, +#' if `by` defines two or more variables and `nested = FALSE`, a cross-classified +#' design is assumed. Note that `demean()` and `degroup()` can't handle a mix +#' of nested and cross-classified designs in one model. +#' +#' For nested designs, `by` can be: +#' - a character vector with the name of the variable that indicates the +#' levels, ordered from *highest* level to *lowest* (e.g. +#' `by = c("L4", "L3", "L2")`. +#' - a character vector with variable names in the format `by = "L4/L3/L2"`, +#' where the levels are separated by `/`. +#' +#' See also section _De-meaning for cross-classified designs_ and +#' _De-meaning for nested designs_ below. +#' @param nested Logical, if `TRUE`, the data is treated as nested. If `FALSE`, +#' the data is treated as cross-classified. Only applies if `by` contains more +#' than one variable. #' @param center Method for centering. `demean()` always performs #' mean-centering, while `degroup()` can use `center = "median"` or #' `center = "mode"` for median- or mode-centering, and also `"min"` @@ -25,174 +43,213 @@ #' attributes to indicate the within- and between-effects. This is only #' relevant when printing `model_parameters()` - in such cases, the #' within- and between-effects are printed in separated blocks. -#' @param group Deprecated. Use `by` instead. #' @inheritParams center #' #' @return #' A data frame with the group-/de-meaned variables, which get the suffix #' `"_between"` (for the group-meaned variable) and `"_within"` (for the -#' de-meaned variable) by default. +#' de-meaned variable) by default. For cross-classified or nested designs, +#' the name pattern of the group-meaned variables is the name of the centered +#' variable followed by the name of the variable that indicates the related +#' grouping level, e.g. `predictor_L3_between` and `predictor_L2_between`. #' #' @seealso If grand-mean centering (instead of centering within-clusters) -#' is required, see [center()]. See [`performance::check_heterogeneity_bias()`] +#' is required, see [`center()`]. See [`performance::check_heterogeneity_bias()`] #' to check for heterogeneity bias. #' -#' @details -#' -#' \subsection{Heterogeneity Bias}{ -#' Mixed models include different levels of sources of variability, i.e. -#' error terms at each level. When macro-indicators (or level-2 predictors, -#' or higher-level units, or more general: *group-level predictors that -#' **vary** within and across groups*) are included as fixed effects (i.e. -#' treated as covariate at level-1), the variance that is left unaccounted for -#' this covariate will be absorbed into the error terms of level-1 and level-2 -#' (\cite{Bafumi and Gelman 2006; Gelman and Hill 2007, Chapter 12.6.}): -#' \dQuote{Such covariates contain two parts: one that is specific to the -#' higher-level entity that does not vary between occasions, and one that -#' represents the difference between occasions, within higher-level entities} -#' (\cite{Bell et al. 2015}). Hence, the error terms will be correlated with -#' the covariate, which violates one of the assumptions of mixed models -#' (iid, independent and identically distributed error terms). This bias is -#' also called the *heterogeneity bias* (\cite{Bell et al. 2015}). To -#' resolve this problem, level-2 predictors used as (level-1) covariates should -#' be separated into their "within" and "between" effects by "de-meaning" and -#' "group-meaning": After demeaning time-varying predictors, \dQuote{at the -#' higher level, the mean term is no longer constrained by Level 1 effects, -#' so it is free to account for all the higher-level variance associated -#' with that variable} (\cite{Bell et al. 2015}). -#' } -#' -#' \subsection{Panel data and correlating fixed and group effects}{ -#' `demean()` is intended to create group- and de-meaned variables -#' for panel regression models (fixed effects models), or for complex -#' random-effect-within-between models (see \cite{Bell et al. 2015, 2018}), -#' where group-effects (random effects) and fixed effects correlate (see -#' \cite{Bafumi and Gelman 2006}). This can happen, for instance, when -#' analyzing panel data, which can lead to *Heterogeneity Bias*. To -#' control for correlating predictors and group effects, it is recommended -#' to include the group-meaned and de-meaned version of *time-varying covariates* -#' (and group-meaned version of *time-invariant covariates* that are on -#' a higher level, e.g. level-2 predictors) in the model. By this, one can -#' fit complex multilevel models for panel data, including time-varying -#' predictors, time-invariant predictors and random effects. -#' } -#' -#' \subsection{Why mixed models are preferred over fixed effects models}{ -#' A mixed models approach can model the causes of endogeneity explicitly -#' by including the (separated) within- and between-effects of time-varying -#' fixed effects and including time-constant fixed effects. Furthermore, -#' mixed models also include random effects, thus a mixed models approach -#' is superior to classic fixed-effects models, which lack information of -#' variation in the group-effects or between-subject effects. Furthermore, -#' fixed effects regression cannot include random slopes, which means that -#' fixed effects regressions are neglecting \dQuote{cross-cluster differences -#' in the effects of lower-level controls (which) reduces the precision of -#' estimated context effects, resulting in unnecessarily wide confidence -#' intervals and low statistical power} (\cite{Heisig et al. 2017}). -#' } -#' -#' \subsection{Terminology}{ -#' The group-meaned variable is simply the mean of an independent variable -#' within each group (or id-level or cluster) represented by `by`. -#' It represents the cluster-mean of an independent variable. The regression -#' coefficient of a group-meaned variable is the *between-subject-effect*. -#' The de-meaned variable is then the centered version of the group-meaned -#' variable. De-meaning is sometimes also called person-mean centering or -#' centering within clusters. The regression coefficient of a de-meaned -#' variable represents the *within-subject-effect*. -#' } -#' -#' \subsection{De-meaning with continuous predictors}{ -#' For continuous time-varying predictors, the recommendation is to include -#' both their de-meaned and group-meaned versions as fixed effects, but not -#' the raw (untransformed) time-varying predictors themselves. The de-meaned -#' predictor should also be included as random effect (random slope). In -#' regression models, the coefficient of the de-meaned predictors indicates -#' the within-subject effect, while the coefficient of the group-meaned -#' predictor indicates the between-subject effect. -#' } -#' -#' \subsection{De-meaning with binary predictors}{ -#' For binary time-varying predictors, there are two recommendations. First -#' is to include the raw (untransformed) binary predictor as fixed effect -#' only and the *de-meaned* variable as random effect (random slope). -#' The alternative would be to add the de-meaned version(s) of binary -#' time-varying covariates as additional fixed effect as well (instead of -#' adding it as random slope). Centering time-varying binary variables to -#' obtain within-effects (level 1) isn't necessary. They have a sensible -#' interpretation when left in the typical 0/1 format (\cite{Hoffmann 2015, -#' chapter 8-2.I}). `demean()` will thus coerce categorical time-varying -#' predictors to numeric to compute the de- and group-meaned versions for -#' these variables, where the raw (untransformed) binary predictor and the -#' de-meaned version should be added to the model. -#' } -#' -#' \subsection{De-meaning of factors with more than 2 levels}{ -#' Factors with more than two levels are demeaned in two ways: first, these -#' are also converted to numeric and de-meaned; second, dummy variables -#' are created (binary, with 0/1 coding for each level) and these binary -#' dummy-variables are de-meaned in the same way (as described above). -#' Packages like \pkg{panelr} internally convert factors to dummies before -#' demeaning, so this behaviour can be mimicked here. -#' } -#' -#' \subsection{De-meaning interaction terms}{ There are multiple ways to deal -#' with interaction terms of within- and between-effects. A classical approach -#' is to simply use the product term of the de-meaned variables (i.e. -#' introducing the de-meaned variables as interaction term in the model -#' formula, e.g. `y ~ x_within * time_within`). This approach, however, -#' might be subject to bias (see \cite{Giesselmann & Schmidt-Catran 2020}). -#' \cr \cr -#' Another option is to first calculate the product term and then apply the -#' de-meaning to it. This approach produces an estimator \dQuote{that reflects -#' unit-level differences of interacted variables whose moderators vary -#' within units}, which is desirable if *no* within interaction of -#' two time-dependent variables is required. \cr \cr -#' A third option, when the interaction should result in a genuine within -#' estimator, is to "double de-mean" the interaction terms -#' (\cite{Giesselmann & Schmidt-Catran 2018}), however, this is currently -#' not supported by `demean()`. If this is required, the `wmb()` -#' function from the \pkg{panelr} package should be used. \cr \cr -#' To de-mean interaction terms for within-between models, simply specify -#' the term as interaction for the `select`-argument, e.g. -#' `select = "a*b"` (see 'Examples'). -#' } -#' -#' \subsection{Analysing panel data with mixed models using lme4}{ -#' A description of how to translate the -#' formulas described in *Bell et al. 2018* into R using `lmer()` -#' from \pkg{lme4} can be found in -#' [this vignette](https://easystats.github.io/parameters/articles/demean.html). -#' } +#' @section Heterogeneity Bias: +#' +#' Mixed models include different levels of sources of variability, i.e. +#' error terms at each level. When macro-indicators (or level-2 predictors, +#' or higher-level units, or more general: *group-level predictors that +#' **vary** within and across groups*) are included as fixed effects (i.e. +#' treated as covariate at level-1), the variance that is left unaccounted for +#' this covariate will be absorbed into the error terms of level-1 and level-2 +#' (_Bafumi and Gelman 2006; Gelman and Hill 2007, Chapter 12.6._): +#' "Such covariates contain two parts: one that is specific to the higher-level +#' entity that does not vary between occasions, and one that represents the +#' difference between occasions, within higher-level entities" (_Bell et al. 2015_). +#' Hence, the error terms will be correlated with the covariate, which violates +#' one of the assumptions of mixed models (iid, independent and identically +#' distributed error terms). This bias is also called the *heterogeneity bias* +#' (_Bell et al. 2015_). To resolve this problem, level-2 predictors used as +#' (level-1) covariates should be separated into their "within" and "between" +#' effects by "de-meaning" and "group-meaning": After demeaning time-varying +#' predictors, "at the higher level, the mean term is no longer constrained by +#' Level 1 effects, so it is free to account for all the higher-level variance +#' associated with that variable" (_Bell et al. 2015_). +#' +#' @section Panel data and correlating fixed and group effects: +#' +#' `demean()` is intended to create group- and de-meaned variables for panel +#' regression models (fixed effects models), or for complex +#' random-effect-within-between models (see _Bell et al. 2015, 2018_), where +#' group-effects (random effects) and fixed effects correlate (see +#' _Bafumi and Gelman 2006_). This can happen, for instance, when analyzing +#' panel data, which can lead to *Heterogeneity Bias*. To control for correlating +#' predictors and group effects, it is recommended to include the group-meaned +#' and de-meaned version of *time-varying covariates* (and group-meaned version +#' of *time-invariant covariates* that are on a higher level, e.g. level-2 +#' predictors) in the model. By this, one can fit complex multilevel models for +#' panel data, including time-varying predictors, time-invariant predictors and +#' random effects. +#' +#' @section Why mixed models are preferred over fixed effects models: +#' +#' A mixed models approach can model the causes of endogeneity explicitly +#' by including the (separated) within- and between-effects of time-varying +#' fixed effects and including time-constant fixed effects. Furthermore, +#' mixed models also include random effects, thus a mixed models approach +#' is superior to classic fixed-effects models, which lack information of +#' variation in the group-effects or between-subject effects. Furthermore, +#' fixed effects regression cannot include random slopes, which means that +#' fixed effects regressions are neglecting "cross-cluster differences in the +#' effects of lower-level controls (which) reduces the precision of estimated +#' context effects, resulting in unnecessarily wide confidence intervals and +#' low statistical power" (_Heisig et al. 2017_). +#' +#' @section Terminology: +#' +#' The group-meaned variable is simply the mean of an independent variable +#' within each group (or id-level or cluster) represented by `by`. It represents +#' the cluster-mean of an independent variable. The regression coefficient of a +#' group-meaned variable is the *between-subject-effect*. The de-meaned variable +#' is then the centered version of the group-meaned variable. De-meaning is +#' sometimes also called person-mean centering or centering within clusters. +#' The regression coefficient of a de-meaned variable represents the +#' *within-subject-effect*. +#' +#' @section De-meaning with continuous predictors: +#' +#' For continuous time-varying predictors, the recommendation is to include +#' both their de-meaned and group-meaned versions as fixed effects, but not +#' the raw (untransformed) time-varying predictors themselves. The de-meaned +#' predictor should also be included as random effect (random slope). In +#' regression models, the coefficient of the de-meaned predictors indicates +#' the within-subject effect, while the coefficient of the group-meaned +#' predictor indicates the between-subject effect. +#' +#' @section De-meaning with binary predictors: +#' +#' For binary time-varying predictors, there are two recommendations. First +#' is to include the raw (untransformed) binary predictor as fixed effect +#' only and the *de-meaned* variable as random effect (random slope). +#' The alternative would be to add the de-meaned version(s) of binary +#' time-varying covariates as additional fixed effect as well (instead of +#' adding it as random slope). Centering time-varying binary variables to +#' obtain within-effects (level 1) isn't necessary. They have a sensible +#' interpretation when left in the typical 0/1 format (_Hoffmann 2015, +#' chapter 8-2.I_). `demean()` will thus coerce categorical time-varying +#' predictors to numeric to compute the de- and group-meaned versions for +#' these variables, where the raw (untransformed) binary predictor and the +#' de-meaned version should be added to the model. +#' +#' @section De-meaning of factors with more than 2 levels: +#' +#' Factors with more than two levels are demeaned in two ways: first, these +#' are also converted to numeric and de-meaned; second, dummy variables +#' are created (binary, with 0/1 coding for each level) and these binary +#' dummy-variables are de-meaned in the same way (as described above). +#' Packages like **panelr** internally convert factors to dummies before +#' demeaning, so this behaviour can be mimicked here. +#' +#' @section De-meaning interaction terms: +#' +#' There are multiple ways to deal with interaction terms of within- and +#' between-effects. +#' +#' - A classical approach is to simply use the product term of the de-meaned +#' variables (i.e. introducing the de-meaned variables as interaction term +#' in the model formula, e.g. `y ~ x_within * time_within`). This approach, +#' however, might be subject to bias (see _Giesselmann & Schmidt-Catran 2020_). +#' +#' - Another option is to first calculate the product term and then apply the +#' de-meaning to it. This approach produces an estimator "that reflects +#' unit-level differences of interacted variables whose moderators vary +#' within units", which is desirable if *no* within interaction of +#' two time-dependent variables is required. This is what `demean()` does +#' internally when `select` contains interaction terms. +#' +#' - A third option, when the interaction should result in a genuine within +#' estimator, is to "double de-mean" the interaction terms +#' (_Giesselmann & Schmidt-Catran 2018_), however, this is currently +#' not supported by `demean()`. If this is required, the `wmb()` +#' function from the **panelr** package should be used. +#' +#' To de-mean interaction terms for within-between models, simply specify +#' the term as interaction for the `select`-argument, e.g. `select = "a*b"` +#' (see 'Examples'). +#' +#' @section De-meaning for cross-classified designs: +#' +#' `demean()` can handle cross-classified designs, where the data has two or +#' more groups at the higher (i.e. second) level. In such cases, the +#' `by`-argument can identify two or more variables that represent the +#' cross-classified group- or cluster-IDs. The de-meaned variables for +#' cross-classified designs are simply subtracting all group means from each +#' individual value, i.e. _fully cluster-mean-centering_ (see _Guo et al. 2024_ +#' for details). Note that de-meaning for cross-classified designs is *not* +#' equivalent to de-meaning of nested data structures from models with three or +#' more levels. Set `nested = TRUE` to explicitly assume a nested design. For +#' cross-classified designs, de-meaning is supposed to work for models like +#' `y ~ x + (1|level3) + (1|level2)`, but *not* for models like +#' `y ~ x + (1|level3/level2)`. Note that `demean()` and `degroup()` can't +#' handle a mix of nested and cross-classified designs in one model. +#' +#' @section De-meaning for nested designs: +#' +#' _Brincks et al. (2017)_ have suggested an algorithm to center variables for +#' nested designs, which is implemented in `demean()`. For nested designs, set +#' `nested = TRUE` *and* specify the variables that indicate the different +#' levels in descending order in the `by` argument. E.g., +#' `by = c("level4", "level3, "level2")` assumes a model like +#' `y ~ x + (1|level4/level3/level2)`. An alternative notation for the +#' `by`-argument would be `by = "level4/level3/level2"`, similar to the +#' formula notation. +#' +#' @section Analysing panel data with mixed models using lme4: +#' +#' A description of how to translate the formulas described in *Bell et al. 2018* +#' into R using `lmer()` from **lme4** can be found in +#' [this vignette](https://easystats.github.io/parameters/articles/demean.html). #' #' @references #' #' - Bafumi J, Gelman A. 2006. Fitting Multilevel Models When Predictors -#' and Group Effects Correlate. In. Philadelphia, PA: Annual meeting of the -#' American Political Science Association. +#' and Group Effects Correlate. In. Philadelphia, PA: Annual meeting of the +#' American Political Science Association. #' #' - Bell A, Fairbrother M, Jones K. 2019. Fixed and Random Effects -#' Models: Making an Informed Choice. Quality & Quantity (53); 1051-1074 +#' Models: Making an Informed Choice. Quality & Quantity (53); 1051-1074 #' #' - Bell A, Jones K. 2015. Explaining Fixed Effects: Random Effects -#' Modeling of Time-Series Cross-Sectional and Panel Data. Political Science -#' Research and Methods, 3(1), 133–153. +#' Modeling of Time-Series Cross-Sectional and Panel Data. Political Science +#' Research and Methods, 3(1), 133–153. +#' +#' - Brincks, A. M., Enders, C. K., Llabre, M. M., Bulotsky-Shearer, R. J., +#' Prado, G., and Feaster, D. J. (2017). Centering Predictor Variables in +#' Three-Level Contextual Models. Multivariate Behavioral Research, 52(2), +#' 149–163. https://doi.org/10.1080/00273171.2016.1256753 #' #' - Gelman A, Hill J. 2007. Data Analysis Using Regression and -#' Multilevel/Hierarchical Models. Analytical Methods for Social Research. -#' Cambridge, New York: Cambridge University Press +#' Multilevel/Hierarchical Models. Analytical Methods for Social Research. +#' Cambridge, New York: Cambridge University Press #' #' - Giesselmann M, Schmidt-Catran, AW. 2020. Interactions in fixed -#' effects regression models. Sociological Methods & Research, 1–28. -#' https://doi.org/10.1177/0049124120914934 +#' effects regression models. Sociological Methods & Research, 1–28. +#' https://doi.org/10.1177/0049124120914934 +#' +#' - Guo Y, Dhaliwal J, Rights JD. 2024. Disaggregating level-specific effects +#' in cross-classified multilevel models. Behavior Research Methods, 56(4), +#' 3023–3057. #' #' - Heisig JP, Schaeffer M, Giesecke J. 2017. The Costs of Simplicity: -#' Why Multilevel Models May Benefit from Accounting for Cross-Cluster -#' Differences in the Effects of Controls. American Sociological Review 82 -#' (4): 796–827. +#' Why Multilevel Models May Benefit from Accounting for Cross-Cluster +#' Differences in the Effects of Controls. American Sociological Review 82 +#' (4): 796–827. #' #' - Hoffman L. 2015. Longitudinal analysis: modeling within-person -#' fluctuation and change. New York: Routledge +#' fluctuation and change. New York: Routledge #' #' @examples #' @@ -223,21 +280,16 @@ demean <- function(x, select, by, + nested = FALSE, suffix_demean = "_within", suffix_groupmean = "_between", add_attributes = TRUE, - verbose = TRUE, - group = NULL) { - ## TODO: remove warning in future release - if (!is.null(group)) { - by <- group - insight::format_warning("Argument `group` is deprecated and will be removed in a future release. Please use `by` instead.") # nolint - } - + verbose = TRUE) { degroup( x = x, select = select, by = by, + nested = nested, center = "mean", suffix_demean = suffix_demean, suffix_groupmean = suffix_groupmean, @@ -247,47 +299,48 @@ demean <- function(x, } - - - - #' @rdname demean #' @export degroup <- function(x, select, by, + nested = FALSE, center = "mean", suffix_demean = "_within", suffix_groupmean = "_between", add_attributes = TRUE, - verbose = TRUE, - group = NULL) { - ## TODO: remove warning later - if (!is.null(group)) { - by <- group - insight::format_warning("Argument `group` is deprecated and will be removed in a future release. Please use `by` instead.") # nolint - } - + verbose = TRUE) { # ugly tibbles again... x <- .coerce_to_dataframe(x) center <- match.arg(tolower(center), choices = c("mean", "median", "mode", "min", "max")) if (inherits(select, "formula")) { - # formula to character, remove "~", split at "+" + # formula to character, remove "~", split at "+". We don't use `all.vars()` + # here because we want to keep the interaction terms as they are select <- trimws(unlist( strsplit(gsub("~", "", insight::safe_deparse(select), fixed = TRUE), "+", fixed = TRUE), use.names = FALSE )) } + # handle different "by" options if (inherits(by, "formula")) { by <- all.vars(by) } + # we also allow lme4-syntax here: if by = "L4/L3/L2", we assume a nested design + if (length(by) == 1 && grepl("/", by, fixed = TRUE)) { + by <- insight::trim_ws(unlist(strsplit(by, "/", fixed = TRUE), use.names = FALSE)) + nested <- TRUE + } + + # identify interaction terms interactions_no <- select[!grepl("(\\*|\\:)", select)] interactions_yes <- select[grepl("(\\*|\\:)", select)] + # if we have interaction terms that should be de-meaned, calculate the product + # of the terms first, then demean the product if (length(interactions_yes)) { interaction_terms <- lapply(strsplit(interactions_yes, "*", fixed = TRUE), trimws) product <- lapply(interaction_terms, function(i) do.call(`*`, x[, i])) @@ -296,20 +349,22 @@ degroup <- function(x, select <- c(interactions_no, colnames(new_dat)) } - not_found <- setdiff(select, colnames(x)) - - if (length(not_found) && isTRUE(verbose)) { - insight::format_alert( - sprintf( - "%i variables were not found in the dataset: %s\n", - length(not_found), - toString(not_found) - ) + # check if all variables are present + not_found <- setdiff(c(select, by), colnames(x)) + + if (length(not_found)) { + insight::format_error( + paste0( + "Variable", + ifelse(length(not_found) > 1, "s ", " "), + text_concatenate(not_found, enclose = "\""), + ifelse(length(not_found) > 1, " were", " was"), + " not found in the dataset." + ), + .misspelled_string(colnames(x), not_found, "Possibly misspelled or not yet defined?") ) } - select <- intersect(colnames(x), select) - # get data to demean... dat <- x[, c(select, by)] @@ -366,37 +421,92 @@ degroup <- function(x, max = function(.gm) max(.gm, na.rm = TRUE), function(.gm) mean(.gm, na.rm = TRUE) ) - x_gm_list <- lapply(select, function(i) { - stats::ave(dat[[i]], dat[[by]], FUN = gm_fun) - }) - names(x_gm_list) <- select - # create de-meaned variables by subtracting the group mean from each individual value + # we allow disaggregating level-specific effects for cross-classified multilevel + # models (see Guo et al. 2024). Two levels should work as proposed by the authors, + # more levels also already work, but need to check the formula from the paper + # and validate results - x_dm_list <- lapply(select, function(i) dat[[i]] - x_gm_list[[i]]) - names(x_dm_list) <- select + if (length(by) == 1) { + # simple case: one level + group_means_list <- lapply(select, function(i) { + stats::ave(dat[[i]], dat[[by]], FUN = gm_fun) + }) + names(group_means_list) <- select + # create de-meaned variables by subtracting the group mean from each individual value + person_means_list <- lapply(select, function(i) dat[[i]] - group_means_list[[i]]) + } else if (nested) { + # nested design: by > 1, nested is explicitly set to TRUE + # We want: + # L3_between = xbar(k) + # L2_between = xbar(j,k) - xbar(k) + # L1_within = x(ijk) - xbar(jk) + # , where + # x(ijk) is the individual value / variable that is measured on level 1 + # xbar(k) <- ave(x_ijk, L3, FUN = mean), the group mean of the variable at highest level + # xbar(jk) <- ave(x_ijk, L3, L2, FUN = mean), the group mean of the variable at second level + group_means_list <- lapply(select, function(i) { + out <- lapply(seq_along(by), function(k) { + dat$higher_levels <- do.call(paste, c(dat[by[1:k]], list(sep = "_"))) + stats::ave(dat[[i]], dat$higher_levels, FUN = gm_fun) + }) + # subtract mean of higher level from lower level + for (j in 2:length(by)) { + out[[j]] <- out[[j]] - out[[j - 1]] + } + names(out) <- paste0(select, "_", by) + out + }) + # create de-meaned variables by subtracting the group mean from each individual value + person_means_list <- lapply( + # seq_along(select), + # function(i) dat[[select[i]]] - group_means_list[[i]][[length(by)]] + select, + function(i) { + dat$higher_levels <- do.call(paste, c(dat[by], list(sep = "_"))) + dat[[i]] - stats::ave(dat[[i]], dat$higher_levels, FUN = gm_fun) + } + ) + } else { + # cross-classified design: by > 1 + group_means_list <- lapply(by, function(j) { + out <- lapply(select, function(i) { + stats::ave(dat[[i]], dat[[j]], FUN = gm_fun) + }) + names(out) <- paste0(select, "_", j) + out + }) + # de-meaned variables for cross-classified design is simply subtracting + # all group means from each individual value + person_means_list <- lapply(seq_along(select), function(i) { + sum_group_means <- do.call(`+`, lapply(group_means_list, function(j) j[[i]])) + dat[[select[i]]] - sum_group_means + }) + } + # preserve names + names(person_means_list) <- select # convert to data frame and add suffix to column names - x_gm <- as.data.frame(x_gm_list) - x_dm <- as.data.frame(x_dm_list) + group_means <- as.data.frame(group_means_list) + person_means <- as.data.frame(person_means_list) - colnames(x_dm) <- sprintf("%s%s", colnames(x_dm), suffix_demean) - colnames(x_gm) <- sprintf("%s%s", colnames(x_gm), suffix_groupmean) + colnames(person_means) <- sprintf("%s%s", colnames(person_means), suffix_demean) + colnames(group_means) <- sprintf("%s%s", colnames(group_means), suffix_groupmean) if (isTRUE(add_attributes)) { - x_dm[] <- lapply(x_dm, function(i) { + person_means[] <- lapply(person_means, function(i) { attr(i, "within-effect") <- TRUE i }) - x_gm[] <- lapply(x_gm, function(i) { + group_means[] <- lapply(group_means, function(i) { attr(i, "between-effect") <- TRUE i }) } - cbind(x_gm, x_dm) + cbind(group_means, person_means) } diff --git a/R/describe_distribution.R b/R/describe_distribution.R index 41f2a8b83..64f6e29c1 100644 --- a/R/describe_distribution.R +++ b/R/describe_distribution.R @@ -186,11 +186,24 @@ describe_distribution.numeric <- function(x, # Confidence Intervals if (!is.null(ci)) { insight::check_if_installed("boot") - results <- boot::boot( - data = x, - statistic = .boot_distribution, - R = iterations, - centrality = centrality + results <- tryCatch( + { + boot::boot( + data = x, + statistic = .boot_distribution, + R = iterations, + centrality = centrality + ) + }, + error = function(e) { + msg <- conditionMessage(e) + if (!is.null(msg) && msg == "sample is too sparse to find TD") { + insight::format_warning( + "When bootstrapping CIs, sample was too sparse to find TD. Returning NA for CIs." + ) + list(t = c(NA_real_, NA_real_)) + } + } ) out_ci <- bayestestR::ci(results$t, ci = ci, verbose = FALSE) out <- cbind(out, data.frame(CI_low = out_ci$CI_low[1], CI_high = out_ci$CI_high[1])) @@ -500,7 +513,7 @@ print.parameters_distribution <- function(x, digits = 2, ...) { ci_brackets = TRUE, ... ) - cat(insight::export_table(formatted_table, format = "text", digits = digits)) + cat(insight::export_table(formatted_table, format = "text", digits = digits, ...)) invisible(x) } diff --git a/R/descriptives.R b/R/descriptives.R index 097934d29..43479f697 100644 --- a/R/descriptives.R +++ b/R/descriptives.R @@ -77,7 +77,6 @@ coef_var.default <- function(x, verbose = TRUE, ...) { #' as the nearest endpoint. #' @param remove_na Logical. Should `NA` values be removed before computing (`TRUE`) #' or not (`FALSE`, default)? -#' @param na.rm Deprecated. Please use `remove_na` instead. #' @param n If `method = "unbiased"` and both `mu` and `sigma` are provided (not #' computed from `x`), what sample size to use to adjust the computed CV #' for small-sample bias? @@ -111,13 +110,7 @@ coef_var.default <- function(x, verbose = TRUE, ...) { #' @export coef_var.numeric <- function(x, mu = NULL, sigma = NULL, method = c("standard", "unbiased", "median_mad", "qcd"), - trim = 0, remove_na = FALSE, n = NULL, na.rm = FALSE, ...) { - # TODO: remove deprecated argument later - if (!missing(na.rm)) { - insight::format_warning("Argument `na.rm` is deprecated. Please use `remove_na` instead.") - remove_na <- na.rm - } - + trim = 0, remove_na = FALSE, n = NULL, ...) { # TODO: Support weights if (!missing(x) && all(c(-1, 1) %in% sign(x))) { insight::format_error("Coefficient of variation only applicable for ratio scale variables.") diff --git a/R/extract_column_names.R b/R/extract_column_names.R index b89173a8c..a3d120d3f 100644 --- a/R/extract_column_names.R +++ b/R/extract_column_names.R @@ -9,8 +9,10 @@ #' tasks. Can be either #' #' - a variable specified as a literal variable name (e.g., `column_name`), -#' - a string with the variable name (e.g., `"column_name"`), or a character -#' vector of variable names (e.g., `c("col1", "col2", "col3")`), +#' - a string with the variable name (e.g., `"column_name"`), a character +#' vector of variable names (e.g., `c("col1", "col2", "col3")`), or a +#' character vector of variable names including ranges specified via `:` +#' (e.g., `c("col1:col3", "col5")`), #' - a formula with variable names (e.g., `~column_1 + column_2`), #' - a vector of positive integers, giving the positions counting from the left #' (e.g. `1` or `c(1, 3, 5)`), @@ -116,7 +118,7 @@ #' ``` #' #' @examples -#' # Find columns names by pattern +#' # Find column names by pattern #' extract_column_names(iris, starts_with("Sepal")) #' extract_column_names(iris, ends_with("Width")) #' extract_column_names(iris, regex("\\.")) @@ -129,6 +131,9 @@ #' numeric_mean_35 <- function(x) is.numeric(x) && mean(x, na.rm = TRUE) > 3.5 #' extract_column_names(iris, numeric_mean_35) #' +#' # find range of colum names by range, using character vector +#' extract_column_names(mtcars, c("cyl:hp", "wt")) +#' #' # rename returned columns for "data_select()" #' head(data_select(mtcars, c(`Miles per Gallon` = "mpg", Cylinders = "cyl"))) #' @export @@ -160,28 +165,6 @@ extract_column_names <- function(data, columns } - -#' @rdname extract_column_names -#' @export -data_find <- function(data, - select = NULL, - exclude = NULL, - ignore_case = FALSE, - regex = FALSE, - verbose = TRUE, - ...) { - insight::format_warning("Function `data_find()` is deprecated and will be removed in a future release. Please use `extract_column_names()` instead.") # nolint - extract_column_names( - data, - select = select, - exclude = exclude, - ignore_case = ignore_case, - regex = regex, - verbose = verbose, - ... - ) -} - #' @rdname extract_column_names #' @export -find_columns <- data_find +find_columns <- extract_column_names diff --git a/R/mean_sd.R b/R/mean_sd.R index d18473d8d..42ce9b523 100644 --- a/R/mean_sd.R +++ b/R/mean_sd.R @@ -20,23 +20,13 @@ #' median_mad(mtcars$mpg) #' #' @export -mean_sd <- function(x, times = 1L, remove_na = TRUE, named = TRUE, na.rm = TRUE, ...) { - # TODO: remove deprecated argument later - if (!missing(na.rm)) { - insight::format_warning("Argument `na.rm` is deprecated. Please use `remove_na` instead.") - remove_na <- na.rm - } +mean_sd <- function(x, times = 1L, remove_na = TRUE, named = TRUE, ...) { .centrality_dispersion(x, type = "mean", times = times, remove_na = remove_na, named = named) } #' @export #' @rdname mean_sd -median_mad <- function(x, times = 1L, remove_na = TRUE, constant = 1.4826, named = TRUE, na.rm = TRUE, ...) { - # TODO: remove deprecated argument later - if (!missing(na.rm)) { - insight::format_warning("Argument `na.rm` is deprecated. Please use `remove_na` instead.") - remove_na <- na.rm - } +median_mad <- function(x, times = 1L, remove_na = TRUE, constant = 1.4826, named = TRUE, ...) { .centrality_dispersion(x, type = "median", times = times, remove_na = remove_na, constant = constant, named = named) } diff --git a/R/means_by_group.R b/R/means_by_group.R index ad188f275..39416bb11 100644 --- a/R/means_by_group.R +++ b/R/means_by_group.R @@ -19,7 +19,6 @@ #' @param digits Optional scalar, indicating the amount of digits after decimal #' point when rounding estimates and values. #' @param ... Currently not used -#' @param group Deprecated. Use `by` instead. #' @inheritParams find_columns #' #' @return A data frame with information on mean and further summary statistics @@ -60,14 +59,7 @@ means_by_group.numeric <- function(x, ci = 0.95, weights = NULL, digits = NULL, - group = NULL, ...) { - ## TODO: remove warning in future release - if (!is.null(group)) { - by <- group - insight::format_warning("Argument `group` is deprecated and will be removed in a future release. Please use `by` instead.") # nolint - } - # validation check for arguments # "by" must be provided @@ -139,14 +131,7 @@ means_by_group.data.frame <- function(x, ignore_case = FALSE, regex = FALSE, verbose = TRUE, - group = NULL, ...) { - ## TODO: remove warning in future release - if (!is.null(group)) { - by <- group - insight::format_warning("Argument `group` is deprecated and will be removed in a future release. Please use `by` instead.") # nolint - } - # evaluate select/exclude, may be select-helpers select <- .select_nse(select, x, diff --git a/R/recode_values.R b/R/recode_values.R index b4570bf44..a8e8d6d3b 100644 --- a/R/recode_values.R +++ b/R/recode_values.R @@ -527,35 +527,3 @@ recode_values.data.frame <- function(x, ok } - - -## TODO Deprecate and remove alias later - -#' @rdname recode_values -#' @export -change_code <- function(x, - select = NULL, - exclude = NULL, - recode = NULL, - default = NULL, - preserve_na = TRUE, - append = FALSE, - ignore_case = FALSE, - regex = FALSE, - verbose = TRUE, - ...) { - insight::format_warning("Function `change_code()` is deprecated. Please use `recode_values()` instead.") # nolint - recode_values( - x, - select = select, - exclude = exclude, - recode = recode, - default = default, - preserve_na = preserve_na, - append = append, - ignore_case = ignore_case, - regex = regex, - verbose = verbose, - ... - ) -} diff --git a/R/rescale_weights.R b/R/rescale_weights.R index 02aab1d2e..60d405c9d 100644 --- a/R/rescale_weights.R +++ b/R/rescale_weights.R @@ -20,7 +20,6 @@ #' @param nest Logical, if `TRUE` and `by` indicates at least two #' group variables, then groups are "nested", i.e. groups are now a #' combination from each group level of the variables in `by`. -#' @param group Deprecated. Use `by` instead. #' #' @return `data`, including the new weighting variables: `pweights_a` #' and `pweights_b`, which represent the rescaled design weights to use @@ -88,13 +87,7 @@ #' ) #' } #' @export -rescale_weights <- function(data, by, probability_weights, nest = FALSE, group = NULL) { - ## TODO: remove warning in future release - if (!is.null(group)) { - by <- group - insight::format_warning("Argument `group` is deprecated and will be removed in a future release. Please use `by` instead.") # nolint - } - +rescale_weights <- function(data, by, probability_weights, nest = FALSE) { if (inherits(by, "formula")) { by <- all.vars(by) } diff --git a/R/reshape_ci.R b/R/reshape_ci.R index 99a670a2d..dcfc729a8 100644 --- a/R/reshape_ci.R +++ b/R/reshape_ci.R @@ -43,15 +43,20 @@ reshape_ci <- function(x, ci_type = "CI") { # Reshape if (length(unique(x$CI)) > 1) { if ("Parameter" %in% names(x)) { + idvar <- "Parameter" remove_parameter <- FALSE - } else { + } else if (is.null(attr(x, "idvars"))) { + idvar <- "Parameter" x$Parameter <- NA remove_parameter <- TRUE + } else { + idvar <- attr(x, "idvars") + remove_parameter <- FALSE } x <- stats::reshape( x, - idvar = "Parameter", + idvar = idvar, timevar = "CI", direction = "wide", v.names = c(ci_low, ci_high), diff --git a/R/row_count.R b/R/row_count.R new file mode 100644 index 000000000..02b1c16dc --- /dev/null +++ b/R/row_count.R @@ -0,0 +1,124 @@ +#' @title Count specific values row-wise +#' @name row_count +#' @description `row_count()` mimics base R's `rowSums()`, with sums for a +#' specific value indicated by `count`. Hence, it is similar to +#' `rowSums(x == count, na.rm = TRUE)`, but offers some more options, including +#' strict comparisons. Comparisons using `==` coerce values to atomic vectors, +#' thus both `2 == 2` and `"2" == 2` are `TRUE`. In `row_count()`, it is also +#' possible to make "type safe" comparisons using the `allow_coercion` argument, +#' where `"2" == 2` is not true. +#' +#' @param data A data frame with at least two columns, where number of specific +#' values are counted row-wise. +#' @param count The value for which the row sum should be computed. May be a +#' numeric value, a character string (for factors or character vectors), `NA` or +#' `Inf`. +#' @param allow_coercion Logical. If `FALSE`, `count` matches only values of same +#' class (i.e. when `count = 2`, the value `"2"` is not counted and vice versa). +#' By default, when `allow_coercion = TRUE`, `count = 2` also matches `"2"`. In +#' order to count factor levels in the data, use `count = factor("level")`. See +#' 'Examples'. +#' +#' @inheritParams extract_column_names +#' @inheritParams row_means +#' +#' @return A vector with row-wise counts of values specified in `count`. +#' +#' @examples +#' dat <- data.frame( +#' c1 = c(1, 2, NA, 4), +#' c2 = c(NA, 2, NA, 5), +#' c3 = c(NA, 4, NA, NA), +#' c4 = c(2, 3, 7, 8) +#' ) +#' +#' # count all 4s per row +#' row_count(dat, count = 4) +#' # count all missing values per row +#' row_count(dat, count = NA) +#' +#' dat <- data.frame( +#' c1 = c("1", "2", NA, "3"), +#' c2 = c(NA, "2", NA, "3"), +#' c3 = c(NA, 4, NA, NA), +#' c4 = c(2, 3, 7, Inf) +#' ) +#' # count all 2s and "2"s per row +#' row_count(dat, count = 2) +#' # only count 2s, but not "2"s +#' row_count(dat, count = 2, allow_coercion = FALSE) +#' +#' dat <- data.frame( +#' c1 = factor(c("1", "2", NA, "3")), +#' c2 = c("2", "1", NA, "3"), +#' c3 = c(NA, 4, NA, NA), +#' c4 = c(2, 3, 7, Inf) +#' ) +#' # find only character "2"s +#' row_count(dat, count = "2", allow_coercion = FALSE) +#' # find only factor level "2"s +#' row_count(dat, count = factor("2"), allow_coercion = FALSE) +#' +#' @export +row_count <- function(data, + select = NULL, + exclude = NULL, + count = NULL, + allow_coercion = TRUE, + ignore_case = FALSE, + regex = FALSE, + verbose = TRUE) { + # evaluate arguments + select <- .select_nse(select, + data, + exclude, + ignore_case = ignore_case, + regex = regex, + verbose = verbose + ) + + if (is.null(count)) { + insight::format_error("`count` must be a valid value (including `NA` or `Inf`), but not `NULL`.") + } + + if (is.null(select) || length(select) == 0) { + insight::format_error("No columns selected.") + } + + data <- .coerce_to_dataframe(data[select]) + + # check if we have a data framme with at least two columns + if (nrow(data) < 1) { + insight::format_error("`data` must be a data frame with at least one row.") + } + + # check if we have a data framme with at least two columns + if (ncol(data) < 2) { + insight::format_error("`data` must be a data frame with at least two numeric columns.") + } + # special case: count missing + if (is.na(count)) { + rowSums(is.na(data)) + } else { + # comparisons in R using == coerce values into a atomic vector, i.e. + # 2 == "2" is TRUE. If `allow_coercion = FALSE`, we only want 2 == 2 or + # "2" == "2" (i.e. we want exact types to be compared only) + if (isFALSE(allow_coercion)) { + # we need the "type" of the count-value - we use class() instead of typeof(), + # because the latter sometimes returns unsuitable classes/types. compare + # typeof(as.Date("2020-01-01")), which returns "double". + count_type <- class(count)[1] + valid_columns <- vapply(data, inherits, TRUE, what = count_type) + # check if any columns left? + if (!any(valid_columns)) { + insight::format_error("No column has same type as the value provided in `count`. Set `allow_coercion = TRUE` or specify a valid value for `count`.") # nolint + } + data <- data[valid_columns] + } + # coerce - we have only valid columns anyway, and we need to coerce factors + # to vectors, else comparison with `==` errors. + count <- as.vector(count) + # finally, count + rowSums(data == count, na.rm = TRUE) + } +} diff --git a/R/row_means.R b/R/row_means.R index 4d2876c6a..729c800be 100644 --- a/R/row_means.R +++ b/R/row_means.R @@ -1,15 +1,16 @@ -#' @title Row means (optionally with minimum amount of valid values) +#' @title Row means or sums (optionally with minimum amount of valid values) #' @name row_means -#' @description This function is similar to the SPSS `MEAN.n` function and computes -#' row means from a data frame or matrix if at least `min_valid` values of a row are -#' valid (and not `NA`). +#' @description This function is similar to the SPSS `MEAN.n` or `SUM.n` +#' function and computes row means or row sums from a data frame or matrix if at +#' least `min_valid` values of a row are valid (and not `NA`). #' -#' @param data A data frame with at least two columns, where row means are applied. +#' @param data A data frame with at least two columns, where row means or row +#' sums are applied. #' @param min_valid Optional, a numeric value of length 1. May either be #' - a numeric value that indicates the amount of valid values per row to -#' calculate the row mean; +#' calculate the row mean or row sum; #' - or a value between `0` and `1`, indicating a proportion of valid values per -#' row to calculate the row mean (see 'Details'). +#' row to calculate the row mean or row sum (see 'Details'). #' - `NULL` (default), in which all cases are considered. #' #' If a row's sum of valid values is less than `min_valid`, `NA` will be returned. @@ -17,21 +18,24 @@ #' used for rounding mean values. Negative values are allowed (see 'Details'). #' By default, `digits = NULL` and no rounding is used. #' @param remove_na Logical, if `TRUE` (default), removes missing (`NA`) values -#' before calculating row means. Only applies if `min_valuid` is not specified. +#' before calculating row means or row sums. Only applies if `min_valid` is not +#' specified. #' @param verbose Toggle warnings. #' @inheritParams extract_column_names #' -#' @return A vector with row means for those rows with at least `n` valid values. +#' @return A vector with row means (for `row_means()`) or row sums (for +#' `row_sums()`) for those rows with at least `n` valid values. #' -#' @details Rounding to a negative number of `digits` means rounding to a power of -#' ten, for example `row_means(df, 3, digits = -2)` rounds to the nearest hundred. -#' For `min_valid`, if not `NULL`, `min_valid` must be a numeric value from `0` -#' to `ncol(data)`. If a row in the data frame has at least `min_valid` -#' non-missing values, the row mean is returned. If `min_valid` is a non-integer -#' value from 0 to 1, `min_valid` is considered to indicate the proportion of -#' required non-missing values per row. E.g., if `min_valid = 0.75`, a row must -#' have at least `ncol(data) * min_valid` non-missing values for the row mean -#' to be calculated. See 'Examples'. +#' @details Rounding to a negative number of `digits` means rounding to a power +#' of ten, for example `row_means(df, 3, digits = -2)` rounds to the nearest +#' hundred. For `min_valid`, if not `NULL`, `min_valid` must be a numeric value +#' from `0` to `ncol(data)`. If a row in the data frame has at least `min_valid` +#' non-missing values, the row mean or row sum is returned. If `min_valid` is a +#' non-integer value from 0 to 1, `min_valid` is considered to indicate the +#' proportion of required non-missing values per row. E.g., if +#' `min_valid = 0.75`, a row must have at least `ncol(data) * min_valid` +#' non-missing values for the row mean or row sum to be calculated. See +#' 'Examples'. #' #' @examples #' dat <- data.frame( @@ -49,6 +53,7 @@ #' #' # needs at least 4 non-missing values per row #' row_means(dat, min_valid = 4) # 1 valid return value +#' row_sums(dat, min_valid = 4) # 1 valid return value #' #' # needs at least 3 non-missing values per row #' row_means(dat, min_valid = 3) # 2 valid return values @@ -61,6 +66,7 @@ #' #' # needs at least 50% of non-missing values per row #' row_means(dat, min_valid = 0.5) # 3 valid return values +#' row_sums(dat, min_valid = 0.5) #' #' # needs at least 75% of non-missing values per row #' row_means(dat, min_valid = 0.75) # 2 valid return values @@ -84,34 +90,52 @@ row_means <- function(data, verbose = verbose ) - if (is.null(select) || length(select) == 0) { - insight::format_error("No columns selected.") - } + # prepare data, sanity checks + data <- .prepare_row_data(data, select, min_valid, verbose) - data <- .coerce_to_dataframe(data[select]) + # calculate row means + .row_sums_or_means(data, min_valid, digits, remove_na, fun = "mean") +} - # n must be a numeric, non-missing value - if (!is.null(min_valid) && (all(is.na(min_valid)) || !is.numeric(min_valid) || length(min_valid) > 1)) { - insight::format_error("`min_valid` must be a numeric value of length 1.") - } - # make sure we only have numeric values - numeric_columns <- vapply(data, is.numeric, TRUE) - if (!all(numeric_columns)) { - if (verbose) { - insight::format_alert("Only numeric columns are considered for calculation.") - } - data <- data[numeric_columns] - } +#' @rdname row_means +#' @export +row_sums <- function(data, + select = NULL, + exclude = NULL, + min_valid = NULL, + digits = NULL, + ignore_case = FALSE, + regex = FALSE, + remove_na = FALSE, + verbose = TRUE) { + # evaluate arguments + select <- .select_nse(select, + data, + exclude, + ignore_case = ignore_case, + regex = regex, + verbose = verbose + ) + + # prepare data, sanity checks + data <- .prepare_row_data(data, select, min_valid, verbose) + + # calculate row sums + .row_sums_or_means(data, min_valid, digits, remove_na, fun = "sum") +} - # check if we have a data framme with at least two columns - if (ncol(data) < 2) { - insight::format_error("`data` must be a data frame with at least two numeric columns.") - } - # proceed here if min_valid is not NULL +# helper ------------------------ + +# calculate row means or sums +.row_sums_or_means <- function(data, min_valid, digits, remove_na, fun) { if (is.null(min_valid)) { - out <- rowMeans(data, na.rm = remove_na) + # calculate row means or sums for complete data + out <- switch(fun, + mean = rowMeans(data, na.rm = remove_na), + rowSums(data, na.rm = remove_na) + ) } else { # is 'min_valid' indicating a proportion? decimals <- min_valid %% 1 @@ -124,9 +148,12 @@ row_means <- function(data, insight::format_error("`min_valid` must be smaller or equal to number of columns in data frame.") } - # row means + # row means or sums to_na <- rowSums(is.na(data)) > ncol(data) - min_valid - out <- rowMeans(data, na.rm = TRUE) + out <- switch(fun, + mean = rowMeans(data, na.rm = TRUE), + rowSums(data, na.rm = TRUE) + ) out[to_na] <- NA } @@ -137,3 +164,34 @@ row_means <- function(data, out } + + +# check that data is in shape for row means or row sums +.prepare_row_data <- function(data, select, min_valid, verbose) { + if (is.null(select) || length(select) == 0) { + insight::format_error("No columns selected.") + } + + data <- .coerce_to_dataframe(data[select]) + + # n must be a numeric, non-missing value + if (!is.null(min_valid) && (all(is.na(min_valid)) || !is.numeric(min_valid) || length(min_valid) > 1)) { + insight::format_error("`min_valid` must be a numeric value of length 1.") + } + + # make sure we only have numeric values + numeric_columns <- vapply(data, is.numeric, TRUE) + if (!all(numeric_columns)) { + if (verbose) { + insight::format_alert("Only numeric columns are considered for calculation.") + } + data <- data[numeric_columns] + } + + # check if we have a data framme with at least two columns + if (ncol(data) < 2) { + insight::format_error("`data` must be a data frame with at least two numeric columns.") + } + + data +} diff --git a/R/select_nse.R b/R/select_nse.R index 8f9eba096..5120691a9 100644 --- a/R/select_nse.R +++ b/R/select_nse.R @@ -139,6 +139,7 @@ # Possibilities: # - quoted variable name # - quoted variable name with ignore case +# - quoted variable name with colon, to indicate range # - character that should be regex-ed on variable names # - special word "all" to return all vars @@ -146,31 +147,63 @@ # use colnames because names() doesn't work for matrices columns <- colnames(data) if (isTRUE(regex)) { + # string is a regular expression grep(x, columns) } else if (length(x) == 1L && x == "all") { + # string is "all" - select all columns seq_along(data) + } else if (any(grepl(":", x, fixed = TRUE))) { + # special pattern, as string (e.g.select = c("cyl:hp", "am")). However, + # this will first go into `.eval_call()` and thus only single elements + # are passed in `x` - we have never a character *vector* here + # check for valid names + colon_vars <- unlist(strsplit(x, ":", fixed = TRUE)) + colon_match <- match(colon_vars, columns) + if (anyNA(colon_match)) { + .warn_not_found(colon_vars, columns, colon_match, verbose) + matches <- NA + } else { + start_pos <- match(colon_vars[1], columns) + end_pos <- match(colon_vars[2], columns) + if (!is.na(start_pos) && !is.na(end_pos)) { + matches <- start_pos:end_pos + } else { + matches <- NA + } + } + matches[!is.na(matches)] } else if (isTRUE(ignore_case)) { + # find columns, case insensitive matches <- match(toupper(x), toupper(columns)) matches[!is.na(matches)] } else { + # find columns, case sensitive matches <- match(x, columns) - if (anyNA(matches) && verbose) { - insight::format_warning( - paste0( - "Following variable(s) were not found: ", - toString(x[is.na(matches)]) - ), - .misspelled_string( - columns, - x[is.na(matches)], - default_message = "Possibly misspelled?" - ) - ) + if (anyNA(matches)) { + .warn_not_found(x, columns, matches, verbose) } matches[!is.na(matches)] } } +# small helper, to avoid duplicated code +.warn_not_found <- function(x, columns, matches, verbose = TRUE) { + if (verbose) { + insight::format_warning( + paste0( + "Following variable(s) were not found: ", + toString(x[is.na(matches)]) + ), + .misspelled_string( + columns, + x[is.na(matches)], + default_message = "Possibly misspelled?" + ) + ) + } +} + + # 3 types of symbols: # - unquoted variables # - objects that need to be evaluated, e.g data_find(iris, i) where diff --git a/R/skewness_kurtosis.R b/R/skewness_kurtosis.R index 6142c59ad..23ced0a04 100644 --- a/R/skewness_kurtosis.R +++ b/R/skewness_kurtosis.R @@ -110,15 +110,7 @@ skewness.numeric <- function(x, type = "2", iterations = NULL, verbose = TRUE, - na.rm = TRUE, ...) { - # TODO: remove deprecated argument later - if (!missing(na.rm)) { - # TODO: add deprecation warning in a later update - insight::format_warning("Argument `na.rm` is deprecated and will be removed in a future release. Please use `remove_na` instead.") # nolint - remove_na <- na.rm - } - if (remove_na) x <- x[!is.na(x)] n <- length(x) out <- (sum((x - mean(x))^3) / n) / (sum((x - mean(x))^2) / n)^1.5 @@ -177,15 +169,7 @@ skewness.matrix <- function(x, remove_na = TRUE, type = "2", iterations = NULL, - na.rm = TRUE, ...) { - # TODO: remove deprecated argument later - if (!missing(na.rm)) { - # TODO: add deprecation warning in a later update - insight::format_warning("Argument `na.rm` is deprecated and will be removed in a future release. Please use `remove_na` instead.") # nolint - remove_na <- na.rm - } - .skewness <- apply( x, 2, @@ -213,15 +197,7 @@ skewness.data.frame <- function(x, remove_na = TRUE, type = "2", iterations = NULL, - na.rm = TRUE, ...) { - # TODO: remove deprecated argument later - if (!missing(na.rm)) { - # TODO: add deprecation warning in a later update - insight::format_warning("Argument `na.rm` is deprecated and will be removed in a future release. Please use `remove_na` instead.") # nolint - remove_na <- na.rm - } - .skewness <- lapply(x, skewness, remove_na = remove_na, @@ -241,15 +217,7 @@ skewness.default <- function(x, remove_na = TRUE, type = "2", iterations = NULL, - na.rm = TRUE, ...) { - # TODO: remove deprecated argument later - if (!missing(na.rm)) { - # TODO: add deprecation warning in a later update - insight::format_warning("Argument `na.rm` is deprecated and will be removed in a future release. Please use `remove_na` instead.") # nolint - remove_na <- na.rm - } - skewness( .factor_to_numeric(x), remove_na = remove_na, @@ -277,15 +245,7 @@ kurtosis.numeric <- function(x, type = "2", iterations = NULL, verbose = TRUE, - na.rm = TRUE, ...) { - # TODO: remove deprecated argument later - if (!missing(na.rm)) { - # TODO: add deprecation warning in a later update - insight::format_warning("Argument `na.rm` is deprecated and will be removed in a future release. Please use `remove_na` instead.") # nolint - remove_na <- na.rm - } - if (remove_na) x <- x[!is.na(x)] n <- length(x) out <- n * sum((x - mean(x))^4) / (sum((x - mean(x))^2)^2) @@ -342,15 +302,7 @@ kurtosis.matrix <- function(x, remove_na = TRUE, type = "2", iterations = NULL, - na.rm = TRUE, ...) { - # TODO: remove deprecated argument later - if (!missing(na.rm)) { - # TODO: add deprecation warning in a later update - insight::format_warning("Argument `na.rm` is deprecated and will be removed in a future release. Please use `remove_na` instead.") # nolint - remove_na <- na.rm - } - .kurtosis <- apply( x, 2, @@ -374,15 +326,7 @@ kurtosis.data.frame <- function(x, remove_na = TRUE, type = "2", iterations = NULL, - na.rm = TRUE, ...) { - # TODO: remove deprecated argument later - if (!missing(na.rm)) { - # TODO: add deprecation warning in a later update - insight::format_warning("Argument `na.rm` is deprecated and will be removed in a future release. Please use `remove_na` instead.") # nolint - remove_na <- na.rm - } - .kurtosis <- lapply(x, kurtosis, remove_na = remove_na, @@ -400,15 +344,7 @@ kurtosis.default <- function(x, remove_na = TRUE, type = "2", iterations = NULL, - na.rm = TRUE, ...) { - # TODO: remove deprecated argument later - if (!missing(na.rm)) { - # TODO: add deprecation warning in a later update - insight::format_warning("Argument `na.rm` is deprecated and will be removed in a future release. Please use `remove_na` instead.") # nolint - remove_na <- na.rm - } - kurtosis( .factor_to_numeric(x), remove_na = remove_na, diff --git a/R/standardize.models.R b/R/standardize.models.R index 6f5a1dfa8..cf6062c78 100644 --- a/R/standardize.models.R +++ b/R/standardize.models.R @@ -78,6 +78,14 @@ standardize.default <- function(x, return(x) } + # check model formula. Some notations don't work when standardizing data + insight::formula_ok( + x, + action = "error", + prefix_msg = "Model cannot be standardized.", + verbose = verbose + ) + data_std <- NULL # needed to avoid note .standardize_models(x, robust = robust, two_sd = two_sd, @@ -197,7 +205,7 @@ standardize.default <- function(x, ## ---- STANDARDIZE! ---- - w <- insight::get_weights(x, na_rm = TRUE) + w <- insight::get_weights(x, remove_na = TRUE) data_std <- standardize(data[do_standardize], robust = robust, diff --git a/R/text_format.R b/R/text_format.R index afdf4f861..0fa75bcac 100644 --- a/R/text_format.R +++ b/R/text_format.R @@ -42,15 +42,6 @@ text_format <- function(text, sep = ", ", last = " and ", width = NULL, enclose text_wrap(text_concatenate(text, sep = sep, last = last, enclose = enclose), width = width) } -## TODO Deprecate and remove alias later - -#' @rdname text_format -#' @export -format_text <- function(text, sep = ", ", last = " and ", width = NULL, enclose = NULL, ...) { - insight::format_warning("Function `format_text()` is deprecated and will be removed in a future release. Please use `text_format()` instead.") # nolint - text_format(text, sep = sep, last = last, width = width, enclose = enclose, ...) -} - #' @rdname text_format #' @export text_fullstop <- function(text) { diff --git a/R/to_numeric.R b/R/to_numeric.R index e38e12e80..3e75bccbd 100644 --- a/R/to_numeric.R +++ b/R/to_numeric.R @@ -17,11 +17,11 @@ #' @inheritParams extract_column_names #' @inheritParams categorize #' -#' @note By default, `to_numeric()` converts factors into "binary" dummies, i.e. +#' @note When factors should be converted into multiple "binary" dummies, i.e. #' each factor level is converted into a separate column filled with a binary -#' 0-1 value. If only one column is required, use `dummy_factors = FALSE`. If -#' you want to preserve the original factor levels (in case these represent -#' numeric values), use `preserve_levels = TRUE`. +#' 0-1 value, set `dummy_factors = TRUE`. If you want to preserve the original +#' factor levels (in case these represent numeric values), use +#' `preserve_levels = TRUE`. #' #' @section Selection of variables - `select` argument: #' For most functions that have a `select` argument the complete input data @@ -34,12 +34,12 @@ #' #' @examples #' to_numeric(head(ToothGrowth)) -#' to_numeric(head(ToothGrowth), dummy_factors = FALSE) +#' to_numeric(head(ToothGrowth), dummy_factors = TRUE) #' #' # factors #' x <- as.factor(mtcars$gear) -#' to_numeric(x, dummy_factors = FALSE) -#' to_numeric(x, dummy_factors = FALSE, preserve_levels = TRUE) +#' to_numeric(x) +#' to_numeric(x, preserve_levels = TRUE) #' # same as: #' coerce_to_numeric(x) #' @@ -69,7 +69,7 @@ to_numeric.default <- function(x, verbose = TRUE, ...) { to_numeric.data.frame <- function(x, select = NULL, exclude = NULL, - dummy_factors = TRUE, + dummy_factors = FALSE, preserve_levels = FALSE, lowest = NULL, append = FALSE, @@ -191,7 +191,7 @@ to_numeric.POSIXlt <- to_numeric.Date #' @export to_numeric.factor <- function(x, - dummy_factors = TRUE, + dummy_factors = FALSE, preserve_levels = FALSE, lowest = NULL, verbose = TRUE, diff --git a/README.Rmd b/README.Rmd index ec0d01df7..39b8825ad 100644 --- a/README.Rmd +++ b/README.Rmd @@ -19,7 +19,7 @@ library(datawizard) [![DOI](https://joss.theoj.org/papers/10.21105/joss.04684/status.svg)](https://doi.org/10.21105/joss.04684) [![downloads](http://cranlogs.r-pkg.org/badges/datawizard)](https://cran.r-project.org/package=datawizard) -[![total](https://cranlogs.r-pkg.org/badges/grand-total/datawizard)](https://cranlogs.r-pkg.org/) [![lifecycle](https://img.shields.io/badge/lifecycle-maturing-blue.svg)](https://lifecycle.r-lib.org/articles/stages.html) +[![total](https://cranlogs.r-pkg.org/badges/grand-total/datawizard)](https://cranlogs.r-pkg.org/) diff --git a/README.md b/README.md index 712449df9..dd046ca12 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,6 @@ [![DOI](https://joss.theoj.org/papers/10.21105/joss.04684/status.svg)](https://doi.org/10.21105/joss.04684) [![downloads](http://cranlogs.r-pkg.org/badges/datawizard)](https://cran.r-project.org/package=datawizard) [![total](https://cranlogs.r-pkg.org/badges/grand-total/datawizard)](https://cranlogs.r-pkg.org/) -[![lifecycle](https://img.shields.io/badge/lifecycle-maturing-blue.svg)](https://lifecycle.r-lib.org/articles/stages.html) @@ -50,11 +49,11 @@ It covers two aspects of data preparation: badge](https://easystats.r-universe.dev/badges/datawizard)](https://easystats.r-universe.dev) [![R-CMD-check](https://github.com/easystats/datawizard/workflows/R-CMD-check/badge.svg?branch=main)](https://github.com/easystats/datawizard/actions) -| Type | Source | Command | -|----|----|----| -| Release | CRAN | `install.packages("datawizard")` | +| Type | Source | Command | +|-------------|------------|------------------------------------------------------------------------------| +| Release | CRAN | `install.packages("datawizard")` | | Development | r-universe | `install.packages("datawizard", repos = "https://easystats.r-universe.dev")` | -| Development | GitHub | `remotes::install_github("easystats/datawizard")` | +| Development | GitHub | `remotes::install_github("easystats/datawizard")` | > **Tip** > @@ -71,9 +70,10 @@ To cite the package, run the following command: citation("datawizard") To cite package 'datawizard' in publications use: - Patil et al., (2022). datawizard: An R Package for Easy Data - Preparation and Statistical Transformations. Journal of Open Source - Software, 7(78), 4684, https://doi.org/10.21105/joss.04684 + Patil et al., (2022). datawizard: An R Package for Easy + Data Preparation and Statistical Transformations. Journal + of Open Source Software, 7(78), 4684, + https://doi.org/10.21105/joss.04684 A BibTeX entry for LaTeX users is @@ -136,9 +136,6 @@ columns, can be achieved using `extract_column_names()` or # find column names matching a pattern extract_column_names(iris, starts_with("Sepal")) #> [1] "Sepal.Length" "Sepal.Width" -``` - -``` r # return data columns matching a pattern data_select(iris, starts_with("Sepal")) |> head() @@ -156,10 +153,8 @@ It is also possible to extract one or more variables: ``` r # single variable data_extract(mtcars, "gear") -#> [1] 4 4 4 3 3 3 3 4 4 4 4 3 3 3 3 3 3 4 4 4 3 3 3 3 3 4 5 5 5 5 5 4 -``` - -``` r +#> [1] 4 4 4 3 3 3 3 4 4 4 4 3 3 3 3 3 3 4 4 4 3 3 3 3 3 4 5 5 5 5 5 +#> [32] 4 # more variables head(data_extract(iris, ends_with("Width"))) @@ -220,17 +215,11 @@ x #> 1 1 a 5 1 #> 2 2 b 6 2 #> 3 3 c 7 3 -``` - -``` r y #> c d e id #> 1 6 f 100 2 #> 2 7 g 101 3 #> 3 8 h 102 4 -``` - -``` r data_merge(x, y, join = "full") #> a b c id d e @@ -238,50 +227,32 @@ data_merge(x, y, join = "full") #> 1 2 b 6 2 f 100 #> 2 3 c 7 3 g 101 #> 4 NA 8 4 h 102 -``` - -``` r data_merge(x, y, join = "left") #> a b c id d e #> 3 1 a 5 1 NA #> 1 2 b 6 2 f 100 #> 2 3 c 7 3 g 101 -``` - -``` r data_merge(x, y, join = "right") #> a b c id d e #> 1 2 b 6 2 f 100 #> 2 3 c 7 3 g 101 #> 3 NA 8 4 h 102 -``` - -``` r data_merge(x, y, join = "semi", by = "c") #> a b c id #> 2 2 b 6 2 #> 3 3 c 7 3 -``` - -``` r data_merge(x, y, join = "anti", by = "c") #> a b c id #> 1 1 a 5 1 -``` - -``` r data_merge(x, y, join = "inner") #> a b c id d e #> 1 2 b 6 2 f 100 #> 2 3 c 7 3 g 101 -``` - -``` r data_merge(x, y, join = "bind") #> a b c id d e @@ -322,17 +293,28 @@ data_to_wide(long_data, values_from = "value", id_cols = "Row_ID" ) -#> Row_ID X1 X2 X3 X4 X5 -#> 1 1 -0.08281164 -1.12490028 -0.70632036 -0.7027895 0.07633326 -#> 2 2 1.93468099 -0.87430362 0.96687656 0.2998642 -0.23035595 -#> 3 3 -2.05128979 0.04386162 -0.71016648 1.1494697 0.31746484 -#> 4 4 0.27773897 -0.58397514 -0.05917365 -0.3016415 -1.59268440 -#> 5 5 -1.52596060 -0.82329858 -0.23094342 -0.5473394 -0.18194062 -#> 6 6 -0.26916362 0.11059280 0.69200045 -0.3854041 1.75614174 -#> 7 7 1.23305388 0.36472778 1.35682290 0.2763720 0.11394932 -#> 8 8 0.63360774 0.05370100 1.78872284 0.1518608 -0.29216508 -#> 9 9 0.35271746 1.36867235 0.41071582 -0.4313808 1.75409316 -#> 10 10 -0.56048248 -0.38045724 -2.18785470 -1.8705001 1.80958455 +#> Row_ID X1 X2 X3 X4 +#> 1 1 -0.08281164 -1.12490028 -0.70632036 -0.7027895 +#> 2 2 1.93468099 -0.87430362 0.96687656 0.2998642 +#> 3 3 -2.05128979 0.04386162 -0.71016648 1.1494697 +#> 4 4 0.27773897 -0.58397514 -0.05917365 -0.3016415 +#> 5 5 -1.52596060 -0.82329858 -0.23094342 -0.5473394 +#> 6 6 -0.26916362 0.11059280 0.69200045 -0.3854041 +#> 7 7 1.23305388 0.36472778 1.35682290 0.2763720 +#> 8 8 0.63360774 0.05370100 1.78872284 0.1518608 +#> 9 9 0.35271746 1.36867235 0.41071582 -0.4313808 +#> 10 10 -0.56048248 -0.38045724 -2.18785470 -1.8705001 +#> X5 +#> 1 0.07633326 +#> 2 -0.23035595 +#> 3 0.31746484 +#> 4 -1.59268440 +#> 5 -0.18194062 +#> 6 1.75614174 +#> 7 0.11394932 +#> 8 -0.29216508 +#> 9 1.75409316 +#> 10 1.80958455 ``` ### Empty rows and columns @@ -352,22 +334,13 @@ tmp #> 3 3 3 NA 3 #> 4 NA NA NA NA #> 5 5 5 NA 5 -``` - -``` r # indices of empty columns or rows empty_columns(tmp) #> c #> 3 -``` - -``` r empty_rows(tmp) #> [1] 4 -``` - -``` r # remove empty columns or rows remove_empty_columns(tmp) @@ -377,18 +350,12 @@ remove_empty_columns(tmp) #> 3 3 3 3 #> 4 NA NA NA #> 5 5 5 5 -``` - -``` r remove_empty_rows(tmp) #> a b c d #> 1 1 1 NA 1 #> 2 2 NA NA NA #> 3 3 3 NA 3 #> 5 5 5 NA 5 -``` - -``` r # remove empty columns and rows remove_empty(tmp) @@ -409,9 +376,6 @@ table(x) #> x #> 1 2 3 4 5 6 7 8 9 10 #> 2 3 5 3 7 5 5 2 11 7 -``` - -``` r # cut into 3 groups, based on distribution (quantiles) table(categorize(x, split = "quantile", n_groups = 3)) @@ -445,26 +409,23 @@ summary(swiss) #> Mean : 41.144 Mean :19.94 #> 3rd Qu.: 93.125 3rd Qu.:21.70 #> Max. :100.000 Max. :26.60 -``` - -``` r # after summary(standardize(swiss)) -#> Fertility Agriculture Examination Education -#> Min. :-2.81327 Min. :-2.1778 Min. :-1.69084 Min. :-1.0378 -#> 1st Qu.:-0.43569 1st Qu.:-0.6499 1st Qu.:-0.56273 1st Qu.:-0.5178 -#> Median : 0.02061 Median : 0.1515 Median :-0.06134 Median :-0.3098 -#> Mean : 0.00000 Mean : 0.0000 Mean : 0.00000 Mean : 0.0000 -#> 3rd Qu.: 0.66504 3rd Qu.: 0.7481 3rd Qu.: 0.69074 3rd Qu.: 0.1062 -#> Max. : 1.78978 Max. : 1.7190 Max. : 2.57094 Max. : 4.3702 -#> Catholic Infant.Mortality -#> Min. :-0.9350 Min. :-3.13886 -#> 1st Qu.:-0.8620 1st Qu.:-0.61543 -#> Median :-0.6235 Median : 0.01972 -#> Mean : 0.0000 Mean : 0.00000 -#> 3rd Qu.: 1.2464 3rd Qu.: 0.60337 -#> Max. : 1.4113 Max. : 2.28566 +#> Fertility Agriculture Examination +#> Min. :-2.81327 Min. :-2.1778 Min. :-1.69084 +#> 1st Qu.:-0.43569 1st Qu.:-0.6499 1st Qu.:-0.56273 +#> Median : 0.02061 Median : 0.1515 Median :-0.06134 +#> Mean : 0.00000 Mean : 0.0000 Mean : 0.00000 +#> 3rd Qu.: 0.66504 3rd Qu.: 0.7481 3rd Qu.: 0.69074 +#> Max. : 1.78978 Max. : 1.7190 Max. : 2.57094 +#> Education Catholic Infant.Mortality +#> Min. :-1.0378 Min. :-0.9350 Min. :-3.13886 +#> 1st Qu.:-0.5178 1st Qu.:-0.8620 1st Qu.:-0.61543 +#> Median :-0.3098 Median :-0.6235 Median : 0.01972 +#> Mean : 0.0000 Mean : 0.0000 Mean : 0.00000 +#> 3rd Qu.: 0.1062 3rd Qu.: 1.2464 3rd Qu.: 0.60337 +#> Max. : 4.3702 Max. : 1.4113 Max. : 2.28566 ``` ### Winsorize @@ -486,9 +447,6 @@ anscombe #> 9 12 12 12 8 10.84 9.13 8.15 5.56 #> 10 7 7 7 8 4.82 7.26 6.42 7.91 #> 11 5 5 5 8 5.68 4.74 5.73 6.89 -``` - -``` r # after winsorize(anscombe) @@ -540,9 +498,6 @@ head(trees) #> 4 10.5 72 16.4 #> 5 10.7 81 18.8 #> 6 10.8 83 19.7 -``` - -``` r # after head(ranktransform(trees)) @@ -575,9 +530,6 @@ x #> Mazda RX4 21.0 6 160 110 #> Mazda RX4 Wag 21.0 6 160 110 #> Datsun 710 22.8 4 108 93 -``` - -``` r data_rotate(x) #> Mazda RX4 Mazda RX4 Wag Datsun 710 diff --git a/cran-comments.md b/cran-comments.md index 58de89d2a..095f22e9a 100644 --- a/cran-comments.md +++ b/cran-comments.md @@ -4,7 +4,8 @@ ## revdepcheck results -We checked 17 reverse dependencies, comparing R CMD check results across CRAN and dev versions of this package. +We checked 18 reverse dependencies, comparing R CMD check results across CRAN and dev versions of this package. * We saw 0 new problems * We failed to check 0 packages + diff --git a/inst/WORDLIST b/inst/WORDLIST index a3dd80b42..eda7dc71c 100644 --- a/inst/WORDLIST +++ b/inst/WORDLIST @@ -2,24 +2,31 @@ Analysing Asparouhov BMC Bafumi +Brincks +Bulotsky CMD Carle Catran Crosstables +Dhaliwal +Disaggregating DOI De Dom EFC +Enders EUROFAMCARE Fairbrother GLMM Gelman Giesecke Giesselmann +Guo Heisig Herrington Hoffmann Joanes +Llabre Lumley MADs Mattan @@ -79,6 +86,7 @@ midhinge modelbased modelling nd +panelr partialization patilindrajeets platykurtic diff --git a/man/adjust.Rd b/man/adjust.Rd index 64e50d9d3..48b321b8f 100644 --- a/man/adjust.Rd +++ b/man/adjust.Rd @@ -43,8 +43,10 @@ out). If \code{NULL} (the default), all variables will be selected.} tasks. Can be either \itemize{ \item a variable specified as a literal variable name (e.g., \code{column_name}), -\item a string with the variable name (e.g., \code{"column_name"}), or a character -vector of variable names (e.g., \code{c("col1", "col2", "col3")}), +\item a string with the variable name (e.g., \code{"column_name"}), a character +vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a +character vector of variable names including ranges specified via \code{:} +(e.g., \code{c("col1:col3", "col5")}), \item a formula with variable names (e.g., \code{~column_1 + column_2}), \item a vector of positive integers, giving the positions counting from the left (e.g. \code{1} or \code{c(1, 3, 5)}), diff --git a/man/assign_labels.Rd b/man/assign_labels.Rd index cca14cc85..e6fd24252 100644 --- a/man/assign_labels.Rd +++ b/man/assign_labels.Rd @@ -38,8 +38,10 @@ labels are omitted.} tasks. Can be either \itemize{ \item a variable specified as a literal variable name (e.g., \code{column_name}), -\item a string with the variable name (e.g., \code{"column_name"}), or a character -vector of variable names (e.g., \code{c("col1", "col2", "col3")}), +\item a string with the variable name (e.g., \code{"column_name"}), a character +vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a +character vector of variable names including ranges specified via \code{:} +(e.g., \code{c("col1:col3", "col5")}), \item a formula with variable names (e.g., \code{~column_1 + column_2}), \item a vector of positive integers, giving the positions counting from the left (e.g. \code{1} or \code{c(1, 3, 5)}), diff --git a/man/categorize.Rd b/man/categorize.Rd index 28f823dd4..dbecbf5e6 100644 --- a/man/categorize.Rd +++ b/man/categorize.Rd @@ -14,6 +14,7 @@ categorize(x, ...) n_groups = NULL, range = NULL, lowest = 1, + breaks = "exclusive", labels = NULL, verbose = TRUE, ... @@ -27,6 +28,7 @@ categorize(x, ...) n_groups = NULL, range = NULL, lowest = 1, + breaks = "exclusive", labels = NULL, append = FALSE, ignore_case = FALSE, @@ -67,10 +69,19 @@ for numeric variables, the minimum of the original input is preserved. For factors, the default minimum is \code{1}. For \code{split = "equal_range"}, the default minimum is always \code{1}, unless specified otherwise in \code{lowest}.} +\item{breaks}{Character, indicating whether breaks for categorizing data are +\code{"inclusive"} (values indicate the \emph{upper} bound of the \emph{previous} group or +interval) or \code{"exclusive"} (values indicate the \emph{lower} bound of the \emph{next} +group or interval to begin). Use \code{labels = "range"} to make this behaviour +easier to see.} + \item{labels}{Character vector of value labels. If not \code{NULL}, \code{categorize()} will returns factors instead of numeric variables, with \code{labels} used -for labelling the factor levels. Can also be \code{"mean"} or \code{"median"} for a -factor with labels as the mean/median of each groups.} +for labelling the factor levels. Can also be \code{"mean"}, \code{"median"}, +\code{"range"} or \code{"observed"} for a factor with labels as the mean/median, +the requested range (even if not all values of that range are present in +the data) or observed range (range of the actual recoded values) of each +group. See 'Examples'.} \item{verbose}{Toggle warnings.} @@ -78,8 +89,10 @@ factor with labels as the mean/median of each groups.} tasks. Can be either \itemize{ \item a variable specified as a literal variable name (e.g., \code{column_name}), -\item a string with the variable name (e.g., \code{"column_name"}), or a character -vector of variable names (e.g., \code{c("col1", "col2", "col3")}), +\item a string with the variable name (e.g., \code{"column_name"}), a character +vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a +character vector of variable names including ranges specified via \code{:} +(e.g., \code{c("col1:col3", "col5")}), \item a formula with variable names (e.g., \code{~column_1 + column_2}), \item a vector of positive integers, giving the positions counting from the left (e.g. \code{1} or \code{c(1, 3, 5)}), @@ -145,7 +158,7 @@ It is basically a wrapper around base R's \code{cut()}, providing a simplified and more accessible way to define the interval breaks (cut-off values). } \section{Splits and breaks (cut-off values)}{ -Breaks are in general \emph{exclusive}, this means that these values indicate +Breaks are by default \emph{exclusive}, this means that these values indicate the lower bound of the next group or interval to begin. Take a simple example, a numeric variable with values from 1 to 9. The median would be 5, thus the first interval ranges from 1-4 and is recoded into 1, while 5-9 @@ -154,6 +167,9 @@ using \code{split = "quantile"} and \code{n_groups = 3} would define breaks at 3 and 6.33 (see \code{quantile(1:9, probs = c(1/3, 2/3))}), which means that values from 1 to 3 belong to the first interval and are recoded into 1 (because the next interval starts at 3.67), 4 to 6 into 2 and 7 to 9 into 3. + +The opposite behaviour can be achieved using \code{breaks = "inclusive"}, in which +case } \section{Recoding into groups with equal size or range}{ @@ -217,6 +233,13 @@ categorize(x, "equal_length", n_groups = 3, labels = c("low", "mid", "high")) x <- sample(1:10, size = 30, replace = TRUE) categorize(x, "equal_length", n_groups = 3, labels = "mean") categorize(x, "equal_length", n_groups = 3, labels = "median") + +# cut numeric into groups with the requested range as a label name +# each category has the same range, and labels indicate this range +categorize(mtcars$mpg, "equal_length", n_groups = 5, labels = "range") +# in this example, each category has the same range, but labels only refer +# to the ranges of the actual values (present in the data) inside each group +categorize(mtcars$mpg, "equal_length", n_groups = 5, labels = "observed") } \seealso{ \itemize{ diff --git a/man/center.Rd b/man/center.Rd index f143f64b2..4774020ab 100644 --- a/man/center.Rd +++ b/man/center.Rd @@ -72,8 +72,10 @@ against the names of the selected variables.} tasks. Can be either \itemize{ \item a variable specified as a literal variable name (e.g., \code{column_name}), -\item a string with the variable name (e.g., \code{"column_name"}), or a character -vector of variable names (e.g., \code{c("col1", "col2", "col3")}), +\item a string with the variable name (e.g., \code{"column_name"}), a character +vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a +character vector of variable names including ranges specified via \code{:} +(e.g., \code{c("col1:col3", "col5")}), \item a formula with variable names (e.g., \code{~column_1 + column_2}), \item a vector of positive integers, giving the positions counting from the left (e.g. \code{1} or \code{c(1, 3, 5)}), diff --git a/man/coef_var.Rd b/man/coef_var.Rd index 0f0965076..2ff973838 100644 --- a/man/coef_var.Rd +++ b/man/coef_var.Rd @@ -19,7 +19,6 @@ distribution_coef_var(x, ...) trim = 0, remove_na = FALSE, n = NULL, - na.rm = FALSE, ... ) } @@ -52,8 +51,6 @@ or not (\code{FALSE}, default)?} \item{n}{If \code{method = "unbiased"} and both \code{mu} and \code{sigma} are provided (not computed from \code{x}), what sample size to use to adjust the computed CV for small-sample bias?} - -\item{na.rm}{Deprecated. Please use \code{remove_na} instead.} } \value{ The computed coefficient of variation for \code{x}. @@ -79,14 +76,10 @@ This means that CV is \strong{NOT} invariant to shifting, but it is to scaling: \if{html}{\out{
}}\preformatted{sandwiches <- c(0, 4, 15, 0, 0, 5, 2, 7) coef_var(sandwiches) #> [1] 1.239094 -}\if{html}{\out{
}} -\if{html}{\out{
}}\preformatted{ coef_var(sandwiches / 2) # same #> [1] 1.239094 -}\if{html}{\out{
}} -\if{html}{\out{
}}\preformatted{ coef_var(sandwiches + 4) # different! 0 is no longer meaningful! #> [1] 0.6290784 }\if{html}{\out{
}} diff --git a/man/convert_na_to.Rd b/man/convert_na_to.Rd index 91121ff94..702e0eb2e 100644 --- a/man/convert_na_to.Rd +++ b/man/convert_na_to.Rd @@ -41,8 +41,10 @@ replace \code{NA}.} tasks. Can be either \itemize{ \item a variable specified as a literal variable name (e.g., \code{column_name}), -\item a string with the variable name (e.g., \code{"column_name"}), or a character -vector of variable names (e.g., \code{c("col1", "col2", "col3")}), +\item a string with the variable name (e.g., \code{"column_name"}), a character +vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a +character vector of variable names including ranges specified via \code{:} +(e.g., \code{c("col1:col3", "col5")}), \item a formula with variable names (e.g., \code{~column_1 + column_2}), \item a vector of positive integers, giving the positions counting from the left (e.g. \code{1} or \code{c(1, 3, 5)}), diff --git a/man/convert_to_na.Rd b/man/convert_to_na.Rd index 2529294b7..fe308d61e 100644 --- a/man/convert_to_na.Rd +++ b/man/convert_to_na.Rd @@ -44,8 +44,10 @@ by \code{NA}, should unused levels be dropped?} tasks. Can be either \itemize{ \item a variable specified as a literal variable name (e.g., \code{column_name}), -\item a string with the variable name (e.g., \code{"column_name"}), or a character -vector of variable names (e.g., \code{c("col1", "col2", "col3")}), +\item a string with the variable name (e.g., \code{"column_name"}), a character +vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a +character vector of variable names including ranges specified via \code{:} +(e.g., \code{c("col1:col3", "col5")}), \item a formula with variable names (e.g., \code{~column_1 + column_2}), \item a vector of positive integers, giving the positions counting from the left (e.g. \code{1} or \code{c(1, 3, 5)}), diff --git a/man/data_codebook.Rd b/man/data_codebook.Rd index 4c0f935e7..d5a542be4 100644 --- a/man/data_codebook.Rd +++ b/man/data_codebook.Rd @@ -34,8 +34,10 @@ data_codebook( tasks. Can be either \itemize{ \item a variable specified as a literal variable name (e.g., \code{column_name}), -\item a string with the variable name (e.g., \code{"column_name"}), or a character -vector of variable names (e.g., \code{c("col1", "col2", "col3")}), +\item a string with the variable name (e.g., \code{"column_name"}), a character +vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a +character vector of variable names including ranges specified via \code{:} +(e.g., \code{c("col1:col3", "col5")}), \item a formula with variable names (e.g., \code{~column_1 + column_2}), \item a vector of positive integers, giving the positions counting from the left (e.g. \code{1} or \code{c(1, 3, 5)}), @@ -125,7 +127,8 @@ labels, values or value range, frequencies, amount of missing values). \note{ There are methods to \code{print()} the data frame in a nicer output, as well methods for printing in markdown or HTML format (\code{print_md()} and -\code{print_html()}). +\code{print_html()}). The \code{print()} method for text outputs passes arguments in +\code{...} to \code{\link[insight:export_table]{insight::export_table()}}. } \examples{ data(iris) diff --git a/man/data_duplicated.Rd b/man/data_duplicated.Rd index 73c3e8de1..88624c8c8 100644 --- a/man/data_duplicated.Rd +++ b/man/data_duplicated.Rd @@ -20,8 +20,10 @@ data_duplicated( tasks. Can be either \itemize{ \item a variable specified as a literal variable name (e.g., \code{column_name}), -\item a string with the variable name (e.g., \code{"column_name"}), or a character -vector of variable names (e.g., \code{c("col1", "col2", "col3")}), +\item a string with the variable name (e.g., \code{"column_name"}), a character +vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a +character vector of variable names including ranges specified via \code{:} +(e.g., \code{c("col1:col3", "col5")}), \item a formula with variable names (e.g., \code{~column_1 + column_2}), \item a vector of positive integers, giving the positions counting from the left (e.g. \code{1} or \code{c(1, 3, 5)}), diff --git a/man/data_extract.Rd b/man/data_extract.Rd index a0cd4e402..0b544e710 100644 --- a/man/data_extract.Rd +++ b/man/data_extract.Rd @@ -27,8 +27,10 @@ and data frame extensions (e.g., tibbles).} tasks. Can be either \itemize{ \item a variable specified as a literal variable name (e.g., \code{column_name}), -\item a string with the variable name (e.g., \code{"column_name"}), or a character -vector of variable names (e.g., \code{c("col1", "col2", "col3")}), +\item a string with the variable name (e.g., \code{"column_name"}), a character +vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a +character vector of variable names including ranges specified via \code{:} +(e.g., \code{c("col1:col3", "col5")}), \item a formula with variable names (e.g., \code{~column_1 + column_2}), \item a vector of positive integers, giving the positions counting from the left (e.g. \code{1} or \code{c(1, 3, 5)}), diff --git a/man/data_group.Rd b/man/data_group.Rd index 56f5f314e..9cb55de5d 100644 --- a/man/data_group.Rd +++ b/man/data_group.Rd @@ -24,8 +24,10 @@ data_ungroup(data, verbose = TRUE, ...) tasks. Can be either \itemize{ \item a variable specified as a literal variable name (e.g., \code{column_name}), -\item a string with the variable name (e.g., \code{"column_name"}), or a character -vector of variable names (e.g., \code{c("col1", "col2", "col3")}), +\item a string with the variable name (e.g., \code{"column_name"}), a character +vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a +character vector of variable names including ranges specified via \code{:} +(e.g., \code{c("col1:col3", "col5")}), \item a formula with variable names (e.g., \code{~column_1 + column_2}), \item a vector of positive integers, giving the positions counting from the left (e.g. \code{1} or \code{c(1, 3, 5)}), diff --git a/man/data_match.Rd b/man/data_match.Rd index a57c34768..a209170ab 100644 --- a/man/data_match.Rd +++ b/man/data_match.Rd @@ -5,7 +5,15 @@ \alias{data_filter} \title{Return filtered or sliced data frame, or row indices} \usage{ -data_match(x, to, match = "and", return_indices = FALSE, drop_na = TRUE, ...) +data_match( + x, + to, + match = "and", + return_indices = FALSE, + remove_na = TRUE, + drop_na, + ... +) data_filter(x, ...) } @@ -24,12 +32,14 @@ or \code{"not"} (or \code{"!"}).} can be used to filter the original data frame. If \code{FALSE} (default), returns directly the filtered data frame instead of the row indices.} -\item{drop_na}{Logical, if \code{TRUE}, missing values (\code{NA}s) are removed before +\item{remove_na}{Logical, if \code{TRUE}, missing values (\code{NA}s) are removed before filtering the data. This is the default behaviour, however, sometimes when row indices are requested (i.e. \code{return_indices=TRUE}), it might be useful to preserve \code{NA} values, so returned row indices match the row indices of the original data frame.} +\item{drop_na}{Deprecated, please use \code{remove_na} instead.} + \item{...}{A sequence of logical expressions indicating which rows to keep, or a numeric vector indicating the row indices of rows to keep. Can also be a string representation of a logical expression (e.g. \code{"x > 4"}), a diff --git a/man/data_modify.Rd b/man/data_modify.Rd index 042962e03..28533ecea 100644 --- a/man/data_modify.Rd +++ b/man/data_modify.Rd @@ -30,6 +30,9 @@ type of expression cannot be mixed with other expressions, i.e. if a character vector is provided, you may not add further elements to \code{...}. \item Using \code{NULL} as right-hand side removes a variable from the data frame. Example: \code{Petal.Width = NULL}. +\item For data frames (including grouped ones), the function \code{n()} can be used to count the +number of observations and thereby, for instance, create index values by +using \code{id = 1:n()} or \code{id = 3:(n()+2)} and similar. } Note that newly created variables can be used in subsequent expressions, @@ -109,7 +112,8 @@ new_efc <- data_modify( grouped_efc, c12hour_c = center(c12hour), c12hour_z = c12hour_c / sd(c12hour, na.rm = TRUE), - c12hour_z2 = standardize(c12hour) + c12hour_z2 = standardize(c12hour), + id = 1:n() ) head(new_efc) diff --git a/man/data_partition.Rd b/man/data_partition.Rd index 68ac05a19..1150b4f28 100644 --- a/man/data_partition.Rd +++ b/man/data_partition.Rd @@ -11,7 +11,6 @@ data_partition( seed = NULL, row_id = ".row_id", verbose = TRUE, - group = NULL, ... ) } @@ -33,8 +32,6 @@ contains the row-id's.} \item{verbose}{Toggle messages and warnings.} -\item{group}{Deprecated. Use \code{by} instead.} - \item{...}{Other arguments passed to or from other functions.} } \value{ diff --git a/man/data_peek.Rd b/man/data_peek.Rd index 4f3f88e8a..9524c70ec 100644 --- a/man/data_peek.Rd +++ b/man/data_peek.Rd @@ -27,8 +27,10 @@ data_peek(x, ...) tasks. Can be either \itemize{ \item a variable specified as a literal variable name (e.g., \code{column_name}), -\item a string with the variable name (e.g., \code{"column_name"}), or a character -vector of variable names (e.g., \code{c("col1", "col2", "col3")}), +\item a string with the variable name (e.g., \code{"column_name"}), a character +vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a +character vector of variable names including ranges specified via \code{:} +(e.g., \code{c("col1:col3", "col5")}), \item a formula with variable names (e.g., \code{~column_1 + column_2}), \item a vector of positive integers, giving the positions counting from the left (e.g. \code{1} or \code{c(1, 3, 5)}), diff --git a/man/data_read.Rd b/man/data_read.Rd index 1ae3cea8a..d7d26255b 100644 --- a/man/data_read.Rd +++ b/man/data_read.Rd @@ -33,15 +33,16 @@ for SAS data files.} \item{encoding}{The character encoding used for the file. Usually not needed.} \item{convert_factors}{If \code{TRUE} (default), numeric variables, where all -values have a value label, are assumed to be categorical and converted -into factors. If \code{FALSE}, no variable types are guessed and no conversion -of numeric variables into factors will be performed. See also section -'Differences to other packages'. For \code{data_write()}, this argument only -applies to the text (e.g. \code{.txt} or \code{.csv}) or spreadsheet file formats (like -\code{.xlsx}). Converting to factors might be useful for these formats because -labelled numeric variables are then converted into factors and exported as -character columns - else, value labels would be lost and only numeric values -are written to the file.} +values have a value label, are assumed to be categorical and converted into +factors. If \code{FALSE}, no variable types are guessed and no conversion of +numeric variables into factors will be performed. For \code{data_read()}, this +argument only applies to file types with \emph{labelled data}, e.g. files from +SPSS, SAS or Stata. See also section 'Differences to other packages'. For +\code{data_write()}, this argument only applies to the text (e.g. \code{.txt} or +\code{.csv}) or spreadsheet file formats (like \code{.xlsx}). Converting to factors +might be useful for these formats because labelled numeric variables are then +converted into factors and exported as character columns - else, value labels +would be lost and only numeric values are written to the file.} \item{verbose}{Toggle warnings and messages.} @@ -118,12 +119,13 @@ versions, use \code{compress = "none"}, for example \code{data_read()} is most comparable to \code{rio::import()}. For data files from SPSS, SAS or Stata, which support labelled data, variables are converted into -their most appropriate type. The major difference to \code{rio::import()} is that -\code{data_read()} automatically converts fully labelled numeric variables into -factors, where imported value labels will be set as factor levels. If a -numeric variable has \emph{no} value labels or less value labels than values, it -is not converted to factor. In this case, value labels are preserved as -\code{"labels"} attribute. Character vectors are preserved. Use +their most appropriate type. The major difference to \code{rio::import()} is for +data files from SPSS, SAS, or Stata, i.e. file types that support +\emph{labelled data}. \code{data_read()} automatically converts fully labelled numeric +variables into factors, where imported value labels will be set as factor +levels. If a numeric variable has \emph{no} value labels or less value labels than +values, it is not converted to factor. In this case, value labels are +preserved as \code{"labels"} attribute. Character vectors are preserved. Use \code{convert_factors = FALSE} to remove the automatic conversion of numeric variables to factors. } diff --git a/man/data_relocate.Rd b/man/data_relocate.Rd index 30e4dbbfe..9949b5d27 100644 --- a/man/data_relocate.Rd +++ b/man/data_relocate.Rd @@ -44,8 +44,10 @@ data_remove( tasks. Can be either \itemize{ \item a variable specified as a literal variable name (e.g., \code{column_name}), -\item a string with the variable name (e.g., \code{"column_name"}), or a character -vector of variable names (e.g., \code{c("col1", "col2", "col3")}), +\item a string with the variable name (e.g., \code{"column_name"}), a character +vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a +character vector of variable names including ranges specified via \code{:} +(e.g., \code{c("col1:col3", "col5")}), \item a formula with variable names (e.g., \code{~column_1 + column_2}), \item a vector of positive integers, giving the positions counting from the left (e.g. \code{1} or \code{c(1, 3, 5)}), diff --git a/man/data_rename.Rd b/man/data_rename.Rd index f1f4de938..2ff779c21 100644 --- a/man/data_rename.Rd +++ b/man/data_rename.Rd @@ -46,14 +46,20 @@ data_rename_rows(data, rows = NULL) \item{pattern}{Character vector. For \code{data_rename()}, indicates columns that should be selected for renaming. Can be \code{NULL} (in which case all columns are selected). For \code{data_addprefix()} or \code{data_addsuffix()}, a character -string, which will be added as prefix or suffix to the column names.} +string, which will be added as prefix or suffix to the column names. For +\code{data_rename()}, \code{pattern} can also be a named vector. In this case, names +are used as values for the \code{replacement} argument (i.e. \code{pattern} can be a +character vector using \verb{ = ""} and argument \code{replacement} +will be ignored then).} \item{select}{Variables that will be included when performing the required tasks. Can be either \itemize{ \item a variable specified as a literal variable name (e.g., \code{column_name}), -\item a string with the variable name (e.g., \code{"column_name"}), or a character -vector of variable names (e.g., \code{c("col1", "col2", "col3")}), +\item a string with the variable name (e.g., \code{"column_name"}), a character +vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a +character vector of variable names including ranges specified via \code{:} +(e.g., \code{c("col1:col3", "col5")}), \item a formula with variable names (e.g., \code{~column_1 + column_2}), \item a vector of positive integers, giving the positions counting from the left (e.g. \code{1} or \code{c(1, 3, 5)}), @@ -104,7 +110,7 @@ functions (see 'Details'), this argument may be used as workaround.} \item{replacement}{Character vector. Indicates the new name of the columns selected in \code{pattern}. Can be \code{NULL} (in which case column are numbered in sequential order). If not \code{NULL}, \code{pattern} and \code{replacement} must be -of the same length.} +of the same length. If \code{pattern} is a named vector, \code{replacement} is ignored.} \item{safe}{Do not throw error if for instance the variable to be renamed/removed doesn't exist.} @@ -134,12 +140,14 @@ head(data_rename(iris, "Sepal.Length", "length")) head(data_rename(iris, "FakeCol", "length")) # This doesn't head(data_rename(iris, c("Sepal.Length", "Sepal.Width"), c("length", "width"))) +# use named vector to rename +head(data_rename(iris, c(length = "Sepal.Length", width = "Sepal.Width"))) + # Reset names head(data_rename(iris, NULL)) # Change all head(data_rename(iris, replacement = paste0("Var", 1:5))) - } \seealso{ \itemize{ diff --git a/man/data_replicate.Rd b/man/data_replicate.Rd index 35448155d..5a427d570 100644 --- a/man/data_replicate.Rd +++ b/man/data_replicate.Rd @@ -27,8 +27,10 @@ column. Note that the variable indicated by \code{expand} must be an integer vec tasks. Can be either \itemize{ \item a variable specified as a literal variable name (e.g., \code{column_name}), -\item a string with the variable name (e.g., \code{"column_name"}), or a character -vector of variable names (e.g., \code{c("col1", "col2", "col3")}), +\item a string with the variable name (e.g., \code{"column_name"}), a character +vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a +character vector of variable names including ranges specified via \code{:} +(e.g., \code{c("col1:col3", "col5")}), \item a formula with variable names (e.g., \code{~column_1 + column_2}), \item a vector of positive integers, giving the positions counting from the left (e.g. \code{1} or \code{c(1, 3, 5)}), diff --git a/man/data_separate.Rd b/man/data_separate.Rd index 37528d46e..7c951f81c 100644 --- a/man/data_separate.Rd +++ b/man/data_separate.Rd @@ -30,8 +30,10 @@ data_separate( tasks. Can be either \itemize{ \item a variable specified as a literal variable name (e.g., \code{column_name}), -\item a string with the variable name (e.g., \code{"column_name"}), or a character -vector of variable names (e.g., \code{c("col1", "col2", "col3")}), +\item a string with the variable name (e.g., \code{"column_name"}), a character +vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a +character vector of variable names including ranges specified via \code{:} +(e.g., \code{c("col1:col3", "col5")}), \item a formula with variable names (e.g., \code{~column_1 + column_2}), \item a vector of positive integers, giving the positions counting from the left (e.g. \code{1} or \code{c(1, 3, 5)}), diff --git a/man/data_summary.Rd b/man/data_summary.Rd index ccbf4c524..24cfa1a9f 100644 --- a/man/data_summary.Rd +++ b/man/data_summary.Rd @@ -7,7 +7,7 @@ \usage{ data_summary(x, ...) -\method{data_summary}{data.frame}(x, ..., by = NULL, include_na = TRUE) +\method{data_summary}{data.frame}(x, ..., by = NULL, remove_na = FALSE) } \arguments{ \item{x}{A (grouped) data frame.} @@ -22,9 +22,9 @@ summary function \code{n()} can be used to count the number of observations.} If supplied, the data will be split by this variable and summary statistics will be computed for each group.} -\item{include_na}{Logical. If \code{TRUE}, missing values are included as a level -in the grouping variable. If \code{FALSE}, missing values are omitted from the -grouping variable.} +\item{remove_na}{Logical. If \code{TRUE}, missing values are omitted from the +grouping variable. If \code{FALSE} (default), missing values are included as a +level in the grouping variable.} } \value{ A data frame with the requested summary statistics. diff --git a/man/data_tabulate.Rd b/man/data_tabulate.Rd index b744c1f1b..b28a26ede 100644 --- a/man/data_tabulate.Rd +++ b/man/data_tabulate.Rd @@ -4,6 +4,7 @@ \alias{data_tabulate} \alias{data_tabulate.default} \alias{data_tabulate.data.frame} +\alias{as.data.frame.datawizard_tables} \title{Create frequency and crosstables of variables} \usage{ data_tabulate(x, ...) @@ -13,7 +14,7 @@ data_tabulate(x, ...) by = NULL, drop_levels = FALSE, weights = NULL, - include_na = TRUE, + remove_na = FALSE, proportions = NULL, name = NULL, verbose = TRUE, @@ -29,12 +30,21 @@ data_tabulate(x, ...) by = NULL, drop_levels = FALSE, weights = NULL, - include_na = TRUE, + remove_na = FALSE, proportions = NULL, collapse = FALSE, verbose = TRUE, ... ) + +\method{as.data.frame}{datawizard_tables}( + x, + row.names = NULL, + optional = FALSE, + ..., + stringsAsFactors = FALSE, + add_total = FALSE +) } \arguments{ \item{x}{A (grouped) data frame, a vector or factor.} @@ -52,7 +62,7 @@ factor levels are dropped from the frequency table.} \item{weights}{Optional numeric vector of weights. Must be of the same length as \code{x}. If \code{weights} is supplied, weighted frequencies are calculated.} -\item{include_na}{Logical, if \code{TRUE}, missing values are included in the +\item{remove_na}{Logical, if \code{FALSE}, missing values are included in the frequency or crosstable, else missing values are omitted.} \item{proportions}{Optional character string, indicating the type of @@ -69,8 +79,10 @@ for printing.} tasks. Can be either \itemize{ \item a variable specified as a literal variable name (e.g., \code{column_name}), -\item a string with the variable name (e.g., \code{"column_name"}), or a character -vector of variable names (e.g., \code{c("col1", "col2", "col3")}), +\item a string with the variable name (e.g., \code{"column_name"}), a character +vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a +character vector of variable names including ranges specified via \code{:} +(e.g., \code{c("col1:col3", "col5")}), \item a formula with variable names (e.g., \code{~column_1 + column_2}), \item a vector of positive integers, giving the positions counting from the left (e.g. \code{1} or \code{c(1, 3, 5)}), @@ -116,6 +128,24 @@ functions (see 'Details'), this argument may be used as workaround.} \item{collapse}{Logical, if \code{TRUE} collapses multiple tables into one larger table for printing. This affects only printing, not the returned object.} + +\item{row.names}{\code{NULL} or a character vector giving the row + names for the data frame. Missing values are not allowed.} + +\item{optional}{logical. If \code{TRUE}, setting row names and + converting column names (to syntactic names: see + \code{\link[base]{make.names}}) is optional. Note that all of \R's + \pkg{base} package \code{as.data.frame()} methods use + \code{optional} only for column names treatment, basically with the + meaning of \code{\link[base]{data.frame}(*, check.names = !optional)}. + See also the \code{make.names} argument of the \code{matrix} method.} + +\item{stringsAsFactors}{logical: should the character vector be converted + to a factor?} + +\item{add_total}{For crosstables (i.e. when \code{by} is not \code{NULL}), a row and +column with the total N values are added to the data frame. \code{add_total} has +no effect in \code{as.data.frame()} for simple frequency tables.} } \value{ A data frame, or a list of data frames, with one frequency table @@ -127,10 +157,18 @@ including the number of levels/values as well as the distribution of raw, valid and cumulative percentages. For crosstables, row, column and cell percentages can be calculated. } +\details{ +There is an \code{as.data.frame()} method, to return the frequency tables as a +data frame. The structure of the returned object is a nested data frame, +where the first column contains name of the variable for which frequencies +were calculated, and the second column is a list column that contains the +frequency tables as data frame. See 'Examples'. +} \note{ There are \code{print_html()} and \code{print_md()} methods available for printing frequency or crosstables in HTML and markdown format, e.g. -\code{print_html(data_tabulate(x))}. +\code{print_html(data_tabulate(x))}. The \code{print()} method for text outputs passes +arguments in \code{...} to \code{\link[insight:export_table]{insight::export_table()}}. } \section{Crosstables}{ @@ -138,7 +176,7 @@ If \code{by} is supplied, a crosstable is created. The crosstable includes \verb (missing) values by default. The first column indicates values of \code{x}, the first row indicates values of \code{by} (including missing values). The last row and column contain the total frequencies for each row and column, respectively. -Setting \code{include_na = FALSE} will omit missing values from the crosstable. +Setting \code{remove_na = FALSE} will omit missing values from the crosstable. Setting \code{proportions} to \code{"row"} or \code{"column"} will add row or column percentages. Setting \code{proportions} to \code{"full"} will add relative frequencies for the full table. @@ -154,7 +192,7 @@ data(efc) data_tabulate(efc$c172code) # drop missing values -data_tabulate(efc$c172code, include_na = FALSE) +data_tabulate(efc$c172code, remove_na = TRUE) # data frame data_tabulate(efc, c("e42dep", "c172code")) @@ -201,11 +239,17 @@ data_tabulate( efc$c172code, by = efc$e16sex, proportions = "column", - include_na = FALSE + remove_na = TRUE ) # round percentages out <- data_tabulate(efc, "c172code", by = "e16sex", proportions = "column") print(out, digits = 0) + +# coerce to data frames +result <- data_tabulate(efc, "c172code", by = "e16sex") +as.data.frame(result) +as.data.frame(result)$table +as.data.frame(result, add_total = TRUE)$table \dontshow{\}) # examplesIf} } diff --git a/man/data_to_long.Rd b/man/data_to_long.Rd index 741725d25..73b54219b 100644 --- a/man/data_to_long.Rd +++ b/man/data_to_long.Rd @@ -45,8 +45,10 @@ rows and fewer columns after the operation.} tasks. Can be either \itemize{ \item a variable specified as a literal variable name (e.g., \code{column_name}), -\item a string with the variable name (e.g., \code{"column_name"}), or a character -vector of variable names (e.g., \code{c("col1", "col2", "col3")}), +\item a string with the variable name (e.g., \code{"column_name"}), a character +vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a +character vector of variable names including ranges specified via \code{:} +(e.g., \code{c("col1:col3", "col5")}), \item a formula with variable names (e.g., \code{~column_1 + column_2}), \item a vector of positive integers, giving the positions counting from the left (e.g. \code{1} or \code{c(1, 3, 5)}), diff --git a/man/data_unique.Rd b/man/data_unique.Rd index 8a45bfc21..a0a70b92a 100644 --- a/man/data_unique.Rd +++ b/man/data_unique.Rd @@ -21,8 +21,10 @@ data_unique( tasks. Can be either \itemize{ \item a variable specified as a literal variable name (e.g., \code{column_name}), -\item a string with the variable name (e.g., \code{"column_name"}), or a character -vector of variable names (e.g., \code{c("col1", "col2", "col3")}), +\item a string with the variable name (e.g., \code{"column_name"}), a character +vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a +character vector of variable names including ranges specified via \code{:} +(e.g., \code{c("col1:col3", "col5")}), \item a formula with variable names (e.g., \code{~column_1 + column_2}), \item a vector of positive integers, giving the positions counting from the left (e.g. \code{1} or \code{c(1, 3, 5)}), diff --git a/man/data_unite.Rd b/man/data_unite.Rd index ba7710a8a..369fd33d8 100644 --- a/man/data_unite.Rd +++ b/man/data_unite.Rd @@ -27,8 +27,10 @@ data_unite( tasks. Can be either \itemize{ \item a variable specified as a literal variable name (e.g., \code{column_name}), -\item a string with the variable name (e.g., \code{"column_name"}), or a character -vector of variable names (e.g., \code{c("col1", "col2", "col3")}), +\item a string with the variable name (e.g., \code{"column_name"}), a character +vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a +character vector of variable names including ranges specified via \code{:} +(e.g., \code{c("col1:col3", "col5")}), \item a formula with variable names (e.g., \code{~column_1 + column_2}), \item a vector of positive integers, giving the positions counting from the left (e.g. \code{1} or \code{c(1, 3, 5)}), diff --git a/man/datawizard-package.Rd b/man/datawizard-package.Rd index db38bc334..d389df6ac 100644 --- a/man/datawizard-package.Rd +++ b/man/datawizard-package.Rd @@ -33,16 +33,16 @@ Useful links: Authors: \itemize{ - \item Indrajeet Patil \email{patilindrajeet.science@gmail.com} (\href{https://orcid.org/0000-0003-1995-6531}{ORCID}) (@patilindrajeets) - \item Dominique Makowski \email{dom.makowski@gmail.com} (\href{https://orcid.org/0000-0001-5375-9967}{ORCID}) (@Dom_Makowski) - \item Daniel Lüdecke \email{d.luedecke@uke.de} (\href{https://orcid.org/0000-0002-8895-3206}{ORCID}) (@strengejacke) + \item Indrajeet Patil \email{patilindrajeet.science@gmail.com} (\href{https://orcid.org/0000-0003-1995-6531}{ORCID}) + \item Dominique Makowski \email{dom.makowski@gmail.com} (\href{https://orcid.org/0000-0001-5375-9967}{ORCID}) + \item Daniel Lüdecke \email{d.luedecke@uke.de} (\href{https://orcid.org/0000-0002-8895-3206}{ORCID}) \item Mattan S. Ben-Shachar \email{matanshm@post.bgu.ac.il} (\href{https://orcid.org/0000-0002-4287-4801}{ORCID}) - \item Brenton M. Wiernik \email{brenton@wiernik.org} (\href{https://orcid.org/0000-0001-9560-6336}{ORCID}) (@bmwiernik) + \item Brenton M. Wiernik \email{brenton@wiernik.org} (\href{https://orcid.org/0000-0001-9560-6336}{ORCID}) } Other contributors: \itemize{ - \item Rémi Thériault \email{remi.theriault@mail.mcgill.ca} (\href{https://orcid.org/0000-0003-4315-6788}{ORCID}) (@rempsyc) [contributor] + \item Rémi Thériault \email{remi.theriault@mail.mcgill.ca} (\href{https://orcid.org/0000-0003-4315-6788}{ORCID}) [contributor] \item Thomas J. Faulkenberry \email{faulkenberry@tarleton.edu} [reviewer] \item Robert Garrett \email{rcg4@illinois.edu} [reviewer] } diff --git a/man/demean.Rd b/man/demean.Rd index d03a1010b..fb4db3a29 100644 --- a/man/demean.Rd +++ b/man/demean.Rd @@ -10,35 +10,35 @@ demean( x, select, by, + nested = FALSE, suffix_demean = "_within", suffix_groupmean = "_between", add_attributes = TRUE, - verbose = TRUE, - group = NULL + verbose = TRUE ) degroup( x, select, by, + nested = FALSE, center = "mean", suffix_demean = "_within", suffix_groupmean = "_between", add_attributes = TRUE, - verbose = TRUE, - group = NULL + verbose = TRUE ) detrend( x, select, by, + nested = FALSE, center = "mean", suffix_demean = "_within", suffix_groupmean = "_between", add_attributes = TRUE, - verbose = TRUE, - group = NULL + verbose = TRUE ) } \arguments{ @@ -48,7 +48,28 @@ detrend( that should be group- and de-meaned.} \item{by}{Character vector (or formula) with the name of the variable that -indicates the group- or cluster-ID.} +indicates the group- or cluster-ID. For cross-classified or nested designs, +\code{by} can also identify two or more variables as group- or cluster-IDs. If +the data is nested and should be treated as such, set \code{nested = TRUE}. Else, +if \code{by} defines two or more variables and \code{nested = FALSE}, a cross-classified +design is assumed. Note that \code{demean()} and \code{degroup()} can't handle a mix +of nested and cross-classified designs in one model. + +For nested designs, \code{by} can be: +\itemize{ +\item a character vector with the name of the variable that indicates the +levels, ordered from \emph{highest} level to \emph{lowest} (e.g. +\code{by = c("L4", "L3", "L2")}. +\item a character vector with variable names in the format \code{by = "L4/L3/L2"}, +where the levels are separated by \code{/}. +} + +See also section \emph{De-meaning for cross-classified designs} and +\emph{De-meaning for nested designs} below.} + +\item{nested}{Logical, if \code{TRUE}, the data is treated as nested. If \code{FALSE}, +the data is treated as cross-classified. Only applies if \code{by} contains more +than one variable.} \item{suffix_demean, suffix_groupmean}{String value, will be appended to the names of the group-meaned and de-meaned variables of \code{x}. By default, @@ -62,8 +83,6 @@ within- and between-effects are printed in separated blocks.} \item{verbose}{Toggle warnings and messages.} -\item{group}{Deprecated. Use \code{by} instead.} - \item{center}{Method for centering. \code{demean()} always performs mean-centering, while \code{degroup()} can use \code{center = "median"} or \code{center = "mode"} for median- or mode-centering, and also \code{"min"} @@ -72,7 +91,10 @@ or \code{"max"}.} \value{ A data frame with the group-/de-meaned variables, which get the suffix \code{"_between"} (for the group-meaned variable) and \code{"_within"} (for the -de-meaned variable) by default. +de-meaned variable) by default. For cross-classified or nested designs, +the name pattern of the group-meaned variables is the name of the centered +variable followed by the name of the variable that indicates the related +grouping level, e.g. \code{predictor_L3_between} and \code{predictor_L2_between}. } \description{ \code{demean()} computes group- and de-meaned versions of a variable that can be @@ -81,46 +103,50 @@ used in regression analysis to model the between- and within-subject effect. \code{demean()} always uses mean-centering, \code{degroup()} can also use the mode or median for centering. } -\details{ -\subsection{Heterogeneity Bias}{ +\section{Heterogeneity Bias}{ + + Mixed models include different levels of sources of variability, i.e. error terms at each level. When macro-indicators (or level-2 predictors, or higher-level units, or more general: \emph{group-level predictors that \strong{vary} within and across groups}) are included as fixed effects (i.e. treated as covariate at level-1), the variance that is left unaccounted for this covariate will be absorbed into the error terms of level-1 and level-2 -(\cite{Bafumi and Gelman 2006; Gelman and Hill 2007, Chapter 12.6.}): -\dQuote{Such covariates contain two parts: one that is specific to the -higher-level entity that does not vary between occasions, and one that -represents the difference between occasions, within higher-level entities} -(\cite{Bell et al. 2015}). Hence, the error terms will be correlated with -the covariate, which violates one of the assumptions of mixed models -(iid, independent and identically distributed error terms). This bias is -also called the \emph{heterogeneity bias} (\cite{Bell et al. 2015}). To -resolve this problem, level-2 predictors used as (level-1) covariates should -be separated into their "within" and "between" effects by "de-meaning" and -"group-meaning": After demeaning time-varying predictors, \dQuote{at the -higher level, the mean term is no longer constrained by Level 1 effects, -so it is free to account for all the higher-level variance associated -with that variable} (\cite{Bell et al. 2015}). +(\emph{Bafumi and Gelman 2006; Gelman and Hill 2007, Chapter 12.6.}): +"Such covariates contain two parts: one that is specific to the higher-level +entity that does not vary between occasions, and one that represents the +difference between occasions, within higher-level entities" (\emph{Bell et al. 2015}). +Hence, the error terms will be correlated with the covariate, which violates +one of the assumptions of mixed models (iid, independent and identically +distributed error terms). This bias is also called the \emph{heterogeneity bias} +(\emph{Bell et al. 2015}). To resolve this problem, level-2 predictors used as +(level-1) covariates should be separated into their "within" and "between" +effects by "de-meaning" and "group-meaning": After demeaning time-varying +predictors, "at the higher level, the mean term is no longer constrained by +Level 1 effects, so it is free to account for all the higher-level variance +associated with that variable" (\emph{Bell et al. 2015}). } -\subsection{Panel data and correlating fixed and group effects}{ -\code{demean()} is intended to create group- and de-meaned variables -for panel regression models (fixed effects models), or for complex -random-effect-within-between models (see \cite{Bell et al. 2015, 2018}), -where group-effects (random effects) and fixed effects correlate (see -\cite{Bafumi and Gelman 2006}). This can happen, for instance, when -analyzing panel data, which can lead to \emph{Heterogeneity Bias}. To -control for correlating predictors and group effects, it is recommended -to include the group-meaned and de-meaned version of \emph{time-varying covariates} -(and group-meaned version of \emph{time-invariant covariates} that are on -a higher level, e.g. level-2 predictors) in the model. By this, one can -fit complex multilevel models for panel data, including time-varying -predictors, time-invariant predictors and random effects. +\section{Panel data and correlating fixed and group effects}{ + + +\code{demean()} is intended to create group- and de-meaned variables for panel +regression models (fixed effects models), or for complex +random-effect-within-between models (see \emph{Bell et al. 2015, 2018}), where +group-effects (random effects) and fixed effects correlate (see +\emph{Bafumi and Gelman 2006}). This can happen, for instance, when analyzing +panel data, which can lead to \emph{Heterogeneity Bias}. To control for correlating +predictors and group effects, it is recommended to include the group-meaned +and de-meaned version of \emph{time-varying covariates} (and group-meaned version +of \emph{time-invariant covariates} that are on a higher level, e.g. level-2 +predictors) in the model. By this, one can fit complex multilevel models for +panel data, including time-varying predictors, time-invariant predictors and +random effects. } -\subsection{Why mixed models are preferred over fixed effects models}{ +\section{Why mixed models are preferred over fixed effects models}{ + + A mixed models approach can model the causes of endogeneity explicitly by including the (separated) within- and between-effects of time-varying fixed effects and including time-constant fixed effects. Furthermore, @@ -128,24 +154,28 @@ mixed models also include random effects, thus a mixed models approach is superior to classic fixed-effects models, which lack information of variation in the group-effects or between-subject effects. Furthermore, fixed effects regression cannot include random slopes, which means that -fixed effects regressions are neglecting \dQuote{cross-cluster differences -in the effects of lower-level controls (which) reduces the precision of -estimated context effects, resulting in unnecessarily wide confidence -intervals and low statistical power} (\cite{Heisig et al. 2017}). +fixed effects regressions are neglecting "cross-cluster differences in the +effects of lower-level controls (which) reduces the precision of estimated +context effects, resulting in unnecessarily wide confidence intervals and +low statistical power" (\emph{Heisig et al. 2017}). } -\subsection{Terminology}{ +\section{Terminology}{ + + The group-meaned variable is simply the mean of an independent variable -within each group (or id-level or cluster) represented by \code{by}. -It represents the cluster-mean of an independent variable. The regression -coefficient of a group-meaned variable is the \emph{between-subject-effect}. -The de-meaned variable is then the centered version of the group-meaned -variable. De-meaning is sometimes also called person-mean centering or -centering within clusters. The regression coefficient of a de-meaned -variable represents the \emph{within-subject-effect}. +within each group (or id-level or cluster) represented by \code{by}. It represents +the cluster-mean of an independent variable. The regression coefficient of a +group-meaned variable is the \emph{between-subject-effect}. The de-meaned variable +is then the centered version of the group-meaned variable. De-meaning is +sometimes also called person-mean centering or centering within clusters. +The regression coefficient of a de-meaned variable represents the +\emph{within-subject-effect}. } -\subsection{De-meaning with continuous predictors}{ +\section{De-meaning with continuous predictors}{ + + For continuous time-varying predictors, the recommendation is to include both their de-meaned and group-meaned versions as fixed effects, but not the raw (untransformed) time-varying predictors themselves. The de-meaned @@ -155,7 +185,9 @@ the within-subject effect, while the coefficient of the group-meaned predictor indicates the between-subject effect. } -\subsection{De-meaning with binary predictors}{ +\section{De-meaning with binary predictors}{ + + For binary time-varying predictors, there are two recommendations. First is to include the raw (untransformed) binary predictor as fixed effect only and the \emph{de-meaned} variable as random effect (random slope). @@ -163,51 +195,91 @@ The alternative would be to add the de-meaned version(s) of binary time-varying covariates as additional fixed effect as well (instead of adding it as random slope). Centering time-varying binary variables to obtain within-effects (level 1) isn't necessary. They have a sensible -interpretation when left in the typical 0/1 format (\cite{Hoffmann 2015, +interpretation when left in the typical 0/1 format (\emph{Hoffmann 2015, chapter 8-2.I}). \code{demean()} will thus coerce categorical time-varying predictors to numeric to compute the de- and group-meaned versions for these variables, where the raw (untransformed) binary predictor and the de-meaned version should be added to the model. } -\subsection{De-meaning of factors with more than 2 levels}{ +\section{De-meaning of factors with more than 2 levels}{ + + Factors with more than two levels are demeaned in two ways: first, these are also converted to numeric and de-meaned; second, dummy variables are created (binary, with 0/1 coding for each level) and these binary dummy-variables are de-meaned in the same way (as described above). -Packages like \pkg{panelr} internally convert factors to dummies before +Packages like \strong{panelr} internally convert factors to dummies before demeaning, so this behaviour can be mimicked here. } -\subsection{De-meaning interaction terms}{ There are multiple ways to deal -with interaction terms of within- and between-effects. A classical approach -is to simply use the product term of the de-meaned variables (i.e. -introducing the de-meaned variables as interaction term in the model -formula, e.g. \code{y ~ x_within * time_within}). This approach, however, -might be subject to bias (see \cite{Giesselmann & Schmidt-Catran 2020}). -\cr \cr -Another option is to first calculate the product term and then apply the -de-meaning to it. This approach produces an estimator \dQuote{that reflects +\section{De-meaning interaction terms}{ + + +There are multiple ways to deal with interaction terms of within- and +between-effects. +\itemize{ +\item A classical approach is to simply use the product term of the de-meaned +variables (i.e. introducing the de-meaned variables as interaction term +in the model formula, e.g. \code{y ~ x_within * time_within}). This approach, +however, might be subject to bias (see \emph{Giesselmann & Schmidt-Catran 2020}). +\item Another option is to first calculate the product term and then apply the +de-meaning to it. This approach produces an estimator "that reflects unit-level differences of interacted variables whose moderators vary -within units}, which is desirable if \emph{no} within interaction of -two time-dependent variables is required. \cr \cr -A third option, when the interaction should result in a genuine within +within units", which is desirable if \emph{no} within interaction of +two time-dependent variables is required. This is what \code{demean()} does +internally when \code{select} contains interaction terms. +\item A third option, when the interaction should result in a genuine within estimator, is to "double de-mean" the interaction terms -(\cite{Giesselmann & Schmidt-Catran 2018}), however, this is currently +(\emph{Giesselmann & Schmidt-Catran 2018}), however, this is currently not supported by \code{demean()}. If this is required, the \code{wmb()} -function from the \pkg{panelr} package should be used. \cr \cr +function from the \strong{panelr} package should be used. +} + To de-mean interaction terms for within-between models, simply specify -the term as interaction for the \code{select}-argument, e.g. -\code{select = "a*b"} (see 'Examples'). +the term as interaction for the \code{select}-argument, e.g. \code{select = "a*b"} +(see 'Examples'). } -\subsection{Analysing panel data with mixed models using lme4}{ -A description of how to translate the -formulas described in \emph{Bell et al. 2018} into R using \code{lmer()} -from \pkg{lme4} can be found in -\href{https://easystats.github.io/parameters/articles/demean.html}{this vignette}. +\section{De-meaning for cross-classified designs}{ + + +\code{demean()} can handle cross-classified designs, where the data has two or +more groups at the higher (i.e. second) level. In such cases, the +\code{by}-argument can identify two or more variables that represent the +cross-classified group- or cluster-IDs. The de-meaned variables for +cross-classified designs are simply subtracting all group means from each +individual value, i.e. \emph{fully cluster-mean-centering} (see \emph{Guo et al. 2024} +for details). Note that de-meaning for cross-classified designs is \emph{not} +equivalent to de-meaning of nested data structures from models with three or +more levels. Set \code{nested = TRUE} to explicitly assume a nested design. For +cross-classified designs, de-meaning is supposed to work for models like +\code{y ~ x + (1|level3) + (1|level2)}, but \emph{not} for models like +\code{y ~ x + (1|level3/level2)}. Note that \code{demean()} and \code{degroup()} can't +handle a mix of nested and cross-classified designs in one model. } + +\section{De-meaning for nested designs}{ + + +\emph{Brincks et al. (2017)} have suggested an algorithm to center variables for +nested designs, which is implemented in \code{demean()}. For nested designs, set +\code{nested = TRUE} \emph{and} specify the variables that indicate the different +levels in descending order in the \code{by} argument. E.g., +\verb{by = c("level4", "level3, "level2")} assumes a model like +\code{y ~ x + (1|level4/level3/level2)}. An alternative notation for the +\code{by}-argument would be \code{by = "level4/level3/level2"}, similar to the +formula notation. } + +\section{Analysing panel data with mixed models using lme4}{ + + +A description of how to translate the formulas described in \emph{Bell et al. 2018} +into R using \code{lmer()} from \strong{lme4} can be found in +\href{https://easystats.github.io/parameters/articles/demean.html}{this vignette}. +} + \examples{ data(iris) @@ -244,12 +316,19 @@ Models: Making an Informed Choice. Quality & Quantity (53); 1051-1074 \item Bell A, Jones K. 2015. Explaining Fixed Effects: Random Effects Modeling of Time-Series Cross-Sectional and Panel Data. Political Science Research and Methods, 3(1), 133–153. +\item Brincks, A. M., Enders, C. K., Llabre, M. M., Bulotsky-Shearer, R. J., +Prado, G., and Feaster, D. J. (2017). Centering Predictor Variables in +Three-Level Contextual Models. Multivariate Behavioral Research, 52(2), +149–163. https://doi.org/10.1080/00273171.2016.1256753 \item Gelman A, Hill J. 2007. Data Analysis Using Regression and Multilevel/Hierarchical Models. Analytical Methods for Social Research. Cambridge, New York: Cambridge University Press \item Giesselmann M, Schmidt-Catran, AW. 2020. Interactions in fixed effects regression models. Sociological Methods & Research, 1–28. https://doi.org/10.1177/0049124120914934 +\item Guo Y, Dhaliwal J, Rights JD. 2024. Disaggregating level-specific effects +in cross-classified multilevel models. Behavior Research Methods, 56(4), +3023–3057. \item Heisig JP, Schaeffer M, Giesecke J. 2017. The Costs of Simplicity: Why Multilevel Models May Benefit from Accounting for Cross-Cluster Differences in the Effects of Controls. American Sociological Review 82 diff --git a/man/describe_distribution.Rd b/man/describe_distribution.Rd index 369bd9ef6..80b69e115 100644 --- a/man/describe_distribution.Rd +++ b/man/describe_distribution.Rd @@ -86,8 +86,10 @@ vector before the mean is computed.} tasks. Can be either \itemize{ \item a variable specified as a literal variable name (e.g., \code{column_name}), -\item a string with the variable name (e.g., \code{"column_name"}), or a character -vector of variable names (e.g., \code{c("col1", "col2", "col3")}), +\item a string with the variable name (e.g., \code{"column_name"}), a character +vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a +character vector of variable names including ranges specified via \code{:} +(e.g., \code{c("col1:col3", "col5")}), \item a formula with variable names (e.g., \code{~column_1 + column_2}), \item a vector of positive integers, giving the positions counting from the left (e.g. \code{1} or \code{c(1, 3, 5)}), diff --git a/man/extract_column_names.Rd b/man/extract_column_names.Rd index 6805d9569..3ea5da7dc 100644 --- a/man/extract_column_names.Rd +++ b/man/extract_column_names.Rd @@ -2,9 +2,7 @@ % Please edit documentation in R/data_select.R, R/extract_column_names.R \name{data_select} \alias{data_select} -\alias{get_columns} \alias{extract_column_names} -\alias{data_find} \alias{find_columns} \title{Find or get columns in a data frame based on search patterns} \usage{ @@ -18,16 +16,6 @@ data_select( ... ) -get_columns( - data, - select = NULL, - exclude = NULL, - ignore_case = FALSE, - regex = FALSE, - verbose = TRUE, - ... -) - extract_column_names( data, select = NULL, @@ -38,16 +26,6 @@ extract_column_names( ... ) -data_find( - data, - select = NULL, - exclude = NULL, - ignore_case = FALSE, - regex = FALSE, - verbose = TRUE, - ... -) - find_columns( data, select = NULL, @@ -65,8 +43,10 @@ find_columns( tasks. Can be either \itemize{ \item a variable specified as a literal variable name (e.g., \code{column_name}), -\item a string with the variable name (e.g., \code{"column_name"}), or a character -vector of variable names (e.g., \code{c("col1", "col2", "col3")}), +\item a string with the variable name (e.g., \code{"column_name"}), a character +vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a +character vector of variable names including ranges specified via \code{:} +(e.g., \code{c("col1:col3", "col5")}), \item a formula with variable names (e.g., \code{~column_1 + column_2}), \item a vector of positive integers, giving the positions counting from the left (e.g. \code{1} or \code{c(1, 3, 5)}), @@ -174,7 +154,7 @@ outer(iris, starts_with("Sep")) }\if{html}{\out{}} } \examples{ -# Find columns names by pattern +# Find column names by pattern extract_column_names(iris, starts_with("Sepal")) extract_column_names(iris, ends_with("Width")) extract_column_names(iris, regex("\\\\.")) @@ -187,6 +167,9 @@ extract_column_names(iris, starts_with("Sepal"), exclude = contains("Width")) numeric_mean_35 <- function(x) is.numeric(x) && mean(x, na.rm = TRUE) > 3.5 extract_column_names(iris, numeric_mean_35) +# find range of colum names by range, using character vector +extract_column_names(mtcars, c("cyl:hp", "wt")) + # rename returned columns for "data_select()" head(data_select(mtcars, c(`Miles per Gallon` = "mpg", Cylinders = "cyl"))) } diff --git a/man/labels_to_levels.Rd b/man/labels_to_levels.Rd index 8024eb2d3..163eb0eaa 100644 --- a/man/labels_to_levels.Rd +++ b/man/labels_to_levels.Rd @@ -33,8 +33,10 @@ allowed.} tasks. Can be either \itemize{ \item a variable specified as a literal variable name (e.g., \code{column_name}), -\item a string with the variable name (e.g., \code{"column_name"}), or a character -vector of variable names (e.g., \code{c("col1", "col2", "col3")}), +\item a string with the variable name (e.g., \code{"column_name"}), a character +vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a +character vector of variable names including ranges specified via \code{:} +(e.g., \code{c("col1:col3", "col5")}), \item a formula with variable names (e.g., \code{~column_1 + column_2}), \item a vector of positive integers, giving the positions counting from the left (e.g. \code{1} or \code{c(1, 3, 5)}), diff --git a/man/mean_sd.Rd b/man/mean_sd.Rd index f0ea239f8..33eeb4bc5 100644 --- a/man/mean_sd.Rd +++ b/man/mean_sd.Rd @@ -5,7 +5,7 @@ \alias{median_mad} \title{Summary Helpers} \usage{ -mean_sd(x, times = 1L, remove_na = TRUE, named = TRUE, na.rm = TRUE, ...) +mean_sd(x, times = 1L, remove_na = TRUE, named = TRUE, ...) median_mad( x, @@ -13,7 +13,6 @@ median_mad( remove_na = TRUE, constant = 1.4826, named = TRUE, - na.rm = TRUE, ... ) } @@ -29,8 +28,6 @@ or not (\code{FALSE}, default)?} \item{named}{Should the vector be named? (E.g., \code{c("-SD" = -1, Mean = 1, "+SD" = 2)}.)} -\item{na.rm}{Deprecated. Please use \code{remove_na} instead.} - \item{...}{Not used.} \item{constant}{scale factor.} diff --git a/man/means_by_group.Rd b/man/means_by_group.Rd index d7a6dfc96..6c06ac3b1 100644 --- a/man/means_by_group.Rd +++ b/man/means_by_group.Rd @@ -8,15 +8,7 @@ \usage{ means_by_group(x, ...) -\method{means_by_group}{numeric}( - x, - by = NULL, - ci = 0.95, - weights = NULL, - digits = NULL, - group = NULL, - ... -) +\method{means_by_group}{numeric}(x, by = NULL, ci = 0.95, weights = NULL, digits = NULL, ...) \method{means_by_group}{data.frame}( x, @@ -29,7 +21,6 @@ means_by_group(x, ...) ignore_case = FALSE, regex = FALSE, verbose = TRUE, - group = NULL, ... ) } @@ -56,14 +47,14 @@ weights are used.} \item{digits}{Optional scalar, indicating the amount of digits after decimal point when rounding estimates and values.} -\item{group}{Deprecated. Use \code{by} instead.} - \item{select}{Variables that will be included when performing the required tasks. Can be either \itemize{ \item a variable specified as a literal variable name (e.g., \code{column_name}), -\item a string with the variable name (e.g., \code{"column_name"}), or a character -vector of variable names (e.g., \code{c("col1", "col2", "col3")}), +\item a string with the variable name (e.g., \code{"column_name"}), a character +vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a +character vector of variable names including ranges specified via \code{:} +(e.g., \code{c("col1:col3", "col5")}), \item a formula with variable names (e.g., \code{~column_1 + column_2}), \item a vector of positive integers, giving the positions counting from the left (e.g. \code{1} or \code{c(1, 3, 5)}), diff --git a/man/normalize.Rd b/man/normalize.Rd index 4a9a61a68..c325e98fe 100644 --- a/man/normalize.Rd +++ b/man/normalize.Rd @@ -71,8 +71,10 @@ the normalized vectors are rescaled to a range from \code{0 + include_bounds} to tasks. Can be either \itemize{ \item a variable specified as a literal variable name (e.g., \code{column_name}), -\item a string with the variable name (e.g., \code{"column_name"}), or a character -vector of variable names (e.g., \code{c("col1", "col2", "col3")}), +\item a string with the variable name (e.g., \code{"column_name"}), a character +vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a +character vector of variable names including ranges specified via \code{:} +(e.g., \code{c("col1:col3", "col5")}), \item a formula with variable names (e.g., \code{~column_1 + column_2}), \item a vector of positive integers, giving the positions counting from the left (e.g. \code{1} or \code{c(1, 3, 5)}), diff --git a/man/ranktransform.Rd b/man/ranktransform.Rd index c23105735..7046db2b5 100644 --- a/man/ranktransform.Rd +++ b/man/ranktransform.Rd @@ -39,8 +39,10 @@ details.} tasks. Can be either \itemize{ \item a variable specified as a literal variable name (e.g., \code{column_name}), -\item a string with the variable name (e.g., \code{"column_name"}), or a character -vector of variable names (e.g., \code{c("col1", "col2", "col3")}), +\item a string with the variable name (e.g., \code{"column_name"}), a character +vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a +character vector of variable names including ranges specified via \code{:} +(e.g., \code{c("col1:col3", "col5")}), \item a formula with variable names (e.g., \code{~column_1 + column_2}), \item a vector of positive integers, giving the positions counting from the left (e.g. \code{1} or \code{c(1, 3, 5)}), diff --git a/man/recode_values.Rd b/man/recode_values.Rd index 9810c0a2d..dece902f7 100644 --- a/man/recode_values.Rd +++ b/man/recode_values.Rd @@ -4,7 +4,6 @@ \alias{recode_values} \alias{recode_values.numeric} \alias{recode_values.data.frame} -\alias{change_code} \title{Recode old values of variables into new values} \usage{ recode_values(x, ...) @@ -31,20 +30,6 @@ recode_values(x, ...) verbose = TRUE, ... ) - -change_code( - x, - select = NULL, - exclude = NULL, - recode = NULL, - default = NULL, - preserve_na = TRUE, - append = FALSE, - ignore_case = FALSE, - regex = FALSE, - verbose = TRUE, - ... -) } \arguments{ \item{x}{A data frame, numeric or character vector, or factor.} @@ -75,8 +60,10 @@ default value.} tasks. Can be either \itemize{ \item a variable specified as a literal variable name (e.g., \code{column_name}), -\item a string with the variable name (e.g., \code{"column_name"}), or a character -vector of variable names (e.g., \code{c("col1", "col2", "col3")}), +\item a string with the variable name (e.g., \code{"column_name"}), a character +vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a +character vector of variable names including ranges specified via \code{:} +(e.g., \code{c("col1:col3", "col5")}), \item a formula with variable names (e.g., \code{~column_1 + column_2}), \item a vector of positive integers, giving the positions counting from the left (e.g. \code{1} or \code{c(1, 3, 5)}), diff --git a/man/rescale.Rd b/man/rescale.Rd index 016a6f841..490964777 100644 --- a/man/rescale.Rd +++ b/man/rescale.Rd @@ -67,8 +67,10 @@ the input vector (\code{range(x)}).} tasks. Can be either \itemize{ \item a variable specified as a literal variable name (e.g., \code{column_name}), -\item a string with the variable name (e.g., \code{"column_name"}), or a character -vector of variable names (e.g., \code{c("col1", "col2", "col3")}), +\item a string with the variable name (e.g., \code{"column_name"}), a character +vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a +character vector of variable names including ranges specified via \code{:} +(e.g., \code{c("col1:col3", "col5")}), \item a formula with variable names (e.g., \code{~column_1 + column_2}), \item a vector of positive integers, giving the positions counting from the left (e.g. \code{1} or \code{c(1, 3, 5)}), diff --git a/man/rescale_weights.Rd b/man/rescale_weights.Rd index 4a67d4100..d9651decb 100644 --- a/man/rescale_weights.Rd +++ b/man/rescale_weights.Rd @@ -4,7 +4,7 @@ \alias{rescale_weights} \title{Rescale design weights for multilevel analysis} \usage{ -rescale_weights(data, by, probability_weights, nest = FALSE, group = NULL) +rescale_weights(data, by, probability_weights, nest = FALSE) } \arguments{ \item{data}{A data frame.} @@ -21,8 +21,6 @@ sampling) weights of the survey data (level-1-weight).} \item{nest}{Logical, if \code{TRUE} and \code{by} indicates at least two group variables, then groups are "nested", i.e. groups are now a combination from each group level of the variables in \code{by}.} - -\item{group}{Deprecated. Use \code{by} instead.} } \value{ \code{data}, including the new weighting variables: \code{pweights_a} diff --git a/man/reverse.Rd b/man/reverse.Rd index 6304dffc6..5767908ff 100644 --- a/man/reverse.Rd +++ b/man/reverse.Rd @@ -45,8 +45,10 @@ usually only makes sense when factor levels are numeric, not characters.} tasks. Can be either \itemize{ \item a variable specified as a literal variable name (e.g., \code{column_name}), -\item a string with the variable name (e.g., \code{"column_name"}), or a character -vector of variable names (e.g., \code{c("col1", "col2", "col3")}), +\item a string with the variable name (e.g., \code{"column_name"}), a character +vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a +character vector of variable names including ranges specified via \code{:} +(e.g., \code{c("col1:col3", "col5")}), \item a formula with variable names (e.g., \code{~column_1 + column_2}), \item a vector of positive integers, giving the positions counting from the left (e.g. \code{1} or \code{c(1, 3, 5)}), diff --git a/man/row_count.Rd b/man/row_count.Rd new file mode 100644 index 000000000..7bf54fe5f --- /dev/null +++ b/man/row_count.Rd @@ -0,0 +1,132 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/row_count.R +\name{row_count} +\alias{row_count} +\title{Count specific values row-wise} +\usage{ +row_count( + data, + select = NULL, + exclude = NULL, + count = NULL, + allow_coercion = TRUE, + ignore_case = FALSE, + regex = FALSE, + verbose = TRUE +) +} +\arguments{ +\item{data}{A data frame with at least two columns, where number of specific +values are counted row-wise.} + +\item{select}{Variables that will be included when performing the required +tasks. Can be either +\itemize{ +\item a variable specified as a literal variable name (e.g., \code{column_name}), +\item a string with the variable name (e.g., \code{"column_name"}), a character +vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a +character vector of variable names including ranges specified via \code{:} +(e.g., \code{c("col1:col3", "col5")}), +\item a formula with variable names (e.g., \code{~column_1 + column_2}), +\item a vector of positive integers, giving the positions counting from the left +(e.g. \code{1} or \code{c(1, 3, 5)}), +\item a vector of negative integers, giving the positions counting from the +right (e.g., \code{-1} or \code{-1:-3}), +\item one of the following select-helpers: \code{starts_with()}, \code{ends_with()}, +\code{contains()}, a range using \code{:} or \code{regex("")}. \code{starts_with()}, +\code{ends_with()}, and \code{contains()} accept several patterns, e.g +\code{starts_with("Sep", "Petal")}. +\item or a function testing for logical conditions, e.g. \code{is.numeric()} (or +\code{is.numeric}), or any user-defined function that selects the variables +for which the function returns \code{TRUE} (like: \code{foo <- function(x) mean(x) > 3}), +\item ranges specified via literal variable names, select-helpers (except +\code{regex()}) and (user-defined) functions can be negated, i.e. return +non-matching elements, when prefixed with a \code{-}, e.g. \code{-ends_with("")}, +\code{-is.numeric} or \code{-(Sepal.Width:Petal.Length)}. \strong{Note:} Negation means +that matches are \emph{excluded}, and thus, the \code{exclude} argument can be +used alternatively. For instance, \code{select=-ends_with("Length")} (with +\code{-}) is equivalent to \code{exclude=ends_with("Length")} (no \code{-}). In case +negation should not work as expected, use the \code{exclude} argument instead. +} + +If \code{NULL}, selects all columns. Patterns that found no matches are silently +ignored, e.g. \code{extract_column_names(iris, select = c("Species", "Test"))} +will just return \code{"Species"}.} + +\item{exclude}{See \code{select}, however, column names matched by the pattern +from \code{exclude} will be excluded instead of selected. If \code{NULL} (the default), +excludes no columns.} + +\item{count}{The value for which the row sum should be computed. May be a +numeric value, a character string (for factors or character vectors), \code{NA} or +\code{Inf}.} + +\item{allow_coercion}{Logical. If \code{FALSE}, \code{count} matches only values of same +class (i.e. when \code{count = 2}, the value \code{"2"} is not counted and vice versa). +By default, when \code{allow_coercion = TRUE}, \code{count = 2} also matches \code{"2"}. In +order to count factor levels in the data, use \code{count = factor("level")}. See +'Examples'.} + +\item{ignore_case}{Logical, if \code{TRUE} and when one of the select-helpers or +a regular expression is used in \code{select}, ignores lower/upper case in the +search pattern when matching against variable names.} + +\item{regex}{Logical, if \code{TRUE}, the search pattern from \code{select} will be +treated as regular expression. When \code{regex = TRUE}, select \emph{must} be a +character string (or a variable containing a character string) and is not +allowed to be one of the supported select-helpers or a character vector +of length > 1. \code{regex = TRUE} is comparable to using one of the two +select-helpers, \code{select = contains("")} or \code{select = regex("")}, however, +since the select-helpers may not work when called from inside other +functions (see 'Details'), this argument may be used as workaround.} + +\item{verbose}{Toggle warnings.} +} +\value{ +A vector with row-wise counts of values specified in \code{count}. +} +\description{ +\code{row_count()} mimics base R's \code{rowSums()}, with sums for a +specific value indicated by \code{count}. Hence, it is similar to +\code{rowSums(x == count, na.rm = TRUE)}, but offers some more options, including +strict comparisons. Comparisons using \code{==} coerce values to atomic vectors, +thus both \code{2 == 2} and \code{"2" == 2} are \code{TRUE}. In \code{row_count()}, it is also +possible to make "type safe" comparisons using the \code{allow_coercion} argument, +where \code{"2" == 2} is not true. +} +\examples{ +dat <- data.frame( + c1 = c(1, 2, NA, 4), + c2 = c(NA, 2, NA, 5), + c3 = c(NA, 4, NA, NA), + c4 = c(2, 3, 7, 8) +) + +# count all 4s per row +row_count(dat, count = 4) +# count all missing values per row +row_count(dat, count = NA) + +dat <- data.frame( + c1 = c("1", "2", NA, "3"), + c2 = c(NA, "2", NA, "3"), + c3 = c(NA, 4, NA, NA), + c4 = c(2, 3, 7, Inf) +) +# count all 2s and "2"s per row +row_count(dat, count = 2) +# only count 2s, but not "2"s +row_count(dat, count = 2, allow_coercion = FALSE) + +dat <- data.frame( + c1 = factor(c("1", "2", NA, "3")), + c2 = c("2", "1", NA, "3"), + c3 = c(NA, 4, NA, NA), + c4 = c(2, 3, 7, Inf) +) +# find only character "2"s +row_count(dat, count = "2", allow_coercion = FALSE) +# find only factor level "2"s +row_count(dat, count = factor("2"), allow_coercion = FALSE) + +} diff --git a/man/row_means.Rd b/man/row_means.Rd index c347fc6f1..43d85b5b0 100644 --- a/man/row_means.Rd +++ b/man/row_means.Rd @@ -2,7 +2,8 @@ % Please edit documentation in R/row_means.R \name{row_means} \alias{row_means} -\title{Row means (optionally with minimum amount of valid values)} +\alias{row_sums} +\title{Row means or sums (optionally with minimum amount of valid values)} \usage{ row_means( data, @@ -15,16 +16,31 @@ row_means( remove_na = FALSE, verbose = TRUE ) + +row_sums( + data, + select = NULL, + exclude = NULL, + min_valid = NULL, + digits = NULL, + ignore_case = FALSE, + regex = FALSE, + remove_na = FALSE, + verbose = TRUE +) } \arguments{ -\item{data}{A data frame with at least two columns, where row means are applied.} +\item{data}{A data frame with at least two columns, where row means or row +sums are applied.} \item{select}{Variables that will be included when performing the required tasks. Can be either \itemize{ \item a variable specified as a literal variable name (e.g., \code{column_name}), -\item a string with the variable name (e.g., \code{"column_name"}), or a character -vector of variable names (e.g., \code{c("col1", "col2", "col3")}), +\item a string with the variable name (e.g., \code{"column_name"}), a character +vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a +character vector of variable names including ranges specified via \code{:} +(e.g., \code{c("col1:col3", "col5")}), \item a formula with variable names (e.g., \code{~column_1 + column_2}), \item a vector of positive integers, giving the positions counting from the left (e.g. \code{1} or \code{c(1, 3, 5)}), @@ -58,9 +74,9 @@ excludes no columns.} \item{min_valid}{Optional, a numeric value of length 1. May either be \itemize{ \item a numeric value that indicates the amount of valid values per row to -calculate the row mean; +calculate the row mean or row sum; \item or a value between \code{0} and \code{1}, indicating a proportion of valid values per -row to calculate the row mean (see 'Details'). +row to calculate the row mean or row sum (see 'Details'). \item \code{NULL} (default), in which all cases are considered. } @@ -84,28 +100,31 @@ since the select-helpers may not work when called from inside other functions (see 'Details'), this argument may be used as workaround.} \item{remove_na}{Logical, if \code{TRUE} (default), removes missing (\code{NA}) values -before calculating row means. Only applies if \code{min_valuid} is not specified.} +before calculating row means or row sums. Only applies if \code{min_valid} is not +specified.} \item{verbose}{Toggle warnings.} } \value{ -A vector with row means for those rows with at least \code{n} valid values. +A vector with row means (for \code{row_means()}) or row sums (for +\code{row_sums()}) for those rows with at least \code{n} valid values. } \description{ -This function is similar to the SPSS \code{MEAN.n} function and computes -row means from a data frame or matrix if at least \code{min_valid} values of a row are -valid (and not \code{NA}). +This function is similar to the SPSS \code{MEAN.n} or \code{SUM.n} +function and computes row means or row sums from a data frame or matrix if at +least \code{min_valid} values of a row are valid (and not \code{NA}). } \details{ -Rounding to a negative number of \code{digits} means rounding to a power of -ten, for example \code{row_means(df, 3, digits = -2)} rounds to the nearest hundred. -For \code{min_valid}, if not \code{NULL}, \code{min_valid} must be a numeric value from \code{0} -to \code{ncol(data)}. If a row in the data frame has at least \code{min_valid} -non-missing values, the row mean is returned. If \code{min_valid} is a non-integer -value from 0 to 1, \code{min_valid} is considered to indicate the proportion of -required non-missing values per row. E.g., if \code{min_valid = 0.75}, a row must -have at least \code{ncol(data) * min_valid} non-missing values for the row mean -to be calculated. See 'Examples'. +Rounding to a negative number of \code{digits} means rounding to a power +of ten, for example \code{row_means(df, 3, digits = -2)} rounds to the nearest +hundred. For \code{min_valid}, if not \code{NULL}, \code{min_valid} must be a numeric value +from \code{0} to \code{ncol(data)}. If a row in the data frame has at least \code{min_valid} +non-missing values, the row mean or row sum is returned. If \code{min_valid} is a +non-integer value from 0 to 1, \code{min_valid} is considered to indicate the +proportion of required non-missing values per row. E.g., if +\code{min_valid = 0.75}, a row must have at least \code{ncol(data) * min_valid} +non-missing values for the row mean or row sum to be calculated. See +'Examples'. } \examples{ dat <- data.frame( @@ -123,6 +142,7 @@ row_means(dat, remove_na = TRUE) # needs at least 4 non-missing values per row row_means(dat, min_valid = 4) # 1 valid return value +row_sums(dat, min_valid = 4) # 1 valid return value # needs at least 3 non-missing values per row row_means(dat, min_valid = 3) # 2 valid return values @@ -135,6 +155,7 @@ row_means(dat, select = c("c1", "c3"), min_valid = 1) # needs at least 50\% of non-missing values per row row_means(dat, min_valid = 0.5) # 3 valid return values +row_sums(dat, min_valid = 0.5) # needs at least 75\% of non-missing values per row row_means(dat, min_valid = 0.75) # 2 valid return values diff --git a/man/skewness.Rd b/man/skewness.Rd index a89d98067..0401e3a40 100644 --- a/man/skewness.Rd +++ b/man/skewness.Rd @@ -19,7 +19,6 @@ skewness(x, ...) type = "2", iterations = NULL, verbose = TRUE, - na.rm = TRUE, ... ) @@ -31,7 +30,6 @@ kurtosis(x, ...) type = "2", iterations = NULL, verbose = TRUE, - na.rm = TRUE, ... ) @@ -61,8 +59,6 @@ errors. If \code{NULL} (default), parametric standard errors are computed.} \item{verbose}{Toggle warnings and messages.} -\item{na.rm}{Deprecated. Please use \code{remove_na} instead.} - \item{digits}{Number of decimal places.} \item{test}{Logical, if \code{TRUE}, tests if skewness or kurtosis is diff --git a/man/slide.Rd b/man/slide.Rd index ccc6bd7e9..c26943116 100644 --- a/man/slide.Rd +++ b/man/slide.Rd @@ -34,8 +34,10 @@ factors or character vectors to numeric values.} tasks. Can be either \itemize{ \item a variable specified as a literal variable name (e.g., \code{column_name}), -\item a string with the variable name (e.g., \code{"column_name"}), or a character -vector of variable names (e.g., \code{c("col1", "col2", "col3")}), +\item a string with the variable name (e.g., \code{"column_name"}), a character +vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a +character vector of variable names including ranges specified via \code{:} +(e.g., \code{c("col1:col3", "col5")}), \item a formula with variable names (e.g., \code{~column_1 + column_2}), \item a vector of positive integers, giving the positions counting from the left (e.g. \code{1} or \code{c(1, 3, 5)}), diff --git a/man/standardize.Rd b/man/standardize.Rd index 4041f2dc0..fcc8c6ae7 100644 --- a/man/standardize.Rd +++ b/man/standardize.Rd @@ -145,8 +145,10 @@ vectors as well.} tasks. Can be either \itemize{ \item a variable specified as a literal variable name (e.g., \code{column_name}), -\item a string with the variable name (e.g., \code{"column_name"}), or a character -vector of variable names (e.g., \code{c("col1", "col2", "col3")}), +\item a string with the variable name (e.g., \code{"column_name"}), a character +vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a +character vector of variable names including ranges specified via \code{:} +(e.g., \code{c("col1:col3", "col5")}), \item a formula with variable names (e.g., \code{~column_1 + column_2}), \item a vector of positive integers, giving the positions counting from the left (e.g. \code{1} or \code{c(1, 3, 5)}), diff --git a/man/text_format.Rd b/man/text_format.Rd index 87f045193..14d64b096 100644 --- a/man/text_format.Rd +++ b/man/text_format.Rd @@ -2,7 +2,6 @@ % Please edit documentation in R/text_format.R \name{text_format} \alias{text_format} -\alias{format_text} \alias{text_fullstop} \alias{text_lastchar} \alias{text_concatenate} @@ -20,15 +19,6 @@ text_format( ... ) -format_text( - text, - sep = ", ", - last = " and ", - width = NULL, - enclose = NULL, - ... -) - text_fullstop(text) text_lastchar(text, n = 1) @@ -63,7 +53,11 @@ text elements will not be enclosed.} \item{pattern}{Character vector. For \code{data_rename()}, indicates columns that should be selected for renaming. Can be \code{NULL} (in which case all columns are selected). For \code{data_addprefix()} or \code{data_addsuffix()}, a character -string, which will be added as prefix or suffix to the column names.} +string, which will be added as prefix or suffix to the column names. For +\code{data_rename()}, \code{pattern} can also be a named vector. In this case, names +are used as values for the \code{replacement} argument (i.e. \code{pattern} can be a +character vector using \verb{ = ""} and argument \code{replacement} +will be ignored then).} } \value{ A character string. diff --git a/man/to_factor.Rd b/man/to_factor.Rd index e035769ec..d544bdaae 100644 --- a/man/to_factor.Rd +++ b/man/to_factor.Rd @@ -36,8 +36,10 @@ the values of \code{x} (i.e. as if using \code{as.factor()}).} tasks. Can be either \itemize{ \item a variable specified as a literal variable name (e.g., \code{column_name}), -\item a string with the variable name (e.g., \code{"column_name"}), or a character -vector of variable names (e.g., \code{c("col1", "col2", "col3")}), +\item a string with the variable name (e.g., \code{"column_name"}), a character +vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a +character vector of variable names including ranges specified via \code{:} +(e.g., \code{c("col1:col3", "col5")}), \item a formula with variable names (e.g., \code{~column_1 + column_2}), \item a vector of positive integers, giving the positions counting from the left (e.g. \code{1} or \code{c(1, 3, 5)}), diff --git a/man/to_numeric.Rd b/man/to_numeric.Rd index 7478c9579..39f04c3a9 100644 --- a/man/to_numeric.Rd +++ b/man/to_numeric.Rd @@ -11,7 +11,7 @@ to_numeric(x, ...) x, select = NULL, exclude = NULL, - dummy_factors = TRUE, + dummy_factors = FALSE, preserve_levels = FALSE, lowest = NULL, append = FALSE, @@ -30,8 +30,10 @@ to_numeric(x, ...) tasks. Can be either \itemize{ \item a variable specified as a literal variable name (e.g., \code{column_name}), -\item a string with the variable name (e.g., \code{"column_name"}), or a character -vector of variable names (e.g., \code{c("col1", "col2", "col3")}), +\item a string with the variable name (e.g., \code{"column_name"}), a character +vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a +character vector of variable names including ranges specified via \code{:} +(e.g., \code{c("col1:col3", "col5")}), \item a formula with variable names (e.g., \code{~column_1 + column_2}), \item a vector of positive integers, giving the positions counting from the left (e.g. \code{1} or \code{c(1, 3, 5)}), @@ -107,11 +109,11 @@ either numeric levels or dummy variables. The "counterpart" to convert variables into factors is \code{to_factor()}. } \note{ -By default, \code{to_numeric()} converts factors into "binary" dummies, i.e. +When factors should be converted into multiple "binary" dummies, i.e. each factor level is converted into a separate column filled with a binary -0-1 value. If only one column is required, use \code{dummy_factors = FALSE}. If -you want to preserve the original factor levels (in case these represent -numeric values), use \code{preserve_levels = TRUE}. +0-1 value, set \code{dummy_factors = TRUE}. If you want to preserve the original +factor levels (in case these represent numeric values), use +\code{preserve_levels = TRUE}. } \section{Selection of variables - \code{select} argument}{ @@ -126,12 +128,12 @@ to also include the original variables in the returned data frame. \examples{ to_numeric(head(ToothGrowth)) -to_numeric(head(ToothGrowth), dummy_factors = FALSE) +to_numeric(head(ToothGrowth), dummy_factors = TRUE) # factors x <- as.factor(mtcars$gear) -to_numeric(x, dummy_factors = FALSE) -to_numeric(x, dummy_factors = FALSE, preserve_levels = TRUE) +to_numeric(x) +to_numeric(x, preserve_levels = TRUE) # same as: coerce_to_numeric(x) diff --git a/pkgdown/_pkgdown.yaml b/pkgdown/_pkgdown.yaml index d52994e16..31ec901d0 100644 --- a/pkgdown/_pkgdown.yaml +++ b/pkgdown/_pkgdown.yaml @@ -71,6 +71,7 @@ reference: - kurtosis - smoothness - skewness + - row_count - row_means - weighted_mean - mean_sd @@ -125,6 +126,11 @@ reference: - nhanes_sample articles: + - title: Overview of vignettes + navbar: ~ + contents: + - overview_of_vignettes + - title: Data Preparation desc: | Articles explaining utility of 'datawizard' for data wrangling diff --git a/tests/testthat/_snaps/categorize.md b/tests/testthat/_snaps/categorize.md new file mode 100644 index 000000000..9ed3c1115 --- /dev/null +++ b/tests/testthat/_snaps/categorize.md @@ -0,0 +1,46 @@ +# categorize labelling ranged + + Code + categorize(mtcars$mpg, "equal_length", n_groups = 5) + Output + [1] 3 3 3 3 2 2 1 3 3 2 2 2 2 2 1 1 1 5 5 5 3 2 2 1 2 4 4 5 2 2 1 3 + +--- + + Code + categorize(mtcars$mpg, "equal_length", n_groups = 5, labels = "range") + Output + [1] [19.8,24.5) [19.8,24.5) [19.8,24.5) [19.8,24.5) [15.1,19.8) [15.1,19.8) + [7] [10.4,15.1) [19.8,24.5) [19.8,24.5) [15.1,19.8) [15.1,19.8) [15.1,19.8) + [13] [15.1,19.8) [15.1,19.8) [10.4,15.1) [10.4,15.1) [10.4,15.1) [29.2,33.9] + [19] [29.2,33.9] [29.2,33.9] [19.8,24.5) [15.1,19.8) [15.1,19.8) [10.4,15.1) + [25] [15.1,19.8) [24.5,29.2) [24.5,29.2) [29.2,33.9] [15.1,19.8) [15.1,19.8) + [31] [10.4,15.1) [19.8,24.5) + Levels: [10.4,15.1) [15.1,19.8) [19.8,24.5) [24.5,29.2) [29.2,33.9] + +--- + + Code + categorize(mtcars$mpg, "equal_length", n_groups = 5, labels = "observed") + Output + [1] (21-24.4) (21-24.4) (21-24.4) (21-24.4) (15.2-19.7) (15.2-19.7) + [7] (10.4-15) (21-24.4) (21-24.4) (15.2-19.7) (15.2-19.7) (15.2-19.7) + [13] (15.2-19.7) (15.2-19.7) (10.4-15) (10.4-15) (10.4-15) (30.4-33.9) + [19] (30.4-33.9) (30.4-33.9) (21-24.4) (15.2-19.7) (15.2-19.7) (10.4-15) + [25] (15.2-19.7) (26-27.3) (26-27.3) (30.4-33.9) (15.2-19.7) (15.2-19.7) + [31] (10.4-15) (21-24.4) + Levels: (10.4-15) (15.2-19.7) (21-24.4) (26-27.3) (30.4-33.9) + +# categorize breaks + + Code + categorize(mtcars$mpg, "equal_length", n_groups = 5, labels = "range", breaks = "inclusive") + Output + [1] (19.8,24.5] (19.8,24.5] (19.8,24.5] (19.8,24.5] (15.1,19.8] (15.1,19.8] + [7] [10.4,15.1] (19.8,24.5] (19.8,24.5] (15.1,19.8] (15.1,19.8] (15.1,19.8] + [13] (15.1,19.8] (15.1,19.8] [10.4,15.1] [10.4,15.1] [10.4,15.1] (29.2,33.9] + [19] (29.2,33.9] (29.2,33.9] (19.8,24.5] (15.1,19.8] (15.1,19.8] [10.4,15.1] + [25] (15.1,19.8] (24.5,29.2] (24.5,29.2] (29.2,33.9] (15.1,19.8] (15.1,19.8] + [31] [10.4,15.1] (19.8,24.5] + Levels: [10.4,15.1] (15.1,19.8] (19.8,24.5] (24.5,29.2] (29.2,33.9] + diff --git a/tests/testthat/_snaps/data_codebook.md b/tests/testthat/_snaps/data_codebook.md index c390ba890..8f9b9e7b5 100644 --- a/tests/testthat/_snaps/data_codebook.md +++ b/tests/testthat/_snaps/data_codebook.md @@ -139,7 +139,7 @@ # data_codebook efc Code - data_codebook(efc) + print(data_codebook(efc), table_width = Inf) Output efc (100 rows and 5 variables, 5 shown) @@ -162,10 +162,94 @@ 5 | neg_c_7 | Negative impact with 7 items | numeric | 3 (3.0%) | [7, 28] | | 97 --------------------------------------------------------------------------------------------------------------------------------------------- +--- + + Code + print(data_codebook(efc), table_width = "auto", remove_duplicates = FALSE) + Output + efc (100 rows and 5 variables, 5 shown) + + ID | Name | Label | Type + ---+----------+------------------------------------------+------------ + 1 | c12hour | average number of hours of care per week | numeric + ---+----------+------------------------------------------+------------ + 2 | e16sex | elder's gender | numeric + | | | + ---+----------+------------------------------------------+------------ + 3 | e42dep | elder's dependency | categorical + | | | + | | | + | | | + ---+----------+------------------------------------------+------------ + 4 | c172code | carer's level of education | numeric + | | | + | | | + ---+----------+------------------------------------------+------------ + 5 | neg_c_7 | Negative impact with 7 items | numeric + ---------------------------------------------------------------------- + + ID | Missings | Values | Value Labels | N + ---+------------+----------+---------------------------------+----------- + 1 | 2 (2.0%) | [5, 168] | | 98 + ---+------------+----------+---------------------------------+----------- + 2 | 0 (0.0%) | 1 | male | 46 (46.0%) + | | 2 | female | 54 (54.0%) + ---+------------+----------+---------------------------------+----------- + 3 | 3 (3.0%) | 1 | independent | 2 ( 2.1%) + | | 2 | slightly dependent | 4 ( 4.1%) + | | 3 | moderately dependent | 28 (28.9%) + | | 4 | severely dependent | 63 (64.9%) + ---+------------+----------+---------------------------------+----------- + 4 | 10 (10.0%) | 1 | low level of education | 8 ( 8.9%) + | | 2 | intermediate level of education | 66 (73.3%) + | | 3 | high level of education | 16 (17.8%) + ---+------------+----------+---------------------------------+----------- + 5 | 3 (3.0%) | [7, 28] | | 97 + ------------------------------------------------------------------------- + +--- + + Code + print(data_codebook(efc), table_width = "auto", remove_duplicates = TRUE) + Output + efc (100 rows and 5 variables, 5 shown) + + ID | Name | Label | Type + ---+----------+------------------------------------------+------------ + 1 | c12hour | average number of hours of care per week | numeric + ---+----------+------------------------------------------+------------ + 2 | e16sex | elder's gender | numeric + ---+----------+------------------------------------------+------------ + 3 | e42dep | elder's dependency | categorical + ---+----------+------------------------------------------+------------ + 4 | c172code | carer's level of education | numeric + ---+----------+------------------------------------------+------------ + 5 | neg_c_7 | Negative impact with 7 items | numeric + ---------------------------------------------------------------------- + + ID | Missings | Values | Value Labels | N + ---+------------+----------+---------------------------------+----------- + 1 | 2 (2.0%) | [5, 168] | | 98 + ---+------------+----------+---------------------------------+----------- + 2 | 0 (0.0%) | 1 | male | 46 (46.0%) + | | 2 | female | 54 (54.0%) + ---+------------+----------+---------------------------------+----------- + 3 | 3 (3.0%) | 1 | independent | 2 ( 2.1%) + | | 2 | slightly dependent | 4 ( 4.1%) + | | 3 | moderately dependent | 28 (28.9%) + | | 4 | severely dependent | 63 (64.9%) + ---+------------+----------+---------------------------------+----------- + 4 | 10 (10.0%) | 1 | low level of education | 8 ( 8.9%) + | | 2 | intermediate level of education | 66 (73.3%) + | | 3 | high level of education | 16 (17.8%) + ---+------------+----------+---------------------------------+----------- + 5 | 3 (3.0%) | [7, 28] | | 97 + ------------------------------------------------------------------------- + # data_codebook efc, variable_label_width Code - data_codebook(efc, variable_label_width = 30) + print(out, table_width = Inf) Output efc (100 rows and 5 variables, 5 shown) @@ -189,10 +273,97 @@ 5 | neg_c_7 | Negative impact with 7 items | numeric | 3 (3.0%) | [7, 28] | | 97 --------------------------------------------------------------------------------------------------------------------------------- +--- + + Code + print(out, table_width = "auto", remove_duplicates = FALSE) + Output + efc (100 rows and 5 variables, 5 shown) + + ID | Name | Label | Type | Missings + ---+----------+------------------------------+-------------+----------- + 1 | c12hour | average number of hours of | numeric | 2 (2.0%) + | | care per week | | + ---+----------+------------------------------+-------------+----------- + 2 | e16sex | elder's gender | numeric | 0 (0.0%) + | | | | + ---+----------+------------------------------+-------------+----------- + 3 | e42dep | elder's dependency | categorical | 3 (3.0%) + | | | | + | | | | + | | | | + ---+----------+------------------------------+-------------+----------- + 4 | c172code | carer's level of education | numeric | 10 (10.0%) + | | | | + | | | | + ---+----------+------------------------------+-------------+----------- + 5 | neg_c_7 | Negative impact with 7 items | numeric | 3 (3.0%) + ----------------------------------------------------------------------- + + ID | Values | Value Labels | N + ---+----------+---------------------------------+----------- + 1 | [5, 168] | | 98 + | | | + ---+----------+---------------------------------+----------- + 2 | 1 | male | 46 (46.0%) + | 2 | female | 54 (54.0%) + ---+----------+---------------------------------+----------- + 3 | 1 | independent | 2 ( 2.1%) + | 2 | slightly dependent | 4 ( 4.1%) + | 3 | moderately dependent | 28 (28.9%) + | 4 | severely dependent | 63 (64.9%) + ---+----------+---------------------------------+----------- + 4 | 1 | low level of education | 8 ( 8.9%) + | 2 | intermediate level of education | 66 (73.3%) + | 3 | high level of education | 16 (17.8%) + ---+----------+---------------------------------+----------- + 5 | [7, 28] | | 97 + ------------------------------------------------------------ + +--- + + Code + print(out, table_width = "auto", remove_duplicates = TRUE) + Output + efc (100 rows and 5 variables, 5 shown) + + ID | Name | Label | Type | Missings + ---+----------+------------------------------+-------------+----------- + 1 | c12hour | average number of hours of | numeric | 2 (2.0%) + | | care per week | | + ---+----------+------------------------------+-------------+----------- + 2 | e16sex | elder's gender | numeric | 0 (0.0%) + ---+----------+------------------------------+-------------+----------- + 3 | e42dep | elder's dependency | categorical | 3 (3.0%) + ---+----------+------------------------------+-------------+----------- + 4 | c172code | carer's level of education | numeric | 10 (10.0%) + ---+----------+------------------------------+-------------+----------- + 5 | neg_c_7 | Negative impact with 7 items | numeric | 3 (3.0%) + ----------------------------------------------------------------------- + + ID | Values | Value Labels | N + ---+----------+---------------------------------+----------- + 1 | [5, 168] | | 98 + ---+----------+---------------------------------+----------- + 2 | 1 | male | 46 (46.0%) + | 2 | female | 54 (54.0%) + ---+----------+---------------------------------+----------- + 3 | 1 | independent | 2 ( 2.1%) + | 2 | slightly dependent | 4 ( 4.1%) + | 3 | moderately dependent | 28 (28.9%) + | 4 | severely dependent | 63 (64.9%) + ---+----------+---------------------------------+----------- + 4 | 1 | low level of education | 8 ( 8.9%) + | 2 | intermediate level of education | 66 (73.3%) + | 3 | high level of education | 16 (17.8%) + ---+----------+---------------------------------+----------- + 5 | [7, 28] | | 97 + ------------------------------------------------------------ + # data_codebook efc, value_label_width Code - data_codebook(efc, variable_label_width = 30, value_label_width = 15) + print(out, table_width = Inf) Output efc (100 rows and 5 variables, 5 shown) @@ -216,6 +387,93 @@ 5 | neg_c_7 | Negative impact with 7 items | numeric | 3 (3.0%) | [7, 28] | | 97 ------------------------------------------------------------------------------------------------------------------ +--- + + Code + print(out, table_width = "auto", remove_duplicates = FALSE) + Output + efc (100 rows and 5 variables, 5 shown) + + ID | Name | Label | Type | Missings + ---+----------+------------------------------+-------------+----------- + 1 | c12hour | average number of hours of | numeric | 2 (2.0%) + | | care per week | | + ---+----------+------------------------------+-------------+----------- + 2 | e16sex | elder's gender | numeric | 0 (0.0%) + | | | | + ---+----------+------------------------------+-------------+----------- + 3 | e42dep | elder's dependency | categorical | 3 (3.0%) + | | | | + | | | | + | | | | + ---+----------+------------------------------+-------------+----------- + 4 | c172code | carer's level of education | numeric | 10 (10.0%) + | | | | + | | | | + ---+----------+------------------------------+-------------+----------- + 5 | neg_c_7 | Negative impact with 7 items | numeric | 3 (3.0%) + ----------------------------------------------------------------------- + + ID | Values | Value Labels | N + ---+----------+------------------+----------- + 1 | [5, 168] | | 98 + | | | + ---+----------+------------------+----------- + 2 | 1 | male | 46 (46.0%) + | 2 | female | 54 (54.0%) + ---+----------+------------------+----------- + 3 | 1 | independent | 2 ( 2.1%) + | 2 | slightly... | 4 ( 4.1%) + | 3 | moderately... | 28 (28.9%) + | 4 | severely... | 63 (64.9%) + ---+----------+------------------+----------- + 4 | 1 | low level of... | 8 ( 8.9%) + | 2 | intermediate... | 66 (73.3%) + | 3 | high level of... | 16 (17.8%) + ---+----------+------------------+----------- + 5 | [7, 28] | | 97 + --------------------------------------------- + +--- + + Code + print(out, table_width = "auto", remove_duplicates = TRUE) + Output + efc (100 rows and 5 variables, 5 shown) + + ID | Name | Label | Type | Missings + ---+----------+------------------------------+-------------+----------- + 1 | c12hour | average number of hours of | numeric | 2 (2.0%) + | | care per week | | + ---+----------+------------------------------+-------------+----------- + 2 | e16sex | elder's gender | numeric | 0 (0.0%) + ---+----------+------------------------------+-------------+----------- + 3 | e42dep | elder's dependency | categorical | 3 (3.0%) + ---+----------+------------------------------+-------------+----------- + 4 | c172code | carer's level of education | numeric | 10 (10.0%) + ---+----------+------------------------------+-------------+----------- + 5 | neg_c_7 | Negative impact with 7 items | numeric | 3 (3.0%) + ----------------------------------------------------------------------- + + ID | Values | Value Labels | N + ---+----------+------------------+----------- + 1 | [5, 168] | | 98 + ---+----------+------------------+----------- + 2 | 1 | male | 46 (46.0%) + | 2 | female | 54 (54.0%) + ---+----------+------------------+----------- + 3 | 1 | independent | 2 ( 2.1%) + | 2 | slightly... | 4 ( 4.1%) + | 3 | moderately... | 28 (28.9%) + | 4 | severely... | 63 (64.9%) + ---+----------+------------------+----------- + 4 | 1 | low level of... | 8 ( 8.9%) + | 2 | intermediate... | 66 (73.3%) + | 3 | high level of... | 16 (17.8%) + ---+----------+------------------+----------- + 5 | [7, 28] | | 97 + --------------------------------------------- + # data_codebook truncated data Code diff --git a/tests/testthat/_snaps/data_tabulate.md b/tests/testthat/_snaps/data_tabulate.md index 59a20dc01..ffde63088 100644 --- a/tests/testthat/_snaps/data_tabulate.md +++ b/tests/testthat/_snaps/data_tabulate.md @@ -259,7 +259,7 @@ Code print(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full", - include_na = FALSE)) + remove_na = TRUE)) Output efc$c172code | male | female | Total -------------+------------+------------+------ @@ -288,7 +288,7 @@ Code print(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full", - include_na = FALSE, weights = efc$weights)) + remove_na = TRUE, weights = efc$weights)) Output efc$c172code | male | female | Total -------------+------------+------------+------ @@ -317,7 +317,7 @@ Code print(data_tabulate(efc, "c172code", by = efc$e16sex, proportions = "row", - include_na = FALSE)) + remove_na = TRUE)) Output c172code | male | female | Total ---------+------------+------------+------ @@ -348,7 +348,7 @@ Code print(data_tabulate(efc, "c172code", by = efc$e16sex, proportions = "row", - include_na = FALSE, weights = efc$weights)) + remove_na = TRUE, weights = efc$weights)) Output c172code | male | female | Total ---------+------------+------------+------ @@ -378,7 +378,7 @@ Code print(data_tabulate(efc, "c172code", by = "e16sex", proportions = "column", - include_na = FALSE)) + remove_na = TRUE)) Output c172code | male | female | Total ---------+------------+------------+------ @@ -409,7 +409,7 @@ Code print(data_tabulate(efc, "c172code", by = "e16sex", proportions = "column", - include_na = FALSE, weights = "weights")) + remove_na = TRUE, weights = "weights")) Output c172code | male | female | Total ---------+------------+------------+------ @@ -497,7 +497,7 @@ Code print_md(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full", - include_na = FALSE)) + remove_na = TRUE)) Output [1] "|efc$c172code | male| female| Total|" [2] "|:------------|----------:|----------:|-----:|" @@ -534,7 +534,7 @@ Code print_md(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full", - include_na = FALSE, weights = efc$weights)) + remove_na = TRUE, weights = efc$weights)) Output [1] "|efc$c172code | male| female| Total|" [2] "|:------------|----------:|----------:|-----:|" diff --git a/tests/testthat/_snaps/data_to_numeric.md b/tests/testthat/_snaps/data_to_numeric.md index 42cb00b67..e963890a5 100644 --- a/tests/testthat/_snaps/data_to_numeric.md +++ b/tests/testthat/_snaps/data_to_numeric.md @@ -1,7 +1,7 @@ # convert data frame to numeric Code - to_numeric(head(ToothGrowth)) + to_numeric(head(ToothGrowth), dummy_factors = TRUE) Output len supp.OJ supp.VC dose 1 4.2 0 1 0.5 @@ -27,7 +27,7 @@ # convert factor to numeric Code - to_numeric(f) + to_numeric(f, dummy_factors = TRUE) Output a c i s t 1 0 0 0 1 0 diff --git a/tests/testthat/_snaps/demean.md b/tests/testthat/_snaps/demean.md index 7f12d263d..a1c2da4a3 100644 --- a/tests/testthat/_snaps/demean.md +++ b/tests/testthat/_snaps/demean.md @@ -23,13 +23,13 @@ Code head(x) Output - Sepal.Length_between Species_between binary_between Species_setosa_between - 1 5.925000 0.850000 0.375 0.4250000 - 2 5.925000 0.850000 0.375 0.4250000 - 3 5.925000 0.850000 0.375 0.4250000 - 4 5.862222 1.133333 0.400 0.2888889 - 5 5.925000 0.850000 0.375 0.4250000 - 6 5.862222 1.133333 0.400 0.2888889 + Sepal.Length_between binary_between Species_between Species_setosa_between + 1 5.925000 0.375 0.850000 0.4250000 + 2 5.925000 0.375 0.850000 0.4250000 + 3 5.925000 0.375 0.850000 0.4250000 + 4 5.862222 0.400 1.133333 0.2888889 + 5 5.925000 0.375 0.850000 0.4250000 + 6 5.862222 0.400 1.133333 0.2888889 Species_versicolor_between Species_virginica_between Sepal.Length_within 1 0.3000000 0.2750000 -0.8250000 2 0.3000000 0.2750000 -1.0250000 @@ -37,13 +37,13 @@ 4 0.2888889 0.4222222 -1.2622222 5 0.3000000 0.2750000 -0.9250000 6 0.2888889 0.4222222 -0.4622222 - Species_within binary_within Species_setosa_within Species_versicolor_within - 1 -0.850000 -0.375 0.5750000 -0.3000000 - 2 -0.850000 0.625 0.5750000 -0.3000000 - 3 -0.850000 -0.375 0.5750000 -0.3000000 - 4 -1.133333 0.600 0.7111111 -0.2888889 - 5 -0.850000 0.625 0.5750000 -0.3000000 - 6 -1.133333 -0.400 0.7111111 -0.2888889 + binary_within Species_within Species_setosa_within Species_versicolor_within + 1 -0.375 -0.850000 0.5750000 -0.3000000 + 2 0.625 -0.850000 0.5750000 -0.3000000 + 3 -0.375 -0.850000 0.5750000 -0.3000000 + 4 0.600 -1.133333 0.7111111 -0.2888889 + 5 0.625 -0.850000 0.5750000 -0.3000000 + 6 -0.400 -1.133333 0.7111111 -0.2888889 Species_virginica_within 1 -0.2750000 2 -0.2750000 diff --git a/tests/testthat/test-categorize.R b/tests/testthat/test-categorize.R index 0e0b5d317..30453d9ad 100644 --- a/tests/testthat/test-categorize.R +++ b/tests/testthat/test-categorize.R @@ -1,5 +1,5 @@ set.seed(123) -d <- sample(1:10, size = 500, replace = TRUE) +d <- sample.int(10, size = 500, replace = TRUE) test_that("recode median", { expect_identical(categorize(d), ifelse(d >= median(d), 2, 1)) @@ -22,7 +22,7 @@ test_that("recode quantile", { }) set.seed(123) -d <- sample(1:100, size = 1000, replace = TRUE) +d <- sample.int(100, size = 1000, replace = TRUE) test_that("recode range", { expect_error(categorize(d, split = "range")) @@ -84,7 +84,7 @@ test_that("recode length", { }) set.seed(123) -x <- sample(1:10, size = 30, replace = TRUE) +x <- sample.int(10, size = 30, replace = TRUE) test_that("recode factor labels", { expect_type(categorize(x, "equal_length", n_groups = 3), "double") expect_s3_class(categorize(x, "equal_length", n_groups = 3, labels = c("low", "mid", "high")), "factor") @@ -232,3 +232,21 @@ test_that("categorize regex", { categorize(mtcars, select = "mpg") ) }) + + +# labelling ranges ------------------------------ +test_that("categorize labelling ranged", { + data(mtcars) + expect_snapshot(categorize(mtcars$mpg, "equal_length", n_groups = 5)) + expect_snapshot(categorize(mtcars$mpg, "equal_length", n_groups = 5, labels = "range")) + expect_snapshot(categorize(mtcars$mpg, "equal_length", n_groups = 5, labels = "observed")) +}) + +test_that("categorize breaks", { + data(mtcars) + expect_snapshot(categorize(mtcars$mpg, "equal_length", n_groups = 5, labels = "range", breaks = "inclusive")) + expect_error( + categorize(mtcars$mpg, "equal_length", n_groups = 5, breaks = "something"), + regex = "should be one of" + ) +}) diff --git a/tests/testthat/test-center.R b/tests/testthat/test-center.R index 7bff1ebc9..e7e347848 100644 --- a/tests/testthat/test-center.R +++ b/tests/testthat/test-center.R @@ -169,8 +169,7 @@ test_that("center, factors (grouped data)", { poorman::ungroup() %>% poorman::pull(Species) - manual <- iris %>% - poorman::pull(Species) + manual <- poorman::pull(iris, Species) expect_identical(datawizard, manual) }) diff --git a/tests/testthat/test-coef_var.R b/tests/testthat/test-coef_var.R index a55eb7b96..2ae2275cd 100644 --- a/tests/testthat/test-coef_var.R +++ b/tests/testthat/test-coef_var.R @@ -29,10 +29,6 @@ test_that("coef_var: argument 'remove_na' works", { ) }) -test_that("coef_var: deprecation warning", { - expect_warning(coef_var(c(1:10, NA), na.rm = TRUE)) -}) - test_that("coef_var: method 'unbiased' needs argument 'n' when sigma and mu are provided", { expect_error( coef_var(1:10, method = "unbiased", mu = 10, sigma = 20), diff --git a/tests/testthat/test-data_codebook.R b/tests/testthat/test-data_codebook.R index 26a67ccf6..06e9bd2f9 100644 --- a/tests/testthat/test-data_codebook.R +++ b/tests/testthat/test-data_codebook.R @@ -19,7 +19,7 @@ test_that("data_codebook NaN and Inf", { set.seed(123) d <- data.frame( - x = c(sample(1:15, 100, TRUE), Inf, Inf) + x = c(sample.int(15, 100, TRUE), Inf, Inf) ) expect_snapshot(data_codebook(d)) expect_snapshot(data_codebook(d, range_at = 100)) @@ -38,24 +38,32 @@ test_that("data_codebook iris, select, ID", { test_that("data_codebook efc", { - expect_snapshot(data_codebook(efc)) + expect_snapshot(print(data_codebook(efc), table_width = Inf)) + expect_snapshot(print(data_codebook(efc), table_width = "auto", remove_duplicates = FALSE)) + expect_snapshot(print(data_codebook(efc), table_width = "auto", remove_duplicates = TRUE)) }) test_that("data_codebook efc, variable_label_width", { - expect_snapshot(data_codebook(efc, variable_label_width = 30)) + out <- data_codebook(efc, variable_label_width = 30) + expect_snapshot(print(out, table_width = Inf)) + expect_snapshot(print(out, table_width = "auto", remove_duplicates = FALSE)) + expect_snapshot(print(out, table_width = "auto", remove_duplicates = TRUE)) }) test_that("data_codebook efc, value_label_width", { - expect_snapshot(data_codebook(efc, variable_label_width = 30, value_label_width = 15)) + out <- data_codebook(efc, variable_label_width = 30, value_label_width = 15) + expect_snapshot(print(out, table_width = Inf)) + expect_snapshot(print(out, table_width = "auto", remove_duplicates = FALSE)) + expect_snapshot(print(out, table_width = "auto", remove_duplicates = TRUE)) }) test_that("data_codebook truncated data", { set.seed(123) d <- data.frame( - a = sample(1:15, 100, TRUE), + a = sample.int(15, 100, TRUE), b = sample(letters[1:18], 100, TRUE), stringsAsFactors = FALSE ) @@ -66,7 +74,7 @@ test_that("data_codebook truncated data", { test_that("data_codebook mixed numeric lengths", { set.seed(123) d <- data.frame( - a = sample(1:4, 100, TRUE), + a = sample.int(4, 100, TRUE), b = sample(5:15, 100, TRUE), stringsAsFactors = FALSE ) @@ -76,7 +84,7 @@ test_that("data_codebook mixed numeric lengths", { test_that("data_codebook mixed range_at", { set.seed(123) d <- data.frame( - a = sample(1:4, 100, TRUE), + a = sample.int(4, 100, TRUE), b = sample(5:15, 100, TRUE), stringsAsFactors = FALSE ) @@ -87,7 +95,7 @@ test_that("data_codebook mixed range_at", { test_that("data_codebook logicals", { set.seed(123) d <- data.frame( - a = sample(1:15, 100, TRUE), + a = sample.int(15, 100, TRUE), b = sample(letters[1:3], 100, TRUE), c = sample(c(TRUE, FALSE), 100, TRUE), stringsAsFactors = FALSE @@ -99,14 +107,14 @@ test_that("data_codebook logicals", { test_that("data_codebook labelled data exceptions", { set.seed(123) - f1 <- sample(1:5, 100, TRUE) + f1 <- sample.int(5, 100, TRUE) f1[f1 == 4] <- NA attr(f1, "labels") <- setNames(1:5, c("One", "Two", "Three", "Four", "Five")) - f2 <- sample(1:5, 100, TRUE) + f2 <- sample.int(5, 100, TRUE) attr(f2, "labels") <- setNames(c(1:3, 5), c("One", "Two", "Three", "Five")) - f3 <- sample(1:5, 100, TRUE) + f3 <- sample.int(5, 100, TRUE) attr(f3, "labels") <- setNames(1:5, c("One", "Two", "Three", "Four", "Five")) d <- data.frame(f1, f2, f3) @@ -143,7 +151,7 @@ test_that("data_codebook works with numbers < 1", { test_that("data_codebook, big marks", { set.seed(123) f1 <- factor(sample(c("c", "b", "a"), 1e6, TRUE)) - f2 <- factor(sample(1:3, 1e6, TRUE)) + f2 <- factor(sample.int(3, 1e6, TRUE)) d <- data.frame(f1, f2) expect_snapshot(data_codebook(d)) }) diff --git a/tests/testthat/test-data_match.R b/tests/testthat/test-data_match.R index 75991b4b2..1a40f39fd 100644 --- a/tests/testthat/test-data_match.R +++ b/tests/testthat/test-data_match.R @@ -52,7 +52,7 @@ test_that("data_match works with missing data", { data.frame(c172code = 1, e16sex = 2), match = "not", return_indices = TRUE, - drop_na = FALSE + remove_na = FALSE )) expect_identical(x1, 41L) x1 <- length(data_match( @@ -60,7 +60,7 @@ test_that("data_match works with missing data", { data.frame(c172code = 1, e16sex = 2), match = "not", return_indices = TRUE, - drop_na = TRUE + remove_na = TRUE )) expect_identical(x1, 36L) }) diff --git a/tests/testthat/test-data_modify.R b/tests/testthat/test-data_modify.R index 9bb0a92d6..a7a153c43 100644 --- a/tests/testthat/test-data_modify.R +++ b/tests/testthat/test-data_modify.R @@ -353,6 +353,16 @@ test_that("data_modify errors for non df", { }) +test_that("data_modify errors for empty data frames", { + data(mtcars) + x <- mtcars[1, ] + expect_error( + data_modify(x[-1, ], new_var = 5), + regex = "empty data frame" + ) +}) + + test_that("data_modify errors for non df", { data(efc) a <- "center(c22hour)" # <---------------- error in variable name @@ -492,6 +502,20 @@ test_that("data_modify works with functions that return character vectors", { }) +test_that("data_modify 1:n() and similar works in (grouped) data frames", { + data(mtcars) + out <- data_modify(mtcars, Trials = 1:n()) # nolint + expect_identical(out$Trials, 1:32) + x <- data_group(mtcars, "gear") + out <- data_modify(x, Trials = 1:n()) # nolint + expect_identical(out$Trials[out$gear == 3], 1:15) + expect_identical(out$Trials[out$gear == 4], 1:12) + out <- data_modify(x, Trials = 3:(n() + 2)) + expect_identical(out$Trials[out$gear == 3], 3:17) + expect_identical(out$Trials[out$gear == 4], 3:14) +}) + + test_that("data_modify .if/.at arguments", { data(iris) d <- iris[1:5, ] @@ -550,3 +574,31 @@ test_that("data_modify .if/.at arguments", { out <- data_modify(d, new_length = Petal.Length * 2, .if = is.numeric, .modify = round) expect_equal(out$new_length, c(3, 3, 3, 3, 3), ignore_attr = TRUE) }) + + +skip_if_not_installed("withr") + +withr::with_environment( + new.env(), + test_that("data_modify 1:n() and similar works in (grouped) data frames inside function calls", { + data(mtcars) + x <- data_group(mtcars, "gear") + + foo <- function(d) { + out <- data_modify(d, Trials = 1:n()) # nolint + out$Trials + } + expect_identical( + foo(x), + c( + 1L, 2L, 3L, 1L, 2L, 3L, 4L, 4L, 5L, 6L, 7L, 5L, 6L, 7L, 8L, + 9L, 10L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 11L, 1L, 2L, 3L, + 4L, 5L, 12L + ) + ) + }) +) + +test_that("data_modify errors on non-defined function", { + expect_error(data_modify(iris, Species = foo())) +}) diff --git a/tests/testthat/test-data_read.R b/tests/testthat/test-data_read.R index fd4884deb..15f1161d3 100644 --- a/tests/testthat/test-data_read.R +++ b/tests/testthat/test-data_read.R @@ -141,12 +141,12 @@ test_that("data_read - RDS file, matrix, coercible", { httr::stop_for_status(request) writeBin(httr::content(request, type = "raw"), temp_file) - expect_message(expect_message(expect_message({ + expect_message({ d <- data_read( temp_file, verbose = TRUE ) - })), regex = "0 out of 5") + }) expect_s3_class(d, "data.frame") expect_identical(dim(d), c(2L, 5L)) @@ -154,6 +154,42 @@ test_that("data_read - RDS file, matrix, coercible", { }) + +# RDS file, preserve class /types ----------------------------------- + +test_that("data_read - RDS file, preserve class", { + withr::with_tempfile("temp_file", fileext = ".rds", code = { + request <- httr::GET("https://raw.github.com/easystats/circus/main/data/hiv.rds") + httr::stop_for_status(request) + writeBin(httr::content(request, type = "raw"), temp_file) + + d <- data_read(temp_file) + expect_s3_class(d, "data.frame") + expect_identical( + sapply(d, class), + c( + village = "integer", outcome = "integer", distance = "numeric", + amount = "numeric", incentive = "integer", age = "integer", + hiv2004 = "integer", agecat = "factor" + ) + ) + }) +}) + + + +# RData ----------------------------------- + +test_that("data_read - no warning for RData", { + withr::with_tempfile("temp_file", fileext = ".RData", code = { + data(mtcars) + save(mtcars, file = temp_file) + expect_silent(data_read(temp_file, verbose = FALSE)) + }) +}) + + + # SPSS file ----------------------------------- test_that("data_read - SPSS file", { diff --git a/tests/testthat/test-data_rename.R b/tests/testthat/test-data_rename.R index a8d003b59..e01c42f8b 100644 --- a/tests/testthat/test-data_rename.R +++ b/tests/testthat/test-data_rename.R @@ -14,6 +14,10 @@ test_that("data_rename works with one or several replacements", { ), c("length", "width", "Petal.Length", "Petal.Width", "Species") ) + expect_named( + data_rename(test, c(length = "Sepal.Length", width = "Sepal.Width")), + c("length", "width", "Petal.Length", "Petal.Width", "Species") + ) }) test_that("data_rename returns a data frame", { @@ -24,11 +28,26 @@ test_that("data_rename returns a data frame", { test_that("data_rename: pattern must be of type character", { expect_error( data_rename(test, pattern = 1), - regexp = "Argument `pattern` must be of type character." + regexp = "Argument `pattern` must be of type character" ) expect_error( data_rename(test, pattern = TRUE), - regexp = "Argument `pattern` must be of type character." + regexp = "Argument `pattern` must be of type character" + ) +}) + +test_that("data_rename: replacement not allowed to have NA or empty strings", { + expect_error( + data_rename(test, pattern = c(test = "Species", "Sepal.Length")), + regexp = "Either name all elements of `pattern`" + ) + expect_error( + data_rename( + test, + pattern = c("Species", "Sepal.Length"), + replacement = c("foo", NA_character_) + ), + regexp = "`replacement` is not allowed" ) }) @@ -42,7 +61,9 @@ test_that("data_rename uses indices when no replacement", { test_that("data_rename works when too many names in 'replacement'", { expect_message( - x <- data_rename(test, replacement = paste0("foo", 1:6)), + { + x <- data_rename(test, replacement = paste0("foo", 1:6)) + }, "There are more names in" ) expect_identical(dim(test), dim(x)) @@ -51,7 +72,9 @@ test_that("data_rename works when too many names in 'replacement'", { test_that("data_rename works when not enough names in 'replacement'", { expect_message( - x <- data_rename(test, replacement = paste0("foo", 1:2)), + { + x <- data_rename(test, replacement = paste0("foo", 1:2)) + }, "There are more names in" ) expect_identical(dim(test), dim(x)) diff --git a/tests/testthat/test-data_summary.R b/tests/testthat/test-data_summary.R index 746d4c51a..c60b142d2 100644 --- a/tests/testthat/test-data_summary.R +++ b/tests/testthat/test-data_summary.R @@ -175,7 +175,7 @@ test_that("data_summary, with NA", { data(efc, package = "datawizard") out <- data_summary(efc, MW = mean(c12hour, na.rm = TRUE), by = "c172code") expect_snapshot(print(out)) - out <- data_summary(efc, MW = mean(c12hour, na.rm = TRUE), by = "c172code", include_na = FALSE) + out <- data_summary(efc, MW = mean(c12hour, na.rm = TRUE), by = "c172code", remove_na = TRUE) expect_snapshot(print(out)) # sorting for multiple groups out <- data_summary(efc, MW = mean(c12hour, na.rm = TRUE), by = c("e42dep", "c172code")) diff --git a/tests/testthat/test-data_tabulate.R b/tests/testthat/test-data_tabulate.R index 39f5d44c6..9848d42b9 100644 --- a/tests/testthat/test-data_tabulate.R +++ b/tests/testthat/test-data_tabulate.R @@ -81,7 +81,7 @@ test_that("data_tabulate data.frame", { "Variable", "Value", "N", "Raw %", "Valid %", "Cumulative %" ), - class = c("dw_data_tabulate", "data.frame"), + class = c("datawizard_table", "data.frame"), row.names = 1:3, type = "numeric", varname = "e16sex", @@ -99,7 +99,7 @@ test_that("data_tabulate data.frame", { "Variable", "Value", "N", "Raw %", "Valid %", "Cumulative %" ), - class = c("dw_data_tabulate", "data.frame"), + class = c("datawizard_table", "data.frame"), row.names = 1:4, type = "numeric", varname = "c172code", @@ -139,7 +139,7 @@ test_that("data_tabulate print", { attributes(out), list( names = c("Variable", "Value", "N", "Raw %", "Valid %", "Cumulative %"), - class = c("dw_data_tabulate", "data.frame"), + class = c("datawizard_table", "data.frame"), row.names = 1:4, type = "integer", varname = "Large Number", @@ -197,7 +197,7 @@ test_that("data_tabulate grouped data.frame", { "Valid %", "Cumulative %" ), - class = c("dw_data_tabulate", "data.frame"), + class = c("datawizard_table", "data.frame"), row.names = 1:4, type = "numeric", varname = "c172code", @@ -268,6 +268,7 @@ test_that("data_tabulate drop levels", { # select helpers ------------------------------ + test_that("data_tabulate regex", { data(mtcars) expect_identical( @@ -286,16 +287,17 @@ test_that("data_tabulate exclude/include missing values", { efc$e16sex[sample.int(nrow(efc), 5)] <- NA out <- data_tabulate(efc$c172code) expect_identical(out$N, c(8L, 66L, 16L, 10L)) - out <- data_tabulate(efc$c172code, include_na = FALSE) + out <- data_tabulate(efc$c172code, remove_na = TRUE) expect_identical(out$N, c(8L, 66L, 16L)) out <- data_tabulate(efc$c172code, weights = efc$weights) expect_identical(out$N, c(10, 67, 15, 13)) - out <- data_tabulate(efc$c172code, include_na = FALSE, weights = efc$weights) + out <- data_tabulate(efc$c172code, remove_na = TRUE, weights = efc$weights) expect_identical(out$N, c(10, 67, 15)) }) # cross tables ------------------------------ + test_that("data_tabulate, cross tables", { data(efc, package = "datawizard") set.seed(123) @@ -303,17 +305,17 @@ test_that("data_tabulate, cross tables", { efc$e16sex[sample.int(nrow(efc), 5)] <- NA expect_snapshot(print(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full"))) - expect_snapshot(print(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full", include_na = FALSE))) + expect_snapshot(print(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full", remove_na = TRUE))) expect_snapshot(print(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full", weights = efc$weights))) - expect_snapshot(print(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full", include_na = FALSE, weights = efc$weights))) # nolint + expect_snapshot(print(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full", remove_na = TRUE, weights = efc$weights))) # nolint expect_snapshot(print(data_tabulate(efc, "c172code", by = efc$e16sex, proportions = "row"))) - expect_snapshot(print(data_tabulate(efc, "c172code", by = efc$e16sex, proportions = "row", include_na = FALSE))) + expect_snapshot(print(data_tabulate(efc, "c172code", by = efc$e16sex, proportions = "row", remove_na = TRUE))) expect_snapshot(print(data_tabulate(efc, "c172code", by = efc$e16sex, proportions = "row", weights = efc$weights))) - expect_snapshot(print(data_tabulate(efc, "c172code", by = efc$e16sex, proportions = "row", include_na = FALSE, weights = efc$weights))) # nolint + expect_snapshot(print(data_tabulate(efc, "c172code", by = efc$e16sex, proportions = "row", remove_na = TRUE, weights = efc$weights))) # nolint expect_snapshot(print(data_tabulate(efc, "c172code", by = "e16sex", proportions = "column"))) - expect_snapshot(print(data_tabulate(efc, "c172code", by = "e16sex", proportions = "column", include_na = FALSE))) + expect_snapshot(print(data_tabulate(efc, "c172code", by = "e16sex", proportions = "column", remove_na = TRUE))) expect_snapshot(print(data_tabulate(efc, "c172code", by = "e16sex", proportions = "column", weights = "weights"))) - expect_snapshot(print(data_tabulate(efc, "c172code", by = "e16sex", proportions = "column", include_na = FALSE, weights = "weights"))) # nolint + expect_snapshot(print(data_tabulate(efc, "c172code", by = "e16sex", proportions = "column", remove_na = TRUE, weights = "weights"))) # nolint }) test_that("data_tabulate, cross tables, HTML", { @@ -324,11 +326,11 @@ test_that("data_tabulate, cross tables, HTML", { efc$e16sex[sample.int(nrow(efc), 5)] <- NA expect_s3_class(print_html(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full")), "gt_tbl") - expect_s3_class(print_html(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full", include_na = FALSE)), "gt_tbl") # nolint + expect_s3_class(print_html(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full", remove_na = TRUE)), "gt_tbl") # nolint expect_s3_class(print_html(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full", weights = efc$weights)), "gt_tbl") # nolint - expect_s3_class(print_html(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full", include_na = FALSE, weights = efc$weights)), "gt_tbl") # nolint + expect_s3_class(print_html(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full", remove_na = TRUE, weights = efc$weights)), "gt_tbl") # nolint expect_s3_class(print_html(data_tabulate(efc, "c172code", by = efc$e16sex, proportions = "row")), "gt_tbl") - expect_s3_class(print_html(data_tabulate(efc, "c172code", by = efc$e16sex, proportions = "row", include_na = FALSE, weights = efc$weights)), "gt_tbl") # nolint + expect_s3_class(print_html(data_tabulate(efc, "c172code", by = efc$e16sex, proportions = "row", remove_na = TRUE, weights = efc$weights)), "gt_tbl") # nolint }) test_that("data_tabulate, cross tables, grouped df", { @@ -375,23 +377,24 @@ test_that("data_tabulate, cross tables, markdown", { efc$e16sex[sample.int(nrow(efc), 5)] <- NA expect_snapshot(print_md(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full"))) - expect_snapshot(print_md(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full", include_na = FALSE))) + expect_snapshot(print_md(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full", remove_na = TRUE))) expect_snapshot(print_md(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full", weights = efc$weights))) - expect_snapshot(print_md(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full", include_na = FALSE, weights = efc$weights))) # nolint + expect_snapshot(print_md(data_tabulate(efc$c172code, by = efc$e16sex, proportions = "full", remove_na = TRUE, weights = efc$weights))) # nolint }) + # validate against table ------------------------- test_that("data_tabulate, validate against table", { data(mtcars) # frequency table out1 <- as.data.frame(table(mtcars$cyl)) - out2 <- data_tabulate(mtcars$cyl, include_na = FALSE) + out2 <- data_tabulate(mtcars$cyl, remove_na = TRUE) expect_identical(out1$Freq, out2$N) # crosstable out1 <- data_arrange(as.data.frame(table(mtcars$cyl, mtcars$gear)), c("Var1", "Var2")) out2 <- data_rename(data_to_long( - as.data.frame(data_tabulate(mtcars$cyl, by = mtcars$gear, include_na = FALSE)), 2:4, + as.data.frame(data_tabulate(mtcars$cyl, by = mtcars$gear, remove_na = TRUE)), 2:4, names_to = "Var2", values_to = "Freq" ), "mtcars$cyl", "Var1") out1[[2]] <- as.character(out1[[2]]) @@ -405,3 +408,68 @@ test_that("data_tabulate, correct 0% for proportions", { expect_identical(format(out[[1]])[[4]], c("0 (0%)", "0 (0%)", "0 (0%)", "0 (0%)", "", "0")) expect_snapshot(print(out[[1]])) }) + + +# coercing to data frame ------------------------- + +test_that("data_tabulate, as.data.frame, frequency tables", { + data(mtcars) + # frequency table + x <- data_tabulate(mtcars$cyl) + out <- as.data.frame(x) + expect_named(out, c("Variable", "Value", "N", "Raw %", "Valid %", "Cumulative %")) + expect_identical(out$Variable, c("mtcars$cyl", "mtcars$cyl", "mtcars$cyl", "mtcars$cyl")) + expect_false(any(vapply(out[2:ncol(out)], is.character, logical(1)))) + # frequency tables + x <- data_tabulate(mtcars, select = c("cyl", "am")) + out <- as.data.frame(x) + expect_named(out, c("var", "table")) + expect_equal(vapply(out, class, character(1)), c("character", "AsIs"), ignore_attr = TRUE) + expect_length(out$table, 2L) + expect_named(out$table[[1]], c("Variable", "Value", "N", "Raw %", "Valid %", "Cumulative %")) + expect_identical(out$table[[1]]$Variable, c("cyl", "cyl", "cyl", "cyl")) + expect_false(any(vapply(out$table[[1]][2:ncol(out$table[[1]])], is.character, logical(1)))) +}) + + +test_that("data_tabulate, as.data.frame, cross tables", { + data(mtcars) + # cross table + x <- data_tabulate(mtcars, "cyl", by = "am") + out <- as.data.frame(x) + expect_named(out, c("var", "table")) + expect_equal(vapply(out, class, character(1)), c("character", "AsIs"), ignore_attr = TRUE) + expect_length(out$table, 1L) + expect_named(out$table[[1]], c("cyl", "0", "1", "NA")) + expect_identical(nrow(out$table[[1]]), 4L) + # cross tables + x <- data_tabulate(mtcars, c("cyl", "vs"), by = "am") + out <- as.data.frame(x) + expect_named(out, c("var", "table")) + expect_equal(vapply(out, class, character(1)), c("character", "AsIs"), ignore_attr = TRUE) + expect_length(out$table, 2L) + expect_named(out$table[[1]], c("cyl", "0", "1", "NA")) + expect_identical(nrow(out$table[[1]]), 4L) +}) + + +test_that("data_tabulate, as.data.frame, cross tables with total N", { + # cross table, with total + x <- data_tabulate(mtcars, "cyl", by = "am") + out <- as.data.frame(x, add_total = TRUE) + expect_named(out, c("var", "table")) + expect_equal(vapply(out, class, character(1)), c("character", "AsIs"), ignore_attr = TRUE) + expect_length(out$table, 1L) + expect_named(out$table[[1]], c("cyl", "0", "1", "", "Total")) + expect_identical(nrow(out$table[[1]]), 5L) + expect_identical(out$table[[1]]$cyl, c("4", "6", "8", NA, "Total")) + # cross tables, with total + x <- data_tabulate(mtcars, c("cyl", "vs"), by = "am") + out <- as.data.frame(x, add_total = TRUE) + expect_named(out, c("var", "table")) + expect_equal(vapply(out, class, character(1)), c("character", "AsIs"), ignore_attr = TRUE) + expect_length(out$table, 2L) + expect_named(out$table[[1]], c("cyl", "0", "1", "", "Total")) + expect_identical(nrow(out$table[[1]]), 5L) + expect_identical(out$table[[1]]$cyl, c("4", "6", "8", NA, "Total")) +}) diff --git a/tests/testthat/test-data_to_numeric.R b/tests/testthat/test-data_to_numeric.R index 464c35e8d..816591ac0 100644 --- a/tests/testthat/test-data_to_numeric.R +++ b/tests/testthat/test-data_to_numeric.R @@ -1,5 +1,5 @@ test_that("convert data frame to numeric", { - expect_snapshot(to_numeric(head(ToothGrowth))) + expect_snapshot(to_numeric(head(ToothGrowth), dummy_factors = TRUE)) expect_snapshot(to_numeric(head(ToothGrowth), dummy_factors = FALSE)) }) @@ -41,7 +41,7 @@ test_that("convert character to numeric lowest", { test_that("convert factor to numeric", { f <- factor(substring("statistics", 1:10, 1:10)) - expect_snapshot(to_numeric(f)) + expect_snapshot(to_numeric(f, dummy_factors = TRUE)) }) test_that("convert factor to numeric", { @@ -67,12 +67,12 @@ test_that("convert factor to numeric, dummy factors", { test_that("convert factor to numeric, append", { data(efc) expect_identical( - colnames(to_numeric(efc)), + colnames(to_numeric(efc, dummy_factors = TRUE)), c("c12hour", "e16sex", "e42dep.1", "e42dep.2", "e42dep.3", "e42dep.4", "c172code", "neg_c_7"), ignore_attr = TRUE ) expect_identical( - colnames(to_numeric(efc, append = TRUE)), + colnames(to_numeric(efc, dummy_factors = TRUE, append = TRUE)), c( "c12hour", "e16sex", "e42dep", "c172code", "neg_c_7", "e42dep_n", "e42dep_n.1", "e42dep_n.2", "e42dep_n.3", "e42dep_n.4" diff --git a/tests/testthat/test-demean.R b/tests/testthat/test-demean.R index 566bd6097..6e169f9c0 100644 --- a/tests/testthat/test-demean.R +++ b/tests/testthat/test-demean.R @@ -57,8 +57,174 @@ test_that("demean shows message if some vars don't exist", { ) set.seed(123) - expect_message( + expect_error( demean(dat, select = "foo", by = "ID"), regexp = "not found" ) }) + + +# see issue #520 +test_that("demean for cross-classified designs (by > 1)", { + skip_if_not_installed("poorman") + + data(efc, package = "datawizard") + dat <- na.omit(efc) + dat$e42dep <- factor(dat$e42dep) + dat$c172code <- factor(dat$c172code) + + x2a <- dat %>% + data_group(e42dep) %>% + data_modify( + c12hour_e42dep = mean(c12hour) + ) %>% + data_ungroup() %>% + data_group(c172code) %>% + data_modify( + c12hour_c172code = mean(c12hour) + ) %>% + data_ungroup() %>% + data_modify( + c12hour_within = c12hour - c12hour_e42dep - c12hour_c172code + ) + + out <- degroup( + dat, + select = "c12hour", + by = c("e42dep", "c172code"), + suffix_demean = "_within" + ) + + expect_equal( + out$c12hour_e42dep_between, + x2a$c12hour_e42dep, + tolerance = 1e-4, + ignore_attr = TRUE + ) + expect_equal( + out$c12hour_within, + x2a$c12hour_within, + tolerance = 1e-4, + ignore_attr = TRUE + ) + + x2a <- dat %>% + data_group(e42dep) %>% + data_modify( + c12hour_e42dep = mean(c12hour, na.rm = TRUE), + neg_c_7_e42dep = mean(neg_c_7, na.rm = TRUE) + ) %>% + data_ungroup() %>% + data_group(c172code) %>% + data_modify( + c12hour_c172code = mean(c12hour, na.rm = TRUE), + neg_c_7_c172code = mean(neg_c_7, na.rm = TRUE) + ) %>% + data_ungroup() %>% + data_modify( + c12hour_within = c12hour - c12hour_e42dep - c12hour_c172code, + neg_c_7_within = neg_c_7 - neg_c_7_e42dep - neg_c_7_c172code + ) + + out <- degroup( + dat, + select = c("c12hour", "neg_c_7"), + by = c("e42dep", "c172code"), + suffix_demean = "_within" + ) + + expect_equal( + out$c12hour_e42dep_between, + x2a$c12hour_e42dep, + tolerance = 1e-4, + ignore_attr = TRUE + ) + expect_equal( + out$neg_c_7_c172code_between, + x2a$neg_c_7_c172code, + tolerance = 1e-4, + ignore_attr = TRUE + ) + expect_equal( + out$neg_c_7_within, + x2a$neg_c_7_within, + tolerance = 1e-4, + ignore_attr = TRUE + ) + expect_equal( + out$c12hour_within, + x2a$c12hour_within, + tolerance = 1e-4, + ignore_attr = TRUE + ) +}) + + +test_that("demean, sanity checks", { + data(efc, package = "datawizard") + dat <- na.omit(efc) + dat$e42dep <- factor(dat$e42dep) + dat$c172code <- factor(dat$c172code) + + expect_error( + degroup( + dat, + select = c("c12hour", "neg_c_8"), + by = c("e42dep", "c172code"), + suffix_demean = "_within" + ), + regex = "Variable \"neg_c_8\" was not found" + ) + expect_error( + degroup( + dat, + select = c("c12hour", "neg_c_8"), + by = c("e42dep", "c173code"), + suffix_demean = "_within" + ), + regex = "Variables \"neg_c_8\" and \"c173code\" were not found" + ) +}) + + +test_that("demean for nested designs (by > 1), nested = TRUE", { + data(efc, package = "datawizard") + dat <- na.omit(efc) + dat$e42dep <- factor(dat$e42dep) + dat$c172code <- factor(dat$c172code) + + x_ijk <- dat$c12hour + xbar_k <- ave(x_ijk, dat$e42dep, FUN = mean) + xbar_jk <- ave(x_ijk, dat$e42dep, dat$c172code, FUN = mean) + + L3_between <- xbar_k + L2_between <- xbar_jk - xbar_k + L1_within <- x_ijk - xbar_jk + + out <- degroup( + dat, + select = "c12hour", + by = c("e42dep", "c172code"), + nested = TRUE, + suffix_demean = "_within" + ) + + expect_equal( + out$c12hour_within, + L1_within, + tolerance = 1e-4, + ignore_attr = TRUE + ) + expect_equal( + out$c12hour_e42dep_between, + L3_between, + tolerance = 1e-4, + ignore_attr = TRUE + ) + expect_equal( + out$c12hour_c172code_between, + L2_between, + tolerance = 1e-4, + ignore_attr = TRUE + ) +}) diff --git a/tests/testthat/test-describe_distribution.R b/tests/testthat/test-describe_distribution.R index 83d2abb33..dfa7bf617 100644 --- a/tests/testthat/test-describe_distribution.R +++ b/tests/testthat/test-describe_distribution.R @@ -286,3 +286,16 @@ test_that("describe_distribution formatting", { x <- describe_distribution(iris$Sepal.Width, quartiles = TRUE) expect_snapshot(format(x)) }) + +# other ----------------------------------- + +test_that("return NA in CI if sample is too sparse", { + skip_if_not_installed("bayestestR") + set.seed(123456) + expect_warning( + res <- describe_distribution(mtcars[mtcars$cyl == "6", ], wt, centrality = "map", ci = 0.95), # nolint + "When bootstrapping CIs, sample was too sparse to find TD" + ) + expect_identical(res$CI_low, NA) + expect_identical(res$CI_high, NA) +}) diff --git a/tests/testthat/test-mean_sd.R b/tests/testthat/test-mean_sd.R index e0af8a0f1..3e0829fb1 100644 --- a/tests/testthat/test-mean_sd.R +++ b/tests/testthat/test-mean_sd.R @@ -15,8 +15,3 @@ test_that("mean_sd", { expect_equal(unname(diff(msd2)), rep(sd(mtcars[["mpg"]]), 6), tolerance = 0.00001) expect_named(msd2, c("-3 SD", "-2 SD", "-1 SD", "Mean", "+1 SD", "+2 SD", "+3 SD")) }) - -test_that("deprecation warning for `na.rm`", { - expect_warning(mean_sd(c(-1, 0, 1, NA), na.rm = TRUE)) - expect_warning(median_mad(c(-1, 0, 1, 2, 3, NA), na.rm = TRUE)) -}) diff --git a/tests/testthat/test-row_count.R b/tests/testthat/test-row_count.R new file mode 100644 index 000000000..0c7d67691 --- /dev/null +++ b/tests/testthat/test-row_count.R @@ -0,0 +1,57 @@ +test_that("row_count", { + d_mn <- data.frame( + c1 = c(1, 2, NA, 4), + c2 = c(NA, 2, NA, 5), + c3 = c(NA, 4, NA, NA), + c4 = c(2, 3, 7, 8) + ) + expect_identical(row_count(d_mn, count = 2), c(1, 2, 0, 0)) + expect_identical(row_count(d_mn, count = NA), c(2, 0, 3, 1)) + d_mn <- data.frame( + c1 = c("a", "b", NA, "c"), + c2 = c(NA, "b", NA, "d"), + c3 = c(NA, 4, NA, NA), + c4 = c(2, 3, 7, Inf), + stringsAsFactors = FALSE + ) + expect_identical(row_count(d_mn, count = "b"), c(0, 2, 0, 0)) + expect_identical(row_count(d_mn, count = Inf), c(0, 0, 0, 1)) +}) + +test_that("row_count, errors or messages", { + data(iris) + expect_error(expect_warning(row_count(iris, select = "abc")), regex = "must be a valid") + expect_error(expect_warning(row_count(iris, select = "abc", count = 3)), regex = "No columns") + expect_error(row_count(iris[1], count = 3), regex = "with at least") + expect_error(row_count(iris[-seq_len(nrow(iris)), , drop = FALSE], count = 2), regex = "one row") +}) + +test_that("row_count, allow_coercion match", { + d_mn <- data.frame( + c1 = c("1", "2", NA, "3"), + c2 = c(NA, "2", NA, "3"), + c3 = c(NA, 4, NA, NA), + c4 = c(2, 3, 7, Inf), + stringsAsFactors = FALSE + ) + expect_identical(row_count(d_mn, count = 2, allow_coercion = TRUE), c(1, 2, 0, 0)) + expect_identical(row_count(d_mn, count = 2, allow_coercion = FALSE), c(1, 0, 0, 0)) + expect_identical(row_count(d_mn, count = "2", allow_coercion = FALSE), c(0, 2, 0, 0)) + expect_identical(row_count(d_mn, count = factor("2"), allow_coercion = TRUE), c(1, 2, 0, 0)) + expect_error(row_count(d_mn, count = factor("2"), allow_coercion = FALSE), regex = "No column has") + + # mix character / factor + d_mn <- data.frame( + c1 = factor(c("1", "2", NA, "3")), + c2 = c("2", "1", NA, "3"), + c3 = c(NA, 4, NA, NA), + c4 = c(2, 3, 7, Inf), + stringsAsFactors = FALSE + ) + expect_identical(row_count(d_mn, count = 2, allow_coercion = TRUE), c(2, 1, 0, 0)) + expect_identical(row_count(d_mn, count = 2, allow_coercion = FALSE), c(1, 0, 0, 0)) + expect_identical(row_count(d_mn, count = "2", allow_coercion = FALSE), c(1, 0, 0, 0)) + expect_identical(row_count(d_mn, count = "2", allow_coercion = TRUE), c(2, 1, 0, 0)) + expect_identical(row_count(d_mn, count = factor("2"), allow_coercion = FALSE), c(0, 1, 0, 0)) + expect_identical(row_count(d_mn, count = factor("2"), allow_coercion = TRUE), c(2, 1, 0, 0)) +}) diff --git a/tests/testthat/test-row_means.R b/tests/testthat/test-row_means.R index 8d0504c69..4db0d7039 100644 --- a/tests/testthat/test-row_means.R +++ b/tests/testthat/test-row_means.R @@ -1,4 +1,4 @@ -test_that("row_means", { +test_that("row_means/sums", { d_mn <- data.frame( c1 = c(1, 2, NA, 4), c2 = c(NA, 2, NA, 5), @@ -14,14 +14,21 @@ test_that("row_means", { expect_equal(row_means(d_mn, min_valid = 2, digits = 1), c(1.5, 2.8, NA, 5.7), tolerance = 1e-1) expect_message(row_means(iris), regex = "Only numeric") expect_equal(row_means(iris, verbose = FALSE), rowMeans(iris[, 1:4]), tolerance = 1e-3, ignore_attr = TRUE) + expect_equal(row_sums(d_mn, min_valid = 4), c(NA, 11, NA, NA), tolerance = 1e-3) + expect_equal(row_sums(d_mn, min_valid = 3), c(NA, 11, NA, 17), tolerance = 1e-3) + expect_message(row_sums(iris), regex = "Only numeric") }) -test_that("row_means, errors or messages", { +test_that("row_means/sums, errors or messages", { data(iris) expect_error(expect_warning(row_means(iris, select = "abc")), regex = "No columns") + expect_error(expect_warning(row_sums(iris, select = "abc")), regex = "No columns") expect_error(row_means(iris[1], min_valid = 1), regex = "two numeric") expect_error(row_means(iris, min_valid = 1:4), regex = "numeric value") expect_error(row_means(iris, min_valid = "a"), regex = "numeric value") expect_message(row_means(iris[1:3, ], min_valid = 3), regex = "Only numeric") expect_silent(row_means(iris[1:3, ], min_valid = 3, verbose = FALSE)) + expect_error(row_sums(iris[1], min_valid = 1), regex = "two numeric") + expect_message(row_sums(iris[1:3, ], min_valid = 3), regex = "Only numeric") + expect_silent(row_sums(iris[1:3, ], min_valid = 3, verbose = FALSE)) }) diff --git a/tests/testthat/test-select_nse.R b/tests/testthat/test-select_nse.R index c0195ad94..fb0f6aefb 100644 --- a/tests/testthat/test-select_nse.R +++ b/tests/testthat/test-select_nse.R @@ -138,3 +138,24 @@ test_that(".select_nse: works with function and namespace", { out <- fun(insight::find_predictors(model, effects = "fixed", flatten = TRUE)) expect_identical(out, iris["Petal.Width"]) }) + +test_that(".select_nse: allow character vector with :", { + data(mtcars) + out <- data_select(mtcars, c("cyl:hp", "wt", "vs:gear")) + expect_named(out, c("cyl", "disp", "hp", "wt", "vs", "am", "gear")) + out <- data_select(mtcars, c("cyl:hp", "wta", "vs:gear")) + expect_named(out, c("cyl", "disp", "hp", "vs", "am", "gear")) + out <- data_select(mtcars, c("hp:cyl", "wta", "vs:gear")) + expect_named(out, c("hp", "disp", "cyl", "vs", "am", "gear")) + out <- data_select(mtcars, c("cyl:hq", "wt", "vs:gear")) + expect_named(out, c("wt", "vs", "am", "gear")) + + expect_warning( + center(mtcars, c("cyl:hp", "wta", "vs:gear"), verbose = TRUE), + regex = "Did you mean \"wt\"" + ) + expect_warning( + center(mtcars, c("cyl:hq", "wt", "vs:gear"), verbose = TRUE), + regex = "Did you mean one of \"hp\"" + ) +}) diff --git a/tests/testthat/test-standardize_models.R b/tests/testthat/test-standardize_models.R index 706a4e6e7..d61caf450 100644 --- a/tests/testthat/test-standardize_models.R +++ b/tests/testthat/test-standardize_models.R @@ -31,6 +31,29 @@ test_that("standardize | errors", { }) +test_that("standardize | problematic formulas", { + data(mtcars) + m <- lm(mpg ~ hp, data = mtcars) + expect_equal( + coef(standardise(m)), + c(`(Intercept)` = -3.14935717633686e-17, hp = -0.776168371826586), + tolerance = 1e-4 + ) + + colnames(mtcars)[1] <- "1_mpg" + m <- lm(`1_mpg` ~ hp, data = mtcars) + expect_error(standardise(m), regex = "Looks like") + + # works interactive only + # data(mtcars) + # m <- lm(mtcars$mpg ~ mtcars$hp) + # expect_error(standardise(m), regex = "model formulas") + + m <- lm(mtcars[, 1] ~ hp, data = mtcars) + expect_error(standardise(m), regex = "indexed data") +}) + + # Transformations --------------------------------------------------------- test_that("transformations", { skip_if_not_installed("effectsize") @@ -206,15 +229,14 @@ test_that("standardize non-Gaussian response", { # variables evaluated in the environment $$$ ------------------------------ test_that("variables evaluated in the environment", { m <- lm(mtcars$mpg ~ mtcars$cyl + am, data = mtcars) - w <- capture_warnings(standardize(m)) - expect_true(any(grepl("mtcars$mpg", w, fixed = TRUE))) + w <- capture_error(standardize(m)) + expect_true(any(grepl("Using `$`", w, fixed = TRUE))) ## Note: # No idea why this is suddenly not giving a warning on older R versions. m <- lm(mtcars$mpg ~ mtcars$cyl + mtcars$am, data = mtcars) - warns <- capture_warnings(standardize(m)) - expect_true(any(grepl("mtcars$mpg", warns, fixed = TRUE))) - expect_true(any(grepl("No variables", warns, fixed = TRUE))) + w <- capture_error(standardize(m)) + expect_true(any(grepl("Using `$`", w, fixed = TRUE))) }) diff --git a/vignettes/overview_of_vignettes.Rmd b/vignettes/overview_of_vignettes.Rmd new file mode 100644 index 000000000..033234607 --- /dev/null +++ b/vignettes/overview_of_vignettes.Rmd @@ -0,0 +1,37 @@ +--- +title: "Overview of Vignettes" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Overview of Vignettes} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r message=FALSE, warning=FALSE, include=FALSE} +library(knitr) +knitr::opts_chunk$set( + echo = TRUE, + collapse = TRUE, + warning = FALSE, + message = FALSE, + comment = "#>", + eval = TRUE +) +``` + +All package vignettes are available at [https://easystats.github.io/datawizard/](https://easystats.github.io/datawizard/). + +## Function Overview + +* [Function Reference](https://easystats.github.io/datawizard/reference/index.html) + + +## Data Preparation + +* [Coming from 'tidyverse'](https://easystats.github.io/datawizard/articles/tidyverse_translation.html) +* [A quick summary of selection syntax in `{datawizard}`](https://easystats.github.io/datawizard/articles/selection_syntax.html) + + +## Statistical Transformations + +* [Data Standardization](https://easystats.github.io/datawizard/articles/standardize_data.html) diff --git a/vignettes/selection_syntax.Rmd b/vignettes/selection_syntax.Rmd index 9b501ebd5..3c0953f65 100644 --- a/vignettes/selection_syntax.Rmd +++ b/vignettes/selection_syntax.Rmd @@ -15,8 +15,7 @@ knitr::opts_chunk$set( pkgs <- c( "datawizard", - "dplyr", - "htmltools" + "dplyr" ) if (!all(vapply(pkgs, requireNamespace, quietly = TRUE, FUN.VALUE = logical(1L)))) { @@ -27,18 +26,10 @@ if (!all(vapply(pkgs, requireNamespace, quietly = TRUE, FUN.VALUE = logical(1L)) ```{r load, echo=FALSE, message=FALSE} library(datawizard) library(dplyr) -library(htmltools) set.seed(123) iris <- iris[sample(nrow(iris), 10), ] row.names(iris) <- NULL - -row <- function(...) { - div( - class = "custom_note", - ... - ) -} ``` ```{css, echo=FALSE} @@ -127,18 +118,26 @@ data_select(iris, contains("pal", "ec")) data_select(iris, regex("^Sep|ies")) ``` -```{r echo=FALSE} -row("Note: these functions are not exported by `datawizard` but are detected and -applied internally. This means that they won't be detected by autocompletion -when we write them.") -``` -```{r echo=FALSE} -row("Note #2: because these functions are not exported, they will not create -conflicts with the ones that come from the `tidyverse` and that have the same name. -So we can still use `dplyr` and its friends, it won't change anything for selection -in `datawizard` functions!") -``` + + + +
+

+ Note: these functions are not exported by `datawizard` but are detected and + applied internally. This means that they won't be detected by autocompletion + when we write them. +

+
+ +
+

+ Note #2: because these functions are not exported, they will not create + conflicts with the ones that come from the `tidyverse` and that have the same + name. Therefore, we can still use `dplyr` and its friends, it won't change + anything for selection in `datawizard` functions! +

+
# Excluding variables diff --git a/vignettes/tidyverse_translation.Rmd b/vignettes/tidyverse_translation.Rmd index b03402468..ae4b339b3 100644 --- a/vignettes/tidyverse_translation.Rmd +++ b/vignettes/tidyverse_translation.Rmd @@ -1,6 +1,6 @@ --- title: "Coming from 'tidyverse'" -output: +output: rmarkdown::html_vignette: toc: true vignette: > @@ -9,7 +9,7 @@ vignette: > %\VignetteEngine{knitr::rmarkdown} --- -```{r message=FALSE, warning=FALSE, include=FALSE, eval = TRUE} +```{r setup, message=FALSE, warning=FALSE, include=FALSE, eval = TRUE} library(knitr) options(knitr.kable.NA = "") knitr::opts_chunk$set( @@ -21,57 +21,71 @@ knitr::opts_chunk$set( pkgs <- c( "dplyr", - "datawizard", "tidyr" ) +all_deps_available <- all(vapply(pkgs, requireNamespace, quietly = TRUE, FUN.VALUE = logical(1L))) -# since we explicitely put eval = TRUE for some chunks, we can't rely on -# knitr::opts_chunk$set(eval = FALSE) at the beginning of the script. So we make -# a logical that is FALSE only if deps are not installed (cf easystats/easystats#317) -evaluate_chunk <- TRUE - -if (!all(vapply(pkgs, requireNamespace, quietly = TRUE, FUN.VALUE = logical(1L)))) { - evaluate_chunk <- FALSE +if (all_deps_available) { + library(datawizard) + library(dplyr) + library(tidyr) } + +# Since we explicitly put `eval = TRUE` for some chunks, we can't rely on +# `knitr::opts_chunk$set(eval = FALSE)` at the beginning of the script. +# Therefore, we introduce a logical that is `FALSE` only if all suggested +# dependencies are not installed (cf easystats/easystats#317) +evaluate_chunk <- all_deps_available && getRversion() >= "4.1.0" ``` This vignette can be referred to by citing the following: Patil et al., (2022). datawizard: An R Package for Easy Data Preparation and Statistical Transformations. *Journal of Open Source Software*, *7*(78), 4684, https://doi.org/10.21105/joss.04684 -```{css, echo=FALSE, eval = evaluate_chunk} +```{css, echo=FALSE, eval = TRUE} .datawizard, .datawizard > .sourceCode { background-color: #e6e6ff; } .tidyverse, .tidyverse > .sourceCode { background-color: #d9f2e5; } +.custom_note { + border-left: solid 5px hsl(220, 100%, 30%); + background-color: hsl(220, 100%, 95%); + padding: 5px; + margin-bottom: 10px +} ``` # Introduction -`{datawizard}` package aims to make basic data wrangling easier than +`{datawizard}` package aims to make basic data wrangling easier than with base R. The data wrangling workflow it supports is similar to the one supported by the tidyverse package combination of `{dplyr}` and `{tidyr}`. However, one of its main features is that it has a very few dependencies: `{stats}` and `{utils}` -(included in base R) and `{insight}`, which is the core package of the _easystats_ -ecosystem. This package grew organically to simultaneously satisfy the +(included in base R) and `{insight}`, which is the core package of the _easystats_ +ecosystem. This package grew organically to simultaneously satisfy the "0 non-base hard dependency" principle of _easystats_ and the data wrangling needs -of the constituent packages in this ecosystem. - -One drawback of this genesis is that not all features of the `{tidyverse}` -packages are supported since only features that were necessary for _easystats_ -ecosystem have been implemented. Some of these missing features (such as `summarize` -or the pipe operator `%>%`) are made available in other dependency-free packages, -such as [`{poorman}`](https://github.com/nathaneastwood/poorman/). It is also -important to note that `{datawizard}` was designed to avoid namespace collisions +of the constituent packages in this ecosystem. It is also +important to note that `{datawizard}` was designed to avoid namespace collisions with `{tidyverse}` packages. -In this article, we will see how to go through basic data wrangling steps with -`{datawizard}`. We will also compare it to the `{tidyverse}` syntax for achieving the same. +In this article, we will see how to go through basic data wrangling steps with +`{datawizard}`. We will also compare it to the `{tidyverse}` syntax for achieving the same. This way, if you decide to make the switch, you can easily find the translations here. This vignette is largely inspired from `{dplyr}`'s [Getting started vignette](https://dplyr.tidyverse.org/articles/dplyr.html). + + + +
+

+ Note: In this vignette, we use the native pipe-operator, `|>`, which was + introduced in R 4.1. Users of R version 3.6 or 4.0 should replace the native + pipe by magrittr's one (`%>%`) so that examples work. +

+
+ ```{r, eval = evaluate_chunk} library(dplyr) library(tidyr) @@ -83,23 +97,23 @@ efc <- head(efc) # Workhorses -Before we look at their *tidyverse* equivalents, we can first have a look at +Before we look at their *tidyverse* equivalents, we can first have a look at `{datawizard}`'s key functions for data wrangling: -| Function | Operation | -| :---------------- | :------------------------------------------------ | -| `data_filter()` | [to select only certain observations](#filtering) | -| `data_select()` | [to select only a few variables](#selecting) | -| `data_modify()` | [to create variables or modify existing ones](#modifying) | -| `data_arrange()` | [to sort observations](#sorting) | -| `data_extract()` | [to extract a single variable](#extracting) | -| `data_rename()` | [to rename variables](#renaming) | -| `data_relocate()` | [to reorder a data frame](#relocating) | -| `data_to_long()` | [to convert data from wide to long](#reshaping) | -| `data_to_wide()` | [to convert data from long to wide](#reshaping) | -| `data_join()` | [to join two data frames](#joining) | -| `data_unite()` | [to concatenate several columns into a single one](#uniting) | -| `data_separate()` | [to separate a single column into multiple columns](#separating) | +| Function | Operation | +| :---------------- | :--------------------------------------------------------------- | +| `data_filter()` | [to select only certain observations](#filtering) | +| `data_select()` | [to select only a few variables](#selecting) | +| `data_modify()` | [to create variables or modify existing ones](#modifying) | +| `data_arrange()` | [to sort observations](#sorting) | +| `data_extract()` | [to extract a single variable](#extracting) | +| `data_rename()` | [to rename variables](#renaming) | +| `data_relocate()` | [to reorder a data frame](#relocating) | +| `data_to_long()` | [to convert data from wide to long](#reshaping) | +| `data_to_wide()` | [to convert data from long to wide](#reshaping) | +| `data_join()` | [to join two data frames](#joining) | +| `data_unite()` | [to concatenate several columns into a single one](#uniting) | +| `data_separate()` | [to separate a single column into multiple columns](#separating) | Note that there are a few functions in `{datawizard}` that have no strict equivalent in `{dplyr}` or `{tidyr}` (e.g `data_rotate()`), and so we won't discuss them in @@ -113,7 +127,7 @@ Before we look at them individually, let's first have a look at the summary tabl | :---------------- | :------------------------------------------------------------------ | | `data_filter()` | `dplyr::filter()`, `dplyr::slice()` | | `data_select()` | `dplyr::select()` | -| `data_modify()` | `dplyr::mutate()` | +| `data_modify()` | `dplyr::mutate()` | | `data_arrange()` | `dplyr::arrange()` | | `data_extract()` | `dplyr::pull()` | | `data_rename()` | `dplyr::rename()` | @@ -123,8 +137,8 @@ Before we look at them individually, let's first have a look at the summary tabl | `data_join()` | `dplyr::inner_join()`, `dplyr::left_join()`, `dplyr::right_join()`, | | | `dplyr::full_join()`, `dplyr::anti_join()`, `dplyr::semi_join()` | | `data_peek()` | `dplyr::glimpse()` | -| `data_unite()` | `tidyr::unite()` | -| `data_separate()` | `tidyr::separate()` | +| `data_unite()` | `tidyr::unite()` | +| `data_separate()` | `tidyr::separate()` | ## Filtering {#filtering} @@ -136,14 +150,14 @@ Before we look at them individually, let's first have a look at the summary tabl ```{r filter, class.source = "datawizard"} # ---------- datawizard ----------- -starwars %>% +starwars |> data_filter( skin_color == "light", eye_color == "brown" ) # or -starwars %>% +starwars |> data_filter( skin_color == "light" & eye_color == "brown" @@ -155,7 +169,7 @@ starwars %>% ```{r, class.source = "tidyverse"} # ---------- tidyverse ----------- -starwars %>% +starwars |> filter( skin_color == "light", eye_color == "brown" @@ -176,9 +190,9 @@ starwars <- head(starwars) ## Selecting {#selecting} -`data_select()` is the equivalent of `dplyr::select()`. +`data_select()` is the equivalent of `dplyr::select()`. The main difference between these two functions is that `data_select()` uses two -arguments (`select` and `exclude`) and requires quoted column names if we want to +arguments (`select` and `exclude`) and requires quoted column names if we want to select several variables, while `dplyr::select()` accepts any unquoted column names. :::: {style="display: grid; grid-template-columns: 50% 50%; grid-column-gap: 10px;"} @@ -187,7 +201,7 @@ select several variables, while `dplyr::select()` accepts any unquoted column na ```{r select1, class.source = "datawizard"} # ---------- datawizard ----------- -starwars %>% +starwars |> data_select(select = c("hair_color", "skin_color", "eye_color")) ``` ::: @@ -196,7 +210,7 @@ starwars %>% ```{r, class.source = "tidyverse"} # ---------- tidyverse ----------- -starwars %>% +starwars |> select(hair_color, skin_color, eye_color) ``` ::: @@ -212,7 +226,7 @@ starwars %>% ```{r select2, class.source = "datawizard"} # ---------- datawizard ----------- -starwars %>% +starwars |> data_select(select = -ends_with("color")) ``` ::: @@ -221,7 +235,7 @@ starwars %>% ```{r, class.source = "tidyverse"} # ---------- tidyverse ----------- -starwars %>% +starwars |> select(-ends_with("color")) ``` ::: @@ -240,7 +254,7 @@ here and quoting them won't work. Should we comment on that? --> ```{r select3, class.source = "datawizard"} # ---------- datawizard ----------- -starwars %>% +starwars |> data_select(select = -(hair_color:eye_color)) ``` ::: @@ -249,7 +263,7 @@ starwars %>% ```{r, class.source = "tidyverse"} # ---------- tidyverse ----------- -starwars %>% +starwars |> select(!(hair_color:eye_color)) ``` ::: @@ -266,7 +280,7 @@ starwars %>% ```{r select4, class.source = "datawizard"} # ---------- datawizard ----------- -starwars %>% +starwars |> data_select(exclude = regex("color$")) ``` ::: @@ -275,7 +289,7 @@ starwars %>% ```{r, class.source = "tidyverse"} # ---------- tidyverse ----------- -starwars %>% +starwars |> select(-contains("color$")) ``` ::: @@ -292,7 +306,7 @@ starwars %>% ```{r select5, class.source = "datawizard"} # ---------- datawizard ----------- -starwars %>% +starwars |> data_select(select = is.numeric) ``` ::: @@ -301,7 +315,7 @@ starwars %>% ```{r, class.source = "tidyverse"} # ---------- tidyverse ----------- -starwars %>% +starwars |> select(where(is.numeric)) ``` ::: @@ -316,8 +330,8 @@ You can find a list of all the select helpers with `?data_select`. ## Modifying {#modifying} -`data_modify()` is a wrapper around `base::transform()` but has several additional -benefits: +`data_modify()` is a wrapper around `base::transform()` but has several additional +benefits: * it allows us to use newly created variables in the following expressions; * it works with grouped data; @@ -325,8 +339,8 @@ benefits: * it accepts expressions as character vectors so that it is easy to program with it -This last point is also the main difference between `data_modify()` and -`dplyr::mutate()`. +This last point is also the main difference between `data_modify()` and +`dplyr::mutate()`. :::: {style="display: grid; grid-template-columns: 50% 50%; grid-column-gap: 10px;"} @@ -334,7 +348,7 @@ This last point is also the main difference between `data_modify()` and ```{r modify1, class.source = "datawizard"} # ---------- datawizard ----------- -efc %>% +efc |> data_modify( c12hour_c = center(c12hour), c12hour_z = c12hour_c / sd(c12hour, na.rm = TRUE), @@ -347,7 +361,7 @@ efc %>% ```{r, class.source = "tidyverse"} # ---------- tidyverse ----------- -efc %>% +efc |> mutate( c12hour_c = center(c12hour), c12hour_z = c12hour_c / sd(c12hour, na.rm = TRUE), @@ -400,7 +414,7 @@ such as `starts_with()` in `data_arrange()`. :::{} ```{r arrange1, class.source = "datawizard"} # ---------- datawizard ----------- -starwars %>% +starwars |> data_arrange(c("hair_color", "height")) ``` ::: @@ -409,7 +423,7 @@ starwars %>% ```{r, class.source = "tidyverse"} # ---------- tidyverse ----------- -starwars %>% +starwars |> arrange(hair_color, height) ``` ::: @@ -419,14 +433,14 @@ starwars %>% ```{r arrange1, eval = evaluate_chunk, echo = FALSE} ``` -You can also sort variables in descending order by putting a `"-"` in front of +You can also sort variables in descending order by putting a `"-"` in front of their name, like below: :::: {style="display: grid; grid-template-columns: 50% 50%; grid-column-gap: 10px;"} :::{} ```{r arrange2, class.source = "datawizard"} # ---------- datawizard ----------- -starwars %>% +starwars |> data_arrange(c("-hair_color", "-height")) ``` ::: @@ -435,7 +449,7 @@ starwars %>% ```{r, class.source = "tidyverse"} # ---------- tidyverse ----------- -starwars %>% +starwars |> arrange(desc(hair_color), -height) ``` ::: @@ -448,15 +462,15 @@ starwars %>% ## Extracting {#extracting} -Although we mostly work on data frames, it is sometimes useful to extract a single -column as a vector. This can be done with `data_extract()`, which reproduces the +Although we mostly work on data frames, it is sometimes useful to extract a single +column as a vector. This can be done with `data_extract()`, which reproduces the behavior of `dplyr::pull()`: :::: {style="display: grid; grid-template-columns: 50% 50%; grid-column-gap: 10px;"} :::{} ```{r extract1, class.source = "datawizard"} # ---------- datawizard ----------- -starwars %>% +starwars |> data_extract(gender) ``` ::: @@ -465,7 +479,7 @@ starwars %>% ```{r, class.source = "tidyverse"} # ---------- tidyverse ----------- -starwars %>% +starwars |> pull(gender) ``` ::: @@ -479,7 +493,7 @@ We can also specify several variables in `select`. In this case, `data_extract() is equivalent to `data_select()`: ```{r eval = evaluate_chunk} -starwars %>% +starwars |> data_extract(select = contains("color")) ``` @@ -488,9 +502,9 @@ starwars %>% ## Renaming {#renaming} -`data_rename()` is the equivalent of `dplyr::rename()` but the syntax between the +`data_rename()` is the equivalent of `dplyr::rename()` but the syntax between the two is different. While `dplyr::rename()` takes new-old pairs of column -names, `data_rename()` requires a vector of column names to rename, and then +names, `data_rename()` requires a vector of column names to rename, and then a vector of new names for these columns that must be of the same length. :::: {style="display: grid; grid-template-columns: 50% 50%; grid-column-gap: 10px;"} @@ -499,7 +513,7 @@ a vector of new names for these columns that must be of the same length. ```{r rename1, class.source = "datawizard"} # ---------- datawizard ----------- -starwars %>% +starwars |> data_rename( pattern = c("sex", "hair_color"), replacement = c("Sex", "Hair Color") @@ -511,7 +525,7 @@ starwars %>% ```{r, class.source = "tidyverse"} # ---------- tidyverse ----------- -starwars %>% +starwars |> rename( Sex = sex, "Hair Color" = hair_color @@ -524,14 +538,14 @@ starwars %>% ```{r rename1, eval = evaluate_chunk, echo = FALSE} ``` -The way `data_rename()` is designed makes it easy to apply the same modifications -to a vector of column names. For example, we can remove underscores and use +The way `data_rename()` is designed makes it easy to apply the same modifications +to a vector of column names. For example, we can remove underscores and use TitleCase with the following code: ```{r rename2} to_rename <- names(starwars) -starwars %>% +starwars |> data_rename( pattern = to_rename, replacement = tools::toTitleCase(gsub("_", " ", to_rename, fixed = TRUE)) @@ -541,16 +555,16 @@ starwars %>% ```{r rename2, eval = evaluate_chunk, echo = FALSE} ``` -It is also possible to add a prefix or a suffix to all or a subset of variables -with `data_addprefix()` and `data_addsuffix()`. The argument `select` accepts +It is also possible to add a prefix or a suffix to all or a subset of variables +with `data_addprefix()` and `data_addsuffix()`. The argument `select` accepts all select helpers that we saw above with `data_select()`: ```{r rename3} -starwars %>% +starwars |> data_addprefix( pattern = "OLD.", select = contains("color") - ) %>% + ) |> data_addsuffix( pattern = ".NEW", select = -contains("color") @@ -566,7 +580,7 @@ Sometimes, we want to relocate one or a small subset of columns in the dataset. Rather than typing many names in `data_select()`, we can use `data_relocate()`, which is the equivalent of `dplyr::relocate()`. Just like `data_select()`, we can specify a list of variables we want to relocate with `select` and `exclude`. -Then, the arguments `before` and `after`^[Note that we use `before` and `after` +Then, the arguments `before` and `after`^[Note that we use `before` and `after` whereas `dplyr::relocate()` uses `.before` and `.after`.] specify where the selected columns should be relocated: @@ -576,32 +590,32 @@ be relocated: ```{r relocate1, class.source = "datawizard"} # ---------- datawizard ----------- -starwars %>% +starwars |> data_relocate(sex:homeworld, before = "height") ``` ::: - + ::: {} ```{r, class.source = "tidyverse"} # ---------- tidyverse ----------- -starwars %>% +starwars |> relocate(sex:homeworld, .before = height) ``` ::: - + :::: ```{r relocate1, eval = evaluate_chunk, echo = FALSE} ``` In addition to column names, `before` and `after` accept column indices. Finally, -one can use `before = -1` to relocate the selected columns just before the last +one can use `before = -1` to relocate the selected columns just before the last column, or `after = -1` to relocate them after the last column. ```{r eval = evaluate_chunk} # ---------- datawizard ----------- -starwars %>% +starwars |> data_relocate(sex:homeworld, after = -1) ``` @@ -611,10 +625,10 @@ starwars %>% ### Longer Reshaping data from wide to long or from long to wide format can be done with -`data_to_long()` and `data_to_wide()`. These functions were designed to match -`tidyr::pivot_longer()` and `tidyr::pivot_wider()` arguments, so that the only -thing to do is to change the function name. However, not all of -`tidyr::pivot_longer()` and `tidyr::pivot_wider()` features are available yet. +`data_to_long()` and `data_to_wide()`. These functions were designed to match +`tidyr::pivot_longer()` and `tidyr::pivot_wider()` arguments, so that the only +thing to do is to change the function name. However, not all of +`tidyr::pivot_longer()` and `tidyr::pivot_wider()` features are available yet. We will use the `relig_income` dataset, as in the [`{tidyr}` vignette](https://tidyr.tidyverse.org/articles/pivot.html). @@ -623,11 +637,11 @@ relig_income ``` -We would like to reshape this dataset to have 3 columns: religion, count, and -income. The column "religion" doesn't need to change, so we exclude it with -`-religion`. Then, each remaining column corresponds to an income category. -Therefore, we want to move all these column names to a single column called -"income". Finally, the values corresponding to each of these columns will be +We would like to reshape this dataset to have 3 columns: religion, count, and +income. The column "religion" doesn't need to change, so we exclude it with +`-religion`. Then, each remaining column corresponds to an income category. +Therefore, we want to move all these column names to a single column called +"income". Finally, the values corresponding to each of these columns will be reshaped to be in a single new column, called "count". :::: {style="display: grid; grid-template-columns: 50% 50%; grid-column-gap: 10px;"} @@ -636,7 +650,7 @@ reshaped to be in a single new column, called "count". ```{r pivot1, class.source = "datawizard"} # ---------- datawizard ----------- -relig_income %>% +relig_income |> data_to_long( -religion, names_to = "income", @@ -649,7 +663,7 @@ relig_income %>% ```{r, class.source = "tidyverse"} # ---------- tidyverse ----------- -relig_income %>% +relig_income |> pivot_longer( !religion, names_to = "income", @@ -676,7 +690,7 @@ billboard ```{r pivot2, class.source = "datawizard"} # ---------- datawizard ----------- -billboard %>% +billboard |> data_to_long( cols = starts_with("wk"), names_to = "week", @@ -690,7 +704,7 @@ billboard %>% ```{r, class.source = "tidyverse"} # ---------- tidyverse ----------- -billboard %>% +billboard |> pivot_longer( cols = starts_with("wk"), names_to = "week", @@ -721,7 +735,7 @@ fish_encounters ```{r pivot3, class.source = "datawizard"} # ---------- datawizard ----------- -fish_encounters %>% +fish_encounters |> data_to_wide( names_from = "station", values_from = "seen", @@ -734,7 +748,7 @@ fish_encounters %>% ```{r, class.source = "tidyverse"} # ---------- tidyverse ----------- -fish_encounters %>% +fish_encounters |> pivot_wider( names_from = station, values_from = seen, @@ -754,12 +768,12 @@ fish_encounters %>% -In `{datawizard}`, joining datasets is done with `data_join()` (or its alias -`data_merge()`). Contrary to `{dplyr}`, this unique function takes care of all +In `{datawizard}`, joining datasets is done with `data_join()` (or its alias +`data_merge()`). Contrary to `{dplyr}`, this unique function takes care of all types of join, which are then specified inside the function with the argument `join` (by default, `join = "left"`). -Below, we show how to perform the four most common joins: full, left, right and +Below, we show how to perform the four most common joins: full, left, right and inner. We will use the datasets `band_members`and `band_instruments` provided by `{dplyr}`: :::: {style="display: grid; grid-template-columns: 50% 50%; grid-column-gap: 10px;"} @@ -789,7 +803,7 @@ band_instruments ```{r join1, class.source = "datawizard"} # ---------- datawizard ----------- -band_members %>% +band_members |> data_join(band_instruments, join = "full") ``` ::: @@ -798,7 +812,7 @@ band_members %>% ```{r, class.source = "tidyverse"} # ---------- tidyverse ----------- -band_members %>% +band_members |> full_join(band_instruments) ``` ::: @@ -818,7 +832,7 @@ band_members %>% ```{r join2, class.source = "datawizard"} # ---------- datawizard ----------- -band_members %>% +band_members |> data_join(band_instruments, join = "left") ``` ::: @@ -827,7 +841,7 @@ band_members %>% ```{r, class.source = "tidyverse"} # ---------- tidyverse ----------- -band_members %>% +band_members |> left_join(band_instruments) ``` ::: @@ -844,7 +858,7 @@ band_members %>% ```{r join3, class.source = "datawizard"} # ---------- datawizard ----------- -band_members %>% +band_members |> data_join(band_instruments, join = "right") ``` ::: @@ -853,7 +867,7 @@ band_members %>% ```{r, class.source = "tidyverse"} # ---------- tidyverse ----------- -band_members %>% +band_members |> right_join(band_instruments) ``` ::: @@ -873,7 +887,7 @@ band_members %>% ```{r join4, class.source = "datawizard"} # ---------- datawizard ----------- -band_members %>% +band_members |> data_join(band_instruments, join = "inner") ``` ::: @@ -882,7 +896,7 @@ band_members %>% ```{r, class.source = "tidyverse"} # ---------- tidyverse ----------- -band_members %>% +band_members |> inner_join(band_instruments) ``` ::: @@ -916,7 +930,7 @@ test ```{r unite1, class.source = "datawizard"} # ---------- datawizard ----------- -test %>% +test |> data_unite( new_column = "date", select = c("year", "month", "day"), @@ -924,12 +938,12 @@ test %>% ) ``` ::: - + ::: {} ```{r, class.source = "tidyverse"} # ---------- tidyverse ----------- -test %>% +test |> unite( col = "date", year, month, day, @@ -937,7 +951,7 @@ test %>% ) ``` ::: - + :::: ```{r unite1, eval = evaluate_chunk, echo = FALSE} @@ -949,7 +963,7 @@ test %>% ```{r unite2, class.source = "datawizard"} # ---------- datawizard ----------- -test %>% +test |> data_unite( new_column = "date", select = c("year", "month", "day"), @@ -958,12 +972,12 @@ test %>% ) ``` ::: - + ::: {} ```{r, class.source = "tidyverse"} # ---------- tidyverse ----------- -test %>% +test |> unite( col = "date", year, month, day, @@ -972,7 +986,7 @@ test %>% ) ``` ::: - + :::: ```{r unite2, eval = evaluate_chunk, echo = FALSE} @@ -999,26 +1013,26 @@ test ```{r separate1, class.source = "datawizard"} # ---------- datawizard ----------- -test %>% +test |> data_separate( select = "date_arrival", new_columns = c("Year", "Month", "Day") ) ``` ::: - + ::: {} ```{r, class.source = "tidyverse"} # ---------- tidyverse ----------- -test %>% +test |> separate( date_arrival, into = c("Year", "Month", "Day") ) ``` ::: - + :::: ```{r separate1, eval = evaluate_chunk, echo = FALSE} @@ -1028,7 +1042,7 @@ test %>% Unlike `tidyr::separate()`, you can separate multiple columns in one step with `data_separate()`. ```{r eval = evaluate_chunk} -test %>% +test |> data_separate( new_columns = list( date_arrival = c("Arr_Year", "Arr_Month", "Arr_Day"), @@ -1040,9 +1054,9 @@ test %>% # Other useful functions -`{datawizard}` contains other functions that are not necessarily included in -`{dplyr}` or `{tidyr}` or do not directly modify the data. Some of them are -inspired from the package `janitor`. +`{datawizard}` contains other functions that are not necessarily included in +`{dplyr}` or `{tidyr}` or do not directly modify the data. Some of them are +inspired from the package `janitor`. ## Work with rownames @@ -1053,12 +1067,12 @@ We can convert a column in rownames and move rownames to a new column with mtcars <- head(mtcars) mtcars -mtcars2 <- mtcars %>% +mtcars2 <- mtcars |> rownames_as_column(var = "model") mtcars2 -mtcars2 %>% +mtcars2 |> column_as_rownames(var = "model") ``` @@ -1068,7 +1082,7 @@ mtcars2 %>% The main difference is when we use it with grouped data. While `tibble::rowid_to_column()` uses one distinct rowid for every row in the dataset, `rowid_as_column()` creates one id for every row *in each group*. Therefore, two rows in different groups -can have the same row id. +can have the same row id. This means that `rowid_as_column()` is closer to using `n()` in `mutate()`, like the following: @@ -1081,16 +1095,16 @@ test <- data.frame( ) test -test %>% - data_group(group) %>% +test |> + data_group(group) |> tibble::rowid_to_column() -test %>% - data_group(group) %>% +test |> + data_group(group) |> rowid_as_column() -test %>% - data_group(group) %>% +test |> + data_group(group) |> mutate(id = seq_len(n())) ``` @@ -1107,11 +1121,11 @@ x <- data.frame( X_2 = c(NA, "Title2", 4:6) ) x -x2 <- x %>% +x2 <- x |> row_to_colnames(row = 2) x2 -x2 %>% +x2 |> colnames_to_row() ```