From 877c5877334eb58a1a8d909cb6b5dd25d07f8a20 Mon Sep 17 00:00:00 2001 From: Daniel Date: Thu, 7 Sep 2023 16:08:18 +0200 Subject: [PATCH 1/8] Rename `rowmean_n()` to `row_means()` (#448) * Rename `rowmean_n()`? Fixes #447 * Update row_means.R * fix * fix * tests * docs * update pkgdown * fix tests * docs * Update NEWS.md Co-authored-by: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com> * version bump --------- Co-authored-by: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com> --- DESCRIPTION | 2 +- NAMESPACE | 2 +- NEWS.md | 6 +- R/row_means.R | 139 +++++++++++++++++++++++++++++++ R/rowmean_n.R | 101 ----------------------- _pkgdown.yaml | 2 +- man/row_means.Rd | 142 ++++++++++++++++++++++++++++++++ man/rowmean_n.Rd | 72 ---------------- tests/testthat/test-row_means.R | 27 ++++++ tests/testthat/test-rowmean_n.R | 26 ------ 10 files changed, 314 insertions(+), 205 deletions(-) create mode 100644 R/row_means.R delete mode 100644 R/rowmean_n.R create mode 100644 man/row_means.Rd delete mode 100644 man/rowmean_n.Rd create mode 100644 tests/testthat/test-row_means.R delete mode 100644 tests/testthat/test-rowmean_n.R diff --git a/DESCRIPTION b/DESCRIPTION index 401e5ad9b..e9c46fef5 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: datawizard Title: Easy Data Wrangling and Statistical Transformations -Version: 0.8.0.8 +Version: 0.8.0.9 Authors@R: c( person("Indrajeet", "Patil", , "patilindrajeet.science@gmail.com", role = "aut", comment = c(ORCID = "0000-0003-1995-6531", Twitter = "@patilindrajeets")), diff --git a/NAMESPACE b/NAMESPACE index 30c064f62..a985af2f0 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -278,9 +278,9 @@ export(reshape_longer) export(reshape_wider) export(reverse) export(reverse_scale) +export(row_means) export(row_to_colnames) export(rowid_as_column) -export(rowmean_n) export(rownames_as_column) export(skewness) export(slide) diff --git a/NEWS.md b/NEWS.md index 734cccd0d..7e91322e7 100644 --- a/NEWS.md +++ b/NEWS.md @@ -2,10 +2,10 @@ NEW FUNCTIONS -* `contr.deviation()` for sum-deviation contrast coding of factors. +* `row_means()`, to compute row means, optionally only for the rows with at + least `min_valid` non-missing values. -* `rowmean_n()`, to compute row means if row contains at least `n` non-missing - values. +* `contr.deviation()` for sum-deviation contrast coding of factors. * `means_by_group()`, to compute mean values of variables, grouped by levels of specified factors. diff --git a/R/row_means.R b/R/row_means.R new file mode 100644 index 000000000..f252824b1 --- /dev/null +++ b/R/row_means.R @@ -0,0 +1,139 @@ +#' @title Row means (optionally with minimum amount of valid values) +#' @name row_means +#' @description This function is similar to the SPSS `MEAN.n` function and computes +#' row means from a data frame or matrix if at least `min_valid` values of a row are +#' valid (and not `NA`). +#' +#' @param data A data frame with at least two columns, where row means are applied. +#' @param min_valid Optional, a numeric value of length 1. May either be +#' - a numeric value that indicates the amount of valid values per row to +#' calculate the row mean; +#' - or a value between 0 and 1, indicating a proportion of valid values per +#' row to calculate the row mean (see 'Details'). +#' - `NULL` (default), in which all cases are considered. +#' +#' If a row's sum of valid values is less than `min_valid`, `NA` will be returned. +#' @param digits Numeric value indicating the number of decimal places to be +#' used for rounding mean values. Negative values are allowed (see 'Details'). +#' By default, `digits = NULL` and no rounding is used. +#' @param remove_na Logical, if `TRUE` (default), removes missing (`NA`) values +#' before calculating row means. Only applies if `min_valuid` is not specified. +#' @param verbose Toggle warnings. +#' @inheritParams find_columns +#' +#' @return A vector with row means for those rows with at least `n` valid values. +#' +#' @details Rounding to a negative number of `digits` means rounding to a power of +#' ten, for example `row_means(df, 3, digits = -2)` rounds to the nearest hundred. +#' For `min_valid`, if not `NULL`, `min_valid` must be a numeric value from `0` +#' to `ncol(data)`. If a row in the data frame has at least `min_valid` +#' non-missing values, the row mean is returned. If `min_valid` is a non-integer +#' value from 0 to 1, `min_valid` is considered to indicate the proportion of +#' required non-missing values per row. E.g., if `min_valid = 0.75`, a row must +#' have at least `ncol(data) * min_valid` non-missing values for the row mean +#' to be calculated. See 'Examples'. +#' +#' @examples +#' dat <- data.frame( +#' c1 = c(1, 2, NA, 4), +#' c2 = c(NA, 2, NA, 5), +#' c3 = c(NA, 4, NA, NA), +#' c4 = c(2, 3, 7, 8) +#' ) +#' +#' # default, all means are shown, if no NA values are present +#' row_means(dat) +#' +#' # remove all NA before computing row means +#' row_means(dat, remove_na = TRUE) +#' +#' # needs at least 4 non-missing values per row +#' row_means(dat, min_valid = 4) # 1 valid return value +#' +#' # needs at least 3 non-missing values per row +#' row_means(dat, min_valid = 3) # 2 valid return values +#' +#' # needs at least 2 non-missing values per row +#' row_means(dat, min_valid = 2) +#' +#' # needs at least 1 non-missing value per row, for two selected variables +#' row_means(dat, select = c("c1", "c3"), min_valid = 1) +#' +#' # needs at least 50% of non-missing values per row +#' row_means(dat, min_valid = 0.5) # 3 valid return values +#' +#' # needs at least 75% of non-missing values per row +#' row_means(dat, min_valid = 0.75) # 2 valid return values +#' +#' @export +row_means <- function(data, + select = NULL, + exclude = NULL, + min_valid = NULL, + digits = NULL, + ignore_case = FALSE, + regex = FALSE, + remove_na = FALSE, + verbose = TRUE) { + # evaluate arguments + select <- .select_nse(select, + data, + exclude, + ignore_case = ignore_case, + regex = regex, + verbose = verbose + ) + + if (is.null(select) || length(select) == 0) { + insight::format_error("No columns selected.") + } + + data <- .coerce_to_dataframe(data[select]) + + # n must be a numeric, non-missing value + if (!is.null(min_valid) && (all(is.na(min_valid)) || !is.numeric(min_valid) || length(min_valid) > 1)) { + insight::format_error("`min_valid` must be a numeric value of length 1.") + } + + # make sure we only have numeric values + numeric_columns <- vapply(data, is.numeric, TRUE) + if (!all(numeric_columns)) { + if (verbose) { + insight::format_alert("Only numeric columns are considered for calculation.") + } + data <- data[numeric_columns] + } + + # check if we have a data framme with at least two columns + if (ncol(data) < 2) { + insight::format_error("`data` must be a data frame with at least two numeric columns.") + } + + # proceed here if min_valid is not NULL + if (!is.null(min_valid)) { + # is 'min_valid' indicating a proportion? + decimals <- min_valid %% 1 + if (decimals != 0) { + min_valid <- round(ncol(data) * decimals) + } + + # min_valid may not be larger as df's amount of columns + if (ncol(data) < min_valid) { + insight::format_error("`min_valid` must be smaller or equal to number of columns in data frame.") + } + + # row means + to_na <- rowSums(is.na(data)) > ncol(data) - min_valid + out <- rowMeans(data, na.rm = TRUE) + out[to_na] <- NA + } else { + out <- rowMeans(data, na.rm = remove_na) + } + + # round, if requested + if (!is.null(digits) && !all(is.na(digits))) { + out <- round(out, digits = digits) + } + + out +} diff --git a/R/rowmean_n.R b/R/rowmean_n.R deleted file mode 100644 index ab47cf511..000000000 --- a/R/rowmean_n.R +++ /dev/null @@ -1,101 +0,0 @@ -#' @title Row means with minimum amount of valid values -#' @name rowmean_n -#' @description This function is similar to the SPSS `MEAN.n` function and computes -#' row means from a data frame or matrix if at least `n` values of a row are -#' valid (and not `NA`). -#' -#' @param data A data frame with at least two columns, where row means are applied. -#' @param n A numeric value of length 1. May either be -#' - a numeric value that indicates the amount of valid values per row to -#' calculate the row mean; -#' - or a value between 0 and 1, indicating a proportion of valid values per -#' row to calculate the row mean (see 'Details'). -#' -#' If a row's sum of valid values is less than `n`, `NA` will be returned. -#' @param digits Numeric value indicating the number of decimal places to be -#' used for rounding mean values. Negative values are allowed (see 'Details'). -#' By default, `digits = NULL` and no rounding is used. -#' @param verbose Toggle warnings. -#' -#' @return A vector with row means for those rows with at least `n` valid values. -#' -#' @details Rounding to a negative number of `digits` means rounding to a power of -#' ten, for example `rowmean_n(df, 3, digits = -2)` rounds to the nearest hundred. -#' For `n`, must be a numeric value from `0` to `ncol(data)`. If a row in the -#' data frame has at least `n` non-missing values, the row mean is returned. If -#' `n` is a non-integer value from 0 to 1, `n` is considered to indicate the -#' proportion of required non-missing values per row. E.g., if `n = 0.75`, a -#' row must have at least `ncol(data) * n` non-missing values for the row mean -#' to be calculated. See 'Examples'. -#' -#' @examples -#' dat <- data.frame( -#' c1 = c(1, 2, NA, 4), -#' c2 = c(NA, 2, NA, 5), -#' c3 = c(NA, 4, NA, NA), -#' c4 = c(2, 3, 7, 8) -#' ) -#' -#' # needs at least 4 non-missing values per row -#' rowmean_n(dat, 4) # 1 valid return value -#' -#' # needs at least 3 non-missing values per row -#' rowmean_n(dat, 3) # 2 valid return values -#' -#' # needs at least 2 non-missing values per row -#' rowmean_n(dat, 2) -#' -#' # needs at least 1 non-missing value per row -#' rowmean_n(dat, 1) # all means are shown -#' -#' # needs at least 50% of non-missing values per row -#' rowmean_n(dat, 0.5) # 3 valid return values -#' -#' # needs at least 75% of non-missing values per row -#' rowmean_n(dat, 0.75) # 2 valid return values -#' -#' @export -rowmean_n <- function(data, n, digits = NULL, verbose = TRUE) { - data <- .coerce_to_dataframe(data) - - # n must be a numeric, non-missing value - if (is.null(n) || all(is.na(n)) || !is.numeric(n) || length(n) > 1) { - insight::format_error("`n` must be a numeric value of length 1.") - } - - # make sure we only have numeric values - numeric_columns <- vapply(data, is.numeric, TRUE) - if (!all(numeric_columns)) { - if (verbose) { - insight::format_alert("Only numeric columns are considered for calculation.") - } - data <- data[numeric_columns] - } - - # check if we have a data framme with at least two columns - if (ncol(data) < 2) { - insight::format_error("`data` must be a data frame with at least two numeric columns.") - } - - # is 'n' indicating a proportion? - decimals <- n %% 1 - if (decimals != 0) { - n <- round(ncol(data) * decimals) - } - - # n may not be larger as df's amount of columns - if (ncol(data) < n) { - insight::format_error("`n` must be smaller or equal to number of columns in data frame.") - } - - # row means - to_na <- rowSums(is.na(data)) > ncol(data) - n - out <- rowMeans(data, na.rm = TRUE) - out[to_na] <- NA - - # round, if requested - if (!is.null(digits) && !all(is.na(digits))) { - out <- round(out, digits = digits) - } - out -} diff --git a/_pkgdown.yaml b/_pkgdown.yaml index 7e0aa5cb4..db2ebfeae 100644 --- a/_pkgdown.yaml +++ b/_pkgdown.yaml @@ -68,8 +68,8 @@ reference: - kurtosis - smoothness - skewness + - row_means - weighted_mean - - rowmean_n - mean_sd - title: Convert and Replace Data diff --git a/man/row_means.Rd b/man/row_means.Rd new file mode 100644 index 000000000..6e4a7774b --- /dev/null +++ b/man/row_means.Rd @@ -0,0 +1,142 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/row_means.R +\name{row_means} +\alias{row_means} +\title{Row means (optionally with minimum amount of valid values)} +\usage{ +row_means( + data, + select = NULL, + exclude = NULL, + min_valid = NULL, + digits = NULL, + ignore_case = FALSE, + regex = FALSE, + remove_na = FALSE, + verbose = TRUE +) +} +\arguments{ +\item{data}{A data frame with at least two columns, where row means are applied.} + +\item{select}{Variables that will be included when performing the required +tasks. Can be either +\itemize{ +\item a variable specified as a literal variable name (e.g., \code{column_name}), +\item a string with the variable name (e.g., \code{"column_name"}), or a character +vector of variable names (e.g., \code{c("col1", "col2", "col3")}), +\item a formula with variable names (e.g., \code{~column_1 + column_2}), +\item a vector of positive integers, giving the positions counting from the left +(e.g. \code{1} or \code{c(1, 3, 5)}), +\item a vector of negative integers, giving the positions counting from the +right (e.g., \code{-1} or \code{-1:-3}), +\item one of the following select-helpers: \code{starts_with()}, \code{ends_with()}, +\code{contains()}, a range using \code{:} or \code{regex("")}. \code{starts_with()}, +\code{ends_with()}, and \code{contains()} accept several patterns, e.g +\code{starts_with("Sep", "Petal")}. +\item or a function testing for logical conditions, e.g. \code{is.numeric()} (or +\code{is.numeric}), or any user-defined function that selects the variables +for which the function returns \code{TRUE} (like: \code{foo <- function(x) mean(x) > 3}), +\item ranges specified via literal variable names, select-helpers (except +\code{regex()}) and (user-defined) functions can be negated, i.e. return +non-matching elements, when prefixed with a \code{-}, e.g. \code{-ends_with("")}, +\code{-is.numeric} or \code{-(Sepal.Width:Petal.Length)}. \strong{Note:} Negation means +that matches are \emph{excluded}, and thus, the \code{exclude} argument can be +used alternatively. For instance, \code{select=-ends_with("Length")} (with +\code{-}) is equivalent to \code{exclude=ends_with("Length")} (no \code{-}). In case +negation should not work as expected, use the \code{exclude} argument instead. +} + +If \code{NULL}, selects all columns. Patterns that found no matches are silently +ignored, e.g. \code{find_columns(iris, select = c("Species", "Test"))} will just +return \code{"Species"}.} + +\item{exclude}{See \code{select}, however, column names matched by the pattern +from \code{exclude} will be excluded instead of selected. If \code{NULL} (the default), +excludes no columns.} + +\item{min_valid}{Optional, a numeric value of length 1. May either be +\itemize{ +\item a numeric value that indicates the amount of valid values per row to +calculate the row mean; +\item or a value between 0 and 1, indicating a proportion of valid values per +row to calculate the row mean (see 'Details'). +\item \code{NULL} (default), in which all cases are considered. +} + +If a row's sum of valid values is less than \code{min_valid}, \code{NA} will be returned.} + +\item{digits}{Numeric value indicating the number of decimal places to be +used for rounding mean values. Negative values are allowed (see 'Details'). +By default, \code{digits = NULL} and no rounding is used.} + +\item{ignore_case}{Logical, if \code{TRUE} and when one of the select-helpers or +a regular expression is used in \code{select}, ignores lower/upper case in the +search pattern when matching against variable names.} + +\item{regex}{Logical, if \code{TRUE}, the search pattern from \code{select} will be +treated as regular expression. When \code{regex = TRUE}, select \emph{must} be a +character string (or a variable containing a character string) and is not +allowed to be one of the supported select-helpers or a character vector +of length > 1. \code{regex = TRUE} is comparable to using one of the two +select-helpers, \code{select = contains("")} or \code{select = regex("")}, however, +since the select-helpers may not work when called from inside other +functions (see 'Details'), this argument may be used as workaround.} + +\item{remove_na}{Logical, if \code{TRUE} (default), removes missing (\code{NA}) values +before calculating row means. Only applies if \code{min_valuid} is not specified.} + +\item{verbose}{Toggle warnings.} +} +\value{ +A vector with row means for those rows with at least \code{n} valid values. +} +\description{ +This function is similar to the SPSS \code{MEAN.n} function and computes +row means from a data frame or matrix if at least \code{min_valid} values of a row are +valid (and not \code{NA}). +} +\details{ +Rounding to a negative number of \code{digits} means rounding to a power of +ten, for example \code{row_means(df, 3, digits = -2)} rounds to the nearest hundred. +For \code{min_valid}, if not \code{NULL}, \code{min_valid} must be a numeric value from \code{0} +to \code{ncol(data)}. If a row in the data frame has at least \code{min_valid} +non-missing values, the row mean is returned. If \code{min_valid} is a non-integer +value from 0 to 1, \code{min_valid} is considered to indicate the proportion of +required non-missing values per row. E.g., if \code{min_valid = 0.75}, a row must +have at least \code{ncol(data) * min_valid} non-missing values for the row mean +to be calculated. See 'Examples'. +} +\examples{ +dat <- data.frame( + c1 = c(1, 2, NA, 4), + c2 = c(NA, 2, NA, 5), + c3 = c(NA, 4, NA, NA), + c4 = c(2, 3, 7, 8) +) + +# default, all means are shown, if no NA values are present +row_means(dat) + +# remove all NA before computing row means +row_means(dat, remove_na = TRUE) + +# needs at least 4 non-missing values per row +row_means(dat, min_valid = 4) # 1 valid return value + +# needs at least 3 non-missing values per row +row_means(dat, min_valid = 3) # 2 valid return values + +# needs at least 2 non-missing values per row +row_means(dat, min_valid = 2) + +# needs at least 1 non-missing value per row, for two selected variables +row_means(dat, select = c("c1", "c3"), min_valid = 1) + +# needs at least 50\% of non-missing values per row +row_means(dat, min_valid = 0.5) # 3 valid return values + +# needs at least 75\% of non-missing values per row +row_means(dat, min_valid = 0.75) # 2 valid return values + +} diff --git a/man/rowmean_n.Rd b/man/rowmean_n.Rd deleted file mode 100644 index df340eed3..000000000 --- a/man/rowmean_n.Rd +++ /dev/null @@ -1,72 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/rowmean_n.R -\name{rowmean_n} -\alias{rowmean_n} -\title{Row means with minimum amount of valid values} -\usage{ -rowmean_n(data, n, digits = NULL, verbose = TRUE) -} -\arguments{ -\item{data}{A data frame with at least two columns, where row means are applied.} - -\item{n}{A numeric value of length 1. May either be -\itemize{ -\item a numeric value that indicates the amount of valid values per row to -calculate the row mean; -\item or a value between 0 and 1, indicating a proportion of valid values per -row to calculate the row mean (see 'Details'). -} - -If a row's sum of valid values is less than \code{n}, \code{NA} will be returned.} - -\item{digits}{Numeric value indicating the number of decimal places to be -used for rounding mean values. Negative values are allowed (see 'Details'). -By default, \code{digits = NULL} and no rounding is used.} - -\item{verbose}{Toggle warnings.} -} -\value{ -A vector with row means for those rows with at least \code{n} valid values. -} -\description{ -This function is similar to the SPSS \code{MEAN.n} function and computes -row means from a data frame or matrix if at least \code{n} values of a row are -valid (and not \code{NA}). -} -\details{ -Rounding to a negative number of \code{digits} means rounding to a power of -ten, for example \code{rowmean_n(df, 3, digits = -2)} rounds to the nearest hundred. -For \code{n}, must be a numeric value from \code{0} to \code{ncol(data)}. If a row in the -data frame has at least \code{n} non-missing values, the row mean is returned. If -\code{n} is a non-integer value from 0 to 1, \code{n} is considered to indicate the -proportion of required non-missing values per row. E.g., if \code{n = 0.75}, a -row must have at least \code{ncol(data) * n} non-missing values for the row mean -to be calculated. See 'Examples'. -} -\examples{ -dat <- data.frame( - c1 = c(1, 2, NA, 4), - c2 = c(NA, 2, NA, 5), - c3 = c(NA, 4, NA, NA), - c4 = c(2, 3, 7, 8) -) - -# needs at least 4 non-missing values per row -rowmean_n(dat, 4) # 1 valid return value - -# needs at least 3 non-missing values per row -rowmean_n(dat, 3) # 2 valid return values - -# needs at least 2 non-missing values per row -rowmean_n(dat, 2) - -# needs at least 1 non-missing value per row -rowmean_n(dat, 1) # all means are shown - -# needs at least 50\% of non-missing values per row -rowmean_n(dat, 0.5) # 3 valid return values - -# needs at least 75\% of non-missing values per row -rowmean_n(dat, 0.75) # 2 valid return values - -} diff --git a/tests/testthat/test-row_means.R b/tests/testthat/test-row_means.R new file mode 100644 index 000000000..8d0504c69 --- /dev/null +++ b/tests/testthat/test-row_means.R @@ -0,0 +1,27 @@ +test_that("row_means", { + d_mn <- data.frame( + c1 = c(1, 2, NA, 4), + c2 = c(NA, 2, NA, 5), + c3 = c(NA, 4, NA, NA), + c4 = c(2, 3, 7, 8) + ) + expect_equal(row_means(d_mn, min_valid = 4), c(NA, 2.75, NA, NA), tolerance = 1e-3) + expect_equal(row_means(d_mn, min_valid = 3), c(NA, 2.75, NA, 5.66667), tolerance = 1e-3) + expect_equal(row_means(d_mn, min_valid = 2), c(1.5, 2.75, NA, 5.66667), tolerance = 1e-3) + expect_equal(row_means(d_mn, min_valid = 1), c(1.5, 2.75, 7, 5.66667), tolerance = 1e-3) + expect_equal(row_means(d_mn, min_valid = 0.5), c(1.5, 2.75, NA, 5.66667), tolerance = 1e-3) + expect_equal(row_means(d_mn, min_valid = 0.75), c(NA, 2.75, NA, 5.66667), tolerance = 1e-3) + expect_equal(row_means(d_mn, min_valid = 2, digits = 1), c(1.5, 2.8, NA, 5.7), tolerance = 1e-1) + expect_message(row_means(iris), regex = "Only numeric") + expect_equal(row_means(iris, verbose = FALSE), rowMeans(iris[, 1:4]), tolerance = 1e-3, ignore_attr = TRUE) +}) + +test_that("row_means, errors or messages", { + data(iris) + expect_error(expect_warning(row_means(iris, select = "abc")), regex = "No columns") + expect_error(row_means(iris[1], min_valid = 1), regex = "two numeric") + expect_error(row_means(iris, min_valid = 1:4), regex = "numeric value") + expect_error(row_means(iris, min_valid = "a"), regex = "numeric value") + expect_message(row_means(iris[1:3, ], min_valid = 3), regex = "Only numeric") + expect_silent(row_means(iris[1:3, ], min_valid = 3, verbose = FALSE)) +}) diff --git a/tests/testthat/test-rowmean_n.R b/tests/testthat/test-rowmean_n.R deleted file mode 100644 index a17996ff6..000000000 --- a/tests/testthat/test-rowmean_n.R +++ /dev/null @@ -1,26 +0,0 @@ -test_that("rowmean_n", { - d_mn <- data.frame( - c1 = c(1, 2, NA, 4), - c2 = c(NA, 2, NA, 5), - c3 = c(NA, 4, NA, NA), - c4 = c(2, 3, 7, 8) - ) - expect_equal(rowmean_n(d_mn, 4), c(NA, 2.75, NA, NA), tolerance = 1e-3) - expect_equal(rowmean_n(d_mn, 3), c(NA, 2.75, NA, 5.66667), tolerance = 1e-3) - expect_equal(rowmean_n(d_mn, 2), c(1.5, 2.75, NA, 5.66667), tolerance = 1e-3) - expect_equal(rowmean_n(d_mn, 1), c(1.5, 2.75, 7, 5.66667), tolerance = 1e-3) - expect_equal(rowmean_n(d_mn, 0.5), c(1.5, 2.75, NA, 5.66667), tolerance = 1e-3) - expect_equal(rowmean_n(d_mn, 0.75), c(NA, 2.75, NA, 5.66667), tolerance = 1e-3) - expect_equal(rowmean_n(d_mn, 2, digits = 1), c(1.5, 2.8, NA, 5.7), tolerance = 1e-1) -}) - -test_that("rowmean_n, errors or messages", { - data(iris) - expect_error(rowmean_n(5, n = 1), regex = "`data` must be") - expect_error(rowmean_n(iris[1], n = 1), regex = "two numeric") - expect_error(rowmean_n(iris, n = NULL), regex = "numeric value") - expect_error(rowmean_n(iris, n = 1:4), regex = "numeric value") - expect_error(rowmean_n(iris, n = "a"), regex = "numeric value") - expect_message(rowmean_n(iris[1:3, ], n = 3), regex = "Only numeric") - expect_silent(rowmean_n(iris[1:3, ], n = 3, verbose = FALSE)) -}) From 1b3b82541538ecbf0a57039875f189ccfca88bc0 Mon Sep 17 00:00:00 2001 From: Daniel Date: Thu, 7 Sep 2023 20:20:29 +0200 Subject: [PATCH 2/8] fix issues with NA values in recodes (#455) * fix issues with NA values in recodes * add reserve_na attr, add tests * add comments * version bump * Update test-recode_into.R * scoping issue * rename objects in tests, maybe fixes random test order --- DESCRIPTION | 2 +- NEWS.md | 6 +++ R/recode_into.r | 26 ++++++++++++- man/recode_into.Rd | 14 ++++++- tests/testthat/test-recode_into.R | 64 ++++++++++++++++++++++++++++++- 5 files changed, 107 insertions(+), 5 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index e9c46fef5..76d6967bc 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: datawizard Title: Easy Data Wrangling and Statistical Transformations -Version: 0.8.0.9 +Version: 0.8.0.10 Authors@R: c( person("Indrajeet", "Patil", , "patilindrajeet.science@gmail.com", role = "aut", comment = c(ORCID = "0000-0003-1995-6531", Twitter = "@patilindrajeets")), diff --git a/NEWS.md b/NEWS.md index 7e91322e7..295570ec4 100644 --- a/NEWS.md +++ b/NEWS.md @@ -15,6 +15,9 @@ CHANGES * `recode_into()` gains an `overwrite` argument to skip overwriting already recoded cases when multiple recode patterns apply to the same case. +* `recode_into()` gains an `preserve_na` argument to preserve `NA` values + when recoding. + * `data_read()` now passes the `encoding` argument to `data.table::fread()`. This allows to read files with non-ASCII characters. @@ -28,6 +31,9 @@ BUG FIXES * Fixed issue in `recode_into()` with probably wrong case number printed in the warning when several recode patterns match to one case. +* Fixed issue in `recode_into()` when original data contained `NA` values and + `NA` was not included in the recode pattern. + * Fixed issue in `data_filter()` where functions containing a `=` (e.g. when naming arguments, like `grepl(pattern, x = a)`) were mistakenly seen as faulty syntax. diff --git a/R/recode_into.r b/R/recode_into.r index 8a382fbe1..b93dbe5d9 100644 --- a/R/recode_into.r +++ b/R/recode_into.r @@ -20,6 +20,10 @@ #' recode patterns. If `FALSE`, former recoded cases will not be altered by later #' recode patterns that would apply to those cases again. A warning message is #' printed to alert such situations and to avoid unintentional recodings. +#' @param preserve_na Logical, if `TRUE` (default) and `default` is not `NA`, +#' missing values in the original variable will be set back to `NA` in the +#' recoded variable (unless overwritten by other recode patterns). If `FALSE`, +#' missing values in the original variable will be recoded to `default`. #' @param verbose Toggle warnings. #' #' @return A vector with recoded values. @@ -73,7 +77,12 @@ #' default = 0 #' ) #' @export -recode_into <- function(..., data = NULL, default = NA, overwrite = TRUE, verbose = TRUE) { +recode_into <- function(..., + data = NULL, + default = NA, + overwrite = TRUE, + preserve_na = TRUE, + verbose = TRUE) { dots <- list(...) # get length of vector, so we know the length of the output vector @@ -135,6 +144,12 @@ recode_into <- function(..., data = NULL, default = NA, overwrite = TRUE, verbos index <- with(data, eval(dots[[i]][[2]])) value <- with(data, eval(dots[[i]][[3]])) } + # remember missing values, so we can add back later + missing_index <- is.na(index) + # make sure index has no missing values. when we have missing values in + # original expression, these are considered as "no match" and set to FALSE + # we handle NA value later and thus want to remove them from "index" now + index[is.na(index)] <- FALSE # overwriting values? do more recode-patterns match the same case? if (is.na(default)) { already_exists <- !is.na(out[index]) @@ -144,7 +159,7 @@ recode_into <- function(..., data = NULL, default = NA, overwrite = TRUE, verbos # save indices of overwritten cases overwritten_cases <- which(index)[already_exists] # tell user... - if (any(already_exists) && verbose) { + if (any(already_exists, na.rm = TRUE) && verbose) { if (overwrite) { msg <- paste( "Several recode patterns apply to the same cases.", @@ -164,7 +179,14 @@ recode_into <- function(..., data = NULL, default = NA, overwrite = TRUE, verbos if (!overwrite) { index[overwritten_cases] <- FALSE } + # write new values into output vector out[index] <- value + # set back missing values + if (any(missing_index) && !is.na(default) && preserve_na) { + # but only where we still have default values + # we don't want to overwrite already recoded values with NA + out[missing_index & out == default] <- NA + } } out diff --git a/man/recode_into.Rd b/man/recode_into.Rd index 064d72f6c..b3e164131 100644 --- a/man/recode_into.Rd +++ b/man/recode_into.Rd @@ -4,7 +4,14 @@ \alias{recode_into} \title{Recode values from one or more variables into a new variable} \usage{ -recode_into(..., data = NULL, default = NA, overwrite = TRUE, verbose = TRUE) +recode_into( + ..., + data = NULL, + default = NA, + overwrite = TRUE, + preserve_na = TRUE, + verbose = TRUE +) } \arguments{ \item{...}{A sequence of two-sided formulas, where the left hand side (LHS) @@ -25,6 +32,11 @@ recode patterns. If \code{FALSE}, former recoded cases will not be altered by la recode patterns that would apply to those cases again. A warning message is printed to alert such situations and to avoid unintentional recodings.} +\item{preserve_na}{Logical, if \code{TRUE} (default) and \code{default} is not \code{NA}, +missing values in the original variable will be set back to \code{NA} in the +recoded variable (unless overwritten by other recode patterns). If \code{FALSE}, +missing values in the original variable will be recoded to \code{default}.} + \item{verbose}{Toggle warnings.} } \value{ diff --git a/tests/testthat/test-recode_into.R b/tests/testthat/test-recode_into.R index ab5c7908b..df7cf60b2 100644 --- a/tests/testthat/test-recode_into.R +++ b/tests/testthat/test-recode_into.R @@ -170,7 +170,7 @@ test_that("recode_into, check differen input length", { ) }) -test_that("recode_into, check differen input length", { +test_that("recode_into, check different input length", { x <- 1:5 y <- c(5, 2, 3, 1, 4) expect_warning( @@ -184,3 +184,65 @@ test_that("recode_into, check differen input length", { regexp = "Several recode patterns" ) }) + +test_that("recode_into, make sure recode works with missing in original variable", { + data(mtcars) + mtcars$mpg[c(3, 10, 12, 15, 16)] <- NA + mtcars$cyl[c(2, 15, 16)] <- NA + d_recode_na <<- as.data.frame(mtcars) + out1_recoded_na <- recode_into( + d_recode_na$mpg > 20 & d_recode_na$cyl == 6 ~ 1, + d_recode_na$mpg <= 20 ~ 2, + d_recode_na$cyl == 4 ~ 3, + default = 0 + ) + out2_recoded_na <- recode_into( + d_recode_na$mpg > 20 & d_recode_na$cyl == 6 ~ 1, + d_recode_na$mpg <= 20 ~ 2, + default = 0 + ) + out3_recoded_na <- recode_into( + d_recode_na$mpg > 20 & d_recode_na$cyl == 6 ~ 1, + d_recode_na$mpg <= 20 ~ 2, + d_recode_na$cyl == 4 ~ 3, + default = 0, + preserve_na = FALSE + ) + out4_recoded_na <- recode_into( + d_recode_na$mpg > 20 & d_recode_na$cyl == 6 ~ 1, + d_recode_na$mpg <= 20 ~ 2, + default = 0, + preserve_na = FALSE + ) + # one NA in mpg is overwritten by valid value from cyl, total 5 NA + expect_identical( + out1_recoded_na, + c( + 1, NA, 3, 1, 2, 2, 2, 3, 3, NA, 2, NA, 2, 2, NA, NA, 2, 3, + 3, 3, 3, 2, 2, 2, 2, 3, 3, 3, 2, 2, 2, 3 + ) + ) + # total 6 NA + expect_identical( + out2_recoded_na, + c( + 1, NA, NA, 1, 2, 2, 2, 0, 0, NA, 2, NA, 2, 2, NA, NA, 2, 0, + 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 2, 2, 2, 0 + ) + ) + # NA is preserved, set to default if not overwritten by other recodes + expect_identical( + out3_recoded_na, + c( + 1, 0, 3, 1, 2, 2, 2, 3, 3, 0, 2, 0, 2, 2, 0, 0, 2, 3, 3, 3, + 3, 2, 2, 2, 2, 3, 3, 3, 2, 2, 2, 3 + ) + ) + expect_identical( + out4_recoded_na, + c( + 1, 0, 0, 1, 2, 2, 2, 0, 0, 0, 2, 0, 2, 2, 0, 0, 2, 0, 0, 0, + 0, 2, 2, 2, 2, 0, 0, 0, 2, 2, 2, 0 + ) + ) +}) From 35c4f13d86269d44f152fb0b447d7e158855f79e Mon Sep 17 00:00:00 2001 From: Daniel Date: Thu, 7 Sep 2023 20:39:42 +0200 Subject: [PATCH 3/8] improve handling --- R/recode_into.r | 57 ++++++++++++++++++++++++++----- man/recode_into.Rd | 38 ++++++++++++++++++--- tests/testthat/test-recode_into.R | 38 ++++++++++++++------- 3 files changed, 106 insertions(+), 27 deletions(-) diff --git a/R/recode_into.r b/R/recode_into.r index b93dbe5d9..a33f03eb7 100644 --- a/R/recode_into.r +++ b/R/recode_into.r @@ -20,10 +20,13 @@ #' recode patterns. If `FALSE`, former recoded cases will not be altered by later #' recode patterns that would apply to those cases again. A warning message is #' printed to alert such situations and to avoid unintentional recodings. -#' @param preserve_na Logical, if `TRUE` (default) and `default` is not `NA`, -#' missing values in the original variable will be set back to `NA` in the -#' recoded variable (unless overwritten by other recode patterns). If `FALSE`, -#' missing values in the original variable will be recoded to `default`. +#' @param preserve_na Logical, if `TRUE` and `default` is not `NA`, missing +#' values in the original variable will be set back to `NA` in the recoded +#' variable (unless overwritten by other recode patterns). If `FALSE`, missing +#' values in the original variable will be recoded to `default`. The latter +#' behaviour prevents unintentional overwriting of missing values with `default`, +#' which means that you won't find valid values where the original data only +#' had missing values. See 'Examples'. #' @param verbose Toggle warnings. #' #' @return A vector with recoded values. @@ -76,12 +79,37 @@ #' data = d, #' default = 0 #' ) +#' +#' # handling of missing values +#' d <- data.frame( +#' x = c(1, NA, 2, NA, 3, 4), +#' y = c(1, 11, 3, NA, 5, 6) +#' ) +#' # first NA in x is overwritten by valid value from y +#' # we have no known value for second NA in x and y, +#' # thus we get one NA in the result +#' recode_into( +#' x <= 3 ~ 1, +#' y > 5 ~ 2, +#' data = d, +#' default = 0, +#' preserve_na = TRUE +#' ) +#' # first NA in x is overwritten by valid value from y +#' # default value is used for second NA +#' recode_into( +#' x <= 3 ~ 1, +#' y > 5 ~ 2, +#' data = d, +#' default = 0, +#' preserve_na = FALSE +#' ) #' @export recode_into <- function(..., data = NULL, default = NA, overwrite = TRUE, - preserve_na = TRUE, + preserve_na = FALSE, verbose = TRUE) { dots <- list(...) @@ -133,6 +161,9 @@ recode_into <- function(..., ) } + # indicator to show message when replacing NA by default + # needed to show message only once + overwrite_NA_msg <- TRUE # iterate all expressions for (i in seq_len(n_params)) { @@ -182,10 +213,18 @@ recode_into <- function(..., # write new values into output vector out[index] <- value # set back missing values - if (any(missing_index) && !is.na(default) && preserve_na) { - # but only where we still have default values - # we don't want to overwrite already recoded values with NA - out[missing_index & out == default] <- NA + if (any(missing_index) && !is.na(default)) { + if (preserve_na) { + # but only where we still have default values + # we don't want to overwrite already recoded values with NA + out[missing_index & out == default] <- NA + } else if (overwrite_NA_msg && verbose) { + # don't show msg again + overwrite_NA_msg <- FALSE + insight::format_alert( + "Missing values in original variable are overwritten by default value. If you want to preserve missing values, set `preserve_na = TRUE`." + ) + } } } diff --git a/man/recode_into.Rd b/man/recode_into.Rd index b3e164131..d8d0a337d 100644 --- a/man/recode_into.Rd +++ b/man/recode_into.Rd @@ -9,7 +9,7 @@ recode_into( data = NULL, default = NA, overwrite = TRUE, - preserve_na = TRUE, + preserve_na = FALSE, verbose = TRUE ) } @@ -32,10 +32,13 @@ recode patterns. If \code{FALSE}, former recoded cases will not be altered by la recode patterns that would apply to those cases again. A warning message is printed to alert such situations and to avoid unintentional recodings.} -\item{preserve_na}{Logical, if \code{TRUE} (default) and \code{default} is not \code{NA}, -missing values in the original variable will be set back to \code{NA} in the -recoded variable (unless overwritten by other recode patterns). If \code{FALSE}, -missing values in the original variable will be recoded to \code{default}.} +\item{preserve_na}{Logical, if \code{TRUE} and \code{default} is not \code{NA}, missing +values in the original variable will be set back to \code{NA} in the recoded +variable (unless overwritten by other recode patterns). If \code{FALSE}, missing +values in the original variable will be recoded to \code{default}. The latter +behaviour prevents unintentional overwriting of missing values with \code{default}, +which means that you won't find valid values where the original data only +had missing values. See 'Examples'.} \item{verbose}{Toggle warnings.} } @@ -95,4 +98,29 @@ recode_into( data = d, default = 0 ) + +# handling of missing values +d <- data.frame( + x = c(1, NA, 2, NA, 3, 4), + y = c(1, 11, 3, NA, 5, 6) +) +# first NA in x is overwritten by valid value from y +# we have no known value for second NA in x and y, +# thus we get one NA in the result +recode_into( + x <= 3 ~ 1, + y > 5 ~ 2, + data = d, + default = 0, + preserve_na = TRUE +) +# first NA in x is overwritten by valid value from y +# default value is used for second NA +recode_into( + x <= 3 ~ 1, + y > 5 ~ 2, + data = d, + default = 0, + preserve_na = FALSE +) } diff --git a/tests/testthat/test-recode_into.R b/tests/testthat/test-recode_into.R index df7cf60b2..90fabcd2f 100644 --- a/tests/testthat/test-recode_into.R +++ b/tests/testthat/test-recode_into.R @@ -194,25 +194,37 @@ test_that("recode_into, make sure recode works with missing in original variable d_recode_na$mpg > 20 & d_recode_na$cyl == 6 ~ 1, d_recode_na$mpg <= 20 ~ 2, d_recode_na$cyl == 4 ~ 3, - default = 0 + default = 0, + preserve_na = TRUE ) out2_recoded_na <- recode_into( d_recode_na$mpg > 20 & d_recode_na$cyl == 6 ~ 1, d_recode_na$mpg <= 20 ~ 2, - default = 0 - ) - out3_recoded_na <- recode_into( - d_recode_na$mpg > 20 & d_recode_na$cyl == 6 ~ 1, - d_recode_na$mpg <= 20 ~ 2, - d_recode_na$cyl == 4 ~ 3, default = 0, - preserve_na = FALSE + preserve_na = TRUE ) - out4_recoded_na <- recode_into( - d_recode_na$mpg > 20 & d_recode_na$cyl == 6 ~ 1, - d_recode_na$mpg <= 20 ~ 2, - default = 0, - preserve_na = FALSE + expect_message( + { + out3_recoded_na <- recode_into( + d_recode_na$mpg > 20 & d_recode_na$cyl == 6 ~ 1, + d_recode_na$mpg <= 20 ~ 2, + d_recode_na$cyl == 4 ~ 3, + default = 0, + preserve_na = FALSE + ) + }, + regex = "Missing values in original variable" + ) + expect_message( + { + out4_recoded_na <- recode_into( + d_recode_na$mpg > 20 & d_recode_na$cyl == 6 ~ 1, + d_recode_na$mpg <= 20 ~ 2, + default = 0, + preserve_na = FALSE + ) + }, + regex = "Missing values in original variable" ) # one NA in mpg is overwritten by valid value from cyl, total 5 NA expect_identical( From 2acaef27141b1371213a707a6f28f678671d0224 Mon Sep 17 00:00:00 2001 From: Daniel Date: Thu, 7 Sep 2023 21:48:31 +0200 Subject: [PATCH 4/8] docs --- R/recode_into.r | 8 ++++---- man/recode_into.Rd | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/R/recode_into.r b/R/recode_into.r index a33f03eb7..fc369a62a 100644 --- a/R/recode_into.r +++ b/R/recode_into.r @@ -23,10 +23,10 @@ #' @param preserve_na Logical, if `TRUE` and `default` is not `NA`, missing #' values in the original variable will be set back to `NA` in the recoded #' variable (unless overwritten by other recode patterns). If `FALSE`, missing -#' values in the original variable will be recoded to `default`. The latter -#' behaviour prevents unintentional overwriting of missing values with `default`, -#' which means that you won't find valid values where the original data only -#' had missing values. See 'Examples'. +#' values in the original variable will be recoded to `default`. Setting +#' `preserve_na = TRUE` prevents unintentional overwriting of missing values +#' with `default`, which means that you won't find valid values where the +#' original data only had missing values. See 'Examples'. #' @param verbose Toggle warnings. #' #' @return A vector with recoded values. diff --git a/man/recode_into.Rd b/man/recode_into.Rd index d8d0a337d..b0acc7c9f 100644 --- a/man/recode_into.Rd +++ b/man/recode_into.Rd @@ -35,10 +35,10 @@ printed to alert such situations and to avoid unintentional recodings.} \item{preserve_na}{Logical, if \code{TRUE} and \code{default} is not \code{NA}, missing values in the original variable will be set back to \code{NA} in the recoded variable (unless overwritten by other recode patterns). If \code{FALSE}, missing -values in the original variable will be recoded to \code{default}. The latter -behaviour prevents unintentional overwriting of missing values with \code{default}, -which means that you won't find valid values where the original data only -had missing values. See 'Examples'.} +values in the original variable will be recoded to \code{default}. Setting +\code{preserve_na = TRUE} prevents unintentional overwriting of missing values +with \code{default}, which means that you won't find valid values where the +original data only had missing values. See 'Examples'.} \item{verbose}{Toggle warnings.} } From fbd4430d7143e186b14952f6e3da7d2c8e55e4df Mon Sep 17 00:00:00 2001 From: Daniel Date: Fri, 8 Sep 2023 13:06:10 +0200 Subject: [PATCH 5/8] fix labels_to_levels (#456) * fix labels_to_levels * fix * lintr * lintr * add comments --- DESCRIPTION | 2 +- NEWS.md | 3 +++ R/utils_labels.R | 16 +++++++++++++++- tests/testthat/test-labels_to_levels.R | 24 ++++++++++++++++++++++-- 4 files changed, 41 insertions(+), 4 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 76d6967bc..9fed6f068 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: datawizard Title: Easy Data Wrangling and Statistical Transformations -Version: 0.8.0.10 +Version: 0.8.0.11 Authors@R: c( person("Indrajeet", "Patil", , "patilindrajeet.science@gmail.com", role = "aut", comment = c(ORCID = "0000-0003-1995-6531", Twitter = "@patilindrajeets")), diff --git a/NEWS.md b/NEWS.md index 295570ec4..160c0fae2 100644 --- a/NEWS.md +++ b/NEWS.md @@ -25,6 +25,9 @@ CHANGES BUG FIXES +* Fixed issue in `labels_to_levels()` when values of labels were not in sorted + order and values were not sequentially numbered. + * Fixed issues in `data_write()` when writing labelled data into SPSS format and vectors were of different type as value labels. diff --git a/R/utils_labels.R b/R/utils_labels.R index 54b1c46fd..67b3ecc6a 100644 --- a/R/utils_labels.R +++ b/R/utils_labels.R @@ -46,7 +46,21 @@ "Not all factor levels had a matching value label. Non-matching levels were preserved." ) } - levels(x)[levels_in_labs] <- names(value_labels[labs_in_levels]) + if (length(value_labels) == length(levels_in_labs)) { + # when length of value_labels and levels_in_labs is identical, we can simply + # replace the levels with the value labels. This makes sure than levels or + # value labels, which are not sorted or not sequentially numbered, match. + # Example: + # x <- c(5, 5, 1, 3, 1, 7) + # attr(x, "labels") <- c(no = 7, yes = 1, maybe = 3, `don't know` = 5) + # to_factor(x, labels_to_levels = TRUE) + levels(x)[levels_in_labs] <- names(value_labels) + } else { + # else, we need to select only those value labels that have a matching level + # (in labs_in_levels). This is required when not all values that have labels + # appear in the data. + levels(x)[levels_in_labs] <- names(value_labels[labs_in_levels]) + } attr(x, "labels") <- NULL x diff --git a/tests/testthat/test-labels_to_levels.R b/tests/testthat/test-labels_to_levels.R index 518dac70e..55105acfe 100644 --- a/tests/testthat/test-labels_to_levels.R +++ b/tests/testthat/test-labels_to_levels.R @@ -8,13 +8,13 @@ test_that("labels_to_levels, numeric", { test_that("labels_to_levels, factor", { data(efc) x <- as.factor(efc$c172code) - attr(x, "labels") <- c("low" = 1, "mid" = 2, "high" = 3) + attr(x, "labels") <- c(low = 1, mid = 2, high = 3) x <- labels_to_levels(x) expect_identical(levels(x), c("low", "mid", "high")) expect_equal(table(x), table(efc$c172code), ignore_attr = TRUE) x <- as.ordered(efc$c172code) - attr(x, "labels") <- c("low" = 1, "mid" = 2, "high" = 3) + attr(x, "labels") <- c(low = 1, mid = 2, high = 3) x <- labels_to_levels(x) expect_identical(levels(x), c("low", "mid", "high")) expect_s3_class(x, "ordered") @@ -40,3 +40,23 @@ test_that("labels_to_levels, factor, data frame", { ) expect_identical(sum(vapply(efc, is.factor, TRUE)), 1L) }) + +test_that("labels_to_levels, factor, with random value numbers (no sequential order)", { + x <- c(5, 5, 1, 3, 1, 7) + attr(x, "labels") <- c(no = 7, yes = 1, maybe = 3, `don't know` = 5) + out <- to_factor(x, labels_to_levels = TRUE) + expect_identical(as.character(out), c("don't know", "don't know", "yes", "maybe", "yes", "no")) + expect_identical(levels(out), c("yes", "maybe", "don't know", "no")) + + x <- c(4, 4, 1, 2, 1, 3) + attr(x, "labels") <- c(a = 1, b = 2, c = 3, d = 4) + out <- to_factor(x, labels_to_levels = TRUE) + expect_identical(as.character(out), c("d", "d", "a", "b", "a", "c")) + expect_identical(levels(out), c("a", "b", "c", "d")) + + x <- c(4, 4, 1, 2, 1, 3) + attr(x, "labels") <- c(d = 1, c = 2, b = 3, a = 4) + out <- to_factor(x, labels_to_levels = TRUE) + expect_identical(as.character(out), c("a", "a", "d", "c", "d", "b")) + expect_identical(levels(out), c("d", "c", "b", "a")) +}) From bcbc115cd36979e839637ab7068fa69d1dfea655 Mon Sep 17 00:00:00 2001 From: Daniel Date: Fri, 8 Sep 2023 16:51:10 +0200 Subject: [PATCH 6/8] Need more fixing (#457) * fix labels_to_levels * fix * lintr * lintr * add comments * still not working for all edge cases * fix * namespace * fix * desc * styler --- DESCRIPTION | 2 +- R/utils_labels.R | 26 ++++----- tests/testthat/test-labels_to_levels.R | 73 ++++++++++++++++++++++++++ 3 files changed, 85 insertions(+), 16 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 9fed6f068..c062db49e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: datawizard Title: Easy Data Wrangling and Statistical Transformations -Version: 0.8.0.11 +Version: 0.8.0.12 Authors@R: c( person("Indrajeet", "Patil", , "patilindrajeet.science@gmail.com", role = "aut", comment = c(ORCID = "0000-0003-1995-6531", Twitter = "@patilindrajeets")), diff --git a/R/utils_labels.R b/R/utils_labels.R index 67b3ecc6a..a7f4fa2c3 100644 --- a/R/utils_labels.R +++ b/R/utils_labels.R @@ -46,21 +46,17 @@ "Not all factor levels had a matching value label. Non-matching levels were preserved." ) } - if (length(value_labels) == length(levels_in_labs)) { - # when length of value_labels and levels_in_labs is identical, we can simply - # replace the levels with the value labels. This makes sure than levels or - # value labels, which are not sorted or not sequentially numbered, match. - # Example: - # x <- c(5, 5, 1, 3, 1, 7) - # attr(x, "labels") <- c(no = 7, yes = 1, maybe = 3, `don't know` = 5) - # to_factor(x, labels_to_levels = TRUE) - levels(x)[levels_in_labs] <- names(value_labels) - } else { - # else, we need to select only those value labels that have a matching level - # (in labs_in_levels). This is required when not all values that have labels - # appear in the data. - levels(x)[levels_in_labs] <- names(value_labels[labs_in_levels]) - } + # we need to find out which levels have no labelled value + missing_levels <- levels(x)[!levels(x) %in% value_labels] + + # and we need to remove those value labels that don't have a matching level + value_labels <- value_labels[value_labels %in% levels(x)] + + # for levels that have no label, we just keep the original factor level + value_labels <- c(value_labels, stats::setNames(missing_levels, missing_levels)) + + # now we can add back levels + levels(x) <- names(value_labels)[order(as.numeric(value_labels))] attr(x, "labels") <- NULL x diff --git a/tests/testthat/test-labels_to_levels.R b/tests/testthat/test-labels_to_levels.R index 55105acfe..866154c8f 100644 --- a/tests/testthat/test-labels_to_levels.R +++ b/tests/testthat/test-labels_to_levels.R @@ -59,4 +59,77 @@ test_that("labels_to_levels, factor, with random value numbers (no sequential or out <- to_factor(x, labels_to_levels = TRUE) expect_identical(as.character(out), c("a", "a", "d", "c", "d", "b")) expect_identical(levels(out), c("d", "c", "b", "a")) + + x <- c(5, 5, 1, 3, 1, 7) + attr(x, "labels") <- c(no = 7, yes = 1, maybe = 3, `don't know` = 5) + out <- to_factor(x, labels_to_levels = TRUE) + expect_identical( + out, + structure( + c(3L, 3L, 1L, 2L, 1L, 4L), + levels = c("yes", "maybe", "don't know", "no"), + class = "factor" + ) + ) + expect_identical( + as.character(out), + c("don't know", "don't know", "yes", "maybe", "yes", "no") + ) + + x <- c(5, 5, 1, 3, 1, 7, 4) + attr(x, "labels") <- c(no = 7, yes = 1, maybe = 3, `don't know` = 5) + expect_message( + { + out <- to_factor(x, labels_to_levels = TRUE) + }, + regex = "Not all factor levels" + ) + expect_identical( + out, + structure( + c(4L, 4L, 1L, 2L, 1L, 5L, 3L), + levels = c("yes", "maybe", "4", "don't know", "no"), + class = "factor" + ) + ) + expect_identical( + as.character(out), + c("don't know", "don't know", "yes", "maybe", "yes", "no", "4") + ) + + x <- c(5, 5, 1, 3, 1, 7) + attr(x, "labels") <- c(no = 7, yes = 1, maybe = 4, `don't know` = 5) + expect_message({ + out <- to_factor(x, labels_to_levels = TRUE) + }) + expect_identical( + out, + structure( + c(3L, 3L, 1L, 2L, 1L, 4L), + levels = c("yes", "3", "don't know", "no"), + class = "factor" + ) + ) + expect_identical( + as.character(out), + c("don't know", "don't know", "yes", "3", "yes", "no") + ) + + x <- c(5, 5, 1, 3, 1, 7, 6) + attr(x, "labels") <- c(no = 7, yes = 1, maybe = 4, `don't know` = 5) + expect_message({ + out <- to_factor(x, labels_to_levels = TRUE) + }) + expect_identical( + out, + structure( + c(3L, 3L, 1L, 2L, 1L, 5L, 4L), + levels = c("yes", "3", "don't know", "6", "no"), + class = "factor" + ) + ) + expect_identical( + as.character(out), + c("don't know", "don't know", "yes", "3", "yes", "no", "6") + ) }) From 6e09435dc978454436faf0d33230d1d69b1ce8ee Mon Sep 17 00:00:00 2001 From: Daniel Date: Fri, 8 Sep 2023 17:15:53 +0200 Subject: [PATCH 7/8] rename 1 --- R/{data_peek.r => _data_peek.R} | 0 R/{data_write.r => _data_write.R} | 0 R/{recode_into.r => _recode_into.R} | 0 3 files changed, 0 insertions(+), 0 deletions(-) rename R/{data_peek.r => _data_peek.R} (100%) rename R/{data_write.r => _data_write.R} (100%) rename R/{recode_into.r => _recode_into.R} (100%) diff --git a/R/data_peek.r b/R/_data_peek.R similarity index 100% rename from R/data_peek.r rename to R/_data_peek.R diff --git a/R/data_write.r b/R/_data_write.R similarity index 100% rename from R/data_write.r rename to R/_data_write.R diff --git a/R/recode_into.r b/R/_recode_into.R similarity index 100% rename from R/recode_into.r rename to R/_recode_into.R From ae7df24b9dce7207e361dd7906fc241743fb90f1 Mon Sep 17 00:00:00 2001 From: Daniel Date: Fri, 8 Sep 2023 17:16:14 +0200 Subject: [PATCH 8/8] rename 2 --- R/{_data_peek.R => data_peek.R} | 0 R/{_data_write.R => data_write.R} | 0 R/{_recode_into.R => recode_into.R} | 0 3 files changed, 0 insertions(+), 0 deletions(-) rename R/{_data_peek.R => data_peek.R} (100%) rename R/{_data_write.R => data_write.R} (100%) rename R/{_recode_into.R => recode_into.R} (100%) diff --git a/R/_data_peek.R b/R/data_peek.R similarity index 100% rename from R/_data_peek.R rename to R/data_peek.R diff --git a/R/_data_write.R b/R/data_write.R similarity index 100% rename from R/_data_write.R rename to R/data_write.R diff --git a/R/_recode_into.R b/R/recode_into.R similarity index 100% rename from R/_recode_into.R rename to R/recode_into.R