From 877c5877334eb58a1a8d909cb6b5dd25d07f8a20 Mon Sep 17 00:00:00 2001
From: Daniel <mail@danielluedecke.de>
Date: Thu, 7 Sep 2023 16:08:18 +0200
Subject: [PATCH 1/8] Rename `rowmean_n()` to `row_means()` (#448)

* Rename `rowmean_n()`?
Fixes #447

* Update row_means.R

* fix

* fix

* tests

* docs

* update pkgdown

* fix tests

* docs

* Update NEWS.md

Co-authored-by: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com>

* version bump

---------

Co-authored-by: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com>
---
 DESCRIPTION                     |   2 +-
 NAMESPACE                       |   2 +-
 NEWS.md                         |   6 +-
 R/row_means.R                   | 139 +++++++++++++++++++++++++++++++
 R/rowmean_n.R                   | 101 -----------------------
 _pkgdown.yaml                   |   2 +-
 man/row_means.Rd                | 142 ++++++++++++++++++++++++++++++++
 man/rowmean_n.Rd                |  72 ----------------
 tests/testthat/test-row_means.R |  27 ++++++
 tests/testthat/test-rowmean_n.R |  26 ------
 10 files changed, 314 insertions(+), 205 deletions(-)
 create mode 100644 R/row_means.R
 delete mode 100644 R/rowmean_n.R
 create mode 100644 man/row_means.Rd
 delete mode 100644 man/rowmean_n.Rd
 create mode 100644 tests/testthat/test-row_means.R
 delete mode 100644 tests/testthat/test-rowmean_n.R

diff --git a/DESCRIPTION b/DESCRIPTION
index 401e5ad9b..e9c46fef5 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,7 +1,7 @@
 Type: Package
 Package: datawizard
 Title: Easy Data Wrangling and Statistical Transformations
-Version: 0.8.0.8
+Version: 0.8.0.9
 Authors@R: c(
     person("Indrajeet", "Patil", , "patilindrajeet.science@gmail.com", role = "aut",
            comment = c(ORCID = "0000-0003-1995-6531", Twitter = "@patilindrajeets")),
diff --git a/NAMESPACE b/NAMESPACE
index 30c064f62..a985af2f0 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -278,9 +278,9 @@ export(reshape_longer)
 export(reshape_wider)
 export(reverse)
 export(reverse_scale)
+export(row_means)
 export(row_to_colnames)
 export(rowid_as_column)
-export(rowmean_n)
 export(rownames_as_column)
 export(skewness)
 export(slide)
diff --git a/NEWS.md b/NEWS.md
index 734cccd0d..7e91322e7 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -2,10 +2,10 @@
 
 NEW FUNCTIONS
 
-* `contr.deviation()` for sum-deviation contrast coding of factors.
+* `row_means()`, to compute row means, optionally only for the rows with at
+  least `min_valid` non-missing values.
 
-* `rowmean_n()`, to compute row means if row contains at least `n` non-missing
-  values.
+* `contr.deviation()` for sum-deviation contrast coding of factors.
 
 * `means_by_group()`, to compute mean values of variables, grouped by levels
   of specified factors.
diff --git a/R/row_means.R b/R/row_means.R
new file mode 100644
index 000000000..f252824b1
--- /dev/null
+++ b/R/row_means.R
@@ -0,0 +1,139 @@
+#' @title Row means (optionally with minimum amount of valid values)
+#' @name row_means
+#' @description This function is similar to the SPSS `MEAN.n` function and computes
+#' row means from a data frame or matrix if at least `min_valid` values of a row are
+#' valid (and not `NA`).
+#'
+#' @param data A data frame with at least two columns, where row means are applied.
+#' @param min_valid Optional, a numeric value of length 1. May either be
+#' - a numeric value that indicates the amount of valid values per row to
+#'   calculate the row mean;
+#' - or a value between 0 and 1, indicating a proportion of valid values per
+#'   row to calculate the row mean (see 'Details').
+#' - `NULL` (default), in which all cases are considered.
+#'
+#' If a row's sum of valid values is less than `min_valid`, `NA` will be returned.
+#' @param digits Numeric value indicating the number of decimal places to be
+#' used for rounding mean values. Negative values are allowed (see 'Details').
+#' By default, `digits = NULL` and no rounding is used.
+#' @param remove_na Logical, if `TRUE` (default), removes missing (`NA`) values
+#' before calculating row means. Only applies if `min_valuid` is not specified.
+#' @param verbose Toggle warnings.
+#' @inheritParams find_columns
+#'
+#' @return A vector with row means for those rows with at least `n` valid values.
+#'
+#' @details Rounding to a negative number of `digits` means rounding to a power of
+#' ten, for example `row_means(df, 3, digits = -2)` rounds to the nearest hundred.
+#' For `min_valid`, if not `NULL`, `min_valid` must be a numeric value from `0`
+#' to `ncol(data)`. If a row in the data frame has at least `min_valid`
+#' non-missing values, the row mean is returned. If `min_valid` is a non-integer
+#' value from 0 to 1, `min_valid` is considered to indicate the proportion of
+#' required non-missing values per row. E.g., if `min_valid = 0.75`, a row must
+#' have at least `ncol(data) * min_valid` non-missing values for the row mean
+#' to be calculated. See 'Examples'.
+#'
+#' @examples
+#' dat <- data.frame(
+#'   c1 = c(1, 2, NA, 4),
+#'   c2 = c(NA, 2, NA, 5),
+#'   c3 = c(NA, 4, NA, NA),
+#'   c4 = c(2, 3, 7, 8)
+#' )
+#'
+#' # default, all means are shown, if no NA values are present
+#' row_means(dat)
+#'
+#' # remove all NA before computing row means
+#' row_means(dat, remove_na = TRUE)
+#'
+#' # needs at least 4 non-missing values per row
+#' row_means(dat, min_valid = 4) # 1 valid return value
+#'
+#' # needs at least 3 non-missing values per row
+#' row_means(dat, min_valid = 3) # 2 valid return values
+#'
+#' # needs at least 2 non-missing values per row
+#' row_means(dat, min_valid = 2)
+#'
+#' # needs at least 1 non-missing value per row, for two selected variables
+#' row_means(dat, select = c("c1", "c3"), min_valid = 1)
+#'
+#' # needs at least 50% of non-missing values per row
+#' row_means(dat, min_valid = 0.5) # 3 valid return values
+#'
+#' # needs at least 75% of non-missing values per row
+#' row_means(dat, min_valid = 0.75) # 2 valid return values
+#'
+#' @export
+row_means <- function(data,
+                      select = NULL,
+                      exclude = NULL,
+                      min_valid = NULL,
+                      digits = NULL,
+                      ignore_case = FALSE,
+                      regex = FALSE,
+                      remove_na = FALSE,
+                      verbose = TRUE) {
+  # evaluate arguments
+  select <- .select_nse(select,
+    data,
+    exclude,
+    ignore_case = ignore_case,
+    regex = regex,
+    verbose = verbose
+  )
+
+  if (is.null(select) || length(select) == 0) {
+    insight::format_error("No columns selected.")
+  }
+
+  data <- .coerce_to_dataframe(data[select])
+
+  # n must be a numeric, non-missing value
+  if (!is.null(min_valid) && (all(is.na(min_valid)) || !is.numeric(min_valid) || length(min_valid) > 1)) {
+    insight::format_error("`min_valid` must be a numeric value of length 1.")
+  }
+
+  # make sure we only have numeric values
+  numeric_columns <- vapply(data, is.numeric, TRUE)
+  if (!all(numeric_columns)) {
+    if (verbose) {
+      insight::format_alert("Only numeric columns are considered for calculation.")
+    }
+    data <- data[numeric_columns]
+  }
+
+  # check if we have a data framme with at least two columns
+  if (ncol(data) < 2) {
+    insight::format_error("`data` must be a data frame with at least two numeric columns.")
+  }
+
+  # proceed here if min_valid is not NULL
+  if (!is.null(min_valid)) {
+    # is 'min_valid' indicating a proportion?
+    decimals <- min_valid %% 1
+    if (decimals != 0) {
+      min_valid <- round(ncol(data) * decimals)
+    }
+
+    # min_valid may not be larger as df's amount of columns
+    if (ncol(data) < min_valid) {
+      insight::format_error("`min_valid` must be smaller or equal to number of columns in data frame.")
+    }
+
+    # row means
+    to_na <- rowSums(is.na(data)) > ncol(data) - min_valid
+    out <- rowMeans(data, na.rm = TRUE)
+    out[to_na] <- NA
+  } else {
+    out <- rowMeans(data, na.rm = remove_na)
+  }
+
+  # round, if requested
+  if (!is.null(digits) && !all(is.na(digits))) {
+    out <- round(out, digits = digits)
+  }
+
+  out
+}
diff --git a/R/rowmean_n.R b/R/rowmean_n.R
deleted file mode 100644
index ab47cf511..000000000
--- a/R/rowmean_n.R
+++ /dev/null
@@ -1,101 +0,0 @@
-#' @title Row means with minimum amount of valid values
-#' @name rowmean_n
-#' @description This function is similar to the SPSS `MEAN.n` function and computes
-#' row means from a data frame or matrix if at least `n` values of a row are
-#' valid (and not `NA`).
-#'
-#' @param data A data frame with at least two columns, where row means are applied.
-#' @param n A numeric value of length 1. May either be
-#' - a numeric value that indicates the amount of valid values per row to
-#'   calculate the row mean;
-#' - or a value between 0 and 1, indicating a proportion of valid values per
-#'   row to calculate the row mean (see 'Details').
-#'
-#' If a row's sum of valid values is less than `n`, `NA` will be returned.
-#' @param digits Numeric value indicating the number of decimal places to be
-#' used for rounding mean values. Negative values are allowed (see 'Details').
-#' By default, `digits = NULL` and no rounding is used.
-#' @param verbose Toggle warnings.
-#'
-#' @return A vector with row means for those rows with at least `n` valid values.
-#'
-#' @details Rounding to a negative number of `digits` means rounding to a power of
-#' ten, for example `rowmean_n(df, 3, digits = -2)` rounds to the nearest hundred.
-#' For `n`, must be a numeric value from `0` to `ncol(data)`. If a row in the
-#' data frame has at least `n` non-missing values, the row mean is returned. If
-#' `n` is a non-integer value from 0 to 1, `n` is considered to indicate the
-#' proportion of required non-missing values per row. E.g., if `n = 0.75`, a
-#' row must have at least `ncol(data) * n` non-missing values for the row mean
-#' to be calculated. See 'Examples'.
-#'
-#' @examples
-#' dat <- data.frame(
-#'   c1 = c(1, 2, NA, 4),
-#'   c2 = c(NA, 2, NA, 5),
-#'   c3 = c(NA, 4, NA, NA),
-#'   c4 = c(2, 3, 7, 8)
-#' )
-#'
-#' # needs at least 4 non-missing values per row
-#' rowmean_n(dat, 4) # 1 valid return value
-#'
-#' # needs at least 3 non-missing values per row
-#' rowmean_n(dat, 3) # 2 valid return values
-#'
-#' # needs at least 2 non-missing values per row
-#' rowmean_n(dat, 2)
-#'
-#' # needs at least 1 non-missing value per row
-#' rowmean_n(dat, 1) # all means are shown
-#'
-#' # needs at least 50% of non-missing values per row
-#' rowmean_n(dat, 0.5) # 3 valid return values
-#'
-#' # needs at least 75% of non-missing values per row
-#' rowmean_n(dat, 0.75) # 2 valid return values
-#'
-#' @export
-rowmean_n <- function(data, n, digits = NULL, verbose = TRUE) {
-  data <- .coerce_to_dataframe(data)
-
-  # n must be a numeric, non-missing value
-  if (is.null(n) || all(is.na(n)) || !is.numeric(n) || length(n) > 1) {
-    insight::format_error("`n` must be a numeric value of length 1.")
-  }
-
-  # make sure we only have numeric values
-  numeric_columns <- vapply(data, is.numeric, TRUE)
-  if (!all(numeric_columns)) {
-    if (verbose) {
-      insight::format_alert("Only numeric columns are considered for calculation.")
-    }
-    data <- data[numeric_columns]
-  }
-
-  # check if we have a data framme with at least two columns
-  if (ncol(data) < 2) {
-    insight::format_error("`data` must be a data frame with at least two numeric columns.")
-  }
-
-  # is 'n' indicating a proportion?
-  decimals <- n %% 1
-  if (decimals != 0) {
-    n <- round(ncol(data) * decimals)
-  }
-
-  # n may not be larger as df's amount of columns
-  if (ncol(data) < n) {
-    insight::format_error("`n` must be smaller or equal to number of columns in data frame.")
-  }
-
-  # row means
-  to_na <- rowSums(is.na(data)) > ncol(data) - n
-  out <- rowMeans(data, na.rm = TRUE)
-  out[to_na] <- NA
-
-  # round, if requested
-  if (!is.null(digits) && !all(is.na(digits))) {
-    out <- round(out, digits = digits)
-  }
-  out
-}
diff --git a/_pkgdown.yaml b/_pkgdown.yaml
index 7e0aa5cb4..db2ebfeae 100644
--- a/_pkgdown.yaml
+++ b/_pkgdown.yaml
@@ -68,8 +68,8 @@ reference:
       - kurtosis
       - smoothness
       - skewness
+      - row_means
       - weighted_mean
-      - rowmean_n
       - mean_sd
 
   - title: Convert and Replace Data
diff --git a/man/row_means.Rd b/man/row_means.Rd
new file mode 100644
index 000000000..6e4a7774b
--- /dev/null
+++ b/man/row_means.Rd
@@ -0,0 +1,142 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/row_means.R
+\name{row_means}
+\alias{row_means}
+\title{Row means (optionally with minimum amount of valid values)}
+\usage{
+row_means(
+  data,
+  select = NULL,
+  exclude = NULL,
+  min_valid = NULL,
+  digits = NULL,
+  ignore_case = FALSE,
+  regex = FALSE,
+  remove_na = FALSE,
+  verbose = TRUE
+)
+}
+\arguments{
+\item{data}{A data frame with at least two columns, where row means are applied.}
+
+\item{select}{Variables that will be included when performing the required
+tasks. Can be either
+\itemize{
+\item a variable specified as a literal variable name (e.g., \code{column_name}),
+\item a string with the variable name (e.g., \code{"column_name"}), or a character
+vector of variable names (e.g., \code{c("col1", "col2", "col3")}),
+\item a formula with variable names (e.g., \code{~column_1 + column_2}),
+\item a vector of positive integers, giving the positions counting from the left
+(e.g. \code{1} or \code{c(1, 3, 5)}),
+\item a vector of negative integers, giving the positions counting from the
+right (e.g., \code{-1} or \code{-1:-3}),
+\item one of the following select-helpers: \code{starts_with()}, \code{ends_with()},
+\code{contains()}, a range using \code{:} or \code{regex("")}. \code{starts_with()},
+\code{ends_with()}, and  \code{contains()} accept several patterns, e.g
+\code{starts_with("Sep", "Petal")}.
+\item or a function testing for logical conditions, e.g. \code{is.numeric()} (or
+\code{is.numeric}), or any user-defined function that selects the variables
+for which the function returns \code{TRUE} (like: \code{foo <- function(x) mean(x) > 3}),
+\item ranges specified via literal variable names, select-helpers (except
+\code{regex()}) and (user-defined) functions can be negated, i.e. return
+non-matching elements, when prefixed with a \code{-}, e.g. \code{-ends_with("")},
+\code{-is.numeric} or \code{-(Sepal.Width:Petal.Length)}. \strong{Note:} Negation means
+that matches are \emph{excluded}, and thus, the \code{exclude} argument can be
+used alternatively. For instance, \code{select=-ends_with("Length")} (with
+\code{-}) is equivalent to \code{exclude=ends_with("Length")} (no \code{-}). In case
+negation should not work as expected, use the \code{exclude} argument instead.
+}
+
+If \code{NULL}, selects all columns. Patterns that found no matches are silently
+ignored, e.g. \code{find_columns(iris, select = c("Species", "Test"))} will just
+return \code{"Species"}.}
+
+\item{exclude}{See \code{select}, however, column names matched by the pattern
+from \code{exclude} will be excluded instead of selected. If \code{NULL} (the default),
+excludes no columns.}
+
+\item{min_valid}{Optional, a numeric value of length 1. May either be
+\itemize{
+\item a numeric value that indicates the amount of valid values per row to
+calculate the row mean;
+\item or a value between 0 and 1, indicating a proportion of valid values per
+row to calculate the row mean (see 'Details').
+\item \code{NULL} (default), in which all cases are considered.
+}
+
+If a row's sum of valid values is less than \code{min_valid}, \code{NA} will be returned.}
+
+\item{digits}{Numeric value indicating the number of decimal places to be
+used for rounding mean values. Negative values are allowed (see 'Details').
+By default, \code{digits = NULL} and no rounding is used.}
+
+\item{ignore_case}{Logical, if \code{TRUE} and when one of the select-helpers or
+a regular expression is used in \code{select}, ignores lower/upper case in the
+search pattern when matching against variable names.}
+
+\item{regex}{Logical, if \code{TRUE}, the search pattern from \code{select} will be
+treated as regular expression. When \code{regex = TRUE}, select \emph{must} be a
+character string (or a variable containing a character string) and is not
+allowed to be one of the supported select-helpers or a character vector
+of length > 1. \code{regex = TRUE} is comparable to using one of the two
+select-helpers, \code{select = contains("")} or \code{select = regex("")}, however,
+since the select-helpers may not work when called from inside other
+functions (see 'Details'), this argument may be used as workaround.}
+
+\item{remove_na}{Logical, if \code{TRUE} (default), removes missing (\code{NA}) values
+before calculating row means. Only applies if \code{min_valuid} is not specified.}
+
+\item{verbose}{Toggle warnings.}
+}
+\value{
+A vector with row means for those rows with at least \code{n} valid values.
+}
+\description{
+This function is similar to the SPSS \code{MEAN.n} function and computes
+row means from a data frame or matrix if at least \code{min_valid} values of a row are
+valid (and not \code{NA}).
+}
+\details{
+Rounding to a negative number of \code{digits} means rounding to a power of
+ten, for example \code{row_means(df, 3, digits = -2)} rounds to the nearest hundred.
+For \code{min_valid}, if not \code{NULL}, \code{min_valid} must be a numeric value from \code{0}
+to \code{ncol(data)}. If a row in the data frame has at least \code{min_valid}
+non-missing values, the row mean is returned. If \code{min_valid} is a non-integer
+value from 0 to 1, \code{min_valid} is considered to indicate the proportion of
+required non-missing values per row. E.g., if \code{min_valid = 0.75}, a row must
+have at least \code{ncol(data) * min_valid} non-missing values for the row mean
+to be calculated. See 'Examples'.
+}
+\examples{
+dat <- data.frame(
+  c1 = c(1, 2, NA, 4),
+  c2 = c(NA, 2, NA, 5),
+  c3 = c(NA, 4, NA, NA),
+  c4 = c(2, 3, 7, 8)
+)
+
+# default, all means are shown, if no NA values are present
+row_means(dat)
+
+# remove all NA before computing row means
+row_means(dat, remove_na = TRUE)
+
+# needs at least 4 non-missing values per row
+row_means(dat, min_valid = 4) # 1 valid return value
+
+# needs at least 3 non-missing values per row
+row_means(dat, min_valid = 3) # 2 valid return values
+
+# needs at least 2 non-missing values per row
+row_means(dat, min_valid = 2)
+
+# needs at least 1 non-missing value per row, for two selected variables
+row_means(dat, select = c("c1", "c3"), min_valid = 1)
+
+# needs at least 50\% of non-missing values per row
+row_means(dat, min_valid = 0.5) # 3 valid return values
+
+# needs at least 75\% of non-missing values per row
+row_means(dat, min_valid = 0.75) # 2 valid return values
+
+}
diff --git a/man/rowmean_n.Rd b/man/rowmean_n.Rd
deleted file mode 100644
index df340eed3..000000000
--- a/man/rowmean_n.Rd
+++ /dev/null
@@ -1,72 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/rowmean_n.R
-\name{rowmean_n}
-\alias{rowmean_n}
-\title{Row means with minimum amount of valid values}
-\usage{
-rowmean_n(data, n, digits = NULL, verbose = TRUE)
-}
-\arguments{
-\item{data}{A data frame with at least two columns, where row means are applied.}
-
-\item{n}{A numeric value of length 1. May either be
-\itemize{
-\item a numeric value that indicates the amount of valid values per row to
-calculate the row mean;
-\item or a value between 0 and 1, indicating a proportion of valid values per
-row to calculate the row mean (see 'Details').
-}
-
-If a row's sum of valid values is less than \code{n}, \code{NA} will be returned.}
-
-\item{digits}{Numeric value indicating the number of decimal places to be
-used for rounding mean values. Negative values are allowed (see 'Details').
-By default, \code{digits = NULL} and no rounding is used.}
-
-\item{verbose}{Toggle warnings.}
-}
-\value{
-A vector with row means for those rows with at least \code{n} valid values.
-}
-\description{
-This function is similar to the SPSS \code{MEAN.n} function and computes
-row means from a data frame or matrix if at least \code{n} values of a row are
-valid (and not \code{NA}).
-}
-\details{
-Rounding to a negative number of \code{digits} means rounding to a power of
-ten, for example \code{rowmean_n(df, 3, digits = -2)} rounds to the nearest hundred.
-For \code{n}, must be a numeric value from \code{0} to \code{ncol(data)}. If a row in the
-data frame has at least \code{n} non-missing values, the row mean is returned. If
-\code{n} is a non-integer value from 0 to 1, \code{n} is considered to indicate the
-proportion of required non-missing values per row. E.g., if \code{n = 0.75}, a
-row must have at least \code{ncol(data) * n} non-missing values for the row mean
-to be calculated. See 'Examples'.
-}
-\examples{
-dat <- data.frame(
-  c1 = c(1, 2, NA, 4),
-  c2 = c(NA, 2, NA, 5),
-  c3 = c(NA, 4, NA, NA),
-  c4 = c(2, 3, 7, 8)
-)
-
-# needs at least 4 non-missing values per row
-rowmean_n(dat, 4) # 1 valid return value
-
-# needs at least 3 non-missing values per row
-rowmean_n(dat, 3) # 2 valid return values
-
-# needs at least 2 non-missing values per row
-rowmean_n(dat, 2)
-
-# needs at least 1 non-missing value per row
-rowmean_n(dat, 1) # all means are shown
-
-# needs at least 50\% of non-missing values per row
-rowmean_n(dat, 0.5) # 3 valid return values
-
-# needs at least 75\% of non-missing values per row
-rowmean_n(dat, 0.75) # 2 valid return values
-
-}
diff --git a/tests/testthat/test-row_means.R b/tests/testthat/test-row_means.R
new file mode 100644
index 000000000..8d0504c69
--- /dev/null
+++ b/tests/testthat/test-row_means.R
@@ -0,0 +1,27 @@
+test_that("row_means", {
+  d_mn <- data.frame(
+    c1 = c(1, 2, NA, 4),
+    c2 = c(NA, 2, NA, 5),
+    c3 = c(NA, 4, NA, NA),
+    c4 = c(2, 3, 7, 8)
+  )
+  expect_equal(row_means(d_mn, min_valid = 4), c(NA, 2.75, NA, NA), tolerance = 1e-3)
+  expect_equal(row_means(d_mn, min_valid = 3), c(NA, 2.75, NA, 5.66667), tolerance = 1e-3)
+  expect_equal(row_means(d_mn, min_valid = 2), c(1.5, 2.75, NA, 5.66667), tolerance = 1e-3)
+  expect_equal(row_means(d_mn, min_valid = 1), c(1.5, 2.75, 7, 5.66667), tolerance = 1e-3)
+  expect_equal(row_means(d_mn, min_valid = 0.5), c(1.5, 2.75, NA, 5.66667), tolerance = 1e-3)
+  expect_equal(row_means(d_mn, min_valid = 0.75), c(NA, 2.75, NA, 5.66667), tolerance = 1e-3)
+  expect_equal(row_means(d_mn, min_valid = 2, digits = 1), c(1.5, 2.8, NA, 5.7), tolerance = 1e-1)
+  expect_message(row_means(iris), regex = "Only numeric")
+  expect_equal(row_means(iris, verbose = FALSE), rowMeans(iris[, 1:4]), tolerance = 1e-3, ignore_attr = TRUE)
+})
+
+test_that("row_means, errors or messages", {
+  data(iris)
+  expect_error(expect_warning(row_means(iris, select = "abc")), regex = "No columns")
+  expect_error(row_means(iris[1], min_valid = 1), regex = "two numeric")
+  expect_error(row_means(iris, min_valid = 1:4), regex = "numeric value")
+  expect_error(row_means(iris, min_valid = "a"), regex = "numeric value")
+  expect_message(row_means(iris[1:3, ], min_valid = 3), regex = "Only numeric")
+  expect_silent(row_means(iris[1:3, ], min_valid = 3, verbose = FALSE))
+})
diff --git a/tests/testthat/test-rowmean_n.R b/tests/testthat/test-rowmean_n.R
deleted file mode 100644
index a17996ff6..000000000
--- a/tests/testthat/test-rowmean_n.R
+++ /dev/null
@@ -1,26 +0,0 @@
-test_that("rowmean_n", {
-  d_mn <- data.frame(
-    c1 = c(1, 2, NA, 4),
-    c2 = c(NA, 2, NA, 5),
-    c3 = c(NA, 4, NA, NA),
-    c4 = c(2, 3, 7, 8)
-  )
-  expect_equal(rowmean_n(d_mn, 4), c(NA, 2.75, NA, NA), tolerance = 1e-3)
-  expect_equal(rowmean_n(d_mn, 3), c(NA, 2.75, NA, 5.66667), tolerance = 1e-3)
-  expect_equal(rowmean_n(d_mn, 2), c(1.5, 2.75, NA, 5.66667), tolerance = 1e-3)
-  expect_equal(rowmean_n(d_mn, 1), c(1.5, 2.75, 7, 5.66667), tolerance = 1e-3)
-  expect_equal(rowmean_n(d_mn, 0.5), c(1.5, 2.75, NA, 5.66667), tolerance = 1e-3)
-  expect_equal(rowmean_n(d_mn, 0.75), c(NA, 2.75, NA, 5.66667), tolerance = 1e-3)
-  expect_equal(rowmean_n(d_mn, 2, digits = 1), c(1.5, 2.8, NA, 5.7), tolerance = 1e-1)
-})
-
-test_that("rowmean_n, errors or messages", {
-  data(iris)
-  expect_error(rowmean_n(5, n = 1), regex = "`data` must be")
-  expect_error(rowmean_n(iris[1], n = 1), regex = "two numeric")
-  expect_error(rowmean_n(iris, n = NULL), regex = "numeric value")
-  expect_error(rowmean_n(iris, n = 1:4), regex = "numeric value")
-  expect_error(rowmean_n(iris, n = "a"), regex = "numeric value")
-  expect_message(rowmean_n(iris[1:3, ], n = 3), regex = "Only numeric")
-  expect_silent(rowmean_n(iris[1:3, ], n = 3, verbose = FALSE))
-})

From 1b3b82541538ecbf0a57039875f189ccfca88bc0 Mon Sep 17 00:00:00 2001
From: Daniel <mail@danielluedecke.de>
Date: Thu, 7 Sep 2023 20:20:29 +0200
Subject: [PATCH 2/8] fix issues with NA values in recodes (#455)

* fix issues with NA values in recodes

* add reserve_na attr, add tests

* add comments

* version bump

* Update test-recode_into.R

* scoping issue

* rename objects in tests, maybe fixes random test order
---
 DESCRIPTION                       |  2 +-
 NEWS.md                           |  6 +++
 R/recode_into.r                   | 26 ++++++++++++-
 man/recode_into.Rd                | 14 ++++++-
 tests/testthat/test-recode_into.R | 64 ++++++++++++++++++++++++++++++-
 5 files changed, 107 insertions(+), 5 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index e9c46fef5..76d6967bc 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,7 +1,7 @@
 Type: Package
 Package: datawizard
 Title: Easy Data Wrangling and Statistical Transformations
-Version: 0.8.0.9
+Version: 0.8.0.10
 Authors@R: c(
     person("Indrajeet", "Patil", , "patilindrajeet.science@gmail.com", role = "aut",
            comment = c(ORCID = "0000-0003-1995-6531", Twitter = "@patilindrajeets")),
diff --git a/NEWS.md b/NEWS.md
index 7e91322e7..295570ec4 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -15,6 +15,9 @@ CHANGES
 * `recode_into()` gains an `overwrite` argument to skip overwriting already
   recoded cases when multiple recode patterns apply to the same case.
 
+* `recode_into()` gains an `preserve_na` argument to preserve `NA` values
+  when recoding.
+
 * `data_read()` now passes the `encoding` argument to `data.table::fread()`.
   This allows to read files with non-ASCII characters.
 
@@ -28,6 +31,9 @@ BUG FIXES
 * Fixed issue in `recode_into()` with probably wrong case number printed in the
   warning when several recode patterns match to one case.
 
+* Fixed issue in `recode_into()` when original data contained `NA` values and
+  `NA` was not included in the recode pattern.
+
 * Fixed issue in `data_filter()` where functions containing a `=` (e.g. when
   naming arguments, like `grepl(pattern, x = a)`) were mistakenly seen as
   faulty syntax. 
diff --git a/R/recode_into.r b/R/recode_into.r
index 8a382fbe1..b93dbe5d9 100644
--- a/R/recode_into.r
+++ b/R/recode_into.r
@@ -20,6 +20,10 @@
 #' recode patterns. If `FALSE`, former recoded cases will not be altered by later
 #' recode patterns that would apply to those cases again. A warning message is
 #' printed to alert such situations and to avoid unintentional recodings.
+#' @param preserve_na Logical, if `TRUE` (default) and `default` is not `NA`,
+#' missing values in the original variable will be set back to `NA` in the
+#' recoded variable (unless overwritten by other recode patterns). If `FALSE`,
+#' missing values in the original variable will be recoded to `default`.
 #' @param verbose Toggle warnings.
 #'
 #' @return A vector with recoded values.
@@ -73,7 +77,12 @@
 #'   default = 0
 #' )
 #' @export
-recode_into <- function(..., data = NULL, default = NA, overwrite = TRUE, verbose = TRUE) {
+recode_into <- function(...,
+                        data = NULL,
+                        default = NA,
+                        overwrite = TRUE,
+                        preserve_na = TRUE,
+                        verbose = TRUE) {
   dots <- list(...)
 
   # get length of vector, so we know the length of the output vector
@@ -135,6 +144,12 @@ recode_into <- function(..., data = NULL, default = NA, overwrite = TRUE, verbos
       index <- with(data, eval(dots[[i]][[2]]))
       value <- with(data, eval(dots[[i]][[3]]))
     }
+    # remember missing values, so we can add back later
+    missing_index <- is.na(index)
+    # make sure index has no missing values. when we have missing values in
+    # original expression, these are considered as "no match" and set to FALSE
+    # we handle NA value later and thus want to remove them from "index" now
+    index[is.na(index)] <- FALSE
     # overwriting values? do more recode-patterns match the same case?
     if (is.na(default)) {
       already_exists <- !is.na(out[index])
@@ -144,7 +159,7 @@ recode_into <- function(..., data = NULL, default = NA, overwrite = TRUE, verbos
     # save indices of overwritten cases
     overwritten_cases <- which(index)[already_exists]
     # tell user...
-    if (any(already_exists) && verbose) {
+    if (any(already_exists, na.rm = TRUE) && verbose) {
       if (overwrite) {
         msg <- paste(
           "Several recode patterns apply to the same cases.",
@@ -164,7 +179,14 @@ recode_into <- function(..., data = NULL, default = NA, overwrite = TRUE, verbos
     if (!overwrite) {
       index[overwritten_cases] <- FALSE
     }
+    # write new values into output vector
     out[index] <- value
+    # set back missing values
+    if (any(missing_index) && !is.na(default) && preserve_na) {
+      # but only where we still have default values
+      # we don't want to overwrite already recoded values with NA
+      out[missing_index & out == default] <- NA
+    }
   }
 
   out
diff --git a/man/recode_into.Rd b/man/recode_into.Rd
index 064d72f6c..b3e164131 100644
--- a/man/recode_into.Rd
+++ b/man/recode_into.Rd
@@ -4,7 +4,14 @@
 \alias{recode_into}
 \title{Recode values from one or more variables into a new variable}
 \usage{
-recode_into(..., data = NULL, default = NA, overwrite = TRUE, verbose = TRUE)
+recode_into(
+  ...,
+  data = NULL,
+  default = NA,
+  overwrite = TRUE,
+  preserve_na = TRUE,
+  verbose = TRUE
+)
 }
 \arguments{
 \item{...}{A sequence of two-sided formulas, where the left hand side (LHS)
@@ -25,6 +32,11 @@ recode patterns. If \code{FALSE}, former recoded cases will not be altered by la
 recode patterns that would apply to those cases again. A warning message is
 printed to alert such situations and to avoid unintentional recodings.}
 
+\item{preserve_na}{Logical, if \code{TRUE} (default) and \code{default} is not \code{NA},
+missing values in the original variable will be set back to \code{NA} in the
+recoded variable (unless overwritten by other recode patterns). If \code{FALSE},
+missing values in the original variable will be recoded to \code{default}.}
+
 \item{verbose}{Toggle warnings.}
 }
 \value{
diff --git a/tests/testthat/test-recode_into.R b/tests/testthat/test-recode_into.R
index ab5c7908b..df7cf60b2 100644
--- a/tests/testthat/test-recode_into.R
+++ b/tests/testthat/test-recode_into.R
@@ -170,7 +170,7 @@ test_that("recode_into, check differen input length", {
   )
 })
 
-test_that("recode_into, check differen input length", {
+test_that("recode_into, check different input length", {
   x <- 1:5
   y <- c(5, 2, 3, 1, 4)
   expect_warning(
@@ -184,3 +184,65 @@ test_that("recode_into, check differen input length", {
     regexp = "Several recode patterns"
   )
 })
+
+test_that("recode_into, make sure recode works with missing in original variable", {
+  data(mtcars)
+  mtcars$mpg[c(3, 10, 12, 15, 16)] <- NA
+  mtcars$cyl[c(2, 15, 16)] <- NA
+  d_recode_na <<- as.data.frame(mtcars)
+  out1_recoded_na <- recode_into(
+    d_recode_na$mpg > 20 & d_recode_na$cyl == 6 ~ 1,
+    d_recode_na$mpg <= 20 ~ 2,
+    d_recode_na$cyl == 4 ~ 3,
+    default = 0
+  )
+  out2_recoded_na <- recode_into(
+    d_recode_na$mpg > 20 & d_recode_na$cyl == 6 ~ 1,
+    d_recode_na$mpg <= 20 ~ 2,
+    default = 0
+  )
+  out3_recoded_na <- recode_into(
+    d_recode_na$mpg > 20 & d_recode_na$cyl == 6 ~ 1,
+    d_recode_na$mpg <= 20 ~ 2,
+    d_recode_na$cyl == 4 ~ 3,
+    default = 0,
+    preserve_na = FALSE
+  )
+  out4_recoded_na <- recode_into(
+    d_recode_na$mpg > 20 & d_recode_na$cyl == 6 ~ 1,
+    d_recode_na$mpg <= 20 ~ 2,
+    default = 0,
+    preserve_na = FALSE
+  )
+  # one NA in mpg is overwritten by valid value from cyl, total 5 NA
+  expect_identical(
+    out1_recoded_na,
+    c(
+      1, NA, 3, 1, 2, 2, 2, 3, 3, NA, 2, NA, 2, 2, NA, NA, 2, 3,
+      3, 3, 3, 2, 2, 2, 2, 3, 3, 3, 2, 2, 2, 3
+    )
+  )
+  # total 6 NA
+  expect_identical(
+    out2_recoded_na,
+    c(
+      1, NA, NA, 1, 2, 2, 2, 0, 0, NA, 2, NA, 2, 2, NA, NA, 2, 0,
+      0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 2, 2, 2, 0
+    )
+  )
+  # NA is preserved, set to default if not overwritten by other recodes
+  expect_identical(
+    out3_recoded_na,
+    c(
+      1, 0, 3, 1, 2, 2, 2, 3, 3, 0, 2, 0, 2, 2, 0, 0, 2, 3, 3, 3,
+      3, 2, 2, 2, 2, 3, 3, 3, 2, 2, 2, 3
+    )
+  )
+  expect_identical(
+    out4_recoded_na,
+    c(
+      1, 0, 0, 1, 2, 2, 2, 0, 0, 0, 2, 0, 2, 2, 0, 0, 2, 0, 0, 0,
+      0, 2, 2, 2, 2, 0, 0, 0, 2, 2, 2, 0
+    )
+  )
+})

From 35c4f13d86269d44f152fb0b447d7e158855f79e Mon Sep 17 00:00:00 2001
From: Daniel <mail@danielluedecke.de>
Date: Thu, 7 Sep 2023 20:39:42 +0200
Subject: [PATCH 3/8] improve handling

---
 R/recode_into.r                   | 57 ++++++++++++++++++++++++++-----
 man/recode_into.Rd                | 38 ++++++++++++++++++---
 tests/testthat/test-recode_into.R | 38 ++++++++++++++-------
 3 files changed, 106 insertions(+), 27 deletions(-)

diff --git a/R/recode_into.r b/R/recode_into.r
index b93dbe5d9..a33f03eb7 100644
--- a/R/recode_into.r
+++ b/R/recode_into.r
@@ -20,10 +20,13 @@
 #' recode patterns. If `FALSE`, former recoded cases will not be altered by later
 #' recode patterns that would apply to those cases again. A warning message is
 #' printed to alert such situations and to avoid unintentional recodings.
-#' @param preserve_na Logical, if `TRUE` (default) and `default` is not `NA`,
-#' missing values in the original variable will be set back to `NA` in the
-#' recoded variable (unless overwritten by other recode patterns). If `FALSE`,
-#' missing values in the original variable will be recoded to `default`.
+#' @param preserve_na Logical, if `TRUE` and `default` is not `NA`, missing
+#' values in the original variable will be set back to `NA` in the recoded
+#' variable (unless overwritten by other recode patterns). If `FALSE`, missing
+#' values in the original variable will be recoded to `default`. The latter
+#' behaviour prevents unintentional overwriting of missing values with `default`,
+#' which means that you won't find valid values where the original data only
+#' had missing values. See 'Examples'.
 #' @param verbose Toggle warnings.
 #'
 #' @return A vector with recoded values.
@@ -76,12 +79,37 @@
 #'   data = d,
 #'   default = 0
 #' )
+#'
+#' # handling of missing values
+#' d <- data.frame(
+#'   x = c(1, NA, 2, NA, 3, 4),
+#'   y = c(1, 11, 3, NA, 5, 6)
+#' )
+#' # first NA in x is overwritten by valid value from y
+#' # we have no known value for second NA in x and y,
+#' # thus we get one NA in the result
+#' recode_into(
+#'   x <= 3 ~ 1,
+#'   y > 5 ~ 2,
+#'   data = d,
+#'   default = 0,
+#'   preserve_na = TRUE
+#' )
+#' # first NA in x is overwritten by valid value from y
+#' # default value is used for second NA
+#' recode_into(
+#'   x <= 3 ~ 1,
+#'   y > 5 ~ 2,
+#'   data = d,
+#'   default = 0,
+#'   preserve_na = FALSE
+#' )
 #' @export
 recode_into <- function(...,
                         data = NULL,
                         default = NA,
                         overwrite = TRUE,
-                        preserve_na = TRUE,
+                        preserve_na = FALSE,
                         verbose = TRUE) {
   dots <- list(...)
 
@@ -133,6 +161,9 @@ recode_into <- function(...,
     )
   }
 
+  # indicator to show message when replacing NA by default
+  # needed to show message only once
+  overwrite_NA_msg <- TRUE
 
   # iterate all expressions
   for (i in seq_len(n_params)) {
@@ -182,10 +213,18 @@ recode_into <- function(...,
     # write new values into output vector
     out[index] <- value
     # set back missing values
-    if (any(missing_index) && !is.na(default) && preserve_na) {
-      # but only where we still have default values
-      # we don't want to overwrite already recoded values with NA
-      out[missing_index & out == default] <- NA
+    if (any(missing_index) && !is.na(default)) {
+      if (preserve_na) {
+        # but only where we still have default values
+        # we don't want to overwrite already recoded values with NA
+        out[missing_index & out == default] <- NA
+      } else if (overwrite_NA_msg && verbose) {
+        # don't show msg again
+        overwrite_NA_msg <- FALSE
+        insight::format_alert(
+          "Missing values in original variable are overwritten by default value. If you want to preserve missing values, set `preserve_na = TRUE`."
+        )
+      }
     }
   }
 
diff --git a/man/recode_into.Rd b/man/recode_into.Rd
index b3e164131..d8d0a337d 100644
--- a/man/recode_into.Rd
+++ b/man/recode_into.Rd
@@ -9,7 +9,7 @@ recode_into(
   data = NULL,
   default = NA,
   overwrite = TRUE,
-  preserve_na = TRUE,
+  preserve_na = FALSE,
   verbose = TRUE
 )
 }
@@ -32,10 +32,13 @@ recode patterns. If \code{FALSE}, former recoded cases will not be altered by la
 recode patterns that would apply to those cases again. A warning message is
 printed to alert such situations and to avoid unintentional recodings.}
 
-\item{preserve_na}{Logical, if \code{TRUE} (default) and \code{default} is not \code{NA},
-missing values in the original variable will be set back to \code{NA} in the
-recoded variable (unless overwritten by other recode patterns). If \code{FALSE},
-missing values in the original variable will be recoded to \code{default}.}
+\item{preserve_na}{Logical, if \code{TRUE} and \code{default} is not \code{NA}, missing
+values in the original variable will be set back to \code{NA} in the recoded
+variable (unless overwritten by other recode patterns). If \code{FALSE}, missing
+values in the original variable will be recoded to \code{default}. The latter
+behaviour prevents unintentional overwriting of missing values with \code{default},
+which means that you won't find valid values where the original data only
+had missing values. See 'Examples'.}
 
 \item{verbose}{Toggle warnings.}
 }
@@ -95,4 +98,29 @@ recode_into(
   data = d,
   default = 0
 )
+
+# handling of missing values
+d <- data.frame(
+  x = c(1, NA, 2, NA, 3, 4),
+  y = c(1, 11, 3, NA, 5, 6)
+)
+# first NA in x is overwritten by valid value from y
+# we have no known value for second NA in x and y,
+# thus we get one NA in the result
+recode_into(
+  x <= 3 ~ 1,
+  y > 5 ~ 2,
+  data = d,
+  default = 0,
+  preserve_na = TRUE
+)
+# first NA in x is overwritten by valid value from y
+# default value is used for second NA
+recode_into(
+  x <= 3 ~ 1,
+  y > 5 ~ 2,
+  data = d,
+  default = 0,
+  preserve_na = FALSE
+)
 }
diff --git a/tests/testthat/test-recode_into.R b/tests/testthat/test-recode_into.R
index df7cf60b2..90fabcd2f 100644
--- a/tests/testthat/test-recode_into.R
+++ b/tests/testthat/test-recode_into.R
@@ -194,25 +194,37 @@ test_that("recode_into, make sure recode works with missing in original variable
     d_recode_na$mpg > 20 & d_recode_na$cyl == 6 ~ 1,
     d_recode_na$mpg <= 20 ~ 2,
     d_recode_na$cyl == 4 ~ 3,
-    default = 0
+    default = 0,
+    preserve_na = TRUE
   )
   out2_recoded_na <- recode_into(
     d_recode_na$mpg > 20 & d_recode_na$cyl == 6 ~ 1,
     d_recode_na$mpg <= 20 ~ 2,
-    default = 0
-  )
-  out3_recoded_na <- recode_into(
-    d_recode_na$mpg > 20 & d_recode_na$cyl == 6 ~ 1,
-    d_recode_na$mpg <= 20 ~ 2,
-    d_recode_na$cyl == 4 ~ 3,
     default = 0,
-    preserve_na = FALSE
+    preserve_na = TRUE
   )
-  out4_recoded_na <- recode_into(
-    d_recode_na$mpg > 20 & d_recode_na$cyl == 6 ~ 1,
-    d_recode_na$mpg <= 20 ~ 2,
-    default = 0,
-    preserve_na = FALSE
+  expect_message(
+    {
+      out3_recoded_na <- recode_into(
+        d_recode_na$mpg > 20 & d_recode_na$cyl == 6 ~ 1,
+        d_recode_na$mpg <= 20 ~ 2,
+        d_recode_na$cyl == 4 ~ 3,
+        default = 0,
+        preserve_na = FALSE
+      )
+    },
+    regex = "Missing values in original variable"
+  )
+  expect_message(
+    {
+      out4_recoded_na <- recode_into(
+        d_recode_na$mpg > 20 & d_recode_na$cyl == 6 ~ 1,
+        d_recode_na$mpg <= 20 ~ 2,
+        default = 0,
+        preserve_na = FALSE
+      )
+    },
+    regex = "Missing values in original variable"
   )
   # one NA in mpg is overwritten by valid value from cyl, total 5 NA
   expect_identical(

From 2acaef27141b1371213a707a6f28f678671d0224 Mon Sep 17 00:00:00 2001
From: Daniel <mail@danielluedecke.de>
Date: Thu, 7 Sep 2023 21:48:31 +0200
Subject: [PATCH 4/8] docs

---
 R/recode_into.r    | 8 ++++----
 man/recode_into.Rd | 8 ++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/R/recode_into.r b/R/recode_into.r
index a33f03eb7..fc369a62a 100644
--- a/R/recode_into.r
+++ b/R/recode_into.r
@@ -23,10 +23,10 @@
 #' @param preserve_na Logical, if `TRUE` and `default` is not `NA`, missing
 #' values in the original variable will be set back to `NA` in the recoded
 #' variable (unless overwritten by other recode patterns). If `FALSE`, missing
-#' values in the original variable will be recoded to `default`. The latter
-#' behaviour prevents unintentional overwriting of missing values with `default`,
-#' which means that you won't find valid values where the original data only
-#' had missing values. See 'Examples'.
+#' values in the original variable will be recoded to `default`. Setting
+#' `preserve_na = TRUE` prevents unintentional overwriting of missing values
+#' with `default`, which means that you won't find valid values where the
+#' original data only had missing values. See 'Examples'.
 #' @param verbose Toggle warnings.
 #'
 #' @return A vector with recoded values.
diff --git a/man/recode_into.Rd b/man/recode_into.Rd
index d8d0a337d..b0acc7c9f 100644
--- a/man/recode_into.Rd
+++ b/man/recode_into.Rd
@@ -35,10 +35,10 @@ printed to alert such situations and to avoid unintentional recodings.}
 \item{preserve_na}{Logical, if \code{TRUE} and \code{default} is not \code{NA}, missing
 values in the original variable will be set back to \code{NA} in the recoded
 variable (unless overwritten by other recode patterns). If \code{FALSE}, missing
-values in the original variable will be recoded to \code{default}. The latter
-behaviour prevents unintentional overwriting of missing values with \code{default},
-which means that you won't find valid values where the original data only
-had missing values. See 'Examples'.}
+values in the original variable will be recoded to \code{default}. Setting
+\code{preserve_na = TRUE} prevents unintentional overwriting of missing values
+with \code{default}, which means that you won't find valid values where the
+original data only had missing values. See 'Examples'.}
 
 \item{verbose}{Toggle warnings.}
 }

From fbd4430d7143e186b14952f6e3da7d2c8e55e4df Mon Sep 17 00:00:00 2001
From: Daniel <mail@danielluedecke.de>
Date: Fri, 8 Sep 2023 13:06:10 +0200
Subject: [PATCH 5/8] fix labels_to_levels (#456)

* fix labels_to_levels

* fix

* lintr

* lintr

* add comments
---
 DESCRIPTION                            |  2 +-
 NEWS.md                                |  3 +++
 R/utils_labels.R                       | 16 +++++++++++++++-
 tests/testthat/test-labels_to_levels.R | 24 ++++++++++++++++++++++--
 4 files changed, 41 insertions(+), 4 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 76d6967bc..9fed6f068 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,7 +1,7 @@
 Type: Package
 Package: datawizard
 Title: Easy Data Wrangling and Statistical Transformations
-Version: 0.8.0.10
+Version: 0.8.0.11
 Authors@R: c(
     person("Indrajeet", "Patil", , "patilindrajeet.science@gmail.com", role = "aut",
            comment = c(ORCID = "0000-0003-1995-6531", Twitter = "@patilindrajeets")),
diff --git a/NEWS.md b/NEWS.md
index 295570ec4..160c0fae2 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -25,6 +25,9 @@ CHANGES
 
 BUG FIXES
 
+* Fixed issue in `labels_to_levels()` when values of labels were not in sorted
+  order and values were not sequentially numbered.
+
 * Fixed issues in `data_write()` when writing labelled data into SPSS format
   and vectors were of different type as value labels.
 
diff --git a/R/utils_labels.R b/R/utils_labels.R
index 54b1c46fd..67b3ecc6a 100644
--- a/R/utils_labels.R
+++ b/R/utils_labels.R
@@ -46,7 +46,21 @@
       "Not all factor levels had a matching value label. Non-matching levels were preserved."
     )
   }
-  levels(x)[levels_in_labs] <- names(value_labels[labs_in_levels])
+  if (length(value_labels) == length(levels_in_labs)) {
+    # when length of value_labels and levels_in_labs is identical, we can simply
+    # replace the levels with the value labels. This makes sure than levels or
+    # value labels, which are not sorted or not sequentially numbered, match.
+    # Example:
+    # x <- c(5, 5, 1, 3, 1, 7)
+    # attr(x, "labels") <- c(no = 7, yes = 1, maybe = 3, `don't know` = 5)
+    # to_factor(x, labels_to_levels = TRUE)
+    levels(x)[levels_in_labs] <- names(value_labels)
+  } else {
+    # else, we need to select only those value labels that have a matching level
+    # (in labs_in_levels). This is required when not all values that have labels
+    # appear in the data.
+    levels(x)[levels_in_labs] <- names(value_labels[labs_in_levels])
+  }
   attr(x, "labels") <- NULL
 
   x
diff --git a/tests/testthat/test-labels_to_levels.R b/tests/testthat/test-labels_to_levels.R
index 518dac70e..55105acfe 100644
--- a/tests/testthat/test-labels_to_levels.R
+++ b/tests/testthat/test-labels_to_levels.R
@@ -8,13 +8,13 @@ test_that("labels_to_levels, numeric", {
 test_that("labels_to_levels, factor", {
   data(efc)
   x <- as.factor(efc$c172code)
-  attr(x, "labels") <- c("low" = 1, "mid" = 2, "high" = 3)
+  attr(x, "labels") <- c(low = 1, mid = 2, high = 3)
   x <- labels_to_levels(x)
   expect_identical(levels(x), c("low", "mid", "high"))
   expect_equal(table(x), table(efc$c172code), ignore_attr = TRUE)
 
   x <- as.ordered(efc$c172code)
-  attr(x, "labels") <- c("low" = 1, "mid" = 2, "high" = 3)
+  attr(x, "labels") <- c(low = 1, mid = 2, high = 3)
   x <- labels_to_levels(x)
   expect_identical(levels(x), c("low", "mid", "high"))
   expect_s3_class(x, "ordered")
@@ -40,3 +40,23 @@ test_that("labels_to_levels, factor, data frame", {
   )
   expect_identical(sum(vapply(efc, is.factor, TRUE)), 1L)
 })
+
+test_that("labels_to_levels, factor, with random value numbers (no sequential order)", {
+  x <- c(5, 5, 1, 3, 1, 7)
+  attr(x, "labels") <- c(no = 7, yes = 1, maybe = 3, `don't know` = 5)
+  out <- to_factor(x, labels_to_levels = TRUE)
+  expect_identical(as.character(out), c("don't know", "don't know", "yes", "maybe", "yes", "no"))
+  expect_identical(levels(out), c("yes", "maybe", "don't know", "no"))
+
+  x <- c(4, 4, 1, 2, 1, 3)
+  attr(x, "labels") <- c(a = 1, b = 2, c = 3, d = 4)
+  out <- to_factor(x, labels_to_levels = TRUE)
+  expect_identical(as.character(out), c("d", "d", "a", "b", "a", "c"))
+  expect_identical(levels(out), c("a", "b", "c", "d"))
+
+  x <- c(4, 4, 1, 2, 1, 3)
+  attr(x, "labels") <- c(d = 1, c = 2, b = 3, a = 4)
+  out <- to_factor(x, labels_to_levels = TRUE)
+  expect_identical(as.character(out), c("a", "a", "d", "c", "d", "b"))
+  expect_identical(levels(out), c("d", "c", "b", "a"))
+})

From bcbc115cd36979e839637ab7068fa69d1dfea655 Mon Sep 17 00:00:00 2001
From: Daniel <mail@danielluedecke.de>
Date: Fri, 8 Sep 2023 16:51:10 +0200
Subject: [PATCH 6/8] Need more fixing (#457)

* fix labels_to_levels

* fix

* lintr

* lintr

* add comments

* still not working for all edge cases

* fix

* namespace

* fix

* desc

* styler
---
 DESCRIPTION                            |  2 +-
 R/utils_labels.R                       | 26 ++++-----
 tests/testthat/test-labels_to_levels.R | 73 ++++++++++++++++++++++++++
 3 files changed, 85 insertions(+), 16 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 9fed6f068..c062db49e 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,7 +1,7 @@
 Type: Package
 Package: datawizard
 Title: Easy Data Wrangling and Statistical Transformations
-Version: 0.8.0.11
+Version: 0.8.0.12
 Authors@R: c(
     person("Indrajeet", "Patil", , "patilindrajeet.science@gmail.com", role = "aut",
            comment = c(ORCID = "0000-0003-1995-6531", Twitter = "@patilindrajeets")),
diff --git a/R/utils_labels.R b/R/utils_labels.R
index 67b3ecc6a..a7f4fa2c3 100644
--- a/R/utils_labels.R
+++ b/R/utils_labels.R
@@ -46,21 +46,17 @@
       "Not all factor levels had a matching value label. Non-matching levels were preserved."
     )
   }
-  if (length(value_labels) == length(levels_in_labs)) {
-    # when length of value_labels and levels_in_labs is identical, we can simply
-    # replace the levels with the value labels. This makes sure than levels or
-    # value labels, which are not sorted or not sequentially numbered, match.
-    # Example:
-    # x <- c(5, 5, 1, 3, 1, 7)
-    # attr(x, "labels") <- c(no = 7, yes = 1, maybe = 3, `don't know` = 5)
-    # to_factor(x, labels_to_levels = TRUE)
-    levels(x)[levels_in_labs] <- names(value_labels)
-  } else {
-    # else, we need to select only those value labels that have a matching level
-    # (in labs_in_levels). This is required when not all values that have labels
-    # appear in the data.
-    levels(x)[levels_in_labs] <- names(value_labels[labs_in_levels])
-  }
+  # we need to find out which levels have no labelled value
+  missing_levels <- levels(x)[!levels(x) %in% value_labels]
+
+  # and we need to remove those value labels that don't have a matching level
+  value_labels <- value_labels[value_labels %in% levels(x)]
+
+  # for levels that have no label, we just keep the original factor level
+  value_labels <- c(value_labels, stats::setNames(missing_levels, missing_levels))
+
+  # now we can add back levels
+  levels(x) <- names(value_labels)[order(as.numeric(value_labels))]
   attr(x, "labels") <- NULL
 
   x
diff --git a/tests/testthat/test-labels_to_levels.R b/tests/testthat/test-labels_to_levels.R
index 55105acfe..866154c8f 100644
--- a/tests/testthat/test-labels_to_levels.R
+++ b/tests/testthat/test-labels_to_levels.R
@@ -59,4 +59,77 @@ test_that("labels_to_levels, factor, with random value numbers (no sequential or
   out <- to_factor(x, labels_to_levels = TRUE)
   expect_identical(as.character(out), c("a", "a", "d", "c", "d", "b"))
   expect_identical(levels(out), c("d", "c", "b", "a"))
+
+  x <- c(5, 5, 1, 3, 1, 7)
+  attr(x, "labels") <- c(no = 7, yes = 1, maybe = 3, `don't know` = 5)
+  out <- to_factor(x, labels_to_levels = TRUE)
+  expect_identical(
+    out,
+    structure(
+      c(3L, 3L, 1L, 2L, 1L, 4L),
+      levels = c("yes", "maybe", "don't know", "no"),
+      class = "factor"
+    )
+  )
+  expect_identical(
+    as.character(out),
+    c("don't know", "don't know", "yes", "maybe", "yes", "no")
+  )
+
+  x <- c(5, 5, 1, 3, 1, 7, 4)
+  attr(x, "labels") <- c(no = 7, yes = 1, maybe = 3, `don't know` = 5)
+  expect_message(
+    {
+      out <- to_factor(x, labels_to_levels = TRUE)
+    },
+    regex = "Not all factor levels"
+  )
+  expect_identical(
+    out,
+    structure(
+      c(4L, 4L, 1L, 2L, 1L, 5L, 3L),
+      levels = c("yes", "maybe", "4", "don't know", "no"),
+      class = "factor"
+    )
+  )
+  expect_identical(
+    as.character(out),
+    c("don't know", "don't know", "yes", "maybe", "yes", "no", "4")
+  )
+
+  x <- c(5, 5, 1, 3, 1, 7)
+  attr(x, "labels") <- c(no = 7, yes = 1, maybe = 4, `don't know` = 5)
+  expect_message({
+    out <- to_factor(x, labels_to_levels = TRUE)
+  })
+  expect_identical(
+    out,
+    structure(
+      c(3L, 3L, 1L, 2L, 1L, 4L),
+      levels = c("yes", "3", "don't know", "no"),
+      class = "factor"
+    )
+  )
+  expect_identical(
+    as.character(out),
+    c("don't know", "don't know", "yes", "3", "yes", "no")
+  )
+
+  x <- c(5, 5, 1, 3, 1, 7, 6)
+  attr(x, "labels") <- c(no = 7, yes = 1, maybe = 4, `don't know` = 5)
+  expect_message({
+    out <- to_factor(x, labels_to_levels = TRUE)
+  })
+  expect_identical(
+    out,
+    structure(
+      c(3L, 3L, 1L, 2L, 1L, 5L, 4L),
+      levels = c("yes", "3", "don't know", "6", "no"),
+      class = "factor"
+    )
+  )
+  expect_identical(
+    as.character(out),
+    c("don't know", "don't know", "yes", "3", "yes", "no", "6")
+  )
 })

From 6e09435dc978454436faf0d33230d1d69b1ce8ee Mon Sep 17 00:00:00 2001
From: Daniel <mail@danielluedecke.de>
Date: Fri, 8 Sep 2023 17:15:53 +0200
Subject: [PATCH 7/8] rename 1

---
 R/{data_peek.r => _data_peek.R}     | 0
 R/{data_write.r => _data_write.R}   | 0
 R/{recode_into.r => _recode_into.R} | 0
 3 files changed, 0 insertions(+), 0 deletions(-)
 rename R/{data_peek.r => _data_peek.R} (100%)
 rename R/{data_write.r => _data_write.R} (100%)
 rename R/{recode_into.r => _recode_into.R} (100%)

diff --git a/R/data_peek.r b/R/_data_peek.R
similarity index 100%
rename from R/data_peek.r
rename to R/_data_peek.R
diff --git a/R/data_write.r b/R/_data_write.R
similarity index 100%
rename from R/data_write.r
rename to R/_data_write.R
diff --git a/R/recode_into.r b/R/_recode_into.R
similarity index 100%
rename from R/recode_into.r
rename to R/_recode_into.R

From ae7df24b9dce7207e361dd7906fc241743fb90f1 Mon Sep 17 00:00:00 2001
From: Daniel <mail@danielluedecke.de>
Date: Fri, 8 Sep 2023 17:16:14 +0200
Subject: [PATCH 8/8] rename 2

---
 R/{_data_peek.R => data_peek.R}     | 0
 R/{_data_write.R => data_write.R}   | 0
 R/{_recode_into.R => recode_into.R} | 0
 3 files changed, 0 insertions(+), 0 deletions(-)
 rename R/{_data_peek.R => data_peek.R} (100%)
 rename R/{_data_write.R => data_write.R} (100%)
 rename R/{_recode_into.R => recode_into.R} (100%)

diff --git a/R/_data_peek.R b/R/data_peek.R
similarity index 100%
rename from R/_data_peek.R
rename to R/data_peek.R
diff --git a/R/_data_write.R b/R/data_write.R
similarity index 100%
rename from R/_data_write.R
rename to R/data_write.R
diff --git a/R/_recode_into.R b/R/recode_into.R
similarity index 100%
rename from R/_recode_into.R
rename to R/recode_into.R