Merge branch 'main' into strengejacke/issue697

easystats · Jun 5, 2024 · dda4683 · dda4683
2 parents 5fa72a2 + 7701624
commit dda4683
Show file tree

Hide file tree

Showing 68 changed files with 160 additions and 132 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Type: Package
 Package: performance
 Title: Assessment of Regression Models Performance
-Version: 0.11.0.6
+Version: 0.11.0.9
 Authors@R: 
     c(person(given = "Daniel",
              family = "Lüdecke",
@@ -124,7 +124,7 @@ Suggests:
     nonnest2,
     ordinal,
     parallel,
-    parameters (>= 0.21.4),
+    parameters (>= 0.21.6),
     patchwork,
     pscl,
     psych,
@@ -154,3 +154,4 @@ Config/Needs/website:
     r-lib/pkgdown,
     easystats/easystatstemplate
 Config/rcmdcheck/ignore-inconsequential-notes: true
+Remotes: easystats/insight, easystats/datawizard, easystats/parameters, easystats/bayestestR
diff --git a/NEWS.md b/NEWS.md
@@ -5,6 +5,10 @@
 * Aliases `posterior_predictive_check()` and `check_posterior_predictions()` for
   `check_predictions()` are deprecated.
 
+* Arguments named `group` or `group_by` will be deprecated in a future release.
+  Please use `by` instead. This affects `check_heterogeneity_bias()` in
+  *performance*.
+
 ## General
 
 * Improved documentation and new vignettes added.
@@ -15,6 +19,9 @@
   the usual style as for other models and no longer returns plots from
   `bayesplot::pp_check()`.
 
+* Updated the trained model that is used to prediction distributions in
+  `check_distribution()`.
+
 ## Bug fixes
 
 * `check_model()` now falls back on normal Q-Q plots when a model is not supported

diff --git a/R/binned_residuals.R b/R/binned_residuals.R
@@ -86,7 +86,6 @@ binned_residuals <- function(model,
                              iterations = 1000,
                              verbose = TRUE,
                              ...) {
-  # match arguments
   ci_type <- match.arg(ci_type)
   residuals <- match.arg(residuals)
 

diff --git a/R/check_autocorrelation.R b/R/check_autocorrelation.R
@@ -29,7 +29,6 @@ check_autocorrelation <- function(x, ...) {
 #' @rdname check_autocorrelation
 #' @export
 check_autocorrelation.default <- function(x, nsim = 1000, ...) {
-  # check for valid input
   .is_model_valid(x)
 
   .residuals <- stats::residuals(x)

diff --git a/R/check_clusterstructure.R b/R/check_clusterstructure.R
@@ -11,7 +11,7 @@
 #' number of square shaped blocks along the diagonal.
 #'
 #' @param x A data frame.
-#' @param standardize Standardize the dataframe before clustering (default).
+#' @param standardize Standardize the data frame before clustering (default).
 #' @param distance Distance method used. Other methods than "euclidean"
 #'   (default) are exploratory in the context of clustering tendency. See
 #'   [stats::dist()] for list of available methods.

diff --git a/R/check_collinearity.R b/R/check_collinearity.R
@@ -145,7 +145,6 @@ multicollinearity <- check_collinearity
 #' @rdname check_collinearity
 #' @export
 check_collinearity.default <- function(x, ci = 0.95, verbose = TRUE, ...) {
-  # check for valid input
   .is_model_valid(x)
   .check_collinearity(x, component = "conditional", ci = ci, verbose = verbose)
 }

diff --git a/R/check_convergence.R b/R/check_convergence.R
@@ -76,7 +76,6 @@ check_convergence <- function(x, tolerance = 0.001, ...) {
 
 #' @export
 check_convergence.default <- function(x, tolerance = 0.001, ...) {
-  # check for valid input
   .is_model_valid(x)
   message(sprintf("`check_convergence()` does not work for models of class '%s'.", class(x)[1]))
 }

diff --git a/R/check_distribution.R b/R/check_distribution.R
@@ -34,12 +34,11 @@ NULL
 #' This function uses an internal random forest model to classify the
 #' distribution from a model-family. Currently, following distributions are
 #' trained (i.e. results of `check_distribution()` may be one of the
-#' following): `"bernoulli"`, `"beta"`, `"beta-binomial"`,
-#' `"binomial"`, `"chi"`, `"exponential"`, `"F"`,
-#' `"gamma"`, `"lognormal"`, `"normal"`, `"negative
-#' binomial"`, `"negative binomial (zero-inflated)"`, `"pareto"`,
-#' `"poisson"`, `"poisson (zero-inflated)"`, `"uniform"` and
-#' `"weibull"`.
+#' following): `"bernoulli"`, `"beta"`, `"beta-binomial"`, `"binomial"`,
+#' `"cauchy"`, `"chi"`, `"exponential"`, `"F"`, `"gamma"`, `"half-cauchy"`,
+#' `"inverse-gamma"`, `"lognormal"`, `"normal"`, `"negative binomial"`,
+#' `"negative binomial (zero-inflated)"`, `"pareto"`, `"poisson"`,
+#' `"poisson (zero-inflated)"`, `"tweedie"`, `"uniform"` and `"weibull"`.
 #' \cr \cr
 #' Note the similarity between certain distributions according to shape, skewness,
 #' etc. Thus, the predicted distribution may not be perfectly representing the
@@ -67,7 +66,6 @@ check_distribution <- function(model) {
 
 #' @export
 check_distribution.default <- function(model) {
-  # check for valid input
   .is_model_valid(model)
 
   insight::check_if_installed("randomForest")
@@ -193,23 +191,40 @@ check_distribution.numeric <- function(model) {
   # validation check, remove missings
   x <- x[!is.na(x)]
 
-  # this might fail, so we wrap in ".safe()"
-  map_est <- .safe(mean(x) - as.numeric(bayestestR::map_estimate(x, bw = "nrd0")))
+  mode <- NULL
+  # find mode for integer, or MAP for distributions
+  if (all(.is_integer(x))) {
+    mode <- datawizard::distribution_mode(x)
+  } else {
+    # this might fail, so we wrap in ".safe()"
+    mode <- tryCatch(
+      as.numeric(bayestestR::map_estimate(x, bw = "nrd0")),
+      error = function(e) NULL
+    )
+    if (is.null(mode)) {
+      mode <- tryCatch(
+        as.numeric(bayestestR::map_estimate(x, bw = "kernel")),
+        error = function(e) NULL
+      )
+    }
+  }
 
-  if (is.null(map_est)) {
-    map_est <- mean(x) - datawizard::distribution_mode(x)
+  if (is.null(mode)) {
+    mean_mode_diff <- mean(x) - datawizard::distribution_mode(x)
     msg <- "Could not accurately estimate the mode."
     if (!is.null(type)) {
       msg <- paste(msg, "Predicted distribution of the", type, "may be less accurate.")
     }
     insight::format_alert(msg)
+  } else {
+    mean_mode_diff <- .safe(mean(x) - mode)
   }
 
   data.frame(
     SD = stats::sd(x),
     MAD = stats::mad(x, constant = 1),
     Mean_Median_Distance = mean(x) - stats::median(x),
-    Mean_Mode_Distance = map_est,
+    Mean_Mode_Distance = mean_mode_diff,
     SD_MAD_Distance = stats::sd(x) - stats::mad(x, constant = 1),
     Var_Mean_Distance = stats::var(x) - mean(x),
     Range_SD = diff(range(x)) / stats::sd(x),
@@ -222,6 +237,7 @@ check_distribution.numeric <- function(model) {
     Min = min(x),
     Max = max(x),
     Proportion_Positive = sum(x >= 0) / length(x),
+    Proportion_Zero = sum(x == 0) / length(x),
     Integer = all(.is_integer(x))
   )
 }

diff --git a/R/check_factorstructure.R b/R/check_factorstructure.R
@@ -50,7 +50,7 @@
 #' exclusion them from the analysis (note that you would need to re-compute the
 #' KMO indices as they are dependent on the whole dataset).
 #'
-#' @param x A dataframe or a correlation matrix. If the latter is passed, `n`
+#' @param x A data frame or a correlation matrix. If the latter is passed, `n`
 #'   must be provided.
 #' @param n If a correlation matrix was passed, the number of observations must
 #'   be specified.

diff --git a/R/check_heterogeneity_bias.R b/R/check_heterogeneity_bias.R
@@ -8,9 +8,10 @@
 #' @param select Character vector (or formula) with names of variables to select
 #'   that should be checked. If `x` is a mixed model object, this argument
 #'   will be ignored.
-#' @param group Character vector (or formula) with the name of the variable that
+#' @param by Character vector (or formula) with the name of the variable that
 #'   indicates the group- or cluster-ID. If `x` is a model object, this
 #'   argument will be ignored.
+#' @param group Deprecated. Use `by` instead.
 #'
 #' @seealso
 #' For further details, read the vignette
@@ -25,32 +26,38 @@
 #' @examples
 #' data(iris)
 #' iris$ID <- sample(1:4, nrow(iris), replace = TRUE) # fake-ID
-#' check_heterogeneity_bias(iris, select = c("Sepal.Length", "Petal.Length"), group = "ID")
+#' check_heterogeneity_bias(iris, select = c("Sepal.Length", "Petal.Length"), by = "ID")
 #' @export
-check_heterogeneity_bias <- function(x, select = NULL, group = NULL) {
+check_heterogeneity_bias <- function(x, select = NULL, by = NULL, group = NULL) {
+  ## TODO: deprecate later
+  if (!is.null(group)) {
+    insight::format_warning("Argument `group` is deprecated and will be removed in a future release. Please use `by` instead.") # nolint
+    by <- group
+  }
   if (insight::is_model(x)) {
-    group <- insight::find_random(x, split_nested = TRUE, flatten = TRUE)
-    if (is.null(group)) {
-      insight::format_error("Model is no mixed model. Please provide a mixed model, or a data frame and arguments `select` and `group`.") # nolint
+    by <- insight::find_random(x, split_nested = TRUE, flatten = TRUE)
+    if (is.null(by)) {
+      insight::format_error("Model is no mixed model. Please provide a mixed model, or a data frame and arguments `select` and `by`.") # nolint
     }
     my_data <- insight::get_data(x, source = "mf", verbose = FALSE)
     select <- insight::find_predictors(x, effects = "fixed", component = "conditional", flatten = TRUE)
   } else {
     if (inherits(select, "formula")) {
       select <- all.vars(select)
     }
-    if (inherits(group, "formula")) {
-      group <- all.vars(group)
+    if (inherits(by, "formula")) {
+      by <- all.vars(by)
     }
     my_data <- x
   }
 
-  unique_groups <- .n_unique(my_data[[group]])
-  combinations <- expand.grid(select, group)
+  unique_groups <- .n_unique(my_data[[by]])
+  combinations <- expand.grid(select, by)
 
   result <- Map(function(predictor, id) {
     # demean predictor
-    d <- datawizard::demean(my_data, select = predictor, group = id, verbose = FALSE)
+
+    d <- datawizard::demean(my_data, select = predictor, by = id, verbose = FALSE)
 
     # get new names
     within_name <- paste0(predictor, "_within")

diff --git a/R/check_heteroscedasticity.R b/R/check_heteroscedasticity.R
@@ -46,7 +46,6 @@ check_heteroskedasticity <- check_heteroscedasticity
 
 #' @export
 check_heteroscedasticity.default <- function(x, ...) {
-  # check for valid input
   .is_model_valid(x)
 
   # only for linear models

diff --git a/R/check_itemscale.R b/R/check_itemscale.R
@@ -66,7 +66,6 @@
 #' )
 #' @export
 check_itemscale <- function(x, factor_index = NULL) {
-  # check for valid input
   if (!inherits(x, c("parameters_pca", "data.frame"))) {
     insight::format_error(
       "`x` must be an object of class `parameters_pca`, as returned by `parameters::principal_components()`, or a data frame." # nolint

diff --git a/R/check_normality.R b/R/check_normality.R
@@ -55,7 +55,6 @@ check_normality <- function(x, ...) {
 
 #' @export
 check_normality.default <- function(x, ...) {
-  # check for valid input
   .is_model_valid(x)
 
   if (!insight::model_info(x)$is_linear) {

diff --git a/R/check_outliers.R b/R/check_outliers.R
@@ -109,7 +109,7 @@
 #'  default threshold to classify outliers is 1.959 (`threshold = list("zscore" = 1.959)`),
 #'  corresponding to the 2.5% (`qnorm(0.975)`) most extreme observations
 #'  (assuming the data is normally distributed). Importantly, the Z-score
-#'  method is univariate: it is computed column by column. If a dataframe is
+#'  method is univariate: it is computed column by column. If a data frame is
 #'  passed, the Z-score is calculated for each variable separately, and the
 #'  maximum (absolute) Z-score is kept for each observations. Thus, all
 #'  observations that are extreme on at least one variable might be detected
@@ -304,14 +304,14 @@
 #' outliers_list <- check_outliers(data$mpg) # Find outliers
 #' outliers_list # Show the row index of the outliers
 #' as.numeric(outliers_list) # The object is a binary vector...
-#' filtered_data <- data[!outliers_list, ] # And can be used to filter a dataframe
+#' filtered_data <- data[!outliers_list, ] # And can be used to filter a data frame
 #' nrow(filtered_data) # New size, 28 (4 outliers removed)
 #'
 #' # Find all observations beyond +/- 2 SD
 #' check_outliers(data$mpg, method = "zscore", threshold = 2)
 #'
 #' # For dataframes ------------------------------------------------------
-#' check_outliers(data) # It works the same way on dataframes
+#' check_outliers(data) # It works the same way on data frames
 #'
 #' # You can also use multiple methods at once
 #' outliers_list <- check_outliers(data, method = c(
@@ -948,7 +948,7 @@ check_outliers.data.frame <- function(x,
   outlier_count <- lapply(outlier_count, function(x) {
     num.df <- x[!names(x) %in% c("Row", ID)]
     if (isTRUE(nrow(num.df) >= 1)) {
-      num.df <- datawizard::change_code(
+      num.df <- datawizard::recode_values(
         num.df,
         recode = list(`2` = "(Multivariate)")
       )

diff --git a/R/check_overdispersion.R b/R/check_overdispersion.R
@@ -77,7 +77,6 @@ check_overdispersion <- function(x, ...) {
 
 #' @export
 check_overdispersion.default <- function(x, ...) {
-  # check for valid input
   .is_model_valid(x)
   insight::format_error(
     paste0("`check_overdisperion()` not yet implemented for models of class `", class(x)[1], "`.")
@@ -289,7 +288,6 @@ check_overdispersion.glmmTMB <- check_overdispersion.merMod
 #' @rdname check_overdispersion
 #' @export
 check_overdispersion.performance_simres <- function(x, alternative = c("two.sided", "less", "greater"), ...) {
-  # match arguments
   alternative <- match.arg(alternative)
 
   # check for special arguments - we may pass "object_name" from other methods

diff --git a/R/check_predictions.R b/R/check_predictions.R
@@ -100,7 +100,6 @@ check_predictions.default <- function(object,
                                       type = "density",
                                       verbose = TRUE,
                                       ...) {
-  # check for valid input
   .is_model_valid(object)
 
   # retrieve model information

diff --git a/R/check_singularity.R b/R/check_singularity.R
@@ -99,8 +99,11 @@
 #'
 #' # Fixing singularity issues using priors in glmmTMB
 #' # Example taken from `vignette("priors", package = "glmmTMB")`
-#' dat <- readRDS(system.file("vignette_data", "gophertortoise.rds",
-#'                package = "glmmTMB"))
+#' dat <- readRDS(system.file(
+#'   "vignette_data",
+#'   "gophertortoise.rds",
+#'   package = "glmmTMB"
+#' ))
 #' model <- glmmTMB::glmmTMB(
 #'   shells ~ prev + offset(log(Area)) + factor(year) + (1 | Site),
 #'   family = poisson,

diff --git a/R/check_zeroinflation.R b/R/check_zeroinflation.R
@@ -120,7 +120,6 @@ check_zeroinflation.performance_simres <- function(x,
                                                    tolerance = 0.1,
                                                    alternative = c("two.sided", "less", "greater"),
                                                    ...) {
-  # match arguments
   alternative <- match.arg(alternative)
 
   # compute test results

diff --git a/R/display.R b/R/display.R
@@ -44,12 +44,3 @@ display.compare_performance <- display.performance_model
 
 #' @export
 display.check_itemscale <- display.performance_model
-
-
-
-
-# Reexports models ------------------------
-
-#' @importFrom insight display
-#' @export
-insight::display
diff --git a/R/item_discrimination.R b/R/item_discrimination.R
@@ -16,8 +16,8 @@
 #' This function calculates the item discriminations (corrected item-total
 #' correlations for each item of `x` with the remaining items) for each item
 #' of a scale. The absolute value of the item discrimination indices should be
-#' above 0.2. An index between 0.2 and 0.4 is considered as "fair", while a
-#' satisfactory index ranges from 0.4 to 0.7. Items with low discrimination
+#' above `0.2`. An index between `0.2` and `0.4` is considered as "fair", while a
+#' satisfactory index ranges from `0.4` to `0.7`. Items with low discrimination
 #' indices are often ambiguously worded and should be examined. Items with
 #' negative indices should be examined to determine why a negative value was
 #' obtained (e.g. reversed answer categories regarding positive and negative

diff --git a/R/model_performance.bayesian.R b/R/model_performance.bayesian.R
@@ -79,7 +79,7 @@ model_performance.stanreg <- function(model, metrics = "all", verbose = TRUE, ..
     metrics <- c("LOOIC", "WAIC", "R2", "RMSE")
   }
 
-  # check for valid input
+
   metrics <- toupper(.check_bad_metrics(metrics, all_metrics, verbose))
 
   algorithm <- insight::find_algorithm(model)
@@ -232,7 +232,7 @@ model_performance.BFBayesFactor <- function(model,
     metrics <- all_metrics
   }
 
-  # check for valid input
+
   metrics <- toupper(.check_bad_metrics(metrics, all_metrics, verbose))
 
   # check for valid BFBayesFactor object

diff --git a/R/model_performance.bife.R b/R/model_performance.bife.R
@@ -11,7 +11,7 @@ model_performance.bife <- function(model, metrics = "all", verbose = TRUE, ...)
     metrics <- c("AIC", "R2")
   }
 
-  # check for valid input
+
   metrics <- .check_bad_metrics(metrics, all_metrics, verbose)
 
   info <- insight::model_info(model)

diff --git a/R/model_performance.ivreg.R b/R/model_performance.ivreg.R
@@ -24,7 +24,7 @@ model_performance.ivreg <- function(model, metrics = "all", verbose = TRUE, ...)
     metrics <- c("AIC", "BIC", "R2", "R2_adj", "RMSE")
   }
 
-  # check for valid input
+
   metrics <- .check_bad_metrics(metrics, all_metrics, verbose)
 
   # the lm-method does not accept ivreg-specific metrics