From 3b08f69b584832f9fd5da2984599f47b14448d4d Mon Sep 17 00:00:00 2001 From: Daniel Date: Tue, 24 Oct 2023 19:20:54 +0200 Subject: [PATCH 01/14] fix --- R/binned_residuals.R | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/R/binned_residuals.R b/R/binned_residuals.R index 286323968..2614b67a6 100644 --- a/R/binned_residuals.R +++ b/R/binned_residuals.R @@ -62,7 +62,15 @@ #' } #' #' @export -binned_residuals <- function(model, term = NULL, n_bins = NULL, show_dots = NULL, ...) { +binned_residuals <- function(model, + term = NULL, + n_bins = NULL, + show_dots = NULL, + residuals = c("gaussian", "exact", "boot"), + ci = 0.95, + ...) { + + residuals <- match.arg(residuals) fv <- stats::fitted(model) mf <- insight::get_data(model, verbose = FALSE) @@ -78,7 +86,8 @@ binned_residuals <- function(model, term = NULL, n_bins = NULL, show_dots = NULL show_dots <- is.null(n) || n <= 1e5 } - y <- .recode_to_zero(insight::get_response(model, verbose = FALSE)) - fv + y0 <- .recode_to_zero(insight::get_response(model, verbose = FALSE)) + y <- y0 - fv if (is.null(n_bins)) n_bins <- round(sqrt(length(pred))) @@ -95,24 +104,28 @@ binned_residuals <- function(model, term = NULL, n_bins = NULL, show_dots = NULL n <- length(items) sdev <- stats::sd(y[items], na.rm = TRUE) - data.frame( + r <- switch(residuals, + gaussian = stats::qnorm(c((1 - ci) / 2, (1 + ci) / 2), mean = ybar, sd = sdev / sqrt(n)), + exact = stats:::binom.test(sum(y0[items]), n)$conf.int - fv, + boot = Hmisc::smean.cl.boot(y[items], conf.int = ci)[c("Lower", "Upper")] + ) + names(r) <- c("CI_low", "CI_high") + + d0 <- data.frame( xbar = xbar, ybar = ybar, n = n, x.lo = model.range[1], x.hi = model.range[2], - se = stats::qnorm(0.975) * sdev / sqrt(n), + se = stats::qnorm((1 + ci) / 2) * sdev / sqrt(n), ci_range = sdev / sqrt(n) ) + cbind(d0, rbind(r)) })) d <- do.call(rbind, d) d <- d[stats::complete.cases(d), ] - # CIs - d$CI_low <- d$ybar - stats::qnorm(0.975) * d$ci_range - d$CI_high <- d$ybar + stats::qnorm(0.975) * d$ci_range - gr <- abs(d$ybar) > abs(d$se) d$group <- "yes" d$group[gr] <- "no" From 66da88519d1783af1ab47fe58e7a8c6b0c17acce Mon Sep 17 00:00:00 2001 From: Daniel Date: Tue, 24 Oct 2023 19:46:54 +0200 Subject: [PATCH 02/14] work --- R/binned_residuals.R | 24 ++++++++++++++++++++---- man/binned_residuals.Rd | 11 ++++++++++- 2 files changed, 30 insertions(+), 5 deletions(-) diff --git a/R/binned_residuals.R b/R/binned_residuals.R index 2614b67a6..3210a228c 100644 --- a/R/binned_residuals.R +++ b/R/binned_residuals.R @@ -66,11 +66,12 @@ binned_residuals <- function(model, term = NULL, n_bins = NULL, show_dots = NULL, - residuals = c("gaussian", "exact", "boot"), ci = 0.95, + ci_type = c("gaussian", "exact", "boot"), + iterations = 1000, ...) { - residuals <- match.arg(residuals) + ci_type <- match.arg(ci_type) fv <- stats::fitted(model) mf <- insight::get_data(model, verbose = FALSE) @@ -104,10 +105,10 @@ binned_residuals <- function(model, n <- length(items) sdev <- stats::sd(y[items], na.rm = TRUE) - r <- switch(residuals, + r <- switch(ci_type, gaussian = stats::qnorm(c((1 - ci) / 2, (1 + ci) / 2), mean = ybar, sd = sdev / sqrt(n)), exact = stats:::binom.test(sum(y0[items]), n)$conf.int - fv, - boot = Hmisc::smean.cl.boot(y[items], conf.int = ci)[c("Lower", "Upper")] + boot = .boot_binned_ci(y[items], ci, iterations) ) names(r) <- c("CI_low", "CI_high") @@ -142,6 +143,21 @@ binned_residuals <- function(model, } +# utilities --------------------------- + +.boot_binned_ci <- function(x, ci = 0.95, iterations = 1000) { + x <- x[!is.na(x)] + n <- length(x) + out <- vector("numeric", iterations) + for (i in seq_len(iterations)) { + out[i] <- sum(x[sample.int(n, n, replace = TRUE)]) + } + out <- out / n + + quant <- stats::quantile(out, c((1 - ci) / 2, (1 + ci) / 2)) + c(CI_low = quant[1L], CI_high = quant[2L]) +} + # methods ----------------------------- diff --git a/man/binned_residuals.Rd b/man/binned_residuals.Rd index b8aee666e..f0362c091 100644 --- a/man/binned_residuals.Rd +++ b/man/binned_residuals.Rd @@ -4,7 +4,16 @@ \alias{binned_residuals} \title{Binned residuals for binomial logistic regression} \usage{ -binned_residuals(model, term = NULL, n_bins = NULL, show_dots = NULL, ...) +binned_residuals( + model, + term = NULL, + n_bins = NULL, + show_dots = NULL, + ci = 0.95, + ci_type = c("gaussian", "exact", "boot"), + iterations = 1000, + ... +) } \arguments{ \item{model}{A \code{glm}-object with \emph{binomial}-family.} From 038dcddf85feab176f3414249bc39d63bdc29ee4 Mon Sep 17 00:00:00 2001 From: Daniel Date: Tue, 24 Oct 2023 23:15:24 +0200 Subject: [PATCH 03/14] some fixes --- R/binned_residuals.R | 26 +++++++++++++++++++++----- man/binned_residuals.Rd | 12 ++++++++++++ 2 files changed, 33 insertions(+), 5 deletions(-) diff --git a/R/binned_residuals.R b/R/binned_residuals.R index 3210a228c..77b3bae78 100644 --- a/R/binned_residuals.R +++ b/R/binned_residuals.R @@ -11,6 +11,13 @@ #' @param n_bins Numeric, the number of bins to divide the data. If #' `n_bins = NULL`, the square root of the number of observations is #' taken. +#' @param ci Numeric, the confidence level for the error bounds. +#' @param ci_type Character, the type of error bounds to calculate. Can be +#' `"gaussian"` (default), `"exact"` or `"boot"`. +#' @param residuals Character, the type of residuals to calculate. Can be +#' `"response"` (default), `"pearson"` or `"deviance"`. +#' @param iterations Integer, the number of iterations to use for the +#' bootstrap method. Only used if `ci_type = "boot"`. #' @param show_dots Logical, if `TRUE`, will show data points in the plot. Set #' to `FALSE` for models with many observations, if generating the plot is too #' time-consuming. By default, `show_dots = NULL`. In this case `binned_residuals()` @@ -68,15 +75,18 @@ binned_residuals <- function(model, show_dots = NULL, ci = 0.95, ci_type = c("gaussian", "exact", "boot"), + residuals = c("response", "pearson", "deviance"), iterations = 1000, ...) { - + # match arguments ci_type <- match.arg(ci_type) - fv <- stats::fitted(model) + residuals <- match.arg(residuals) + + fitted_values <- stats::fitted(model) mf <- insight::get_data(model, verbose = FALSE) if (is.null(term)) { - pred <- fv + pred <- fitted_values } else { pred <- mf[[term]] } @@ -88,7 +98,13 @@ binned_residuals <- function(model, } y0 <- .recode_to_zero(insight::get_response(model, verbose = FALSE)) - y <- y0 - fv + + # calculate residuals + y <- switch(residuals, + response = y0 - fitted_values, + pearson = .safe((y0 - fitted_values) / sqrt(fitted_values * (1 - fitted_values))), + deviance = .safe(stats::residuals(model, type = "deviance")) + ) if (is.null(n_bins)) n_bins <- round(sqrt(length(pred))) @@ -107,7 +123,7 @@ binned_residuals <- function(model, r <- switch(ci_type, gaussian = stats::qnorm(c((1 - ci) / 2, (1 + ci) / 2), mean = ybar, sd = sdev / sqrt(n)), - exact = stats:::binom.test(sum(y0[items]), n)$conf.int - fv, + exact = stats:::binom.test(sum(y0[items]), n)$conf.int, boot = .boot_binned_ci(y[items], ci, iterations) ) names(r) <- c("CI_low", "CI_high") diff --git a/man/binned_residuals.Rd b/man/binned_residuals.Rd index f0362c091..b913ec53a 100644 --- a/man/binned_residuals.Rd +++ b/man/binned_residuals.Rd @@ -11,6 +11,7 @@ binned_residuals( show_dots = NULL, ci = 0.95, ci_type = c("gaussian", "exact", "boot"), + residuals = c("response", "pearson", "deviance"), iterations = 1000, ... ) @@ -33,6 +34,17 @@ time-consuming. By default, \code{show_dots = NULL}. In this case \code{binned_r tries to guess whether performance will be poor due to a very large model and thus automatically shows or hides dots.} +\item{ci}{Numeric, the confidence level for the error bounds.} + +\item{ci_type}{Character, the type of error bounds to calculate. Can be +\code{"gaussian"} (default), \code{"exact"} or \code{"boot"}.} + +\item{residuals}{Character, the type of residuals to calculate. Can be +\code{"response"} (default), \code{"pearson"} or \code{"deviance"}.} + +\item{iterations}{Integer, the number of iterations to use for the +bootstrap method. Only used if \code{ci_type = "boot"}.} + \item{...}{Currently not used.} } \value{ From aaa8eb6acbb02957a980eb939a17004b3d20714f Mon Sep 17 00:00:00 2001 From: Daniel Date: Tue, 24 Oct 2023 23:34:03 +0200 Subject: [PATCH 04/14] center CI --- R/binned_residuals.R | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/R/binned_residuals.R b/R/binned_residuals.R index 77b3bae78..7d9ec9803 100644 --- a/R/binned_residuals.R +++ b/R/binned_residuals.R @@ -123,7 +123,11 @@ binned_residuals <- function(model, r <- switch(ci_type, gaussian = stats::qnorm(c((1 - ci) / 2, (1 + ci) / 2), mean = ybar, sd = sdev / sqrt(n)), - exact = stats:::binom.test(sum(y0[items]), n)$conf.int, + exact = { + out <- stats:::binom.test(sum(y0[items]), n)$conf.int + out <- out - (min(out) - ybar) - (diff(out) / 2) + out + }, boot = .boot_binned_ci(y[items], ci, iterations) ) names(r) <- c("CI_low", "CI_high") From 7ad5946dd7755e486d15d7d85d08c1ffa02629d0 Mon Sep 17 00:00:00 2001 From: Daniel Date: Wed, 25 Oct 2023 08:40:25 +0200 Subject: [PATCH 05/14] docs, minor fixes --- R/binned_residuals.R | 24 +++++++++++++++++------- R/check_model.R | 17 +++++++++-------- man/binned_residuals.Rd | 14 ++++++++++---- man/check_model.Rd | 3 ++- 4 files changed, 38 insertions(+), 20 deletions(-) diff --git a/R/binned_residuals.R b/R/binned_residuals.R index 7d9ec9803..d7461c8c7 100644 --- a/R/binned_residuals.R +++ b/R/binned_residuals.R @@ -13,9 +13,15 @@ #' taken. #' @param ci Numeric, the confidence level for the error bounds. #' @param ci_type Character, the type of error bounds to calculate. Can be -#' `"gaussian"` (default), `"exact"` or `"boot"`. +#' `"exact"` (default), `"gaussian"` or `"boot"`. `"exact"` calculates the +#' error bounds based on the exact binomial distribution, using [`binom.test()`]. +#' `"gaussian"` uses the Gaussian approximation, while `"boot"` uses a simple +#' bootstrap method, where confidence intervals are calculated based on the +#' quantiles of the bootstrap distribution. #' @param residuals Character, the type of residuals to calculate. Can be -#' `"response"` (default), `"pearson"` or `"deviance"`. +#' `"deviance"` (default), `"pearson"` or `"response"`. It is recommended to +#' use `"response"` only for those models where other residuals are not +#' available. #' @param iterations Integer, the number of iterations to use for the #' bootstrap method. Only used if `ci_type = "boot"`. #' @param show_dots Logical, if `TRUE`, will show data points in the plot. Set @@ -74,8 +80,8 @@ binned_residuals <- function(model, n_bins = NULL, show_dots = NULL, ci = 0.95, - ci_type = c("gaussian", "exact", "boot"), - residuals = c("response", "pearson", "deviance"), + ci_type = c("exact", "gaussian", "boot"), + residuals = c("deviance", "pearson", "response"), iterations = 1000, ...) { # match arguments @@ -106,6 +112,11 @@ binned_residuals <- function(model, deviance = .safe(stats::residuals(model, type = "deviance")) ) + # make sure we really have residuals + if (is.null(y)) { + insight::format_error("Could not calculate residuals. Try using `residuals = \"response\"`.") + } + if (is.null(n_bins)) n_bins <- round(sqrt(length(pred))) breaks.index <- floor(length(pred) * (1:(n_bins - 1)) / n_bins) @@ -124,7 +135,7 @@ binned_residuals <- function(model, r <- switch(ci_type, gaussian = stats::qnorm(c((1 - ci) / 2, (1 + ci) / 2), mean = ybar, sd = sdev / sqrt(n)), exact = { - out <- stats:::binom.test(sum(y0[items]), n)$conf.int + out <- stats::binom.test(sum(y0[items]), n)$conf.int out <- out - (min(out) - ybar) - (diff(out) / 2) out }, @@ -138,8 +149,7 @@ binned_residuals <- function(model, n = n, x.lo = model.range[1], x.hi = model.range[2], - se = stats::qnorm((1 + ci) / 2) * sdev / sqrt(n), - ci_range = sdev / sqrt(n) + se = stats::qnorm((1 + ci) / 2) * sdev / sqrt(n) ) cbind(d0, rbind(r)) })) diff --git a/R/check_model.R b/R/check_model.R index bbf6c6a84..5e7567d7f 100644 --- a/R/check_model.R +++ b/R/check_model.R @@ -35,7 +35,8 @@ #' tries to guess whether performance will be poor due to a very large model #' and thus automatically shows or hides dots. #' @param verbose If `FALSE` (default), suppress most warning messages. -#' @param ... Currently not used. +#' @param ... Arguments passed down to the individual check functions, especially +#' to `check_predictions()` and `binned_residuals()`. #' @inheritParams check_predictions #' #' @return The data frame that is used for plotting. @@ -185,11 +186,11 @@ check_model.default <- function(x, ca <- tryCatch( { if (minfo$is_bayesian) { - suppressWarnings(.check_assumptions_stan(x)) + suppressWarnings(.check_assumptions_stan(x, ...)) } else if (minfo$is_linear) { - suppressWarnings(.check_assumptions_linear(x, minfo, verbose)) + suppressWarnings(.check_assumptions_linear(x, minfo, verbose, ...)) } else { - suppressWarnings(.check_assumptions_glm(x, minfo, verbose)) + suppressWarnings(.check_assumptions_glm(x, minfo, verbose, ...)) } }, error = function(e) { @@ -346,7 +347,7 @@ check_model.model_fit <- function(x, threshold <- NULL } dat$INFLUENTIAL <- .influential_obs(model, threshold = threshold) - dat$PP_CHECK <- .safe(check_predictions(model)) + dat$PP_CHECK <- .safe(check_predictions(model, ...)) dat <- insight::compact_list(dat) class(dat) <- c("check_model", "see_check_model") @@ -357,7 +358,7 @@ check_model.model_fit <- function(x, # compile plots for checks of generalized linear models ------------------------ -.check_assumptions_glm <- function(model, model_info, verbose = TRUE) { +.check_assumptions_glm <- function(model, model_info, verbose = TRUE, ...) { dat <- list() dat$VIF <- .diag_vif(model, verbose = verbose) @@ -371,9 +372,9 @@ check_model.model_fit <- function(x, threshold <- NULL } dat$INFLUENTIAL <- .influential_obs(model, threshold = threshold) - dat$PP_CHECK <- .safe(check_predictions(model)) + dat$PP_CHECK <- .safe(check_predictions(model, ...)) if (isTRUE(model_info$is_binomial)) { - dat$BINNED_RESID <- binned_residuals(model) + dat$BINNED_RESID <- binned_residuals(model, ...) } if (isTRUE(model_info$is_count)) { dat$OVERDISPERSION <- .diag_overdispersion(model) diff --git a/man/binned_residuals.Rd b/man/binned_residuals.Rd index b913ec53a..33e710f11 100644 --- a/man/binned_residuals.Rd +++ b/man/binned_residuals.Rd @@ -10,8 +10,8 @@ binned_residuals( n_bins = NULL, show_dots = NULL, ci = 0.95, - ci_type = c("gaussian", "exact", "boot"), - residuals = c("response", "pearson", "deviance"), + ci_type = c("exact", "gaussian", "boot"), + residuals = c("deviance", "pearson", "response"), iterations = 1000, ... ) @@ -37,10 +37,16 @@ and thus automatically shows or hides dots.} \item{ci}{Numeric, the confidence level for the error bounds.} \item{ci_type}{Character, the type of error bounds to calculate. Can be -\code{"gaussian"} (default), \code{"exact"} or \code{"boot"}.} +\code{"exact"} (default), \code{"gaussian"} or \code{"boot"}. \code{"exact"} calculates the +error bounds based on the exact binomial distribution, using \code{\link[=binom.test]{binom.test()}}. +\code{"gaussian"} uses the Gaussian approximation, while \code{"boot"} uses a simple +bootstrap method, where confidence intervals are calculated based on the +quantiles of the bootstrap distribution.} \item{residuals}{Character, the type of residuals to calculate. Can be -\code{"response"} (default), \code{"pearson"} or \code{"deviance"}.} +\code{"deviance"} (default), \code{"pearson"} or \code{"response"}. It is recommended to +use \code{"response"} only for those models where other residuals are not +available.} \item{iterations}{Integer, the number of iterations to use for the bootstrap method. Only used if \code{ci_type = "boot"}.} diff --git a/man/check_model.Rd b/man/check_model.Rd index 2bf82af92..d5ead9420 100644 --- a/man/check_model.Rd +++ b/man/check_model.Rd @@ -28,7 +28,8 @@ check_model(x, ...) \arguments{ \item{x}{A model object.} -\item{...}{Currently not used.} +\item{...}{Arguments passed down to the individual check functions, especially +to \code{check_predictions()} and \code{binned_residuals()}.} \item{dot_size, line_size}{Size of line and dot-geoms.} From 79d29f922a0a709e5b5bec675fb4a44b5a040470 Mon Sep 17 00:00:00 2001 From: Daniel Date: Wed, 25 Oct 2023 08:46:48 +0200 Subject: [PATCH 06/14] add tests --- tests/testthat/test-binned_residuals.R | 102 +++++++++++++++++++++++-- 1 file changed, 96 insertions(+), 6 deletions(-) diff --git a/tests/testthat/test-binned_residuals.R b/tests/testthat/test-binned_residuals.R index 4aa69e0ec..6f0155092 100644 --- a/tests/testthat/test-binned_residuals.R +++ b/tests/testthat/test-binned_residuals.R @@ -1,10 +1,10 @@ test_that("binned_residuals", { data(mtcars) model <- glm(vs ~ wt + mpg, data = mtcars, family = "binomial") - result <- binned_residuals(model) + result <- binned_residuals(model, ci_type = "gaussian", residuals = "response") expect_named( result, - c("xbar", "ybar", "n", "x.lo", "x.hi", "se", "ci_range", "CI_low", "CI_high", "group") + c("xbar", "ybar", "n", "x.lo", "x.hi", "se", "CI_low", "CI_high", "group") ) expect_equal( result$xbar, @@ -16,16 +16,21 @@ test_that("binned_residuals", { c(-0.03786, -0.09514, 0.07423, -0.07955, 0.28891, -0.13786), tolerance = 1e-4 ) + expect_equal( + result$CI_low, + c(-0.05686, -0.12331, -0.35077, -0.57683, 0.17916, -0.44147), + tolerance = 1e-4 + ) }) test_that("binned_residuals, n_bins", { data(mtcars) model <- glm(vs ~ wt + mpg, data = mtcars, family = "binomial") - result <- binned_residuals(model, n_bins = 10) + result <- binned_residuals(model, ci_type = "gaussian", residuals = "response", n_bins = 10) expect_named( result, - c("xbar", "ybar", "n", "x.lo", "x.hi", "se", "ci_range", "CI_low", "CI_high", "group") + c("xbar", "ybar", "n", "x.lo", "x.hi", "se", "CI_low", "CI_high", "group") ) expect_equal( result$xbar, @@ -49,10 +54,10 @@ test_that("binned_residuals, n_bins", { test_that("binned_residuals, terms", { data(mtcars) model <- glm(vs ~ wt + mpg, data = mtcars, family = "binomial") - result <- binned_residuals(model, term = "mpg") + result <- binned_residuals(model, ci_type = "gaussian", residuals = "response", term = "mpg") expect_named( result, - c("xbar", "ybar", "n", "x.lo", "x.hi", "se", "ci_range", "CI_low", "CI_high", "group") + c("xbar", "ybar", "n", "x.lo", "x.hi", "se", "CI_low", "CI_high", "group") ) expect_equal( result$xbar, @@ -65,3 +70,88 @@ test_that("binned_residuals, terms", { tolerance = 1e-4 ) }) + + +test_that("binned_residuals, deviance residuals, gaussian CI", { + data(mtcars) + model <- glm(vs ~ wt + mpg, data = mtcars, family = "binomial") + result <- binned_residuals(model, residuals = "deviance", ci_type = "gaussian") + expect_named( + result, + c("xbar", "ybar", "n", "x.lo", "x.hi", "se", "CI_low", "CI_high", "group") + ) + expect_equal( + result$xbar, + c(0.03786, 0.09514, 0.25911, 0.47955, 0.71109, 0.97119), + tolerance = 1e-4 + ) + expect_equal( + result$ybar, + c(-0.26905, -0.44334, 0.03763, -0.19917, 0.81563, -0.23399), + tolerance = 1e-4 + ) + expect_equal( + result$ybar, + c(-0.26905, -0.44334, 0.03763, -0.19917, 0.81563, -0.23399), + tolerance = 1e-4 + ) + expect_equal( + result$CI_low, + c(-0.33985, -0.50865, -0.98255, -1.36025, 0.61749, -1.00913), + tolerance = 1e-4 + ) +}) + + +test_that("binned_residuals, default", { + data(mtcars) + model <- glm(vs ~ wt + mpg, data = mtcars, family = "binomial") + result <- binned_residuals(model) + expect_named( + result, + c("xbar", "ybar", "n", "x.lo", "x.hi", "se", "CI_low", "CI_high", "group") + ) + expect_equal( + result$xbar, + c(0.03786, 0.09514, 0.25911, 0.47955, 0.71109, 0.97119), + tolerance = 1e-4 + ) + expect_equal( + result$ybar, + c(-0.26905, -0.44334, 0.03763, -0.19917, 0.81563, -0.23399), + tolerance = 1e-4 + ) + expect_equal( + result$CI_low, + c(-0.52997, -0.70426, -0.32935, -0.59948, 0.55472, -0.55251), + tolerance = 1e-4 + ) +}) + + +test_that("binned_residuals, bootstrapped CI", { + skip_on_cran() + data(mtcars) + model <- glm(vs ~ wt + mpg, data = mtcars, family = "binomial") + set.seed(123) + result <- binned_residuals(model, ci_type = "boot", iterations = 100) + expect_named( + result, + c("xbar", "ybar", "n", "x.lo", "x.hi", "se", "CI_low", "CI_high", "group") + ) + expect_equal( + result$xbar, + c(0.03786, 0.09514, 0.25911, 0.47955, 0.71109, 0.97119), + tolerance = 1e-4 + ) + expect_equal( + result$ybar, + c(-0.26905, -0.44334, 0.03763, -0.19917, 0.81563, -0.23399), + tolerance = 1e-4 + ) + expect_equal( + result$CI_low, + c(-0.32623, -0.50543, -0.80879, -1.15154, 0.67569, -0.65748), + tolerance = 1e-4 + ) +}) From 66a4f1b0bec61ac5bff08e2611e9195166280483 Mon Sep 17 00:00:00 2001 From: Daniel Date: Wed, 25 Oct 2023 08:49:49 +0200 Subject: [PATCH 07/14] news --- NEWS.md | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/NEWS.md b/NEWS.md index e3eedf313..2bb37d162 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,15 @@ # performance 0.10.7 +## Breaking changes + +* `binned_residuals()` gains a few new arguments to control the residuals used + for the test, as well as different options to calculate confidence intervals + (namely, `ci_type`, `residuals`, `ci` and `iterations`). The default values + to compute binned residuals have changed. Default residuals are now "deviance" + residuals (and no longer "response" residuals). Default confidence intervals + are now "exact" intervals (and no longer based on Gaussian approximation). + Use `ci_type = "gaussian"` and `residuals = "response"` to get the old defaults. + ## Changes to functions * `binned_residuals()` - like `check_model()` - gains a `show_dots` argument to From 8a358a062c03904681a8a0a9357e774e56d92c08 Mon Sep 17 00:00:00 2001 From: Daniel Date: Wed, 25 Oct 2023 12:00:16 +0200 Subject: [PATCH 08/14] fix --- R/check_model.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/check_model.R b/R/check_model.R index 5e7567d7f..80f14d2b0 100644 --- a/R/check_model.R +++ b/R/check_model.R @@ -331,7 +331,7 @@ check_model.model_fit <- function(x, # compile plots for checks of linear models ------------------------ -.check_assumptions_linear <- function(model, model_info, verbose = TRUE) { +.check_assumptions_linear <- function(model, model_info, verbose = TRUE, ...) { dat <- list() dat$VIF <- .diag_vif(model, verbose = verbose) @@ -389,7 +389,7 @@ check_model.model_fit <- function(x, # compile plots for checks of Bayesian models ------------------------ -.check_assumptions_stan <- function(model) { +.check_assumptions_stan <- function(model, ...) { if (inherits(model, "brmsfit")) { # check if brms can be loaded From 1a32f04e9c0a6315406d04246a28dfb6675ae1cd Mon Sep 17 00:00:00 2001 From: Daniel Date: Wed, 25 Oct 2023 12:01:34 +0200 Subject: [PATCH 09/14] fix --- R/check_outliers.R | 2 +- man/check_outliers.Rd | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/R/check_outliers.R b/R/check_outliers.R index f7b304c5e..5e0c55380 100644 --- a/R/check_outliers.R +++ b/R/check_outliers.R @@ -263,7 +263,7 @@ #' #' - Thériault, R., Ben-Shachar, M. S., Patil, I., Lüdecke, D., Wiernik, B. M., #' and Makowski, D. (2023). Check your outliers! An introduction to identifying -#' statistical outliers in R with easystats. https://doi.org/10.31234/osf.io/bu6nt +#' statistical outliers in R with easystats. \doi{10.31234/osf.io/bu6nt} #' #' - Rousseeuw, P. J., and Van Zomeren, B. C. (1990). Unmasking multivariate #' outliers and leverage points. Journal of the American Statistical diff --git a/man/check_outliers.Rd b/man/check_outliers.Rd index f22a51f6a..74c992b6f 100644 --- a/man/check_outliers.Rd +++ b/man/check_outliers.Rd @@ -345,7 +345,7 @@ statistical models. Journal of Open Source Software, 6(60), 3139. \doi{10.21105/joss.03139} \item Thériault, R., Ben-Shachar, M. S., Patil, I., Lüdecke, D., Wiernik, B. M., and Makowski, D. (2023). Check your outliers! An introduction to identifying -statistical outliers in R with easystats. https://doi.org/10.31234/osf.io/bu6nt +statistical outliers in R with easystats. \doi{10.31234/osf.io/bu6nt} \item Rousseeuw, P. J., and Van Zomeren, B. C. (1990). Unmasking multivariate outliers and leverage points. Journal of the American Statistical association, 85(411), 633-639. From bdc6ecb0dbbaae36252a02e511deac4af72adb69 Mon Sep 17 00:00:00 2001 From: Daniel Date: Wed, 25 Oct 2023 12:23:16 +0200 Subject: [PATCH 10/14] lintr --- R/check_outliers.R | 94 +++++++++++++++++++++++----------------------- 1 file changed, 47 insertions(+), 47 deletions(-) diff --git a/R/check_outliers.R b/R/check_outliers.R index 5e0c55380..655bf239e 100644 --- a/R/check_outliers.R +++ b/R/check_outliers.R @@ -410,16 +410,16 @@ check_outliers.default <- function(x, } # Others - if (!all(method %in% c("cook", "pareto"))) { + if (all(method %in% c("cook", "pareto"))) { + df <- data.frame(Row = seq_len(nrow(as.data.frame(data)))) + outlier_count <- list() + outlier_var <- list() + } else { out <- check_outliers(data, method, threshold) outlier_var <- attributes(out)$outlier_var outlier_count <- attributes(out)$outlier_count df <- attributes(out)$data - df <- df[!names(df) %in% "Outlier"] - } else { - df <- data.frame(Row = seq_len(nrow(as.data.frame(data)))) - outlier_count <- list() - outlier_var <- list() + df <- df[!names(df) == "Outlier"] } # Cook @@ -449,17 +449,17 @@ check_outliers.default <- function(x, outlier_count$cook <- count.table - if (!all(method %in% c("cook", "pareto"))) { + if (all(method %in% c("cook", "pareto"))) { + outlier_count$all <- count.table + } else { outlier_count$all <- datawizard::data_merge( list(outlier_count$all, count.table), join = "full", by = "Row" ) - } else { - outlier_count$all <- count.table } } else { - method <- method[!(method %in% "cook")] + method <- method[!(method == "cook")] } # Pareto @@ -1442,21 +1442,21 @@ check_outliers.metabin <- check_outliers.metagen lof <- 0.001 list( - "zscore" = zscore, - "zscore_robust" = zscore_robust, - "iqr" = iqr, - "ci" = ci, - "hdi" = hdi, - "eti" = eti, - "bci" = bci, - "cook" = cook, - "pareto" = pareto, - "mahalanobis" = mahalanobis, - "mahalanobis_robust" = mahalanobis_robust, - "mcd" = mcd, - "ics" = ics, - "optics" = optics, - "lof" = lof + zscore = zscore, + zscore_robust = zscore_robust, + iqr = iqr, + ci = ci, + hdi = hdi, + eti = eti, + bci = bci, + cook = cook, + pareto = pareto, + mahalanobis = mahalanobis, + mahalanobis_robust = mahalanobis_robust, + mcd = mcd, + ics = ics, + optics = optics, + lof = lof ) } @@ -1504,8 +1504,8 @@ check_outliers.metabin <- check_outliers.metagen out$Outlier_Zscore <- as.numeric(out$Distance_Zscore > threshold) output <- list( - "data_zscore" = out, - "threshold_zscore" = threshold + data_zscore = out, + threshold_zscore = threshold ) if (isTRUE(robust)) { @@ -1566,8 +1566,8 @@ check_outliers.metabin <- check_outliers.metagen }, numeric(1)) list( - "data_iqr" = out, - "threshold_iqr" = threshold + data_iqr = out, + threshold_iqr = threshold ) } @@ -1615,8 +1615,8 @@ check_outliers.metabin <- check_outliers.metagen out <- cbind(out.0, out) output <- list( - "data_" = out, - "threshold_" = threshold + data_ = out, + threshold_ = threshold ) names(output) <- paste0(names(output), method) output @@ -1636,8 +1636,8 @@ check_outliers.metabin <- check_outliers.metagen out$Outlier_Cook <- as.numeric(out$Distance_Cook > threshold) list( - "data_cook" = out, - "threshold_cook" = threshold + data_cook = out, + threshold_cook = threshold ) } @@ -1656,8 +1656,8 @@ check_outliers.metabin <- check_outliers.metagen out$Outlier_Pareto <- as.numeric(out$Distance_Pareto > threshold) list( - "data_pareto" = out, - "threshold_pareto" = threshold + data_pareto = out, + threshold_pareto = threshold ) } @@ -1686,8 +1686,8 @@ check_outliers.metabin <- check_outliers.metagen out$Outlier_Mahalanobis <- as.numeric(out$Distance_Mahalanobis > threshold) list( - "data_mahalanobis" = out, - "threshold_mahalanobis" = threshold + data_mahalanobis = out, + threshold_mahalanobis = threshold ) } @@ -1717,8 +1717,8 @@ check_outliers.metabin <- check_outliers.metagen ) list( - "data_mahalanobis_robust" = out, - "threshold_mahalanobis_robust" = threshold + data_mahalanobis_robust = out, + threshold_mahalanobis_robust = threshold ) } @@ -1744,8 +1744,8 @@ check_outliers.metabin <- check_outliers.metagen out$Outlier_MCD <- as.numeric(out$Distance_MCD > threshold) list( - "data_mcd" = out, - "threshold_mcd" = threshold + data_mcd = out, + threshold_mcd = threshold ) } @@ -1822,8 +1822,8 @@ check_outliers.metabin <- check_outliers.metagen # Out list( - "data_ics" = out, - "threshold_ics" = threshold + data_ics = out, + threshold_ics = threshold ) } @@ -1854,8 +1854,8 @@ check_outliers.metabin <- check_outliers.metagen } list( - "data_optics" = out, - "threshold_optics" = threshold + data_optics = out, + threshold_optics = threshold ) } @@ -1921,8 +1921,8 @@ check_outliers.metabin <- check_outliers.metagen out$Outlier_LOF <- as.numeric(out$Distance_LOF > cutoff) list( - "data_lof" = out, - "threshold_lof" = threshold + data_lof = out, + threshold_lof = threshold ) } From 1df774f4427b3019cdd72c9787a95361ffb71a06 Mon Sep 17 00:00:00 2001 From: Daniel Date: Wed, 25 Oct 2023 12:24:22 +0200 Subject: [PATCH 11/14] lintr --- R/check_model.R | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/R/check_model.R b/R/check_model.R index 80f14d2b0..58757468f 100644 --- a/R/check_model.R +++ b/R/check_model.R @@ -203,7 +203,7 @@ check_model.default <- function(x, } # try to find sensible default for "type" argument - suggest_dots <- (minfo$is_bernoulli || minfo$is_count || minfo$is_ordinal || minfo$is_categorical || minfo$is_multinomial) + suggest_dots <- (minfo$is_bernoulli || minfo$is_count || minfo$is_ordinal || minfo$is_categorical || minfo$is_multinomial) # nolint if (missing(type) && suggest_dots) { type <- "discrete_interval" } @@ -341,10 +341,10 @@ check_model.model_fit <- function(x, dat$NCV <- .diag_ncv(model, verbose = verbose) dat$HOMOGENEITY <- .diag_homogeneity(model, verbose = verbose) dat$OUTLIERS <- check_outliers(model, method = "cook") - if (!is.null(dat$OUTLIERS)) { - threshold <- attributes(dat$OUTLIERS)$threshold$cook - } else { + if (is.null(dat$OUTLIERS)) { threshold <- NULL + } else { + threshold <- attributes(dat$OUTLIERS)$threshold$cook } dat$INFLUENTIAL <- .influential_obs(model, threshold = threshold) dat$PP_CHECK <- .safe(check_predictions(model, ...)) @@ -366,10 +366,10 @@ check_model.model_fit <- function(x, dat$HOMOGENEITY <- .diag_homogeneity(model, verbose = verbose) dat$REQQ <- .diag_reqq(model, level = 0.95, model_info = model_info, verbose = verbose) dat$OUTLIERS <- check_outliers(model, method = "cook") - if (!is.null(dat$OUTLIERS)) { - threshold <- attributes(dat$OUTLIERS)$threshold$cook - } else { + if (is.null(dat$OUTLIERS)) { threshold <- NULL + } else { + threshold <- attributes(dat$OUTLIERS)$threshold$cook } dat$INFLUENTIAL <- .influential_obs(model, threshold = threshold) dat$PP_CHECK <- .safe(check_predictions(model, ...)) From 9e1ca4d88136c36123036d7d49ff3e59650e730c Mon Sep 17 00:00:00 2001 From: Daniel Date: Wed, 25 Oct 2023 12:36:58 +0200 Subject: [PATCH 12/14] add comments --- R/binned_residuals.R | 2 ++ 1 file changed, 2 insertions(+) diff --git a/R/binned_residuals.R b/R/binned_residuals.R index d7461c8c7..d8444ee7d 100644 --- a/R/binned_residuals.R +++ b/R/binned_residuals.R @@ -103,6 +103,7 @@ binned_residuals <- function(model, show_dots <- is.null(n) || n <= 1e5 } + # make sure response is 0/1 (and numeric) y0 <- .recode_to_zero(insight::get_response(model, verbose = FALSE)) # calculate residuals @@ -136,6 +137,7 @@ binned_residuals <- function(model, gaussian = stats::qnorm(c((1 - ci) / 2, (1 + ci) / 2), mean = ybar, sd = sdev / sqrt(n)), exact = { out <- stats::binom.test(sum(y0[items]), n)$conf.int + # center CIs around point estimate out <- out - (min(out) - ybar) - (diff(out) / 2) out }, From 9aa5922a9a673bd9f648c718bffbb5ff94720d3d Mon Sep 17 00:00:00 2001 From: Daniel Date: Wed, 25 Oct 2023 13:26:43 +0200 Subject: [PATCH 13/14] lintr --- R/check_outliers.R | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/R/check_outliers.R b/R/check_outliers.R index 655bf239e..2a1d185d3 100644 --- a/R/check_outliers.R +++ b/R/check_outliers.R @@ -489,17 +489,17 @@ check_outliers.default <- function(x, outlier_count$pareto <- count.table - if (!all(method %in% c("cook", "pareto"))) { + if (all(method %in% c("cook", "pareto"))) { + outlier_count$all <- count.table + } else { outlier_count$all <- datawizard::data_merge( list(outlier_count$all, count.table), join = "full", by = "Row" ) - } else { - outlier_count$all <- count.table } } else { - method <- method[!(method %in% "pareto")] + method <- method[!(method == "pareto")] } outlier_count$all <- datawizard::convert_na_to(outlier_count$all, @@ -1478,15 +1478,15 @@ check_outliers.metabin <- check_outliers.metagen x <- as.data.frame(x) # Standardize - if (!robust) { + if (robust) { d <- abs(as.data.frame(lapply( x, - function(x) (x - mean(x, na.rm = TRUE)) / stats::sd(x, na.rm = TRUE) + function(x) (x - stats::median(x, na.rm = TRUE)) / stats::mad(x, na.rm = TRUE) ))) } else { d <- abs(as.data.frame(lapply( x, - function(x) (x - stats::median(x, na.rm = TRUE)) / stats::mad(x, na.rm = TRUE) + function(x) (x - mean(x, na.rm = TRUE)) / stats::sd(x, na.rm = TRUE) ))) } @@ -1765,10 +1765,10 @@ check_outliers.metabin <- check_outliers.metagen insight::check_if_installed("ICSOutlier") # Get n cores - n_cores <- if (!requireNamespace("parallel", quietly = TRUE)) { - NULL - } else { + n_cores <- if (requireNamespace("parallel", quietly = TRUE)) { getOption("mc.cores", 1L) + } else { + NULL } # tell user about n-cores option From bf4b8fffc8f2dbd6c4b568f836bf1260bcb9262e Mon Sep 17 00:00:00 2001 From: Daniel Date: Wed, 25 Oct 2023 14:25:40 +0200 Subject: [PATCH 14/14] use better variable name --- R/binned_residuals.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/R/binned_residuals.R b/R/binned_residuals.R index d8444ee7d..8b5533bb0 100644 --- a/R/binned_residuals.R +++ b/R/binned_residuals.R @@ -133,7 +133,7 @@ binned_residuals <- function(model, n <- length(items) sdev <- stats::sd(y[items], na.rm = TRUE) - r <- switch(ci_type, + conf_int <- switch(ci_type, gaussian = stats::qnorm(c((1 - ci) / 2, (1 + ci) / 2), mean = ybar, sd = sdev / sqrt(n)), exact = { out <- stats::binom.test(sum(y0[items]), n)$conf.int @@ -143,7 +143,7 @@ binned_residuals <- function(model, }, boot = .boot_binned_ci(y[items], ci, iterations) ) - names(r) <- c("CI_low", "CI_high") + names(conf_int) <- c("CI_low", "CI_high") d0 <- data.frame( xbar = xbar, @@ -153,7 +153,7 @@ binned_residuals <- function(model, x.hi = model.range[2], se = stats::qnorm((1 + ci) / 2) * sdev / sqrt(n) ) - cbind(d0, rbind(r)) + cbind(d0, rbind(conf_int)) })) d <- do.call(rbind, d)