diff --git a/CRAN-SUBMISSION b/CRAN-SUBMISSION index b1f325ddd..afc762333 100644 --- a/CRAN-SUBMISSION +++ b/CRAN-SUBMISSION @@ -1,3 +1,3 @@ -Version: 0.10.8 -Date: 2023-10-29 16:13:36 UTC -SHA: c94741704caad7b8cde589e941af5d2f2826e6c2 +Version: 0.10.9 +Date: 2024-02-17 07:56:08 UTC +SHA: 051016febd197937ad083266b630c871fa9e1623 diff --git a/DESCRIPTION b/DESCRIPTION index 30285f897..c4784fa9b 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: performance Title: Assessment of Regression Models Performance -Version: 0.10.8.10 +Version: 0.10.9 Authors@R: c(person(given = "Daniel", family = "Lüdecke", @@ -69,7 +69,7 @@ BugReports: https://github.com/easystats/performance/issues Depends: R (>= 3.6) Imports: - bayestestR (>= 0.13.1), + bayestestR (>= 0.13.2), insight (>= 0.19.8), datawizard (>= 0.9.1), methods, @@ -124,10 +124,11 @@ Suggests: nonnest2, ordinal, parallel, - parameters (>= 0.20.3), + parameters (>= 0.21.4), patchwork, pscl, psych, + quantreg, qqplotr (>= 0.0.6), randomForest, rempsyc, @@ -135,7 +136,7 @@ Suggests: rstanarm, rstantools, sandwich, - see (>= 0.8.1), + see (>= 0.8.2), survey, survival, testthat (>= 3.2.1), @@ -144,7 +145,7 @@ Suggests: withr (>= 3.0.0) Encoding: UTF-8 Language: en-US -RoxygenNote: 7.2.3.9000 +RoxygenNote: 7.3.1 Roxygen: list(markdown = TRUE) Config/testthat/edition: 3 Config/testthat/parallel: true @@ -153,4 +154,3 @@ Config/Needs/website: r-lib/pkgdown, easystats/easystatstemplate Config/rcmdcheck/ignore-inconsequential-notes: true -Remotes: easystats/insight diff --git a/NAMESPACE b/NAMESPACE index a35ccca91..226d58481 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -89,6 +89,9 @@ S3method(check_outliers,metagen) S3method(check_outliers,numeric) S3method(check_outliers,rma) S3method(check_outliers,rma.uni) +S3method(check_outliers,rq) +S3method(check_outliers,rqs) +S3method(check_outliers,rqss) S3method(check_overdispersion,default) S3method(check_overdispersion,fixest) S3method(check_overdispersion,fixest_multi) @@ -430,6 +433,8 @@ S3method(r2_coxsnell,survreg) S3method(r2_coxsnell,svycoxph) S3method(r2_coxsnell,truncreg) S3method(r2_efron,default) +S3method(r2_kullback,default) +S3method(r2_kullback,glm) S3method(r2_loo_posterior,BFBayesFactor) S3method(r2_loo_posterior,brmsfit) S3method(r2_loo_posterior,stanmvreg) diff --git a/NEWS.md b/NEWS.md index 6c29b63f5..83264fbe6 100644 --- a/NEWS.md +++ b/NEWS.md @@ -24,6 +24,21 @@ * Clarification in the documentation of the `estimator` argument for `performance_aic()`. +* Improved plots for overdispersion-checks for negative-binomial models from + package *glmmTMB* (affects `check_overdispersion()` and `check_mnodel()`). + +* Improved detection rates for singularity in `check_singularity()` for models + from package *glmmTMB*. + +* For model of class `glmmTMB`, deviance residuals are now used in the + `check_model()` plot. + +* Improved (better to understand) error messages for `check_model()`, + `check_collinearity()` and `check_outliers()` for models with non-numeric + response variables. + +* `r2_kullback()` now gives an informative error for non-supported models. + ## Bug fixes * Fixed issue in `binned_residuals()` for models with binary outcome, where @@ -32,6 +47,14 @@ * `performance_score()` should no longer fail for models where scoring rules can't be calculated. Instead, an informative message is returned. +* `check_outliers()` now properly accept the `percentage_central` argument when + using the `"mcd"` method. + +* Fixed edge cases in `check_collinearity()` and `check_outliers()` for models + with response variables of classes `Date`, `POSIXct`, `POSIXlt` or `difftime`. + +* Fixed issue with `check_model()` for models of package *quantreg*. + # performance 0.10.8 ## Changes diff --git a/R/check_collinearity.R b/R/check_collinearity.R index 2add49126..8aac2c9d6 100644 --- a/R/check_collinearity.R +++ b/R/check_collinearity.R @@ -405,6 +405,20 @@ check_collinearity.zerocount <- function(x, .check_collinearity <- function(x, component, ci = 0.95, verbose = TRUE) { v <- insight::get_varcov(x, component = component, verbose = FALSE) + + # sanity check + if (is.null(v)) { + if (isTRUE(verbose)) { + insight::format_alert( + paste( + sprintf("Could not extract the variance-covariance matrix for the %s component of the model.", component), + "Please try to run `vcov(model)`, which may help identifying the problem." + ) + ) + } + return(NULL) + } + term_assign <- .term_assignments(x, component, verbose = verbose) # any assignment found? @@ -432,10 +446,8 @@ check_collinearity.zerocount <- function(x, if (insight::has_intercept(x)) { v <- v[-1, -1] term_assign <- term_assign[-1] - } else { - if (isTRUE(verbose)) { - insight::format_alert("Model has no intercept. VIFs may not be sensible.") - } + } else if (isTRUE(verbose)) { + insight::format_alert("Model has no intercept. VIFs may not be sensible.") } f <- insight::find_formula(x) @@ -475,6 +487,11 @@ check_collinearity.zerocount <- function(x, result <- vector("numeric") na_terms <- vector("numeric") + # sanity check - models with offset(?) may contain too many term assignments + if (length(term_assign) > ncol(v)) { + term_assign <- term_assign[seq_len(ncol(v))] + } + for (term in 1:n.terms) { subs <- which(term_assign == term) if (length(subs)) { diff --git a/R/check_heterogeneity_bias.R b/R/check_heterogeneity_bias.R index b87aa9962..d9bb337f9 100644 --- a/R/check_heterogeneity_bias.R +++ b/R/check_heterogeneity_bias.R @@ -31,9 +31,9 @@ check_heterogeneity_bias <- function(x, select = NULL, group = NULL) { if (insight::is_model(x)) { group <- insight::find_random(x, split_nested = TRUE, flatten = TRUE) if (is.null(group)) { - insight::format_error("Model is no mixed model. Please provide a mixed model, or a data frame and arguments `select` and `group`.") + insight::format_error("Model is no mixed model. Please provide a mixed model, or a data frame and arguments `select` and `group`.") # nolint } - data <- insight::get_data(x, source = "mf", verbose = FALSE) + my_data <- insight::get_data(x, source = "mf", verbose = FALSE) select <- insight::find_predictors(x, effects = "fixed", component = "conditional", flatten = TRUE) } else { if (inherits(select, "formula")) { @@ -42,15 +42,15 @@ check_heterogeneity_bias <- function(x, select = NULL, group = NULL) { if (inherits(group, "formula")) { group <- all.vars(group) } - data <- x + my_data <- x } - unique_groups <- .n_unique(data[[group]]) + unique_groups <- .n_unique(my_data[[group]]) combinations <- expand.grid(select, group) result <- Map(function(predictor, id) { # demean predictor - d <- datawizard::demean(data, select = predictor, group = id, verbose = FALSE) + d <- datawizard::demean(my_data, select = predictor, group = id, verbose = FALSE) # get new names within_name <- paste0(predictor, "_within") diff --git a/R/check_model.R b/R/check_model.R index e261704a5..bd44558e9 100644 --- a/R/check_model.R +++ b/R/check_model.R @@ -183,23 +183,27 @@ check_model.default <- function(x, minfo <- insight::model_info(x, verbose = FALSE) - ca <- tryCatch( - { - if (minfo$is_bayesian) { - suppressWarnings(.check_assumptions_stan(x, ...)) - } else if (minfo$is_linear) { - suppressWarnings(.check_assumptions_linear(x, minfo, verbose, ...)) - } else { - suppressWarnings(.check_assumptions_glm(x, minfo, verbose, ...)) - } + assumptions_data <- tryCatch( + if (minfo$is_bayesian) { + suppressWarnings(.check_assumptions_stan(x, ...)) + } else if (minfo$is_linear) { + suppressWarnings(.check_assumptions_linear(x, minfo, verbose, ...)) + } else { + suppressWarnings(.check_assumptions_glm(x, minfo, verbose, ...)) }, error = function(e) { - NULL + e } ) - if (is.null(ca)) { - insight::format_error(paste0("`check_model()` not implemented for models of class `", class(x)[1], "` yet.")) + if (inherits(assumptions_data, c("error", "simpleError"))) { + pattern <- "(\n|\\s{2,})" + replacement <- " " + cleaned_string <- gsub(pattern, replacement, assumptions_data$message) + insight::format_error( + paste("`check_model()` returned following error:", cleaned_string), + paste0("\nIf the error message does not help identifying your problem, another reason why `check_model()` failed might be that models of class `", class(x)[1], "` are not yet supported.") # nolint + ) } # try to find sensible default for "type" argument @@ -214,21 +218,22 @@ check_model.default <- function(x, show_dots <- is.null(n) || n <= 1e5 } - attr(ca, "panel") <- panel - attr(ca, "dot_size") <- dot_size - attr(ca, "line_size") <- line_size - attr(ca, "check") <- check - attr(ca, "alpha") <- alpha - attr(ca, "dot_alpha") <- dot_alpha - attr(ca, "show_dots") <- isTRUE(show_dots) - attr(ca, "detrend") <- detrend - attr(ca, "colors") <- colors - attr(ca, "theme") <- theme - attr(ca, "model_info") <- minfo - attr(ca, "overdisp_type") <- list(...)$plot_type - attr(ca, "bandwidth") <- bandwidth - attr(ca, "type") <- type - ca + attr(assumptions_data, "panel") <- panel + attr(assumptions_data, "dot_size") <- dot_size + attr(assumptions_data, "line_size") <- line_size + attr(assumptions_data, "check") <- check + attr(assumptions_data, "alpha") <- alpha + attr(assumptions_data, "dot_alpha") <- dot_alpha + attr(assumptions_data, "show_dots") <- isTRUE(show_dots) + attr(assumptions_data, "detrend") <- detrend + attr(assumptions_data, "colors") <- colors + attr(assumptions_data, "theme") <- theme + attr(assumptions_data, "model_info") <- minfo + attr(assumptions_data, "overdisp_type") <- list(...)$plot_type + attr(assumptions_data, "bandwidth") <- bandwidth + attr(assumptions_data, "type") <- type + attr(assumptions_data, "model_class") <- class(x)[1] + assumptions_data } @@ -263,7 +268,7 @@ check_model.stanreg <- function(x, dot_alpha = 0.8, colors = c("#3aaf85", "#1b6ca8", "#cd201f"), theme = "see::theme_lucid", - detrend = FALSE, + detrend = TRUE, show_dots = NULL, bandwidth = "nrd", type = "density", @@ -302,7 +307,7 @@ check_model.model_fit <- function(x, dot_alpha = 0.8, colors = c("#3aaf85", "#1b6ca8", "#cd201f"), theme = "see::theme_lucid", - detrend = FALSE, + detrend = TRUE, show_dots = NULL, bandwidth = "nrd", type = "density", @@ -335,7 +340,7 @@ check_model.model_fit <- function(x, dat <- list() dat$VIF <- .diag_vif(model, verbose = verbose) - dat$QQ <- .diag_qq(model, verbose = verbose) + dat$QQ <- .diag_qq(model, model_info = model_info, verbose = verbose) dat$REQQ <- .diag_reqq(model, level = 0.95, model_info = model_info, verbose = verbose) dat$NORM <- .diag_norm(model, verbose = verbose) dat$NCV <- .diag_ncv(model, verbose = verbose) @@ -362,7 +367,7 @@ check_model.model_fit <- function(x, dat <- list() dat$VIF <- .diag_vif(model, verbose = verbose) - dat$QQ <- .diag_qq(model, verbose = verbose) + dat$QQ <- .diag_qq(model, model_info = model_info, verbose = verbose) dat$HOMOGENEITY <- .diag_homogeneity(model, verbose = verbose) dat$REQQ <- .diag_reqq(model, level = 0.95, model_info = model_info, verbose = verbose) dat$OUTLIERS <- check_outliers(model, method = "cook") diff --git a/R/check_model_diagnostics.R b/R/check_model_diagnostics.R index 6e398efa8..55797cab8 100644 --- a/R/check_model_diagnostics.R +++ b/R/check_model_diagnostics.R @@ -35,11 +35,13 @@ # prepare data for QQ plot ---------------------------------- -.diag_qq <- function(model, verbose = TRUE) { - if (inherits(model, c("lme", "lmerMod", "merMod", "glmmTMB", "gam"))) { +.diag_qq <- function(model, model_info = NULL, verbose = TRUE) { + if (inherits(model, c("lme", "lmerMod", "merMod", "gam"))) { res_ <- stats::residuals(model) } else if (inherits(model, "geeglm")) { res_ <- stats::residuals(model, type = "pearson") + } else if (inherits(model, "glmmTMB")) { + res_ <- stats::residuals(model, type = "deviance") } else if (inherits(model, "glm")) { res_ <- .safe(abs(stats::rstandard(model, type = "deviance"))) } else { @@ -61,7 +63,7 @@ return(NULL) } - if (inherits(model, "glm")) { + if (inherits(model, c("glm", "glmerMod")) || (inherits(model, "glmmTMB") && isFALSE(model_info$is_linear))) { fitted_ <- stats::qnorm((stats::ppoints(length(res_)) + 1) / 2) } else { fitted_ <- stats::fitted(model) @@ -94,29 +96,25 @@ insight::check_if_installed("lme4") tryCatch( - { - if (inherits(model, "glmmTMB")) { - var_attr <- "condVar" - re <- .collapse_cond(lme4::ranef(model, condVar = TRUE)) - } else { - var_attr <- "postVar" - re <- lme4::ranef(model, condVar = TRUE) - } + if (inherits(model, "glmmTMB")) { + var_attr <- "condVar" + re <- .collapse_cond(lme4::ranef(model, condVar = TRUE)) + } else { + var_attr <- "postVar" + re <- lme4::ranef(model, condVar = TRUE) }, error = function(e) { - return(NULL) + NULL } ) se <- tryCatch( - { - suppressWarnings(lapply(re, function(.x) { - pv <- attr(.x, var_attr, exact = TRUE) - cols <- seq_len(dim(pv)[1]) - unlist(lapply(cols, function(.y) sqrt(pv[.y, .y, ]))) - })) - }, + suppressWarnings(lapply(re, function(.x) { + pv <- attr(.x, var_attr, exact = TRUE) + cols <- seq_len(dim(pv)[1]) + unlist(lapply(cols, function(.y) sqrt(pv[.y, .y, ]))) + })), error = function(e) { NULL } @@ -186,15 +184,15 @@ n_params <- tryCatch(model$rank, error = function(e) insight::n_parameters(model)) infl <- stats::influence(model, do.coef = FALSE) - resid <- as.numeric(insight::get_residuals(model)) + model_resid <- as.numeric(insight::get_residuals(model)) - std_resid <- tryCatch(stats::rstandard(model, infl), error = function(e) resid) + std_resid <- tryCatch(stats::rstandard(model, infl), error = function(e) model_resid) plot_data <- data.frame( Hat = infl$hat, Cooks_Distance = stats::cooks.distance(model, infl), Fitted = insight::get_predicted(model, ci = NULL), - Residuals = resid, + Residuals = model_resid, Std_Residuals = std_resid, stringsAsFactors = FALSE ) @@ -213,12 +211,10 @@ .diag_ncv <- function(model, verbose = TRUE) { ncv <- tryCatch( - { - data.frame( - x = as.numeric(stats::fitted(model)), - y = as.numeric(stats::residuals(model)) - ) - }, + data.frame( + x = as.numeric(stats::fitted(model)), + y = as.numeric(stats::residuals(model)) + ), error = function(e) { NULL } @@ -244,24 +240,22 @@ .diag_homogeneity <- function(model, verbose = TRUE) { faminfo <- insight::model_info(model) r <- tryCatch( - { - if (inherits(model, "merMod")) { - stats::residuals(model, scaled = TRUE) - } else if (inherits(model, "gam")) { - stats::residuals(model, type = "scaled.pearson") - } else if (inherits(model, c("glmmTMB", "MixMod"))) { - sigma <- if (faminfo$is_mixed) { - sqrt(insight::get_variance_residual(model)) - } else { - .sigma_glmmTMB_nonmixed(model, faminfo) - } - stats::residuals(model) / sigma - } else if (inherits(model, "glm")) { - ## TODO: check if we can / should use deviance residuals (as for QQ plots) here as well? - stats::rstandard(model, type = "pearson") + if (inherits(model, "merMod")) { + stats::residuals(model, scaled = TRUE) + } else if (inherits(model, "gam")) { + stats::residuals(model, type = "scaled.pearson") + } else if (inherits(model, c("glmmTMB", "MixMod"))) { + residual_sigma <- if (faminfo$is_mixed) { + sqrt(insight::get_variance_residual(model)) } else { - stats::rstandard(model) + .sigma_glmmTMB_nonmixed(model, faminfo) } + stats::residuals(model) / residual_sigma + } else if (inherits(model, "glm")) { + ## TODO: check if we can / should use deviance residuals (as for QQ plots) here as well? + stats::rstandard(model, type = "pearson") + } else { + stats::rstandard(model) }, error = function(e) { NULL @@ -302,11 +296,26 @@ # data for negative binomial models if (faminfo$is_negbin && !faminfo$is_zero_inflated) { - d <- data.frame(Predicted = stats::predict(model, type = "response")) - d$Residuals <- insight::get_response(model) - as.vector(d$Predicted) - d$Res2 <- d$Residuals^2 - d$V <- d$Predicted * (1 + d$Predicted / insight::get_sigma(model)) - d$StdRes <- insight::get_residuals(model, type = "pearson") + if (inherits(model, "glmmTMB")) { + d <- data.frame(Predicted = stats::predict(model, type = "response")) + d$Residuals <- insight::get_residuals(model, type = "pearson") + d$Res2 <- d$Residuals^2 + d$StdRes <- insight::get_residuals(model, type = "pearson") + if (faminfo$family == "nbinom1") { + # for nbinom1, we can use "sigma()" + d$V <- insight::get_sigma(model)^2 * stats::family(model)$variance(d$Predicted) + } else { + # for nbinom2, "sigma()" has "inverse meaning" (see #654) + d$V <- (1 / insight::get_sigma(model)^2) * stats::family(model)$variance(d$Predicted) + } + } else { + ## FIXME: this is not correct for glm.nb models? + d <- data.frame(Predicted = stats::predict(model, type = "response")) + d$Residuals <- insight::get_response(model) - as.vector(d$Predicted) + d$Res2 <- d$Residuals^2 + d$V <- d$Predicted * (1 + d$Predicted / insight::get_sigma(model)) + d$StdRes <- insight::get_residuals(model, type = "pearson") + } } # data for zero-inflated poisson models diff --git a/R/check_outliers.R b/R/check_outliers.R index 2a1d185d3..2017d4cc5 100644 --- a/R/check_outliers.R +++ b/R/check_outliers.R @@ -23,9 +23,11 @@ #' 'Details'). If a numeric value is given, it will be used as the threshold #' for any of the method run. #' @param ID Optional, to report an ID column along with the row number. +#' @param verbose Toggle warnings. #' @param ... When `method = "ics"`, further arguments in `...` are passed #' down to [ICSOutlier::ics.outlier()]. When `method = "mahalanobis"`, -#' they are passed down to [stats::mahalanobis()]. +#' they are passed down to [stats::mahalanobis()]. `percentage_central` can +#' be specified when `method = "mcd"`. #' #' @return A logical vector of the detected outliers with a nice printing #' method: a check (message) on whether outliers were detected or not. The @@ -160,6 +162,9 @@ #' the data (by default, 66\%), before computing the Mahalanobis Distance. This #' is deemed to be a more robust method of identifying and removing outliers #' than regular Mahalanobis distance. +#' This method has a `percentage_central` argument that allows specifying +#' the breakdown point (0.75, the default, is recommended by Leys et al. 2018, +#' but a commonly used alternative is 0.50). #' #' - **Invariant Coordinate Selection (ICS)**: #' The outlier are detected using ICS, which by default uses an alpha threshold @@ -308,7 +313,7 @@ #' @examplesIf require("see") && require("bigutilsr") && require("loo") && require("MASS") && require("ICSOutlier") && require("ICS") && require("dbscan") #' \donttest{ #' # You can also run all the methods -#' check_outliers(data, method = "all") +#' check_outliers(data, method = "all", verbose = FALSE) #' #' # For statistical models --------------------------------------------- #' # select only mpg and disp (continuous) @@ -344,6 +349,7 @@ check_outliers.default <- function(x, method = c("cook", "pareto"), threshold = NULL, ID = NULL, + verbose = TRUE, ...) { # Check args if (all(method == "all")) { @@ -385,16 +391,31 @@ check_outliers.default <- function(x, ) # Get data - data <- insight::get_data(x, verbose = FALSE) + my_data <- insight::get_data(x, verbose = FALSE) + + # sanity check for date, POSIXt and difftime variables + if (any(vapply(my_data, inherits, FUN.VALUE = logical(1), what = c("Date", "POSIXt", "difftime"))) && verbose) { + insight::format_alert( + paste( + "Date variables are not supported for outliers detection. These will be ignored.", + "Make sure any date variables are converted to numeric or factor {.b before} fitting the model." + ) + ) + } # Remove non-numerics - data <- datawizard::data_select(data, select = is.numeric) + my_data <- datawizard::data_select(my_data, select = is.numeric, verbose = FALSE) + + # check if any data left + if (is.null(my_data) || ncol(my_data) == 0) { + insight::format_error("No numeric variables found. No data to check for outliers.") + } # Thresholds if (is.null(threshold)) { - thresholds <- .check_outliers_thresholds(data) + thresholds <- .check_outliers_thresholds(my_data) } else if (is.list(threshold)) { - thresholds <- .check_outliers_thresholds(data) + thresholds <- .check_outliers_thresholds(my_data) thresholds[names(threshold)] <- threshold[names(threshold)] } else { insight::format_error( @@ -405,21 +426,21 @@ check_outliers.default <- function(x, ) } - if (!missing(ID)) { + if (!missing(ID) && verbose) { insight::format_warning(paste0("ID argument not supported for model objects of class `", class(x)[1], "`.")) } # Others if (all(method %in% c("cook", "pareto"))) { - df <- data.frame(Row = seq_len(nrow(as.data.frame(data)))) + my_df <- data.frame(Row = seq_len(nrow(as.data.frame(my_data)))) outlier_count <- list() outlier_var <- list() } else { - out <- check_outliers(data, method, threshold) + out <- check_outliers(my_data, method, threshold) outlier_var <- attributes(out)$outlier_var outlier_count <- attributes(out)$outlier_count - df <- attributes(out)$data - df <- df[!names(df) == "Outlier"] + my_df <- attributes(out)$data + my_df <- my_df[names(my_df) != "Outlier"] } # Cook @@ -429,7 +450,7 @@ check_outliers.default <- function(x, threshold = thresholds$cook )$data_cook - df <- datawizard::data_merge(list(df, data_cook), + my_df <- datawizard::data_merge(list(my_df, data_cook), join = "full", by = "Row" ) @@ -459,7 +480,7 @@ check_outliers.default <- function(x, ) } } else { - method <- method[!(method == "cook")] + method <- method[method != "cook"] } # Pareto @@ -469,7 +490,7 @@ check_outliers.default <- function(x, threshold = thresholds$pareto )$data_pareto - df <- datawizard::data_merge(list(df, data_pareto), + my_df <- datawizard::data_merge(list(my_df, data_pareto), join = "full", by = "Row" ) @@ -499,7 +520,7 @@ check_outliers.default <- function(x, ) } } else { - method <- method[!(method == "pareto")] + method <- method[method != "pareto"] } outlier_count$all <- datawizard::convert_na_to(outlier_count$all, @@ -531,21 +552,21 @@ check_outliers.default <- function(x, thresholds <- thresholds[names(thresholds) %in% method] # Composite outlier score - df$Outlier <- rowMeans(df[grepl("Outlier_", names(df), fixed = TRUE)]) - df <- df[c(names(df)[names(df) != "Outlier"], "Outlier")] + my_df$Outlier <- rowMeans(my_df[grepl("Outlier_", names(my_df), fixed = TRUE)]) + my_df <- my_df[c(names(my_df)[names(my_df) != "Outlier"], "Outlier")] # Out - outlier <- df$Outlier > 0.5 + outlier <- my_df$Outlier > 0.5 # Attributes class(outlier) <- c("check_outliers", "see_check_outliers", class(outlier)) - attr(outlier, "data") <- df + attr(outlier, "data") <- my_df attr(outlier, "threshold") <- thresholds attr(outlier, "method") <- method attr(outlier, "text_size") <- 3 attr(outlier, "influential_obs") <- .influential_obs(x) attr(outlier, "variables") <- "(Whole model)" - attr(outlier, "raw_data") <- data + attr(outlier, "raw_data") <- my_data attr(outlier, "outlier_var") <- outlier_var attr(outlier, "outlier_count") <- outlier_count @@ -798,7 +819,7 @@ check_outliers.data.frame <- function(x, ) # Remove non-numerics - data <- x + my_data <- x x <- x[, vapply(x, is.numeric, logical(1)), drop = FALSE] # Check args @@ -841,20 +862,20 @@ check_outliers.data.frame <- function(x, outlier_var <- out.meta$outlier_var # Combine outlier data - df <- out[vapply(out, is.data.frame, logical(1))] - if (length(df) > 1 && !is.null(ID)) { - df <- datawizard::data_merge(df, by = c("Row", ID)) - } else if (length(df) > 1) { - df <- datawizard::data_merge(df, by = "Row") + my_df <- out[vapply(out, is.data.frame, logical(1))] + if (length(my_df) > 1 && !is.null(ID)) { + my_df <- datawizard::data_merge(my_df, by = c("Row", ID)) + } else if (length(my_df) > 1) { + my_df <- datawizard::data_merge(my_df, by = "Row") } else { - df <- df[[1]] + my_df <- my_df[[1]] } # Composite outlier score - df$Outlier <- rowMeans(df[grepl("Outlier_", names(df), fixed = TRUE)]) + my_df$Outlier <- rowMeans(my_df[grepl("Outlier_", names(my_df), fixed = TRUE)]) # Out - outlier <- df$Outlier > 0.5 + outlier <- my_df$Outlier > 0.5 # Combine outlier frequency table if (length(outlier_count) > 1 && !is.null(ID)) { @@ -896,12 +917,12 @@ check_outliers.data.frame <- function(x, # Attributes class(outlier) <- c("check_outliers", "see_check_outliers", class(outlier)) - attr(outlier, "data") <- df + attr(outlier, "data") <- my_df attr(outlier, "threshold") <- thresholds attr(outlier, "method") <- method attr(outlier, "text_size") <- 3 attr(outlier, "variables") <- names(x) - attr(outlier, "raw_data") <- data + attr(outlier, "raw_data") <- my_data attr(outlier, "outlier_var") <- outlier_var attr(outlier, "outlier_count") <- outlier_count outlier @@ -914,7 +935,7 @@ check_outliers.data.frame <- function(x, outlier.list <- lapply(outlier.list, function(x) { x[x[[Outlier_method]] >= 0.5, ] }) - outlier.list <- outlier.list[lapply(outlier.list, nrow) > 0] + outlier.list <- outlier.list[vapply(outlier.list, nrow, numeric(1)) > 0] outlier.list <- lapply(outlier.list, datawizard::data_remove, Outlier_method, as_data_frame = TRUE @@ -1098,8 +1119,8 @@ check_outliers.data.frame <- function(x, out <- c(out, .check_outliers_mcd( x, threshold = thresholds$mcd, - percentage_central = 0.66, - ID.names = ID.names + ID.names = ID.names, + ... )) count.table <- datawizard::data_filter( @@ -1217,7 +1238,7 @@ check_outliers.grouped_df <- function(x, } # Initialize elements - data <- data.frame() + my_data <- data.frame() out <- NULL thresholds <- list() outlier_var <- list() @@ -1226,24 +1247,24 @@ check_outliers.grouped_df <- function(x, # Loop through groups for (i in seq_along(grps)) { rows <- grps[[i]] - subset <- check_outliers( + outliers_subset <- check_outliers( as.data.frame(x[rows, ]), method = method, threshold = threshold, ID = ID, ... ) - data <- rbind(data, as.data.frame(subset)) - out <- c(out, subset) - thresholds[[paste0("group_", i)]] <- attributes(subset)$threshold + my_data <- rbind(my_data, as.data.frame(outliers_subset)) + out <- c(out, outliers_subset) + thresholds[[paste0("group_", i)]] <- attributes(outliers_subset)$threshold outlier_var[[i]] <- lapply( - attributes(subset)$outlier_var, lapply, function(y) { + attributes(outliers_subset)$outlier_var, lapply, function(y) { y$Row <- rows[which(seq_along(rows) %in% y$Row)] y } ) outlier_count[[i]] <- lapply( - attributes(subset)$outlier_count, function(y) { + attributes(outliers_subset)$outlier_count, function(y) { y$Row <- rows[which(seq_along(rows) %in% y$Row)] y } @@ -1264,16 +1285,16 @@ check_outliers.grouped_df <- function(x, info$groups$.rows[[x]] <- as.data.frame(info$groups$.rows[[x]]) }) - data[names(info$groups)[1]] <- do.call(rbind, groups) - data <- datawizard::data_relocate( - data, + my_data[names(info$groups)[1]] <- do.call(rbind, groups) + my_data <- datawizard::data_relocate( + my_data, select = names(info$groups)[1], after = "Row" ) - data$Row <- seq_len(nrow(data)) + my_data$Row <- seq_len(nrow(my_data)) class(out) <- c("check_outliers", "see_check_outliers", class(out)) - attr(out, "data") <- data + attr(out, "data") <- my_data attr(out, "method") <- method attr(out, "threshold") <- thresholds[[1]] attr(out, "text_size") <- 3 @@ -1434,7 +1455,7 @@ check_outliers.metabin <- check_outliers.metagen bci <- 1 - 0.001 cook <- stats::qf(0.5, ncol(x), nrow(x) - ncol(x)) pareto <- 0.7 - mahalanobis <- stats::qchisq(p = 1 - 0.001, df = ncol(x)) + mahalanobis_value <- stats::qchisq(p = 1 - 0.001, df = ncol(x)) mahalanobis_robust <- stats::qchisq(p = 1 - 0.001, df = ncol(x)) mcd <- stats::qchisq(p = 1 - 0.001, df = ncol(x)) ics <- 0.001 @@ -1451,7 +1472,7 @@ check_outliers.metabin <- check_outliers.metagen bci = bci, cook = cook, pareto = pareto, - mahalanobis = mahalanobis, + mahalanobis = mahalanobis_value, mahalanobis_robust = mahalanobis_robust, mcd = mcd, ics = ics, @@ -1726,14 +1747,31 @@ check_outliers.metabin <- check_outliers.metagen .check_outliers_mcd <- function(x, threshold = stats::qchisq(p = 1 - 0.001, df = ncol(x)), - percentage_central = 0.50, - ID.names = NULL) { + percentage_central = 0.75, + ID.names = NULL, + verbose = TRUE, + ...) { out <- data.frame(Row = seq_len(nrow(x))) if (!is.null(ID.names)) { out <- cbind(out, ID.names) } + # check whether N to p ratio is not too large, else MCD flags too many outliers + # See #672: This does seem to be a function of the N/p (N = sample size; p = + # number of parameters) ratio. When it is larger than 10, the % of outliers + # flagged is okay (in well behaved data). This makes sense: the MCD looks at + # the cov matrix of subsamples of the data - with high dimensional data, small + # samples sizes will give highly variable cov matrices, as so the "smallest" + # one will probably miss-represent the data. + + if ((nrow(x) / ncol(x)) <= 10 && isTRUE(verbose)) { + insight::format_warning( + "The sample size is too small in your data, relative to the number of variables, for MCD to be reliable.", + "You may try to increase the `percentage_central` argument (must be between 0 and 1), or choose another method." + ) + } + insight::check_if_installed("MASS") # Compute @@ -1949,3 +1987,12 @@ check_outliers.lmrob <- check_outliers.glmmTMB #' @export check_outliers.glmrob <- check_outliers.glmmTMB + +#' @export +check_outliers.rq <- check_outliers.glmmTMB + +#' @export +check_outliers.rqs <- check_outliers.glmmTMB + +#' @export +check_outliers.rqss <- check_outliers.glmmTMB diff --git a/R/check_singularity.R b/R/check_singularity.R index 3128ef933..b446708bd 100644 --- a/R/check_singularity.R +++ b/R/check_singularity.R @@ -101,15 +101,24 @@ check_singularity.merMod <- function(x, tolerance = 1e-5, ...) { check_singularity.rlmerMod <- check_singularity.merMod - #' @export check_singularity.glmmTMB <- function(x, tolerance = 1e-5, ...) { insight::check_if_installed("lme4") - vc <- .collapse_cond(lme4::VarCorr(x)) - any(sapply(vc, function(.x) any(abs(diag(.x)) < tolerance))) + eigen_values <- list() + vv <- lme4::VarCorr(x) + for (component in c("cond", "zi")) { + for (i in seq_along(vv[[component]])) { + eigen_values <- c( + eigen_values, + list(eigen(vv[[component]][[i]], only.values = TRUE)$values) + ) + } + } + any(vapply(eigen_values, min, numeric(1), na.rm = TRUE) < tolerance) } + #' @export check_singularity.glmmadmb <- check_singularity.glmmTMB diff --git a/R/icc.R b/R/icc.R index 46c1a331a..47fbb9a45 100644 --- a/R/icc.R +++ b/R/icc.R @@ -532,14 +532,12 @@ print.icc_decomposed <- function(x, digits = 2, ...) { name_full = "ICC", verbose = TRUE) { vars <- tryCatch( - { - insight::get_variance(model, - name_fun = name_fun, - name_full = name_full, - tolerance = tolerance, - verbose = verbose - ) - }, + insight::get_variance(model, + name_fun = name_fun, + name_full = name_full, + tolerance = tolerance, + verbose = verbose + ), error = function(e) { if (inherits(e, c("simpleError", "error")) && verbose) { insight::print_color(e$message, "red") @@ -597,7 +595,7 @@ print.icc_decomposed <- function(x, digits = 2, ...) { # prepare arguments for "lme4::bootMer" .do_lme4_bootmer <- function(model, .boot_fun, iterations, dots) { insight::check_if_installed(c("lme4", "boot")) - args <- list( + my_args <- list( model, .boot_fun, nsim = iterations, @@ -608,25 +606,25 @@ print.icc_decomposed <- function(x, digits = 2, ...) { ) # add/overwrite dot-args if (!is.null(dots[["use.u"]])) { - args$use.u <- dots[["use.u"]] + my_args$use.u <- dots[["use.u"]] } if (!is.null(dots[["re.form"]])) { - args$re.form <- dots[["re.form"]] + my_args$re.form <- dots[["re.form"]] } if (!is.null(dots[["type"]])) { - args$type <- dots[["type"]] - if (args$type == "semiparametric") { - args$use.u <- TRUE + my_args$type <- dots[["type"]] + if (my_args$type == "semiparametric") { + my_args$use.u <- TRUE } } if (!is.null(dots[["parallel"]])) { - args$parallel <- dots[["parallel"]] + my_args$parallel <- dots[["parallel"]] } if (!is.null(dots[["ncpus"]])) { - args$ncpus <- dots[["ncpus"]] + my_args$ncpus <- dots[["ncpus"]] } # bootsrap - do.call(lme4::bootMer, args) + do.call(lme4::bootMer, args = my_args) } @@ -664,12 +662,10 @@ print.icc_decomposed <- function(x, digits = 2, ...) { } model_rank <- tryCatch( - { - if (!is.null(model$rank)) { - model$rank - df_int - } else { - insight::n_parameters(model) - df_int - } + if (!is.null(model$rank)) { + model$rank - df_int + } else { + insight::n_parameters(model) - df_int }, error = function(e) insight::n_parameters(model) - df_int ) diff --git a/R/model_performance.bayesian.R b/R/model_performance.bayesian.R index fd796fb60..2ba4e5130 100644 --- a/R/model_performance.bayesian.R +++ b/R/model_performance.bayesian.R @@ -40,7 +40,7 @@ #' #' - **PCP**: percentage of correct predictions, see [performance_pcp()]. #' -#' @examplesIf require("rstanarm") && require("rstantools") && require("BayesFactor") +#' @examplesIf require("rstanarm") && require("rstantools") #' \donttest{ #' model <- suppressWarnings(rstanarm::stan_glm( #' mpg ~ wt + cyl, @@ -59,12 +59,6 @@ #' refresh = 0 #' )) #' model_performance(model) -#' -#' model <- BayesFactor::generalTestBF(carb ~ am + mpg, mtcars) -#' -#' model_performance(model) -#' model_performance(model[3]) -#' model_performance(model, average = TRUE) #' } #' @seealso [r2_bayes] #' @references Gelman, A., Goodrich, B., Gabry, J., and Vehtari, A. (2018). diff --git a/R/r2.R b/R/r2.R index dacdef614..2789b5efe 100644 --- a/R/r2.R +++ b/R/r2.R @@ -896,12 +896,10 @@ r2.DirichletRegModel <- function(model, ...) { m <- sum(w * f / sum(w)) mss <- sum(w * (f - m)^2) } + } else if (is.null(w)) { + mss <- sum(f^2) } else { - if (is.null(w)) { - mss <- sum(f^2) - } else { - mss <- sum(w * f^2) - } + mss <- sum(w * f^2) } if (is.null(w)) { rss <- sum(r^2) diff --git a/R/r2_bayes.R b/R/r2_bayes.R index 24e8269ba..ec98f754b 100644 --- a/R/r2_bayes.R +++ b/R/r2_bayes.R @@ -30,7 +30,7 @@ #' `r2_posterior()` is the actual workhorse for `r2_bayes()` and #' returns a posterior sample of Bayesian R2 values. #' -#' @examplesIf require("rstanarm") && require("rstantools") && require("BayesFactor") && require("brms") +#' @examplesIf require("rstanarm") && require("rstantools") && require("brms") #' library(performance) #' \donttest{ #' model <- suppressWarnings(rstanarm::stan_glm( @@ -53,24 +53,6 @@ #' r2_bayes(model) #' } #' -#' BFM <- BayesFactor::generalTestBF(mpg ~ qsec + gear, data = mtcars, progress = FALSE) -#' FM <- BayesFactor::lmBF(mpg ~ qsec + gear, data = mtcars) -#' -#' r2_bayes(FM) -#' r2_bayes(BFM[3]) -#' r2_bayes(BFM, average = TRUE) # across all models -#' -#' # with random effects: -#' mtcars$gear <- factor(mtcars$gear) -#' model <- BayesFactor::lmBF( -#' mpg ~ hp + cyl + gear + gear:wt, -#' mtcars, -#' progress = FALSE, -#' whichRandom = c("gear", "gear:wt") -#' ) -#' -#' r2_bayes(model) -#' #' \donttest{ #' model <- suppressWarnings(brms::brm( #' mpg ~ wt + cyl, @@ -201,28 +183,26 @@ r2_posterior.brmsfit <- function(model, verbose = TRUE, ...) { }) names(br2) <- res } + } else if (mi$is_mixed) { + br2 <- list( + R2_Bayes = as.vector(rstantools::bayes_R2( + model, + re.form = NULL, + re_formula = NULL, + summary = FALSE + )), + R2_Bayes_marginal = as.vector(rstantools::bayes_R2( + model, + re.form = NA, + re_formula = NA, + summary = FALSE + )) + ) + names(br2$R2_Bayes) <- rep("Conditional R2", length(br2$R2_Bayes)) + names(br2$R2_Bayes_marginal) <- rep("Marginal R2", length(br2$R2_Bayes)) } else { - if (mi$is_mixed) { - br2 <- list( - R2_Bayes = as.vector(rstantools::bayes_R2( - model, - re.form = NULL, - re_formula = NULL, - summary = FALSE - )), - R2_Bayes_marginal = as.vector(rstantools::bayes_R2( - model, - re.form = NA, - re_formula = NA, - summary = FALSE - )) - ) - names(br2$R2_Bayes) <- rep("Conditional R2", length(br2$R2_Bayes)) - names(br2$R2_Bayes_marginal) <- rep("Marginal R2", length(br2$R2_Bayes)) - } else { - br2 <- list(R2_Bayes = as.vector(rstantools::bayes_R2(model, summary = FALSE))) - names(br2$R2_Bayes) <- rep("R2", length(br2$R2_Bayes)) - } + br2 <- list(R2_Bayes = as.vector(rstantools::bayes_R2(model, summary = FALSE))) + names(br2$R2_Bayes) <- rep("R2", length(br2$R2_Bayes)) } br2 @@ -400,7 +380,7 @@ as.data.frame.r2_bayes <- function(x, ...) { # remove sig and g cols params_theta <- params[, !grepl(pattern = "^sig2$|^g_|^g$", colnames(params))] - params_sigma <- sqrt(params[, grepl(pattern = "^sig2$", colnames(params))]) + params_sigma <- sqrt(params[, colnames(params) == "sig2"]) # Model Matrix mm <- insight::get_modelmatrix(model[1]) diff --git a/R/r2_coxsnell.R b/R/r2_coxsnell.R index d56803f5c..042e5461a 100644 --- a/R/r2_coxsnell.R +++ b/R/r2_coxsnell.R @@ -69,20 +69,20 @@ r2_coxsnell.glm <- function(model, verbose = TRUE, ...) { if (is.null(info)) { info <- suppressWarnings(insight::model_info(model, verbose = FALSE)) } + # Cox & Snell's R2 is not defined for binomial models that are not Bernoulli models if (info$is_binomial && !info$is_bernoulli && class(model)[1] == "glm") { if (verbose) { insight::format_alert("Can't calculate accurate R2 for binomial models that are not Bernoulli models.") } return(NULL) - } else { - # if no deviance, return NA - if (is.null(model$deviance)) { - return(NULL) - } - r2_coxsnell <- (1 - exp((model$deviance - model$null.deviance) / insight::n_obs(model, disaggregate = TRUE))) - names(r2_coxsnell) <- "Cox & Snell's R2" - r2_coxsnell } + # if no deviance, return NULL + if (is.null(model$deviance)) { + return(NULL) + } + r2_coxsnell <- (1 - exp((model$deviance - model$null.deviance) / insight::n_obs(model, disaggregate = TRUE))) + names(r2_coxsnell) <- "Cox & Snell's R2" + r2_coxsnell } #' @export @@ -95,22 +95,22 @@ r2_coxsnell.glmmTMB <- function(model, verbose = TRUE, ...) { if (is.null(info)) { info <- suppressWarnings(insight::model_info(model, verbose = FALSE)) } + # Cox & Snell's R2 is not defined for binomial models that are not Bernoulli models if (info$is_binomial && !info$is_bernoulli) { if (verbose) { insight::format_alert("Can't calculate accurate R2 for binomial models that are not Bernoulli models.") } return(NULL) - } else { - dev <- stats::deviance(model) - # if no deviance, return NA - if (is.null(dev)) { - return(NULL) - } - null_dev <- stats::deviance(insight::null_model(model)) - r2_coxsnell <- (1 - exp((dev - null_dev) / insight::n_obs(model, disaggregate = TRUE))) - names(r2_coxsnell) <- "Cox & Snell's R2" - r2_coxsnell } + dev <- stats::deviance(model) + # if no deviance, return NULL + if (is.null(dev)) { + return(NULL) + } + null_dev <- stats::deviance(insight::null_model(model)) + r2_coxsnell <- (1 - exp((dev - null_dev) / insight::n_obs(model, disaggregate = TRUE))) + names(r2_coxsnell) <- "Cox & Snell's R2" + r2_coxsnell } diff --git a/R/r2_kl.R b/R/r2_kl.R index b5c1bb649..553e338e4 100644 --- a/R/r2_kl.R +++ b/R/r2_kl.R @@ -7,6 +7,7 @@ #' @param model A generalized linear model. #' @param adjust Logical, if `TRUE` (the default), the adjusted R2 value is #' returned. +#' @param ... Additional arguments. Currently not used. #' #' @return A named vector with the R2 value. #' @@ -19,7 +20,13 @@ #' 77: 329-342. #' #' @export -r2_kullback <- function(model, adjust = TRUE) { +r2_kullback <- function(model, ...) { + UseMethod("r2_kullback") +} + +#' @rdname r2_kullback +#' @export +r2_kullback.glm <- function(model, adjust = TRUE, ...) { if (adjust) { adj <- model$df.null / model$df.residual } else { @@ -31,3 +38,8 @@ r2_kullback <- function(model, adjust = TRUE) { names(klr2) <- "Kullback-Leibler R2" klr2 } + +#' @export +r2_kullback.default <- function(model, ...) { + insight::format_error("This function only works for objects of class `glm`.") +} diff --git a/R/r2_loo.R b/R/r2_loo.R index 040d9b572..dbcaf38b3 100644 --- a/R/r2_loo.R +++ b/R/r2_loo.R @@ -50,10 +50,10 @@ r2_loo <- function(model, robust = TRUE, ci = 0.95, verbose = TRUE, ...) { loo_r2 <- structure( class = "r2_loo", lapply(loo_r2, ifelse(robust, stats::median, mean)), - "SE" = lapply(loo_r2, ifelse(robust, stats::mad, stats::sd)), + SE = lapply(loo_r2, ifelse(robust, stats::mad, stats::sd)), # "Estimates" = lapply(r2_bayesian, bayestestR::point_estimate, centrality = "all", dispersion = TRUE), - "CI" = lapply(loo_r2, bayestestR::hdi, ci = ci), - "robust" = robust + CI = lapply(loo_r2, bayestestR::hdi, ci = ci), + robust = robust ) return(loo_r2) } @@ -84,13 +84,13 @@ r2_loo_posterior.brmsfit <- function(model, verbose = TRUE, ...) { res <- insight::find_response(model) if (mi[[1]]$is_mixed) { br2_mv <- list( - "R2_loo" = rstantools::loo_R2( + R2_loo = rstantools::loo_R2( model, re.form = NULL, re_formula = NULL, summary = FALSE ), - "R2_loo_marginal" = rstantools::loo_R2( + R2_loo_marginal = rstantools::loo_R2( model, re.form = NA, re_formula = NA, @@ -99,40 +99,38 @@ r2_loo_posterior.brmsfit <- function(model, verbose = TRUE, ...) { ) br2 <- lapply(seq_along(res), function(x) { list( - "R2_loo" = unname(as.vector(br2_mv$R2_loo[, x])), - "R2_loo_marginal" = unname(as.vector(br2_mv$R2_loo_marginal[, x])) + R2_loo = unname(as.vector(br2_mv$R2_loo[, x])), + R2_loo_marginal = unname(as.vector(br2_mv$R2_loo_marginal[, x])) ) }) names(br2) <- res } else { - br2_mv <- list("R2_loo" = rstantools::loo_R2(model, summary = FALSE)) + br2_mv <- list(R2_loo = rstantools::loo_R2(model, summary = FALSE)) br2 <- lapply(seq_along(res), function(x) { - list("R2_loo" = unname(as.vector(br2_mv$R2_loo[, x]))) + list(R2_loo = unname(as.vector(br2_mv$R2_loo[, x]))) }) names(br2) <- res } + } else if (mi$is_mixed) { + br2 <- list( + R2_loo = as.vector(rstantools::loo_R2( + model, + re.form = NULL, + re_formula = NULL, + summary = FALSE + )), + R2_loo_marginal = as.vector(rstantools::loo_R2( + model, + re.form = NA, + re_formula = NA, + summary = FALSE + )) + ) + names(br2$R2_loo) <- rep("Conditional R2_adjusted", length(br2$R2_loo)) + names(br2$R2_loo_marginal) <- rep("Marginal R2_adjusted", length(br2$R2_loo)) } else { - if (mi$is_mixed) { - br2 <- list( - "R2_loo" = as.vector(rstantools::loo_R2( - model, - re.form = NULL, - re_formula = NULL, - summary = FALSE - )), - "R2_loo_marginal" = as.vector(rstantools::loo_R2( - model, - re.form = NA, - re_formula = NA, - summary = FALSE - )) - ) - names(br2$R2_loo) <- rep("Conditional R2_adjusted", length(br2$R2_loo)) - names(br2$R2_loo_marginal) <- rep("Marginal R2_adjusted", length(br2$R2_loo)) - } else { - br2 <- list("R2_loo" = as.vector(rstantools::loo_R2(model, summary = FALSE))) - names(br2$R2_loo) <- rep("R2_adjusted", length(br2$R2_loo)) - } + br2 <- list(R2_loo = as.vector(rstantools::loo_R2(model, summary = FALSE))) + names(br2$R2_loo) <- rep("R2_adjusted", length(br2$R2_loo)) } br2 diff --git a/R/test_bf.R b/R/test_bf.R index 60cb3a129..38b0e421d 100644 --- a/R/test_bf.R +++ b/R/test_bf.R @@ -9,21 +9,21 @@ test_bf <- function(...) { #' @export test_bf.default <- function(..., reference = 1, text_length = NULL) { # Attribute class to list and get names from the global environment - objects <- insight::ellipsis_info(..., only_models = TRUE) - names(objects) <- match.call(expand.dots = FALSE)$`...` + my_objects <- insight::ellipsis_info(..., only_models = TRUE) + names(my_objects) <- match.call(expand.dots = FALSE)[["..."]] # validation checks (will throw error if non-valid objects) - .test_performance_checks(objects, multiple = FALSE) + .test_performance_checks(objects = my_objects, multiple = FALSE) - if (length(objects) == 1 && isTRUE(insight::is_model(objects))) { + if (length(my_objects) == 1 && isTRUE(insight::is_model(my_objects))) { insight::format_error( - "`test_bf()` is designed to compare multiple models together. For a single model, you might want to run `bayestestR::bf_parameters()` instead." + "`test_bf()` is designed to compare multiple models together. For a single model, you might want to run `bayestestR::bf_parameters()` instead." # nolint ) } # If a suitable class is found, run the more specific method on it - if (inherits(objects, c("ListNestedRegressions", "ListNonNestedRegressions", "ListLavaan"))) { - test_bf(objects, reference = reference, text_length = text_length) + if (inherits(my_objects, c("ListNestedRegressions", "ListNonNestedRegressions", "ListLavaan"))) { + test_bf(my_objects, reference = reference, text_length = text_length) } else { insight::format_error("The models cannot be compared for some reason :/") } @@ -87,9 +87,9 @@ test_bf.ListModels <- function(objects, reference = 1, text_length = NULL, ...) if (all(bayesian_models)) { "yes" - } else if (!all(bayesian_models)) { - "no" - } else { + } else if (any(bayesian_models)) { "mixed" + } else { + "no" } } diff --git a/R/test_likelihoodratio.R b/R/test_likelihoodratio.R index 9784b13a9..529516487 100644 --- a/R/test_likelihoodratio.R +++ b/R/test_likelihoodratio.R @@ -22,29 +22,29 @@ test_lrt <- test_likelihoodratio #' @export test_likelihoodratio.default <- function(..., estimator = "OLS", verbose = TRUE) { # Attribute class to list - objects <- insight::ellipsis_info(..., only_models = TRUE) + my_objects <- insight::ellipsis_info(..., only_models = TRUE) # validation checks (will throw error if non-valid objects) - objects <- .test_performance_checks(objects, verbose = verbose) + my_objects <- .test_performance_checks(my_objects, verbose = verbose) # different default when mixed model or glm is included if (missing(estimator)) { - mixed_models <- sapply(objects, insight::is_mixed_model) - if (all(mixed_models) && all(sapply(objects, .is_lmer_reml)) && isTRUE(attributes(objects)$same_fixef)) { + mixed_models <- sapply(my_objects, insight::is_mixed_model) + if (all(mixed_models) && all(sapply(my_objects, .is_lmer_reml)) && isTRUE(attributes(my_objects)$same_fixef)) { estimator <- "REML" - } else if (any(mixed_models) || !all(attributes(objects)$is_linear)) { + } else if (any(mixed_models) || !all(attributes(my_objects)$is_linear)) { estimator <- "ML" } } # ensure proper object names - objects <- .check_objectnames(objects, sapply(match.call(expand.dots = FALSE)$`...`, as.character)) + my_objects <- .check_objectnames(my_objects, sapply(match.call(expand.dots = FALSE)[["..."]], as.character)) # If a suitable class is found, run the more specific method on it - if (inherits(objects, "ListNestedRegressions")) { - test_likelihoodratio(objects, estimator = estimator) - } else if (inherits(objects, "ListLavaan")) { - test_likelihoodratio_ListLavaan(..., objects = objects) # Because lavaanLRT requires the ellipsis + if (inherits(my_objects, "ListNestedRegressions")) { + test_likelihoodratio(my_objects, estimator = estimator) + } else if (inherits(my_objects, "ListLavaan")) { + test_likelihoodratio_ListLavaan(..., objects = my_objects) # Because lavaanLRT requires the ellipsis } else { insight::format_error( "The models are not nested, which is a prerequisite for `test_likelihoodratio()`.", @@ -106,7 +106,7 @@ test_likelihoodratio.ListNestedRegressions <- function(objects, estimator = "ML" same_fixef <- attributes(objects)$same_fixef # sort by df - if (!all(sort(dfs) == dfs) && !all(sort(dfs) == rev(dfs))) { + if (is.unsorted(dfs) && is.unsorted(rev(dfs))) { objects <- objects[order(dfs)] dfs <- sort(dfs, na.last = TRUE) } diff --git a/R/test_performance.R b/R/test_performance.R index b643143f7..f818b02dd 100644 --- a/R/test_performance.R +++ b/R/test_performance.R @@ -236,17 +236,17 @@ test_performance <- function(..., reference = 1, verbose = TRUE) { #' @export test_performance.default <- function(..., reference = 1, include_formula = FALSE, verbose = TRUE) { # Attribute class to list and get names from the global environment - objects <- insight::ellipsis_info(..., only_models = TRUE) + my_objects <- insight::ellipsis_info(..., only_models = TRUE) # validation checks (will throw error if non-valid objects) - objects <- .test_performance_checks(objects, verbose = verbose) + my_objects <- .test_performance_checks(my_objects, verbose = verbose) # ensure proper object names - objects <- .check_objectnames(objects, sapply(match.call(expand.dots = FALSE)$`...`, as.character)) + my_objects <- .check_objectnames(my_objects, sapply(match.call(expand.dots = FALSE)[["..."]], as.character)) # If a suitable class is found, run the more specific method on it - if (inherits(objects, c("ListNestedRegressions", "ListNonNestedRegressions", "ListLavaan"))) { - test_performance(objects, reference = reference, include_formula = include_formula) + if (inherits(my_objects, c("ListNestedRegressions", "ListNonNestedRegressions", "ListLavaan"))) { + test_performance(my_objects, reference = reference, include_formula = include_formula) } else { insight::format_error("The models cannot be compared for some reason :/") } @@ -421,10 +421,10 @@ test_performance.ListNonNestedRegressions <- function(objects, .test_performance_init <- function(objects, include_formula = FALSE) { - names <- insight::model_name(objects, include_formula = include_formula) + model_names <- insight::model_name(objects, include_formula = include_formula) out <- data.frame( Name = names(objects), - Model = names, + Model = model_names, stringsAsFactors = FALSE ) row.names(out) <- NULL @@ -453,7 +453,7 @@ test_performance.ListNonNestedRegressions <- function(objects, if (same_response && !inherits(objects, "ListLavaan") && isFALSE(attributes(objects)$same_response)) { insight::format_error( - "The models' dependent variables don't have the same data, which is a prerequisite to compare them. Probably the proportion of missing data differs between models." + "The models' dependent variables don't have the same data, which is a prerequisite to compare them. Probably the proportion of missing data differs between models." # nolint ) } diff --git a/R/test_vuong.R b/R/test_vuong.R index 449f55ddb..f0cce6d16 100644 --- a/R/test_vuong.R +++ b/R/test_vuong.R @@ -8,17 +8,17 @@ test_vuong <- function(..., verbose = TRUE) { #' @export test_vuong.default <- function(..., reference = 1, verbose = TRUE) { # Attribute class to list and get names from the global environment - objects <- insight::ellipsis_info(..., only_models = TRUE) + my_objects <- insight::ellipsis_info(..., only_models = TRUE) # validation checks (will throw error if non-valid objects) - objects <- .test_performance_checks(objects, verbose = verbose) + my_objects <- .test_performance_checks(my_objects, verbose = verbose) # ensure proper object names - objects <- .check_objectnames(objects, sapply(match.call(expand.dots = FALSE)$`...`, as.character)) + my_objects <- .check_objectnames(my_objects, sapply(match.call(expand.dots = FALSE)[["..."]], as.character)) # If a suitable class is found, run the more specific method on it - if (inherits(objects, c("ListNestedRegressions", "ListNonNestedRegressions", "ListLavaan"))) { - test_vuong(objects, reference = reference) + if (inherits(my_objects, c("ListNestedRegressions", "ListNonNestedRegressions", "ListLavaan"))) { + test_vuong(my_objects, reference = reference) } else { insight::format_error("The models cannot be compared for some reason :/") } diff --git a/R/test_wald.R b/R/test_wald.R index 0e043502a..ddfe21829 100644 --- a/R/test_wald.R +++ b/R/test_wald.R @@ -8,17 +8,17 @@ test_wald <- function(..., verbose = TRUE) { #' @export test_wald.default <- function(..., verbose = TRUE) { # Attribute class to list and get names from the global environment - objects <- insight::ellipsis_info(..., only_models = TRUE) + my_objects <- insight::ellipsis_info(..., only_models = TRUE) # validation checks (will throw error if non-valid objects) - objects <- .test_performance_checks(objects, verbose = verbose) + my_objects <- .test_performance_checks(my_objects, verbose = verbose) # ensure proper object names - objects <- .check_objectnames(objects, sapply(match.call(expand.dots = FALSE)$`...`, as.character)) + my_objects <- .check_objectnames(my_objects, sapply(match.call(expand.dots = FALSE)[["..."]], as.character)) # If a suitable class is found, run the more specific method on it - if (inherits(objects, c("ListNestedRegressions", "ListNonNestedRegressions", "ListLavaan"))) { - test_wald(objects) + if (inherits(my_objects, c("ListNestedRegressions", "ListNonNestedRegressions", "ListLavaan"))) { + test_wald(my_objects) } else { insight::format_error("The models cannot be compared for some reason :/") } @@ -37,10 +37,10 @@ test_wald.ListNestedRegressions <- function(objects, verbose = TRUE, ...) { ) } return(test_likelihoodratio(objects)) - } else { - out <- .test_wald(objects, test = "F") } + out <- .test_wald(objects, test = "F") + attr(out, "is_nested") <- TRUE class(out) <- c("test_performance", class(out)) out @@ -60,7 +60,7 @@ test_wald.ListNonNestedRegressions <- function(objects, verbose = TRUE, ...) { dfs <- sapply(objects, insight::get_df, type = "residual") # sort by df - if (!all(sort(dfs) == dfs) && !all(sort(dfs) == rev(dfs))) { + if (is.unsorted(dfs) && is.unsorted(rev(dfs))) { objects <- objects[order(dfs)] dfs <- sort(dfs, na.last = TRUE) } @@ -78,18 +78,18 @@ test_wald.ListNonNestedRegressions <- function(objects, verbose = TRUE, ...) { # Find reference-model related stuff refmodel <- order(dfs)[1] - scale <- dev[refmodel] / dfs[refmodel] + my_scale <- dev[refmodel] / dfs[refmodel] # test = "F" if (test == "F") { - f_value <- (dev_diff / dfs_diff) / scale + f_value <- (dev_diff / dfs_diff) / my_scale f_value[!is.na(f_value) & f_value < 0] <- NA # rather than p = 0 - out$`F` <- f_value + out[["F"]] <- f_value p <- stats::pf(f_value, abs(dfs_diff), dfs[refmodel], lower.tail = FALSE) # test = "LRT" } else { - chi2 <- dev_diff / scale * sign(dfs_diff) + chi2 <- dev_diff / my_scale * sign(dfs_diff) chi2[!is.na(chi2) & chi2 < 0] <- NA # rather than p = 0 out$Chi2 <- chi2 p <- stats::pchisq(chi2, abs(dfs_diff), lower.tail = FALSE) diff --git a/cran-comments.md b/cran-comments.md index 2dec3e271..fa6f2d4c2 100644 --- a/cran-comments.md +++ b/cran-comments.md @@ -1 +1 @@ -Hot fix release that fixes CRAN check errors. \ No newline at end of file +Maintainance release. \ No newline at end of file diff --git a/inst/WORDLIST b/inst/WORDLIST index 88ad8d39d..97bc3a8f8 100644 --- a/inst/WORDLIST +++ b/inst/WORDLIST @@ -299,6 +299,7 @@ poisson preprint priori pscl +quantreg quared quartile quartiles diff --git a/man/check_outliers.Rd b/man/check_outliers.Rd index 74c992b6f..22b88228c 100644 --- a/man/check_outliers.Rd +++ b/man/check_outliers.Rd @@ -14,6 +14,7 @@ check_outliers(x, ...) method = c("cook", "pareto"), threshold = NULL, ID = NULL, + verbose = TRUE, ... ) @@ -26,7 +27,8 @@ check_outliers(x, ...) \item{...}{When \code{method = "ics"}, further arguments in \code{...} are passed down to \code{\link[ICSOutlier:ics.outlier]{ICSOutlier::ics.outlier()}}. When \code{method = "mahalanobis"}, -they are passed down to \code{\link[stats:mahalanobis]{stats::mahalanobis()}}.} +they are passed down to \code{\link[stats:mahalanobis]{stats::mahalanobis()}}. \code{percentage_central} can +be specified when \code{method = "mcd"}.} \item{method}{The outlier detection method(s). Can be \code{"all"} or some of \code{"cook"}, \code{"pareto"}, \code{"zscore"}, \code{"zscore_robust"}, \code{"iqr"}, \code{"ci"}, \code{"eti"}, @@ -40,6 +42,8 @@ considered as outlier. If \code{NULL}, default values will be used (see for any of the method run.} \item{ID}{Optional, to report an ID column along with the row number.} + +\item{verbose}{Toggle warnings.} } \value{ A logical vector of the detected outliers with a nice printing @@ -189,6 +193,9 @@ calculates the mean and covariance matrix based on the most central subset of the data (by default, 66\\%), before computing the Mahalanobis Distance. This is deemed to be a more robust method of identifying and removing outliers than regular Mahalanobis distance. +This method has a \code{percentage_central} argument that allows specifying +the breakdown point (0.75, the default, is recommended by Leys et al. 2018, +but a commonly used alternative is 0.50). \item \strong{Invariant Coordinate Selection (ICS)}: The outlier are detected using ICS, which by default uses an alpha threshold of 0.025 (corresponding to the 2.5\\% most extreme observations) as a cut-off @@ -295,7 +302,7 @@ check_outliers(group_iris) \dontshow{if (require("see") && require("bigutilsr") && require("loo") && require("MASS") && require("ICSOutlier") && require("ICS") && require("dbscan")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} \donttest{ # You can also run all the methods -check_outliers(data, method = "all") +check_outliers(data, method = "all", verbose = FALSE) # For statistical models --------------------------------------------- # select only mpg and disp (continuous) diff --git a/man/model_performance.stanreg.Rd b/man/model_performance.stanreg.Rd index bbd82bc53..622c5b18a 100644 --- a/man/model_performance.stanreg.Rd +++ b/man/model_performance.stanreg.Rd @@ -60,7 +60,7 @@ values mean better fit. See \code{?loo::waic}. } } \examples{ -\dontshow{if (require("rstanarm") && require("rstantools") && require("BayesFactor")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +\dontshow{if (require("rstanarm") && require("rstantools")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} \donttest{ model <- suppressWarnings(rstanarm::stan_glm( mpg ~ wt + cyl, @@ -79,12 +79,6 @@ model <- suppressWarnings(rstanarm::stan_glmer( refresh = 0 )) model_performance(model) - -model <- BayesFactor::generalTestBF(carb ~ am + mpg, mtcars) - -model_performance(model) -model_performance(model[3]) -model_performance(model, average = TRUE) } \dontshow{\}) # examplesIf} } diff --git a/man/performance-package.Rd b/man/performance-package.Rd index abbdcf21f..b3f1d5a3f 100644 --- a/man/performance-package.Rd +++ b/man/performance-package.Rd @@ -3,7 +3,6 @@ \docType{package} \name{performance-package} \alias{performance-package} -\alias{_PACKAGE} \title{performance: An R Package for Assessment, Comparison and Testing of Statistical Models} \description{ diff --git a/man/r2_bayes.Rd b/man/r2_bayes.Rd index edc78150e..67200fb7d 100644 --- a/man/r2_bayes.Rd +++ b/man/r2_bayes.Rd @@ -62,7 +62,7 @@ R2 takes both the fixed and random effects into account. returns a posterior sample of Bayesian R2 values. } \examples{ -\dontshow{if (require("rstanarm") && require("rstantools") && require("BayesFactor") && require("brms")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +\dontshow{if (require("rstanarm") && require("rstantools") && require("brms")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} library(performance) \donttest{ model <- suppressWarnings(rstanarm::stan_glm( @@ -85,24 +85,6 @@ model <- suppressWarnings(rstanarm::stan_lmer( r2_bayes(model) } -BFM <- BayesFactor::generalTestBF(mpg ~ qsec + gear, data = mtcars, progress = FALSE) -FM <- BayesFactor::lmBF(mpg ~ qsec + gear, data = mtcars) - -r2_bayes(FM) -r2_bayes(BFM[3]) -r2_bayes(BFM, average = TRUE) # across all models - -# with random effects: -mtcars$gear <- factor(mtcars$gear) -model <- BayesFactor::lmBF( - mpg ~ hp + cyl + gear + gear:wt, - mtcars, - progress = FALSE, - whichRandom = c("gear", "gear:wt") -) - -r2_bayes(model) - \donttest{ model <- suppressWarnings(brms::brm( mpg ~ wt + cyl, diff --git a/man/r2_kullback.Rd b/man/r2_kullback.Rd index d2980d18e..ae0f9dd55 100644 --- a/man/r2_kullback.Rd +++ b/man/r2_kullback.Rd @@ -2,13 +2,18 @@ % Please edit documentation in R/r2_kl.R \name{r2_kullback} \alias{r2_kullback} +\alias{r2_kullback.glm} \title{Kullback-Leibler R2} \usage{ -r2_kullback(model, adjust = TRUE) +r2_kullback(model, ...) + +\method{r2_kullback}{glm}(model, adjust = TRUE, ...) } \arguments{ \item{model}{A generalized linear model.} +\item{...}{Additional arguments. Currently not used.} + \item{adjust}{Logical, if \code{TRUE} (the default), the adjusted R2 value is returned.} } diff --git a/tests/testthat/test-check_collinearity.R b/tests/testthat/test-check_collinearity.R index 3d68b87ac..ea41af513 100644 --- a/tests/testthat/test-check_collinearity.R +++ b/tests/testthat/test-check_collinearity.R @@ -211,3 +211,10 @@ test_that("check_collinearity, hurdle/zi models w/o zi-formula", { ) expect_equal(out$VIF, c(1.05772, 1.05772, 1.06587, 1.06587), tolerance = 1e-4) }) + +test_that("check_collinearity, invalid data", { + skip_if(packageVersion("insight") < "0.19.8.2") + dd <- data.frame(y = as.difftime(0:5, units = "days")) + m1 <- lm(y ~ 1, data = dd) + expect_error(check_collinearity(m1), "Can't extract variance-covariance matrix") +}) diff --git a/tests/testthat/test-check_model.R b/tests/testthat/test-check_model.R index 6b4ce4db2..4a3d23703 100644 --- a/tests/testthat/test-check_model.R +++ b/tests/testthat/test-check_model.R @@ -36,3 +36,18 @@ test_that("`check_outliers()` works if convergence issues", { x <- check_outliers(m, verbose = FALSE) expect_s3_class(x, "check_outliers") }) + +test_that("`check_model()` for invalid models", { + skip_if(packageVersion("insight") < "0.19.8.2") + dd <- data.frame(y = as.difftime(0:5, units = "days")) + m1 <- lm(y ~ 1, data = dd) + expect_error(check_model(m1)) +}) + +test_that("`check_model()` works for quantreg", { + skip_if_not_installed("quantreg") + data(engel, package = "quantreg") + qm <- quantreg::rq(foodexp ~ income, data = engel) + x <- check_model(qm, verbose = FALSE) + expect_s3_class(x, "check_model") +}) diff --git a/tests/testthat/test-check_outliers.R b/tests/testthat/test-check_outliers.R index e464028d0..8840e26bf 100644 --- a/tests/testthat/test-check_outliers.R +++ b/tests/testthat/test-check_outliers.R @@ -87,9 +87,18 @@ test_that("mcd which", { # (not clear why method mcd needs a seed) set.seed(42) expect_identical( - tail(which(check_outliers(mtcars[1:4], method = "mcd", threshold = 45))), + tail(which(check_outliers(mtcars[1:4], method = "mcd", threshold = 45, verbose = FALSE))), 31L ) + expect_warning( + { + out <- check_outliers(mtcars, method = "mcd") + }, + regex = "The sample size is too small" + ) + expect_identical(sum(out), 8L) + out <- check_outliers(mtcars, method = "mcd", percentage_central = 0.5, verbose = FALSE) + expect_identical(sum(out), 15L) }) ## FIXME: Fails on CRAN/windows @@ -197,11 +206,12 @@ test_that("all methods which", { "mahalanobis", "mahalanobis_robust", "mcd", "optics", "lof" ), threshold = list( - "zscore" = 2.2, "zscore_robust" = 2.2, "iqr" = 1.2, - "ci" = 0.95, "eti" = 0.95, "hdi" = 0.90, "bci" = 0.95, - "mahalanobis" = 20, "mahalanobis_robust" = 25, "mcd" = 25, - "optics" = 14, "lof" = 0.005 - ) + zscore = 2.2, zscore_robust = 2.2, iqr = 1.2, + ci = 0.95, eti = 0.95, hdi = 0.90, bci = 0.95, + mahalanobis = 20, mahalanobis_robust = 25, mcd = 25, + optics = 14, lof = 0.005 + ), + verbose = FALSE )), as.integer(c(9, 15, 16, 19, 20, 28, 29, 31)) ) @@ -219,12 +229,13 @@ test_that("multiple methods with ID", { "mahalanobis", "mahalanobis_robust", "mcd", "optics", "lof" ), threshold = list( - "zscore" = 2.2, "zscore_robust" = 2.2, "iqr" = 1.2, - "ci" = 0.95, "eti" = 0.95, "hdi" = 0.90, "bci" = 0.95, - "mahalanobis" = 20, "mahalanobis_robust" = 25, "mcd" = 25, - "optics" = 14, "lof" = 0.005 + zscore = 2.2, zscore_robust = 2.2, iqr = 1.2, + ci = 0.95, eti = 0.95, hdi = 0.90, bci = 0.95, + mahalanobis = 20, mahalanobis_robust = 25, mcd = 25, + optics = 14, lof = 0.005 ), - ID = "car" + ID = "car", + verbose = FALSE )) expect_identical( x$outlier_var$zscore$mpg$car, @@ -318,3 +329,16 @@ test_that("cook multiple methods which", { c("setosa", "versicolor", "virginica") ) }) + + +test_that("check_outliers with invald data", { + dd <- data.frame(y = as.difftime(0:5, units = "days")) + m1 <- lm(y ~ 1, data = dd) + expect_error( + expect_message( + check_outliers(m1), + regex = "Date variables are not supported" + ), + regex = "No numeric variables found" + ) +}) diff --git a/tests/testthat/test-check_singularity.R b/tests/testthat/test-check_singularity.R index 3edf6e8f6..dc0d56964 100644 --- a/tests/testthat/test-check_singularity.R +++ b/tests/testthat/test-check_singularity.R @@ -1,14 +1,14 @@ -test_that("check_singularity", { +test_that("check_singularity, lme4", { skip_on_cran() skip_if_not_installed("lme4") data(sleepstudy, package = "lme4") set.seed(123) - sleepstudy$mygrp <- sample(1:5, size = 180, replace = TRUE) + sleepstudy$mygrp <- sample.int(5, size = 180, replace = TRUE) sleepstudy$mysubgrp <- NA for (i in 1:5) { filter_group <- sleepstudy$mygrp == i sleepstudy$mysubgrp[filter_group] <- - sample(1:30, size = sum(filter_group), replace = TRUE) + sample.int(30, size = sum(filter_group), replace = TRUE) } model <- suppressMessages(lme4::lmer( @@ -17,3 +17,23 @@ test_that("check_singularity", { )) expect_true(check_singularity(model)) }) + + +test_that("check_singularity", { + skip_on_cran() + skip_if_not_installed("glmmTMB") + set.seed(101) + dd <- expand.grid(x = factor(1:6), f = factor(1:20), rep = 1:5) + dd$y <- glmmTMB::simulate_new(~ 1 + (x | f), + newdata = dd, + newparam = list( + beta = 0, + theta = rep(0, 21), + betad = 0 + ) + )[[1]] + expect_warning(expect_warning({ + m2 <- glmmTMB::glmmTMB(y ~ 1 + (x | f), data = dd, REML = FALSE) + })) + expect_true(check_singularity(m2)) +}) diff --git a/tests/testthat/test-model_performance.bayesian.R b/tests/testthat/test-model_performance.bayesian.R index 57ef8e0eb..a15e41a10 100644 --- a/tests/testthat/test-model_performance.bayesian.R +++ b/tests/testthat/test-model_performance.bayesian.R @@ -112,7 +112,7 @@ test_that("model_performance.BFBayesFactor", { }) expect_null(p) - + skip_on_os("linux") mod <- BayesFactor::regressionBF(mpg ~ cyl, mtcars, progress = FALSE) modF <- lm(mpg ~ cyl, mtcars) p <- model_performance(mod) diff --git a/tests/testthat/test-r2_bayes.R b/tests/testthat/test-r2_bayes.R index 7c176b8e3..00e3a54e2 100644 --- a/tests/testthat/test-r2_bayes.R +++ b/tests/testthat/test-r2_bayes.R @@ -1,3 +1,5 @@ +skip_on_os("linux") + test_that("r2_BayesFactor", { skip_if_not_installed("BayesFactor") set.seed(1) diff --git a/tests/testthat/test-r2_kullback.R b/tests/testthat/test-r2_kullback.R index 591295cef..8ce9620b9 100644 --- a/tests/testthat/test-r2_kullback.R +++ b/tests/testthat/test-r2_kullback.R @@ -3,3 +3,10 @@ test_that("r2_kullback", { expect_equal(r2_kullback(model), c(`Kullback-Leibler R2` = 0.3834), tolerance = 1e-3) expect_equal(r2_kullback(model, adjust = FALSE), c(`Kullback-Leibler R2` = 0.4232), tolerance = 1e-3) }) + +test_that("r2_kullback errors for non-supported", { + skip_if_not_installed("pscl") + data("bioChemists", package = "pscl") + model <- pscl::zeroinfl(art ~ . | 1, data = bioChemists, dist = "negbin") + expect_error(r2_kullback(model), regex = "This function only works") +}) diff --git a/vignettes/check_model.Rmd b/vignettes/check_model.Rmd index 7e9ff0506..e657f3089 100644 --- a/vignettes/check_model.Rmd +++ b/vignettes/check_model.Rmd @@ -57,7 +57,7 @@ Most plots seen here can also be generated by their dedicated functions, e.g.: - Binned residuals: `binned_residuals()` - Check for overdispersion: `check_overdispersion()` -## Linear models: Are all assumptions for linear models met? +# Linear models: Are all assumptions for linear models met? We start with a simple example for a linear model. @@ -87,9 +87,9 @@ Now let's take a closer look for each plot. To do so, we ask `check_model()` to diagnostic_plots <- plot(check_model(m1, panel = FALSE)) ``` -### Posterior predictive checks +## Posterior predictive checks -The first plot is based on `check_predictions()`. Posterior predictive checks can be used to look for systematic discrepancies between real and simulated data. It helps to see whether the type of model (distributional family) fits well to the data (_Gelman and Hill, 2007, p. 158_). Posterior predictive checks can be used to "look for systematic discrepancies between real and simulated data" (_Gelman et al. 2014, p. 169_). +The first plot is based on `check_predictions()`. Posterior predictive checks can be used to "look for systematic discrepancies between real and simulated data" (_Gelman et al. 2014, p. 169_). It helps to see whether the type of model (distributional family) fits well to the data (_Gelman and Hill, 2007, p. 158_). ```{r eval=all(successfully_loaded[c("see", "ggplot2")])} # posterior predicive checks @@ -113,11 +113,11 @@ plot(out) As you can see, the green line in this plot deviates visibly from the blue lines. This may indicate that our linear model is not appropriate, since it does not capture the distributional nature of the response variable properly. -#### How to fix this? +### How to fix this? The best way, if there are serious concerns that the model does not fit well to the data, is to use a different type (family) of regression models. In our example, it is obvious that we should better use a Poisson regression. -#### Plots for discrete outcomes +### Plots for discrete outcomes For discrete or integer outcomes (like in logistic or Poisson regression), density plots are not always the best choice, as they look somewhat "wiggly" around the actual values of the dependent variables. In this case, use the `type` argument of the `plot()` method to change the plot-style. Available options are `type = "discrete_dots"` (dots for observed and replicated outcomes), `type = "discrete_interval"` (dots for observed, error bars for replicated outcomes) or `type = "discrete_both"` (both dots and error bars). @@ -134,7 +134,7 @@ out <- check_predictions(m3) plot(out, type = "discrete_both") ``` -### Linearity +## Linearity This plot helps to check the assumption of linear relationship. It shows whether predictors may have a non-linear relationship with the outcome, in which case the reference line may roughly indicate that relationship. A straight and horizontal line indicates that the model specification seems to be ok. @@ -160,7 +160,7 @@ out <- plot(check_model(m, panel = FALSE)) out[[2]] ``` -#### How to fix this? +### How to fix this? If the green reference line is not roughly flat and horizontal, but rather - like in our example - U-shaped, this may indicate that some of the predictors probably should better be modeled as quadratic term. Transforming the response variable might be another solution when linearity assumptions are not met. @@ -175,7 +175,7 @@ out[[2]] **Some caution is needed** when interpreting these plots. Although these plots are helpful to check model assumptions, they do not necessarily indicate so-called "lack of fit", e.g. missed non-linear relationships or interactions. Thus, it is always recommended to also look at [effect plots, including partial residuals](https://strengejacke.github.io/ggeffects/articles/introduction_partial_residuals.html). -### Homogeneity of variance - detecting heteroscedasticity +## Homogeneity of variance - detecting heteroscedasticity This plot helps to check the assumption of equal (or constant) variance, i.e. homoscedasticity. To meet this assumption, the variance of the residuals across different values of predictors is similar and does not notably increase or decrease. Hence, the desired pattern would be that dots spread equally above and below a roughly straight, horizontal line and show no apparent deviation. @@ -202,7 +202,7 @@ But why does the diagnostic plot used in `check_model()` look different? `check_ diagnostic_plots[[3]] ``` -#### How to fix this? +### How to fix this? There are several ways to address heteroscedasticity. @@ -212,7 +212,7 @@ There are several ways to address heteroscedasticity. 3. Transforming the response variable, for instance, taking the `log()`, may also help to avoid issues with heteroscedasticity. -### Influential observations - outliers +## Influential observations - outliers Outliers can be defined as particularly influential observations, and this plot helps detecting those outliers. Cook's distance (_Cook 1977_, _Cook & Weisberg 1982_) is used to define outliers, i.e. any point in this plot that falls outside of Cook's distance (the dashed lines) is considered an influential observation. @@ -223,11 +223,11 @@ diagnostic_plots[[4]] In our example, everything looks well. -#### How to fix this? +### How to fix this? Dealing with outliers is not straightforward, as it is not recommended to automatically discard any observation that has been marked as "an outlier". Rather, your _domain knowledge_ must be involved in the decision whether to keep or omit influential observation. A helpful heuristic is to distinguish between error outliers, interesting outliers, and random outliers (_Leys et al. 2019_). _Error outliers_ are likely due to human error and should be corrected before data analysis. _Interesting outliers_ are not due to technical error and may be of theoretical interest; it might thus be relevant to investigate them further even though they should be removed from the current analysis of interest. _Random outliers_ are assumed to be due to chance alone and to belong to the correct distribution and, therefore, should be retained. -### Multicollinearity +## Multicollinearity This plot checks for potential collinearity among predictors. In a nutshell multicollinearity means that once you know the effect of one predictor, the value of knowing the other predictor is rather low. Multicollinearity might arise when a third, unobserved variable has a causal effect on each of the two predictors that are associated with the outcome. In such cases, the actual relationship that matters would be the association between the unobserved variable and the outcome. @@ -244,11 +244,11 @@ The variance inflation factor (VIF) indicates the magnitude of multicollinearity Our model clearly suffers from multicollinearity, as all predictors have high VIF values. -#### How to fix this? +### How to fix this? Usually, predictors with (very) high VIF values should be removed from the model to fix multicollinearity. Some caution is needed for interaction terms. If interaction terms are included in a model, high VIF values are expected. This portion of multicollinearity among the component terms of an interaction is also called "inessential ill-conditioning", which leads to inflated VIF values that are typically seen for models with interaction terms _(Francoeur 2013)_. In such cases, re-fit your model without interaction terms and check this model for collinearity among predictors. -### Normality of residuals +## Normality of residuals In linear regression, residuals should be normally distributed. This can be checked using so-called Q-Q plots (quantile-quantile plot) to compare the shapes of distributions. This plot shows the quantiles of the studentized residuals versus fitted values. @@ -261,7 +261,7 @@ diagnostic_plots[[6]] In our example, we see that most data points are ok, except some observations at the tails. Whether any action is needed to fix this or not can also depend on the results of the remaining diagnostic plots. If all other plots indicate no violation of assumptions, some deviation of normality, particularly at the tails, can be less critical. -#### How to fix this? +### How to fix this? Here are some remedies to fix non-normality of residuals, according to _Pek et al. 2018_. diff --git a/vignettes/check_outliers.Rmd b/vignettes/check_outliers.Rmd index d9f906d6c..0c567489a 100644 --- a/vignettes/check_outliers.Rmd +++ b/vignettes/check_outliers.Rmd @@ -132,7 +132,7 @@ One common approach for this is to compute multivariate distance metrics such as In *{performance}*'s `check_outliers()`, one can use this approach with `method = "mcd"`.^[Our default threshold for the MCD method is defined by `stats::qchisq(p = 1 - 0.001, df = ncol(x))`, which again is an approximation of the critical value for _p_ < .001 consistent with the thresholds of our other methods.] ```{r multivariate} -outliers <- check_outliers(data, method = "mcd") +outliers <- check_outliers(data, method = "mcd", verbose = FALSE) outliers ``` @@ -227,7 +227,7 @@ rempsyc::nice_scatter(data, "height", "weight") Using either the *z*-score or MCD methods, our model-consistent observation will be incorrectly flagged as an outlier or influential observation. ```{r} -outliers <- check_outliers(model, method = c("zscore_robust", "mcd")) +outliers <- check_outliers(model, method = c("zscore_robust", "mcd"), verbose = FALSE) which(outliers) ``` @@ -248,7 +248,7 @@ The *{performance}* package also offers an alternative, consensus-based approach In practice, this approach computes a composite outlier score, formed of the average of the binary (0 or 1) classification results of each method. It represents the probability that each observation is classified as an outlier by at least one method. The default decision rule classifies rows with composite outlier scores superior or equal to 0.5 as outlier observations (i.e., that were classified as outliers by at least half of the methods). In *{performance}*'s `check_outliers()`, one can use this approach by including all desired methods in the corresponding argument. ```{r multimethod, fig.cap = "Visual depiction of outliers using several different statistical outlier detection methods."} -outliers <- check_outliers(model, method = c("zscore_robust", "mcd", "cook")) +outliers <- check_outliers(model, method = c("zscore_robust", "mcd", "cook"), verbose = FALSE) which(outliers) ``` diff --git a/vignettes/r2.Rmd b/vignettes/r2.Rmd index 843bd3412..e6ee63356 100644 --- a/vignettes/r2.Rmd +++ b/vignettes/r2.Rmd @@ -29,14 +29,11 @@ knitr::opts_chunk$set( ) options(digits = 2) -pkgs <- c( - "effectsize", "BayesFactor", "lme4", "rstanarm" -) +pkgs <- c("effectsize", "lme4", "rstanarm") successfully_loaded <- sapply(pkgs, requireNamespace, quietly = TRUE) if (all(successfully_loaded)) { library(performance) library(effectsize) - library(BayesFactor) library(lme4) library(rstanarm) } @@ -147,27 +144,6 @@ model <- stan_lmer(Petal.Length ~ Petal.Width + (1 | Species), data = iris, refr r2(model) ``` -Let's look at another regression analysis carried out with `{BayesFactor}` package. - -```{r, eval=successfully_loaded["BayesFactor"] && utils::packageVersion("BayesFactor") >= package_version("0.9.12-4.3")} -library(BayesFactor) -data(puzzles) - -m1 <- anovaBF(extra ~ group + ID, - data = sleep, - whichRandom = "ID", progress = FALSE -) - -r2(m1) - -m2 <- generalTestBF(RT ~ shape * color + ID, - data = puzzles, whichRandom = "ID", - neverExclude = "ID", progress = FALSE -) - -r2(m2) -``` - # Comparing change in R2 using Cohen's *f* Cohen's $f$ (of [ANOVA fame](https://easystats.github.io/effectsize/articles/anovaES.html)) can be used as a measure of effect size in the context of sequential multiple regression (i.e., [**nested models**](https://easystats.github.io/performance/reference/test_performance.html)).