diff --git a/.github/workflows/R-CMD-check-strict.yaml b/.github/workflows/R-CMD-check-strict.yaml deleted file mode 100644 index 5f84492d7..000000000 --- a/.github/workflows/R-CMD-check-strict.yaml +++ /dev/null @@ -1,15 +0,0 @@ -# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples -# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help -# -# R CMD Check will fail on a `NOTE`. -on: - push: - branches: [main, master] - pull_request: - branches: [main, master] - -name: R-CMD-check-strict - -jobs: - R-CMD-check-strict: - uses: easystats/workflows/.github/workflows/R-CMD-check-strict.yaml@main diff --git a/CRAN-SUBMISSION b/CRAN-SUBMISSION index 44e7c417a..22db08405 100644 --- a/CRAN-SUBMISSION +++ b/CRAN-SUBMISSION @@ -1,3 +1,3 @@ -Version: 0.10.5 -Date: 2023-09-11 21:16:32 UTC -SHA: c3348f5c1183042544ebdfc7dbaa9489186c71ea +Version: 0.11.0 +Date: 2024-03-22 21:30:58 UTC +SHA: 051b9bb2b7721c632ce145f85c55aa55c8eebf90 diff --git a/DESCRIPTION b/DESCRIPTION index 933df479a..43d03cf76 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: performance Title: Assessment of Regression Models Performance -Version: 0.10.5.6 +Version: 0.11.0.5 Authors@R: c(person(given = "Daniel", family = "Lüdecke", @@ -33,16 +33,16 @@ Authors@R: role = c("aut", "ctb"), email = "brenton@wiernik.org", comment = c(ORCID = "0000-0001-9560-6336", Twitter = "@bmwiernik")), + person(given = "Rémi", + family = "Thériault", + role = c("aut", "ctb"), + email = "remi.theriault@mail.mcgill.ca", + comment = c(ORCID = "0000-0003-4315-6788", Twitter = "@rempsyc")), person(given = "Vincent", family = "Arel-Bundock", email = "vincent.arel-bundock@umontreal.ca", role = "ctb", comment = c(ORCID = "0000-0003-2042-7063")), - person(given = "Rémi", - family = "Thériault", - role = "ctb", - email = "remi.theriault@mail.mcgill.ca", - comment = c(ORCID = "0000-0003-4315-6788", Twitter = "@rempsyc")), person(given = "Martin", family = "Jullum", role = "rev"), @@ -69,10 +69,9 @@ BugReports: https://github.com/easystats/performance/issues Depends: R (>= 3.6) Imports: - bayestestR (>= 0.13.1), - insight (>= 0.19.5), - datawizard (>= 0.9.0), - methods, + bayestestR (>= 0.13.2), + insight (>= 0.19.10), + datawizard (>= 0.10.0), stats, utils Suggests: @@ -91,9 +90,12 @@ Suggests: correlation, cplm, dbscan, + DHARMa, estimatr, fixest, + flextable, forecast, + ftExtra, gamm4, ggplot2, glmmTMB, @@ -122,26 +124,28 @@ Suggests: nonnest2, ordinal, parallel, - parameters (>= 0.20.3), + parameters (>= 0.21.4), patchwork, pscl, psych, + quantreg, qqplotr (>= 0.0.6), randomForest, + rempsyc, rmarkdown, rstanarm, rstantools, sandwich, - see (>= 0.7.5), + see (>= 0.8.2), survey, survival, - testthat, + testthat (>= 3.2.1), tweedie, VGAM, - withr + withr (>= 3.0.0) Encoding: UTF-8 Language: en-US -RoxygenNote: 7.2.3.9000 +RoxygenNote: 7.3.1 Roxygen: list(markdown = TRUE) Config/testthat/edition: 3 Config/testthat/parallel: true @@ -150,4 +154,4 @@ Config/Needs/website: r-lib/pkgdown, easystats/easystatstemplate Config/rcmdcheck/ignore-inconsequential-notes: true -Remotes: easystats/see, easystats/parameters, easystats/insight +Remotes: easystats/see diff --git a/NAMESPACE b/NAMESPACE index e573d5ce0..7a20bd1dc 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -55,9 +55,11 @@ S3method(check_heteroscedasticity,default) S3method(check_homogeneity,afex_aov) S3method(check_homogeneity,default) S3method(check_homogeneity,htest) +S3method(check_model,DHARMa) S3method(check_model,brmsfit) S3method(check_model,default) S3method(check_model,model_fit) +S3method(check_model,performance_simres) S3method(check_model,stanreg) S3method(check_multimodal,data.frame) S3method(check_multimodal,numeric) @@ -70,7 +72,9 @@ S3method(check_normality,htest) S3method(check_normality,lmerModLmerTest) S3method(check_normality,merMod) S3method(check_normality,numeric) +S3method(check_normality,performance_simres) S3method(check_outliers,BFBayesFactor) +S3method(check_outliers,DHARMa) S3method(check_outliers,character) S3method(check_outliers,data.frame) S3method(check_outliers,default) @@ -87,8 +91,13 @@ S3method(check_outliers,meta) S3method(check_outliers,metabin) S3method(check_outliers,metagen) S3method(check_outliers,numeric) +S3method(check_outliers,performance_simres) S3method(check_outliers,rma) S3method(check_outliers,rma.uni) +S3method(check_outliers,rq) +S3method(check_outliers,rqs) +S3method(check_outliers,rqss) +S3method(check_overdispersion,DHARMa) S3method(check_overdispersion,default) S3method(check_overdispersion,fixest) S3method(check_overdispersion,fixest_multi) @@ -100,11 +109,17 @@ S3method(check_overdispersion,model_fit) S3method(check_overdispersion,negbin) S3method(check_overdispersion,negbinirr) S3method(check_overdispersion,negbinmfx) +S3method(check_overdispersion,performance_simres) S3method(check_overdispersion,poissonirr) S3method(check_overdispersion,poissonmfx) S3method(check_predictions,BFBayesFactor) +S3method(check_predictions,brmsfit) S3method(check_predictions,default) S3method(check_predictions,lme) +S3method(check_predictions,stanreg) +S3method(check_residuals,DHARMa) +S3method(check_residuals,default) +S3method(check_residuals,performance_simres) S3method(check_singularity,MixMod) S3method(check_singularity,clmm) S3method(check_singularity,cpglmm) @@ -120,6 +135,9 @@ S3method(check_sphericity,default) S3method(check_sphericity,mlm) S3method(check_symmetry,htest) S3method(check_symmetry,numeric) +S3method(check_zeroinflation,DHARMa) +S3method(check_zeroinflation,default) +S3method(check_zeroinflation,performance_simres) S3method(cronbachs_alpha,data.frame) S3method(cronbachs_alpha,matrix) S3method(cronbachs_alpha,parameters_pca) @@ -258,10 +276,12 @@ S3method(plot,check_model) S3method(plot,check_normality) S3method(plot,check_outliers) S3method(plot,check_overdisp) +S3method(plot,check_residuals) S3method(plot,check_sphericity) S3method(plot,compare_performance) S3method(plot,performance_pp_check) S3method(plot,performance_roc) +S3method(plot,performance_simres) S3method(plot,test_likelihoodratio) S3method(plot,test_performance) S3method(print,binned_residuals) @@ -280,7 +300,9 @@ S3method(print,check_normality_binom) S3method(print,check_outliers) S3method(print,check_outliers_metafor) S3method(print,check_outliers_metagen) +S3method(print,check_outliers_simres) S3method(print,check_overdisp) +S3method(print,check_residuals) S3method(print,check_sphericity) S3method(print,check_symmetry) S3method(print,check_zi) @@ -299,6 +321,7 @@ S3method(print,performance_pcp) S3method(print,performance_pp_check) S3method(print,performance_roc) S3method(print,performance_score) +S3method(print,performance_simres) S3method(print,r2_bayes) S3method(print,r2_generic) S3method(print,r2_loo) @@ -308,6 +331,7 @@ S3method(print,r2_nakagawa_by_group) S3method(print,r2_pseudo) S3method(print,test_likelihoodratio) S3method(print,test_performance) +S3method(print_html,check_itemscale) S3method(print_html,compare_performance) S3method(print_html,test_performance) S3method(print_md,check_itemscale) @@ -385,6 +409,7 @@ S3method(r2,rma) S3method(r2,scam) S3method(r2,selection) S3method(r2,sem) +S3method(r2,serp) S3method(r2,stanreg) S3method(r2,summary.lm) S3method(r2,survreg) @@ -407,6 +432,7 @@ S3method(r2_coxsnell,coxph) S3method(r2_coxsnell,cpglm) S3method(r2_coxsnell,crch) S3method(r2_coxsnell,glm) +S3method(r2_coxsnell,glmmTMB) S3method(r2_coxsnell,glmx) S3method(r2_coxsnell,logitmfx) S3method(r2_coxsnell,logitor) @@ -420,10 +446,13 @@ S3method(r2_coxsnell,poissonirr) S3method(r2_coxsnell,poissonmfx) S3method(r2_coxsnell,polr) S3method(r2_coxsnell,probit) +S3method(r2_coxsnell,serp) S3method(r2_coxsnell,survreg) S3method(r2_coxsnell,svycoxph) S3method(r2_coxsnell,truncreg) S3method(r2_efron,default) +S3method(r2_kullback,default) +S3method(r2_kullback,glm) S3method(r2_loo_posterior,BFBayesFactor) S3method(r2_loo_posterior,brmsfit) S3method(r2_loo_posterior,stanmvreg) @@ -448,6 +477,7 @@ S3method(r2_mcfadden,poissonirr) S3method(r2_mcfadden,poissonmfx) S3method(r2_mcfadden,polr) S3method(r2_mcfadden,probitmfx) +S3method(r2_mcfadden,serp) S3method(r2_mcfadden,truncreg) S3method(r2_mcfadden,vglm) S3method(r2_mckelvey,default) @@ -463,6 +493,7 @@ S3method(r2_nagelkerke,coxph) S3method(r2_nagelkerke,cpglm) S3method(r2_nagelkerke,crch) S3method(r2_nagelkerke,glm) +S3method(r2_nagelkerke,glmmTMB) S3method(r2_nagelkerke,glmx) S3method(r2_nagelkerke,logitmfx) S3method(r2_nagelkerke,logitor) @@ -476,6 +507,7 @@ S3method(r2_nagelkerke,poissonirr) S3method(r2_nagelkerke,poissonmfx) S3method(r2_nagelkerke,polr) S3method(r2_nagelkerke,probitmfx) +S3method(r2_nagelkerke,serp) S3method(r2_nagelkerke,survreg) S3method(r2_nagelkerke,svycoxph) S3method(r2_nagelkerke,truncreg) @@ -488,6 +520,7 @@ S3method(r2_tjur,nestedLogit) S3method(residuals,BFBayesFactor) S3method(residuals,check_normality_numeric) S3method(residuals,iv_robust) +S3method(residuals,performance_simres) S3method(rstudent,check_normality_numeric) S3method(test_bf,ListModels) S3method(test_bf,default) @@ -523,6 +556,7 @@ export(check_outliers) export(check_overdispersion) export(check_posterior_predictions) export(check_predictions) +export(check_residuals) export(check_singularity) export(check_sphericity) export(check_sphericity_bartlett) @@ -576,6 +610,7 @@ export(r2_tjur) export(r2_xu) export(r2_zeroinflated) export(rmse) +export(simulate_residuals) export(test_bf) export(test_likelihoodratio) export(test_lrt) diff --git a/NEWS.md b/NEWS.md index 06eee8f47..c94980743 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,7 +1,154 @@ -# performance (development version) +# performance 0.11.1 + +## Breaking + +* Aliases `posterior_predictive_check()` and `check_posterior_predictions()` for + `check_predictions()` are deprecated. ## General +* Improved documentation and new vignettes added. + +* `check_model()` gets a `base_size` argument, to set the base font size for plots. + +* `check_predictions()` for `stanreg` and `brmsfit` models now returns plots in + the usual style as for other models and no longer returns plots from + `bayesplot::pp_check()`. + +## Bug fixes + +* `check_model()` now falls back on normal Q-Q plots when a model is not supported + by the DHARMa package and simulated residuals cannot be calculated. + +# performance 0.11.0 + +## New supported models + +* Rudimentary support for models of class `serp` from package *serp*. + +## New functions + +* `simulate_residuals()` and `check_residuals()`, to simulate and check residuals + from generalized linear (mixed) models. Simulating residuals is based on the + DHARMa package, and objects returned by `simulate_residuals()` inherit from + the `DHARMa` class, and thus can be used with any functions from the *DHARMa* + package. However, there are also implementations in the *performance* package, + such as `check_overdispersion()`, `check_zeroinflation()`, `check_outliers()` + or `check_model()`. + +* Plots for `check_model()` have been improved. The Q-Q plots are now based + on simulated residuals from the DHARMa package for non-Gaussian models, thus + providing more accurate and informative plots. The half-normal QQ plot for + generalized linear models can still be obtained by setting the new argument + `residual_type = "normal"`. + +* Following functions now support simulated residuals (from `simulate_residuals()`) + resp. objects returned from `DHARMa::simulateResiduals()`: + - `check_overdispersion()` + - `check_zeroinflation()` + - `check_outliers()` + - `check_model()` + +## General + +* Improved error messages for `check_model()` when QQ-plots cannot be created. + +* `check_distribution()` is more stable for possibly sparse data. + +## Bug fixes + +* Fixed issue in `check_normality()` for t-tests. + +* Fixed issue in `check_itemscale()` for data frame inputs, when `factor_index` + was not a named vector. + +# performance 0.10.9 + +## Changes + +* `r2()` for models of class `glmmTMB` without random effects now returns the + correct r-squared value for non-mixed models. + +* `check_itemscale()` now also accepts data frames as input. In this case, + `factor_index` must be specified, which must be a numeric vector of same + length as number of columns in `x`, where each element is the index of the + factor to which the respective column in `x`. + +* `check_itemscale()` gets a `print_html()` method. + +* Clarification in the documentation of the `estimator` argument for + `performance_aic()`. + +* Improved plots for overdispersion-checks for negative-binomial models from + package *glmmTMB* (affects `check_overdispersion()` and `check_model()`). + +* Improved detection rates for singularity in `check_singularity()` for models + from package *glmmTMB*. + +* For model of class `glmmTMB`, deviance residuals are now used in the + `check_model()` plot. + +* Improved (better to understand) error messages for `check_model()`, + `check_collinearity()` and `check_outliers()` for models with non-numeric + response variables. + +* `r2_kullback()` now gives an informative error for non-supported models. + +## Bug fixes + +* Fixed issue in `binned_residuals()` for models with binary outcome, where + in rare occasions empty bins could occur. + +* `performance_score()` should no longer fail for models where scoring rules + can't be calculated. Instead, an informative message is returned. + +* `check_outliers()` now properly accept the `percentage_central` argument when + using the `"mcd"` method. + +* Fixed edge cases in `check_collinearity()` and `check_outliers()` for models + with response variables of classes `Date`, `POSIXct`, `POSIXlt` or `difftime`. + +* Fixed issue with `check_model()` for models of package *quantreg*. + +# performance 0.10.8 + +## Changes + +* Changed behaviour of `check_predictions()` for models from binomial family, + to get comparable plots for different ways of outcome specification. Now, + if the outcome is a proportion, or defined as matrix of trials and successes, + the produced plots are the same (because the models should be the same, too). + +## Bug fixes + +* Fixed CRAN check errors. + +* Fixed issue with `binned_residuals()` for models with binomial family, where + the outcome was a proportion. + +# performance 0.10.7 + +## Breaking changes + +* `binned_residuals()` gains a few new arguments to control the residuals used + for the test, as well as different options to calculate confidence intervals + (namely, `ci_type`, `residuals`, `ci` and `iterations`). The default values + to compute binned residuals have changed. Default residuals are now "deviance" + residuals (and no longer "response" residuals). Default confidence intervals + are now "exact" intervals (and no longer based on Gaussian approximation). + Use `ci_type = "gaussian"` and `residuals = "response"` to get the old defaults. + +## Changes to functions + +* `binned_residuals()` - like `check_model()` - gains a `show_dots` argument to + show or hide data points that lie inside error bounds. This is particular + useful for models with many observations, where generating the plot would be + very slow. + +# performance 0.10.6 + +## General + * Support for `nestedLogit` models. ## Changes to functions diff --git a/R/binned_residuals.R b/R/binned_residuals.R index 355ab63c4..8c6608ebf 100644 --- a/R/binned_residuals.R +++ b/R/binned_residuals.R @@ -11,6 +11,25 @@ #' @param n_bins Numeric, the number of bins to divide the data. If #' `n_bins = NULL`, the square root of the number of observations is #' taken. +#' @param ci Numeric, the confidence level for the error bounds. +#' @param ci_type Character, the type of error bounds to calculate. Can be +#' `"exact"` (default), `"gaussian"` or `"boot"`. `"exact"` calculates the +#' error bounds based on the exact binomial distribution, using [`binom.test()`]. +#' `"gaussian"` uses the Gaussian approximation, while `"boot"` uses a simple +#' bootstrap method, where confidence intervals are calculated based on the +#' quantiles of the bootstrap distribution. +#' @param residuals Character, the type of residuals to calculate. Can be +#' `"deviance"` (default), `"pearson"` or `"response"`. It is recommended to +#' use `"response"` only for those models where other residuals are not +#' available. +#' @param iterations Integer, the number of iterations to use for the +#' bootstrap method. Only used if `ci_type = "boot"`. +#' @param show_dots Logical, if `TRUE`, will show data points in the plot. Set +#' to `FALSE` for models with many observations, if generating the plot is too +#' time-consuming. By default, `show_dots = NULL`. In this case `binned_residuals()` +#' tries to guess whether performance will be poor due to a very large model +#' and thus automatically shows or hides dots. +#' @param verbose Toggle warnings and messages. #' @param ... Currently not used. #' #' @return A data frame representing the data that is mapped in the accompanying @@ -57,17 +76,57 @@ #' } #' #' @export -binned_residuals <- function(model, term = NULL, n_bins = NULL, ...) { - fv <- stats::fitted(model) +binned_residuals <- function(model, + term = NULL, + n_bins = NULL, + show_dots = NULL, + ci = 0.95, + ci_type = c("exact", "gaussian", "boot"), + residuals = c("deviance", "pearson", "response"), + iterations = 1000, + verbose = TRUE, + ...) { + # match arguments + ci_type <- match.arg(ci_type) + residuals <- match.arg(residuals) + + # for non-bernoulli models, `"exact"` doesn't work + if (isFALSE(insight::model_info(model)$is_bernoulli)) { + ci_type <- "gaussian" + if (verbose) { + insight::format_alert("Using `ci_type = \"gaussian\"` because model is not bernoulli.") + } + } + + fitted_values <- stats::fitted(model) mf <- insight::get_data(model, verbose = FALSE) if (is.null(term)) { - pred <- fv + pred <- fitted_values } else { pred <- mf[[term]] } - y <- .recode_to_zero(insight::get_response(model, verbose = FALSE)) - fv + # set default for show_dots, based on "model size" + if (is.null(show_dots)) { + n <- .safe(insight::n_obs(model)) + show_dots <- is.null(n) || n <= 1e5 + } + + # make sure response is 0/1 (and numeric) + y0 <- .recode_to_zero(insight::get_response(model, verbose = FALSE)) + + # calculate residuals + y <- switch(residuals, + response = y0 - fitted_values, + pearson = .safe((y0 - fitted_values) / sqrt(fitted_values * (1 - fitted_values))), + deviance = .safe(stats::residuals(model, type = "deviance")) + ) + + # make sure we really have residuals + if (is.null(y)) { + insight::format_error("Could not calculate residuals. Try using `residuals = \"response\"`.") + } if (is.null(n_bins)) n_bins <- round(sqrt(length(pred))) @@ -84,24 +143,37 @@ binned_residuals <- function(model, term = NULL, n_bins = NULL, ...) { n <- length(items) sdev <- stats::sd(y[items], na.rm = TRUE) - data.frame( + # sanity check - do we have any data in our bin? + if (n == 0) { + conf_int <- stats::setNames(c(NA, NA), c("CI_low", "CI_high")) + } else { + conf_int <- switch(ci_type, + gaussian = stats::qnorm(c((1 - ci) / 2, (1 + ci) / 2), mean = ybar, sd = sdev / sqrt(n)), + exact = { + out <- stats::binom.test(sum(y0[items]), n)$conf.int + # center CIs around point estimate + out <- out - (min(out) - ybar) - (diff(out) / 2) + out + }, + boot = .boot_binned_ci(y[items], ci, iterations) + ) + names(conf_int) <- c("CI_low", "CI_high") + } + + d0 <- data.frame( xbar = xbar, ybar = ybar, n = n, x.lo = model.range[1], x.hi = model.range[2], - se = stats::qnorm(0.975) * sdev / sqrt(n), - ci_range = sdev / sqrt(n) + se = stats::qnorm((1 + ci) / 2) * sdev / sqrt(n) ) + cbind(d0, rbind(conf_int)) })) d <- do.call(rbind, d) d <- d[stats::complete.cases(d), ] - # CIs - d$CI_low <- d$ybar - stats::qnorm(0.975) * d$ci_range - d$CI_high <- d$ybar + stats::qnorm(0.975) * d$ci_range - gr <- abs(d$ybar) > abs(d$se) d$group <- "yes" d$group[gr] <- "no" @@ -112,11 +184,27 @@ binned_residuals <- function(model, term = NULL, n_bins = NULL, ...) { attr(d, "resid_ok") <- resid_ok attr(d, "resp_var") <- insight::find_response(model) attr(d, "term") <- term + attr(d, "show_dots") <- show_dots d } +# utilities --------------------------- + +.boot_binned_ci <- function(x, ci = 0.95, iterations = 1000) { + x <- x[!is.na(x)] + n <- length(x) + out <- vector("numeric", iterations) + for (i in seq_len(iterations)) { + out[i] <- sum(x[sample.int(n, n, replace = TRUE)]) + } + out <- out / n + + quant <- stats::quantile(out, c((1 - ci) / 2, (1 + ci) / 2), na.rm = TRUE) + c(CI_low = quant[1L], CI_high = quant[2L]) +} + # methods ----------------------------- diff --git a/R/check_clusterstructure.R b/R/check_clusterstructure.R index 46929b615..8a80c7b8f 100644 --- a/R/check_clusterstructure.R +++ b/R/check_clusterstructure.R @@ -47,14 +47,14 @@ check_clusterstructure <- function(x, H <- .clusterstructure_hopkins(x, distance = distance) if (H < 0.5) { - text <- paste0( + res_text <- paste0( "The dataset is suitable for clustering (Hopkins' H = ", insight::format_value(H), ").\n" ) color <- "green" } else { - text <- paste0( + res_text <- paste0( "The dataset is not suitable for clustering (Hopkins' H = ", insight::format_value(H), ").\n" @@ -67,7 +67,7 @@ check_clusterstructure <- function(x, dissimilarity_matrix = .clusterstructure_dm(x, distance = distance, method = "ward.D2") ) - attr(out, "text") <- text + attr(out, "text") <- res_text attr(out, "color") <- color attr(out, "title") <- "Clustering tendency" class(out) <- c("see_check_clusterstructure", "check_clusterstructure", "easystats_check", class(out)) @@ -107,35 +107,32 @@ plot.check_clusterstructure <- function(x, ...) { n <- nrow(x) - 1 - c <- apply(x, 2, min) # minimum value per column + cc <- apply(x, 2, min) # minimum value per column d <- apply(x, 2, max) p <- matrix(0, ncol = ncol(x), nrow = n) # n vectors of space for (i in seq_len(ncol(x))) { - p[, i] <- stats::runif(n, min = c[i], max = d[i]) + p[, i] <- stats::runif(n, min = cc[i], max = d[i]) } k <- round(stats::runif(n, 1, nrow(x))) - q <- as.matrix(x[k, ]) + qq <- as.matrix(x[k, ]) distp <- rep(0, nrow(x)) - # distq=rep(0,nrow(x)-1) distq <- 0 minp <- rep(0, n) minq <- rep(0, n) for (i in 1:n) { distp[1] <- stats::dist(rbind(p[i, ], x[1, ]), method = distance) - minqi <- stats::dist(rbind(q[i, ], x[1, ]), method = distance) + minqi <- stats::dist(rbind(qq[i, ], x[1, ]), method = distance) for (j in 2:nrow(x)) { distp[j] <- stats::dist(rbind(p[i, ], x[j, ]), method = distance) - error <- q[i, ] - x[j, ] + error <- qq[i, ] - x[j, ] if (sum(abs(error)) != 0) { - # distq[j]<-stats::dist(rbind(q[i,],x[j,])) - distq <- stats::dist(rbind(q[i, ], x[j, ]), method = distance) + distq <- stats::dist(rbind(qq[i, ], x[j, ]), method = distance) if (distq < minqi) { minqi <- distq } } } minp[i] <- min(distp) - # minq[i]<-apply(distq,1,min) minq[i] <- minqi } sum(minq) / (sum(minp) + sum(minq)) diff --git a/R/check_collinearity.R b/R/check_collinearity.R index fed6fbf60..14dd2fcce 100644 --- a/R/check_collinearity.R +++ b/R/check_collinearity.R @@ -31,6 +31,8 @@ #' with other terms, and tolerance values (including confidence intervals), #' where `tolerance = 1/vif`. #' +#' @seealso [`see::plot.see_check_collinearity()`] for options to customize the plot. +#' #' @section Multicollinearity: #' Multicollinearity should not be confused with a raw strong correlation #' between predictors. What matters is the association between one or more @@ -405,10 +407,24 @@ check_collinearity.zerocount <- function(x, .check_collinearity <- function(x, component, ci = 0.95, verbose = TRUE) { v <- insight::get_varcov(x, component = component, verbose = FALSE) - assign <- .term_assignments(x, component, verbose = verbose) + + # sanity check + if (is.null(v)) { + if (isTRUE(verbose)) { + insight::format_alert( + paste( + sprintf("Could not extract the variance-covariance matrix for the %s component of the model.", component), + "Please try to run `vcov(model)`, which may help identifying the problem." + ) + ) + } + return(NULL) + } + + term_assign <- .term_assignments(x, component, verbose = verbose) # any assignment found? - if (is.null(assign) || all(is.na(assign))) { + if (is.null(term_assign) || all(is.na(term_assign))) { if (verbose) { insight::format_alert( sprintf("Could not extract model terms for the %s component of the model.", component) @@ -420,7 +436,7 @@ check_collinearity.zerocount <- function(x, # we have rank-deficiency here. remove NA columns from assignment if (isTRUE(attributes(v)$rank_deficient) && !is.null(attributes(v)$na_columns_index)) { - assign <- assign[-attributes(v)$na_columns_index] + term_assign <- term_assign[-attributes(v)$na_columns_index] if (isTRUE(verbose)) { insight::format_alert( "Model matrix is rank deficient. VIFs may not be sensible." @@ -431,11 +447,9 @@ check_collinearity.zerocount <- function(x, # check for missing intercept if (insight::has_intercept(x)) { v <- v[-1, -1] - assign <- assign[-1] - } else { - if (isTRUE(verbose)) { - insight::format_alert("Model has no intercept. VIFs may not be sensible.") - } + term_assign <- term_assign[-1] + } else if (isTRUE(verbose)) { + insight::format_alert("Model has no intercept. VIFs may not be sensible.") } f <- insight::find_formula(x) @@ -449,16 +463,16 @@ check_collinearity.zerocount <- function(x, } if (inherits(x, "mixor")) { - terms <- labels(x$terms) + model_terms <- labels(x$terms) } else { - terms <- labels(stats::terms(f[[component]])) + model_terms <- labels(stats::terms(f[[component]])) } if ("instruments" %in% names(f)) { - terms <- unique(c(terms, labels(stats::terms(f[["instruments"]])))) + model_terms <- unique(c(model_terms, labels(stats::terms(f[["instruments"]])))) } - n.terms <- length(terms) + n.terms <- length(model_terms) if (n.terms < 2) { if (isTRUE(verbose)) { @@ -475,8 +489,13 @@ check_collinearity.zerocount <- function(x, result <- vector("numeric") na_terms <- vector("numeric") + # sanity check - models with offset(?) may contain too many term assignments + if (length(term_assign) > ncol(v)) { + term_assign <- term_assign[seq_len(ncol(v))] + } + for (term in 1:n.terms) { - subs <- which(assign == term) + subs <- which(term_assign == term) if (length(subs)) { result <- c( result, @@ -489,7 +508,7 @@ check_collinearity.zerocount <- function(x, # any terms to remove, due to rank deficiency? if (length(na_terms)) { - terms <- terms[-na_terms] + model_terms <- model_terms[-na_terms] } # check for interactions, VIF might be inflated... @@ -522,7 +541,7 @@ check_collinearity.zerocount <- function(x, out <- insight::text_remove_backticks( data.frame( - Term = terms, + Term = model_terms, VIF = result, VIF_CI_low = 1 / (1 - ci_lo), VIF_CI_high = 1 / (1 - ci_up), @@ -538,7 +557,7 @@ check_collinearity.zerocount <- function(x, attr(out, "data") <- insight::text_remove_backticks( data.frame( - Term = terms, + Term = model_terms, VIF = result, SE_factor = sqrt(result), stringsAsFactors = FALSE @@ -564,29 +583,29 @@ check_collinearity.zerocount <- function(x, tryCatch( { if (inherits(x, c("hurdle", "zeroinfl", "zerocount"))) { - assign <- switch(component, + term_assign <- switch(component, conditional = attr(insight::get_modelmatrix(x, model = "count"), "assign"), zero_inflated = attr(insight::get_modelmatrix(x, model = "zero"), "assign") ) } else if (inherits(x, "glmmTMB")) { - assign <- switch(component, + term_assign <- switch(component, conditional = attr(insight::get_modelmatrix(x), "assign"), zero_inflated = .zi_term_assignment(x, component, verbose = verbose) ) } else if (inherits(x, "MixMod")) { - assign <- switch(component, + term_assign <- switch(component, conditional = attr(insight::get_modelmatrix(x, type = "fixed"), "assign"), zero_inflated = attr(insight::get_modelmatrix(x, type = "zi_fixed"), "assign") ) } else { - assign <- attr(insight::get_modelmatrix(x), "assign") + term_assign <- attr(insight::get_modelmatrix(x), "assign") } - if (is.null(assign)) { - assign <- .find_term_assignment(x, component, verbose = verbose) + if (is.null(term_assign)) { + term_assign <- .find_term_assignment(x, component, verbose = verbose) } - assign + term_assign }, error = function(e) { .find_term_assignment(x, component, verbose = verbose) diff --git a/R/check_distribution.R b/R/check_distribution.R index e43b28036..89f48263a 100644 --- a/R/check_distribution.R +++ b/R/check_distribution.R @@ -77,8 +77,7 @@ check_distribution.default <- function(model) { } else { x <- stats::residuals(model) } - # x_scaled <- .normalize(x) - dat <- .extract_features(x) + dat <- .extract_features(x, "residuals") dist_residuals <- as.data.frame(t(stats::predict(classify_distribution, dat, type = "prob"))) @@ -89,7 +88,7 @@ check_distribution.default <- function(model) { dummy_factors = FALSE, preserve_levels = TRUE ) - dat <- .extract_features(x) + dat <- .extract_features(x, "response") dist_response <- as.data.frame(t(stats::predict(classify_distribution, dat, type = "prob"))) @@ -171,11 +170,11 @@ check_distribution.numeric <- function(model) { insight::check_if_installed("randomForest") dat <- .extract_features(model) - dist <- as.data.frame(t(stats::predict(classify_distribution, dat, type = "prob"))) + distance <- as.data.frame(t(stats::predict(classify_distribution, dat, type = "prob"))) out <- data.frame( - Distribution = rownames(dist), - p_Vector = dist[[1]], + Distribution = rownames(distance), + p_Vector = distance[[1]], stringsAsFactors = FALSE, row.names = NULL ) @@ -190,15 +189,27 @@ check_distribution.numeric <- function(model) { # utilities ----------------------------- -.extract_features <- function(x) { - # sanity check, remove missings +.extract_features <- function(x, type = NULL) { + # validation check, remove missings x <- x[!is.na(x)] + # this might fail, so we wrap in ".safe()" + map_est <- .safe(mean(x) - as.numeric(bayestestR::map_estimate(x, bw = "nrd0"))) + + if (is.null(map_est)) { + map_est <- mean(x) - datawizard::distribution_mode(x) + msg <- "Could not accurately estimate the mode." + if (!is.null(type)) { + msg <- paste(msg, "Predicted distribution of the", type, "may be less accurate.") + } + insight::format_alert(msg) + } + data.frame( SD = stats::sd(x), MAD = stats::mad(x, constant = 1), Mean_Median_Distance = mean(x) - stats::median(x), - Mean_Mode_Distance = mean(x) - as.numeric(bayestestR::map_estimate(x, bw = "nrd0")), + Mean_Mode_Distance = map_est, SD_MAD_Distance = stats::sd(x) - stats::mad(x, constant = 1), Var_Mean_Distance = stats::var(x) - mean(x), Range_SD = diff(range(x)) / stats::sd(x), @@ -219,14 +230,8 @@ check_distribution.numeric <- function(model) { .is_integer <- function(x) { tryCatch( - expr = { - ifelse(is.infinite(x), FALSE, x %% 1 == 0) - }, - warning = function(w) { - is.integer(x) - }, - error = function(e) { - FALSE - } + ifelse(is.infinite(x), FALSE, x %% 1 == 0), + warning = function(w) is.integer(x), + error = function(e) FALSE ) } diff --git a/R/check_factorstructure.R b/R/check_factorstructure.R index 911fca434..d4b1c5c54 100644 --- a/R/check_factorstructure.R +++ b/R/check_factorstructure.R @@ -95,7 +95,7 @@ check_factorstructure <- function(x, n = NULL, ...) { kmo <- check_kmo(x, n, ...) sphericity <- check_sphericity_bartlett(x, n, ...) - text <- paste0("\n - Sphericity: ", attributes(sphericity)$text, "\n - KMO: ", attributes(kmo)$text) + res_text <- paste0("\n - Sphericity: ", attributes(sphericity)$text, "\n - KMO: ", attributes(kmo)$text) if (attributes(kmo)$color == "red" || attributes(sphericity)$color == "red") { color <- "red" @@ -105,7 +105,7 @@ check_factorstructure <- function(x, n = NULL, ...) { out <- list(KMO = kmo, sphericity = sphericity) - attr(out, "text") <- text + attr(out, "text") <- res_text attr(out, "color") <- color attr(out, "title") <- "Is the data suitable for Factor Analysis?" class(out) <- c("easystats_check", class(out)) @@ -120,7 +120,7 @@ check_factorstructure <- function(x, n = NULL, ...) { #' @rdname check_factorstructure #' @export check_kmo <- function(x, n = NULL, ...) { - out <- .check_factor_structure_sanity(x, n, ...) + out <- .validate_factor_structure(x, n, ...) Q <- solve(out$r) @@ -136,18 +136,16 @@ check_kmo <- function(x, n = NULL, ...) { # TODO: add interpret_kmo in effectsize and use that here for more fine-grained interpretation if (MSA < 0.5) { - text <- - sprintf( - "The Kaiser, Meyer, Olkin (KMO) overall measure of sampling adequacy suggests that factor analysis is likely to be inappropriate (KMO = %.2f).", - MSA - ) + msg_text <- sprintf( + "The Kaiser, Meyer, Olkin (KMO) overall measure of sampling adequacy suggests that factor analysis is likely to be inappropriate (KMO = %.2f).", # nolint + MSA + ) color <- "red" } else { - text <- - sprintf( - "The Kaiser, Meyer, Olkin (KMO) overall measure of sampling adequacy suggests that data seems appropriate for factor analysis (KMO = %.2f).", - MSA - ) + msg_text <- sprintf( + "The Kaiser, Meyer, Olkin (KMO) overall measure of sampling adequacy suggests that data seems appropriate for factor analysis (KMO = %.2f).", # nolint + MSA + ) color <- "green" } @@ -160,9 +158,9 @@ check_kmo <- function(x, n = NULL, ...) { ")" )) - text <- paste0(text, " The individual KMO scores are: ", text_ind, ".") + msg_text <- paste0(msg_text, " The individual KMO scores are: ", text_ind, ".") - attr(out, "text") <- text + attr(out, "text") <- msg_text attr(out, "color") <- color attr(out, "title") <- "KMO Measure of Sampling Adequacy" class(out) <- c("easystats_check", class(out)) @@ -177,38 +175,36 @@ check_kmo <- function(x, n = NULL, ...) { #' @rdname check_factorstructure #' @export check_sphericity_bartlett <- function(x, n = NULL, ...) { - out <- .check_factor_structure_sanity(x, n, ...) + out <- .validate_factor_structure(x, n, ...) p <- dim(out$r)[2] detR <- det(out$r) statistic <- -log(detR) * (out$n - 1 - (2 * p + 5) / 6) - df <- p * (p - 1) / 2 - pval <- stats::pchisq(statistic, df, lower.tail = FALSE) + dof <- p * (p - 1) / 2 + pval <- stats::pchisq(statistic, df = dof, lower.tail = FALSE) - out <- list(chisq = statistic, p = pval, dof = df) + out <- list(chisq = statistic, p = pval, dof = dof) if (pval < 0.001) { - text <- - sprintf( - "Bartlett's test of sphericity suggests that there is sufficient significant correlation in the data for factor analysis (Chisq(%i) = %.2f, %s).", - df, - statistic, - insight::format_p(pval) - ) + msg_text <- sprintf( + "Bartlett's test of sphericity suggests that there is sufficient significant correlation in the data for factor analysis (Chisq(%i) = %.2f, %s).", # nolint + dof, + statistic, + insight::format_p(pval) + ) color <- "green" } else { - text <- - sprintf( - "Bartlett's test of sphericity suggests that there is not enough significant correlation in the data for factor analysis (Chisq(%i) = %.2f, %s).", - df, - statistic, - insight::format_p(pval) - ) + msg_text <- sprintf( + "Bartlett's test of sphericity suggests that there is not enough significant correlation in the data for factor analysis (Chisq(%i) = %.2f, %s).", # nolint + dof, + statistic, + insight::format_p(pval) + ) color <- "red" } - attr(out, "text") <- text + attr(out, "text") <- msg_text attr(out, "color") <- color attr(out, "title") <- "Test of Sphericity" class(out) <- c("easystats_check", class(out)) @@ -221,7 +217,7 @@ check_sphericity_bartlett <- function(x, n = NULL, ...) { # Helpers ----------------------------------------------------------------- #' @keywords internal -.check_factor_structure_sanity <- function(x, n = NULL, ...) { +.validate_factor_structure <- function(x, n = NULL, ...) { if (is.null(n)) { r <- stats::cor(x, use = "pairwise.complete.obs", ...) n <- nrow(x) @@ -229,7 +225,6 @@ check_sphericity_bartlett <- function(x, n = NULL, ...) { r <- x } - # Sanity check if (nrow(r) != ncol(r)) { insight::format_error("The correlation matrix is not square.") } diff --git a/R/check_heterogeneity_bias.R b/R/check_heterogeneity_bias.R index b87aa9962..d9bb337f9 100644 --- a/R/check_heterogeneity_bias.R +++ b/R/check_heterogeneity_bias.R @@ -31,9 +31,9 @@ check_heterogeneity_bias <- function(x, select = NULL, group = NULL) { if (insight::is_model(x)) { group <- insight::find_random(x, split_nested = TRUE, flatten = TRUE) if (is.null(group)) { - insight::format_error("Model is no mixed model. Please provide a mixed model, or a data frame and arguments `select` and `group`.") + insight::format_error("Model is no mixed model. Please provide a mixed model, or a data frame and arguments `select` and `group`.") # nolint } - data <- insight::get_data(x, source = "mf", verbose = FALSE) + my_data <- insight::get_data(x, source = "mf", verbose = FALSE) select <- insight::find_predictors(x, effects = "fixed", component = "conditional", flatten = TRUE) } else { if (inherits(select, "formula")) { @@ -42,15 +42,15 @@ check_heterogeneity_bias <- function(x, select = NULL, group = NULL) { if (inherits(group, "formula")) { group <- all.vars(group) } - data <- x + my_data <- x } - unique_groups <- .n_unique(data[[group]]) + unique_groups <- .n_unique(my_data[[group]]) combinations <- expand.grid(select, group) result <- Map(function(predictor, id) { # demean predictor - d <- datawizard::demean(data, select = predictor, group = id, verbose = FALSE) + d <- datawizard::demean(my_data, select = predictor, group = id, verbose = FALSE) # get new names within_name <- paste0(predictor, "_within") diff --git a/R/check_heteroscedasticity.R b/R/check_heteroscedasticity.R index c3fb8a19b..cf8e4cb09 100644 --- a/R/check_heteroscedasticity.R +++ b/R/check_heteroscedasticity.R @@ -66,7 +66,7 @@ check_heteroscedasticity.default <- function(x, ...) { .U <- (r^2) / S.sq mod <- stats::lm(.U ~ stats::fitted(x)) - SS <- stats::anova(mod)$"Sum Sq" + SS <- stats::anova(mod)$`Sum Sq` RegSS <- sum(SS) - SS[length(SS)] Chisq <- RegSS / 2 diff --git a/R/check_homogeneity.R b/R/check_homogeneity.R index c3a839ab7..db79fa106 100644 --- a/R/check_homogeneity.R +++ b/R/check_homogeneity.R @@ -117,7 +117,7 @@ print.check_homogeneity <- function(x, ...) { } else if (x < 0.05) { insight::print_color(sprintf("Warning: Variances differ between groups (%s, p = %.3f).\n", method.string, x), "red") } else { - insight::print_color(sprintf("OK: There is not clear evidence for different variances across groups (%s, p = %.3f).\n", method.string, x), "green") + insight::print_color(sprintf("OK: There is not clear evidence for different variances across groups (%s, p = %.3f).\n", method.string, x), "green") # nolint } invisible(x) } @@ -146,13 +146,13 @@ check_homogeneity.afex_aov <- function(x, method = "levene", ...) { insight::format_error("Levene test is only aplicable to ANOVAs with between-subjects factors.") } - data <- x$data$long # Use this to also get id column + long_data <- x$data$long # Use this to also get id column dv <- attr(x, "dv") id <- attr(x, "id") between <- names(attr(x, "between")) is_covar <- vapply(attr(x, "between"), is.null, logical(1)) - ag_data <- stats::aggregate(data[, dv], data[, c(between, id)], mean) + ag_data <- stats::aggregate(long_data[, dv], long_data[, c(between, id)], mean) colnames(ag_data)[length(c(between, id)) + 1] <- dv if (any(is_covar)) { diff --git a/R/check_htest.R b/R/check_htest.R index 197b2a6fe..f5d085839 100644 --- a/R/check_htest.R +++ b/R/check_htest.R @@ -1,7 +1,7 @@ #' @export check_normality.htest <- function(x, ...) { - data <- insight::get_data(x) - if (is.null(data)) { + model_data <- insight::get_data(x) + if (is.null(model_data)) { insight::format_error( "Cannot check assumptions - Unable to retrieve data from `htest` object." ) @@ -11,31 +11,42 @@ check_normality.htest <- function(x, ...) { if (grepl("Welch", method, fixed = TRUE) || grepl("F test to compare two variances", method, fixed = TRUE)) { - m1 <- stats::lm(data[[1]] ~ 1) - m2 <- stats::lm(data[[2]] ~ 1) + # sanity check + if (!is.numeric(model_data[[2]])) { + insight::format_error( + "Discrete or character variables are not supported for this test. Please use a continuous variable for the second argument." + ) + } + m1 <- stats::lm(model_data[[1]] ~ 1) + m2 <- stats::lm(model_data[[2]] ~ 1) out <- check_normality(m1) out[2] <- check_normality(m2)[1] attr(out, "units") <- c("Group1", "Group2") } else if (grepl("Two Sample t-test", method, fixed = TRUE)) { m <- stats::lm( - formula = Value ~ factor(Name), - data = datawizard::data_to_long(data) + formula = value ~ factor(name), + data = datawizard::data_to_long(model_data) ) out <- check_normality(m) } else if (grepl("One Sample t-test", method, fixed = TRUE)) { - m <- stats::lm(data[[1]] ~ 1) + m <- stats::lm(model_data[[1]] ~ 1) out <- check_normality(m) } else if (grepl("Paired t-test", method, fixed = TRUE)) { - d <- data[[1]] - data[[2]] + if (!is.numeric(model_data[[2]])) { + insight::format_error( + "Discrete or character variables are not supported for this test. Please use a continuous variable for the second argument." + ) + } + d <- model_data[[1]] - model_data[[2]] m <- stats::lm(d ~ 1) out <- check_normality(m) } else if (grepl("One-way analysis of means (not assuming equal variances)", method, fixed = TRUE)) { - data <- split(data, data[[2]]) - outs <- lapply(data, function(d) { + model_data <- split(model_data, model_data[[2]]) + outs <- lapply(model_data, function(d) { check_normality(stats::lm(d[[1]] ~ 1)) }) @@ -43,11 +54,11 @@ check_normality.htest <- function(x, ...) { attributes(out) <- attributes(outs[[1]]) attr(out, "units") <- paste0("Group", seq_along(outs)) } else if (grepl("One-way analysis of means", method, fixed = TRUE)) { - m <- stats::aov(data[[1]] ~ factor(data[[2]])) + m <- stats::aov(model_data[[1]] ~ factor(model_data[[2]])) out <- check_normality(m) } else if (grepl("Pearson's product-moment correlation", method, fixed = TRUE)) { - out <- .MVN_hz(data)[["p value"]] + out <- .MVN_hz(model_data)[["p value"]] class(out) <- c("check_normality", "see_check_normality", "numeric") attr(out, "type") <- "residuals" } else if (grepl("Pearson's Chi-squared test", method, fixed = TRUE) || @@ -73,8 +84,8 @@ check_normality.htest <- function(x, ...) { #' @export check_homogeneity.htest <- function(x, ...) { - data <- insight::get_data(x) - if (is.null(data)) { + model_data <- insight::get_data(x) + if (is.null(model_data)) { insight::format_error( "Cannot check assumptions - Unable to retrieve data from `htest` object." ) @@ -88,11 +99,14 @@ check_homogeneity.htest <- function(x, ...) { if (grepl("Two Sample t-test", method, fixed = TRUE)) { m <- stats::lm( - formula = Value ~ factor(Name), - data = datawizard::data_to_long(data) + formula = value ~ factor(name), + data = datawizard::data_to_long(model_data) ) } else if (grepl("One-way analysis of means", method, fixed = TRUE)) { - m <- stats::aov(stats::reformulate(names(data)[2], response = names(data)[1]), data = data) + m <- stats::aov( + stats::reformulate(names(model_data)[2], response = names(model_data)[1]), + data = model_data + ) } else { insight::format_error( "This `htest` is not supported (or this assumption is not required for this test)." @@ -109,8 +123,8 @@ check_homogeneity.htest <- function(x, ...) { #' @export check_symmetry.htest <- function(x, ...) { - data <- insight::get_data(x) - if (is.null(data)) { + model_data <- insight::get_data(x) + if (is.null(model_data)) { insight::format_error( "Cannot check assumptions - Unable to retrieve data from `htest` object." ) @@ -118,10 +132,10 @@ check_symmetry.htest <- function(x, ...) { method <- x[["method"]] if (grepl("signed rank", method, fixed = TRUE)) { - if (ncol(data) > 1) { - out <- check_symmetry(data[[1]] - data[[2]]) + if (ncol(model_data) > 1) { + out <- check_symmetry(model_data[[1]] - model_data[[2]]) } else { - out <- check_symmetry(data[[1]]) + out <- check_symmetry(model_data[[1]]) } } else { insight::format_error( @@ -157,7 +171,7 @@ print.check_normality_binom <- function(x, ...) { "Warning: Some cells in the expected table have less than 5 observations.\n" ), "red") } - return(invisible(x)) + invisible(x) } @@ -180,7 +194,7 @@ print.check_normality_binom <- function(x, ...) { dif <- scale(data, scale = FALSE) Dj <- diag(dif %*% solve(S, tol = tol) %*% t(dif)) Y <- data %*% solve(S, tol = tol) %*% t(data) - Djk <- -2 * t(Y) + matrix(diag(t(Y))) %*% matrix(c(rep(1, n)), 1, n) + matrix(c(rep(1, n)), n, 1) %*% diag(t(Y)) + Djk <- -2 * t(Y) + matrix(diag(t(Y))) %*% matrix(rep(1, n), 1, n) + matrix(rep(1, n), n, 1) %*% diag(t(Y)) b <- 1 / (sqrt(2)) * ((2 * p + 1) / 4)^(1 / (p + 4)) * (n^(1 / (p + 4))) if (qr(S)$rank == p) { HZ <- n * (1 / (n^2) * sum(sum(exp(-(b^2) / 2 * Djk))) - 2 * ((1 + (b^2))^(-p / 2)) * (1 / n) * (sum(exp(-((b^2) / (2 * (1 + (b^2)))) * Dj))) + ((1 + (2 * (b^2)))^(-p / 2))) diff --git a/R/check_itemscale.R b/R/check_itemscale.R index 5aa704618..2dff6c5da 100644 --- a/R/check_itemscale.R +++ b/R/check_itemscale.R @@ -2,11 +2,14 @@ #' @name check_itemscale #' #' @description Compute various measures of internal consistencies -#' applied to (sub)scales, which items were extracted using -#' `parameters::principal_components()`. +#' applied to (sub)scales, which items were extracted using +#' `parameters::principal_components()`. #' #' @param x An object of class `parameters_pca`, as returned by -#' [`parameters::principal_components()`]. +#' [`parameters::principal_components()`], or a data frame. +#' @param factor_index If `x` is a data frame, `factor_index` must be specified. +#' It must be a numeric vector of same length as number of columns in `x`, where +#' each element is the index of the factor to which the respective column in `x`. #' #' @return A list of data frames, with related measures of internal #' consistencies of each subscale. @@ -48,21 +51,58 @@ #' X <- matrix(rnorm(1600), 100, 16) #' Z <- X %*% C #' -#' pca <- principal_components(as.data.frame(Z), rotation = "varimax", n = 3) +#' pca <- parameters::principal_components( +#' as.data.frame(Z), +#' rotation = "varimax", +#' n = 3 +#' ) #' pca #' check_itemscale(pca) +#' +#' # as data frame +#' check_itemscale( +#' as.data.frame(Z), +#' factor_index = parameters::closest_component(pca) +#' ) #' @export -check_itemscale <- function(x) { - if (!inherits(x, "parameters_pca")) { +check_itemscale <- function(x, factor_index = NULL) { + # check for valid input + if (!inherits(x, c("parameters_pca", "data.frame"))) { insight::format_error( - "`x` must be an object of class `parameters_pca`, as returned by `parameters::principal_components()`." + "`x` must be an object of class `parameters_pca`, as returned by `parameters::principal_components()`, or a data frame." # nolint ) } - insight::check_if_installed("parameters") + # if data frame, we need `factor_index` + if (inherits(x, "data.frame") && !inherits(x, "parameters_pca")) { + if (is.null(factor_index)) { + insight::format_error("If `x` is a data frame, `factor_index` must be specified.") + } + if (!is.numeric(factor_index)) { + insight::format_error("`factor_index` must be numeric.") + } + if (length(factor_index) != ncol(x)) { + insight::format_error( + "`factor_index` must be of same length as number of columns in `x`.", + "Each element of `factor_index` must be the index of the factor to which the respective column in `x` belongs to." # nolint + ) + } + } - dataset <- attributes(x)$dataset - subscales <- parameters::closest_component(x) + # factor_index must be a named vector (column names as names) + if (!is.null(factor_index) && is.null(names(factor_index)) && !inherits(x, "parameters_pca")) { + factor_index <- stats::setNames(factor_index, colnames(x)) + } + + # assign data and factor index + if (inherits(x, "parameters_pca")) { + insight::check_if_installed("parameters") + dataset <- attributes(x)$dataset + subscales <- parameters::closest_component(x) + } else { + dataset <- x + subscales <- factor_index + } out <- lapply(sort(unique(subscales)), function(.subscale) { columns <- names(subscales)[subscales == .subscale] @@ -123,3 +163,26 @@ print.check_itemscale <- function(x, digits = 2, ...) { zap_small = TRUE )) } + + +#' @export +print_html.check_itemscale <- function(x, digits = 2, ...) { + x <- lapply(seq_along(x), function(i) { + out <- x[[i]] + attr(out, "table_caption") <- sprintf( + "Component %i: Mean inter-item-correlation = %.3f, Cronbach's alpha = %.3f", + i, + attributes(out)$item_intercorrelation, + attributes(out)$cronbachs_alpha + ) + out + }) + insight::export_table( + x, + caption = "Description of (Sub-)Scales", + digits = digits, + format = "html", + missing = "", + zap_small = TRUE + ) +} diff --git a/R/check_model.R b/R/check_model.R index bbf6c6a84..daa1255fc 100644 --- a/R/check_model.R +++ b/R/check_model.R @@ -9,42 +9,54 @@ #' #' @param x A model object. #' @param dot_size,line_size Size of line and dot-geoms. +#' @param base_size,title_size,axis_title_size Base font size for axis and plot titles. #' @param panel Logical, if `TRUE`, plots are arranged as panels; else, #' single plots for each diagnostic are returned. #' @param check Character vector, indicating which checks for should be performed -#' and plotted. May be one or more of `"all"`, `"vif"`, `"qq"`, `"normality"`, -#' `"linearity"`, `"ncv"`, `"homogeneity"`, `"outliers"`, `"reqq"`, `"pp_check"`, -#' `"binned_residuals"` or `"overdispersion"`, Not that not all check apply -#' to all type of models (see 'Details'). `"reqq"` is a QQ-plot for random -#' effects and only available for mixed models. `"ncv"` is an alias for -#' `"linearity"`, and checks for non-constant variance, i.e. for -#' heteroscedasticity, as well as the linear relationship. By default, all -#' possible checks are performed and plotted. +#' and plotted. May be one or more of `"all"`, `"vif"`, `"qq"`, `"normality"`, +#' `"linearity"`, `"ncv"`, `"homogeneity"`, `"outliers"`, `"reqq"`, `"pp_check"`, +#' `"binned_residuals"` or `"overdispersion"`. Note that not all check apply +#' to all type of models (see 'Details'). `"reqq"` is a QQ-plot for random +#' effects and only available for mixed models. `"ncv"` is an alias for +#' `"linearity"`, and checks for non-constant variance, i.e. for +#' heteroscedasticity, as well as the linear relationship. By default, all +#' possible checks are performed and plotted. #' @param alpha,dot_alpha The alpha level of the confidence bands and dot-geoms. -#' Scalar from 0 to 1. +#' Scalar from 0 to 1. #' @param colors Character vector with color codes (hex-format). Must be of -#' length 3. First color is usually used for reference lines, second color -#' for dots, and third color for outliers or extreme values. +#' length 3. First color is usually used for reference lines, second color +#' for dots, and third color for outliers or extreme values. #' @param theme String, indicating the name of the plot-theme. Must be in the -#' format `"package::theme_name"` (e.g. `"ggplot2::theme_minimal"`). +#' format `"package::theme_name"` (e.g. `"ggplot2::theme_minimal"`). #' @param detrend Logical. Should Q-Q/P-P plots be detrended? Defaults to -#' `TRUE`. +#' `TRUE` for linear models or when `residual_type = "normal"`. Defaults to +#' `FALSE` for QQ plots based on simulated residuals (i.e. when +#' `residual_type = "simulated"`). +#' @param residual_type Character, indicating the type of residuals to be used. +#' For non-Gaussian models, the default is `"simulated"`, which uses simulated +#' residuals. These are based on [`simulate_residuals()`] and thus uses the +#' **DHARMa** package to return randomized quantile residuals. For Gaussian +#' models, the default is `"normal"`, which uses the default residuals from +#' the model. Setting `residual_type = "normal"` for non-Gaussian models will +#' use a half-normal Q-Q plot of the absolute value of the standardized deviance +#' residuals. #' @param show_dots Logical, if `TRUE`, will show data points in the plot. Set -#' to `FALSE` for models with many observations, if generating the plot is too -#' time-consuming. By default, `show_dots = NULL`. In this case `check_model()` -#' tries to guess whether performance will be poor due to a very large model -#' and thus automatically shows or hides dots. +#' to `FALSE` for models with many observations, if generating the plot is too +#' time-consuming. By default, `show_dots = NULL`. In this case `check_model()` +#' tries to guess whether performance will be poor due to a very large model +#' and thus automatically shows or hides dots. #' @param verbose If `FALSE` (default), suppress most warning messages. -#' @param ... Currently not used. +#' @param ... Arguments passed down to the individual check functions, especially +#' to `check_predictions()` and `binned_residuals()`. #' @inheritParams check_predictions #' #' @return The data frame that is used for plotting. #' #' @note This function just prepares the data for plotting. To create the plots, -#' **see** needs to be installed. Furthermore, this function suppresses -#' all possible warnings. In case you observe suspicious plots, please refer -#' to the dedicated functions (like `check_collinearity()`, -#' `check_normality()` etc.) to get informative messages and warnings. +#' **see** needs to be installed. Furthermore, this function suppresses +#' all possible warnings. In case you observe suspicious plots, please refer +#' to the dedicated functions (like `check_collinearity()`, +#' `check_normality()` etc.) to get informative messages and warnings. #' #' @details For Bayesian models from packages **rstanarm** or **brms**, #' models will be "converted" to their frequentist counterpart, using @@ -102,10 +114,20 @@ #' normally distributed. Usually, dots should fall along the line. If there is #' some deviation (mostly at the tails), this indicates that the model doesn't #' predict the outcome well for that range that shows larger deviations from -#' the line. For generalized linear models, a half-normal Q-Q plot of the -#' absolute value of the standardized deviance residuals is shown, however, the -#' interpretation of the plot remains the same. See [`check_normality()`] for -#' further details. +#' the line. For generalized linear models and when `residual_type = "normal"`, +#' a half-normal Q-Q plot of the absolute value of the standardized deviance +#' residuals is shown, however, the interpretation of the plot remains the same. +#' See [`check_normality()`] for further details. Usually, for generalized linear +#' (mixed) models, a test for uniformity of residuals based on simulated residuals +#' is conducted (see next section). +#' +#' @section Uniformity of Residuals: +#' Fore non-Gaussian models, when `residual_type = "simulated"` (the default +#' for generalized linear (mixed) models), residuals are not expected to be +#' normally distributed. In this case, the created Q-Q plot checks the uniformity +#' of residuals. The interpretation of the plot is the same as for the normal +#' Q-Q plot. See [`simulate_residuals()`] and [`check_residuals()`] for further +#' details. #' #' @section Overdispersion: #' For count models, an *overdispersion plot* is shown. Overdispersion occurs @@ -123,12 +145,13 @@ #' inside the error bounds. See [`binned_residuals()`] for further details. #' #' @section Residuals for (Generalized) Linear Models: -#' Plots that check the normality of residuals (QQ-plot) or the homogeneity of -#' variance use standardized Pearson's residuals for generalized linear models, -#' and standardized residuals for linear models. The plots for the normality of -#' residuals (with overlayed normal curve) and for the linearity assumption use -#' the default residuals for `lm` and `glm` (which are deviance -#' residuals for `glm`). +#' Plots that check the homogeneity of variance use standardized Pearson's +#' residuals for generalized linear models, and standardized residuals for +#' linear models. The plots for the normality of residuals (with overlayed +#' normal curve) and for the linearity assumption use the default residuals +#' for `lm` and `glm` (which are deviance residuals for `glm`). The Q-Q plots +#' use simulated residuals (see [`simulate_residuals()`]) for non-Gaussian +#' models and standardized residuals for linear models. #' #' @section Troubleshooting: #' For models with many observations, or for more complex models in general, @@ -161,18 +184,22 @@ check_model <- function(x, ...) { #' @rdname check_model #' @export check_model.default <- function(x, - dot_size = 2, - line_size = 0.8, panel = TRUE, check = "all", + detrend = TRUE, + bandwidth = "nrd", + type = "density", + residual_type = NULL, + show_dots = NULL, + dot_size = 2, + line_size = 0.8, + title_size = 12, + axis_title_size = base_size, + base_size = 10, alpha = 0.2, dot_alpha = 0.8, colors = c("#3aaf85", "#1b6ca8", "#cd201f"), theme = "see::theme_lucid", - detrend = TRUE, - show_dots = NULL, - bandwidth = "nrd", - type = "density", verbose = FALSE, ...) { # check model formula @@ -182,27 +209,58 @@ check_model.default <- function(x, minfo <- insight::model_info(x, verbose = FALSE) - ca <- tryCatch( - { - if (minfo$is_bayesian) { - suppressWarnings(.check_assumptions_stan(x)) - } else if (minfo$is_linear) { - suppressWarnings(.check_assumptions_linear(x, minfo, verbose)) - } else { - suppressWarnings(.check_assumptions_glm(x, minfo, verbose)) - } + # set default for residual_type + if (is.null(residual_type)) { + residual_type <- ifelse(minfo$is_linear && !minfo$is_gam, "normal", "simulated") + } + + # catch models/families not supported by DHARMa - we need to add more + # exceptions here as they appear, but for now, `check_model()` also + # automatically falls back to normal Q-Q plot for all models not supported + # by DHARMa + if (minfo$family %in% c("quasipoisson", "quasibinomial")) { + residual_type <- "normal" + } + + # set default for detrend + if (missing(detrend)) { + detrend <- residual_type == "normal" + } + + assumptions_data <- tryCatch( + if (minfo$is_bayesian) { + suppressWarnings(.check_assumptions_stan(x, ...)) + } else if (minfo$is_linear) { + suppressWarnings(.check_assumptions_linear(x, minfo, check, residual_type, verbose, ...)) + } else { + suppressWarnings(.check_assumptions_glm(x, minfo, check, residual_type, verbose, ...)) }, error = function(e) { - NULL + e } ) - if (is.null(ca)) { - insight::format_error(paste0("`check_model()` not implemented for models of class `", class(x)[1], "` yet.")) + if (inherits(assumptions_data, c("error", "simpleError"))) { + pattern <- "(\n|\\s{2,})" + replacement <- " " + cleaned_string <- gsub(pattern, replacement, assumptions_data$message) + insight::format_error( + paste("`check_model()` returned following error:", cleaned_string), + paste0("\nIf the error message does not help identifying your problem, another reason why `check_model()` failed might be that models of class `", class(x)[1], "` are not yet supported.") # nolint + ) + } + + # did Q-Q plot work with simulated residuals? + if (is.null(assumptions_data$QQ) && residual_type == "simulated") { + insight::format_alert(paste0( + "Cannot simulate residuals for models of class `", + class(x)[1], + "`. Please try `check_model(..., residual_type = \"normal\")` instead." + )) } # try to find sensible default for "type" argument - suggest_dots <- (minfo$is_bernoulli || minfo$is_count || minfo$is_ordinal || minfo$is_categorical || minfo$is_multinomial) + suggest_dots <- (minfo$is_bernoulli || minfo$is_count || minfo$is_ordinal || minfo$is_categorical || minfo$is_multinomial) # nolint if (missing(type) && suggest_dots) { type <- "discrete_interval" } @@ -213,21 +271,25 @@ check_model.default <- function(x, show_dots <- is.null(n) || n <= 1e5 } - attr(ca, "panel") <- panel - attr(ca, "dot_size") <- dot_size - attr(ca, "line_size") <- line_size - attr(ca, "check") <- check - attr(ca, "alpha") <- alpha - attr(ca, "dot_alpha") <- dot_alpha - attr(ca, "show_dots") <- isTRUE(show_dots) - attr(ca, "detrend") <- detrend - attr(ca, "colors") <- colors - attr(ca, "theme") <- theme - attr(ca, "model_info") <- minfo - attr(ca, "overdisp_type") <- list(...)$plot_type - attr(ca, "bandwidth") <- bandwidth - attr(ca, "type") <- type - ca + attr(assumptions_data, "panel") <- panel + attr(assumptions_data, "dot_size") <- dot_size + attr(assumptions_data, "line_size") <- line_size + attr(assumptions_data, "base_size") <- base_size + attr(assumptions_data, "axis_title_size") <- axis_title_size + attr(assumptions_data, "title_size") <- title_size + attr(assumptions_data, "check") <- check + attr(assumptions_data, "alpha") <- alpha + attr(assumptions_data, "dot_alpha") <- dot_alpha + attr(assumptions_data, "show_dots") <- isTRUE(show_dots) + attr(assumptions_data, "detrend") <- detrend + attr(assumptions_data, "colors") <- colors + attr(assumptions_data, "theme") <- theme + attr(assumptions_data, "model_info") <- minfo + attr(assumptions_data, "overdisp_type") <- list(...)$plot_type + attr(assumptions_data, "bandwidth") <- bandwidth + attr(assumptions_data, "type") <- type + attr(assumptions_data, "model_class") <- class(x)[1] + assumptions_data } @@ -254,19 +316,23 @@ plot.check_model <- function(x, ...) { #' @export check_model.stanreg <- function(x, - dot_size = 2, - line_size = 0.8, panel = TRUE, check = "all", + detrend = TRUE, + bandwidth = "nrd", + type = "density", + residual_type = NULL, + show_dots = NULL, + dot_size = 2, + line_size = 0.8, + title_size = 12, + axis_title_size = base_size, + base_size = 10, alpha = 0.2, dot_alpha = 0.8, colors = c("#3aaf85", "#1b6ca8", "#cd201f"), theme = "see::theme_lucid", - detrend = FALSE, - show_dots = NULL, - bandwidth = "nrd", - type = "density", - verbose = TRUE, + verbose = FALSE, ...) { check_model(bayestestR::bayesian_as_frequentist(x), dot_size = dot_size, @@ -277,10 +343,13 @@ check_model.stanreg <- function(x, dot_alpha = dot_alpha, colors = colors, theme = theme, + base_size = base_size, + axis_title_size = axis_title_size, detrend = detrend, show_dots = show_dots, bandwidth = bandwidth, type = type, + residual_type = residual_type, verbose = verbose, ... ) @@ -293,19 +362,23 @@ check_model.brmsfit <- check_model.stanreg #' @export check_model.model_fit <- function(x, - dot_size = 2, - line_size = 0.8, panel = TRUE, check = "all", + detrend = TRUE, + bandwidth = "nrd", + type = "density", + residual_type = NULL, + show_dots = NULL, + dot_size = 2, + line_size = 0.8, + title_size = 12, + axis_title_size = base_size, + base_size = 10, alpha = 0.2, dot_alpha = 0.8, colors = c("#3aaf85", "#1b6ca8", "#cd201f"), theme = "see::theme_lucid", - detrend = FALSE, - show_dots = NULL, - bandwidth = "nrd", - type = "density", - verbose = TRUE, + verbose = FALSE, ...) { check_model( x$fit, @@ -314,39 +387,122 @@ check_model.model_fit <- function(x, panel = panel, check = check, alpha = alpha, + axis_title_size = axis_title_size, dot_alpha = dot_alpha, colors = colors, theme = theme, + base_size = base_size, detrend = detrend, show_dots = show_dots, bandwidth = bandwidth, type = type, + residual_type = residual_type, verbose = verbose, ... ) } +#' @export +check_model.performance_simres <- function(x, + panel = TRUE, + check = "all", + detrend = TRUE, + bandwidth = "nrd", + type = "density", + residual_type = NULL, + show_dots = NULL, + dot_size = 2, + line_size = 0.8, + title_size = 12, + axis_title_size = base_size, + base_size = 10, + alpha = 0.2, + dot_alpha = 0.8, + colors = c("#3aaf85", "#1b6ca8", "#cd201f"), + theme = "see::theme_lucid", + verbose = FALSE, + ...) { + check_model( + x$fittedModel, + dot_size = dot_size, + line_size = line_size, + panel = panel, + check = check, + alpha = alpha, + dot_alpha = dot_alpha, + axis_title_size = axis_title_size, + colors = colors, + theme = theme, + base_size = base_size, + detrend = detrend, + show_dots = show_dots, + bandwidth = bandwidth, + type = type, + residual_type = "simulated", + verbose = verbose, + ... + ) +} + +#' @export +check_model.DHARMa <- check_model.performance_simres + + # compile plots for checks of linear models ------------------------ -.check_assumptions_linear <- function(model, model_info, verbose = TRUE) { +.check_assumptions_linear <- function(model, model_info, check = "all", residual_type = "normal", verbose = TRUE, ...) { dat <- list() - dat$VIF <- .diag_vif(model, verbose = verbose) - dat$QQ <- .diag_qq(model, verbose = verbose) - dat$REQQ <- .diag_reqq(model, level = 0.95, model_info = model_info, verbose = verbose) - dat$NORM <- .diag_norm(model, verbose = verbose) - dat$NCV <- .diag_ncv(model, verbose = verbose) - dat$HOMOGENEITY <- .diag_homogeneity(model, verbose = verbose) - dat$OUTLIERS <- check_outliers(model, method = "cook") - if (!is.null(dat$OUTLIERS)) { - threshold <- attributes(dat$OUTLIERS)$threshold$cook - } else { - threshold <- NULL + # multicollinearity -------------- + if (any(c("all", "vif") %in% check)) { + dat$VIF <- .diag_vif(model, verbose = verbose) + } + + # Q-Q plot (normality/uniformity of residuals) -------------- + if (any(c("all", "qq") %in% check)) { + dat$QQ <- switch(residual_type, + simulated = .safe(simulate_residuals(model, ...)), + .diag_qq(model, model_info = model_info, verbose = verbose) + ) + } + + # Random Effects Q-Q plot (normality of BLUPs) -------------- + if (any(c("all", "reqq") %in% check)) { + dat$REQQ <- .diag_reqq(model, level = 0.95, model_info = model_info, verbose = verbose) + } + + # normal-curve plot (normality of residuals) -------------- + if (any(c("all", "normality") %in% check)) { + dat$NORM <- .diag_norm(model, verbose = verbose) + } + + # non-constant variance (heteroskedasticity, liniearity) -------------- + if (any(c("all", "ncv", "linearity") %in% check)) { + dat$NCV <- .diag_ncv(model, verbose = verbose) + } + + # homogeneity of variance -------------- + if (any(c("all", "homogeneity") %in% check)) { + dat$HOMOGENEITY <- .diag_homogeneity(model, verbose = verbose) + } + + # outliers -------------- + if (any(c("all", "outliers") %in% check)) { + dat$OUTLIERS <- .safe(check_outliers(model, method = "cook")) + if (is.null(dat$OUTLIERS)) { + threshold <- NULL + } else { + threshold <- attributes(dat$OUTLIERS)$threshold$cook + } + dat$INFLUENTIAL <- .influential_obs(model, threshold = threshold) + } + + # posterior predictive checks -------------- + if (any(c("all", "pp_check") %in% check)) { + dat$PP_CHECK <- .safe(check_predictions(model, ...)) } - dat$INFLUENTIAL <- .influential_obs(model, threshold = threshold) - dat$PP_CHECK <- .safe(check_predictions(model)) dat <- insight::compact_list(dat) class(dat) <- c("check_model", "see_check_model") @@ -357,25 +513,55 @@ check_model.model_fit <- function(x, # compile plots for checks of generalized linear models ------------------------ -.check_assumptions_glm <- function(model, model_info, verbose = TRUE) { +.check_assumptions_glm <- function(model, model_info, check = "all", residual_type = "simulated", verbose = TRUE, ...) { dat <- list() - dat$VIF <- .diag_vif(model, verbose = verbose) - dat$QQ <- .diag_qq(model, verbose = verbose) - dat$HOMOGENEITY <- .diag_homogeneity(model, verbose = verbose) - dat$REQQ <- .diag_reqq(model, level = 0.95, model_info = model_info, verbose = verbose) - dat$OUTLIERS <- check_outliers(model, method = "cook") - if (!is.null(dat$OUTLIERS)) { - threshold <- attributes(dat$OUTLIERS)$threshold$cook - } else { - threshold <- NULL + # multicollinearity -------------- + if (any(c("all", "vif") %in% check)) { + dat$VIF <- .diag_vif(model, verbose = verbose) + } + + # Q-Q plot (normality/uniformity of residuals) -------------- + if (any(c("all", "qq") %in% check)) { + dat$QQ <- switch(residual_type, + simulated = .safe(simulate_residuals(model, ...)), + .diag_qq(model, model_info = model_info, verbose = verbose) + ) + } + + # homogeneity of variance -------------- + if (any(c("all", "homogeneity") %in% check)) { + dat$HOMOGENEITY <- .diag_homogeneity(model, verbose = verbose) + } + + # Random Effects Q-Q plot (normality of BLUPs) -------------- + if (any(c("all", "reqq") %in% check)) { + dat$REQQ <- .diag_reqq(model, level = 0.95, model_info = model_info, verbose = verbose) + } + + # outliers -------------- + if (any(c("all", "outliers") %in% check)) { + dat$OUTLIERS <- .safe(check_outliers(model, method = "cook")) + if (is.null(dat$OUTLIERS)) { + threshold <- NULL + } else { + threshold <- attributes(dat$OUTLIERS)$threshold$cook + } + dat$INFLUENTIAL <- .influential_obs(model, threshold = threshold) + } + + # posterior predictive checks -------------- + if (any(c("all", "pp_check") %in% check)) { + dat$PP_CHECK <- .safe(check_predictions(model, ...)) } - dat$INFLUENTIAL <- .influential_obs(model, threshold = threshold) - dat$PP_CHECK <- .safe(check_predictions(model)) - if (isTRUE(model_info$is_binomial)) { - dat$BINNED_RESID <- binned_residuals(model) + + # binned residuals for bernoulli/binomial -------------- + if (isTRUE(model_info$is_binomial) && any(c("all", "binned_residuals") %in% check)) { + dat$BINNED_RESID <- .safe(binned_residuals(model, verbose = verbose, ...)) } - if (isTRUE(model_info$is_count)) { + + # misspecified dispersion and zero-inflation -------------- + if (isTRUE(model_info$is_count) && any(c("all", "overdispersion") %in% check)) { dat$OVERDISPERSION <- .diag_overdispersion(model) } @@ -388,7 +574,7 @@ check_model.model_fit <- function(x, # compile plots for checks of Bayesian models ------------------------ -.check_assumptions_stan <- function(model) { +.check_assumptions_stan <- function(model, ...) { if (inherits(model, "brmsfit")) { # check if brms can be loaded diff --git a/R/check_model_diagnostics.R b/R/check_model_diagnostics.R index 46a64f9c2..431a2bc0f 100644 --- a/R/check_model_diagnostics.R +++ b/R/check_model_diagnostics.R @@ -35,11 +35,13 @@ # prepare data for QQ plot ---------------------------------- -.diag_qq <- function(model, verbose = TRUE) { - if (inherits(model, c("lme", "lmerMod", "merMod", "glmmTMB", "gam"))) { +.diag_qq <- function(model, model_info = NULL, verbose = TRUE) { + if (inherits(model, c("lme", "lmerMod", "merMod", "gam"))) { res_ <- stats::residuals(model) } else if (inherits(model, "geeglm")) { res_ <- stats::residuals(model, type = "pearson") + } else if (inherits(model, "glmmTMB")) { + res_ <- stats::residuals(model, type = "deviance") } else if (inherits(model, "glm")) { res_ <- .safe(abs(stats::rstandard(model, type = "deviance"))) } else { @@ -49,25 +51,36 @@ } } - if (is.null(res_)) { + if (is.null(res_) || all(is.na(res_))) { if (verbose) { + if (is.null(model_info$family)) { + fam <- "model" + } else { + fam <- paste0("`", model_info$family, "`") + } insight::format_alert( - sprintf( - "QQ plot could not be created. Cannot extract residuals from objects of class `%s`.", - class(model)[1] + paste( + sprintf( + "QQ plot could not be created. Cannot extract residuals from objects of class `%s`.", + class(model)[1] + ), + sprintf( + "Maybe the model class or the %s family does not support the computation of (deviance) residuals?", + fam + ) ) ) } return(NULL) } - if (inherits(model, "glm")) { + if (inherits(model, c("glm", "glmerMod")) || (inherits(model, "glmmTMB") && isFALSE(model_info$is_linear))) { fitted_ <- stats::qnorm((stats::ppoints(length(res_)) + 1) / 2) } else { fitted_ <- stats::fitted(model) } - # sanity check, sometimes either residuals or fitted can contain NA, see #488 + # validation check, sometimes either residuals or fitted can contain NA, see #488 if (anyNA(res_) || anyNA(fitted_)) { # drop NA and make sure both fitted and residuals match non_na <- !is.na(fitted_) & !is.na(res_) @@ -85,38 +98,34 @@ # prepare data for random effects QQ plot ---------------------------------- -.diag_reqq <- function(model, level = 0.95, model_info, verbose = TRUE) { +.diag_reqq <- function(model, level = 0.95, model_info = NULL, verbose = TRUE) { # check if we have mixed model - if (!model_info$is_mixed) { + if (is.null(model_info) || !model_info$is_mixed) { return(NULL) } insight::check_if_installed("lme4") tryCatch( - { - if (inherits(model, "glmmTMB")) { - var_attr <- "condVar" - re <- .collapse_cond(lme4::ranef(model, condVar = TRUE)) - } else { - var_attr <- "postVar" - re <- lme4::ranef(model, condVar = TRUE) - } + if (inherits(model, "glmmTMB")) { + var_attr <- "condVar" + re <- .collapse_cond(lme4::ranef(model, condVar = TRUE)) + } else { + var_attr <- "postVar" + re <- lme4::ranef(model, condVar = TRUE) }, error = function(e) { - return(NULL) + NULL } ) se <- tryCatch( - { - suppressWarnings(lapply(re, function(.x) { - pv <- attr(.x, var_attr, exact = TRUE) - cols <- seq_len(dim(pv)[1]) - unlist(lapply(cols, function(.y) sqrt(pv[.y, .y, ]))) - })) - }, + suppressWarnings(lapply(re, function(.x) { + pv <- attr(.x, var_attr, exact = TRUE) + cols <- seq_len(dim(pv)[1]) + unlist(lapply(cols, function(.y) sqrt(pv[.y, .y, ]))) + })), error = function(e) { NULL } @@ -156,7 +165,10 @@ r <- try(as.numeric(stats::residuals(model)), silent = TRUE) if (inherits(r, "try-error")) { - insight::format_alert(sprintf("Non-normality of residuals could not be computed. Cannot extract residuals from objects of class '%s'.", class(model)[1])) + insight::format_alert(sprintf( + "Non-normality of residuals could not be computed. Cannot extract residuals from objects of class '%s'.", + class(model)[1] + )) return(NULL) } @@ -183,15 +195,15 @@ n_params <- tryCatch(model$rank, error = function(e) insight::n_parameters(model)) infl <- stats::influence(model, do.coef = FALSE) - resid <- as.numeric(insight::get_residuals(model)) + model_resid <- as.numeric(insight::get_residuals(model)) - std_resid <- tryCatch(stats::rstandard(model, infl), error = function(e) resid) + std_resid <- tryCatch(stats::rstandard(model, infl), error = function(e) model_resid) plot_data <- data.frame( Hat = infl$hat, Cooks_Distance = stats::cooks.distance(model, infl), Fitted = insight::get_predicted(model, ci = NULL), - Residuals = resid, + Residuals = model_resid, Std_Residuals = std_resid, stringsAsFactors = FALSE ) @@ -210,12 +222,10 @@ .diag_ncv <- function(model, verbose = TRUE) { ncv <- tryCatch( - { - data.frame( - x = as.numeric(stats::fitted(model)), - y = as.numeric(stats::residuals(model)) - ) - }, + data.frame( + x = as.numeric(stats::fitted(model)), + y = as.numeric(stats::residuals(model)) + ), error = function(e) { NULL } @@ -223,7 +233,10 @@ if (is.null(ncv)) { if (verbose) { - insight::format_alert(sprintf("Non-constant error variance could not be computed. Cannot extract residuals from objects of class '%s'.", class(model)[1])) + insight::format_alert(sprintf( + "Non-constant error variance could not be computed. Cannot extract residuals from objects of class '%s'.", + class(model)[1] + )) } return(NULL) } @@ -238,24 +251,22 @@ .diag_homogeneity <- function(model, verbose = TRUE) { faminfo <- insight::model_info(model) r <- tryCatch( - { - if (inherits(model, "merMod")) { - stats::residuals(model, scaled = TRUE) - } else if (inherits(model, "gam")) { - stats::residuals(model, type = "scaled.pearson") - } else if (inherits(model, c("glmmTMB", "MixMod"))) { - sigma <- if (faminfo$is_mixed) { - sqrt(insight::get_variance_residual(model)) - } else { - .sigma_glmmTMB_nonmixed(model, faminfo) - } - stats::residuals(model) / sigma - } else if (inherits(model, "glm")) { - ## TODO: check if we can / should use deviance residuals (as for QQ plots) here as well? - stats::rstandard(model, type = "pearson") + if (inherits(model, "merMod")) { + stats::residuals(model, scaled = TRUE) + } else if (inherits(model, "gam")) { + stats::residuals(model, type = "scaled.pearson") + } else if (inherits(model, c("glmmTMB", "MixMod"))) { + residual_sigma <- if (faminfo$is_mixed) { + sqrt(insight::get_variance_residual(model)) } else { - stats::rstandard(model) + .sigma_glmmTMB_nonmixed(model, faminfo) } + stats::residuals(model) / residual_sigma + } else if (inherits(model, "glm")) { + ## TODO: check if we can / should use deviance residuals (as for QQ plots) here as well? + stats::rstandard(model, type = "pearson") + } else { + stats::rstandard(model) }, error = function(e) { NULL @@ -264,7 +275,10 @@ if (is.null(r)) { if (verbose) { - insight::format_alert(sprintf("Homogeneity of variance could not be computed. Cannot extract residual variance from objects of class '%s'.", class(model)[1])) + insight::format_alert(sprintf( + "Homogeneity of variance could not be computed. Cannot extract residual variance from objects of class '%s'.", + class(model)[1] + )) } return(NULL) } @@ -279,7 +293,81 @@ # prepare data for homogeneity of variance plot ---------------------------------- -.diag_overdispersion <- function(model) { +.new_diag_overdispersion <- function(model, ...) { + faminfo <- insight::model_info(model) + + simres <- simulate_residuals(model, ...) + predicted <- simres$fittedPredictedResponse + d <- data.frame(Predicted = predicted) + + # residuals based on simulated residuals - but we want normally distributed residuals + d$Residuals <- stats::residuals(simres, quantile_function = stats::qnorm, ...) + d$Res2 <- d$Residuals^2 + d$StdRes <- insight::get_residuals(model, type = "pearson") + + # data for poisson models + if (faminfo$is_poisson && !faminfo$is_zero_inflated) { + d$V <- predicted + } + + # data for negative binomial models + if (faminfo$is_negbin && !faminfo$is_zero_inflated) { + if (inherits(model, "glmmTMB")) { + if (faminfo$family == "nbinom1") { + # for nbinom1, we can use "sigma()" + d$V <- insight::get_sigma(model)^2 * stats::family(model)$variance(predicted) + } else { + # for nbinom2, "sigma()" has "inverse meaning" (see #654) + d$V <- (1 / insight::get_sigma(model)^2) * stats::family(model)$variance(predicted) + } + } else { + ## FIXME: this is not correct for glm.nb models? + d$V <- predicted * (1 + predicted / insight::get_sigma(model)) + } + } + + # data for zero-inflated poisson models + if (faminfo$is_poisson && faminfo$is_zero_inflated) { + if (inherits(model, "glmmTMB")) { + ptype <- "zprob" + } else { + ptype <- "zero" + } + d$Prob <- stats::predict(model, type = ptype) + d$V <- predicted * (1 - d$Prob) * (1 + predicted * d$Prob) + } + + # data for zero-inflated negative binomial models + if (faminfo$is_negbin && faminfo$is_zero_inflated && !faminfo$is_dispersion) { + if (inherits(model, "glmmTMB")) { + ptype <- "zprob" + } else { + ptype <- "zero" + } + d$Prob <- stats::predict(model, type = ptype) + d$Disp <- insight::get_sigma(model) + d$V <- predicted * (1 + predicted / d$Disp) * (1 - d$Prob) * (1 + predicted * (1 + predicted / d$Disp) * d$Prob) # nolint + } + + # data for zero-inflated negative binomial models with dispersion + if (faminfo$is_negbin && faminfo$is_zero_inflated && faminfo$is_dispersion) { + d <- data.frame(Predicted = stats::predict(model, type = "response")) + if (inherits(model, "glmmTMB")) { + ptype <- "zprob" + } else { + ptype <- "zero" + } + d$Prob <- stats::predict(model, type = ptype) + d$Disp <- stats::predict(model, type = "disp") + d$V <- predicted * (1 + predicted / d$Disp) * (1 - d$Prob) * (1 + predicted * (1 + predicted / d$Disp) * d$Prob) # nolint + } + + d +} + + + +.diag_overdispersion <- function(model, ...) { faminfo <- insight::model_info(model) # data for poisson models @@ -293,11 +381,26 @@ # data for negative binomial models if (faminfo$is_negbin && !faminfo$is_zero_inflated) { - d <- data.frame(Predicted = stats::predict(model, type = "response")) - d$Residuals <- insight::get_response(model) - as.vector(d$Predicted) - d$Res2 <- d$Residuals^2 - d$V <- d$Predicted * (1 + d$Predicted / insight::get_sigma(model)) - d$StdRes <- insight::get_residuals(model, type = "pearson") + if (inherits(model, "glmmTMB")) { + d <- data.frame(Predicted = stats::predict(model, type = "response")) + d$Residuals <- insight::get_residuals(model, type = "pearson") + d$Res2 <- d$Residuals^2 + d$StdRes <- insight::get_residuals(model, type = "pearson") + if (faminfo$family == "nbinom1") { + # for nbinom1, we can use "sigma()" + d$V <- insight::get_sigma(model)^2 * stats::family(model)$variance(d$Predicted) + } else { + # for nbinom2, "sigma()" has "inverse meaning" (see #654) + d$V <- (1 / insight::get_sigma(model)^2) * stats::family(model)$variance(d$Predicted) + } + } else { + ## FIXME: this is not correct for glm.nb models? + d <- data.frame(Predicted = stats::predict(model, type = "response")) + d$Residuals <- insight::get_response(model) - as.vector(d$Predicted) + d$Res2 <- d$Residuals^2 + d$V <- d$Predicted * (1 + d$Predicted / insight::get_sigma(model)) + d$StdRes <- insight::get_residuals(model, type = "pearson") + } } # data for zero-inflated poisson models @@ -327,7 +430,7 @@ } d$Prob <- stats::predict(model, type = ptype) d$Disp <- insight::get_sigma(model) - d$V <- d$Predicted * (1 + d$Predicted / d$Disp) * (1 - d$Prob) * (1 + d$Predicted * (1 + d$Predicted / d$Disp) * d$Prob) + d$V <- d$Predicted * (1 + d$Predicted / d$Disp) * (1 - d$Prob) * (1 + d$Predicted * (1 + d$Predicted / d$Disp) * d$Prob) # nolint d$StdRes <- insight::get_residuals(model, type = "pearson") } @@ -343,7 +446,7 @@ } d$Prob <- stats::predict(model, type = ptype) d$Disp <- stats::predict(model, type = "disp") - d$V <- d$Predicted * (1 + d$Predicted / d$Disp) * (1 - d$Prob) * (1 + d$Predicted * (1 + d$Predicted / d$Disp) * d$Prob) + d$V <- d$Predicted * (1 + d$Predicted / d$Disp) * (1 - d$Prob) * (1 + d$Predicted * (1 + d$Predicted / d$Disp) * d$Prob) # nolint d$StdRes <- insight::get_residuals(model, type = "pearson") } @@ -351,7 +454,6 @@ } - # helpers ---------------------------------- .sigma_glmmTMB_nonmixed <- function(model, faminfo) { diff --git a/R/check_normality.R b/R/check_normality.R index 9dc00d03f..297c50179 100644 --- a/R/check_normality.R +++ b/R/check_normality.R @@ -17,6 +17,8 @@ #' [`plot()`-method](https://easystats.github.io/see/articles/performance.html) #' implemented in the [**see**-package](https://easystats.github.io/see/). #' +#' @seealso [`see::plot.see_check_normality()`] for options to customize the plot. +#' #' @details `check_normality()` calls `stats::shapiro.test` and checks the #' standardized residuals (or studentized residuals for mixed models) for #' normal distribution. Note that this formal test almost always yields @@ -58,7 +60,7 @@ check_normality.default <- function(x, ...) { if (!insight::model_info(x)$is_linear) { insight::format_alert( - "Checking normality of residuals is only appropriate for linear models." + "Checking normality of residuals is only appropriate for linear models. It is recommended to use `simulate_residuals()` and `check_residuals()` to check generalized linear (mixed) models for uniformity of residuals." # nolint ) return(NULL) } @@ -87,11 +89,28 @@ check_normality.glm <- function(x, ...) { insight::format_alert( "There's no formal statistical test for normality for generalized linear model.", - "Please use `plot()` on the return value of this function: `plot(check_normality(model))`" + "Instead, please use `simulate_residuals()` and `check_residuals()` to check for uniformity of residuals." ) invisible(out) } +# simulated residuals ---------- + +#' @export +check_normality.performance_simres <- function(x, ...) { + # check for normality of residuals + res <- stats::residuals(x, quantile_function = stats::qnorm) + p.val <- .check_normality(res[!is.infinite(res) & !is.na(res)], x) + + attr(p.val, "data") <- x + attr(p.val, "object_name") <- insight::safe_deparse_symbol(substitute(x)) + attr(p.val, "effects") <- "fixed" + class(p.val) <- unique(c("check_normality", "see_check_normality", class(p.val))) + + p.val +} + + # numeric ------------------- #' @export @@ -181,7 +200,7 @@ check_normality.merMod <- function(x, effects = c("fixed", "random"), ...) { # valid model? if (!info$is_linear && effects == "fixed") { insight::format_alert( - "Checking normality of residuals is only appropriate for linear models." + "Checking normality of residuals is only appropriate for linear models. It is recommended to use `simulate_residuals()` and `check_residuals()` to check generalized linear (mixed) models for uniformity of residuals." # nolint ) return(NULL) } @@ -200,7 +219,7 @@ check_normality.merMod <- function(x, effects = c("fixed", "random"), ...) { } }, error = function(e) { - return(NULL) + NULL } ) @@ -217,6 +236,8 @@ check_normality.merMod <- function(x, effects = c("fixed", "random"), ...) { attr(p.val, "type") <- "random effects" attr(p.val, "re_groups") <- re_groups } + } else if (inherits(x, "glmmTMB")) { + p.val <- .check_normality(stats::residuals(x, type = "deviance"), x) } else { # check for normality of residuals p.val <- .check_normality(stats::rstudent(x), x) @@ -260,7 +281,7 @@ check_normality.BFBayesFactor <- check_normality.afex_aov # helper --------------------- .check_normality <- function(x, model, type = "residuals") { - ts <- .safe({ + ts_result <- .safe({ if (length(x) >= 5000) { suppressWarnings(stats::ks.test(x, y = "pnorm", alternative = "two.sided")) } else { @@ -268,7 +289,7 @@ check_normality.BFBayesFactor <- check_normality.afex_aov } }) - if (is.null(ts)) { + if (is.null(ts_result)) { insight::print_color( sprintf("`check_normality()` does not support models of class `%s`.\n", class(model)[1]), "red" @@ -276,7 +297,7 @@ check_normality.BFBayesFactor <- check_normality.afex_aov return(NULL) } - out <- ts$p.value + out <- ts_result$p.value attr(out, "type") <- type out diff --git a/R/check_outliers.R b/R/check_outliers.R index e98e037c0..adb4165dc 100644 --- a/R/check_outliers.R +++ b/R/check_outliers.R @@ -12,7 +12,8 @@ #' by at least half of the methods). See the **Details** section below #' for a description of the methods. #' -#' @param x A model or a data.frame object. +#' @param x A model, a data.frame, a `performance_simres` [`simulate_residuals()`] +#' or a `DHARMa` object. #' @param method The outlier detection method(s). Can be `"all"` or some of #' `"cook"`, `"pareto"`, `"zscore"`, `"zscore_robust"`, `"iqr"`, `"ci"`, `"eti"`, #' `"hdi"`, `"bci"`, `"mahalanobis"`, `"mahalanobis_robust"`, `"mcd"`, `"ics"`, @@ -23,9 +24,19 @@ #' 'Details'). If a numeric value is given, it will be used as the threshold #' for any of the method run. #' @param ID Optional, to report an ID column along with the row number. +#' @param type Type of method to test for outliers. Can be one of `"default"`, +#' `"binomial"` or `"bootstrap"`. Only applies when `x` is an object returned +#' by `simulate_residuals()` or of class `DHARMa`. See 'Details' in +#' `?DHARMa::testOutliers` for a detailed description of the types. +#' @param verbose Toggle warnings. #' @param ... When `method = "ics"`, further arguments in `...` are passed #' down to [ICSOutlier::ics.outlier()]. When `method = "mahalanobis"`, -#' they are passed down to [stats::mahalanobis()]. +#' they are passed down to [stats::mahalanobis()]. `percentage_central` can +#' be specified when `method = "mcd"`. For objects of class `performance_simres` +#' or `DHARMa`, further arguments are passed down to `DHARMa::testOutliers()`. +#' +#' @inheritParams check_zeroinflation +#' @inheritParams simulate_residuals #' #' @return A logical vector of the detected outliers with a nice printing #' method: a check (message) on whether outliers were detected or not. The @@ -36,6 +47,8 @@ #' #' @family functions to check model assumptions and and assess model quality #' +#' @seealso [`see::plot.see_check_outliers()`] for options to customize the plot. +#' #' @note There is also a #' [`plot()`-method](https://easystats.github.io/see/articles/performance.html) #' implemented in the @@ -160,6 +173,9 @@ #' the data (by default, 66\%), before computing the Mahalanobis Distance. This #' is deemed to be a more robust method of identifying and removing outliers #' than regular Mahalanobis distance. +#' This method has a `percentage_central` argument that allows specifying +#' the breakdown point (0.75, the default, is recommended by Leys et al. 2018, +#' but a commonly used alternative is 0.50). #' #' - **Invariant Coordinate Selection (ICS)**: #' The outlier are detected using ICS, which by default uses an alpha threshold @@ -195,6 +211,17 @@ #' observations located at `qnorm(1-0.025) * SD)` of the log-transformed #' LOF distance. Requires the **dbscan** package. #' +#' @section Methods for simulated residuals: +#' +#' The approach for detecting outliers based on simulated residuals differs +#' from the traditional methods and may not be detecting outliers as expected. +#' Literally, this approach compares observed to simulated values. However, we +#' do not know the deviation of the observed data to the model expectation, and +#' thus, the term "outlier" should be taken with a grain of salt. It refers to +#' "simulation outliers". Basically, the comparison tests whether on observed +#' data point is outside the simulated range. It is strongly recommended to read +#' the related documentations in the **DHARMa** package, e.g. `?DHARMa::testOutliers`. +#' #' @section Threshold specification: #' #' Default thresholds are currently specified as follows: @@ -261,6 +288,10 @@ #' statistical models. Journal of Open Source Software, 6(60), 3139. #' \doi{10.21105/joss.03139} #' +#' - Thériault, R., Ben-Shachar, M. S., Patil, I., Lüdecke, D., Wiernik, B. M., +#' and Makowski, D. (2023). Check your outliers! An introduction to identifying +#' statistical outliers in R with easystats. \doi{10.31234/osf.io/bu6nt} +#' #' - Rousseeuw, P. J., and Van Zomeren, B. C. (1990). Unmasking multivariate #' outliers and leverage points. Journal of the American Statistical #' association, 85(411), 633-639. @@ -304,7 +335,7 @@ #' @examplesIf require("see") && require("bigutilsr") && require("loo") && require("MASS") && require("ICSOutlier") && require("ICS") && require("dbscan") #' \donttest{ #' # You can also run all the methods -#' check_outliers(data, method = "all") +#' check_outliers(data, method = "all", verbose = FALSE) #' #' # For statistical models --------------------------------------------- #' # select only mpg and disp (continuous) @@ -340,6 +371,7 @@ check_outliers.default <- function(x, method = c("cook", "pareto"), threshold = NULL, ID = NULL, + verbose = TRUE, ...) { # Check args if (all(method == "all")) { @@ -381,16 +413,31 @@ check_outliers.default <- function(x, ) # Get data - data <- insight::get_data(x, verbose = FALSE) + my_data <- insight::get_data(x, verbose = FALSE) + + # sanity check for date, POSIXt and difftime variables + if (any(vapply(my_data, inherits, FUN.VALUE = logical(1), what = c("Date", "POSIXt", "difftime"))) && verbose) { + insight::format_alert( + paste( + "Date variables are not supported for outliers detection. These will be ignored.", + "Make sure any date variables are converted to numeric or factor {.b before} fitting the model." + ) + ) + } # Remove non-numerics - data <- datawizard::data_select(data, select = is.numeric) + my_data <- datawizard::data_select(my_data, select = is.numeric, verbose = FALSE) + + # check if any data left + if (is.null(my_data) || ncol(my_data) == 0) { + insight::format_error("No numeric variables found. No data to check for outliers.") + } # Thresholds if (is.null(threshold)) { - thresholds <- .check_outliers_thresholds(data) + thresholds <- .check_outliers_thresholds(my_data) } else if (is.list(threshold)) { - thresholds <- .check_outliers_thresholds(data) + thresholds <- .check_outliers_thresholds(my_data) thresholds[names(threshold)] <- threshold[names(threshold)] } else { insight::format_error( @@ -401,21 +448,21 @@ check_outliers.default <- function(x, ) } - if (!missing(ID)) { + if (!missing(ID) && verbose) { insight::format_warning(paste0("ID argument not supported for model objects of class `", class(x)[1], "`.")) } # Others - if (!all(method %in% c("cook", "pareto"))) { - out <- check_outliers(data, method, threshold) - outlier_var <- attributes(out)$outlier_var - outlier_count <- attributes(out)$outlier_count - df <- attributes(out)$data - df <- df[!names(df) %in% "Outlier"] - } else { - df <- data.frame(Row = seq_len(nrow(as.data.frame(data)))) + if (all(method %in% c("cook", "pareto"))) { + my_df <- data.frame(Row = seq_len(nrow(as.data.frame(my_data)))) outlier_count <- list() outlier_var <- list() + } else { + out <- check_outliers(my_data, method, threshold) + outlier_var <- attributes(out)$outlier_var + outlier_count <- attributes(out)$outlier_count + my_df <- attributes(out)$data + my_df <- my_df[names(my_df) != "Outlier"] } # Cook @@ -425,7 +472,7 @@ check_outliers.default <- function(x, threshold = thresholds$cook )$data_cook - df <- datawizard::data_merge(list(df, data_cook), + my_df <- datawizard::data_merge(list(my_df, data_cook), join = "full", by = "Row" ) @@ -445,17 +492,17 @@ check_outliers.default <- function(x, outlier_count$cook <- count.table - if (!all(method %in% c("cook", "pareto"))) { + if (all(method %in% c("cook", "pareto"))) { + outlier_count$all <- count.table + } else { outlier_count$all <- datawizard::data_merge( list(outlier_count$all, count.table), join = "full", by = "Row" ) - } else { - outlier_count$all <- count.table } } else { - method <- method[!(method %in% "cook")] + method <- method[method != "cook"] } # Pareto @@ -465,7 +512,7 @@ check_outliers.default <- function(x, threshold = thresholds$pareto )$data_pareto - df <- datawizard::data_merge(list(df, data_pareto), + my_df <- datawizard::data_merge(list(my_df, data_pareto), join = "full", by = "Row" ) @@ -485,17 +532,17 @@ check_outliers.default <- function(x, outlier_count$pareto <- count.table - if (!all(method %in% c("cook", "pareto"))) { + if (all(method %in% c("cook", "pareto"))) { + outlier_count$all <- count.table + } else { outlier_count$all <- datawizard::data_merge( list(outlier_count$all, count.table), join = "full", by = "Row" ) - } else { - outlier_count$all <- count.table } } else { - method <- method[!(method %in% "pareto")] + method <- method[method != "pareto"] } outlier_count$all <- datawizard::convert_na_to(outlier_count$all, @@ -527,21 +574,21 @@ check_outliers.default <- function(x, thresholds <- thresholds[names(thresholds) %in% method] # Composite outlier score - df$Outlier <- rowMeans(df[grepl("Outlier_", names(df), fixed = TRUE)]) - df <- df[c(names(df)[names(df) != "Outlier"], "Outlier")] + my_df$Outlier <- rowMeans(my_df[grepl("Outlier_", names(my_df), fixed = TRUE)]) + my_df <- my_df[c(names(my_df)[names(my_df) != "Outlier"], "Outlier")] # Out - outlier <- df$Outlier > 0.5 + outlier <- my_df$Outlier > 0.5 # Attributes class(outlier) <- c("check_outliers", "see_check_outliers", class(outlier)) - attr(outlier, "data") <- df + attr(outlier, "data") <- my_df attr(outlier, "threshold") <- thresholds attr(outlier, "method") <- method attr(outlier, "text_size") <- 3 attr(outlier, "influential_obs") <- .influential_obs(x) attr(outlier, "variables") <- "(Whole model)" - attr(outlier, "raw_data") <- data + attr(outlier, "raw_data") <- my_data attr(outlier, "outlier_var") <- outlier_var attr(outlier, "outlier_count") <- outlier_count @@ -760,6 +807,28 @@ plot.check_outliers <- function(x, ...) { NextMethod() } +#' @export +print.check_outliers_simres <- function(x, digits = 2, ...) { + result <- paste0( + insight::format_value(100 * x$Expected, digits = digits, ...), + "%, ", + insight::format_ci(100 * x$CI_low, 100 * x$CI_high, digits = digits, ...) + ) + insight::print_color("# Outliers detection\n\n", "blue") + cat(sprintf(" Proportion of observed outliers: %.*f%%\n", digits, 100 * x$Coefficient)) + cat(sprintf(" Proportion of expected outliers: %s\n\n", result)) + + p_string <- paste0(" (", insight::format_p(x$p_value), ")") + + if (x$p_value < 0.05) { + message("Outliers were detected", p_string, ".") + } else { + message("No outliers were detected", p_string, ".") + } + + invisible(x) +} + # other classes ------------------------- @@ -794,7 +863,7 @@ check_outliers.data.frame <- function(x, ) # Remove non-numerics - data <- x + my_data <- x x <- x[, vapply(x, is.numeric, logical(1)), drop = FALSE] # Check args @@ -837,20 +906,20 @@ check_outliers.data.frame <- function(x, outlier_var <- out.meta$outlier_var # Combine outlier data - df <- out[vapply(out, is.data.frame, logical(1))] - if (length(df) > 1 && !is.null(ID)) { - df <- datawizard::data_merge(df, by = c("Row", ID)) - } else if (length(df) > 1) { - df <- datawizard::data_merge(df, by = "Row") + my_df <- out[vapply(out, is.data.frame, logical(1))] + if (length(my_df) > 1 && !is.null(ID)) { + my_df <- datawizard::data_merge(my_df, by = c("Row", ID)) + } else if (length(my_df) > 1) { + my_df <- datawizard::data_merge(my_df, by = "Row") } else { - df <- df[[1]] + my_df <- my_df[[1]] } # Composite outlier score - df$Outlier <- rowMeans(df[grepl("Outlier_", names(df), fixed = TRUE)]) + my_df$Outlier <- rowMeans(my_df[grepl("Outlier_", names(my_df), fixed = TRUE)]) # Out - outlier <- df$Outlier > 0.5 + outlier <- my_df$Outlier > 0.5 # Combine outlier frequency table if (length(outlier_count) > 1 && !is.null(ID)) { @@ -892,12 +961,12 @@ check_outliers.data.frame <- function(x, # Attributes class(outlier) <- c("check_outliers", "see_check_outliers", class(outlier)) - attr(outlier, "data") <- df + attr(outlier, "data") <- my_df attr(outlier, "threshold") <- thresholds attr(outlier, "method") <- method attr(outlier, "text_size") <- 3 attr(outlier, "variables") <- names(x) - attr(outlier, "raw_data") <- data + attr(outlier, "raw_data") <- my_data attr(outlier, "outlier_var") <- outlier_var attr(outlier, "outlier_count") <- outlier_count outlier @@ -910,7 +979,7 @@ check_outliers.data.frame <- function(x, outlier.list <- lapply(outlier.list, function(x) { x[x[[Outlier_method]] >= 0.5, ] }) - outlier.list <- outlier.list[lapply(outlier.list, nrow) > 0] + outlier.list <- outlier.list[vapply(outlier.list, nrow, numeric(1)) > 0] outlier.list <- lapply(outlier.list, datawizard::data_remove, Outlier_method, as_data_frame = TRUE @@ -1094,8 +1163,8 @@ check_outliers.data.frame <- function(x, out <- c(out, .check_outliers_mcd( x, threshold = thresholds$mcd, - percentage_central = 0.66, - ID.names = ID.names + ID.names = ID.names, + ... )) count.table <- datawizard::data_filter( @@ -1213,7 +1282,7 @@ check_outliers.grouped_df <- function(x, } # Initialize elements - data <- data.frame() + my_data <- data.frame() out <- NULL thresholds <- list() outlier_var <- list() @@ -1222,24 +1291,24 @@ check_outliers.grouped_df <- function(x, # Loop through groups for (i in seq_along(grps)) { rows <- grps[[i]] - subset <- check_outliers( + outliers_subset <- check_outliers( as.data.frame(x[rows, ]), method = method, threshold = threshold, ID = ID, ... ) - data <- rbind(data, as.data.frame(subset)) - out <- c(out, subset) - thresholds[[paste0("group_", i)]] <- attributes(subset)$threshold + my_data <- rbind(my_data, as.data.frame(outliers_subset)) + out <- c(out, outliers_subset) + thresholds[[paste0("group_", i)]] <- attributes(outliers_subset)$threshold outlier_var[[i]] <- lapply( - attributes(subset)$outlier_var, lapply, function(y) { + attributes(outliers_subset)$outlier_var, lapply, function(y) { y$Row <- rows[which(seq_along(rows) %in% y$Row)] y } ) outlier_count[[i]] <- lapply( - attributes(subset)$outlier_count, function(y) { + attributes(outliers_subset)$outlier_count, function(y) { y$Row <- rows[which(seq_along(rows) %in% y$Row)] y } @@ -1260,16 +1329,16 @@ check_outliers.grouped_df <- function(x, info$groups$.rows[[x]] <- as.data.frame(info$groups$.rows[[x]]) }) - data[names(info$groups)[1]] <- do.call(rbind, groups) - data <- datawizard::data_relocate( - data, + my_data[names(info$groups)[1]] <- do.call(rbind, groups) + my_data <- datawizard::data_relocate( + my_data, select = names(info$groups)[1], after = "Row" ) - data$Row <- seq_len(nrow(data)) + my_data$Row <- seq_len(nrow(my_data)) class(out) <- c("check_outliers", "see_check_outliers", class(out)) - attr(out, "data") <- data + attr(out, "data") <- my_data attr(out, "method") <- method attr(out, "threshold") <- thresholds[[1]] attr(out, "text_size") <- 3 @@ -1413,6 +1482,30 @@ check_outliers.meta <- check_outliers.metagen check_outliers.metabin <- check_outliers.metagen +#' @rdname check_outliers +#' @export +check_outliers.performance_simres <- function(x, type = "default", iterations = 100, alternative = "two.sided", ...) { + type <- match.arg(type, c("default", "binomial", "bootstrap")) + alternative <- match.arg(alternative, c("two.sided", "greater", "less")) + + insight::check_if_installed("DHARMa") + result <- DHARMa::testOutliers(x, type = type, nBoot = iterations, alternative = alternative, plot = FALSE, ...) + + outlier <- list( + Coefficient = as.vector(result$estimate), + Expected = as.numeric(gsub("(.*)\\(expected: (\\d.*)\\)", "\\2", names(result$estimate))), + CI_low = result$conf.int[1], + CI_high = result$conf.int[2], + p_value = result$p.value + ) + class(outlier) <- c("check_outliers_simres", class(outlier)) + outlier +} + +#' @export +check_outliers.DHARMa <- check_outliers.performance_simres + + # Thresholds -------------------------------------------------------------- @@ -1430,7 +1523,7 @@ check_outliers.metabin <- check_outliers.metagen bci <- 1 - 0.001 cook <- stats::qf(0.5, ncol(x), nrow(x) - ncol(x)) pareto <- 0.7 - mahalanobis <- stats::qchisq(p = 1 - 0.001, df = ncol(x)) + mahalanobis_value <- stats::qchisq(p = 1 - 0.001, df = ncol(x)) mahalanobis_robust <- stats::qchisq(p = 1 - 0.001, df = ncol(x)) mcd <- stats::qchisq(p = 1 - 0.001, df = ncol(x)) ics <- 0.001 @@ -1438,21 +1531,21 @@ check_outliers.metabin <- check_outliers.metagen lof <- 0.001 list( - "zscore" = zscore, - "zscore_robust" = zscore_robust, - "iqr" = iqr, - "ci" = ci, - "hdi" = hdi, - "eti" = eti, - "bci" = bci, - "cook" = cook, - "pareto" = pareto, - "mahalanobis" = mahalanobis, - "mahalanobis_robust" = mahalanobis_robust, - "mcd" = mcd, - "ics" = ics, - "optics" = optics, - "lof" = lof + zscore = zscore, + zscore_robust = zscore_robust, + iqr = iqr, + ci = ci, + hdi = hdi, + eti = eti, + bci = bci, + cook = cook, + pareto = pareto, + mahalanobis = mahalanobis_value, + mahalanobis_robust = mahalanobis_robust, + mcd = mcd, + ics = ics, + optics = optics, + lof = lof ) } @@ -1474,15 +1567,15 @@ check_outliers.metabin <- check_outliers.metagen x <- as.data.frame(x) # Standardize - if (!robust) { + if (robust) { d <- abs(as.data.frame(lapply( x, - function(x) (x - mean(x, na.rm = TRUE)) / stats::sd(x, na.rm = TRUE) + function(x) (x - stats::median(x, na.rm = TRUE)) / stats::mad(x, na.rm = TRUE) ))) } else { d <- abs(as.data.frame(lapply( x, - function(x) (x - stats::median(x, na.rm = TRUE)) / stats::mad(x, na.rm = TRUE) + function(x) (x - mean(x, na.rm = TRUE)) / stats::sd(x, na.rm = TRUE) ))) } @@ -1500,8 +1593,8 @@ check_outliers.metabin <- check_outliers.metagen out$Outlier_Zscore <- as.numeric(out$Distance_Zscore > threshold) output <- list( - "data_zscore" = out, - "threshold_zscore" = threshold + data_zscore = out, + threshold_zscore = threshold ) if (isTRUE(robust)) { @@ -1562,8 +1655,8 @@ check_outliers.metabin <- check_outliers.metagen }, numeric(1)) list( - "data_iqr" = out, - "threshold_iqr" = threshold + data_iqr = out, + threshold_iqr = threshold ) } @@ -1611,8 +1704,8 @@ check_outliers.metabin <- check_outliers.metagen out <- cbind(out.0, out) output <- list( - "data_" = out, - "threshold_" = threshold + data_ = out, + threshold_ = threshold ) names(output) <- paste0(names(output), method) output @@ -1632,8 +1725,8 @@ check_outliers.metabin <- check_outliers.metagen out$Outlier_Cook <- as.numeric(out$Distance_Cook > threshold) list( - "data_cook" = out, - "threshold_cook" = threshold + data_cook = out, + threshold_cook = threshold ) } @@ -1652,8 +1745,8 @@ check_outliers.metabin <- check_outliers.metagen out$Outlier_Pareto <- as.numeric(out$Distance_Pareto > threshold) list( - "data_pareto" = out, - "threshold_pareto" = threshold + data_pareto = out, + threshold_pareto = threshold ) } @@ -1682,8 +1775,8 @@ check_outliers.metabin <- check_outliers.metagen out$Outlier_Mahalanobis <- as.numeric(out$Distance_Mahalanobis > threshold) list( - "data_mahalanobis" = out, - "threshold_mahalanobis" = threshold + data_mahalanobis = out, + threshold_mahalanobis = threshold ) } @@ -1713,8 +1806,8 @@ check_outliers.metabin <- check_outliers.metagen ) list( - "data_mahalanobis_robust" = out, - "threshold_mahalanobis_robust" = threshold + data_mahalanobis_robust = out, + threshold_mahalanobis_robust = threshold ) } @@ -1722,14 +1815,31 @@ check_outliers.metabin <- check_outliers.metagen .check_outliers_mcd <- function(x, threshold = stats::qchisq(p = 1 - 0.001, df = ncol(x)), - percentage_central = 0.50, - ID.names = NULL) { + percentage_central = 0.75, + ID.names = NULL, + verbose = TRUE, + ...) { out <- data.frame(Row = seq_len(nrow(x))) if (!is.null(ID.names)) { out <- cbind(out, ID.names) } + # check whether N to p ratio is not too large, else MCD flags too many outliers + # See #672: This does seem to be a function of the N/p (N = sample size; p = + # number of parameters) ratio. When it is larger than 10, the % of outliers + # flagged is okay (in well behaved data). This makes sense: the MCD looks at + # the cov matrix of subsamples of the data - with high dimensional data, small + # samples sizes will give highly variable cov matrices, as so the "smallest" + # one will probably miss-represent the data. + + if ((nrow(x) / ncol(x)) <= 10 && isTRUE(verbose)) { + insight::format_warning( + "The sample size is too small in your data, relative to the number of variables, for MCD to be reliable.", + "You may try to increase the `percentage_central` argument (must be between 0 and 1), or choose another method." + ) + } + insight::check_if_installed("MASS") # Compute @@ -1740,8 +1850,8 @@ check_outliers.metabin <- check_outliers.metagen out$Outlier_MCD <- as.numeric(out$Distance_MCD > threshold) list( - "data_mcd" = out, - "threshold_mcd" = threshold + data_mcd = out, + threshold_mcd = threshold ) } @@ -1761,10 +1871,10 @@ check_outliers.metabin <- check_outliers.metagen insight::check_if_installed("ICSOutlier") # Get n cores - n_cores <- if (!requireNamespace("parallel", quietly = TRUE)) { - NULL - } else { + n_cores <- if (requireNamespace("parallel", quietly = TRUE)) { getOption("mc.cores", 1L) + } else { + NULL } # tell user about n-cores option @@ -1808,7 +1918,7 @@ check_outliers.metabin <- check_outliers.metagen # Get results cutoff <- .safe(outliers@ics.dist.cutoff) - # sanity check + # validation check if (is.null(cutoff)) { insight::print_color("Could not detect cut-off for outliers.\n", "red") return(NULL) @@ -1818,8 +1928,8 @@ check_outliers.metabin <- check_outliers.metagen # Out list( - "data_ics" = out, - "threshold_ics" = threshold + data_ics = out, + threshold_ics = threshold ) } @@ -1850,8 +1960,8 @@ check_outliers.metabin <- check_outliers.metagen } list( - "data_optics" = out, - "threshold_optics" = threshold + data_optics = out, + threshold_optics = threshold ) } @@ -1917,8 +2027,8 @@ check_outliers.metabin <- check_outliers.metagen out$Outlier_LOF <- as.numeric(out$Distance_LOF > cutoff) list( - "data_lof" = out, - "threshold_lof" = threshold + data_lof = out, + threshold_lof = threshold ) } @@ -1945,3 +2055,12 @@ check_outliers.lmrob <- check_outliers.glmmTMB #' @export check_outliers.glmrob <- check_outliers.glmmTMB + +#' @export +check_outliers.rq <- check_outliers.glmmTMB + +#' @export +check_outliers.rqs <- check_outliers.glmmTMB + +#' @export +check_outliers.rqss <- check_outliers.glmmTMB diff --git a/R/check_overdispersion.R b/R/check_overdispersion.R index 5f61f18b0..fa46edfef 100644 --- a/R/check_overdispersion.R +++ b/R/check_overdispersion.R @@ -1,39 +1,51 @@ -#' @title Check overdispersion of GL(M)M's +#' @title Check overdispersion (and underdispersion) of GL(M)M's #' @name check_overdispersion #' #' @description `check_overdispersion()` checks generalized linear (mixed) -#' models for overdispersion. +#' models for overdispersion (and underdispersion). #' -#' @param x Fitted model of class `merMod`, `glmmTMB`, `glm`, -#' or `glm.nb` (package \pkg{MASS}). -#' @param ... Currently not used. +#' @param x Fitted model of class `merMod`, `glmmTMB`, `glm`, or `glm.nb` +#' (package **MASS**), or an object returned by `simulate_residuals()`. +#' +#' @inheritParams check_zeroinflation #' #' @return A list with results from the overdispersion test, like chi-squared #' statistics, p-value or dispersion ratio. #' #' @details Overdispersion occurs when the observed variance is higher than the -#' variance of a theoretical model. For Poisson models, variance increases -#' with the mean and, therefore, variance usually (roughly) equals the mean -#' value. If the variance is much higher, the data are "overdispersed". +#' variance of a theoretical model. For Poisson models, variance increases +#' with the mean and, therefore, variance usually (roughly) equals the mean +#' value. If the variance is much higher, the data are "overdispersed". A less +#' common case is underdispersion, where the variance is much lower than the +#' mean. #' #' @section Interpretation of the Dispersion Ratio: #' If the dispersion ratio is close to one, a Poisson model fits well to the #' data. Dispersion ratios larger than one indicate overdispersion, thus a -#' negative binomial model or similar might fit better to the data. A p-value < -#' .05 indicates overdispersion. +#' negative binomial model or similar might fit better to the data. Dispersion +#' ratios much smaller than one indicate underdispersion. A p-value < .05 +#' indicates either overdispersion or underdispersion (the first being more common). #' #' @section Overdispersion in Poisson Models: #' For Poisson models, the overdispersion test is based on the code from #' _Gelman and Hill (2007), page 115_. #' +#' @section Overdispersion in Negative Binomial or Zero-Inflated Models: +#' For negative binomial (mixed) models or models with zero-inflation component, +#' the overdispersion test is based simulated residuals (see [`simulate_residuals()`]). +#' #' @section Overdispersion in Mixed Models: #' For `merMod`- and `glmmTMB`-objects, `check_overdispersion()` #' is based on the code in the #' [GLMM FAQ](http://bbolker.github.io/mixedmodels-misc/glmmFAQ.html), #' section *How can I deal with overdispersion in GLMMs?*. Note that this #' function only returns an *approximate* estimate of an overdispersion -#' parameter, and is probably inaccurate for zero-inflated mixed models (fitted -#' with `glmmTMB`). +#' parameter. Using this approach would be inaccurate for zero-inflated or +#' negative binomial mixed models (fitted with `glmmTMB`), thus, in such cases, +#' the overdispersion test is based on [`simulate_residuals()`] (which is identical +#' to `check_overdispersion(simulate_residuals(model))`). +#' +#' @inheritSection check_zeroinflation Tests based on simulated residuals #' #' @section How to fix Overdispersion: #' Overdispersion can be fixed by either modeling the dispersion parameter, or @@ -50,19 +62,10 @@ #' multilevel/hierarchical models. Cambridge; New York: Cambridge University #' Press. #' -#' @examplesIf getRversion() >= "4.0.0" && require("glmmTMB", quietly = TRUE) -#' -#' library(glmmTMB) -#' data(Salamanders) +#' @examplesIf getRversion() >= "4.0.0" && require("glmmTMB") +#' data(Salamanders, package = "glmmTMB") #' m <- glm(count ~ spp + mined, family = poisson, data = Salamanders) #' check_overdispersion(m) -#' -#' m <- glmmTMB( -#' count ~ mined + spp + (1 | site), -#' family = poisson, -#' data = Salamanders -#' ) -#' check_overdispersion(m) #' @export check_overdispersion <- function(x, ...) { UseMethod("check_overdispersion") @@ -113,7 +116,9 @@ print.check_overdisp <- function(x, digits = 3, ...) { orig_x <- x x$dispersion_ratio <- sprintf("%.*f", digits, x$dispersion_ratio) - x$chisq_statistic <- sprintf("%.*f", digits, x$chisq_statistic) + if (!is.null(x$chisq_statistic)) { + x$chisq_statistic <- sprintf("%.*f", digits, x$chisq_statistic) + } x$p_value <- pval <- round(x$p_value, digits = digits) if (x$p_value < 0.001) x$p_value <- "< 0.001" @@ -125,14 +130,21 @@ print.check_overdisp <- function(x, digits = 3, ...) { ) insight::print_color("# Overdispersion test\n\n", "blue") - cat(sprintf(" dispersion ratio = %s\n", format(x$dispersion_ratio, justify = "right", width = maxlen))) - cat(sprintf(" Pearson's Chi-Squared = %s\n", format(x$chisq_statistic, justify = "right", width = maxlen))) - cat(sprintf(" p-value = %s\n\n", format(x$p_value, justify = "right", width = maxlen))) + if (is.null(x$chisq_statistic)) { + cat(sprintf(" dispersion ratio = %s\n", format(x$dispersion_ratio, justify = "right", width = maxlen))) + cat(sprintf(" p-value = %s\n\n", format(x$p_value, justify = "right", width = maxlen))) + } else { + cat(sprintf(" dispersion ratio = %s\n", format(x$dispersion_ratio, justify = "right", width = maxlen))) + cat(sprintf(" Pearson's Chi-Squared = %s\n", format(x$chisq_statistic, justify = "right", width = maxlen))) + cat(sprintf(" p-value = %s\n\n", format(x$p_value, justify = "right", width = maxlen))) + } if (pval > 0.05) { message("No overdispersion detected.") - } else { + } else if (x$dispersion_ratio > 1) { message("Overdispersion detected.") + } else { + message("Underdispersion detected.") } invisible(orig_x) @@ -144,8 +156,21 @@ print.check_overdisp <- function(x, digits = 3, ...) { #' @export check_overdispersion.glm <- function(x, verbose = TRUE, ...) { - # check if we have poisson + # model info info <- insight::model_info(x) + obj_name <- insight::safe_deparse_symbol(substitute(x)) + + # for certain distributions, simulated residuals are more accurate + use_simulated <- info$is_bernoulli || info$is_binomial || (!info$is_count && !info$is_binomial) || info$is_negbin + + # model classes not supported in DHARMa + not_supported <- c("fixest", "glmx") + + if (use_simulated && !inherits(x, not_supported)) { + return(check_overdispersion(simulate_residuals(x, ...), object_name = obj_name, ...)) + } + + # check if we have poisson - need this for models not supported by DHARMa if (!info$is_count && !info$is_binomial) { insight::format_error( "Overdispersion checks can only be used for models from Poisson families or binomial families with trials > 1." @@ -157,10 +182,6 @@ check_overdispersion.glm <- function(x, verbose = TRUE, ...) { insight::format_error("Overdispersion checks cannot be used for Bernoulli models.") } - if (info$is_binomial) { - return(check_overdispersion.merMod(x, verbose = verbose, ...)) - } - yhat <- stats::fitted(x) n <- stats::nobs(x) @@ -179,7 +200,7 @@ check_overdispersion.glm <- function(x, verbose = TRUE, ...) { ) class(out) <- c("check_overdisp", "see_check_overdisp") - attr(out, "object_name") <- insight::safe_deparse_symbol(substitute(x)) + attr(out, "object_name") <- obj_name out } @@ -219,39 +240,30 @@ check_overdispersion.model_fit <- check_overdispersion.poissonmfx # Overdispersion for mixed models --------------------------- #' @export -check_overdispersion.merMod <- function(x, verbose = TRUE, ...) { - # check if we have poisson or binomial +check_overdispersion.merMod <- function(x, ...) { + # for certain distributions, simulated residuals are more accurate info <- insight::model_info(x) - if (!info$is_count && !info$is_binomial) { - insight::format_error( - "Overdispersion checks can only be used for models from Poisson families or binomial families with trials > 1." - ) - } + obj_name <- insight::safe_deparse_symbol(substitute(x)) - # check for Bernoulli - if (info$is_bernoulli) { - insight::format_error("Overdispersion checks cannot be used for Bernoulli models.") + # for certain distributions, simulated residuals are more accurate + use_simulated <- info$family == "genpois" || info$is_zero_inflated || info$is_bernoulli || info$is_binomial || (!info$is_count && !info$is_binomial) || info$is_negbin # nolint + + if (use_simulated) { + return(check_overdispersion(simulate_residuals(x, ...), object_name = obj_name, ...)) } rdf <- stats::df.residual(x) rp <- insight::get_residuals(x, type = "pearson") + + # check if pearson residuals are available if (insight::is_empty_object(rp)) { - Pearson.chisq <- NA - prat <- NA - pval <- NA - rp <- NA - if (isTRUE(verbose)) { - insight::format_alert( - "Cannot test for overdispersion, because pearson residuals are not implemented for models with zero-inflation or variable dispersion.", - "Only the visual inspection using `plot(check_overdispersion(model))` is possible." - ) - } - } else { - Pearson.chisq <- sum(rp^2) - prat <- Pearson.chisq / rdf - pval <- stats::pchisq(Pearson.chisq, df = rdf, lower.tail = FALSE) + return(check_overdispersion(simulate_residuals(x, ...), object_name = obj_name, ...)) } + Pearson.chisq <- sum(rp^2) + prat <- Pearson.chisq / rdf + pval <- stats::pchisq(Pearson.chisq, df = rdf, lower.tail = FALSE) + out <- list( chisq_statistic = Pearson.chisq, dispersion_ratio = prat, @@ -260,7 +272,7 @@ check_overdispersion.merMod <- function(x, verbose = TRUE, ...) { ) class(out) <- c("check_overdisp", "see_check_overdisp") - attr(out, "object_name") <- insight::safe_deparse_symbol(substitute(x)) + attr(out, "object_name") <- obj_name out } @@ -270,3 +282,41 @@ check_overdispersion.negbin <- check_overdispersion.merMod #' @export check_overdispersion.glmmTMB <- check_overdispersion.merMod + + +# simulated residuals ----------------------------- + +#' @rdname check_overdispersion +#' @export +check_overdispersion.performance_simres <- function(x, alternative = c("two.sided", "less", "greater"), ...) { + # match arguments + alternative <- match.arg(alternative) + + # check for special arguments - we may pass "object_name" from other methods + dots <- list(...) + if (is.null(dots$object_name)) { + obj_name <- insight::safe_deparse_symbol(substitute(x)) + } else { + obj_name <- dots$object_name + } + + # statistics function + variance <- stats::sd(x$simulatedResponse)^2 + dispersion <- function(i) stats::var(i - x$fittedPredictedResponse) / variance + + # compute test results + result <- .simres_statistics(x, statistic_fun = dispersion, alternative = alternative) + + out <- list( + dispersion_ratio = result$observed / mean(result$simulated), + p_value = result$p + ) + + class(out) <- c("check_overdisp", "see_check_overdisp") + attr(out, "object_name") <- obj_name + + out +} + +#' @export +check_overdispersion.DHARMa <- check_overdispersion.performance_simres diff --git a/R/check_predictions.R b/R/check_predictions.R index d52b8378a..682047b50 100644 --- a/R/check_predictions.R +++ b/R/check_predictions.R @@ -37,6 +37,9 @@ #' #' @return A data frame of simulated responses and the original response vector. #' +#' @seealso [`simulate_residuals()`] and [`check_residuals()`]. See also +#' [`see::print.see_performance_pp_check()`] for options to customize the plot. +#' #' @details An example how posterior predictive checks can also be used for model #' comparison is Figure 6 from _Gabry et al. 2019, Figure 6_. #' @@ -104,7 +107,42 @@ check_predictions.default <- function(object, minfo <- insight::model_info(object, verbose = FALSE) # try to find sensible default for "type" argument - suggest_dots <- (minfo$is_bernoulli || minfo$is_count || minfo$is_ordinal || minfo$is_categorical || minfo$is_multinomial) + suggest_dots <- (minfo$is_bernoulli || minfo$is_count || minfo$is_ordinal || minfo$is_categorical || minfo$is_multinomial) # nolint + if (missing(type) && suggest_dots) { + type <- "discrete_interval" + } + + # args + type <- match.arg(type, choices = c("density", "discrete_dots", "discrete_interval", "discrete_both")) + + pp_check.lm( + object, + iterations = iterations, + check_range = check_range, + re_formula = re_formula, + bandwidth = bandwidth, + type = type, + verbose = verbose, + model_info = minfo, + ... + ) +} + + +#' @export +check_predictions.stanreg <- function(object, + iterations = 50, + check_range = FALSE, + re_formula = NULL, + bandwidth = "nrd", + type = "density", + verbose = TRUE, + ...) { + # retrieve model information + minfo <- insight::model_info(object, verbose = FALSE) + + # try to find sensible default for "type" argument + suggest_dots <- (minfo$is_bernoulli || minfo$is_count || minfo$is_ordinal || minfo$is_categorical || minfo$is_multinomial) # nolint if (missing(type) && suggest_dots) { type <- "discrete_interval" } @@ -112,27 +150,64 @@ check_predictions.default <- function(object, # args type <- match.arg(type, choices = c("density", "discrete_dots", "discrete_interval", "discrete_both")) - if (isTRUE(minfo$is_bayesian) && isFALSE(inherits(object, "BFBayesFactor"))) { - insight::check_if_installed( - "bayesplot", - "to create posterior prediction plots for Stan models" + # convert to type-argument for pp_check + pp_type <- switch(type, + density = "dens", + "bars" + ) + + insight::check_if_installed( + "bayesplot", + "to create posterior prediction plots for Stan models" + ) + + # for plotting + resp_string <- insight::find_terms(object)$response + + if (inherits(object, "brmsfit")) { + out <- as.data.frame(bayesplot::pp_check(object, type = pp_type, ndraws = iterations, ...)$data) + } else { + out <- as.data.frame(bayesplot::pp_check(object, type = pp_type, nreps = iterations, ...)$data) + } + + # bring data into shape, like we have for other models with `check_predictions()` + if (pp_type == "dens") { + d_filter <- out[!out$is_y, ] + d_filter <- datawizard::data_to_wide( + d_filter, + id_cols = "y_id", + values_from = "value", + names_from = "rep_id" ) - bayesplot::pp_check(object) + d_filter$y_id <- NULL + colnames(d_filter) <- paste0("sim_", colnames(d_filter)) + d_filter$y <- out$value[out$is_y] + out <- d_filter } else { - pp_check.lm( - object, - iterations = iterations, - check_range = check_range, - re_formula = re_formula, - bandwidth = bandwidth, - type = type, - verbose = verbose, - model_info = minfo, - ... + colnames(out) <- c("x", "y", "CI_low", "Mean", "CI_high") + # to long, for plotting + out <- datawizard::data_to_long( + out, + select = c("y", "Mean"), + names_to = "Group", + values_to = "Count" ) } + + attr(out, "is_stan") <- TRUE + attr(out, "check_range") <- check_range + attr(out, "response_name") <- resp_string + attr(out, "bandwidth") <- bandwidth + attr(out, "model_info") <- minfo + attr(out, "type") <- type + class(out) <- c("performance_pp_check", "see_performance_pp_check", class(out)) + out } +#' @export +check_predictions.brmsfit <- check_predictions.stanreg + + #' @export check_predictions.BFBayesFactor <- function(object, iterations = 50, @@ -193,14 +268,14 @@ pp_check.lm <- function(object, # else, proceed as usual out <- .safe(stats::simulate(object, nsim = iterations, re.form = re_formula, ...)) - # sanity check, for mixed models, where re.form = NULL (default) might fail + # validation check, for mixed models, where re.form = NULL (default) might fail out <- .check_re_formula(out, object, iterations, re_formula, verbose, ...) # save information about model - if (!is.null(model_info)) { - minfo <- model_info - } else { + if (is.null(model_info)) { minfo <- insight::model_info(object) + } else { + minfo <- model_info } # glmmTMB returns column matrix for bernoulli @@ -215,9 +290,10 @@ pp_check.lm <- function(object, } if (is.null(out)) { - insight::format_error( - sprintf("Could not simulate responses. Maybe there is no `simulate()` for objects of class `%s`?", class(object)[1]) - ) + insight::format_error(sprintf( + "Could not simulate responses. Maybe there is no `simulate()` for objects of class `%s`?", + class(object)[1] + )) } # get response data, and response term, to check for transformations @@ -263,20 +339,21 @@ pp_check.glm <- function(object, out <- tryCatch( { matrix_sim <- stats::simulate(object, nsim = iterations, re.form = re_formula, ...) - as.data.frame(sapply(matrix_sim, function(i) i[, 1] / i[, 2], simplify = TRUE)) + as.data.frame(sapply(matrix_sim, function(i) i[, 1] / rowSums(i, na.rm = TRUE), simplify = TRUE)) }, error = function(e) { NULL } ) - # sanity check, for mixed models, where re.form = NULL (default) might fail + # validation check, for mixed models, where re.form = NULL (default) might fail out <- .check_re_formula(out, object, iterations, re_formula, verbose, ...) if (is.null(out)) { - insight::format_error( - sprintf("Could not simulate responses. Maybe there is no `simulate()` for objects of class `%s`?", class(object)[1]) - ) + insight::format_error(sprintf( + "Could not simulate responses. Maybe there is no `simulate()` for objects of class `%s`?", + class(object)[1] + )) } # get response data, and response term @@ -285,13 +362,13 @@ pp_check.glm <- function(object, ) resp_string <- insight::find_terms(object)$response - out$y <- response[, 1] / response[, 2] + out$y <- response[, 1] / rowSums(response, na.rm = TRUE) # safe information about model - if (!is.null(model_info)) { - minfo <- model_info - } else { + if (is.null(model_info)) { minfo <- insight::model_info(object) + } else { + minfo <- model_info } attr(out, "check_range") <- check_range @@ -342,11 +419,17 @@ pp_check.glmmTMB <- #' @rdname check_predictions #' @export -posterior_predictive_check <- check_predictions +posterior_predictive_check <- function(object, ...) { + .Deprecated("check_predictions()") + check_predictions(object, ...) +} #' @rdname check_predictions #' @export -check_posterior_predictions <- check_predictions +check_posterior_predictions <- function(object, ...) { + .Deprecated("check_predictions()") + check_predictions(object, ...) +} @@ -363,14 +446,20 @@ print.performance_pp_check <- function(x, verbose = TRUE, ...) { if (is.numeric(original)) { if (min(replicated) > min(original)) { insight::print_color( - insight::format_message("Warning: Minimum value of original data is not included in the replicated data.", "Model may not capture the variation of the data."), + insight::format_message( + "Warning: Minimum value of original data is not included in the replicated data.", + "Model may not capture the variation of the data." + ), "red" ) } if (max(replicated) < max(original)) { insight::print_color( - insight::format_message("Warning: Maximum value of original data is not included in the replicated data.", "Model may not capture the variation of the data."), + insight::format_message( + "Warning: Maximum value of original data is not included in the replicated data.", + "Model may not capture the variation of the data." + ), "red" ) } @@ -444,10 +533,10 @@ plot.performance_pp_check <- function(x, ...) { .check_re_formula <- function(out, object, iterations, re_formula, verbose, ...) { - # sanity check, for mixed models, where re.form = NULL (default) might fail + # validation check, for mixed models, where re.form = NULL (default) might fail if (is.null(out) && insight::is_mixed_model(object) && !isTRUE(is.na(re_formula))) { if (verbose) { - insight::format_warning( + insight::format_alert( paste0( "Failed to compute posterior predictive checks with `re_formula=", deparse(re_formula), diff --git a/R/check_residuals.R b/R/check_residuals.R new file mode 100644 index 000000000..d467bf354 --- /dev/null +++ b/R/check_residuals.R @@ -0,0 +1,103 @@ +#' Check uniformity of simulated residuals +#' +#' `check_residuals()` checks generalized linear (mixed) models for uniformity +#' of randomized quantile residuals, which can be used to identify typical model +#' misspecification problems, such as over/underdispersion, zero-inflation, and +#' residual spatial and temporal autocorrelation. +#' +#' @param x An object returned by [`simulate_residuals()`] or +#' [`DHARMa::simulateResiduals()`]. +#' @param alternative A character string specifying the alternative hypothesis. +#' See [`stats::ks.test()`] for details. +#' @param ... Passed down to [`stats::ks.test()`]. +#' +#' @details Uniformity of residuals is checked using a Kolmogorov-Smirnov test. +#' There is a `plot()` method to visualize the distribution of the residuals. +#' The test for uniformity basically tests to which extent the observed values +#' deviate from the model expectations (i.e. simulated values). In this sense, +#' the `check_residuals()` function has similar goals like [`check_predictions()`]. +#' +#' @inheritSection simulate_residuals Tests based on simulated residuals +#' +#' @seealso [`simulate_residuals()`], [`check_zeroinflation()`], +#' [`check_overdispersion()`] and [`check_predictions()`]. See also +#' [`see::plot.see_performance_simres()`] for options to customize the plot. +#' +#' @return The p-value of the test statistics. +#' +#' @examplesIf require("DHARMa") +#' dat <- DHARMa::createData(sampleSize = 100, overdispersion = 0.5, family = poisson()) +#' m <- glm(observedResponse ~ Environment1, family = poisson(), data = dat) +#' res <- simulate_residuals(m) +#' check_residuals(res) +#' +#' @export +check_residuals <- function(x, ...) { + UseMethod("check_residuals") +} + +#' @rdname check_residuals +#' @export +check_residuals.default <- function(x, alternative = c("two.sided", "less", "greater"), ...) { + if (insight::is_model(x)) { + check_residuals(simulate_residuals(x, ...), alternative = alternative) + } else { + insight::format_error("`check_residuals()` only works with objects supported by `simulate_residuals()` or `DHARMa::simulateResiduals()`.") # nolint + } +} + +#' @export +check_residuals.performance_simres <- function(x, alternative = c("two.sided", "less", "greater"), ...) { + alternative <- match.arg(alternative) + ts_test <- suppressWarnings( + stats::ks.test( + stats::residuals(x), + "punif", + alternative = alternative, + ... + ) + ) + + p.val <- ts_test$p.value + + attr(p.val, "data") <- x + attr(p.val, "object_name") <- insight::safe_deparse_symbol(substitute(x)) + class(p.val) <- unique(c("check_residuals", "see_check_residuals", class(p.val))) + + p.val +} + +#' @export +check_residuals.DHARMa <- check_residuals.performance_simres + + +# methods ------------------------------ + +#' @export +print.check_residuals <- function(x, ...) { + pstring <- insight::format_p(x) + + if (x < 0.05) { + insight::print_color( + sprintf( + "Warning: Non-uniformity of simulated residuals detected (%s).\n", pstring + ), + "red" + ) + } else { + insight::print_color( + sprintf( + "OK: Simulated residuals appear as uniformly distributed (%s).\n", pstring + ), + "green" + ) + } + + invisible(x) +} + +#' @export +plot.check_residuals <- function(x, ...) { + insight::check_if_installed("see", "for residual plots") + NextMethod() +} diff --git a/R/check_singularity.R b/R/check_singularity.R index 3128ef933..44d66d221 100644 --- a/R/check_singularity.R +++ b/R/check_singularity.R @@ -36,6 +36,23 @@ #' - "keep it maximal", i.e. fit the most complex model consistent with the #' experimental design, removing only terms required to allow a non-singular #' fit (_Barr et al. 2013_) +#' - since version 1.1.9, the **glmmTMB** package allows to use priors in a +#' frequentist framework, too. One recommendation is to use a Gamma prior +#' (_Chung et al. 2013_). The mean may vary from 1 to very large values +#' (like `1e8`), and the shape parameter should be set to a value of 2.5. You +#' can then `update()` your model with the specified prior. In **glmmTMB**, +#' the code would look like this: +#' ``` +#' # "model" is an object of class gmmmTMB +#' prior <- data.frame( +#' prior = "gamma(1, 2.5)", # mean can be 1, but even 1e8 +#' class = "ranef" # for random effects +#' ) +#' model_with_priors <- update(model, priors = prior) +#' ``` +#' Large values for the mean parameter of the Gamma prior have no large impact +#' on the random effects variances in terms of a "bias". Thus, if `1` doesn't +#' fix the singular fit, you can safely try larger values. #' #' Note the different meaning between singularity and convergence: singularity #' indicates an issue with the "true" best estimate, i.e. whether the maximum @@ -48,21 +65,23 @@ #' #' @references #' - Bates D, Kliegl R, Vasishth S, Baayen H. Parsimonious Mixed Models. -#' arXiv:1506.04967, June 2015. +#' arXiv:1506.04967, June 2015. #' #' - Barr DJ, Levy R, Scheepers C, Tily HJ. Random effects structure for -#' confirmatory hypothesis testing: Keep it maximal. Journal of Memory and -#' Language, 68(3):255-278, April 2013. +#' confirmatory hypothesis testing: Keep it maximal. Journal of Memory and +#' Language, 68(3):255-278, April 2013. #' -#' - Matuschek H, Kliegl R, Vasishth S, Baayen H, Bates D. Balancing type -#' I error and power in linear mixed models. Journal of Memory and Language, -#' 94:305-315, 2017. +#' - Chung Y, Rabe-Hesketh S, Dorie V, Gelman A, and Liu J. 2013. "A Nondegenerate +#' Penalized Likelihood Estimator for Variance Parameters in Multilevel Models." +#' Psychometrika 78 (4): 685–709. \doi{10.1007/s11336-013-9328-2} +#' +#' - Matuschek H, Kliegl R, Vasishth S, Baayen H, Bates D. Balancing type I error +#' and power in linear mixed models. Journal of Memory and Language, 94:305-315, 2017. #' #' - lme4 Reference Manual, #' -#' @examplesIf require("lme4") -#' library(lme4) -#' data(sleepstudy) +#' @examplesIf require("lme4") && require("glmmTMB") +#' data(sleepstudy, package = "lme4") #' set.seed(123) #' sleepstudy$mygrp <- sample(1:5, size = 180, replace = TRUE) #' sleepstudy$mysubgrp <- NA @@ -72,14 +91,34 @@ #' sample(1:30, size = sum(filter_group), replace = TRUE) #' } #' -#' model <- lmer( +#' model <- lme4::lmer( #' Reaction ~ Days + (1 | mygrp / mysubgrp) + (1 | Subject), #' data = sleepstudy #' ) +#' check_singularity(model) #' +#' # Fixing singularity issues using priors in glmmTMB +#' # Example taken from `vignette("priors", package = "glmmTMB")` +#' dat <- readRDS(system.file("vignette_data", "gophertortoise.rds", +#' package = "glmmTMB")) +#' model <- glmmTMB::glmmTMB( +#' shells ~ prev + offset(log(Area)) + factor(year) + (1 | Site), +#' family = poisson, +#' data = dat +#' ) +#' # singular fit #' check_singularity(model) +#' +#' # impose Gamma prior on random effects parameters +#' prior <- data.frame( +#' prior = "gamma(1, 2.5)", # mean can be 1, but even 1e8 +#' class = "ranef" # for random effects +#' ) +#' model_with_priors <- update(model, priors = prior) +#' # no singular fit +#' check_singularity(model_with_priors) +#' #' @export - check_singularity <- function(x, tolerance = 1e-5, ...) { UseMethod("check_singularity") } @@ -101,15 +140,24 @@ check_singularity.merMod <- function(x, tolerance = 1e-5, ...) { check_singularity.rlmerMod <- check_singularity.merMod - #' @export check_singularity.glmmTMB <- function(x, tolerance = 1e-5, ...) { insight::check_if_installed("lme4") - vc <- .collapse_cond(lme4::VarCorr(x)) - any(sapply(vc, function(.x) any(abs(diag(.x)) < tolerance))) + eigen_values <- list() + vv <- lme4::VarCorr(x) + for (component in c("cond", "zi")) { + for (i in seq_along(vv[[component]])) { + eigen_values <- c( + eigen_values, + list(eigen(vv[[component]][[i]], only.values = TRUE)$values) + ) + } + } + any(vapply(eigen_values, min, numeric(1), na.rm = TRUE) < tolerance) } + #' @export check_singularity.glmmadmb <- check_singularity.glmmTMB diff --git a/R/check_sphericity.R b/R/check_sphericity.R index a087a1a5f..6df73d032 100644 --- a/R/check_sphericity.R +++ b/R/check_sphericity.R @@ -67,7 +67,7 @@ check_sphericity.Anova.mlm <- function(x, ...) { p.val <- test[, 2] - # sanity check + # validation check if (is.null(p.val)) { p.val <- 1 } diff --git a/R/check_zeroinflation.R b/R/check_zeroinflation.R index f0f19b369..5f87941f3 100644 --- a/R/check_zeroinflation.R +++ b/R/check_zeroinflation.R @@ -7,9 +7,13 @@ #' @param x Fitted model of class `merMod`, `glmmTMB`, `glm`, or `glm.nb` #' (package **MASS**). #' @param tolerance The tolerance for the ratio of observed and predicted -#' zeros to considered as over- or underfitting zeros. A ratio -#' between 1 +/- `tolerance` is considered as OK, while a ratio -#' beyond or below this threshold would indicate over- or underfitting. +#' zeros to considered as over- or underfitting zeros. A ratio +#' between 1 +/- `tolerance` is considered as OK, while a ratio +#' beyond or below this threshold would indicate over- or underfitting. +#' @param alternative A character string specifying the alternative hypothesis. +#' @param ... Arguments passed down to [`simulate_residuals()`]. This only applies +#' for models with zero-inflation component, or for models of class `glmmTMB` +#' from `nbinom1` or `nbinom2` family. #' #' @return A list with information about the amount of predicted and observed #' zeros in the outcome, as well as the ratio between these two values. @@ -19,14 +23,53 @@ #' zero-inflation in the data. In such cases, it is recommended to use #' negative binomial or zero-inflated models. #' +#' In case of negative binomial models, models with zero-inflation component, +#' or hurdle models, the results from `check_zeroinflation()` are based on +#' [`simulate_residuals()`], i.e. `check_zeroinflation(simulate_residuals(model))` +#' is internally called if necessary. +#' +#' @section Tests based on simulated residuals: +#' For certain models, resp. model from certain families, tests are based on +#' simulated residuals (see [`simulate_residuals()`]). These are usually more +#' accurate for testing such models than the traditionally used Pearson residuals. +#' However, when simulating from more complex models, such as mixed models or +#' models with zero-inflation, there are several important considerations. +#' Arguments specified in `...` are passed to [`simulate_residuals()`], which +#' relies on [`DHARMa::simulateResiduals()`] (and therefore, arguments in `...` +#' are passed further down to _DHARMa_). The defaults in DHARMa are set on the +#' most conservative option that works for all models. However, in many cases, +#' the help advises to use different settings in particular situations or for +#' particular models. It is recommended to read the 'Details' in +#' `?DHARMa::simulateResiduals` closely to understand the implications of the +#' simulation process and which arguments should be modified to get the most +#' accurate results. +#' #' @family functions to check model assumptions and and assess model quality #' -#' @examplesIf require("glmmTMB") +#' @examplesIf require("glmmTMB") && require("DHARMa") #' data(Salamanders, package = "glmmTMB") #' m <- glm(count ~ spp + mined, family = poisson, data = Salamanders) #' check_zeroinflation(m) +#' +#' # for models with zero-inflation component, it's better to carry out +#' # the check for zero-inflation using simulated residuals +#' m <- glmmTMB::glmmTMB( +#' count ~ spp + mined, +#' ziformula = ~ mined + spp, +#' family = poisson, +#' data = Salamanders +#' ) +#' res <- simulate_residuals(m) +#' check_zeroinflation(res) +#' @export +check_zeroinflation <- function(x, ...) { + UseMethod("check_zeroinflation") +} + + +#' @rdname check_zeroinflation #' @export -check_zeroinflation <- function(x, tolerance = 0.05) { +check_zeroinflation.default <- function(x, tolerance = 0.05, ...) { # check if we have poisson model_info <- insight::model_info(x) if (!model_info$is_count) { @@ -41,28 +84,22 @@ check_zeroinflation <- function(x, tolerance = 0.05) { return(NULL) } - # get predictions of outcome - mu <- stats::fitted(x) + # model classes not supported in DHARMa + not_supported <- c("fixest", "glmx") - # get overdispersion parameters - if (model_info$is_negbin) { - if (methods::is(x, "glmmTMB")) { - theta <- stats::sigma(x) - } else if (methods::is(x, "glmerMod")) { - theta <- environment(x@resp$family$aic)[[".Theta"]] - } else { - theta <- x$theta + # for models with zero-inflation component or negative binomial families, + # we use simulate_residuals() + if (!inherits(x, not_supported) && (model_info$is_zero_inflated || model_info$is_negbin || model_info$family == "genpois")) { # nolint + if (missing(tolerance)) { + tolerance <- 0.1 } - } else { - theta <- NULL + return(check_zeroinflation(simulate_residuals(x, ...), tolerance = tolerance, ...)) } + # get predictions of outcome + mu <- stats::fitted(x) # get predicted zero-counts - if (!is.null(theta)) { - pred.zero <- round(sum(stats::dnbinom(x = 0, size = theta, mu = mu))) - } else { - pred.zero <- round(sum(stats::dpois(x = 0, lambda = mu))) - } + pred.zero <- round(sum(stats::dpois(x = 0, lambda = mu))) # proportion structure( @@ -77,6 +114,33 @@ check_zeroinflation <- function(x, tolerance = 0.05) { } +#' @rdname check_zeroinflation +#' @export +check_zeroinflation.performance_simres <- function(x, + tolerance = 0.1, + alternative = c("two.sided", "less", "greater"), + ...) { + # match arguments + alternative <- match.arg(alternative) + + # compute test results + result <- .simres_statistics(x, statistic_fun = function(i) sum(i == 0), alternative = alternative) + + structure( + class = "check_zi", + list( + predicted.zeros = round(mean(result$simulated)), + observed.zeros = result$observed, + ratio = mean(result$simulated) / result$observed, + tolerance = tolerance, + p.value = result$p + ) + ) +} + +#' @export +check_zeroinflation.DHARMa <- check_zeroinflation.performance_simres + # methods ------------------ @@ -90,12 +154,22 @@ print.check_zi <- function(x, ...) { lower <- 1 - x$tolerance upper <- 1 + x$tolerance + if (is.null(x$p.value)) { + p_string <- "" + } else { + p_string <- paste0(" (", insight::format_p(x$p.value), ")") + } + if (x$ratio < lower) { - message("Model is underfitting zeros (probable zero-inflation).") + message("Model is underfitting zeros (probable zero-inflation)", p_string, ".") } else if (x$ratio > upper) { - message("Model is overfitting zeros.") + message("Model is overfitting zeros", p_string, ".") } else { - insight::format_alert("Model seems ok, ratio of observed and predicted zeros is within the tolerance range.") + insight::format_alert(paste0( + "Model seems ok, ratio of observed and predicted zeros is within the tolerance range", + p_string, + "." + )) } invisible(x) diff --git a/R/compare_performance.R b/R/compare_performance.R index 76b0b329f..2318d5e63 100644 --- a/R/compare_performance.R +++ b/R/compare_performance.R @@ -86,21 +86,21 @@ #' @export compare_performance <- function(..., metrics = "all", rank = FALSE, estimator = "ML", verbose = TRUE) { # process input - objects <- insight::ellipsis_info(..., only_models = TRUE) + model_objects <- insight::ellipsis_info(..., only_models = TRUE) # ensure proper object names - objects <- .check_objectnames(objects, sapply(match.call(expand.dots = FALSE)$`...`, as.character)) + model_objects <- .check_objectnames(model_objects, sapply(match.call(expand.dots = FALSE)[["..."]], as.character)) # drop unsupport models - supported_models <- sapply(objects, function(i) insight::is_model_supported(i) | inherits(i, "lavaan")) - object_names <- names(objects) + supported_models <- sapply(model_objects, function(i) insight::is_model_supported(i) | inherits(i, "lavaan")) + object_names <- names(model_objects) if (!all(supported_models)) { insight::format_alert( "Following objects are not supported:", datawizard::text_concatenate(object_names[!supported_models], enclose = "`") ) - objects <- objects[supported_models] + model_objects <- model_objects[supported_models] object_names <- object_names[supported_models] } @@ -110,8 +110,8 @@ compare_performance <- function(..., metrics = "all", rank = FALSE, estimator = model_name <- gsub("\"", "", insight::safe_deparse(.y), fixed = TRUE) perf_df <- data.frame(Name = model_name, Model = class(.x)[1], dat, stringsAsFactors = FALSE) attributes(perf_df) <- c(attributes(perf_df), attributes(dat)[!names(attributes(dat)) %in% c("names", "row.names", "class")]) - return(perf_df) - }, objects, object_names, SIMPLIFY = FALSE) + perf_df + }, model_objects, object_names, SIMPLIFY = FALSE) attri <- lapply(m, function(x) { attri <- attributes(x) @@ -132,7 +132,7 @@ compare_performance <- function(..., metrics = "all", rank = FALSE, estimator = } # check if all models were fit from same data - if (!isTRUE(attributes(objects)$same_response) && verbose) { + if (!isTRUE(attributes(model_objects)$same_response) && verbose) { insight::format_alert( "When comparing models, please note that probably not all models were fit from same data." ) @@ -188,9 +188,9 @@ compare_performance <- function(..., metrics = "all", rank = FALSE, estimator = # only for IC comparison any(grepl("(AIC|BIC)", names(dfs))) && # only when mixed models are involved, others probably don't have problems with REML fit - any(sapply(objects, insight::is_mixed_model)) && + any(sapply(model_objects, insight::is_mixed_model)) && # only if not all models have same fixed effects (else, REML is ok) - !isTRUE(attributes(objects)$same_fixef)) { + !isTRUE(attributes(model_objects)$same_fixef)) { insight::format_alert( "Information criteria (like AIC) are based on REML fits (i.e. `estimator=\"REML\"`).", "Please note that information criteria are probably not directly comparable and that it is not recommended comparing models with different fixed effects in such cases." diff --git a/R/cronbachs_alpha.R b/R/cronbachs_alpha.R index 6d4547b40..735641ccc 100644 --- a/R/cronbachs_alpha.R +++ b/R/cronbachs_alpha.R @@ -40,7 +40,7 @@ cronbachs_alpha.data.frame <- function(x, verbose = TRUE, ...) { # we need at least two columns for Cronach's Alpha if (is.null(ncol(.data)) || ncol(.data) < 2) { if (verbose) { - insight::format_warning("Too few columns in `x` to compute Cronbach's Alpha.") + insight::format_alert("Too few columns in `x` to compute Cronbach's Alpha.") } return(NULL) } @@ -68,7 +68,7 @@ cronbachs_alpha.parameters_pca <- function(x, verbose = TRUE, ...) { pca_data <- attr(x, "data") if (is.null(pca_data)) { if (verbose) { - insight::format_warning("Could not find data frame that was used for the PCA.") + insight::format_alert("Could not find data frame that was used for the PCA.") } return(NULL) } diff --git a/R/helpers.R b/R/helpers.R index 3032ffa32..c231d6226 100644 --- a/R/helpers.R +++ b/R/helpers.R @@ -1,6 +1,10 @@ # small wrapper around this commonly used try-catch .safe <- function(code, on_error = NULL) { - tryCatch(code, error = function(e) on_error) + if (getOption("easystats_erros", FALSE) && is.null(on_error)) { + code + } else { + tryCatch(code, error = function(e) on_error) + } } @@ -34,13 +38,10 @@ if (!is.numeric(x)) { return(x) } - # remove missings tmp <- x[!is.na(x)] - # standardize tmp <- (tmp - mean(tmp)) / stats::sd(tmp) - # and fill in values in original vector x[!is.na(x)] <- tmp @@ -65,10 +66,10 @@ .get_sigma <- function(model, verbose = TRUE) { s <- insight::get_sigma(model, ci = NULL, verbose = verbose) - if (!is.null(s)) { - as.numeric(s) - } else { + if (is.null(s)) { NULL + } else { + as.numeric(s) } } diff --git a/R/icc.R b/R/icc.R index 16821e85f..47fbb9a45 100644 --- a/R/icc.R +++ b/R/icc.R @@ -267,7 +267,7 @@ icc <- function(model, # CI for adjusted ICC icc_ci_adjusted <- as.vector(result$t[, 1]) icc_ci_adjusted <- icc_ci_adjusted[!is.na(icc_ci_adjusted)] - # sanity check + # validation check if (length(icc_ci_adjusted) > 0) { icc_ci_adjusted <- bayestestR::eti(icc_ci_adjusted, ci = ci) } else { @@ -276,7 +276,7 @@ icc <- function(model, # CI for unadjusted ICC icc_ci_unadjusted <- as.vector(result$t[, 2]) icc_ci_unadjusted <- icc_ci_unadjusted[!is.na(icc_ci_unadjusted)] - # sanity check + # validation check if (length(icc_ci_unadjusted) > 0) { icc_ci_unadjusted <- bayestestR::eti(icc_ci_unadjusted, ci = ci) } else { @@ -532,14 +532,12 @@ print.icc_decomposed <- function(x, digits = 2, ...) { name_full = "ICC", verbose = TRUE) { vars <- tryCatch( - { - insight::get_variance(model, - name_fun = name_fun, - name_full = name_full, - tolerance = tolerance, - verbose = verbose - ) - }, + insight::get_variance(model, + name_fun = name_fun, + name_full = name_full, + tolerance = tolerance, + verbose = verbose + ), error = function(e) { if (inherits(e, c("simpleError", "error")) && verbose) { insight::print_color(e$message, "red") @@ -597,7 +595,7 @@ print.icc_decomposed <- function(x, digits = 2, ...) { # prepare arguments for "lme4::bootMer" .do_lme4_bootmer <- function(model, .boot_fun, iterations, dots) { insight::check_if_installed(c("lme4", "boot")) - args <- list( + my_args <- list( model, .boot_fun, nsim = iterations, @@ -608,25 +606,25 @@ print.icc_decomposed <- function(x, digits = 2, ...) { ) # add/overwrite dot-args if (!is.null(dots[["use.u"]])) { - args$use.u <- dots[["use.u"]] + my_args$use.u <- dots[["use.u"]] } if (!is.null(dots[["re.form"]])) { - args$re.form <- dots[["re.form"]] + my_args$re.form <- dots[["re.form"]] } if (!is.null(dots[["type"]])) { - args$type <- dots[["type"]] - if (args$type == "semiparametric") { - args$use.u <- TRUE + my_args$type <- dots[["type"]] + if (my_args$type == "semiparametric") { + my_args$use.u <- TRUE } } if (!is.null(dots[["parallel"]])) { - args$parallel <- dots[["parallel"]] + my_args$parallel <- dots[["parallel"]] } if (!is.null(dots[["ncpus"]])) { - args$ncpus <- dots[["ncpus"]] + my_args$ncpus <- dots[["ncpus"]] } # bootsrap - do.call(lme4::bootMer, args) + do.call(lme4::bootMer, args = my_args) } @@ -664,12 +662,10 @@ print.icc_decomposed <- function(x, digits = 2, ...) { } model_rank <- tryCatch( - { - if (!is.null(model$rank)) { - model$rank - df_int - } else { - insight::n_parameters(model) - df_int - } + if (!is.null(model$rank)) { + model$rank - df_int + } else { + insight::n_parameters(model) - df_int }, error = function(e) insight::n_parameters(model) - df_int ) diff --git a/R/looic.R b/R/looic.R index 8f0a0c66e..4ded6ccd7 100644 --- a/R/looic.R +++ b/R/looic.R @@ -12,6 +12,7 @@ #' @return A list with four elements, the ELPD, LOOIC and their standard errors. #' #' @examplesIf require("rstanarm") +#' \donttest{ #' model <- suppressWarnings(rstanarm::stan_glm( #' mpg ~ wt + cyl, #' data = mtcars, @@ -20,6 +21,7 @@ #' refresh = 0 #' )) #' looic(model) +#' } #' @export looic <- function(model, verbose = TRUE) { insight::check_if_installed("loo") diff --git a/R/model_performance.bayesian.R b/R/model_performance.bayesian.R index fd796fb60..2ba4e5130 100644 --- a/R/model_performance.bayesian.R +++ b/R/model_performance.bayesian.R @@ -40,7 +40,7 @@ #' #' - **PCP**: percentage of correct predictions, see [performance_pcp()]. #' -#' @examplesIf require("rstanarm") && require("rstantools") && require("BayesFactor") +#' @examplesIf require("rstanarm") && require("rstantools") #' \donttest{ #' model <- suppressWarnings(rstanarm::stan_glm( #' mpg ~ wt + cyl, @@ -59,12 +59,6 @@ #' refresh = 0 #' )) #' model_performance(model) -#' -#' model <- BayesFactor::generalTestBF(carb ~ am + mpg, mtcars) -#' -#' model_performance(model) -#' model_performance(model[3]) -#' model_performance(model, average = TRUE) #' } #' @seealso [r2_bayes] #' @references Gelman, A., Goodrich, B., Gabry, J., and Vehtari, A. (2018). diff --git a/R/model_performance.rma.R b/R/model_performance.rma.R index 6a3fb2e93..d7fd37e09 100644 --- a/R/model_performance.rma.R +++ b/R/model_performance.rma.R @@ -1,7 +1,7 @@ #' Performance of Meta-Analysis Models #' #' Compute indices of model performance for meta-analysis model from the -#' \pkg{metafor} package. +#' **metafor** package. #' #' @param model A `rma` object as returned by `metafor::rma()`. #' @param metrics Can be `"all"` or a character vector of metrics to be @@ -18,7 +18,7 @@ #' #' - **AIC** Akaike's Information Criterion, see `?stats::AIC` #' -#' - **BIC** {Bayesian Information Criterion, see `?stats::BIC`} +#' - **BIC** Bayesian Information Criterion, see `?stats::BIC` #' #' - **I2**: For a random effects model, `I2` estimates (in #' percent) how much of the total variability in the effect size estimates diff --git a/R/performance_aicc.R b/R/performance_aicc.R index 171a97ca2..e222dfc82 100644 --- a/R/performance_aicc.R +++ b/R/performance_aicc.R @@ -14,7 +14,8 @@ #' @param x A model object. #' @param estimator Only for linear models. Corresponds to the different #' estimators for the standard deviation of the errors. If `estimator = "ML"` -#' (default), the scaling is done by n (the biased ML estimator), which is +#' (default, except for `performance_aic()` when the model object is of class +#' `lmerMod`), the scaling is done by `n` (the biased ML estimator), which is #' then equivalent to using `AIC(logLik())`. Setting it to `"REML"` will give #' the same results as `AIC(logLik(..., REML = TRUE))`. #' @param verbose Toggle warnings. @@ -101,11 +102,16 @@ performance_aic.default <- function(x, estimator = "ML", verbose = TRUE, ...) { # mixed models ------------------------------------ +#' @rdname performance_aicc #' @export performance_aic.lmerMod <- function(x, estimator = "REML", verbose = TRUE, ...) { REML <- identical(estimator, "REML") if (isFALSE(list(...)$REML)) REML <- FALSE + if (isFALSE(as.logical(x@devcomp$dims[["REML"]])) && isTRUE(REML) && verbose) { + insight::format_alert("Model was not fitted with REML, however, `estimator = \"REML\"`. Set `estimator = \"ML\"` to obtain identical results as from `AIC()`.") # nolint + } + .safe( stats::AIC(insight::get_loglikelihood(x, check_response = TRUE, REML = REML, verbose = verbose)) ) @@ -275,26 +281,17 @@ performance_aicc.rma <- function(x, ...) { tryCatch( { trans <- insight::find_transformation(x) - - if (trans == "identity") { - .weighted_sum(log(insight::get_response(x)), w = model_weights) - } else if (trans == "log") { - .weighted_sum(log(1 / insight::get_response(x)), w = model_weights) - } else if (trans == "log1p") { - .weighted_sum(log(1 / (insight::get_response(x) + 1)), w = model_weights) - } else if (trans == "log2") { - .weighted_sum(log(1 / (insight::get_response(x) * log(2))), w = model_weights) - } else if (trans == "log10") { - .weighted_sum(log(1 / (insight::get_response(x) * log(10))), w = model_weights) - } else if (trans == "exp") { - .weighted_sum(insight::get_response(x), w = model_weights) - } else if (trans == "expm1") { - .weighted_sum((insight::get_response(x) - 1), w = model_weights) - } else if (trans == "sqrt") { - .weighted_sum(log(0.5 / sqrt(insight::get_response(x))), w = model_weights) - } else { + switch(trans, + identity = .weighted_sum(log(insight::get_response(x)), w = model_weights), + log = .weighted_sum(log(1 / insight::get_response(x)), w = model_weights), + log1p = .weighted_sum(log(1 / (insight::get_response(x) + 1)), w = model_weights), + log2 = .weighted_sum(log(1 / (insight::get_response(x) * log(2))), w = model_weights), + log10 = .weighted_sum(log(1 / (insight::get_response(x) * log(10))), w = model_weights), + exp = .weighted_sum(insight::get_response(x), w = model_weights), + expm1 = .weighted_sum((insight::get_response(x) - 1), w = model_weights), + sqrt = .weighted_sum(log(0.5 / sqrt(insight::get_response(x))), w = model_weights), .ll_jacobian_adjustment(x, model_weights) - } + ) }, error = function(e) { NULL diff --git a/R/performance_pcp.R b/R/performance_pcp.R index 653c4afa5..edcf4c724 100644 --- a/R/performance_pcp.R +++ b/R/performance_pcp.R @@ -112,13 +112,13 @@ print.performance_pcp <- function(x, digits = 2, ...) { #' @export as.data.frame.performance_pcp <- function(x, row.names = NULL, ...) { data.frame( - "Model" = c("full", "null"), - "Estimate" = c(x$pcp_model, x$pcp_m0), - "CI_low" = c(x$model_ci_low, x$null_ci_low), - "CI_high" = c(x$model_ci_high, x$null_ci_high), - "Chisq" = c(NA, x$lrt_chisq), - "df_error" = c(NA, x$lrt_df_error), - "p" = c(NA, x$lrt_p), + Model = c("full", "null"), + Estimate = c(x$pcp_model, x$pcp_m0), + CI_low = c(x$model_ci_low, x$null_ci_low), + CI_high = c(x$model_ci_high, x$null_ci_high), + Chisq = c(NA, x$lrt_chisq), + df_error = c(NA, x$lrt_df_error), + p = c(NA, x$lrt_p), stringsAsFactors = FALSE, row.names = row.names, ... diff --git a/R/performance_score.R b/R/performance_score.R index eb5ee9b31..5ca3fdc84 100644 --- a/R/performance_score.R +++ b/R/performance_score.R @@ -65,7 +65,7 @@ performance_score <- function(model, verbose = TRUE, ...) { if (minfo$is_ordinal || minfo$is_multinomial) { if (verbose) { - insight::print_color("Can't calculate proper scoring rules for ordinal, multinomial or cumulative link models.\n", "red") + insight::format_alert("Can't calculate proper scoring rules for ordinal, multinomial or cumulative link models.") } return(list(logarithmic = NA, quadratic = NA, spherical = NA)) } @@ -74,10 +74,7 @@ performance_score <- function(model, verbose = TRUE, ...) { if (!is.null(ncol(resp)) && ncol(resp) > 1) { if (verbose) { - insight::print_color( - "Can't calculate proper scoring rules for models without integer response values.\n", - "red" - ) + insight::format_alert("Can't calculate proper scoring rules for models without integer response values.") } return(list(logarithmic = NA, quadratic = NA, spherical = NA)) } @@ -127,7 +124,14 @@ performance_score <- function(model, verbose = TRUE, ...) { } else { datawizard::to_numeric(resp, dummy_factors = FALSE, preserve_levels = TRUE) } - p_y <- prob_fun(resp, mean = pr$pred, pis = pr$pred_zi, sum(resp)) + p_y <- .safe(suppressWarnings(prob_fun(resp, mean = pr$pred, pis = pr$pred_zi, sum(resp)))) + + if (is.null(p_y) || all(is.na(p_y))) { + if (verbose) { + insight::format_alert("Can't calculate proper scoring rules for this model.") + } + return(list(logarithmic = NA, quadratic = NA, spherical = NA)) + } quadrat_p <- sum(p_y^2) @@ -205,27 +209,25 @@ print.performance_score <- function(x, ...) { pred_zi <- NULL tryCatch( - { - if (inherits(model, "MixMod")) { - pred <- stats::predict(model, type = "subject_specific") - pred_zi <- if (!is.null(model$gammas)) attr(pred, "zi_probs") - } else if (inherits(model, "glmmTMB")) { - pred <- stats::predict(model, type = "response") - pred_zi <- stats::predict(model, type = "zprob") - } else if (inherits(model, c("hurdle", "zeroinfl"))) { - pred <- stats::predict(model, type = "response") - pred_zi <- stats::predict(model, type = "zero") - } else if (inherits(model, c("clm", "clm2", "clmm"))) { - pred <- stats::predict(model) - } else if (all(inherits(model, c("stanreg", "lmerMod"), which = TRUE)) > 0) { - insight::check_if_installed("rstanarm") - pred <- colMeans(rstanarm::posterior_predict(model)) - } else { - pred <- stats::predict(model, type = "response") - } + if (inherits(model, "MixMod")) { + pred <- stats::predict(model, type = "subject_specific") + pred_zi <- if (!is.null(model$gammas)) attr(pred, "zi_probs") + } else if (inherits(model, "glmmTMB")) { + pred <- stats::predict(model, type = "response") + pred_zi <- stats::predict(model, type = "zprob") + } else if (inherits(model, c("hurdle", "zeroinfl"))) { + pred <- stats::predict(model, type = "response") + pred_zi <- stats::predict(model, type = "zero") + } else if (inherits(model, c("clm", "clm2", "clmm"))) { + pred <- stats::predict(model) + } else if (all(inherits(model, c("stanreg", "lmerMod"), which = TRUE)) > 0) { + insight::check_if_installed("rstanarm") + pred <- colMeans(rstanarm::posterior_predict(model)) + } else { + pred <- stats::predict(model, type = "response") }, error = function(e) { - return(NULL) + NULL } ) diff --git a/R/r2.R b/R/r2.R index c844f22cd..94982b401 100644 --- a/R/r2.R +++ b/R/r2.R @@ -386,6 +386,9 @@ r2.censReg <- function(model, ...) { #' @export r2.cpglm <- r2.censReg +#' @export +r2.serp <- r2.censReg + #' @export r2.clm <- r2.censReg @@ -467,9 +470,6 @@ r2.merMod <- function(model, ci = NULL, tolerance = 1e-5, ...) { r2_nakagawa(model, ci = ci, tolerance = tolerance, ...) } -#' @export -r2.glmmTMB <- r2.merMod - #' @export r2.cpglmm <- r2.merMod @@ -491,6 +491,48 @@ r2.MixMod <- r2.merMod #' @export r2.rlmerMod <- r2.merMod +#' @export +r2.glmmTMB <- function(model, ci = NULL, tolerance = 1e-5, verbose = TRUE, ...) { + # most models are mixed models + if (insight::is_mixed_model(model)) { + return(r2_nakagawa(model, ci = ci, tolerance = tolerance, ...)) + } else { + if (!is.null(ci) && !is.na(ci)) { + return(.r2_ci(model, ci = ci, ...)) + } + # calculate r2 for non-mixed glmmTMB models here ------------------------- + info <- insight::model_info(model, verbose = FALSE) + + if (info$is_linear) { + # for linear models, use the manual calculation + out <- .safe(.r2_lm_manual(model)) + } else if (info$is_logit && info$is_bernoulli) { + # logistic regression with binary outcome + out <- list(R2_Tjur = r2_tjur(model, model_info = info, ...)) + attr(out, "model_type") <- "Logistic" + names(out$R2_Tjur) <- "Tjur's R2" + class(out) <- c("r2_pseudo", class(out)) + } else if (info$is_binomial && !info$is_bernoulli) { + # currently, non-bernoulli binomial models are not supported + if (verbose) { + insight::format_warning("Can't calculate accurate R2 for binomial models that are not Bernoulli models.") + } + out <- NULL + } else if ((info$is_poisson && !info$is_zero_inflated) || info$is_exponential) { + # Poisson-regression or Gamma uses Nagelkerke's R2 + out <- list(R2_Nagelkerke = r2_nagelkerke(model, ...)) + names(out$R2_Nagelkerke) <- "Nagelkerke's R2" + attr(out, "model_type") <- "Generalized Linear" + class(out) <- c("r2_pseudo", class(out)) + } else if (info$is_zero_inflated) { + # zero-inflated models use the default method + out <- r2_zeroinflated(model) + } else { + insight::format_error("`r2()` does not support models of class `glmmTMB` without random effects and this link-function.") # nolint + } + } + out +} #' @export r2.wbm <- function(model, tolerance = 1e-5, ...) { @@ -840,3 +882,43 @@ r2.DirichletRegModel <- function(model, ...) { } ci } + + +.r2_lm_manual <- function(model) { + w <- insight::get_weights(model, verbose = FALSE) + r <- stats::residuals(model) + f <- stats::fitted(model) + n <- length(r) + rdf <- .safe(stats::df.residual(model)) + df_int <- .safe(as.numeric(insight::has_intercept(model))) + + if (insight::has_intercept(model)) { + if (is.null(w)) { + mss <- sum((f - mean(f))^2) + } else { + m <- sum(w * f / sum(w)) + mss <- sum(w * (f - m)^2) + } + } else if (is.null(w)) { + mss <- sum(f^2) + } else { + mss <- sum(w * f^2) + } + if (is.null(w)) { + rss <- sum(r^2) + } else { + rss <- sum(w * r^2) + } + r_squared <- mss / (mss + rss) + if (is.null(df_int) || is.null(rdf)) { + adj_r2 <- NULL + } else { + adj_r2 <- 1 - (1 - r_squared) * ((n - df_int) / rdf) + } + out <- list(R2 = r_squared, R2_adjusted = adj_r2) + + names(out$R2) <- "R2" + names(out$R2_adjusted) <- "adjusted R2" + attr(out, "model_type") <- "Linear" + structure(class = "r2_generic", out) +} diff --git a/R/r2_bayes.R b/R/r2_bayes.R index fc007489c..ec98f754b 100644 --- a/R/r2_bayes.R +++ b/R/r2_bayes.R @@ -30,67 +30,45 @@ #' `r2_posterior()` is the actual workhorse for `r2_bayes()` and #' returns a posterior sample of Bayesian R2 values. #' -#' @examples +#' @examplesIf require("rstanarm") && require("rstantools") && require("brms") #' library(performance) -#' if (require("rstanarm") && require("rstantools")) { -#' model <- suppressWarnings(stan_glm( -#' mpg ~ wt + cyl, -#' data = mtcars, -#' chains = 1, -#' iter = 500, -#' refresh = 0, -#' show_messages = FALSE -#' )) -#' r2_bayes(model) -#' -#' model <- suppressWarnings(stan_lmer( -#' Petal.Length ~ Petal.Width + (1 | Species), -#' data = iris, -#' chains = 1, -#' iter = 500, -#' refresh = 0 -#' )) -#' r2_bayes(model) -#' } -#' -#' if (require("BayesFactor")) { -#' BFM <- generalTestBF(mpg ~ qsec + gear, data = mtcars, progress = FALSE) -#' FM <- lmBF(mpg ~ qsec + gear, data = mtcars) -#' -#' r2_bayes(FM) -#' r2_bayes(BFM[3]) -#' r2_bayes(BFM, average = TRUE) # across all models -#' -#' # with random effects: -#' mtcars$gear <- factor(mtcars$gear) -#' model <- lmBF( -#' mpg ~ hp + cyl + gear + gear:wt, -#' mtcars, -#' progress = FALSE, -#' whichRandom = c("gear", "gear:wt") -#' ) +#' \donttest{ +#' model <- suppressWarnings(rstanarm::stan_glm( +#' mpg ~ wt + cyl, +#' data = mtcars, +#' chains = 1, +#' iter = 500, +#' refresh = 0, +#' show_messages = FALSE +#' )) +#' r2_bayes(model) #' -#' r2_bayes(model) +#' model <- suppressWarnings(rstanarm::stan_lmer( +#' Petal.Length ~ Petal.Width + (1 | Species), +#' data = iris, +#' chains = 1, +#' iter = 500, +#' refresh = 0 +#' )) +#' r2_bayes(model) #' } #' #' \donttest{ -#' if (require("brms")) { -#' model <- suppressWarnings(brms::brm( -#' mpg ~ wt + cyl, -#' data = mtcars, -#' silent = 2, -#' refresh = 0 -#' )) -#' r2_bayes(model) +#' model <- suppressWarnings(brms::brm( +#' mpg ~ wt + cyl, +#' data = mtcars, +#' silent = 2, +#' refresh = 0 +#' )) +#' r2_bayes(model) #' -#' model <- suppressWarnings(brms::brm( -#' Petal.Length ~ Petal.Width + (1 | Species), -#' data = iris, -#' silent = 2, -#' refresh = 0 -#' )) -#' r2_bayes(model) -#' } +#' model <- suppressWarnings(brms::brm( +#' Petal.Length ~ Petal.Width + (1 | Species), +#' data = iris, +#' silent = 2, +#' refresh = 0 +#' )) +#' r2_bayes(model) #' } #' @references #' Gelman, A., Goodrich, B., Gabry, J., and Vehtari, A. (2018). @@ -114,7 +92,7 @@ r2_bayes <- function(model, robust = TRUE, ci = 0.95, verbose = TRUE, ...) { mean(i) } }), - "SE" = rapply(r2_bayesian, function(i) { + SE = rapply(r2_bayesian, function(i) { if (robust) { stats::mad(i) } else { @@ -122,9 +100,9 @@ r2_bayes <- function(model, robust = TRUE, ci = 0.95, verbose = TRUE, ...) { } }), # "Estimates" = rapply(r2_bayesian, bayestestR::point_estimate, centrality = "all", dispersion = TRUE), - "CI" = rapply(r2_bayesian, bayestestR::hdi, ci = ci), - "ci_method" = "HDI", - "robust" = robust + CI = rapply(r2_bayesian, bayestestR::hdi, ci = ci), + ci_method = "HDI", + robust = robust ) } else { structure( @@ -136,17 +114,17 @@ r2_bayes <- function(model, robust = TRUE, ci = 0.95, verbose = TRUE, ...) { mean(i) } }), - "SE" = lapply(r2_bayesian, function(i) { + SE = lapply(r2_bayesian, function(i) { if (robust) { stats::mad(i) } else { stats::sd(i) } }), - # "Estimates" = lapply(r2_bayesian, bayestestR::point_estimate, centrality = "all", dispersion = TRUE), - "CI" = lapply(r2_bayesian, bayestestR::hdi, ci = ci), - "ci_method" = "HDI", - "robust" = robust + # Estimates = lapply(r2_bayesian, bayestestR::point_estimate, centrality = "all", dispersion = TRUE), + CI = lapply(r2_bayesian, bayestestR::hdi, ci = ci), + ci_method = "HDI", + robust = robust ) } } @@ -178,13 +156,13 @@ r2_posterior.brmsfit <- function(model, verbose = TRUE, ...) { res <- insight::find_response(model) if (mi[[1]]$is_mixed) { br2_mv <- list( - "R2_Bayes" = rstantools::bayes_R2( + R2_Bayes = rstantools::bayes_R2( model, re.form = NULL, re_formula = NULL, summary = FALSE ), - "R2_Bayes_marginal" = rstantools::bayes_R2( + R2_Bayes_marginal = rstantools::bayes_R2( model, re.form = NA, re_formula = NA, @@ -193,40 +171,38 @@ r2_posterior.brmsfit <- function(model, verbose = TRUE, ...) { ) br2 <- lapply(seq_along(res), function(x) { list( - "R2_Bayes" = unname(as.vector(br2_mv$R2_Bayes[, x])), - "R2_Bayes_marginal" = unname(as.vector(br2_mv$R2_Bayes_marginal[, x])) + R2_Bayes = unname(as.vector(br2_mv$R2_Bayes[, x])), + R2_Bayes_marginal = unname(as.vector(br2_mv$R2_Bayes_marginal[, x])) ) }) names(br2) <- res } else { - br2_mv <- list("R2_Bayes" = rstantools::bayes_R2(model, summary = FALSE)) + br2_mv <- list(R2_Bayes = rstantools::bayes_R2(model, summary = FALSE)) br2 <- lapply(seq_along(res), function(x) { - list("R2_Bayes" = unname(as.vector(br2_mv$R2_Bayes[, x]))) + list(R2_Bayes = unname(as.vector(br2_mv$R2_Bayes[, x]))) }) names(br2) <- res } + } else if (mi$is_mixed) { + br2 <- list( + R2_Bayes = as.vector(rstantools::bayes_R2( + model, + re.form = NULL, + re_formula = NULL, + summary = FALSE + )), + R2_Bayes_marginal = as.vector(rstantools::bayes_R2( + model, + re.form = NA, + re_formula = NA, + summary = FALSE + )) + ) + names(br2$R2_Bayes) <- rep("Conditional R2", length(br2$R2_Bayes)) + names(br2$R2_Bayes_marginal) <- rep("Marginal R2", length(br2$R2_Bayes)) } else { - if (mi$is_mixed) { - br2 <- list( - "R2_Bayes" = as.vector(rstantools::bayes_R2( - model, - re.form = NULL, - re_formula = NULL, - summary = FALSE - )), - "R2_Bayes_marginal" = as.vector(rstantools::bayes_R2( - model, - re.form = NA, - re_formula = NA, - summary = FALSE - )) - ) - names(br2$R2_Bayes) <- rep("Conditional R2", length(br2$R2_Bayes)) - names(br2$R2_Bayes_marginal) <- rep("Marginal R2", length(br2$R2_Bayes)) - } else { - br2 <- list("R2_Bayes" = as.vector(rstantools::bayes_R2(model, summary = FALSE))) - names(br2$R2_Bayes) <- rep("R2", length(br2$R2_Bayes)) - } + br2 <- list(R2_Bayes = as.vector(rstantools::bayes_R2(model, summary = FALSE))) + names(br2$R2_Bayes) <- rep("R2", length(br2$R2_Bayes)) } br2 @@ -339,10 +315,10 @@ r2_posterior.BFBayesFactor <- function(model, # Compute posterior model probabilities - if (!is.null(prior_odds)) { - prior_odds <- c(1, prior_odds) - } else { + if (is.null(prior_odds)) { prior_odds <- rep(1, nrow(BFMods)) + } else { + prior_odds <- c(1, prior_odds) } posterior_odds <- prior_odds * BFMods$BF posterior_odds <- posterior_odds[-1] / posterior_odds[1] @@ -404,7 +380,7 @@ as.data.frame.r2_bayes <- function(x, ...) { # remove sig and g cols params_theta <- params[, !grepl(pattern = "^sig2$|^g_|^g$", colnames(params))] - params_sigma <- sqrt(params[, grepl(pattern = "^sig2$", colnames(params))]) + params_sigma <- sqrt(params[, colnames(params) == "sig2"]) # Model Matrix mm <- insight::get_modelmatrix(model[1]) @@ -416,7 +392,7 @@ as.data.frame.r2_bayes <- function(x, ...) { if (utils::packageVersion("BayesFactor") < package_version("0.9.12.4.3")) { insight::format_error("R2 for BayesFactor models with random effects requires BayesFactor v0.9.12.4.3 or higher.") } - insight::format_error("Woops, you seem to have stumbled on some weird edge case. Please file an issue at {.url https://github.com/easystats/performance/issues}") + insight::format_error("Woops, you seem to have stumbled on some weird edge case. Please file an issue at {.url https://github.com/easystats/performance/issues}") # nolint } out <- list( diff --git a/R/r2_coxsnell.R b/R/r2_coxsnell.R index cdb73dea7..3db7197b7 100644 --- a/R/r2_coxsnell.R +++ b/R/r2_coxsnell.R @@ -69,26 +69,51 @@ r2_coxsnell.glm <- function(model, verbose = TRUE, ...) { if (is.null(info)) { info <- suppressWarnings(insight::model_info(model, verbose = FALSE)) } + # Cox & Snell's R2 is not defined for binomial models that are not Bernoulli models if (info$is_binomial && !info$is_bernoulli && class(model)[1] == "glm") { if (verbose) { insight::format_alert("Can't calculate accurate R2 for binomial models that are not Bernoulli models.") } return(NULL) - } else { - # if no deviance, return NA - if (is.null(model$deviance)) { - return(NULL) - } - r2_coxsnell <- (1 - exp((model$deviance - model$null.deviance) / insight::n_obs(model, disaggregate = TRUE))) - names(r2_coxsnell) <- "Cox & Snell's R2" - r2_coxsnell } + # if no deviance, return NULL + if (is.null(model$deviance)) { + return(NULL) + } + r2_coxsnell <- (1 - exp((model$deviance - model$null.deviance) / insight::n_obs(model, disaggregate = TRUE))) + names(r2_coxsnell) <- "Cox & Snell's R2" + r2_coxsnell } #' @export r2_coxsnell.BBreg <- r2_coxsnell.glm +#' @export +r2_coxsnell.glmmTMB <- function(model, verbose = TRUE, ...) { + info <- list(...)$model_info + if (is.null(info)) { + info <- suppressWarnings(insight::model_info(model, verbose = FALSE)) + } + # Cox & Snell's R2 is not defined for binomial models that are not Bernoulli models + if (info$is_binomial && !info$is_bernoulli) { + if (verbose) { + insight::format_alert("Can't calculate accurate R2 for binomial models that are not Bernoulli models.") + } + return(NULL) + } + dev <- stats::deviance(model) + # if no deviance, return NULL + if (is.null(dev)) { + return(NULL) + } + null_dev <- stats::deviance(insight::null_model(model)) + r2_coxsnell <- (1 - exp((dev - null_dev) / insight::n_obs(model, disaggregate = TRUE))) + names(r2_coxsnell) <- "Cox & Snell's R2" + r2_coxsnell +} + + #' @export r2_coxsnell.nestedLogit <- function(model, ...) { n <- insight::n_obs(model, disaggregate = TRUE) @@ -218,6 +243,9 @@ r2_coxsnell.clm <- function(model, ...) { #' @export r2_coxsnell.crch <- r2_coxsnell.clm +#' @export +r2_coxsnell.serp <- r2_coxsnell.clm + #' @export r2_coxsnell.cpglm <- r2_coxsnell.clm diff --git a/R/r2_kl.R b/R/r2_kl.R index b5c1bb649..553e338e4 100644 --- a/R/r2_kl.R +++ b/R/r2_kl.R @@ -7,6 +7,7 @@ #' @param model A generalized linear model. #' @param adjust Logical, if `TRUE` (the default), the adjusted R2 value is #' returned. +#' @param ... Additional arguments. Currently not used. #' #' @return A named vector with the R2 value. #' @@ -19,7 +20,13 @@ #' 77: 329-342. #' #' @export -r2_kullback <- function(model, adjust = TRUE) { +r2_kullback <- function(model, ...) { + UseMethod("r2_kullback") +} + +#' @rdname r2_kullback +#' @export +r2_kullback.glm <- function(model, adjust = TRUE, ...) { if (adjust) { adj <- model$df.null / model$df.residual } else { @@ -31,3 +38,8 @@ r2_kullback <- function(model, adjust = TRUE) { names(klr2) <- "Kullback-Leibler R2" klr2 } + +#' @export +r2_kullback.default <- function(model, ...) { + insight::format_error("This function only works for objects of class `glm`.") +} diff --git a/R/r2_loo.R b/R/r2_loo.R index 040d9b572..dbcaf38b3 100644 --- a/R/r2_loo.R +++ b/R/r2_loo.R @@ -50,10 +50,10 @@ r2_loo <- function(model, robust = TRUE, ci = 0.95, verbose = TRUE, ...) { loo_r2 <- structure( class = "r2_loo", lapply(loo_r2, ifelse(robust, stats::median, mean)), - "SE" = lapply(loo_r2, ifelse(robust, stats::mad, stats::sd)), + SE = lapply(loo_r2, ifelse(robust, stats::mad, stats::sd)), # "Estimates" = lapply(r2_bayesian, bayestestR::point_estimate, centrality = "all", dispersion = TRUE), - "CI" = lapply(loo_r2, bayestestR::hdi, ci = ci), - "robust" = robust + CI = lapply(loo_r2, bayestestR::hdi, ci = ci), + robust = robust ) return(loo_r2) } @@ -84,13 +84,13 @@ r2_loo_posterior.brmsfit <- function(model, verbose = TRUE, ...) { res <- insight::find_response(model) if (mi[[1]]$is_mixed) { br2_mv <- list( - "R2_loo" = rstantools::loo_R2( + R2_loo = rstantools::loo_R2( model, re.form = NULL, re_formula = NULL, summary = FALSE ), - "R2_loo_marginal" = rstantools::loo_R2( + R2_loo_marginal = rstantools::loo_R2( model, re.form = NA, re_formula = NA, @@ -99,40 +99,38 @@ r2_loo_posterior.brmsfit <- function(model, verbose = TRUE, ...) { ) br2 <- lapply(seq_along(res), function(x) { list( - "R2_loo" = unname(as.vector(br2_mv$R2_loo[, x])), - "R2_loo_marginal" = unname(as.vector(br2_mv$R2_loo_marginal[, x])) + R2_loo = unname(as.vector(br2_mv$R2_loo[, x])), + R2_loo_marginal = unname(as.vector(br2_mv$R2_loo_marginal[, x])) ) }) names(br2) <- res } else { - br2_mv <- list("R2_loo" = rstantools::loo_R2(model, summary = FALSE)) + br2_mv <- list(R2_loo = rstantools::loo_R2(model, summary = FALSE)) br2 <- lapply(seq_along(res), function(x) { - list("R2_loo" = unname(as.vector(br2_mv$R2_loo[, x]))) + list(R2_loo = unname(as.vector(br2_mv$R2_loo[, x]))) }) names(br2) <- res } + } else if (mi$is_mixed) { + br2 <- list( + R2_loo = as.vector(rstantools::loo_R2( + model, + re.form = NULL, + re_formula = NULL, + summary = FALSE + )), + R2_loo_marginal = as.vector(rstantools::loo_R2( + model, + re.form = NA, + re_formula = NA, + summary = FALSE + )) + ) + names(br2$R2_loo) <- rep("Conditional R2_adjusted", length(br2$R2_loo)) + names(br2$R2_loo_marginal) <- rep("Marginal R2_adjusted", length(br2$R2_loo)) } else { - if (mi$is_mixed) { - br2 <- list( - "R2_loo" = as.vector(rstantools::loo_R2( - model, - re.form = NULL, - re_formula = NULL, - summary = FALSE - )), - "R2_loo_marginal" = as.vector(rstantools::loo_R2( - model, - re.form = NA, - re_formula = NA, - summary = FALSE - )) - ) - names(br2$R2_loo) <- rep("Conditional R2_adjusted", length(br2$R2_loo)) - names(br2$R2_loo_marginal) <- rep("Marginal R2_adjusted", length(br2$R2_loo)) - } else { - br2 <- list("R2_loo" = as.vector(rstantools::loo_R2(model, summary = FALSE))) - names(br2$R2_loo) <- rep("R2_adjusted", length(br2$R2_loo)) - } + br2 <- list(R2_loo = as.vector(rstantools::loo_R2(model, summary = FALSE))) + names(br2$R2_loo) <- rep("R2_adjusted", length(br2$R2_loo)) } br2 diff --git a/R/r2_mcfadden.R b/R/r2_mcfadden.R index 25bb70301..b04977517 100644 --- a/R/r2_mcfadden.R +++ b/R/r2_mcfadden.R @@ -77,6 +77,9 @@ r2_mcfadden.glm <- function(model, verbose = TRUE, ...) { #' @export r2_mcfadden.clm <- r2_mcfadden.glm +#' @export +r2_mcfadden.serp <- r2_mcfadden.glm + #' @export r2_mcfadden.cpglm <- r2_mcfadden.glm diff --git a/R/r2_nagelkerke.R b/R/r2_nagelkerke.R index 85bcc6e8f..9b5b40f8f 100644 --- a/R/r2_nagelkerke.R +++ b/R/r2_nagelkerke.R @@ -58,26 +58,64 @@ r2_nagelkerke.glm <- function(model, verbose = TRUE, ...) { if (is.null(info)) { info <- suppressWarnings(insight::model_info(model, verbose = FALSE)) } + if (info$is_binomial && !info$is_bernoulli && class(model)[1] == "glm") { if (verbose) { insight::format_warning("Can't calculate accurate R2 for binomial models that are not Bernoulli models.") } return(NULL) - } else { - r2cox <- r2_coxsnell(model) - if (is.na(r2cox) || is.null(r2cox)) { - return(NULL) - } - r2_nagelkerke <- r2cox / (1 - exp(-model$null.deviance / insight::n_obs(model, disaggregate = TRUE))) - names(r2_nagelkerke) <- "Nagelkerke's R2" - r2_nagelkerke } + + r2cox <- r2_coxsnell(model) + + if (is.na(r2cox) || is.null(r2cox)) { + return(NULL) + } + + r2_nagelkerke <- r2cox / (1 - exp(-model$null.deviance / insight::n_obs(model, disaggregate = TRUE))) + names(r2_nagelkerke) <- "Nagelkerke's R2" + r2_nagelkerke } #' @export r2_nagelkerke.BBreg <- r2_nagelkerke.glm +#' @export +r2_nagelkerke.glmmTMB <- function(model, verbose = TRUE, ...) { + info <- list(...)$model_info + if (is.null(info)) { + info <- suppressWarnings(insight::model_info(model, verbose = FALSE)) + } + + if (info$is_binomial && !info$is_bernoulli) { + if (verbose) { + insight::format_warning("Can't calculate accurate R2 for binomial models that are not Bernoulli models.") + } + return(NULL) + } + + dev <- stats::deviance(model) + + # if no deviance, return NA + if (is.null(dev)) { + return(NULL) + } + + null_mod <- suppressWarnings(insight::null_model(model)) + null_dev <- stats::deviance(null_mod) + r2cox <- (1 - exp((dev - null_dev) / insight::n_obs(model, disaggregate = TRUE))) + + if (is.na(r2cox) || is.null(r2cox)) { + return(NULL) + } + + r2_nagelkerke <- r2cox / (1 - exp(-null_dev / insight::n_obs(model, disaggregate = TRUE))) + names(r2_nagelkerke) <- "Nagelkerke's R2" + r2_nagelkerke +} + + #' @export r2_nagelkerke.nestedLogit <- function(model, ...) { n <- insight::n_obs(model, disaggregate = TRUE) @@ -163,6 +201,9 @@ r2_nagelkerke.clm <- function(model, ...) { #' @export r2_nagelkerke.polr <- r2_nagelkerke.clm +#' @export +r2_nagelkerke.serp <- r2_nagelkerke.clm + #' @export r2_nagelkerke.cpglm <- r2_nagelkerke.clm diff --git a/R/r2_nakagawa.R b/R/r2_nakagawa.R index 9b751c843..3e72a9457 100644 --- a/R/r2_nakagawa.R +++ b/R/r2_nakagawa.R @@ -136,7 +136,7 @@ r2_nakagawa <- function(model, # CI for marginal R2 r2_ci_marginal <- as.vector(result$t[, 1]) r2_ci_marginal <- r2_ci_marginal[!is.na(r2_ci_marginal)] - # sanity check + # validation check if (length(r2_ci_marginal) > 0) { r2_ci_marginal <- bayestestR::eti(r2_ci_marginal, ci = ci) } else { @@ -146,7 +146,7 @@ r2_nakagawa <- function(model, # CI for unadjusted R2 r2_ci_conditional <- as.vector(result$t[, 2]) r2_ci_conditional <- r2_ci_conditional[!is.na(r2_ci_conditional)] - # sanity check + # validation check if (length(r2_ci_conditional) > 0) { r2_ci_conditional <- bayestestR::eti(r2_ci_conditional, ci = ci) } else { diff --git a/R/r2_zeroinflated.R b/R/r2_zeroinflated.R index b1e057bb6..23d859587 100644 --- a/R/r2_zeroinflated.R +++ b/R/r2_zeroinflated.R @@ -63,7 +63,6 @@ r2_zeroinflated <- function(model, method = c("default", "correlation")) { k <- length(insight::find_parameters(model)[["conditional"]]) y <- insight::get_response(model, verbose = FALSE) - # pred <- stats::predict(model, type = "response") var_fixed <- sum((stats::fitted(model) - mean(y))^2) var_resid <- sum(stats::residuals(model, type = "pearson")^2) diff --git a/R/simulate_residuals.R b/R/simulate_residuals.R new file mode 100644 index 000000000..7312c196f --- /dev/null +++ b/R/simulate_residuals.R @@ -0,0 +1,161 @@ +#' @title Simulate randomized quantile residuals from a model +#' @name simulate_residuals +#' +#' @description Returns simulated residuals from a model. This is useful for +#' checking the uniformity of residuals, in particular for non-Gaussian models, +#' where the residuals are not expected to be normally distributed. +#' +#' @param x A model object. +#' @param iterations Number of simulations to run. +#' @param ... Arguments passed on to [`DHARMa::simulateResiduals()`]. +#' @param object A `performance_simres` object, as returned by `simulate_residuals()`. +#' @param quantile_function A function to apply to the residuals. If `NULL`, the +#' residuals are returned as is. If not `NULL`, the residuals are passed to this +#' function. This is useful for returning normally distributed residuals, for +#' example: `residuals(x, quantile_function = qnorm)`. +#' @param outlier_values A vector of length 2, specifying the values to replace +#' `-Inf` and `Inf` with, respectively. +#' +#' @return Simulated residuals, which can be further processed with +#' [`check_residuals()`]. The returned object is of class `DHARMa` and +#' `performance_simres`. +#' +#' @seealso [`check_residuals()`], [`check_zeroinflation()`], +#' [`check_overdispersion()`] and [`check_predictions()`]. See also +#' [`see::plot.see_performance_simres()`] for options to customize the plot. +#' +#' @details This function is a small wrapper around [`DHARMa::simulateResiduals()`]. +#' It basically only sets `plot = FALSE` and adds an additional class attribute +#' (`"performance_sim_res"`), which allows using the DHARMa object in own plotting +#' functions from the **see** package. See also `vignette("DHARMa")`. There is a +#' `plot()` method to visualize the distribution of the residuals. +#' +#' @section Tests based on simulated residuals: +#' For certain models, resp. model from certain families, tests like +#' [`check_zeroinflation()`] or [`check_overdispersion()`] are based on +#' simulated residuals. These are usually more accurate for such tests than +#' the traditionally used Pearson residuals. However, when simulating from more +#' complex models, such as mixed models or models with zero-inflation, there are +#' several important considerations. `simulate_residuals()` relies on +#' [`DHARMa::simulateResiduals()`], and additional arguments specified in `...` +#' are passed further down to that function. The defaults in DHARMa are set on +#' the most conservative option that works for all models. However, in many +#' cases, the help advises to use different settings in particular situations +#' or for particular models. It is recommended to read the 'Details' in +#' `?DHARMa::simulateResiduals` closely to understand the implications of the +#' simulation process and which arguments should be modified to get the most +#' accurate results. +#' +#' @references +#' +#' - Hartig, F., & Lohse, L. (2022). DHARMa: Residual Diagnostics for Hierarchical +#' (Multi-Level / Mixed) Regression Models (Version 0.4.5). Retrieved from +#' https://CRAN.R-project.org/package=DHARMa +#' +#' - Dunn, P. K., & Smyth, G. K. (1996). Randomized Quantile Residuals. Journal +#' of Computational and Graphical Statistics, 5(3), 236. \doi{10.2307/1390802} +#' +#' @examplesIf require("DHARMa") +#' m <- lm(mpg ~ wt + cyl + gear + disp, data = mtcars) +#' simulate_residuals(m) +#' +#' # extract residuals +#' head(residuals(simulate_residuals(m))) +#' +#' @export +simulate_residuals <- function(x, iterations = 250, ...) { + insight::check_if_installed("DHARMa") + # TODO (low priority): Note that DHARMa::simulateResiduals(x, ...) does its own checks for whether + # or not the model passed to it is supported, do we want to use this or do our + # own checks so we can supply our own error message? + if (iterations < 2) { + insight::format_error("`iterations` must be at least 2.") + } + # It's important to preserve this object as is, rather than prematurely + # extracting the residuals from it because the object contains important stuff + # in it that we'll want to pass onto other functions later, such as passing + # the fitted model into check_model(). + out <- DHARMa::simulateResiduals(x, n = iterations, plot = FALSE, ...) + class(out) <- c("performance_simres", "see_performance_simres", class(out)) + out +} + + +# methods ------------------------------ + +#' @export +print.performance_simres <- function(x, ...) { + # TODO (low priority): We can probably just base this off of the print method + # DHARMa uses, but with an easystats style. For now we can just stick with + # DHARMa's method. + msg <- paste0( + "Simulated residuals from a model of class `", class(x$fittedModel)[1], + "` based on ", x$nSim, " simulations. Use `check_residuals()` to check", + " uniformity of residuals or `residuals()` to extract simulated residuals.", + " It is recommended to refer to `?DHARMa::simulateResiudals` and", + " `vignette(\"DHARMa\")` for more information about different settings", + " in particular situations or for particular models.\n" + ) + cat(insight::format_message(msg)) +} + +#' @export +plot.performance_simres <- function(x, ...) { + insight::check_if_installed("see", "for residual plots") + NextMethod() +} + + +# methods -------------------------- + +#' @rdname simulate_residuals +#' @export +residuals.performance_simres <- function(object, quantile_function = NULL, outlier_values = NULL, ...) { + # check for DHARMa argument names + dots <- list(...) + if (!is.null(dots$quantileFunction)) { + quantile_function <- dots$quantileFunction + } + if (!is.null(dots$outlierValues)) { + outlier_values <- dots$outlierValues + } + + if (is.null(quantile_function)) { + res <- object$scaledResiduals + } else { + res <- quantile_function(object$scaledResiduals) + if (!is.null(outlier_values)) { + # check for correct length of outlier_values + if (length(outlier_values) != 2) { + insight::format_error("`outlier_values` must be a vector of length 2.") + } + res[res == -Inf] <- outlier_values[1] + res[res == Inf] <- outlier_values[2] + } + } + res +} + + +# helper functions --------------------- + +.simres_statistics <- function(x, statistic_fun, alternative = "two.sided") { + # summarize the observed and simulated residuals + if (is.null(statistic_fun)) { + # we pass the values to compute the p-value directly (for "check_outliers()") + observed <- x + simulated <- statistic_fun + } else { + # or apply a function to observed and simulated residusls, + # to calcualte a summary statistic + observed <- statistic_fun(x$observedResponse) + simulated <- apply(x$simulatedResponse, 2, statistic_fun) + } + # p is simply ratio of simulated zeros to observed zeros + p <- switch(alternative, + greater = mean(simulated >= observed), + less = mean(simulated <= observed), + min(min(mean(simulated <= observed), mean(simulated >= observed)) * 2, 1) + ) + list(observed = observed, simulated = simulated, p = p) +} diff --git a/R/test_bf.R b/R/test_bf.R index b50f0f652..38b0e421d 100644 --- a/R/test_bf.R +++ b/R/test_bf.R @@ -9,21 +9,21 @@ test_bf <- function(...) { #' @export test_bf.default <- function(..., reference = 1, text_length = NULL) { # Attribute class to list and get names from the global environment - objects <- insight::ellipsis_info(..., only_models = TRUE) - names(objects) <- match.call(expand.dots = FALSE)$`...` + my_objects <- insight::ellipsis_info(..., only_models = TRUE) + names(my_objects) <- match.call(expand.dots = FALSE)[["..."]] - # Sanity checks (will throw error if non-valid objects) - .test_performance_checks(objects, multiple = FALSE) + # validation checks (will throw error if non-valid objects) + .test_performance_checks(objects = my_objects, multiple = FALSE) - if (length(objects) == 1 && isTRUE(insight::is_model(objects))) { + if (length(my_objects) == 1 && isTRUE(insight::is_model(my_objects))) { insight::format_error( - "`test_bf()` is designed to compare multiple models together. For a single model, you might want to run `bayestestR::bf_parameters()` instead." + "`test_bf()` is designed to compare multiple models together. For a single model, you might want to run `bayestestR::bf_parameters()` instead." # nolint ) } # If a suitable class is found, run the more specific method on it - if (inherits(objects, c("ListNestedRegressions", "ListNonNestedRegressions", "ListLavaan"))) { - test_bf(objects, reference = reference, text_length = text_length) + if (inherits(my_objects, c("ListNestedRegressions", "ListNonNestedRegressions", "ListLavaan"))) { + test_bf(my_objects, reference = reference, text_length = text_length) } else { insight::format_error("The models cannot be compared for some reason :/") } @@ -87,9 +87,9 @@ test_bf.ListModels <- function(objects, reference = 1, text_length = NULL, ...) if (all(bayesian_models)) { "yes" - } else if (!all(bayesian_models)) { - "no" - } else { + } else if (any(bayesian_models)) { "mixed" + } else { + "no" } } diff --git a/R/test_likelihoodratio.R b/R/test_likelihoodratio.R index 5b0a532eb..529516487 100644 --- a/R/test_likelihoodratio.R +++ b/R/test_likelihoodratio.R @@ -22,29 +22,29 @@ test_lrt <- test_likelihoodratio #' @export test_likelihoodratio.default <- function(..., estimator = "OLS", verbose = TRUE) { # Attribute class to list - objects <- insight::ellipsis_info(..., only_models = TRUE) + my_objects <- insight::ellipsis_info(..., only_models = TRUE) - # Sanity checks (will throw error if non-valid objects) - objects <- .test_performance_checks(objects, verbose = verbose) + # validation checks (will throw error if non-valid objects) + my_objects <- .test_performance_checks(my_objects, verbose = verbose) # different default when mixed model or glm is included if (missing(estimator)) { - mixed_models <- sapply(objects, insight::is_mixed_model) - if (all(mixed_models) && all(sapply(objects, .is_lmer_reml)) && isTRUE(attributes(objects)$same_fixef)) { + mixed_models <- sapply(my_objects, insight::is_mixed_model) + if (all(mixed_models) && all(sapply(my_objects, .is_lmer_reml)) && isTRUE(attributes(my_objects)$same_fixef)) { estimator <- "REML" - } else if (any(mixed_models) || !all(attributes(objects)$is_linear)) { + } else if (any(mixed_models) || !all(attributes(my_objects)$is_linear)) { estimator <- "ML" } } # ensure proper object names - objects <- .check_objectnames(objects, sapply(match.call(expand.dots = FALSE)$`...`, as.character)) + my_objects <- .check_objectnames(my_objects, sapply(match.call(expand.dots = FALSE)[["..."]], as.character)) # If a suitable class is found, run the more specific method on it - if (inherits(objects, "ListNestedRegressions")) { - test_likelihoodratio(objects, estimator = estimator) - } else if (inherits(objects, "ListLavaan")) { - test_likelihoodratio_ListLavaan(..., objects = objects) # Because lavaanLRT requires the ellipsis + if (inherits(my_objects, "ListNestedRegressions")) { + test_likelihoodratio(my_objects, estimator = estimator) + } else if (inherits(my_objects, "ListLavaan")) { + test_likelihoodratio_ListLavaan(..., objects = my_objects) # Because lavaanLRT requires the ellipsis } else { insight::format_error( "The models are not nested, which is a prerequisite for `test_likelihoodratio()`.", @@ -106,7 +106,7 @@ test_likelihoodratio.ListNestedRegressions <- function(objects, estimator = "ML" same_fixef <- attributes(objects)$same_fixef # sort by df - if (!all(sort(dfs) == dfs) && !all(sort(dfs) == rev(dfs))) { + if (is.unsorted(dfs) && is.unsorted(rev(dfs))) { objects <- objects[order(dfs)] dfs <- sort(dfs, na.last = TRUE) } diff --git a/R/test_performance.R b/R/test_performance.R index 4ad0c18f2..f818b02dd 100644 --- a/R/test_performance.R +++ b/R/test_performance.R @@ -236,17 +236,17 @@ test_performance <- function(..., reference = 1, verbose = TRUE) { #' @export test_performance.default <- function(..., reference = 1, include_formula = FALSE, verbose = TRUE) { # Attribute class to list and get names from the global environment - objects <- insight::ellipsis_info(..., only_models = TRUE) + my_objects <- insight::ellipsis_info(..., only_models = TRUE) - # Sanity checks (will throw error if non-valid objects) - objects <- .test_performance_checks(objects, verbose = verbose) + # validation checks (will throw error if non-valid objects) + my_objects <- .test_performance_checks(my_objects, verbose = verbose) # ensure proper object names - objects <- .check_objectnames(objects, sapply(match.call(expand.dots = FALSE)$`...`, as.character)) + my_objects <- .check_objectnames(my_objects, sapply(match.call(expand.dots = FALSE)[["..."]], as.character)) # If a suitable class is found, run the more specific method on it - if (inherits(objects, c("ListNestedRegressions", "ListNonNestedRegressions", "ListLavaan"))) { - test_performance(objects, reference = reference, include_formula = include_formula) + if (inherits(my_objects, c("ListNestedRegressions", "ListNonNestedRegressions", "ListLavaan"))) { + test_performance(my_objects, reference = reference, include_formula = include_formula) } else { insight::format_error("The models cannot be compared for some reason :/") } @@ -421,10 +421,10 @@ test_performance.ListNonNestedRegressions <- function(objects, .test_performance_init <- function(objects, include_formula = FALSE) { - names <- insight::model_name(objects, include_formula = include_formula) + model_names <- insight::model_name(objects, include_formula = include_formula) out <- data.frame( Name = names(objects), - Model = names, + Model = model_names, stringsAsFactors = FALSE ) row.names(out) <- NULL @@ -453,7 +453,7 @@ test_performance.ListNonNestedRegressions <- function(objects, if (same_response && !inherits(objects, "ListLavaan") && isFALSE(attributes(objects)$same_response)) { insight::format_error( - "The models' dependent variables don't have the same data, which is a prerequisite to compare them. Probably the proportion of missing data differs between models." + "The models' dependent variables don't have the same data, which is a prerequisite to compare them. Probably the proportion of missing data differs between models." # nolint ) } diff --git a/R/test_vuong.R b/R/test_vuong.R index 949fceab7..f0cce6d16 100644 --- a/R/test_vuong.R +++ b/R/test_vuong.R @@ -8,17 +8,17 @@ test_vuong <- function(..., verbose = TRUE) { #' @export test_vuong.default <- function(..., reference = 1, verbose = TRUE) { # Attribute class to list and get names from the global environment - objects <- insight::ellipsis_info(..., only_models = TRUE) + my_objects <- insight::ellipsis_info(..., only_models = TRUE) - # Sanity checks (will throw error if non-valid objects) - objects <- .test_performance_checks(objects, verbose = verbose) + # validation checks (will throw error if non-valid objects) + my_objects <- .test_performance_checks(my_objects, verbose = verbose) # ensure proper object names - objects <- .check_objectnames(objects, sapply(match.call(expand.dots = FALSE)$`...`, as.character)) + my_objects <- .check_objectnames(my_objects, sapply(match.call(expand.dots = FALSE)[["..."]], as.character)) # If a suitable class is found, run the more specific method on it - if (inherits(objects, c("ListNestedRegressions", "ListNonNestedRegressions", "ListLavaan"))) { - test_vuong(objects, reference = reference) + if (inherits(my_objects, c("ListNestedRegressions", "ListNonNestedRegressions", "ListLavaan"))) { + test_vuong(my_objects, reference = reference) } else { insight::format_error("The models cannot be compared for some reason :/") } diff --git a/R/test_wald.R b/R/test_wald.R index 8a5efd99c..ddfe21829 100644 --- a/R/test_wald.R +++ b/R/test_wald.R @@ -8,17 +8,17 @@ test_wald <- function(..., verbose = TRUE) { #' @export test_wald.default <- function(..., verbose = TRUE) { # Attribute class to list and get names from the global environment - objects <- insight::ellipsis_info(..., only_models = TRUE) + my_objects <- insight::ellipsis_info(..., only_models = TRUE) - # Sanity checks (will throw error if non-valid objects) - objects <- .test_performance_checks(objects, verbose = verbose) + # validation checks (will throw error if non-valid objects) + my_objects <- .test_performance_checks(my_objects, verbose = verbose) # ensure proper object names - objects <- .check_objectnames(objects, sapply(match.call(expand.dots = FALSE)$`...`, as.character)) + my_objects <- .check_objectnames(my_objects, sapply(match.call(expand.dots = FALSE)[["..."]], as.character)) # If a suitable class is found, run the more specific method on it - if (inherits(objects, c("ListNestedRegressions", "ListNonNestedRegressions", "ListLavaan"))) { - test_wald(objects) + if (inherits(my_objects, c("ListNestedRegressions", "ListNonNestedRegressions", "ListLavaan"))) { + test_wald(my_objects) } else { insight::format_error("The models cannot be compared for some reason :/") } @@ -37,10 +37,10 @@ test_wald.ListNestedRegressions <- function(objects, verbose = TRUE, ...) { ) } return(test_likelihoodratio(objects)) - } else { - out <- .test_wald(objects, test = "F") } + out <- .test_wald(objects, test = "F") + attr(out, "is_nested") <- TRUE class(out) <- c("test_performance", class(out)) out @@ -60,7 +60,7 @@ test_wald.ListNonNestedRegressions <- function(objects, verbose = TRUE, ...) { dfs <- sapply(objects, insight::get_df, type = "residual") # sort by df - if (!all(sort(dfs) == dfs) && !all(sort(dfs) == rev(dfs))) { + if (is.unsorted(dfs) && is.unsorted(rev(dfs))) { objects <- objects[order(dfs)] dfs <- sort(dfs, na.last = TRUE) } @@ -78,18 +78,18 @@ test_wald.ListNonNestedRegressions <- function(objects, verbose = TRUE, ...) { # Find reference-model related stuff refmodel <- order(dfs)[1] - scale <- dev[refmodel] / dfs[refmodel] + my_scale <- dev[refmodel] / dfs[refmodel] # test = "F" if (test == "F") { - f_value <- (dev_diff / dfs_diff) / scale + f_value <- (dev_diff / dfs_diff) / my_scale f_value[!is.na(f_value) & f_value < 0] <- NA # rather than p = 0 - out$`F` <- f_value + out[["F"]] <- f_value p <- stats::pf(f_value, abs(dfs_diff), dfs[refmodel], lower.tail = FALSE) # test = "LRT" } else { - chi2 <- dev_diff / scale * sign(dfs_diff) + chi2 <- dev_diff / my_scale * sign(dfs_diff) chi2[!is.na(chi2) & chi2 < 0] <- NA # rather than p = 0 out$Chi2 <- chi2 p <- stats::pchisq(chi2, abs(dfs_diff), lower.tail = FALSE) diff --git a/README.md b/README.md index 3be782f33..57e7cd9f0 100644 --- a/README.md +++ b/README.md @@ -147,8 +147,8 @@ model <- stan_glmer( r2(model) #> # Bayesian R2 with Compatibility Interval #> -#> Conditional R2: 0.953 (95% CI [0.942, 0.963]) -#> Marginal R2: 0.824 (95% CI [0.721, 0.899]) +#> Conditional R2: 0.953 (95% CI [0.941, 0.963]) +#> Marginal R2: 0.823 (95% CI [0.710, 0.898]) library(lme4) model <- lmer(Reaction ~ Days + (1 + Days | Subject), data = sleepstudy) @@ -422,12 +422,12 @@ lm3 <- lm(Sepal.Length ~ Species * Sepal.Width, data = iris) lm4 <- lm(Sepal.Length ~ Species * Sepal.Width + Petal.Length + Petal.Width, data = iris) test_performance(lm1, lm2, lm3, lm4) -#> Name | Model | BF | Omega2 | p (Omega2) | LR | p (LR) -#> -------------------------------------------------------------- -#> lm1 | lm | | | | | -#> lm2 | lm | 3.45e+26 | 0.69 | < .001 | -6.25 | < .001 -#> lm3 | lm | 4.69e+07 | 0.36 | < .001 | -3.44 | < .001 -#> lm4 | lm | 7.58e+29 | 0.73 | < .001 | -7.77 | < .001 +#> Name | Model | BF | Omega2 | p (Omega2) | LR | p (LR) +#> ------------------------------------------------------------ +#> lm1 | lm | | | | | +#> lm2 | lm | > 1000 | 0.69 | < .001 | -6.25 | < .001 +#> lm3 | lm | > 1000 | 0.36 | < .001 | -3.44 | < .001 +#> lm4 | lm | > 1000 | 0.73 | < .001 | -7.77 | < .001 #> Each model is compared to lm1. test_bf(lm1, lm2, lm3, lm4) @@ -465,7 +465,8 @@ Please follow contributing guidelines mentioned here: ## References -
+
diff --git a/WIP/check_model_logistic.Rmd b/WIP/check_model_logistic.Rmd new file mode 100644 index 000000000..e6b7cafe8 --- /dev/null +++ b/WIP/check_model_logistic.Rmd @@ -0,0 +1,204 @@ +--- +title: "Checking model assumption - logistic regression models" +output: + rmarkdown::html_vignette: + toc: true + fig_width: 10.08 + fig_height: 6 +tags: [r, performance, r2] +vignette: > + \usepackage[utf8]{inputenc} + %\VignetteIndexEntry{Checking model assumption - logistic regression models} + %\VignetteEngine{knitr::rmarkdown} +editor_options: + chunk_output_type: console +--- + +```{r , include=FALSE} +library(knitr) +library(performance) +options(knitr.kable.NA = "") +knitr::opts_chunk$set( + comment = ">", + message = FALSE, + warning = FALSE, + out.width = "100%", + dpi = 450 +) +options(digits = 2) + +pkgs <- c("see", "ggplot2", "datawizard", "parameters") +successfully_loaded <- vapply(pkgs, requireNamespace, FUN.VALUE = logical(1L), quietly = TRUE) +can_evaluate <- all(successfully_loaded) + +if (can_evaluate) { + knitr::opts_chunk$set(eval = TRUE) + vapply(pkgs, require, FUN.VALUE = logical(1L), quietly = TRUE, character.only = TRUE) +} else { + knitr::opts_chunk$set(eval = FALSE) +} +``` + +# Make sure your model inference is accurate! + +Model diagnostics is crucial, because parameter estimation, p-values and confidence interval depend on correct model assumptions as well as on the data. If model assumptions are violated, estimates can be statistically significant "even if the effect under study is null" (_Gelman/Greenland 2019_). + +There are several problems associated with model diagnostics. Different types of models require different checks. For instance, normally distributed residuals are assumed to apply for linear regression, but is no appropriate assumption for logistic regression. Furthermore, it is recommended to carry out visual inspections, i.e. to generate and inspect so called diagnostic plots of model assumptions - formal statistical tests are often too strict and warn of violation of the model assumptions, although everything is fine within a certain tolerance range. But how should such diagnostic plots be interpreted? And if violations have been detected, how to fix them? + +This vignette introduces the `check_model()` function of the **performance** package, shows how to use this function for logistic regression models and how the resulting diagnostic plots should be interpreted. Furthermore, recommendations are given how to address possible violations of model assumptions. + +Most plots seen here can also be generated by their dedicated functions, e.g.: + +- Posterior predictive checks: `check_predictions()` +- Homogeneity of variance: `check_heteroskedasticity()` +- Normality of residuals: `check_normality()` +- Multicollinearity: `check_collinearity()` +- Influential observations: `check_outliers()` +- Binned residuals: `binned_residuals()` +- Check for overdispersion: `check_overdispersion()` + +# Logistic regression models: Are all assumptions met? + +We start with a simple example for a logistic regression model. + +```{r} +data(Titanic) +d <- as.data.frame(Titanic) +d <- tidyr::uncount(d, Freq) +m1 <- glm(Survived ~ Class + Sex + Age, data = d, family = binomial()) +``` + +Before we go into details of the diagnostic plots, let's first look at the summary table. + +```{r eval=successfully_loaded["parameters"]} +library(parameters) +model_parameters(m1) +``` + +There is nothing suspicious so far. Now let's start with model diagnostics. We use the `check_model()` function, which provides an overview with the most important and appropriate diagnostic plots for the model under investigation. + +```{r eval=all(successfully_loaded[c("see", "ggplot2")]), fig.height=11} +library(performance) +check_model(m1) +``` + +Now let's take a closer look for each plot. To do so, we ask `check_model()` to return a single plot for each check, instead of arranging them in a grid. We can do so using the `panel` argument. This returns a list of *ggplot* plots. + +```{r eval=all(successfully_loaded[c("see", "ggplot2")])} +# return a list of single plots +diagnostic_plots <- plot(check_model(m1, panel = FALSE)) +``` + +## Posterior predictive checks + +The first plot is based on `check_predictions()`. Posterior predictive checks can be used to "look for systematic discrepancies between real and simulated data" (_Gelman et al. 2014, p. 169_). It helps to see whether the type of model (distributional family) fits well to the data (_Gelman and Hill, 2007, p. 158_). + +```{r eval=all(successfully_loaded[c("see", "ggplot2")])} +# posterior predicive checks +diagnostic_plots[[1]] +``` + +In case of logistic regression our count models, the plot shows by default _dots_ for the observed and simulated data, not _lines_ (as for linear models). The blue dots are simulated data based on the model, if the model were true and distributional assumptions met. The green dots represents the actual observed data of the response variable. + +This plot looks good, because the green dots are inside the range of the blue error bars, and thus we would not assume any violations of model assumptions here. + +### How to fix this? + +The best way, if there are serious concerns that the model does not fit well to the data, is to use a different type (family) of regression models. + +## Binned residuals + + +```{r eval=all(successfully_loaded[c("see", "ggplot2")])} +# linearity +diagnostic_plots[[2]] +``` + + +### How to fix this? + + +## Influential observations - outliers + +Outliers can be defined as particularly influential observations, and this plot helps detecting those outliers. Cook's distance (_Cook 1977_, _Cook & Weisberg 1982_) is used to define outliers, i.e. any point in this plot that falls outside of Cook's distance (the dashed lines) is considered an influential observation. + +```{r eval=all(successfully_loaded[c("see", "ggplot2")])} +# influential observations - outliers +diagnostic_plots[[4]] +``` + +In our example, everything looks well. + +### How to fix this? + +Dealing with outliers is not straightforward, as it is not recommended to automatically discard any observation that has been marked as "an outlier". Rather, your _domain knowledge_ must be involved in the decision whether to keep or omit influential observation. A helpful heuristic is to distinguish between error outliers, interesting outliers, and random outliers (_Leys et al. 2019_). _Error outliers_ are likely due to human error and should be corrected before data analysis. _Interesting outliers_ are not due to technical error and may be of theoretical interest; it might thus be relevant to investigate them further even though they should be removed from the current analysis of interest. _Random outliers_ are assumed to be due to chance alone and to belong to the correct distribution and, therefore, should be retained. + +## Multicollinearity + +This plot checks for potential collinearity among predictors. In a nutshell multicollinearity means that once you know the effect of one predictor, the value of knowing the other predictor is rather low. Multicollinearity might arise when a third, unobserved variable has a causal effect on each of the two predictors that are associated with the outcome. In such cases, the actual relationship that matters would be the association between the unobserved variable and the outcome. + +Multicollinearity should not be confused with a raw strong correlation between predictors. What matters is the association between one or more predictor variables, *conditional on the other variables in the model*. + +If multicollinearity is a problem, the model seems to suggest that the predictors in question don't seems to be reliably associated with the outcome (low estimates, high standard errors), although these predictors actually are strongly associated with the outcome, i.e. indeed might have strong effect (_McElreath 2020, chapter 6.1_). + +```{r eval=all(successfully_loaded[c("see", "ggplot2")])} +# multicollinearity +diagnostic_plots[[5]] +``` + +The variance inflation factor (VIF) indicates the magnitude of multicollinearity of model terms. The thresholds for low, moderate and high collinearity are VIF values less than 5, between 5 and 10 and larger than 10, respectively (_James et al. 2013_). Note that these thresholds, although commonly used, are also criticized for being too high. _Zuur et al. (2010)_ suggest using lower values, e.g. a VIF of 3 or larger may already no longer be considered as "low". + +Our model clearly suffers from multicollinearity, as all predictors have high VIF values. + +### How to fix this? + +Usually, predictors with (very) high VIF values should be removed from the model to fix multicollinearity. Some caution is needed for interaction terms. If interaction terms are included in a model, high VIF values are expected. This portion of multicollinearity among the component terms of an interaction is also called "inessential ill-conditioning", which leads to inflated VIF values that are typically seen for models with interaction terms _(Francoeur 2013)_. In such cases, re-fit your model without interaction terms and check this model for collinearity among predictors. + +## Normality of residuals + +In linear regression, residuals should be normally distributed. This can be checked using so-called Q-Q plots (quantile-quantile plot) to compare the shapes of distributions. This plot shows the quantiles of the studentized residuals versus fitted values. + +Usually, dots should fall along the green reference line. If there is some deviation (mostly at the tails), this indicates that the model doesn't predict the outcome well for the range that shows larger deviations from the reference line. In such cases, inferential statistics like the p-value or coverage of confidence intervals can be inaccurate. + +```{r eval=all(successfully_loaded[c("see", "ggplot2")])} +# normally distributed residuals +diagnostic_plots[[6]] +``` + +In our example, we see that most data points are ok, except some observations at the tails. Whether any action is needed to fix this or not can also depend on the results of the remaining diagnostic plots. If all other plots indicate no violation of assumptions, some deviation of normality, particularly at the tails, can be less critical. + +### How to fix this? + +Here are some remedies to fix non-normality of residuals, according to _Pek et al. 2018_. + +1. For large sample sizes, the assumption of normality can be relaxed due to the central limit theorem - no action needed. + +2. Calculating heteroscedasticity-consistent standard errors can help. See section **Homogeneity of variance** for details. + +3. Bootstrapping is another alternative to resolve issues with non-normally residuals. Again, this can be easily done using the **parameters** package, e.g. `parameters::model_parameters(m1, bootstrap = TRUE)` or [`parameters::bootstrap_parameters()`](https://easystats.github.io/parameters/reference/bootstrap_parameters.html). + +# References + +Brooks ME, Kristensen K, Benthem KJ van, Magnusson A, Berg CW, Nielsen A, et al. glmmTMB Balances Speed and Flexibility Among Packages for Zero-inflated Generalized Linear Mixed Modeling. The R Journal. 2017;9: 378-400. + +Cook RD. Detection of influential observation in linear regression. Technometrics. 1977;19(1): 15-18. + +Cook RD and Weisberg S. Residuals and Influence in Regression. London: Chapman and Hall, 1982. + +Francoeur RB. Could Sequential Residual Centering Resolve Low Sensitivity in Moderated Regression? Simulations and Cancer Symptom Clusters. Open Journal of Statistics. 2013:03(06), 24-44. + +Gelman A, Carlin JB, Stern HS, Dunson DB, Vehtari A, and Rubin DB. Bayesian data analysis. (Third edition). CRC Press, 2014 + +Gelman A, Greenland S. Are confidence intervals better termed "uncertainty intervals"? BMJ. 2019;l5381. doi:10.1136/bmj.l5381 + +Gelman A, and Hill J. Data analysis using regression and multilevel/hierarchical models. Cambridge; New York. Cambridge University Press, 2007 + +James, G., Witten, D., Hastie, T., and Tibshirani, R. (eds.).An introduction to statistical learning: with applications in R. New York: Springer, 2013 + +Leys C, Delacre M, Mora YL, Lakens D, Ley C. How to Classify, Detect, and Manage Univariate and Multivariate Outliers, With Emphasis on Pre-Registration. International Review of Social Psychology, 2019 + +McElreath, R. Statistical rethinking: A Bayesian course with examples in R and Stan. 2nd edition. Chapman and Hall/CRC, 2020 + +Pek J, Wong O, Wong ACM. How to Address Non-normality: A Taxonomy of Approaches, Reviewed, and Illustrated. Front Psychol (2018) 9:2104. doi: 10.3389/fpsyg.2018.02104 + +Zuur AF, Ieno EN, Elphick CS. A protocol for data exploration to avoid common statistical problems: Data exploration. Methods in Ecology and Evolution (2010) 1:3-14. diff --git a/_pkgdown.yaml b/_pkgdown.yaml index 71f444657..f4efe80c1 100644 --- a/_pkgdown.yaml +++ b/_pkgdown.yaml @@ -13,6 +13,7 @@ reference: contents: - binned_residuals - starts_with("check_") + - simulate_residuals - title: "Check Model Performance or Quality" contents: @@ -43,3 +44,22 @@ reference: - title: "Sample Data" contents: - classify_distribution + +articles: + - title: Checking model assumptions and data properties + navbar: ~ + contents: + - check_model + - check_outliers + - simulate_residuals + + - title: Model comparison and testing + navbar: ~ + contents: + - compare + - r2 + + - title: Case Studies + navbar: ~ + contents: + - check_model_practical diff --git a/cran-comments.md b/cran-comments.md index d044c232a..fa6f2d4c2 100644 --- a/cran-comments.md +++ b/cran-comments.md @@ -1 +1 @@ -This release fixes CRAN check errors. We checked all reverse dependencies, comparing R CMD check results across CRAN and dev versions of this package and saw no new problems. \ No newline at end of file +Maintainance release. \ No newline at end of file diff --git a/inst/WORDLIST b/inst/WORDLIST index adf602827..ca43db445 100644 --- a/inst/WORDLIST +++ b/inst/WORDLIST @@ -9,6 +9,7 @@ Ankerst Archimbaud Arel Asq +BCI BFBayesFactor BMJ Baayen @@ -24,6 +25,7 @@ Bollen's Bortz Breunig Breusch +BRM Bryk Bundock Burnham @@ -45,10 +47,12 @@ DOI Datenerhebung Delacre Deskriptivstatistische +DHARMa Distinguishability Dom Dominicy Dordrecht +Dorie Dunson Durbin Dxy @@ -75,6 +79,8 @@ Gazen Gelman Gnanadesikan Guilford +Hartig +HDI HJ Hastie Herron @@ -110,10 +116,10 @@ Killeen Kliegl Kristensen Kullback -Lakens LOF LOGLOSS LOOIC +Lakens Laniado Leibler Lemeshow @@ -122,7 +128,9 @@ Ley Leys Lillo Liu +Lohse Lomax +MADs MSA Maddala Magee @@ -145,9 +153,11 @@ Nagelkerke Nagelkerke's Nakagawa Nakagawa's +Nondegenerate Nordhausen Normed ORCID +OSF Olkin PNFI Pek @@ -178,6 +188,7 @@ Sensivity Shachar Shinichi Skrondal +Smyth Solomons Somers Specifity @@ -193,7 +204,6 @@ Tibshirani Tily Tjur Tjur's -Trochim Tsai Tweedie VIF @@ -207,9 +217,9 @@ Visualisation Vuong Vuong's WAIC -WMK Weisberg Windmeijer +Winsorization Witten Xu YL @@ -237,6 +247,7 @@ brmsfit cauchy clusterable concurvity +datawizard dbscan der detrend @@ -245,6 +256,7 @@ discriminations doi easystats et +equidispersion explicitely favour fixest @@ -252,23 +264,21 @@ fpsyg gam geoms ggplot -github gjo glm glmmTMB glmrob grey heteroskedasticity -homoskedasticity homoscedasticity +homoskedasticity https intra intraclass -io -ize joss kmeans lavaan +lm lme lmrob lmtest @@ -277,6 +287,8 @@ metafor mfx mhurdle mis +misspecification +misspecified mlm mlogit modelfit @@ -285,18 +297,24 @@ models’ multicollinearity multimodel multiresponse +multivariable nd nonnest overfitted patilindrajeets poisson preprint +priori pscl +quantreg quared quartile quartiles rOpenSci +recoding +reimplement rempsyc +reproducibility rescaling rma rmarkdown @@ -305,9 +323,11 @@ rsif rssa rstanarm se +serp smicd sphericity strengejacke +suboptimal subscale subscales theoreritcal @@ -316,6 +336,10 @@ unadjusted und underfitted underfitting +underdispersion visualisation +winsorization +winsorize +winsorized xy youtube diff --git a/man/binned_residuals.Rd b/man/binned_residuals.Rd index ff6fb5784..632e2a050 100644 --- a/man/binned_residuals.Rd +++ b/man/binned_residuals.Rd @@ -4,7 +4,18 @@ \alias{binned_residuals} \title{Binned residuals for binomial logistic regression} \usage{ -binned_residuals(model, term = NULL, n_bins = NULL, ...) +binned_residuals( + model, + term = NULL, + n_bins = NULL, + show_dots = NULL, + ci = 0.95, + ci_type = c("exact", "gaussian", "boot"), + residuals = c("deviance", "pearson", "response"), + iterations = 1000, + verbose = TRUE, + ... +) } \arguments{ \item{model}{A \code{glm}-object with \emph{binomial}-family.} @@ -18,6 +29,31 @@ plotted.} \code{n_bins = NULL}, the square root of the number of observations is taken.} +\item{show_dots}{Logical, if \code{TRUE}, will show data points in the plot. Set +to \code{FALSE} for models with many observations, if generating the plot is too +time-consuming. By default, \code{show_dots = NULL}. In this case \code{binned_residuals()} +tries to guess whether performance will be poor due to a very large model +and thus automatically shows or hides dots.} + +\item{ci}{Numeric, the confidence level for the error bounds.} + +\item{ci_type}{Character, the type of error bounds to calculate. Can be +\code{"exact"} (default), \code{"gaussian"} or \code{"boot"}. \code{"exact"} calculates the +error bounds based on the exact binomial distribution, using \code{\link[=binom.test]{binom.test()}}. +\code{"gaussian"} uses the Gaussian approximation, while \code{"boot"} uses a simple +bootstrap method, where confidence intervals are calculated based on the +quantiles of the bootstrap distribution.} + +\item{residuals}{Character, the type of residuals to calculate. Can be +\code{"deviance"} (default), \code{"pearson"} or \code{"response"}. It is recommended to +use \code{"response"} only for those models where other residuals are not +available.} + +\item{iterations}{Integer, the number of iterations to use for the +bootstrap method. Only used if \code{ci_type = "boot"}.} + +\item{verbose}{Toggle warnings and messages.} + \item{...}{Currently not used.} } \value{ diff --git a/man/check_collinearity.Rd b/man/check_collinearity.Rd index 9b943758d..847ff110d 100644 --- a/man/check_collinearity.Rd +++ b/man/check_collinearity.Rd @@ -160,6 +160,8 @@ Evolution (2010) 1:3–14. } } \seealso{ +\code{\link[see:plot.see_check_collinearity]{see::plot.see_check_collinearity()}} for options to customize the plot. + Other functions to check model assumptions and and assess model quality: \code{\link{check_autocorrelation}()}, \code{\link{check_convergence}()}, diff --git a/man/check_itemscale.Rd b/man/check_itemscale.Rd index 7f790b1d2..a5ada3875 100644 --- a/man/check_itemscale.Rd +++ b/man/check_itemscale.Rd @@ -4,11 +4,15 @@ \alias{check_itemscale} \title{Describe Properties of Item Scales} \usage{ -check_itemscale(x) +check_itemscale(x, factor_index = NULL) } \arguments{ \item{x}{An object of class \code{parameters_pca}, as returned by -\code{\link[parameters:principal_components]{parameters::principal_components()}}.} +\code{\link[parameters:principal_components]{parameters::principal_components()}}, or a data frame.} + +\item{factor_index}{If \code{x} is a data frame, \code{factor_index} must be specified. +It must be a numeric vector of same length as number of columns in \code{x}, where +each element is the index of the factor to which the respective column in \code{x}.} } \value{ A list of data frames, with related measures of internal @@ -51,9 +55,19 @@ set.seed(17) X <- matrix(rnorm(1600), 100, 16) Z <- X \%*\% C -pca <- principal_components(as.data.frame(Z), rotation = "varimax", n = 3) +pca <- parameters::principal_components( + as.data.frame(Z), + rotation = "varimax", + n = 3 +) pca check_itemscale(pca) + +# as data frame +check_itemscale( + as.data.frame(Z), + factor_index = parameters::closest_component(pca) +) \dontshow{\}) # examplesIf} } \references{ diff --git a/man/check_model.Rd b/man/check_model.Rd index 2bf82af92..bb4898df8 100644 --- a/man/check_model.Rd +++ b/man/check_model.Rd @@ -9,18 +9,22 @@ check_model(x, ...) \method{check_model}{default}( x, - dot_size = 2, - line_size = 0.8, panel = TRUE, check = "all", + detrend = TRUE, + bandwidth = "nrd", + type = "density", + residual_type = NULL, + show_dots = NULL, + dot_size = 2, + line_size = 0.8, + title_size = 12, + axis_title_size = base_size, + base_size = 10, alpha = 0.2, dot_alpha = 0.8, colors = c("#3aaf85", "#1b6ca8", "#cd201f"), theme = "see::theme_lucid", - detrend = TRUE, - show_dots = NULL, - bandwidth = "nrd", - type = "density", verbose = FALSE, ... ) @@ -28,9 +32,8 @@ check_model(x, ...) \arguments{ \item{x}{A model object.} -\item{...}{Currently not used.} - -\item{dot_size, line_size}{Size of line and dot-geoms.} +\item{...}{Arguments passed down to the individual check functions, especially +to \code{check_predictions()} and \code{binned_residuals()}.} \item{panel}{Logical, if \code{TRUE}, plots are arranged as panels; else, single plots for each diagnostic are returned.} @@ -38,31 +41,17 @@ single plots for each diagnostic are returned.} \item{check}{Character vector, indicating which checks for should be performed and plotted. May be one or more of \code{"all"}, \code{"vif"}, \code{"qq"}, \code{"normality"}, \code{"linearity"}, \code{"ncv"}, \code{"homogeneity"}, \code{"outliers"}, \code{"reqq"}, \code{"pp_check"}, -\code{"binned_residuals"} or \code{"overdispersion"}, Not that not all check apply +\code{"binned_residuals"} or \code{"overdispersion"}. Note that not all check apply to all type of models (see 'Details'). \code{"reqq"} is a QQ-plot for random effects and only available for mixed models. \code{"ncv"} is an alias for \code{"linearity"}, and checks for non-constant variance, i.e. for heteroscedasticity, as well as the linear relationship. By default, all possible checks are performed and plotted.} -\item{alpha, dot_alpha}{The alpha level of the confidence bands and dot-geoms. -Scalar from 0 to 1.} - -\item{colors}{Character vector with color codes (hex-format). Must be of -length 3. First color is usually used for reference lines, second color -for dots, and third color for outliers or extreme values.} - -\item{theme}{String, indicating the name of the plot-theme. Must be in the -format \code{"package::theme_name"} (e.g. \code{"ggplot2::theme_minimal"}).} - \item{detrend}{Logical. Should Q-Q/P-P plots be detrended? Defaults to -\code{TRUE}.} - -\item{show_dots}{Logical, if \code{TRUE}, will show data points in the plot. Set -to \code{FALSE} for models with many observations, if generating the plot is too -time-consuming. By default, \code{show_dots = NULL}. In this case \code{check_model()} -tries to guess whether performance will be poor due to a very large model -and thus automatically shows or hides dots.} +\code{TRUE} for linear models or when \code{residual_type = "normal"}. Defaults to +\code{FALSE} for QQ plots based on simulated residuals (i.e. when +\code{residual_type = "simulated"}).} \item{bandwidth}{A character string indicating the smoothing bandwidth to be used. Unlike \code{stats::density()}, which used \code{"nrd0"} as default, the @@ -75,6 +64,35 @@ to a different value.} options are appropriate for models with discrete - binary, integer or ordinal etc. - outcomes).} +\item{residual_type}{Character, indicating the type of residuals to be used. +For non-Gaussian models, the default is \code{"simulated"}, which uses simulated +residuals. These are based on \code{\link[=simulate_residuals]{simulate_residuals()}} and thus uses the +\strong{DHARMa} package to return randomized quantile residuals. For Gaussian +models, the default is \code{"normal"}, which uses the default residuals from +the model. Setting \code{residual_type = "normal"} for non-Gaussian models will +use a half-normal Q-Q plot of the absolute value of the standardized deviance +residuals.} + +\item{show_dots}{Logical, if \code{TRUE}, will show data points in the plot. Set +to \code{FALSE} for models with many observations, if generating the plot is too +time-consuming. By default, \code{show_dots = NULL}. In this case \code{check_model()} +tries to guess whether performance will be poor due to a very large model +and thus automatically shows or hides dots.} + +\item{dot_size, line_size}{Size of line and dot-geoms.} + +\item{base_size, title_size, axis_title_size}{Base font size for axis and plot titles.} + +\item{alpha, dot_alpha}{The alpha level of the confidence bands and dot-geoms. +Scalar from 0 to 1.} + +\item{colors}{Character vector with color codes (hex-format). Must be of +length 3. First color is usually used for reference lines, second color +for dots, and third color for outliers or extreme values.} + +\item{theme}{String, indicating the name of the plot-theme. Must be in the +format \code{"package::theme_name"} (e.g. \code{"ggplot2::theme_minimal"}).} + \item{verbose}{If \code{FALSE} (default), suppress most warning messages.} } \value{ @@ -160,10 +178,22 @@ This plot is used to determine if the residuals of the regression model are normally distributed. Usually, dots should fall along the line. If there is some deviation (mostly at the tails), this indicates that the model doesn't predict the outcome well for that range that shows larger deviations from -the line. For generalized linear models, a half-normal Q-Q plot of the -absolute value of the standardized deviance residuals is shown, however, the -interpretation of the plot remains the same. See \code{\link[=check_normality]{check_normality()}} for -further details. +the line. For generalized linear models and when \code{residual_type = "normal"}, +a half-normal Q-Q plot of the absolute value of the standardized deviance +residuals is shown, however, the interpretation of the plot remains the same. +See \code{\link[=check_normality]{check_normality()}} for further details. Usually, for generalized linear +(mixed) models, a test for uniformity of residuals based on simulated residuals +is conducted (see next section). +} + +\section{Uniformity of Residuals}{ + +Fore non-Gaussian models, when \code{residual_type = "simulated"} (the default +for generalized linear (mixed) models), residuals are not expected to be +normally distributed. In this case, the created Q-Q plot checks the uniformity +of residuals. The interpretation of the plot is the same as for the normal +Q-Q plot. See \code{\link[=simulate_residuals]{simulate_residuals()}} and \code{\link[=check_residuals]{check_residuals()}} for further +details. } \section{Overdispersion}{ @@ -187,12 +217,13 @@ inside the error bounds. See \code{\link[=binned_residuals]{binned_residuals()}} \section{Residuals for (Generalized) Linear Models}{ -Plots that check the normality of residuals (QQ-plot) or the homogeneity of -variance use standardized Pearson's residuals for generalized linear models, -and standardized residuals for linear models. The plots for the normality of -residuals (with overlayed normal curve) and for the linearity assumption use -the default residuals for \code{lm} and \code{glm} (which are deviance -residuals for \code{glm}). +Plots that check the homogeneity of variance use standardized Pearson's +residuals for generalized linear models, and standardized residuals for +linear models. The plots for the normality of residuals (with overlayed +normal curve) and for the linearity assumption use the default residuals +for \code{lm} and \code{glm} (which are deviance residuals for \code{glm}). The Q-Q plots +use simulated residuals (see \code{\link[=simulate_residuals]{simulate_residuals()}}) for non-Gaussian +models and standardized residuals for linear models. } \section{Troubleshooting}{ diff --git a/man/check_normality.Rd b/man/check_normality.Rd index 282aa6016..7388e0c0b 100644 --- a/man/check_normality.Rd +++ b/man/check_normality.Rd @@ -60,3 +60,6 @@ plot(check_normality(m), type = "pp") } \dontshow{\}) # examplesIf} } +\seealso{ +\code{\link[see:plot.see_check_normality]{see::plot.see_check_normality()}} for options to customize the plot. +} diff --git a/man/check_outliers.Rd b/man/check_outliers.Rd index c19d4ecb0..c75d5fc4a 100644 --- a/man/check_outliers.Rd +++ b/man/check_outliers.Rd @@ -5,6 +5,7 @@ \alias{check_outliers.default} \alias{check_outliers.numeric} \alias{check_outliers.data.frame} +\alias{check_outliers.performance_simres} \title{Outliers detection (check for influential observations)} \usage{ check_outliers(x, ...) @@ -14,19 +15,31 @@ check_outliers(x, ...) method = c("cook", "pareto"), threshold = NULL, ID = NULL, + verbose = TRUE, ... ) \method{check_outliers}{numeric}(x, method = "zscore_robust", threshold = NULL, ...) \method{check_outliers}{data.frame}(x, method = "mahalanobis", threshold = NULL, ID = NULL, ...) + +\method{check_outliers}{performance_simres}( + x, + type = "default", + iterations = 100, + alternative = "two.sided", + ... +) } \arguments{ -\item{x}{A model or a data.frame object.} +\item{x}{A model, a data.frame, a \code{performance_simres} \code{\link[=simulate_residuals]{simulate_residuals()}} +or a \code{DHARMa} object.} \item{...}{When \code{method = "ics"}, further arguments in \code{...} are passed down to \code{\link[ICSOutlier:ics.outlier]{ICSOutlier::ics.outlier()}}. When \code{method = "mahalanobis"}, -they are passed down to \code{\link[stats:mahalanobis]{stats::mahalanobis()}}.} +they are passed down to \code{\link[stats:mahalanobis]{stats::mahalanobis()}}. \code{percentage_central} can +be specified when \code{method = "mcd"}. For objects of class \code{performance_simres} +or \code{DHARMa}, further arguments are passed down to \code{DHARMa::testOutliers()}.} \item{method}{The outlier detection method(s). Can be \code{"all"} or some of \code{"cook"}, \code{"pareto"}, \code{"zscore"}, \code{"zscore_robust"}, \code{"iqr"}, \code{"ci"}, \code{"eti"}, @@ -40,6 +53,17 @@ considered as outlier. If \code{NULL}, default values will be used (see for any of the method run.} \item{ID}{Optional, to report an ID column along with the row number.} + +\item{verbose}{Toggle warnings.} + +\item{type}{Type of method to test for outliers. Can be one of \code{"default"}, +\code{"binomial"} or \code{"bootstrap"}. Only applies when \code{x} is an object returned +by \code{simulate_residuals()} or of class \code{DHARMa}. See 'Details' in +\code{?DHARMa::testOutliers} for a detailed description of the types.} + +\item{iterations}{Number of simulations to run.} + +\item{alternative}{A character string specifying the alternative hypothesis.} } \value{ A logical vector of the detected outliers with a nice printing @@ -189,6 +213,9 @@ calculates the mean and covariance matrix based on the most central subset of the data (by default, 66\\%), before computing the Mahalanobis Distance. This is deemed to be a more robust method of identifying and removing outliers than regular Mahalanobis distance. +This method has a \code{percentage_central} argument that allows specifying +the breakdown point (0.75, the default, is recommended by Leys et al. 2018, +but a commonly used alternative is 0.50). \item \strong{Invariant Coordinate Selection (ICS)}: The outlier are detected using ICS, which by default uses an alpha threshold of 0.025 (corresponding to the 2.5\\% most extreme observations) as a cut-off @@ -223,6 +250,19 @@ LOF distance. Requires the \strong{dbscan} package. } } +\section{Methods for simulated residuals}{ + + +The approach for detecting outliers based on simulated residuals differs +from the traditional methods and may not be detecting outliers as expected. +Literally, this approach compares observed to simulated values. However, we +do not know the deviation of the observed data to the model expectation, and +thus, the term "outlier" should be taken with a grain of salt. It refers to +"simulation outliers". Basically, the comparison tests whether on observed +data point is outside the simulated range. It is strongly recommended to read +the related documentations in the \strong{DHARMa} package, e.g. \code{?DHARMa::testOutliers}. +} + \section{Threshold specification}{ @@ -295,7 +335,7 @@ check_outliers(group_iris) \dontshow{if (require("see") && require("bigutilsr") && require("loo") && require("MASS") && require("ICSOutlier") && require("ICS") && require("dbscan")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} \donttest{ # You can also run all the methods -check_outliers(data, method = "all") +check_outliers(data, method = "all", verbose = FALSE) # For statistical models --------------------------------------------- # select only mpg and disp (continuous) @@ -343,12 +383,17 @@ IEEE. (2021). performance: An R package for assessment, comparison and testing of statistical models. Journal of Open Source Software, 6(60), 3139. \doi{10.21105/joss.03139} +\item Thériault, R., Ben-Shachar, M. S., Patil, I., Lüdecke, D., Wiernik, B. M., +and Makowski, D. (2023). Check your outliers! An introduction to identifying +statistical outliers in R with easystats. \doi{10.31234/osf.io/bu6nt} \item Rousseeuw, P. J., and Van Zomeren, B. C. (1990). Unmasking multivariate outliers and leverage points. Journal of the American Statistical association, 85(411), 633-639. } } \seealso{ +\code{\link[see:plot.see_check_outliers]{see::plot.see_check_outliers()}} for options to customize the plot. + Other functions to check model assumptions and and assess model quality: \code{\link{check_autocorrelation}()}, \code{\link{check_collinearity}()}, diff --git a/man/check_overdispersion.Rd b/man/check_overdispersion.Rd index e8a68a6c9..692f7aa0b 100644 --- a/man/check_overdispersion.Rd +++ b/man/check_overdispersion.Rd @@ -2,15 +2,22 @@ % Please edit documentation in R/check_overdispersion.R \name{check_overdispersion} \alias{check_overdispersion} -\title{Check overdispersion of GL(M)M's} +\alias{check_overdispersion.performance_simres} +\title{Check overdispersion (and underdispersion) of GL(M)M's} \usage{ check_overdispersion(x, ...) + +\method{check_overdispersion}{performance_simres}(x, alternative = c("two.sided", "less", "greater"), ...) } \arguments{ -\item{x}{Fitted model of class \code{merMod}, \code{glmmTMB}, \code{glm}, -or \code{glm.nb} (package \pkg{MASS}).} +\item{x}{Fitted model of class \code{merMod}, \code{glmmTMB}, \code{glm}, or \code{glm.nb} +(package \strong{MASS}), or an object returned by \code{simulate_residuals()}.} + +\item{...}{Arguments passed down to \code{\link[=simulate_residuals]{simulate_residuals()}}. This only applies +for models with zero-inflation component, or for models of class \code{glmmTMB} +from \code{nbinom1} or \code{nbinom2} family.} -\item{...}{Currently not used.} +\item{alternative}{A character string specifying the alternative hypothesis.} } \value{ A list with results from the overdispersion test, like chi-squared @@ -18,20 +25,23 @@ statistics, p-value or dispersion ratio. } \description{ \code{check_overdispersion()} checks generalized linear (mixed) -models for overdispersion. +models for overdispersion (and underdispersion). } \details{ Overdispersion occurs when the observed variance is higher than the variance of a theoretical model. For Poisson models, variance increases with the mean and, therefore, variance usually (roughly) equals the mean -value. If the variance is much higher, the data are "overdispersed". +value. If the variance is much higher, the data are "overdispersed". A less +common case is underdispersion, where the variance is much lower than the +mean. } \section{Interpretation of the Dispersion Ratio}{ If the dispersion ratio is close to one, a Poisson model fits well to the data. Dispersion ratios larger than one indicate overdispersion, thus a -negative binomial model or similar might fit better to the data. A p-value < -.05 indicates overdispersion. +negative binomial model or similar might fit better to the data. Dispersion +ratios much smaller than one indicate underdispersion. A p-value < .05 +indicates either overdispersion or underdispersion (the first being more common). } \section{Overdispersion in Poisson Models}{ @@ -40,6 +50,12 @@ For Poisson models, the overdispersion test is based on the code from \emph{Gelman and Hill (2007), page 115}. } +\section{Overdispersion in Negative Binomial or Zero-Inflated Models}{ + +For negative binomial (mixed) models or models with zero-inflation component, +the overdispersion test is based simulated residuals (see \code{\link[=simulate_residuals]{simulate_residuals()}}). +} + \section{Overdispersion in Mixed Models}{ For \code{merMod}- and \code{glmmTMB}-objects, \code{check_overdispersion()} @@ -47,8 +63,10 @@ is based on the code in the \href{http://bbolker.github.io/mixedmodels-misc/glmmFAQ.html}{GLMM FAQ}, section \emph{How can I deal with overdispersion in GLMMs?}. Note that this function only returns an \emph{approximate} estimate of an overdispersion -parameter, and is probably inaccurate for zero-inflated mixed models (fitted -with \code{glmmTMB}). +parameter. Using this approach would be inaccurate for zero-inflated or +negative binomial mixed models (fitted with \code{glmmTMB}), thus, in such cases, +the overdispersion test is based on \code{\link[=simulate_residuals]{simulate_residuals()}} (which is identical +to \code{check_overdispersion(simulate_residuals(model))}). } \section{How to fix Overdispersion}{ @@ -58,19 +76,28 @@ by choosing a different distributional family (like Quasi-Poisson, or negative binomial, see \emph{Gelman and Hill (2007), pages 115-116}). } -\examples{ -\dontshow{if (getRversion() >= "4.0.0" && require("glmmTMB", quietly = TRUE)) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +\section{Tests based on simulated residuals}{ -library(glmmTMB) -data(Salamanders) -m <- glm(count ~ spp + mined, family = poisson, data = Salamanders) -check_overdispersion(m) +For certain models, resp. model from certain families, tests are based on +simulated residuals (see \code{\link[=simulate_residuals]{simulate_residuals()}}). These are usually more +accurate for testing such models than the traditionally used Pearson residuals. +However, when simulating from more complex models, such as mixed models or +models with zero-inflation, there are several important considerations. +Arguments specified in \code{...} are passed to \code{\link[=simulate_residuals]{simulate_residuals()}}, which +relies on \code{\link[DHARMa:simulateResiduals]{DHARMa::simulateResiduals()}} (and therefore, arguments in \code{...} +are passed further down to \emph{DHARMa}). The defaults in DHARMa are set on the +most conservative option that works for all models. However, in many cases, +the help advises to use different settings in particular situations or for +particular models. It is recommended to read the 'Details' in +\code{?DHARMa::simulateResiduals} closely to understand the implications of the +simulation process and which arguments should be modified to get the most +accurate results. +} -m <- glmmTMB( - count ~ mined + spp + (1 | site), - family = poisson, - data = Salamanders -) +\examples{ +\dontshow{if (getRversion() >= "4.0.0" && require("glmmTMB")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +data(Salamanders, package = "glmmTMB") +m <- glm(count ~ spp + mined, family = poisson, data = Salamanders) check_overdispersion(m) \dontshow{\}) # examplesIf} } diff --git a/man/check_predictions.Rd b/man/check_predictions.Rd index 591c813da..148df6994 100644 --- a/man/check_predictions.Rd +++ b/man/check_predictions.Rd @@ -117,6 +117,9 @@ Cambridge University Press. } } \seealso{ +\code{\link[=simulate_residuals]{simulate_residuals()}} and \code{\link[=check_residuals]{check_residuals()}}. See also +\code{\link[see:print.see_performance_pp_check]{see::print.see_performance_pp_check()}} for options to customize the plot. + Other functions to check model assumptions and and assess model quality: \code{\link{check_autocorrelation}()}, \code{\link{check_collinearity}()}, diff --git a/man/check_residuals.Rd b/man/check_residuals.Rd new file mode 100644 index 000000000..972b9783e --- /dev/null +++ b/man/check_residuals.Rd @@ -0,0 +1,67 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/check_residuals.R +\name{check_residuals} +\alias{check_residuals} +\alias{check_residuals.default} +\title{Check uniformity of simulated residuals} +\usage{ +check_residuals(x, ...) + +\method{check_residuals}{default}(x, alternative = c("two.sided", "less", "greater"), ...) +} +\arguments{ +\item{x}{An object returned by \code{\link[=simulate_residuals]{simulate_residuals()}} or +\code{\link[DHARMa:simulateResiduals]{DHARMa::simulateResiduals()}}.} + +\item{...}{Passed down to \code{\link[stats:ks.test]{stats::ks.test()}}.} + +\item{alternative}{A character string specifying the alternative hypothesis. +See \code{\link[stats:ks.test]{stats::ks.test()}} for details.} +} +\value{ +The p-value of the test statistics. +} +\description{ +\code{check_residuals()} checks generalized linear (mixed) models for uniformity +of randomized quantile residuals, which can be used to identify typical model +misspecification problems, such as over/underdispersion, zero-inflation, and +residual spatial and temporal autocorrelation. +} +\details{ +Uniformity of residuals is checked using a Kolmogorov-Smirnov test. +There is a \code{plot()} method to visualize the distribution of the residuals. +The test for uniformity basically tests to which extent the observed values +deviate from the model expectations (i.e. simulated values). In this sense, +the \code{check_residuals()} function has similar goals like \code{\link[=check_predictions]{check_predictions()}}. +} +\section{Tests based on simulated residuals}{ + +For certain models, resp. model from certain families, tests like +\code{\link[=check_zeroinflation]{check_zeroinflation()}} or \code{\link[=check_overdispersion]{check_overdispersion()}} are based on +simulated residuals. These are usually more accurate for such tests than +the traditionally used Pearson residuals. However, when simulating from more +complex models, such as mixed models or models with zero-inflation, there are +several important considerations. \code{simulate_residuals()} relies on +\code{\link[DHARMa:simulateResiduals]{DHARMa::simulateResiduals()}}, and additional arguments specified in \code{...} +are passed further down to that function. The defaults in DHARMa are set on +the most conservative option that works for all models. However, in many +cases, the help advises to use different settings in particular situations +or for particular models. It is recommended to read the 'Details' in +\code{?DHARMa::simulateResiduals} closely to understand the implications of the +simulation process and which arguments should be modified to get the most +accurate results. +} + +\examples{ +\dontshow{if (require("DHARMa")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +dat <- DHARMa::createData(sampleSize = 100, overdispersion = 0.5, family = poisson()) +m <- glm(observedResponse ~ Environment1, family = poisson(), data = dat) +res <- simulate_residuals(m) +check_residuals(res) +\dontshow{\}) # examplesIf} +} +\seealso{ +\code{\link[=simulate_residuals]{simulate_residuals()}}, \code{\link[=check_zeroinflation]{check_zeroinflation()}}, +\code{\link[=check_overdispersion]{check_overdispersion()}} and \code{\link[=check_predictions]{check_predictions()}}. See also +\code{\link[see:plot.see_performance_simres]{see::plot.see_performance_simres()}} for options to customize the plot. +} diff --git a/man/check_singularity.Rd b/man/check_singularity.Rd index 67f47f9df..e4cafa99a 100644 --- a/man/check_singularity.Rd +++ b/man/check_singularity.Rd @@ -47,6 +47,24 @@ predictive accuracy and overfitting/type I error (\emph{Bates et al. 2015}, \item "keep it maximal", i.e. fit the most complex model consistent with the experimental design, removing only terms required to allow a non-singular fit (\emph{Barr et al. 2013}) +\item since version 1.1.9, the \strong{glmmTMB} package allows to use priors in a +frequentist framework, too. One recommendation is to use a Gamma prior +(\emph{Chung et al. 2013}). The mean may vary from 1 to very large values +(like \code{1e8}), and the shape parameter should be set to a value of 2.5. You +can then \code{update()} your model with the specified prior. In \strong{glmmTMB}, +the code would look like this: + +\if{html}{\out{
}}\preformatted{# "model" is an object of class gmmmTMB +prior <- data.frame( + prior = "gamma(1, 2.5)", # mean can be 1, but even 1e8 + class = "ranef" # for random effects +) +model_with_priors <- update(model, priors = prior) +}\if{html}{\out{
}} + +Large values for the mean parameter of the Gamma prior have no large impact +on the random effects variances in terms of a "bias". Thus, if \code{1} doesn't +fix the singular fit, you can safely try larger values. } Note the different meaning between singularity and convergence: singularity @@ -57,9 +75,8 @@ question of whether we can assume that the numerical optimization has worked correctly or not. } \examples{ -\dontshow{if (require("lme4")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} -library(lme4) -data(sleepstudy) +\dontshow{if (require("lme4") && require("glmmTMB")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +data(sleepstudy, package = "lme4") set.seed(123) sleepstudy$mygrp <- sample(1:5, size = 180, replace = TRUE) sleepstudy$mysubgrp <- NA @@ -69,12 +86,32 @@ for (i in 1:5) { sample(1:30, size = sum(filter_group), replace = TRUE) } -model <- lmer( +model <- lme4::lmer( Reaction ~ Days + (1 | mygrp / mysubgrp) + (1 | Subject), data = sleepstudy ) +check_singularity(model) +# Fixing singularity issues using priors in glmmTMB +# Example taken from `vignette("priors", package = "glmmTMB")` +dat <- readRDS(system.file("vignette_data", "gophertortoise.rds", + package = "glmmTMB")) +model <- glmmTMB::glmmTMB( + shells ~ prev + offset(log(Area)) + factor(year) + (1 | Site), + family = poisson, + data = dat +) +# singular fit check_singularity(model) + +# impose Gamma prior on random effects parameters +prior <- data.frame( + prior = "gamma(1, 2.5)", # mean can be 1, but even 1e8 + class = "ranef" # for random effects +) +model_with_priors <- update(model, priors = prior) +# no singular fit +check_singularity(model_with_priors) \dontshow{\}) # examplesIf} } \references{ @@ -84,9 +121,11 @@ arXiv:1506.04967, June 2015. \item Barr DJ, Levy R, Scheepers C, Tily HJ. Random effects structure for confirmatory hypothesis testing: Keep it maximal. Journal of Memory and Language, 68(3):255-278, April 2013. -\item Matuschek H, Kliegl R, Vasishth S, Baayen H, Bates D. Balancing type -I error and power in linear mixed models. Journal of Memory and Language, -94:305-315, 2017. +\item Chung Y, Rabe-Hesketh S, Dorie V, Gelman A, and Liu J. 2013. "A Nondegenerate +Penalized Likelihood Estimator for Variance Parameters in Multilevel Models." +Psychometrika 78 (4): 685–709. \doi{10.1007/s11336-013-9328-2} +\item Matuschek H, Kliegl R, Vasishth S, Baayen H, Bates D. Balancing type I error +and power in linear mixed models. Journal of Memory and Language, 94:305-315, 2017. \item lme4 Reference Manual, \url{https://cran.r-project.org/package=lme4} } } diff --git a/man/check_zeroinflation.Rd b/man/check_zeroinflation.Rd index db9eddd23..7a4da3945 100644 --- a/man/check_zeroinflation.Rd +++ b/man/check_zeroinflation.Rd @@ -2,18 +2,35 @@ % Please edit documentation in R/check_zeroinflation.R \name{check_zeroinflation} \alias{check_zeroinflation} +\alias{check_zeroinflation.default} +\alias{check_zeroinflation.performance_simres} \title{Check for zero-inflation in count models} \usage{ -check_zeroinflation(x, tolerance = 0.05) +check_zeroinflation(x, ...) + +\method{check_zeroinflation}{default}(x, tolerance = 0.05, ...) + +\method{check_zeroinflation}{performance_simres}( + x, + tolerance = 0.1, + alternative = c("two.sided", "less", "greater"), + ... +) } \arguments{ \item{x}{Fitted model of class \code{merMod}, \code{glmmTMB}, \code{glm}, or \code{glm.nb} (package \strong{MASS}).} +\item{...}{Arguments passed down to \code{\link[=simulate_residuals]{simulate_residuals()}}. This only applies +for models with zero-inflation component, or for models of class \code{glmmTMB} +from \code{nbinom1} or \code{nbinom2} family.} + \item{tolerance}{The tolerance for the ratio of observed and predicted zeros to considered as over- or underfitting zeros. A ratio between 1 +/- \code{tolerance} is considered as OK, while a ratio beyond or below this threshold would indicate over- or underfitting.} + +\item{alternative}{A character string specifying the alternative hypothesis.} } \value{ A list with information about the amount of predicted and observed @@ -28,12 +45,46 @@ If the amount of observed zeros is larger than the amount of predicted zeros, the model is underfitting zeros, which indicates a zero-inflation in the data. In such cases, it is recommended to use negative binomial or zero-inflated models. + +In case of negative binomial models, models with zero-inflation component, +or hurdle models, the results from \code{check_zeroinflation()} are based on +\code{\link[=simulate_residuals]{simulate_residuals()}}, i.e. \code{check_zeroinflation(simulate_residuals(model))} +is internally called if necessary. } +\section{Tests based on simulated residuals}{ + +For certain models, resp. model from certain families, tests are based on +simulated residuals (see \code{\link[=simulate_residuals]{simulate_residuals()}}). These are usually more +accurate for testing such models than the traditionally used Pearson residuals. +However, when simulating from more complex models, such as mixed models or +models with zero-inflation, there are several important considerations. +Arguments specified in \code{...} are passed to \code{\link[=simulate_residuals]{simulate_residuals()}}, which +relies on \code{\link[DHARMa:simulateResiduals]{DHARMa::simulateResiduals()}} (and therefore, arguments in \code{...} +are passed further down to \emph{DHARMa}). The defaults in DHARMa are set on the +most conservative option that works for all models. However, in many cases, +the help advises to use different settings in particular situations or for +particular models. It is recommended to read the 'Details' in +\code{?DHARMa::simulateResiduals} closely to understand the implications of the +simulation process and which arguments should be modified to get the most +accurate results. +} + \examples{ -\dontshow{if (require("glmmTMB")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +\dontshow{if (require("glmmTMB") && require("DHARMa")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} data(Salamanders, package = "glmmTMB") m <- glm(count ~ spp + mined, family = poisson, data = Salamanders) check_zeroinflation(m) + +# for models with zero-inflation component, it's better to carry out +# the check for zero-inflation using simulated residuals +m <- glmmTMB::glmmTMB( + count ~ spp + mined, + ziformula = ~ mined + spp, + family = poisson, + data = Salamanders +) +res <- simulate_residuals(m) +check_zeroinflation(res) \dontshow{\}) # examplesIf} } \seealso{ diff --git a/man/compare_performance.Rd b/man/compare_performance.Rd index 30c324351..d6b17b9f1 100644 --- a/man/compare_performance.Rd +++ b/man/compare_performance.Rd @@ -25,7 +25,8 @@ overall model performance. See 'Details'.} \item{estimator}{Only for linear models. Corresponds to the different estimators for the standard deviation of the errors. If \code{estimator = "ML"} -(default), the scaling is done by n (the biased ML estimator), which is +(default, except for \code{performance_aic()} when the model object is of class +\code{lmerMod}), the scaling is done by \code{n} (the biased ML estimator), which is then equivalent to using \code{AIC(logLik())}. Setting it to \code{"REML"} will give the same results as \code{AIC(logLik(..., REML = TRUE))}.} diff --git a/man/figures/unnamed-chunk-14-1.png b/man/figures/unnamed-chunk-14-1.png index 4a0474675..89a6592ba 100644 Binary files a/man/figures/unnamed-chunk-14-1.png and b/man/figures/unnamed-chunk-14-1.png differ diff --git a/man/figures/unnamed-chunk-20-1.png b/man/figures/unnamed-chunk-20-1.png index 1f2f7b4cc..063ce5313 100644 Binary files a/man/figures/unnamed-chunk-20-1.png and b/man/figures/unnamed-chunk-20-1.png differ diff --git a/man/looic.Rd b/man/looic.Rd index 742ac3482..7f985f4a2 100644 --- a/man/looic.Rd +++ b/man/looic.Rd @@ -22,6 +22,7 @@ indicative of a better fit. } \examples{ \dontshow{if (require("rstanarm")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +\donttest{ model <- suppressWarnings(rstanarm::stan_glm( mpg ~ wt + cyl, data = mtcars, @@ -30,5 +31,6 @@ model <- suppressWarnings(rstanarm::stan_glm( refresh = 0 )) looic(model) +} \dontshow{\}) # examplesIf} } diff --git a/man/model_performance.merMod.Rd b/man/model_performance.merMod.Rd index 519f1ee0a..2f02ea179 100644 --- a/man/model_performance.merMod.Rd +++ b/man/model_performance.merMod.Rd @@ -21,7 +21,8 @@ BIC, R2, ICC and RMSE.} \item{estimator}{Only for linear models. Corresponds to the different estimators for the standard deviation of the errors. If \code{estimator = "ML"} -(default), the scaling is done by n (the biased ML estimator), which is +(default, except for \code{performance_aic()} when the model object is of class +\code{lmerMod}), the scaling is done by \code{n} (the biased ML estimator), which is then equivalent to using \code{AIC(logLik())}. Setting it to \code{"REML"} will give the same results as \code{AIC(logLik(..., REML = TRUE))}.} diff --git a/man/model_performance.rma.Rd b/man/model_performance.rma.Rd index 69d1923ba..f6489005b 100644 --- a/man/model_performance.rma.Rd +++ b/man/model_performance.rma.Rd @@ -20,7 +20,8 @@ computed (some of \code{c("AIC", "BIC", "I2", "H2", "TAU2", "R2", "CochransQ", " \item{estimator}{Only for linear models. Corresponds to the different estimators for the standard deviation of the errors. If \code{estimator = "ML"} -(default), the scaling is done by n (the biased ML estimator), which is +(default, except for \code{performance_aic()} when the model object is of class +\code{lmerMod}), the scaling is done by \code{n} (the biased ML estimator), which is then equivalent to using \code{AIC(logLik())}. Setting it to \code{"REML"} will give the same results as \code{AIC(logLik(..., REML = TRUE))}.} @@ -34,13 +35,13 @@ A data frame (with one row) and one column per "index" (see } \description{ Compute indices of model performance for meta-analysis model from the -\pkg{metafor} package. +\strong{metafor} package. } \details{ \subsection{Indices of fit}{ \itemize{ \item \strong{AIC} Akaike's Information Criterion, see \code{?stats::AIC} -\item \strong{BIC} {Bayesian Information Criterion, see \code{?stats::BIC}} +\item \strong{BIC} Bayesian Information Criterion, see \code{?stats::BIC} \item \strong{I2}: For a random effects model, \code{I2} estimates (in percent) how much of the total variability in the effect size estimates can be attributed to heterogeneity among the true effects. For a diff --git a/man/model_performance.stanreg.Rd b/man/model_performance.stanreg.Rd index bbd82bc53..622c5b18a 100644 --- a/man/model_performance.stanreg.Rd +++ b/man/model_performance.stanreg.Rd @@ -60,7 +60,7 @@ values mean better fit. See \code{?loo::waic}. } } \examples{ -\dontshow{if (require("rstanarm") && require("rstantools") && require("BayesFactor")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +\dontshow{if (require("rstanarm") && require("rstantools")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} \donttest{ model <- suppressWarnings(rstanarm::stan_glm( mpg ~ wt + cyl, @@ -79,12 +79,6 @@ model <- suppressWarnings(rstanarm::stan_glmer( refresh = 0 )) model_performance(model) - -model <- BayesFactor::generalTestBF(carb ~ am + mpg, mtcars) - -model_performance(model) -model_performance(model[3]) -model_performance(model, average = TRUE) } \dontshow{\}) # examplesIf} } diff --git a/man/performance-package.Rd b/man/performance-package.Rd index 781db6a18..b3f1d5a3f 100644 --- a/man/performance-package.Rd +++ b/man/performance-package.Rd @@ -3,7 +3,6 @@ \docType{package} \name{performance-package} \alias{performance-package} -\alias{_PACKAGE} \title{performance: An R Package for Assessment, Comparison and Testing of Statistical Models} \description{ @@ -44,12 +43,12 @@ Authors: \item Indrajeet Patil \email{patilindrajeet.science@gmail.com} (\href{https://orcid.org/0000-0003-1995-6531}{ORCID}) (@patilindrajeets) [contributor] \item Philip Waggoner \email{philip.waggoner@gmail.com} (\href{https://orcid.org/0000-0002-7825-7573}{ORCID}) [contributor] \item Brenton M. Wiernik \email{brenton@wiernik.org} (\href{https://orcid.org/0000-0001-9560-6336}{ORCID}) (@bmwiernik) [contributor] + \item Rémi Thériault \email{remi.theriault@mail.mcgill.ca} (\href{https://orcid.org/0000-0003-4315-6788}{ORCID}) (@rempsyc) [contributor] } Other contributors: \itemize{ \item Vincent Arel-Bundock \email{vincent.arel-bundock@umontreal.ca} (\href{https://orcid.org/0000-0003-2042-7063}{ORCID}) [contributor] - \item Rémi Thériault \email{remi.theriault@mail.mcgill.ca} (\href{https://orcid.org/0000-0003-4315-6788}{ORCID}) (@rempsyc) [contributor] \item Martin Jullum [reviewer] \item gjo11 [reviewer] \item Etienne Bacher \email{etienne.bacher@protonmail.com} (\href{https://orcid.org/0000-0002-9271-5075}{ORCID}) [contributor] diff --git a/man/performance_aicc.Rd b/man/performance_aicc.Rd index be3695527..0bf2120b7 100644 --- a/man/performance_aicc.Rd +++ b/man/performance_aicc.Rd @@ -4,6 +4,7 @@ \alias{performance_aicc} \alias{performance_aic} \alias{performance_aic.default} +\alias{performance_aic.lmerMod} \title{Compute the AIC or second-order AIC} \usage{ performance_aicc(x, ...) @@ -11,6 +12,8 @@ performance_aicc(x, ...) performance_aic(x, ...) \method{performance_aic}{default}(x, estimator = "ML", verbose = TRUE, ...) + +\method{performance_aic}{lmerMod}(x, estimator = "REML", verbose = TRUE, ...) } \arguments{ \item{x}{A model object.} @@ -19,7 +22,8 @@ performance_aic(x, ...) \item{estimator}{Only for linear models. Corresponds to the different estimators for the standard deviation of the errors. If \code{estimator = "ML"} -(default), the scaling is done by n (the biased ML estimator), which is +(default, except for \code{performance_aic()} when the model object is of class +\code{lmerMod}), the scaling is done by \code{n} (the biased ML estimator), which is then equivalent to using \code{AIC(logLik())}. Setting it to \code{"REML"} will give the same results as \code{AIC(logLik(..., REML = TRUE))}.} diff --git a/man/r2_bayes.Rd b/man/r2_bayes.Rd index 63b082e08..67200fb7d 100644 --- a/man/r2_bayes.Rd +++ b/man/r2_bayes.Rd @@ -62,67 +62,47 @@ R2 takes both the fixed and random effects into account. returns a posterior sample of Bayesian R2 values. } \examples{ +\dontshow{if (require("rstanarm") && require("rstantools") && require("brms")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} library(performance) -if (require("rstanarm") && require("rstantools")) { - model <- suppressWarnings(stan_glm( - mpg ~ wt + cyl, - data = mtcars, - chains = 1, - iter = 500, - refresh = 0, - show_messages = FALSE - )) - r2_bayes(model) - - model <- suppressWarnings(stan_lmer( - Petal.Length ~ Petal.Width + (1 | Species), - data = iris, - chains = 1, - iter = 500, - refresh = 0 - )) - r2_bayes(model) -} - -if (require("BayesFactor")) { - BFM <- generalTestBF(mpg ~ qsec + gear, data = mtcars, progress = FALSE) - FM <- lmBF(mpg ~ qsec + gear, data = mtcars) - - r2_bayes(FM) - r2_bayes(BFM[3]) - r2_bayes(BFM, average = TRUE) # across all models - - # with random effects: - mtcars$gear <- factor(mtcars$gear) - model <- lmBF( - mpg ~ hp + cyl + gear + gear:wt, - mtcars, - progress = FALSE, - whichRandom = c("gear", "gear:wt") - ) - - r2_bayes(model) +\donttest{ +model <- suppressWarnings(rstanarm::stan_glm( + mpg ~ wt + cyl, + data = mtcars, + chains = 1, + iter = 500, + refresh = 0, + show_messages = FALSE +)) +r2_bayes(model) + +model <- suppressWarnings(rstanarm::stan_lmer( + Petal.Length ~ Petal.Width + (1 | Species), + data = iris, + chains = 1, + iter = 500, + refresh = 0 +)) +r2_bayes(model) } \donttest{ -if (require("brms")) { - model <- suppressWarnings(brms::brm( - mpg ~ wt + cyl, - data = mtcars, - silent = 2, - refresh = 0 - )) - r2_bayes(model) - - model <- suppressWarnings(brms::brm( - Petal.Length ~ Petal.Width + (1 | Species), - data = iris, - silent = 2, - refresh = 0 - )) - r2_bayes(model) -} +model <- suppressWarnings(brms::brm( + mpg ~ wt + cyl, + data = mtcars, + silent = 2, + refresh = 0 +)) +r2_bayes(model) + +model <- suppressWarnings(brms::brm( + Petal.Length ~ Petal.Width + (1 | Species), + data = iris, + silent = 2, + refresh = 0 +)) +r2_bayes(model) } +\dontshow{\}) # examplesIf} } \references{ Gelman, A., Goodrich, B., Gabry, J., and Vehtari, A. (2018). diff --git a/man/r2_kullback.Rd b/man/r2_kullback.Rd index d2980d18e..ae0f9dd55 100644 --- a/man/r2_kullback.Rd +++ b/man/r2_kullback.Rd @@ -2,13 +2,18 @@ % Please edit documentation in R/r2_kl.R \name{r2_kullback} \alias{r2_kullback} +\alias{r2_kullback.glm} \title{Kullback-Leibler R2} \usage{ -r2_kullback(model, adjust = TRUE) +r2_kullback(model, ...) + +\method{r2_kullback}{glm}(model, adjust = TRUE, ...) } \arguments{ \item{model}{A generalized linear model.} +\item{...}{Additional arguments. Currently not used.} + \item{adjust}{Logical, if \code{TRUE} (the default), the adjusted R2 value is returned.} } diff --git a/man/simulate_residuals.Rd b/man/simulate_residuals.Rd new file mode 100644 index 000000000..493461cda --- /dev/null +++ b/man/simulate_residuals.Rd @@ -0,0 +1,86 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/simulate_residuals.R +\name{simulate_residuals} +\alias{simulate_residuals} +\alias{residuals.performance_simres} +\title{Simulate randomized quantile residuals from a model} +\usage{ +simulate_residuals(x, iterations = 250, ...) + +\method{residuals}{performance_simres}(object, quantile_function = NULL, outlier_values = NULL, ...) +} +\arguments{ +\item{x}{A model object.} + +\item{iterations}{Number of simulations to run.} + +\item{...}{Arguments passed on to \code{\link[DHARMa:simulateResiduals]{DHARMa::simulateResiduals()}}.} + +\item{object}{A \code{performance_simres} object, as returned by \code{simulate_residuals()}.} + +\item{quantile_function}{A function to apply to the residuals. If \code{NULL}, the +residuals are returned as is. If not \code{NULL}, the residuals are passed to this +function. This is useful for returning normally distributed residuals, for +example: \code{residuals(x, quantile_function = qnorm)}.} + +\item{outlier_values}{A vector of length 2, specifying the values to replace +\code{-Inf} and \code{Inf} with, respectively.} +} +\value{ +Simulated residuals, which can be further processed with +\code{\link[=check_residuals]{check_residuals()}}. The returned object is of class \code{DHARMa} and +\code{performance_simres}. +} +\description{ +Returns simulated residuals from a model. This is useful for +checking the uniformity of residuals, in particular for non-Gaussian models, +where the residuals are not expected to be normally distributed. +} +\details{ +This function is a small wrapper around \code{\link[DHARMa:simulateResiduals]{DHARMa::simulateResiduals()}}. +It basically only sets \code{plot = FALSE} and adds an additional class attribute +(\code{"performance_sim_res"}), which allows using the DHARMa object in own plotting +functions from the \strong{see} package. See also \code{vignette("DHARMa")}. There is a +\code{plot()} method to visualize the distribution of the residuals. +} +\section{Tests based on simulated residuals}{ + +For certain models, resp. model from certain families, tests like +\code{\link[=check_zeroinflation]{check_zeroinflation()}} or \code{\link[=check_overdispersion]{check_overdispersion()}} are based on +simulated residuals. These are usually more accurate for such tests than +the traditionally used Pearson residuals. However, when simulating from more +complex models, such as mixed models or models with zero-inflation, there are +several important considerations. \code{simulate_residuals()} relies on +\code{\link[DHARMa:simulateResiduals]{DHARMa::simulateResiduals()}}, and additional arguments specified in \code{...} +are passed further down to that function. The defaults in DHARMa are set on +the most conservative option that works for all models. However, in many +cases, the help advises to use different settings in particular situations +or for particular models. It is recommended to read the 'Details' in +\code{?DHARMa::simulateResiduals} closely to understand the implications of the +simulation process and which arguments should be modified to get the most +accurate results. +} + +\examples{ +\dontshow{if (require("DHARMa")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +m <- lm(mpg ~ wt + cyl + gear + disp, data = mtcars) +simulate_residuals(m) + +# extract residuals +head(residuals(simulate_residuals(m))) +\dontshow{\}) # examplesIf} +} +\references{ +\itemize{ +\item Hartig, F., & Lohse, L. (2022). DHARMa: Residual Diagnostics for Hierarchical +(Multi-Level / Mixed) Regression Models (Version 0.4.5). Retrieved from +https://CRAN.R-project.org/package=DHARMa +\item Dunn, P. K., & Smyth, G. K. (1996). Randomized Quantile Residuals. Journal +of Computational and Graphical Statistics, 5(3), 236. \doi{10.2307/1390802} +} +} +\seealso{ +\code{\link[=check_residuals]{check_residuals()}}, \code{\link[=check_zeroinflation]{check_zeroinflation()}}, +\code{\link[=check_overdispersion]{check_overdispersion()}} and \code{\link[=check_predictions]{check_predictions()}}. See also +\code{\link[see:plot.see_performance_simres]{see::plot.see_performance_simres()}} for options to customize the plot. +} diff --git a/papers/JOSE/apa.csl b/papers/JOSE/apa.csl new file mode 100644 index 000000000..946c7fcd2 --- /dev/null +++ b/papers/JOSE/apa.csl @@ -0,0 +1,1917 @@ + + \ No newline at end of file diff --git a/papers/JOSE/arxiv.sty b/papers/JOSE/arxiv.sty new file mode 100644 index 000000000..f32d6d899 --- /dev/null +++ b/papers/JOSE/arxiv.sty @@ -0,0 +1,255 @@ +\NeedsTeXFormat{LaTeX2e} + +\ProcessOptions\relax + +% fonts +\renewcommand{\rmdefault}{ptm} +\renewcommand{\sfdefault}{phv} + +% set page geometry +\usepackage[verbose=true,letterpaper]{geometry} +\AtBeginDocument{ + \newgeometry{ + textheight=9in, + textwidth=6.5in, + top=1in, + headheight=14pt, + headsep=25pt, + footskip=30pt + } +} + +\widowpenalty=10000 +\clubpenalty=10000 +\flushbottom +\sloppy + +\usepackage{fancyhdr} +\fancyhf{} +\pagestyle{fancy} +\renewcommand{\headrulewidth}{0pt} +\fancyheadoffset{0pt} +\rhead{\scshape A preprint - \today} +\cfoot{\thepage} + + +%Handling Keywords +\def\keywordname{{\bfseries \emph Keywords}}% +\def\keywords#1{\par\addvspace\medskipamount{\rightskip=0pt plus1cm +\def\and{\ifhmode\unskip\nobreak\fi\ $\cdot$ +}\noindent\keywordname\enspace\ignorespaces#1\par}} + +% font sizes with reduced leading +\renewcommand{\normalsize}{% + \@setfontsize\normalsize\@xpt\@xipt + \abovedisplayskip 7\p@ \@plus 2\p@ \@minus 5\p@ + \abovedisplayshortskip \z@ \@plus 3\p@ + \belowdisplayskip \abovedisplayskip + \belowdisplayshortskip 4\p@ \@plus 3\p@ \@minus 3\p@ +} +\normalsize +\renewcommand{\small}{% + \@setfontsize\small\@ixpt\@xpt + \abovedisplayskip 6\p@ \@plus 1.5\p@ \@minus 4\p@ + \abovedisplayshortskip \z@ \@plus 2\p@ + \belowdisplayskip \abovedisplayskip + \belowdisplayshortskip 3\p@ \@plus 2\p@ \@minus 2\p@ +} +\renewcommand{\footnotesize}{\@setfontsize\footnotesize\@ixpt\@xpt} +\renewcommand{\scriptsize}{\@setfontsize\scriptsize\@viipt\@viiipt} +\renewcommand{\tiny}{\@setfontsize\tiny\@vipt\@viipt} +\renewcommand{\large}{\@setfontsize\large\@xiipt{14}} +\renewcommand{\Large}{\@setfontsize\Large\@xivpt{16}} +\renewcommand{\LARGE}{\@setfontsize\LARGE\@xviipt{20}} +\renewcommand{\huge}{\@setfontsize\huge\@xxpt{23}} +\renewcommand{\Huge}{\@setfontsize\Huge\@xxvpt{28}} + +% sections with less space +\providecommand{\section}{} +\renewcommand{\section}{% + \@startsection{section}{1}{\z@}% + {-2.0ex \@plus -0.5ex \@minus -0.2ex}% + { 1.5ex \@plus 0.3ex \@minus 0.2ex}% + {\large\bf\raggedright}% +} +\providecommand{\subsection}{} +\renewcommand{\subsection}{% + \@startsection{subsection}{2}{\z@}% + {-1.8ex \@plus -0.5ex \@minus -0.2ex}% + { 0.8ex \@plus 0.2ex}% + {\normalsize\bf\raggedright}% +} +\providecommand{\subsubsection}{} +\renewcommand{\subsubsection}{% + \@startsection{subsubsection}{3}{\z@}% + {-1.5ex \@plus -0.5ex \@minus -0.2ex}% + { 0.5ex \@plus 0.2ex}% + {\normalsize\bf\raggedright}% +} +\providecommand{\paragraph}{} +\renewcommand{\paragraph}{% + \@startsection{paragraph}{4}{\z@}% + {1.5ex \@plus 0.5ex \@minus 0.2ex}% + {-1em}% + {\normalsize\bf}% +} +\providecommand{\subparagraph}{} +\renewcommand{\subparagraph}{% + \@startsection{subparagraph}{5}{\z@}% + {1.5ex \@plus 0.5ex \@minus 0.2ex}% + {-1em}% + {\normalsize\bf}% +} +\providecommand{\subsubsubsection}{} +\renewcommand{\subsubsubsection}{% + \vskip5pt{\noindent\normalsize\rm\raggedright}% +} + +% float placement +\renewcommand{\topfraction }{0.85} +\renewcommand{\bottomfraction }{0.4} +\renewcommand{\textfraction }{0.1} +\renewcommand{\floatpagefraction}{0.7} + +\newlength{\@abovecaptionskip}\setlength{\@abovecaptionskip}{7\p@} +\newlength{\@belowcaptionskip}\setlength{\@belowcaptionskip}{\z@} + +\setlength{\abovecaptionskip}{\@abovecaptionskip} +\setlength{\belowcaptionskip}{\@belowcaptionskip} + +% swap above/belowcaptionskip lengths for tables +\renewenvironment{table} + {\setlength{\abovecaptionskip}{\@belowcaptionskip}% + \setlength{\belowcaptionskip}{\@abovecaptionskip}% + \@float{table}} + {\end@float} + +% footnote formatting +\setlength{\footnotesep }{6.65\p@} +\setlength{\skip\footins}{9\p@ \@plus 4\p@ \@minus 2\p@} +\renewcommand{\footnoterule}{\kern-3\p@ \hrule width 12pc \kern 2.6\p@} +\setcounter{footnote}{0} + +% paragraph formatting +\setlength{\parindent}{\z@} +\setlength{\parskip }{5.5\p@} + +% list formatting +\setlength{\topsep }{4\p@ \@plus 1\p@ \@minus 2\p@} +\setlength{\partopsep }{1\p@ \@plus 0.5\p@ \@minus 0.5\p@} +\setlength{\itemsep }{2\p@ \@plus 1\p@ \@minus 0.5\p@} +\setlength{\parsep }{2\p@ \@plus 1\p@ \@minus 0.5\p@} +\setlength{\leftmargin }{3pc} +\setlength{\leftmargini }{\leftmargin} +\setlength{\leftmarginii }{2em} +\setlength{\leftmarginiii}{1.5em} +\setlength{\leftmarginiv }{1.0em} +\setlength{\leftmarginv }{0.5em} +\def\@listi {\leftmargin\leftmargini} +\def\@listii {\leftmargin\leftmarginii + \labelwidth\leftmarginii + \advance\labelwidth-\labelsep + \topsep 2\p@ \@plus 1\p@ \@minus 0.5\p@ + \parsep 1\p@ \@plus 0.5\p@ \@minus 0.5\p@ + \itemsep \parsep} +\def\@listiii{\leftmargin\leftmarginiii + \labelwidth\leftmarginiii + \advance\labelwidth-\labelsep + \topsep 1\p@ \@plus 0.5\p@ \@minus 0.5\p@ + \parsep \z@ + \partopsep 0.5\p@ \@plus 0\p@ \@minus 0.5\p@ + \itemsep \topsep} +\def\@listiv {\leftmargin\leftmarginiv + \labelwidth\leftmarginiv + \advance\labelwidth-\labelsep} +\def\@listv {\leftmargin\leftmarginv + \labelwidth\leftmarginv + \advance\labelwidth-\labelsep} +\def\@listvi {\leftmargin\leftmarginvi + \labelwidth\leftmarginvi + \advance\labelwidth-\labelsep} + +% create title +\providecommand{\maketitle}{} +\renewcommand{\maketitle}{% + \par + \begingroup + \renewcommand{\thefootnote}{\fnsymbol{footnote}} + % for perfect author name centering + \renewcommand{\@makefnmark}{\hbox to \z@{$^{\@thefnmark}$\hss}} + % The footnote-mark was overlapping the footnote-text, + % added the following to fix this problem (MK) + \long\def\@makefntext##1{% + \parindent 1em\noindent + \hbox to 1.8em{\hss $\m@th ^{\@thefnmark}$}##1 + } + \thispagestyle{empty} + \@maketitle + \@thanks + %\@notice + \endgroup + \let\maketitle\relax + \let\thanks\relax +} + +% rules for title box at top of first page +\newcommand{\@toptitlebar}{ + \hrule height 2\p@ + \vskip 0.25in + \vskip -\parskip% +} +\newcommand{\@bottomtitlebar}{ + \vskip 0.29in + \vskip -\parskip + \hrule height 2\p@ + \vskip 0.09in% +} + +% create title (includes both anonymized and non-anonymized versions) +\providecommand{\@maketitle}{} +\renewcommand{\@maketitle}{% + \vbox{% + \hsize\textwidth + \linewidth\hsize + \vskip 0.1in + \@toptitlebar + \centering + {\LARGE\sc \@title\par} + \@bottomtitlebar + \textsc{A Preprint}\\ + \vskip 0.1in + \def\And{% + \end{tabular}\hfil\linebreak[0]\hfil% + \begin{tabular}[t]{c}\bf\rule{\z@}{24\p@}\ignorespaces% + } + \def\AND{% + \end{tabular}\hfil\linebreak[4]\hfil% + \begin{tabular}[t]{c}\bf\rule{\z@}{24\p@}\ignorespaces% + } + \begin{tabular}[t]{c}\bf\rule{\z@}{24\p@}\@author\end{tabular}% + \vskip 0.4in \@minus 0.1in \center{\today} \vskip 0.2in + } +} + +% add conference notice to bottom of first page +\newcommand{\ftype@noticebox}{8} +\newcommand{\@notice}{% + % give a bit of extra room back to authors on first page + \enlargethispage{2\baselineskip}% + \@float{noticebox}[b]% + \footnotesize\@noticestring% + \end@float% +} + +% abstract styling +\renewenvironment{abstract} +{ + \centerline + {\large \bfseries \scshape Abstract} + \begin{quote} +} +{ + \end{quote} +} + +\endinput diff --git a/papers/JOSE/cover_letter.Rmd b/papers/JOSE/cover_letter.Rmd new file mode 100644 index 000000000..4251c050a --- /dev/null +++ b/papers/JOSE/cover_letter.Rmd @@ -0,0 +1,29 @@ +--- +output: pdf_document +--- + +Dear Dr. Vazire, + +We are pleased to submit this paper to *Collabra: Psychology*. + +The paper, titled "Check your outliers! An introduction to identifying statistical outliers in R with *easystats*", provides an overview of current recommendations and best practices regarding the diagnosis and treatment of outliers, a common issue faced by researchers---and a potential source of scientific malpractice. + +It explains the key approaches, highlights recommendations, and shows how users can adopt them in their R analysis with a single function. The manuscript covers univariate, multivariate, and model-based statistical outlier detection methods, their recommended threshold, standard output, and plotting method, among other things. + +Beyond acting like a concise review of outlier treatment procedures and practical tutorial, we also introduce a new outlier-detection method that relies on a consensus-based approach. In this sense, the paper fits well with the "Methodology and Research Practice in Psychology" section of the journal, as it essentially communicates to psychologists how to easily follow some of the best practices in the detection of statistical outlier using currently available open source and free software. This makes the manuscript relevant to data science, behavioural science, and good research and statistical practices more generally. + +As Associated Editor, we would like to suggest Jeffrey Girard, as he is familiar with the *easystats* and R ecosystems, as well as good statistical practices. Additionally, we would like to request a streamlined review, as the manuscript has been rejected within the previous 365 days from the journal *Mathematics*. Accordingly, we provide in this submission a detailed letter that includes prior reviews, the decision letter, as well as how we addressed the reviewers' comments. We have in this regard integrated most of the changes suggested by the reviewers. Note that the previous editors and reviewers have not given their permission for their comments to be openly available at *Collabra: Psychology*. However, although an open review was requested, the reviewers did not sign their reviews. + +Our current submission is original and has been neither published elsewhere nor is currently under consideration for publication elsewhere. All authors have contributed substantially to the software and manuscript. All authors gave final approval to the manuscript and accept to be accountable. We have no conflicts of interest to disclose. We have also read the Transparency and Openness policy of the Editorial Policies of *Collabra: Psychology*. + +Thank you for considering our submission. + +On the behalf of all authors, + +Rémi Thériault + +Department of Psychology, + +Université du Québec à Montréal, + +Montréal, Québec, Canada \ No newline at end of file diff --git a/papers/JOSE/cover_letter.pdf b/papers/JOSE/cover_letter.pdf new file mode 100644 index 000000000..15650f828 Binary files /dev/null and b/papers/JOSE/cover_letter.pdf differ diff --git a/papers/JOSE/paper.Rmd b/papers/JOSE/paper.Rmd new file mode 100644 index 000000000..56a893b1c --- /dev/null +++ b/papers/JOSE/paper.Rmd @@ -0,0 +1,296 @@ +--- +title: "Check your outliers! An introduction to identifying statistical outliers in R with *easystats*" +tags: + - R + - univariate outliers + - multivariate outliers + - robust detection methods + - easystats +authors: + - name: Rémi Thériault + orcid: 0000-0003-4315-6788 + affiliation: 1 + - name: Mattan S. Ben-Shachar + orcid: 0000-0002-4287-4801 + affiliation: 2 + - name: Indrajeet Patil + orcid: 0000-0003-1995-6531 + affiliation: 3 + - name: Daniel Lüdecke + orcid: 0000-0002-8895-3206 + affiliation: 4 + - name: Brenton M. Wiernik + orcid: 0000-0001-9560-6336 + affiliation: 5 + - name: Dominique Makowski + orcid: 0000-0001-5375-9967 + affiliation: 6 +affiliations: + - index: 1 + name: Department of Psychology, Université du Québec à Montréal, Montréal, Québec, Canada + - index: 2 + name: Independent Researcher, Ramat Gan, Israel + - index: 3 + name: Center for Humans and Machines, Max Planck Institute for Human Development, Berlin, Germany + - index: 4 + name: Institute of Medical Sociology, University Medical Center Hamburg-Eppendorf, Germany + - index: 5 + name: Independent Researcher, Tampa, FL, USA + - index: 6 + name: School of Psychology, University of Sussex, Brighton, UK +correspondence: theriault.remi@courrier.uqam.ca. +type: article +status: submit +date: 7 June 2023 +bibliography: paper.bib +simplesummary: | + The *{performance}* package from the *easystats* ecosystem makes it easy to + diagnose outliers in R and according to current best practices thanks to the + `check_outiers()` function. +keywords: | + univariate outliers; multivariate outliers; robust detection methods; R; easystats +acknowledgement: | + *{performance}* is part of the collaborative + [*easystats*](https://github.com/easystats/easystats) ecosystem + [@easystatspackage]. Thus, we thank all + [members of easystats](https://github.com/orgs/easystats/people), + contributors, and users alike. +authorcontributions: | + R.T. drafted the paper; all authors contributed to both the writing of the + paper and the conception of the software. +funding: | + This research received no external funding. +conflictsofinterest: | + The authors declare no conflict of interest. +abbreviations: + - short: SOD + long: Statistical outlier detection + - short: SEM + long: Structural equation modelling + - short: SD + long: Standard deviation + - short: MAD + long: Median absolute deviation + - short: IQR + long: Interquartile range + - short: HDI + long: Highest density interval + - short: BCI + long: Bias corrected and accelerated interval + - short: MCD + long: Minimum covariance determinant + - short: ICS + long: invariant coordinate selection + - short: OSF + long: Open Science Framework +output: + rticles::joss_article: + journal: "JOSE" +csl: apa.csl +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set( + echo = TRUE, + comment = "#>", + out.width = "100%", + dpi = 300, + warning = FALSE +) + +library(performance) +library(see) +library(datawizard) +``` + +# Summary + +Beyond the challenge of keeping up-to-date with current best practices regarding the diagnosis and treatment of outliers, an additional difficulty arises concerning the mathematical implementation of the recommended methods. Here, we provide an overview of current recommendations and best practices and demonstrate how they can easily and conveniently be implemented in the R statistical computing software, using the *{performance}* package of the *easystats* ecosystem. We cover univariate, multivariate, and model-based statistical outlier detection methods, their recommended threshold, standard output, and plotting methods. We conclude by reviewing the different theoretical types of outliers, whether to exclude or winsorize them, and the importance of transparency. + +# Statement of Need + +Real-life data often contain observations that can be considered *abnormal* when compared to the main population. The cause of it can be hard to assess and the boundaries of "abnormal", difficult to define---they may belong to a different distribution (originating from a different generative process) or simply be extreme cases, statistically rare but not impossible. + +Nonetheless, the improper handling of these outliers can substantially affect statistical model estimations, biasing effect estimations and weakening the models' predictive performance. It is thus essential to address this problem in a thoughtful manner. Yet, despite the existence of established recommendations and guidelines, many researchers still do not treat outliers in a consistent manner, or do so using inappropriate strategies [@simmons2011false; @leys2013outliers]. + +One possible reason is that researchers are not aware of the existing recommendations, or do not know how to implement them using their analysis software. In this paper, we show how to follow current best practices for automatic and reproducible statistical outlier detection (SOD) using R and the *{performance}* package [@ludecke2021performance], which is part of the *easystats* ecosystem of packages that build an R framework for easy statistical modeling, visualization, and reporting [@easystatspackage]. Installation instructions can be found on [GitHub](https://github.com/easystats/performance) or its [website](https://easystats.github.io/performance/), and its list of dependencies on [CRAN](https://cran.r-project.org/package=performance). + +The instructional materials that follow are aimed at an audience of researchers who want to follow good practices, and are appropriate for advanced undergraduate students, graduate students, professors, or professionals having to deal with the nuances of outlier treatment. + +# Identifying Outliers + +Although many researchers attempt to identify outliers with measures based on the mean (e.g., _z_ scores), those methods are problematic because the mean and standard deviation themselves are not robust to the influence of outliers and those methods also assume normally distributed data (i.e., a Gaussian distribution). Therefore, current guidelines recommend using robust methods to identify outliers, such as those relying on the median as opposed to the mean [@leys2019outliers; @leys2013outliers; @leys2018outliers]. + +Nonetheless, which exact outlier method to use depends on many factors. In some cases, eye-gauging odd observations can be an appropriate solution, though many researchers will favour algorithmic solutions to detect potential outliers, for example, based on a continuous value expressing the observation stands out from the others. + +One of the factors to consider when selecting an algorithmic outlier detection method is the statistical test of interest. Identifying observations the regression model does not fit well can help find information relevant to our specific research context. This approach, known as model-based outliers detection (as outliers are extracted after the statistical model has been fit), can be contrasted with distribution-based outliers detection, which is based on the distance between an observation and the "center" of its population. Various quantification strategies of this distance exist for the latter, both univariate (involving only one variable at a time) or multivariate (involving multiple variables). + +When no method is readily available to detect model-based outliers, such as for structural equation modelling (SEM), looking for multivariate outliers may be of relevance. For simple tests (_t_ tests or correlations) that compare values of the same variable, it can be appropriate to check for univariate outliers. However, univariate methods can give false positives since _t_ tests and correlations, ultimately, are also models/multivariable statistics. They are in this sense more limited, but we show them nonetheless for educational purposes. + +Importantly, whatever approach researchers choose remains a subjective decision, which usage (and rationale) must be transparently documented and reproducible [@leys2019outliers]. Researchers should commit (ideally in a preregistration) to an outlier treatment method before collecting the data. They should report in the paper their decisions and details of their methods, as well as any deviation from their original plan. These transparency practices can help reduce false positives due to excessive researchers' degrees of freedom (i.e., choice flexibility throughout the analysis). In the following section, we will go through each of the mentioned methods and provide examples on how to implement them with R. + +## Univariate Outliers + +Researchers frequently attempt to identify outliers using measures of deviation from the center of a variable's distribution. One of the most popular such procedure is the _z_ score transformation, which computes the distance in standard deviation (SD) from the mean. However, as mentioned earlier, this popular method is not robust. Therefore, for univariate outliers, it is recommended to use the median along with the Median Absolute Deviation (MAD), which are more robust than the interquartile range or the mean and its standard deviation [@leys2019outliers; @leys2013outliers]. + +Researchers can identify outliers based on robust (i.e., MAD-based) _z_ scores using the `check_outliers()` function of the *{performance}* package, by specifying `method = "zscore_robust"`.^[Note that `check_outliers()` only checks numeric variables.] Although @leys2013outliers suggest a default threshold of 2.5 and @leys2019outliers a threshold of 3, *{performance}* uses by default a less conservative threshold of ~3.29.^[3.29 is an approximation of the two-tailed critical value for _p_ < .001, obtained through `qnorm(p = 1 - 0.001 / 2)`. We chose this threshold for consistency with the thresholds of all our other methods.] That is, data points will be flagged as outliers if they go beyond +/- ~3.29 MAD. Users can adjust this threshold using the `threshold` argument. + +Below we provide example code using the `mtcars` dataset, which was extracted from the 1974 *Motor Trend* US magazine. The dataset contains fuel consumption and 10 characteristics of automobile design and performance for 32 different car models (see `?mtcars` for details). We chose this dataset because it is accessible from base R and familiar to many R users. We might want to conduct specific statistical analyses on this data set, say, _t_ tests or structural equation modelling, but first, we want to check for outliers that may influence those test results. + +Because the automobile names are stored as column names in `mtcars`, we first have to convert them to an ID column to benefit from the `check_outliers()` ID argument. Furthermore, we only really need a couple columns for this demonstration, so we choose the first four (`mpg` = Miles/(US) gallon; `cyl` = Number of cylinders; `disp` = Displacement; `hp` = Gross horsepower). Finally, because there are no outliers in this dataset, we add two artificial outliers before running our function. + +```{r z_score} +library(performance) + +# Create some artificial outliers and an ID column +data <- rbind(mtcars[1:4], 42, 55) +data <- cbind(car = row.names(data), data) + +outliers <- check_outliers(data, method = "zscore_robust", ID = "car") +outliers +``` + +What we see is that `check_outliers()` with the robust _z_ score method detected two outliers: cases 33 and 34, which were the observations we added ourselves. They were flagged for two variables specifically: `mpg` (Miles/(US) gallon) and `cyl` (Number of cylinders), and the output provides their exact _z_ score for those variables. + +We describe how to deal with those cases in more details later in the paper, but should we want to exclude these detected outliers from the main dataset, we can extract row numbers using `which()` on the output object, which can then be used for indexing: + +```{r} +which(outliers) + +data_clean <- data[-which(outliers), ] +``` + +Other univariate methods are available, such as using the interquartile range (IQR), or based on different intervals, such as the Highest Density Interval (HDI) or the Bias Corrected and Accelerated Interval (BCI). These methods are documented and described in the function's [help page](). + +## Multivariate Outliers + +Univariate outliers can be useful when the focus is on a particular variable, for instance the reaction time, as extreme values might be indicative of inattention or non-task-related behavior^[ Note that they might not be the optimal way of treating reaction time outliers [@ratcliff1993methods; @van1995statistical]]. + +However, in many scenarios, variables of a data set are not independent, and an abnormal observation will impact multiple dimensions. For instance, a participant giving random answers to a questionnaire. In this case, computing the _z_ score for each of the questions might not lead to satisfactory results. Instead, one might want to look at these variables together. + +One common approach for this is to compute multivariate distance metrics such as the Mahalanobis distance. Although the Mahalanobis distance is very popular, just like the regular _z_ scores method, it is not robust and is heavily influenced by the outliers themselves. Therefore, for multivariate outliers, it is recommended to use the Minimum Covariance Determinant, a robust version of the Mahalanobis distance [MCD, @leys2018outliers; @leys2019outliers]. + +In *{performance}*'s `check_outliers()`, one can use this approach with `method = "mcd"`.^[Our default threshold for the MCD method is defined by `stats::qchisq(p = 1 - 0.001, df = ncol(x))`, which again is an approximation of the critical value for _p_ < .001 consistent with the thresholds of our other methods.] + +```{r multivariate} +outliers <- check_outliers(data, method = "mcd") +outliers +``` + +Here, we detected 9 multivariate outliers (i.e,. when looking at all variables of our dataset together). + +Other multivariate methods are available, such as another type of robust Mahalanobis distance that in this case relies on an orthogonalized Gnanadesikan-Kettenring pairwise estimator [@gnanadesikan1972robust]. These methods are documented and described in the function's [help page](https://easystats.github.io/performance/reference/check_outliers.html). + +## Model-Based Outliers + +Working with regression models creates the possibility of using model-based SOD methods. These methods rely on the concept of *leverage*, that is, how much influence a given observation can have on the model estimates. If few observations have a relatively strong leverage/influence on the model, one can suspect that the model's estimates are biased by these observations, in which case flagging them as outliers could prove helpful (see next section, "Handling Outliers"). + +In {performance}, two such model-based SOD methods are currently available: Cook's distance, for regular regression models, and Pareto, for Bayesian models. As such, `check_outliers()` can be applied directly on regression model objects, by simply specifying `method = "cook"` (or `method = "pareto"` for Bayesian models).^[Our default threshold for the Cook method is defined by `stats::qf(0.5, ncol(x), nrow(x) - ncol(x))`, which again is an approximation of the critical value for _p_ < .001 consistent with the thresholds of our other methods.] + +Currently, most lm models are supported (with the exception of `glmmTMB`, `lmrob`, and `glmrob` models), as long as they are supported by the underlying functions `stats::cooks.distance()` (or `loo::pareto_k_values()`) and `insight::get_data()` (for a full list of the 225 models currently supported by the `insight` package, see https://easystats.github.io/insight/#list-of-supported-models-by-class). Also note that although `check_outliers()` supports the pipe operators (`|>` or `%>%`), it does not support `tidymodels` at this time. We show a demo below. + +```{r model} +model <- lm(disp ~ mpg * disp, data = data) +outliers <- check_outliers(model, method = "cook") +outliers +``` + +Using the model-based outlier detection method, we identified a single outlier. + +Table 1 below summarizes which methods to use in which cases, and with what threshold. The recommended thresholds are the default thresholds. + +```{r table1_prep, echo=FALSE} +df <- data.frame( + `Statistical Test` = c( + "Supported regression model", + "Structural Equation Modeling (or other unsupported model)", + "Simple test with few variables (*t* test, correlation, etc.)"), + `Diagnosis Method` = c( + "**Model-based**: Cook (or Pareto for Bayesian models)", + "**Multivariate**: Minimum Covariance Determinant (MCD)", + "**Univariate**: robust *z* scores (MAD)"), + `Recommended Threshold` = c( + "_qf(0.5, ncol(x), nrow(x) - ncol(x))_ (or 0.7 for Pareto)", + "_qchisq(p = 1 - 0.001, df = ncol(x))_", + "_qnorm(p = 1 - 0.001 / 2)_, ~ 3.29"), + `Function Usage` = c( + '_check_outliers(model, method = "cook")_', + '_check_outliers(data, method = "mcd")_', + '_check_outliers(data, method = "zscore_robust")_'), + check.names = FALSE +) +``` + +### Table 1 + +_Summary of Statistical Outlier Detection Methods Recommendations_ + +```{r table1_print, echo=FALSE, message=FALSE, eval=FALSE} +x <- flextable::flextable(df, cwidth = 1.25) +x <- flextable::theme_apa(x) +x <- flextable::font(x, fontname = "Latin Modern Roman", part = "all") +x <- flextable::fontsize(x, size = 10, part = "all") +ftExtra::colformat_md(x) + +``` + +![](table1.jpg) + +All `check_outliers()` output objects possess a `plot()` method, meaning it is also possible to visualize the outliers using the generic `plot()` function on the resulting outlier object after loading the {see} package (Figure 1). + +```{r model_fig, fig.cap = "Visual depiction of outliers based on Cook's distance (leverage and standardized residuals), based on the fitted model."} +plot(outliers) +``` + +## Cook's Distance vs. MCD + +@leys2018outliers report a preference for the MCD method over Cook's distance. This is because Cook's distance removes one observation at a time and checks its corresponding influence on the model each time [@cook1977detection], and flags any observation that has a large influence. In the view of these authors, when there are several outliers, the process of removing a single outlier at a time is problematic as the model remains "contaminated" or influenced by other possible outliers in the model, rendering this method suboptimal in the presence of multiple outliers. + +However, distribution-based approaches are not a silver bullet either, and there are cases where the usage of methods agnostic to theoretical and statistical models of interest might be problematic. For example, a very tall person would be expected to also be much heavier than average, but that would still fit with the expected association between height and weight (i.e., it would be in line with a model such as `weight ~ height`). In contrast, using multivariate outlier detection methods there may flag this person as being an outlier---being unusual on two variables, height and weight---even though the pattern fits perfectly with our predictions. + +Finally, unusual observations happen naturally: extreme observations are expected even when taken from a normal distribution. While statistical models can integrate this "expectation", multivariate outlier methods might be too conservative, flagging too many observations despite belonging to the right generative process. For these reasons, we believe that model-based methods are still preferable to the MCD when using supported regression models. Additionally, if the presence of multiple outliers is a significant concern, regression methods that are more robust to outliers should be considered---like _t_ regression or quantile regression---as they render their precise identification less critical [@mcelreath2020statistical]. + +## Composite Outlier Score + +The *{performance}* package also offers an alternative, consensus-based approach that combines several methods, based on the assumption that different methods provide different angles of looking at a given problem. By applying a variety of methods, one can hope to "triangulate" the true outliers (those consistently flagged by multiple methods) and thus attempt to minimize false positives. + +In practice, this approach computes a composite outlier score, formed of the average of the binary (0 or 1) classification results of each method. It represents the probability that each observation is classified as an outlier by at least one method. The default decision rule classifies rows with composite outlier scores superior or equal to 0.5 as outlier observations (i.e., that were classified as outliers by at least half of the methods). In *{performance}*'s `check_outliers()`, one can use this approach by including all desired methods in the corresponding argument. + +```{r multimethod, fig.cap = "Visual depiction of outliers using several different statistical outlier detection methods."} +outliers <- check_outliers(model, method = c("zscore_robust", "mcd", "cook")) +which(outliers) +``` + +Outliers (counts or per variables) for individual methods can then be obtained through attributes. For example: + +```{r} +attributes(outliers)$outlier_var$zscore_robust +``` + +An example sentence for reporting the usage of the composite method could be: + +> Based on a composite outlier score [see the 'check_outliers()' function in the 'performance' R package, @ludecke2021performance] obtained via the joint application of multiple outliers detection algorithms [(a) median absolute deviation (MAD)-based robust _z_ scores, @leys2013outliers; (b) Mahalanobis minimum covariance determinant (MCD), @leys2019outliers; and (c) Cook's distance, @cook1977detection], we excluded two participants that were classified as outliers by at least half of the methods used. + +# Handling Outliers + +The above section demonstrated how to identify outliers using the `check_outliers()` function in the *{performance}* package. But what should we do with these outliers once identified? Although it is common to automatically discard any observation that has been marked as "an outlier" as if it might infect the rest of the data with its statistical ailment, we believe that the use of SOD methods is but one step in the get-to-know-your-data pipeline; a researcher or analyst's _domain knowledge_ must be involved in the decision of how to deal with observations marked as outliers by means of SOD. Indeed, automatic tools can help detect outliers, but they are nowhere near perfect. Although they can be useful to flag suspect data, they can have misses and false alarms, and they cannot replace human eyes and proper vigilance from the researcher. If you do end up manually inspecting your data for outliers, it can be helpful to think of outliers as belonging to different types of outliers, or categories, which can help decide what to do with a given outlier. + +## Error, Interesting, and Random Outliers + +@leys2019outliers distinguish between error outliers, interesting outliers, and random outliers. _Error outliers_ are likely due to human error and should be corrected before data analysis or outright removed since they are invalid observations. _Interesting outliers_ are not due to technical error and may be of theoretical interest; it might thus be relevant to investigate them further even though they should be removed from the current analysis of interest. _Random outliers_ are assumed to be due to chance alone and to belong to the correct distribution and, therefore, should be retained. + +It is recommended to _keep_ observations which are expected to be part of the distribution of interest, even if they are outliers [@leys2019outliers]. However, if it is suspected that the outliers belong to an alternative distribution, then those observations could have a large impact on the results and call into question their robustness, especially if significance is conditional on their inclusion, so should be removed. + +We should also keep in mind that there might be error outliers that are not detected by statistical tools, but should nonetheless be found and removed. For example, if we are studying the effects of X on Y among teenagers and we have one observation from a 20-year-old, this observation might not be a _statistical outlier_, but it is an outlier in the _context_ of our research, and should be discarded. We could call these observations *undetected* error outliers, in the sense that although they do not statistically stand out, they do not belong to the theoretical or empirical distribution of interest (e.g., teenagers). In this way, we should not blindly rely on statistical outlier detection methods; doing our due diligence to investigate undetected error outliers relative to our specific research question is also essential for valid inferences. + +## Winsorization + +_Removing_ outliers can in this case be a valid strategy, and ideally one would report results with and without outliers to see the extent of their impact on results. This approach however can reduce statistical power. Therefore, some propose a _recoding_ approach, namely, winsorization: bringing outliers back within acceptable limits [e.g., 3 MADs, @tukey1963less]. However, if possible, it is recommended to collect enough data so that even after removing outliers, there is still sufficient statistical power without having to resort to winsorization [@leys2019outliers]. + +The _easystats_ ecosystem makes it easy to incorporate this step into your workflow through the `winsorize()` function of *{datawizard}*, a lightweight R package to facilitate data wrangling and statistical transformations [@patil2022datawizard]. This procedure will bring back univariate outliers within the limits of 'acceptable' values, based either on the percentile, the _z_ score, or its robust alternative based on the MAD. + +## The Importance of Transparency + +Finally, it is a critical part of a sound outlier treatment that regardless of which SOD method used, it should be reported in a reproducible manner. Ideally, the handling of outliers should be specified *a priori* with as much detail as possible, and preregistered, to limit researchers' degrees of freedom and therefore risks of false positives [@leys2019outliers]. This is especially true given that interesting outliers and random outliers are often times hard to distinguish in practice. Thus, researchers should always prioritize transparency and report all of the following information: (a) how many outliers were identified (including percentage); (b) according to which method and criteria, (c) using which function of which R package (if applicable), and (d) how they were handled (excluded or winsorized, if the latter, using what threshold). If at all possible, (e) the corresponding code script along with the data should be shared on a public repository like the Open Science Framework (OSF), so that the exclusion criteria can be reproduced precisely. + +# References \ No newline at end of file diff --git a/papers/JOSE/paper.bib b/papers/JOSE/paper.bib new file mode 100644 index 000000000..a7cdbb87d --- /dev/null +++ b/papers/JOSE/paper.bib @@ -0,0 +1,162 @@ +@article{leys2019outliers, + title = {How to Classify, Detect, and Manage Univariate and Multivariate Outliers, With Emphasis on Pre-Registration},author = {Leys, Christophe and Delacre, Marie and Mora, Youri L. and Lakens, Daniël and Ley, Christophe}, + journal = {International Review of Social Psychology}, + year = {2019}, + doi = {10.5334/irsp.289} +} + +@article{leys2013outliers, + title = {Detecting outliers: Do not use standard deviation around the mean, use absolute deviation around the median}, + author = {Christophe Leys and Christophe Ley and Olivier Klein and Philippe Bernard and Laurent Licata}, + journal = {Journal of Experimental Social Psychology}, + volume = {49}, + number = {4}, + pages = {764-766}, + year = {2013}, + doi = {10.1016/j.jesp.2013.03.013}, + url = {https://doi.org/10.1016/j.jesp.2013.03.013} +} + +@article{leys2018outliers, + title = {Detecting multivariate outliers: Use a robust variant of the Mahalanobis distance}, + journal = {Journal of Experimental Social Psychology}, + volume = {74}, + pages = {150-156}, + year = {2018}, + issn = {0022-1031}, + doi = {10.1016/j.jesp.2017.09.011}, + url = {https://www.sciencedirect.com/science/article/pii/S0022103117302123}, + author = {Christophe Leys and Olivier Klein and Yves Dominicy and Christophe Ley}, +} + +@article{simmons2011false, + author = {Joseph P. Simmons and Leif D. Nelson and Uri Simonsohn}, + title ={False-Positive Psychology: Undisclosed Flexibility in Data Collection and Analysis Allows Presenting Anything as Significant}, + journal = {Psychological Science}, + volume = {22}, + number = {11}, + pages = {1359-1366}, + year = {2011}, + doi = {10.1177/0956797611417632}, + URL = {https://doi.org/10.1177/0956797611417632}, +} + +@software{easystatspackage, + title = {{easystats}: Streamline Model Interpretation, Visualization, and Reporting}, + author = {Daniel Lüdecke and Dominique Makowski and Mattan S. Ben-Shachar and Indrajeet Patil and Brenton M. Wiernik and Etienne Bacher and Rémi Thériault}, + date = {2023-02-04T22:06:06Z}, + origdate = {2019-01-28T10:39:29Z}, + url = {https://easystats.github.io/easystats/} +} + +@Article{ludecke2021performance, + author = {Daniel Lüdecke and Mattan S. Ben-Shachar and Indrajeet Patil and Philip Waggoner and Dominique Makowski}, + title = {{performance}: An {R} package for assessment, comparison and testing of statistical models}, + volume = {6}, + number = {60}, + journal = {Journal of Open Source Software}, + year = {2021}, + pages = {3139}, + doi = {10.21105/joss.03139}, + url = {https://doi.org/10.21105/joss.03139} + } + +@Article{patil2022datawizard, + title = {{datawizard}: An {R} package for easy data preparation and statistical transformations}, + author = {Indrajeet Patil and Dominique Makowski and Mattan S. Ben-Shachar and Brenton M. Wiernik and Etienne Bacher and Daniel Lüdecke}, + journal = {Journal of Open Source Software}, + year = {2022}, + volume = {7}, + number = {78}, + pages = {4684}, + doi = {10.21105/joss.04684}, + } + +@article{cook1977detection, + author = {R. Dennis Cook}, + title = {Detection of Influential Observation in Linear Regression}, + journal = {Technometrics}, + volume = {19}, + number = {1}, + pages = {15-18}, + year = {1977}, + publisher = {Taylor & Francis}, + doi = {10.1080/00401706.1977.10489493} +} + +@book{iglewicz1993outliers, + title = {How to detect and handle outliers (Vol. 16)}, + publisher = {Asq Press}, + author = {Iglewicz, B. and Hoaglin, D. C}, + year = {1993} +} + +@article{gnanadesikan1972robust, + title = {Robust estimates, residuals, and outlier detection with multiresponse data}, + author = {Gnanadesikan, R. and Kettenring, J. R}, + doi = {10.2307/2528963}, + journal = {Biometrics}, + pages = {81-124}, + year = {1972} +} + +@article{hubert2018mcd, + author = {Hubert, Mia and Debruyne, Michiel and Rousseeuw, Peter J.}, + title = {Minimum covariance determinant and extensions}, + journal = {Wiley Interdisciplinary Reviews: Computational Statistics}, + volume = {10}, + number = {3}, + pages = {e1421}, + doi = {10.1002/wics.1421}, + url = {https://doi.org/10.1002/wics.1421}, + year = {2018} +} + +@article{tukey1963less, + title={Less vulnerable confidence and significance procedures for location based on a single sample: Trimming/Winsorization 1}, + author={Tukey, John W and McLaughlin, Donald H}, + journal={Sankhy{\=a}: The Indian Journal of Statistics, Series A}, + pages={331--352}, + year={1963}, + publisher={JSTOR} +} + +@article{van1995statistical, + title={Statistical mimicking of reaction time data: Single-process models, parameter variability, and mixtures}, + author={Van Zandt, Trisha and Ratcliff, Roger}, + journal={Psychonomic Bulletin \& Review}, + volume={2}, + number={1}, + pages={20--54}, + year={1995}, + publisher={Springer}, + doi = {10.3758/BF03214411} +} + +@article{ratcliff1993methods, + title={Methods for dealing with reaction time outliers.}, + author={Ratcliff, Roger}, + journal={Psychological bulletin}, + volume={114}, + number={3}, + pages={510}, + year={1993}, + publisher={American Psychological Association}, + doi = {10.1037/0033-2909.114.3.510} +} + +@book{mcelreath2020statistical, + title={Statistical rethinking: A Bayesian course with examples in {R} and Stan}, + author={McElreath, Richard}, + year={2020}, + publisher={CRC press} +} + +@Manual{rcore, + title = {{R}: A Language and Environment for Statistical Computing}, + author = {{R Core Team}}, + organization = {R Foundation for Statistical Computing}, + address = {Vienna, Austria}, + year = {2021}, + url = {https://www.R-project.org/} +} \ No newline at end of file diff --git a/papers/JOSE/paper.log b/papers/JOSE/paper.log new file mode 100644 index 000000000..6147576d4 --- /dev/null +++ b/papers/JOSE/paper.log @@ -0,0 +1,1133 @@ +This is XeTeX, Version 3.141592653-2.6-0.999995 (TeX Live 2023) (preloaded format=xelatex 2023.10.4) 5 OCT 2023 12:10 +entering extended mode + restricted \write18 enabled. + %&-line parsing enabled. +**paper.tex +(./paper.tex +LaTeX2e <2023-06-01> patch level 1 +L3 programming layer <2023-08-29> +(c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/latex/base/article.cls +Document Class: article 2023/05/17 v1.4n Standard LaTeX document class +(c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/latex/base/size10.clo +File: size10.clo 2023/05/17 v1.4n Standard LaTeX file (size option) +) +\c@part=\count181 +\c@section=\count182 +\c@subsection=\count183 +\c@subsubsection=\count184 +\c@paragraph=\count185 +\c@subparagraph=\count186 +\c@figure=\count187 +\c@table=\count188 +\abovecaptionskip=\skip48 +\belowcaptionskip=\skip49 +\bibindent=\dimen140 +) (c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/latex/marginnote/marginnote.sty +Package: marginnote 2018/08/09 1.4b non floating margin notes for LaTeX +\c@mn@abspage=\count189 +) (c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/latex/graphics/graphicx.sty +Package: graphicx 2021/09/16 v1.2d Enhanced LaTeX Graphics (DPC,SPQR) +(c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/latex/graphics/keyval.sty +Package: keyval 2022/05/29 v1.15 key=value parser (DPC) +\KV@toks@=\toks17 +) (c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/latex/graphics/graphics.sty +Package: graphics 2022/03/10 v1.4e Standard LaTeX Graphics (DPC,SPQR) +(c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/latex/graphics/trig.sty +Package: trig 2021/08/11 v1.11 sin cos tan (DPC) +) (c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/latex/graphics-cfg/graphics.cfg +File: graphics.cfg 2016/06/04 v1.11 sample graphics configuration +) +Package graphics Info: Driver file: xetex.def on input line 107. +(c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/latex/graphics-def/xetex.def +File: xetex.def 2022/09/22 v5.0n Graphics/color driver for xetex +)) +\Gin@req@height=\dimen141 +\Gin@req@width=\dimen142 +) (c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/latex/xcolor/xcolor.sty +Package: xcolor 2022/06/12 v2.14 LaTeX color extensions (UK) +(c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/latex/graphics-cfg/color.cfg +File: color.cfg 2016/01/02 v1.6 sample color configuration +) +Package xcolor Info: Driver file: xetex.def on input line 227. +(c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/latex/graphics/mathcolor.ltx) +Package xcolor Info: Model `cmy' substituted by `cmy0' on input line 1353. +Package xcolor Info: Model `RGB' extended on input line 1369. +Package xcolor Info: Model `HTML' substituted by `rgb' on input line 1371. +Package xcolor Info: Model `Hsb' substituted by `hsb' on input line 1372. +Package xcolor Info: Model `tHsb' substituted by `hsb' on input line 1373. +Package xcolor Info: Model `HSB' substituted by `hsb' on input line 1374. +Package xcolor Info: Model `Gray' substituted by `gray' on input line 1375. +Package xcolor Info: Model `wave' substituted by `hsb' on input line 1376. +) (c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/latex/preprint/authblk.sty +Package: authblk 2001/02/27 1.3 (PWD) +\affilsep=\skip50 +\@affilsep=\skip51 +\c@Maxaffil=\count190 +\c@authors=\count191 +\c@affil=\count192 +) (c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/latex/etoolbox/etoolbox.sty +Package: etoolbox 2020/10/05 v2.5k e-TeX tools for LaTeX (JAW) +\etb@tempcnta=\count193 +) (c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/latex/titlesec/titlesec.sty +Package: titlesec 2021/07/05 v2.14 Sectioning titles +\ttl@box=\box51 +\beforetitleunit=\skip52 +\aftertitleunit=\skip53 +\ttl@plus=\dimen143 +\ttl@minus=\dimen144 +\ttl@toksa=\toks18 +\titlewidth=\dimen145 +\titlewidthlast=\dimen146 +\titlewidthfirst=\dimen147 +) (c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/latex/tools/calc.sty +Package: calc 2017/05/25 v4.3 Infix arithmetic (KKT,FJ) +\calc@Acount=\count194 +\calc@Bcount=\count195 +\calc@Adimen=\dimen148 +\calc@Bdimen=\dimen149 +\calc@Askip=\skip54 +\calc@Bskip=\skip55 +LaTeX Info: Redefining \setlength on input line 80. +LaTeX Info: Redefining \addtolength on input line 81. +\calc@Ccount=\count196 +\calc@Cskip=\skip56 +) (c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/latex/pgf/frontendlayer/tikz.sty (c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/latex/pgf/basiclayer/pgf.sty (c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/latex/pgf/utilities/pgfrcs.sty (c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/generic/pgf/utilities/pgfutil-common.tex +\pgfutil@everybye=\toks19 +\pgfutil@tempdima=\dimen150 +\pgfutil@tempdimb=\dimen151 +) (c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/generic/pgf/utilities/pgfutil-latex.def +\pgfutil@abb=\box52 +) (c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/generic/pgf/utilities/pgfrcs.code.tex (c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/generic/pgf/pgf.revision.tex) +Package: pgfrcs 2023-01-15 v3.1.10 (3.1.10) +)) +Package: pgf 2023-01-15 v3.1.10 (3.1.10) +(c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/latex/pgf/basiclayer/pgfcore.sty (c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/latex/pgf/systemlayer/pgfsys.sty (c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/generic/pgf/systemlayer/pgfsys.code.tex +Package: pgfsys 2023-01-15 v3.1.10 (3.1.10) +(c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/generic/pgf/utilities/pgfkeys.code.tex +\pgfkeys@pathtoks=\toks20 +\pgfkeys@temptoks=\toks21 +(c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/generic/pgf/utilities/pgfkeyslibraryfiltered.code.tex +\pgfkeys@tmptoks=\toks22 +)) +\pgf@x=\dimen152 +\pgf@y=\dimen153 +\pgf@xa=\dimen154 +\pgf@ya=\dimen155 +\pgf@xb=\dimen156 +\pgf@yb=\dimen157 +\pgf@xc=\dimen158 +\pgf@yc=\dimen159 +\pgf@xd=\dimen160 +\pgf@yd=\dimen161 +\w@pgf@writea=\write3 +\r@pgf@reada=\read2 +\c@pgf@counta=\count197 +\c@pgf@countb=\count198 +\c@pgf@countc=\count199 +\c@pgf@countd=\count266 +\t@pgf@toka=\toks23 +\t@pgf@tokb=\toks24 +\t@pgf@tokc=\toks25 +\pgf@sys@id@count=\count267 +(c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/generic/pgf/systemlayer/pgf.cfg +File: pgf.cfg 2023-01-15 v3.1.10 (3.1.10) +) +Driver file for pgf: pgfsys-xetex.def +(c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/generic/pgf/systemlayer/pgfsys-xetex.def +File: pgfsys-xetex.def 2023-01-15 v3.1.10 (3.1.10) +(c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/generic/pgf/systemlayer/pgfsys-dvipdfmx.def +File: pgfsys-dvipdfmx.def 2023-01-15 v3.1.10 (3.1.10) +(c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/generic/pgf/systemlayer/pgfsys-common-pdf.def +File: pgfsys-common-pdf.def 2023-01-15 v3.1.10 (3.1.10) +) +\pgfsys@objnum=\count268 +))) (c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/generic/pgf/systemlayer/pgfsyssoftpath.code.tex +File: pgfsyssoftpath.code.tex 2023-01-15 v3.1.10 (3.1.10) +\pgfsyssoftpath@smallbuffer@items=\count269 +\pgfsyssoftpath@bigbuffer@items=\count270 +) (c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/generic/pgf/systemlayer/pgfsysprotocol.code.tex +File: pgfsysprotocol.code.tex 2023-01-15 v3.1.10 (3.1.10) +)) (c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/generic/pgf/basiclayer/pgfcore.code.tex +Package: pgfcore 2023-01-15 v3.1.10 (3.1.10) +(c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/generic/pgf/math/pgfmath.code.tex (c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/generic/pgf/math/pgfmathutil.code.tex) (c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/generic/pgf/math/pgfmathparser.code.tex +\pgfmath@dimen=\dimen162 +\pgfmath@count=\count271 +\pgfmath@box=\box53 +\pgfmath@toks=\toks26 +\pgfmath@stack@operand=\toks27 +\pgfmath@stack@operation=\toks28 +) (c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.code.tex) (c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.basic.code.tex) (c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.trigonometric.code.tex) (c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.random.code.tex) (c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.comparison.code.tex) (c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.base.code.tex) (c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.round.code.tex) (c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.misc.code.tex) (c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.integerarithmetics.code.tex) (c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/generic/pgf/math/pgfmathcalc.code.tex) (c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/generic/pgf/math/pgfmathfloat.code.tex +\c@pgfmathroundto@lastzeros=\count272 +)) (c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/generic/pgf/math/pgfint.code.tex) (c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/generic/pgf/basiclayer/pgfcorepoints.code.tex +File: pgfcorepoints.code.tex 2023-01-15 v3.1.10 (3.1.10) +\pgf@picminx=\dimen163 +\pgf@picmaxx=\dimen164 +\pgf@picminy=\dimen165 +\pgf@picmaxy=\dimen166 +\pgf@pathminx=\dimen167 +\pgf@pathmaxx=\dimen168 +\pgf@pathminy=\dimen169 +\pgf@pathmaxy=\dimen170 +\pgf@xx=\dimen171 +\pgf@xy=\dimen172 +\pgf@yx=\dimen173 +\pgf@yy=\dimen174 +\pgf@zx=\dimen175 +\pgf@zy=\dimen176 +) (c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/generic/pgf/basiclayer/pgfcorepathconstruct.code.tex +File: pgfcorepathconstruct.code.tex 2023-01-15 v3.1.10 (3.1.10) +\pgf@path@lastx=\dimen177 +\pgf@path@lasty=\dimen178 +) (c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/generic/pgf/basiclayer/pgfcorepathusage.code.tex +File: pgfcorepathusage.code.tex 2023-01-15 v3.1.10 (3.1.10) +\pgf@shorten@end@additional=\dimen179 +\pgf@shorten@start@additional=\dimen180 +) (c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/generic/pgf/basiclayer/pgfcorescopes.code.tex +File: pgfcorescopes.code.tex 2023-01-15 v3.1.10 (3.1.10) +\pgfpic=\box54 +\pgf@hbox=\box55 +\pgf@layerbox@main=\box56 +\pgf@picture@serial@count=\count273 +) (c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/generic/pgf/basiclayer/pgfcoregraphicstate.code.tex +File: pgfcoregraphicstate.code.tex 2023-01-15 v3.1.10 (3.1.10) +\pgflinewidth=\dimen181 +) (c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/generic/pgf/basiclayer/pgfcoretransformations.code.tex +File: pgfcoretransformations.code.tex 2023-01-15 v3.1.10 (3.1.10) +\pgf@pt@x=\dimen182 +\pgf@pt@y=\dimen183 +\pgf@pt@temp=\dimen184 +) (c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/generic/pgf/basiclayer/pgfcorequick.code.tex +File: pgfcorequick.code.tex 2023-01-15 v3.1.10 (3.1.10) +) (c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/generic/pgf/basiclayer/pgfcoreobjects.code.tex +File: pgfcoreobjects.code.tex 2023-01-15 v3.1.10 (3.1.10) +) (c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/generic/pgf/basiclayer/pgfcorepathprocessing.code.tex +File: pgfcorepathprocessing.code.tex 2023-01-15 v3.1.10 (3.1.10) +) (c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/generic/pgf/basiclayer/pgfcorearrows.code.tex +File: pgfcorearrows.code.tex 2023-01-15 v3.1.10 (3.1.10) +\pgfarrowsep=\dimen185 +) (c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/generic/pgf/basiclayer/pgfcoreshade.code.tex +File: pgfcoreshade.code.tex 2023-01-15 v3.1.10 (3.1.10) +\pgf@max=\dimen186 +\pgf@sys@shading@range@num=\count274 +\pgf@shadingcount=\count275 +) (c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/generic/pgf/basiclayer/pgfcoreimage.code.tex +File: pgfcoreimage.code.tex 2023-01-15 v3.1.10 (3.1.10) +) (c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/generic/pgf/basiclayer/pgfcoreexternal.code.tex +File: pgfcoreexternal.code.tex 2023-01-15 v3.1.10 (3.1.10) +\pgfexternal@startupbox=\box57 +) (c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/generic/pgf/basiclayer/pgfcorelayers.code.tex +File: pgfcorelayers.code.tex 2023-01-15 v3.1.10 (3.1.10) +) (c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/generic/pgf/basiclayer/pgfcoretransparency.code.tex +File: pgfcoretransparency.code.tex 2023-01-15 v3.1.10 (3.1.10) +) (c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/generic/pgf/basiclayer/pgfcorepatterns.code.tex +File: pgfcorepatterns.code.tex 2023-01-15 v3.1.10 (3.1.10) +) (c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/generic/pgf/basiclayer/pgfcorerdf.code.tex +File: pgfcorerdf.code.tex 2023-01-15 v3.1.10 (3.1.10) +))) (c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/generic/pgf/modules/pgfmoduleshapes.code.tex +File: pgfmoduleshapes.code.tex 2023-01-15 v3.1.10 (3.1.10) +\pgfnodeparttextbox=\box58 +) (c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/generic/pgf/modules/pgfmoduleplot.code.tex +File: pgfmoduleplot.code.tex 2023-01-15 v3.1.10 (3.1.10) +) (c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/latex/pgf/compatibility/pgfcomp-version-0-65.sty +Package: pgfcomp-version-0-65 2023-01-15 v3.1.10 (3.1.10) +\pgf@nodesepstart=\dimen187 +\pgf@nodesepend=\dimen188 +) (c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/latex/pgf/compatibility/pgfcomp-version-1-18.sty +Package: pgfcomp-version-1-18 2023-01-15 v3.1.10 (3.1.10) +)) (c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/latex/pgf/utilities/pgffor.sty (c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/latex/pgf/utilities/pgfkeys.sty (c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/generic/pgf/utilities/pgfkeys.code.tex)) (c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/latex/pgf/math/pgfmath.sty (c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/generic/pgf/math/pgfmath.code.tex)) (c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/generic/pgf/utilities/pgffor.code.tex +Package: pgffor 2023-01-15 v3.1.10 (3.1.10) +\pgffor@iter=\dimen189 +\pgffor@skip=\dimen190 +\pgffor@stack=\toks29 +\pgffor@toks=\toks30 +)) (c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/generic/pgf/frontendlayer/tikz/tikz.code.tex +Package: tikz 2023-01-15 v3.1.10 (3.1.10) +(c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/generic/pgf/libraries/pgflibraryplothandlers.code.tex +File: pgflibraryplothandlers.code.tex 2023-01-15 v3.1.10 (3.1.10) +\pgf@plot@mark@count=\count276 +\pgfplotmarksize=\dimen191 +) +\tikz@lastx=\dimen192 +\tikz@lasty=\dimen193 +\tikz@lastxsaved=\dimen194 +\tikz@lastysaved=\dimen195 +\tikz@lastmovetox=\dimen196 +\tikz@lastmovetoy=\dimen197 +\tikzleveldistance=\dimen198 +\tikzsiblingdistance=\dimen199 +\tikz@figbox=\box59 +\tikz@figbox@bg=\box60 +\tikz@tempbox=\box61 +\tikz@tempbox@bg=\box62 +\tikztreelevel=\count277 +\tikznumberofchildren=\count278 +\tikznumberofcurrentchild=\count279 +\tikz@fig@count=\count280 +(c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/generic/pgf/modules/pgfmodulematrix.code.tex +File: pgfmodulematrix.code.tex 2023-01-15 v3.1.10 (3.1.10) +\pgfmatrixcurrentrow=\count281 +\pgfmatrixcurrentcolumn=\count282 +\pgf@matrix@numberofcolumns=\count283 +) +\tikz@expandcount=\count284 +(c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/generic/pgf/frontendlayer/tikz/libraries/tikzlibrarytopaths.code.tex +File: tikzlibrarytopaths.code.tex 2023-01-15 v3.1.10 (3.1.10) +))) (c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/latex/hyperref/hyperref.sty +Package: hyperref 2023-07-08 v7.01b Hypertext links for LaTeX +(c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/generic/ltxcmds/ltxcmds.sty +Package: ltxcmds 2020-05-10 v1.25 LaTeX kernel commands for general use (HO) +) (c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/generic/iftex/iftex.sty +Package: iftex 2022/02/03 v1.0f TeX engine tests +) (c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/generic/pdftexcmds/pdftexcmds.sty +Package: pdftexcmds 2020-06-27 v0.33 Utility functions of pdfTeX for LuaTeX (HO) +(c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/generic/infwarerr/infwarerr.sty +Package: infwarerr 2019/12/03 v1.5 Providing info/warning/error messages (HO) +) +Package pdftexcmds Info: \pdf@primitive is available. +Package pdftexcmds Info: \pdf@ifprimitive is available. +Package pdftexcmds Info: \pdfdraftmode not found. +) (c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/latex/kvsetkeys/kvsetkeys.sty +Package: kvsetkeys 2022-10-05 v1.19 Key value parser (HO) +) (c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/generic/kvdefinekeys/kvdefinekeys.sty +Package: kvdefinekeys 2019-12-19 v1.6 Define keys (HO) +) (c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/generic/pdfescape/pdfescape.sty +Package: pdfescape 2019/12/09 v1.15 Implements pdfTeX's escape features (HO) +) (c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/latex/hycolor/hycolor.sty +Package: hycolor 2020-01-27 v1.10 Color options for hyperref/bookmark (HO) +) (c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/latex/letltxmacro/letltxmacro.sty +Package: letltxmacro 2019/12/03 v1.6 Let assignment for LaTeX macros (HO) +) (c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/latex/auxhook/auxhook.sty +Package: auxhook 2019-12-17 v1.6 Hooks for auxiliary files (HO) +) (c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/latex/hyperref/nameref.sty +Package: nameref 2023-08-07 v2.53 Cross-referencing by name of section +(c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/latex/refcount/refcount.sty +Package: refcount 2019/12/15 v3.6 Data extraction from label references (HO) +) (c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/generic/gettitlestring/gettitlestring.sty +Package: gettitlestring 2019/12/15 v1.6 Cleanup title references (HO) +(c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/latex/kvoptions/kvoptions.sty +Package: kvoptions 2022-06-15 v3.15 Key value format for package options (HO) +)) +\c@section@level=\count285 +) +\@linkdim=\dimen256 +\Hy@linkcounter=\count286 +\Hy@pagecounter=\count287 +(c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/latex/hyperref/pd1enc.def +File: pd1enc.def 2023-07-08 v7.01b Hyperref: PDFDocEncoding definition (HO) +) (c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/generic/intcalc/intcalc.sty +Package: intcalc 2019/12/15 v1.3 Expandable calculations with integers (HO) +) +\Hy@SavedSpaceFactor=\count288 +(c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/latex/hyperref/puenc.def +File: puenc.def 2023-07-08 v7.01b Hyperref: PDF Unicode definition (HO) +) +Package hyperref Info: Hyper figures OFF on input line 4167. +Package hyperref Info: Link nesting OFF on input line 4172. +Package hyperref Info: Hyper index ON on input line 4175. +Package hyperref Info: Plain pages OFF on input line 4182. +Package hyperref Info: Backreferencing OFF on input line 4187. +Package hyperref Info: Implicit mode ON; LaTeX internals redefined. +Package hyperref Info: Bookmarks ON on input line 4434. +\c@Hy@tempcnt=\count289 +(c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/latex/url/url.sty +\Urlmuskip=\muskip16 +Package: url 2013/09/16 ver 3.4 Verb mode for urls, etc. +) +LaTeX Info: Redefining \url on input line 4772. +\XeTeXLinkMargin=\dimen257 +(c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/generic/bitset/bitset.sty +Package: bitset 2019/12/09 v1.3 Handle bit-vector datatype (HO) +(c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/generic/bigintcalc/bigintcalc.sty +Package: bigintcalc 2019/12/15 v1.5 Expandable calculations on big integers (HO) +)) +\Fld@menulength=\count290 +\Field@Width=\dimen258 +\Fld@charsize=\dimen259 +Package hyperref Info: Hyper figures OFF on input line 6051. +Package hyperref Info: Link nesting OFF on input line 6056. +Package hyperref Info: Hyper index ON on input line 6059. +Package hyperref Info: backreferencing OFF on input line 6066. +Package hyperref Info: Link coloring OFF on input line 6071. +Package hyperref Info: Link coloring with OCG OFF on input line 6076. +Package hyperref Info: PDF/A mode OFF on input line 6081. +(c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/latex/base/atbegshi-ltx.sty +Package: atbegshi-ltx 2021/01/10 v1.0c Emulation of the original atbegshi +package with kernel methods +) +\Hy@abspage=\count291 +\c@Item=\count292 +\c@Hfootnote=\count293 +) +Package hyperref Info: Driver (autodetected): hxetex. +(c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/latex/hyperref/hxetex.def +File: hxetex.def 2023-07-08 v7.01b Hyperref driver for XeTeX +(c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/generic/stringenc/stringenc.sty +Package: stringenc 2019/11/29 v1.12 Convert strings between diff. encodings (HO) +) +\pdfm@box=\box63 +\c@Hy@AnnotLevel=\count294 +\HyField@AnnotCount=\count295 +\Fld@listcount=\count296 +\c@bookmark@seq@number=\count297 +(c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/latex/rerunfilecheck/rerunfilecheck.sty +Package: rerunfilecheck 2022-07-10 v1.10 Rerun checks for auxiliary files (HO) +(c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/latex/base/atveryend-ltx.sty +Package: atveryend-ltx 2020/08/19 v1.0a Emulation of the original atveryend package +with kernel methods +) (c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/generic/uniquecounter/uniquecounter.sty +Package: uniquecounter 2019/12/15 v1.4 Provide unlimited unique counter (HO) +) +Package uniquecounter Info: New unique counter `rerunfilecheck' on input line 285. +) +\Hy@SectionHShift=\skip57 +) +Package hyperref Info: Option `colorlinks' set `true' on input line 12. +Package hyperref Info: Option `breaklinks' set `true' on input line 12. +(c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/latex/caption/caption.sty +Package: caption 2023/08/05 v3.6o Customizing captions (AR) +(c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/latex/caption/caption3.sty +Package: caption3 2023/07/31 v2.4d caption3 kernel (AR) +\caption@tempdima=\dimen260 +\captionmargin=\dimen261 +\caption@leftmargin=\dimen262 +\caption@rightmargin=\dimen263 +\caption@width=\dimen264 +\caption@indent=\dimen265 +\caption@parindent=\dimen266 +\caption@hangindent=\dimen267 +Package caption Info: Standard document class detected. +) +\c@caption@flags=\count298 +\c@continuedfloat=\count299 +Package caption Info: hyperref package is loaded. +) (c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/latex/tcolorbox/tcolorbox.sty +Package: tcolorbox 2023/09/26 version 6.1.0 text color boxes +(c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/latex/tools/verbatim.sty +Package: verbatim 2022-07-02 v1.5u LaTeX2e package for verbatim enhancements +\every@verbatim=\toks31 +\verbatim@line=\toks32 +\verbatim@in@stream=\read3 +) (c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/latex/environ/environ.sty +Package: environ 2014/05/04 v0.3 A new way to define environments +(c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/latex/trimspaces/trimspaces.sty +Package: trimspaces 2009/09/17 v1.1 Trim spaces around a token list +) +\@envbody=\toks33 +) +\tcb@titlebox=\box64 +\tcb@upperbox=\box65 +\tcb@lowerbox=\box66 +\tcb@phantombox=\box67 +\c@tcbbreakpart=\count300 +\c@tcblayer=\count301 +\c@tcolorbox@number=\count302 +\tcb@temp=\box68 +\tcb@temp=\box69 +\tcb@temp=\box70 +\tcb@temp=\box71 +) (c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/latex/amsfonts/amssymb.sty +Package: amssymb 2013/01/14 v3.01 AMS font symbols +(c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/latex/amsfonts/amsfonts.sty +Package: amsfonts 2013/01/14 v3.01 Basic AMSFonts support +\@emptytoks=\toks34 +\symAMSa=\mathgroup4 +\symAMSb=\mathgroup5 +LaTeX Font Info: Redeclaring math symbol \hbar on input line 98. +LaTeX Font Info: Overwriting math alphabet `\mathfrak' in version `bold' +(Font) U/euf/m/n --> U/euf/b/n on input line 106. +)) (c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/latex/amsmath/amsmath.sty +Package: amsmath 2023/05/13 v2.17o AMS math features +\@mathmargin=\skip58 +For additional information on amsmath, use the `?' option. +(c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/latex/amsmath/amstext.sty +Package: amstext 2021/08/26 v2.01 AMS text +(c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/latex/amsmath/amsgen.sty +File: amsgen.sty 1999/11/30 v2.0 generic functions +\@emptytoks=\toks35 +\ex@=\dimen268 +)) (c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/latex/amsmath/amsbsy.sty +Package: amsbsy 1999/11/29 v1.2d Bold Symbols +\pmbraise@=\dimen269 +) (c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/latex/amsmath/amsopn.sty +Package: amsopn 2022/04/08 v2.04 operator names +) +\inf@bad=\count303 +LaTeX Info: Redefining \frac on input line 234. +\uproot@=\count304 +\leftroot@=\count305 +LaTeX Info: Redefining \overline on input line 399. +LaTeX Info: Redefining \colon on input line 410. +\classnum@=\count306 +\DOTSCASE@=\count307 +LaTeX Info: Redefining \ldots on input line 496. +LaTeX Info: Redefining \dots on input line 499. +LaTeX Info: Redefining \cdots on input line 620. +\Mathstrutbox@=\box72 +\strutbox@=\box73 +LaTeX Info: Redefining \big on input line 722. +LaTeX Info: Redefining \Big on input line 723. +LaTeX Info: Redefining \bigg on input line 724. +LaTeX Info: Redefining \Bigg on input line 725. +\big@size=\dimen270 +LaTeX Font Info: Redeclaring font encoding OML on input line 743. +LaTeX Font Info: Redeclaring font encoding OMS on input line 744. +\macc@depth=\count308 +LaTeX Info: Redefining \bmod on input line 905. +LaTeX Info: Redefining \pmod on input line 910. +LaTeX Info: Redefining \smash on input line 940. +LaTeX Info: Redefining \relbar on input line 970. +LaTeX Info: Redefining \Relbar on input line 971. +\c@MaxMatrixCols=\count309 +\dotsspace@=\muskip17 +\c@parentequation=\count310 +\dspbrk@lvl=\count311 +\tag@help=\toks36 +\row@=\count312 +\column@=\count313 +\maxfields@=\count314 +\andhelp@=\toks37 +\eqnshift@=\dimen271 +\alignsep@=\dimen272 +\tagshift@=\dimen273 +\tagwidth@=\dimen274 +\totwidth@=\dimen275 +\lineht@=\dimen276 +\@envbody=\toks38 +\multlinegap=\skip59 +\multlinetaggap=\skip60 +\mathdisplay@stack=\toks39 +LaTeX Info: Redefining \[ on input line 2953. +LaTeX Info: Redefining \] on input line 2954. +) (c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/generic/iftex/ifxetex.sty +Package: ifxetex 2019/10/25 v0.7 ifxetex legacy package. Use iftex instead. +) (c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/generic/iftex/ifluatex.sty +Package: ifluatex 2019/10/25 v1.5 ifluatex legacy package. Use iftex instead. +) (c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/latex/seqsplit/seqsplit.sty +Package: seqsplit 2006/08/07 v0.1 Splitting long sequences (DNA, RNA, proteins, etc.) +) (c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/latex/base/fixltx2e.sty +Package: fixltx2e 2016/12/29 v2.1a fixes to LaTeX (obsolete) +Applying: [2015/01/01] Old fixltx2e package on input line 46. + +Package fixltx2e Warning: fixltx2e is not required with releases after 2015 +(fixltx2e) All fixes are now in the LaTeX kernel. +(fixltx2e) See the latexrelease package for details. + +Already applied: [0000/00/00] Old fixltx2e package on input line 53. +) (c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/latex/biblatex/biblatex.sty +Package: biblatex 2023/03/05 v3.19 programmable bibliographies (PK/MW) +(c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/latex/logreq/logreq.sty +Package: logreq 2010/08/04 v1.0 xml request logger +\lrq@indent=\count315 +(c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/latex/logreq/logreq.def +File: logreq.def 2010/08/04 v1.0 logreq spec v1.0 +)) (c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/latex/base/ifthen.sty +Package: ifthen 2022/04/13 v1.1d Standard LaTeX ifthen package (DPC) +) +\c@tabx@nest=\count316 +\c@listtotal=\count317 +\c@listcount=\count318 +\c@liststart=\count319 +\c@liststop=\count320 +\c@citecount=\count321 +\c@citetotal=\count322 +\c@multicitecount=\count323 +\c@multicitetotal=\count324 +\c@instcount=\count325 +\c@maxnames=\count326 +\c@minnames=\count327 +\c@maxitems=\count328 +\c@minitems=\count329 +\c@citecounter=\count330 +\c@maxcitecounter=\count331 +\c@savedcitecounter=\count332 +\c@uniquelist=\count333 +\c@uniquename=\count334 +\c@refsection=\count335 +\c@refsegment=\count336 +\c@maxextratitle=\count337 +\c@maxextratitleyear=\count338 +\c@maxextraname=\count339 +\c@maxextradate=\count340 +\c@maxextraalpha=\count341 +\c@abbrvpenalty=\count342 +\c@highnamepenalty=\count343 +\c@lownamepenalty=\count344 +\c@maxparens=\count345 +\c@parenlevel=\count346 +\blx@tempcnta=\count347 +\blx@tempcntb=\count348 +\blx@tempcntc=\count349 +\c@blx@maxsection=\count350 +\blx@maxsegment@0=\count351 +\blx@notetype=\count352 +\blx@parenlevel@text=\count353 +\blx@parenlevel@foot=\count354 +\blx@sectionciteorder@0=\count355 +\blx@sectionciteorderinternal@0=\count356 +\blx@entrysetcounter=\count357 +\blx@biblioinstance=\count358 +\labelnumberwidth=\skip61 +\labelalphawidth=\skip62 +\biblabelsep=\skip63 +\bibitemsep=\skip64 +\bibnamesep=\skip65 +\bibinitsep=\skip66 +\bibparsep=\skip67 +\bibhang=\skip68 +\blx@bcfin=\read4 +\blx@bcfout=\write4 +\blx@langwohyphens=\language3 +\c@mincomprange=\count359 +\c@maxcomprange=\count360 +\c@mincompwidth=\count361 +Package biblatex Info: Trying to load biblatex default data model... +Package biblatex Info: ... file 'blx-dm.def' found. +(c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/latex/biblatex/blx-dm.def +File: blx-dm.def 2023/03/05 v3.19 biblatex localization (PK/MW) +) +Package biblatex Info: Trying to load biblatex custom data model... +Package biblatex Info: ... file 'biblatex-dm.cfg' not found. +\c@afterword=\count362 +\c@savedafterword=\count363 +\c@annotator=\count364 +\c@savedannotator=\count365 +\c@author=\count366 +\c@savedauthor=\count367 +\c@bookauthor=\count368 +\c@savedbookauthor=\count369 +\c@commentator=\count370 +\c@savedcommentator=\count371 +\c@editor=\count372 +\c@savededitor=\count373 +\c@editora=\count374 +\c@savededitora=\count375 +\c@editorb=\count376 +\c@savededitorb=\count377 +\c@editorc=\count378 +\c@savededitorc=\count379 +\c@foreword=\count380 +\c@savedforeword=\count381 +\c@holder=\count382 +\c@savedholder=\count383 +\c@introduction=\count384 +\c@savedintroduction=\count385 +\c@namea=\count386 +\c@savednamea=\count387 +\c@nameb=\count388 +\c@savednameb=\count389 +\c@namec=\count390 +\c@savednamec=\count391 +\c@translator=\count392 +\c@savedtranslator=\count393 +\c@shortauthor=\count394 +\c@savedshortauthor=\count395 +\c@shorteditor=\count396 +\c@savedshorteditor=\count397 +\c@labelname=\count398 +\c@savedlabelname=\count399 +\c@institution=\count400 +\c@savedinstitution=\count401 +\c@lista=\count402 +\c@savedlista=\count403 +\c@listb=\count404 +\c@savedlistb=\count405 +\c@listc=\count406 +\c@savedlistc=\count407 +\c@listd=\count408 +\c@savedlistd=\count409 +\c@liste=\count410 +\c@savedliste=\count411 +\c@listf=\count412 +\c@savedlistf=\count413 +\c@location=\count414 +\c@savedlocation=\count415 +\c@organization=\count416 +\c@savedorganization=\count417 +\c@origlocation=\count418 +\c@savedoriglocation=\count419 +\c@origpublisher=\count420 +\c@savedorigpublisher=\count421 +\c@publisher=\count422 +\c@savedpublisher=\count423 +\c@language=\count424 +\c@savedlanguage=\count425 +\c@origlanguage=\count426 +\c@savedoriglanguage=\count427 +\c@pageref=\count428 +\c@savedpageref=\count429 +\shorthandwidth=\skip69 +\shortjournalwidth=\skip70 +\shortserieswidth=\skip71 +\shorttitlewidth=\skip72 +\shortauthorwidth=\skip73 +\shorteditorwidth=\skip74 +\locallabelnumberwidth=\skip75 +\locallabelalphawidth=\skip76 +\localshorthandwidth=\skip77 +\localshortjournalwidth=\skip78 +\localshortserieswidth=\skip79 +\localshorttitlewidth=\skip80 +\localshortauthorwidth=\skip81 +\localshorteditorwidth=\skip82 +Package biblatex Info: Trying to load enhanced support for Unicode engines... +Package biblatex Info: ... file 'blx-unicode.def' found. +(c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/latex/biblatex/blx-unicode.def) +Package biblatex Info: Trying to load compatibility code... +Package biblatex Info: ... file 'blx-compat.def' found. +(c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/latex/biblatex/blx-compat.def +File: blx-compat.def 2023/03/05 v3.19 biblatex compatibility (PK/MW) +) +Package biblatex Info: Trying to load generic definitions... +Package biblatex Info: ... file 'biblatex.def' found. +(c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/latex/biblatex/biblatex.def +File: biblatex.def 2023/03/05 v3.19 biblatex compatibility (PK/MW) +\c@textcitecount=\count430 +\c@textcitetotal=\count431 +\c@textcitemaxnames=\count432 +\c@biburlbigbreakpenalty=\count433 +\c@biburlbreakpenalty=\count434 +\c@biburlnumpenalty=\count435 +\c@biburlucpenalty=\count436 +\c@biburllcpenalty=\count437 +\biburlbigskip=\muskip18 +\biburlnumskip=\muskip19 +\biburlucskip=\muskip20 +\biburllcskip=\muskip21 +\c@smartand=\count438 +) +Package biblatex Info: Trying to load bibliography style 'numeric'... +Package biblatex Info: ... file 'numeric.bbx' found. +(c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/latex/biblatex/bbx/numeric.bbx +File: numeric.bbx 2023/03/05 v3.19 biblatex bibliography style (PK/MW) +Package biblatex Info: Trying to load bibliography style 'standard'... +Package biblatex Info: ... file 'standard.bbx' found. +(c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/latex/biblatex/bbx/standard.bbx +File: standard.bbx 2023/03/05 v3.19 biblatex bibliography style (PK/MW) +\c@bbx:relatedcount=\count439 +\c@bbx:relatedtotal=\count440 +)) +Package biblatex Info: Trying to load citation style 'numeric'... +Package biblatex Info: ... file 'numeric.cbx' found. +(c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/latex/biblatex/cbx/numeric.cbx +File: numeric.cbx 2023/03/05 v3.19 biblatex citation style (PK/MW) +Package biblatex Info: Redefining '\cite'. +Package biblatex Info: Redefining '\parencite'. +Package biblatex Info: Redefining '\footcite'. +Package biblatex Info: Redefining '\footcitetext'. +Package biblatex Info: Redefining '\smartcite'. +Package biblatex Info: Redefining '\supercite'. +Package biblatex Info: Redefining '\textcite'. +Package biblatex Info: Redefining '\textcites'. +Package biblatex Info: Redefining '\cites'. +Package biblatex Info: Redefining '\parencites'. +Package biblatex Info: Redefining '\smartcites'. +) +Package biblatex Info: Trying to load configuration file... +Package biblatex Info: ... file 'biblatex.cfg' found. +(c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/latex/biblatex/biblatex.cfg +File: biblatex.cfg +) +Package biblatex Info: XeTeX detected. +(biblatex) Assuming input encoding 'utf8'. +Package biblatex Info: Document encoding is UTF8 .... +(c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/latex/l3kernel/expl3.sty +Package: expl3 2023-08-29 L3 programming layer (loader) +(c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/latex/l3backend/l3backend-xetex.def +File: l3backend-xetex.def 2023-04-19 L3 backend support: XeTeX +\g__graphics_track_int=\count441 +\l__pdf_internal_box=\box74 +\g__pdf_backend_object_int=\count442 +\g__pdf_backend_annotation_int=\count443 +\g__pdf_backend_link_int=\count444 +)) +Package biblatex Info: ... and expl3 +(biblatex) 2023-08-29 L3 programming layer (loader) +(biblatex) is new enough (at least 2020/04/06), +(biblatex) setting 'casechanger=expl3'. +(c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/latex/biblatex/blx-case-expl3.sty (c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/latex/l3packages/xparse/xparse.sty +Package: xparse 2023-08-29 L3 Experimental document command parser +) +Package: blx-case-expl3 2023/03/05 v3.19 expl3 case changing code for biblatex +)) (c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/latex/geometry/geometry.sty +Package: geometry 2020/01/02 v5.9 Page Geometry +(c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/generic/iftex/ifvtex.sty +Package: ifvtex 2019/10/25 v1.7 ifvtex legacy package. Use iftex instead. +) +\Gm@cnth=\count445 +\Gm@cntv=\count446 +\c@Gm@tempcnt=\count447 +\Gm@bindingoffset=\dimen277 +\Gm@wd@mp=\dimen278 +\Gm@odd@mp=\dimen279 +\Gm@even@mp=\dimen280 +\Gm@layoutwidth=\dimen281 +\Gm@layoutheight=\dimen282 +\Gm@layouthoffset=\dimen283 +\Gm@layoutvoffset=\dimen284 +\Gm@dimlist=\toks40 +) (c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/latex/fancyhdr/fancyhdr.sty +Package: fancyhdr 2022/11/09 v4.1 Extensive control of page headers and footers +\f@nch@headwidth=\skip83 +\f@nch@O@elh=\skip84 +\f@nch@O@erh=\skip85 +\f@nch@O@olh=\skip86 +\f@nch@O@orh=\skip87 +\f@nch@O@elf=\skip88 +\f@nch@O@erf=\skip89 +\f@nch@O@olf=\skip90 +\f@nch@O@orf=\skip91 +) (c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/xelatex/mathspec/mathspec.sty +Package: mathspec 2016/12/22 v0.2b LaTeX Package (Mathematics font selection for XeLaTeX) +(c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/latex/fontspec/fontspec.sty +Package: fontspec 2022/01/15 v2.8a Font selection for XeLaTeX and LuaLaTeX +(c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/latex/fontspec/fontspec-xetex.sty +Package: fontspec-xetex 2022/01/15 v2.8a Font selection for XeLaTeX and LuaLaTeX +\l__fontspec_script_int=\count448 +\l__fontspec_language_int=\count449 +\l__fontspec_strnum_int=\count450 +\l__fontspec_tmp_int=\count451 +\l__fontspec_tmpa_int=\count452 +\l__fontspec_tmpb_int=\count453 +\l__fontspec_tmpc_int=\count454 +\l__fontspec_em_int=\count455 +\l__fontspec_emdef_int=\count456 +\l__fontspec_strong_int=\count457 +\l__fontspec_strongdef_int=\count458 +\l__fontspec_tmpa_dim=\dimen285 +\l__fontspec_tmpb_dim=\dimen286 +\l__fontspec_tmpc_dim=\dimen287 +(c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/latex/base/fontenc.sty +Package: fontenc 2021/04/29 v2.0v Standard LaTeX package +) (c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/latex/fontspec/fontspec.cfg))) (c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/latex/xkeyval/xkeyval.sty +Package: xkeyval 2022/06/16 v2.9 package option processing (HA) +(c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/generic/xkeyval/xkeyval.tex (c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/generic/xkeyval/xkvutils.tex +\XKV@toks=\toks41 +\XKV@tempa@toks=\toks42 +) +\XKV@depth=\count459 +File: xkeyval.tex 2014/12/03 v2.7a key=value parser (HA) +)) +\c@eu@=\count460 +\c@eu@i=\count461 +\c@mkern=\count462 +) (c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/latex/microtype/microtype.sty +Package: microtype 2023/03/13 v3.1a Micro-typographical refinements (RS) +\MT@toks=\toks43 +\MT@tempbox=\box75 +\MT@count=\count463 +LaTeX Info: Redefining \noprotrusionifhmode on input line 1059. +LaTeX Info: Redefining \leftprotrusion on input line 1060. +\MT@prot@toks=\toks44 +LaTeX Info: Redefining \rightprotrusion on input line 1078. +LaTeX Info: Redefining \textls on input line 1368. +\MT@outer@kern=\dimen288 +LaTeX Info: Redefining \textmicrotypecontext on input line 1988. +\MT@listname@count=\count464 +(c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/latex/microtype/microtype-xetex.def +File: microtype-xetex.def 2023/03/13 v3.1a Definitions specific to xetex (RS) +LaTeX Info: Redefining \lsstyle on input line 238. +) +Package microtype Info: Loading configuration file microtype.cfg. +(c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/latex/microtype/microtype.cfg +File: microtype.cfg 2023/03/13 v3.1a microtype main configuration file (RS) +)) +Package hyperref Info: Option `unicode' set `true' on input line 154. +Package hyperref Info: Option `breaklinks' set `true' on input line 154. +(c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/latex/grffile/grffile.sty +Package: grffile 2019/11/11 v2.1 Extended file name support for graphics (legacy) +Package grffile Info: This package is an empty stub for compatibility on input line 40. +) (c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/latex/parskip/parskip.sty +Package: parskip 2021-03-14 v2.0h non-zero parskip adjustments +) (c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/latex/fancyvrb/fancyvrb.sty +Package: fancyvrb 2023/01/19 4.5a verbatim text (tvz,hv) +\FV@CodeLineNo=\count465 +\FV@InFile=\read5 +\FV@TabBox=\box76 +\c@FancyVerbLine=\count466 +\FV@StepNumber=\count467 +\FV@OutFile=\write5 +) (c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/latex/framed/framed.sty +Package: framed 2011/10/22 v 0.96: framed or shaded text with page breaks +\OuterFrameSep=\skip92 +\fb@frw=\dimen289 +\fb@frh=\dimen290 +\FrameRule=\dimen291 +\FrameSep=\dimen292 +) +\cslhangindent=\skip93 +\csllabelwidth=\skip94 +\cslentryspacingunit=\skip95 +\@quotelevel=\count468 +\@quotereset=\count469 +(./paper.aux) +\openout1 = `paper.aux'. + +LaTeX Font Info: Checking defaults for OML/cmm/m/it on input line 284. +LaTeX Font Info: ... okay on input line 284. +LaTeX Font Info: Checking defaults for OMS/cmsy/m/n on input line 284. +LaTeX Font Info: ... okay on input line 284. +LaTeX Font Info: Checking defaults for OT1/cmr/m/n on input line 284. +LaTeX Font Info: ... okay on input line 284. +LaTeX Font Info: Checking defaults for T1/cmr/m/n on input line 284. +LaTeX Font Info: ... okay on input line 284. +LaTeX Font Info: Checking defaults for TS1/cmr/m/n on input line 284. +LaTeX Font Info: Trying to load font information for TS1+cmr on input line 284. +(c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/latex/base/ts1cmr.fd +File: ts1cmr.fd 2023/04/13 v2.5m Standard LaTeX font definitions +) +LaTeX Font Info: ... okay on input line 284. +LaTeX Font Info: Checking defaults for TU/lmr/m/n on input line 284. +LaTeX Font Info: ... okay on input line 284. +LaTeX Font Info: Checking defaults for OMX/cmex/m/n on input line 284. +LaTeX Font Info: ... okay on input line 284. +LaTeX Font Info: Checking defaults for U/cmr/m/n on input line 284. +LaTeX Font Info: ... okay on input line 284. +LaTeX Font Info: Checking defaults for PD1/pdf/m/n on input line 284. +LaTeX Font Info: ... okay on input line 284. +LaTeX Font Info: Checking defaults for PU/pdf/m/n on input line 284. +LaTeX Font Info: ... okay on input line 284. +Package hyperref Info: Link coloring ON on input line 284. +(./paper.out) (./paper.out) +\@outlinefile=\write6 +\openout6 = `paper.out'. + +Package caption Info: Begin \AtBeginDocument code. +Package caption Info: End \AtBeginDocument code. +Package biblatex Info: Trying to load language 'english'... +Package biblatex Info: ... file 'english.lbx' found. +(c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/latex/biblatex/lbx/english.lbx +File: english.lbx 2023/03/05 v3.19 biblatex localization (PK/MW) +) +Package biblatex Info: XeTeX detected. +(biblatex) Assuming input encoding 'utf8'. +Package biblatex Info: Automatic encoding selection. +(biblatex) Assuming data encoding 'utf8'. +\openout4 = `paper.bcf'. + +Package biblatex Info: Trying to load bibliographic data... +Package biblatex Info: ... file 'paper.bbl' not found. +No file paper.bbl. +Package biblatex Info: Reference section=0 on input line 284. +Package biblatex Info: Reference segment=0 on input line 284. +*geometry* driver: auto-detecting +*geometry* detected driver: xetex +*geometry* verbose mode - [ preamble ] result: +* driver: xetex +* paper: a4paper +* layout: +* layoutoffset:(h,v)=(0.0pt,0.0pt) +* modes: includemp +* h-part:(L,W,R)=(28.45274pt, 526.376pt, 42.67912pt) +* v-part:(T,H,B)=(99.58464pt, 660.10394pt, 85.35826pt) +* \paperwidth=597.50787pt +* \paperheight=845.04684pt +* \textwidth=387.33861pt +* \textheight=660.10394pt +* \oddsidemargin=95.22015pt +* \evensidemargin=95.22015pt +* \topmargin=-60.28131pt +* \headheight=62.59596pt +* \headsep=25.0pt +* \topskip=10.0pt +* \footskip=30.0pt +* \marginparwidth=128.0374pt +* \marginparsep=11.0pt +* \columnsep=10.0pt +* \skip\footins=9.0pt plus 4.0pt minus 2.0pt +* \hoffset=0.0pt +* \voffset=0.0pt +* \mag=1000 +* \@twocolumnfalse +* \@twosidefalse +* \@mparswitchfalse +* \@reversemargintrue +* (1in=72.27pt=25.4mm, 1cm=28.453pt) + +LaTeX Info: Redefining \microtypecontext on input line 284. +Package microtype Info: Applying patch `item' on input line 284. +Package microtype Info: Applying patch `toc' on input line 284. +Package microtype Info: Applying patch `eqnum' on input line 284. +Package microtype Info: Applying patch `footnote' on input line 284. +Package microtype Info: Applying patch `verbatim' on input line 284. +Package microtype Info: Character protrusion enabled (level 2). +Package microtype Info: Using protrusion set `basicmath'. +Package microtype Info: No adjustment of tracking. +Package microtype Info: No adjustment of spacing. +Package microtype Info: No adjustment of kerning. +(c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/latex/microtype/mt-LatinModernRoman.cfg +File: mt-LatinModernRoman.cfg 2021/02/21 v1.1 microtype config. file: Latin Modern Roman (RS) +) +LaTeX Font Info: Font shape `TU/lmss/m/it' in size <17.28> not available +(Font) Font shape `TU/lmss/m/sl' tried instead on input line 285. +Package microtype Info: Loading generic protrusion settings for font family +(microtype) `lmss' (encoding: TU). +(microtype) For optimal results, create family-specific settings. +(microtype) See the microtype manual for details. +(c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/latex/microtype/mt-cmr.cfg +File: mt-cmr.cfg 2013/05/19 v2.2 microtype config. file: Computer Modern Roman (RS) +) +LaTeX Font Info: Trying to load font information for U+msa on input line 285. +(c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/latex/amsfonts/umsa.fd +File: umsa.fd 2013/01/14 v3.01 AMS symbols A +) (c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/latex/microtype/mt-msa.cfg +File: mt-msa.cfg 2006/02/04 v1.1 microtype config. file: AMS symbols (a) (RS) +) +LaTeX Font Info: Trying to load font information for U+msb on input line 285. +(c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/latex/amsfonts/umsb.fd +File: umsb.fd 2013/01/14 v3.01 AMS symbols B +) (c:/Users/rempsyc/AppData/Roaming/TinyTeX/texmf-dist/tex/latex/microtype/mt-msb.cfg +File: mt-msb.cfg 2005/06/01 v1.0 microtype config. file: AMS symbols (b) (RS) +) + +Package hyperref Warning: Suppressing link with empty target on input line 311. + + +Package hyperref Warning: Suppressing link with empty target on input line 311. + + +Package hyperref Warning: Suppressing link with empty target on input line 311. + +File: D:/Rpackages/rticles/rmarkdown/templates/joss/resources/JOSE-logo.png Graphic file (type bmp) + + +Package fancyhdr Warning: \headheight is too small (62.59596pt): +(fancyhdr) Make it at least 64.31554pt, for example: +(fancyhdr) \setlength{\headheight}{64.31554pt}. +(fancyhdr) You might also make \topmargin smaller to compensate: +(fancyhdr) \addtolength{\topmargin}{-1.71957pt}. + +LaTeX Font Info: Font shape `TU/lmss/m/it' in size <8> not available +(Font) Font shape `TU/lmss/m/sl' tried instead on input line 376. +[1 + +] +File: D:/Rpackages/rticles/rmarkdown/templates/joss/resources/JOSE-logo.png Graphic file (type bmp) + + +Package fancyhdr Warning: \headheight is too small (62.59596pt): +(fancyhdr) Make it at least 64.31554pt, for example: +(fancyhdr) \setlength{\headheight}{64.31554pt}. +(fancyhdr) You might also make \topmargin smaller to compensate: +(fancyhdr) \addtolength{\topmargin}{-1.71957pt}. + +[2] +LaTeX Font Info: Font shape `TU/lmtt/bx/n' in size <10> not available +(Font) Font shape `TU/lmtt/b/n' tried instead on input line 466. + +Overfull \hbox (32.66139pt too wide) in paragraph at lines 482--482 +[]\TU/lmtt/m/n/10 #> -----------------------------------------------------------------------------[] + [] + + +Overfull \hbox (32.66139pt too wide) in paragraph at lines 491--491 +[]\TU/lmtt/m/n/10 #> -----------------------------------------------------------------------------[] + [] + +File: D:/Rpackages/rticles/rmarkdown/templates/joss/resources/JOSE-logo.png Graphic file (type bmp) + + +Package fancyhdr Warning: \headheight is too small (62.59596pt): +(fancyhdr) Make it at least 64.31554pt, for example: +(fancyhdr) \setlength{\headheight}{64.31554pt}. +(fancyhdr) You might also make \topmargin smaller to compensate: +(fancyhdr) \addtolength{\topmargin}{-1.71957pt}. + +[3] +File: D:/Rpackages/rticles/rmarkdown/templates/joss/resources/JOSE-logo.png Graphic file (type bmp) + + +Package fancyhdr Warning: \headheight is too small (62.59596pt): +(fancyhdr) Make it at least 64.31554pt, for example: +(fancyhdr) \setlength{\headheight}{64.31554pt}. +(fancyhdr) You might also make \topmargin smaller to compensate: +(fancyhdr) \addtolength{\topmargin}{-1.71957pt}. + +[4] +File: table1.jpg Graphic file (type bmp) + +File: D:/Rpackages/rticles/rmarkdown/templates/joss/resources/JOSE-logo.png Graphic file (type bmp) + + +Package fancyhdr Warning: \headheight is too small (62.59596pt): +(fancyhdr) Make it at least 64.31554pt, for example: +(fancyhdr) \setlength{\headheight}{64.31554pt}. +(fancyhdr) You might also make \topmargin smaller to compensate: +(fancyhdr) \addtolength{\topmargin}{-1.71957pt}. + +[5] +File: paper_files/figure-latex/model_fig-1.pdf Graphic file (type pdf) + +File: D:/Rpackages/rticles/rmarkdown/templates/joss/resources/JOSE-logo.png Graphic file (type bmp) + + +Package fancyhdr Warning: \headheight is too small (62.59596pt): +(fancyhdr) Make it at least 64.31554pt, for example: +(fancyhdr) \setlength{\headheight}{64.31554pt}. +(fancyhdr) You might also make \topmargin smaller to compensate: +(fancyhdr) \addtolength{\topmargin}{-1.71957pt}. + +[6] +File: D:/Rpackages/rticles/rmarkdown/templates/joss/resources/JOSE-logo.png Graphic file (type bmp) + + +Package fancyhdr Warning: \headheight is too small (62.59596pt): +(fancyhdr) Make it at least 64.31554pt, for example: +(fancyhdr) \setlength{\headheight}{64.31554pt}. +(fancyhdr) You might also make \topmargin smaller to compensate: +(fancyhdr) \addtolength{\topmargin}{-1.71957pt}. + +[7] +File: D:/Rpackages/rticles/rmarkdown/templates/joss/resources/JOSE-logo.png Graphic file (type bmp) + + +Package fancyhdr Warning: \headheight is too small (62.59596pt): +(fancyhdr) Make it at least 64.31554pt, for example: +(fancyhdr) \setlength{\headheight}{64.31554pt}. +(fancyhdr) You might also make \topmargin smaller to compensate: +(fancyhdr) \addtolength{\topmargin}{-1.71957pt}. + +[8] +Underfull \hbox (badness 1584) in paragraph at lines 928--934 +[]\TU/lmr/m/n/10 Simmons, J. P., Nelson, L. D., & Simonsohn, U. (2011). False-positive psy- + [] + + +Underfull \hbox (badness 3049) in paragraph at lines 928--934 +\TU/lmr/m/n/10 chology: Undisclosed flexibility in data collection and analysis allows pre- + [] + + +Underfull \hbox (badness 3735) in paragraph at lines 928--934 +\TU/lmr/m/n/10 senting anything as significant. \TU/lmr/m/it/10 Psychological Science\TU/lmr/m/n/10 , \TU/lmr/m/it/10 22\TU/lmr/m/n/10 (11), 1359–1366. + [] + +File: D:/Rpackages/rticles/rmarkdown/templates/joss/resources/JOSE-logo.png Graphic file (type bmp) + + +Package fancyhdr Warning: \headheight is too small (62.59596pt): +(fancyhdr) Make it at least 64.31554pt, for example: +(fancyhdr) \setlength{\headheight}{64.31554pt}. +(fancyhdr) You might also make \topmargin smaller to compensate: +(fancyhdr) \addtolength{\topmargin}{-1.71957pt}. + +[9] (./paper.aux) + *********** +LaTeX2e <2023-06-01> patch level 1 +L3 programming layer <2023-08-29> + *********** +Package rerunfilecheck Info: File `paper.out' has not changed. +(rerunfilecheck) Checksum: 18F584A1BC96404D165BE4F0A067B822;2146. +Package logreq Info: Writing requests to 'paper.run.xml'. +\openout1 = `paper.run.xml'. + + ) +Here is how much of TeX's memory you used: + 36640 strings out of 477589 + 751637 string characters out of 5817003 + 1940416 words of memory out of 5000000 + 57291 multiletter control sequences out of 15000+600000 + 564989 words of font info for 90 fonts, out of 8000000 for 9000 + 14 hyphenation exceptions out of 8191 + 84i,12n,87p,1194b,850s stack positions out of 10000i,1000n,20000p,200000b,200000s + +Output written on paper.pdf (9 pages). diff --git a/papers/JOSE/paper.md b/papers/JOSE/paper.md new file mode 100644 index 000000000..d44309809 --- /dev/null +++ b/papers/JOSE/paper.md @@ -0,0 +1,325 @@ +--- +title: "Check your outliers! An introduction to identifying statistical outliers in R with *easystats*" +tags: + - R + - univariate outliers + - multivariate outliers + - robust detection methods + - easystats +authors: + - name: Rémi Thériault + orcid: 0000-0003-4315-6788 + affiliation: 1 + - name: Mattan S. Ben-Shachar + orcid: 0000-0002-4287-4801 + affiliation: 2 + - name: Indrajeet Patil + orcid: 0000-0003-1995-6531 + affiliation: 3 + - name: Daniel Lüdecke + orcid: 0000-0002-8895-3206 + affiliation: 4 + - name: Brenton M. Wiernik + orcid: 0000-0001-9560-6336 + affiliation: 5 + - name: Dominique Makowski + orcid: 0000-0001-5375-9967 + affiliation: 6 +affiliations: + - index: 1 + name: Department of Psychology, Université du Québec à Montréal, Montréal, Québec, Canada + - index: 2 + name: Independent Researcher, Ramat Gan, Israel + - index: 3 + name: Center for Humans and Machines, Max Planck Institute for Human Development, Berlin, Germany + - index: 4 + name: Institute of Medical Sociology, University Medical Center Hamburg-Eppendorf, Germany + - index: 5 + name: Independent Researcher, Tampa, FL, USA + - index: 6 + name: School of Psychology, University of Sussex, Brighton, UK +correspondence: theriault.remi@courrier.uqam.ca. +type: article +status: submit +date: 7 June 2023 +bibliography: paper.bib +simplesummary: | + The *{performance}* package from the *easystats* ecosystem makes it easy to + diagnose outliers in R and according to current best practices thanks to the + `check_outiers()` function. +keywords: | + univariate outliers; multivariate outliers; robust detection methods; R; easystats +acknowledgement: | + *{performance}* is part of the collaborative + [*easystats*](https://github.com/easystats/easystats) ecosystem + [@easystatspackage]. Thus, we thank all + [members of easystats](https://github.com/orgs/easystats/people), + contributors, and users alike. +authorcontributions: | + R.T. drafted the paper; all authors contributed to both the writing of the + paper and the conception of the software. +funding: | + This research received no external funding. +conflictsofinterest: | + The authors declare no conflict of interest. +abbreviations: + - short: SOD + long: Statistical outlier detection + - short: SEM + long: Structural equation modelling + - short: SD + long: Standard deviation + - short: MAD + long: Median absolute deviation + - short: IQR + long: Interquartile range + - short: HDI + long: Highest density interval + - short: BCI + long: Bias corrected and accelerated interval + - short: MCD + long: Minimum covariance determinant + - short: ICS + long: invariant coordinate selection + - short: OSF + long: Open Science Framework +output: + rticles::joss_article: + journal: "JOSE" +csl: apa.csl +--- + + + +# Summary + +Beyond the challenge of keeping up-to-date with current best practices regarding the diagnosis and treatment of outliers, an additional difficulty arises concerning the mathematical implementation of the recommended methods. Here, we provide an overview of current recommendations and best practices and demonstrate how they can easily and conveniently be implemented in the R statistical computing software, using the *{performance}* package of the *easystats* ecosystem. We cover univariate, multivariate, and model-based statistical outlier detection methods, their recommended threshold, standard output, and plotting methods. We conclude by reviewing the different theoretical types of outliers, whether to exclude or winsorize them, and the importance of transparency. + +# Statement of Need + +Real-life data often contain observations that can be considered *abnormal* when compared to the main population. The cause of it can be hard to assess and the boundaries of "abnormal", difficult to define---they may belong to a different distribution (originating from a different generative process) or simply be extreme cases, statistically rare but not impossible. + +Nonetheless, the improper handling of these outliers can substantially affect statistical model estimations, biasing effect estimations and weakening the models' predictive performance. It is thus essential to address this problem in a thoughtful manner. Yet, despite the existence of established recommendations and guidelines, many researchers still do not treat outliers in a consistent manner, or do so using inappropriate strategies [@simmons2011false; @leys2013outliers]. + +One possible reason is that researchers are not aware of the existing recommendations, or do not know how to implement them using their analysis software. In this paper, we show how to follow current best practices for automatic and reproducible statistical outlier detection (SOD) using R and the *{performance}* package [@ludecke2021performance], which is part of the *easystats* ecosystem of packages that build an R framework for easy statistical modeling, visualization, and reporting [@easystatspackage]. Installation instructions can be found on [GitHub](https://github.com/easystats/performance) or its [website](https://easystats.github.io/performance/), and its list of dependencies on [CRAN](https://cran.r-project.org/package=performance). + +The instructional materials that follow are aimed at an audience of researchers who want to follow good practices, and are appropriate for advanced undergraduate students, graduate students, professors, or professionals having to deal with the nuances of outlier treatment. + +# Identifying Outliers + +Although many researchers attempt to identify outliers with measures based on the mean (e.g., _z_ scores), those methods are problematic because the mean and standard deviation themselves are not robust to the influence of outliers and those methods also assume normally distributed data (i.e., a Gaussian distribution). Therefore, current guidelines recommend using robust methods to identify outliers, such as those relying on the median as opposed to the mean [@leys2019outliers; @leys2013outliers; @leys2018outliers]. + +Nonetheless, which exact outlier method to use depends on many factors. In some cases, eye-gauging odd observations can be an appropriate solution, though many researchers will favour algorithmic solutions to detect potential outliers, for example, based on a continuous value expressing the observation stands out from the others. + +One of the factors to consider when selecting an algorithmic outlier detection method is the statistical test of interest. Identifying observations the regression model does not fit well can help find information relevant to our specific research context. This approach, known as model-based outliers detection (as outliers are extracted after the statistical model has been fit), can be contrasted with distribution-based outliers detection, which is based on the distance between an observation and the "center" of its population. Various quantification strategies of this distance exist for the latter, both univariate (involving only one variable at a time) or multivariate (involving multiple variables). + +When no method is readily available to detect model-based outliers, such as for structural equation modelling (SEM), looking for multivariate outliers may be of relevance. For simple tests (_t_ tests or correlations) that compare values of the same variable, it can be appropriate to check for univariate outliers. However, univariate methods can give false positives since _t_ tests and correlations, ultimately, are also models/multivariable statistics. They are in this sense more limited, but we show them nonetheless for educational purposes. + +Importantly, whatever approach researchers choose remains a subjective decision, which usage (and rationale) must be transparently documented and reproducible [@leys2019outliers]. Researchers should commit (ideally in a preregistration) to an outlier treatment method before collecting the data. They should report in the paper their decisions and details of their methods, as well as any deviation from their original plan. These transparency practices can help reduce false positives due to excessive researchers' degrees of freedom (i.e., choice flexibility throughout the analysis). In the following section, we will go through each of the mentioned methods and provide examples on how to implement them with R. + +## Univariate Outliers + +Researchers frequently attempt to identify outliers using measures of deviation from the center of a variable's distribution. One of the most popular such procedure is the _z_ score transformation, which computes the distance in standard deviation (SD) from the mean. However, as mentioned earlier, this popular method is not robust. Therefore, for univariate outliers, it is recommended to use the median along with the Median Absolute Deviation (MAD), which are more robust than the interquartile range or the mean and its standard deviation [@leys2019outliers; @leys2013outliers]. + +Researchers can identify outliers based on robust (i.e., MAD-based) _z_ scores using the `check_outliers()` function of the *{performance}* package, by specifying `method = "zscore_robust"`.^[Note that `check_outliers()` only checks numeric variables.] Although @leys2013outliers suggest a default threshold of 2.5 and @leys2019outliers a threshold of 3, *{performance}* uses by default a less conservative threshold of ~3.29.^[3.29 is an approximation of the two-tailed critical value for _p_ < .001, obtained through `qnorm(p = 1 - 0.001 / 2)`. We chose this threshold for consistency with the thresholds of all our other methods.] That is, data points will be flagged as outliers if they go beyond +/- ~3.29 MAD. Users can adjust this threshold using the `threshold` argument. + +Below we provide example code using the `mtcars` dataset, which was extracted from the 1974 *Motor Trend* US magazine. The dataset contains fuel consumption and 10 characteristics of automobile design and performance for 32 different car models (see `?mtcars` for details). We chose this dataset because it is accessible from base R and familiar to many R users. We might want to conduct specific statistical analyses on this data set, say, _t_ tests or structural equation modelling, but first, we want to check for outliers that may influence those test results. + +Because the automobile names are stored as column names in `mtcars`, we first have to convert them to an ID column to benefit from the `check_outliers()` ID argument. Furthermore, we only really need a couple columns for this demonstration, so we choose the first four (`mpg` = Miles/(US) gallon; `cyl` = Number of cylinders; `disp` = Displacement; `hp` = Gross horsepower). Finally, because there are no outliers in this dataset, we add two artificial outliers before running our function. + + +```r +library(performance) + +# Create some artificial outliers and an ID column +data <- rbind(mtcars[1:4], 42, 55) +data <- cbind(car = row.names(data), data) + +outliers <- check_outliers(data, method = "zscore_robust", ID = "car") +outliers +``` + +``` +#> 2 outliers detected: cases 33, 34. +#> - Based on the following method and threshold: zscore_robust (3.291). +#> - For variables: mpg, cyl, disp, hp. +#> +#> ----------------------------------------------------------------------------- +#> +#> The following observations were considered outliers for two or more +#> variables by at least one of the selected methods: +#> +#> Row car n_Zscore_robust +#> 1 33 33 2 +#> 2 34 34 2 +#> +#> ----------------------------------------------------------------------------- +#> Outliers per variable (zscore_robust): +#> +#> $mpg +#> Row car Distance_Zscore_robust +#> 33 33 33 3.709699 +#> 34 34 34 5.848328 +#> +#> $cyl +#> Row car Distance_Zscore_robust +#> 33 33 33 12.14083 +#> 34 34 34 16.52502 +``` + +What we see is that `check_outliers()` with the robust _z_ score method detected two outliers: cases 33 and 34, which were the observations we added ourselves. They were flagged for two variables specifically: `mpg` (Miles/(US) gallon) and `cyl` (Number of cylinders), and the output provides their exact _z_ score for those variables. + +We describe how to deal with those cases in more details later in the paper, but should we want to exclude these detected outliers from the main dataset, we can extract row numbers using `which()` on the output object, which can then be used for indexing: + + +```r +which(outliers) +``` + +``` +#> [1] 33 34 +``` + +```r +data_clean <- data[-which(outliers), ] +``` + +Other univariate methods are available, such as using the interquartile range (IQR), or based on different intervals, such as the Highest Density Interval (HDI) or the Bias Corrected and Accelerated Interval (BCI). These methods are documented and described in the function's [help page](). + +## Multivariate Outliers + +Univariate outliers can be useful when the focus is on a particular variable, for instance the reaction time, as extreme values might be indicative of inattention or non-task-related behavior^[ Note that they might not be the optimal way of treating reaction time outliers [@ratcliff1993methods; @van1995statistical]]. + +However, in many scenarios, variables of a data set are not independent, and an abnormal observation will impact multiple dimensions. For instance, a participant giving random answers to a questionnaire. In this case, computing the _z_ score for each of the questions might not lead to satisfactory results. Instead, one might want to look at these variables together. + +One common approach for this is to compute multivariate distance metrics such as the Mahalanobis distance. Although the Mahalanobis distance is very popular, just like the regular _z_ scores method, it is not robust and is heavily influenced by the outliers themselves. Therefore, for multivariate outliers, it is recommended to use the Minimum Covariance Determinant, a robust version of the Mahalanobis distance [MCD, @leys2018outliers; @leys2019outliers]. + +In *{performance}*'s `check_outliers()`, one can use this approach with `method = "mcd"`.^[Our default threshold for the MCD method is defined by `stats::qchisq(p = 1 - 0.001, df = ncol(x))`, which again is an approximation of the critical value for _p_ < .001 consistent with the thresholds of our other methods.] + + +```r +outliers <- check_outliers(data, method = "mcd") +outliers +``` + +``` +#> 9 outliers detected: cases 7, 15, 16, 17, 24, 29, 31, 33, 34. +#> - Based on the following method and threshold: mcd (20). +#> - For variables: mpg, cyl, disp, hp. +``` + +Here, we detected 9 multivariate outliers (i.e,. when looking at all variables of our dataset together). + +Other multivariate methods are available, such as another type of robust Mahalanobis distance that in this case relies on an orthogonalized Gnanadesikan-Kettenring pairwise estimator [@gnanadesikan1972robust]. These methods are documented and described in the function's [help page](https://easystats.github.io/performance/reference/check_outliers.html). + +## Model-Based Outliers + +Working with regression models creates the possibility of using model-based SOD methods. These methods rely on the concept of *leverage*, that is, how much influence a given observation can have on the model estimates. If few observations have a relatively strong leverage/influence on the model, one can suspect that the model's estimates are biased by these observations, in which case flagging them as outliers could prove helpful (see next section, "Handling Outliers"). + +In {performance}, two such model-based SOD methods are currently available: Cook's distance, for regular regression models, and Pareto, for Bayesian models. As such, `check_outliers()` can be applied directly on regression model objects, by simply specifying `method = "cook"` (or `method = "pareto"` for Bayesian models).^[Our default threshold for the Cook method is defined by `stats::qf(0.5, ncol(x), nrow(x) - ncol(x))`, which again is an approximation of the critical value for _p_ < .001 consistent with the thresholds of our other methods.] + +Currently, most lm models are supported (with the exception of `glmmTMB`, `lmrob`, and `glmrob` models), as long as they are supported by the underlying functions `stats::cooks.distance()` (or `loo::pareto_k_values()`) and `insight::get_data()` (for a full list of the 225 models currently supported by the `insight` package, see https://easystats.github.io/insight/#list-of-supported-models-by-class). Also note that although `check_outliers()` supports the pipe operators (`|>` or `%>%`), it does not support `tidymodels` at this time. We show a demo below. + + +```r +model <- lm(disp ~ mpg * disp, data = data) +outliers <- check_outliers(model, method = "cook") +outliers +``` + +``` +#> 1 outlier detected: case 34. +#> - Based on the following method and threshold: cook (0.708). +#> - For variable: (Whole model). +``` + +Using the model-based outlier detection method, we identified a single outlier. + +Table 1 below summarizes which methods to use in which cases, and with what threshold. The recommended thresholds are the default thresholds. + + + +### Table 1 + +_Summary of Statistical Outlier Detection Methods Recommendations_ + + + +![](table1.jpg) + +All `check_outliers()` output objects possess a `plot()` method, meaning it is also possible to visualize the outliers using the generic `plot()` function on the resulting outlier object after loading the {see} package (Figure 1). + + +```r +plot(outliers) +``` + +\begin{figure} +\includegraphics[width=1\linewidth]{paper_files/figure-latex/model_fig-1} \caption{Visual depiction of outliers based on Cook's distance (leverage and standardized residuals), based on the fitted model.}\label{fig:model_fig} +\end{figure} + +## Cook's Distance vs. MCD + +@leys2018outliers report a preference for the MCD method over Cook's distance. This is because Cook's distance removes one observation at a time and checks its corresponding influence on the model each time [@cook1977detection], and flags any observation that has a large influence. In the view of these authors, when there are several outliers, the process of removing a single outlier at a time is problematic as the model remains "contaminated" or influenced by other possible outliers in the model, rendering this method suboptimal in the presence of multiple outliers. + +However, distribution-based approaches are not a silver bullet either, and there are cases where the usage of methods agnostic to theoretical and statistical models of interest might be problematic. For example, a very tall person would be expected to also be much heavier than average, but that would still fit with the expected association between height and weight (i.e., it would be in line with a model such as `weight ~ height`). In contrast, using multivariate outlier detection methods there may flag this person as being an outlier---being unusual on two variables, height and weight---even though the pattern fits perfectly with our predictions. + +Finally, unusual observations happen naturally: extreme observations are expected even when taken from a normal distribution. While statistical models can integrate this "expectation", multivariate outlier methods might be too conservative, flagging too many observations despite belonging to the right generative process. For these reasons, we believe that model-based methods are still preferable to the MCD when using supported regression models. Additionally, if the presence of multiple outliers is a significant concern, regression methods that are more robust to outliers should be considered---like _t_ regression or quantile regression---as they render their precise identification less critical [@mcelreath2020statistical]. + +## Composite Outlier Score + +The *{performance}* package also offers an alternative, consensus-based approach that combines several methods, based on the assumption that different methods provide different angles of looking at a given problem. By applying a variety of methods, one can hope to "triangulate" the true outliers (those consistently flagged by multiple methods) and thus attempt to minimize false positives. + +In practice, this approach computes a composite outlier score, formed of the average of the binary (0 or 1) classification results of each method. It represents the probability that each observation is classified as an outlier by at least one method. The default decision rule classifies rows with composite outlier scores superior or equal to 0.5 as outlier observations (i.e., that were classified as outliers by at least half of the methods). In *{performance}*'s `check_outliers()`, one can use this approach by including all desired methods in the corresponding argument. + + +```r +outliers <- check_outliers(model, method = c("zscore_robust", "mcd", "cook")) +which(outliers) +``` + +``` +#> [1] 33 34 +``` + +Outliers (counts or per variables) for individual methods can then be obtained through attributes. For example: + + +```r +attributes(outliers)$outlier_var$zscore_robust +``` + +``` +#> $mpg +#> Row Distance_Zscore_robust +#> 33 33 3.709699 +#> 34 34 5.848328 +``` + +An example sentence for reporting the usage of the composite method could be: + +> Based on a composite outlier score [see the 'check_outliers()' function in the 'performance' R package, @ludecke2021performance] obtained via the joint application of multiple outliers detection algorithms [(a) median absolute deviation (MAD)-based robust _z_ scores, @leys2013outliers; (b) Mahalanobis minimum covariance determinant (MCD), @leys2019outliers; and (c) Cook's distance, @cook1977detection], we excluded two participants that were classified as outliers by at least half of the methods used. + +# Handling Outliers + +The above section demonstrated how to identify outliers using the `check_outliers()` function in the *{performance}* package. But what should we do with these outliers once identified? Although it is common to automatically discard any observation that has been marked as "an outlier" as if it might infect the rest of the data with its statistical ailment, we believe that the use of SOD methods is but one step in the get-to-know-your-data pipeline; a researcher or analyst's _domain knowledge_ must be involved in the decision of how to deal with observations marked as outliers by means of SOD. Indeed, automatic tools can help detect outliers, but they are nowhere near perfect. Although they can be useful to flag suspect data, they can have misses and false alarms, and they cannot replace human eyes and proper vigilance from the researcher. If you do end up manually inspecting your data for outliers, it can be helpful to think of outliers as belonging to different types of outliers, or categories, which can help decide what to do with a given outlier. + +## Error, Interesting, and Random Outliers + +@leys2019outliers distinguish between error outliers, interesting outliers, and random outliers. _Error outliers_ are likely due to human error and should be corrected before data analysis or outright removed since they are invalid observations. _Interesting outliers_ are not due to technical error and may be of theoretical interest; it might thus be relevant to investigate them further even though they should be removed from the current analysis of interest. _Random outliers_ are assumed to be due to chance alone and to belong to the correct distribution and, therefore, should be retained. + +It is recommended to _keep_ observations which are expected to be part of the distribution of interest, even if they are outliers [@leys2019outliers]. However, if it is suspected that the outliers belong to an alternative distribution, then those observations could have a large impact on the results and call into question their robustness, especially if significance is conditional on their inclusion, so should be removed. + +We should also keep in mind that there might be error outliers that are not detected by statistical tools, but should nonetheless be found and removed. For example, if we are studying the effects of X on Y among teenagers and we have one observation from a 20-year-old, this observation might not be a _statistical outlier_, but it is an outlier in the _context_ of our research, and should be discarded. We could call these observations *undetected* error outliers, in the sense that although they do not statistically stand out, they do not belong to the theoretical or empirical distribution of interest (e.g., teenagers). In this way, we should not blindly rely on statistical outlier detection methods; doing our due diligence to investigate undetected error outliers relative to our specific research question is also essential for valid inferences. + +## Winsorization + +_Removing_ outliers can in this case be a valid strategy, and ideally one would report results with and without outliers to see the extent of their impact on results. This approach however can reduce statistical power. Therefore, some propose a _recoding_ approach, namely, winsorization: bringing outliers back within acceptable limits [e.g., 3 MADs, @tukey1963less]. However, if possible, it is recommended to collect enough data so that even after removing outliers, there is still sufficient statistical power without having to resort to winsorization [@leys2019outliers]. + +The _easystats_ ecosystem makes it easy to incorporate this step into your workflow through the `winsorize()` function of *{datawizard}*, a lightweight R package to facilitate data wrangling and statistical transformations [@patil2022datawizard]. This procedure will bring back univariate outliers within the limits of 'acceptable' values, based either on the percentile, the _z_ score, or its robust alternative based on the MAD. + +## The Importance of Transparency + +Finally, it is a critical part of a sound outlier treatment that regardless of which SOD method used, it should be reported in a reproducible manner. Ideally, the handling of outliers should be specified *a priori* with as much detail as possible, and preregistered, to limit researchers' degrees of freedom and therefore risks of false positives [@leys2019outliers]. This is especially true given that interesting outliers and random outliers are often times hard to distinguish in practice. Thus, researchers should always prioritize transparency and report all of the following information: (a) how many outliers were identified (including percentage); (b) according to which method and criteria, (c) using which function of which R package (if applicable), and (d) how they were handled (excluded or winsorized, if the latter, using what threshold). If at all possible, (e) the corresponding code script along with the data should be shared on a public repository like the Open Science Framework (OSF), so that the exclusion criteria can be reproduced precisely. + +# References diff --git a/papers/JOSE/paper.pdf b/papers/JOSE/paper.pdf new file mode 100644 index 000000000..b886b69e7 Binary files /dev/null and b/papers/JOSE/paper.pdf differ diff --git a/papers/Mathematics/paper_files/figure-latex/model-1.pdf b/papers/JOSE/paper_files/figure-latex/model_fig-1.pdf similarity index 53% rename from papers/Mathematics/paper_files/figure-latex/model-1.pdf rename to papers/JOSE/paper_files/figure-latex/model_fig-1.pdf index 981b68318..26d212b03 100644 Binary files a/papers/Mathematics/paper_files/figure-latex/model-1.pdf and b/papers/JOSE/paper_files/figure-latex/model_fig-1.pdf differ diff --git a/papers/Mathematics/paper.Rmd b/papers/JOSE/paper_longform.Rmd similarity index 85% rename from papers/Mathematics/paper.Rmd rename to papers/JOSE/paper_longform.Rmd index e35b6f02b..3214bde37 100644 --- a/papers/Mathematics/paper.Rmd +++ b/papers/JOSE/paper_longform.Rmd @@ -1,42 +1,48 @@ --- title: "Check your outliers! An introduction to identifying statistical outliers in R with *easystats*" -author: +tags: + - R + - univariate outliers + - multivariate outliers + - robust detection methods + - easystats +authors: - name: Rémi Thériault - affil: 1,* orcid: 0000-0003-4315-6788 + affiliation: 1 - name: Mattan S. Ben-Shachar - affil: 2 orcid: 0000-0002-4287-4801 + affiliation: 2 - name: Indrajeet Patil - affil: 3 orcid: 0000-0003-1995-6531 + affiliation: 3 - name: Daniel Lüdecke - affil: 4 orcid: 0000-0002-8895-3206 + affiliation: 4 - name: Brenton M. Wiernik - affil: 5 orcid: 0000-0001-9560-6336 + affiliation: 5 - name: Dominique Makowski - affil: 6 orcid: 0000-0001-5375-9967 -affiliation: - - num: 1 - address: Department of Psychology, Université du Québec à Montréal, Montréal, Québec, Canada - - num: 2 - address: Independent Researcher - - num: 3 - address: Center for Humans and Machines, Max Planck Institute for Human Development, Berlin, Germany - - num: 4 - address: Institute of Medical Sociology, University Medical Center Hamburg-Eppendorf, Germany - - num: 5 - address: Independent Researcher, Tampa, FL, USA - - num: 6 - address: School of Psychology, University of Sussex, Brighton, UK + affiliation: 6 +affiliations: + - index: 1 + name: Department of Psychology, Université du Québec à Montréal, Montréal, Québec, Canada + - index: 2 + name: Independent Researcher + - index: 3 + name: Center for Humans and Machines, Max Planck Institute for Human Development, Berlin, Germany + - index: 4 + name: Institute of Medical Sociology, University Medical Center Hamburg-Eppendorf, Germany + - index: 5 + name: Independent Researcher, Tampa, FL, USA + - index: 6 + name: School of Psychology, University of Sussex, Brighton, UK correspondence: theriault.remi@courrier.uqam.ca. -journal: "mathematics" type: article status: submit -bibliography: mybibfile.bib +date: 7 June 2023 +bibliography: paper.bib simplesummary: | The *{performance}* package from the *easystats* ecosystem makes it easy to diagnose outliers in R and according to current best practices thanks to the @@ -45,7 +51,7 @@ abstract: | Beyond the challenge of keeping up-to-date with current best practices regarding the diagnosis and treatment of outliers, an additional difficulty arises concerning the mathematical implementation of the recommended methods. - In this paper, we provide an overview of current recommandations and best + In this paper, we provide an overview of current recommendations and best practices and demonstrate how they can easily and conveniently be implemented in the R statistical computing software, using the *{performance}* package of the *easystats* ecosystem. We cover univariate, multivariate, and @@ -53,7 +59,7 @@ abstract: | threshold, standard output, and plotting methods. We conclude with recommendations on the handling of outliers: the different theoretical types of outliers, whether to exclude or winsorize them, and the importance of - transparency. + transparency. keywords: | univariate outliers; multivariate outliers; robust detection methods; R; easystats acknowledgement: | @@ -90,7 +96,10 @@ abbreviations: long: invariant coordinate selection - short: OSF long: Open Science Framework -output: rticles::mdpi_article +output: + rticles::joss_article: + journal: "JOSE" +csl: apa.csl --- ```{r setup, include=FALSE} @@ -107,9 +116,9 @@ library(see) library(datawizard) ``` -# Introduction +# Statement of Need -Real-life data often contain observations that can be considered *abnormal* when compared to the main population. The cause of it---be it because they belong to a different distribution (originating from a different generative process) or simply being extreme cases, statistically rare but not impossible---can be hard to assess, and the boundaries of "abnormal" are hard to define. +Real-life data often contain observations that can be considered *abnormal* when compared to the main population. The cause of it---be it because they belong to a different distribution (originating from a different generative process) or simply being extreme cases, statistically rare but not impossible---can be hard to assess, and the boundaries of "abnormal" difficult to define. Nonetheless, the improper handling of these outliers can substantially affect statistical model estimations, biasing effect estimations and weakening the models' predictive performance. It is thus essential to address this problem in a thoughtful manner. Yet, despite the existence of established recommendations and guidelines, many researchers still do not treat outliers in a consistent manner, or do so using inappropriate strategies [@simmons2011false; @leys2013outliers]. @@ -155,12 +164,21 @@ data_clean <- data[-which(outliers), ] All `check_outliers()` output objects possess a `plot()` method, meaning it is also possible to visualize the outliers: -```{r univariate, fig.cap = "Visual depiction of outliers using the robust z-score method."} +```{r univariate, eval=FALSE} library(see) plot(outliers) ``` +```{r univariate_implicit, fig.cap = "Visual depiction of outliers using the robust z-score method. The distance represents an aggregate score for variables mpg, cyl, disp, and hp.", echo=FALSE} +library(see) + +plot(outliers) + + ggplot2::theme(axis.text.x = ggplot2::element_text( + angle = 45, size = 7 +)) +``` + Other univariate methods are available, such as using the interquartile range (IQR), or based on different intervals, such as the Highest Density Interval (HDI) or the Bias Corrected and Accelerated Interval (BCI). These methods are documented and described in the function's [help page](). ## Multivariate Outliers @@ -173,13 +191,22 @@ One common approach for this is to compute multivariate distance metrics such as In *{performance}*'s `check_outliers()`, one can use this approach with `method = "mcd"`.^[Our default threshold for the MCD method is defined by `stats::qchisq(p = 1 - 0.001, df = ncol(x))`, which again is an approximation of the critical value for _p_ < .001 consistent with the thresholds of our other methods.] -```{r multivariate, fig.cap = "Visual depiction of outliers using the Minimum Covariance Determinant (MCD) method, a robust version of the Mahalanobis distance."} +```{r multivariate} outliers <- check_outliers(data, method = "mcd") outliers +``` +```{r multivariate_plot, eval=FALSE} plot(outliers) ``` +```{r multivariate_implicit, fig.cap = "Visual depiction of outliers using the Minimum Covariance Determinant (MCD) method, a robust version of the Mahalanobis distance. The distance represents the MCD scores for variables mpg, cyl, disp, and hp.", echo=FALSE} +plot(outliers) + + ggplot2::theme(axis.text.x = ggplot2::element_text( + angle = 45, size = 7 +)) +``` + Other multivariate methods are available, such as another type of robust Mahalanobis distance that in this case relies on an orthogonalized Gnanadesikan-Kettenring pairwise estimator [@gnanadesikan1972robust]. These methods are documented and described in the function's [help page](https://easystats.github.io/performance/reference/check_outliers.html). ## Model-Based Outliers @@ -188,7 +215,7 @@ Working with regression models creates the possibility of using model-based SOD In {performance}, two such model-based SOD methods are currently available: Cook's distance, for regular regression models, and Pareto, for Bayesian models. As such, `check_outliers()` can be applied directly on regression model objects, by simply specifying `method = "cook"` (or `method = "pareto"` for Bayesian models).^[Our default threshold for the Cook method is defined by `stats::qf(0.5, ncol(x), nrow(x) - ncol(x))`, which again is an approximation of the critical value for _p_ < .001 consistent with the thresholds of our other methods.] -```{r model, fig.cap = "Visual depiction of outliers based on Cook's distance (leverage and standardized residuals)."} +```{r model, fig.cap = "Visual depiction of outliers based on Cook's distance (leverage and standardized residuals), based on the fitted model."} model <- lm(disp ~ mpg * disp, data = data) outliers <- check_outliers(model, method = "cook") outliers @@ -218,7 +245,7 @@ knitr::kable( caption = "Summary of Statistical Outlier Detection Methods Recommendations.", longtable = TRUE) ``` -### Cook's Distance vs. MCD +## Cook's Distance vs. MCD @leys2018outliers report a preference for the MCD method over Cook's distance. This is because Cook's distance removes one observation at a time and checks its corresponding influence on the model each time [@cook1977detection], and flags any observation that has a large influence. In the view of these authors, when there are several outliers, the process of removing a single outlier at a time is problematic as the model remains "contaminated" or influenced by other possible outliers in the model, rendering this method suboptimal in the presence of multiple outliers. @@ -242,7 +269,7 @@ which(outliers) In contrast, the model-based detection method displays the desired behaviour: it correctly flags the person who is very tall but very light, without flagging the person who is both tall and heavy. -```{r model2, fig.cap = "The leverage method (Cook's distance) correctly distinguishes the true outlier from the model-consistent extreme observation)."} +```{r model2, fig.cap = "The leverage method (Cook's distance) correctly distinguishes the true outlier from the model-consistent extreme observation), based on the fitted model."} outliers <- check_outliers(model, method = "cook") which(outliers) plot(outliers) @@ -250,9 +277,9 @@ plot(outliers) Finally, unusual observations happen naturally: extreme observations are expected even when taken from a normal distribution. While statistical models can integrate this "expectation", multivariate outlier methods might be too conservative, flagging too many observations despite belonging to the right generative process. For these reasons, we believe that model-based methods are still preferable to the MCD when using supported regression models. Additionally, if the presence of multiple outliers is a significant concern, regression methods that are more robust to outliers should be considered---like _t_ regression or quantile regression---as they render their precise identification less critical [@mcelreath2020statistical]. -## Multiple Methods +## Composite Outlier Score -An alternative approach that is possible is to combine several methods, based on the assumption that different methods provide different angles of looking at the problem. By applying a variety of methods, one can hope to "triangulate" the true outliers (those consistently flagged by multiple methods) and thus attempt to minimize false positives. +The *{performance}* package also offers an alternative, consensus-based approach that combines several methods, based on the assumption that different methods provide different angles of looking at a given problem. By applying a variety of methods, one can hope to "triangulate" the true outliers (those consistently flagged by multiple methods) and thus attempt to minimize false positives. In practice, this approach computes a composite outlier score, formed of the average of the binary (0 or 1) classification results of each method. It represents the probability that each observation is classified as an outlier by at least one method. The default decision rule classifies rows with composite outlier scores superior or equal to 0.5 as outlier observations (i.e., that were classified as outliers by at least half of the methods). In *{performance}*'s `check_outliers()`, one can use this approach by including all desired methods in the corresponding argument. @@ -269,7 +296,7 @@ attributes(outliers)$outlier_var$zscore_robust An example sentence for reporting the usage of the composite method could be: -> Based on a composite outlier score (see the 'check_outliers()' function in the 'performance' R package, [@ludecke2021performance]) obtained via the joint application of multiple outliers detection algorithms ((a) median absolute deviation (MAD)-based robust _z_ scores, [@leys2013outliers]; (b) Mahalanobis minimum covariance determinant (MCD), [@leys2019outliers]; and (c) Cook's distance, [@cook1977detection]), we excluded two participants that were classified as outliers by at least half of the methods used. +> Based on a composite outlier score [see the 'check_outliers()' function in the 'performance' R package, @ludecke2021performance] obtained via the joint application of multiple outliers detection algorithms [(a) median absolute deviation (MAD)-based robust _z_ scores, @leys2013outliers; (b) Mahalanobis minimum covariance determinant (MCD), @leys2019outliers; and (c) Cook's distance, @cook1977detection], we excluded two participants that were classified as outliers by at least half of the methods used. # Handling Outliers @@ -279,7 +306,7 @@ The above section demonstrated how to identify outliers using the `check_outlier @leys2019outliers distinguish between error outliers, interesting outliers, and random outliers. _Error outliers_ are likely due to human error and should be corrected before data analysis or outright removed since they are invalid observations. _Interesting outliers_ are not due to technical error and may be of theoretical interest; it might thus be relevant to investigate them further even though they should be removed from the current analysis of interest. _Random outliers_ are assumed to be due to chance alone and to belong to the correct distribution and, therefore, should be retained. -It is recommended to _keep_ observations which are expected to be part of the distribution of interest, even if they are outliers [@leys2019outliers]. However, if it is suspected that the outliers belong to an alternative distribution, then those observations could have a large impact on the results and call into question their robustness, especially if significance is conditional on their inclusion. +It is recommended to _keep_ observations which are expected to be part of the distribution of interest, even if they are outliers [@leys2019outliers]. However, if it is suspected that the outliers belong to an alternative distribution, then those observations could have a large impact on the results and call into question their robustness, especially if significance is conditional on their inclusion, so should be removed. On the other hand, there are also outliers that cannot be detected by statistical tools, but should be found and removed. For example, if we are studying the effects of X on Y among teenagers and we have one observation from a 20-year-old, this observation might not be a _statistical outlier_, but it is an outlier in the _context_ of our research, and should be discarded to allow for valid inferences. @@ -302,9 +329,26 @@ winsorized_data[1501:1502, ] ## The Importance of Transparency -Once again, it is a critical part of a sound outlier treatment that regardless of which SOD method used, it should be reported in a reproducible manner. Ideally, the handling of outliers should be specified *a priori* with as much detail as possible, and preregistered, to limit researchers' degrees of freedom and therefore risks of false positives [@leys2019outliers]. This is especially true given that interesting outliers and random outliers are often times hard to distinguish in practice. Thus, researchers should always prioritize transparency and report all of the following information: (a) how many outliers were identified; (b) according to which method and criteria, (c) using which function of which R package (if applicable), and (d) how they were handled (excluded or winsorized, if the latter, using what threshold). If at all possible, (e) the corresponding code script along with the data should be shared on a public repository like the Open Science Framework (OSF), so that the exclusion criteria can be reproduced precisely. +Once again, it is a critical part of a sound outlier treatment that regardless of which SOD method used, it should be reported in a reproducible manner. Ideally, the handling of outliers should be specified *a priori* with as much detail as possible, and preregistered, to limit researchers' degrees of freedom and therefore risks of false positives [@leys2019outliers]. This is especially true given that interesting outliers and random outliers are often times hard to distinguish in practice. Thus, researchers should always prioritize transparency and report all of the following information: (a) how many outliers were identified (including percentage); (b) according to which method and criteria, (c) using which function of which R package (if applicable), and (d) how they were handled (excluded or winsorized, if the latter, using what threshold). If at all possible, (e) the corresponding code script along with the data should be shared on a public repository like the Open Science Framework (OSF), so that the exclusion criteria can be reproduced precisely. # Conclusion In this paper, we have showed how to investigate outliers using the `check_outliers()` function of the *{performance}* package while following current good practices. However, best practice for outlier treatment does not stop at using appropriate statistical algorithms, but entails respecting existing recommendations, such as preregistration, reproducibility, consistency, transparency, and justification. Ideally, one would additionally also report the package, function, and threshold used (linking to the full code when possible). We hope that this paper and the accompanying `check_outlier()` function of *easystats* will help researchers engage in good research practices while providing a smooth outlier detection experience. +### Contributions + +R.T. drafted the paper; all authors contributed to both the writing of the paper and the conception of the software. + +### Acknowledgements + +*{performance}* is part of the collaborative [*easystats*](https://github.com/easystats/easystats) ecosystem [@easystatspackage]. Thus, we thank all [members of easystats](https://github.com/orgs/easystats/people), contributors, and users alike. + +### Funding information + +This research received no external funding. + +### Competing Interests + +The authors declare no conflict of interest + +# References \ No newline at end of file diff --git a/papers/JOSE/table1.jpg b/papers/JOSE/table1.jpg new file mode 100644 index 000000000..5f29d676e Binary files /dev/null and b/papers/JOSE/table1.jpg differ diff --git a/papers/JOSS/paper.bib b/papers/JOSS/paper_temp.bib similarity index 100% rename from papers/JOSS/paper.bib rename to papers/JOSS/paper_temp.bib diff --git a/papers/JOSS/paper.md b/papers/JOSS/paper_temp.md similarity index 100% rename from papers/JOSS/paper.md rename to papers/JOSS/paper_temp.md diff --git a/papers/Mathematics/cover_letter.Rmd b/papers/Mathematics/cover_letter.Rmd deleted file mode 100644 index 700385be2..000000000 --- a/papers/Mathematics/cover_letter.Rmd +++ /dev/null @@ -1,27 +0,0 @@ ---- -output: pdf_document ---- - -Dear Editors, - -We are pleased to submit this paper to *Mathematics*, for the special issue "Advances in Statistical Computing". - -The paper, titled "Check your outliers! An accessible introduction to identifying statistical outliers in R with *easystats*", provides an overview of current recommendations and best practices regarding the diagnosis and treament of outliers. It demonstrates how these recommendations can be easily and conveniently implemented in the R software using the *{performance}* package of the *easystats* ecosystem. The manuscript covers univariate, multivariate, and model-based statistical outlier detection methods, their recommended threshold, standard output, and plotting method, among other things. - -In this sense, the paper fits very well with the special issue "Advances in Statistical Computing", as it essentially communicates to the wider public current advances in the statistical computing of outlier detection algorithms and their implementation in currently available open source and free software. This makes the manuscript relevant to data science, behavioural science, and statistical computing more generally. - -Our current submission is original and has been neither published elsewhere nor is currently under consideration for publication elsewhere. All authors have contributed substantially to the software and manuscript. All authors gave final approval to the manuscript and accept to be accountable. We have no conflicts of interest to disclose. - -We would also like to use the open peer review option. - -Thank you for considering our submission. - -Best Regards, - -Rémi Thériault - -Department of Psychology, - -Université du Québec à Montréal, - -Montréal, Québec, Canada \ No newline at end of file diff --git a/papers/Mathematics/journalnames.tex b/papers/Mathematics/journalnames.tex deleted file mode 100644 index a3305b818..000000000 --- a/papers/Mathematics/journalnames.tex +++ /dev/null @@ -1,234 +0,0 @@ -\DeclareOption{acoustics}{ \gdef\@journal{acoustics} \gdef\@journalshort{Acoustics} \gdef\@journalfull{Acoustics} \gdef\@doiabbr{acoustics} \gdef\@ISSN{2624-599X} } -\DeclareOption{actuators}{ \gdef\@journal{actuators} \gdef\@journalshort{Actuators} \gdef\@journalfull{Actuators} \gdef\@doiabbr{act} \gdef\@ISSN{2076-0825} } -\DeclareOption{addictions}{ \gdef\@journal{addictions} \gdef\@journalshort{Addictions} \gdef\@journalfull{Addictions} \gdef\@doiabbr{} \gdef\@ISSN{0006-0006} } -\DeclareOption{admsci}{ \gdef\@journal{admsci} \gdef\@journalshort{Adm. Sci.} \gdef\@journalfull{Administrative Sciences} \gdef\@doiabbr{admsci} \gdef\@ISSN{2076-3387} } -\DeclareOption{aerospace}{ \gdef\@journal{aerospace} \gdef\@journalshort{Aerospace} \gdef\@journalfull{Aerospace} \gdef\@doiabbr{aerospace} \gdef\@ISSN{2226-4310} } -\DeclareOption{agriculture}{ \gdef\@journal{agriculture} \gdef\@journalshort{Agriculture} \gdef\@journalfull{Agriculture} \gdef\@doiabbr{agriculture} \gdef\@ISSN{2077-0472} } -\DeclareOption{agriengineering}{ \gdef\@journal{agriengineering} \gdef\@journalshort{AgriEngineering} \gdef\@journalfull{AgriEngineering} \gdef\@doiabbr{agriengineering} \gdef\@ISSN{2624-7402} } -\DeclareOption{agronomy}{ \gdef\@journal{agronomy} \gdef\@journalshort{Agronomy} \gdef\@journalfull{Agronomy} \gdef\@doiabbr{agronomy} \gdef\@ISSN{2073-4395} } -\DeclareOption{algorithms}{ \gdef\@journal{algorithms} \gdef\@journalshort{Algorithms} \gdef\@journalfull{Algorithms} \gdef\@doiabbr{a} \gdef\@ISSN{1999-4893} } -\DeclareOption{animals}{ \gdef\@journal{animals} \gdef\@journalshort{Animals} \gdef\@journalfull{Animals} \gdef\@doiabbr{ani} \gdef\@ISSN{2076-2615} } -\DeclareOption{antibiotics}{ \gdef\@journal{antibiotics} \gdef\@journalshort{Antibiotics} \gdef\@journalfull{Antibiotics} \gdef\@doiabbr{antibiotics} \gdef\@ISSN{2079-6382} } -\DeclareOption{antibodies}{ \gdef\@journal{antibodies} \gdef\@journalshort{Antibodies} \gdef\@journalfull{Antibodies} \gdef\@doiabbr{antib} \gdef\@ISSN{2073-4468} } -\DeclareOption{antioxidants}{ \gdef\@journal{antioxidants} \gdef\@journalshort{Antioxidants} \gdef\@journalfull{Antioxidants} \gdef\@doiabbr{antiox} \gdef\@ISSN{2076-3921} } -\DeclareOption{applsci}{ \gdef\@journal{applsci} \gdef\@journalshort{Appl. Sci.} \gdef\@journalfull{Applied Sciences} \gdef\@doiabbr{app} \gdef\@ISSN{2076-3417} } -\DeclareOption{arts}{ \gdef\@journal{arts} \gdef\@journalshort{Arts} \gdef\@journalfull{Arts} \gdef\@doiabbr{arts} \gdef\@ISSN{2076-0752} } -\DeclareOption{asc}{ \gdef\@journal{asc} \gdef\@journalshort{Autom. Syst. Control} \gdef\@journalfull{Automatic Systems and Control} \gdef\@doiabbr{} \gdef\@ISSN{} } -\DeclareOption{asi}{ \gdef\@journal{asi} \gdef\@journalshort{Appl. Syst. Innov.} \gdef\@journalfull{Applied System Innovation} \gdef\@doiabbr{asi} \gdef\@ISSN{2571-5577} } -\DeclareOption{atmosphere}{ \gdef\@journal{atmosphere} \gdef\@journalshort{Atmosphere} \gdef\@journalfull{Atmosphere} \gdef\@doiabbr{atmos} \gdef\@ISSN{2073-4433} } -\DeclareOption{atoms}{ \gdef\@journal{atoms} \gdef\@journalshort{Atoms} \gdef\@journalfull{Atoms} \gdef\@doiabbr{atoms} \gdef\@ISSN{2218-2004} } -\DeclareOption{axioms}{ \gdef\@journal{axioms} \gdef\@journalshort{Axioms} \gdef\@journalfull{Axioms} \gdef\@doiabbr{axioms} \gdef\@ISSN{2075-1680} } -\DeclareOption{batteries}{ \gdef\@journal{batteries} \gdef\@journalshort{Batteries} \gdef\@journalfull{Batteries} \gdef\@doiabbr{batteries} \gdef\@ISSN{2313-0105} } -\DeclareOption{bdcc}{ \gdef\@journal{bdcc} \gdef\@journalshort{Big Data Cogn. Comput.} \gdef\@journalfull{Big Data and Cognitive Computing} \gdef\@doiabbr{bdcc} \gdef\@ISSN{2504-2289} } -\DeclareOption{behavsci}{ \gdef\@journal{behavsci} \gdef\@journalshort{Behav. Sci.} \gdef\@journalfull{Behavioral Sciences} \gdef\@doiabbr{bs} \gdef\@ISSN{2076-328X} } -\DeclareOption{beverages}{ \gdef\@journal{beverages} \gdef\@journalshort{Beverages} \gdef\@journalfull{Beverages} \gdef\@doiabbr{beverages} \gdef\@ISSN{2306-5710} } -\DeclareOption{bioengineering}{ \gdef\@journal{bioengineering} \gdef\@journalshort{Bioengineering} \gdef\@journalfull{Bioengineering} \gdef\@doiabbr{bioengineering} \gdef\@ISSN{2306-5354} } -\DeclareOption{biology}{ \gdef\@journal{biology} \gdef\@journalshort{Biology} \gdef\@journalfull{Biology} \gdef\@doiabbr{biology} \gdef\@ISSN{2079-7737} } -\DeclareOption{biomedicines}{ \gdef\@journal{biomedicines} \gdef\@journalshort{Biomedicines} \gdef\@journalfull{Biomedicines} \gdef\@doiabbr{biomedicines} \gdef\@ISSN{2227-9059} } -\DeclareOption{biomimetics}{ \gdef\@journal{biomimetics} \gdef\@journalshort{Biomimetics} \gdef\@journalfull{Biomimetics} \gdef\@doiabbr{biomimetics} \gdef\@ISSN{2313-7673} } -\DeclareOption{biomolecules}{ \gdef\@journal{biomolecules} \gdef\@journalshort{Biomolecules} \gdef\@journalfull{Biomolecules} \gdef\@doiabbr{biom} \gdef\@ISSN{2218-273X} } -\DeclareOption{biosensors}{ \gdef\@journal{biosensors} \gdef\@journalshort{Biosensors} \gdef\@journalfull{Biosensors} \gdef\@doiabbr{bios} \gdef\@ISSN{2079-6374} } -\DeclareOption{brainsci}{ \gdef\@journal{brainsci} \gdef\@journalshort{Brain Sci.} \gdef\@journalfull{Brain Sciences} \gdef\@doiabbr{brainsci} \gdef\@ISSN{2076-3425} } -\DeclareOption{buildings}{ \gdef\@journal{buildings} \gdef\@journalshort{Buildings} \gdef\@journalfull{Buildings} \gdef\@doiabbr{buildings} \gdef\@ISSN{2075-5309} } -\DeclareOption{cancers}{ \gdef\@journal{cancers} \gdef\@journalshort{Cancers} \gdef\@journalfull{Cancers} \gdef\@doiabbr{cancers} \gdef\@ISSN{2072-6694} } -\DeclareOption{carbon}{ \gdef\@journal{carbon} \gdef\@journalshort{C} \gdef\@journalfull{C} \gdef\@doiabbr{c} \gdef\@ISSN{2311-5629} } -\DeclareOption{catalysts}{ \gdef\@journal{catalysts} \gdef\@journalshort{Catalysts} \gdef\@journalfull{Catalysts} \gdef\@doiabbr{catal} \gdef\@ISSN{2073-4344} } -\DeclareOption{cells}{ \gdef\@journal{cells} \gdef\@journalshort{Cells} \gdef\@journalfull{Cells} \gdef\@doiabbr{cells} \gdef\@ISSN{2073-4409} } -\DeclareOption{ceramics}{ \gdef\@journal{ceramics} \gdef\@journalshort{Ceramics} \gdef\@journalfull{Ceramics} \gdef\@doiabbr{ceramics} \gdef\@ISSN{2571-6131} } -\DeclareOption{challenges}{ \gdef\@journal{challenges} \gdef\@journalshort{Challenges} \gdef\@journalfull{Challenges} \gdef\@doiabbr{challe} \gdef\@ISSN{2078-1547} } -\DeclareOption{chemengineering}{ \gdef\@journal{chemengineering} \gdef\@journalshort{ChemEngineering} \gdef\@journalfull{ChemEngineering} \gdef\@doiabbr{chemengineering} \gdef\@ISSN{2305-7084} } -\DeclareOption{chemistry}{ \gdef\@journal{chemistry} \gdef\@journalshort{Chemistry} \gdef\@journalfull{Chemistry} \gdef\@doiabbr{chemistry} \gdef\@ISSN{2624-8549} } -\DeclareOption{chemosensors}{ \gdef\@journal{chemosensors} \gdef\@journalshort{Chemosensors} \gdef\@journalfull{Chemosensors} \gdef\@doiabbr{chemosensors} \gdef\@ISSN{2227-9040} } -\DeclareOption{children}{ \gdef\@journal{children} \gdef\@journalshort{Children} \gdef\@journalfull{Children} \gdef\@doiabbr{children} \gdef\@ISSN{2227-9067} } -\DeclareOption{cleantechnol}{ \gdef\@journal{cleantechnol} \gdef\@journalshort{Clean Technol.} \gdef\@journalfull{Clean Technologies} \gdef\@doiabbr{cleantechnol} \gdef\@ISSN{2571-8797} } -\DeclareOption{climate}{ \gdef\@journal{climate} \gdef\@journalshort{Climate} \gdef\@journalfull{Climate} \gdef\@doiabbr{cli} \gdef\@ISSN{2225-1154} } -\DeclareOption{clockssleep}{ \gdef\@journal{clockssleep} \gdef\@journalshort{Clocks\&Sleep} \gdef\@journalfull{Clocks \& Sleep} \gdef\@doiabbr{clockssleep} \gdef\@ISSN{2624-5175} } -\DeclareOption{cmd}{ \gdef\@journal{cmd} \gdef\@journalshort{Corros. Mater. Degrad.} \gdef\@journalfull{Corrosion and Materials Degradation} \gdef\@doiabbr{cmd} \gdef\@ISSN{2624-5558} } -\DeclareOption{coatings}{ \gdef\@journal{coatings} \gdef\@journalshort{Coatings} \gdef\@journalfull{Coatings} \gdef\@doiabbr{coatings} \gdef\@ISSN{2079-6412} } -\DeclareOption{colloids}{ \gdef\@journal{colloids} \gdef\@journalshort{Colloids Interfaces} \gdef\@journalfull{Colloids Interfaces} \gdef\@doiabbr{colloids} \gdef\@ISSN{2504-5377} } -\DeclareOption{computation}{ \gdef\@journal{computation} \gdef\@journalshort{Computation} \gdef\@journalfull{Computation} \gdef\@doiabbr{computation} \gdef\@ISSN{2079-3197} } -\DeclareOption{computers}{ \gdef\@journal{computers} \gdef\@journalshort{Computers} \gdef\@journalfull{Computers} \gdef\@doiabbr{computers} \gdef\@ISSN{2073-431X} } -\DeclareOption{condensedmatter}{ \gdef\@journal{condensedmatter} \gdef\@journalshort{Condens. Matter} \gdef\@journalfull{Condensed Matter} \gdef\@doiabbr{condmat} \gdef\@ISSN{2410-3896} } -\DeclareOption{cosmetics}{ \gdef\@journal{cosmetics} \gdef\@journalshort{Cosmetics} \gdef\@journalfull{Cosmetics} \gdef\@doiabbr{cosmetics} \gdef\@ISSN{2079-9284} } -\DeclareOption{cryptography}{ \gdef\@journal{cryptography} \gdef\@journalshort{Cryptography} \gdef\@journalfull{Cryptography} \gdef\@doiabbr{cryptography} \gdef\@ISSN{2410-387X} } -\DeclareOption{crystals}{ \gdef\@journal{crystals} \gdef\@journalshort{Crystals} \gdef\@journalfull{Crystals} \gdef\@doiabbr{cryst} \gdef\@ISSN{2073-4352} } -\DeclareOption{dairy}{ \gdef\@journal{dairy} \gdef\@journalshort{Dairy} \gdef\@journalfull{Dairy} \gdef\@doiabbr{dairy} \gdef\@ISSN{2624-862X} } -\DeclareOption{data}{ \gdef\@journal{data} \gdef\@journalshort{Data} \gdef\@journalfull{Data} \gdef\@doiabbr{data} \gdef\@ISSN{2306-5729} } -\DeclareOption{dentistry}{ \gdef\@journal{dentistry} \gdef\@journalshort{Dent. J.} \gdef\@journalfull{Dentistry Journal} \gdef\@doiabbr{dj} \gdef\@ISSN{2304-6767} } -\DeclareOption{designs}{ \gdef\@journal{designs} \gdef\@journalshort{Designs} \gdef\@journalfull{Designs} \gdef\@doiabbr{designs} \gdef\@ISSN{2411-9660} } -\DeclareOption{diagnostics}{ \gdef\@journal{diagnostics} \gdef\@journalshort{Diagnostics} \gdef\@journalfull{Diagnostics} \gdef\@doiabbr{diagnostics} \gdef\@ISSN{2075-4418} } -\DeclareOption{diseases}{ \gdef\@journal{diseases} \gdef\@journalshort{Diseases} \gdef\@journalfull{Diseases} \gdef\@doiabbr{diseases} \gdef\@ISSN{2079-9721} } -\DeclareOption{diversity}{ \gdef\@journal{diversity} \gdef\@journalshort{Diversity} \gdef\@journalfull{Diversity} \gdef\@doiabbr{d} \gdef\@ISSN{1424-2818} } -\DeclareOption{drones}{ \gdef\@journal{drones} \gdef\@journalshort{Drones} \gdef\@journalfull{Drones} \gdef\@doiabbr{drones} \gdef\@ISSN{2504-446X} } -\DeclareOption{econometrics}{ \gdef\@journal{econometrics} \gdef\@journalshort{Econometrics} \gdef\@journalfull{Econometrics} \gdef\@doiabbr{econometrics} \gdef\@ISSN{2225-1146} } -\DeclareOption{economies}{ \gdef\@journal{economies} \gdef\@journalshort{Economies} \gdef\@journalfull{Economies} \gdef\@doiabbr{economies} \gdef\@ISSN{2227-7099} } -\DeclareOption{education}{ \gdef\@journal{education} \gdef\@journalshort{Educ. Sci.} \gdef\@journalfull{Education Sciences} \gdef\@doiabbr{educsci} \gdef\@ISSN{2227-7102} } -\DeclareOption{electrochem}{ \gdef\@journal{electrochem} \gdef\@journalshort{Electrochem} \gdef\@journalfull{Electrochem} \gdef\@doiabbr{electrochem} \gdef\@ISSN{} } -\DeclareOption{electronics}{ \gdef\@journal{electronics} \gdef\@journalshort{Electronics} \gdef\@journalfull{Electronics} \gdef\@doiabbr{electronics} \gdef\@ISSN{2079-9292} } -\DeclareOption{energies}{ \gdef\@journal{energies} \gdef\@journalshort{Energies} \gdef\@journalfull{Energies} \gdef\@doiabbr{en} \gdef\@ISSN{1996-1073} } -\DeclareOption{entropy}{ \gdef\@journal{entropy} \gdef\@journalshort{Entropy} \gdef\@journalfull{Entropy} \gdef\@doiabbr{e} \gdef\@ISSN{1099-4300} } -\DeclareOption{environments}{ \gdef\@journal{environments} \gdef\@journalshort{Environments} \gdef\@journalfull{Environments} \gdef\@doiabbr{environments} \gdef\@ISSN{2076-3298} } -\DeclareOption{epigenomes}{ \gdef\@journal{epigenomes} \gdef\@journalshort{Epigenomes} \gdef\@journalfull{Epigenomes} \gdef\@doiabbr{epigenomes} \gdef\@ISSN{2075-4655} } -\DeclareOption{est}{ \gdef\@journal{est} \gdef\@journalshort{Electrochem. Sci. Technol.} \gdef\@journalfull{Electrochemical Science and Technology} \gdef\@doiabbr{} \gdef\@ISSN{} } -\DeclareOption{fermentation}{ \gdef\@journal{fermentation} \gdef\@journalshort{Fermentation} \gdef\@journalfull{Fermentation} \gdef\@doiabbr{fermentation} \gdef\@ISSN{2311-5637} } -\DeclareOption{fibers}{ \gdef\@journal{fibers} \gdef\@journalshort{Fibers} \gdef\@journalfull{Fibers} \gdef\@doiabbr{fib} \gdef\@ISSN{2079-6439} } -\DeclareOption{fire}{ \gdef\@journal{fire} \gdef\@journalshort{Fire} \gdef\@journalfull{Fire} \gdef\@doiabbr{fire} \gdef\@ISSN{2571-6255} } -\DeclareOption{fishes}{ \gdef\@journal{fishes} \gdef\@journalshort{Fishes} \gdef\@journalfull{Fishes} \gdef\@doiabbr{fishes} \gdef\@ISSN{2410-3888} } -\DeclareOption{fluids}{ \gdef\@journal{fluids} \gdef\@journalshort{Fluids} \gdef\@journalfull{Fluids} \gdef\@doiabbr{fluids} \gdef\@ISSN{2311-5521} } -\DeclareOption{foods}{ \gdef\@journal{foods} \gdef\@journalshort{Foods} \gdef\@journalfull{Foods} \gdef\@doiabbr{foods} \gdef\@ISSN{2304-8158} } -\DeclareOption{forecasting}{ \gdef\@journal{forecasting} \gdef\@journalshort{Forecasting} \gdef\@journalfull{Forecasting} \gdef\@doiabbr{forecast} \gdef\@ISSN{2571-9394} } -\DeclareOption{forests}{ \gdef\@journal{forests} \gdef\@journalshort{Forests} \gdef\@journalfull{Forests} \gdef\@doiabbr{f} \gdef\@ISSN{1999-4907} } -\DeclareOption{fractalfract}{ \gdef\@journal{fractalfract} \gdef\@journalshort{Fractal Fract.} \gdef\@journalfull{Fractal and Fractional} \gdef\@doiabbr{fractalfract} \gdef\@ISSN{2504-3110} } -\DeclareOption{futureinternet}{ \gdef\@journal{futureinternet} \gdef\@journalshort{Future Internet} \gdef\@journalfull{Future Internet} \gdef\@doiabbr{fi} \gdef\@ISSN{1999-5903} } -\DeclareOption{futurephys}{ \gdef\@journal{futurephys} \gdef\@journalshort{Future Phys.} \gdef\@journalfull{Future Physics} \gdef\@doiabbr{futurephys} \gdef\@ISSN{2624-6503} } -\DeclareOption{galaxies}{ \gdef\@journal{galaxies} \gdef\@journalshort{Galaxies} \gdef\@journalfull{Galaxies} \gdef\@doiabbr{galaxies} \gdef\@ISSN{2075-4434} } -\DeclareOption{games}{ \gdef\@journal{games} \gdef\@journalshort{Games} \gdef\@journalfull{Games} \gdef\@doiabbr{g} \gdef\@ISSN{2073-4336} } -\DeclareOption{gastrointestdisord}{ \gdef\@journal{gastrointestdisord} \gdef\@journalshort{Gastrointest. Disord.} \gdef\@journalfull{Gastrointestinal Disorders} \gdef\@doiabbr{gidisord} \gdef\@ISSN{2624-5647} } -\DeclareOption{gels}{ \gdef\@journal{gels} \gdef\@journalshort{Gels} \gdef\@journalfull{Gels} \gdef\@doiabbr{gels} \gdef\@ISSN{2310-2861} } -\DeclareOption{genealogy}{ \gdef\@journal{genealogy} \gdef\@journalshort{Genealogy} \gdef\@journalfull{Genealogy} \gdef\@doiabbr{genealogy} \gdef\@ISSN{2313-5778} } -\DeclareOption{genes}{ \gdef\@journal{genes} \gdef\@journalshort{Genes} \gdef\@journalfull{Genes} \gdef\@doiabbr{genes} \gdef\@ISSN{2073-4425} } -\DeclareOption{geohazards}{ \gdef\@journal{geohazards} \gdef\@journalshort{GeoHazards} \gdef\@journalfull{GeoHazards} \gdef\@doiabbr{geohazards} \gdef\@ISSN{2624-795X} } -\DeclareOption{geosciences}{ \gdef\@journal{geosciences} \gdef\@journalshort{Geosciences} \gdef\@journalfull{Geosciences} \gdef\@doiabbr{geosciences} \gdef\@ISSN{2076-3263} } -\DeclareOption{geriatrics}{ \gdef\@journal{geriatrics} \gdef\@journalshort{Geriatrics} \gdef\@journalfull{Geriatrics} \gdef\@doiabbr{geriatrics} \gdef\@ISSN{2308-3417} } -\DeclareOption{hazardousmatters}{ \gdef\@journal{hazardousmatters} \gdef\@journalshort{Hazard. Matters} \gdef\@journalfull{Hazardous Matters} \gdef\@doiabbr{} \gdef\@ISSN{0014-0014} } -\DeclareOption{healthcare}{ \gdef\@journal{healthcare} \gdef\@journalshort{Healthcare} \gdef\@journalfull{Healthcare} \gdef\@doiabbr{healthcare} \gdef\@ISSN{2227-9032} } -\DeclareOption{heritage}{ \gdef\@journal{heritage} \gdef\@journalshort{Heritage} \gdef\@journalfull{Heritage} \gdef\@doiabbr{heritage} \gdef\@ISSN{2571-9408} } -\DeclareOption{highthroughput}{ \gdef\@journal{highthroughput} \gdef\@journalshort{High-Throughput} \gdef\@journalfull{High-Throughput} \gdef\@doiabbr{ht} \gdef\@ISSN{2571-5135} } -\DeclareOption{horticulturae}{ \gdef\@journal{horticulturae} \gdef\@journalshort{Horticulturae} \gdef\@journalfull{Horticulturae} \gdef\@doiabbr{horticulturae} \gdef\@ISSN{2311-7524} } -\DeclareOption{humanities}{ \gdef\@journal{humanities} \gdef\@journalshort{Humanities} \gdef\@journalfull{Humanities} \gdef\@doiabbr{h} \gdef\@ISSN{2076-0787} } -\DeclareOption{hydrology}{ \gdef\@journal{hydrology} \gdef\@journalshort{Hydrology} \gdef\@journalfull{Hydrology} \gdef\@doiabbr{hydrology} \gdef\@ISSN{2306-5338} } -\DeclareOption{ijerph}{ \gdef\@journal{ijerph} \gdef\@journalshort{Int. J. Environ. Res. Public Health} \gdef\@journalfull{International Journal of Environmental Research and Public Health} \gdef\@doiabbr{ijerph} \gdef\@ISSN{1660-4601} } -\DeclareOption{ijfs}{ \gdef\@journal{ijfs} \gdef\@journalshort{Int. J. Financial Stud.} \gdef\@journalfull{International Journal of Financial Studies} \gdef\@doiabbr{ijfs} \gdef\@ISSN{2227-7072} } -\DeclareOption{ijgi}{ \gdef\@journal{ijgi} \gdef\@journalshort{ISPRS Int. J. Geo-Inf.} \gdef\@journalfull{ISPRS International Journal of Geo-Information} \gdef\@doiabbr{ijgi} \gdef\@ISSN{2220-9964} } -\DeclareOption{ijms}{ \gdef\@journal{ijms} \gdef\@journalshort{Int. J. Mol. Sci.} \gdef\@journalfull{International Journal of Molecular Sciences} \gdef\@doiabbr{ijms} \gdef\@ISSN{1422-0067} } -\DeclareOption{ijtpp}{ \gdef\@journal{ijtpp} \gdef\@journalshort{Int. J. Turbomach. Propuls. Power} \gdef\@journalfull{International Journal of Turbomachinery, Propulsion and Power} \gdef\@doiabbr{ijtpp} \gdef\@ISSN{2504-186X} } -\DeclareOption{informatics}{ \gdef\@journal{informatics} \gdef\@journalshort{Informatics} \gdef\@journalfull{Informatics} \gdef\@doiabbr{informatics} \gdef\@ISSN{2227-9709} } -\DeclareOption{information}{ \gdef\@journal{information} \gdef\@journalshort{Information} \gdef\@journalfull{Information} \gdef\@doiabbr{info} \gdef\@ISSN{2078-2489} } -\DeclareOption{infrastructures}{ \gdef\@journal{infrastructures} \gdef\@journalshort{Infrastructures} \gdef\@journalfull{Infrastructures} \gdef\@doiabbr{infrastructures} \gdef\@ISSN{2412-3811} } -\DeclareOption{inorganics}{ \gdef\@journal{inorganics} \gdef\@journalshort{Inorganics} \gdef\@journalfull{Inorganics} \gdef\@doiabbr{inorganics} \gdef\@ISSN{2304-6740} } -\DeclareOption{insects}{ \gdef\@journal{insects} \gdef\@journalshort{Insects} \gdef\@journalfull{Insects} \gdef\@doiabbr{insects} \gdef\@ISSN{2075-4450} } -\DeclareOption{instruments}{ \gdef\@journal{instruments} \gdef\@journalshort{Instruments} \gdef\@journalfull{Instruments} \gdef\@doiabbr{instruments} \gdef\@ISSN{2410-390X} } -\DeclareOption{inventions}{ \gdef\@journal{inventions} \gdef\@journalshort{Inventions} \gdef\@journalfull{Inventions} \gdef\@doiabbr{inventions} \gdef\@ISSN{2411-5134} } -\DeclareOption{iot}{ \gdef\@journal{iot} \gdef\@journalshort{IoT} \gdef\@journalfull{IoT} \gdef\@doiabbr{iot} \gdef\@ISSN{2624-831X} } -\DeclareOption{j}{ \gdef\@journal{j} \gdef\@journalshort{J} \gdef\@journalfull{J} \gdef\@doiabbr{j} \gdef\@ISSN{2571-8800} } -\DeclareOption{jcdd}{ \gdef\@journal{jcdd} \gdef\@journalshort{J. Cardiovasc. Dev. Dis.} \gdef\@journalfull{Journal of Cardiovascular Development and Disease} \gdef\@doiabbr{jcdd} \gdef\@ISSN{2308-3425} } -\DeclareOption{jcm}{ \gdef\@journal{jcm} \gdef\@journalshort{J. Clin. Med.} \gdef\@journalfull{Journal of Clinical Medicine} \gdef\@doiabbr{jcm} \gdef\@ISSN{2077-0383} } -\DeclareOption{jcp}{ \gdef\@journal{jcp} \gdef\@journalshort{J. Cybersecur. Priv.} \gdef\@journalfull{Journal of Cybersecurity and Privacy} \gdef\@doiabbr{jcp} \gdef\@ISSN{2624-800X} } -\DeclareOption{jcs}{ \gdef\@journal{jcs} \gdef\@journalshort{J. Compos. Sci.} \gdef\@journalfull{Journal of Composites Science} \gdef\@doiabbr{jcs} \gdef\@ISSN{2504-477X} } -\DeclareOption{jdb}{ \gdef\@journal{jdb} \gdef\@journalshort{J. Dev. Biol.} \gdef\@journalfull{Journal of Developmental Biology} \gdef\@doiabbr{jdb} \gdef\@ISSN{2221-3759} } -\DeclareOption{jfb}{ \gdef\@journal{jfb} \gdef\@journalshort{J. Funct. Biomater.} \gdef\@journalfull{Journal of Functional Biomaterials} \gdef\@doiabbr{jfb} \gdef\@ISSN{2079-4983} } -\DeclareOption{jfmk}{ \gdef\@journal{jfmk} \gdef\@journalshort{J. Funct. Morphol. Kinesiol.} \gdef\@journalfull{Journal of Functional Morphology and Kinesiology} \gdef\@doiabbr{jfmk} \gdef\@ISSN{2411-5142} } -\DeclareOption{jimaging}{ \gdef\@journal{jimaging} \gdef\@journalshort{J. Imaging} \gdef\@journalfull{Journal of Imaging} \gdef\@doiabbr{jimaging} \gdef\@ISSN{2313-433X} } -\DeclareOption{jintelligence}{ \gdef\@journal{jintelligence} \gdef\@journalshort{J. Intell.} \gdef\@journalfull{Journal of Intelligence} \gdef\@doiabbr{jintelligence} \gdef\@ISSN{2079-3200} } -\DeclareOption{jlpea}{ \gdef\@journal{jlpea} \gdef\@journalshort{J. Low Power Electron. Appl.} \gdef\@journalfull{Journal of Low Power Electronics and Applications} \gdef\@doiabbr{jlpea} \gdef\@ISSN{2079-9268} } -\DeclareOption{jmmp}{ \gdef\@journal{jmmp} \gdef\@journalshort{J. Manuf. Mater. Process.} \gdef\@journalfull{Journal of Manufacturing and Materials Processing} \gdef\@doiabbr{jmmp} \gdef\@ISSN{2504-4494} } -\DeclareOption{jmse}{ \gdef\@journal{jmse} \gdef\@journalshort{J. Mar. Sci. Eng.} \gdef\@journalfull{Journal of Marine Science and Engineering} \gdef\@doiabbr{jmse} \gdef\@ISSN{2077-1312} } -\DeclareOption{jnt}{ \gdef\@journal{jnt} \gdef\@journalshort{J. Nanotheranostics} \gdef\@journalfull{Journal of Nanotheranostics} \gdef\@doiabbr{jnt} \gdef\@ISSN{2624-845X} } -\DeclareOption{jof}{ \gdef\@journal{jof} \gdef\@journalshort{J. Fungi} \gdef\@journalfull{Journal of Fungi} \gdef\@doiabbr{jof} \gdef\@ISSN{2309-608X} } -\DeclareOption{joitmc}{ \gdef\@journal{joitmc} \gdef\@journalshort{J. Open Innov. Technol. Mark. Complex.} \gdef\@journalfull{Journal of Open Innovation: Technology, Market, and Complexity} \gdef\@doiabbr{joitmc} \gdef\@ISSN{2199-8531} } -\DeclareOption{jpm}{ \gdef\@journal{jpm} \gdef\@journalshort{J. Pers. Med.} \gdef\@journalfull{Journal of Personalized Medicine} \gdef\@doiabbr{jpm} \gdef\@ISSN{2075-4426} } -\DeclareOption{jrfm}{ \gdef\@journal{jrfm} \gdef\@journalshort{J. Risk Financial Manag.} \gdef\@journalfull{Journal of Risk and Financial Management} \gdef\@doiabbr{jrfm} \gdef\@ISSN{1911-8074} } -\DeclareOption{jsan}{ \gdef\@journal{jsan} \gdef\@journalshort{J. Sens. Actuator Netw.} \gdef\@journalfull{Journal of Sensor and Actuator Networks} \gdef\@doiabbr{jsan} \gdef\@ISSN{2224-2708} } -\DeclareOption{land}{ \gdef\@journal{land} \gdef\@journalshort{Land} \gdef\@journalfull{Land} \gdef\@doiabbr{land} \gdef\@ISSN{2073-445X} } -\DeclareOption{languages}{ \gdef\@journal{languages} \gdef\@journalshort{Languages} \gdef\@journalfull{Languages} \gdef\@doiabbr{languages} \gdef\@ISSN{2226-471X} } -\DeclareOption{laws}{ \gdef\@journal{laws} \gdef\@journalshort{Laws} \gdef\@journalfull{Laws} \gdef\@doiabbr{laws} \gdef\@ISSN{2075-471X} } -\DeclareOption{life}{ \gdef\@journal{life} \gdef\@journalshort{Life} \gdef\@journalfull{Life} \gdef\@doiabbr{life} \gdef\@ISSN{2075-1729} } -\DeclareOption{literature}{ \gdef\@journal{literature} \gdef\@journalshort{Literature} \gdef\@journalfull{Literature} \gdef\@doiabbr{} \gdef\@ISSN{2410-9789} } -\DeclareOption{logistics}{ \gdef\@journal{logistics} \gdef\@journalshort{Logistics} \gdef\@journalfull{Logistics} \gdef\@doiabbr{logistics} \gdef\@ISSN{2305-6290} } -\DeclareOption{lubricants}{ \gdef\@journal{lubricants} \gdef\@journalshort{Lubricants} \gdef\@journalfull{Lubricants} \gdef\@doiabbr{lubricants} \gdef\@ISSN{2075-4442} } -\DeclareOption{machines}{ \gdef\@journal{machines} \gdef\@journalshort{Machines} \gdef\@journalfull{Machines} \gdef\@doiabbr{machines} \gdef\@ISSN{2075-1702} } -\DeclareOption{magnetochemistry}{ \gdef\@journal{magnetochemistry} \gdef\@journalshort{Magnetochemistry} \gdef\@journalfull{Magnetochemistry} \gdef\@doiabbr{magnetochemistry} \gdef\@ISSN{2312-7481} } -\DeclareOption{make}{ \gdef\@journal{make} \gdef\@journalshort{Mach. Learn. Knowl. Extr.} \gdef\@journalfull{Machine Learning and Knowledge Extraction} \gdef\@doiabbr{make} \gdef\@ISSN{2504-4990} } -\DeclareOption{marinedrugs}{ \gdef\@journal{marinedrugs} \gdef\@journalshort{Mar. Drugs} \gdef\@journalfull{Marine Drugs} \gdef\@doiabbr{md} \gdef\@ISSN{1660-3397} } -\DeclareOption{materials}{ \gdef\@journal{materials} \gdef\@journalshort{Materials} \gdef\@journalfull{Materials} \gdef\@doiabbr{ma} \gdef\@ISSN{1996-1944} } -\DeclareOption{mathematics}{ \gdef\@journal{mathematics} \gdef\@journalshort{Mathematics} \gdef\@journalfull{Mathematics} \gdef\@doiabbr{math} \gdef\@ISSN{2227-7390} } -\DeclareOption{mca}{ \gdef\@journal{mca} \gdef\@journalshort{Math. Comput. Appl.} \gdef\@journalfull{Mathematical and Computational Applications} \gdef\@doiabbr{mca} \gdef\@ISSN{2297-8747} } -\DeclareOption{medicina}{ \gdef\@journal{medicina} \gdef\@journalshort{Medicina} \gdef\@journalfull{Medicina} \gdef\@doiabbr{medicina} \gdef\@ISSN{1010-660X} } -\DeclareOption{medicines}{ \gdef\@journal{medicines} \gdef\@journalshort{Medicines} \gdef\@journalfull{Medicines} \gdef\@doiabbr{medicines} \gdef\@ISSN{2305-6320} } -\DeclareOption{medsci}{ \gdef\@journal{medsci} \gdef\@journalshort{Med. Sci.} \gdef\@journalfull{Medical Sciences} \gdef\@doiabbr{medsci} \gdef\@ISSN{2076-3271} } -\DeclareOption{membranes}{ \gdef\@journal{membranes} \gdef\@journalshort{Membranes} \gdef\@journalfull{Membranes} \gdef\@doiabbr{membranes} \gdef\@ISSN{2077-0375} } -\DeclareOption{metabolites}{ \gdef\@journal{metabolites} \gdef\@journalshort{Metabolites} \gdef\@journalfull{Metabolites} \gdef\@doiabbr{metabo} \gdef\@ISSN{2218-1989} } -\DeclareOption{metals}{ \gdef\@journal{metals} \gdef\@journalshort{Metals} \gdef\@journalfull{Metals} \gdef\@doiabbr{met} \gdef\@ISSN{2075-4701} } -\DeclareOption{microarrays}{ \gdef\@journal{microarrays} \gdef\@journalshort{Microarrays} \gdef\@journalfull{Microarrays} \gdef\@doiabbr{} \gdef\@ISSN{2076-3905} } -\DeclareOption{micromachines}{ \gdef\@journal{micromachines} \gdef\@journalshort{Micromachines} \gdef\@journalfull{Micromachines} \gdef\@doiabbr{mi} \gdef\@ISSN{2072-666X} } -\DeclareOption{microorganisms}{ \gdef\@journal{microorganisms} \gdef\@journalshort{Microorganisms} \gdef\@journalfull{Microorganisms} \gdef\@doiabbr{microorganisms} \gdef\@ISSN{2076-2607} } -\DeclareOption{minerals}{ \gdef\@journal{minerals} \gdef\@journalshort{Minerals} \gdef\@journalfull{Minerals} \gdef\@doiabbr{min} \gdef\@ISSN{2075-163X} } -\DeclareOption{modelling}{ \gdef\@journal{modelling} \gdef\@journalshort{Modelling} \gdef\@journalfull{Modelling} \gdef\@doiabbr{} \gdef\@ISSN{0012-0012} } -\DeclareOption{molbank}{ \gdef\@journal{molbank} \gdef\@journalshort{Molbank} \gdef\@journalfull{Molbank} \gdef\@doiabbr{M} \gdef\@ISSN{1422-8599} } -\DeclareOption{molecules}{ \gdef\@journal{molecules} \gdef\@journalshort{Molecules} \gdef\@journalfull{Molecules} \gdef\@doiabbr{molecules} \gdef\@ISSN{1420-3049} } -\DeclareOption{mps}{ \gdef\@journal{mps} \gdef\@journalshort{Methods Protoc.} \gdef\@journalfull{Methods and Protocols} \gdef\@doiabbr{mps} \gdef\@ISSN{2409-9279} } -\DeclareOption{mti}{ \gdef\@journal{mti} \gdef\@journalshort{Multimodal Technol. Interact.} \gdef\@journalfull{Multimodal Technologies and Interaction} \gdef\@doiabbr{mti} \gdef\@ISSN{2414-4088} } -\DeclareOption{nanomaterials}{ \gdef\@journal{nanomaterials} \gdef\@journalshort{Nanomaterials} \gdef\@journalfull{Nanomaterials} \gdef\@doiabbr{nano} \gdef\@ISSN{2079-4991} } -\DeclareOption{ncrna}{ \gdef\@journal{ncrna} \gdef\@journalshort{Non-coding RNA} \gdef\@journalfull{Non-coding RNA} \gdef\@doiabbr{ncrna} \gdef\@ISSN{2311-553X} } -\DeclareOption{ijns}{ \gdef\@journal{ijns} \gdef\@journalshort{Int. J. Neonatal Screen.} \gdef\@journalfull{International Journal of Neonatal Screening} \gdef\@doiabbr{ijns} \gdef\@ISSN{2409-515X} } -\DeclareOption{neuroglia}{ \gdef\@journal{neuroglia} \gdef\@journalshort{Neuroglia} \gdef\@journalfull{Neuroglia} \gdef\@doiabbr{neuroglia} \gdef\@ISSN{2571-6980} } -\DeclareOption{nitrogen}{ \gdef\@journal{nitrogen} \gdef\@journalshort{Nitrogen} \gdef\@journalfull{Nitrogen} \gdef\@doiabbr{nitrogen} \gdef\@ISSN{2504-3129} } -\DeclareOption{notspecified}{ \gdef\@journal{notspecified} \gdef\@journalshort{Journal Not Specified} \gdef\@journalfull{Journal Not Specified} \gdef\@doiabbr{} \gdef\@ISSN{} } -\DeclareOption{nutrients}{ \gdef\@journal{nutrients} \gdef\@journalshort{Nutrients} \gdef\@journalfull{Nutrients} \gdef\@doiabbr{nu} \gdef\@ISSN{2072-6643} } -\DeclareOption{ohbm}{ \gdef\@journal{ohbm} \gdef\@journalshort{J. Otorhinolaryngol. Hear. Balance Med.} \gdef\@journalfull{Journal of Otorhinolaryngology, Hearing and Balance Medicine} \gdef\@doiabbr{ohbm} \gdef\@ISSN{2504-463X} } -\DeclareOption{particles}{ \gdef\@journal{particles} \gdef\@journalshort{Particles} \gdef\@journalfull{Particles} \gdef\@doiabbr{particles} \gdef\@ISSN{2571-712X} } -\DeclareOption{pathogens}{ \gdef\@journal{pathogens} \gdef\@journalshort{Pathogens} \gdef\@journalfull{Pathogens} \gdef\@doiabbr{pathogens} \gdef\@ISSN{2076-0817} } -\DeclareOption{pharmaceuticals}{ \gdef\@journal{pharmaceuticals} \gdef\@journalshort{Pharmaceuticals} \gdef\@journalfull{Pharmaceuticals} \gdef\@doiabbr{ph} \gdef\@ISSN{1424-8247} } -\DeclareOption{pharmaceutics}{ \gdef\@journal{pharmaceutics} \gdef\@journalshort{Pharmaceutics} \gdef\@journalfull{Pharmaceutics} \gdef\@doiabbr{pharmaceutics} \gdef\@ISSN{1999-4923} } -\DeclareOption{pharmacy}{ \gdef\@journal{pharmacy} \gdef\@journalshort{Pharmacy} \gdef\@journalfull{Pharmacy} \gdef\@doiabbr{pharmacy} \gdef\@ISSN{2226-4787} } -\DeclareOption{philosophies}{ \gdef\@journal{philosophies} \gdef\@journalshort{Philosophies} \gdef\@journalfull{Philosophies} \gdef\@doiabbr{philosophies} \gdef\@ISSN{2409-9287} } -\DeclareOption{photonics}{ \gdef\@journal{photonics} \gdef\@journalshort{Photonics} \gdef\@journalfull{Photonics} \gdef\@doiabbr{photonics} \gdef\@ISSN{2304-6732} } -\DeclareOption{physics}{ \gdef\@journal{physics} \gdef\@journalshort{Physics} \gdef\@journalfull{Physics} \gdef\@doiabbr{physics} \gdef\@ISSN{2624-8174} } -\DeclareOption{plants}{ \gdef\@journal{plants} \gdef\@journalshort{Plants} \gdef\@journalfull{Plants} \gdef\@doiabbr{plants} \gdef\@ISSN{2223-7747} } -\DeclareOption{plasma}{ \gdef\@journal{plasma} \gdef\@journalshort{Plasma} \gdef\@journalfull{Plasma} \gdef\@doiabbr{plasma} \gdef\@ISSN{2571-6182} } -\DeclareOption{polymers}{ \gdef\@journal{polymers} \gdef\@journalshort{Polymers} \gdef\@journalfull{Polymers} \gdef\@doiabbr{polym} \gdef\@ISSN{2073-4360} } -\DeclareOption{polysaccharides}{ \gdef\@journal{polysaccharides} \gdef\@journalshort{Polysaccharides} \gdef\@journalfull{Polysaccharides} \gdef\@doiabbr{} \gdef\@ISSN{} } -\DeclareOption{preprints}{ \gdef\@journal{preprints} \gdef\@journalshort{Preprints} \gdef\@journalfull{Preprints} \gdef\@doiabbr{} \gdef\@ISSN{} } -\DeclareOption{proceedings}{ \gdef\@journal{proceedings} \gdef\@journalshort{Proceedings} \gdef\@journalfull{Proceedings} \gdef\@doiabbr{proceedings} \gdef\@ISSN{2504-3900} } -\DeclareOption{processes}{ \gdef\@journal{processes} \gdef\@journalshort{Processes} \gdef\@journalfull{Processes} \gdef\@doiabbr{pr} \gdef\@ISSN{2227-9717} } -\DeclareOption{proteomes}{ \gdef\@journal{proteomes} \gdef\@journalshort{Proteomes} \gdef\@journalfull{Proteomes} \gdef\@doiabbr{proteomes} \gdef\@ISSN{2227-7382} } -\DeclareOption{psych}{ \gdef\@journal{psych} \gdef\@journalshort{Psych} \gdef\@journalfull{Psych} \gdef\@doiabbr{psych} \gdef\@ISSN{2624-8611} } -\DeclareOption{publications}{ \gdef\@journal{publications} \gdef\@journalshort{Publications} \gdef\@journalfull{Publications} \gdef\@doiabbr{publications} \gdef\@ISSN{2304-6775} } -\DeclareOption{quantumrep}{ \gdef\@journal{quantumrep} \gdef\@journalshort{Quantum Rep.} \gdef\@journalfull{Quantum Reports} \gdef\@doiabbr{quantum} \gdef\@ISSN{2624-960X} } -\DeclareOption{quaternary}{ \gdef\@journal{quaternary} \gdef\@journalshort{Quaternary} \gdef\@journalfull{Quaternary} \gdef\@doiabbr{quat} \gdef\@ISSN{2571-550X} } -\DeclareOption{qubs}{ \gdef\@journal{qubs} \gdef\@journalshort{Quantum Beam Sci.} \gdef\@journalfull{Quantum Beam Science} \gdef\@doiabbr{qubs} \gdef\@ISSN{2412-382X} } -\DeclareOption{reactions}{ \gdef\@journal{reactions} \gdef\@journalshort{Reactions} \gdef\@journalfull{Reactions} \gdef\@doiabbr{reactions} \gdef\@ISSN{2624-781X} } -\DeclareOption{recycling}{ \gdef\@journal{recycling} \gdef\@journalshort{Recycling} \gdef\@journalfull{Recycling} \gdef\@doiabbr{recycling} \gdef\@ISSN{2313-4321} } -\DeclareOption{religions}{ \gdef\@journal{religions} \gdef\@journalshort{Religions} \gdef\@journalfull{Religions} \gdef\@doiabbr{rel} \gdef\@ISSN{2077-1444} } -\DeclareOption{remotesensing}{ \gdef\@journal{remotesensing} \gdef\@journalshort{Remote Sens.} \gdef\@journalfull{Remote Sensing} \gdef\@doiabbr{rs} \gdef\@ISSN{2072-4292} } -\DeclareOption{reports}{ \gdef\@journal{reports} \gdef\@journalshort{Reports} \gdef\@journalfull{Reports} \gdef\@doiabbr{reports} \gdef\@ISSN{2571-841X} } -\DeclareOption{resources}{ \gdef\@journal{resources} \gdef\@journalshort{Resources} \gdef\@journalfull{Resources} \gdef\@doiabbr{resources} \gdef\@ISSN{2079-9276} } -\DeclareOption{risks}{ \gdef\@journal{risks} \gdef\@journalshort{Risks} \gdef\@journalfull{Risks} \gdef\@doiabbr{risks} \gdef\@ISSN{2227-9091} } -\DeclareOption{robotics}{ \gdef\@journal{robotics} \gdef\@journalshort{Robotics} \gdef\@journalfull{Robotics} \gdef\@doiabbr{robotics} \gdef\@ISSN{2218-6581} } -\DeclareOption{safety}{ \gdef\@journal{safety} \gdef\@journalshort{Safety} \gdef\@journalfull{Safety} \gdef\@doiabbr{safety} \gdef\@ISSN{2313-576X} } -\DeclareOption{sci}{ \gdef\@journal{sci} \gdef\@journalshort{Sci} \gdef\@journalfull{Sci} \gdef\@doiabbr{sci} \gdef\@ISSN{2413-4155} } -\DeclareOption{scipharm}{ \gdef\@journal{scipharm} \gdef\@journalshort{Sci. Pharm.} \gdef\@journalfull{Scientia Pharmaceutica} \gdef\@doiabbr{scipharm} \gdef\@ISSN{2218-0532} } -\DeclareOption{sensors}{ \gdef\@journal{sensors} \gdef\@journalshort{Sensors} \gdef\@journalfull{Sensors} \gdef\@doiabbr{s} \gdef\@ISSN{1424-8220} } -\DeclareOption{separations}{ \gdef\@journal{separations} \gdef\@journalshort{Separations} \gdef\@journalfull{Separations} \gdef\@doiabbr{separations} \gdef\@ISSN{2297-8739} } -\DeclareOption{sexes}{ \gdef\@journal{sexes} \gdef\@journalshort{Sexes} \gdef\@journalfull{Sexes} \gdef\@doiabbr{} \gdef\@ISSN{2411-5118} } -\DeclareOption{signals}{ \gdef\@journal{signals} \gdef\@journalshort{Signals} \gdef\@journalfull{Signals} \gdef\@doiabbr{signals} \gdef\@ISSN{2624-6120} } -\DeclareOption{sinusitis}{ \gdef\@journal{sinusitis} \gdef\@journalshort{Sinusitis} \gdef\@journalfull{Sinusitis} \gdef\@doiabbr{sinusitis} \gdef\@ISSN{2309-107X} } -\DeclareOption{smartcities}{ \gdef\@journal{smartcities} \gdef\@journalshort{Smart Cities} \gdef\@journalfull{Smart Cities} \gdef\@doiabbr{smartcities} \gdef\@ISSN{2624-6511} } -\DeclareOption{sna}{ \gdef\@journal{sna} \gdef\@journalshort{Sinusitis Asthma} \gdef\@journalfull{Sinusitis and Asthma} \gdef\@doiabbr{sna} \gdef\@ISSN{2624-7003} } -\DeclareOption{societies}{ \gdef\@journal{societies} \gdef\@journalshort{Societies} \gdef\@journalfull{Societies} \gdef\@doiabbr{soc} \gdef\@ISSN{2075-4698} } -\DeclareOption{socsci}{ \gdef\@journal{socsci} \gdef\@journalshort{Soc. Sci.} \gdef\@journalfull{Social Sciences} \gdef\@doiabbr{socsci} \gdef\@ISSN{2076-0760} } -\DeclareOption{soilsystems}{ \gdef\@journal{soilsystems} \gdef\@journalshort{Soil Syst.} \gdef\@journalfull{Soil Systems} \gdef\@doiabbr{soilsystems} \gdef\@ISSN{2571-8789} } -\DeclareOption{sports}{ \gdef\@journal{sports} \gdef\@journalshort{Sports} \gdef\@journalfull{Sports} \gdef\@doiabbr{sports} \gdef\@ISSN{2075-4663} } -\DeclareOption{standards}{ \gdef\@journal{standards} \gdef\@journalshort{Standards} \gdef\@journalfull{Standards} \gdef\@doiabbr{} \gdef\@ISSN{2305-6703} } -\DeclareOption{stats}{ \gdef\@journal{stats} \gdef\@journalshort{Stats} \gdef\@journalfull{Stats} \gdef\@doiabbr{stats} \gdef\@ISSN{2571-905X} } -\DeclareOption{surfaces}{ \gdef\@journal{surfaces} \gdef\@journalshort{Surfaces} \gdef\@journalfull{Surfaces} \gdef\@doiabbr{surfaces} \gdef\@ISSN{2571-9637} } -\DeclareOption{surgeries}{ \gdef\@journal{surgeries} \gdef\@journalshort{Surgeries} \gdef\@journalfull{Surgeries} \gdef\@doiabbr{} \gdef\@ISSN{2017-2017} } -\DeclareOption{sustainability}{ \gdef\@journal{sustainability} \gdef\@journalshort{Sustainability} \gdef\@journalfull{Sustainability} \gdef\@doiabbr{su} \gdef\@ISSN{2071-1050} } -\DeclareOption{symmetry}{ \gdef\@journal{symmetry} \gdef\@journalshort{Symmetry} \gdef\@journalfull{Symmetry} \gdef\@doiabbr{sym} \gdef\@ISSN{2073-8994} } -\DeclareOption{systems}{ \gdef\@journal{systems} \gdef\@journalshort{Systems} \gdef\@journalfull{Systems} \gdef\@doiabbr{systems} \gdef\@ISSN{2079-8954} } -\DeclareOption{technologies}{ \gdef\@journal{technologies} \gdef\@journalshort{Technologies} \gdef\@journalfull{Technologies} \gdef\@doiabbr{technologies} \gdef\@ISSN{2227-7080} } -\DeclareOption{test}{ \gdef\@journal{test} \gdef\@journalshort{Test} \gdef\@journalfull{Test} \gdef\@doiabbr{} \gdef\@ISSN{} } -\DeclareOption{toxics}{ \gdef\@journal{toxics} \gdef\@journalshort{Toxics} \gdef\@journalfull{Toxics} \gdef\@doiabbr{toxics} \gdef\@ISSN{2305-6304} } -\DeclareOption{toxins}{ \gdef\@journal{toxins} \gdef\@journalshort{Toxins} \gdef\@journalfull{Toxins} \gdef\@doiabbr{toxins} \gdef\@ISSN{2072-6651} } -\DeclareOption{tropicalmed}{ \gdef\@journal{tropicalmed} \gdef\@journalshort{Trop. Med. Infect. Dis.} \gdef\@journalfull{Tropical Medicine and Infectious Disease} \gdef\@doiabbr{tropicalmed} \gdef\@ISSN{2414-6366} } -\DeclareOption{universe}{ \gdef\@journal{universe} \gdef\@journalshort{Universe} \gdef\@journalfull{Universe} \gdef\@doiabbr{universe} \gdef\@ISSN{2218-1997} } -\DeclareOption{urbansci}{ \gdef\@journal{urbansci} \gdef\@journalshort{Urban Sci.} \gdef\@journalfull{Urban Science} \gdef\@doiabbr{urbansci} \gdef\@ISSN{2413-8851} } -\DeclareOption{vaccines}{ \gdef\@journal{vaccines} \gdef\@journalshort{Vaccines} \gdef\@journalfull{Vaccines} \gdef\@doiabbr{vaccines} \gdef\@ISSN{2076-393X} } -\DeclareOption{vehicles}{ \gdef\@journal{vehicles} \gdef\@journalshort{Vehicles} \gdef\@journalfull{Vehicles} \gdef\@doiabbr{vehicles} \gdef\@ISSN{2624-8921} } -\DeclareOption{vetsci}{ \gdef\@journal{vetsci} \gdef\@journalshort{Vet. Sci.} \gdef\@journalfull{Veterinary Sciences} \gdef\@doiabbr{vetsci} \gdef\@ISSN{2306-7381} } -\DeclareOption{vibration}{ \gdef\@journal{vibration} \gdef\@journalshort{Vibration} \gdef\@journalfull{Vibration} \gdef\@doiabbr{vibration} \gdef\@ISSN{2571-631X} } -\DeclareOption{viruses}{ \gdef\@journal{viruses} \gdef\@journalshort{Viruses} \gdef\@journalfull{Viruses} \gdef\@doiabbr{v} \gdef\@ISSN{1999-4915} } -\DeclareOption{vision}{ \gdef\@journal{vision} \gdef\@journalshort{Vision} \gdef\@journalfull{Vision} \gdef\@doiabbr{vision} \gdef\@ISSN{2411-5150} } -\DeclareOption{water}{ \gdef\@journal{water} \gdef\@journalshort{Water} \gdef\@journalfull{Water} \gdef\@doiabbr{w} \gdef\@ISSN{2073-4441} } -\DeclareOption{wem}{ \gdef\@journal{wem} \gdef\@journalshort{Wildl. Ecol. Manag.} \gdef\@journalfull{Wildlife Ecology and Management} \gdef\@doiabbr{} \gdef\@ISSN{1234-4321} } -\DeclareOption{wevj}{ \gdef\@journal{wevj} \gdef\@journalshort{World Electric Vehicle Journal} \gdef\@journalfull{World Electric Vehicle Journal} \gdef\@doiabbr{wevj} \gdef\@ISSN{2032-6653} } \ No newline at end of file diff --git a/papers/Mathematics/logo-mdpi.pdf b/papers/Mathematics/logo-mdpi.pdf deleted file mode 100644 index 3791788e9..000000000 Binary files a/papers/Mathematics/logo-mdpi.pdf and /dev/null differ diff --git a/papers/Mathematics/logo-orcid.pdf b/papers/Mathematics/logo-orcid.pdf deleted file mode 100644 index 0c305e3ee..000000000 Binary files a/papers/Mathematics/logo-orcid.pdf and /dev/null differ diff --git a/papers/Mathematics/logo-updates.pdf b/papers/Mathematics/logo-updates.pdf deleted file mode 100644 index c79d6b5c9..000000000 Binary files a/papers/Mathematics/logo-updates.pdf and /dev/null differ diff --git a/papers/Mathematics/mdpi.bst b/papers/Mathematics/mdpi.bst deleted file mode 100644 index d259a0b0b..000000000 --- a/papers/Mathematics/mdpi.bst +++ /dev/null @@ -1,1347 +0,0 @@ -%% Bibliography style for MDPI journals - -ENTRY - { address - archiveprefix % - author - booktitle - chapter - edition - editor - eprint % - doi - howpublished - institution - journal - key - month - note - number - organization - pages - primaryclass % - publisher - school - series - title - type - volume - year - url - urldate - nationality - } - {} - { label extra.label sort.label short.list } - -INTEGERS { output.state before.all mid.sentence after.sentence after.block after.item } - -FUNCTION {init.state.consts} -{ #0 'before.all := - #1 'mid.sentence := - #2 'after.sentence := - #3 'after.block := - #4 'after.item := -} - -STRINGS { s t } - -FUNCTION {output.nonnull} -{ 's := - output.state mid.sentence = - { ", " * write$ } - { output.state after.block = - { add.period$ write$ - newline$ - "\newblock " write$ - } - { output.state before.all = - 'write$ - { output.state after.item = - {"; " * write$} - {add.period$ " " * write$} - if$} - if$ - } - if$ - mid.sentence 'output.state := - } - if$ - s -} - -FUNCTION {output} -{ duplicate$ empty$ - 'pop$ - 'output.nonnull - if$ -} - -FUNCTION {output.check} -{ 't := - duplicate$ empty$ - { pop$ "empty " t * " in " * cite$ * warning$ } - 'output.nonnull - if$ -} - -FUNCTION {output.checkwoa} -{ 't := - duplicate$ empty$ - { pop$ } - 'output.nonnull - if$ -} - -FUNCTION {fin.entry} -{ add.period$ - write$ - newline$ -} - -FUNCTION {new.block} -{ output.state before.all = - 'skip$ - { after.block 'output.state := } - if$ -} - -FUNCTION {new.sentence} -{ output.state after.block = - 'skip$ - { output.state before.all = - 'skip$ - { after.sentence 'output.state := } - if$ - } - if$ -} - -FUNCTION {not} -{ { #0 } - { #1 } - if$ -} - -FUNCTION {and} -{ 'skip$ - { pop$ #0 } - if$ -} - -FUNCTION {or} -{ { pop$ #1 } - 'skip$ - if$ -} - -FUNCTION {new.block.checka} -{ empty$ - 'skip$ - 'new.block - if$ -} - -FUNCTION {new.block.checkb} -{ empty$ - swap$ empty$ - and - 'skip$ - 'new.block - if$ -} - -FUNCTION {new.sentence.checka} -{ empty$ - 'skip$ - 'new.sentence - if$ -} - -FUNCTION {new.sentence.checkb} -{ empty$ - swap$ empty$ - and - 'skip$ - 'new.sentence - if$ -} - -FUNCTION {field.or.null} -{ duplicate$ empty$ - { pop$ "" } - 'skip$ - if$ -} - -FUNCTION {emphasize} -{ duplicate$ empty$ - { pop$ "" } - { "{\em " swap$ * "}" * } - if$ -} - -FUNCTION {embolden} -{ duplicate$ empty$ - { pop$ "" } - { "{\bf " swap$ * "}" * } - if$ -} - -FUNCTION {website} -{ duplicate$ empty$ - { pop$ "" } - { "\url{" swap$ * "}" * } - if$ -} - -INTEGERS { nameptr namesleft numnames } - -FUNCTION {format.names} -{ 's := - #1 'nameptr := - s num.names$ 'numnames := - numnames 'namesleft := - { namesleft #0 > } - { s nameptr "{vv~}{ll}{, jj}{, f{.}}." format.name$ 't := - nameptr #1 > - { - nameptr #10 - #1 + = - numnames #10 - > and - { "others" 't := - #1 'namesleft := } - 'skip$ - if$ - namesleft #1 > - { "; " * t * } - { - s nameptr "{ll}" format.name$ duplicate$ "others" = - { 't := } - { pop$ } - if$ - numnames #2 > - { "" * } - 'skip$ - if$ - t "others" = - {"; " * " et~al." * } - { "; " * t * } - if$ - } - if$ - } - 't - if$ - nameptr #1 + 'nameptr := - namesleft #1 - 'namesleft := - } - while$ -} - -FUNCTION {format.key} -{ empty$ - { key field.or.null } - { "" } - if$ -} - -FUNCTION {format.authors} -{ author empty$ - { "" } - { author format.names } - if$ -} - -FUNCTION {format.editors} -{ editor empty$ - { "" } - { editor format.names - editor num.names$ #1 > - { ", Eds." * } - { ", Ed." * } - if$ - } - if$ -} - - - - -FUNCTION {format.title} -{ title empty$ - { "" } - { title} - if$ -} - -FUNCTION {format.number.patent} -{ number empty$ - { "" } - { nationality empty$ - { number} - { nationality " " * number *} - if$ - } - if$ -} - -FUNCTION {format.full.names} -{'s := - #1 'nameptr := - s num.names$ 'numnames := - numnames 'namesleft := - { namesleft #0 > } - { s nameptr - "{vv~}{ll}" format.name$ 't := - nameptr #1 > - { - namesleft #1 > - { ", " * t * } - { - numnames #2 > - { "," * } - 'skip$ - if$ - t "others" = - { " et~al." * } - { " and " * t * } - if$ - } - if$ - } - 't - if$ - nameptr #1 + 'nameptr := - namesleft #1 - 'namesleft := - } - while$ -} - -FUNCTION {author.editor.full} -{ author empty$ - { editor empty$ - { "" } - { editor format.full.names } - if$ - } - { author format.full.names } - if$ -} - - - -FUNCTION {author.full} -{ author empty$ - { "" } - { author format.full.names } - if$ -} - -FUNCTION {editor.full} -{ editor empty$ - { "" } - { editor format.full.names } - if$ -} - -FUNCTION {make.full.names} -{ type$ "book" = - type$ "inbook" = - or - 'author.editor.full - { type$ "proceedings" = - 'editor.full - 'author.full - if$ - } - if$ -} - -FUNCTION {output.bibitem} -{ newline$ - "\bibitem[" write$ - label write$ - ")" make.full.names duplicate$ short.list = - { pop$ } - { * } - if$ - "]{" * write$ - cite$ write$ - "}" write$ - newline$ - "" - before.all 'output.state := -} - -FUNCTION {n.dashify} -{ 't := - "" - { t empty$ not } - { t #1 #1 substring$ "-" = - { t #1 #2 substring$ "--" = not - { "--" * - t #2 global.max$ substring$ 't := - } - { { t #1 #1 substring$ "-" = } - { "-" * - t #2 global.max$ substring$ 't := - } - while$ - } - if$ - } - { t #1 #1 substring$ * - t #2 global.max$ substring$ 't := - } - if$ - } - while$ -} - - -FUNCTION {format.date} -{ year empty$ - { month empty$ - { "" } - { "there's a month but no year in " cite$ * warning$ - month - } - if$ - } - { " " year embolden * } - if$ -} - -FUNCTION {format.bdate} -{ year empty$ - { month empty$ - { "" } - { "there's a month but no year in " cite$ * warning$ - month - } - if$ - } - { " " year * } - if$ -} - -FUNCTION {format.pdate} -{ year empty$ - { month empty$ - { "" } - { "there's a month but no year in " cite$ * warning$ - month - } - if$ - } - { month empty$ - { " " year * } - { " " month * ", " * year * } - if$} - if$ -} - -FUNCTION {format.btitle} -{ title emphasize -} - -FUNCTION {tie.or.space.connect} -{ duplicate$ text.length$ #3 < - { "~" } - { " " } - if$ - swap$ * * -} - -FUNCTION {either.or.check} -{ empty$ - 'pop$ - { "can't use both " swap$ * " fields in " * cite$ * warning$ } - if$ -} - -FUNCTION {format.bvolume} -{ volume empty$ - { "" } - { "Vol." volume tie.or.space.connect - series empty$ - 'skip$ - { ", " * series emphasize * } - if$ - "volume and number" number either.or.check - } - if$ -} - -FUNCTION {format.number.series} -{ volume empty$ - { number empty$ - { series field.or.null } - { output.state mid.sentence = - { "number" } - { "Number" } - if$ - number tie.or.space.connect - series empty$ - { "there's a number but no series in " cite$ * warning$ } - { " in " * series * } - if$ - } - if$ - } - { "" } - if$ -} - -FUNCTION {format.edition} -{ edition empty$ - { "" } - { output.state mid.sentence = - { edition "l" change.case$ " ed." * } - { edition "t" change.case$ " ed." * } - if$ - } - if$ -} - -INTEGERS { multiresult } - -FUNCTION {multi.page.check} -{ 't := - #0 'multiresult := - { multiresult not - t empty$ not - and - } - { t #1 #1 substring$ - duplicate$ "-" = - swap$ duplicate$ "," = - swap$ "+" = - or or - { #1 'multiresult := } - { t #2 global.max$ substring$ 't := } - if$ - } - while$ - multiresult -} - -FUNCTION {format.pages} -{ pages empty$ - { "" } - { pages multi.page.check - { "pp." pages n.dashify tie.or.space.connect } - { "p." pages tie.or.space.connect } - if$ - } - if$ -} - -FUNCTION {format.vol.num.pages} -{ volume emphasize field.or.null - number empty$ - 'skip$ - { - volume empty$ - { "there's a number but no volume in " cite$ * warning$ } - 'skip$ - if$ - } - if$ - pages empty$ - 'skip$ - { duplicate$ empty$ - { pop$ format.pages } - { ",~" * pages n.dashify * } - if$ - } - if$ -} - -FUNCTION {format.chapter.pages} -{ chapter empty$ - 'format.pages - { type empty$ - { "chapter" } - { type "l" change.case$ } - if$ - chapter tie.or.space.connect - pages empty$ - 'skip$ - { ", " * format.pages * } - if$ - } - if$ -} - -FUNCTION {format.in.ed.booktitle} -{ booktitle empty$ - { "" } - { editor empty$ - { edition empty$ - {"In " booktitle emphasize *} - {"In " booktitle emphasize * ", " * edition * " ed." *} - if$ - } - { edition empty$ - {"In " booktitle emphasize * "; " * format.editors * } - {"In " booktitle emphasize * ", " * edition * " ed." * "; " * format.editors * } - if$ - } - if$ - } - if$ -} - -FUNCTION {format.in.ed.booktitle.proc} -{ booktitle empty$ - { "" } - { editor empty$ - { edition empty$ - {"In Proceedings of the " booktitle *} - {"In Proceedings of the " booktitle * ", " * edition * " ed." *} - if$ - } - { edition empty$ - {"In Proceedings of the " booktitle * "; " * format.editors * } - {"In Proceedings of the " booktitle * ", " * edition * " ed." * "; " * format.editors * } - if$ - } - if$ - } - if$ -} - -FUNCTION {format.publisher.and.address} -{ publisher empty$ - {""} - { address empty$ - {publisher } - {publisher ": " * address *} - if$ - } - if$ -} - - - -FUNCTION {empty.misc.check} -{ author empty$ title empty$ howpublished empty$ - month empty$ year empty$ note empty$ - and and and and and - { "all relevant fields are empty in " cite$ * warning$ } - 'skip$ - if$ -} - -FUNCTION {format.thesis.type} -{ type empty$ - 'skip$ - { pop$ - type "t" change.case$ - } - if$ -} - -FUNCTION {format.tr.number} -{ type empty$ - { "Technical Report" } - 'type - if$ - number empty$ - { "t" change.case$ } - { number tie.or.space.connect } - if$ -} - -FUNCTION {format.article.crossref} -{ key empty$ - { journal empty$ - { "need key or journal for " cite$ * " to crossref " * crossref * - warning$ - "" - } - { "In \emph{" journal * "}" * } - if$ - } - { "In " } - if$ - " \citet{" * crossref * "}" * -} - - - -FUNCTION {format.book.crossref} -{ volume empty$ - { "empty volume in " cite$ * "'s crossref of " * crossref * warning$ - "In " - } - { "Vol." volume tie.or.space.connect - " of " * - } - if$ - editor empty$ - editor field.or.null author field.or.null = - or - { key empty$ - { series empty$ - { "need editor, key, or series for " cite$ * " to crossref " * - crossref * warning$ - "" * - } - { "{\em " * series * "\/}" * } - if$ - } - { key * } - if$ - } - { "" * } - if$ - " \cite{" * crossref * "}" * -} - -FUNCTION {format.incoll.inproc.crossref} -{ editor empty$ - editor field.or.null author field.or.null = - or - { key empty$ - { booktitle empty$ - { "need editor, key, or booktitle for " cite$ * " to crossref " * - crossref * warning$ - "" - } - { "In {\em " booktitle * "\/}" * } - if$ - } - { "In " key * } - if$ - } - { "In " * } - if$ - " \cite{" * crossref * "}" * -} - -FUNCTION {format.website} -{ url empty$ - { "" } - { "" url website * - urldate empty$ - {"there is url but no urldate in " cite$ * warning$} - { ", accessed on " * urldate *} - if$ - } - if$ -} - - -%% the following function is modified from kp.bst at http://arxiv.org/hypertex/bibstyles/ -FUNCTION {format.eprint} -{eprint empty$ - { ""} - {primaryClass empty$ - {" \href{http://xxx.lanl.gov/abs/" eprint * "}" * "{{\normalfont " * "[" * eprint * "]" * "}}" *} - {archivePrefix empty$ - {" \href{http://xxx.lanl.gov/abs/" eprint * "}" * "{{\normalfont " * "[" * "arXiv:" * primaryClass * "/" * eprint * "]" * "}}" *} - {" \href{http://xxx.lanl.gov/abs/" eprint * "}" * "{{\normalfont " * "[" * archivePrefix * ":" * primaryClass * "/" * eprint * "]" * "}}" *} - if$ - } - if$ - } -if$ -} - - -%% For printing DOI numbers (it is a hyperlink but printed in black) -FUNCTION {formatfull.doi} -{ doi empty$ - { "" } - {"{\url{https://doi.org/" doi * "}}" * } - if$ -} - - - -FUNCTION {article} -{ output.bibitem - format.authors "author" output.check - author format.key output - new.block - format.title "title" output.check - new.block - crossref missing$ - { journal emphasize "journal" output.check - format.date * format.vol.num.pages "" * output - } - { format.article.crossref output.nonnull - format.pages output - } - if$ -format.eprint output -new.block -note output -formatfull.doi output -fin.entry -} - -FUNCTION {book} -{ output.bibitem - author empty$ - { format.editors "author and editor" output.check } - { format.authors output.nonnull - crossref missing$ - { "author and editor" editor either.or.check } - 'skip$ - if$ - } - if$ - new.block - format.btitle "title" output.check - format.edition output - after.item 'output.state := - crossref missing$ - { format.bvolume output - format.number.series output - format.publisher.and.address "publisher" output.check -%%% address output - } - { - format.book.crossref output.nonnull - } - if$ - format.bdate "year" output.check - after.item 'output.state := - format.chapter.pages output - format.eprint output - new.block - note output - formatfull.doi output - fin.entry -} - -FUNCTION {booklet} -{ output.bibitem - format.authors output - new.block - format.title "title" output.check - howpublished address new.block.checkb - howpublished output - address output - format.bdate output - format.eprint output - new.block - note output - formatfull.doi output - fin.entry -} - -FUNCTION {inbook} -{ output.bibitem - author empty$ - { format.editors "author and editor" output.check } - { format.authors output.nonnull - crossref missing$ - { "author and editor" editor either.or.check } - 'skip$ - if$ - } - if$ -%%% new.block - format.title "title" output.check - new.block - crossref missing$ - { format.in.ed.booktitle "booktitle" output.check - after.item 'output.state := - format.number.series output -%% new.sentence - format.publisher.and.address "publisher" output.check - format.bdate "year" output.check - after.item 'output.state := - format.bvolume output - format.chapter.pages "chapter and pages" output.check - - } - { format.chapter.pages "chapter and pages" output.check - new.block - format.book.crossref output.nonnull - format.bdate "year" output.check - } - if$ - format.eprint output - new.block - note output - formatfull.doi output - fin.entry -} - -FUNCTION {incollection} -{ output.bibitem - format.authors "author" output.check - new.block - format.title "title" output.check - new.sentence - crossref missing$ - { format.in.ed.booktitle "booktitle" output.check - after.item 'output.state := - format.number.series output -% new.sentence - format.publisher.and.address "publisher" output.check - format.bdate "year" output.check - after.item 'output.state := - format.bvolume output - format.chapter.pages output - } - { format.incoll.inproc.crossref output.nonnull - format.chapter.pages output - } - if$ - format.eprint output - new.block - note output - formatfull.doi output - fin.entry -} - -FUNCTION {inproceedings} -{ output.bibitem - format.authors "author" output.check - new.block - format.title "title" output.check - new.block - crossref missing$ - { format.in.ed.booktitle.proc "booktitle" output.check - address empty$ - { organization publisher new.sentence.checkb - organization output - publisher output - format.bdate "year" output.check - } - { after.item 'output.state := - organization output - format.publisher.and.address output.nonnull - format.bdate "year" output.check - after.item 'output.state := - } - if$ - format.number.series output - format.bvolume output - format.pages output - } - { format.incoll.inproc.crossref output.nonnull - format.pages output - } - if$ - format.eprint output - new.block - note output - formatfull.doi output - fin.entry -} - -FUNCTION {conference} { inproceedings } - -FUNCTION {manual} -{ output.bibitem - author empty$ - { organization empty$ - 'skip$ - { organization output.nonnull - address output - } - if$ - } - { format.authors output.nonnull } - if$ - new.block - format.btitle "title" output.check - author empty$ - { organization empty$ - { address new.block.checka - address output - } - 'skip$ - if$ - } - { organization address new.block.checkb - organization output - address output - } - if$ - format.edition output - format.bdate output - format.eprint output - new.block - note output - formatfull.doi output - fin.entry -} - -FUNCTION {mastersthesis} -{ output.bibitem - format.authors "author" output.check - new.block - format.title "title" output.check - new.block - "Master's thesis" format.thesis.type output.nonnull - school "school" output.check - address output - format.bdate "year" output.check - format.eprint output - new.block - note output - formatfull.doi output - fin.entry -} - -FUNCTION {misc} -{ output.bibitem - format.authors output - title howpublished new.block.checkb - format.title output - howpublished new.block.checka - howpublished output - format.bdate output - format.eprint output - new.block - note output - formatfull.doi output - fin.entry - empty.misc.check -} - -FUNCTION {phdthesis} -{ output.bibitem - format.authors "author" output.check - new.block - format.title "title" output.check - new.block - "PhD thesis" format.thesis.type output.nonnull - school "school" output.check - address output - format.bdate "year" output.check - format.eprint output - new.block - note output - formatfull.doi output - fin.entry -} - -FUNCTION {proceedings} -{ output.bibitem - editor empty$ - { organization output } - { format.editors output.nonnull } - if$ - new.block - format.btitle "title" output.check - format.bvolume output - format.number.series output - address empty$ - { editor empty$ - { publisher new.sentence.checka } - { organization publisher new.sentence.checkb - organization output - } - if$ - publisher output - format.bdate "year" output.check - } - { address output.nonnull - format.bdate "year" output.check - new.sentence - editor empty$ - 'skip$ - { organization output } - if$ - publisher output - } - if$ - format.eprint output - new.block - note output - formatfull.doi output - fin.entry -} - -FUNCTION {techreport} -{ output.bibitem - format.authors "author" output.check - new.block - format.title "title" output.check - new.block - format.tr.number output.nonnull - institution "institution" output.check - address output - format.bdate "year" output.check - format.eprint output - new.block - note output - formatfull.doi output - fin.entry -} - -FUNCTION {unpublished} -{ output.bibitem - format.authors "author" output.check - new.block - format.title "title" output.check - format.eprint output - new.block - note output - formatfull.doi output - fin.entry -} - -FUNCTION {www} -{ output.bibitem - format.authors "author" output.checkwoa - new.block - format.title "title" output.check - new.block - format.website "url" output.check - format.eprint output - new.block - note output - formatfull.doi output - fin.entry -} - -FUNCTION {patent} -{ output.bibitem - format.authors "author" output.check - new.block - format.title "title" output.check - new.block - format.number.patent "number" output.check - mid.sentence 'output.state := - format.pdate "date" output.check - format.eprint output - new.block - note output - formatfull.doi output - fin.entry -} - -READ - -FUNCTION {sortify} -{ purify$ - "l" change.case$ -} - - -INTEGERS { len } - -FUNCTION {chop.word} -{ 's := - 'len := - s #1 len substring$ = - { s len #1 + global.max$ substring$ } - 's - if$ -} - - -FUNCTION {format.lab.names} -{ 's := - s #1 "{vv~}{ll}" format.name$ - s num.names$ duplicate$ - #2 > - { pop$ " \em{et~al.}" * } - { #2 < - 'skip$ - { s #2 "{ff }{vv }{ll}{ jj}" format.name$ "others" = - { " \em{et~al.}" * } - { " and " * s #2 "{vv~}{ll}" format.name$ * } - if$ - } - if$ - } - if$ -} - - -FUNCTION {author.key.label} -{ author empty$ - { key empty$ - { cite$ #1 #3 substring$ } - 'key - if$ - } - { author format.lab.names } - if$ -} - -FUNCTION {author.editor.key.label} -{ author empty$ - { editor empty$ - { key empty$ - { cite$ #1 #3 substring$ } - 'key - if$ - } - { editor format.lab.names } - if$ - } - { author format.lab.names } - if$ -} - -FUNCTION {author.key.organization.label} -{ author empty$ - { key empty$ - { organization empty$ - { cite$ #1 #3 substring$ } - { "The " #4 organization chop.word #3 text.prefix$ } - if$ - } - 'key - if$ - } - { author format.lab.names } - if$ -} - -FUNCTION {editor.key.organization.label} -{ editor empty$ - { key empty$ - { organization empty$ - { cite$ #1 #3 substring$ } - { "The " #4 organization chop.word #3 text.prefix$ } - if$ - } - 'key - if$ - } - { editor format.lab.names } - if$ -} - -FUNCTION {calc.short.authors} -{ type$ "book" = - type$ "inbook" = - or - 'author.editor.key.label - { type$ "proceedings" = - 'editor.key.organization.label - { type$ "manual" = - 'author.key.organization.label - 'author.key.label - if$ - } - if$ - } - if$ - 'short.list := -} - -FUNCTION {calc.label} -{ calc.short.authors - short.list - "(" - * - year duplicate$ empty$ - short.list key field.or.null = or - { pop$ "" } - 'skip$ - if$ - * - 'label := -} - -INTEGERS { seq.num } - -FUNCTION {init.seq} -{ #0 'seq.num :=} - -EXECUTE {init.seq} - -FUNCTION {int.to.fix} -{ "000000000" swap$ int.to.str$ * - #-1 #10 substring$ -} - - -FUNCTION {presort} -{ calc.label - label sortify - " " - * - seq.num #1 + 'seq.num := - seq.num int.to.fix - 'sort.label := - sort.label * - #1 entry.max$ substring$ - 'sort.key$ := -} - -ITERATE {presort} - - -STRINGS { longest.label last.label next.extra } - -INTEGERS { longest.label.width last.extra.num number.label } - -FUNCTION {initialize.longest.label} -{ "" 'longest.label := - #0 int.to.chr$ 'last.label := - "" 'next.extra := - #0 'longest.label.width := - #0 'last.extra.num := - #0 'number.label := -} - -FUNCTION {forward.pass} -{ last.label label = - { last.extra.num #1 + 'last.extra.num := - last.extra.num int.to.chr$ 'extra.label := - } - { "a" chr.to.int$ 'last.extra.num := - "" 'extra.label := - label 'last.label := - } - if$ - number.label #1 + 'number.label := -} - -FUNCTION {reverse.pass} -{ next.extra "b" = - { "a" 'extra.label := } - 'skip$ - if$ - extra.label 'next.extra := - extra.label - duplicate$ empty$ - 'skip$ - { "{\natexlab{" swap$ * "}}" * } - if$ - 'extra.label := - label extra.label * 'label := -} - -EXECUTE {initialize.longest.label} - -ITERATE {forward.pass} - -REVERSE {reverse.pass} - -FUNCTION {begin.bib} -{ "\begin{thebibliography}{999}" - write$ newline$ -} - -EXECUTE {begin.bib} - -EXECUTE {init.state.consts} - -ITERATE {call.type$} - -FUNCTION {end.bib} -{ newline$ - "\end{thebibliography}" write$ newline$ -} - -EXECUTE {end.bib} - - diff --git a/papers/Mathematics/mdpi.cls b/papers/Mathematics/mdpi.cls deleted file mode 100644 index aad304f8c..000000000 --- a/papers/Mathematics/mdpi.cls +++ /dev/null @@ -1,1176 +0,0 @@ -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -%% %% MDPI class for LaTeX files 15.2.2019 b -%% %% For any information please send an e-mail to: -%% %% latex@mdpi.com -%% %% -%% %% Initial class provided by: -%% %% Stefano Mariani -%% %% Modified by: -%% %% Dietrich Rordorf -%% %% Peter Harremoes -%% %% Zeno Schumacher -%% %% Maddalena Giulini -%% %% Andres Gartmann -%% %% Dr. Janine Daum -%% %% Versions: -%% %% v1.0 before Dr. Janine Daum -%% %% v2.0 when Dr. Janine Daum started (March 2013) -%% %% v3.0 after layout change (September 2015) -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% - -%% IDENTIFICATION -\NeedsTeXFormat{LaTeX2e} -\ProvidesClass{mdpi}[15/02/2019 MDPI paper class] - -%%%% Copyright and citebox - \AtEndDocument{\par \cright} - -%% PRELIMINARY DECLARATIONS -\LoadClass[10pt,a4paper]{article} -\RequirePackage[T1]{fontenc} -\RequirePackage[utf8]{inputenc} -\RequirePackage{calc} -\RequirePackage{indentfirst} -\RequirePackage{fancyhdr} -\RequirePackage{graphicx,epstopdf} -\RequirePackage{lastpage} -\RequirePackage{ifthen} -\RequirePackage{float} -\RequirePackage{amsmath} -% TODO: Currently lineno needs to be loaded after amsmath because of conflict -% https://github.com/latex-lineno/lineno/issues/5 -\RequirePackage{lineno} -\RequirePackage{setspace} -\RequirePackage{enumitem} -\RequirePackage{mathpazo} -\RequirePackage{booktabs} % For \toprule etc. in tables -\RequirePackage[largestsep]{titlesec} -\RequirePackage{etoolbox} % For \AtBeginDocument etc. -\RequirePackage{tabto} % To use tab for alignment on first page -\RequirePackage[table]{xcolor} % To provide color for soul (for english editing) and provide coloring for tables (author request) -\RequirePackage{soul} % To highlight text -\newcommand{\highlight}[1]{\colorbox{yellow}{#1}} -\RequirePackage{multirow} -\RequirePackage{microtype} % For command \textls[]{} -\RequirePackage{tikz} % For \foreach used for Orcid icon -\RequirePackage{totcount} % To enable extracting the value of the counter "page" - - -%% OPTIONS -%% To choose the journal -% All journals (website name, full name, short name, DOI abbreviation, and ISSN) are defined in an extra file. -% This is the same as for mdpi.cls. -\input{journalnames} -\DeclareOption{journal}{\ClassWarning{mdpi}{You used an invalid journal name or you have not specified the journal. The first option of the documentclass command specifies the journal. The word 'journal' should be replaced by one of the journal names specified in template.tex (in the comment 'Choose between the following MDPI journal').}} - -%% To choose the type of manuscript -\DeclareOption{abstract}{\gdef\@arttype{Abstract}} -\DeclareOption{addendum}{\gdef\@arttype{Addendum}} -\DeclareOption{article}{\gdef\@arttype{Article}} -\DeclareOption{benchmark}{\gdef\@arttype{Benchmark}} -\DeclareOption{book}{\gdef\@arttype{Book}} -\DeclareOption{bookreview}{\gdef\@arttype{Book Review}} -\DeclareOption{briefreport}{\gdef\@arttype{Brief Report}} -\DeclareOption{casereport}{\gdef\@arttype{Case Report}} -\DeclareOption{changes}{\gdef\@arttype{Changes}} -\DeclareOption{comment}{\gdef\@arttype{Comment}} -\DeclareOption{commentary}{\gdef\@arttype{Commentary}} -\DeclareOption{communication}{\gdef\@arttype{Communication}} -\DeclareOption{conceptpaper}{\gdef\@arttype{Concept Paper}} -\DeclareOption{conferenceproceedings}{\gdef\@arttype{Proceedings}} -\DeclareOption{correction}{\gdef\@arttype{Correction}} -\DeclareOption{conferencereport}{\gdef\@arttype{Conference Report}} -\DeclareOption{expressionofconcern}{\gdef\@arttype{Expression of Concern}} -\DeclareOption{extendedabstract}{\gdef\@arttype{Extended Abstract}} -\DeclareOption{meetingreport}{\gdef\@arttype{Meeting Report}} -\DeclareOption{creative}{\gdef\@arttype{Creative}} -\DeclareOption{datadescriptor}{\gdef\@arttype{Data Descriptor}} -\DeclareOption{discussion}{\gdef\@arttype{Discussion}} -\DeclareOption{editorial}{\gdef\@arttype{Editorial}} -\DeclareOption{essay}{\gdef\@arttype{Essay}} -\DeclareOption{erratum}{\gdef\@arttype{Erratum}} -\DeclareOption{hypothesis}{\gdef\@arttype{Hypothesis}} -\DeclareOption{interestingimages}{\gdef\@arttype{Interesting Images}} -\DeclareOption{letter}{\gdef\@arttype{Letter}} -\DeclareOption{meetingreport}{\gdef\@arttype{Meeting Report}} -\DeclareOption{newbookreceived}{\gdef\@arttype{New Book Received}} -\DeclareOption{obituary}{\gdef\@arttype{Obituary}} -\DeclareOption{opinion}{\gdef\@arttype{Opinion}} -\DeclareOption{projectreport}{\gdef\@arttype{Project Report}} -\DeclareOption{reply}{\gdef\@arttype{Reply}} -\DeclareOption{retraction}{\gdef\@arttype{Retraction}} -\DeclareOption{review}{\gdef\@arttype{Review}} -\DeclareOption{perspective}{\gdef\@arttype{Perspective}} -\DeclareOption{protocol}{\gdef\@arttype{Protocol}} -\DeclareOption{shortnote}{\gdef\@arttype{Short Note}} -\DeclareOption{supfile}{\gdef\@arttype{Supfile}} -\DeclareOption{technicalnote}{\gdef\@arttype{Technical Note}} -\DeclareOption{viewpoint}{\gdef\@arttype{Viewpoint}} - -%% To choose the status of the manuscript -\DeclareOption{submit}{\gdef\@status{submit}} -\DeclareOption{accept}{\gdef\@status{accept}} - -%% To choose the whether there is one or more authors -\DeclareOption{oneauthor}{\gdef\@authornum{author}} -\DeclareOption{moreauthors}{\gdef\@authornum{authors}} - -%% Add the chosen options to the class -\DeclareOption*{\PassOptionsToClass{\CurrentOption}{article}} - -%% Defaults -\ExecuteOptions{notspecified,10pt,a4paper,article,submit,oneauthor} - -%% Process options -\ProcessOptions\relax - -%% MORE DECLARATIONS -%%%% Maths environments -\RequirePackage{amsthm} -\newtheoremstyle{mdpi}% name -{12pt}% space above -{12pt}% space below -{\itshape}% body font -{}% indent amount 1 -{\bfseries}% theorem head font -{.}% punctuation after theorem head -{.5em}% space after theorem head -{}% theorem head spec (can be left empty, meaning `normal') - -\renewcommand{\qed}{\unskip\nobreak\quad\qedsymbol} %% This places the symbol right after the text instead of placing it at the end on the line. - -\renewenvironment{proof}[1][\proofname]{\par %% \proofname allows to have "Proof of my theorem" - \pushQED{\qed}% - \normalfont \topsep6\p@\@plus6\p@\relax - \trivlist - \item[\hskip\labelsep - \bfseries %% "Proof" is bold - #1\@addpunct{.}]\ignorespaces %% Period instead of colon -}{% - \popQED\endtrivlist\@endpefalse -} - - \theoremstyle{mdpi} - \newcounter{theorem} - \setcounter{theorem}{0} - \newtheorem{Theorem}[theorem]{Theorem} - - \newcounter{lemma} - \setcounter{lemma}{0} - \newtheorem{Lemma}[lemma]{Lemma} - - \newcounter{corollary} - \setcounter{corollary}{0} - \newtheorem{Corollary}[corollary]{Corollary} - - \newcounter{proposition} - \setcounter{proposition}{0} - \newtheorem{Proposition}[proposition]{Proposition} - - \newcounter{characterization} - \setcounter{characterization}{0} - \newtheorem{Characterization}[characterization]{Characterization} - - \newcounter{property} - \setcounter{property}{0} - \newtheorem{Property}[property]{Property} - - \newcounter{problem} - \setcounter{problem}{0} - \newtheorem{Problem}[problem]{Problem} - - \newcounter{example} - \setcounter{example}{0} - \newtheorem{Example}[example]{Example} - - \newcounter{examplesanddefinitions} - \setcounter{examplesanddefinitions}{0} - \newtheorem{ExamplesandDefinitions}[examplesanddefinitions]{Examples and Definitions} - - \newcounter{remark} - \setcounter{remark}{0} - \newtheorem{Remark}[remark]{Remark} - - \newcounter{definition} - \setcounter{definition}{0} - \newtheorem{Definition}[definition]{Definition} - - \newcounter{hypothesis} - \setcounter{hypothesis}{0} - \newtheorem{Hypothesis}[hypothesis]{Hypothesis} - - \newcounter{notation} - \setcounter{notation}{0} - \newtheorem{Notation}[notation]{Notation} - -%%%% Hyphenation -\RequirePackage[none]{hyphenat} -\sloppy - -%%%% References -\RequirePackage[sort&compress,sectionbib]{natbib} % option sectionbib is for optionally organizing references using sections (author request) - -\ifthenelse{\equal{\@journal}{admsci} -\OR \equal{\@journal}{arts} -\OR \equal{\@journal}{econometrics} -\OR \equal{\@journal}{economies} -\OR \equal{\@journal}{genealogy} -\OR \equal{\@journal}{humanities} -\OR \equal{\@journal}{ijfs} -\OR \equal{\@journal}{jrfm} -\OR \equal{\@journal}{languages} -\OR \equal{\@journal}{laws} -\OR \equal{\@journal}{religions} -\OR \equal{\@journal}{risks} -\OR \equal{\@journal}{socsci}}{% - \bibliographystyle{chicago2} - \bibpunct{(}{)}{;}{x}{}{}% - }{% - \bibliographystyle{mdpi} - \bibpunct{[}{]}{,}{n}{}{}% - }% - -\renewcommand\NAT@set@cites{% - \ifNAT@numbers - \ifNAT@super \let\@cite\NAT@citesuper - \def\NAT@mbox##1{\unskip\nobreak\textsuperscript{##1}}% - \let\citeyearpar=\citeyear - \let\NAT@space\relax - \def\NAT@super@kern{\kern\p@}% - \else - \let\NAT@mbox=\mbox - \let\@cite\NAT@citenum - \let\NAT@space\relax - \let\NAT@super@kern\relax - \fi - \let\@citex\NAT@citexnum - \let\@biblabel\NAT@biblabelnum - \let\@bibsetup\NAT@bibsetnum - \renewcommand\NAT@idxtxt{\NAT@name\NAT@spacechar\NAT@open\NAT@num\NAT@close}% - \def\natexlab##1{}% - \def\NAT@penalty{\penalty\@m}% - \else - \let\@cite\NAT@cite - \let\@citex\NAT@citex - \let\@biblabel\NAT@biblabel - \let\@bibsetup\NAT@bibsetup - \let\NAT@space\NAT@spacechar - \let\NAT@penalty\@empty - \renewcommand\NAT@idxtxt{\NAT@name\NAT@spacechar\NAT@open\NAT@date\NAT@close}% - \def\natexlab##1{##1}% - \fi} - -%%%%% Hyperlinks -%% Define color for citations -\definecolor{bluecite}{HTML}{0875b7} - -\ifthenelse{\equal{\@arttype}{Book}}{ - \RequirePackage[unicode=true, - bookmarksopen={true}, - pdffitwindow=true, - colorlinks=true, - linkcolor=black, - citecolor=black, - urlcolor=black, - hyperfootnotes=false, - pdfstartview={FitH}, - pdfpagemode=UseNone]{hyperref} - }{ - \RequirePackage[unicode=true, - bookmarksopen={true}, - pdffitwindow=true, - colorlinks=true, - linkcolor=bluecite, - citecolor=bluecite, - urlcolor=bluecite, - hyperfootnotes=false, - pdfstartview={FitH}, - pdfpagemode= UseNone]{hyperref} -} - -%% To have the possibility to change the urlcolor -\newcommand{\changeurlcolor}[1]{\hypersetup{urlcolor=#1}} - -%% Metadata -\newcommand{\org@maketitle}{}% LATEX-Check -\let\org@maketitle\maketitle -\def\maketitle{% - \hypersetup{ - pdftitle={\@Title}, - pdfsubject={\@abstract}, - pdfkeywords={\@keyword}, - pdfauthor={\@AuthorNames} - }% - \org@maketitle -} - -%%%% Footnotes -\RequirePackage[hang]{footmisc} -\setlength{\skip\footins}{1.2cm} -\setlength{\footnotemargin}{5mm} -\def\footnoterule{\kern-14\p@ -\hrule \@width 2in \kern 11.6\p@} - -%%%% URL -\RequirePackage{url} -\urlstyle{same} -\g@addto@macro{\UrlBreaks}{\UrlOrds} - -%%%% Widows & orphans -\clubpenalty=10000 -\widowpenalty=10000 -\displaywidowpenalty=10000 - -%%%% Front matter -\newcommand{\firstargument}{} -\newcommand{\Title}[1]{\gdef\@Title{#1}}% -\newcommand{\Author}[1]{\gdef\@Author{#1}}% -\def\@AuthorNames{} -\newcommand{\AuthorNames}[1]{\gdef\@AuthorNames{#1}}% -\newcommand{\firstpage}[1]{\gdef\@firstpage{#1}} -\newcommand{\doinum}[1]{\gdef\@doinum{#1}} - -% DOI number -\newcommand\twodigits[1]{% -\ifnum#1<10 -0\number#1 - \else -\number#1 -\fi -} - -\newcommand\fourdigits[1]{% -\ifnum#1<10 000\number#1 - \else - \ifnum#1<100 00\number#1 - \else - \ifnum#1<1000 0\number#1 - \else - \ifnum#1<10000 \number#1 - \else - error - \fi - \fi - \fi - \fi -} - - -\ifthenelse{\equal{\@journal}{molbank}}{ - \doinum{10.3390/\@articlenumber} - }{ - \doinum{10.3390/\@doiabbr\@pubvolume\twodigits\@issuenum\fourdigits\@articlenumber} -} - - -\newcommand{\pubvolume}[1]{\gdef\@pubvolume{#1}} -\newcommand{\pubyear}[1]{\gdef\@pubyear{#1}} -\newcommand{\copyrightyear}[1]{\gdef\@copyrightyear{#1}} -\newcommand{\address}[2][]{\renewcommand{\firstargument}{#1}\gdef\@address{#2}} -\newcommand{\corresfirstargument}{} -\def\@corres{} -\newcommand{\corres}[2][]{\renewcommand{\corresfirstargument}{#1}\gdef\@corres{#2}} -\def\@conference{} -\newcommand{\conference}[1]{\gdef\@conference{#1}}% -\def\@abstract{} -\renewcommand{\abstract}[1]{\gdef\@abstract{#1}} -\def\@externaleditor{} -\newcommand{\externaleditor}[1]{\gdef\@externaleditor{#1}} -\def\@LSID{} -\newcommand{\LSID}[1]{\gdef\@LSID{#1}} -\newcommand{\history}[1]{\gdef\@history{#1}} -\def\@pacs{} -\newcommand{\PACS}[1]{\gdef\@pacs{#1}} -\def\@msc{} -\newcommand{\MSC}[1]{\gdef\@msc{#1}} -\def\@jel{} -\newcommand{\JEL}[1]{\gdef\@jel{#1}} -\def\@keyword{} -\newcommand{\keyword}[1]{\gdef\@keyword{#1}} -\def\@dataset{} -\newcommand{\dataset}[1]{\gdef\@dataset{#1}} -\def\@datasetlicense{} -\newcommand{\datasetlicense}[1]{\gdef\@datasetlicense{#1}} -\def\@featuredapplication{} -\newcommand{\featuredapplication}[1]{\gdef\@featuredapplication{#1}} -\def\@keycontribution{} -\newcommand{\keycontribution}[1]{\gdef\@keycontribution{#1}} - - -\def\@issuenum{} -\newcommand{\issuenum}[1]{\gdef\@issuenum{#1}} -\def\@updates{} -\newcommand{\updates}[1]{\gdef\@updates{#1}} - -\def\@firstnote{} -\newcommand{\firstnote}[1]{\gdef\@firstnote{#1}} -\def\@secondnote{} -\newcommand{\secondnote}[1]{\gdef\@secondnote{#1}}% -\def\@thirdnote{} -\newcommand{\thirdnote}[1]{\gdef\@thirdnote{#1}}% -\def\@fourthnote{} -\newcommand{\fourthnote}[1]{\gdef\@fourthnote{#1}}% -\def\@fifthnote{} -\newcommand{\fifthnote}[1]{\gdef\@fifthnote{#1}}% -\def\@sixthnote{} -\newcommand{\sixthnote}[1]{\gdef\@sixthnote{#1}}% -\def\@seventhnote{} -\newcommand{\seventhnote}[1]{\gdef\@seventhnote{#1}}% -\def\@eighthnote{} -\newcommand{\eighthnote}[1]{\gdef\@eighthnote{#1}}% - -\def\@simplesumm{} -\newcommand{\simplesumm}[1]{\gdef\@simplesumm{#1}} -\newcommand{\articlenumber}[1]{\gdef\@articlenumber{#1}} - -\def\@externalbibliography{} -\newcommand{\externalbibliography}[1]{\gdef\@externalbibliography{#1}} - -\def\@reftitle{} -\newcommand{\reftitle}[1]{\gdef\@reftitle{#1}} - -% For transition period to change back to continuous page numbers -\def\@continuouspages{} -\newcommand{\continuouspages}[1]{\gdef\@continuouspages{#1}} - - -%% ORCID -% Make Orcid icon -\newcommand{\orcidicon}{\includegraphics[width=0.32cm]{logo-orcid.pdf}} - -% Define link and button for each author -\foreach \x in {A, ..., Z}{% -\expandafter\xdef\csname orcid\x\endcsname{\noexpand\href{https://orcid.org/\csname orcidauthor\x\endcsname}{\noexpand\orcidicon}} -} - -%%%% Journal name for the header -\newcommand{\journalname}{\@journalshort} - - -\regtotcounter{page} % to enable extracting the value of the counter "page" using the totcount package - -%%%% Header and footer on first page -%% The plain page style needs to be redefined because with \maketitle in the article class, LaTeX applies the the plain page style automatically to the first page. -\ifthenelse{\equal{\@journal}{preprints} % - \OR \equal{\@arttype}{Book}}{% - \fancypagestyle{plain}{% - \fancyhf{} - \ifthenelse{\equal{\@arttype}{Book}}{ - \fancyfoot[C]{\footnotesize\thepage} - }{% - } - } - }{% - \ifthenelse{\equal{\@arttype}{Supfile}}{ - \fancypagestyle{plain}{ - \fancyhf{} - \fancyhead[R]{ - \footnotesize % - S\thepage{} of S\pageref*{LastPage}% - }% - \fancyhead[L]{ - \footnotesize % - \ifthenelse{\equal{\@status}{submit}}{% - Version {\@ \today} submitted to {\em\journalname}% - }{% - {\em \journalname} % - {\bfseries \@pubyear}, % - {\em \@pubvolume}, % - \ifthenelse{\equal{\@continuouspages}{\@empty}}{% - \@firstpage --\pageref*{LastPage}% - }{% - \@articlenumber% - }% - ; doi:{\changeurlcolor{black}% - \href{http://dx.doi.org/\@doinum}% - {\@doinum}}% - }% - }% - }% - }{ - \fancypagestyle{plain}{ - \fancyhf{} - \fancyfoot[L]{ - \footnotesize% - \ifthenelse{\equal{\@status}{submit}}{% - Submitted to {\em\journalname}, % - pages \thepage \ -- \color{black}{\pageref*{LastPage}}% - }{ - {\em \journalname}\ % - {\bfseries \@pubyear}, % - {\em \@pubvolume}, % - \ifthenelse{\equal{\@continuouspages}{\@empty}}{% - \@articlenumber% - }{% - \@firstpage\ifnumcomp{\totvalue{page}-1}{=}{\@firstpage}{}{--\pageref*{LastPage}}% - }% - ; doi:{\changeurlcolor{black}% - \href{http://dx.doi.org/\@doinum}% - {\@doinum}}% - }% - }% - \fancyfoot[R]{ - \footnotesize% - {\changeurlcolor{black}% - \href{http://www.mdpi.com/journal/\@journal}% - {www.mdpi.com/journal/\@journal}}% - }% - \fancyhead{} - \renewcommand{\headrulewidth}{0.0pt}% - } - }% - }% - -%%%% Maketitle part 1: Logo, Arttype, Title, Author -\renewcommand{\@maketitle}{ - \begin{flushleft} - \ifthenelse{\equal{\@arttype}{Supfile}}{% - \fontsize{18}{18}\selectfont - \raggedright - \noindent\textbf{Supplementary Materials: \@Title}% - \par - \vspace{12pt} - \fontsize{10}{10}\selectfont - \noindent\boldmath\bfseries{\@Author} - }{% - \ifthenelse{\equal{\@arttype}{Book}}{}{% - \vspace*{-1.75cm} - } - {%0 - \ifthenelse{\equal{\@journal}{preprints} - \OR \equal{\@arttype}{Book}}{}{% - \ifthenelse{\equal{\@status}{submit}}{% - \hfill \href{http://www.mdpi.com}{% - \includegraphics[height=1cm]{logo-mdpi.pdf}}\vspace{0.5cm}% - }{ - \href{http://www.mdpi.com/journal/\@journal}{ - \includegraphics[height=1.2cm]{\@journal-logo.eps}}% - \hfill - \ifthenelse{\equal{\@journal}{proceedings}}{ - \href{http://www.mdpi.com/journal/\@journal}{ - \includegraphics[height=1.2cm]{logo-conference.eps} - \hfill} - }{} - \ifthenelse{\equal{\@journal}{scipharm}}{% - \href{http://www.mdpi.com}{\includegraphics[height=1cm]{logo-mdpi-scipharm.eps}}% - }{% - \href{http://www.mdpi.com}{\includegraphics[height=1cm]{logo-mdpi.pdf}}% - }% - }% - }% - \par - }%0 - {%1 - \vspace{14pt} - \fontsize{10}{10}\selectfont - \ifthenelse{\equal{\@arttype}{Book}}{}{ - \textit{\@arttype}% - }% - \par% - }%1 - {%2 - \vspace{-1pt} - \fontsize{18}{18}\selectfont - \boldmath\bfseries{\@Title} - \par - \vspace{15pt} - }%2 - {%3 - \boldmath\bfseries{\@Author} - \par - \vspace{-4pt} - }%3 - } - \end{flushleft}% - } - -% Commands for hanging indent -\newcommand{\dist}{1.7em} -\newcommand{\hang}{\hangafter=1\hangindent=\dist\noindent} - -%%%% Maketitle part 2 -\newcommand{\maketitlen}{ -\ifthenelse{\equal{\@arttype}{Book}}{\vspace{12pt}}{ - \begin{flushleft} - \begin{spacing}{1.35} - \leftskip0.2cm - \fontsize{9}{9}\selectfont - {% - \ifthenelse{\equal{\firstargument}{1}}{}{% - \hang}\@address - \par - }% - {% - \ifthenelse{\equal{\@authornum}{author}}{}{% - \ifthenelse{\equal{\@corres}{\@empty}}{}{% - \hang\textbf{*} \tabto{\dist} \@corres} - \par - } - }% - {% - \ifthenelse{\equal{\@conference}{\@empty}}{}{% - \hang$\dagger$ \tabto{\dist} This paper is an extended version of our paper published in\space \@conference.} - \par - }% - {% - \ifthenelse{\equal{\@firstnote}{\@empty}}{}{% - \hang\ifthenelse{\equal{\@conference}{\@empty}}{$\dagger$}{$\ddagger$} \tabto{\dist} \@firstnote} - \par - }% - {% - \ifthenelse{\equal{\@secondnote}{\@empty}}{}{% - \hang \ifthenelse{\equal{\@conference}{\@empty}}{$\ddagger$}{\S} \tabto{\dist} \@secondnote} - \par - }% - {% - \ifthenelse{\equal{\@thirdnote}{\@empty}}{}{% - \hang \ifthenelse{\equal{\@conference}{\@empty}}{\S}{$\|$} \tabto{\dist} \@thirdnote} - \par - }% - {% - \ifthenelse{\equal{\@fourthnote}{\@empty}}{}{% - \hang \ifthenelse{\equal{\@conference}{\@empty}}{$\|$}{\P} \tabto{\dist} \@fourthnote} - \par - }% - {% - \ifthenelse{\equal{\@fifthnote}{\@empty}}{}{% - \hang \ifthenelse{\equal{\@conference}{\@empty}}{\P}{**} \tabto{\dist} \@fifthnote} - \par - }% - {% - \ifthenelse{\equal{\@sixthnote}{\@empty}}{}{% - \hang \ifthenelse{\equal{\@conference}{\@empty}}{**}{$\dagger\dagger$} \tabto{\dist} \@sixthnote} - \par - }% - {% - \ifthenelse{\equal{\@seventhnote}{\@empty}}{}{% - \hang \ifthenelse{\equal{\@conference}{\@empty}}{$\dagger\dagger$}{$\ddagger\ddagger$} \tabto{\dist} \@seventhnote} - \par - }% - {% - \ifthenelse{\equal{\@eighthnote}{\@empty}}{}{% - \hang \ifthenelse{\equal{\@conference}{\@empty}}{$\ddagger\ddagger$}{***} \tabto{\dist} \@eighthnote} - \par - }% - \vspace{6pt} - \ifthenelse{\equal{\@updates}{\@empty}}{ - \ifthenelse{\equal{\@externaleditor}{\@empty}}{}{\@externaleditor} - \par - \ifthenelse{\equal{\@LSID}{\@empty}}{}{\@LSID} - \par - \ifthenelse{\equal{\@status}{submit}}{ - Version {\@ \today} submitted to \journalname - }{ - \mbox{\@history} - } - }{ - \parbox[tb]{.79\textwidth}{ - \ifthenelse{\equal{\@externaleditor}{\@empty}}{}{\@externaleditor} - \par - \ifthenelse{\equal{\@LSID}{\@empty}}{}{\@LSID} - \par - \ifthenelse{\equal{\@status}{submit}}{ - Version {\@ \today} submitted to \journalname - }{ - \mbox{\@history} - } - } - \parbox[b]{.19\textwidth}{ - \hfill - \ifthenelse{\equal{\@updates}{\@empty}}{ - }{ - \href{http://www.mdpi.com/\@ISSN/\@pubvolume/\@issuenum/\@articlenumber?type=check_update&version=1}{\includegraphics[height=.6cm]{logo-updates.pdf}}% - }% - }% - } - \par - \vspace{-4pt}% - \end{spacing} - \end{flushleft} -} -} - -%%%% Abstract, keywords, journal data, PACS, MSC, JEL -\newcommand{\abstractkeywords}{ -\vspace{-8pt} -{% For journal Applied Sciences: -\ifthenelse{\equal{\@featuredapplication}{\@empty}}{}{ -\begingroup -\leftskip0.2cm -\noindent\textbf{Featured Application:\space\@featuredapplication} -\vspace{12pt} -\par -\endgroup} -}% -{%10 -\begingroup -\leftskip0.2cm -\ifthenelse{\equal{\@simplesumm}{\@empty}}{}{ -\noindent\textbf{Simple Summary:\space}\@simplesumm -\vspace{12pt} -\par -} -\ifthenelse{\equal{\@abstract}{\@empty}}{}{ -\noindent\textbf{Abstract:\space}\@abstract -\vspace{12pt} -\par -} -\endgroup -}%10 -{% For journal Data: -\ifthenelse{\equal{\@dataset}{\@empty}}{}{ -\begingroup -\leftskip0.2cm -\noindent\textbf{Dataset:\space}\@dataset -\vspace{12pt} -\par -\endgroup} -}% -{%For journal Data: -\ifthenelse{\equal{\@datasetlicense}{\@empty}}{}{ -\begingroup -\leftskip0.2cm -\noindent\textbf{Dataset License:\space}\@datasetlicense -\vspace{12pt} -\par -\endgroup} -}% -{%11 -\begingroup -\leftskip0.2cm -\ifthenelse{\equal{\@keyword}{\@empty}}{}{ -\noindent\textbf{Keywords:\space}\@keyword -\vspace{12pt} -\par -} -\endgroup -}%11 -{%For journal Toxins: -\begingroup -\leftskip0.2cm -\ifthenelse{\equal{\@keycontribution}{\@empty}}{}{ -\noindent\textbf{Key Contribution:\space}\@keycontribution -\vspace{12pt} -\par -} -\endgroup -}%11 -{%12 -\ifthenelse{\equal{\@pacs}{\@empty}}{}{ -\begingroup -\leftskip0.2cm -\noindent\textbf{PACS:\space}\@pacs -\vspace{12pt} -\par -\endgroup} -}%12 -{%13 -\ifthenelse{\equal{\@msc}{\@empty}}{}{ -\begingroup -\leftskip0.2cm -\noindent\textbf{MSC:\space}\@msc -\vspace{12pt} -\par -\endgroup} -}%13 -{%14 -\ifthenelse{\equal{\@jel}{\@empty}}{}{ -\begingroup -\leftskip0.2cm -\noindent\textbf{JEL Classification:\space}\@jel -\vspace{12pt} -\par -\endgroup} -}%14 -\vspace{4pt} -\ifthenelse{\equal{\@arttype}{Book}}{}{\hrule} -\vspace{12pt} -} - - -%%%% Print maketitle and abstractkeywords -\ifthenelse{\equal{\@arttype}{Supfile}}{ - \AfterEndPreamble{ - \maketitle - \let\maketitle\relax - \ifthenelse{\equal{\@status}{submit}}{\linenumbers}{} - }% - }{ - \AfterEndPreamble{ - \maketitle - \let\maketitle\relax - \maketitlen - \let\maketitlen\relax - \ifthenelse{\equal{\@status}{submit}}{\linenumbers}{} - \abstractkeywords - }% - } -\AtBeginDocument{ - \DeclareSymbolFont{AMSb}{U}{msb}{m}{n} - \DeclareSymbolFontAlphabet{\mathbb}{AMSb} - } - -%%%% Font size in Tables -\AtEndPreamble{ - \def\@tablesize{} - \newcommand{\tablesize}[1]{\gdef\@tablesize{#1}} - \let\oldtabular\tabular - \renewcommand{\tabular}{\ifthenelse{\equal{\@tablesize}{\@empty}}{\small}{\@tablesize}\oldtabular} -} - -%%%% Section headings -\setcounter{secnumdepth}{4} %i.e., section numbering depth, which defaults to 3 in the article class. To get paragraphs numbered and counted, increase the default value of secnumdepth to 4 - -\titleformat {\section} [block] {\raggedright \fontsize{10}{10}\selectfont\bfseries} {\thesection.\space} {0pt} {} -\titlespacing {\section} {0pt} {12pt} {6pt} - -\titleformat {\subsection} [block] {\raggedright \fontsize{10}{10}\selectfont\itshape} {\thesubsection.\space} {0pt} {} -\titlespacing {\subsection} {0pt} {12pt} {6pt} - -\titleformat {\subsubsection} [block] {\raggedright \fontsize{10}{10}\selectfont} {\thesubsubsection.\space} {0pt} {} -\titlespacing {\subsubsection} {0pt} {12pt} {6pt} - -\titleformat {\paragraph} [block] {\raggedright \fontsize{10}{10}\selectfont} {} {0pt} {} -\titlespacing {\paragraph} {0pt} {12pt} {6pt} - -%%%% Special section title style for back matter -\newcommand{\supplementary}[1]{ -\par\vspace{6pt}\noindent{\fontsize{9}{9}\selectfont\textbf{Supplementary Materials:} {#1}\par}} - -\newcommand{\acknowledgments}[1]{ -\vspace{6pt}\noindent{\fontsize{9}{9}\selectfont\textbf{Acknowledgments:} {#1}\par}} - -\newcommand{\authorcontributions}[1]{% -\vspace{6pt}\noindent{\fontsize{9}{9}\selectfont\textbf{Author Contributions:} {#1}\par}} - -\newcommand{\funding}[1]{ -\vspace{6pt}\noindent{\fontsize{9}{9}\selectfont\textbf{Funding:} {#1}\par}} - -\newcommand{\conflictsofinterest}[1]{% -\vspace{6pt}\noindent{\fontsize{9}{9}\selectfont\textbf{Conflicts of Interest:} {#1}\par}} - -\newcommand{\conflictofinterest}[1]{% Backwards compatibility for book prodcution -\vspace{6pt}\noindent{\fontsize{9}{9}\selectfont\textbf{Conflicts of Interest:} {#1}\par}} - -\newcommand{\conflictofinterests}[1]{% Backwards compatibility for book prodcution -\vspace{6pt}\noindent{\fontsize{9}{9}\selectfont\textbf{Conflicts of Interest:} {#1}\par}} - -\newcommand{\sampleavailability}[1]{% -\vspace{12pt}\noindent{\fontsize{9}{9}\selectfont\textbf{Sample Availability:} {#1}\par}} - -\newcommand{\reviewreports}[1]{% -\vspace{12pt}\noindent{\fontsize{9}{9}\selectfont\textbf{Review Reports:} {#1}\par}} - -\newcommand{\abbreviations}[1]{% -\vspace{12pt}\noindent{\selectfont\textbf{Abbreviations}\par\vspace{6pt}\noindent {\fontsize{9}{9}\selectfont #1}\par}} - -%%%%% Defines the appendix -\def\@appendixtitles{} -\newcommand{\appendixtitles}[1]{\gdef\@appendixtitles{#1}} - -\def\@appendixsections{} -\newcommand{\appendixsections}[1]{\gdef\@appendixsections{#1}} - -\renewcommand{\appendix}{% -\setcounter{section}{0}% -\setcounter{subsection}{0}% -\setcounter{subsubsection}{0}% -% -\gdef\thesection{\@Alph\c@section}% -\gdef\thesubsection{\@Alph\c@section.\@arabic\c@subsection}% -% -\titleformat {\section} [block] {\raggedright \fontsize{10}{10}\selectfont\bfseries} {% - \ifthenelse{\equal{\@appendixtitles}{yes}}{% - \appendixname~\thesection.% - }{% - \appendixname~\thesection~% - } - } {0pt} {} -\titlespacing {\section} {0pt} {12pt} {6pt} -% -\titleformat {\subsection} [block] {\raggedright \fontsize{10}{10}\selectfont\itshape} {% - \ifthenelse{\equal{\@appendixtitles}{yes}}{% - \appendixname~\thesubsection.% - }{% - \appendixname~\thesubsection% - } - } {0pt} {} -\titlespacing {\subsection} {0pt} {12pt} {6pt} -% -\titleformat {\subsubsection} [block] {\raggedright \fontsize{10}{10}\selectfont} {% - \ifthenelse{\equal{\@appendixtitles}{yes}}{% - \appendixname~\thesubsubsection.% - }{% - \appendixname~\thesubsubsection% - } - } {0pt} {} -\titlespacing {\subsubsection} {0pt} {12pt} {6pt} -% -\gdef\theHsection{\@Alph\c@section.}% for hyperref -\gdef\theHsubsection{\@Alph\c@section.\@arabic\c@subsection}% for hyperref -\csname appendixmore\endcsname -\renewcommand{\thefigure}{A\arabic{figure}} -\setcounter{figure}{0} -\renewcommand{\thetable}{A\arabic{table}} -\setcounter{table}{0} -\renewcommand{\thescheme}{A\arabic{scheme}} -\setcounter{scheme}{0} -\renewcommand{\thechart}{A\arabic{chart}} -\setcounter{chart}{0} -\renewcommand{\theboxenv}{A\arabic{boxenv}} -\setcounter{boxenv}{0} -\renewcommand{\theequation}{A\arabic{equation}} -\setcounter{equation}{0} -\renewcommand{\thetheorem}{A\arabic{theorem}} -\setcounter{theorem}{0} -\renewcommand{\thelemma}{A\arabic{lemma}} -\setcounter{lemma}{0} -\renewcommand{\thecorollary}{A\arabic{corollary}} -\setcounter{corollary}{0} -\renewcommand{\theproposition}{A\arabic{proposition}} -\setcounter{proposition}{0} -\renewcommand{\thecharacterization}{A\arabic{characterization}} -\setcounter{characterization}{0} -\renewcommand{\theproperty}{A\arabic{property}} -\setcounter{property}{0} -\renewcommand{\theproblem}{A\arabic{problem}} -\setcounter{problem}{0} -\renewcommand{\theexample}{A\arabic{example}} -\setcounter{example}{0} -\renewcommand{\theexamplesanddefinitions}{A\arabic{examplesanddefinitions}} -\setcounter{examplesanddefinitions}{0} -\renewcommand{\theremark}{A\arabic{remark}} -\setcounter{remark}{0} -\renewcommand{\thedefinition}{A\arabic{definition}} -\setcounter{definition}{0} -\renewcommand{\thehypothesis}{A\arabic{hypothesis}} -\setcounter{hypothesis}{0} -\renewcommand{\thenotation}{A\arabic{notation}} -\setcounter{notation}{0} -} - -%%%% Layout -\ifthenelse{\equal{\@arttype}{Book}}{%% - \RequirePackage[left=2.05cm, - right=2.05cm, - top=2.05cm, - bottom=2.05cm, - paperwidth=170mm, - paperheight=244mm, - includehead, - includefoot]{geometry} - }{ - \RequirePackage[left=2.7cm, - right=2.7cm, - top=1.8cm, - bottom=1.5cm, - includehead, - includefoot]{geometry} - } - -\linespread{1.13} -\setlength{\parindent}{0.75cm} - -%%%% Figures and tables -\RequirePackage{newfloat} -\DeclareFloatingEnvironment[]{listing} -\DeclareFloatingEnvironment[name=Box]{boxenv} -\DeclareFloatingEnvironment[]{chart} -\DeclareFloatingEnvironment[]{scheme} - -\RequirePackage{caption} -\captionsetup[figure]{position=bottom, margin=0.75cm, labelfont={bf, small, stretch=1.17}, labelsep=period, textfont={small, stretch=1.17}, aboveskip=6pt, belowskip=-6pt, justification=justified} - -\captionsetup[scheme]{position=bottom, margin=0.75cm, labelfont={bf, small, stretch=1.17}, labelsep=period, textfont={small, stretch=1.17}, aboveskip=6pt, belowskip=-6pt, justification=justified} - -\captionsetup[listing]{position=top, margin=0.75cm, labelfont={bf, small, stretch=1.17}, labelsep=period, textfont={small, stretch=1.17}, aboveskip=6pt, justification=justified} - -\captionsetup[chart]{position=bottom, margin=0.75cm, labelfont={bf, small, stretch=1.17}, labelsep=period, textfont={small, stretch=1.17}, aboveskip=6pt, belowskip=-6pt, justification=justified} - -\captionsetup[table]{position=top, margin=0.75cm, labelfont={bf, small, stretch=1.17}, labelsep=period, textfont={small, stretch=1.17}, aboveskip=6pt, justification=justified} - -\captionsetup[boxenv]{position=top, margin=0.75cm, labelfont={bf, small, stretch=1.17}, labelsep=period, textfont={small, stretch=1.17}, aboveskip=6pt,justification=justified} - - - -%% For table footnotes -\newsavebox{\@justcentbox} -\newcommand{\justifyorcenter}[1]{ -\sbox \@justcentbox{#1} -\ifdim \wd \@justcentbox >\hsize #1 -\else \centerline{#1} \fi -} - -%%%% Bullet lists -\newlength{\wideitemsep} -\setlength{\wideitemsep}{.5\itemsep} -\addtolength{\wideitemsep}{-7pt} -\let\olditem\item -\renewcommand{\item}{\setlength{\itemsep}{\wideitemsep}\olditem} - -%%%% Quote environment -\patchcmd{\quote}{\rightmargin}{\leftmargin 0.75cm \rightmargin}{}{} - -%%%% Supplementary file -\ifthenelse{\equal{\@arttype}{Supfile}}{ - \renewcommand{\thefigure}{S\arabic{figure}}% - \renewcommand{\thetable}{S\arabic{table}}% - }{}% - -%% Link to supplementary material: www.mdpi.com/ISSN-number/volume-number/issue-number/article-number -\newcommand{\linksupplementary}[1]{\url{http://www.mdpi.com/\@ISSN/\@pubvolume/\@issuenum/\@articlenumber/#1}} - -%%%% Header and footer (all pages except the first) -\renewcommand\headrule{} %% set line (from fancyhdr) in header to nothing -\pagestyle{fancy} -\lhead{ - \ifthenelse{\equal{\@journal}{preprints}% - \OR \equal{\@arttype}{Book}}{% - }{% - \fontsize{8}{8}\selectfont% - \ifthenelse{\equal{\@status}{submit}}{% - Version {\@ \today} submitted to {\em \journalname}% - }{% - \ifthenelse{\equal{\@arttype}{Supfile}}{% - {\em \journalname} {\bfseries \@pubyear}, {\em \@pubvolume} % - %\ifthenelse{\equal{\@articlenumber}{}}{% - %\@firstpage --\pageref*{LastPage}% - %}{\@articlenumber}% - ; doi:{\changeurlcolor{black}% - \href{http://dx.doi.org/\@doinum}% - {\@doinum}}% - }{% - {\em\journalname\ }{\bfseries\@pubyear}, {\em \@pubvolume}% - \ifthenelse{\equal{\@continuouspages}{\@empty}}{% - , \@articlenumber% - }{% - }% - }% - }% - }% - } - -\rhead{% -\ifthenelse{\equal{\@arttype}{Book}}{}{% - \ifthenelse{\equal{\@arttype}{Supfile}}{% - \fontsize{8}{8}\selectfont S\thepage{} of S\pageref*{LastPage}% - }{% - \ifthenelse{\equal{\@continuouspages}{\@empty}}{% - \fontsize{8}{8}\selectfont\thepage{} of \pageref*{LastPage}% - }{% - \fontsize{8}{8}\selectfont\thepage%{} of \pageref*{LastPage}% - }% - }% - }% -} - -\cfoot{ - \ifthenelse{\equal{\@arttype}{Book}}{% - \fontsize{8}{8}\selectfont\thepage - }{% - } -} - - -%%%% Bibliography -\renewcommand\bibname{References} % Backwards compatibility for book production -\renewcommand\@biblabel[1]{#1.\hfill} -\def\thebibliography#1{ -\linespread{1.44} -\section*{\@reftitle} -\addcontentsline{toc}{section}{References} -\fontsize{9}{9}\selectfont -\list{{\arabic{enumi}}}{\def\makelabel##1{\hss{##1}} -\topsep=0\p@ -\parsep=5\p@ -\partopsep=0\p@ -\itemsep=0\p@ -\labelsep=1.5mm -\ifthenelse{\equal{\@journal}{admsci} -\OR \equal{\@journal}{arts} -\OR \equal{\@journal}{econometrics} -\OR \equal{\@journal}{economies} -\OR \equal{\@journal}{genealogy} -\OR \equal{\@journal}{humanities} -\OR \equal{\@journal}{ijfs} -\OR \equal{\@journal}{jrfm} -\OR \equal{\@journal}{languages} -\OR \equal{\@journal}{laws} -\OR \equal{\@journal}{religions} -\OR \equal{\@journal}{risks} -\OR \equal{\@journal}{socsci}}{% - \ifthenelse{\equal{\@externalbibliography}{\@empty}}{% - \itemindent=-7.7mm - }{% - \itemindent=-3.3mm}% - }{% - \itemindent=0\p@} -\settowidth\labelwidth{\footnotesize[#1]}% -\leftmargin\labelwidth -\advance\leftmargin\labelsep -%\advance\leftmargin -\itemindent -\usecounter{enumi}} -%\def\newblock{\ } -%\sloppy\clubpenalty4000\widowpenalty4000 -%\sfcode`\.=1000\relax -} -\let\endthebibliography=\endlist - -%%%% Copyright info -\newcommand{\cright}{% - \ifthenelse{\equal{\@arttype}{Supfile} \OR \equal{\@journal}{preprints}}{% - }{% - \vspace{12pt} - \noindent - \linespread{1.44} - \fontsize{9}{9}\selectfont - \ifthenelse{\equal{\@status}{submit}}{ - \noindent \copyright{} {\@ \the\year} by the \@authornum. % - Submitted to {\em \journalname} for % - possible open access publication % - under the terms and conditions of the Creative Commons Attribution % - \ifthenelse{\equal{\@journal}{ijtpp}}{NonCommercial NoDerivatives (CC BY-NC-ND)}{(CC BY)} % - license % - \ifthenelse{\equal{\@journal}{ijtpp}}{ - (\changeurlcolor{black}% - \href{https://creativecommons.org/licenses/by-nc-nd/4.0/.}% - {https://creativecommons.org/licenses/by-nc-nd/4.0/}).% - }{% - (\changeurlcolor{black}% - \href{http://creativecommons.org/licenses/by/4.0/.}% - {http://creativecommons.org/licenses/by/4.0/}).} - }{ - \begin{minipage}{.2\textwidth} - \hspace{-1.2mm}% - \vspace{2mm}% - \href{http://creativecommons.org/}{% - \ifthenelse{\equal{\@journal}{ijtpp}}{% - \includegraphics[width=0.94\textwidth]{logo-ccby-nc-nd.eps}% - }{% - \includegraphics[width=0.94\textwidth]{logo-ccby.pdf} - } - } - \end{minipage}% - \begin{minipage}{.79\textwidth} - \copyright \ {\@copyrightyear} by the \@authornum. % - Licensee MDPI, Basel, Switzerland. % - This article is an open access article % - distributed under the terms and conditions % - of the Creative Commons Attribution % - \ifthenelse{\equal{\@journal}{ijtpp}}{NonCommercial NoDerivatives (CC BY-NC-ND)}{(CC BY)} % - license % - \ifthenelse{\equal{\@journal}{ijtpp}}{ - (\changeurlcolor{black}% - \href{https://creativecommons.org/licenses/by-nc-nd/4.0/.}% - {https://creativecommons.org/licenses/by-nc-nd/4.0/}).% - }{% - (\changeurlcolor{black}% - \href{http://creativecommons.org/licenses/by/4.0/.}% - {http://creativecommons.org/licenses/by/4.0/}).} - \end{minipage} - } - } - } - - -\endinput diff --git a/papers/Mathematics/paper.pdf b/papers/Mathematics/paper.pdf deleted file mode 100644 index ad9a354e3..000000000 Binary files a/papers/Mathematics/paper.pdf and /dev/null differ diff --git a/papers/Mathematics/paper.tex b/papers/Mathematics/paper.tex deleted file mode 100644 index 79e120cd3..000000000 --- a/papers/Mathematics/paper.tex +++ /dev/null @@ -1,956 +0,0 @@ -% LaTeX support: latex@mdpi.com -% In case you need support, please attach all files that are necessary for compiling as well as the log file, and specify the details of your LaTeX setup (which operating system and LaTeX version / tools you are using). - -%================================================================= -\documentclass[mathematics,article,submit,moreauthors,pdftex]{mdpi} - -% If you would like to post an early version of this manuscript as a preprint, you may use preprint as the journal and change 'submit' to 'accept'. The document class line would be, e.g., \documentclass[preprints,article,accept,moreauthors,pdftex]{mdpi}. This is especially recommended for submission to arXiv, where line numbers should be removed before posting. For preprints.org, the editorial staff will make this change immediately prior to posting. - -%% Some pieces required from the pandoc template -\setlist[itemize]{leftmargin=*,labelsep=5.8mm} -\setlist[enumerate]{leftmargin=*,labelsep=4.9mm} - - -%-------------------- -% Class Options: -%-------------------- -%---------- -% journal -%---------- -% Choose between the following MDPI journals: -% acoustics, actuators, addictions, admsci, aerospace, agriculture, agriengineering, agronomy, algorithms, animals, antibiotics, antibodies, antioxidants, applsci, arts, asc, asi, atmosphere, atoms, axioms, batteries, bdcc, behavsci , beverages, bioengineering, biology, biomedicines, biomimetics, biomolecules, biosensors, brainsci , buildings, cancers, carbon , catalysts, cells, ceramics, challenges, chemengineering, chemistry, chemosensors, children, cleantechnol, climate, clockssleep, cmd, coatings, colloids, computation, computers, condensedmatter, cosmetics, cryptography, crystals, dairy, data, dentistry, designs , diagnostics, diseases, diversity, drones, econometrics, economies, education, electrochem, electronics, energies, entropy, environments, epigenomes, est, fermentation, fibers, fire, fishes, fluids, foods, forecasting, forests, fractalfract, futureinternet, futurephys, galaxies, games, gastrointestdisord, gels, genealogy, genes, geohazards, geosciences, geriatrics, hazardousmatters, healthcare, heritage, highthroughput, horticulturae, humanities, hydrology, ijerph, ijfs, ijgi, ijms, ijns, ijtpp, informatics, information, infrastructures, inorganics, insects, instruments, inventions, iot, j, jcdd, jcm, jcp, jcs, jdb, jfb, jfmk, jimaging, jintelligence, jlpea, jmmp, jmse, jnt, jof, joitmc, jpm, jrfm, jsan, land, languages, laws, life, literature, logistics, lubricants, machines, magnetochemistry, make, marinedrugs, materials, mathematics, mca, medicina, medicines, medsci, membranes, metabolites, metals, microarrays, micromachines, microorganisms, minerals, modelling, molbank, molecules, mps, mti, nanomaterials, ncrna, neuroglia, nitrogen, notspecified, nutrients, ohbm, particles, pathogens, pharmaceuticals, pharmaceutics, pharmacy, philosophies, photonics, physics, plants, plasma, polymers, polysaccharides, preprints , proceedings, processes, proteomes, psych, publications, quantumrep, quaternary, qubs, reactions, recycling, religions, remotesensing, reports, resources, risks, robotics, safety, sci, scipharm, sensors, separations, sexes, signals, sinusitis, smartcities, sna, societies, socsci, soilsystems, sports, standards, stats, surfaces, surgeries, sustainability, symmetry, systems, technologies, test, toxics, toxins, tropicalmed, universe, urbansci, vaccines, vehicles, vetsci, vibration, viruses, vision, water, wem, wevj - -%--------- -% article -%--------- -% The default type of manuscript is "article", but can be replaced by: -% abstract, addendum, article, benchmark, book, bookreview, briefreport, casereport, changes, comment, commentary, communication, conceptpaper, conferenceproceedings, correction, conferencereport, expressionofconcern, extendedabstract, meetingreport, creative, datadescriptor, discussion, editorial, essay, erratum, hypothesis, interestingimages, letter, meetingreport, newbookreceived, obituary, opinion, projectreport, reply, retraction, review, perspective, protocol, shortnote, supfile, technicalnote, viewpoint -% supfile = supplementary materials - -%---------- -% submit -%---------- -% The class option "submit" will be changed to "accept" by the Editorial Office when the paper is accepted. This will only make changes to the frontpage (e.g., the logo of the journal will get visible), the headings, and the copyright information. Also, line numbering will be removed. Journal info and pagination for accepted papers will also be assigned by the Editorial Office. - -%------------------ -% moreauthors -%------------------ -% If there is only one author the class option oneauthor should be used. Otherwise use the class option moreauthors. - -%--------- -% pdftex -%--------- -% The option pdftex is for use with pdfLaTeX. If eps figures are used, remove the option pdftex and use LaTeX and dvi2pdf. - -%================================================================= -\firstpage{1} -\makeatletter -\setcounter{page}{\@firstpage} -\makeatother -\pubvolume{xx} -\issuenum{1} -\articlenumber{5} -\pubyear{2019} -\copyrightyear{2019} -%\externaleditor{Academic Editor: name} -\history{Received: date; Accepted: date; Published: date} -\updates{yes} % If there is an update available, un-comment this line - -%% MDPI internal command: uncomment if new journal that already uses continuous page numbers -%\continuouspages{yes} - -%------------------------------------------------------------------ -% The following line should be uncommented if the LaTeX file is uploaded to arXiv.org -%\pdfoutput=1 - -%================================================================= -% Add packages and commands here. The following packages are loaded in our class file: fontenc, calc, indentfirst, fancyhdr, graphicx, lastpage, ifthen, lineno, float, amsmath, setspace, enumitem, mathpazo, booktabs, titlesec, etoolbox, amsthm, hyphenat, natbib, hyperref, footmisc, geometry, caption, url, mdframed, tabto, soul, multirow, microtype, tikz - -%================================================================= -%% Please use the following mathematics environments: Theorem, Lemma, Corollary, Proposition, Characterization, Property, Problem, Example, ExamplesandDefinitions, Hypothesis, Remark, Definition -%% For proofs, please use the proof environment (the amsthm package is loaded by the MDPI class). - -%================================================================= -% Full title of the paper (Capitalized) -\Title{Check your outliers! An introduction to identifying statistical -outliers in R with \emph{easystats}} - -% Authors, for the paper (add full first names) -\Author{Rémi -Thériault$^{1,*}$\href{https://orcid.org/0000-0003-4315-6788}{\orcidicon}, Mattan -S. -Ben-Shachar$^{2}$\href{https://orcid.org/0000-0002-4287-4801}{\orcidicon}, Indrajeet -Patil$^{3}$\href{https://orcid.org/0000-0003-1995-6531}{\orcidicon}, Daniel -Lüdecke$^{4}$\href{https://orcid.org/0000-0002-8895-3206}{\orcidicon}, Brenton -M. -Wiernik$^{5}$\href{https://orcid.org/0000-0001-9560-6336}{\orcidicon}, Dominique -Makowski$^{6}$\href{https://orcid.org/0000-0001-5375-9967}{\orcidicon}} - -% Authors, for metadata in PDF -\AuthorNames{Rémi Thériault, Mattan S. Ben-Shachar, Indrajeet -Patil, Daniel Lüdecke, Brenton M. Wiernik, Dominique Makowski} - -% Affiliations / Addresses (Add [1] after \address if there is only one affiliation.) -\address{% -$^{1}$ \quad Department of Psychology, Université du Québec à Montréal, -Montréal, Québec, Canada; \\ -$^{2}$ \quad Independent Researcher; \\ -$^{3}$ \quad Center for Humans and Machines, Max Planck Institute for -Human Development, Berlin, Germany; \\ -$^{4}$ \quad Institute of Medical Sociology, University Medical Center -Hamburg-Eppendorf, Germany; \\ -$^{5}$ \quad Independent Researcher, Tampa, FL, USA; \\ -$^{6}$ \quad School of Psychology, University of Sussex, Brighton, -UK; \\ -} -% Contact information of the corresponding author -\corres{Correspondence: \href{mailto:theriault.remi@courrier.uqam.ca}{\nolinkurl{theriault.remi@courrier.uqam.ca}}.} - -% Current address and/or shared authorship - - - - - - - - -% The commands \thirdnote{} till \eighthnote{} are available for further notes - -% Simple summary -\simplesumm{The \emph{\{performance\}} package from the \emph{easystats} -ecosystem makes it easy to diagnose outliers in R and according to -current best practices thanks to the \texttt{check\_outiers()} -function.} - -% Abstract (Do not insert blank lines, i.e. \\) -\abstract{Beyond the challenge of keeping up-to-date with current best -practices regarding the diagnosis and treatment of outliers, an -additional difficulty arises concerning the mathematical implementation -of the recommended methods. In this paper, we provide an overview of -current recommandations and best practices and demonstrate how they can -easily and conveniently be implemented in the R statistical computing -software, using the \emph{\{performance\}} package of the -\emph{easystats} ecosystem. We cover univariate, multivariate, and -model-based statistical outlier detection methods, their recommended -threshold, standard output, and plotting methods. We conclude with -recommendations on the handling of outliers: the different theoretical -types of outliers, whether to exclude or winsorize them, and the -importance of transparency.} - -% Keywords -\keyword{univariate outliers; multivariate outliers; robust detection -methods; R; easystats} - -% The fields PACS, MSC, and JEL may be left empty or commented out if not applicable -%\PACS{J0101} -%\MSC{} -%\JEL{} - -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -% Only for the journal Diversity -%\LSID{\url{http://}} - -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -% Only for the journal Applied Sciences: -%\featuredapplication{Authors are encouraged to provide a concise description of the specific application or a potential application of the work. This section is not mandatory.} -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% - -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -% Only for the journal Data: -%\dataset{DOI number or link to the deposited data set in cases where the data set is published or set to be published separately. If the data set is submitted and will be published as a supplement to this paper in the journal Data, this field will be filled by the editors of the journal. In this case, please make sure to submit the data set as a supplement when entering your manuscript into our manuscript editorial system.} - -%\datasetlicense{license under which the data set is made available (CC0, CC-BY, CC-BY-SA, CC-BY-NC, etc.)} - -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -% Only for the journal Toxins -%\keycontribution{The breakthroughs or highlights of the manuscript. Authors can write one or two sentences to describe the most important part of the paper.} - -%\setcounter{secnumdepth}{4} -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% - -% Pandoc syntax highlighting -\usepackage{color} -\usepackage{fancyvrb} -\newcommand{\VerbBar}{|} -\newcommand{\VERB}{\Verb[commandchars=\\\{\}]} -\DefineVerbatimEnvironment{Highlighting}{Verbatim}{commandchars=\\\{\}} -% Add ',fontsize=\small' for more characters per line -\usepackage{framed} -\definecolor{shadecolor}{RGB}{248,248,248} -\newenvironment{Shaded}{\begin{snugshade}}{\end{snugshade}} -\newcommand{\AlertTok}[1]{\textcolor[rgb]{0.94,0.16,0.16}{#1}} -\newcommand{\AnnotationTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}} -\newcommand{\AttributeTok}[1]{\textcolor[rgb]{0.77,0.63,0.00}{#1}} -\newcommand{\BaseNTok}[1]{\textcolor[rgb]{0.00,0.00,0.81}{#1}} -\newcommand{\BuiltInTok}[1]{#1} -\newcommand{\CharTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}} -\newcommand{\CommentTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textit{#1}}} -\newcommand{\CommentVarTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}} -\newcommand{\ConstantTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}} -\newcommand{\ControlFlowTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{\textbf{#1}}} -\newcommand{\DataTypeTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{#1}} -\newcommand{\DecValTok}[1]{\textcolor[rgb]{0.00,0.00,0.81}{#1}} -\newcommand{\DocumentationTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}} -\newcommand{\ErrorTok}[1]{\textcolor[rgb]{0.64,0.00,0.00}{\textbf{#1}}} -\newcommand{\ExtensionTok}[1]{#1} -\newcommand{\FloatTok}[1]{\textcolor[rgb]{0.00,0.00,0.81}{#1}} -\newcommand{\FunctionTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}} -\newcommand{\ImportTok}[1]{#1} -\newcommand{\InformationTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}} -\newcommand{\KeywordTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{\textbf{#1}}} -\newcommand{\NormalTok}[1]{#1} -\newcommand{\OperatorTok}[1]{\textcolor[rgb]{0.81,0.36,0.00}{\textbf{#1}}} -\newcommand{\OtherTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{#1}} -\newcommand{\PreprocessorTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textit{#1}}} -\newcommand{\RegionMarkerTok}[1]{#1} -\newcommand{\SpecialCharTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}} -\newcommand{\SpecialStringTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}} -\newcommand{\StringTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}} -\newcommand{\VariableTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}} -\newcommand{\VerbatimStringTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}} -\newcommand{\WarningTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}} - -% tightlist command for lists without linebreak -\providecommand{\tightlist}{% - \setlength{\itemsep}{0pt}\setlength{\parskip}{0pt}} - -% From pandoc table feature -\usepackage{longtable,booktabs,array} -\usepackage{calc} % for calculating minipage widths -% Correct order of tables after \paragraph or \subparagraph -\usepackage{etoolbox} -\makeatletter -\patchcmd\longtable{\par}{\if@noskipsec\mbox{}\fi\par}{}{} -\makeatother -% Allow footnotes in longtable head/foot -\IfFileExists{footnotehyper.sty}{\usepackage{footnotehyper}}{\usepackage{footnote}} -\makesavenoteenv{longtable} - - - -\begin{document} - - -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% - -\hypertarget{introduction}{% -\section{Introduction}\label{introduction}} - -Real-life data often contain observations that can be considered -\emph{abnormal} when compared to the main population. The cause of -it---be it because they belong to a different distribution (originating -from a different generative process) or simply being extreme cases, -statistically rare but not impossible---can be hard to assess, and the -boundaries of ``abnormal'' are hard to define. - -Nonetheless, the improper handling of these outliers can substantially -affect statistical model estimations, biasing effect estimations and -weakening the models' predictive performance. It is thus essential to -address this problem in a thoughtful manner. Yet, despite the existence -of established recommendations and guidelines, many researchers still do -not treat outliers in a consistent manner, or do so using inappropriate -strategies \citep{simmons2011false, leys2013outliers}. - -One possible reason is that researchers are not aware of the existing -recommendations, or do not know how to implement them using their -analysis software. In this paper, we show how to follow current best -practices for automatic and reproducible statistical outlier detection -(SOD) using R and the \emph{\{performance\}} package -\citep{ludecke2021performance}, which is part of the \emph{easystats} -ecosystem of packages that build an R framework for easy statistical -modeling, visualization, and reporting \citep{easystatspackage}. - -\hypertarget{identifying-outliers}{% -\section{Identifying Outliers}\label{identifying-outliers}} - -Although many researchers attempt to identify outliers with measures -based on the mean (e.g., \emph{z} scores), those methods are problematic -because the mean and standard deviation themselves are not robust to the -influence of outliers and they assume normally distributed data (i.e., a -Gaussian distribution). Therefore, current guidelines recommend using -robust methods to identify outliers, such as those relying on the median -as opposed to the mean -\citep{leys2019outliers, leys2013outliers, leys2018outliers}. - -Nonetheless, which exact outlier method to use depends on many factors. -In some cases, eye-gauging odd observations can be an appropriate -solution, though many researchers will favour algorithmic solutions to -detect potential outliers, for example, based on a continuous value -expressing the observation stands out from the others. - -One of the factors to consider when selecting an algorithmic outlier -detection method is the statistical test of interest. When using a -regression model, relevant information can be found by identifying -observations that do not fit well with the model. This approach, known -as model-based outliers detection (as outliers are extracted after the -statistical model has been fit), can be contrasted with -distribution-based outliers detection, which is based on the distance -between an observation and the ``center'' of its population. Various -quantification strategies of this distance exist for the latter, both -univariate (involving only one variable at a time) or multivariate -(involving multiple variables). - -When no method is readily available to detect model-based outliers, such -as for structural equation modelling (SEM), looking for multivariate -outliers may be of relevance. For simple tests (\emph{t} tests or -correlations) that compare values of the same variable, it can be -appropriate to check for univariate outliers. However, univariate -methods can give false positives since \emph{t} tests and correlations, -ultimately, are also models/multivariable statistics. They are in this -sense more limited, but we show them nonetheless for educational -purposes. - -Importantly, whatever approach researchers choose remains a subjective -decision, which usage (and rationale) must be transparently documented -and reproducible \citep{leys2019outliers}. Researchers should commit -(ideally in a preregistration) to an outlier treatment method before -collecting the data. They should report in the paper their decisions and -details of their methods, as well as any deviation from their original -plan. These transparency practices can help reduce false positives due -to excessive researchers' degrees of freedom (i.e., choice flexibility -throughout the analysis). In the following section, we will go through -each of the mentioned methods and provide examples on how to implement -them with R. - -\hypertarget{univariate-outliers}{% -\subsection{Univariate Outliers}\label{univariate-outliers}} - -Researchers frequently attempt to identify outliers using measures of -deviation from the center of a variable's distribution. One of the most -popular such procedure is the \emph{z} score transformation, which -computes the distance in standard deviation (SD) from the mean. However, -as mentioned earlier, this popular method is not robust. Therefore, for -univariate outliers, it is recommended to use the median along with the -Median Absolute Deviation (MAD), which are more robust than the -interquartile range or the mean and its standard deviation -\citep{leys2019outliers, leys2013outliers}. - -Researchers can identify outliers based on robust (i.e., MAD-based) -\emph{z} scores using the \texttt{check\_outliers()} function of the -\emph{\{performance\}} package, by specifying -\texttt{method\ =\ "zscore\_robust"}.\footnote{Note that - \texttt{check\_outliers()} only checks numeric variables.} Although -\citet{leys2013outliers} suggest a default threshold of 2.5 and -\citet{leys2019outliers} a threshold of 3, \emph{\{performance\}} uses -by default a less conservative threshold of -\textasciitilde3.29.\footnote{3.29 is an approximation of the two-tailed - critical value for \emph{p} \textless{} .001, obtained through - \texttt{qnorm(p\ =\ 1\ -\ 0.001\ /\ 2)}. We chose this threshold for - consistency with the thresholds of all our other methods.} That is, -data points will be flagged as outliers if they go beyond +/- -\textasciitilde3.29 MAD. Users can adjust this threshold using the -\texttt{threshold} argument, as demonstrated below. - -\begin{Shaded} -\begin{Highlighting}[] -\FunctionTok{library}\NormalTok{(performance)} - -\CommentTok{\# Create some artificial outliers and an ID column} -\NormalTok{data }\OtherTok{\textless{}{-}} \FunctionTok{rbind}\NormalTok{(mtcars[}\DecValTok{1}\SpecialCharTok{:}\DecValTok{4}\NormalTok{], }\DecValTok{42}\NormalTok{, }\DecValTok{55}\NormalTok{)} -\NormalTok{data }\OtherTok{\textless{}{-}} \FunctionTok{cbind}\NormalTok{(}\AttributeTok{car =} \FunctionTok{row.names}\NormalTok{(data), data)} - -\NormalTok{outliers }\OtherTok{\textless{}{-}} \FunctionTok{check\_outliers}\NormalTok{(data, }\AttributeTok{method =} \StringTok{"zscore\_robust"}\NormalTok{, }\AttributeTok{ID =} \StringTok{"car"}\NormalTok{)} -\NormalTok{outliers} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -#> 2 outliers detected: cases 33, 34. -#> - Based on the following method and threshold: zscore_robust (3.09). -#> - For variables: mpg, cyl, disp, hp. -#> -#> ----------------------------------------------------------------------------- -#> -#> The following observations were considered outliers for two or more -#> variables by at least one of the selected methods: -#> -#> Row car n_Zscore_robust -#> 1 33 33 2 -#> 2 34 34 2 -#> -#> ----------------------------------------------------------------------------- -#> Outliers per variable (zscore_robust): -#> -#> $mpg -#> Row car Distance_Zscore_robust -#> 33 33 33 3.709699 -#> 34 34 34 5.848328 -#> -#> $cyl -#> Row car Distance_Zscore_robust -#> 33 33 33 12.14083 -#> 34 34 34 16.52502 -\end{verbatim} - -The row numbers of the detected outliers can be obtained by using -\texttt{which()} on the output object, which can be used for exclusions -for example: - -\begin{Shaded} -\begin{Highlighting}[] -\FunctionTok{which}\NormalTok{(outliers)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -#> [1] 33 34 -\end{verbatim} - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{data\_clean }\OtherTok{\textless{}{-}}\NormalTok{ data[}\SpecialCharTok{{-}}\FunctionTok{which}\NormalTok{(outliers), ]} -\end{Highlighting} -\end{Shaded} - -All \texttt{check\_outliers()} output objects possess a \texttt{plot()} -method, meaning it is also possible to visualize the outliers: - -\begin{Shaded} -\begin{Highlighting}[] -\FunctionTok{library}\NormalTok{(see)} - -\FunctionTok{plot}\NormalTok{(outliers)} -\end{Highlighting} -\end{Shaded} - -\begin{figure} -\includegraphics[width=1\linewidth]{paper_files/figure-latex/univariate-1} \caption{Visual depiction of outliers using the robust z-score method.}\label{fig:univariate} -\end{figure} - -Other univariate methods are available, such as using the interquartile -range (IQR), or based on different intervals, such as the Highest -Density Interval (HDI) or the Bias Corrected and Accelerated Interval -(BCI). These methods are documented and described in the function's -\href{https://easystats.github.io/performance/reference/check_outliers.html}{help -page}. - -\hypertarget{multivariate-outliers}{% -\subsection{Multivariate Outliers}\label{multivariate-outliers}} - -Univariate outliers can be useful when the focus is on a particular -variable, for instance the reaction time, as extreme values might be -indicative of inattention or non-task-related behavior\footnote{ Note - that they might not be the optimal way of treating reaction time - outliers \citep{ratcliff1993methods, van1995statistical}}. - -However, in many scenarios, variables of a data set are not independent, -and an abnormal observation will impact multiple dimensions. For -instance, a participant giving random answers to a questionnaire. In -this case, computing the \emph{z} score for each of the questions might -not lead to satisfactory results. Instead, one might want to look at -these variables together. - -One common approach for this is to compute multivariate distance metrics -such as the Mahalanobis distance. Although the Mahalanobis distance is -very popular, just like the regular \emph{z} scores method, it is not -robust and is heavily influenced by the outliers themselves. Therefore, -for multivariate outliers, it is recommended to use the Minimum -Covariance Determinant, a robust version of the Mahalanobis distance -\citep[MCD,][]{leys2018outliers, leys2019outliers}. - -In \emph{\{performance\}}'s \texttt{check\_outliers()}, one can use this -approach with \texttt{method\ =\ "mcd"}.\footnote{Our default threshold - for the MCD method is defined by - \texttt{stats::qchisq(p\ =\ 1\ -\ 0.001,\ df\ =\ ncol(x))}, which - again is an approximation of the critical value for \emph{p} - \textless{} .001 consistent with the thresholds of our other methods.} - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{outliers }\OtherTok{\textless{}{-}} \FunctionTok{check\_outliers}\NormalTok{(data, }\AttributeTok{method =} \StringTok{"mcd"}\NormalTok{)} -\NormalTok{outliers} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -#> 9 outliers detected: cases 7, 15, 16, 17, 24, 29, 31, 33, 34. -#> - Based on the following method and threshold: mcd (20). -#> - For variables: mpg, cyl, disp, hp. -\end{verbatim} - -\begin{Shaded} -\begin{Highlighting}[] -\FunctionTok{plot}\NormalTok{(outliers)} -\end{Highlighting} -\end{Shaded} - -\begin{figure} -\includegraphics[width=1\linewidth]{paper_files/figure-latex/multivariate-1} \caption{Visual depiction of outliers using the Minimum Covariance Determinant (MCD) method, a robust version of the Mahalanobis distance.}\label{fig:multivariate} -\end{figure} - -Other multivariate methods are available, such as another type of robust -Mahalanobis distance that in this case relies on an orthogonalized -Gnanadesikan-Kettenring pairwise estimator -\citep{gnanadesikan1972robust}. These methods are documented and -described in the function's -\href{https://easystats.github.io/performance/reference/check_outliers.html}{help -page}. - -\hypertarget{model-based-outliers}{% -\subsection{Model-Based Outliers}\label{model-based-outliers}} - -Working with regression models creates the possibility of using -model-based SOD methods. These methods rely on the concept of -\emph{leverage}, that is, how much influence a given observation can -have on the model estimates. If few observations have a relatively -strong leverage/influence on the model, one can suspect that the model's -estimates are biased by these observations, in which case flagging them -as outliers could prove helpful (see next section, ``Handling -Outliers''). - -In \{performance\}, two such model-based SOD methods are currently -available: Cook's distance, for regular regression models, and Pareto, -for Bayesian models. As such, \texttt{check\_outliers()} can be applied -directly on regression model objects, by simply specifying -\texttt{method\ =\ "cook"} (or \texttt{method\ =\ "pareto"} for Bayesian -models).\footnote{Our default threshold for the Cook method is defined - by \texttt{stats::qf(0.5,\ ncol(x),\ nrow(x)\ -\ ncol(x))}, which - again is an approximation of the critical value for \emph{p} - \textless{} .001 consistent with the thresholds of our other methods.} - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{model }\OtherTok{\textless{}{-}} \FunctionTok{lm}\NormalTok{(disp }\SpecialCharTok{\textasciitilde{}}\NormalTok{ mpg }\SpecialCharTok{*}\NormalTok{ disp, }\AttributeTok{data =}\NormalTok{ data)} -\NormalTok{outliers }\OtherTok{\textless{}{-}} \FunctionTok{check\_outliers}\NormalTok{(model, }\AttributeTok{method =} \StringTok{"cook"}\NormalTok{)} -\NormalTok{outliers} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -#> 1 outlier detected: case 34. -#> - Based on the following method and threshold: cook (0.708). -#> - For variable: (Whole model). -\end{verbatim} - -\begin{Shaded} -\begin{Highlighting}[] -\FunctionTok{plot}\NormalTok{(outliers)} -\end{Highlighting} -\end{Shaded} - -\begin{figure} -\includegraphics[width=1\linewidth]{paper_files/figure-latex/model-1} \caption{Visual depiction of outliers based on Cook's distance (leverage and standardized residuals).}\label{fig:model} -\end{figure} - -Table 1 below summarizes which methods to use in which cases, and with -what threshold. - -\begin{longtable}[]{@{} - >{\raggedright\arraybackslash}p{(\columnwidth - 4\tabcolsep) * \real{0.3506}} - >{\raggedright\arraybackslash}p{(\columnwidth - 4\tabcolsep) * \real{0.3161}} - >{\raggedright\arraybackslash}p{(\columnwidth - 4\tabcolsep) * \real{0.3333}}@{}} -\caption{Summary of Statistical Outlier Detection Methods -Recommendations.}\tabularnewline -\toprule() -\begin{minipage}[b]{\linewidth}\raggedright -Statistical Test -\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright -Diagnosis Method -\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright -Recommended Threshold -\end{minipage} \\ -\midrule() -\endfirsthead -\toprule() -\begin{minipage}[b]{\linewidth}\raggedright -Statistical Test -\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright -Diagnosis Method -\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright -Recommended Threshold -\end{minipage} \\ -\midrule() -\endhead -Supported regression model & \textbf{Model-based}: Cook (or Pareto for -Bayesian models) & \texttt{qf(0.5,\ ncol(x),\ nrow(x)\ -\ ncol(x))} (or -0.7 for Pareto) \\ -Structural Equation Modeling (or other unsupported model) & -\textbf{Multivariate}: Minimum Covariance Determinant (MCD) & -\texttt{qchisq(p\ =\ 1\ -\ 0.001,\ df\ =\ ncol(x))} \\ -Simple test with few variables (\emph{t} test, correlation, etc.) & -\textbf{Univariate}: robust \emph{z} scores (MAD) & -\texttt{qnorm(p\ =\ 1\ -\ 0.001\ /\ 2)}, \textasciitilde{} 3.29 \\ -\bottomrule() -\end{longtable} - -\hypertarget{cooks-distance-vs.-mcd}{% -\subsubsection{Cook's Distance vs.~MCD}\label{cooks-distance-vs.-mcd}} - -\citet{leys2018outliers} report a preference for the MCD method over -Cook's distance. This is because Cook's distance removes one observation -at a time and checks its corresponding influence on the model each time -\citep{cook1977detection}, and flags any observation that has a large -influence. In the view of these authors, when there are several -outliers, the process of removing a single outlier at a time is -problematic as the model remains ``contaminated'' or influenced by other -possible outliers in the model, rendering this method suboptimal in the -presence of multiple outliers. - -However, distribution-based approaches are not a silver bullet either, -and there are cases where the usage of methods agnostic to theoretical -and statistical models of interest might be problematic. For example, a -very tall person would be expected to also be much heavier than average, -but that would still fit with the expected association between height -and weight (i.e., it would be in line with a model such as -\texttt{weight\ \textasciitilde{}\ height}). In contrast, using -multivariate outlier detection methods there may flag this person as -being an outlier---being unusual on two variables, height and -weight---even though the pattern fits perfectly with our predictions. - -In the example below, we plot the raw data and see two possible -outliers. The first one falls along the regression line, and is -therefore ``in line'' with our hypothesis. The second one clearly -diverges from the regression line, and therefore we can conclude that -this outlier may have a disproportionate influence on our model. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{data }\OtherTok{\textless{}{-}}\NormalTok{ women[}\FunctionTok{rep}\NormalTok{(}\FunctionTok{seq\_len}\NormalTok{(}\FunctionTok{nrow}\NormalTok{(women)), }\AttributeTok{each =} \DecValTok{100}\NormalTok{), ]} -\NormalTok{data }\OtherTok{\textless{}{-}} \FunctionTok{rbind}\NormalTok{(data, }\FunctionTok{c}\NormalTok{(}\DecValTok{100}\NormalTok{, }\DecValTok{258}\NormalTok{), }\FunctionTok{c}\NormalTok{(}\DecValTok{100}\NormalTok{, }\DecValTok{200}\NormalTok{))} -\NormalTok{model }\OtherTok{\textless{}{-}} \FunctionTok{lm}\NormalTok{(weight }\SpecialCharTok{\textasciitilde{}}\NormalTok{ height, data)} -\NormalTok{rempsyc}\SpecialCharTok{::}\FunctionTok{nice\_scatter}\NormalTok{(data, }\StringTok{"height"}\NormalTok{, }\StringTok{"weight"}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\begin{figure} -\includegraphics[width=1\linewidth]{paper_files/figure-latex/scatter-1} \caption{Scatter plot of height and weight, with two extreme observations: one model-consistent (top-right) and the other, model-inconsistent (i.e., an outlier; bottom-right).}\label{fig:scatter} -\end{figure} - -Using either the \emph{z}-score or MCD methods, our model-consistent -observation will be incorrectly flagged as an outlier or influential -observation. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{outliers }\OtherTok{\textless{}{-}} \FunctionTok{check\_outliers}\NormalTok{(model, }\AttributeTok{method =} \FunctionTok{c}\NormalTok{(}\StringTok{"zscore\_robust"}\NormalTok{, }\StringTok{"mcd"}\NormalTok{))} -\FunctionTok{which}\NormalTok{(outliers)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -#> [1] 1501 1502 -\end{verbatim} - -In contrast, the model-based detection method displays the desired -behaviour: it correctly flags the person who is very tall but very -light, without flagging the person who is both tall and heavy. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{outliers }\OtherTok{\textless{}{-}} \FunctionTok{check\_outliers}\NormalTok{(model, }\AttributeTok{method =} \StringTok{"cook"}\NormalTok{)} -\FunctionTok{which}\NormalTok{(outliers)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -#> [1] 1502 -\end{verbatim} - -\begin{Shaded} -\begin{Highlighting}[] -\FunctionTok{plot}\NormalTok{(outliers)} -\end{Highlighting} -\end{Shaded} - -\begin{figure} -\includegraphics[width=1\linewidth]{paper_files/figure-latex/model2-1} \caption{The leverage method (Cook's distance) correctly distinguishes the true outlier from the model-consistent extreme observation).}\label{fig:model2} -\end{figure} - -Finally, unusual observations happen naturally: extreme observations are -expected even when taken from a normal distribution. While statistical -models can integrate this ``expectation'', multivariate outlier methods -might be too conservative, flagging too many observations despite -belonging to the right generative process. For these reasons, we believe -that model-based methods are still preferable to the MCD when using -supported regression models. Additionally, if the presence of multiple -outliers is a significant concern, regression methods that are more -robust to outliers should be considered---like \emph{t} regression or -quantile regression---as they render their precise identification less -critical \citep{mcelreath2020statistical}. - -\hypertarget{multiple-methods}{% -\subsection{Multiple Methods}\label{multiple-methods}} - -An alternative approach that is possible is to combine several methods, -based on the assumption that different methods provide different angles -of looking at the problem. By applying a variety of methods, one can -hope to ``triangulate'' the true outliers (those consistently flagged by -multiple methods) and thus attempt to minimize false positives. - -In practice, this approach computes a composite outlier score, formed of -the average of the binary (0 or 1) classification results of each -method. It represents the probability that each observation is -classified as an outlier by at least one method. The default decision -rule classifies rows with composite outlier scores superior or equal to -0.5 as outlier observations (i.e., that were classified as outliers by -at least half of the methods). In \emph{\{performance\}}'s -\texttt{check\_outliers()}, one can use this approach by including all -desired methods in the corresponding argument. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{outliers }\OtherTok{\textless{}{-}} \FunctionTok{check\_outliers}\NormalTok{(model, }\AttributeTok{method =} \FunctionTok{c}\NormalTok{(}\StringTok{"zscore\_robust"}\NormalTok{, }\StringTok{"mcd"}\NormalTok{, }\StringTok{"cook"}\NormalTok{))} -\FunctionTok{which}\NormalTok{(outliers)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -#> [1] 1501 1502 -\end{verbatim} - -Outliers (counts or per variables) for individual methods can then be -obtained through attributes. For example: - -\begin{Shaded} -\begin{Highlighting}[] -\FunctionTok{attributes}\NormalTok{(outliers)}\SpecialCharTok{$}\NormalTok{outlier\_var}\SpecialCharTok{$}\NormalTok{zscore\_robust} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -#> $weight -#> Row Distance_Zscore_robust -#> 1501 1501 6.913530 -#> 1502 1502 3.653492 -#> -#> $height -#> Row Distance_Zscore_robust -#> 1501 1501 5.901794 -#> 1502 1502 5.901794 -\end{verbatim} - -An example sentence for reporting the usage of the composite method -could be: - -\begin{quote} -Based on a composite outlier score (see the `check\_outliers()' function -in the `performance' R package, \citep{ludecke2021performance}) obtained -via the joint application of multiple outliers detection algorithms ((a) -median absolute deviation (MAD)-based robust \emph{z} scores, -\citep{leys2013outliers}; (b) Mahalanobis minimum covariance determinant -(MCD), \citep{leys2019outliers}; and (c) Cook's distance, -\citep{cook1977detection}), we excluded two participants that were -classified as outliers by at least half of the methods used. -\end{quote} - -\hypertarget{handling-outliers}{% -\section{Handling Outliers}\label{handling-outliers}} - -The above section demonstrated how to identify outliers using the -\texttt{check\_outliers()} function in the \emph{\{performance\}} -package. But what should we do with these outliers once identified? -Although it is common to automatically discard any observation that has -been marked as ``an outlier'' as if it might infect the rest of the data -with its statistical ailment, we believe that the use of SOD methods is -but one step in the get-to-know-your-data pipeline; a researcher or -analyst's \emph{domain knowledge} must be involved in the decision of -how to deal with observations marked as outliers by means of SOD. -Indeed, automatic tools can help detect outliers, but they are nowhere -near perfect. Although they can be useful to flag suspect data, they can -have misses and false alarms, and they cannot replace human eyes and -proper vigilance from the researcher. If you do end up manually -inspecting your data for outliers, it can be helpful to think of -outliers as belonging to different types of outliers, or categories, -which can help decide what to do with a given outlier. - -\hypertarget{error-interesting-and-random-outliers}{% -\subsection{Error, Interesting, and Random -Outliers}\label{error-interesting-and-random-outliers}} - -\citet{leys2019outliers} distinguish between error outliers, interesting -outliers, and random outliers. \emph{Error outliers} are likely due to -human error and should be corrected before data analysis or outright -removed since they are invalid observations. \emph{Interesting outliers} -are not due to technical error and may be of theoretical interest; it -might thus be relevant to investigate them further even though they -should be removed from the current analysis of interest. \emph{Random -outliers} are assumed to be due to chance alone and to belong to the -correct distribution and, therefore, should be retained. - -It is recommended to \emph{keep} observations which are expected to be -part of the distribution of interest, even if they are outliers -\citep{leys2019outliers}. However, if it is suspected that the outliers -belong to an alternative distribution, then those observations could -have a large impact on the results and call into question their -robustness, especially if significance is conditional on their -inclusion. - -On the other hand, there are also outliers that cannot be detected by -statistical tools, but should be found and removed. For example, if we -are studying the effects of X on Y among teenagers and we have one -observation from a 20-year-old, this observation might not be a -\emph{statistical outlier}, but it is an outlier in the \emph{context} -of our research, and should be discarded to allow for valid inferences. - -\hypertarget{winsorization}{% -\subsection{Winsorization}\label{winsorization}} - -\emph{Removing} outliers can in this case be a valid strategy, and -ideally one would report results with and without outliers to see the -extent of their impact on results. This approach however can reduce -statistical power. Therefore, some propose a \emph{recoding} approach, -namely, winsorization: bringing outliers back within acceptable limits -\citep[e.g., 3 MADs,][]{tukey1963less}. However, if possible, it is -recommended to collect enough data so that even after removing outliers, -there is still sufficient statistical power without having to resort to -winsorization \citep{leys2019outliers}. - -The \emph{easystats} ecosystem makes it easy to incorporate this step -into your workflow through the \texttt{winsorize()} function of -\emph{\{datawizard\}}, a lightweight R package to facilitate data -wrangling and statistical transformations \citep{patil2022datawizard}. -This procedure will bring back univariate outliers within the limits of -`acceptable' values, based either on the percentile, the \emph{z} score, -or its robust alternative based on the MAD. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{data[}\DecValTok{1501}\SpecialCharTok{:}\DecValTok{1502}\NormalTok{, ] }\CommentTok{\# See outliers rows} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -#> height weight -#> 1501 100 258 -#> 1502 100 200 -\end{verbatim} - -\begin{Shaded} -\begin{Highlighting}[] -\CommentTok{\# Winsorizing using the MAD} -\FunctionTok{library}\NormalTok{(datawizard)} -\NormalTok{winsorized\_data }\OtherTok{\textless{}{-}} \FunctionTok{winsorize}\NormalTok{(data, }\AttributeTok{method =} \StringTok{"zscore"}\NormalTok{, }\AttributeTok{robust =} \ConstantTok{TRUE}\NormalTok{, }\AttributeTok{threshold =} \DecValTok{3}\NormalTok{)} - -\CommentTok{\# Values \textgreater{} +/{-} MAD have been winsorized} -\NormalTok{winsorized\_data[}\DecValTok{1501}\SpecialCharTok{:}\DecValTok{1502}\NormalTok{, ]} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -#> height weight -#> 1501 82.7912 188.3736 -#> 1502 82.7912 188.3736 -\end{verbatim} - -\hypertarget{the-importance-of-transparency}{% -\subsection{The Importance of -Transparency}\label{the-importance-of-transparency}} - -Once again, it is a critical part of a sound outlier treatment that -regardless of which SOD method used, it should be reported in a -reproducible manner. Ideally, the handling of outliers should be -specified \emph{a priori} with as much detail as possible, and -preregistered, to limit researchers' degrees of freedom and therefore -risks of false positives \citep{leys2019outliers}. This is especially -true given that interesting outliers and random outliers are often times -hard to distinguish in practice. Thus, researchers should always -prioritize transparency and report all of the following information: (a) -how many outliers were identified; (b) according to which method and -criteria, (c) using which function of which R package (if applicable), -and (d) how they were handled (excluded or winsorized, if the latter, -using what threshold). If at all possible, (e) the corresponding code -script along with the data should be shared on a public repository like -the Open Science Framework (OSF), so that the exclusion criteria can be -reproduced precisely. - -\hypertarget{conclusion}{% -\section{Conclusion}\label{conclusion}} - -In this paper, we have showed how to investigate outliers using the -\texttt{check\_outliers()} function of the \emph{\{performance\}} -package while following current good practices. However, best practice -for outlier treatment does not stop at using appropriate statistical -algorithms, but entails respecting existing recommendations, such as -preregistration, reproducibility, consistency, transparency, and -justification. Ideally, one would additionally also report the package, -function, and threshold used (linking to the full code when possible). -We hope that this paper and the accompanying \texttt{check\_outlier()} -function of \emph{easystats} will help researchers engage in good -research practices while providing a smooth outlier detection -experience. - -% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -% %% optional -% \supplementary{The following are available online at www.mdpi.com/link, Figure S1: title, Table S1: title, Video S1: title.} -% -% % Only for the journal Methods and Protocols: -% % If you wish to submit a video article, please do so with any other supplementary material. -% % \supplementary{The following are available at www.mdpi.com/link: Figure S1: title, Table S1: title, Video S1: title. A supporting video article is available at doi: link.} - -\vspace{6pt} - -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -\acknowledgments{\emph{\{performance\}} is part of the collaborative -\href{https://github.com/easystats/easystats}{\emph{easystats}} -ecosystem \citep{easystatspackage}. Thus, we thank all -\href{https://github.com/orgs/easystats/people}{members of easystats}, -contributors, and users alike.} - -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -\authorcontributions{R.T. drafted the paper; all authors contributed to -both the writing of the paper and the conception of the software.} - -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -\conflictsofinterest{The authors declare no conflict of interest.} - -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -%% optional -\abbreviations{The following abbreviations are used in this manuscript:\\ - -\noindent -\begin{tabular}{@{}ll} -SOD & Statistical outlier detection \\ -SEM & Structural equation modelling \\ -SD & Standard deviation \\ -MAD & Median absolute deviation \\ -IQR & Interquartile range \\ -HDI & Highest density interval \\ -BCI & Bias corrected and accelerated interval \\ -MCD & Minimum covariance determinant \\ -ICS & invariant coordinate selection \\ -OSF & Open Science Framework \\ -\end{tabular}} - - -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -% Citations and References in Supplementary files are permitted provided that they also appear in the reference list here. - -%===================================== -% References, variant A: internal bibliography -%===================================== -%\reftitle{References} -%\begin{thebibliography}{999} -% Reference 1 -%\bibitem[Author1(year)]{ref-journal} -%Author1, T. The title of the cited article. {\em Journal Abbreviation} {\bf 2008}, {\em 10}, 142--149. -% Reference 2 -%\bibitem[Author2(year)]{ref-book} -%Author2, L. The title of the cited contribution. In {\em The Book Title}; Editor1, F., Editor2, A., Eds.; Publishing House: City, Country, 2007; pp. 32--58. -%\end{thebibliography} - -% The following MDPI journals use author-date citation: Arts, Econometrics, Economies, Genealogy, Humanities, IJFS, JRFM, Laws, Religions, Risks, Social Sciences. For those journals, please follow the formatting guidelines on http://www.mdpi.com/authors/references -% To cite two works by the same author: \citeauthor{ref-journal-1a} (\citeyear{ref-journal-1a}, \citeyear{ref-journal-1b}). This produces: Whittaker (1967, 1975) -% To cite two works by the same author with specific pages: \citeauthor{ref-journal-3a} (\citeyear{ref-journal-3a}, p. 328; \citeyear{ref-journal-3b}, p.475). This produces: Wong (1999, p. 328; 2000, p. 475) - -%===================================== -% References, variant B: external bibliography -%===================================== -\reftitle{References} -\externalbibliography{yes} -\bibliography{mybibfile.bib} - -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -%% optional - -%% for journal Sci -%\reviewreports{\\ -%Reviewer 1 comments and authors’ response\\ -%Reviewer 2 comments and authors’ response\\ -%Reviewer 3 comments and authors’ response -%} - -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% - - -\end{document} diff --git a/papers/Mathematics/paper_files/figure-latex/model2-1.pdf b/papers/Mathematics/paper_files/figure-latex/model2-1.pdf deleted file mode 100644 index 09f4b43c0..000000000 Binary files a/papers/Mathematics/paper_files/figure-latex/model2-1.pdf and /dev/null differ diff --git a/papers/Mathematics/paper_files/figure-latex/multimethod-1.pdf b/papers/Mathematics/paper_files/figure-latex/multimethod-1.pdf deleted file mode 100644 index 899d7efff..000000000 Binary files a/papers/Mathematics/paper_files/figure-latex/multimethod-1.pdf and /dev/null differ diff --git a/papers/Mathematics/paper_files/figure-latex/multivariate-1.pdf b/papers/Mathematics/paper_files/figure-latex/multivariate-1.pdf deleted file mode 100644 index 19572182e..000000000 Binary files a/papers/Mathematics/paper_files/figure-latex/multivariate-1.pdf and /dev/null differ diff --git a/papers/Mathematics/paper_files/figure-latex/scatter-1.pdf b/papers/Mathematics/paper_files/figure-latex/scatter-1.pdf deleted file mode 100644 index ff314a816..000000000 Binary files a/papers/Mathematics/paper_files/figure-latex/scatter-1.pdf and /dev/null differ diff --git a/papers/Mathematics/paper_files/figure-latex/univariate-1.pdf b/papers/Mathematics/paper_files/figure-latex/univariate-1.pdf deleted file mode 100644 index bd9572308..000000000 Binary files a/papers/Mathematics/paper_files/figure-latex/univariate-1.pdf and /dev/null differ diff --git a/papers/Mathematics/paper_files/figure-latex/unnamed-chunk-3-1.pdf b/papers/Mathematics/paper_files/figure-latex/unnamed-chunk-3-1.pdf deleted file mode 100644 index 23f1cf478..000000000 Binary files a/papers/Mathematics/paper_files/figure-latex/unnamed-chunk-3-1.pdf and /dev/null differ diff --git a/papers/Mathematics/paper_files/figure-latex/unnamed-chunk-4-1.pdf b/papers/Mathematics/paper_files/figure-latex/unnamed-chunk-4-1.pdf deleted file mode 100644 index 3316d1de5..000000000 Binary files a/papers/Mathematics/paper_files/figure-latex/unnamed-chunk-4-1.pdf and /dev/null differ diff --git a/papers/Mathematics/paper_files/figure-latex/unnamed-chunk-5-1.pdf b/papers/Mathematics/paper_files/figure-latex/unnamed-chunk-5-1.pdf deleted file mode 100644 index 74d6134f7..000000000 Binary files a/papers/Mathematics/paper_files/figure-latex/unnamed-chunk-5-1.pdf and /dev/null differ diff --git a/papers/Mathematics/paper_files/figure-latex/unnamed-chunk-5-2.pdf b/papers/Mathematics/paper_files/figure-latex/unnamed-chunk-5-2.pdf deleted file mode 100644 index ef8134512..000000000 Binary files a/papers/Mathematics/paper_files/figure-latex/unnamed-chunk-5-2.pdf and /dev/null differ diff --git a/tests/testthat/_snaps/check_collinearity.md b/tests/testthat/_snaps/check_collinearity.md index 3e9aa24b7..dec439e85 100644 --- a/tests/testthat/_snaps/check_collinearity.md +++ b/tests/testthat/_snaps/check_collinearity.md @@ -12,3 +12,26 @@ P 1.00 1.00 1.00 K 1.00 1.00 1.00 +# check_collinearity, hurdle/zi models w/o zi-formula + + Code + print(out) + Output + # Check for Multicollinearity + + * conditional component: + + Low Correlation + + Term VIF VIF 95% CI Increased SE Tolerance Tolerance 95% CI + fem 1.06 [1.02, 1.20] 1.03 0.95 [0.83, 0.98] + mar 1.06 [1.02, 1.20] 1.03 0.95 [0.83, 0.98] + + * zero inflated component: + + Low Correlation + + Term VIF VIF 95% CI Increased SE Tolerance Tolerance 95% CI + fem 1.07 [1.02, 1.20] 1.03 0.94 [0.83, 0.98] + mar 1.07 [1.02, 1.20] 1.03 0.94 [0.83, 0.98] + diff --git a/tests/testthat/_snaps/windows/check_itemscale.md b/tests/testthat/_snaps/windows/check_itemscale.md new file mode 100644 index 000000000..48e0bd482 --- /dev/null +++ b/tests/testthat/_snaps/windows/check_itemscale.md @@ -0,0 +1,24 @@ +# check_itemscale + + Code + print(out) + Output + # Description of (Sub-)ScalesComponent 1 + + Item | Missings | Mean | SD | Skewness | Difficulty | Discrimination | alpha if deleted + ----------------------------------------------------------------------------------------- + b | 0 | 5.02 | 0.79 | -0.04 | 0.84 | 0.06 | -0.55 + e | 0 | 2.12 | 0.81 | -0.22 | 0.35 | -0.09 | -0.03 + f | 0 | 2.00 | 0.82 | 0.00 | 0.33 | -0.16 | 0.17 + + Mean inter-item-correlation = -0.046 Cronbach's alpha = -0.159 + Component 2 + + Item | Missings | Mean | SD | Skewness | Difficulty | Discrimination | alpha if deleted + ----------------------------------------------------------------------------------------- + a | 0 | 5.02 | 0.83 | -0.04 | 0.84 | 0.21 | -0.18 + c | 0 | 4.74 | 0.81 | 0.51 | 0.79 | -0.04 | 0.41 + d | 0 | 2.07 | 0.79 | -0.13 | 0.34 | 0.13 | 0.04 + + Mean inter-item-correlation = 0.067 Cronbach's alpha = 0.178 + diff --git a/tests/testthat/test-binned_residuals.R b/tests/testthat/test-binned_residuals.R index 4aa69e0ec..e23e05d01 100644 --- a/tests/testthat/test-binned_residuals.R +++ b/tests/testthat/test-binned_residuals.R @@ -1,10 +1,10 @@ test_that("binned_residuals", { data(mtcars) model <- glm(vs ~ wt + mpg, data = mtcars, family = "binomial") - result <- binned_residuals(model) + result <- binned_residuals(model, ci_type = "gaussian", residuals = "response") expect_named( result, - c("xbar", "ybar", "n", "x.lo", "x.hi", "se", "ci_range", "CI_low", "CI_high", "group") + c("xbar", "ybar", "n", "x.lo", "x.hi", "se", "CI_low", "CI_high", "group") ) expect_equal( result$xbar, @@ -16,16 +16,25 @@ test_that("binned_residuals", { c(-0.03786, -0.09514, 0.07423, -0.07955, 0.28891, -0.13786), tolerance = 1e-4 ) + expect_equal( + result$CI_low, + c(-0.05686, -0.12331, -0.35077, -0.57683, 0.17916, -0.44147), + tolerance = 1e-4 + ) + expect_identical( + capture.output(print(result)), + "Warning: Probably bad model fit. Only about 50% of the residuals are inside the error bounds." + ) }) test_that("binned_residuals, n_bins", { data(mtcars) model <- glm(vs ~ wt + mpg, data = mtcars, family = "binomial") - result <- binned_residuals(model, n_bins = 10) + result <- binned_residuals(model, ci_type = "gaussian", residuals = "response", n_bins = 10) expect_named( result, - c("xbar", "ybar", "n", "x.lo", "x.hi", "se", "ci_range", "CI_low", "CI_high", "group") + c("xbar", "ybar", "n", "x.lo", "x.hi", "se", "CI_low", "CI_high", "group") ) expect_equal( result$xbar, @@ -49,10 +58,10 @@ test_that("binned_residuals, n_bins", { test_that("binned_residuals, terms", { data(mtcars) model <- glm(vs ~ wt + mpg, data = mtcars, family = "binomial") - result <- binned_residuals(model, term = "mpg") + result <- binned_residuals(model, ci_type = "gaussian", residuals = "response", term = "mpg") expect_named( result, - c("xbar", "ybar", "n", "x.lo", "x.hi", "se", "ci_range", "CI_low", "CI_high", "group") + c("xbar", "ybar", "n", "x.lo", "x.hi", "se", "CI_low", "CI_high", "group") ) expect_equal( result$xbar, @@ -65,3 +74,174 @@ test_that("binned_residuals, terms", { tolerance = 1e-4 ) }) + + +test_that("binned_residuals, deviance residuals, gaussian CI", { + data(mtcars) + model <- glm(vs ~ wt + mpg, data = mtcars, family = "binomial") + result <- binned_residuals(model, residuals = "deviance", ci_type = "gaussian") + expect_named( + result, + c("xbar", "ybar", "n", "x.lo", "x.hi", "se", "CI_low", "CI_high", "group") + ) + expect_equal( + result$xbar, + c(0.03786, 0.09514, 0.25911, 0.47955, 0.71109, 0.97119), + tolerance = 1e-4 + ) + expect_equal( + result$ybar, + c(-0.26905, -0.44334, 0.03763, -0.19917, 0.81563, -0.23399), + tolerance = 1e-4 + ) + expect_equal( + result$ybar, + c(-0.26905, -0.44334, 0.03763, -0.19917, 0.81563, -0.23399), + tolerance = 1e-4 + ) + expect_equal( + result$CI_low, + c(-0.33985, -0.50865, -0.98255, -1.36025, 0.61749, -1.00913), + tolerance = 1e-4 + ) +}) + + +test_that("binned_residuals, default", { + data(mtcars) + model <- glm(vs ~ wt + mpg, data = mtcars, family = "binomial") + result <- binned_residuals(model) + expect_named( + result, + c("xbar", "ybar", "n", "x.lo", "x.hi", "se", "CI_low", "CI_high", "group") + ) + expect_equal( + result$xbar, + c(0.03786, 0.09514, 0.25911, 0.47955, 0.71109, 0.97119), + tolerance = 1e-4 + ) + expect_equal( + result$ybar, + c(-0.26905, -0.44334, 0.03763, -0.19917, 0.81563, -0.23399), + tolerance = 1e-4 + ) + expect_equal( + result$CI_low, + c(-0.52997, -0.70426, -0.32935, -0.59948, 0.55472, -0.55251), + tolerance = 1e-4 + ) +}) + + +test_that("binned_residuals, bootstrapped CI", { + skip_on_cran() + data(mtcars) + model <- glm(vs ~ wt + mpg, data = mtcars, family = "binomial") + set.seed(123) + result <- binned_residuals(model, ci_type = "boot", iterations = 100) + expect_named( + result, + c("xbar", "ybar", "n", "x.lo", "x.hi", "se", "CI_low", "CI_high", "group") + ) + expect_equal( + result$xbar, + c(0.03786, 0.09514, 0.25911, 0.47955, 0.71109, 0.97119), + tolerance = 1e-4 + ) + expect_equal( + result$ybar, + c(-0.26905, -0.44334, 0.03763, -0.19917, 0.81563, -0.23399), + tolerance = 1e-4 + ) + expect_equal( + result$CI_low, + c(-0.32623, -0.50543, -0.80879, -1.15154, 0.67569, -0.65748), + tolerance = 1e-4 + ) +}) + +test_that("binned_residuals, msg for non-bernoulli", { + skip_on_cran() + skip_if(packageVersion("insight") < "0.19.7") + tot <- rep(10, 100) + suc <- rbinom(100, prob = 0.9, size = tot) + + dat <- data.frame(tot, suc) + dat$prop <- suc / tot + dat$x1 <- as.factor(sample.int(5, 100, replace = TRUE)) + + mod <- glm(prop ~ x1, + family = binomial, + data = dat, + weights = tot + ) + + expect_message(binned_residuals(mod), regex = "Using `ci_type = \"gaussian\"`") + expect_silent(binned_residuals(mod, verbose = FALSE)) +}) + +test_that("binned_residuals, empty bins", { + eel <- data.frame( + cured_bin = c( + 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, + 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, + 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, + 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, + 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, + 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0 + ), + intervention = c( + "No treatment", + "No treatment", "No treatment", "No treatment", "Intervention", + "No treatment", "Intervention", "Intervention", "No treatment", + "No treatment", "Intervention", "No treatment", "No treatment", + "Intervention", "No treatment", "No treatment", "Intervention", + "Intervention", "Intervention", "Intervention", "No treatment", + "Intervention", "Intervention", "No treatment", "Intervention", + "Intervention", "No treatment", "No treatment", "Intervention", + "Intervention", "No treatment", "No treatment", "Intervention", + "Intervention", "Intervention", "No treatment", "No treatment", + "Intervention", "No treatment", "Intervention", "No treatment", + "Intervention", "Intervention", "Intervention", "No treatment", + "No treatment", "No treatment", "Intervention", "Intervention", + "No treatment", "Intervention", "Intervention", "Intervention", + "No treatment", "No treatment", "Intervention", "Intervention", + "No treatment", "Intervention", "Intervention", "No treatment", + "No treatment", "No treatment", "Intervention", "Intervention", + "No treatment", "No treatment", "No treatment", "No treatment", + "No treatment", "Intervention", "No treatment", "Intervention", + "Intervention", "Intervention", "No treatment", "Intervention", + "Intervention", "No treatment", "Intervention", "No treatment", + "No treatment", "Intervention", "Intervention", "Intervention", + "Intervention", "No treatment", "Intervention", "Intervention", + "No treatment", "Intervention", "No treatment", "Intervention", + "Intervention", "Intervention", "Intervention", "No treatment", + "No treatment", "No treatment", "Intervention", "No treatment", + "No treatment", "Intervention", "No treatment", "No treatment", + "No treatment", "No treatment", "No treatment", "Intervention", + "Intervention", "No treatment", "No treatment", "Intervention" + ), duration = c( + 7L, 7L, 6L, 8L, 7L, 6L, 7L, 7L, 8L, 7L, 7L, 7L, + 5L, 9L, 6L, 7L, 8L, 7L, 7L, 9L, 7L, 9L, 8L, 7L, 6L, 8L, 7L, 6L, + 7L, 6L, 7L, 6L, 5L, 6L, 7L, 7L, 8L, 7L, 5L, 7L, 9L, 10L, 7L, + 8L, 5L, 8L, 4L, 7L, 8L, 6L, 6L, 6L, 7L, 7L, 8L, 7L, 7L, 7L, 7L, + 8L, 7L, 9L, 7L, 8L, 8L, 7L, 7L, 7L, 8L, 7L, 8L, 7L, 8L, 8L, 9L, + 7L, 10L, 5L, 7L, 8L, 9L, 5L, 10L, 8L, 7L, 6L, 5L, 6L, 7L, 7L, + 7L, 7L, 7L, 7L, 8L, 5L, 6L, 7L, 6L, 7L, 7L, 9L, 6L, 6L, 7L, 7L, + 6L, 7L, 8L, 9L, 4L, 6L, 9L + ), + stringsAsFactors = FALSE + ) + m_eel <- glm(cured_bin ~ intervention + duration, data = eel, family = binomial()) + out <- binned_residuals(m_eel) + expect_equal( + out$xbar, + c(0.27808, 0.28009, 0.28167, 0.28326, 0.48269, 0.56996, 0.57188, 0.57456), + tolerance = 1e-4 + ) + expect_equal( + out$CI_low, + c(-0.42552, -0.45162, -0.10819, -0.7339, -0.28086, -0.52599, 0.02795, -0.44023), + tolerance = 1e-4 + ) +}) diff --git a/tests/testthat/test-check_autocorrelation.R b/tests/testthat/test-check_autocorrelation.R index b389f3985..a97b2eeaa 100644 --- a/tests/testthat/test-check_autocorrelation.R +++ b/tests/testthat/test-check_autocorrelation.R @@ -3,5 +3,10 @@ test_that("check_autocorrelation", { m <- lm(mpg ~ wt + cyl + gear + disp, data = mtcars) set.seed(123) out <- check_autocorrelation(m) - expect_equal(as.vector(out), 0.316, ignor_attr = TRUE, tolerance = 1e-2) + expect_equal(as.vector(out), 0.316, ignore_attr = TRUE, tolerance = 1e-2) + expect_identical( + capture.output(print(out)), + "OK: Residuals appear to be independent and not autocorrelated (p = 0.316)." + ) + expect_warning(plot(out), "There is currently") }) diff --git a/tests/testthat/test-check_collinearity.R b/tests/testthat/test-check_collinearity.R index 3d68b87ac..042142073 100644 --- a/tests/testthat/test-check_collinearity.R +++ b/tests/testthat/test-check_collinearity.R @@ -24,6 +24,7 @@ test_that("check_collinearity, correct order in print", { test_that("check_collinearity", { + skip_if(getRversion() > "4.3.3") skip_if_not_installed("glmmTMB") skip_if_not(getRversion() >= "4.0.0") @@ -50,6 +51,7 @@ test_that("check_collinearity", { test_that("check_collinearity", { + skip_if(getRversion() > "4.3.3") skip_if_not_installed("glmmTMB") skip_if_not(getRversion() >= "4.0.0") @@ -202,12 +204,20 @@ test_that("check_collinearity, hurdle/zi models w/o zi-formula", { link = "logit" ) out <- check_collinearity(m) - expect_identical( - colnames(out), + expect_named( + out, c( "Term", "VIF", "VIF_CI_low", "VIF_CI_high", "SE_factor", "Tolerance", "Tolerance_CI_low", "Tolerance_CI_high", "Component" ) ) expect_equal(out$VIF, c(1.05772, 1.05772, 1.06587, 1.06587), tolerance = 1e-4) + expect_snapshot(print(out)) +}) + +test_that("check_collinearity, invalid data", { + skip_if(packageVersion("insight") < "0.19.8.2") + dd <- data.frame(y = as.difftime(0:5, units = "days")) + m1 <- lm(y ~ 1, data = dd) + expect_error(check_collinearity(m1), "Can't extract variance-covariance matrix") }) diff --git a/tests/testthat/test-check_convergence.R b/tests/testthat/test-check_convergence.R index 1663d1219..0bfadc8a6 100644 --- a/tests/testthat/test-check_convergence.R +++ b/tests/testthat/test-check_convergence.R @@ -26,3 +26,21 @@ test_that("check_convergence", { model <- lme4::lmer(Reaction ~ Days + (1 + Days | Subject), data = sleepstudy) expect_true(check_convergence(model)) }) + + +test_that("check_convergence, glmmTMB", { + skip_if(getRversion() > "4.3.3") + skip_if_not_installed("glmmTMB") + data(iris) + model <- suppressWarnings(glmmTMB::glmmTMB( + Sepal.Length ~ poly(Petal.Width, 4) * poly(Petal.Length, 4) + + (1 + poly(Petal.Width, 4) | Species), + data = iris + )) + expect_false(check_convergence(model)) + model <- suppressWarnings(glmmTMB::glmmTMB( + Sepal.Length ~ Petal.Width + (1 | Species), + data = iris + )) + expect_true(check_convergence(model)) +}) diff --git a/tests/testthat/test-check_heterogeneity_bias.R b/tests/testthat/test-check_heterogeneity_bias.R index 7abc6af30..2bd63856e 100644 --- a/tests/testthat/test-check_heterogeneity_bias.R +++ b/tests/testthat/test-check_heterogeneity_bias.R @@ -1,7 +1,7 @@ test_that("check_heterogeneity_bias", { data(iris) set.seed(123) - iris$ID <- sample(1:4, nrow(iris), replace = TRUE) # fake-ID + iris$ID <- sample.int(4, nrow(iris), replace = TRUE) # fake-ID out <- check_heterogeneity_bias(iris, select = c("Sepal.Length", "Petal.Length"), group = "ID") expect_equal(out, c("Sepal.Length", "Petal.Length"), ignore_attr = TRUE) expect_output(print(out), "Possible heterogeneity bias due to following predictors: Sepal\\.Length, Petal\\.Length") diff --git a/tests/testthat/test-check_heteroskedasticity.R b/tests/testthat/test-check_heteroskedasticity.R new file mode 100644 index 000000000..4d64a870b --- /dev/null +++ b/tests/testthat/test-check_heteroskedasticity.R @@ -0,0 +1,17 @@ +test_that("check_heteroskedasticity", { + data(mtcars) + m <- lm(mpg ~ wt + cyl + gear + disp, data = mtcars) + out <- check_heteroscedasticity(m) + expect_equal(as.vector(out), 0.0423, ignore_attr = TRUE, tolerance = 1e-2) + expect_identical( + capture.output(print(out)), + "Warning: Heteroscedasticity (non-constant error variance) detected (p = 0.042)." + ) + m <- lm(mpg ~ hp, data = mtcars) + out <- check_heteroscedasticity(m) + expect_equal(as.vector(out), 0.8271352, ignore_attr = TRUE, tolerance = 1e-2) + expect_identical( + capture.output(print(out)), + "OK: Error variance appears to be homoscedastic (p = 0.827)." + ) +}) diff --git a/tests/testthat/test-check_homogeneity.R b/tests/testthat/test-check_homogeneity.R index 40bba5479..1b3fddbd9 100644 --- a/tests/testthat/test-check_homogeneity.R +++ b/tests/testthat/test-check_homogeneity.R @@ -21,9 +21,37 @@ test_that("check_homogeneity | afex", { expect_error(check_homogeneity(aW)) msg <- capture.output(expect_message(check_homogeneity(aB, method = "bartlett"), "Only")) - msg <- capture.output(pM <- check_homogeneity(aM)) - msg <- capture.output(pB <- check_homogeneity(aB)) + msg <- capture.output({ + pM <- check_homogeneity(aM) + }) + msg <- capture.output({ + pB <- check_homogeneity(aB) + }) expect_equal(pM, 0.3496516, ignore_attr = TRUE, tolerance = 0.001) expect_equal(pB, 0.3496516, ignore_attr = TRUE, tolerance = 0.001) }) + +test_that("check_homogeneity | t-test", { + data(mtcars) + expect_error( + check_homogeneity(t.test(mtcars$mpg, mtcars$hp, var.equal = FALSE)), + regex = "Test does not assume" + ) + + out <- t.test(mtcars$mpg, mtcars$hp, var.equal = TRUE) + expect_equal( + check_homogeneity(out), + structure( + 6.18792236963585e-121, + object_name = out, + method = "Bartlett Test", + class = c( + "check_homogeneity", + "see_check_homogeneity", "numeric" + ) + ), + tolerance = 1e-3, + ignore_attr = TRUE + ) +}) diff --git a/tests/testthat/test-check_itemscale.R b/tests/testthat/test-check_itemscale.R index 119283d37..eb64b79b5 100644 --- a/tests/testthat/test-check_itemscale.R +++ b/tests/testthat/test-check_itemscale.R @@ -1,4 +1,4 @@ -test_that("check_convergence", { +test_that("check_itemscale", { skip_if_not_installed("parameters") set.seed(123) @@ -25,4 +25,49 @@ test_that("check_convergence", { tolerance = 1e-4, ignore_attr = TRUE ) + expect_snapshot(print(out), variant = "windows") + comp <- parameters::closest_component(pca) + out2 <- check_itemscale(d, comp) + expect_equal( + out[[1]]$Mean, + out2[[1]]$Mean, + tolerance = 1e-4, + ignore_attr = TRUE + ) + expect_equal( + out[[1]]$Difficulty, + out2[[1]]$Difficulty, + tolerance = 1e-4, + ignore_attr = TRUE + ) + # factor_index as none-named vector + out3 <- check_itemscale(d, factor_index = c(2, 1, 2, 2, 1, 1)) + expect_equal( + out[[1]]$Mean, + out3[[1]]$Mean, + tolerance = 1e-4, + ignore_attr = TRUE + ) + expect_equal( + out[[1]]$Difficulty, + out3[[1]]$Difficulty, + tolerance = 1e-4, + ignore_attr = TRUE + ) + expect_error( + check_itemscale(d), + regex = "If `x` is a data" + ) + expect_error( + check_itemscale(d, factor_index = 1:8), + regex = "`factor_index` must be of same" + ) + expect_error( + check_itemscale(d, factor_index = factor(comp)), + regex = "`factor_index` must be numeric." + ) + expect_error( + check_itemscale(iris$Species), + regex = "`x` must be an object of class" + ) }) diff --git a/tests/testthat/test-check_model.R b/tests/testthat/test-check_model.R index 6b4ce4db2..6543e5065 100644 --- a/tests/testthat/test-check_model.R +++ b/tests/testthat/test-check_model.R @@ -36,3 +36,76 @@ test_that("`check_outliers()` works if convergence issues", { x <- check_outliers(m, verbose = FALSE) expect_s3_class(x, "check_outliers") }) + +test_that("`check_model()` for invalid models", { + skip_if(packageVersion("insight") < "0.19.8.2") + dd <- data.frame(y = as.difftime(0:5, units = "days")) + m1 <- lm(y ~ 1, data = dd) + expect_error(check_model(m1)) +}) + +test_that("`check_model()` works for quantreg", { + skip_if_not_installed("quantreg") + data(engel, package = "quantreg") + qm <- quantreg::rq(foodexp ~ income, data = engel) + x <- check_model(qm, verbose = FALSE) + expect_s3_class(x, "check_model") +}) + +test_that("`check_model()` warnings for tweedie", { + skip_if(getRversion() > "4.3.3") + skip_if_not_installed("glmmTMB") + skip_if_not_installed("lme4") + data(sleepstudy, package = "lme4") + set.seed(123) + d <- sleepstudy[sample.int(50), ] + m <- suppressWarnings(glmmTMB::glmmTMB(Reaction ~ Days, + data = d, + family = glmmTMB::tweedie + )) + expect_message( + expect_message( + check_model(m, iterations = 2, verbose = TRUE), + regex = "Not enough model terms" + ) + ) +}) + + +test_that("`check_model()` warnings for zero-infl", { + skip_if_not_installed("pscl") + data(bioChemists, package = "pscl") + model <- pscl::zeroinfl( + art ~ fem + mar + kid5 + ment | kid5 + phd, + data = bioChemists + ) + expect_message(expect_message(check_model(model, verbose = TRUE), regex = "Cannot simulate"), regex = "Homogeneity") +}) + + +test_that("`check_model()` no warnings for quasipoisson", { + skip_if_not_installed("datawizard") + set.seed(250419) + # Generate random x values + x <- rnorm( + n = 500, + mean = 5, + sd = 2 + ) + # Generate y values y = 5x + e + y <- 5 * x + rnorm( + n = 500, + mean = 5, + sd = 2 + ) + # Generate z as offset + z <- runif(500, min = 0, max = 6719) + mock_data <- data.frame(x, y, z) |> + # both should be whole numbers since they're counts + datawizard::data_modify(y = round(y), z = round(z)) |> + datawizard::data_filter(x >= 0, y >= 0) + # Run model + model1 <- glm(y ~ x + offset(log(z)), family = "quasipoisson", data = mock_data) + expect_message(check_model(model1, verbose = TRUE), regex = "Not enough") + expect_silent(check_model(model1)) +}) diff --git a/tests/testthat/test-check_normality.R b/tests/testthat/test-check_normality.R index 912f86d4b..44b461aea 100644 --- a/tests/testthat/test-check_normality.R +++ b/tests/testthat/test-check_normality.R @@ -19,9 +19,15 @@ test_that("check_normality | afex", { ) })) - msg <- capture.output(pM <- check_normality(aM)) - msg <- capture.output(pW <- check_normality(aW)) - msg <- capture.output(pB <- check_normality(aB)) + msg <- capture.output({ + pM <- check_normality(aM) + }) + msg <- capture.output({ + pW <- check_normality(aW) + }) + msg <- capture.output({ + pB <- check_normality(aB) + }) expect_equal(pM, 0.2054236, ignore_attr = TRUE, tolerance = 0.001) expect_equal(pW, 0.5496325, ignore_attr = TRUE, tolerance = 0.001) @@ -29,6 +35,7 @@ test_that("check_normality | afex", { }) test_that("check_normality | glmmTMB", { + skip_if(getRversion() > "4.3.3") skip_if_not_installed("glmmTMB") skip_if_not(getRversion() >= "4.0.0") @@ -40,12 +47,70 @@ test_that("check_normality | glmmTMB", { ) out <- check_normality(m, effects = "random") - expect_equal(attributes(out)$re_groups, "site: (Intercept)") + expect_identical(attributes(out)$re_groups, "site: (Intercept)") expect_equal(as.vector(out), 0.698457693553405, tolerance = 1e-3) expect_message( - out <- check_normality(m, effects = "fixed"), + { + out <- check_normality(m, effects = "fixed") + }, "for linear models" ) expect_null(out) }) + + +test_that("check_normality | t-test", { + data(mtcars) + expect_error( + check_normality(t.test(mtcars$mpg, mtcars$hp, var.equal = FALSE)), + regex = "Discrete or character variables" + ) + out <- t.test(mtcars$mpg, mtcars$hp, var.equal = TRUE) + expect_equal( + check_normality(out), + structure( + 7.15789362314837e-12, + type = "residuals", + object_name = out, + effects = "fixed", + class = c( + "check_normality", + "see_check_normality", "numeric" + ) + ), + tolerance = 1e-3, + ignore_attr = TRUE + ) +}) + + +test_that("check_normality | simulated residuals", { + skip_if_not_installed("DHARMa") + m <- lm(mpg ~ wt + cyl + gear + disp, data = mtcars) + res <- simulate_residuals(m) + out <- check_normality(res) + expect_equal( + as.numeric(out), + 0.2969038, + tolerance = 1e-3, + ignore_attr = TRUE + ) + expect_identical( + capture.output(print(out)), + "OK: residuals appear as normally distributed (p = 0.297)." + ) + + m <- lm(mpg ~ wt + cyl + gear + disp, data = mtcars) + out <- check_normality(m) + expect_equal( + as.numeric(out), + 0.2303071, + tolerance = 1e-3, + ignore_attr = TRUE + ) + expect_identical( + capture.output(print(out)), + "OK: residuals appear as normally distributed (p = 0.230)." + ) +}) diff --git a/tests/testthat/test-check_outliers.R b/tests/testthat/test-check_outliers.R index e464028d0..704b27feb 100644 --- a/tests/testthat/test-check_outliers.R +++ b/tests/testthat/test-check_outliers.R @@ -6,6 +6,7 @@ test_that("zscore negative threshold", { }) test_that("lof illegal threshold", { + skip_if_not_installed("dbscan") expect_error( check_outliers(mtcars$mpg, method = "lof", threshold = -1), "The `threshold` argument" @@ -84,12 +85,22 @@ test_that("mahalanobis_robust which", { }) test_that("mcd which", { + skip_if_not_installed("MASS") # (not clear why method mcd needs a seed) set.seed(42) expect_identical( - tail(which(check_outliers(mtcars[1:4], method = "mcd", threshold = 45))), + tail(which(check_outliers(mtcars[1:4], method = "mcd", threshold = 45, verbose = FALSE))), 31L ) + expect_warning( + { + out <- check_outliers(mtcars, method = "mcd") + }, + regex = "The sample size is too small" + ) + expect_identical(sum(out), 8L) + out <- check_outliers(mtcars, method = "mcd", percentage_central = 0.5, verbose = FALSE) + expect_identical(sum(out), 15L) }) ## FIXME: Fails on CRAN/windows @@ -190,6 +201,12 @@ test_that("multiple methods which", { # We exclude method ics because it is too slow test_that("all methods which", { skip_if_not_installed("bigutilsr") + skip_if_not_installed("MASS") + skip_if_not_installed("dbscan") + skip_if_not_installed("ICS") + skip_if_not_installed("ICSOutlier") + skip_if_not_installed("loo") + expect_identical( which(check_outliers(mtcars, method = c( @@ -197,11 +214,12 @@ test_that("all methods which", { "mahalanobis", "mahalanobis_robust", "mcd", "optics", "lof" ), threshold = list( - "zscore" = 2.2, "zscore_robust" = 2.2, "iqr" = 1.2, - "ci" = 0.95, "eti" = 0.95, "hdi" = 0.90, "bci" = 0.95, - "mahalanobis" = 20, "mahalanobis_robust" = 25, "mcd" = 25, - "optics" = 14, "lof" = 0.005 - ) + zscore = 2.2, zscore_robust = 2.2, iqr = 1.2, + ci = 0.95, eti = 0.95, hdi = 0.90, bci = 0.95, + mahalanobis = 20, mahalanobis_robust = 25, mcd = 25, + optics = 14, lof = 0.005 + ), + verbose = FALSE )), as.integer(c(9, 15, 16, 19, 20, 28, 29, 31)) ) @@ -212,6 +230,12 @@ test_that("all methods which", { test_that("multiple methods with ID", { skip_if_not_installed("bigutilsr") + skip_if_not_installed("MASS") + skip_if_not_installed("dbscan") + skip_if_not_installed("ICS") + skip_if_not_installed("ICSOutlier") + skip_if_not_installed("loo") + data <- datawizard::rownames_as_column(mtcars, var = "car") x <- attributes(check_outliers(data, method = c( @@ -219,12 +243,13 @@ test_that("multiple methods with ID", { "mahalanobis", "mahalanobis_robust", "mcd", "optics", "lof" ), threshold = list( - "zscore" = 2.2, "zscore_robust" = 2.2, "iqr" = 1.2, - "ci" = 0.95, "eti" = 0.95, "hdi" = 0.90, "bci" = 0.95, - "mahalanobis" = 20, "mahalanobis_robust" = 25, "mcd" = 25, - "optics" = 14, "lof" = 0.005 + zscore = 2.2, zscore_robust = 2.2, iqr = 1.2, + ci = 0.95, eti = 0.95, hdi = 0.90, bci = 0.95, + mahalanobis = 20, mahalanobis_robust = 25, mcd = 25, + optics = 14, lof = 0.005 ), - ID = "car" + ID = "car", + verbose = FALSE )) expect_identical( x$outlier_var$zscore$mpg$car, @@ -269,6 +294,7 @@ test_that("cook multiple methods which", { test_that("pareto which", { skip_if_not_installed("dbscan") + skip_if_not_installed("loo") skip_if_not_installed("rstanarm") set.seed(123) model <- rstanarm::stan_glm(mpg ~ qsec + wt, data = mtcars, refresh = 0) @@ -282,6 +308,7 @@ test_that("pareto which", { test_that("pareto multiple methods which", { skip_if_not_installed("dbscan") + skip_if_not_installed("loo") skip_if_not_installed("rstanarm") set.seed(123) model <- rstanarm::stan_glm(mpg ~ qsec + wt, data = mtcars, refresh = 0) @@ -318,3 +345,53 @@ test_that("cook multiple methods which", { c("setosa", "versicolor", "virginica") ) }) + + +test_that("check_outliers with invald data", { + dd <- data.frame(y = as.difftime(0:5, units = "days")) + m1 <- lm(y ~ 1, data = dd) + expect_error( + expect_message( + check_outliers(m1), + regex = "Date variables are not supported" + ), + regex = "No numeric variables found" + ) +}) + + +test_that("check_outliers with DHARMa", { + skip_if_not_installed("DHARMa") + mt1 <- mtcars[, c(1, 3, 4)] + # create some fake outliers and attach outliers to main df + mt2 <- rbind(mt1, data.frame( + mpg = c(37, 40), disp = c(300, 400), + hp = c(110, 120) + )) + # fit model with outliers + model <- lm(disp ~ mpg + hp, data = mt2) + set.seed(123) + res <- simulate_residuals(model) + out <- check_outliers(res) + expect_equal( + out, + structure( + list( + Coefficient = 0.0294117647058824, Expected = 0.00796812749003984, + CI_low = 0.000744364234690261, CI_high = 0.153267669560318, + p_value = 0.238146844116552 + ), + class = c("check_outliers_simres", "list") + ), + ignore_attr = TRUE, + tolerance = 1e-4 + ) + expect_identical( + capture.output(print(out)), + c( + "# Outliers detection", "", " Proportion of observed outliers: 2.94%", + " Proportion of expected outliers: 0.80%, 95% CI [0.07, 15.33]", + "" + ) + ) +}) diff --git a/tests/testthat/test-check_overdispersion.R b/tests/testthat/test-check_overdispersion.R index cdd36bcd0..06cc95dd0 100644 --- a/tests/testthat/test-check_overdispersion.R +++ b/tests/testthat/test-check_overdispersion.R @@ -1,11 +1,13 @@ -test_that("check_overdispersion", { +test_that("check_overdispersion, glmmTMB-poisson", { + skip_if(getRversion() > "4.3.3") skip_if_not_installed("glmmTMB") skip_if_not(getRversion() >= "4.0.0") data(Salamanders, package = "glmmTMB") m1 <- glm(count ~ spp + mined, family = poisson, data = Salamanders) + out <- check_overdispersion(m1) expect_equal( - check_overdispersion(m1), + out, structure( list( chisq_statistic = 1873.71012423995, @@ -18,9 +20,37 @@ test_that("check_overdispersion", { ), tolerance = 1e-3 ) + expect_identical( + capture.output(print(out)), + c( + "# Overdispersion test", + "", + " dispersion ratio = 2.946", + " Pearson's Chi-Squared = 1873.710", + " p-value = < 0.001", + "" + ) + ) + expect_message(capture.output(print(out)), "Overdispersion detected") + + set.seed(123) + out <- check_overdispersion(simulate_residuals(m1)) + expect_equal( + out, + structure( + list( + dispersion_ratio = 3.91516791651235, + p_value = 0 + ), + class = c("check_overdisp", "see_check_overdisp") + ), + tolerance = 1e-3 + ) }) -test_that("check_overdispersion", { + +test_that("check_overdispersion, glmmTMB-poisson mixed", { + skip_if(getRversion() > "4.3.3") skip_if_not_installed("glmmTMB") skip_if_not(getRversion() >= "4.0.0") data(Salamanders, package = "glmmTMB") @@ -45,3 +75,135 @@ test_that("check_overdispersion", { tolerance = 1e-3 ) }) + + +test_that("check_overdispersion, zero-inflated and negbin", { + skip_if(getRversion() > "4.3.3") + skip_if_not_installed("glmmTMB") + skip_if_not_installed("DHARMa") + skip_if_not(getRversion() >= "4.0.0") + data(Salamanders, package = "glmmTMB") + + m1 <- glmmTMB::glmmTMB( + count ~ spp + mined, + ziformula = ~ spp + mined, + family = poisson, + data = Salamanders + ) + m2 <- glmmTMB::glmmTMB( + count ~ spp + mined, + family = poisson, + data = Salamanders + ) + m3 <- glmmTMB::glmmTMB( + count ~ spp + mined, + family = glmmTMB::nbinom1(), + data = Salamanders + ) + expect_equal( + check_overdispersion(m1), + structure( + list( + dispersion_ratio = 1.98057695890769, + p_value = 0 + ), + class = c("check_overdisp", "see_check_overdisp") + ), + tolerance = 1e-4, + ignore_attr = TRUE + ) + expect_equal( + check_overdispersion(m2), + structure( + list( + chisq_statistic = 1873.7105986433, + dispersion_ratio = 2.94608584692342, + residual_df = 636L, + p_value = 3.26556213101505e-122 + ), + class = c("check_overdisp", "see_check_overdisp"), + object_name = "m1" + ), + tolerance = 1e-4, + ignore_attr = TRUE + ) + expect_equal( + check_overdispersion(m1), + structure( + list( + dispersion_ratio = 1.98057695890769, + p_value = 0 + ), + class = c("check_overdisp", "see_check_overdisp") + ), + tolerance = 1e-4, + ignore_attr = TRUE + ) +}) + + +test_that("check_overdispersion, MASS::negbin", { + skip_if_not_installed("MASS") + skip_if_not_installed("DHARMa") + set.seed(3) + mu <- rpois(500, lambda = 3) + x <- rnorm(500, mu, mu * 3) + x <- ceiling(x) + x <- pmax(x, 0) + m <- MASS::glm.nb(x ~ mu) + out <- check_overdispersion(m) + expect_equal( + out, + structure( + list( + dispersion_ratio = 0.409521313173506, + p_value = 0 + ), + class = c("check_overdisp", "see_check_overdisp") + ), + ignore_attr = TRUE, + tolerance = 1e-4 + ) + expect_identical( + capture.output(print(out)), + c( + "# Overdispersion test", + "", + " dispersion ratio = 0.410", + " p-value = < 0.001", + "" + ) + ) + expect_message(capture.output(print(out)), "Underdispersion detected") + + # check that plot works + skip_if_not_installed("see") + expect_s3_class(plot(out), "ggplot") +}) + + +test_that("check_overdispersion, genpois", { + skip_if(getRversion() > "4.3.3") + skip_if_not_installed("glmmTMB") + skip_if_not_installed("DHARMa") + skip_if_not(getRversion() >= "4.0.0") + data(Salamanders, package = "glmmTMB") + + model <- glmmTMB::glmmTMB( + count ~ mined + spp + (1 | site), + family = glmmTMB::genpois(), + data = Salamanders + ) + expect_equal( + check_overdispersion(model), + structure( + list( + dispersion_ratio = 0.971975646955856, + p_value = 0.88 + ), + class = c("check_overdisp", "see_check_overdisp") + ), + tolerance = 1e-4, + ignore_attr = TRUE + ) +}) diff --git a/tests/testthat/test-check_predictions.R b/tests/testthat/test-check_predictions.R new file mode 100644 index 000000000..1f7774c8a --- /dev/null +++ b/tests/testthat/test-check_predictions.R @@ -0,0 +1,131 @@ +skip_on_os("mac") +skip_on_cran() + +test_that("check_predictions", { + data(mtcars) + model <- lm(mpg ~ disp, data = mtcars) + set.seed(99) + out <- check_predictions(model) + + expect_named( + out, + c( + "sim_1", "sim_2", "sim_3", "sim_4", "sim_5", "sim_6", "sim_7", + "sim_8", "sim_9", "sim_10", "sim_11", "sim_12", "sim_13", "sim_14", + "sim_15", "sim_16", "sim_17", "sim_18", "sim_19", "sim_20", "sim_21", + "sim_22", "sim_23", "sim_24", "sim_25", "sim_26", "sim_27", "sim_28", + "sim_29", "sim_30", "sim_31", "sim_32", "sim_33", "sim_34", "sim_35", + "sim_36", "sim_37", "sim_38", "sim_39", "sim_40", "sim_41", "sim_42", + "sim_43", "sim_44", "sim_45", "sim_46", "sim_47", "sim_48", "sim_49", + "sim_50", "y" + ) + ) + expect_equal( + out$sim_1, + c( + 23.70112, 24.56502, 25.43419, 20.40954, 13.58266, 20.72532, + 11.95366, 25.14559, 22.61286, 18.48403, 20.26737, 21.2291, 20.67149, + 10.07628, 0.25886, 10.64176, 10.18407, 20.68235, 28.10115, 27.55045, + 28.22301, 18.94021, 16.87727, 14.05421, 13.8378, 28.13797, 26.86451, + 23.90539, 10.68719, 28.17587, 21.65853, 26.07681 + ), + tolerance = 1e-4 + ) +}) + + +test_that("check_predictions, glmmTMB", { + skip_if(getRversion() > "4.3.3") + skip_if_not_installed("glmmTMB") + data(mtcars) + model <- glmmTMB::glmmTMB(vs ~ disp, data = mtcars, family = binomial()) + set.seed(99) + out <- check_predictions(model) + + expect_named( + out, + c( + "sim_1", "sim_2", "sim_3", "sim_4", "sim_5", "sim_6", "sim_7", + "sim_8", "sim_9", "sim_10", "sim_11", "sim_12", "sim_13", "sim_14", + "sim_15", "sim_16", "sim_17", "sim_18", "sim_19", "sim_20", "sim_21", + "sim_22", "sim_23", "sim_24", "sim_25", "sim_26", "sim_27", "sim_28", + "sim_29", "sim_30", "sim_31", "sim_32", "sim_33", "sim_34", "sim_35", + "sim_36", "sim_37", "sim_38", "sim_39", "sim_40", "sim_41", "sim_42", + "sim_43", "sim_44", "sim_45", "sim_46", "sim_47", "sim_48", "sim_49", + "sim_50", "y" + ) + ) + expect_equal( + out$sim_1, + c( + 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, + 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1 + ), + tolerance = 1e-4 + ) + expect_true(attributes(out)$model_info$is_bernoulli) + + model <- glmmTMB::glmmTMB(vs ~ disp + (1 | cyl), data = mtcars, family = binomial()) + set.seed(99) + out <- check_predictions(model) + + expect_equal( + out$sim_1, + c( + 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, + 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0 + ), + tolerance = 1e-4 + ) + expect_true(attributes(out)$model_info$is_bernoulli) +}) + + +test_that("check_predictions, glm, binomial", { + skip_if(packageVersion("insight") <= "0.19.6") + data(mtcars) + set.seed(1) + tot <- rep(10, 100) + suc <- rbinom(100, prob = 0.9, size = tot) + dat <- data.frame(tot, suc) + dat$prop <- suc / tot + + mod1 <- glm(cbind(suc, tot - suc) ~ 1, + family = binomial, + data = dat + ) + + mod2 <- glm(prop ~ 1, + family = binomial, + data = dat, + weights = tot + ) + + mod3 <- glm(cbind(suc, tot) ~ 1, + family = binomial, + data = dat + ) + + mod4 <- glm(am ~ 1, + family = binomial, + data = mtcars + ) + + set.seed(1) + out1 <- check_predictions(mod1) + set.seed(1) + out2 <- check_predictions(mod2) + set.seed(1) + out3 <- check_predictions(mod3) + set.seed(1) + out4 <- check_predictions(mod4) + + expect_equal(head(out1$sim_1), c(1, 0.9, 0.9, 0.8, 1, 0.8), tolerance = 1e-4) + expect_false(attributes(out1)$model_info$is_bernoulli) + expect_equal(head(out2$sim_1), c(1, 0.9, 0.9, 0.8, 1, 0.8), tolerance = 1e-4) + expect_false(attributes(out2)$model_info$is_bernoulli) + expect_equal(head(out3$sim_1), c(0.4, 0.42105, 0.47368, 0.61111, 0.4, 0.61111), tolerance = 1e-3) + expect_false(attributes(out3)$model_info$is_bernoulli) + expect_equal(head(out4$sim_1), c(0, 0, 0, 1, 0, 1), tolerance = 1e-4) + expect_true(attributes(out4)$model_info$is_bernoulli) +}) diff --git a/tests/testthat/test-check_residuals.R b/tests/testthat/test-check_residuals.R new file mode 100644 index 000000000..9be029c0c --- /dev/null +++ b/tests/testthat/test-check_residuals.R @@ -0,0 +1,77 @@ +test_that("check_residuals and simulate_residuals", { + skip_on_cran() + skip_if_not_installed("DHARMa") + set.seed(123) + dat <- DHARMa::createData(sampleSize = 100, overdispersion = 0.5, family = poisson()) + m <- glm(observedResponse ~ Environment1, family = poisson(), data = dat) + res <- simulate_residuals(m) + expect_identical( + capture.output(print(res)), + c( + "Simulated residuals from a model of class `glm` based on 250", + " simulations. Use `check_residuals()` to check uniformity of residuals or", + " `residuals()` to extract simulated residuals. It is recommended to refer", + " to `?DHARMa::simulateResiudals` and `vignette(\"DHARMa\")` for more", + " information about different settings in particular situations or for", + " particular models." + ) + ) + # check raw residuals + expect_equal( + head(residuals(res)), + c(0.55349, 0.44012, 0.39826, 0.9825, 0.90753, 0.05809), + tolerance = 1e-4, + ignore_attr = TRUE + ) + expect_equal( + head(residuals(res, quantile_function = stats::qnorm)), + c(0.13448, -0.15068, -0.25785, 2.10826, 1.3257, -1.57097), + tolerance = 1e-4, + ignore_attr = TRUE + ) + # compare to DHARMa + res_d <- DHARMa::simulateResiduals(m, n = 250, plot = FALSE) + expect_equal( + head(residuals(res)), + head(residuals(res_d)), + tolerance = 1e-4, + ignore_attr = TRUE + ) + expect_equal( + head(residuals(res, quantile_function = stats::qnorm)), + head(residuals(res_d, quantileFunction = stats::qnorm)), + tolerance = 1e-4, + ignore_attr = TRUE + ) + # DHARMa args work in residuals.permormance_simres + expect_equal( + residuals(res, quantileFunction = stats::qnorm, outlierValues = c(-3, 3)), + residuals(res_d, quantileFunction = stats::qnorm, outlierValues = c(-3, 3)), + tolerance = 1e-4, + ignore_attr = TRUE + ) + # outlier_values works + expect_identical(sum(is.infinite(residuals(res, quantile_function = stats::qnorm))), 3L) + expect_identical(sum(is.infinite(residuals(res, quantile_function = stats::qnorm, outlier_values = c(-100, 100)))), 0L) # nolint + expect_error(residuals(res, quantile_function = stats::qnorm, outlier_values = 1:3), regex = "`outlier_values` must be") # nolint + + # check_residuals + out <- check_residuals(res) + expect_equal(out, 0.01884602, ignore_attr = TRUE, tolerance = 1e-4) + expect_identical( + capture.output(print(out)), + "Warning: Non-uniformity of simulated residuals detected (p = 0.019)." + ) + expect_error(simulate_residuals(m, iterations = 1), "`iterations` must be") + + skip_if_not_installed("MASS") + set.seed(3) + mu <- rpois(500, lambda = 3) + x <- rnorm(500, mu, mu * 3) + x <- ceiling(x) + x <- pmax(x, 0) + quine.nb1 <- MASS::glm.nb(x ~ mu) + set.seed(123) + result <- check_residuals(quine.nb1) + expect_equal(result, 0.000665414, tolerance = 1e-3, ignore_attr = TRUE) +}) diff --git a/tests/testthat/test-check_singularity.R b/tests/testthat/test-check_singularity.R index 3edf6e8f6..dc0d56964 100644 --- a/tests/testthat/test-check_singularity.R +++ b/tests/testthat/test-check_singularity.R @@ -1,14 +1,14 @@ -test_that("check_singularity", { +test_that("check_singularity, lme4", { skip_on_cran() skip_if_not_installed("lme4") data(sleepstudy, package = "lme4") set.seed(123) - sleepstudy$mygrp <- sample(1:5, size = 180, replace = TRUE) + sleepstudy$mygrp <- sample.int(5, size = 180, replace = TRUE) sleepstudy$mysubgrp <- NA for (i in 1:5) { filter_group <- sleepstudy$mygrp == i sleepstudy$mysubgrp[filter_group] <- - sample(1:30, size = sum(filter_group), replace = TRUE) + sample.int(30, size = sum(filter_group), replace = TRUE) } model <- suppressMessages(lme4::lmer( @@ -17,3 +17,23 @@ test_that("check_singularity", { )) expect_true(check_singularity(model)) }) + + +test_that("check_singularity", { + skip_on_cran() + skip_if_not_installed("glmmTMB") + set.seed(101) + dd <- expand.grid(x = factor(1:6), f = factor(1:20), rep = 1:5) + dd$y <- glmmTMB::simulate_new(~ 1 + (x | f), + newdata = dd, + newparam = list( + beta = 0, + theta = rep(0, 21), + betad = 0 + ) + )[[1]] + expect_warning(expect_warning({ + m2 <- glmmTMB::glmmTMB(y ~ 1 + (x | f), data = dd, REML = FALSE) + })) + expect_true(check_singularity(m2)) +}) diff --git a/tests/testthat/test-check_zeroinflation.R b/tests/testthat/test-check_zeroinflation.R index d2e60f065..a7ff7cbbd 100644 --- a/tests/testthat/test-check_zeroinflation.R +++ b/tests/testthat/test-check_zeroinflation.R @@ -1,4 +1,5 @@ test_that("check_zeroinflation", { + skip_if(getRversion() > "4.3.3") skip_if_not_installed("glmmTMB") set.seed(123) data(Salamanders, package = "glmmTMB") @@ -19,7 +20,59 @@ test_that("check_zeroinflation", { ) }) + +test_that("check_zeroinflation, glmmTMB with and without zero-inflation component", { + skip_if(getRversion() > "4.3.3") + skip_if_not_installed("glmmTMB") + skip_if_not_installed("DHARMa") + set.seed(123) + data(Salamanders, package = "glmmTMB") + + # no zero-inflation model + m <- glmmTMB::glmmTMB(count ~ spp + mined, family = poisson, data = Salamanders) + + expect_equal( + check_zeroinflation(m), + structure( + list( + predicted.zeros = 298, + observed.zeros = 387L, + ratio = 0.770025839793282, + tolerance = 0.05 + ), + class = "check_zi" + ), + tolerance = 1e-3 + ) + + # zero-inflation model + m <- glmmTMB::glmmTMB( + count ~ spp + mined, + ziformula = ~ spp + mined, + family = poisson, + data = Salamanders + ) + + set.seed(123) + expect_equal( + check_zeroinflation(m), + structure( + list( + predicted.zeros = 387, + observed.zeros = 387L, + ratio = 1.00093023255814, + tolerance = 0.1, + p.value = 1 + ), + class = "check_zi" + ), + tolerance = 1e-3 + ) +}) + + test_that("check_zeroinflation, glmer.nb", { + skip_on_cran() skip_if_not_installed("glmmTMB") skip_if_not_installed("lme4") set.seed(101) @@ -34,19 +87,109 @@ test_that("check_zeroinflation, glmer.nb", { mu <- 5 * (-4 + with(dd, as.integer(f1) + 4 * as.numeric(f2))) dd$y <- rnbinom(nrow(dd), mu = mu, size = 0.5) dat2 <<- dd - suppressMessages( + suppressMessages({ m <- lme4::glmer.nb(y ~ f1 * f2 + (1 | g), data = dat2, verbose = FALSE) - ) + }) expect_equal( check_zeroinflation(m), structure( list( - predicted.zeros = 153, observed.zeros = 155L, - ratio = 0.987096774193548, tolerance = 0.05 + predicted.zeros = 153, + observed.zeros = 155L, + ratio = 0.987329032258065, + tolerance = 0.1, + p.value = 0.944 + ), + class = "check_zi" + ), + tolerance = 1e-3 + ) +}) + + +test_that("check_zeroinflation, glmmTMB nbinom", { + skip_if(getRversion() > "4.3.3") + skip_if_not_installed("glmmTMB") + skip_if_not_installed("DHARMa") + skip_on_cran() + + set.seed(1234) + dat <- DHARMa::createData(sampleSize = 1000) + fit <- suppressWarnings(glmmTMB::glmmTMB( + observedResponse ~ Environment1 + (1 | group), + data = dat, + family = glmmTMB::nbinom1() + )) + expect_equal( + check_zeroinflation(fit), + structure( + list( + predicted.zeros = 462, + observed.zeros = 482L, + ratio = 0.95850622406639, + tolerance = 0.1, + p.value = 0.776 ), class = "check_zi" ), tolerance = 1e-3 ) }) + + +test_that("check_zeroinflation, MASS::negbin", { + skip_if_not_installed("MASS") + skip_if_not_installed("DHARMa") + set.seed(3) + mu <- rpois(500, lambda = 3) + x <- rnorm(500, mu, mu * 3) + x <- ceiling(x) + x <- pmax(x, 0) + m <- MASS::glm.nb(x ~ mu) + expect_equal( + check_zeroinflation(m), + structure( + list( + predicted.zeros = 178, + observed.zeros = 202L, + ratio = 0.879643564356436, + tolerance = 0.1, + p.value = 0.008 + ), + class = "check_zi" + ), + ignore_attr = TRUE, + tolerance = 1e-4 + ) +}) + + +test_that("check_zeroinflation, genpois", { + skip_if(getRversion() > "4.3.3") + skip_if_not_installed("glmmTMB") + skip_if_not_installed("DHARMa") + skip_if_not(getRversion() >= "4.0.0") + data(Salamanders, package = "glmmTMB") + + model <- glmmTMB::glmmTMB( + count ~ mined + spp + (1 | site), + family = glmmTMB::genpois(), + data = Salamanders + ) + expect_equal( + check_zeroinflation(model), + structure( + list( + predicted.zeros = 386, + observed.zeros = 387L, + ratio = 0.997860465116279, + tolerance = 0.1, + p.value = 1 + ), + class = "check_zi" + ), + tolerance = 1e-4, + ignore_attr = TRUE + ) +}) diff --git a/tests/testthat/test-checks.R b/tests/testthat/test-checks.R index 46fd77dc5..b6d59af3b 100644 --- a/tests/testthat/test-checks.R +++ b/tests/testthat/test-checks.R @@ -1,12 +1,36 @@ test_that("check_factorstructure", { skip_if_not_installed("parameters") x <- check_factorstructure(mtcars) - expect_equal(x$KMO$MSA, 0.826, tolerance = 0.01) - expect_equal(x$sphericity$chisq, 408.011, tolerance = 0.01) + expect_equal(x$KMO$MSA, 0.8265536, tolerance = 0.01) + expect_equal(x$sphericity$chisq, 408.0116, tolerance = 0.01) }) -test_that("check_clusterstructure", { +test_that("check_clusterstructure, ok", { skip_if_not_installed("parameters") set.seed(333) - expect_equal(check_clusterstructure(iris[, 1:4])$H, 0.187, tolerance = 0.01) + out <- check_clusterstructure(iris[, 1:4]) + expect_equal(out$H, 0.1869618, tolerance = 0.01) + expect_identical( + capture.output(print(out)), + c( + "# Clustering tendency", + "", + "The dataset is suitable for clustering (Hopkins' H = 0.19)." + ) + ) +}) + +test_that("check_clusterstructure, bad", { + skip_if_not_installed("parameters") + set.seed(13) + out <- check_clusterstructure(mtcars[, 10:11]) + expect_equal(out$H, 0.5142575, tolerance = 0.01) + expect_identical( + capture.output(print(out)), + c( + "# Clustering tendency", + "", + "The dataset is not suitable for clustering (Hopkins' H = 0.51)." + ) + ) }) diff --git a/tests/testthat/test-cronbachs_alpha.R b/tests/testthat/test-cronbachs_alpha.R index a2670e979..ecf244745 100644 --- a/tests/testthat/test-cronbachs_alpha.R +++ b/tests/testthat/test-cronbachs_alpha.R @@ -3,16 +3,16 @@ test_that("cronbachs_alpha, data frame", { x <- mtcars[, c("cyl", "gear", "carb", "hp")] expect_equal(cronbachs_alpha(x), 0.09463206, tolerance = 1e-3) }) + test_that("cronbachs_alpha", { - expect_warning(expect_null(cronbachs_alpha(mtcars[1]))) + expect_message(expect_null(cronbachs_alpha(mtcars[1])), regex = "Too few") }) - test_that("cronbachs_alpha, principal_components", { - skip_if_not_installed("parameters", minimum_version = "0.21.2.1") + skip_if_not_installed("parameters", minimum_version = "0.21.3") pca <- parameters::principal_components(mtcars[, c("cyl", "gear", "carb", "hp")], n = 2) expect_equal(cronbachs_alpha(pca, verbose = FALSE), c(PC1 = 0.1101384), tolerance = 1e-3) - expect_warning(cronbachs_alpha(pca)) + expect_message(cronbachs_alpha(pca), regex = "Too few") pca <- parameters::principal_components(mtcars[, c("cyl", "gear", "carb", "hp")], n = 1) expect_equal(cronbachs_alpha(pca, verbose = FALSE), c(PC1 = 0.09463206), tolerance = 1e-3) @@ -25,7 +25,6 @@ test_that("cronbachs_alpha, principal_components", { expect_equal(cronbachs_alpha(pca), c(PC1 = 0.4396, PC2 = -1.44331), tolerance = 1e-3) }) - test_that("cronbachs_alpha, matrix", { m <- as.matrix(mtcars[c("cyl", "gear", "carb", "hp")]) expect_equal(cronbachs_alpha(m), 0.09463206, tolerance = 1e-3) diff --git a/tests/testthat/test-glmmPQL.R b/tests/testthat/test-glmmPQL.R new file mode 100644 index 000000000..6c1713cc8 --- /dev/null +++ b/tests/testthat/test-glmmPQL.R @@ -0,0 +1,15 @@ +skip_if_not_installed("MASS") +test_that("r2", { + example_dat <- data.frame( + prop = c(0.2, 0.2, 0.5, 0.7, 0.1, 1, 1, 1, 0.1), + size = c("small", "small", "small", "large", "large", "large", "large", "small", "small"), + x = c(0.1, 0.1, 0.8, 0.7, 0.6, 0.5, 0.5, 0.1, 0.1), + species = c("sp1", "sp1", "sp2", "sp2", "sp3", "sp3", "sp4", "sp4", "sp4"), + stringsAsFactors = FALSE + ) + mn <- MASS::glmmPQL(prop ~ x + size, + random = ~ 1 | species, + family = "quasibinomial", data = example_dat + ) + expect_message(performance_score(mn), regex = "Can't calculate") +}) diff --git a/tests/testthat/test-helpers.R b/tests/testthat/test-helpers.R new file mode 100644 index 000000000..d1d6a5545 --- /dev/null +++ b/tests/testthat/test-helpers.R @@ -0,0 +1,14 @@ +skip_on_cran() +skip_if_not_installed("withr") +withr::with_options( + list(easystats_erros = TRUE), + test_that(".safe works with options", { + expect_error(performance:::.safe(mean(fd)), regex = "object 'fd' not found") + expect_identical(performance:::.safe(mean(fd), 1L), 1L) + expect_identical(performance:::.safe(mean(c(1, 2, 3))), 2) + }) +) +test_that(".safe works", { + expect_null(performance:::.safe(mean(fd))) + expect_identical(performance:::.safe(mean(c(1, 2, 3))), 2) +}) diff --git a/tests/testthat/test-icc.R b/tests/testthat/test-icc.R index 7b467ac51..68624ff09 100644 --- a/tests/testthat/test-icc.R +++ b/tests/testthat/test-icc.R @@ -38,6 +38,7 @@ test_that("icc, CI", { test_that("icc", { skip_on_cran() + skip_if_not_installed("curl") skip_if_offline() skip_if_not_installed("httr") m2 <- insight::download_model("stanreg_lmerMod_1") @@ -54,6 +55,7 @@ test_that("icc", { test_that("icc", { skip_on_cran() + skip_if_not_installed("curl") skip_if_offline() skip_if_not_installed("httr") m3 <- insight::download_model("brms_mixed_1") @@ -67,6 +69,7 @@ test_that("icc", { test_that("icc", { skip_on_cran() + skip_if_not_installed("curl") skip_if_offline() skip_if_not_installed("httr") m3 <- insight::download_model("brms_mixed_1") @@ -87,12 +90,12 @@ test_that("icc", { skip_if_not_installed("lme4") data(sleepstudy, package = "lme4") set.seed(12345) - sleepstudy$grp <- sample(1:5, size = 180, replace = TRUE) + sleepstudy$grp <- sample.int(5, size = 180, replace = TRUE) sleepstudy$subgrp <- NA for (i in 1:5) { filter_group <- sleepstudy$grp == i sleepstudy$subgrp[filter_group] <- - sample(1:30, size = sum(filter_group), replace = TRUE) + sample.int(30, size = sum(filter_group), replace = TRUE) } model <- lme4::lmer( Reaction ~ Days + (1 | grp) + (1 | Subject), @@ -122,3 +125,50 @@ test_that("icc", { expect_equal(out$ICC_adjusted, 0.9104331, tolerance = 0.01) expect_equal(out$ICC_unadjusted, 0.3109478, tolerance = 0.01) }) + + +test_that("icc, glmmTMB 1.1.9+", { + skip_on_cran() + skip_if_not_installed("glmmTMB", minimum_version = "1.1.9") + set.seed(101) + dd <- data.frame( + z = rnorm(1000), + x1 = 1:1000, + x2 = runif(1000, 0, 10), + re = rep(1:20, each = 50) + ) + dd <- transform(dd, x3 = as.factor(ifelse( + x1 <= 500, "Low", sample(c("Middle", "High"), 1000, replace = TRUE) + ))) + dd <- transform(dd, x4 = as.factor(ifelse( + x1 > 500, "High", sample(c("Absent", "Low"), 1000, replace = TRUE) + ))) + dd <- transform(dd, z = z + re * 5) + expect_message({ + mod_TMB <- glmmTMB::glmmTMB( + z ~ x1 + x2 + x3 + x4 + (1 | re), + data = dd, + start = list(theta = 3), + control = glmmTMB::glmmTMBControl(rank_check = "adjust") + ) + }) + expect_equal( + icc(mod_TMB), + data.frame( + ICC_adjusted = 0.995480998331767, + ICC_conditional = 0.244468078371849, + ICC_unadjusted = 0.244468078371849 + ), + ignore_attr = TRUE, + tolerance = 1e-4 + ) + expect_equal( + r2(mod_TMB), + list( + R2_conditional = c(`Conditional R2` = 0.998890233308478), + R2_marginal = c(`Marginal R2` = 0.754422154936629) + ), + ignore_attr = TRUE, + tolerance = 1e-4 + ) +}) diff --git a/tests/testthat/test-logLik.R b/tests/testthat/test-logLik.R index 2691f1d3c..5da5ef923 100644 --- a/tests/testthat/test-logLik.R +++ b/tests/testthat/test-logLik.R @@ -1,3 +1,5 @@ +skip_on_cran() + test_that("logLik", { skip_if_not_installed("plm") skip_if_not_installed("withr") diff --git a/tests/testthat/test-model_performance.bayesian.R b/tests/testthat/test-model_performance.bayesian.R index 2f933e76c..a15e41a10 100644 --- a/tests/testthat/test-model_performance.bayesian.R +++ b/tests/testthat/test-model_performance.bayesian.R @@ -1,5 +1,6 @@ test_that("model_performance.stanreg", { skip_on_cran() + skip_if_not_installed("curl") skip_if_offline() skip_if_not_installed("httr") set.seed(333) @@ -31,6 +32,7 @@ test_that("model_performance.stanreg", { test_that("model_performance.brmsfit", { skip_on_cran() + skip_if_not_installed("curl") skip_if_offline() skip_if_not_installed("httr") set.seed(333) @@ -110,7 +112,7 @@ test_that("model_performance.BFBayesFactor", { }) expect_null(p) - + skip_on_os("linux") mod <- BayesFactor::regressionBF(mpg ~ cyl, mtcars, progress = FALSE) modF <- lm(mpg ~ cyl, mtcars) p <- model_performance(mod) diff --git a/tests/testthat/test-model_performance.merMod.R b/tests/testthat/test-model_performance.merMod.R index 12ed44ede..0c70da3c3 100644 --- a/tests/testthat/test-model_performance.merMod.R +++ b/tests/testthat/test-model_performance.merMod.R @@ -1,5 +1,6 @@ test_that("model_performance.merMod", { skip_on_cran() + skip_if_not_installed("curl") skip_if_offline() skip_if_not_installed("httr") diff --git a/tests/testthat/test-performance_aic.R b/tests/testthat/test-performance_aic.R index ee255d0da..b9db63087 100644 --- a/tests/testthat/test-performance_aic.R +++ b/tests/testthat/test-performance_aic.R @@ -27,4 +27,7 @@ test_that("performance_aic lme4 default", { m1 <- lme4::lmer(Sepal.Length ~ Petal.Length + (1 | Species), data = iris) expect_equal(performance_aic(m1), AIC(m1), tolerance = 1e-2) expect_equal(performance_aic(m1, estimator = "ML"), 125.0043, tolerance = 1e-2) + m2 <- lme4::lmer(Sepal.Length ~ Petal.Length + (1 | Species), data = iris, REML = FALSE) + expect_equal(performance_aic(m2, estimator = "REML"), 128.0054, tolerance = 1e-2) + expect_message(performance_aic(m2), regex = "was not fitted") }) diff --git a/tests/testthat/test-pkg-fixest.R b/tests/testthat/test-pkg-fixest.R index dd851f360..8a187e33b 100644 --- a/tests/testthat/test-pkg-fixest.R +++ b/tests/testthat/test-pkg-fixest.R @@ -1,21 +1,23 @@ -base <- iris -names(base) <- c("y1", "y2", "x1", "x2", "species") +base_iris <- iris +names(base_iris) <- c("y1", "y2", "x1", "x2", "species") test_that("fixest: r2", { skip_if_not_installed("fixest") - res <- fixest::feols(y1 ~ x1 + x2 + x2^2 | species, base) + fixest::setFixest_nthreads(1) + res <- fixest::feols(y1 ~ x1 + x2 + x2^2 | species, base_iris) r2_res <- performance::r2(res) - expect_equal(r2_res$R2, fixest::fitstat(res, "r2")[[1]]) - expect_equal(r2_res$R2_adjusted, fixest::fitstat(res, "ar2")[[1]]) - expect_equal(r2_res$R2_within, fixest::fitstat(res, "wr2")[[1]]) - expect_equal(r2_res$R2_within_adjusted, fixest::fitstat(res, "war2")[[1]]) + expect_equal(r2_res$R2, fixest::fitstat(res, "r2")[[1]], ignore_attr = TRUE) + expect_equal(r2_res$R2_adjusted, fixest::fitstat(res, "ar2")[[1]], ignore_attr = TRUE) + expect_equal(r2_res$R2_within, fixest::fitstat(res, "wr2")[[1]], ignore_attr = TRUE) + expect_equal(r2_res$R2_within_adjusted, fixest::fitstat(res, "war2")[[1]], ignore_attr = TRUE) }) test_that("fixest: overdispersion", { skip_if_not_installed("fixest") - res <- fixest::feols(y1 ~ x1 + x2 + x2^2 | species, base) + fixest::setFixest_nthreads(1) + res <- fixest::feols(y1 ~ x1 + x2 + x2^2 | species, base_iris) expect_error( check_overdispersion(res), "can only be used for models from Poisson" @@ -24,14 +26,16 @@ test_that("fixest: overdispersion", { test_that("fixest: outliers", { skip_if_not_installed("fixest") - res <- fixest::feols(y1 ~ x1 + x2 + x2^2 | species, base) + fixest::setFixest_nthreads(1) + res <- fixest::feols(y1 ~ x1 + x2 + x2^2 | species, base_iris) outliers_list <- suppressMessages(check_outliers(res)) expect_identical(attr(outliers_list, "outlier_count"), list()) }) test_that("fixest: model_performance", { skip_if_not_installed("fixest") - res <- fixest::feols(y1 ~ x1 + x2 + x2^2 | species, base) + fixest::setFixest_nthreads(1) + res <- fixest::feols(y1 ~ x1 + x2 + x2^2 | species, base_iris) perf <- model_performance(res) expect_equal(perf$AIC, 107.743, tolerance = 1e-3) expect_equal(perf$BIC, 125.807, tolerance = 1e-3) @@ -47,7 +51,8 @@ test_that("fixest: model_performance", { test_that("fixest_multi: r2", { skip_if_not_installed("fixest") - res <- fixest::feols(c(y1, y2) ~ x1 + csw(x2, x2^2) | species, base) + fixest::setFixest_nthreads(1) + res <- fixest::feols(c(y1, y2) ~ x1 + csw(x2, x2^2) | species, base_iris) r2_res <- performance::r2(res) expect_equal(unname(r2_res[[1]]$R2), 0.837, tolerance = 1e-3) @@ -55,7 +60,8 @@ test_that("fixest_multi: r2", { test_that("fixest_multi: overdispersion", { skip_if_not_installed("fixest") - res <- fixest::feols(c(y1, y2) ~ x1 + csw(x2, x2^2) | species, base) + fixest::setFixest_nthreads(1) + res <- fixest::feols(c(y1, y2) ~ x1 + csw(x2, x2^2) | species, base_iris) expect_error( check_overdispersion(res), "can only be used for models from Poisson" @@ -64,15 +70,17 @@ test_that("fixest_multi: overdispersion", { test_that("fixest_multi: outliers", { skip_if_not_installed("fixest") - res <- fixest::feols(c(y1, y2) ~ x1 + csw(x2, x2^2) | species, base) + fixest::setFixest_nthreads(1) + res <- fixest::feols(c(y1, y2) ~ x1 + csw(x2, x2^2) | species, base_iris) outliers_list <- suppressMessages(check_outliers(res)[[1]]) expect_identical(attr(outliers_list, "outlier_count"), list()) }) test_that("fixest_multi: model_performance", { skip_if_not_installed("fixest") - res <- fixest::feols(c(y1, y2) ~ x1 + csw(x2, x2^2) | species, base) - res2 <- fixest::feols(y1 ~ x1 + x2 + x2^2 | species, base) + fixest::setFixest_nthreads(1) + res <- fixest::feols(c(y1, y2) ~ x1 + csw(x2, x2^2) | species, base_iris) + res2 <- fixest::feols(y1 ~ x1 + x2 + x2^2 | species, base_iris) perf <- model_performance(res) perf2 <- model_performance(res2) expect_identical(perf[[2]], perf2) diff --git a/tests/testthat/test-r2.R b/tests/testthat/test-r2.R index e69198929..f521b3aab 100644 --- a/tests/testthat/test-r2.R +++ b/tests/testthat/test-r2.R @@ -39,3 +39,76 @@ test_that("r2 glm, ci", { tolerance = 1e-3 ) }) + +# glmmTMB, non-mixed -------------------------------------------------------- + +skip_if_not_installed("withr") +withr::with_environment( + new.env(), + test_that("r2 glmmTMB, no ranef", { + skip_if(getRversion() > "4.3.3") + skip_if_not_installed("glmmTMB") + data(Owls, package = "glmmTMB") + # linear --------------------------------------------------------------- + m <- glmmTMB::glmmTMB(NegPerChick ~ BroodSize + ArrivalTime, data = Owls) + out <- r2(m) + expect_equal(out$R2, 0.05597288, tolerance = 1e-3, ignore_attr = TRUE) + # validate against lm + m2 <- lm(NegPerChick ~ BroodSize + ArrivalTime, data = Owls) + out2 <- r2(m2) + expect_equal(out$R2, out2$R2, tolerance = 1e-3, ignore_attr = TRUE) + # binomial ------------------------------------------------------------- + data(mtcars) + m <- glmmTMB::glmmTMB(am ~ mpg, data = mtcars, family = binomial()) + out <- r2(m) + expect_equal(out[[1]], 0.3677326, tolerance = 1e-3, ignore_attr = TRUE) + # validate against glm + m2 <- glm(am ~ mpg, data = mtcars, family = binomial()) + out2 <- r2(m2) + expect_equal(out[[1]], out2[[1]], tolerance = 1e-3, ignore_attr = TRUE) + # poisson -------------------------------------------------------------- + d <<- data.frame( + counts = c(18, 17, 15, 20, 10, 20, 25, 13, 12), + outcome = gl(3, 1, 9), + treatment = gl(3, 3) + ) + m <- glmmTMB::glmmTMB(counts ~ outcome + treatment, family = poisson(), data = d) + out <- r2(m) + expect_equal(out[[1]], 0.6571698, tolerance = 1e-3, ignore_attr = TRUE) + # validate against glm + m2 <- glm(counts ~ outcome + treatment, family = poisson(), data = d) + out2 <- r2(m2) + expect_equal(out[[1]], out2[[1]], tolerance = 1e-3, ignore_attr = TRUE) + # zero-inflated -------------------------------------------------------------- + skip_if_not_installed("pscl") + data(bioChemists, package = "pscl") + m <- glmmTMB::glmmTMB( + art ~ fem + mar + kid5 + ment, + ziformula = ~ kid5 + phd, + family = poisson(), + data = bioChemists + ) + out <- r2(m) + expect_equal(out[[1]], 0.1797549, tolerance = 1e-3, ignore_attr = TRUE) + # validate against pscl::zeroinfl + m2 <- pscl::zeroinfl( + art ~ fem + mar + kid5 + ment | kid5 + phd, + data = bioChemists + ) + out2 <- r2(m2) + expect_equal(out[[1]], out2[[1]], tolerance = 1e-3, ignore_attr = TRUE) + # Gamma -------------------------------------------------------------- + clotting <<- data.frame( + u = c(5, 10, 15, 20, 30, 40, 60, 80, 100), + lot1 = c(118, 58, 42, 35, 27, 25, 21, 19, 18), + lot2 = c(69, 35, 26, 21, 18, 16, 13, 12, 12) + ) + m <- suppressWarnings(glmmTMB::glmmTMB(lot1 ~ log(u), data = clotting, family = Gamma())) + out <- r2(m) + expect_equal(out[[1]], 0.996103, tolerance = 1e-3, ignore_attr = TRUE) + # validate against glm + m2 <- glm(lot1 ~ log(u), data = clotting, family = Gamma()) + out2 <- r2(m2) + expect_equal(out[[1]], out2[[1]], tolerance = 1e-3, ignore_attr = TRUE) + }) +) diff --git a/tests/testthat/test-r2_bayes.R b/tests/testthat/test-r2_bayes.R index 7c176b8e3..00e3a54e2 100644 --- a/tests/testthat/test-r2_bayes.R +++ b/tests/testthat/test-r2_bayes.R @@ -1,3 +1,5 @@ +skip_on_os("linux") + test_that("r2_BayesFactor", { skip_if_not_installed("BayesFactor") set.seed(1) diff --git a/tests/testthat/test-r2_kullback.R b/tests/testthat/test-r2_kullback.R index 591295cef..8ce9620b9 100644 --- a/tests/testthat/test-r2_kullback.R +++ b/tests/testthat/test-r2_kullback.R @@ -3,3 +3,10 @@ test_that("r2_kullback", { expect_equal(r2_kullback(model), c(`Kullback-Leibler R2` = 0.3834), tolerance = 1e-3) expect_equal(r2_kullback(model, adjust = FALSE), c(`Kullback-Leibler R2` = 0.4232), tolerance = 1e-3) }) + +test_that("r2_kullback errors for non-supported", { + skip_if_not_installed("pscl") + data("bioChemists", package = "pscl") + model <- pscl::zeroinfl(art ~ . | 1, data = bioChemists, dist = "negbin") + expect_error(r2_kullback(model), regex = "This function only works") +}) diff --git a/tests/testthat/test-r2_nagelkerke.R b/tests/testthat/test-r2_nagelkerke.R index 488a46e95..7fa42c8d9 100644 --- a/tests/testthat/test-r2_nagelkerke.R +++ b/tests/testthat/test-r2_nagelkerke.R @@ -4,6 +4,8 @@ test_that("r2_nagelkerke", { expect_equal(r2(model), list(R2_Tjur = c(`Tjur's R2` = 0.477692621360749)), tolerance = 1e-3, ignore_attr = TRUE) }) +skip_if_not_installed("withr") + test_that("r2_nagelkerke", { skip_if_not_installed("MASS") withr::with_options( diff --git a/vignettes/check_model.Rmd b/vignettes/check_model.Rmd index 7e9ff0506..1bd8e6eee 100644 --- a/vignettes/check_model.Rmd +++ b/vignettes/check_model.Rmd @@ -3,8 +3,6 @@ title: "Checking model assumption - linear models" output: rmarkdown::html_vignette: toc: true - fig_width: 10.08 - fig_height: 6 tags: [r, performance, r2] vignette: > \usepackage[utf8]{inputenc} @@ -19,11 +17,15 @@ library(knitr) library(performance) options(knitr.kable.NA = "") knitr::opts_chunk$set( - comment = ">", + dpi = 300, + fig.width = 7, + fig.height = 6, + out.width = "80%", + out.height = "80%", + comment = "#>", + collapse = TRUE, message = FALSE, - warning = FALSE, - out.width = "100%", - dpi = 450 + warning = FALSE ) options(digits = 2) @@ -57,7 +59,7 @@ Most plots seen here can also be generated by their dedicated functions, e.g.: - Binned residuals: `binned_residuals()` - Check for overdispersion: `check_overdispersion()` -## Linear models: Are all assumptions for linear models met? +# Linear models: Are all assumptions for linear models met? We start with a simple example for a linear model. @@ -75,7 +77,7 @@ model_parameters(m1) There is nothing suspicious so far. Now let's start with model diagnostics. We use the `check_model()` function, which provides an overview with the most important and appropriate diagnostic plots for the model under investigation. -```{r eval=all(successfully_loaded[c("see", "ggplot2")]), fig.height=11} +```{r eval=all(successfully_loaded[c("see", "ggplot2")]), fig.height=12, fig.width=10, out.width="100%", out.height="100%"} library(performance) check_model(m1) ``` @@ -87,9 +89,9 @@ Now let's take a closer look for each plot. To do so, we ask `check_model()` to diagnostic_plots <- plot(check_model(m1, panel = FALSE)) ``` -### Posterior predictive checks +## Posterior predictive checks -The first plot is based on `check_predictions()`. Posterior predictive checks can be used to look for systematic discrepancies between real and simulated data. It helps to see whether the type of model (distributional family) fits well to the data (_Gelman and Hill, 2007, p. 158_). Posterior predictive checks can be used to "look for systematic discrepancies between real and simulated data" (_Gelman et al. 2014, p. 169_). +The first plot is based on `check_predictions()`. Posterior predictive checks can be used to "look for systematic discrepancies between real and simulated data" (_Gelman et al. 2014, p. 169_). It helps to see whether the type of model (distributional family) fits well to the data (_Gelman and Hill, 2007, p. 158_). ```{r eval=all(successfully_loaded[c("see", "ggplot2")])} # posterior predicive checks @@ -113,11 +115,11 @@ plot(out) As you can see, the green line in this plot deviates visibly from the blue lines. This may indicate that our linear model is not appropriate, since it does not capture the distributional nature of the response variable properly. -#### How to fix this? +### How to fix this? The best way, if there are serious concerns that the model does not fit well to the data, is to use a different type (family) of regression models. In our example, it is obvious that we should better use a Poisson regression. -#### Plots for discrete outcomes +### Plots for discrete outcomes For discrete or integer outcomes (like in logistic or Poisson regression), density plots are not always the best choice, as they look somewhat "wiggly" around the actual values of the dependent variables. In this case, use the `type` argument of the `plot()` method to change the plot-style. Available options are `type = "discrete_dots"` (dots for observed and replicated outcomes), `type = "discrete_interval"` (dots for observed, error bars for replicated outcomes) or `type = "discrete_both"` (both dots and error bars). @@ -134,7 +136,7 @@ out <- check_predictions(m3) plot(out, type = "discrete_both") ``` -### Linearity +## Linearity This plot helps to check the assumption of linear relationship. It shows whether predictors may have a non-linear relationship with the outcome, in which case the reference line may roughly indicate that relationship. A straight and horizontal line indicates that the model specification seems to be ok. @@ -160,7 +162,7 @@ out <- plot(check_model(m, panel = FALSE)) out[[2]] ``` -#### How to fix this? +### How to fix this? If the green reference line is not roughly flat and horizontal, but rather - like in our example - U-shaped, this may indicate that some of the predictors probably should better be modeled as quadratic term. Transforming the response variable might be another solution when linearity assumptions are not met. @@ -175,7 +177,7 @@ out[[2]] **Some caution is needed** when interpreting these plots. Although these plots are helpful to check model assumptions, they do not necessarily indicate so-called "lack of fit", e.g. missed non-linear relationships or interactions. Thus, it is always recommended to also look at [effect plots, including partial residuals](https://strengejacke.github.io/ggeffects/articles/introduction_partial_residuals.html). -### Homogeneity of variance - detecting heteroscedasticity +## Homogeneity of variance - detecting heteroscedasticity This plot helps to check the assumption of equal (or constant) variance, i.e. homoscedasticity. To meet this assumption, the variance of the residuals across different values of predictors is similar and does not notably increase or decrease. Hence, the desired pattern would be that dots spread equally above and below a roughly straight, horizontal line and show no apparent deviation. @@ -202,7 +204,7 @@ But why does the diagnostic plot used in `check_model()` look different? `check_ diagnostic_plots[[3]] ``` -#### How to fix this? +### How to fix this? There are several ways to address heteroscedasticity. @@ -212,7 +214,7 @@ There are several ways to address heteroscedasticity. 3. Transforming the response variable, for instance, taking the `log()`, may also help to avoid issues with heteroscedasticity. -### Influential observations - outliers +## Influential observations - outliers Outliers can be defined as particularly influential observations, and this plot helps detecting those outliers. Cook's distance (_Cook 1977_, _Cook & Weisberg 1982_) is used to define outliers, i.e. any point in this plot that falls outside of Cook's distance (the dashed lines) is considered an influential observation. @@ -223,11 +225,11 @@ diagnostic_plots[[4]] In our example, everything looks well. -#### How to fix this? +### How to fix this? Dealing with outliers is not straightforward, as it is not recommended to automatically discard any observation that has been marked as "an outlier". Rather, your _domain knowledge_ must be involved in the decision whether to keep or omit influential observation. A helpful heuristic is to distinguish between error outliers, interesting outliers, and random outliers (_Leys et al. 2019_). _Error outliers_ are likely due to human error and should be corrected before data analysis. _Interesting outliers_ are not due to technical error and may be of theoretical interest; it might thus be relevant to investigate them further even though they should be removed from the current analysis of interest. _Random outliers_ are assumed to be due to chance alone and to belong to the correct distribution and, therefore, should be retained. -### Multicollinearity +## Multicollinearity This plot checks for potential collinearity among predictors. In a nutshell multicollinearity means that once you know the effect of one predictor, the value of knowing the other predictor is rather low. Multicollinearity might arise when a third, unobserved variable has a causal effect on each of the two predictors that are associated with the outcome. In such cases, the actual relationship that matters would be the association between the unobserved variable and the outcome. @@ -244,11 +246,11 @@ The variance inflation factor (VIF) indicates the magnitude of multicollinearity Our model clearly suffers from multicollinearity, as all predictors have high VIF values. -#### How to fix this? +### How to fix this? Usually, predictors with (very) high VIF values should be removed from the model to fix multicollinearity. Some caution is needed for interaction terms. If interaction terms are included in a model, high VIF values are expected. This portion of multicollinearity among the component terms of an interaction is also called "inessential ill-conditioning", which leads to inflated VIF values that are typically seen for models with interaction terms _(Francoeur 2013)_. In such cases, re-fit your model without interaction terms and check this model for collinearity among predictors. -### Normality of residuals +## Normality of residuals In linear regression, residuals should be normally distributed. This can be checked using so-called Q-Q plots (quantile-quantile plot) to compare the shapes of distributions. This plot shows the quantiles of the studentized residuals versus fitted values. @@ -261,7 +263,7 @@ diagnostic_plots[[6]] In our example, we see that most data points are ok, except some observations at the tails. Whether any action is needed to fix this or not can also depend on the results of the remaining diagnostic plots. If all other plots indicate no violation of assumptions, some deviation of normality, particularly at the tails, can be less critical. -#### How to fix this? +### How to fix this? Here are some remedies to fix non-normality of residuals, according to _Pek et al. 2018_. diff --git a/vignettes/check_model_practical.Rmd b/vignettes/check_model_practical.Rmd new file mode 100644 index 000000000..53ca7ab58 --- /dev/null +++ b/vignettes/check_model_practical.Rmd @@ -0,0 +1,215 @@ +--- +title: "How to arrive at the best model fit" +output: + rmarkdown::html_vignette: + toc: true +tags: [r, performance] +vignette: > + \usepackage[utf8]{inputenc} + %\VignetteIndexEntry{How to arrive at the best model fit} + %\VignetteEngine{knitr::rmarkdown} +editor_options: + chunk_output_type: console +--- + +```{r , include=FALSE} +library(knitr) +knitr::opts_chunk$set( + dpi = 300, + fig.width = 7, + fig.height = 5, + out.width = "100%", + out.height = "100%", + collapse = TRUE, + comment = "#>", + warning = FALSE, + message = TRUE +) +options(knitr.kable.NA = "") +options(digits = 2) + +pkgs <- c("DHARMa", "glmmTMB", "see", "parameters") +successfully_loaded <- vapply(pkgs, requireNamespace, FUN.VALUE = logical(1L), quietly = TRUE) +can_evaluate <- all(successfully_loaded) + +if (can_evaluate) { + knitr::opts_chunk$set(eval = TRUE) + vapply(pkgs, require, FUN.VALUE = logical(1L), quietly = TRUE, character.only = TRUE) +} else { + knitr::opts_chunk$set(eval = FALSE) +} +``` + +This vignette shows how to use the *performance* package to check the fit of a model, how to detect misspecification and how to improve your model. The basic workflow of the *performance* package can be summarized as follows: + +- fit a regression model +- check the model fit and assess model fit indices +- if necessary, fit another model that could potentially improve the fit +- compare the model fit indices and perform statistical tests to determine which model is the best fit + +![](images/figure_workflow.png){width="75%"} + +In the following, we will demonstrate this workflow using a model with a count response variable. We will fit a Poisson regression model to the Salamanders dataset from the *glmmTMB* package. The dataset contains counts of salamanders in different sites, along with information on the number of mines and the species of salamanders. We will check the model fit and assess the model fit indices. + +Problems that may arise with count response variables are _zero inflation_ and _overdispersion_. Zero inflation occurs when there are more zeros in the data than expected under the Poisson distribution. Overdispersion occurs when the variance of the data is greater than the mean, which violates the assumption of equidispersion in the Poisson distribution. + +We will check for these problems and suggest ways to improve the model fit, i.e. if necessary, we will fit another model that could potentially improve the fit. Finally, we will compare the model fit indices and perform statistical tests to determine which model is the best fit. + +## Fit the initial model + +We start with a generalized mixed effects model, using a Poisson distribution. + +```{r} +library(performance) +model1 <- glmmTMB::glmmTMB( + count ~ mined + spp + (1 | site), + family = poisson, + data = glmmTMB::Salamanders +) +``` + +First, let us look at the summary of the model. + +```{r} +library(parameters) +model_parameters(model1) +``` + +We see a lot of statistically significant estimates here. No matter, which [philosophy](https://easystats.github.io/parameters/reference/p_function.html) you follow, our conclusions we draw from statistical models will be inaccurate if our modeling assumptions are a poor fit for the situation. Hence, checking model fit is essential. + +In *performance*, we can conduct a comprehensive visual inspection of our model fit using `check_model()`. We won't go into details of all the plots here, but you can find more information on all created diagnostic plots in the [dedicated vignette](https://easystats.github.io/performance/articles/check_model.html). + +For now, we want to focus on the _posterior predictive checks_, _dispersion and zero-inflation_ as well as the Q-Q plot (_uniformity of residuals_). + +```{r fig.height=12, fig.width=10} +check_model(model1, dot_size = 1.2) +``` + +Note that unlike `plot()`, which is a base R function to create diagnostic plots, `check_model()` relies on *simulated residuals* for the Q-Q plot, which is more accurate for non-Gaussian models. See [this vignette](https://easystats.github.io/performance/articles/simulate_residuals.html) and the documentation of `simulate_residuals()` for further details. + +The above plot suggests that we may have issues with overdispersion and/or zero-inflation. We can check for these problems using `check_overdispersion()` and `check_zeroinflation()`, which will perform statistical tests (based on simulated residuals). These tests can additionally be used beyond the visual inspection. + +```{r} +check_overdispersion(model1) + +check_zeroinflation(model1) +``` + +As we can see, our model seems to suffer both from overdispersion and zero-inflation. + +## First attempt at improving the model fit + +We can try to improve the model fit by fitting a model with zero-inflation component: + +```{r fig.height=12, fig.width=10} +model2 <- glmmTMB::glmmTMB( + count ~ mined + spp + (1 | site), + ziformula = ~ mined + spp, + family = poisson, + data = glmmTMB::Salamanders +) +check_model(model2, dot_size = 1.2) +``` + +Looking at the above plots, the zero-inflation seems to be addressed properly (see especially _posterior predictive checks_ and _uniformity of residuals_, the Q-Q plot). However, the overdispersion still could be present. We can check for these problems using `check_overdispersion()` and `check_zeroinflation()` again. + +```{r} +check_overdispersion(model2) + +check_zeroinflation(model2) +``` + +Indeed, the overdispersion is still present. + +## Second attempt at improving the model fit + +We can try to address this issue by fitting a negative binomial model instead of using a Poisson distribution. + +```{r fig.height=12, fig.width=10} +model3 <- glmmTMB::glmmTMB( + count ~ mined + spp + (1 | site), + ziformula = ~ mined + spp, + family = glmmTMB::nbinom1, + data = glmmTMB::Salamanders +) +check_model(model3, dot_size = 1.2) +``` + +Now we see that the plot showing _misspecified dispersion and zero-inflation_ suggests that the overdispersion is better addressed than before. Let us check again: + +```{r} +check_overdispersion(model3) + +check_zeroinflation(model3) +``` + +## Comparing model fit indices + +There are different model fit indices that can be used to compare models. For our purpose, we rely on the Akaike Information Criterion (AIC), the corrected Akaike Information Criterion (AICc), the Bayesian Information Criterion (BIC), and the Proper Scoring Rules. We can compare the models using `compare_performance()` and `plot()`. + +```{r} +result <- compare_performance( + model1, model2, model3, + metrics = c("AIC", "AICc", "BIC", "SCORE") +) +result + +plot(result) +``` + +The weighted AIC and BIC range from 0 to 1, indicating better model fit the closer the value is to 1. The AICc is a corrected version of the AIC for small sample sizes. The Proper Scoring Rules range from -Inf to 0, with higher values (i.e. closer to 0) indicating better model fit. + +The above results suggest that indeed our third model is the best fit. + +## Statistical tests for model comparison + +We can also perform statistical tests to determine which model is the best fit using `test_performance()` or `anova()`. `test_performance()` automatically selects an appropriate test based on the model family. You can also call the different tests, like `test_likelihoodratio()`, `test_bf()`, `test_wald()` or `test_vuong()` directly. + +```{r} +test_performance(model1, model2, model3) +``` + +We see, first, that `test_performance()` used the Bayes factor (based on BIC comparison) to compare the models. And second, that both the second and third model seem to be significantly better than the first model. + +Now we compare the second against the third model +```{r} +test_performance(model2, model3) + +test_likelihoodratio(model2, model3) +``` + +We see that both the Bayes factor and likelihood ratio test suggest that the third model is significantly better than the second model. + +What does this mean for our inference? + +```{r} +model_parameters(model3) +``` + +Obviously, although we might have found the best fitting model, coefficients for the _zero-inflation_ component of our model look rather spurious. We have *very* high coefficients here. We still might find a better distributional family for our model, and try `nbinom2` now. + +```{r fig.height=12, fig.width=10} +model4 <- glmmTMB::glmmTMB( + count ~ mined + spp + (1 | site), + ziformula = ~ mined + spp, + family = glmmTMB::nbinom2, + data = glmmTMB::Salamanders +) +check_model(model4, dot_size = 1.2) + +check_overdispersion(model4) + +check_zeroinflation(model4) + +test_likelihoodratio(model3, model4) + +model_parameters(model4) +``` + +Based on these results, we might even go with `model4`. + +# Conclusion + +Statistics is hard. It is not just about fitting a model, but also about checking the model fit and improving the model. This also requires domain knowledge to consider whether all relevant predictors are included in the model (and whether all included predictors are relevant!). + +The *performance* package provides a comprehensive set of tools to help you with this task. We have demonstrated how to use these tools to check the fit of a model, detect misspecification, and improve the model. We have also shown how to compare the model fit indices and perform statistical tests to determine which model is the best fit. We hope this vignette has been helpful in guiding you through this process. diff --git a/vignettes/check_outliers.Rmd b/vignettes/check_outliers.Rmd new file mode 100644 index 000000000..906095f21 --- /dev/null +++ b/vignettes/check_outliers.Rmd @@ -0,0 +1,304 @@ +--- +title: "Checking outliers with *performance*" +output: + rmarkdown::html_vignette: + toc: true + fig_width: 10.08 + fig_height: 6 +bibliography: paper.bib +vignette: > + \usepackage[utf8]{inputenc} + %\VignetteIndexEntry{Checking outliers with *performance*} + %\VignetteEngine{knitr::rmarkdown} +editor_options: + chunk_output_type: console +--- + +```{r , include=FALSE} +library(knitr) +library(performance) +options(knitr.kable.NA = "") +knitr::opts_chunk$set( + comment = ">", + message = FALSE, + warning = FALSE, + out.width = "100%", + dpi = 450 +) +options(digits = 2) + +pkgs <- c( + "see", "performance", "datawizard", "rempsyc", + "ggplot2", "flextable", "ftExtra" +) +successfully_loaded <- vapply(pkgs, requireNamespace, FUN.VALUE = logical(1L), quietly = TRUE) +can_evaluate <- all(successfully_loaded) + +if (can_evaluate) { + knitr::opts_chunk$set(eval = TRUE) + vapply(pkgs, require, FUN.VALUE = logical(1L), quietly = TRUE, character.only = TRUE) +} else { + knitr::opts_chunk$set(eval = FALSE) +} +``` + + +# Reuse of this Material + +> Note: This vignette is an extended write-up of the [Behavior Research Methods paper](https://doi.org/10.3758/s13428-024-02356-w). This educational module can be freely reused for teaching purposes as long as the original BRM paper is cited. The raw code file, which can be adapted to other rmarkdown formats for teaching purposes, can be accessed [here](https://github.com/easystats/performance/blob/HEAD/vignettes/check_outliers.Rmd). To contribute to and improve this content directly, please submit a Pull Request at the *{performance}* package GitHub repository by following our usual contributing guidelines: https://easystats.github.io/performance/CONTRIBUTING.html. To report issues or problems, with this module, or seek support, please open an issue: https://github.com/easystats/performance/issues. + +**Reference:** + +Thériault, R., Ben-Shachar, M. S., Patil, I., Lüdecke, D., Wiernik, B. M., & Makowski, D. (2024). Check your outliers! An introduction to identifying statistical outliers in R with easystats. *Behavior Research Methods*, 1-11. https://doi.org/10.3758/s13428-024-02356-w + +# Summary + +Beyond the challenge of keeping up-to-date with current best practices regarding the diagnosis and treatment of outliers, an additional difficulty arises concerning the mathematical implementation of the recommended methods. In this vignette, we provide an overview of current recommendations and best practices and demonstrate how they can easily and conveniently be implemented in the R statistical computing software, using the *{performance}* package of the *easystats* ecosystem. We cover univariate, multivariate, and model-based statistical outlier detection methods, their recommended threshold, standard output, and plotting methods. We conclude with recommendations on the handling of outliers: the different theoretical types of outliers, whether to exclude or winsorize them, and the importance of transparency. + +# Statement of Need + +Real-life data often contain observations that can be considered *abnormal* when compared to the main population. The cause of it---be it because they belong to a different distribution (originating from a different generative process) or simply being extreme cases, statistically rare but not impossible---can be hard to assess, and the boundaries of "abnormal" difficult to define. + +Nonetheless, the improper handling of these outliers can substantially affect statistical model estimations, biasing effect estimations and weakening the models' predictive performance. It is thus essential to address this problem in a thoughtful manner. Yet, despite the existence of established recommendations and guidelines, many researchers still do not treat outliers in a consistent manner, or do so using inappropriate strategies [@simmons2011false; @leys2013outliers]. + +One possible reason is that researchers are not aware of the existing recommendations, or do not know how to implement them using their analysis software. In this paper, we show how to follow current best practices for automatic and reproducible statistical outlier detection (SOD) using R and the *{performance}* package [@ludecke2021performance], which is part of the *easystats* ecosystem of packages that build an R framework for easy statistical modeling, visualization, and reporting [@easystatspackage]. Installation instructions can be found on [GitHub](https://github.com/easystats/performance) or its [website](https://easystats.github.io/performance/), and its list of dependencies on [CRAN](https://cran.r-project.org/package=performance). + +The instructional materials that follow are aimed at an audience of researchers who want to follow good practices, and are appropriate for advanced undergraduate students, graduate students, professors, or professionals having to deal with the nuances of outlier treatment. + +# Identifying Outliers + +Although many researchers attempt to identify outliers with measures based on the mean (e.g., _z_ scores), those methods are problematic because the mean and standard deviation themselves are not robust to the influence of outliers and those methods also assume normally distributed data (i.e., a Gaussian distribution). Therefore, current guidelines recommend using robust methods to identify outliers, such as those relying on the median as opposed to the mean [@leys2019outliers; @leys2013outliers; @leys2018outliers]. + +Nonetheless, which exact outlier method to use depends on many factors. In some cases, eye-gauging odd observations can be an appropriate solution, though many researchers will favour algorithmic solutions to detect potential outliers, for example, based on a continuous value expressing the observation stands out from the others. + +One of the factors to consider when selecting an algorithmic outlier detection method is the statistical test of interest. When using a regression model, relevant information can be found by identifying observations that do not fit well with the model. This approach, known as model-based outliers detection (as outliers are extracted after the statistical model has been fit), can be contrasted with distribution-based outliers detection, which is based on the distance between an observation and the "center" of its population. Various quantification strategies of this distance exist for the latter, both univariate (involving only one variable at a time) or multivariate (involving multiple variables). + +When no method is readily available to detect model-based outliers, such as for structural equation modelling (SEM), looking for multivariate outliers may be of relevance. For simple tests (_t_ tests or correlations) that compare values of the same variable, it can be appropriate to check for univariate outliers. However, univariate methods can give false positives since _t_ tests and correlations, ultimately, are also models/multivariable statistics. They are in this sense more limited, but we show them nonetheless for educational purposes. + +Importantly, whatever approach researchers choose remains a subjective decision, which usage (and rationale) must be transparently documented and reproducible [@leys2019outliers]. Researchers should commit (ideally in a preregistration) to an outlier treatment method before collecting the data. They should report in the paper their decisions and details of their methods, as well as any deviation from their original plan. These transparency practices can help reduce false positives due to excessive researchers' degrees of freedom (i.e., choice flexibility throughout the analysis). In the following section, we will go through each of the mentioned methods and provide examples on how to implement them with R. + +## Univariate Outliers + +Researchers frequently attempt to identify outliers using measures of deviation from the center of a variable's distribution. One of the most popular such procedure is the _z_ score transformation, which computes the distance in standard deviation (SD) from the mean. However, as mentioned earlier, this popular method is not robust. Therefore, for univariate outliers, it is recommended to use the median along with the Median Absolute Deviation (MAD), which are more robust than the interquartile range or the mean and its standard deviation [@leys2019outliers; @leys2013outliers]. + +Researchers can identify outliers based on robust (i.e., MAD-based) _z_ scores using the `check_outliers()` function of the *{performance}* package, by specifying `method = "zscore_robust"`.^[Note that `check_outliers()` only checks numeric variables.] Although @leys2013outliers suggest a default threshold of 2.5 and @leys2019outliers a threshold of 3, *{performance}* uses by default a less conservative threshold of ~3.29.^[3.29 is an approximation of the two-tailed critical value for _p_ < .001, obtained through `qnorm(p = 1 - 0.001 / 2)`. We chose this threshold for consistency with the thresholds of all our other methods.] That is, data points will be flagged as outliers if they go beyond +/- ~3.29 MAD. Users can adjust this threshold using the `threshold` argument. + +Below we provide example code using the `mtcars` dataset, which was extracted from the 1974 *Motor Trend* US magazine. The dataset contains fuel consumption and 10 characteristics of automobile design and performance for 32 different car models (see `?mtcars` for details). We chose this dataset because it is accessible from base R and familiar to many R users. We might want to conduct specific statistical analyses on this data set, say, _t_ tests or structural equation modelling, but first, we want to check for outliers that may influence those test results. + +Because the automobile names are stored as column names in `mtcars`, we first have to convert them to an ID column to benefit from the `check_outliers()` ID argument. Furthermore, we only really need a couple columns for this demonstration, so we choose the first four (`mpg` = Miles/(US) gallon; `cyl` = Number of cylinders; `disp` = Displacement; `hp` = Gross horsepower). Finally, because there are no outliers in this dataset, we add two artificial outliers before running our function. + +```{r z_score} +library(performance) + +# Create some artificial outliers and an ID column +data <- rbind(mtcars[1:4], 42, 55) +data <- cbind(car = row.names(data), data) + +outliers <- check_outliers(data, method = "zscore_robust", ID = "car") +outliers +``` + +What we see is that `check_outliers()` with the robust _z_ score method detected two outliers: cases 33 and 34, which were the observations we added ourselves. They were flagged for two variables specifically: `mpg` (Miles/(US) gallon) and `cyl` (Number of cylinders), and the output provides their exact _z_ score for those variables. + +We describe how to deal with those cases in more details later in the paper, but should we want to exclude these detected outliers from the main dataset, we can extract row numbers using `which()` on the output object, which can then be used for indexing: + +```{r} +which(outliers) + +data_clean <- data[-which(outliers), ] +``` + +All `check_outliers()` output objects possess a `plot()` method, meaning it is also possible to visualize the outliers using the generic `plot()` function on the resulting outlier object after loading the {see} package. + +```{r univariate, eval=FALSE} +library(see) +plot(outliers) +``` + +```{r univariate_implicit, fig.cap = "Visual depiction of outliers using the robust z-score method. The distance represents an aggregate score for variables mpg, cyl, disp, and hp.", echo=FALSE} +library(see) +plot(outliers) + + ggplot2::theme(axis.text.x = ggplot2::element_text( + angle = 45, size = 7 + )) +``` + +Other univariate methods are available, such as using the interquartile range (IQR), or based on different intervals, such as the Highest Density Interval (HDI) or the Bias Corrected and Accelerated Interval (BCI). These methods are documented and described in the function's [help page](). + +## Multivariate Outliers + +Univariate outliers can be useful when the focus is on a particular variable, for instance the reaction time, as extreme values might be indicative of inattention or non-task-related behavior^[ Note that they might not be the optimal way of treating reaction time outliers [@ratcliff1993methods; @van1995statistical]]. + +However, in many scenarios, variables of a data set are not independent, and an abnormal observation will impact multiple dimensions. For instance, a participant giving random answers to a questionnaire. In this case, computing the _z_ score for each of the questions might not lead to satisfactory results. Instead, one might want to look at these variables together. + +One common approach for this is to compute multivariate distance metrics such as the Mahalanobis distance. Although the Mahalanobis distance is very popular, just like the regular _z_ scores method, it is not robust and is heavily influenced by the outliers themselves. Therefore, for multivariate outliers, it is recommended to use the Minimum Covariance Determinant, a robust version of the Mahalanobis distance [MCD, @leys2018outliers; @leys2019outliers]. + +In *{performance}*'s `check_outliers()`, one can use this approach with `method = "mcd"`.^[Our default threshold for the MCD method is defined by `stats::qchisq(p = 1 - 0.001, df = ncol(x))`, which again is an approximation of the critical value for _p_ < .001 consistent with the thresholds of our other methods.] + +```{r multivariate} +outliers <- check_outliers(data, method = "mcd", verbose = FALSE) +outliers +``` + +Here, we detected 9 multivariate outliers (i.e,. when looking at all variables of our dataset together). + +```{r multivariate_plot, eval=FALSE} +plot(outliers) +``` + +```{r multivariate_implicit, fig.cap = "Visual depiction of outliers using the Minimum Covariance Determinant (MCD) method, a robust version of the Mahalanobis distance. The distance represents the MCD scores for variables mpg, cyl, disp, and hp.", echo=FALSE} +plot(outliers) + + ggplot2::theme(axis.text.x = ggplot2::element_text( + angle = 45, size = 7 + )) +``` + +Other multivariate methods are available, such as another type of robust Mahalanobis distance that in this case relies on an orthogonalized Gnanadesikan-Kettenring pairwise estimator [@gnanadesikan1972robust]. These methods are documented and described in the function's [help page](https://easystats.github.io/performance/reference/check_outliers.html). + +## Model-Based Outliers + +Working with regression models creates the possibility of using model-based SOD methods. These methods rely on the concept of *leverage*, that is, how much influence a given observation can have on the model estimates. If few observations have a relatively strong leverage/influence on the model, one can suspect that the model's estimates are biased by these observations, in which case flagging them as outliers could prove helpful (see next section, "Handling Outliers"). + +In {performance}, two such model-based SOD methods are currently available: Cook's distance, for regular regression models, and Pareto, for Bayesian models. As such, `check_outliers()` can be applied directly on regression model objects, by simply specifying `method = "cook"` (or `method = "pareto"` for Bayesian models).^[Our default threshold for the Cook method is defined by `stats::qf(0.5, ncol(x), nrow(x) - ncol(x))`, which again is an approximation of the critical value for _p_ < .001 consistent with the thresholds of our other methods.] + +Currently, most lm models are supported (with the exception of `glmmTMB`, `lmrob`, and `glmrob` models), as long as they are supported by the underlying functions `stats::cooks.distance()` (or `loo::pareto_k_values()`) and `insight::get_data()` (for a full list of the 225 models currently supported by the `insight` package, see https://easystats.github.io/insight/#list-of-supported-models-by-class). Also note that although `check_outliers()` supports the pipe operators (`|>` or `%>%`), it does not support `tidymodels` at this time. We show a demo below. + +```{r model, fig.cap = "Visual depiction of outliers based on Cook's distance (leverage and standardized residuals), based on the fitted model."} +model <- lm(disp ~ mpg * hp, data = data) +outliers <- check_outliers(model, method = "cook") +outliers + +plot(outliers) +``` + +Using the model-based outlier detection method, we identified two outliers. + +Table 1 below summarizes which methods to use in which cases, and with what threshold. The recommended thresholds are the default thresholds. + +```{r table1_prep, echo=FALSE} +df <- data.frame( + `Statistical Test` = c( + "Supported regression model", + "Structural Equation Modeling (or other unsupported model)", + "Simple test with few variables (*t* test, correlation, etc.)" + ), + `Diagnosis Method` = c( + "**Model-based**: Cook (or Pareto for Bayesian models)", + "**Multivariate**: Minimum Covariance Determinant (MCD)", + "**Univariate**: robust *z* scores (MAD)" + ), + `Recommended Threshold` = c( + "_qf(0.5, ncol(x), nrow(x) - ncol(x))_ (or 0.7 for Pareto)", + "_qchisq(p = 1 - 0.001, df = ncol(x))_", + "_qnorm(p = 1 - 0.001 / 2)_, ~ 3.29" + ), + `Function Usage` = c( + '_check_outliers(model, method = "cook")_', + '_check_outliers(data, method = "mcd")_', + '_check_outliers(data, method = "zscore_robust")_' + ), + check.names = FALSE +) +``` + +### Table 1 + +_Summary of Statistical Outlier Detection Methods Recommendations_ + +```{r table1_print, echo=FALSE, message=FALSE} +x <- flextable::flextable(df, cwidth = 2.25) +x <- flextable::theme_apa(x) +x <- flextable::font(x, fontname = "Latin Modern Roman", part = "all") +# x <- flextable::fontsize(x, size = 10, part = "all") +ftExtra::colformat_md(x) +``` + +## Cook's Distance vs. MCD + +@leys2018outliers report a preference for the MCD method over Cook's distance. This is because Cook's distance removes one observation at a time and checks its corresponding influence on the model each time [@cook1977detection], and flags any observation that has a large influence. In the view of these authors, when there are several outliers, the process of removing a single outlier at a time is problematic as the model remains "contaminated" or influenced by other possible outliers in the model, rendering this method suboptimal in the presence of multiple outliers. + +However, distribution-based approaches are not a silver bullet either, and there are cases where the usage of methods agnostic to theoretical and statistical models of interest might be problematic. For example, a very tall person would be expected to also be much heavier than average, but that would still fit with the expected association between height and weight (i.e., it would be in line with a model such as `weight ~ height`). In contrast, using multivariate outlier detection methods there may flag this person as being an outlier---being unusual on two variables, height and weight---even though the pattern fits perfectly with our predictions. + +In the example below, we plot the raw data and see two possible outliers. The first one falls along the regression line, and is therefore "in line" with our hypothesis. The second one clearly diverges from the regression line, and therefore we can conclude that this outlier may have a disproportionate influence on our model. + +```{r scatter, fig.cap = "Scatter plot of height and weight, with two extreme observations: one model-consistent (top-right) and the other, model-inconsistent (i.e., an outlier; bottom-right)."} +data <- women[rep(seq_len(nrow(women)), each = 100), ] +data <- rbind(data, c(100, 258), c(100, 200)) +model <- lm(weight ~ height, data) +rempsyc::nice_scatter(data, "height", "weight") +``` + +Using either the *z*-score or MCD methods, our model-consistent observation will be incorrectly flagged as an outlier or influential observation. + +```{r} +outliers <- check_outliers(model, method = c("zscore_robust", "mcd"), verbose = FALSE) +which(outliers) +``` + +In contrast, the model-based detection method displays the desired behaviour: it correctly flags the person who is very tall but very light, without flagging the person who is both tall and heavy. + +```{r model2, fig.cap = "The leverage method (Cook's distance) correctly distinguishes the true outlier from the model-consistent extreme observation), based on the fitted model."} +outliers <- check_outliers(model, method = "cook") +which(outliers) +plot(outliers) +``` + +Finally, unusual observations happen naturally: extreme observations are expected even when taken from a normal distribution. While statistical models can integrate this "expectation", multivariate outlier methods might be too conservative, flagging too many observations despite belonging to the right generative process. For these reasons, we believe that model-based methods are still preferable to the MCD when using supported regression models. Additionally, if the presence of multiple outliers is a significant concern, regression methods that are more robust to outliers should be considered---like _t_ regression or quantile regression---as they render their precise identification less critical [@mcelreath2020statistical]. + +## Composite Outlier Score + +The *{performance}* package also offers an alternative, consensus-based approach that combines several methods, based on the assumption that different methods provide different angles of looking at a given problem. By applying a variety of methods, one can hope to "triangulate" the true outliers (those consistently flagged by multiple methods) and thus attempt to minimize false positives. + +In practice, this approach computes a composite outlier score, formed of the average of the binary (0 or 1) classification results of each method. It represents the probability that each observation is classified as an outlier by at least one method. The default decision rule classifies rows with composite outlier scores superior or equal to 0.5 as outlier observations (i.e., that were classified as outliers by at least half of the methods). In *{performance}*'s `check_outliers()`, one can use this approach by including all desired methods in the corresponding argument. + +```{r multimethod, fig.cap = "Visual depiction of outliers using several different statistical outlier detection methods."} +outliers <- check_outliers(model, method = c("zscore_robust", "mcd", "cook"), verbose = FALSE) +which(outliers) +``` + +Outliers (counts or per variables) for individual methods can then be obtained through attributes. For example: + +```{r} +attributes(outliers)$outlier_var$zscore_robust +``` + +An example sentence for reporting the usage of the composite method could be: + +> Based on a composite outlier score [see the 'check_outliers()' function in the 'performance' R package, @ludecke2021performance] obtained via the joint application of multiple outliers detection algorithms [(a) median absolute deviation (MAD)-based robust _z_ scores, @leys2013outliers; (b) Mahalanobis minimum covariance determinant (MCD), @leys2019outliers; and (c) Cook's distance, @cook1977detection], we excluded two participants that were classified as outliers by at least half of the methods used. + +# Handling Outliers + +The above section demonstrated how to identify outliers using the `check_outliers()` function in the *{performance}* package. But what should we do with these outliers once identified? Although it is common to automatically discard any observation that has been marked as "an outlier" as if it might infect the rest of the data with its statistical ailment, we believe that the use of SOD methods is but one step in the get-to-know-your-data pipeline; a researcher or analyst's _domain knowledge_ must be involved in the decision of how to deal with observations marked as outliers by means of SOD. Indeed, automatic tools can help detect outliers, but they are nowhere near perfect. Although they can be useful to flag suspect data, they can have misses and false alarms, and they cannot replace human eyes and proper vigilance from the researcher. If you do end up manually inspecting your data for outliers, it can be helpful to think of outliers as belonging to different types of outliers, or categories, which can help decide what to do with a given outlier. + +## Error, Interesting, and Random Outliers + +@leys2019outliers distinguish between error outliers, interesting outliers, and random outliers. _Error outliers_ are likely due to human error and should be corrected before data analysis or outright removed since they are invalid observations. _Interesting outliers_ are not due to technical error and may be of theoretical interest; it might thus be relevant to investigate them further even though they should be removed from the current analysis of interest. _Random outliers_ are assumed to be due to chance alone and to belong to the correct distribution and, therefore, should be retained. + +It is recommended to _keep_ observations which are expected to be part of the distribution of interest, even if they are outliers [@leys2019outliers]. However, if it is suspected that the outliers belong to an alternative distribution, then those observations could have a large impact on the results and call into question their robustness, especially if significance is conditional on their inclusion, so should be removed. + +We should also keep in mind that there might be error outliers that are not detected by statistical tools, but should nonetheless be found and removed. For example, if we are studying the effects of X on Y among teenagers and we have one observation from a 20-year-old, this observation might not be a _statistical outlier_, but it is an outlier in the _context_ of our research, and should be discarded. We could call these observations *undetected* error outliers, in the sense that although they do not statistically stand out, they do not belong to the theoretical or empirical distribution of interest (e.g., teenagers). In this way, we should not blindly rely on statistical outlier detection methods; doing our due diligence to investigate undetected error outliers relative to our specific research question is also essential for valid inferences. + +## Winsorization + +_Removing_ outliers can in this case be a valid strategy, and ideally one would report results with and without outliers to see the extent of their impact on results. This approach however can reduce statistical power. Therefore, some propose a _recoding_ approach, namely, winsorization: bringing outliers back within acceptable limits [e.g., 3 MADs, @tukey1963less]. However, if possible, it is recommended to collect enough data so that even after removing outliers, there is still sufficient statistical power without having to resort to winsorization [@leys2019outliers]. + +The _easystats_ ecosystem makes it easy to incorporate this step into your workflow through the `winsorize()` function of *{datawizard}*, a lightweight R package to facilitate data wrangling and statistical transformations [@patil2022datawizard]. This procedure will bring back univariate outliers within the limits of 'acceptable' values, based either on the percentile, the _z_ score, or its robust alternative based on the MAD. + +```{r winsorization} +data[1501:1502, ] # See outliers rows +# Winsorizing using the MAD +library(datawizard) +winsorized_data <- winsorize(data, method = "zscore", robust = TRUE, threshold = 3) +# Values > +/- MAD have been winsorized +winsorized_data[1501:1502, ] +``` + +## The Importance of Transparency + +Once again, it is a critical part of a sound outlier treatment that regardless of which SOD method used, it should be reported in a reproducible manner. Ideally, the handling of outliers should be specified *a priori* with as much detail as possible, and preregistered, to limit researchers' degrees of freedom and therefore risks of false positives [@leys2019outliers]. This is especially true given that interesting outliers and random outliers are often times hard to distinguish in practice. Thus, researchers should always prioritize transparency and report all of the following information: (a) how many outliers were identified (including percentage); (b) according to which method and criteria, (c) using which function of which R package (if applicable), and (d) how they were handled (excluded or winsorized, if the latter, using what threshold). If at all possible, (e) the corresponding code script along with the data should be shared on a public repository like the Open Science Framework (OSF), so that the exclusion criteria can be reproduced precisely. + +# Conclusion + +In this vignette, we have showed how to investigate outliers using the `check_outliers()` function of the *{performance}* package while following current good practices. However, best practice for outlier treatment does not stop at using appropriate statistical algorithms, but entails respecting existing recommendations, such as preregistration, reproducibility, consistency, transparency, and justification. Ideally, one would additionally also report the package, function, and threshold used (linking to the full code when possible). We hope that this paper and the accompanying `check_outlier()` function of *easystats* will help researchers engage in good research practices while providing a smooth outlier detection experience. + +# References diff --git a/vignettes/images/figure_workflow.png b/vignettes/images/figure_workflow.png new file mode 100644 index 000000000..2d826c203 Binary files /dev/null and b/vignettes/images/figure_workflow.png differ diff --git a/papers/Mathematics/mybibfile.bib b/vignettes/paper.bib similarity index 90% rename from papers/Mathematics/mybibfile.bib rename to vignettes/paper.bib index 463fd9e2b..56c8ae7e1 100644 --- a/papers/Mathematics/mybibfile.bib +++ b/vignettes/paper.bib @@ -41,18 +41,17 @@ @article{simmons2011false URL = {https://doi.org/10.1177/0956797611417632}, } -@Article{easystatspackage, - title = {easystats: Framework for Easy Statistical Modeling, Visualization, and Reporting}, - author = {Daniel Lüdecke and Mattan S. Ben-Shachar and Indrajeet Patil and Brenton M. Wiernik and Etienne Bacher and Rémi Thériault and Dominique Makowski}, - journal = {CRAN}, - year = {2022}, - note = {R package}, - url = {https://easystats.github.io/easystats/}, - } +@software{easystatspackage, + title = {{easystats}: Streamline Model Interpretation, Visualization, and Reporting}, + author = {Daniel Lüdecke and Dominique Makowski and Mattan S. Ben-Shachar and Indrajeet Patil and Brenton M. Wiernik and Etienne Bacher and Rémi Thériault}, + date = {2023-02-04T22:06:06Z}, + origdate = {2019-01-28T10:39:29Z}, + url = {https://easystats.github.io/easystats/} +} @Article{ludecke2021performance, author = {Daniel Lüdecke and Mattan S. Ben-Shachar and Indrajeet Patil and Philip Waggoner and Dominique Makowski}, - title = {{performance}: An R package for assessment, comparison and testing of statistical models}, + title = {{performance}: An {R} package for assessment, comparison and testing of statistical models}, volume = {6}, number = {60}, journal = {Journal of Open Source Software}, @@ -146,7 +145,7 @@ @article{ratcliff1993methods } @book{mcelreath2020statistical, - title={Statistical rethinking: A Bayesian course with examples in R and Stan}, + title={Statistical rethinking: A Bayesian course with examples in {R} and Stan}, author={McElreath, Richard}, year={2020}, publisher={CRC press} diff --git a/vignettes/r2.Rmd b/vignettes/r2.Rmd index 843bd3412..e6ee63356 100644 --- a/vignettes/r2.Rmd +++ b/vignettes/r2.Rmd @@ -29,14 +29,11 @@ knitr::opts_chunk$set( ) options(digits = 2) -pkgs <- c( - "effectsize", "BayesFactor", "lme4", "rstanarm" -) +pkgs <- c("effectsize", "lme4", "rstanarm") successfully_loaded <- sapply(pkgs, requireNamespace, quietly = TRUE) if (all(successfully_loaded)) { library(performance) library(effectsize) - library(BayesFactor) library(lme4) library(rstanarm) } @@ -147,27 +144,6 @@ model <- stan_lmer(Petal.Length ~ Petal.Width + (1 | Species), data = iris, refr r2(model) ``` -Let's look at another regression analysis carried out with `{BayesFactor}` package. - -```{r, eval=successfully_loaded["BayesFactor"] && utils::packageVersion("BayesFactor") >= package_version("0.9.12-4.3")} -library(BayesFactor) -data(puzzles) - -m1 <- anovaBF(extra ~ group + ID, - data = sleep, - whichRandom = "ID", progress = FALSE -) - -r2(m1) - -m2 <- generalTestBF(RT ~ shape * color + ID, - data = puzzles, whichRandom = "ID", - neverExclude = "ID", progress = FALSE -) - -r2(m2) -``` - # Comparing change in R2 using Cohen's *f* Cohen's $f$ (of [ANOVA fame](https://easystats.github.io/effectsize/articles/anovaES.html)) can be used as a measure of effect size in the context of sequential multiple regression (i.e., [**nested models**](https://easystats.github.io/performance/reference/test_performance.html)). diff --git a/vignettes/simulate_residuals.Rmd b/vignettes/simulate_residuals.Rmd new file mode 100644 index 000000000..89f826ba6 --- /dev/null +++ b/vignettes/simulate_residuals.Rmd @@ -0,0 +1,106 @@ +--- +title: "Checking simulated residuals" +output: + rmarkdown::html_vignette: + toc: true +tags: [r, performance] +vignette: > + \usepackage[utf8]{inputenc} + %\VignetteIndexEntry{Checking simulated residuals} + %\VignetteEngine{knitr::rmarkdown} +editor_options: + chunk_output_type: console +--- + +```{r , include=FALSE} +library(knitr) +knitr::opts_chunk$set( + dpi = 300, + fig.width = 7, + fig.height = 5, + out.width = "100%", + collapse = TRUE, + comment = "#>", + warning = FALSE, + message = FALSE +) +options(knitr.kable.NA = "") +options(digits = 2) + +pkgs <- c("DHARMa", "glmmTMB", "see") +successfully_loaded <- vapply(pkgs, requireNamespace, FUN.VALUE = logical(1L), quietly = TRUE) +can_evaluate <- all(successfully_loaded) + +if (can_evaluate) { + knitr::opts_chunk$set(eval = TRUE) + vapply(pkgs, require, FUN.VALUE = logical(1L), quietly = TRUE, character.only = TRUE) +} else { + knitr::opts_chunk$set(eval = FALSE) +} +``` + +The basic workflow for simulated residual checks using `simulate_residuals()` is as follows. + +First, fit a model: + +```{r} +model <- glmmTMB::glmmTMB( + count ~ mined + spp + (1 | site), + family = poisson, + data = glmmTMB::Salamanders +) +``` + +Next, simulate residuals from the model: + +```{r} +library(performance) +simulated_residuals <- simulate_residuals(model) + +simulated_residuals +``` + +The raw residuals can be extracted using `residuals()`: + +```{r} +head(residuals(simulated_residuals)) +``` + + + +Note that since this inherits the DHARMa class, all the methods implemented in DHARMa just work, including all the tests: + +```{r} +DHARMa::testUniformity(simulated_residuals, plot = FALSE) +``` + + +Finally, run specific checks on the simulated residuals: + +```{r message=TRUE} +check_residuals(simulated_residuals) +``` + +Further implemented checks are tests for overdispersion, outliers and zero-inflation. + +```{r message=TRUE} +check_overdispersion(simulated_residuals) + +check_zeroinflation(simulated_residuals) + +check_outliers(simulated_residuals) +``` + +The above three functions internally call `simulate_residuals()` for more complex models automatically, so you don't need to call `simulate_residuals()` yourself. Simulated residuals are usually more reliable than the standard residuals, especially for complex models. + +Finally, you can even perform a visual check for the entire model, either by passing the model object directly, or the object returned from `simulate_residuals()`. + +```{r fig.height=12, fig.width=10} +check_model(simulated_residuals, dot_size = 1.5) +``` + +The `check_model()` function is the main reason we don't want to prematurely extract the residuals in `simulate_residuals()`, because if we do then the simulated residual won't contain the model fit (`fittedModel` in the output below), so we won't be able to do all of the checks we would want to do using the model (e.g., posterior predictive checks). + +```{r} +str(simulated_residuals, max.level = 1) +```