Merge pull request #22 from USEPA/develop

CRAN v0.7.0
USEPA · Jul 18, 2024 · b3e1dab · b3e1dab
2 parents f636ff5 + f3c66cf
commit b3e1dab
Show file tree

Hide file tree

Showing 142 changed files with 18,097 additions and 6,506 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: spmodel
 Title: Spatial Statistical Modeling and Prediction
-Version: 0.6.0
+Version: 0.7.0
 Authors@R: c(
     person(given = "Michael",
            family = "Dumelle",
@@ -28,7 +28,7 @@ License: GPL-3
 Encoding: UTF-8
 LazyData: true
 Roxygen: list(markdown = TRUE)
-RoxygenNote: 7.3.1
+RoxygenNote: 7.3.2
 Depends: 
     R (>= 3.5.0)
 Imports: 
@@ -45,7 +45,8 @@ Suggests:
     testthat (>= 3.0.0),
     ggplot2,
     ranger,
-    statmod
+    statmod,
+    pROC
 VignetteBuilder: knitr
 Config/testthat/edition: 3
 URL: https://usepa.github.io/spmodel/

diff --git a/NAMESPACE b/NAMESPACE
@@ -8,6 +8,12 @@ S3method(AICc,spautor)
 S3method(AICc,spgautor)
 S3method(AICc,spglm)
 S3method(AICc,splm)
+S3method(AUROC,spgautor)
+S3method(AUROC,spglm)
+S3method(BIC,spautor)
+S3method(BIC,spgautor)
+S3method(BIC,spglm)
+S3method(BIC,splm)
 S3method(anova,spautor)
 S3method(anova,spgautor)
 S3method(anova,spglm)
@@ -345,6 +351,7 @@ S3method(vcov,spgautor)
 S3method(vcov,spglm)
 S3method(vcov,splm)
 export(AICc)
+export(AUROC)
 export(augment)
 export(covmatrix)
 export(dispersion_initial)
@@ -391,8 +398,10 @@ importFrom(sf,st_coordinates)
 importFrom(sf,st_crs)
 importFrom(sf,st_drop_geometry)
 importFrom(sf,st_intersects)
+importFrom(sf,st_is_longlat)
 importFrom(stats,.getXlevels)
 importFrom(stats,AIC)
+importFrom(stats,BIC)
 importFrom(stats,anova)
 importFrom(stats,coef)
 importFrom(stats,coefficients)

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,21 @@
+# spmodel 0.7.0
+
+## Minor Updates
+
+* Added `AUROC()`  functions to compute the area under the receiver operating characteristic (AUROC) curve for `spglm()` and `spgautor()` models when `family` is `"binomial"` and the response is binary (i.e., represents a single success or failure).
+* Added a `BIC()` function to compute the Bayesian Information Criterion (BIC) for `splm()`, `spautor()`, `spglm()`, and `spgautor()` models when `estmethod` is `"reml"` (restricted maximum likelihood; the default) or `"ml"` (maximum likelihood).
+* Added a `type` argument to `loocv()` when `cv_predict = TRUE` and using `spglm()` or `spgautor()` models so that predictions may be obtained on the link or response scale.
+* Added a warning message when `data` is an `sf` object and a geographic (i.e., degrees) coordinate system is used instead of a projected coordinate system.
+* Changed the default behavior of `local` in `predict.spmodel` so that it depends only on the observed data sample size. Now, when the observed data sample size exceeds 10,000 `local` is set to `TRUE` by default. This change was made because prediction for big data depends almost exclusively on the observed data sample size, not the number of predictions desired.
+* Minor external data updates (for package testing).
+* Minor vignette updates.
+* Minor documentation updates.
+* Minor error message updates.
+
+## Bug Fixes
+
+* Fixed a bug that prohibited proper indexing when calling `predict()` with the `local` method `"distance"` on a model object fit with a random effect or partition factor.
+
 # spmodel 0.6.0
 
 ## Minor Updates

diff --git a/R/AUROC.R b/R/AUROC.R
@@ -0,0 +1,58 @@
+#' Area Under Receiver Operating Characteristic Curve
+#'
+#' @description Compare area under the receiver operating characteristic curve (AUROC) for binary (e.g.,
+#'   logistic) models. The area under the ROC curve provides a measure of the
+#'   model's classification accuracy averaged over all possible threshold values.
+#'
+#' @param object A fitted model object from [spglm()] or [spgautor()]) where \code{family = "binomial"}
+#'   and the response values are binary, representing a single success or failure for each datum.
+#' @param ... Additional arguments to [pROC::auc()].
+#'
+#' @return The area under the receiver operating characteristic curve.
+#'
+#' @order 1
+#' @export
+#'
+#' @examples
+#' spgmod <- spglm(presence ~ elev,
+#'   family = "binomial", data = moose,
+#'   spcov_type = "exponential"
+#' )
+#' AUROC(spgmod)
+#' @references
+#' Robin, X., Turck, N., Hainard, A., Tiberti, N., Lisacek, F., Sanchez, J. C., & Müller, M.
+#' (2011). pROC: an open-source package for R and S+ to analyze and compare ROC curves.
+#' \emph{BMC bioinformatics}, 12, 1-8.
+AUROC <- function(object, ...) {
+  UseMethod("AUROC", object)
+}
+
+#' @rdname AUROC
+#' @method AUROC spglm
+#' @order 2
+#' @export
+AUROC.spglm <- function(object, ...) {
+  if (!requireNamespace("pROC", quietly = TRUE)) {
+    stop("Install the pROC package before using AUROC().", call. = FALSE)
+  } else {
+
+    if (object$family != "binomial") {
+      stop("AUROC() only available when family is \"binomial\".", call. = FALSE)
+    }
+
+    if (any(object$size != 1)) {
+      stop("AUROC() only available for binary models (i.e., models whose response indicates a single success or failure).", call. = FALSE)
+    }
+    dotlist <- list(...)
+    if (!("quiet" %in% names(dotlist))) {
+      dotlist$quiet <- TRUE
+    }
+    as.numeric(do.call(pROC::auc, c(list(response = as.vector(object$y), predictor = fitted(object)), dotlist)))
+  }
+}
+
+#' @rdname AUROC
+#' @method AUROC spgautor
+#' @order 3
+#' @export
+AUROC.spgautor <- AUROC.spglm
diff --git a/R/BIC.R b/R/BIC.R
@@ -0,0 +1,127 @@
+#' Compute BIC of fitted model objects
+#'
+#' @description Compute BIC for one or
+#' several fitted model objects for which a log-likelihood
+#' value can be obtained.
+#'
+#' @param object A fitted model object from [splm()], [spautor()], [spglm()], or [spgautor()]
+#'   where \code{estmethod} is \code{"ml"} or \code{"reml"}.
+#' @param ... Optionally more fitted model objects.
+#'
+#' @details When comparing models fit by maximum or restricted maximum
+#'   likelihood, the smaller the BIC, the better the fit. The theory of
+#'   BIC requires that the log-likelihood has been maximized, and hence,
+#'   no BIC methods exist for models where \code{estmethod} is not
+#'   \code{"ml"} or \code{"reml"}. Additionally, BIC comparisons between \code{"ml"}
+#'   and \code{"reml"} models are meaningless -- comparisons should only be made
+#'   within a set of models estimated using \code{"ml"} or a set of models estimated
+#'   using \code{"reml"}. BIC comparisons for \code{"reml"} must
+#'   use the same fixed effects. To vary the covariance parameters and
+#'   fixed effects simultaneously, use \code{"ml"}.
+#'
+#'   BIC is defined as \eqn{-2loglik + log(n)(estparams)}, where \eqn{n} is the sample size
+#'   and \eqn{estparams} is the number of estimated parameters. For \code{"ml"}, \eqn{estparams} is
+#'   the number of estimated covariance parameters plus the number of estimated
+#'   fixed effects. For \code{"reml"}, \eqn{estparams} is the number of estimated covariance
+#'   parameters.
+#'
+#' @return If just one object is provided, a numeric value with the corresponding
+#'   BIC.
+#'
+#'   If multiple objects are provided, a \code{data.frame} with rows corresponding
+#'   to the objects and columns representing the number of parameters estimated
+#'   (\code{df}) and the BIC.
+#'
+#' @name BIC.spmodel
+#' @method BIC splm
+#' @order 1
+#' @export
+#'
+#' @examples
+#' spmod <- splm(z ~ water + tarp,
+#'   data = caribou,
+#'   spcov_type = "exponential", xcoord = x, ycoord = y
+#' )
+#' BIC(spmod)
+BIC.splm <- function(object, ...) {
+
+  # set k as log of sample size
+  k <- log(object$n)
+
+  # store object and ...
+  object_list <- list(object, ...)
+
+  # see if ... has any elements
+  if (length(object_list) == 1) {
+
+    # number of estimated parameters
+    if (object$estmethod == "ml") {
+      n_est_param <- object$npar + object$p
+    } else {
+      n_est_param <- object$npar
+    }
+
+    # error if not ml or reml
+    if (!object$estmethod %in% c("ml", "reml")) {
+      stop("BIC is only defined is estmethod is \"ml\" or \"reml\".", call. = FALSE)
+    }
+    # compute BIC
+    BIC_val <- -2 * logLik(object) + k * (n_est_param)
+  } else {
+
+
+    # warning if ml and reml in same call
+    est_methods <- vapply(object_list, function(x) x$estmethod, character(1))
+    if ("ml" %in% est_methods && "reml" %in% est_methods) {
+      warning("BIC and BICc should not compare models fit with
+              \"ml\" to models fit with \"reml\"", call. = FALSE)
+    }
+
+    # warning if reml and fixed effects change
+    est_methods_reml <- which(est_methods == "reml")
+    if (length(est_methods_reml) > 1) {
+      if (any(vapply(
+        est_methods_reml,
+        function(x) !identical(formula(object_list[[x]]), formula(object_list[[1]])), logical(1)
+      ))) {
+        warning("BIC and BICc should not be used to compare models fit with \"reml\" whose fixed effect formulas differ.", call. = FALSE)
+      }
+    }
+
+    # find model names provided
+    object_list_names <- as.character(c(substitute(object), (as.list(substitute(list(...)))[-1])))
+
+    # error if any names duplicated
+    if (any(duplicated(object_list_names))) {
+      stop("Each model object must have a unique name", call. = FALSE)
+    }
+    # iterate through each model
+    object_BIC <- lapply(object_list, function(x) {
+      # warning if estmethod not ml or reml
+      if (!object$estmethod %in% c("ml", "reml")) {
+        stop("BIC is only defined is estmethod is \"ml\" or \"reml\".", call. = FALSE)
+      }
+
+      if (x$estmethod == "ml") {
+        n_est_param <- x$npar + x$p
+      } else {
+        n_est_param <- x$npar
+      }
+
+      # store degrees of freedom (parames estimated) and BIC
+      data.frame(df = n_est_param, BIC = -2 * logLik(x) + k * (n_est_param))
+    })
+    # put all BIC data frames together
+    BIC_val <- do.call("rbind", object_BIC)
+    # set rownames as model names
+    row.names(BIC_val) <- object_list_names
+  }
+  # return BIC value
+  BIC_val
+}
+
+#' @rdname BIC.spmodel
+#' @method BIC spautor
+#' @order 2
+#' @export
+BIC.spautor <- BIC.splm
diff --git a/R/BIC_glm.R b/R/BIC_glm.R
@@ -0,0 +1,11 @@
+#' @rdname BIC.spmodel
+#' @method BIC spglm
+#' @order 3
+#' @export
+BIC.spglm <- BIC.splm
+
+#' @rdname BIC.spmodel
+#' @method BIC spgautor
+#' @order 4
+#' @export
+BIC.spgautor <- BIC.spautor
diff --git a/R/get_data_object.R b/R/get_data_object.R
@@ -89,6 +89,20 @@ get_data_object_splm <- function(formula, data, spcov_initial, xcoord, ycoord, e
     stop("Coordinates must be numeric.", call. = FALSE)
   }
 
+  # check if coordinates are projected
+  if (is_sf) {
+    if (st_is_longlat(crs)) {
+      warning("Coordinates are in a geographic coordinate system. For the most accurate results, please ensure
+            coordinates are in a projected coordinate system (e.g., via sf::st_transform()).", call. = FALSE)
+    }
+  } else {
+    # possible revisit this later and add explicit warning
+    # if (any(abs(c(data[[xcoord]], data[[ycoord]])) <= 360)) {
+    #   warning("Coordinates may be in a geographic coordinate system. For the most accurate results, please ensure
+    #         coordinates are in a projected coordinate system (e.g., via sf::st_transform()).", call. = FALSE)
+    # }
+  }
+
   # subsetting by na and not na values
   ## find response variabale name
   na_index <- is.na(data[[all.vars(formula)[1]]])
@@ -206,7 +220,7 @@ get_data_object_splm <- function(formula, data, spcov_initial, xcoord, ycoord, e
   if (is.null(local)) {
     if (n > 5000) {
       local <- TRUE
-      message("Because the sample size exceeds 5000, we are setting local = TRUE to perform computationally efficient approximations. To override this behavior and compute the exact solution, rerun splm() with local = FALSE. Be aware that setting local = FALSE may result in exceedingly long computational times.")
+      message("Because the sample size exceeds 5,000, we are setting local = TRUE to perform computationally efficient approximations. To override this behavior and compute the exact solution, rerun with local = FALSE. Be aware that setting local = FALSE may result in exceedingly long computational times.")
     } else {
       local <- FALSE
     }

diff --git a/R/get_data_object_glm.R b/R/get_data_object_glm.R
@@ -90,6 +90,20 @@ get_data_object_spglm <- function(formula, family, data, spcov_initial, xcoord,
     stop("Coordinates must be numeric.", call. = FALSE)
   }
 
+  # check if coordinates are projected
+  if (is_sf) {
+    if (st_is_longlat(crs)) {
+      warning("Coordinates are in a geographic coordinate system. For the most accurate results, please ensure
+            coordinates are in a projected coordinate system (e.g., via sf::st_transform()).", call. = FALSE)
+    }
+  } else {
+    # possible revisit this later and add explicit warning
+    # if (any(abs(c(data[[xcoord]], data[[ycoord]])) <= 360)) {
+    #   warning("Coordinates may be in a geographic coordinate system. For the most accurate results, please ensure
+    #         coordinates are in a projected coordinate system (e.g., via sf::st_transform()).", call. = FALSE)
+    # }
+  }
+
   # subsetting by na and not na values
   ## find response variable name
   # na_index <- is.na(data[[all.vars(formula)[1]]])
@@ -239,7 +253,7 @@ get_data_object_spglm <- function(formula, family, data, spcov_initial, xcoord,
   if (is.null(local)) {
     if (n > 3000) {
       local <- TRUE
-      message("Because the sample size exceeds 3000, we are setting local = TRUE to perform computationally efficient approximations. To override this behavior and compute the exact solution, rerun splm() with local = FALSE. Be aware that setting local = FALSE may result in exceedingly long computational times.")
+      message("Because the sample size exceeds 3,000, we are setting local = TRUE to perform computationally efficient approximations. To override this behavior and compute the exact solution, rerun with local = FALSE. Be aware that setting local = FALSE may result in exceedingly long computational times.")
     } else {
       local <- FALSE
     }

diff --git a/R/get_local_list.R b/R/get_local_list.R
@@ -66,7 +66,13 @@ get_local_list_estimation <- function(local, data, xcoord, ycoord, n, partition_
 
   # setting var adjust
   if (!"var_adjust" %in% names_local) {
-    local$var_adjust <- "theoretical"
+    if (n <= 100000) {
+      local$var_adjust <- "theoretical"
+    } else {
+      message('var_adjust was not specified and the sample size exceeds 100,000, so the default var_adjust value is being changed from "theoretical" to "none". To override this behavior, rerun and set var_adjust in local. Be aware that setting var_adjust to "theoretical" may result in exceedingly long computational times.')
+      local$var_adjust <- "none"
+    }
+
   } # "none", "empirical", "theoretical", and "pooled"
 
   # setting partition factor

diff --git a/R/glance.R b/R/glance.R
@@ -14,20 +14,21 @@
 #'     \item \code{n} The sample size.
 #'     \item \code{p} The number of fixed effects.
 #'     \item \code{npar} The number of estimated covariance parameters.
-#'     \item \code{value} The optimized value of the fitting function
+#'     \item \code{value} The optimized value of the fitting function.
 #'     \item \code{AIC} The AIC.
 #'     \item \code{AICc} The AICc.
-#'     \item \code{logLik} The log-likelihood
+#'     \item \code{BIC} The BIC.
+#'     \item \code{logLik} The log-likelihood.
 #'     \item \code{deviance} The deviance.
-#'     \item \code{pseudo.r.squared} The pseudo r-squared
+#'     \item \code{pseudo.r.squared} The pseudo r-squared.
 #'   }
 #'
 #' @name glance.spmodel
 #' @method glance splm
 #' @order 1
 #' @export
 #'
-#' @seealso [AIC.spmodel()] [AICc()] [logLik.spmodel()] [deviance.spmodel()] [pseudoR2()] [tidy.spmodel()] [augment.spmodel()]
+#' @seealso [AIC.spmodel()] [AICc()] [BIC.spmodel()] [logLik.spmodel()] [deviance.spmodel()] [pseudoR2()] [tidy.spmodel()] [augment.spmodel()]
 #'
 #' @examples
 #' spmod <- splm(z ~ water + tarp,
@@ -44,6 +45,7 @@ glance.splm <- function(x, ...) {
     value = x$optim$value,
     AIC = ifelse(is_likbased, AIC(x), NA),
     AICc = ifelse(is_likbased, AICc(x), NA),
+    BIC = ifelse(is_likbased, BIC(x), NA),
     logLik = ifelse(is_likbased, logLik(x), NA),
     deviance = ifelse(is_likbased, deviance(x), NA),
     pseudo.r.squared = pseudoR2(x),