From a8010fbf8828efef4ac472a34fae0a47c942ed70 Mon Sep 17 00:00:00 2001
From: AngusMcLure <angus.mclure@anu.edu.au>
Date: Tue, 2 Apr 2024 12:41:54 +1100
Subject: [PATCH] sample size and power for random catch sizes

---
 NAMESPACE         |   2 +
 R/power.R         | 238 +++++++++++++++++++++++++++++++++++++++-------
 man/power_pool.Rd |  84 ++++++++++++++--
 3 files changed, 283 insertions(+), 41 deletions(-)

diff --git a/NAMESPACE b/NAMESPACE
index 858ca24..cb2a060 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -17,4 +17,6 @@ export(pois_catch)
 export(pool_max_size)
 export(pool_target_number)
 export(power_pool)
+export(power_pool_random)
 export(sample_size_pool)
+export(sample_size_pool_random)
diff --git a/R/power.R b/R/power.R
index 22b3c3b..3497432 100644
--- a/R/power.R
+++ b/R/power.R
@@ -6,12 +6,21 @@
 #' `sample_size_pool()` calculate the sample size required for a pooled survey
 #' to achieve a specified power.
 #'
-#' @param sample_size numeric The total number of units across the whole sample.
-#'   Should be greater than `pool_size * pool_number`
 #' @param pool_size numeric The number of units per pool. Must be a numeric
 #'   value or vector of values greater than 0.
 #' @param pool_number numeric The number of pools per cluster. Must be a integer
 #'   value or a vector of integer values greater than or equal to 1.
+#' @param cluster_number numeric The total number of clusters in a cluster
+#'   survey design (should be greater than 1) or 1 surveys where all collection
+#'   happens at a single site or collected via simple random sampling from the
+#'   target population.
+#' @param catch_dist An object of class `distribution` (e.g. produced by
+#'   `nb_catch()`) defining the distribution of the possible catch.
+#' @param pool_strat function Defines a rule for how a number of units will be
+#'   divided into pools. Must take a single numeric argument and return a named
+#'   list of pool sizes and pool numbers. `pool_max_size()` and
+#'   `pool_target_number()` provide convenience functions for defining common
+#'   pooling strategies.
 #' @param prevalence_null,prevalence_alt numeric The proportion of units that
 #'   carry the marker of interest (i.e. true positive). `prevalence_null` is the
 #'   threshold to compare to and `prevalence_alt` is the design prevalence. Must
@@ -46,51 +55,66 @@
 #' @param link string Transformation to be applied to prevalence estimates for
 #'   the purposes of calculating confidence intervals. Options are 'identity'
 #'   (i.e. no transformation), 'logit' (default), 'cloglog' and 'log'.
+#' @param max_iter numeric Maximum number of iterations (possible catch sizes)
+#'   to consider when calculating expected FI over random catch sizes. Generally
+#'   needs to be large enough so that the nearly all catch sizes will be less
+#'   than `max_iter` otherwise algorithm will terminate early (with a warning)
+#' @param rel_tol numeric Relative tolerance for determining convergence when
+#'   calculating expected FI over random catch sizes. Must be positive and
+#'   should be much smaller than 1.
 #'
 #'
 #' @return The statistical power of the proposed design with regards to
 #'   comparing prevalence to a threshold (`power_pool()`) or a list with the
-#'   sample size (number of sites, pools, and units) required to achieve desired
-#'   power (`sample_size_pool()`)
+#'   sample size (number of clusters, pools, and units) required to achieve
+#'   desired power (`sample_size_pool()`)
 #' @export
 #'
 #' @examples
-#' power_pool(sample_size = 1000, pool_size = 10, pool_number = 2,
+#' power_pool(pool_size = 10, pool_number = 2, cluster_number = 50,
 #'            prevalence_null = 0.01, prevalence_alt = 0.02)
 #'
 #' sample_size_pool(pool_size = 10, pool_number = 2,
 #'                  prevalence_null = 0.01, prevalence_alt = 0.02)
 #'
-#' power_pool(sample_size = 1000, pool_size = 10, pool_number = 2,
+#' power_pool(pool_size = 10, pool_number = 2, cluster_number = 50,
 #'            prevalence_null = 0.01, prevalence_alt = 0.02,
 #'            correlation = 0.01)
-#'
+#' 
 #' sample_size_pool(pool_size = 10, pool_number = 2,
 #'                  prevalence_null = 0.01, prevalence_alt = 0.02,
 #'                  correlation = 0.01)
+#' 
+#' power_pool_random(nb_catch(20,25), pool_target_number(2), cluster_number = 50,
+#'                   prevalence_null = 0.01, prevalence_alt = 0.02,
+#'                   correlation = 0.01)
+#' 
+#' sample_size_pool_random(nb_catch(20,25), pool_target_number(2),
+#'                         prevalence_null = 0.01, prevalence_alt = 0.02,
+#'                          correlation = 0.01)
+
 
 
-power_pool <- function(sample_size, pool_size, pool_number, prevalence_null, prevalence_alt,
+power_pool <- function(pool_size, pool_number, cluster_number,
+                       prevalence_null, prevalence_alt,
                        correlation = 0, sensitivity = 1, specificity = 1,
                        sig_level = 0.05, alternative = 'greater',
-                       form = 'beta', link = 'logit'){
-  if(sample_size < pool_size*pool_number){
-    stop('The total number of units in sample (sample_size) must be greater than the pool size times pool number (pool_size * pool_number)')
+                       form = 'logitnorm', link = 'logit'){
+  if(correlation > 0 & cluster_number <= 1){
+    stop('The number of clusters (cluster_number) must be (substantially) greater than 1 if there is non-zero correlation between units in a cluster')
+  }
+  
+  if(correlation > 0 & cluster_number <= 10){
+    warning('Estimated power may be unreliable if number of clusters (cluster_number) is less than 10')
+  }
+  
+  if(correlation == 0 & cluster_number * pool_number <= 10){
+    warning('Estimated power may be unreliable if total number of pools (cluster_number * pool_number) is less than 10')
   }
+  
   thetaa <- prevalence_alt
   theta0 <- prevalence_null
   
-  # The idea here was that the hypothesis test would be for mu rather than theta, with
-  # the idea that this would be equivalent to a test on theta, i.e. if mu0 =
-  # g(theta0)<g(thetaa) = mua then theta0 < thetaa. However, this is not true
-  # since differences in rho/correlations allows for theta0 > thetaa
-  
-  # if(real.scale & form %in% c('logitnorm', 'cloglognorm')){ g <- function(x){
-  # #calculate mu from theta and rho .var <- correlation * x * (1-x)
-  # mu_sigma_linknorm(x,.var, link = switch(form, logitnorm = stats::qlogis,
-  # cloglognorm = cloglog), invlink = switch(form, logitnorm = plogis,
-  # cloglognorm = cloglog_inv))[1] } gdivinv <- function(x){1}
-  # }else{
   g <- switch(link,
               logit = stats::qlogis,
               cloglog = cloglog,
@@ -102,9 +126,8 @@ power_pool <- function(sample_size, pool_size, pool_number, prevalence_null, pre
                     cloglog = function(x){-log1p(-x) * (1-x)},
                     log = function(x){x},
                     identity = function(x){1})
-  # }
   
-  fia <- sample_size/(pool_size*pool_number) * gdivinv(thetaa)^2 /
+  fia <- cluster_number * gdivinv(thetaa)^2 /
     solve(fi_pool_cluster(pool_size = pool_size,
                           pool_number = pool_number,
                           prevalence = thetaa,
@@ -113,7 +136,7 @@ power_pool <- function(sample_size, pool_size, pool_number, prevalence_null, pre
                           specificity = specificity,
                           form = form))[1,1]
   #print(fia)
-  fi0 <- sample_size/(pool_size*pool_number) * gdivinv(theta0)^2 /
+  fi0 <- cluster_number * gdivinv(theta0)^2/
     solve(fi_pool_cluster(pool_size = pool_size,
                           pool_number = pool_number,
                           prevalence = theta0,  #should this be theta0 or thetaa?
@@ -136,12 +159,87 @@ power_pool <- function(sample_size, pool_size, pool_number, prevalence_null, pre
 #' @rdname power_pool
 #' @export
 
+power_pool_random <- function(catch_dist, pool_strat, cluster_number,
+                              prevalence_null, prevalence_alt,
+                              correlation = 0, sensitivity = 1, specificity = 1,
+                              sig_level = 0.05, alternative = 'greater',
+                              form = 'logitnorm', link = 'logit',
+                              max_iter = 10000, rel_tol = 1e-6){
+  
+  exp_pools <- ev(\(catch) sum(pool_strat(catch)$pool_number),
+                  catch_dist, max_iter, rel_tol) * cluster_number
+  
+  if(correlation > 0 & cluster_number <= 1){
+    stop('The number of clusters (cluster_number) must be (substantially) greater than 1 if there is non-zero correlation between units in a cluster')
+  }else if(cluster_number < 1){
+    stop('The number of clusters (cluster_number) must be at least 1.')
+  }
+  
+  if(correlation > 0 & cluster_number <= 10){
+    warning('Estimated power may be unreliable if number of clusters (cluster_number) is less than 10')
+  }
+  
+  if(correlation == 0 & exp_pools <= 10){
+    warning('The expected number of pools from this survey is less than ', ceiling(exp_pools),
+            '. Estimated power may be unreliable if total number of pools is less than 10')
+  }
+  
+  thetaa <- prevalence_alt
+  theta0 <- prevalence_null
+  
+  g <- switch(link,
+              logit = stats::qlogis,
+              cloglog = cloglog,
+              log = log,
+              identity = function(x){x})
+  
+  gdivinv <- switch(link,
+                    logit = function(x){x * (1-x)},
+                    cloglog = function(x){-log1p(-x) * (1-x)},
+                    log = function(x){x},
+                    identity = function(x){1})
+  
+  fia <- cluster_number * gdivinv(thetaa)^2 /
+    solve(fi_pool_cluster_random(catch_dist = catch_dist, 
+                                 pool_strat = pool_strat, 
+                                 prevalence = thetaa,
+                                 correlation = correlation,
+                                 sensitivity = sensitivity,
+                                 specificity = specificity,
+                                 form = form,
+                                 max_iter = max_iter,
+                                 rel_tol = rel_tol-6))[1,1]
+  
+  fi0 <- cluster_number * gdivinv(theta0)^2 /
+    solve(fi_pool_cluster_random(catch_dist = catch_dist, 
+                                 pool_strat = pool_strat, 
+                                 prevalence = theta0,  #should this be theta0 or thetaa?
+                                 correlation = correlation,
+                                 sensitivity = sensitivity,
+                                 specificity = specificity,
+                                 form = form,
+                                 max_iter = max_iter,
+                                 rel_tol = rel_tol-6))[1,1]
+
+  power <- switch(alternative,
+                  less    = stats::pnorm(((g(theta0) - g(thetaa))  - stats::qnorm(1-sig_level)/sqrt(fi0)) * sqrt(fia)),
+                  greater = stats::pnorm(((g(thetaa) - g(theta0))  - stats::qnorm(1-sig_level)/sqrt(fi0)) * sqrt(fia)),
+                  two.sided = stats::pnorm(((g(theta0) - g(thetaa))  - stats::qnorm(1-sig_level/2)/sqrt(fi0)) * sqrt(fia)) +
+                    stats::pnorm(((g(thetaa) - g(theta0))  - stats::qnorm(1-sig_level/2)/sqrt(fi0)) * sqrt(fia)),
+                  stop('invalid alternative. options are less, greater, and two.sided')
+  )
+  power
+}
+
+#' @rdname power_pool
+#' @export
+
 sample_size_pool <- function(pool_size, pool_number,
                              prevalence_null, prevalence_alt,
                              correlation = 0, sensitivity = 1, specificity = 1,
                              power = 0.8, sig_level = 0.05,
                              alternative = 'greater',
-                             form = 'beta',
+                             form = 'logitnorm',
                              link = 'logit'){
   thetaa <- prevalence_alt
   theta0 <- prevalence_null
@@ -170,7 +268,7 @@ sample_size_pool <- function(pool_size, pool_number,
                     log = function(x){x},
                     identity = function(x){1})
   
-  unit_fia <- 1/(pool_size*pool_number) * gdivinv(thetaa)^2 /
+  fia <- gdivinv(thetaa)^2 /
     solve(fi_pool_cluster(pool_size = pool_size,
                           pool_number = pool_number,
                           prevalence = thetaa,
@@ -178,7 +276,7 @@ sample_size_pool <- function(pool_size, pool_number,
                           sensitivity = sensitivity,
                           specificity = specificity,
                           form = form))[1,1]
-  unit_fi0 <- 1/(pool_size*pool_number) * gdivinv(theta0)^2 /
+  fi0 <- gdivinv(theta0)^2 /
     solve(fi_pool_cluster(pool_size = pool_size,
                           pool_number = pool_number,
                           prevalence = theta0,
@@ -188,12 +286,88 @@ sample_size_pool <- function(pool_size, pool_number,
                           form = form))[1,1]
   
   # Note that the below is correct for either kind of one-sided test, but not for two sided tests
-  ssraw <- ((stats::qnorm(power)/sqrt(unit_fia) + stats::qnorm(1 - sig_level)/sqrt(unit_fi0))/(g(theta0) - g(thetaa)))^2
+  total_clusters_raw <- ((stats::qnorm(power)/sqrt(fia) + stats::qnorm(1 - sig_level)/sqrt(fi0))/(g(theta0) - g(thetaa)))^2
   
-  total_sites <- ceiling(ssraw/(pool_number * pool_size))
-  total_pools <- total_sites * pool_number
+  total_clusters <- ceiling(total_clusters_raw)
+  total_pools <- total_clusters * pool_number
   total_units <- total_pools * pool_size
   
-  return(list(sites = total_sites, pools = total_pools, units = total_units))
+  return(list(clusters = total_clusters, pools = total_pools, units = total_units))
+  
+}
+
+
 
+
+#' @rdname power_pool
+#' @export
+#' 
+
+sample_size_pool_random <- function(catch_dist, pool_strat,
+                                    prevalence_null, prevalence_alt,
+                                    correlation = 0, sensitivity = 1, specificity = 1,
+                                    power = 0.8, sig_level = 0.05,
+                                    alternative = 'greater',
+                                    form = 'logitnorm', link = 'logit',
+                                    max_iter = 10000, rel_tol = 1e-6){
+  thetaa <- prevalence_alt
+  theta0 <- prevalence_null
+  
+  if(!(alternative %in% c('less', 'greater'))){
+    stop('currently only supports one-sided tests. Valid options for alternative are less and greater')
+  }
+  
+  if(alternative == 'less' & theta0 < thetaa){
+    stop('If alternative == "less", then prevalence.altnerative must be less than or equal to prevalence_null' )
+  }
+  
+  if(alternative == 'greater' & theta0 > thetaa){
+    stop('If alternative == "greater", then prevalence.altnerative must be greater than or equal to prevalence_null' )
+  }
+  
+  g <- switch(link,
+              logit = stats::qlogis,
+              cloglog = cloglog,
+              log = log,
+              identity = function(x){x})
+  
+  gdivinv <- switch(link,
+                    logit = function(x){x * (1-x)},
+                    cloglog = function(x){-log1p(-x) * (1-x)},
+                    log = function(x){x},
+                    identity = function(x){1})
+  
+  fia <- gdivinv(thetaa)^2 /
+    solve(fi_pool_cluster_random(catch_dist = catch_dist, 
+                                 pool_strat = pool_strat, 
+                                 prevalence = thetaa,
+                                 correlation = correlation,
+                                 sensitivity = sensitivity,
+                                 specificity = specificity,
+                                 form = form,
+                                 max_iter = max_iter,
+                                 rel_tol = rel_tol-6))[1,1]
+  fi0 <- gdivinv(theta0)^2 /
+    solve(fi_pool_cluster_random(catch_dist = catch_dist, 
+                                 pool_strat = pool_strat, 
+                                 prevalence = theta0,  #should this be theta0 or thetaa?
+                                 correlation = correlation,
+                                 sensitivity = sensitivity,
+                                 specificity = specificity,
+                                 form = form,
+                                 max_iter = max_iter,
+                                 rel_tol = rel_tol-6))[1,1]
+  
+  # Note that the below is correct for either kind of one-sided test, but not for two sided tests
+  total_cluster_raw <- ((stats::qnorm(power)/sqrt(fia) + stats::qnorm(1 - sig_level)/sqrt(fi0))/(g(theta0) - g(thetaa)))^2
+  
+  total_clusters <- ceiling(total_cluster_raw)
+  exp_total_units <- round(mean(catch_dist) * total_clusters,1)
+  exp_total_pools <- round(ev(\(catch) sum(pool_strat(catch)$pool_number),
+                              catch_dist, max_iter, rel_tol) * total_clusters, 1)
+  
+  return(list(clusters = total_clusters,
+              expected_pools = exp_total_pools,
+              expected_units = exp_total_units))
+  
 }
\ No newline at end of file
diff --git a/man/power_pool.Rd b/man/power_pool.Rd
index a96629c..6d5db30 100644
--- a/man/power_pool.Rd
+++ b/man/power_pool.Rd
@@ -2,14 +2,16 @@
 % Please edit documentation in R/power.R
 \name{power_pool}
 \alias{power_pool}
+\alias{power_pool_random}
 \alias{sample_size_pool}
+\alias{sample_size_pool_random}
 \title{Power and sample size calculations for estimating population prevalence from
 pooled samples}
 \usage{
 power_pool(
-  sample_size,
   pool_size,
   pool_number,
+  cluster_number,
   prevalence_null,
   prevalence_alt,
   correlation = 0,
@@ -17,10 +19,27 @@ power_pool(
   specificity = 1,
   sig_level = 0.05,
   alternative = "greater",
-  form = "beta",
+  form = "logitnorm",
   link = "logit"
 )
 
+power_pool_random(
+  catch_dist,
+  pool_strat,
+  cluster_number,
+  prevalence_null,
+  prevalence_alt,
+  correlation = 0,
+  sensitivity = 1,
+  specificity = 1,
+  sig_level = 0.05,
+  alternative = "greater",
+  form = "logitnorm",
+  link = "logit",
+  max_iter = 10000,
+  rel_tol = 1e-06
+)
+
 sample_size_pool(
   pool_size,
   pool_number,
@@ -32,20 +51,39 @@ sample_size_pool(
   power = 0.8,
   sig_level = 0.05,
   alternative = "greater",
-  form = "beta",
+  form = "logitnorm",
   link = "logit"
 )
+
+sample_size_pool_random(
+  catch_dist,
+  pool_strat,
+  prevalence_null,
+  prevalence_alt,
+  correlation = 0,
+  sensitivity = 1,
+  specificity = 1,
+  power = 0.8,
+  sig_level = 0.05,
+  alternative = "greater",
+  form = "logitnorm",
+  link = "logit",
+  max_iter = 10000,
+  rel_tol = 1e-06
+)
 }
 \arguments{
-\item{sample_size}{numeric The total number of units across the whole sample.
-Should be greater than `pool_size * pool_number`}
-
 \item{pool_size}{numeric The number of units per pool. Must be a numeric
 value or vector of values greater than 0.}
 
 \item{pool_number}{numeric The number of pools per cluster. Must be a integer
 value or a vector of integer values greater than or equal to 1.}
 
+\item{cluster_number}{numeric The total number of clusters in a cluster
+survey design (should be greater than 1) or 1 surveys where all collection
+happens at a single site or collected via simple random sampling from the
+target population.}
+
 \item{prevalence_null, prevalence_alt}{numeric The proportion of units that
 carry the marker of interest (i.e. true positive). `prevalence_null` is the
 threshold to compare to and `prevalence_alt` is the design prevalence. Must
@@ -87,11 +125,31 @@ prevalence and correlation of units within cluster. Select one of "beta",
 the purposes of calculating confidence intervals. Options are 'identity'
 (i.e. no transformation), 'logit' (default), 'cloglog' and 'log'.}
 
+\item{catch_dist}{An object of class `distribution` (e.g. produced by
+`nb_catch()`) defining the distribution of the possible catch.}
+
+\item{pool_strat}{function Defines a rule for how a number of units will be
+divided into pools. Must take a single numeric argument and return a named
+list of pool sizes and pool numbers. `pool_max_size()` and
+`pool_target_number()` provide convenience functions for defining common
+pooling strategies.}
+
+\item{max_iter}{numeric Maximum number of iterations (possible catch sizes)
+to consider when calculating expected FI over random catch sizes. Generally
+needs to be large enough so that the nearly all catch sizes will be less
+than `max_iter` otherwise algorithm will terminate early (with a warning)}
+
+\item{rel_tol}{numeric Relative tolerance for determining convergence when
+calculating expected FI over random catch sizes. Must be positive and
+should be much smaller than 1.}
+
 \item{power}{numeric The desired statistical power of the survey.}
 }
 \value{
 The statistical power of the proposed design with regards to
-  comparing prevalence to a threshold (`power_pool()`)
+  comparing prevalence to a threshold (`power_pool()`) or a list with the
+  sample size (number of clusters, pools, and units) required to achieve
+  desired power (`sample_size_pool()`)
 }
 \description{
 `power_pool()` calculates the statistical power of a pooled survey design to
@@ -100,17 +158,25 @@ determine whether population prevalence is different from a threshold.
 to achieve a specified power.
 }
 \examples{
-power_pool(sample_size = 1000, pool_size = 10, pool_number = 2,
+power_pool(pool_size = 10, pool_number = 2, cluster_number = 50,
            prevalence_null = 0.01, prevalence_alt = 0.02)
 
 sample_size_pool(pool_size = 10, pool_number = 2,
                  prevalence_null = 0.01, prevalence_alt = 0.02)
 
-power_pool(sample_size = 1000, pool_size = 10, pool_number = 2,
+power_pool(pool_size = 10, pool_number = 2, cluster_number = 50,
            prevalence_null = 0.01, prevalence_alt = 0.02,
            correlation = 0.01)
 
 sample_size_pool(pool_size = 10, pool_number = 2,
                  prevalence_null = 0.01, prevalence_alt = 0.02,
                  correlation = 0.01)
+
+power_pool_random(nb_catch(20,25), pool_target_number(2), cluster_number = 50,
+                  prevalence_null = 0.01, prevalence_alt = 0.02,
+                  correlation = 0.01)
+
+sample_size_pool_random(nb_catch(20,25), pool_target_number(2),
+                        prevalence_null = 0.01, prevalence_alt = 0.02,
+                         correlation = 0.01)
 }