From 1bdc76e75486e17db518df8a6d09566df3c71aed Mon Sep 17 00:00:00 2001
From: "Pavel N. Krivitsky" <p.krivitsky@unsw.edu.au>
Date: Tue, 2 Jan 2024 22:06:59 +1100
Subject: [PATCH] Fixed the wrong categories being selected for
 COLLAPSE_SMALLEST() and added lexicographic tie breaking with warnings for
 when categories passed to LARGEST, SMALLEST, and COLLAPSE_SMALLEST have equal
 sizes.

fixes statnet/ergm#544, statnet/ergm#545
---
 DESCRIPTION                        |  4 +-
 R/get.node.attr.R                  | 52 ++++++++++++----
 man/nodal_attributes.Rd            | 14 +++--
 tests/testthat/test-level-select.R | 97 ++++++++++++++++++++++++++++++
 4 files changed, 149 insertions(+), 18 deletions(-)
 create mode 100644 tests/testthat/test-level-select.R

diff --git a/DESCRIPTION b/DESCRIPTION
index dec1c66b3..6f92420fa 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: ergm
-Version: 4.6-7286
-Date: 2023-12-17
+Version: 4.6-7289
+Date: 2024-01-02
 Title: Fit, Simulate and Diagnose Exponential-Family Models for Networks
 Authors@R: c(
              person(c("Mark", "S."), "Handcock", role=c("aut"), email="handcock@stat.ucla.edu"),
diff --git a/R/get.node.attr.R b/R/get.node.attr.R
index 0b88ff648..bdc5e3063 100644
--- a/R/get.node.attr.R
+++ b/R/get.node.attr.R
@@ -116,7 +116,8 @@ get.node.attr <- function(nw, attrname, functionname=NULL, numeric=FALSE) {
 #' transform the attribute by collapsing the smallest `n` categories
 #' into one, naming it `into`. Note that `into` must be of the same
 #' type (numeric, character, etc.) as the vertex attribute in
-#' question.
+#' question. If there are ties for `n`th smallest category, they will
+#' be broken in lexicographic order, and a warning will be issued.
 #'
 #' The name the nodal attribute receives in the statistic can be
 #' overridden by setting a an [attr()]-style attribute `"name"`.
@@ -140,9 +141,10 @@ get.node.attr <- function(nw, attrname, functionname=NULL, numeric=FALSE) {
 #' is `LARGEST`, which will refer to the most frequent category, so,
 #' say, to set such a category as the baseline, pass
 #' `levels=-LARGEST`. In addition, `LARGEST(n)` will refer to the `n`
-#' largest categories. `SMALLEST` works analogously. Note that if there
-#' are ties in frequencies, they will be broken arbitrarily. To
-#' specify numeric or logical levels literally, wrap in [I()].}
+#' largest categories. `SMALLEST` works analogously. If there are ties
+#' in frequencies, they will be broken in lexicographic order, and a
+#' warning will be issued. To specify numeric or logical levels
+#' literally, wrap in [I()].}
 #'
 #'\item{[`NULL`]}{Retain all possible levels; usually equivalent to
 #' passing `TRUE`.}
@@ -192,6 +194,10 @@ get.node.attr <- function(nw, attrname, functionname=NULL, numeric=FALSE) {
 #' table(faux.mesa.high %v% "Grade")
 #' summary(faux.mesa.high~nodefactor((~Grade) %>% COLLAPSE_SMALLEST(2, 0),
 #'                                   levels=TRUE))
+#'
+#' # Handling of tied frequencies
+#' faux.mesa.high %v% "Plans" <- sample(rep(c("College", "Trade School", "Apprenticeship", "Undecided"), c(80,80,20,25)))
+#' summary(faux.mesa.high ~ nodefactor("Plans", levels = -LARGEST))
 #' 
 #' # Mixing between lower and upper grades:
 #' summary(faux.mesa.high~mm(~Grade>=10))
@@ -652,14 +658,38 @@ ERGM_VATTR_SPEC_NULL <- "function,formula,character,AsIs,NULL"
 #' @export
 ERGM_LEVELS_SPEC <- "function,formula,character,numeric,logical,AsIs,NULL,matrix"
 
+rank_cut <- function(x, n, tie_action = c("warning", "error"), top = FALSE){    
+  ordrank <- if(top) function(r) length(x) + 1 - r else identity
+  s1 <- ordrank(rank(x, ties.method="min")) <= n
+  s2 <- ordrank(rank(x, ties.method="max")) <= n
+
+  if(identical(s1, s2)) which(s1)
+  else{
+    tie_action <- match.arg(tie_action)
+    msg <- paste0("Levels ", paste.and(sQuote(names(x)[s1!=s2])), " are tied.")
+    switch(tie_action,
+           error = ergm_Init_abort(msg, " Specify explicitly."),
+           warning = {
+             ergm_Init_warn(msg, " Using the order given.")
+             which(ordrank(rank(x, ties.method="first")) <= n)
+           })
+  }
+}
+
+levels_cut <- function(x, n, lvls = sort(unique(x)), top = FALSE, ...){
+  f <- setNames(tabulate(match(x, lvls)), lvls)
+  sel <- rank_cut(f, n, top=top, ...)
+  if(missing(lvls)) lvls[sel] else sel
+}
+
 #' @rdname nodal_attributes
 #' @export
 LARGEST <- structure(function(l, a){
-  if(!missing(a)) which.max(tabulate(match(a, l))) # passed as levels=LARGEST
+  if(!missing(a)) levels_cut(a, 1, l, top=TRUE) # passed as levels=LARGEST
   else{ # passed as levels=LARGEST(n): return a function
     n <- l
     structure(function(l, a){
-      which(order(tabulate(match(a,l)), decreasing=TRUE)<=n)
+      levels_cut(a, n, l, top=TRUE)
     }, class = c("ergm_levels_spec_function", "function"))
   }
 }, class = c("ergm_levels_spec_function", "function"))
@@ -667,11 +697,11 @@ LARGEST <- structure(function(l, a){
 #' @rdname nodal_attributes
 #' @export
 SMALLEST <- structure(function(l, a){
-  if(!missing(a)) which.min(tabulate(match(a, l))) # passed as levels=SMALLEST
+  if(!missing(a)) levels_cut(a, 1, l) # passed as levels=SMALLEST
   else{ # passed as levels=SMALLEST(n): return a function
     n <- l
     structure(function(l, a){
-      which(order(tabulate(match(a,l)), decreasing=FALSE)<=n)
+      levels_cut(a, n, l)
     }, class = c("ergm_levels_spec_function", "function"))
   }
 }, class = c("ergm_levels_spec_function", "function"))
@@ -698,10 +728,8 @@ COLLAPSE_SMALLEST <- function(object, n, into){
   attr <- object
   function(...){
     vattr <- ergm_get_vattr(attr, ...)
-    lvls <- unique(vattr)
-    vattr.codes <- match(vattr,lvls)
-    smallest <- which(order(tabulate(vattr.codes), decreasing=FALSE)<=n)
-    vattr[vattr.codes %in% smallest] <- into
+    smallest <- levels_cut(vattr, n)
+    vattr[vattr %in% smallest] <- into
     vattr
   }
 }
diff --git a/man/nodal_attributes.Rd b/man/nodal_attributes.Rd
index c4d56ecd2..04aac3378 100644
--- a/man/nodal_attributes.Rd
+++ b/man/nodal_attributes.Rd
@@ -78,7 +78,8 @@ Any of these arguments may also be wrapped in or piped through
 transform the attribute by collapsing the smallest \code{n} categories
 into one, naming it \code{into}. Note that \code{into} must be of the same
 type (numeric, character, etc.) as the vertex attribute in
-question.
+question. If there are ties for \code{n}th smallest category, they will
+be broken in lexicographic order, and a warning will be issued.
 
 The name the nodal attribute receives in the statistic can be
 overridden by setting a an \code{\link[=attr]{attr()}}-style attribute \code{"name"}.
@@ -104,9 +105,10 @@ retain all levels. Negative values exclude. Another special value
 is \code{LARGEST}, which will refer to the most frequent category, so,
 say, to set such a category as the baseline, pass
 \code{levels=-LARGEST}. In addition, \code{LARGEST(n)} will refer to the \code{n}
-largest categories. \code{SMALLEST} works analogously. Note that if there
-are ties in frequencies, they will be broken arbitrarily. To
-specify numeric or logical levels literally, wrap in \code{\link[=I]{I()}}.}
+largest categories. \code{SMALLEST} works analogously. If there are ties
+in frequencies, they will be broken in lexicographic order, and a
+warning will be issued. To specify numeric or logical levels
+literally, wrap in \code{\link[=I]{I()}}.}
 
 \item{\code{\link{NULL}}}{Retain all possible levels; usually equivalent to
 passing \code{TRUE}.}
@@ -157,6 +159,10 @@ table(faux.mesa.high \%v\% "Grade")
 summary(faux.mesa.high~nodefactor((~Grade) \%>\% COLLAPSE_SMALLEST(2, 0),
                                   levels=TRUE))
 
+# Handling of tied frequencies
+faux.mesa.high \%v\% "Plans" <- sample(rep(c("College", "Trade School", "Apprenticeship", "Undecided"), c(80,80,20,25)))
+summary(faux.mesa.high ~ nodefactor("Plans", levels = -LARGEST))
+
 # Mixing between lower and upper grades:
 summary(faux.mesa.high~mm(~Grade>=10))
 # Mixing between grades 7 and 8 only:
diff --git a/tests/testthat/test-level-select.R b/tests/testthat/test-level-select.R
new file mode 100644
index 000000000..f9b35e96f
--- /dev/null
+++ b/tests/testthat/test-level-select.R
@@ -0,0 +1,97 @@
+o <- options(useFancyQuotes=FALSE)
+
+set.seed(123) # Need stable randomization.
+data(florentine)
+flomarriage %v% "x" <- sample(c(1,11,2,3), 16, replace=TRUE) ## 11 tests for numeric rather than alphabetical sorting.
+
+test_that("Nodal attribute level initialization and sorting", {
+  expect_equal(summary(flomarriage ~ nodefactor("x", levels=TRUE)),
+               c(nodefactor.x.1 = 3, nodefactor.x.2 = 18, nodefactor.x.3 = 3, nodefactor.x.11 = 16))
+})
+
+test_that("Selecting the smallest and largest categories", {
+  expect_equal(summary(flomarriage ~ nodefactor("x", levels=SMALLEST)),
+               c(nodefactor.x.3 = 3))
+
+  expect_equal(summary(flomarriage ~ nodefactor("x", levels=SMALLEST(2))),
+               c(nodefactor.x.1 = 3, nodefactor.x.3 = 3))
+
+  expect_equal(summary(flomarriage ~ nodefactor("x", levels=LARGEST)),
+               c(nodefactor.x.11 = 16))
+
+  expect_equal(summary(flomarriage ~ nodefactor("x", levels=LARGEST(2))),
+               c(nodefactor.x.2 = 18, nodefactor.x.11 = 16))
+})
+
+
+test_that("Selector negation", {
+  expect_equal(summary(flomarriage ~ nodefactor("x", levels=-SMALLEST)),
+               c(nodefactor.x.1 = 3, nodefactor.x.2 = 18, nodefactor.x.11 = 16))
+
+  expect_equal(summary(flomarriage ~ nodefactor("x", levels=-SMALLEST(2))),
+               c(nodefactor.x.2 = 18, nodefactor.x.11 = 16))
+
+  expect_equal(summary(flomarriage ~ nodefactor("x", levels=-LARGEST)),
+               c(nodefactor.x.1 = 3, nodefactor.x.2 = 18, nodefactor.x.3 = 3))
+
+  expect_equal(summary(flomarriage ~ nodefactor("x", levels=-LARGEST(2))),
+               c(nodefactor.x.1 = 3, nodefactor.x.3 = 3))
+})
+
+test_that("Collapsing categories", {
+  expect_equal(summary(flomarriage ~ nodefactor("x" %>% COLLAPSE_SMALLEST(2, 5), levels=TRUE)),
+               c(nodefactor.x.2 = 18, nodefactor.x.5 = 6, nodefactor.x.11 = 16)) 
+
+  expect_equal(summary(flomarriage ~ nodefactor("x" %>% COLLAPSE_SMALLEST(3, 5), levels=TRUE)),
+               c(nodefactor.x.5 = 24, nodefactor.x.11 = 16)) 
+
+  expect_equal(summary(flomarriage ~ nodefactor("x" %>% COLLAPSE_SMALLEST(2, 5), levels=SMALLEST(2))),
+               c(nodefactor.x.2 = 18, nodefactor.x.5 = 6))
+})
+
+## Tied categories
+
+set.seed(789) # Need stable randomization.
+data(florentine)
+flomarriage %v% "x" <- sample(c(1,11,2,3), 16, replace=TRUE) ## 11 tests for numeric rather than alphabetical sorting.
+
+test_that("Tied categories nodal attribute level initialization and sorting", {
+  expect_equal(summary(flomarriage ~ nodefactor("x", levels=TRUE)),
+               c(nodefactor.x.1 = 2, nodefactor.x.2 = 11, nodefactor.x.3 = 15, nodefactor.x.11 = 12))
+})
+
+test_that("Tied categories selecting the smallest and largest", {
+  expect_no_warning(expect_equal(summary(flomarriage ~ nodefactor("x", levels=SMALLEST(1))),
+                              c(nodefactor.x.1 = 2)))
+
+  expect_warning(expect_equal(summary(flomarriage ~ nodefactor("x", levels=SMALLEST(2))),
+                              c(nodefactor.x.1 = 2, nodefactor.x.2 = 11)),
+                 "In term 'nodefactor' in package 'ergm': Levels '2' and '11' are tied. Using the order given.")
+
+  expect_no_warning(expect_equal(summary(flomarriage ~ nodefactor("x", levels=SMALLEST(3))),
+                                 c(nodefactor.x.1 = 2, nodefactor.x.2 = 11, nodefactor.x.11 = 12)))
+
+  expect_equal(summary(flomarriage ~ nodefactor("x", levels=LARGEST)),
+               c(nodefactor.x.3 = 15))
+
+  expect_warning(expect_equal(summary(flomarriage ~ nodefactor("x", levels=LARGEST(2))),
+                              c(nodefactor.x.3 = 15, nodefactor.x.11 = 12)),
+                 "In term 'nodefactor' in package 'ergm': Levels '2' and '11' are tied. Using the order given.")
+})
+
+
+test_that("Collapsing categories", {
+  expect_warning(expect_equal(summary(flomarriage ~ nodefactor("x" %>% COLLAPSE_SMALLEST(2, 5), levels=TRUE)),
+                              c(nodefactor.x.3 = 15, nodefactor.x.5 = 13, nodefactor.x.11 = 12)),
+                 "In term 'nodefactor' in package 'ergm': Levels '2' and '11' are tied. Using the order given.")
+
+  expect_no_warning(expect_equal(summary(flomarriage ~ nodefactor("x" %>% COLLAPSE_SMALLEST(3, 5), levels=TRUE)),
+                                 c(nodefactor.x.3 = 15, nodefactor.x.5 = 25)))
+
+  expect_warning(expect_warning(expect_equal(summary(flomarriage ~ nodefactor("x" %>% COLLAPSE_SMALLEST(2, 5), levels=SMALLEST(2))),
+                                             c(nodefactor.x.3 = 15, nodefactor.x.11 = 12)),
+                                "In term 'nodefactor' in package 'ergm': Levels '2' and '11' are tied. Using the order given."),
+                 "In term 'nodefactor' in package 'ergm': Levels '3' and '5' are tied. Using the order given.")
+})
+
+options(o)