From 1bdc76e75486e17db518df8a6d09566df3c71aed Mon Sep 17 00:00:00 2001
From: "Pavel N. Krivitsky"
Date: Tue, 2 Jan 2024 22:06:59 +1100
Subject: [PATCH] Fixed the wrong categories being selected for
COLLAPSE_SMALLEST() and added lexicographic tie breaking with warnings for
when categories passed to LARGEST, SMALLEST, and COLLAPSE_SMALLEST have equal
sizes.
fixes statnet/ergm#544, statnet/ergm#545
---
DESCRIPTION | 4 +-
R/get.node.attr.R | 52 ++++++++++++----
man/nodal_attributes.Rd | 14 +++--
tests/testthat/test-level-select.R | 97 ++++++++++++++++++++++++++++++
4 files changed, 149 insertions(+), 18 deletions(-)
create mode 100644 tests/testthat/test-level-select.R
diff --git a/DESCRIPTION b/DESCRIPTION
index dec1c66b3..6f92420fa 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
Package: ergm
-Version: 4.6-7286
-Date: 2023-12-17
+Version: 4.6-7289
+Date: 2024-01-02
Title: Fit, Simulate and Diagnose Exponential-Family Models for Networks
Authors@R: c(
person(c("Mark", "S."), "Handcock", role=c("aut"), email="handcock@stat.ucla.edu"),
diff --git a/R/get.node.attr.R b/R/get.node.attr.R
index 0b88ff648..bdc5e3063 100644
--- a/R/get.node.attr.R
+++ b/R/get.node.attr.R
@@ -116,7 +116,8 @@ get.node.attr <- function(nw, attrname, functionname=NULL, numeric=FALSE) {
#' transform the attribute by collapsing the smallest `n` categories
#' into one, naming it `into`. Note that `into` must be of the same
#' type (numeric, character, etc.) as the vertex attribute in
-#' question.
+#' question. If there are ties for `n`th smallest category, they will
+#' be broken in lexicographic order, and a warning will be issued.
#'
#' The name the nodal attribute receives in the statistic can be
#' overridden by setting a an [attr()]-style attribute `"name"`.
@@ -140,9 +141,10 @@ get.node.attr <- function(nw, attrname, functionname=NULL, numeric=FALSE) {
#' is `LARGEST`, which will refer to the most frequent category, so,
#' say, to set such a category as the baseline, pass
#' `levels=-LARGEST`. In addition, `LARGEST(n)` will refer to the `n`
-#' largest categories. `SMALLEST` works analogously. Note that if there
-#' are ties in frequencies, they will be broken arbitrarily. To
-#' specify numeric or logical levels literally, wrap in [I()].}
+#' largest categories. `SMALLEST` works analogously. If there are ties
+#' in frequencies, they will be broken in lexicographic order, and a
+#' warning will be issued. To specify numeric or logical levels
+#' literally, wrap in [I()].}
#'
#'\item{[`NULL`]}{Retain all possible levels; usually equivalent to
#' passing `TRUE`.}
@@ -192,6 +194,10 @@ get.node.attr <- function(nw, attrname, functionname=NULL, numeric=FALSE) {
#' table(faux.mesa.high %v% "Grade")
#' summary(faux.mesa.high~nodefactor((~Grade) %>% COLLAPSE_SMALLEST(2, 0),
#' levels=TRUE))
+#'
+#' # Handling of tied frequencies
+#' faux.mesa.high %v% "Plans" <- sample(rep(c("College", "Trade School", "Apprenticeship", "Undecided"), c(80,80,20,25)))
+#' summary(faux.mesa.high ~ nodefactor("Plans", levels = -LARGEST))
#'
#' # Mixing between lower and upper grades:
#' summary(faux.mesa.high~mm(~Grade>=10))
@@ -652,14 +658,38 @@ ERGM_VATTR_SPEC_NULL <- "function,formula,character,AsIs,NULL"
#' @export
ERGM_LEVELS_SPEC <- "function,formula,character,numeric,logical,AsIs,NULL,matrix"
+rank_cut <- function(x, n, tie_action = c("warning", "error"), top = FALSE){
+ ordrank <- if(top) function(r) length(x) + 1 - r else identity
+ s1 <- ordrank(rank(x, ties.method="min")) <= n
+ s2 <- ordrank(rank(x, ties.method="max")) <= n
+
+ if(identical(s1, s2)) which(s1)
+ else{
+ tie_action <- match.arg(tie_action)
+ msg <- paste0("Levels ", paste.and(sQuote(names(x)[s1!=s2])), " are tied.")
+ switch(tie_action,
+ error = ergm_Init_abort(msg, " Specify explicitly."),
+ warning = {
+ ergm_Init_warn(msg, " Using the order given.")
+ which(ordrank(rank(x, ties.method="first")) <= n)
+ })
+ }
+}
+
+levels_cut <- function(x, n, lvls = sort(unique(x)), top = FALSE, ...){
+ f <- setNames(tabulate(match(x, lvls)), lvls)
+ sel <- rank_cut(f, n, top=top, ...)
+ if(missing(lvls)) lvls[sel] else sel
+}
+
#' @rdname nodal_attributes
#' @export
LARGEST <- structure(function(l, a){
- if(!missing(a)) which.max(tabulate(match(a, l))) # passed as levels=LARGEST
+ if(!missing(a)) levels_cut(a, 1, l, top=TRUE) # passed as levels=LARGEST
else{ # passed as levels=LARGEST(n): return a function
n <- l
structure(function(l, a){
- which(order(tabulate(match(a,l)), decreasing=TRUE)<=n)
+ levels_cut(a, n, l, top=TRUE)
}, class = c("ergm_levels_spec_function", "function"))
}
}, class = c("ergm_levels_spec_function", "function"))
@@ -667,11 +697,11 @@ LARGEST <- structure(function(l, a){
#' @rdname nodal_attributes
#' @export
SMALLEST <- structure(function(l, a){
- if(!missing(a)) which.min(tabulate(match(a, l))) # passed as levels=SMALLEST
+ if(!missing(a)) levels_cut(a, 1, l) # passed as levels=SMALLEST
else{ # passed as levels=SMALLEST(n): return a function
n <- l
structure(function(l, a){
- which(order(tabulate(match(a,l)), decreasing=FALSE)<=n)
+ levels_cut(a, n, l)
}, class = c("ergm_levels_spec_function", "function"))
}
}, class = c("ergm_levels_spec_function", "function"))
@@ -698,10 +728,8 @@ COLLAPSE_SMALLEST <- function(object, n, into){
attr <- object
function(...){
vattr <- ergm_get_vattr(attr, ...)
- lvls <- unique(vattr)
- vattr.codes <- match(vattr,lvls)
- smallest <- which(order(tabulate(vattr.codes), decreasing=FALSE)<=n)
- vattr[vattr.codes %in% smallest] <- into
+ smallest <- levels_cut(vattr, n)
+ vattr[vattr %in% smallest] <- into
vattr
}
}
diff --git a/man/nodal_attributes.Rd b/man/nodal_attributes.Rd
index c4d56ecd2..04aac3378 100644
--- a/man/nodal_attributes.Rd
+++ b/man/nodal_attributes.Rd
@@ -78,7 +78,8 @@ Any of these arguments may also be wrapped in or piped through
transform the attribute by collapsing the smallest \code{n} categories
into one, naming it \code{into}. Note that \code{into} must be of the same
type (numeric, character, etc.) as the vertex attribute in
-question.
+question. If there are ties for \code{n}th smallest category, they will
+be broken in lexicographic order, and a warning will be issued.
The name the nodal attribute receives in the statistic can be
overridden by setting a an \code{\link[=attr]{attr()}}-style attribute \code{"name"}.
@@ -104,9 +105,10 @@ retain all levels. Negative values exclude. Another special value
is \code{LARGEST}, which will refer to the most frequent category, so,
say, to set such a category as the baseline, pass
\code{levels=-LARGEST}. In addition, \code{LARGEST(n)} will refer to the \code{n}
-largest categories. \code{SMALLEST} works analogously. Note that if there
-are ties in frequencies, they will be broken arbitrarily. To
-specify numeric or logical levels literally, wrap in \code{\link[=I]{I()}}.}
+largest categories. \code{SMALLEST} works analogously. If there are ties
+in frequencies, they will be broken in lexicographic order, and a
+warning will be issued. To specify numeric or logical levels
+literally, wrap in \code{\link[=I]{I()}}.}
\item{\code{\link{NULL}}}{Retain all possible levels; usually equivalent to
passing \code{TRUE}.}
@@ -157,6 +159,10 @@ table(faux.mesa.high \%v\% "Grade")
summary(faux.mesa.high~nodefactor((~Grade) \%>\% COLLAPSE_SMALLEST(2, 0),
levels=TRUE))
+# Handling of tied frequencies
+faux.mesa.high \%v\% "Plans" <- sample(rep(c("College", "Trade School", "Apprenticeship", "Undecided"), c(80,80,20,25)))
+summary(faux.mesa.high ~ nodefactor("Plans", levels = -LARGEST))
+
# Mixing between lower and upper grades:
summary(faux.mesa.high~mm(~Grade>=10))
# Mixing between grades 7 and 8 only:
diff --git a/tests/testthat/test-level-select.R b/tests/testthat/test-level-select.R
new file mode 100644
index 000000000..f9b35e96f
--- /dev/null
+++ b/tests/testthat/test-level-select.R
@@ -0,0 +1,97 @@
+o <- options(useFancyQuotes=FALSE)
+
+set.seed(123) # Need stable randomization.
+data(florentine)
+flomarriage %v% "x" <- sample(c(1,11,2,3), 16, replace=TRUE) ## 11 tests for numeric rather than alphabetical sorting.
+
+test_that("Nodal attribute level initialization and sorting", {
+ expect_equal(summary(flomarriage ~ nodefactor("x", levels=TRUE)),
+ c(nodefactor.x.1 = 3, nodefactor.x.2 = 18, nodefactor.x.3 = 3, nodefactor.x.11 = 16))
+})
+
+test_that("Selecting the smallest and largest categories", {
+ expect_equal(summary(flomarriage ~ nodefactor("x", levels=SMALLEST)),
+ c(nodefactor.x.3 = 3))
+
+ expect_equal(summary(flomarriage ~ nodefactor("x", levels=SMALLEST(2))),
+ c(nodefactor.x.1 = 3, nodefactor.x.3 = 3))
+
+ expect_equal(summary(flomarriage ~ nodefactor("x", levels=LARGEST)),
+ c(nodefactor.x.11 = 16))
+
+ expect_equal(summary(flomarriage ~ nodefactor("x", levels=LARGEST(2))),
+ c(nodefactor.x.2 = 18, nodefactor.x.11 = 16))
+})
+
+
+test_that("Selector negation", {
+ expect_equal(summary(flomarriage ~ nodefactor("x", levels=-SMALLEST)),
+ c(nodefactor.x.1 = 3, nodefactor.x.2 = 18, nodefactor.x.11 = 16))
+
+ expect_equal(summary(flomarriage ~ nodefactor("x", levels=-SMALLEST(2))),
+ c(nodefactor.x.2 = 18, nodefactor.x.11 = 16))
+
+ expect_equal(summary(flomarriage ~ nodefactor("x", levels=-LARGEST)),
+ c(nodefactor.x.1 = 3, nodefactor.x.2 = 18, nodefactor.x.3 = 3))
+
+ expect_equal(summary(flomarriage ~ nodefactor("x", levels=-LARGEST(2))),
+ c(nodefactor.x.1 = 3, nodefactor.x.3 = 3))
+})
+
+test_that("Collapsing categories", {
+ expect_equal(summary(flomarriage ~ nodefactor("x" %>% COLLAPSE_SMALLEST(2, 5), levels=TRUE)),
+ c(nodefactor.x.2 = 18, nodefactor.x.5 = 6, nodefactor.x.11 = 16))
+
+ expect_equal(summary(flomarriage ~ nodefactor("x" %>% COLLAPSE_SMALLEST(3, 5), levels=TRUE)),
+ c(nodefactor.x.5 = 24, nodefactor.x.11 = 16))
+
+ expect_equal(summary(flomarriage ~ nodefactor("x" %>% COLLAPSE_SMALLEST(2, 5), levels=SMALLEST(2))),
+ c(nodefactor.x.2 = 18, nodefactor.x.5 = 6))
+})
+
+## Tied categories
+
+set.seed(789) # Need stable randomization.
+data(florentine)
+flomarriage %v% "x" <- sample(c(1,11,2,3), 16, replace=TRUE) ## 11 tests for numeric rather than alphabetical sorting.
+
+test_that("Tied categories nodal attribute level initialization and sorting", {
+ expect_equal(summary(flomarriage ~ nodefactor("x", levels=TRUE)),
+ c(nodefactor.x.1 = 2, nodefactor.x.2 = 11, nodefactor.x.3 = 15, nodefactor.x.11 = 12))
+})
+
+test_that("Tied categories selecting the smallest and largest", {
+ expect_no_warning(expect_equal(summary(flomarriage ~ nodefactor("x", levels=SMALLEST(1))),
+ c(nodefactor.x.1 = 2)))
+
+ expect_warning(expect_equal(summary(flomarriage ~ nodefactor("x", levels=SMALLEST(2))),
+ c(nodefactor.x.1 = 2, nodefactor.x.2 = 11)),
+ "In term 'nodefactor' in package 'ergm': Levels '2' and '11' are tied. Using the order given.")
+
+ expect_no_warning(expect_equal(summary(flomarriage ~ nodefactor("x", levels=SMALLEST(3))),
+ c(nodefactor.x.1 = 2, nodefactor.x.2 = 11, nodefactor.x.11 = 12)))
+
+ expect_equal(summary(flomarriage ~ nodefactor("x", levels=LARGEST)),
+ c(nodefactor.x.3 = 15))
+
+ expect_warning(expect_equal(summary(flomarriage ~ nodefactor("x", levels=LARGEST(2))),
+ c(nodefactor.x.3 = 15, nodefactor.x.11 = 12)),
+ "In term 'nodefactor' in package 'ergm': Levels '2' and '11' are tied. Using the order given.")
+})
+
+
+test_that("Collapsing categories", {
+ expect_warning(expect_equal(summary(flomarriage ~ nodefactor("x" %>% COLLAPSE_SMALLEST(2, 5), levels=TRUE)),
+ c(nodefactor.x.3 = 15, nodefactor.x.5 = 13, nodefactor.x.11 = 12)),
+ "In term 'nodefactor' in package 'ergm': Levels '2' and '11' are tied. Using the order given.")
+
+ expect_no_warning(expect_equal(summary(flomarriage ~ nodefactor("x" %>% COLLAPSE_SMALLEST(3, 5), levels=TRUE)),
+ c(nodefactor.x.3 = 15, nodefactor.x.5 = 25)))
+
+ expect_warning(expect_warning(expect_equal(summary(flomarriage ~ nodefactor("x" %>% COLLAPSE_SMALLEST(2, 5), levels=SMALLEST(2))),
+ c(nodefactor.x.3 = 15, nodefactor.x.11 = 12)),
+ "In term 'nodefactor' in package 'ergm': Levels '2' and '11' are tied. Using the order given."),
+ "In term 'nodefactor' in package 'ergm': Levels '3' and '5' are tied. Using the order given.")
+})
+
+options(o)