From 1bdc76e75486e17db518df8a6d09566df3c71aed Mon Sep 17 00:00:00 2001 From: "Pavel N. Krivitsky" Date: Tue, 2 Jan 2024 22:06:59 +1100 Subject: [PATCH] Fixed the wrong categories being selected for COLLAPSE_SMALLEST() and added lexicographic tie breaking with warnings for when categories passed to LARGEST, SMALLEST, and COLLAPSE_SMALLEST have equal sizes. fixes statnet/ergm#544, statnet/ergm#545 --- DESCRIPTION | 4 +- R/get.node.attr.R | 52 ++++++++++++---- man/nodal_attributes.Rd | 14 +++-- tests/testthat/test-level-select.R | 97 ++++++++++++++++++++++++++++++ 4 files changed, 149 insertions(+), 18 deletions(-) create mode 100644 tests/testthat/test-level-select.R diff --git a/DESCRIPTION b/DESCRIPTION index dec1c66b3..6f92420fa 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: ergm -Version: 4.6-7286 -Date: 2023-12-17 +Version: 4.6-7289 +Date: 2024-01-02 Title: Fit, Simulate and Diagnose Exponential-Family Models for Networks Authors@R: c( person(c("Mark", "S."), "Handcock", role=c("aut"), email="handcock@stat.ucla.edu"), diff --git a/R/get.node.attr.R b/R/get.node.attr.R index 0b88ff648..bdc5e3063 100644 --- a/R/get.node.attr.R +++ b/R/get.node.attr.R @@ -116,7 +116,8 @@ get.node.attr <- function(nw, attrname, functionname=NULL, numeric=FALSE) { #' transform the attribute by collapsing the smallest `n` categories #' into one, naming it `into`. Note that `into` must be of the same #' type (numeric, character, etc.) as the vertex attribute in -#' question. +#' question. If there are ties for `n`th smallest category, they will +#' be broken in lexicographic order, and a warning will be issued. #' #' The name the nodal attribute receives in the statistic can be #' overridden by setting a an [attr()]-style attribute `"name"`. @@ -140,9 +141,10 @@ get.node.attr <- function(nw, attrname, functionname=NULL, numeric=FALSE) { #' is `LARGEST`, which will refer to the most frequent category, so, #' say, to set such a category as the baseline, pass #' `levels=-LARGEST`. In addition, `LARGEST(n)` will refer to the `n` -#' largest categories. `SMALLEST` works analogously. Note that if there -#' are ties in frequencies, they will be broken arbitrarily. To -#' specify numeric or logical levels literally, wrap in [I()].} +#' largest categories. `SMALLEST` works analogously. If there are ties +#' in frequencies, they will be broken in lexicographic order, and a +#' warning will be issued. To specify numeric or logical levels +#' literally, wrap in [I()].} #' #'\item{[`NULL`]}{Retain all possible levels; usually equivalent to #' passing `TRUE`.} @@ -192,6 +194,10 @@ get.node.attr <- function(nw, attrname, functionname=NULL, numeric=FALSE) { #' table(faux.mesa.high %v% "Grade") #' summary(faux.mesa.high~nodefactor((~Grade) %>% COLLAPSE_SMALLEST(2, 0), #' levels=TRUE)) +#' +#' # Handling of tied frequencies +#' faux.mesa.high %v% "Plans" <- sample(rep(c("College", "Trade School", "Apprenticeship", "Undecided"), c(80,80,20,25))) +#' summary(faux.mesa.high ~ nodefactor("Plans", levels = -LARGEST)) #' #' # Mixing between lower and upper grades: #' summary(faux.mesa.high~mm(~Grade>=10)) @@ -652,14 +658,38 @@ ERGM_VATTR_SPEC_NULL <- "function,formula,character,AsIs,NULL" #' @export ERGM_LEVELS_SPEC <- "function,formula,character,numeric,logical,AsIs,NULL,matrix" +rank_cut <- function(x, n, tie_action = c("warning", "error"), top = FALSE){ + ordrank <- if(top) function(r) length(x) + 1 - r else identity + s1 <- ordrank(rank(x, ties.method="min")) <= n + s2 <- ordrank(rank(x, ties.method="max")) <= n + + if(identical(s1, s2)) which(s1) + else{ + tie_action <- match.arg(tie_action) + msg <- paste0("Levels ", paste.and(sQuote(names(x)[s1!=s2])), " are tied.") + switch(tie_action, + error = ergm_Init_abort(msg, " Specify explicitly."), + warning = { + ergm_Init_warn(msg, " Using the order given.") + which(ordrank(rank(x, ties.method="first")) <= n) + }) + } +} + +levels_cut <- function(x, n, lvls = sort(unique(x)), top = FALSE, ...){ + f <- setNames(tabulate(match(x, lvls)), lvls) + sel <- rank_cut(f, n, top=top, ...) + if(missing(lvls)) lvls[sel] else sel +} + #' @rdname nodal_attributes #' @export LARGEST <- structure(function(l, a){ - if(!missing(a)) which.max(tabulate(match(a, l))) # passed as levels=LARGEST + if(!missing(a)) levels_cut(a, 1, l, top=TRUE) # passed as levels=LARGEST else{ # passed as levels=LARGEST(n): return a function n <- l structure(function(l, a){ - which(order(tabulate(match(a,l)), decreasing=TRUE)<=n) + levels_cut(a, n, l, top=TRUE) }, class = c("ergm_levels_spec_function", "function")) } }, class = c("ergm_levels_spec_function", "function")) @@ -667,11 +697,11 @@ LARGEST <- structure(function(l, a){ #' @rdname nodal_attributes #' @export SMALLEST <- structure(function(l, a){ - if(!missing(a)) which.min(tabulate(match(a, l))) # passed as levels=SMALLEST + if(!missing(a)) levels_cut(a, 1, l) # passed as levels=SMALLEST else{ # passed as levels=SMALLEST(n): return a function n <- l structure(function(l, a){ - which(order(tabulate(match(a,l)), decreasing=FALSE)<=n) + levels_cut(a, n, l) }, class = c("ergm_levels_spec_function", "function")) } }, class = c("ergm_levels_spec_function", "function")) @@ -698,10 +728,8 @@ COLLAPSE_SMALLEST <- function(object, n, into){ attr <- object function(...){ vattr <- ergm_get_vattr(attr, ...) - lvls <- unique(vattr) - vattr.codes <- match(vattr,lvls) - smallest <- which(order(tabulate(vattr.codes), decreasing=FALSE)<=n) - vattr[vattr.codes %in% smallest] <- into + smallest <- levels_cut(vattr, n) + vattr[vattr %in% smallest] <- into vattr } } diff --git a/man/nodal_attributes.Rd b/man/nodal_attributes.Rd index c4d56ecd2..04aac3378 100644 --- a/man/nodal_attributes.Rd +++ b/man/nodal_attributes.Rd @@ -78,7 +78,8 @@ Any of these arguments may also be wrapped in or piped through transform the attribute by collapsing the smallest \code{n} categories into one, naming it \code{into}. Note that \code{into} must be of the same type (numeric, character, etc.) as the vertex attribute in -question. +question. If there are ties for \code{n}th smallest category, they will +be broken in lexicographic order, and a warning will be issued. The name the nodal attribute receives in the statistic can be overridden by setting a an \code{\link[=attr]{attr()}}-style attribute \code{"name"}. @@ -104,9 +105,10 @@ retain all levels. Negative values exclude. Another special value is \code{LARGEST}, which will refer to the most frequent category, so, say, to set such a category as the baseline, pass \code{levels=-LARGEST}. In addition, \code{LARGEST(n)} will refer to the \code{n} -largest categories. \code{SMALLEST} works analogously. Note that if there -are ties in frequencies, they will be broken arbitrarily. To -specify numeric or logical levels literally, wrap in \code{\link[=I]{I()}}.} +largest categories. \code{SMALLEST} works analogously. If there are ties +in frequencies, they will be broken in lexicographic order, and a +warning will be issued. To specify numeric or logical levels +literally, wrap in \code{\link[=I]{I()}}.} \item{\code{\link{NULL}}}{Retain all possible levels; usually equivalent to passing \code{TRUE}.} @@ -157,6 +159,10 @@ table(faux.mesa.high \%v\% "Grade") summary(faux.mesa.high~nodefactor((~Grade) \%>\% COLLAPSE_SMALLEST(2, 0), levels=TRUE)) +# Handling of tied frequencies +faux.mesa.high \%v\% "Plans" <- sample(rep(c("College", "Trade School", "Apprenticeship", "Undecided"), c(80,80,20,25))) +summary(faux.mesa.high ~ nodefactor("Plans", levels = -LARGEST)) + # Mixing between lower and upper grades: summary(faux.mesa.high~mm(~Grade>=10)) # Mixing between grades 7 and 8 only: diff --git a/tests/testthat/test-level-select.R b/tests/testthat/test-level-select.R new file mode 100644 index 000000000..f9b35e96f --- /dev/null +++ b/tests/testthat/test-level-select.R @@ -0,0 +1,97 @@ +o <- options(useFancyQuotes=FALSE) + +set.seed(123) # Need stable randomization. +data(florentine) +flomarriage %v% "x" <- sample(c(1,11,2,3), 16, replace=TRUE) ## 11 tests for numeric rather than alphabetical sorting. + +test_that("Nodal attribute level initialization and sorting", { + expect_equal(summary(flomarriage ~ nodefactor("x", levels=TRUE)), + c(nodefactor.x.1 = 3, nodefactor.x.2 = 18, nodefactor.x.3 = 3, nodefactor.x.11 = 16)) +}) + +test_that("Selecting the smallest and largest categories", { + expect_equal(summary(flomarriage ~ nodefactor("x", levels=SMALLEST)), + c(nodefactor.x.3 = 3)) + + expect_equal(summary(flomarriage ~ nodefactor("x", levels=SMALLEST(2))), + c(nodefactor.x.1 = 3, nodefactor.x.3 = 3)) + + expect_equal(summary(flomarriage ~ nodefactor("x", levels=LARGEST)), + c(nodefactor.x.11 = 16)) + + expect_equal(summary(flomarriage ~ nodefactor("x", levels=LARGEST(2))), + c(nodefactor.x.2 = 18, nodefactor.x.11 = 16)) +}) + + +test_that("Selector negation", { + expect_equal(summary(flomarriage ~ nodefactor("x", levels=-SMALLEST)), + c(nodefactor.x.1 = 3, nodefactor.x.2 = 18, nodefactor.x.11 = 16)) + + expect_equal(summary(flomarriage ~ nodefactor("x", levels=-SMALLEST(2))), + c(nodefactor.x.2 = 18, nodefactor.x.11 = 16)) + + expect_equal(summary(flomarriage ~ nodefactor("x", levels=-LARGEST)), + c(nodefactor.x.1 = 3, nodefactor.x.2 = 18, nodefactor.x.3 = 3)) + + expect_equal(summary(flomarriage ~ nodefactor("x", levels=-LARGEST(2))), + c(nodefactor.x.1 = 3, nodefactor.x.3 = 3)) +}) + +test_that("Collapsing categories", { + expect_equal(summary(flomarriage ~ nodefactor("x" %>% COLLAPSE_SMALLEST(2, 5), levels=TRUE)), + c(nodefactor.x.2 = 18, nodefactor.x.5 = 6, nodefactor.x.11 = 16)) + + expect_equal(summary(flomarriage ~ nodefactor("x" %>% COLLAPSE_SMALLEST(3, 5), levels=TRUE)), + c(nodefactor.x.5 = 24, nodefactor.x.11 = 16)) + + expect_equal(summary(flomarriage ~ nodefactor("x" %>% COLLAPSE_SMALLEST(2, 5), levels=SMALLEST(2))), + c(nodefactor.x.2 = 18, nodefactor.x.5 = 6)) +}) + +## Tied categories + +set.seed(789) # Need stable randomization. +data(florentine) +flomarriage %v% "x" <- sample(c(1,11,2,3), 16, replace=TRUE) ## 11 tests for numeric rather than alphabetical sorting. + +test_that("Tied categories nodal attribute level initialization and sorting", { + expect_equal(summary(flomarriage ~ nodefactor("x", levels=TRUE)), + c(nodefactor.x.1 = 2, nodefactor.x.2 = 11, nodefactor.x.3 = 15, nodefactor.x.11 = 12)) +}) + +test_that("Tied categories selecting the smallest and largest", { + expect_no_warning(expect_equal(summary(flomarriage ~ nodefactor("x", levels=SMALLEST(1))), + c(nodefactor.x.1 = 2))) + + expect_warning(expect_equal(summary(flomarriage ~ nodefactor("x", levels=SMALLEST(2))), + c(nodefactor.x.1 = 2, nodefactor.x.2 = 11)), + "In term 'nodefactor' in package 'ergm': Levels '2' and '11' are tied. Using the order given.") + + expect_no_warning(expect_equal(summary(flomarriage ~ nodefactor("x", levels=SMALLEST(3))), + c(nodefactor.x.1 = 2, nodefactor.x.2 = 11, nodefactor.x.11 = 12))) + + expect_equal(summary(flomarriage ~ nodefactor("x", levels=LARGEST)), + c(nodefactor.x.3 = 15)) + + expect_warning(expect_equal(summary(flomarriage ~ nodefactor("x", levels=LARGEST(2))), + c(nodefactor.x.3 = 15, nodefactor.x.11 = 12)), + "In term 'nodefactor' in package 'ergm': Levels '2' and '11' are tied. Using the order given.") +}) + + +test_that("Collapsing categories", { + expect_warning(expect_equal(summary(flomarriage ~ nodefactor("x" %>% COLLAPSE_SMALLEST(2, 5), levels=TRUE)), + c(nodefactor.x.3 = 15, nodefactor.x.5 = 13, nodefactor.x.11 = 12)), + "In term 'nodefactor' in package 'ergm': Levels '2' and '11' are tied. Using the order given.") + + expect_no_warning(expect_equal(summary(flomarriage ~ nodefactor("x" %>% COLLAPSE_SMALLEST(3, 5), levels=TRUE)), + c(nodefactor.x.3 = 15, nodefactor.x.5 = 25))) + + expect_warning(expect_warning(expect_equal(summary(flomarriage ~ nodefactor("x" %>% COLLAPSE_SMALLEST(2, 5), levels=SMALLEST(2))), + c(nodefactor.x.3 = 15, nodefactor.x.11 = 12)), + "In term 'nodefactor' in package 'ergm': Levels '2' and '11' are tied. Using the order given."), + "In term 'nodefactor' in package 'ergm': Levels '3' and '5' are tied. Using the order given.") +}) + +options(o)