Skip to content

Commit

Permalink
Fixed the wrong categories being selected for COLLAPSE_SMALLEST() and…
Browse files Browse the repository at this point in the history
… added lexicographic tie breaking with warnings for when categories passed to LARGEST, SMALLEST, and COLLAPSE_SMALLEST have equal sizes.

fixes #544, #545
  • Loading branch information
krivit committed Jan 2, 2024
1 parent b25558f commit 1bdc76e
Show file tree
Hide file tree
Showing 4 changed files with 149 additions and 18 deletions.
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: ergm
Version: 4.6-7286
Date: 2023-12-17
Version: 4.6-7289
Date: 2024-01-02
Title: Fit, Simulate and Diagnose Exponential-Family Models for Networks
Authors@R: c(
person(c("Mark", "S."), "Handcock", role=c("aut"), email="[email protected]"),
Expand Down
52 changes: 40 additions & 12 deletions R/get.node.attr.R
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,8 @@ get.node.attr <- function(nw, attrname, functionname=NULL, numeric=FALSE) {
#' transform the attribute by collapsing the smallest `n` categories
#' into one, naming it `into`. Note that `into` must be of the same
#' type (numeric, character, etc.) as the vertex attribute in
#' question.
#' question. If there are ties for `n`th smallest category, they will
#' be broken in lexicographic order, and a warning will be issued.
#'
#' The name the nodal attribute receives in the statistic can be
#' overridden by setting a an [attr()]-style attribute `"name"`.
Expand All @@ -140,9 +141,10 @@ get.node.attr <- function(nw, attrname, functionname=NULL, numeric=FALSE) {
#' is `LARGEST`, which will refer to the most frequent category, so,
#' say, to set such a category as the baseline, pass
#' `levels=-LARGEST`. In addition, `LARGEST(n)` will refer to the `n`
#' largest categories. `SMALLEST` works analogously. Note that if there
#' are ties in frequencies, they will be broken arbitrarily. To
#' specify numeric or logical levels literally, wrap in [I()].}
#' largest categories. `SMALLEST` works analogously. If there are ties
#' in frequencies, they will be broken in lexicographic order, and a
#' warning will be issued. To specify numeric or logical levels
#' literally, wrap in [I()].}
#'
#'\item{[`NULL`]}{Retain all possible levels; usually equivalent to
#' passing `TRUE`.}
Expand Down Expand Up @@ -192,6 +194,10 @@ get.node.attr <- function(nw, attrname, functionname=NULL, numeric=FALSE) {
#' table(faux.mesa.high %v% "Grade")
#' summary(faux.mesa.high~nodefactor((~Grade) %>% COLLAPSE_SMALLEST(2, 0),
#' levels=TRUE))
#'
#' # Handling of tied frequencies
#' faux.mesa.high %v% "Plans" <- sample(rep(c("College", "Trade School", "Apprenticeship", "Undecided"), c(80,80,20,25)))
#' summary(faux.mesa.high ~ nodefactor("Plans", levels = -LARGEST))
#'
#' # Mixing between lower and upper grades:
#' summary(faux.mesa.high~mm(~Grade>=10))
Expand Down Expand Up @@ -652,26 +658,50 @@ ERGM_VATTR_SPEC_NULL <- "function,formula,character,AsIs,NULL"
#' @export
ERGM_LEVELS_SPEC <- "function,formula,character,numeric,logical,AsIs,NULL,matrix"

rank_cut <- function(x, n, tie_action = c("warning", "error"), top = FALSE){
ordrank <- if(top) function(r) length(x) + 1 - r else identity
s1 <- ordrank(rank(x, ties.method="min")) <= n
s2 <- ordrank(rank(x, ties.method="max")) <= n

if(identical(s1, s2)) which(s1)
else{
tie_action <- match.arg(tie_action)
msg <- paste0("Levels ", paste.and(sQuote(names(x)[s1!=s2])), " are tied.")
switch(tie_action,
error = ergm_Init_abort(msg, " Specify explicitly."),
warning = {
ergm_Init_warn(msg, " Using the order given.")
which(ordrank(rank(x, ties.method="first")) <= n)
})
}
}

levels_cut <- function(x, n, lvls = sort(unique(x)), top = FALSE, ...){
f <- setNames(tabulate(match(x, lvls)), lvls)
sel <- rank_cut(f, n, top=top, ...)
if(missing(lvls)) lvls[sel] else sel
}

#' @rdname nodal_attributes
#' @export
LARGEST <- structure(function(l, a){
if(!missing(a)) which.max(tabulate(match(a, l))) # passed as levels=LARGEST
if(!missing(a)) levels_cut(a, 1, l, top=TRUE) # passed as levels=LARGEST
else{ # passed as levels=LARGEST(n): return a function
n <- l
structure(function(l, a){
which(order(tabulate(match(a,l)), decreasing=TRUE)<=n)
levels_cut(a, n, l, top=TRUE)
}, class = c("ergm_levels_spec_function", "function"))
}
}, class = c("ergm_levels_spec_function", "function"))

#' @rdname nodal_attributes
#' @export
SMALLEST <- structure(function(l, a){
if(!missing(a)) which.min(tabulate(match(a, l))) # passed as levels=SMALLEST
if(!missing(a)) levels_cut(a, 1, l) # passed as levels=SMALLEST
else{ # passed as levels=SMALLEST(n): return a function
n <- l
structure(function(l, a){
which(order(tabulate(match(a,l)), decreasing=FALSE)<=n)
levels_cut(a, n, l)
}, class = c("ergm_levels_spec_function", "function"))
}
}, class = c("ergm_levels_spec_function", "function"))
Expand All @@ -698,10 +728,8 @@ COLLAPSE_SMALLEST <- function(object, n, into){
attr <- object
function(...){
vattr <- ergm_get_vattr(attr, ...)
lvls <- unique(vattr)
vattr.codes <- match(vattr,lvls)
smallest <- which(order(tabulate(vattr.codes), decreasing=FALSE)<=n)
vattr[vattr.codes %in% smallest] <- into
smallest <- levels_cut(vattr, n)
vattr[vattr %in% smallest] <- into
vattr
}
}
14 changes: 10 additions & 4 deletions man/nodal_attributes.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

97 changes: 97 additions & 0 deletions tests/testthat/test-level-select.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
o <- options(useFancyQuotes=FALSE)

set.seed(123) # Need stable randomization.
data(florentine)
flomarriage %v% "x" <- sample(c(1,11,2,3), 16, replace=TRUE) ## 11 tests for numeric rather than alphabetical sorting.

test_that("Nodal attribute level initialization and sorting", {
expect_equal(summary(flomarriage ~ nodefactor("x", levels=TRUE)),
c(nodefactor.x.1 = 3, nodefactor.x.2 = 18, nodefactor.x.3 = 3, nodefactor.x.11 = 16))
})

test_that("Selecting the smallest and largest categories", {
expect_equal(summary(flomarriage ~ nodefactor("x", levels=SMALLEST)),
c(nodefactor.x.3 = 3))

expect_equal(summary(flomarriage ~ nodefactor("x", levels=SMALLEST(2))),
c(nodefactor.x.1 = 3, nodefactor.x.3 = 3))

expect_equal(summary(flomarriage ~ nodefactor("x", levels=LARGEST)),
c(nodefactor.x.11 = 16))

expect_equal(summary(flomarriage ~ nodefactor("x", levels=LARGEST(2))),
c(nodefactor.x.2 = 18, nodefactor.x.11 = 16))
})


test_that("Selector negation", {
expect_equal(summary(flomarriage ~ nodefactor("x", levels=-SMALLEST)),
c(nodefactor.x.1 = 3, nodefactor.x.2 = 18, nodefactor.x.11 = 16))

expect_equal(summary(flomarriage ~ nodefactor("x", levels=-SMALLEST(2))),
c(nodefactor.x.2 = 18, nodefactor.x.11 = 16))

expect_equal(summary(flomarriage ~ nodefactor("x", levels=-LARGEST)),
c(nodefactor.x.1 = 3, nodefactor.x.2 = 18, nodefactor.x.3 = 3))

expect_equal(summary(flomarriage ~ nodefactor("x", levels=-LARGEST(2))),
c(nodefactor.x.1 = 3, nodefactor.x.3 = 3))
})

test_that("Collapsing categories", {
expect_equal(summary(flomarriage ~ nodefactor("x" %>% COLLAPSE_SMALLEST(2, 5), levels=TRUE)),
c(nodefactor.x.2 = 18, nodefactor.x.5 = 6, nodefactor.x.11 = 16))

expect_equal(summary(flomarriage ~ nodefactor("x" %>% COLLAPSE_SMALLEST(3, 5), levels=TRUE)),
c(nodefactor.x.5 = 24, nodefactor.x.11 = 16))

expect_equal(summary(flomarriage ~ nodefactor("x" %>% COLLAPSE_SMALLEST(2, 5), levels=SMALLEST(2))),
c(nodefactor.x.2 = 18, nodefactor.x.5 = 6))
})

## Tied categories

set.seed(789) # Need stable randomization.
data(florentine)
flomarriage %v% "x" <- sample(c(1,11,2,3), 16, replace=TRUE) ## 11 tests for numeric rather than alphabetical sorting.

test_that("Tied categories nodal attribute level initialization and sorting", {
expect_equal(summary(flomarriage ~ nodefactor("x", levels=TRUE)),
c(nodefactor.x.1 = 2, nodefactor.x.2 = 11, nodefactor.x.3 = 15, nodefactor.x.11 = 12))
})

test_that("Tied categories selecting the smallest and largest", {
expect_no_warning(expect_equal(summary(flomarriage ~ nodefactor("x", levels=SMALLEST(1))),
c(nodefactor.x.1 = 2)))

expect_warning(expect_equal(summary(flomarriage ~ nodefactor("x", levels=SMALLEST(2))),
c(nodefactor.x.1 = 2, nodefactor.x.2 = 11)),
"In term 'nodefactor' in package 'ergm': Levels '2' and '11' are tied. Using the order given.")

expect_no_warning(expect_equal(summary(flomarriage ~ nodefactor("x", levels=SMALLEST(3))),
c(nodefactor.x.1 = 2, nodefactor.x.2 = 11, nodefactor.x.11 = 12)))

expect_equal(summary(flomarriage ~ nodefactor("x", levels=LARGEST)),
c(nodefactor.x.3 = 15))

expect_warning(expect_equal(summary(flomarriage ~ nodefactor("x", levels=LARGEST(2))),
c(nodefactor.x.3 = 15, nodefactor.x.11 = 12)),
"In term 'nodefactor' in package 'ergm': Levels '2' and '11' are tied. Using the order given.")
})


test_that("Collapsing categories", {
expect_warning(expect_equal(summary(flomarriage ~ nodefactor("x" %>% COLLAPSE_SMALLEST(2, 5), levels=TRUE)),
c(nodefactor.x.3 = 15, nodefactor.x.5 = 13, nodefactor.x.11 = 12)),
"In term 'nodefactor' in package 'ergm': Levels '2' and '11' are tied. Using the order given.")

expect_no_warning(expect_equal(summary(flomarriage ~ nodefactor("x" %>% COLLAPSE_SMALLEST(3, 5), levels=TRUE)),
c(nodefactor.x.3 = 15, nodefactor.x.5 = 25)))

expect_warning(expect_warning(expect_equal(summary(flomarriage ~ nodefactor("x" %>% COLLAPSE_SMALLEST(2, 5), levels=SMALLEST(2))),
c(nodefactor.x.3 = 15, nodefactor.x.11 = 12)),
"In term 'nodefactor' in package 'ergm': Levels '2' and '11' are tied. Using the order given."),
"In term 'nodefactor' in package 'ergm': Levels '3' and '5' are tied. Using the order given.")
})

options(o)

0 comments on commit 1bdc76e

Please sign in to comment.