From 34a24cee27413a2ae9963b2f667fe63c06863564 Mon Sep 17 00:00:00 2001 From: wlandau Date: Thu, 12 Oct 2023 13:42:29 -0400 Subject: [PATCH] Fix #1139 --- NEWS.md | 12 +++++++++++- R/class_branch.R | 2 +- R/class_target.R | 2 +- R/tar_seed_create.R | 16 ++++++++-------- man/tar_seed_create.Rd | 16 ++++++++-------- man/tar_seed_get.Rd | 16 ++++++++-------- man/tar_seed_set.Rd | 16 ++++++++-------- 7 files changed, 45 insertions(+), 35 deletions(-) diff --git a/NEWS.md b/NEWS.md index 178d50e72..145cbecb6 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,6 +1,16 @@ # targets 1.3.2.9001 (development) -* Add function `tar_seed_set()` which sets a seed and the default RNG algorithms. +## Invalidating changes + +Because of the changes below, upgrading to this version of `targets` will unavoidably invalidate previously built targets in existing pipelines. Your pipeline code should still work, but any targets you ran before will most likely need to rerun after the upgrade. + +* Use SHA512 during the creation of target-specific pseudo-random number generator seeds (#1139). This change decreases the risk of overlapping/correlated random number generator streams. See the "RNG overlap" section of the `tar_seed_create()` help file for details and justification. + +## Other changes + +* Add a new exported function `tar_seed_create()` which creates target-specific pseudo-random number generator seeds. +* Add an "RNG overlap" section in the `tar_seed_create()` help file to justify and defend how `targets` and `tarchetypes` approach pseudo-random numbers. +* Add function `tar_seed_set()` which sets a seed and sets all the RNG algorithms to their defaults in the R installation of the user. Each target now uses `tar_seed_set()` function to set its seed before running its R command (#1139). * Deprecate `tar_seed()` in favor of the new `tar_seed_get()` function. # targets 1.3.2 diff --git a/R/class_branch.R b/R/class_branch.R index f0949c1c7..fbe747ed1 100644 --- a/R/class_branch.R +++ b/R/class_branch.R @@ -10,7 +10,7 @@ branch_init <- function( command <- command_clone(command) deps <- union(command$deps, deps) command$deps <- setdiff(deps, settings$dimensions) - command$seed <- produce_seed(child) + command$seed <- tar_seed_create(child) pedigree <- pedigree_new(settings$name, child, index) settings <- settings_clone(settings) settings$name <- child diff --git a/R/class_target.R b/R/class_target.R index 115cd44ce..a594e5182 100644 --- a/R/class_target.R +++ b/R/class_target.R @@ -19,7 +19,7 @@ target_init <- function( retrieval = "main", cue = NULL ) { - seed <- produce_seed(name) + seed <- tar_seed_create(name) command <- command_init(expr, packages, library, seed, deps, string) cue <- cue %|||% cue_init() if (any(grepl("^aws_", format))) { diff --git a/R/tar_seed_create.R b/R/tar_seed_create.R index d1a3284d4..da23b152b 100644 --- a/R/tar_seed_create.R +++ b/R/tar_seed_create.R @@ -4,8 +4,8 @@ #' @description Create a seed for a target. #' @section Seeds: #' A target's random number generator seed -#' is a deterministic function of its name and the global pipeline seed. -#' Consequently, +#' is a deterministic function of its name and the global pipeline seed +#' from [tar_option_get("seed")]. Consequently, #' #' 1. Each target runs with a reproducible seed so that #' different runs of the same pipeline in the same computing @@ -24,14 +24,14 @@ #' correlated results. (For a discussion of the motivating problem, #' see the Section 6: "Random-number generation" in the `parallel` #' package vignette: `vignette(topic = "parallel", package = "parallel")`.) -#' However, this risk is extremely small in practice. -#' -#' `targets` and `tarchetypes` take the approach discussed in +#' However, this risk is extremely small in practice, as shown by #' L'Ecuyer et al. (2027) -#' "A single RNG with a 'random' seed for each stream" (Section 4: +#' under "A single RNG with a 'random' seed for each stream" (Section 4: #' under "How to produce parallel streams and substreams"). -#' Here, [tar_seed_create()] plays the role -#' of the upstream pseudo-random number generator (RNG) that produces +#' +#' `targets` and `tarchetypes` take the approach discussed in the +#' aforementioned section of the paper, where [tar_seed_create()] plays the +#' role of the upstream pseudo-random number generator (RNG) that produces #' seeds for the subsequent parallel streams. Specifically, #' [tar_seed_create()] acts as a counter-based RNG, #' where the output function is the SHA512 hash algorithm. diff --git a/man/tar_seed_create.Rd b/man/tar_seed_create.Rd index 807992fae..02762a8dd 100644 --- a/man/tar_seed_create.Rd +++ b/man/tar_seed_create.Rd @@ -12,8 +12,8 @@ Create a seed for a target. \section{Seeds}{ A target's random number generator seed -is a deterministic function of its name and the global pipeline seed. -Consequently, +is a deterministic function of its name and the global pipeline seed +from \link{tar_option_get("seed")}. Consequently, \if{html}{\out{
}}\preformatted{1. Each target runs with a reproducible seed so that different runs of the same pipeline in the same computing @@ -36,14 +36,14 @@ streams of different targets will overlap and produce statistically correlated results. (For a discussion of the motivating problem, see the Section 6: "Random-number generation" in the \code{parallel} package vignette: \code{vignette(topic = "parallel", package = "parallel")}.) -However, this risk is extremely small in practice. - -\code{targets} and \code{tarchetypes} take the approach discussed in +However, this risk is extremely small in practice, as shown by L'Ecuyer et al. (2027) \url{https://doi.org/10.1016/j.matcom.2016.05.005} -"A single RNG with a 'random' seed for each stream" (Section 4: +under "A single RNG with a 'random' seed for each stream" (Section 4: under "How to produce parallel streams and substreams"). -Here, \code{\link[=tar_seed_create]{tar_seed_create()}} plays the role -of the upstream pseudo-random number generator (RNG) that produces + +\code{targets} and \code{tarchetypes} take the approach discussed in the +aforementioned section of the paper, where \code{\link[=tar_seed_create]{tar_seed_create()}} plays the +role of the upstream pseudo-random number generator (RNG) that produces seeds for the subsequent parallel streams. Specifically, \code{\link[=tar_seed_create]{tar_seed_create()}} acts as a counter-based RNG, where the output function is the SHA512 hash algorithm. diff --git a/man/tar_seed_get.Rd b/man/tar_seed_get.Rd index 82e0bd36c..77e33c553 100644 --- a/man/tar_seed_get.Rd +++ b/man/tar_seed_get.Rd @@ -25,8 +25,8 @@ of the target currently running. \section{Seeds}{ A target's random number generator seed -is a deterministic function of its name and the global pipeline seed. -Consequently, +is a deterministic function of its name and the global pipeline seed +from \link{tar_option_get("seed")}. Consequently, \if{html}{\out{
}}\preformatted{1. Each target runs with a reproducible seed so that different runs of the same pipeline in the same computing @@ -49,14 +49,14 @@ streams of different targets will overlap and produce statistically correlated results. (For a discussion of the motivating problem, see the Section 6: "Random-number generation" in the \code{parallel} package vignette: \code{vignette(topic = "parallel", package = "parallel")}.) -However, this risk is extremely small in practice. - -\code{targets} and \code{tarchetypes} take the approach discussed in +However, this risk is extremely small in practice, as shown by L'Ecuyer et al. (2027) \url{https://doi.org/10.1016/j.matcom.2016.05.005} -"A single RNG with a 'random' seed for each stream" (Section 4: +under "A single RNG with a 'random' seed for each stream" (Section 4: under "How to produce parallel streams and substreams"). -Here, \code{\link[=tar_seed_create]{tar_seed_create()}} plays the role -of the upstream pseudo-random number generator (RNG) that produces + +\code{targets} and \code{tarchetypes} take the approach discussed in the +aforementioned section of the paper, where \code{\link[=tar_seed_create]{tar_seed_create()}} plays the +role of the upstream pseudo-random number generator (RNG) that produces seeds for the subsequent parallel streams. Specifically, \code{\link[=tar_seed_create]{tar_seed_create()}} acts as a counter-based RNG, where the output function is the SHA512 hash algorithm. diff --git a/man/tar_seed_set.Rd b/man/tar_seed_set.Rd index 45da02632..08fd6d3a6 100644 --- a/man/tar_seed_set.Rd +++ b/man/tar_seed_set.Rd @@ -26,8 +26,8 @@ these seeds in R. \section{Seeds}{ A target's random number generator seed -is a deterministic function of its name and the global pipeline seed. -Consequently, +is a deterministic function of its name and the global pipeline seed +from \link{tar_option_get("seed")}. Consequently, \if{html}{\out{
}}\preformatted{1. Each target runs with a reproducible seed so that different runs of the same pipeline in the same computing @@ -50,14 +50,14 @@ streams of different targets will overlap and produce statistically correlated results. (For a discussion of the motivating problem, see the Section 6: "Random-number generation" in the \code{parallel} package vignette: \code{vignette(topic = "parallel", package = "parallel")}.) -However, this risk is extremely small in practice. - -\code{targets} and \code{tarchetypes} take the approach discussed in +However, this risk is extremely small in practice, as shown by L'Ecuyer et al. (2027) \url{https://doi.org/10.1016/j.matcom.2016.05.005} -"A single RNG with a 'random' seed for each stream" (Section 4: +under "A single RNG with a 'random' seed for each stream" (Section 4: under "How to produce parallel streams and substreams"). -Here, \code{\link[=tar_seed_create]{tar_seed_create()}} plays the role -of the upstream pseudo-random number generator (RNG) that produces + +\code{targets} and \code{tarchetypes} take the approach discussed in the +aforementioned section of the paper, where \code{\link[=tar_seed_create]{tar_seed_create()}} plays the +role of the upstream pseudo-random number generator (RNG) that produces seeds for the subsequent parallel streams. Specifically, \code{\link[=tar_seed_create]{tar_seed_create()}} acts as a counter-based RNG, where the output function is the SHA512 hash algorithm.