From 63cd3596eaf32c35078ac73b7214a80df61dd8aa Mon Sep 17 00:00:00 2001 From: wlandau-lilly Date: Fri, 10 Nov 2023 13:15:55 -0500 Subject: [PATCH] More cloud resource fields --- R/class_resources_aws.R | 37 ++++++---- R/class_resources_gcp.R | 23 +++--- R/tar_resources_aws.R | 49 +++++++++---- R/tar_resources_gcp.R | 3 + man/tar_resources_aws.Rd | 40 ++++++++--- man/tar_resources_gcp.Rd | 11 +++ tests/testthat/test-tar_resources_aws.R | 93 +++++++++++++++++++++++++ tests/testthat/test-tar_resources_gcp.R | 39 +++++++++++ 8 files changed, 249 insertions(+), 46 deletions(-) diff --git a/R/class_resources_aws.R b/R/class_resources_aws.R index 560a94325..d9149d625 100644 --- a/R/class_resources_aws.R +++ b/R/class_resources_aws.R @@ -2,24 +2,30 @@ resources_aws_init <- function( bucket = NULL, prefix = tar_path_objects_dir_cloud(), region = NULL, - part_size = 5 * (2 ^ 20), endpoint = NULL, + s3_force_path_style = NULL, + version = "latest", + part_size = 5 * (2 ^ 20), + page_size = 1000L, max_tries = NULL, seconds_timeout = NULL, close_connection = NULL, - s3_force_path_style = NULL, + verbose = TRUE, args = list() ) { resources_aws_new( bucket = bucket, prefix = prefix, region = region, - part_size = part_size, endpoint = endpoint, + s3_force_path_style = s3_force_path_style, + version = version, + part_size = part_size, + page_size = page_size, max_tries = max_tries, seconds_timeout = seconds_timeout, close_connection = close_connection, - s3_force_path_style = s3_force_path_style, + verbose = verbose, args = args ) } @@ -28,51 +34,58 @@ resources_aws_new <- function( bucket = NULL, prefix = NULL, region = NULL, - part_size = NULL, endpoint = NULL, + s3_force_path_style = NULL, + version = NULL, + part_size = NULL, + page_size = NULL, max_tries = NULL, seconds_timeout = NULL, close_connection = NULL, - s3_force_path_style = NULL, + verbose = TRUE, args = NULL ) { force(bucket) force(prefix) force(region) - force(part_size) force(endpoint) + force(s3_force_path_style) + force(version) + force(part_size) + force(page_size) force(max_tries) force(seconds_timeout) force(close_connection) - force(s3_force_path_style) + force(verbose) force(args) enclass(environment(), c("tar_resources_aws", "tar_resources")) } #' @export resources_validate.tar_resources_aws <- function(resources) { - for (field in c("bucket", "prefix")) { + for (field in c("bucket", "prefix", "version")) { tar_assert_scalar(resources[[field]]) tar_assert_chr(resources[[field]]) tar_assert_none_na(resources[[field]]) tar_assert_nzchar(resources[[field]]) } - for (field in c("region", "endpiont")) { + for (field in c("region", "endpoint")) { tar_assert_scalar(resources[[field]] %|||% "x") tar_assert_chr(resources[[field]] %|||% "x") tar_assert_none_na(resources[[field]] %|||% "x") } - for (field in c("part_size", "max_tries", "seconds_timeout")) { + for (field in c("part_size", "page_size", "max_tries", "seconds_timeout")) { tar_assert_scalar(resources[[field]] %|||% 1L) tar_assert_dbl(resources[[field]] %|||% 1L) tar_assert_none_na(resources[[field]] %|||% 1L) tar_assert_ge(resources[[field]] %|||% 1L, 0L) } - for (field in c("close_connection", "s3_force_path_style")) { + for (field in c("close_connection", "s3_force_path_style", "verbose")) { tar_assert_scalar(resources[[field]] %|||% TRUE) tar_assert_lgl(resources[[field]] %|||% TRUE) tar_assert_none_na(resources[[field]] %|||% TRUE) } + tar_assert_in(resources$version, c("latest", "meta")) resources_aws_validate_args(resources$args) } diff --git a/R/class_resources_gcp.R b/R/class_resources_gcp.R index 9e90ed145..8a3d19a2c 100644 --- a/R/class_resources_gcp.R +++ b/R/class_resources_gcp.R @@ -1,6 +1,7 @@ resources_gcp_init <- function( bucket = NULL, prefix = tar_path_objects_dir_cloud(), + version = "latest", predefined_acl = "private", max_tries = 5L, verbose = FALSE @@ -8,6 +9,7 @@ resources_gcp_init <- function( resources_gcp_new( bucket = bucket, prefix = prefix, + version = version, predefined_acl = predefined_acl, max_tries = max_tries, verbose = verbose @@ -17,12 +19,14 @@ resources_gcp_init <- function( resources_gcp_new <- function( bucket = NULL, prefix = NULL, + version = NULL, predefined_acl = NULL, max_tries = NULL, verbose = NULL ) { force(bucket) force(prefix) + force(version) force(predefined_acl) force(max_tries) force(verbose) @@ -31,22 +35,19 @@ resources_gcp_new <- function( #' @export resources_validate.tar_resources_gcp <- function(resources) { - message <- "GCP resources require a valid bucket name." - tar_assert_scalar(resources$bucket, msg = message) - tar_assert_chr(resources$bucket, msg = message) - tar_assert_none_na(resources$bucket, msg = message) - tar_assert_nzchar(resources$bucket, msg = message) - tar_assert_scalar(resources$prefix) - tar_assert_chr(resources$prefix) - tar_assert_nzchar(resources$prefix) - tar_assert_scalar(resources$predefined_acl) - tar_assert_chr(resources$predefined_acl) - tar_assert_nzchar(resources$predefined_acl) + for (field in c("bucket", "prefix", "predefined_acl")) { + message <- paste("GCP resources require a valid", field) + tar_assert_scalar(resources[[field]], msg = message) + tar_assert_chr(resources[[field]], msg = message) + tar_assert_none_na(resources[[field]], msg = message) + tar_assert_nzchar(resources[[field]], msg = message) + } tar_assert_scalar(resources$max_tries %|||% 1L) tar_assert_dbl(resources$max_tries %|||% 1L) tar_assert_none_na(resources$max_tries %|||% 1L) tar_assert_ge(resources$max_tries %|||% 1L, 0L) tar_assert_scalar(resources$verbose) + tar_assert_in(resources$version, c("latest", "meta")) tar_assert_lgl(resources$verbose) } diff --git a/R/tar_resources_aws.R b/R/tar_resources_aws.R index bf8b7cb8f..4cbbee8b0 100644 --- a/R/tar_resources_aws.R +++ b/R/tar_resources_aws.R @@ -23,11 +23,6 @@ #' was added on 2023-08-24: `targets` version 1.2.2.9000.) #' @param region Character of length 1, AWS region containing the S3 bucket. #' Set to `NULL` to use the default region. -#' @param part_size Positive numeric of length 1, number of bytes -#' for each part of a multipart upload. (Except the last part, -#' which is the remainder.) In a multipart upload, each part -#' must be at least 5 MB. The default value of the `part_size` -#' argument is `5 * (2 ^ 20)`. #' @param endpoint Character of length 1, URL endpoint for S3 storage. #' Defaults to the Amazon AWS endpoint if `NULL`. Example: #' To use the S3 protocol with Google Cloud Storage, @@ -45,14 +40,33 @@ #' object versioning turned on, `targets` may fail to record object #' versions. Google Cloud Storage in particular has this #' incompatibility. +#' @param s3_force_path_style Logical of length 1, whether to use path-style +#' addressing for S3 requests. +#' @param version Character of length 1: `"latest"` to read the latest +#' version of the target in the bucket (default), or `"meta"` to +#' read the version recorded in the metadata. This affects how `targets` +#' downloads target data and makes sure it is up to date. `"latest"` +#' is sufficient for most cases. Use `"meta"` if you are reverting to +#' a historical copy of the metadata (`_targets/meta/meta`) and wish to use +#' `targets` to use the corresponding old copies of versioned data in a +#' versioned bucket. The `version` argument is only applicable if +#' the bucket has versioning enabled. +#' @param part_size Positive numeric of length 1, number of bytes +#' for each part of a multipart upload. (Except the last part, +#' which is the remainder.) In a multipart upload, each part +#' must be at least 5 MB. The default value of the `part_size` +#' argument is `5 * (2 ^ 20)`. +#' @param page_size Positive integer of length 1, number of items in each +#' page for paginated HTTP requests such as listing objects. #' @param max_tries Positive integer of length 1, maximum number of attempts #' to access a network resource on AWS. #' @param seconds_timeout Positive numeric of length 1, #' number of seconds until an HTTP connection times out. #' @param close_connection Logical of length 1, whether to close HTTP #' connections immediately. -#' @param s3_force_path_style Logical of length 1, whether to use path-style -#' addressing for S3 requests. +#' @param verbose Logical of length 1, whether to print console messages +#' when running computationally expensive operations such as listing +#' objects in a large bucket. #' @param ... Named arguments to functions in `paws.storage::s3()` to manage #' S3 storage. The documentation of these specific functions #' is linked from `https://www.paws-r-sdk.com/docs/s3/`. @@ -89,14 +103,17 @@ tar_resources_aws <- function( bucket = targets::tar_option_get("resources")$aws$bucket, prefix = targets::tar_option_get("resources")$aws$prefix, region = targets::tar_option_get("resources")$aws$region, - part_size = targets::tar_option_get("resources")$aws$part_size, endpoint = targets::tar_option_get("resources")$aws$endpoint, - max_tries = targets::tar_option_get("resources")$aws$max_tries, - seconds_timeout = targets::tar_option_get("resources")$aws$seconds_timeout, - close_connection = targets::tar_option_get("resources")$aws$close_connection, s3_force_path_style = targets::tar_option_get( "resources" )$aws$s3_force_path_style, + version = targets::tar_option_get("resources")$aws$version, + part_size = targets::tar_option_get("resources")$aws$part_size, + page_size = targets::tar_option_get("resources")$aws$page_size, + max_tries = targets::tar_option_get("resources")$aws$max_tries, + seconds_timeout = targets::tar_option_get("resources")$aws$seconds_timeout, + close_connection = targets::tar_option_get("resources")$aws$close_connection, + verbose = targets::tar_option_get("resources")$aws$verbose, ... ) { if (is.null(prefix)) { @@ -104,7 +121,10 @@ tar_resources_aws <- function( prefix <- path_store_default() } prefix <- prefix %|||% targets::tar_path_objects_dir_cloud() + version <- version %|||% "latest" part_size <- part_size %|||% (5 * (2 ^ 20)) + page_size <- page_size %|||% 1000L + verbose <- verbose %|||% TRUE args <- list(...) default_args <- targets::tar_option_get("resources")$aws$args for (name in names(default_args)) { @@ -114,12 +134,15 @@ tar_resources_aws <- function( bucket = bucket, prefix = prefix, region = region, - part_size = part_size, endpoint = endpoint, + s3_force_path_style = s3_force_path_style, + version = version, + part_size = part_size, + page_size = page_size, max_tries = max_tries, seconds_timeout = seconds_timeout, close_connection = close_connection, - s3_force_path_style = s3_force_path_style, + verbose = verbose, args = args ) resources_validate(out) diff --git a/R/tar_resources_gcp.R b/R/tar_resources_gcp.R index 8dce358bf..8a98fc8e6 100644 --- a/R/tar_resources_gcp.R +++ b/R/tar_resources_gcp.R @@ -36,6 +36,7 @@ tar_resources_gcp <- function( bucket = targets::tar_option_get("resources")$gcp$bucket, prefix = targets::tar_option_get("resources")$gcp$prefix, + version = targets::tar_option_get("resources")$gcp$version, predefined_acl = targets::tar_option_get("resources")$gcp$predefined_acl, max_tries = targets::tar_option_get("resources")$gcp$max_tries, verbose = targets::tar_option_get("resources")$gcp$verbose @@ -44,11 +45,13 @@ tar_resources_gcp <- function( tar_warn_prefix() prefix <- path_store_default() } + version <- version %|||% "latest" predefined_acl <- predefined_acl %|||% "private" verbose <- verbose %|||% FALSE out <- resources_gcp_init( bucket = bucket, prefix = prefix, + version = version, predefined_acl = predefined_acl, max_tries = max_tries, verbose = verbose diff --git a/man/tar_resources_aws.Rd b/man/tar_resources_aws.Rd index 116a4fd0f..239c5a93f 100644 --- a/man/tar_resources_aws.Rd +++ b/man/tar_resources_aws.Rd @@ -8,12 +8,15 @@ tar_resources_aws( bucket = targets::tar_option_get("resources")$aws$bucket, prefix = targets::tar_option_get("resources")$aws$prefix, region = targets::tar_option_get("resources")$aws$region, - part_size = targets::tar_option_get("resources")$aws$part_size, endpoint = targets::tar_option_get("resources")$aws$endpoint, + s3_force_path_style = targets::tar_option_get("resources")$aws$s3_force_path_style, + version = targets::tar_option_get("resources")$aws$version, + part_size = targets::tar_option_get("resources")$aws$part_size, + page_size = targets::tar_option_get("resources")$aws$page_size, max_tries = targets::tar_option_get("resources")$aws$max_tries, seconds_timeout = targets::tar_option_get("resources")$aws$seconds_timeout, close_connection = targets::tar_option_get("resources")$aws$close_connection, - s3_force_path_style = targets::tar_option_get("resources")$aws$s3_force_path_style, + verbose = targets::tar_option_get("resources")$aws$verbose, ... ) } @@ -33,12 +36,6 @@ was added on 2023-08-24: \code{targets} version 1.2.2.9000.)} \item{region}{Character of length 1, AWS region containing the S3 bucket. Set to \code{NULL} to use the default region.} -\item{part_size}{Positive numeric of length 1, number of bytes -for each part of a multipart upload. (Except the last part, -which is the remainder.) In a multipart upload, each part -must be at least 5 MB. The default value of the \code{part_size} -argument is \code{5 * (2 ^ 20)}.} - \item{endpoint}{Character of length 1, URL endpoint for S3 storage. Defaults to the Amazon AWS endpoint if \code{NULL}. Example: To use the S3 protocol with Google Cloud Storage, @@ -57,6 +54,28 @@ object versioning turned on, \code{targets} may fail to record object versions. Google Cloud Storage in particular has this incompatibility.} +\item{s3_force_path_style}{Logical of length 1, whether to use path-style +addressing for S3 requests.} + +\item{version}{Character of length 1: \code{"latest"} to read the latest +version of the target in the bucket (default), or \code{"meta"} to +read the version recorded in the metadata. This affects how \code{targets} +downloads target data and makes sure it is up to date. \code{"latest"} +is sufficient for most cases. Use \code{"meta"} if you are reverting to +a historical copy of the metadata (\verb{_targets/meta/meta}) and wish to use +\code{targets} to use the corresponding old copies of versioned data in a +versioned bucket. The \code{version} argument is only applicable if +the bucket has versioning enabled.} + +\item{part_size}{Positive numeric of length 1, number of bytes +for each part of a multipart upload. (Except the last part, +which is the remainder.) In a multipart upload, each part +must be at least 5 MB. The default value of the \code{part_size} +argument is \code{5 * (2 ^ 20)}.} + +\item{page_size}{Positive integer of length 1, number of items in each +page for paginated HTTP requests such as listing objects.} + \item{max_tries}{Positive integer of length 1, maximum number of attempts to access a network resource on AWS.} @@ -66,8 +85,9 @@ number of seconds until an HTTP connection times out.} \item{close_connection}{Logical of length 1, whether to close HTTP connections immediately.} -\item{s3_force_path_style}{Logical of length 1, whether to use path-style -addressing for S3 requests.} +\item{verbose}{Logical of length 1, whether to print console messages +when running computationally expensive operations such as listing +objects in a large bucket.} \item{...}{Named arguments to functions in \code{paws.storage::s3()} to manage S3 storage. The documentation of these specific functions diff --git a/man/tar_resources_gcp.Rd b/man/tar_resources_gcp.Rd index ad67a2992..bad55f3d7 100644 --- a/man/tar_resources_gcp.Rd +++ b/man/tar_resources_gcp.Rd @@ -8,6 +8,7 @@ Google Cloud Storage (GCS)} tar_resources_gcp( bucket = targets::tar_option_get("resources")$gcp$bucket, prefix = targets::tar_option_get("resources")$gcp$prefix, + version = targets::tar_option_get("resources")$gcp$version, predefined_acl = targets::tar_option_get("resources")$gcp$predefined_acl, max_tries = targets::tar_option_get("resources")$gcp$max_tries, verbose = targets::tar_option_get("resources")$gcp$verbose @@ -26,6 +27,16 @@ In the future, \code{targets} will begin requiring explicitly user-supplied prefixes. (This last note was added on 2023-08-24: \code{targets} version 1.2.2.9000.)} +\item{version}{Character of length 1: \code{"latest"} to read the latest +version of the target in the bucket (default), or \code{"meta"} to +read the version recorded in the metadata. This affects how \code{targets} +downloads target data and makes sure it is up to date. \code{"latest"} +is sufficient for most cases. Use \code{"meta"} if you are reverting to +a historical copy of the metadata (\verb{_targets/meta/meta}) and wish to use +\code{targets} to use the corresponding old copies of versioned data in a +versioned bucket. The \code{version} argument is only applicable if +the bucket has versioning enabled.} + \item{predefined_acl}{Character of length 1, user access to the object. See \code{?googleCloudStorageR::gcs_upload} for possible values. Defaults to \code{"private"}.} diff --git a/tests/testthat/test-tar_resources_aws.R b/tests/testthat/test-tar_resources_aws.R index f46e9f3c0..28bc8f822 100644 --- a/tests/testthat/test-tar_resources_aws.R +++ b/tests/testthat/test-tar_resources_aws.R @@ -195,3 +195,96 @@ tar_test("tar_resources_aws() wants a prefix", { class = "tar_condition_deprecate" ) }) + +tar_test("tar_resources_aws() verbose", { + skip_cran() + skip_on_os("windows") + skip_if_not_installed("paws.storage") + tar_option_set( + resources = tar_resources( + aws = tar_resources_aws( + prefix = "x", + bucket = "x" + ) + ) + ) + out <- tar_resources_aws() + expect_true(out$verbose) + tar_option_set( + resources = tar_resources( + aws = tar_resources_aws( + verbose = FALSE, + prefix = "x", + bucket = "x" + ) + ) + ) + out <- tar_resources_aws() + expect_false(out$verbose) +}) + +tar_test("tar_resources_aws() page_size", { + skip_cran() + skip_on_os("windows") + skip_if_not_installed("paws.storage") + tar_option_set( + resources = tar_resources( + aws = tar_resources_aws( + prefix = "x", + bucket = "x" + ) + ) + ) + out <- tar_resources_aws() + expect_equal(out$page_size, 1000L) + tar_option_set( + resources = tar_resources( + aws = tar_resources_aws( + page_size = 3L, + prefix = "x", + bucket = "x" + ) + ) + ) + out <- tar_resources_aws() + expect_equal(out$page_size, 3L) +}) + +tar_test("tar_resources_aws() version", { + skip_cran() + skip_on_os("windows") + skip_if_not_installed("paws.storage") + tar_option_set( + resources = tar_resources( + aws = tar_resources_aws( + prefix = "x", + bucket = "x" + ) + ) + ) + out <- tar_resources_aws() + expect_equal(out$version, "latest") + tar_option_set( + resources = tar_resources( + aws = tar_resources_aws( + version = "meta", + prefix = "x", + bucket = "x" + ) + ) + ) + out <- tar_resources_aws() + expect_equal(out$version, "meta") + expect_error( + tar_option_set( + resources = tar_resources( + aws = tar_resources_aws( + version = "nope", + prefix = "x", + bucket = "x" + ) + ) + ), + class = "tar_condition_validate" + ) +}) diff --git a/tests/testthat/test-tar_resources_gcp.R b/tests/testthat/test-tar_resources_gcp.R index 0f8fa4110..e80984f14 100644 --- a/tests/testthat/test-tar_resources_gcp.R +++ b/tests/testthat/test-tar_resources_gcp.R @@ -74,3 +74,42 @@ tar_test("tar_resources_gcp() wants a prefix", { class = "tar_condition_deprecate" ) }) + +tar_test("tar_resources_gcp() version", { + skip_cran() + skip_on_os("windows") + skip_if_not_installed("googleCloudStorageR") + tar_option_set( + resources = tar_resources( + gcp = tar_resources_gcp( + prefix = "x", + bucket = "x" + ) + ) + ) + out <- tar_resources_gcp() + expect_equal(out$version, "latest") + tar_option_set( + resources = tar_resources( + gcp = tar_resources_gcp( + version = "meta", + prefix = "x", + bucket = "x" + ) + ) + ) + out <- tar_resources_gcp() + expect_equal(out$version, "meta") + expect_error( + tar_option_set( + resources = tar_resources( + gcp = tar_resources_gcp( + version = "nope", + prefix = "x", + bucket = "x" + ) + ) + ), + class = "tar_condition_validate" + ) +})