diff --git a/R/index.R b/R/index.R index 107da8c..536b770 100644 --- a/R/index.R +++ b/R/index.R @@ -78,19 +78,24 @@ modify_index <- function(index, name = index, description = NULL, guest_role = N #' Upload documents #' #' @param index The index name to create. -#' @param documents A data frame with columns title, text, date, and optional -#' other columns. -#' @param columns An optional list with data types, e.g. list(author = "keyword"). -#' @param chunk_size Uploads are broken into chunks to prevent errors. Smaller -#' chunks are less error-prone, but this also makes the upload slower. +#' @param documents A data frame with columns title, text, date, and +#' optional other columns. +#' @param columns An optional list with data types, e.g. list(author = +#' "keyword"). +#' @param chunk_size Uploads are broken into chunks to prevent errors. +#' Smaller chunks are less error-prone, but this also makes the +#' upload slower. +#' @param max_tries In case something goes wrong, how often should the +#' function retry to send the documents? #' @param verbose Should a progress bar be printed during upload. -#' @param credentials The credentials to use. If not given, uses last login -#' information. +#' @param credentials The credentials to use. If not given, uses last +#' login information. #' @export upload_documents <- function(index, documents, columns = NULL, chunk_size = 100L, + max_tries = 5L, verbose = TRUE, credentials = NULL) { req_fields <- c("title", "date", "text") # hard coded, might change later @@ -103,14 +108,15 @@ upload_documents <- function(index, # chunk uploads rows <- seq_len(nrow(documents)) chunks <- split(rows, ceiling(seq_along(rows) / chunk_size)) - if (verbose & length(chunks) > 1L) pb <- progress::progress_bar$new(total = length(chunks)) + if (verbose & length(chunks) > 1L) cli::cli_progress_bar("Uploading", total = length(chunks)) for (r in chunks) { - if (verbose & length(chunks) > 1L) pb$tick() + if (verbose & length(chunks) > 1L) cli::cli_progress_update() body <- list(documents = documents[r, ]) if (!is.null(columns)) body$columns <- lapply(columns, jsonlite::unbox) - request(credentials, c("index", index, "documents"), "POST", body, auto_unbox = FALSE) |> + request(credentials, c("index", index, "documents"), "POST", body, max_tries = max_tries, auto_unbox = FALSE) |> invisible() } + if (verbose & length(chunks) > 1L) cli::cli_progress_done() } diff --git a/R/query.R b/R/query.R index 5419c9e..2906b5a 100644 --- a/R/query.R +++ b/R/query.R @@ -99,11 +99,11 @@ query_documents <- function(index, max_pages_old <- max_pages max_pages <- 10000 %/% per_page cli::cli_alert_warning( - c("You requested more than 10 000 results {per_page} * {max_pages} ", + c("You requested more than 10 000 results {per_page} * {max_pages_old} ", "(per_page * max_pages) = {per_page * max_pages}, which will not ", "work. If you want more than 10 000 documents, you need to use the ", - "scroll API, e.g., by setting scroll=\"5m\". For now, you will ", - "only ge the first {max_pages} pages.") + "{.emph scroll API}, e.g., by setting {.code scroll=\"5m\"}. For now, ", + "you will only ge the first {max_pages} pages.") ) } } @@ -140,8 +140,13 @@ query_documents <- function(index, # requesting a specific page. scroll takes precedence in the API, hence # when scroll != NULL, page is ignored if (is.null(scroll)) { + r <<- r body$page <- body$page + 1 - if (body$page >= r$meta$page_count) break + # for when user sets page = NULL + if (length(body$page) == 0) { + body$page <- 1L + } + if (isTRUE(body$page >= r$meta$page_count)) break } else { body$scroll_id <- r$meta$scroll_id } diff --git a/man/upload_documents.Rd b/man/upload_documents.Rd index 9880927..cce08b5 100644 --- a/man/upload_documents.Rd +++ b/man/upload_documents.Rd @@ -9,6 +9,7 @@ upload_documents( documents, columns = NULL, chunk_size = 100L, + max_tries = 5L, verbose = TRUE, credentials = NULL ) @@ -16,18 +17,23 @@ upload_documents( \arguments{ \item{index}{The index name to create.} -\item{documents}{A data frame with columns title, text, date, and optional -other columns.} +\item{documents}{A data frame with columns title, text, date, and +optional other columns.} -\item{columns}{An optional list with data types, e.g. list(author = "keyword").} +\item{columns}{An optional list with data types, e.g. list(author = +"keyword").} -\item{chunk_size}{Uploads are broken into chunks to prevent errors. Smaller -chunks are less error-prone, but this also makes the upload slower.} +\item{chunk_size}{Uploads are broken into chunks to prevent errors. +Smaller chunks are less error-prone, but this also makes the +upload slower.} + +\item{max_tries}{In case something goes wrong, how often should the +function retry to send the documents?} \item{verbose}{Should a progress bar be printed during upload.} -\item{credentials}{The credentials to use. If not given, uses last login -information.} +\item{credentials}{The credentials to use. If not given, uses last +login information.} } \description{ Upload documents diff --git a/tests/testthat/test-query.R b/tests/testthat/test-query.R index f734ab1..9984872 100644 --- a/tests/testthat/test-query.R +++ b/tests/testthat/test-query.R @@ -29,6 +29,11 @@ test_that("query", { query_documents("amcat4r-test", queries = NULL, per_page = 1, page = 2, max_pages = 2) ))) + expect_length( + query_documents("amcat4r-test", queries = NULL, per_page = 1, max_pages = 10)$.id, + 10L + ) + expect_equal( colnames( query_aggregate("amcat4r-test",