diff --git a/R/table_scrap.R b/R/table_scrap.R index 2df270f..49216ab 100644 --- a/R/table_scrap.R +++ b/R/table_scrap.R @@ -8,7 +8,6 @@ #' @param choose an integer indicating which table to scrape #' @param header do you want the first line to be the leader (default to TRUE) #' @param askRobot logical. Should the function ask the robots.txt if we're allowed or not to scrape the web page ? Default is FALSE. -#' @param fill logical. Should be set to TRUE when the table has an inconsistent number of columns. #' @return a data frame object. #' @examples \donttest{ #' # Extracting premier ligue 2019/2020 top scorers @@ -30,60 +29,58 @@ table_scrap <- function(link, choose = 1, header = TRUE, - fill = FALSE, askRobot = FALSE) { -if(missing(link)) { - stop("'link' is a mandatory parameter") -} + if(missing(link)) { + stop("'link' is a mandatory parameter") + } -if(!is.character(link)) { - stop("'link' parameter must be provided as a character string") -} + if(!is.character(link)) { + stop("'link' parameter must be provided as a character string") + } -if(!is.numeric(choose)){ - stop(paste0("the 'choose' parameter must be provided as numeric not as "), - typeof(choose)) -} + if(!is.numeric(choose)){ + stop(paste0("the 'choose' parameter must be provided as numeric not as "), + typeof(choose)) + } -############################## Ask robot part ################################################### + ############################## Ask robot part ################################################### - if (askRobot) { - if (paths_allowed(link) == TRUE) { - message(green("the robot.txt doesn't prohibit scraping this web page")) + if (askRobot) { + if (paths_allowed(link) == TRUE) { + message(green("the robot.txt doesn't prohibit scraping this web page")) - } else { - message(bgRed( - "WARNING: the robot.txt doesn't allow scraping this web page" - )) - - } + } else { + message(bgRed( + "WARNING: the robot.txt doesn't allow scraping this web page" + )) } -################################################################################################# + } + ################################################################################################# -tryCatch( + tryCatch( -expr = { + expr = { -table <- link %>% - read_html() %>% - html_table(header, fill = fill) + table <- link %>% + read_html() %>% + html_table(header) -chosen_table <- table[[choose]] + chosen_table <- table[[choose]] -return(chosen_table) + return(chosen_table) - }, + }, -error = function(cond){ + error = function(cond){ if(!has_internet()){ @@ -93,18 +90,18 @@ error = function(cond){ } else if (grepl("current working directory", cond) || grepl("HTTP error 404", cond)) { - message(paste0("The URL doesn't seem to be a valid one: ", link)) + message(paste0("The URL doesn't seem to be a valid one: ", link)) - message(paste0("Here the original error message: ", cond)) + message(paste0("Here the original error message: ", cond)) - return(NA) + return(NA) } else if(grepl("subscript out of bounds", cond)) { message( - "Are you sure that your web page contains more than one HTML table ?" - ) + "Are you sure that your web page contains more than one HTML table ?" + ) message(paste0("Here the original error message: ", cond)) @@ -117,6 +114,6 @@ error = function(cond){ return(NA) } -} + } -)} + )}