Skip to content

Commit

Permalink
fill argument is now deprecated in rvest 1.0.2.
Browse files Browse the repository at this point in the history
  • Loading branch information
gbganalyst committed Jun 14, 2022
1 parent ce182f4 commit 6bf16e3
Showing 1 changed file with 36 additions and 39 deletions.
75 changes: 36 additions & 39 deletions R/table_scrap.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
#' @param choose an integer indicating which table to scrape
#' @param header do you want the first line to be the leader (default to TRUE)
#' @param askRobot logical. Should the function ask the robots.txt if we're allowed or not to scrape the web page ? Default is FALSE.
#' @param fill logical. Should be set to TRUE when the table has an inconsistent number of columns.
#' @return a data frame object.
#' @examples \donttest{
#' # Extracting premier ligue 2019/2020 top scorers
Expand All @@ -30,60 +29,58 @@
table_scrap <- function(link,
choose = 1,
header = TRUE,
fill = FALSE,
askRobot = FALSE) {


if(missing(link)) {
stop("'link' is a mandatory parameter")
}
if(missing(link)) {
stop("'link' is a mandatory parameter")
}


if(!is.character(link)) {
stop("'link' parameter must be provided as a character string")
}
if(!is.character(link)) {
stop("'link' parameter must be provided as a character string")
}


if(!is.numeric(choose)){
stop(paste0("the 'choose' parameter must be provided as numeric not as "),
typeof(choose))
}
if(!is.numeric(choose)){
stop(paste0("the 'choose' parameter must be provided as numeric not as "),
typeof(choose))
}


############################## Ask robot part ###################################################
############################## Ask robot part ###################################################

if (askRobot) {
if (paths_allowed(link) == TRUE) {
message(green("the robot.txt doesn't prohibit scraping this web page"))
if (askRobot) {
if (paths_allowed(link) == TRUE) {
message(green("the robot.txt doesn't prohibit scraping this web page"))

} else {
message(bgRed(
"WARNING: the robot.txt doesn't allow scraping this web page"
))

}
} else {
message(bgRed(
"WARNING: the robot.txt doesn't allow scraping this web page"
))

}

#################################################################################################
}

#################################################################################################

tryCatch(
tryCatch(

expr = {
expr = {

table <- link %>%
read_html() %>%
html_table(header, fill = fill)
table <- link %>%
read_html() %>%
html_table(header)

chosen_table <- table[[choose]]
chosen_table <- table[[choose]]

return(chosen_table)
return(chosen_table)


},
},

error = function(cond){
error = function(cond){

if(!has_internet()){

Expand All @@ -93,18 +90,18 @@ error = function(cond){

} else if (grepl("current working directory", cond) || grepl("HTTP error 404", cond)) {

message(paste0("The URL doesn't seem to be a valid one: ", link))
message(paste0("The URL doesn't seem to be a valid one: ", link))

message(paste0("Here the original error message: ", cond))
message(paste0("Here the original error message: ", cond))

return(NA)
return(NA)


} else if(grepl("subscript out of bounds", cond)) {

message(
"Are you sure that your web page contains more than one HTML table ?"
)
"Are you sure that your web page contains more than one HTML table ?"
)

message(paste0("Here the original error message: ", cond))

Expand All @@ -117,6 +114,6 @@ error = function(cond){
return(NA)

}
}
}

)}
)}

0 comments on commit 6bf16e3

Please sign in to comment.