-
Notifications
You must be signed in to change notification settings - Fork 1
/
1_RSelenium_scraping.R
91 lines (59 loc) · 2.44 KB
/
1_RSelenium_scraping.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
# Load libraries ----------------------------------------------------------
library(RSelenium)
library(wdman)
library(plyr)
library(tidyr)
# Set up driver -----------------------------------------------------------
# set up port - must be an integer, otherwise throws an error in wdman::selenium() command
PORT <- as.integer(4444)
# set up server
server <- wdman::selenium(port = PORT) # note: could be started with selenium() w/o port, but better to specify it
# set up the browser
# firefox - recommended, crashes the least
browser <- remoteDriver(
browserName = "firefox",
port = PORT
)
class(browser)
# Open browser ------------------------------------------------------------
browser$open()
# navigate to the Rush hour crush
browser$navigate("https://metro.co.uk/rush-hour-crush/?ico=rhc_banner_home/home")
# Get to the bottom -----------------------------------------------------
## possbile way forward - get to the end of the page by load more
## & then scrape all & assign it numbers
# locate "load more" button
load_more <- browser$findElement('id', "metro-rush-hour-crush-load-more")
# get to the bottom of the site (click until the button disappears)
i <- 1
while (TRUE) {
load_more$clickElement()
print(paste("Finished iteration n.", i))
Sys.sleep(runif(1, 1, 4))
i <- i + 1
}
# if the button is not there, it will still attempt it and not fail!! Break manually
# Extract everything ------------------------------------------------------
container <- browser$findElements("class", "metro-rush-hour-crush")
length(container) # 353 posts
# saveRDS(container, "container")
# getting text - toy example
container[[1]]$findElement("tag", 'p')$getElementText()
container[[2]]$findElement("tag", 'p')$getElementText()
# getting the author - toy example
container[[1]]$findElement("tag", 'h4')$getElementText()
# gets both in one go - can be splitted later in text - author by \n
content <- sapply(container, function(x) x$getElementText())
# Binding it to a dataframe -----------------------------------------------
# bind it to a dataframe
df <- plyr::ldply(content, data.frame)
names(df)[1] <- "text"
write.csv(df, "df.csv")
# detach plyr so it doesn't clash with other dplyr later
detach("package:plyr", unload=TRUE)
# split stringr to text & author columns
split <- tidyr::separate(df, col = text,
sep = "\\n", into = c("text", "author"))
write.csv(split, "RHC_dataframe.csv")
# stop server after session
server$stop()