-
Notifications
You must be signed in to change notification settings - Fork 1
/
medium.R
61 lines (54 loc) · 1.71 KB
/
medium.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
install.packages('rvest');
install.packages('textcat');
require('rvest');
require('textcat');
install.packages('dplyr');
require('dplyr')
data = read.csv('./Medium_Clean.csv', stringsAsFactors = FALSE);
# for random selection of script =======
randomRows = function(df,n){
return(df[sample(nrow(df),n),])
}
data = randomRows(data, 1000)
# for random selection of script ========
contentList = data.frame();
startIndex = 1;
endIndex = 1000;
for (i in seq(1, 1000)) {
url = as.vector(data[i, 'url']);
contentList[i, 'url'] = url;
texts = tryCatch({
webpage = read_html(url);
pTags = html_nodes(webpage, 'p');
pText = html_text(pTags);
language = textcat::textcat(pText);
if (!is.na(language[1]) && language[1] == 'english') {
print(paste("Success on:", i, url))
#logger(paste("Success on:", i, url))
pText;
} else {
print(paste("Fail on:", i, url, "Language"))
#logger(paste("Fail on:", i, url, "Language"));
msg = NA;
msg;
}
}, error = function(error_condition) {
print(paste("Fail on:", i, url, "HTTP ERROR", error_condition))
#logger(paste("Fail on:", i, url, "HTTP ERROR", error_condition));
msg = NA;
msg;
});
contentList[i, 'content'] = paste(texts, sep = " ", collapse = " ");
if (i%%1000 == 0) {
endIndex = i;
new_data=merge(data[startIndex:endIndex,], contentList, by.y = "url");
new_data_filter_1 = dplyr::filter(new_data, new_data['content'] != 'NA')
print("Writing the file");
write.csv(new_data_filter_1, file = paste("Data", "_", startIndex,"_", endIndex, ".csv", sep = ""))
startIndex = i+1;
contentList = data.frame();
}
}
logger = function(message) {
write(message, file="log.txt", append=TRUE)
}