This repository has been archived by the owner on Aug 6, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper.py
134 lines (116 loc) · 5.74 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait as wait
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.chrome.options import Options
import time
import argparse
import csv
meses = ['--','enero','febrero','marzo','abril','mayo','junio','julio','agosto','septiembre','octubre','noviembre','diciembre']
class Collector(object):
"""Collector of recent FaceBook posts.
Note: We bypass the FaceBook-Graph-API by using a
selenium FireFox instance!
This is against the FB guide lines and thus not allowed.
USE THIS FOR EDUCATIONAL PURPOSES ONLY. DO NOT ACTAULLY RUN IT.
"""
def __init__(self, pages=["oxfess"], corpus_file="posts.csv", depth=5, delay=2,month=1,year=2020):
super(Collector, self).__init__()
self.pages = pages
self.depth = depth + 1
self.delay = delay
self.month = month
self.year = year
self.currentYear = "2020"
self.dump = "old{}{}.csv".format(self.year, str(self.month).zfill(2))
# browser instance
options = Options()
options.add_argument("--disable-notifications")
self.browser = webdriver.Chrome(options=options)
self.wait = WebDriverWait(self.browser, 10)
#self.browser = webdriver.Firefox()
# creating CSV header
with open(self.dump, "w", newline='', encoding="utf-8") as save_file:
writer = csv.writer(save_file)
writer.writerow(["Source", "utime", "Text"])
def strip(self, string):
"""Helping function to remove all non alphanumeric characters"""
words = string.split()
words = [word for word in words if "#" not in word]
string = " ".join(words)
clean = ""
for c in string:
if str.isalnum(c) or (c in [" ", ".", ","]):
clean += c
return clean
def collect_page(self, page):
# navigate to page
self.browser.get('https://www.facebook.com/')
username = self.browser.find_element_by_id("email")
password = self.browser.find_element_by_id("pass")
submit = self.browser.find_element_by_id("loginbutton")
username.send_keys("")
password.send_keys("")
# Step 4) Click Login
submit.click()
self.browser.get('https://www.facebook.com' + page)
#self.browser.find_elements_by_xpath("//span[contains(text(), 'Más recientes')]")[0].click()
#time.sleep(3)
self.browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
self.browser.find_elements_by_xpath("//div[contains(i/following-sibling::text(), 'Elige una fecha...')]")[0].click()
self.browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
selects = self.browser.find_elements_by_xpath('((//div[@data-testid="filters_section"])[last()]/div/div)[last()]')[0]
selects.find_element_by_link_text(self.currentYear).click()
years = self.browser.find_elements_by_xpath('//*[@id="globalContainer"]/div[3]/div')[0]
years.find_element_by_link_text( str(self.year) ).click()
time.sleep(3)
self.browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
self.browser.find_elements_by_xpath("//div[contains(i/following-sibling::text(), 'Elige una fecha...')]")[0].click()
self.browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
selects = self.browser.find_elements_by_xpath('((//div[@data-testid="filters_section"])[last()]/div/div)[last()]')[0]
selects.find_element_by_link_text(meses[0]).click()
months = self.browser.find_elements_by_xpath('//*[@id="globalContainer"]/div[4]/div')[0]
months.find_element_by_link_text(meses[self.month]).click()
time.sleep(3)
i=0
while True:
i=i+1
if i==10:
break;
self.browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
end = self.browser.find_elements_by_xpath('// *[ @ id = "browse_end_of_results_footer"] / div / div / div')
if len(end) == 0:
if len(self.browser.find_elements_by_xpath("//div[contains(text(), 'No hemos encontrado ningún resultado para')]"))>0:
break
time.sleep(3)
else:
break
# Scroll down depth-times and wait delay seconds to load
# between scrolls
for scroll in range(self.depth):
# Scroll down to bottom
self.browser.execute_script(
"window.scrollTo(0, document.body.scrollHeight);")
# Wait to load page
time.sleep(self.delay)
# Once the full page is loaded, we can start scraping
with open(self.dump, "a+", newline='', encoding="utf-8") as save_file:
writer = csv.writer(save_file)
contentArea = self.browser.find_element_by_id("contentArea")
permalinks = contentArea.find_elements_by_css_selector("a[href^='/groups/btcarg/permalink/']")
for permalink in permalinks:
writer.writerow([permalink.get_attribute("href")])
self.browser.quit()
def collect(self):
for page in self.pages:
self.collect_page(page)
for year in range(2018,2020):
for month in range(1,13):
if year == 2018:
if month < 9:
continue
print("{} {}",year,month)
C = Collector(pages=["/groups/btcarg/search/?query=\"\"&epa=SEARCH_BOX"],depth=2,month=month,year=year)
C.collect()