-
Notifications
You must be signed in to change notification settings - Fork 0
/
news-get.py
259 lines (227 loc) · 10.8 KB
/
news-get.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
import argparse
import random
import time
import logging
import glob
import os
import validators
import queue
from threading import Thread
from threading import Lock
from tqdm import tqdm
from urllib.parse import urlparse
import dask.dataframe as dd
import pandas as pd
import newspaper
from sumy.parsers.html import HtmlParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words
from selenium.webdriver import Chrome
from selenium.webdriver import ChromeOptions
# from selenium.webdriver.common.keys import Keys
# from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
class NewsGetter:
def __init__(self, output_dir, severity=1, **kwargs):
self.language = kwargs.get('language', 'english')
self.severity = severity
self.outdir = output_dir
self.tokenizer = Tokenizer(self.language)
self.connection_error = [] # list recording the urls that cannot be scraped
self.recent_sites = [] # this list tracks recently accessed sites to avoid over-ping one site
if self.severity >= 2:
ops = ChromeOptions()
# ops.add_argument('headless')
ops.add_argument('--window-size=1920,1080') # to ensure no content is hidden if in smaller windows
self.driver = Chrome(ChromeDriverManager().install(), options=ops)
def text_from_url(self, url, sleep=(0, 0.1)):
time.sleep(random.uniform(sleep[0], sleep[1]))
output_string = ''
try:
article = newspaper.Article(url, language='en')
article.download()
output_string = newspaper.fulltext(article.html)
except:
try:
sumy_parser = HtmlParser.from_url(url, self.tokenizer)
for paragraph in sumy_parser.document.paragraphs:
for sentence in paragraph.senteces:
output_string += sentence._text
output_string += '\n'
except:
if self.severity < 2: # if not trying selenium, raise a debug log
self.connection_error.append('Cannot connect to ' + url)
return
if len(output_string) < 400:
# 400 char is arbitrary threshold, identify "successful connection but rejected by robot test"
# if connection failed in the previous two attempts, output_string = ''
if self.severity < 2:
self.connection_error.append('Cannot scrap text from ' + url)
return # if blocked by robot test, do not return anything
else:
try:
self.driver.get(url)
except: # when selenium also fails to connect
self.connection_error.append('Cannot connect to ' + url)
return
html_source = self.driver.page_source
output_string = newspaper.fulltext(html_source)
return output_string
def sum_from_url(self, url, sleep=(0, 0.1), **kwargs):
time.sleep(random.uniform(sleep[0], sleep[1]))
try:
sumy_parser = HtmlParser.from_url(url, self.tokenizer)
except:
try:
article = newspaper.Article(url, language='en')
article.download()
sumy_parser = HtmlParser.from_string(article.html, article.source_url, self.tokenizer)
except:
if self.severity < 2:
self.connection_error.append('Cannot connect to ' + url)
return
else:
try:
self.driver.get(url)
except:
self.connection_error.append('Cannot connect to ' + url)
return
sumy_parser = HtmlParser.from_string(self.driver.page_source, url, self.tokenizer)
summarizer = LsaSummarizer(Stemmer(self.language))
summarizer.stop_words = get_stop_words(self.language)
try:
summary = summarizer(sumy_parser.document, kwargs.get('sentence_count', 10))
summerized = ''
for sentence in summary:
summerized += (sentence._text + " ")
return summerized
except:
# when the text scrapped is empty, print an error log
self.connection_error.append('Cannot scrap text from ' + url)
return
def texts_from_csv(self, csv_dir, summarize=False, thread_count=16):
class Worker(Thread):
def __init__(self, url_queue: queue.Queue, getter: NewsGetter, lock: Lock, taskbar):
Thread.__init__(self)
self.queue = url_queue
self.result = pd.Series(dtype=str)
self.news_getter = getter
self.task_bar = taskbar
self.lock = lock
def run(self):
while True:
(index, url) = self.queue.get()
if url == '':
# print(threading.active_count())
break
site = urlparse(url).netloc
site = '.'.join(site.split('.')[-2:]) # ignore the sub-domain
# this chunk is to avoid requesting the same site too frequently
if site in self.news_getter.recent_sites:
time.sleep(3)
# print('Sleep triggered for site ' + site)
self.lock.acquire() # the lock for the outerclass's list of visited sites
self.news_getter.recent_sites.append(site)
# the "cushion" for visited url is avg. 2 urls per thread
if len(self.news_getter.recent_sites) >= 2 * thread_count:
del self.news_getter.recent_sites[0]
self.lock.release()
if summarize:
text = self.news_getter.sum_from_url(url)
else:
text = self.news_getter.text_from_url(url)
self.result = pd.concat([self.result,
pd.Series([text], index=[index])])
self.queue.task_done()
self.task_bar.update(1)
urls = queue.Queue()
csv_files = glob.glob(csv_dir + "/*.csv")
if len(csv_files) == 0:
raise Exception("No .csv files found in directory: " + csv_dir)
for file in csv_files:
self.connection_error = []
self.recent_sites = []
df = pd.read_csv(file)
outdir = self.outdir + "\\" + file.split("\\")[-1]
url_column = df.loc[:, 'link'] # this column name is universal across output files of feedparser
print("Processing {0} links from {1}".format(len(url_column), file))
queue_size = 0
for i in url_column.index:
urls.put((i, url_column[i]))
queue_size += 1
workers = []
print("{} threads initiated".format(thread_count))
lock = Lock()
with tqdm(total=queue_size) as pbar:
for i in range(thread_count):
urls.put(('', '')) # this is the stopper indicating no more urls left
worker = Worker(urls, self, lock, pbar)
worker.start()
workers.append(worker)
for worker in workers:
worker.join() # join worker to wait until all finish
for failed_message in self.connection_error:
print(failed_message)
texts = pd.Series(dtype=str)
for worker in workers:
pd.concat([texts, worker.result])
df['text'] = texts
df.to_csv(outdir, mode='w+')
print('file saved to ' + outdir)
def main(args):
getter = NewsGetter(severity=args.severity, output_dir=args.output_dir, laguage='english')
if args.dir_to_csv is not None:
getter.texts_from_csv(args.dir_to_csv, args.summarize, args.threads_count)
if args.urls is not None:
urls_output = pd.DataFrame(columns=['url', 'text'])
for url in args.urls:
if args.summarize:
retrieved = [url, getter.sum_from_url(url)]
urls_output = pd.concat([urls_output, pd.DataFrame([retrieved], columns=urls_output.columns)])
else:
retrieved = [url, getter.text_from_url(url)]
urls_output = pd.concat([urls_output, pd.DataFrame([retrieved], columns=urls_output.columns)])
urls_output.to_csv(args.output_dir + "\\" + 'url_output.csv')
print('file saved to ' + args.output_dir + '\\' + 'url_output.csv')
return
def check_args(args):
if args.urls is None:
if args.dir_to_csv is None:
raise ValueError("must input at least one of valid url or .csv containing urls")
else:
for url in args.urls:
if not validators.url(url):
raise ValueError(url + 'is not a valid url')
if args.dir_to_csv is not None:
if not os.path.isdir(args.dir_to_csv):
raise ValueError("dir_to_csv {} is not a valid directory".format(args.dir_to_csv))
if args.severity is not None:
if args.severity > 4 or args.severity < 1:
raise ValueError("input severity is not valid")
else:
args.severity = 1
if args.summarize is None:
args.summarize = False
if not args.output_dir is None and not os.path.isdir(args.output_dir):
raise ValueError('Output directory (%s) is not a valid directory' % (
os.path.abspath(args.outdir)))
if args.threads_count is None:
args.threads_count = 8
return args
if __name__ == '__main__':
parser = argparse.ArgumentParser(description="Acquiring news articles from urls")
parser.add_argument("-u", "--urls", nargs="+", help="urls to desired news text", required=False)
parser.add_argument("-dir", "--dir_to_csv", help="directory to the csv files generated by rss_get.py",
required=False)
parser.add_argument('-sum', "--summarize", help='whether to summarize text', action=argparse.BooleanOptionalAction)
parser.add_argument("-s", "--severity",
help="level 1: get html directly; level 2: use selenium to bypass robot test; level 3: set "
"minimum wait time to bypass javascript; level 4 (under construction): use credentials",
nargs="?", const=1, type=int, required=False)
parser.add_argument('-o', '--output_dir', type=str, required=True)
parser.add_argument('-t', '--threads_count', nargs='?', const=8, type=int)
args = parser.parse_args()
args = check_args(args)
main(args)