-
Notifications
You must be signed in to change notification settings - Fork 0
/
helper.py
107 lines (70 loc) · 2.95 KB
/
helper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import requests, json
import matplotlib.pyplot as plt
import urllib.request
from datetime import datetime
from paralleldots import similarity, ner, taxonomy, sentiment, keywords, intent, emotion, abuse
from AUTH import news_key, pd_key
from textanalysis import wordcounter
from scraper import scrapArticle
def printJSON(JSON):
''' Prints JSON object in readable format'''
print(json.dumps(JSON, indent=4))
def convertDatetime(datet):
return datetime.strptime(datet, '%Y-%m-%dT%H:%M:%SZ')
def similarityscore(title1, title2):
return similarity(title1,title2)['normalized_score']
def isvalidLink(url):
try:
urllib.request.urlopen(url)
except:
return False
return True
def swapArticles(delaydict, source, i):
i = int(i)
newArticle = delaydict[source][1]
oldArticle = delaydict[source][0]
delaydict[source][0] = newArticle
delaydict[source][1] = oldArticle
return delaydict
def getQuery(query):
url = ('https://newsapi.org/v2/everything?'
'q=' + query + '&'
'language=en&'
'pagesize=100&'
'sources=' + newsAPI_sources + '&'
'sortBy=relevancy&'
'apiKey=' + news_key)
return requests.get(url).json()
def extractInfo(response, query):
publishdelay = {}
for article in response['articles']:
title = article['title']
# Check if article link is valid and title relevant enough, and article is not a video
if similarityscore(query, title) >= 3.5 and isvalidLink(article['url']) and article['author']:
id = article['source']['id']
# Scrap article and determine word count
textfp = scrapArticle(article['url'], id)
wordcount = wordcounter(textfp)
# If first article from source or the earliest article from said source, save
if id not in publishdelay:
publishdelay[id] = [{'datetime': convertDatetime(article['publishedAt']), 'title': title, 'word count': wordcount}]
elif len(publishdelay[id]) < 3:
if convertDatetime(article['publishedAt']) < publishdelay[id][0]['datetime']:
publishdelay[id].insert(0, {'datetime': convertDatetime(article['publishedAt']), 'title': title, 'word count': wordcount})
else:
publishdelay[id].append({'datetime': convertDatetime(article['publishedAt']), 'title': title, 'word count': wordcount})
# Get time of earliest published article (time zero)
time_zero = min([publishdelay[key][0]['datetime'] for key in publishdelay])
# Calculate delay time of every other article
for source in publishdelay:
publishdelay[source][0]['delay time'] = (publishdelay[source][0]['datetime'] - time_zero).total_seconds() / 60.0
return publishdelay
def addEvent(query):
response = getQuery(query)
return extractInfo(response, query)
newsAPI_sources = 'associated-press, cnn, the-new-york-times, the-washington-post'
# print(addEvent('stephen hawking dies'))
# # import urllib.request
# # from bs4 import BeautifulSoup
# url = 'http://thehill.com/homenews/house/382176-ryan-responsible-nations-cant-tolerate-chemical-attack-in-syria'
# print(isvalidLink(url))