This repository has been archived by the owner on Sep 26, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 14
/
collector.py
128 lines (91 loc) · 2.86 KB
/
collector.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
from sys import argv, exit
from tweepy.streaming import StreamListener
from tweepy import OAuthHandler
from tweepy import Stream
import json
import csv
import re
script, sentiment_label, campain, keywords = argv
DEBUG = False
keywords = keywords.split(",")
if "positive"==sentiment_label:
emoticon = ":)"
elif "negative"==sentiment_label:
emoticon =":("
regex = re.compile('|'.join(keywords).lower())
linenum_re = re.compile(r'([A-Z][A-Z]\d+)')
retweets_re = re.compile(r'^RT\s')
enc = lambda x: x.encode('latin1', errors='ignore')
def print_debug(er,msg):
if DEBUG:
print "Error: ", er
print "Message: < %s >" % msg
class EmoticonListener(StreamListener):
def __init__(self):
self.count = 0
def on_data(self, data):
if(self.count > 500):
exit(0)
tweet = json.loads(data, encoding='utf-8')
if not tweet.has_key('id'):
print_debug("No Id, skip the tweet","")
return True
elif not tweet.has_key('user'):
print_debug("No user, skip the tweet", "")
return True
elif not tweet.has_key("text"):
print_debug("not text, skip the tweet", "")
return True
elif not tweet.has_key("lang") or tweet['lang'] != "en":
print_debug("Not english", "")
return True
tweet_id = tweet['id']
user_id = enc(tweet['user']['name'])
text = enc(tweet['text'])
matches = re.search(regex, text.lower())
rt_matches = re.search(retweets_re, text)
if not matches:
print "No Keyword in text, skip the tweet"
print "< %s >" % text
return True
if not (emoticon in text):
print_debug("No Emoticon, skip the tweet", text)
return True
elif rt_matches:
print_debug("Is a retweet, skip", text)
return True
else:
print_debug("This is good", text)
writer.writerow({
"TweetID" : tweet_id,
"User_ID" : user_id,
"Text" : text,
"Sentiment" : sentiment_label
})
self.count = self.count + 1
print "-" * 10 + "Count", self.count
print "ID:", tweet_id
print "USER:", user_id
print "TEXT:", text
print "Sentiment", sentiment_label
def on_error(self, status):
print('status: %s' % status)
#end EmoticonListener
if __name__ == '__main__':
config = json.load(open('config.json','r'))
auth = OAuthHandler(config['consumer_key'], config['consumer_secret'])
auth.set_access_token(config['access_token'], config['access_token_secret'])
print "Start collecting tweets"
print "Campaign name: ", campain
print "Sentiment: ", sentiment_label
print "Keywords: ", [k + " " + emoticon for k in keywords]
print 'Authenticated'
with open(campain+"-"+sentiment_label+".csv", "w") as f:
writer = csv.DictWriter(f, fieldnames=["TweetID", "User_ID", "Text", "Sentiment"])
writer.writeheader()
l = EmoticonListener()
s = SimpleListener()
stream = Stream(auth, l)
print "Stream created"
stream.filter(track=[k + " " + emoticon for k in keywords], languages=['en'])
print 'End'