-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathclassify_tweets_inferred.py
399 lines (294 loc) · 14.2 KB
/
classify_tweets_inferred.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
'''
local machine test, autumn 2016.
'''
import pandas as pd
import random
import numpy as np
import os, sys
# specify dir to save a pickle model here, later
dir_model = os.path.expanduser('~/Downloads/')
CURR_PLATFORM = sys.platform
# Mac vs Win vs Linux
if CURR_PLATFORM == 'darwin':
TWITDIR = os.path.expanduser('~/Dropbox/data-notes-mac-to-chrome/data-incubator/Project_submission/supporting_files_code_queries_logs_Etc/demoapptwitter')
SCRAPEDIR = os.path.expanduser('~/Dropbox/data-notes-mac-to-chrome/data-incubator/Project_submission/supporting_files_code_queries_logs_Etc/scrape')
elif CURR_PLATFORM != 'linux':
TWITDIR = 'U:\Documents\Project\demoapptwitter'
SCRAPEDIR = 'U:\Documents\Project\scrape'
dir_model = 'C:/Users/johnbarker/Downloads/'
else:
TWITDIR = '/home/luke/programming/'
SCRAPEDIR = '/home/luke/programming/scraping'
sys.path.insert(0, TWITDIR)
sys.path.insert(0, SCRAPEDIR)
# get some handy functions
import jlpb
import jlpb_classify
if __name__ == "__main__":
'''
We will use a Doc2Vec model to then train a classifier
(Logistic Regression currently) to try to classify relevant
tweets from irrelevant ones for topic of flooding.
'''
import time
start = time.perf_counter() # keep track of processing time
# Set up some key variables for the process of model training:
# Use a random seed for reproducibility
seed = 40
# This can take a LOT of time if high! but should give better
# performance for the classifier.
epochs = 16
vocab_rows = 50000 # how many unlabelled tweets to use for building vocab in D2Vec
vocab_frac = 1 # when using a sample of a huge file of unlabelled tweets
vecs = 160
test_num = 450 # 450
## LOAD DATA SETS OF TWEETS TO PANDAS DFs FROM CSV==========================
##
# Make Pandas dataframes from labelled data and combine to
# a single dataframe that we then split into test and training set:
#
df = pd.read_csv('pos_frm500.csv')
df = df[[u'label',u'text']]
ndf = pd.read_csv('neg_frm500.csv')
ndf = ndf[[u'label',u'text']]
## combine these two together:
zdf = pd.concat([df, ndf], axis=0)
# load this further 4002 labelled rows :
xdf = pd.read_csv('tokens_buffer10k4002set.csv')
xdf = xdf[[u'label',u'text']]
# Combine these 500 + 4002 labelled:
tdf = pd.concat([zdf, xdf], axis=0)
# make the string a list
# NB needed in D2V gensim: (ie split the string into words, for D2V)
tdf.loc[:,'text'] = tdf.loc[:,'text'].map(jlpb_classify.split)
print('tdf',tdf.shape)
tdf[tdf == 'negative'] = 0
tdf[tdf == 'positive'] = 1
print('head:5', tdf.head(5))
print('tail:5', tdf.tail(5))
# Randomise the order of the Labelled set rows
# (using a seed for reproducibility)
tdf = tdf.sample(frac=1, random_state=np.random.RandomState(seed)).\
reset_index(drop=True)
print('Head labelled set', tdf.head(5))
print('Tail labelled set', tdf.tail(5))
print('Dims tdf',tdf.shape)
# Load in our unlabelled data set of tweets to build the d2v vocabulary.
print('loading unlabelled vocab tweets into dataframe...')
udf = pd.read_csv('unlab420k.csv') #unlabelled.csv has 50k
udf = udf[[u'text']]
print('completed loading unlabelled.')
# udf = udf.sample(frac=vocab_frac, random_state=np.random.RandomState(seed)).\
# reset_index(drop=True)
# udf = udf.iloc[:5] # debug
# uncomment to use with 23rd data only:
# udf = udf.iloc[:vocab_rows]
print(udf.size , 'rows')
# we need to clean up chars in this unlabelled tweets and tokenise into words:
udf.loc[:,'text'] = udf.loc[:,'text'].map(jlpb_classify.split)
total_num_unlabelled = udf.size
# Gensim Doc2Vec for high-dim vectors in model(s) for each tweet:
from gensim.models.doc2vec import TaggedDocument, Doc2Vec
total_num = int(tdf.size/2)
print('tweets data dims: ', total_num)
# split for the needed test and training data
# maintain approx. a 9:1 ratio of training:test,
# as we have relatively little labelled data.
print('Test set size', test_num)
training_num = total_num - test_num
print('Training set size', training_num)
documents = [TaggedDocument(list(tdf.loc[i,'text']),[i]) for i in range(0, total_num)]
documents_unlabelled = [TaggedDocument(list(udf.loc[i,'text']), \
[i]) for i in range(0, total_num_unlabelled)]
documents_all = documents_unlabelled
doc2vec_train_id = list(range(0, total_num_unlabelled))
#random.shuffle(doc2vec_train_id)
# training documents for Doc2Vec
training_doc = [documents_all[id] for id in doc2vec_train_id]
print('num training docs: ', len(training_doc))
# get all class labels
class_labels = tdf.loc[:,'label']
# find out max system workers from the cores
# so we can make use of the max CPU:
import multiprocessing
cores = multiprocessing.cpu_count()
# build fresh doc2vec models if True set below!
# (otherwise load from disk)
most_recent = dir_model + 'Mac_d2v_tol0001_win10_420kseed40_160se4_ep16_minc3'
# save current labelled dataframe to a CSV
tdf.to_csv(most_recent + 'LOG.csv')
model_DM, model_DBOW = (None, None)
# change below line to True to load in new models:
if False:
# Parameters can be adjusted to try to get better accuracy from classifier.
model_DM = Doc2Vec(size=vecs, window=10, min_count=3, sample=1e-4,\
negative=5, workers=cores, dm=1, dm_concat=1 )
model_DBOW = Doc2Vec(size=vecs, window=10, min_count=3, sample=1e-4,\
negative=5, workers=cores, dm=0)
# construct the vocabs for our models
model_DM.build_vocab(training_doc)
model_DBOW.build_vocab(training_doc)
for it in range(0,epochs):
# progress as this takes a long time:
if (it % 2) == 0:
print('epoch ' + str(it) + ' of ' + str(epochs))
random.shuffle(doc2vec_train_id)
training_doc = [documents_all[id] for id in doc2vec_train_id]
# Train the model again
model_DM.train(training_doc)
model_DBOW.train(training_doc)
# Save the trained models:
fout = 'DM.d2v'
model_DM.save(most_recent + fout)
model_DM.init_sims(replace=True)
fout = 'DBOW.d2v'
model_DBOW.init_sims(replace=True)
model_DBOW.save(most_recent + fout)
else:
# Load Doc2Vec model from disk:
fout = 'DM.d2v'
model_DM = Doc2Vec.load(most_recent + fout)
fout = 'DBOW.d2v'
model_DBOW = Doc2Vec.load(most_recent + fout)
# train the two different methods of the Doc2Vec algorithm:
# NB DBOW is more similar to the recommended skip-gram of
# Word2Vec by the original paper's authors.
print('nonmatch', model_DM.doesnt_match("delay government flooding lightning".split()))
print('nonmatch', model_DM.doesnt_match("euref voteout remain lightning".split()))
print('euref sim by word', model_DM.similar_by_word('euref'))
print('flood ', model_DM.similar_by_word('flood'))
print('flooding ', model_DM.similar_by_word('flooding'))
print('weather', model_DM.most_similar('weather'))
print('rain', model_DM.most_similar('rain'))
print('lightning', model_DM.most_similar('lightning'))
print('thunder', model_DM.most_similar('thunder'))
print('thunderstorm', model_DM.most_similar('thunderstorm'))
print('ukstorm', model_DM.most_similar('ukstorm'))
print('trains', model_DM.most_similar('trains'))
print('delays', model_DM.most_similar('delays'))
print('ligh thun similarity', model_DM.similarity('lightning', 'thunder'))
'''
Use Logistic Regression and train a classifier from Doc2Vec model and labelled data.
Then output plots of confusion matrices of the accuracy of the model applied to
test data.
Credit: https://www.zybuluo.com/HaomingJiang/note/462804
NB Adapted methodology from a tutorial in Doc2Vec -- URL above -- to scaffold
this classification.
'''
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
import statsmodels.api as sm
random.seed(100) # 100 , 1212
new_index = random.sample(range(0, total_num), total_num)
# set the IDs for the test set:
testID = new_index[-test_num:]
# set the IDs for the training set
trainID = new_index[:-test_num]
train_targets, train_regressors = zip(*[(class_labels[id], \
list(model_DM.docvecs[id]) + list(model_DBOW.docvecs[id])) for id in trainID])
# add a constant term so that we fit the intercept of our linear model.
# i.e. the log odds *only* when x1=x2=0, desirable to avoid biasing the model.
train_regressors = sm.add_constant(train_regressors)
# set params for model: tol=0.0001,
model_logreg = LogisticRegression(C=0.5, tol=0.0001, penalty='l2', n_jobs=-1)
model_logreg.fit(train_regressors, train_targets)
## Prepare the test data for testing the model:
accuracies = []
test_regressors = [list(model_DM.docvecs[id]) + list(model_DBOW.docvecs[id]) for id in testID]
# add a constant term so that we fit the intercept of our linear model.
test_regressors = sm.add_constant(test_regressors)
test_predictions = model_logreg.predict(test_regressors)
accuracy = 0
# Loop through the test predictions and adjust accuracy measurement
# Also print out the correct positive and all incorrect predictions.
for i in range(0, test_num):
if test_predictions[i] == tdf.loc[testID[i],u'label']:
if(test_predictions[i] == 1):
jlpb.uprint('Correct: id', str(i) + ', tdf_row:' + str(testID[i]) + ', ', \
str(tdf.loc[testID[i], u'label']),\
tdf.loc[testID[i], u'text'])
accuracy = accuracy + 1
else:
jlpb.uprint('WRONG:'+ str(i) + ', tdf_row:' + str(testID[i]) +', should be', \
str(tdf.loc[testID[i], u'label']), tdf.loc[testID[i], u'text'])
# calculate the final accuracy:
accuracies = accuracies + [1.0 * accuracy / test_num]
## Show user time needed for this classifier:
total = int((time.perf_counter() - start) / 60)
print("Process took %s minutes" % total)
###
# Infer vectors checks of the model:
###
doc_id = np.random.randint(model_DM.docvecs.count) # pick random doc; re-run cell for more examples
print('for doc %d...' % doc_id)
print(udf.loc[doc_id, 'text'])
inferred_docvec = model_DM.infer_vector(udf.loc[doc_id, 'text'])
inferdocsim = model_DM.docvecs.most_similar([inferred_docvec], topn=5)
print('DM:\n %s' % (inferdocsim))
for doc in inferdocsim:
print('doc: ')
print(udf.loc[doc[0], 'text'])
extra_docs = 'flood warning bishops waltham and botley on the river hamble june'.split()
extravec = [model_DM.infer_vector(extra_docs)]
extramostsim = model_DM.docvecs.most_similar(extravec)
print (extramostsim)
for doc in extramostsim:
print('doc: ', doc[0])
print(udf.loc[doc[0], 'text'])
extra_docs = 'crazy weather pound lane onto a is flooding water is expelling from the drainage a has excessive water'.split()
extravec = [model_DM.infer_vector(extra_docs)]
extramostsim = model_DM.docvecs.most_similar(extravec)
print (extramostsim)
for doc in extramostsim:
print('doc: ', doc[0])
print(udf.loc[doc[0], 'text'])
exit('aborting...')
extra = [ list(model_DM.infer_vector(extra_docs)) + list(model_DBOW.infer_vector(extra_docs)) ]
print(extra)
# add a constant term so that we fit the intercept of our linear model.
extra_regressors = sm.add_constant(extra_regressors)
print (model_logreg.predict(extra))
## OUTPUT Evaluation of Accuracy=================================================
# Accuracy rates and so on:
#
# Produce some confusion matrices and plot them:
#
# cast the labels for the confusion matrix, otherwise they are seen as binary!
cast = tdf.loc[testID, u'label']
cast = (cast.values).astype(np.int8) # numpy.ndarray now!
confusion_mtx = confusion_matrix(test_predictions, cast)
print('test conf matrix: ', confusion_mtx)
jlpb_classify.show_confusion_matrix(confusion_mtx)
train_predictions = model_logreg.predict(train_regressors)
accuracy = 0
for i in range(0,len(train_targets)):
if train_predictions[i] == train_targets[i]:
accuracy = accuracy + 1
accuracies = accuracies + [1.0 * accuracy / len(train_targets)]
confusion_mtx = confusion_matrix(train_predictions,train_targets)
print('training conf matrix: ', confusion_mtx)
jlpb_classify.show_confusion_matrix(confusion_mtx)
## Todo: Further evaluations
## Show F1 score
'''
Because precision and recall both provide valuable
information about the quality of a classifier, you often
want to combine them into a single general-purpose score.
The F1 score is defined as the harmonic mean of recall and precision:
F1 = (2 x recall x precision) / (recall + precision)
The F1 score thus tends to favor classifiers that are strong in
both precision and recall, rather than classifiers that
emphasize one at the cost of the other.
'''
## Todo:
## predict unseen and unlabelled tweets
##
##
## make token lists first
#tokens = "ACTUAL SENTENCE".split() # should be same tokenization as training
# then make document vectors
#dv = model.infer_vector(tokens) # note: may want to use many more steps than default
# try the similar tweets:
#sims = model.docvecs.most_similar(positive=[dv])
# then model_logreg.predict() /
# model_logreg.predict_proba() to get the class probabailities