-
Notifications
You must be signed in to change notification settings - Fork 0
/
1. Text Processing.py
646 lines (457 loc) · 22 KB
/
1. Text Processing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
"""
###############################################################################
### packages required to run code.
###############################################################################
import re,string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
# from nltk.download import stopwords
import nltk
nltk.download('stopwords')
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.manifold import MDS
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, KFold
import pandas as pd
import os
from gensim.models import Word2Vec,LdaMulticore, TfidfModel
from gensim import corpora
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import numpy as np
#Functionality to turn stemming on or off
STEMMING=False
NGRAM_LENGTH=2
###############################################################################
### Function to process documents
###############################################################################
def clean_doc(doc):
#split document into individual words
tokens=doc.split()
re_punc = re.compile('[%s]' % re.escape(string.punctuation))
# remove punctuation from each word
tokens = [re_punc.sub('', w) for w in tokens]
# remove remaining tokens that are not alphabetic
tokens = [word for word in tokens if word.isalpha()]
# filter out short tokens
tokens = [word for word in tokens if len(word) > 4]
#lowercase all words
tokens = [word.lower() for word in tokens]
# filter out stop words
stop_words = set(stopwords.words('english'))
tokens = [w for w in tokens if not w in stop_words]
# word stemming
if STEMMING:
ps=PorterStemmer()
tokens=[ps.stem(word) for word in tokens]
return tokens
###############################################################################
# Functions to label encoding
###############################################################################
def One_Hot(variable):
LE=LabelEncoder()
LE.fit(variable)
Label1=LE.transform(variable)
OHE=OneHotEncoder()
labels=OHE.fit_transform(Label1.reshape(-1,1)).toarray()
return labels, LE, OHE
###############################################################################
### Processing text into lists
###############################################################################
#set working Directory to where class corpus is saved.
os.chdir('')
#read in class corpus csv into python
data=pd.read_csv('Corpus.csv')
data=pd.read_csv('Corpus_Gowani.csv')
#create empty list to store text documents titles
titles=[]
#for loop which appends the DSI title to the titles list
for i in range(0,len(data)):
temp_text=data['DSI_Title'].iloc[i]
titles.append(temp_text)
#create empty list to store text documents
text_body=[]
#for loop which appends the text to the text_body list
for i in range(0,len(data)):
temp_text=data['Text'].iloc[i]
text_body.append(temp_text)
#Note: the text_body is the unprocessed list of documents read directly form the csv.
# empty list to store processed documents
processed_text=[]
#for loop to process the text to the processed_text list
for i in text_body:
text=clean_doc(i)
processed_text.append(text)
#Note: the processed_text is the PROCESSED list of documents read directly form
#the csv. Note the list of words is separated by commas.
#stitch back together individual words to reform body of text
final_processed_text=[]
for i in processed_text:
temp_DSI=i[0]
for k in range(1,len(i)):
temp_DSI=temp_DSI+' '+i[k]
final_processed_text.append(temp_DSI)
#Note: We stitched the processed text together so the TFIDF vectorizer can work.
#Final section of code has 3 lists used. 2 of which are used for further processing.
#(1) text_body - unused, (2) processed_text (used in W2V),
#(3) final_processed_text (used in TFIDF), and (4) DSI titles (used in TFIDF Matrix)
###############################################################################
### Sklearn TFIDF
###############################################################################
#note the ngram_range will allow you to include multiple words within the TFIDF matrix
#Call Tfidf Vectorizer
Tfidf=TfidfVectorizer(ngram_range=(1,3))
#fit the vectorizer using final processed documents. The vectorizer requires the
#stiched back together document.
TFIDF_matrix=Tfidf.fit_transform(final_processed_text)
#creating datafram from TFIDF Matrix
matrix=pd.DataFrame(TFIDF_matrix.toarray(), columns=Tfidf.get_feature_names(), index=titles)
#creating datafram from CV Matrix
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(ngram_range=(1,3))
cv_matrix=cv.fit_transform(final_processed_text)
cv_matrix=pd.DataFrame(cv_matrix.toarray(), columns=cv.get_feature_names(), index=titles)
matrix_t = matrix.transpose()
#### Corpus ###
matrix_corp_t = matrix.transpose()
cv_matrix_corp_t = cv_matrix.transpose()
# Sum all columns
cv_matrix_corp_t['df_corpus']= (cv_matrix_corp_t != 0).sum(axis=1)
cv_matrix_corp_t['tf_corpus']= cv_matrix_corp_t.sum(axis=1) - cv_matrix_corp_t['df_corpus']
# Remove Columns
idx = np.r_[0:123]
print(idx)
cv_matrix_corp_t.drop(cv_matrix_corp_t.columns[idx], axis=1, inplace=True)
### Done ###
### Gowani ###
matrix_gowani_t = matrix.transpose()
cv_matrix_gowani_t = cv_matrix.transpose()
# Sum all columns (not applicable for gowani corpus; only needed for entire corpus)
# cv_matrix_gowani_t['df_corpus']= (cv_matrix_gowani_t != 0).sum(axis=1)
# cv_matrix_gowani_t['tf_corpus']= cv_matrix_gowani_t.sum(axis=1) - cv_matrix_gowani_t['df_corpus']
# Merge cv_matrix and matrix
merged_gowani = matrix_gowani_t.merge(cv_matrix_gowani_t, left_index=True, right_index=True, how="left")
### Done ###
# list(mergedDf.columns.values)
# Merge cv_matrix_corp_t and merged_gowani
merged_gowani = merged_gowani.merge(cv_matrix_corp_t, left_index=True, right_index=True, how="left")
# Rename merged_gowani columns and reposition
list(merged_gowani.columns.values)
merged_gowani.rename(columns = {'AAG_Doc1_Trump Limits AI Exports To China.docx_x':'Doc1_tfidf', 'AAG_Doc2_Trump administration not interested in AI regulation.docx_x':'Doc2_tfidf', 'AAG_Doc1_Trump Limits AI Exports To China.docx_y':'Doc1_tf', 'AAG_Doc2_Trump administration not interested in AI regulation.docx_y':'Doc2_tf', 'df_corpus':'Corpus_df', 'tf_corpus':'Corpus_tf'}, inplace = True)
list(merged_gowani.columns.values)
merged_gowani = merged_gowani[['Doc1_tf', 'Doc1_tfidf', 'Doc2_tf', 'Doc2_tfidf', 'Corpus_tf', 'Corpus_df']]
# Selected Terms
merged_gowani_selected = pd.DataFrame(merged_gowani.loc[['technology', 'artificial intelligence', 'regulations', 'companies'],])
merged_gowani_selected
# Select n largest terms
merged_gowani_selected_largest = merged_gowani.nlargest(10, 'Doc2_tfidf')
merged_gowani_selected_largest.round(3)
### Chart Creation for Selected Terms ###
# Copy index to terms
merged_gowani_selected2 = merged_gowani_selected
merged_gowani_selected2['terms'] = merged_gowani_selected2.index
merged_gowani_selected2 = merged_gowani_selected2[['Doc1_tf', 'Doc2_tf', 'Corpus_df', 'terms']]
matrix_gowani_t_rounded = matrix_gowani_t.describe().round(3)
merged_gowani_avgTFIDF.round(3)
matrix_gowani_t_rounded = matrix_gowani_t_rounded.applymap('{:4}'.format)
matrix_gowani_t_rounded.rename(columns = {'AAG_Doc2_Trump administration not interested in AI regulation.docx':'Doc2'}, inplace = True)
# Plot Selected Terms
import matplotlib.pyplot as plt
merged_gowani_selected2.plot(kind='barh')
### Done ###
### Classification and Clustering ###
# Selected Terms
# Top 10 Terms in Each Doc
doc1_doc2_matrix_tfidf = matrix_corp_t[['AAG_Doc1_Trump Limits AI Exports To China.docx', 'AAG_Doc2_Trump administration not interested in AI regulation.docx']].nlargest(10, 'AAG_Doc2_Trump administration not interested in AI regulation.docx').round(4)
#### Doc 1 ####
# Top 10 Terms for Doc1 and remaining corpus
matrix_corp_t_selected_doc1 = pd.DataFrame(matrix_corp_t.loc[['surveillance', 'technology', 'china', 'restrictions', 'industry', 'export', 'giants', 'technologies', 'companies', 'exports'],]).round(4)
# Transpose to get docs on first column
matrix_corp_t_selected_doc1 = pd.DataFrame(matrix_corp_t_selected_doc1).transpose()
# For doc 1 Sum values for each Top 10 per all docs in corpus
matrix_corp_t_selected_doc1['Sum_Top_10'] = matrix_corp_t_selected_doc1.sum(axis=1)
# Show largest top 10 in Sum_Top_10
matrix_corp_t_selected_doc1 = matrix_corp_t_selected_doc1.nlargest(10, 'Sum_Top_10')
#### Doc 1 ####
#### Doc 2 ####
# Top 10 Terms for Doc1 and remaining corpus
matrix_corp_t_selected_doc1 = pd.DataFrame(matrix_corp_t.loc[['surveillance', 'technology', 'china', 'restrictions', 'industry', 'export', 'giants', 'technologies', 'companies', 'exports'],]).round(4)
# Transpose to get docs on first column
matrix_corp_t_selected_doc1 = pd.DataFrame(matrix_corp_t_selected_doc1).transpose()
# For doc 1 Sum values for each Top 10 per all docs in corpus
matrix_corp_t_selected_doc1['Sum_Top_10'] = matrix_corp_t_selected_doc1.sum(axis=1)
# Show largest top 10 in Sum_Top_10
matrix_corp_t_selected_doc1 = matrix_corp_t_selected_doc1.nlargest(10, 'Sum_Top_10')
#### Doc 2 ####
###############################################################################
### Explore TFIDF Values
###############################################################################
average_TFIDF={}
for i in matrix.columns:
average_TFIDF[i]=np.mean(matrix[i])
average_TFIDF_DF=pd.DataFrame(average_TFIDF,index=[0]).transpose()
average_TFIDF_DF.columns=['TFIDF']
# Gowani Selected with Average TFIDF from Corpus
merged_gowani_selected_avg = pd.DataFrame(merged_gowani.loc[['technology', 'artificial intelligence', 'regulations', 'companies'],])
merged_gowani_selected_avg
# Merge gowani_selected_avg and average_TFIDF
merged_gowani_avgTFIDF = merged_gowani_selected_avg.merge(average_TFIDF_DF, left_index=True, right_index=True, how="left")
# Rename TFIDF to Avg_TFIDF
merged_gowani_avgTFIDF.rename(columns = {'Corpus_Avg_TFIDF':'Avg_TFIDF'}, inplace = True)
list(merged_gowani.columns.values)
# Look at top x terms in merged_gowani
merged_gowani_avgTFIDF
merged_gowani_avgTFIDF.round(3)
merged_gowani_avgTFIDF.applymap('{:2}'.format)
#calculate Q1 and Q3 range
Q1=np.percentile(average_TFIDF_DF, 25)
Q3=np.percentile(average_TFIDF_DF, 75)
IQR = Q3 - Q1
outlier=Q3+(1.5*IQR)
#words that exceed the Q3+IQR*1.5
outlier_list=average_TFIDF_DF[average_TFIDF_DF['TFIDF']>=outlier]
#can export matrix to csv and explore further if necessary
###############################################################################
### Doc2Vec
###############################################################################
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(final_processed_text)]
model = Doc2Vec(documents, vector_size=100, window=2, min_count=1, workers=4)
doc2vec_df=pd.DataFrame()
for i in range(0,len(processed_text)):
vector=pd.DataFrame(model.infer_vector(processed_text[i])).transpose()
doc2vec_df=pd.concat([doc2vec_df,vector], axis=0)
doc2vec_df=doc2vec_df.reset_index()
doc_titles={'title': titles}
t=pd.DataFrame(doc_titles)
doc2vec_df=pd.concat([doc2vec_df,t], axis=1)
doc2vec_df=doc2vec_df.drop('index', axis=1)
###############################################################################
### Gensim Word2vec
###############################################################################
#Note, there are opportunities to use the word2vec matrix to determine words
#which are similar. Similar words can be used to create equivalent classes.
#k-means is not used to group individual words using the Word2Vec output.
#word to vec
model_w2v = Word2Vec(processed_text, size=100, window=5, min_count=1, workers=4)
#join all processed DSI words into single list
processed_text_w2v=[]
for i in processed_text:
for k in i:
processed_text_w2v.append(k)
#obtian all the unique words from DSI
w2v_words=list(set(processed_text_w2v))
#can also use the get_feature_names() from TFIDF to get the list of words
#w2v_words=Tfidf.get_feature_names()
#empty dictionary to store words with vectors
w2v_vectors={}
#for loop to obtain weights for each word
for i in w2v_words:
temp_vec=model_w2v.wv[i]
w2v_vectors[i]=temp_vec
#create a final dataframe to view word vectors
w2v_df=pd.DataFrame(w2v_vectors).transpose()
#the following section runs applies the k-means algorithm on the TFIDF matrix.
###############################################################################
### K Means Clustering - TFIDF
###############################################################################
k=8
km = KMeans(n_clusters=k, random_state =89)
km.fit(TFIDF_matrix)
clusters = km.labels_.tolist()
terms = Tfidf.get_feature_names()
Dictionary={'Doc Name':titles, 'Cluster':clusters, 'Text': final_processed_text}
frame=pd.DataFrame(Dictionary, columns=['Cluster', 'Doc Name','Text'])
print("Top terms per cluster:")
#sort cluster centers by proximity to centroid
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms_dict=[]
#save the terms for each cluster and document to dictionaries. To be used later
#for plotting output.
#dictionary to store terms and titles
cluster_terms={}
cluster_title={}
for i in range(k):
print("Cluster %d:" % i),
temp_terms=[]
temp_titles=[]
for ind in order_centroids[i, :10]:
print(' %s' % terms[ind])
terms_dict.append(terms[ind])
temp_terms.append(terms[ind])
cluster_terms[i]=temp_terms
print("Cluster %d titles:" % i, end='')
temp=frame[frame['Cluster']==i]
for title in temp['Doc Name']:
print(' %s,' % title, end='')
temp_titles.append(title)
cluster_title[i]=temp_titles
###############################################################################
### Plotting
###############################################################################
# convert two components as we're plotting points in a two-dimensional plane
# "precomputed" because we provide a distance matrix
# we will also specify `random_state` so the plot is reproducible.
mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)
dist = 1 - cosine_similarity(TFIDF_matrix)
pos = mds.fit_transform(dist) # shape (n_components, n_samples)
xs, ys = pos[:, 0], pos[:, 1]
#set up colors per clusters using a dict. number of colors must correspond to K
cluster_colors = {0: 'black', 1: 'grey', 2: 'blue', 3: 'rosybrown', 4: 'firebrick', 5:'red', 6:'darksalmon', 7:'sienna'}
#set up cluster names using a dict.
cluster_dict=cluster_title
#create data frame that has the result of the MDS plus the cluster numbers and titles
df = pd.DataFrame(dict(x=xs, y=ys, label=clusters, title=range(0,len(clusters))))
#group by cluster
groups = df.groupby('label')
fig, ax = plt.subplots(figsize=(12, 12)) # set size
ax.margins(0.05) # Optional, just adds 5% padding to the autoscaling
#iterate through groups to layer the plot
#note that I use the cluster_name and cluster_color dicts with the 'name' lookup to return the appropriate color/label
for name, group in groups:
ax.plot(group.x, group.y, marker='o', linestyle='', ms=12,
label=cluster_dict[name], color=cluster_colors[name],
mec='none')
ax.set_aspect('auto')
ax.tick_params(\
axis= 'x', # changes apply to the x-axis
which='both', # both major and minor ticks are affected
bottom='off', # ticks along the bottom edge are off
top='off', # ticks along the top edge are off
labelbottom='on')
ax.tick_params(\
axis= 'y', # changes apply to the y-axis
which='both', # both major and minor ticks are affected
left='off', # ticks along the bottom edge are off
top='off', # ticks along the top edge are off
labelleft='on')
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5)) #show legend with only 1 point
#The following section of code is to run the k-means algorithm on the doc2vec outputs.
#note the differences in document clusters compared to the TFIDF matrix.
###############################################################################
### K Means Clustering Doc2Vec
###############################################################################
doc2vec_k_means=doc2vec_df.drop('title', axis=1)
k=8
km = KMeans(n_clusters=k, random_state =89)
km.fit(doc2vec_k_means)
clusters_d2v = km.labels_.tolist()
Dictionary={'Doc Name':titles, 'Cluster':clusters_d2v, 'Text': final_processed_text}
frame=pd.DataFrame(Dictionary, columns=['Cluster', 'Doc Name','Text'])
#dictionary to store clusters and respective titles
cluster_title={}
#note doc2vec clusters will not have individual words due to the vector representation
#is based on the entire document not indvidual words. As a result, there won't be individual
#word outputs from each cluster.
for i in range(k):
temp=frame[frame['Cluster']==i]
temp_title_list=[]
for title in temp['Doc Name']:
temp_title_list.append(title)
cluster_title[i]=temp_title_list
###############################################################################
### Plotting Doc2vec
###############################################################################
# convert two components as we're plotting points in a two-dimensional plane
# "precomputed" because we provide a distance matrix
# we will also specify `random_state` so the plot is reproducible.
mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)
dist = 1 - cosine_similarity(doc2vec_k_means)
pos = mds.fit_transform(dist) # shape (n_components, n_samples)
xs, ys = pos[:, 0], pos[:, 1]
#set up colors per clusters using a dict. number of colors must correspond to K
cluster_colors = {0: 'black', 1: 'grey', 2: 'blue', 3: 'rosybrown', 4: 'firebrick', 5:'red', 6:'darksalmon', 7:'sienna'}
#set up cluster names using a dict.
cluster_dict=cluster_title
#create data frame that has the result of the MDS plus the cluster numbers and titles
df = pd.DataFrame(dict(x=xs, y=ys, label=clusters, title=range(0,len(clusters))))
#group by cluster
groups = df.groupby('label')
fig, ax = plt.subplots(figsize=(12, 12)) # set size
ax.margins(0.05) # Optional, just adds 5% padding to the autoscaling
#iterate through groups to layer the plot
#note that I use the cluster_name and cluster_color dicts with the 'name' lookup to return the appropriate color/label
for name, group in groups:
ax.plot(group.x, group.y, marker='o', linestyle='', ms=12,
label=cluster_dict[name], color=cluster_colors[name],
mec='none')
ax.set_aspect('auto')
ax.tick_params(\
axis= 'x', # changes apply to the x-axis
which='both', # both major and minor ticks are affected
bottom='off', # ticks along the bottom edge are off
top='off', # ticks along the top edge are off
labelbottom='on')
ax.tick_params(\
axis= 'y', # changes apply to the y-axis
which='both', # both major and minor ticks are affected
left='off', # ticks along the bottom edge are off
top='off', # ticks along the top edge are off
labelleft='on')
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5)) #show legend with only 1 point
#The following section is used to create a model to predict the clusters labels
#based on the the TFIDF matrix and the doc2vec vectors. Note the model performance
#using the two different vectorization methods.
###############################################################################
### Classification using various RF Model
###############################################################################
model_RF=RandomForestClassifier()
#TFIDF
Y=clusters
X=TFIDF_matrix
#cross validation
cv_score=cross_val_score(model_RF, X,Y, cv=5)
#mean CV score
np.mean(cv_score)
#Doc2Vec
Y=clusters_d2v
X=doc2vec_k_means
#cross validation
cv_score=cross_val_score(model_RF, X,Y, cv=5)
#mean CV score
np.mean(cv_score)
#the following section is example code to create ECs within the corpus. A dictionary
#will need to be created for every EC. Each EC will need to be applied to the corpus.
#Below is an example of how the function works.
###############################################################################
### EC clean up code
###############################################################################
def create_ec(dictionary, corpus):
for key, values in dictionary.items():
for value in values:
corpus= corpus.replace(value, key)
return corpus
corpus='i like swiss. i like cheddar. i like provolone.'
cheese_dic={'cheese': ['swiss', 'cheddar', 'provolone']}
corpus_new=create_ec(cheese_dic, corpus)
###############################################################################
### LDA Code
###############################################################################
#LDA using bag of words
dictionary = corpora.Dictionary(processed_text)
corpus = [dictionary.doc2bow(doc) for doc in processed_text]
ldamodel = LdaMulticore(corpus, num_topics=3, id2word=dictionary, passes=2, workers=2)
for idx, topic in ldamodel.print_topics(-1):
print('Topic: {} \nWords: {}'.format(idx, topic))
#LDA using TFIDF
tfidf = TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
ldamodel = LdaMulticore(corpus_tfidf, num_topics=3, id2word=dictionary, passes=2, workers=2)
for idx, topic in ldamodel.print_topics(-1):
print('Topic: {} \nWords: {}'.format(idx, topic))
########################################
########### Testing #################
########################################
frame.to_csv('frame.csv')
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(8,6))
df.groupby('label').label.count().plot.bar(ylim=0)
plt.show()