-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathtsneviz.py
104 lines (84 loc) · 3.16 KB
/
tsneviz.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
'''
tSNE visualising Word2Vec words - utility functions.
Credit to github user aneesha, author of this snippet:
https://gist.github.com/aneesha/da9216fb8d84245f7af6edaa14f4efa9#file-display_closestwords_tsnescatterplot-ipynb
Sub-region plotting -
Credit to github user MiguelSteph, author of this snippet:
http://migsena.com/build-and-visualize-word2vec-model-on-amazon-reviews/
'''
# from gensim.models.doc2vec import Doc2Vec
import gensim
import numpy as np
import pandas as pd
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns
def plot_region(x_bounds, y_bounds, points):
'''
plot a sub-region of words in a tSNE reduction, for a dataframe: points
'''
slice = points[
(x_bounds[0] <= points.x) &
(points.x <= x_bounds[1]) &
(y_bounds[0] <= points.y) &
(points.y <= y_bounds[1])
]
ax = slice.plot.scatter("x", "y", s=35, figsize=(10, 8))
for i, point in slice.iterrows():
ax.text(point.x + 0.005, point.y + 0.005, point.word, fontsize=11)
def display_closestwords_tsnescatterplot(model, word, vec_size):
'''
tSNE visualising Word2Vec function.
'''
arr = np.empty((0, vec_size), dtype='f')
word_labels = [word]
# get close words
close_words = model.similar_by_word(word)
# add the vector for each of the closest words to the array
arr = np.append(arr, np.array([model[word]]), axis=0)
for wrd_score in close_words:
wrd_vector = model[wrd_score[0]]
word_labels.append(wrd_score[0])
arr = np.append(arr, np.array([wrd_vector]), axis=0)
# find tsne coords for 2 dimensions
tsne = TSNE(n_components=2, random_state=0)
np.set_printoptions(suppress=True)
Y = tsne.fit_transform(arr)
x_coords = Y[:, 0]
y_coords = Y[:, 1]
# display scatter plot
plt.scatter(x_coords, y_coords)
for label, x, y in zip(word_labels, x_coords, y_coords):
plt.annotate(label, xy=(x, y), xytext=(0, 0), textcoords='offset points')
plt.title('Most similar word vectors in the generated embeddings for term: "' \
+ word + '"')
plt.xlim(x_coords.min()+0.00005, x_coords.max()+0.00005)
plt.ylim(y_coords.min()+0.00005, y_coords.max()+0.00005)
plt.xlabel("tSNE dimension 1")
plt.ylabel("tSNE dimension 2")
plt.show()
# Make a DF of the words and their coords, & show a zoomed in region:
# count = 26756
# word_vectors_matrix = np.ndarray(shape=(count, 200), dtype='float64')
# word_list = []
# i = 0
# for word in model.vocab:
# word_vectors_matrix[i] = model[word]
# word_list.append(word)
# i = i+1
# if i == count:
# break
# word_vectors_matrix_2d = tsne.fit_transform(word_vectors_matrix)
# points = pd.DataFrame(
# [
# (word, coords[0], coords[1])
# for word, coords in [
# (word, word_vectors_matrix_2d[word_list.index(word)])
# for word in word_list
# ]
# ],
# columns=["word", "x", "y"]
# )
# print (points.head(10))
# print (points.tail(10))
# plot_region(x_bounds=(-160.0, -140.0), y_bounds=(-160.0, -140.0), points=points)