-
Notifications
You must be signed in to change notification settings - Fork 0
/
kmeans.py
102 lines (79 loc) · 3.34 KB
/
kmeans.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import random
import word2vec
import pickle
class Center:
def __init__(self, embedding):
self.embedding = embedding
def initialize_dict(collection_50k, clusters_dict, center_ids):
for id in center_ids:
center = Center(embedding=collection_50k[id].embeddings)
clusters_dict[center] = []
return clusters_dict
def termination_condition(a1, a2):
terminate = True
for j in range(len(a1)):
for i in range(a1[j].embedding.shape[0]):
if a1[j].embedding[i] > a2[j].embedding[i] * 1.1 or a1[j].embedding[i] < a2[j].embedding[i] * 1.1:
terminate = False
return terminate
def initialize_kmeans(collection_50k, k):
clusters_dict = {}
# randomly choose "k" docs as centers
center_ids = random.sample(range(len(collection_50k)), k)
clusters_dict = initialize_dict(collection_50k, clusters_dict, center_ids)
print('dict initialized')
MAX_ITERATION = 500
for it in range(MAX_ITERATION):
if it % 10 == 0:
print('iteration ', it)
tmp_dict = dict.fromkeys(clusters_dict.keys(), [])
# calculate cosine similarity (distance) between each doc and centers
for doc in collection_50k:
highest_score = -1
# closest_cnt = list(clusters_dict.keys())[0] # not a good way
closest_cnt = ""
for cnt in clusters_dict.keys():
score = word2vec.cos_similarity_emb(doc.embeddings, cnt.embedding)
if score > highest_score:
highest_score = score
closest_cnt = cnt
# assign each doc to its nearest center
try:
tmp_dict[closest_cnt].append(doc)
except:
# print('error')
pass
# calculate new centers
for cnt in tmp_dict.keys():
embeddings = [doc.embeddings for doc in tmp_dict[cnt]]
cnt.embedding = sum(embeddings) / len(embeddings)
if termination_condition(list(tmp_dict.keys()), list(clusters_dict.keys())):
break
clusters_dict = tmp_dict
print('finish iteration')
# save model
with open('kmeans_model_500it.obj', 'wb') as kmeans_file:
pickle.dump(clusters_dict, kmeans_file)
print('file saved')
# return the dictionary: keys are centers, values are docs in that cluster
return clusters_dict
def search_kmeans(query_embedding, clusters_dict, b=3):
# compare query vector with cluster centers (cosine similarity)
cnt_scores = {}
for cnt in clusters_dict.keys():
score = word2vec.cos_similarity_emb(query_embedding, cnt.embedding)
cnt_scores[cnt] = score
# find "b" closest clusters
cnt_scores = dict(sorted(cnt_scores.items(), key=lambda item: item[1], reverse=True))
cnt_scores_list = list(cnt_scores)
# compare query vector with docs in "b" clusters
similarities = {} # {doc: similarity, ...}
for i in range(1, b+1):
for doc in clusters_dict[cnt_scores_list[i]]:
similarities[doc] = word2vec.cos_similarity_emb(query_embedding, doc.embeddings)
# return top "z" docs
# sort scores dictionary
similarities = dict(sorted(similarities.items(), key=lambda item: item[1], reverse=True))
z = 5
first_z_pairs = {i: similarities[i] for i in list(similarities)[:z]}
return first_z_pairs