-
Notifications
You must be signed in to change notification settings - Fork 0
/
WcDe.py
229 lines (172 loc) · 7.99 KB
/
WcDe.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
# -*- coding: utf-8 -*-
'''
Author : Sunanda Bansal ([email protected])
Year : 2021
'''
# Importing Libraries
import warnings
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering as AHC
def cluster_word_vectors(
word_vectors,
clustering_algorithm,
n_clusters=1500,
distance_threshold=None,
linkage="ward",
random_state=0,
**clustering_kwargs
):
'''
Clusters word vectors
Parameters
----------
word_vectors : pandas.Series
Pandas Series with words as the index and vector (numpy array) as the
corresponding value.
clustering_algorithm : str
Specifies which clustering algorithm to use. Accepts "kmeans" for K-Means
or "ahc" for Agglomerative Hierarchical Clustering (AHC).
n_clusters : int, optional
The number of clusters to cluster the word vectors into. The default is 1500.
distance_threshold : float, optional
For Agglomerative Hierarchical Clustering (AHC), the clusters with linkage
distance smaller than this value will be merged. If a value is declared
for this parameter then `n_clusters` must be set to None.
The default is None.
linkage : str, optional
The linkage critera for Agglomerative Hierarchical Clustering (AHC).
The default is None.
**clustering_kwargs : TYPE
Additional keyword arguments are passed to respective clustering functions
based on the value `clustering_algorithm`. For refer to the documentation
of sklearn library for KMeans and AgglomerativeClustering clustering
algorithms.
Returns
-------
labels : list
A list of cluster labels for each of the word vectors.
'''
if clustering_algorithm == "kmeans":
clustering_model = KMeans(
n_clusters = n_clusters,
random_state = random_state,
**clustering_kwargs
)
elif clustering_algorithm == "ahc" :
clustering_model = AHC(
n_clusters = n_clusters,
distance_threshold = distance_threshold,
linkage = linkage,
compute_full_tree = True if n_clusters is None else False,
**clustering_kwargs
)
else:
raise Exception('''Argument clustering_algorithm accepts "kmeans" or "ahc" only.''')
clustering_model = clustering_model.fit(word_vectors)
labels = clustering_model.labels_
if len(set(labels)) == 1:
warnings.warn("Based on the parameters provided, only 1 word cluster was found.")
return labels
def get_document_vectors(
tokenized_texts,
words,
cluster_labels,
weight_function="cfidf",
training_tokenized_texts=None,
normalize=True
):
'''
Computes the document vectors given word clusters and weight function.
CF-IDF
======
Required for CF-iDF cluster weight computation
cf(i,j) = count of every occurrence of any terms of cluster i that are present in document j
cdf(i) = cluster document frequency of cluster i
= number of document where any term from cluster i appeared in
Parameters
----------
tokenized_texts : list of lists of str
A list of texts where each text is further a list of tokens.
words : list
The words that were clustered.
cluster_labels : list
For each word, this list gives the label of the cluster it belongs to.
weight_function : str, optional
Provides the default weighting scheme to be used. The available
options are "cfidf" or "tfidf_sum".
The default is "cfidf".
training_tokenized_texts : list of list of str, optional
The training texts are used to calculate the document frequencies.
The default is None.
normalize : bool, optional
If True, then the document vectors will be length normalized (L2).
The default is True.
Returns
-------
labels : list of lists
A list of document vector for each documents in the tokenized_texts
'''
# If training texts are not provided
if training_tokenized_texts == None:
training_tokenized_texts = tokenized_texts
# Total number of texts (used in tfidf and cfidf weight calculations)
N = len(training_tokenized_texts)
vector_size = len(set(cluster_labels))
# Dictionaries that record the document IDs each term or word cluster appears in
df = {word:[] for word in words}
cdf = {cl:[] for cl in range(vector_size)}
word_clusters = {word: cluster_label for word, cluster_label in zip(words, cluster_labels)}
# Calculates term-document-frequency and cluster-document-frequency from the [training] data
for idx, tokenized_text in enumerate(training_tokenized_texts):
for token in set(tokenized_text):
# If token is in a part of word clusters
if token in words:
# Append document ID if it isn't already in the list of documents
if idx not in df[token]:
df[token].append(idx)
if idx not in cdf[word_clusters[token]]:
cdf[word_clusters[token]].append(idx)
# Maps the token and word clusters to their corresponding document frequencies
df = {token:len(df[token]) for token in df}
cdf = {cluster_label:len(cdf[cluster_label]) for cluster_label in cdf}
# Calculate inverse document frequencies in preparation for weighting schemes
if weight_function == "cfidf":
# CF-IDF Prep
cdf_vector = np.array([cdf[cl] for cl in range(vector_size)])
icdf_vector = np.log(N/cdf_vector + 0.01)
elif weight_function == "tfidf_sum":
# TF-iDF SUM Prep
idf = {token:np.log(N/df[token]) for token in df}
wcde_doc_vectors = [] # WcDe Document vector corresponding to each document
cf_all = [] # Cluster Frequency vectors corresponding to each document
# Process each document and generate its document vector
for idx, tokenized_text in enumerate(tokenized_texts):
# Default vector - origin
cluster_weights = [0]*vector_size
if len(tokenized_text) > 0:
if weight_function == "cfidf":
# Get cluster frequency -
# cf(i,j) = count of every occurrence of any terms of
# cluster i that are present in document j
cf_vector = [0]*vector_size # default
for token in tokenized_text:
if token in words:
cf_vector[word_clusters[token]] += 1
cf_vector = np.array(cf_vector)
cf_all.append(cf_vector)
# Calculate CF-iDF weight of all the clusters in the document
cluster_weights = cf_vector * icdf_vector
elif weight_function == "tfidf_sum":
# Calculate the weight of clusters in the documents as sum of
# tfidf values of its members
for token in set(tokenized_text):
if token in words:
cluster_weights[word_clusters[token]] += tokenized_text.count(token)*idf[token]
cluster_weights = np.array(cluster_weights)
# Length Normalize
if bool(normalize):
cluster_weights = cluster_weights/np.sqrt(np.sum(np.square(cluster_weights)))
wcde_doc_vectors.append(list(np.asarray(cluster_weights)))
return wcde_doc_vectors