-
Notifications
You must be signed in to change notification settings - Fork 0
/
linkage.py
305 lines (227 loc) · 12 KB
/
linkage.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
import pandas as pd
import numpy as np
def _perform_checks_(current_no_of_clusters, final_no_of_clusters):
'''
Perform checks on the number of clusters.
Parameters:
current_no_of_clusters (int): The current number of clusters.
final_no_of_clusters (int): The final number of clusters.
Returns:
bool: True if the checks are passed, False otherwise.
'''
#number or clusters
if (current_no_of_clusters < final_no_of_clusters):
print("Error: The current number of clusters is less than the final number of clusters.")
print("Returning the data as it is.")
return False
elif (current_no_of_clusters == final_no_of_clusters):
print("Error: The current number of clusters is equal to the final number of clusters.")
print("No further clustering required. Returning the data as it is.")
return False
else:
pass
if (final_no_of_clusters < 1):
print("Error: The final number of clusters is less than 1.")
print("Returning the data as it is.")
return False
return True
def single_linkage(data, final_clusters):
'''
Single linkage clustering algorithm.
The function will combine the nearest data clusters based on the minimum euclidian distance between the clusters,
calculated as the distance between the closest points.
The function will return the updated clustered data.
Parameters:
data (pandas dataframe): The data to be clustered. Last column is the cluster labels.
final_clusters (int): The number of clusters to be formed.
Returns:
data (pandas dataframe): The updated clustered data.
'''
#CHECK: if the last column does not contain int values as cluster labels, then return the data as it is and raise an error
if (data.iloc[:, -1].dtype != 'int64'):
print("Error: The last column does not contain int values as cluster labels.")
print("Returning the data as it is.")
return data
#CHECK: if the cluster labels are not in the range that starts from 1 then return the data as it is and raise an error
if (np.min(data.iloc[:, -1]) < 1):
print("Error: The cluster labels must start from 1.")
print("Returning the data as it is.")
return data
# Calculate the current number of clusters
clusters = np.unique(data.iloc[:, -1])
n_clusters = len(clusters)
#CHECK: For the validity of the number of clusters
if not(_perform_checks_(n_clusters, final_clusters)):
return data
"""Combine the two clusters having the closest nearest points (in terms of Euclidian distance).
Continue combining clusters until the number of clusters is equal to the final number of clusters"""
while (n_clusters > final_clusters):
# Find the closest points between the clusters
min_dist = np.inf #initialize the minimum distance to infinity
for i in range(n_clusters):
for j in range(i+1, n_clusters):
# Find the minimum distance between the clusters
dist = np.min(np.linalg.norm(data[data.iloc[:, -1] == clusters[i]].iloc[:, :-1].values[:, np.newaxis] - data[data.iloc[:, -1] == clusters[j]].iloc[:, :-1].values, axis=2))
if (dist < min_dist):
min_dist = dist
cluster1 = clusters[i]
cluster2 = clusters[j]
# Combine the two clusters having the closest nearest points
data.iloc[data.iloc[:, -1] == cluster2, -1] = cluster1
# Remove the second cluster from the list of clusters
clusters = np.unique(data.iloc[:, -1])
n_clusters = len(clusters)
#Renaming the clusters from 1 to n_clusters
for i in range(n_clusters):
data.iloc[data.iloc[:, -1] == clusters[i], -1] = i+1
return data
def complete_linkage(data, final_clusters):
'''
Complete linkage clustering algorithm.
The function will combine the nearest data clusters based on the maximum euclidian distance between the clusters,
calculated as the distance between the farthest points.
The function will return the updated clustered data.
Parameters:
data (pandas dataframe): The data to be clustered. Last column is the cluster labels.
final_clusters (int): The number of clusters to be formed.
Returns:
data (pandas dataframe): The updated clustered data.
'''
#CHECK: if the last column does not contain int values as cluster labels, then return the data as it is and raise an error
if (data.iloc[:, -1].dtype != 'int64'):
print("Error: The last column does not contain int values as cluster labels.")
print("Returning the data as it is.")
return data
#CHECK: if the cluster labels are not in the range that starts from 1 then return the data as it is and raise an error
if (np.min(data.iloc[:, -1]) < 1):
print("Error: The cluster labels must start from 1.")
print("Returning the data as it is.")
return data
# Calculate the current number of clusters
clusters = np.unique(data.iloc[:, -1])
n_clusters = len(clusters)
#CHECK: For the validity of the number of clusters
if not(_perform_checks_(n_clusters, final_clusters)):
return data
"""Combine the two clusters having the farthest nearest points (in terms of Euclidian distance).
Continue combining clusters until the number of clusters is equal to the final number of clusters"""
while (n_clusters > final_clusters):
# Find the farthest points between the clusters
max_dist = -np.inf #initialize the maximum distance to -infinity
for i in range(n_clusters):
for j in range(i+1, n_clusters):
# Find the maximum distance between the clusters
dist = np.max(np.linalg.norm(data[data.iloc[:, -1] == clusters[i]].iloc[:, :-1].values[:, np.newaxis] - data[data.iloc[:, -1] == clusters[j]].iloc[:, :-1].values, axis=2))
if (dist > max_dist):
max_dist = dist
cluster1 = clusters[i]
cluster2 = clusters[j]
# Combine the two clusters having the farthest nearest points
data.iloc[data.iloc[:, -1] == cluster2, -1] = cluster1
# Remove the second cluster from the list of clusters
clusters = np.unique(data.iloc[:, -1])
n_clusters = len(clusters)
#Renaming the clusters from 1 to n_clusters
for i in range(n_clusters):
data.iloc[data.iloc[:, -1] == clusters[i], -1] = i+1
return data
def average_linkage(data, final_clusters):
'''
Average linkage clustering algorithm.
The function will combine the nearest data clusters based on the average euclidian distance between the clusters,
calculated as the average distance between all the points.
The function will return the updated clustered data.
Parameters:
data (pandas dataframe): The data to be clustered. Last column is the cluster labels.
final_clusters (int): The number of clusters to be formed.
Returns:
data (pandas dataframe): The updated clustered data.
'''
#CHECK: if the last column does not contain int values as cluster labels, then return the data as it is and raise an error
if (data.iloc[:, -1].dtype != 'int64'):
print("Error: The last column does not contain int values as cluster labels.")
print("Returning the data as it is.")
return data
#CHECK: if the cluster labels are not in the range that starts from 1 then return the data as it is and raise an error
if (np.min(data.iloc[:, -1]) < 1):
print("Error: The cluster labels must start from 1.")
print("Returning the data as it is.")
return data
# Calculate the current number of clusters
clusters = np.unique(data.iloc[:, -1])
n_clusters = len(clusters)
#CHECK: For the validity of the number of clusters
if not(_perform_checks_(n_clusters, final_clusters)):
return data
"""Combine the two clusters having the smallest average distance between the points.
Continue combining clusters until the number of clusters is equal to the final number of clusters"""
while (n_clusters > final_clusters):
# Find the average distance between the points of the clusters
min_dist = np.inf
for i in range(n_clusters):
for j in range(i+1, n_clusters):
# Find the average distance between the points of the clusters
dist = np.mean(np.linalg.norm(data[data.iloc[:, -1] == clusters[i]].iloc[:, :-1].values[:, np.newaxis] - data[data.iloc[:, -1] == clusters[j]].iloc[:, :-1].values, axis=2))
if (dist < min_dist):
min_dist = dist
cluster1 = clusters[i]
cluster2 = clusters[j]
# Combine the two clusters having the smallest average distance between the points
data.iloc[data.iloc[:, -1] == cluster2, -1] = cluster1
# Remove the second cluster from the list of clusters
clusters = np.unique(data.iloc[:, -1])
n_clusters = len(clusters)
#Renaming the clusters from 1 to n_clusters
for i in range(n_clusters):
data.iloc[data.iloc[:, -1] == clusters[i], -1] = i+1
return data
def centroid_linkage(data, final_clusters):
'''
Centroid linkage clustering algorithm.
The function will combine the nearest data clusters based on the average euclidian distance between the clusters,
calculated as the distance between the centroids of the clusters.
The function will return the updated clustered data.
Parameters:
data (pandas dataframe): The data to be clustered. Last column is the cluster labels.
final_clusters (int): The number of clusters to be formed.
Returns:
data (pandas dataframe): The updated clustered data.
'''
#CHECK: if the last column does not contain int values as cluster labels, then return the data as it is and raise an error
if (data.iloc[:, -1].dtype != 'int64'):
print("Error: The last column does not contain int values as cluster labels.")
print("Returning the data as it is.")
return data
#CHECK: if the cluster labels are not in the range that starts from 1 then return the data as it is and raise an error
if (np.min(data.iloc[:, -1]) < 1):
print("Error: The cluster labels must start from 1.")
print("Returning the data as it is.")
return data
# Calculate the current number of clusters
clusters = np.unique(data.iloc[:, -1])
n_clusters = len(clusters)
#CHECK: For the validity of the number of clusters
if not(_perform_checks_(n_clusters, final_clusters)):
return data
"""Combine the two clusters having the closest nearest points (in terms of Euclidian distance).
Continue combining clusters until the number of clusters is equal to the final number of clusters"""
while (n_clusters > final_clusters):
# Find the closest points between the clusters
min_dist = np.inf #initialize the minimum distance to infinity
for i in range(n_clusters):
for j in range(i+1, n_clusters):
# Find the distance between the centroids of the clusters
dist = np.linalg.norm(np.mean(data[data.iloc[:, -1] == clusters[i]].iloc[:, :-1].values, axis=0) - np.mean(data[data.iloc[:, -1] == clusters[j]].iloc[:, :-1].values, axis=0))
if (dist < min_dist):
min_dist = dist
cluster1 = clusters[i]
cluster2 = clusters[j]
# Combine the two clusters having
data.iloc[data.iloc[:, -1] == cluster2, -1] = cluster1
# Remove the second cluster from the list of clusters
clusters = np.unique(data.iloc[:, -1])
n_clusters = len(clusters)
#Renaming the clusters from 1 to n_clusters
for i in range(n_clusters):
data.iloc[data.iloc[:, -1] == clusters[i], -1] = i+1
return data