-
Notifications
You must be signed in to change notification settings - Fork 0
/
kmean.py
146 lines (125 loc) · 5.63 KB
/
kmean.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
import numpy as np
def wkmean(k, data, weights=None, metric='Euclidian',
MAX_ITERATIONS=1000, method='k++'):
"""
Weighted k-means algorithm with relative weighted population count
for each cluster. Returns the best centroids in the form
positions, relative weighted populations and fit errror.
If no weights are provided all data points are assumed to be equal.
If no metric is provided, the standard Euclidian metric is used.
Possible methods are:
'k++_pdf' for kmeans++ initialization with PDF distributed choice
'k++_max' for kmeans++ initialization with farthest point choice
'uniform' for uniform [-0.5, 0.5] initialization
"""
def euclidian_metric(coords):
return (coords**2).sum(axis=1)
def kmeans_plus_plus_initialization(X, k, pdf_method=True):
'''Initialize one point at random. Loop for k - 1 iterations:
Next, calculate for each point the distance of the point from its
nearest center. Sample a point with a probability proportional to
the square of the distance of the point from its nearest center.'''
def distance_to_centroids(data, centers):
distance = np.sum((np.array(centers) - data[:, None, :])**2,
axis=2)
return distance
centers = []
X = np.array(X)
# Sample the first point
initial_index = np.random.choice(range(X.shape[0]), )
centers.append(X[initial_index, :].tolist())
# Loop and select the remaining points
for i in range(k - 1):
distance = distance_to_centroids(X, np.array(centers))
if i == 0:
pdf = distance/np.sum(distance)
centroid_new = X[np.random.choice(range(X.shape[0]),
replace=False,
p=pdf.flatten())]
else:
# Calculate the distance of each point to its nearest centroid
dist_min = np.min(distance, axis=1)
if pdf_method is True:
pdf = dist_min/np.sum(dist_min)
# Sample one point from the given distribution
centroid_new = X[np.random.choice(range(X.shape[0]),
replace=False,
p=pdf)]
else:
index_max = np.argmax(dist_min, axis=0)
centroid_new = X[index_max, :]
centers.append(centroid_new.tolist())
return np.array(centers)
# Uniform random initialization
def random_coord_initialization(data, k):
dim = np.array(data[1]).shape[0]
centroids = [np.random.rand(dim) - 0.5 for _ in range(k)]
return np.array(centroids)
# Stop criteria for k-means update
def should_stop(oldCentroids, centroids, iterations):
if iterations > MAX_ITERATIONS:
return True
return (oldCentroids == centroids).all()
# Returns array of indexes of closest centroid for each point
def assign_centroids(data, centroids):
distances = [0 for _ in range(k)]
for ii in range(k):
diff = data - centroids[ii]
distances[ii] = metric(diff)
distances = np.array(distances)
closest_centroid = distances.argmin(axis=0)
return closest_centroid
# New centroid positions, their relative weighted population and error
def update_centroids(data, weights, labels):
neighborhood = np.array([labels == jj for jj in range(k)])
neighborhood = neighborhood * weights
# Sets unnormalized cumulative centroid
centroids = neighborhood.dot(data)
populations = np.array([0 for _ in range(k)]).astype(float)
error = 0
for jj in range(k):
s = sum(neighborhood[jj])
if s != 0:
centroids[jj] = centroids[jj] / s
diff = data - centroids[jj]
distances = metric(diff)
error += distances.dot(neighborhood[jj])
populations[jj] = s
return np.array(centroids), populations, error
# Make the initial preparations
data = np.array(data)
if weights is None:
weights = np.array([1 for _ in data])
else:
weights = np.array(weights)
weights = weights / sum(weights)
if metric == 'Euclidian':
metric = euclidian_metric
if method == 'uniform':
def initialize_centroids(data, k):
return random_coord_initialization(data, k)
elif method == 'k++_max':
def initialize_centroids(data, k):
return kmeans_plus_plus_initialization(data, k, pdf_method=False)
elif method == 'k++_pdf':
def initialize_centroids(data, k):
return kmeans_plus_plus_initialization(data, k, pdf_method=True)
else:
raise NotImplementedError()
# Initialize clusters
centroids = initialize_centroids(data, k)
populations = [0 for _ in range(k)]
error = 10**15
# Initialize book keeping vars
iterations = 0
oldCentroids = None
# Repeat until centroids converge to a local minima
while not should_stop(oldCentroids, centroids, iterations):
oldCentroids = centroids
iterations += 1
# Find the closest centroid for each point
labels = assign_centroids(data, centroids)
# Update centroid positions
centroids, populations, error = update_centroids(data,
weights, labels)
return np.array(centroids), np.array(populations), error