-
Notifications
You must be signed in to change notification settings - Fork 0
/
eigencuts.py
292 lines (261 loc) · 9.69 KB
/
eigencuts.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
"""
Copyright 2013 University of Pittsburgh
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
import argparse
import numpy as np
import scipy.misc
import scipy.linalg as la
import scipy.sparse as sparse
import scipy.sparse.csgraph as csgraph
import scipy.sparse.linalg as sla
import matplotlib.pyplot as plot
import matplotlib.cm as cm
import affinity
def sensitivities(u, l, d, b, A):
"""
Computes half-life sensitivities for the given eigenvector/eigenvalue pair.
Parameters
----------
u : array, shape (N,)
Eigenvector.
l : float
Associated eigenvalue.
d : array, shape (N,)
Diagonal matrix.
b : float
Beta0.
A : array, shape (N, N)
Affinity matrix.
Returns
-------
S : array, shape (N, N)
Matrix of sensitivities.
"""
S = np.zeros(shape = A.shape)
for i in range(0, np.size(u)):
for j in range(0, np.size(u)):
if i == j: continue
if A[i, j] == 0.0: continue
# There is a connection here in the graph.
prefix = np.log(2) / (l * np.log(l) * np.log((l ** b) / 2.0))
firstterm = - ((u[i] / np.sqrt(d[i])) - (u[j] / np.sqrt(d[j]))) ** 2
secondterm = ((u[i] ** 2) / d[i]) + ((u[j] ** 2) / d[j])
S[i, j] = prefix * (firstterm + ((1 - l) * secondterm))
return S
def suppression(S, t, d):
"""
Performs non-maximal suppression. Sensitivities are suppressed (set to 0)
if there is a sensitivity in the same row or column that is more negative.
Parameters
----------
S : array, shape (N, N)
Matrix of sensitivities.
t : float
Tau.
d : float
Delta.
Returns
-------
Ss : array, shape (N, N)
Contains at most N non-suppressed sensitivities.
"""
Ss = np.copy(S)
for i in range(0, np.size(S, axis = 0)):
for j in range(0, np.size(S, axis = 1)):
if np.size(np.where(S[i, :] < S[i, j])) > 0 or np.size(np.where(S[:, j] < S[i, j])) > 0:
Ss[i, j] = (t / d)
return Ss
def update(A, S, t, d):
"""
Parameters
----------
A : array or sparse matrix, shape (N, N)
Original affinity matrix.
S : array, shape (N, N)
Sensitivity matrix.
t : float
Tau.
d : float
Delta.
Returns
-------
Ac : array, shape (N, N)
Trimmed affinity matrix.
n : integer
Number of cuts made to the underlying graph.
"""
Ac = A.copy()
n = 0
for i in range(0, np.size(A, axis = 0)):
for j in range(i, np.size(A, axis = 1)):
if i == j: continue
if S[i, j] >= (t / d): continue
# Cut the affinity matrix (symmetrically), and move the weight
# into the diagonal.
Ac[i, i] += Ac[i, j]
Ac[i, j] = 0
Ac[j, j] += Ac[j, i]
Ac[j, i] = 0
n += 2
return [Ac, n]
def eigencuts(A, k = 5, b = 40.0, e = 0.25, t = -0.2, verbose = True):
"""
Runs the Eigencuts algorithm on the affinity matrix.
Parameters
----------
A : array-like or sparse, shape (N, N)
Pairwise affinity data, representing the graph stucture.
k : integer
Eigenrank of the system, or number of top eigenvectors/eigenvalues to keep.
b : float
Beta_0.
e : float
Epsilon.
t : float
Tau.
verbose : boolean
If True, prints out a few debug statements.
Returns
-------
A : array-like or sparse, shape (N, N)
Processed affinity matrix.
n : integer
Number of iterations performed.
"""
# The diagonal matrix never changes, so let's just compute it now.
# Note: since A is symmetric, summing across the rows vs columns should
# give the same result (since a CSC matrix is optimized for columns).
D = None
if sparse.isspmatrix(A):
D = np.array(A.sum(axis = 0))[0, :]
else:
D = np.sum(A, axis = 1)
Ac = A.copy()
delta = np.median(D)
# Now, start the loop.
numCuts = 1
iterNum = 0
while (numCuts > 0):
# Compute the normalized graph Laplacian.
# Viciously ripped the following four lines from scikit-learn.
# Because this is bloody awesome.
values = None
vectors = None
if sparse.isspmatrix(Ac):
n_nodes = Ac.shape[0]
if not Ac.format == 'coo':
L = Ac.tocoo()
else:
L = Ac.copy()
w = np.sqrt(np.asarray(L.sum(axis = 0)).squeeze())
L.data /= w[L.row]
L.data /= w[L.col]
# Perform the SVD.
out = sla.svds(L, k = k)
vectors = out[0]
values = out[1]
else:
L = Ac.copy()
d = np.sqrt(D)
L /= d
L /= d[:, np.newaxis]
# Perform the SVD.
U, s, Vh = la.svd(L)
values = s[:k]
vectors = U[:, :k]
# For each eigenvector with a half-life beyond the threshold,
# compute sensitivities.
numCuts = 0
for i in range(0, np.size(vectors, axis = 1)):
v = vectors[:, i]
l = values[i]
if l == 1.0: continue # would result in divide-by-zero error
betak = -np.log(2) / np.log(l)
if betak <= (e * b): continue
Sk = sensitivities(v, l, D, b, Ac)
# Comment out the following line to ignore non-maximal suppression.
Sk = suppression(Sk, t, delta)
# Cut edges.
Ac, n = update(Ac, Sk, t, delta)
numCuts += n
iterNum += 1
numEdges = np.size(np.nonzero(Ac)[0]) - Ac.shape[0]
if verbose:
print '%s: %s cuts made, %s edges left.' % (iterNum, numCuts, numEdges)
return [Ac, numCuts]
if __name__ == "__main__":
parser = argparse.ArgumentParser(description = 'Eigencuts Spectral Clustering', \
epilog = 'lol pygencuts', add_help = 'How to use', \
prog = 'python eigencuts.py <options>')
parser.add_argument('-i', '--input', required = True,
help = 'Path to input.')
parser.add_argument('--type', choices = ['img', 'txt', 'aff'], required = True,
help = 'Specifies the type of input: comma-separated instances, or a PNG image.')
# Optional arguments.
parser.add_argument('--mahout_data', default = None,
help = 'Required if type == "aff". Specifies the path to the original data.')
parser.add_argument('-n', '--numdims', type = int, default = 150,
help = "Required if type is 'aff'. Number of points in the dataset.")
parser.add_argument('--eigenrank', type = int, default = 10,
help = 'Number of eigenvectors to use.')
parser.add_argument('-b', '--halflife', type = float, default = 40.0,
help = 'Value for beta0, the half-life threshold. [DEFAULT: 40.0]')
parser.add_argument('-e', '--epsilon', type = float, default = 0.25,
help = 'Multiplier for beta0. [DEFAULT: 0.25]')
parser.add_argument('-t', '--tau', type = float, default = -0.2,
help = 'Clamping thresholding on sensitivities. [DEFAULT: -0.2]')
parser.add_argument('-d', '--distance', type = float, default = 2.0,
help = 'For "txt" data, the neighborhood distance threshold for computing affinities. [DEFAULT: 2.0]')
parser.add_argument('-s', '--sigma', type = float, default = 1.0,
help = 'For "txt" data, the standard deviation used to compute affinities. [DEFAULT: 1.0]')
args = vars(parser.parse_args())
# Create the affinity matrix. This is the only step that is dependent
# on the format of the input.
A = None
image = None
data = None
if args['type'] == 'txt':
# Comma-delimited list, one instance per line.
data = np.loadtxt(args['input'], delimiter = ",")
A = affinity.cartesian_affinities(data, args['distance'], args['sigma'])
elif args['type'] == 'img':
# Image.
image = scipy.misc.imread(args['input'], flatten = True)
A = affinity.image_affinities(image)
else:
# Mahout-style affinity data.
data = np.loadtxt(args['mahout_data'], delimiter = ",")
A = affinity.mahout_affinities(args['input'], args['numdims'])
# Run the algorithm.
Ac, n = eigencuts(A, args['eigenrank'], args['halflife'], args['epsilon'], args['tau'])
# Perform connected component analysis on A.
Abin = None
if sparse.isspmatrix(Ac):
Abin = sparse.csc_matrix(Ac).sign().todense()
else:
Abin = np.sign(Ac)
numConn, connMap = csgraph.connected_components(Abin, directed = False)
print 'Found %s clusters.' % numConn
if args['type'] == 'img':
plot.figure(0)
plot.imshow(np.reshape(connMap, newshape = image.shape), interpolation = 'nearest')
plot.figure(1)
plot.imshow(image, interpolation = 'nearest', cmap = cm.gray)
plot.show()
else:
plot.figure(0)
colormap = cm.get_cmap("jet", numConn)
colorvals = colormap(np.arange(numConn))
colors = [colorvals[connMap[i]] for i in range(0, np.size(connMap))]
for i in range(0, data.shape[0]):
plot.plot(data[i, 0], data[i, 1], marker = 'o', c = colors[i])
plot.show()