-
Notifications
You must be signed in to change notification settings - Fork 0
/
bw_selection.py
298 lines (240 loc) · 9.78 KB
/
bw_selection.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Functions for bandwidth selection.
"""
import warnings
import numpy as np
import scipy
from scipy import fftpack
from scipy.optimize import brentq
from KDEpy.binning import linear_binning
from KDEpy.utils import autogrid
# Choose the largest available float on the system
try:
FLOAT = scipy.float128
except AttributeError:
FLOAT = np.float64
def _fixed_point(t, N, I_sq, a2):
r"""
Compute the fixed point as described in the paper by Botev et al.
.. math:
t = \xi \gamma^{5}(t)
Parameters
----------
t : float
Initial guess.
N : int
Number of data points.
I_sq : array-like
The numbers [1, 2, 9, 16, ...]
a2 : array-like
The DCT of the original data, divided by 2 and squared.
Examples
--------
>>> # From the matlab code
>>> ans = _fixed_point(0.01, 50, np.arange(1, 51), np.arange(1, 51))
>>> assert np.allclose(ans, 0.0099076220293967618515)
>>> # another
>>> ans = _fixed_point(0.07, 25, np.arange(1, 11), np.arange(1, 11))
>>> assert np.allclose(ans, 0.068342291525717486795)
References
----------
- Implementation by Daniel B. Smith, PhD, found at
https://github.com/Daniel-B-Smith/KDE-for-SciPy/blob/master/kde.py
"""
# This is important, as the powers might overflow if not done
I_sq = np.asfarray(I_sq, dtype=FLOAT)
a2 = np.asfarray(a2, dtype=FLOAT)
# ell = 7 corresponds to the 5 steps recommended in the paper
ell = 7
# Fast evaluation of |f^l|^2 using the DCT, see Plancherel theorem
f = (0.5) * np.pi ** (2 * ell) * np.sum(np.power(I_sq, ell) * a2 * np.exp(-I_sq * np.pi**2 * t))
# Norm of a function, should never be negative
if f <= 0:
return -1
for s in reversed(range(2, ell)):
# This could also be formulated using the double factorial n!!,
# but this is faster so and requires an import less
# Step one: estimate t_s from |f^(s+1)|^2
odd_numbers_prod = np.product(np.arange(1, 2 * s + 1, 2, dtype=FLOAT))
K0 = odd_numbers_prod / np.sqrt(2 * np.pi)
const = (1 + (1 / 2) ** (s + 1 / 2)) / 3
time = np.power((2 * const * K0 / (N * f)), (2.0 / (3.0 + 2.0 * s)))
# Step two: estimate |f^s| from t_s
f = (0.5) * np.pi ** (2 * s) * np.sum(np.power(I_sq, s) * a2 * np.exp(-I_sq * np.pi**2 * time))
# This is the minimizer of the AMISE
t_opt = np.power(2 * N * np.sqrt(np.pi) * f, -2.0 / 5)
# Return the difference between the original t and the optimal value
return t - t_opt
def _root(function, N, args):
"""
Root finding algorithm. Based on MATLAB implementation by Botev et al.
>>> # From the matlab code
>>> ints = np.arange(1, 51)
>>> ans = _root(_fixed_point, N=50, args=(50, ints, ints))
>>> np.allclose(ans, 9.237610787616029e-05)
True
"""
# From the implementation by Botev, the original paper author
# Rule of thumb of obtaining a feasible solution
N = max(min(1050, N), 50)
tol = 10e-12 + 0.01 * (N - 50) / 1000
# While a solution is not found, increase the tolerance and try again
found = 0
while found == 0:
try:
# Other viable solvers include: [brentq, brenth, ridder, bisect]
x, res = brentq(function, 0, tol, args=args, full_output=True, disp=False)
found = 1 if res.converged else 0
except ValueError:
x = 0
tol *= 2.0
found = 0
if x <= 0:
found = 0
# If the tolerance grows too large, minimize the function
if tol >= 1:
# raise ValueError("Root finding did not converge. Need more data.")
x = 0.01
return x
if not x > 0:
raise ValueError("Root finding failed to find positive solution.")
return x
def improved_sheather_jones(data, weights=None):
"""
The Improved Sheater Jones (ISJ) algorithm from the paper by Botev et al.
This algorithm computes the optimal bandwidth for a gaussian kernel,
and works very well for bimodal data (unlike other rules). The
disadvantage of this algorithm is longer computation time, and the fact
that this implementation does not always converge if very few data
points are supplied.
Understanding this algorithm is difficult, see:
https://books.google.no/books?id=Trj9HQ7G8TUC&pg=PA328&lpg=PA328&dq=
sheather+jones+why+use+dct&source=bl&ots=1ETdKd_6EF&sig=jZk4R515GB1xsn-
VZVnjr-JfjSI&hl=en&sa=X&ved=2ahUKEwi1_czNncTcAhVGhqYKHaPiBtcQ6AEwA3oEC
AcQAQ#v=onepage&q=sheather%20jones%20why%20use%20dct&f=false
Parameters
----------
data: array-like
The data points. Data must have shape (obs, 1).
weights: array-like, optional
One weight per data point. Must have shape (obs,). If None is
passed, uniform weights are used.
"""
obs, dims = data.shape
if not dims == 1:
raise ValueError("ISJ is only available for 1D data.")
n = 2**10
# weights <= 0 still affect calculations unless we remove them
if weights is not None:
data = data[weights > 0]
weights = weights[weights > 0]
# Setting `percentile` higher decreases the chance of overflow
xmesh = autogrid(data, boundary_abs=6, num_points=n, boundary_rel=0.5)
data = data.ravel()
xmesh = xmesh.ravel()
# Create an equidistant grid
R = np.max(data) - np.min(data)
# dx = R / (n - 1)
data = data.ravel()
N = len(np.unique(data))
# Use linear binning to bin the data on an equidistant grid, this is a
# prerequisite for using the FFT (evenly spaced samples)
initial_data = linear_binning(data.reshape(-1, 1), xmesh, weights)
assert np.allclose(initial_data.sum(), 1)
# Compute the type 2 Discrete Cosine Transform (DCT) of the data
a = fftpack.dct(initial_data)
# Compute the bandwidth
# The definition of a2 used here and in `_fixed_point` correspond to
# the one cited in this issue:
# https://github.com/tommyod/KDEpy/issues/95
I_sq = np.power(np.arange(1, n, dtype=FLOAT), 2)
a2 = a[1:] ** 2
# Solve for the optimal (in the AMISE sense) t
t_star = _root(_fixed_point, N, args=(N, I_sq, a2))
# The remainder of the algorithm computes the actual density
# estimate, but this function is only used to compute the
# bandwidth, since the bandwidth may be used for other kernels
# apart from the Gaussian kernel
# Smooth the initial data using the computed optimal t
# Multiplication in frequency domain is convolution
# integers = np.arange(n, dtype=float)
# a_t = a * np.exp(-integers**2 * np.pi ** 2 * t_star / 2)
# Diving by 2 done because of the implementation of fftpack.idct
# density = fftpack.idct(a_t) / (2 * R)
# Due to overflow, some values might be smaller than zero, correct it
# density[density < 0] = 0.
bandwidth = np.sqrt(t_star) * R
return bandwidth
def scotts_rule(data, weights=None):
"""
Scotts rule.
Scott (1992, page 152)
Scott, D.W. (1992) Multivariate Density Estimation. Theory, Practice and
Visualization. New York: Wiley.
Examples
--------
>>> data = np.arange(9).reshape(-1, 1)
>>> ans = scotts_rule(data)
>>> assert np.allclose(ans, 1.76474568962182)
"""
if not len(data.shape) == 2:
raise ValueError("Data must be of shape (obs, dims).")
if weights is not None:
warnings.warn("Scott's rule currently ignores all weights")
obs, dims = data.shape
if not dims == 1:
raise ValueError("Scotts rule is only available for 1D data.")
sigma = np.std(data, ddof=1)
# scipy.norm.ppf(.75) - scipy.norm.ppf(.25) -> 1.3489795003921634
IQR = (np.percentile(data, q=75) - np.percentile(data, q=25)) / 1.3489795003921634
sigma = min(sigma, IQR)
return sigma * np.power(obs, -1.0 / (dims + 4))
def silvermans_rule(data, weights=None):
"""
Returns optimal smoothing (standard deviation) if the data is close to
normal.
TODO: Extend to multidimensional:
https://docs.scipy.org/doc/scipy-0.13.0/reference/generated/scipy.
stats.gaussian_kde.html#r216
Examples
--------
>>> data = np.arange(9).reshape(-1, 1)
>>> ans = silvermans_rule(data)
>>> assert np.allclose(ans, 1.8692607078355594)
"""
if not len(data.shape) == 2:
raise ValueError("Data must be of shape (obs, dims).")
obs, dims = data.shape
if not dims == 1:
raise ValueError("Silverman's rule is only available for 1D data.")
if weights is not None:
warnings.warn("Silverman's rule currently ignores all weights")
if obs == 1:
return 1
if obs < 1:
raise ValueError("Data must be of length > 0.")
sigma = np.std(data, ddof=1)
# scipy.stats.norm.ppf(.75) - scipy.stats.norm.ppf(.25) -> 1.3489795003921634
IQR = (np.percentile(data, q=75) - np.percentile(data, q=25)) / 1.3489795003921634
sigma = min(sigma, IQR)
# The logic below is not related to silverman's rule, but if the data is constant
# it's nice to return a value instead of getting an error. A warning will be raised.
if sigma > 0:
return sigma * (obs * 3 / 4.0) ** (-1 / 5)
else:
# stats.norm.ppf(.99) - stats.norm.ppf(.01) = 4.6526957480816815
IQR = (np.percentile(data, q=99) - np.percentile(data, q=1)) / 4.6526957480816815
if IQR > 0:
bw = IQR * (obs * 3 / 4.0) ** (-1 / 5)
warnings.warn("Silverman's rule failed. Too many idential values. Setting bw = {}".format(bw))
return bw
# Here, all values are basically constant
warnings.warn("Silverman's rule failed. Too many idential values. Setting bw = 1.0")
return 1.0
_bw_methods = {
"silverman": silvermans_rule,
"scott": scotts_rule,
"ISJ": improved_sheather_jones,
}