Skip to content

Commit 44ef1ef

Browse files
rhettingerSonicField
authored andcommitted
pythongh-115532 Add kde_random() to the statistic module (python#118210)
1 parent 1493daa commit 44ef1ef

File tree

4 files changed

+207
-63
lines changed

4 files changed

+207
-63
lines changed

Doc/library/statistics.rst

+25-59
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,7 @@ or sample.
7777
:func:`geometric_mean` Geometric mean of data.
7878
:func:`harmonic_mean` Harmonic mean of data.
7979
:func:`kde` Estimate the probability density distribution of the data.
80+
:func:`kde_random` Random sampling from the PDF generated by kde().
8081
:func:`median` Median (middle value) of data.
8182
:func:`median_low` Low median of data.
8283
:func:`median_high` High median of data.
@@ -311,6 +312,30 @@ However, for reading convenience, most of the examples show sorted sequences.
311312
.. versionadded:: 3.13
312313

313314

315+
.. function:: kde_random(data, h, kernel='normal', *, seed=None)
316+
317+
Return a function that makes a random selection from the estimated
318+
probability density function produced by ``kde(data, h, kernel)``.
319+
320+
Providing a *seed* allows reproducible selections. In the future, the
321+
values may change slightly as more accurate kernel inverse CDF estimates
322+
are implemented. The seed may be an integer, float, str, or bytes.
323+
324+
A :exc:`StatisticsError` will be raised if the *data* sequence is empty.
325+
326+
Continuing the example for :func:`kde`, we can use
327+
:func:`kde_random` to generate new random selections from an
328+
estimated probability density function:
329+
330+
>>> data = [-2.1, -1.3, -0.4, 1.9, 5.1, 6.2]
331+
>>> rand = kde_random(data, h=1.5, seed=8675309)
332+
>>> new_selections = [rand() for i in range(10)]
333+
>>> [round(x, 1) for x in new_selections]
334+
[0.7, 6.2, 1.2, 6.9, 7.0, 1.8, 2.5, -0.5, -1.8, 5.6]
335+
336+
.. versionadded:: 3.13
337+
338+
314339
.. function:: median(data)
315340

316341
Return the median (middle value) of numeric data, using the common "mean of
@@ -1148,65 +1173,6 @@ The final prediction goes to the largest posterior. This is known as the
11481173
'female'
11491174

11501175

1151-
Sampling from kernel density estimation
1152-
***************************************
1153-
1154-
The :func:`kde()` function creates a continuous probability density
1155-
function from discrete samples. Some applications need a way to make
1156-
random selections from that distribution.
1157-
1158-
The technique is to pick a sample from a bandwidth scaled kernel
1159-
function and recenter the result around a randomly chosen point from
1160-
the input data. This can be done with any kernel that has a known or
1161-
accurately approximated inverse cumulative distribution function.
1162-
1163-
.. testcode::
1164-
1165-
from random import choice, random, seed
1166-
from math import sqrt, log, pi, tan, asin, cos, acos
1167-
from statistics import NormalDist
1168-
1169-
kernel_invcdfs = {
1170-
'normal': NormalDist().inv_cdf,
1171-
'logistic': lambda p: log(p / (1 - p)),
1172-
'sigmoid': lambda p: log(tan(p * pi/2)),
1173-
'rectangular': lambda p: 2*p - 1,
1174-
'triangular': lambda p: sqrt(2*p) - 1 if p < 0.5 else 1 - sqrt(2 - 2*p),
1175-
'parabolic': lambda p: 2 * cos((acos(2*p-1) + pi) / 3),
1176-
'cosine': lambda p: 2*asin(2*p - 1)/pi,
1177-
}
1178-
1179-
def kde_random(data, h, kernel='normal'):
1180-
'Return a function that samples from kde() smoothed data.'
1181-
kernel_invcdf = kernel_invcdfs[kernel]
1182-
def rand():
1183-
return h * kernel_invcdf(random()) + choice(data)
1184-
return rand
1185-
1186-
For example:
1187-
1188-
.. doctest::
1189-
1190-
>>> discrete_samples = [-2.1, -1.3, -0.4, 1.9, 5.1, 6.2]
1191-
>>> rand = kde_random(discrete_samples, h=1.5)
1192-
>>> seed(8675309)
1193-
>>> selections = [rand() for i in range(10)]
1194-
>>> [round(x, 1) for x in selections]
1195-
[4.7, 7.4, 1.2, 7.8, 6.9, -1.3, 5.8, 0.2, -1.4, 5.7]
1196-
1197-
.. testcode::
1198-
:hide:
1199-
1200-
from statistics import kde
1201-
from math import isclose
1202-
1203-
# Verify that cdf / invcdf will round trip
1204-
xarr = [i/100 for i in range(-100, 101)]
1205-
for kernel, invcdf in kernel_invcdfs.items():
1206-
cdf = kde([0.0], h=1.0, kernel=kernel, cumulative=True)
1207-
for x in xarr:
1208-
assert isclose(invcdf(cdf(x)), x, abs_tol=1E-9)
1209-
12101176
..
12111177
# This modelines must appear within the last ten lines of the file.
12121178
kate: indent-width 3; remove-trailing-space on; replace-tabs on; encoding utf-8;

Doc/whatsnew/3.13.rst

+2-1
Original file line numberDiff line numberDiff line change
@@ -745,7 +745,8 @@ statistics
745745

746746
* Add :func:`statistics.kde` for kernel density estimation.
747747
This makes it possible to estimate a continuous probability density function
748-
from a fixed number of discrete samples.
748+
from a fixed number of discrete samples. Also added :func:`statistics.kde_random`
749+
for sampling from the estimated probability density function.
749750
(Contributed by Raymond Hettinger in :gh:`115863`.)
750751

751752
.. _whatsnew313-subprocess:

Lib/statistics.py

+100-3
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,7 @@
113113
'geometric_mean',
114114
'harmonic_mean',
115115
'kde',
116+
'kde_random',
116117
'linear_regression',
117118
'mean',
118119
'median',
@@ -138,12 +139,13 @@
138139
from itertools import count, groupby, repeat
139140
from bisect import bisect_left, bisect_right
140141
from math import hypot, sqrt, fabs, exp, erf, tau, log, fsum, sumprod
141-
from math import isfinite, isinf, pi, cos, sin, cosh, atan
142+
from math import isfinite, isinf, pi, cos, sin, tan, cosh, asin, atan, acos
142143
from functools import reduce
143144
from operator import itemgetter
144145
from collections import Counter, namedtuple, defaultdict
145146

146147
_SQRT2 = sqrt(2.0)
148+
_random = random
147149

148150
# === Exceptions ===
149151

@@ -978,11 +980,9 @@ def pdf(x):
978980
return sum(K((x - x_i) / h) for x_i in data) / (n * h)
979981

980982
def cdf(x):
981-
982983
n = len(data)
983984
return sum(W((x - x_i) / h) for x_i in data) / n
984985

985-
986986
else:
987987

988988
sample = sorted(data)
@@ -1078,6 +1078,7 @@ def quantiles(data, *, n=4, method='exclusive'):
10781078
if ld == 1:
10791079
return data * (n - 1)
10801080
raise StatisticsError('must have at least one data point')
1081+
10811082
if method == 'inclusive':
10821083
m = ld - 1
10831084
result = []
@@ -1086,6 +1087,7 @@ def quantiles(data, *, n=4, method='exclusive'):
10861087
interpolated = (data[j] * (n - delta) + data[j + 1] * delta) / n
10871088
result.append(interpolated)
10881089
return result
1090+
10891091
if method == 'exclusive':
10901092
m = ld + 1
10911093
result = []
@@ -1096,6 +1098,7 @@ def quantiles(data, *, n=4, method='exclusive'):
10961098
interpolated = (data[j - 1] * (n - delta) + data[j] * delta) / n
10971099
result.append(interpolated)
10981100
return result
1101+
10991102
raise ValueError(f'Unknown method: {method!r}')
11001103

11011104

@@ -1709,3 +1712,97 @@ def __getstate__(self):
17091712

17101713
def __setstate__(self, state):
17111714
self._mu, self._sigma = state
1715+
1716+
1717+
## kde_random() ##############################################################
1718+
1719+
def _newton_raphson(f_inv_estimate, f, f_prime, tolerance=1e-12):
1720+
def f_inv(y):
1721+
"Return x such that f(x) ≈ y within the specified tolerance."
1722+
x = f_inv_estimate(y)
1723+
while abs(diff := f(x) - y) > tolerance:
1724+
x -= diff / f_prime(x)
1725+
return x
1726+
return f_inv
1727+
1728+
def _quartic_invcdf_estimate(p):
1729+
sign, p = (1.0, p) if p <= 1/2 else (-1.0, 1.0 - p)
1730+
x = (2.0 * p) ** 0.4258865685331 - 1.0
1731+
if p >= 0.004 < 0.499:
1732+
x += 0.026818732 * sin(7.101753784 * p + 2.73230839482953)
1733+
return x * sign
1734+
1735+
_quartic_invcdf = _newton_raphson(
1736+
f_inv_estimate = _quartic_invcdf_estimate,
1737+
f = lambda t: 3/16 * t**5 - 5/8 * t**3 + 15/16 * t + 1/2,
1738+
f_prime = lambda t: 15/16 * (1.0 - t * t) ** 2)
1739+
1740+
def _triweight_invcdf_estimate(p):
1741+
sign, p = (1.0, p) if p <= 1/2 else (-1.0, 1.0 - p)
1742+
x = (2.0 * p) ** 0.3400218741872791 - 1.0
1743+
return x * sign
1744+
1745+
_triweight_invcdf = _newton_raphson(
1746+
f_inv_estimate = _triweight_invcdf_estimate,
1747+
f = lambda t: 35/32 * (-1/7*t**7 + 3/5*t**5 - t**3 + t) + 1/2,
1748+
f_prime = lambda t: 35/32 * (1.0 - t * t) ** 3)
1749+
1750+
_kernel_invcdfs = {
1751+
'normal': NormalDist().inv_cdf,
1752+
'logistic': lambda p: log(p / (1 - p)),
1753+
'sigmoid': lambda p: log(tan(p * pi/2)),
1754+
'rectangular': lambda p: 2*p - 1,
1755+
'parabolic': lambda p: 2 * cos((acos(2*p-1) + pi) / 3),
1756+
'quartic': _quartic_invcdf,
1757+
'triweight': _triweight_invcdf,
1758+
'triangular': lambda p: sqrt(2*p) - 1 if p < 1/2 else 1 - sqrt(2 - 2*p),
1759+
'cosine': lambda p: 2 * asin(2*p - 1) / pi,
1760+
}
1761+
_kernel_invcdfs['gauss'] = _kernel_invcdfs['normal']
1762+
_kernel_invcdfs['uniform'] = _kernel_invcdfs['rectangular']
1763+
_kernel_invcdfs['epanechnikov'] = _kernel_invcdfs['parabolic']
1764+
_kernel_invcdfs['biweight'] = _kernel_invcdfs['quartic']
1765+
1766+
def kde_random(data, h, kernel='normal', *, seed=None):
1767+
"""Return a function that makes a random selection from the estimated
1768+
probability density function created by kde(data, h, kernel).
1769+
1770+
Providing a *seed* allows reproducible selections within a single
1771+
thread. The seed may be an integer, float, str, or bytes.
1772+
1773+
A StatisticsError will be raised if the *data* sequence is empty.
1774+
1775+
Example:
1776+
1777+
>>> data = [-2.1, -1.3, -0.4, 1.9, 5.1, 6.2]
1778+
>>> rand = kde_random(data, h=1.5, seed=8675309)
1779+
>>> new_selections = [rand() for i in range(10)]
1780+
>>> [round(x, 1) for x in new_selections]
1781+
[0.7, 6.2, 1.2, 6.9, 7.0, 1.8, 2.5, -0.5, -1.8, 5.6]
1782+
1783+
"""
1784+
n = len(data)
1785+
if not n:
1786+
raise StatisticsError('Empty data sequence')
1787+
1788+
if not isinstance(data[0], (int, float)):
1789+
raise TypeError('Data sequence must contain ints or floats')
1790+
1791+
if h <= 0.0:
1792+
raise StatisticsError(f'Bandwidth h must be positive, not {h=!r}')
1793+
1794+
try:
1795+
kernel_invcdf = _kernel_invcdfs[kernel]
1796+
except KeyError:
1797+
raise StatisticsError(f'Unknown kernel name: {kernel!r}')
1798+
1799+
prng = _random.Random(seed)
1800+
random = prng.random
1801+
choice = prng.choice
1802+
1803+
def rand():
1804+
return choice(data) + h * kernel_invcdf(random())
1805+
1806+
rand.__doc__ = f'Random KDE selection with {h=!r} and {kernel=!r}'
1807+
1808+
return rand

Lib/test/test_statistics.py

+80
Original file line numberDiff line numberDiff line change
@@ -2426,6 +2426,86 @@ def integrate(func, low, high, steps=10_000):
24262426
self.assertEqual(f_hat(-1.0), 1/2)
24272427
self.assertEqual(f_hat(1.0), 1/2)
24282428

2429+
def test_kde_kernel_invcdfs(self):
2430+
kernel_invcdfs = statistics._kernel_invcdfs
2431+
kde = statistics.kde
2432+
2433+
# Verify that cdf / invcdf will round trip
2434+
xarr = [i/100 for i in range(-100, 101)]
2435+
for kernel, invcdf in kernel_invcdfs.items():
2436+
with self.subTest(kernel=kernel):
2437+
cdf = kde([0.0], h=1.0, kernel=kernel, cumulative=True)
2438+
for x in xarr:
2439+
self.assertAlmostEqual(invcdf(cdf(x)), x, places=5)
2440+
2441+
def test_kde_random(self):
2442+
kde_random = statistics.kde_random
2443+
StatisticsError = statistics.StatisticsError
2444+
kernels = ['normal', 'gauss', 'logistic', 'sigmoid', 'rectangular',
2445+
'uniform', 'triangular', 'parabolic', 'epanechnikov',
2446+
'quartic', 'biweight', 'triweight', 'cosine']
2447+
sample = [-2.1, -1.3, -0.4, 1.9, 5.1, 6.2]
2448+
2449+
# Smoke test
2450+
2451+
for kernel in kernels:
2452+
with self.subTest(kernel=kernel):
2453+
rand = kde_random(sample, h=1.5, kernel=kernel)
2454+
selections = [rand() for i in range(10)]
2455+
2456+
# Check error cases
2457+
2458+
with self.assertRaises(StatisticsError):
2459+
kde_random([], h=1.0) # Empty dataset
2460+
with self.assertRaises(TypeError):
2461+
kde_random(['abc', 'def'], 1.5) # Non-numeric data
2462+
with self.assertRaises(TypeError):
2463+
kde_random(iter(sample), 1.5) # Data is not a sequence
2464+
with self.assertRaises(StatisticsError):
2465+
kde_random(sample, h=0.0) # Zero bandwidth
2466+
with self.assertRaises(StatisticsError):
2467+
kde_random(sample, h=0.0) # Negative bandwidth
2468+
with self.assertRaises(TypeError):
2469+
kde_random(sample, h='str') # Wrong bandwidth type
2470+
with self.assertRaises(StatisticsError):
2471+
kde_random(sample, h=1.0, kernel='bogus') # Invalid kernel
2472+
2473+
# Test name and docstring of the generated function
2474+
2475+
h = 1.5
2476+
kernel = 'cosine'
2477+
prng = kde_random(sample, h, kernel)
2478+
self.assertEqual(prng.__name__, 'rand')
2479+
self.assertIn(kernel, prng.__doc__)
2480+
self.assertIn(repr(h), prng.__doc__)
2481+
2482+
# Approximate distribution test: Compare a random sample to the expected distribution
2483+
2484+
data = [-2.1, -1.3, -0.4, 1.9, 5.1, 6.2, 7.8, 14.3, 15.1, 15.3, 15.8, 17.0]
2485+
n = 1_000_000
2486+
h = 1.75
2487+
dx = 0.1
2488+
2489+
def p_expected(x):
2490+
return F_hat(x + dx) - F_hat(x - dx)
2491+
2492+
def p_observed(x):
2493+
# P(x-dx <= X < x+dx) / (2*dx)
2494+
i = bisect.bisect_left(big_sample, x - dx)
2495+
j = bisect.bisect_right(big_sample, x + dx)
2496+
return (j - i) / len(big_sample)
2497+
2498+
for kernel in kernels:
2499+
with self.subTest(kernel=kernel):
2500+
2501+
F_hat = statistics.kde(data, h, kernel, cumulative=True)
2502+
rand = kde_random(data, h, kernel, seed=8675309**2)
2503+
big_sample = sorted([rand() for i in range(n)])
2504+
2505+
for x in range(-40, 190):
2506+
x /= 10
2507+
self.assertTrue(math.isclose(p_observed(x), p_expected(x), abs_tol=0.001))
2508+
24292509

24302510
class TestQuantiles(unittest.TestCase):
24312511

0 commit comments

Comments
 (0)