-
Notifications
You must be signed in to change notification settings - Fork 0
/
KDE_subtraction.py
82 lines (64 loc) · 2.84 KB
/
KDE_subtraction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
from sklearn.neighbors import KernelDensity
from scipy.stats import gaussian_kde
from statsmodels.nonparametric.kde import KDEUnivariate
from statsmodels.nonparametric.kernel_density import KDEMultivariate
import numpy as np
import matplotlib.pyplot as plt
import csv
from scipy.stats.distributions import norm
def kde_scipy(x, x_grid, bandwidth=0.2, **kwargs):
"""Kernel Density Estimation with Scipy"""
# Note that scipy weights its bandwidth by the covariance of the
# input data. To make the results comparable to the other methods,
# we divide the bandwidth by the sample standard deviation here.
kde = gaussian_kde(x, bw_method=bandwidth / x.std(ddof=1), **kwargs)
return kde.evaluate(x_grid)
def kde_statsmodels_u(x, x_grid, bandwidth=0.2, **kwargs):
"""Univariate Kernel Density Estimation with Statsmodels"""
kde = KDEUnivariate(x)
kde.fit(bw=bandwidth, **kwargs)
return kde.evaluate(x_grid)
def kde_statsmodels_m(x, x_grid, bandwidth=0.2, **kwargs):
"""Multivariate Kernel Density Estimation with Statsmodels"""
kde = KDEMultivariate(x, bw=bandwidth * np.ones_like(x),
var_type='c', **kwargs)
return kde.pdf(x_grid)
def kde_sklearn(x, x_grid, bandwidth=0.2, **kwargs):
"""Kernel Density Estimation with Scikit-learn"""
kde_skl = KernelDensity(bandwidth=bandwidth, **kwargs)
kde_skl.fit(x[:, np.newaxis])
# score_samples() returns the log-likelihood of the samples
log_pdf = kde_skl.score_samples(x_grid[:, np.newaxis])
return np.exp(log_pdf)
kde_funcs = [kde_statsmodels_u, kde_statsmodels_m, kde_scipy, kde_sklearn]
kde_funcnames = ['Statsmodels-U', 'Statsmodels-M', 'Scipy', 'Scikit-learn']
print "Package Versions:"
import sklearn; print " scikit-learn:", sklearn.__version__
import scipy; print " scipy:", scipy.__version__
import statsmodels; print " statsmodels:", statsmodels.__version__
# The grid we'll use for plotting
x_grid = np.linspace(-4.5, 3.5, 1000)
#Import data from csv file
def getdata ():
with open('BF11Ages1s.csv', 'rb') as csvfile:
reader = csv.reader(csvfile)
ages = [map(float, row) for row in reader]
return ages
print getdata()
# Draw points from a bimodal distribution in 1D
np.random.seed(0)
x = np.concatenate([norm(-1, 1.).rvs(400),
norm(1, 0.3).rvs(100)])
pdf_true = (0.8 * norm(-1, 1).pdf(x_grid) +
0.2 * norm(1, 0.3).pdf(x_grid))
# Plot the three kernel density estimates
fig, ax = plt.subplots(1, 4, sharey=True,
figsize=(13, 3))
fig.subplots_adjust(wspace=0)
for i in range(4):
pdf = kde_funcs[i](getdata(), x_grid, bandwidth=0.2)
ax[i].plot(x_grid, pdf, color='blue', alpha=0.5, lw=3)
ax[i].fill(x_grid, pdf_true, ec='gray', fc='gray', alpha=0.4)
ax[i].set_title(kde_funcnames[i])
ax[i].set_xlim(-4.5, 3.5)
fig.savefig('foo.pdf')