Skip to content

refactor kde-related functions and small fixes #2191

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
May 18, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pymc3/plots/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from .autocorrplot import autocorrplot
from .compareplot import compareplot
from .forestplot import forestplot
from .kdeplot import kdeplot, kde2plot
from .kdeplot import kdeplot
from .posteriorplot import plot_posterior, plot_posterior_predictive_glm
from .traceplot import traceplot
from .energyplot import energyplot
36 changes: 9 additions & 27 deletions pymc3/plots/artists.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import numpy as np
from scipy.stats import kde, mode
from scipy.stats import mode

from pymc3.stats import hpd
from .utils import fast_kde
from .kdeplot import fast_kde, kdeplot


def _histplot_bins(column, bins=100):
Expand All @@ -16,7 +16,8 @@ def histplot_op(ax, data, alpha=.35):
"""Add a histogram for each column of the data to the provided axes."""
hs = []
for column in data.T:
hs.append(ax.hist(column, bins=_histplot_bins(column), alpha=alpha, align='left'))
hs.append(ax.hist(column, bins=_histplot_bins(
column), alpha=alpha, align='left'))
ax.set_xlim(np.min(data) - 0.5, np.max(data) + 0.5)
return hs

Expand All @@ -32,7 +33,8 @@ def kdeplot_op(ax, data, prior=None, prior_alpha=1, prior_style='--'):
x = np.linspace(l, u, len(density))
if prior is not None:
p = prior.logp(x).eval()
pls.append(ax.plot(x, np.exp(p), alpha=prior_alpha, ls=prior_style))
pls.append(ax.plot(x, np.exp(p),
alpha=prior_alpha, ls=prior_style))

ls.append(ax.plot(x, density))
except ValueError:
Expand All @@ -46,26 +48,7 @@ def kdeplot_op(ax, data, prior=None, prior_alpha=1, prior_style='--'):
return ls, pls


def kde2plot_op(ax, x, y, grid=200, **kwargs):
xmin = x.min()
xmax = x.max()
ymin = y.min()
ymax = y.max()
extent = kwargs.pop('extent', [])
if len(extent) != 4:
extent = [xmin, xmax, ymin, ymax]

grid = grid * 1j
X, Y = np.mgrid[xmin:xmax:grid, ymin:ymax:grid]
positions = np.vstack([X.ravel(), Y.ravel()])
values = np.vstack([x, y])
kernel = kde.gaussian_kde(values)
Z = np.reshape(kernel(positions).T, X.shape)

ax.imshow(np.rot90(Z), extent=extent, **kwargs)


def plot_posterior_op(trace_values, figsize, ax, kde_plot, point_estimate, round_to,
def plot_posterior_op(trace_values, ax, kde_plot, point_estimate, round_to,
alpha_level, ref_val, rope, text_size=16, **kwargs):
"""Artist to draw posterior."""
def format_as_percent(x, round_to=0):
Expand Down Expand Up @@ -139,9 +122,8 @@ def set_key_if_doesnt_exist(d, key, value):
d[key] = value

if kde_plot:
density, l, u = fast_kde(trace_values)
x = np.linspace(l, u, len(density))
ax.plot(x, density, figsize=figsize, **kwargs)
kdeplot(trace_values, alpha=0.35, ax=ax, **kwargs)

else:
set_key_if_doesnt_exist(kwargs, 'bins', 30)
set_key_if_doesnt_exist(kwargs, 'edgecolor', 'w')
Expand Down
36 changes: 18 additions & 18 deletions pymc3/plots/energyplot.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
import matplotlib.pyplot as plt
import numpy as np

from .utils import fast_kde
from .kdeplot import kdeplot

def energyplot(trace, kind='kde', figsize=None, ax=None, legend=True, lw=0, alpha=0.5, frame=True, **kwargs):

def energyplot(trace, kind='kde', figsize=None, ax=None, legend=True, lw=0,
alpha=0.35, frame=True, **kwargs):
"""Plot energy transition distribution and marginal energy distribution in order
to diagnose poor exploration by HMC algorithms.

Expand All @@ -25,49 +27,47 @@ def energyplot(trace, kind='kde', figsize=None, ax=None, legend=True, lw=0, alph
Alpha value for plot line. Defaults to 0.35.
frame : bool
Flag for plotting frame around figure.

Returns
-------

ax : matplotlib axes
"""

try:
energy = trace['energy']
except KeyError:
print('There is no energy information in the passed trace.')
return ax
series_dict = {'Marginal energy distribution': energy - energy.mean(),
'Energy transition distribution': np.diff(energy)}
series = [('Marginal energy distribution', energy - energy.mean()),
('Energy transition distribution', np.diff(energy))]

if figsize is None:
figsize = (8, 6)

if ax is None:
_, ax = plt.subplots(figsize=figsize)

if kind == 'kde':
for series in series_dict:
density, l, u = fast_kde(series_dict[series])
x = np.linspace(l, u, len(density))
ax.plot(x, density, label=series, **kwargs)
ax.fill_between(x, density, alpha=alpha)

for label, value in series:
kdeplot(value, label=label, alpha=alpha, shade=True, ax=ax,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do we want to propagate the return values?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the comment. Not sure if I am really understanding you. I think this is what we always do, isn't it?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah, sorry, this is fine (ax and fig get updated in place here)

**kwargs)

elif kind == 'hist':
for series in series_dict:
ax.hist(series_dict[series], lw=lw, alpha=alpha, label=series, **kwargs)
for label, value in series:
ax.hist(value, lw=lw, alpha=alpha, label=label, **kwargs)

else:
raise ValueError('Plot type {} not recognized.'.format(kind))

ax.set_xticks([])
ax.set_yticks([])

if not frame:
for spine in ax.spines.values():
spine.set_visible(False)

if legend:
ax.legend()

return ax
75 changes: 65 additions & 10 deletions pymc3/plots/kdeplot.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,72 @@
import matplotlib.pyplot as plt
import numpy as np
from scipy.signal import gaussian, convolve

from .artists import kdeplot_op, kde2plot_op


def kdeplot(data, ax=None):
def kdeplot(trace_values, label=None, alpha=0.35, shade=False, ax=None,
**kwargs):
if ax is None:
_, ax = plt.subplots(1, 1, squeeze=True)
kdeplot_op(ax, data)
_, ax = plt.subplots()
density, l, u = fast_kde(trace_values)
x = np.linspace(l, u, len(density))
ax.plot(x, density, label=label, **kwargs)
if shade:
ax.fill_between(x, density, alpha=alpha, **kwargs)
return ax


def kde2plot(x, y, grid=200, ax=None, **kwargs):
if ax is None:
_, ax = plt.subplots(1, 1, squeeze=True)
kde2plot_op(ax, x, y, grid, **kwargs)
return ax
def fast_kde(x):
"""
A fft-based Gaussian kernel density estimate (KDE) for computing
the KDE on a regular grid.
The code was adapted from https://github.com/mfouesneau/faststats

Parameters
----------

x : Numpy array or list

Returns
-------

grid: A gridded 1D KDE of the input points (x).
xmin: minimum value of x
xmax: maximum value of x

"""
x = x[~np.isnan(x)]
x = x[~np.isinf(x)]
n = len(x)
nx = 200

# add small jitter in case input values are the same
x += np.random.uniform(-1E-12, 1E-12, size=n)
xmin, xmax = np.min(x), np.max(x)

# compute histogram
bins = np.linspace(xmin, xmax, nx)
xyi = np.digitize(x, bins)
dx = (xmax - xmin) / (nx - 1)
grid = np.histogram(x, bins=nx)[0]

# Scaling factor for bandwidth
scotts_factor = n ** (-0.2)
# Determine the bandwidth using Scott's rule
std_x = np.std(xyi)
kern_nx = int(np.round(scotts_factor * 2 * np.pi * std_x))

# Evaluate the gaussian function on the kernel grid
kernel = np.reshape(gaussian(kern_nx, scotts_factor * std_x), kern_nx)

# Compute the KDE
# use symmetric padding to correct for data boundaries in the kde
npad = np.min((nx, 2 * kern_nx))

grid = np.concatenate([grid[npad: 0: -1], grid, grid[nx: nx - npad: -1]])
grid = convolve(grid, kernel, mode='same')[npad: npad + nx]

norm_factor = n * dx * (2 * np.pi * std_x ** 2 * scotts_factor ** 2) ** 0.5

grid = grid / norm_factor

return grid, xmin, xmax
6 changes: 3 additions & 3 deletions pymc3/plots/posteriorplot.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,8 +79,8 @@ def get_trace_dict(tr, varnames):
if figsize is None:
figsize = (6, 2)
if ax is None:
fig, ax = plt.subplots()
plot_posterior_op(transform(trace), figsize=figsize, ax=ax, kde_plot=kde_plot,
fig, ax = plt.subplots(figsize=figsize)
plot_posterior_op(transform(trace), ax=ax, kde_plot=kde_plot,
point_estimate=point_estimate, round_to=round_to,
alpha_level=alpha_level, ref_val=ref_val, rope=rope, text_size=text_size, **kwargs)
else:
Expand All @@ -94,7 +94,7 @@ def get_trace_dict(tr, varnames):

for a, v in zip(np.atleast_1d(ax), trace_dict):
tr_values = transform(trace_dict[v])
plot_posterior_op(tr_values, figsize=figsize, ax=a, kde_plot=kde_plot,
plot_posterior_op(tr_values, ax=a, kde_plot=kde_plot,
point_estimate=point_estimate, round_to=round_to,
alpha_level=alpha_level, ref_val=ref_val, rope=rope, text_size=text_size, **kwargs)
a.set_title(v)
Expand Down
58 changes: 0 additions & 58 deletions pymc3/plots/utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import matplotlib.pyplot as plt
import numpy as np
from scipy.signal import gaussian, convolve
# plotting utilities can all be in this namespace
from ..util import get_default_varnames # pylint: disable=unused-import

Expand Down Expand Up @@ -39,60 +38,3 @@ def make_2d(a):
newshape = np.product(a.shape[1:]).astype(int)
a = a.reshape((n, newshape), order='F')
return a


def fast_kde(x):
"""
A fft-based Gaussian kernel density estimate (KDE) for computing
the KDE on a regular grid.
The code was adapted from https://github.com/mfouesneau/faststats

Parameters
----------

x : Numpy array or list

Returns
-------

grid: A gridded 1D KDE of the input points (x).
xmin: minimum value of x
xmax: maximum value of x

"""
x = x[~np.isnan(x)]
x = x[~np.isinf(x)]
n = len(x)
nx = 200

# add small jitter in case input values are the same
x += np.random.uniform(-1E-12, 1E-12, size=n)
xmin, xmax = np.min(x), np.max(x)

# compute histogram
bins = np.linspace(xmin, xmax, nx)
xyi = np.digitize(x, bins)
dx = (xmax - xmin) / (nx - 1)
grid = np.histogram(x, bins=nx)[0]

# Scaling factor for bandwidth
scotts_factor = n ** (-0.2)
# Determine the bandwidth using Scott's rule
std_x = np.std(xyi)
kern_nx = int(np.round(scotts_factor * 2 * np.pi * std_x))

# Evaluate the gaussian function on the kernel grid
kernel = np.reshape(gaussian(kern_nx, scotts_factor * std_x), kern_nx)

# Compute the KDE
# use symmetric padding to correct for data boundaries in the kde
npad = np.min((nx, 2 * kern_nx))

grid = np.concatenate([grid[npad: 0: -1], grid, grid[nx: nx - npad: -1]])
grid = convolve(grid, kernel, mode='same')[npad: npad + nx]

norm_factor = n * dx * (2 * np.pi * std_x ** 2 * scotts_factor ** 2) ** 0.5

grid = grid / norm_factor

return grid, xmin, xmax