run multiple chains

aloctavodia · aloctavodia · commit 308bd058f442 · 2020-06-24T08:13:52.000-03:00
diff --git a/pymc3/smc/sample_smc.py b/pymc3/smc/sample_smc.py
@@ -14,7 +14,19 @@
 
 import time
 import logging
+import warnings
+from collections.abc import Iterable
+import multiprocessing as mp
+import numpy as np
+
 from .smc import SMC
+from ..model import modelcontext
+from ..backends.base import MultiTrace
+from ..parallel_sampling import _cpu_count
+
+EXPERIMENTAL_WARNING = (
+    "Warning: SMC-ABC is an experimental step method and not yet recommended for use in PyMC3!"
+)
 
 
 def sample_smc(
@@ -30,6 +42,9 @@ def sample_smc(
     sum_stat="identity",
     model=None,
     random_seed=-1,
+    parallel=True,
+    chains=None,
+    cores=None,
 ):
     r"""
     Sequential Monte Carlo based sampling
@@ -69,6 +84,16 @@ def sample_smc(
     model: Model (optional if in ``with`` context)).
     random_seed: int
         random seed
+    parallel: bool
+        Distribute computations across cores if the number of cores is larger than 1.
+        Defaults to True.
+    cores : int
+        The number of chains to run in parallel. If ``None``, set to the number of CPUs in the
+        system, but at most 4.
+    chains : int
+        The number of chains to sample. Running independent chains is important for some
+        convergence statistics. If ``None`` (default), then set to either ``cores`` or 2, whichever
+        is larger.
 
     Notes
     -----
@@ -115,6 +140,89 @@ def sample_smc(
         %282007%29133:7%28816%29>`__
     """
 
+    _log = logging.getLogger("pymc3")
+    _log.info("Initializing SMC sampler...")
+
+    if cores is None:
+        cores = _cpu_count()
+
+    if chains is None:
+        chains = max(2, cores)
+
+    _log.info(f"Multiprocess sampling ({chains} chains in {cores} jobs)")
+
+    if random_seed == -1:
+        random_seed = None
+    if chains == 1 and isinstance(random_seed, int):
+        random_seed = [random_seed]
+    if random_seed is None or isinstance(random_seed, int):
+        if random_seed is not None:
+            np.random.seed(random_seed)
+        random_seed = [np.random.randint(2 ** 30) for _ in range(chains)]
+    if not isinstance(random_seed, Iterable):
+        raise TypeError("Invalid value for `random_seed`. Must be tuple, list or int")
+
+    if kernel.lower() == "abc":
+        warnings.warn(EXPERIMENTAL_WARNING)
+        if len(modelcontext(model).observed_RVs) != 1:
+            warnings.warn("SMC-ABC only works properly with models with one observed variable")
+
+    params = (
+        draws,
+        kernel,
+        n_steps,
+        start,
+        tune_steps,
+        p_acc_rate,
+        threshold,
+        epsilon,
+        dist_func,
+        sum_stat,
+        model,
+    )
+
+    t1 = time.time()
+    if parallel:
+        loggers = [_log] + [None] * (chains - 1)
+        pool = mp.Pool(cores)
+        results = pool.starmap(
+            sample_smc_int, [(*params, random_seed[i], i, loggers[i]) for i in range(chains)]
+        )
+
+        pool.close()
+        pool.join()
+    else:
+        results = []
+        for i in range(chains):
+            results.append((sample_smc_int(*params, random_seed[i], i, _log)))
+
+    traces, log_marginal_likelihoods = zip(*results)
+    trace = MultiTrace(traces)
+    trace.report._n_draws = draws
+    trace.report._n_tune = 0
+    trace.report._t_sampling = time.time() - t1
+    trace.report.log_marginal_likelihood = np.array(log_marginal_likelihoods)
+
+    return trace
+
+
+def sample_smc_int(
+    draws=2000,
+    kernel="metropolis",
+    n_steps=25,
+    start=None,
+    tune_steps=True,
+    p_acc_rate=0.99,
+    threshold=0.5,
+    epsilon=1.0,
+    dist_func="gaussian_kernel",
+    sum_stat="identity",
+    model=None,
+    random_seed=-1,
+    chain=0,
+    _log=None,
+):
+
     smc = SMC(
         draws=draws,
         kernel=kernel,
@@ -128,33 +236,21 @@ def sample_smc(
         sum_stat=sum_stat,
         model=model,
         random_seed=random_seed,
+        chain=chain,
     )
-
-    t1 = time.time()
-    _log = logging.getLogger("pymc3")
-    _log.info("Sample initial stage: ...")
     stage = 0
     smc.initialize_population()
     smc.setup_kernel()
     smc.initialize_logp()
 
     while smc.beta < 1:
         smc.update_weights_beta()
-        _log.info(
-            "Stage: {:3d} Beta: {:.3f} Steps: {:3d} Acce: {:.3f}".format(
-                stage, smc.beta, smc.n_steps, smc.acc_rate
-            )
-        )
+        if _log is not None:
+            _log.info(f"Stage: {stage:3d} Beta: {smc.beta:.3f}")
         smc.update_proposal()
         smc.resample()
-        for _ in range(2):
-            smc.mutate()
-            smc.tune()
+        smc.mutate()
+        smc.tune()
         stage += 1
 
-    trace = smc.posterior_to_trace()
-    trace.report._n_draws = smc.draws
-    trace.report._n_tune = 0
-    trace.report._t_sampling = time.time() - t1
-    trace.report.ess = smc.ess
-    return trace
+    return smc.posterior_to_trace()
diff --git a/pymc3/smc/smc.py b/pymc3/smc/smc.py
@@ -16,7 +16,6 @@
 
 import numpy as np
 from scipy.special import logsumexp
-import warnings
 from theano import function as theano_function
 from arviz import psislw
 
@@ -25,12 +24,6 @@
 from ..theanof import floatX, inputvars, make_shared_replacements, join_nonshared_inputs
 from ..sampling import sample_prior_predictive
 from ..backends.ndarray import NDArray
-from ..backends.base import MultiTrace
-
-EXPERIMENTAL_WARNING = (
-    "Warning: SMC-ABC methods are experimental step methods and not yet"
-    " recommended for use in PyMC3!"
-)
 
 
 class SMC:
@@ -48,6 +41,7 @@ def __init__(
         sum_stat="Identity",
         model=None,
         random_seed=-1,
+        chain=0,
     ):
 
         self.draws = draws
@@ -62,6 +56,7 @@ def __init__(
         self.sum_stat = sum_stat
         self.model = model
         self.random_seed = random_seed
+        self.chain = chain
 
         self.model = modelcontext(model)
 
@@ -73,11 +68,11 @@ def __init__(
         self.proposed = draws * n_steps
         self.acc_rate = 1
         self.acc_per_chain = np.ones(self.draws)
-        self.model.log_marginal_likelihood = 0
         self.variables = inputvars(self.model.vars)
         self.dimension = sum(v.dsize for v in self.variables)
         self.scalings = np.ones(self.draws) * 2.38 / (self.dimension) ** 0.5
         self.weights = np.ones(self.draws) / self.draws
+        self.log_marginal_likelihood = 0
 
     def initialize_population(self):
         """
@@ -113,9 +108,6 @@ def setup_kernel(self):
         self.prior_logp_func = logp_forw([self.model.varlogpt], self.variables, shared)
 
         if self.kernel.lower() == "abc":
-            warnings.warn(EXPERIMENTAL_WARNING)
-            if len(self.model.observed_RVs) != 1:
-                warnings.warn("SMC-ABC only works properly with models with one observed variable")
             simulator = self.model.observed_RVs[0]
             self.likelihood_logp_func = PseudoLikelihood(
                 self.epsilon,
@@ -165,9 +157,8 @@ def update_weights_beta(self):
             new_beta = 1
             log_weights_un = (new_beta - old_beta) * self.likelihood_logp
             log_weights = log_weights_un - logsumexp(log_weights_un)
-            self.ess = np.exp(-logsumexp(log_weights * 2))
 
-        self.model.log_marginal_likelihood += logsumexp(log_weights_un) - np.log(self.draws)
+        self.log_marginal_likelihood += logsumexp(log_weights_un) - np.log(self.draws)
         self.beta = new_beta
         self.weights = np.exp(log_weights)
 
@@ -178,6 +169,7 @@ def resample(self):
         resampling_indexes = np.random.choice(
             np.arange(self.draws), size=self.draws, p=self.weights
         )
+
         self.posterior = self.posterior[resampling_indexes]
         self.prior_logp = self.prior_logp[resampling_indexes]
         self.likelihood_logp = self.likelihood_logp[resampling_indexes]
@@ -239,6 +231,29 @@ def mutate(self):
         self.acc_per_chain = np.mean(ac_, axis=0)
         self.acc_rate = np.mean(ac_)
 
+    def posterior_to_trace_bk(self):
+        """
+        Save results into a PyMC3 trace
+        """
+        lenght_pos = len(self.posterior)
+        varnames = [v.name for v in self.variables]
+        straces = []
+        with self.model:
+            chain_lenght = int(lenght_pos / 10)
+            for chain in range(10):
+                strace = NDArray(self.model)
+                strace.setup(chain_lenght, chain)
+                for i in range(chain_lenght):
+                    value = []
+                    size = 0
+                    for var in varnames:
+                        shape, new_size = self.var_info[var]
+                        value.append(self.posterior[i][size : size + new_size].reshape(shape))
+                        size += new_size
+                    strace.record({k: v for k, v in zip(varnames, value)})
+                straces.append(strace)
+        return MultiTrace(straces)
+
     def posterior_to_trace(self):
         """
         Save results into a PyMC3 trace
@@ -248,16 +263,16 @@ def posterior_to_trace(self):
 
         with self.model:
             strace = NDArray(self.model)
-            strace.setup(lenght_pos, 0)
+            strace.setup(lenght_pos, self.chain)
         for i in range(lenght_pos):
             value = []
             size = 0
             for var in varnames:
                 shape, new_size = self.var_info[var]
                 value.append(self.posterior[i][size : size + new_size].reshape(shape))
                 size += new_size
-            strace.record({k: v for k, v in zip(varnames, value)})
-        return MultiTrace([strace])
+            strace.record(point={k: v for k, v in zip(varnames, value)})
+        return strace, self.log_marginal_likelihood
 
 
 def logp_forw(out_vars, vars, shared):
diff --git a/pymc3/tests/test_smc.py b/pymc3/tests/test_smc.py
@@ -79,9 +79,9 @@ def test_ml(self):
                 a = pm.Beta("a", alpha, beta)
                 y = pm.Bernoulli("y", a, observed=data)
                 trace = pm.sample_smc(2000)
-                marginals.append(model.marginal_log_likelihood)
+                marginals.append(trace.report.log_marginal_likelihood)
         # compare to the analytical result
-        assert abs(np.exp(marginals[1] - marginals[0]) - 4.0) <= 1
+        assert abs(np.exp(np.mean(marginals[1]) - np.mean(marginals[0])) - 4.0) <= 1
 
     def test_start(self):
         with pm.Model() as model:
@@ -110,7 +110,9 @@ def normal_sim(a, b):
 
     def test_one_gaussian(self):
         with self.SMABC_test:
-            trace = pm.sample_smc(draws=1000, kernel="ABC", sum_stat="sorted", epsilon=1)
+            trace = pm.sample_smc(
+                draws=1000, kernel="ABC", sum_stat="sorted", epsilon=1, parallel=False
+            )
 
         np.testing.assert_almost_equal(self.data.mean(), trace["a"].mean(), decimal=2)
         np.testing.assert_almost_equal(self.data.std(), trace["b"].mean(), decimal=1)