Variational Inference uses floatX (#2221)

ferrine · twiecki · commit b81a9f7e4200 · 2017-05-26T13:17:24.000+02:00
* Variational Inference uses floatX * fix test that fardcode float32 * pylint fix * use floatX inside function * change scope * remove redundant floatX on python floats following @twiecki's suggestion
diff --git a/pymc3/data.py b/pymc3/data.py
@@ -61,7 +61,7 @@ def make_variable(self, gop, name=None):
     def __init__(self, generator):
         if not pm.vartypes.isgenerator(generator):
             raise TypeError('Object should be generator like')
-        self.test_value = copy(next(generator))
+        self.test_value = pm.smartfloatX(copy(next(generator)))
         # make pickling potentially possible
         self._yielded_test_value = False
         self.gen = generator
@@ -75,7 +75,7 @@ def __next__(self):
             self._yielded_test_value = True
             return self.test_value
         else:
-            return copy(next(self.gen))
+            return pm.smartfloatX(copy(next(self.gen)))
 
     # python2 generator
     next = __next__
diff --git a/pymc3/distributions/dist_math.py b/pymc3/distributions/dist_math.py
@@ -14,7 +14,8 @@
 from ..math import logdet as _logdet
 from pymc3.theanof import floatX
 
-c = - 0.5 * np.log(2 * np.pi)
+f = floatX
+c = - .5 * np.log(2. * np.pi)
 
 
 def bound(logp, *conditions, **kwargs):
@@ -81,34 +82,34 @@ def std_cdf(x):
     """
     Calculates the standard normal cumulative distribution function.
     """
-    return 0.5 + 0.5 * tt.erf(x / tt.sqrt(2.))
+    return .5 + .5 * tt.erf(x / tt.sqrt(2.))
 
 
 def i0(x):
     """
     Calculates the 0 order modified Bessel function of the first kind""
     """
-    return tt.switch(tt.lt(x, 5), 1 + x**2 / 4 + x**4 / 64 + x**6 / 2304 + x**8 / 147456
-                     + x**10 / 14745600 + x**12 / 2123366400,
-                     np.e**x / (2 * np.pi * x)**0.5 * (1 + 1 / (8 * x) + 9 / (128 * x**2) + 225 / (3072 * x**3)
-                                                       + 11025 / (98304 * x**4)))
+    return tt.switch(tt.lt(x, 5), 1. + x**2 / 4. + x**4 / 64. + x**6 / 2304. + x**8 / 147456.
+                     + x**10 / 14745600. + x**12 / 2123366400.,
+                     np.e**x / (2. * np.pi * x)**0.5 * (1. + 1. / (8. * x) + 9. / (128. * x**2) + 225. / (3072 * x**3)
+                                                       + 11025. / (98304. * x**4)))
 
 
 def i1(x):
     """
     Calculates the 1 order modified Bessel function of the first kind""
     """
-    return tt.switch(tt.lt(x, 5), x / 2 + x**3 / 16 + x**5 / 384 + x**7 / 18432 +
-                     x**9 / 1474560 + x**11 / 176947200 + x**13 / 29727129600,
-                     np.e**x / (2 * np.pi * x)**0.5 * (1 - 3 / (8 * x) + 15 / (128 * x**2) + 315 / (3072 * x**3)
-                                                       + 14175 / (98304 * x**4)))
+    return tt.switch(tt.lt(x, 5), x / 2. + x**3 / 16. + x**5 / 384. + x**7 / 18432. +
+                     x**9 / 1474560. + x**11 / 176947200. + x**13 / 29727129600.,
+                     np.e**x / (2. * np.pi * x)**0.5 * (1. - 3. / (8. * x) + 15. / (128. * x**2) + 315. / (3072. * x**3)
+                                                        + 14175. / (98304. * x**4)))
 
 
 def sd2rho(sd):
     """
     `sd -> rho` theano converter
     :math:`mu + sd*e = mu + log(1+exp(rho))*e`"""
-    return tt.log(tt.exp(sd) - 1)
+    return tt.log(tt.exp(sd) - 1.)
 
 
 def rho2sd(rho):
@@ -122,13 +123,15 @@ def log_normal(x, mean, **kwargs):
     """
     Calculate logarithm of normal distribution at point `x`
     with given `mean` and `std`
+
     Parameters
     ----------
     x : Tensor
         point of evaluation
     mean : Tensor
         mean of normal distribution
     kwargs : one of parameters `{sd, tau, w, rho}`
+
     Notes
     -----
     There are four variants for density parametrization.
@@ -143,7 +146,7 @@ def log_normal(x, mean, **kwargs):
     w = kwargs.get('w')
     rho = kwargs.get('rho')
     tau = kwargs.get('tau')
-    eps = kwargs.get('eps', 0.0)
+    eps = kwargs.get('eps', 0.)
     check = sum(map(lambda a: a is not None, [sd, w, rho, tau]))
     if check > 1:
         raise ValueError('more than one required kwarg is passed')
@@ -157,14 +160,15 @@ def log_normal(x, mean, **kwargs):
         std = rho2sd(rho)
     else:
         std = tau**(-1)
-    std += eps
-    return c - tt.log(tt.abs_(std)) - (x - mean) ** 2 / (2 * std ** 2)
+    std += f(eps)
+    return f(c) - tt.log(tt.abs_(std)) - (x - mean) ** 2 / (2. * std ** 2)
 
 
 def log_normal_mv(x, mean, gpu_compat=False, **kwargs):
     """
     Calculate logarithm of normal distribution at point `x`
     with given `mean` and `sigma` matrix
+
     Parameters
     ----------
     x : Tensor
@@ -173,8 +177,8 @@ def log_normal_mv(x, mean, gpu_compat=False, **kwargs):
         mean of normal distribution
     kwargs : one of parameters `{cov, tau, chol}`
 
-    Flags
-    ----------
+    Other Parameters
+    ----------------
     gpu_compat : False, because LogDet is not GPU compatible yet.
                  If this is set as true, the GPU compatible (but numerically unstable) log(det) is used.
 
@@ -212,10 +216,10 @@ def logdet(m):
         T = tt.nlinalg.matrix_inverse(S)
         log_det = -logdet(S)
     delta = x - mean
-    k = S.shape[0]
-    result = k * tt.log(2 * np.pi) - log_det
+    k = f(S.shape[0])
+    result = k * tt.log(2. * np.pi) - log_det
     result += delta.dot(T).dot(delta)
-    return -1 / 2. * result
+    return -.5 * result
 
 
 def MvNormalLogp():
@@ -240,25 +244,25 @@ def MvNormalLogp():
     cholesky = Cholesky(nofail=True, lower=True)
 
     n, k = delta.shape
-
+    n, k = f(n), f(k)
     chol_cov = cholesky(cov)
     diag = tt.nlinalg.diag(chol_cov)
     ok = tt.all(diag > 0)
 
     chol_cov = tt.switch(ok, chol_cov, tt.fill(chol_cov, 1))
     delta_trans = solve_lower(chol_cov, delta.T).T
 
-    result = n * k * tt.log(2 * np.pi)
-    result += 2.0 * n * tt.sum(tt.log(diag))
-    result += (delta_trans ** 2).sum()
-    result = -0.5 * result
+    result = n * k * tt.log(f(2) * np.pi)
+    result += f(2) * n * tt.sum(tt.log(diag))
+    result += (delta_trans ** f(2)).sum()
+    result = f(-.5) * result
     logp = tt.switch(ok, result, -np.inf)
 
     def dlogp(inputs, gradients):
         g_logp, = gradients
         cov, delta = inputs
 
-        g_logp.tag.test_value = floatX(np.array(1.))
+        g_logp.tag.test_value = floatX(1.)
         n, k = delta.shape
 
         chol_cov = cholesky(cov)
diff --git a/pymc3/math.py b/pymc3/math.py
@@ -33,11 +33,11 @@ def logsumexp(x, axis=None):
 
 
 def invlogit(x, eps=sys.float_info.epsilon):
-    return (1 - 2 * eps) / (1 + tt.exp(-x)) + eps
+    return (1. - 2. * eps) / (1. + tt.exp(-x)) + eps
 
 
 def logit(p):
-    return tt.log(p / (1 - p))
+    return tt.log(p / (floatX(1) - p))
 
 
 def flatten_list(tensors):
@@ -82,11 +82,11 @@ def __str__(self):
 
 
 def probit(p):
-    return -sqrt(2) * erfcinv(2 * p)
+    return -sqrt(2.) * erfcinv(2. * p)
 
 
 def invprobit(x):
-    return 0.5 * erfc(-x / sqrt(2))
+    return .5 * erfc(-x / sqrt(2.))
 
 
 def expand_packed_triangular(n, packed, lower=True, diagonal_only=False):
diff --git a/pymc3/model.py b/pymc3/model.py
@@ -198,8 +198,8 @@ def scaling(self):
                 denom = self.logp_elemwiset.shape[0]
             else:
                 denom = 1
-            coef = tt.as_tensor(total_size) / denom
-        return coef
+            coef = pm.floatX(tt.as_tensor(total_size)) / pm.floatX(denom)
+        return pm.floatX(coef)
 
 
 class InitContextMeta(type):
@@ -840,19 +840,20 @@ def init_value(self):
 def pandas_to_array(data):
     if hasattr(data, 'values'):  # pandas
         if data.isnull().any().any():  # missing values
-            return np.ma.MaskedArray(data.values, data.isnull().values)
+            ret = np.ma.MaskedArray(data.values, data.isnull().values)
         else:
-            return data.values
+            ret = data.values
     elif hasattr(data, 'mask'):
-        return data
+        ret = data
     elif isinstance(data, theano.gof.graph.Variable):
-        return data
+        ret = data
     elif sps.issparse(data):
-        return data
+        ret = data
     elif isgenerator(data):
-        return generator(data)
+        ret = generator(data)
     else:
-        return np.asarray(data)
+        ret = np.asarray(data)
+    return pm.smartfloatX(ret)
 
 
 def as_tensor(data, name, model, distribution):
diff --git a/pymc3/tests/conftest.py b/pymc3/tests/conftest.py
@@ -7,3 +7,12 @@ def theano_config():
     config = theano.configparser.change_flags(compute_test_value='raise')
     with config:
         yield
+
+
+@pytest.fixture(scope='function')
+def strict_float32():
+    config = theano.configparser.change_flags(
+        warn_float64='raise',
+        floatX='float32')
+    with config:
+        yield
diff --git a/pymc3/tests/test_theanof.py b/pymc3/tests/test_theanof.py
@@ -11,14 +11,14 @@
 def integers():
     i = 0
     while True:
-        yield np.float32(i)
+        yield floatX(i)
         i += 1
 
 
 def integers_ndim(ndim):
     i = 0
     while True:
-        yield np.ones((2,) * ndim) * i
+        yield floatX(np.ones((2,) * ndim) * i)
         i += 1
 
 
@@ -47,15 +47,15 @@ def test_ndim(self):
     def test_cloning_available(self):
         gop = generator(integers())
         res = gop ** 2
-        shared = theano.shared(np.float32(10))
+        shared = theano.shared(floatX(10))
         res1 = theano.clone(res, {gop: shared})
         f = theano.function([], res1)
         assert f() == np.float32(100)
 
     def test_default_value(self):
         def gen():
             for i in range(2):
-                yield np.ones((10, 10)) * i
+                yield floatX(np.ones((10, 10)) * i)
 
         gop = generator(gen(), np.ones((10, 10)) * 10)
         f = theano.function([], gop)
@@ -68,7 +68,7 @@ def gen():
     def test_set_gen_and_exc(self):
         def gen():
             for i in range(2):
-                yield np.ones((10, 10)) * i
+                yield floatX(np.ones((10, 10)) * i)
 
         gop = generator(gen())
         f = theano.function([], gop)
diff --git a/pymc3/tests/test_variational_inference.py b/pymc3/tests/test_variational_inference.py
@@ -66,6 +66,7 @@ def _test_aevb(self):
 
 
 class TestApproximates:
+    @pytest.mark.usefixtures('strict_float32')
     class Base(SeededTest):
         inference = None
         NITER = 12000
@@ -202,7 +203,7 @@ def test_optimizer_minibatch_with_callback(self):
             def create_minibatch(data):
                 while True:
                     data = np.roll(data, 100, axis=0)
-                    yield data[:100]
+                    yield pm.floatX(data[:100])
 
             minibatches = create_minibatch(data)
             with Model():
diff --git a/pymc3/theanof.py b/pymc3/theanof.py
@@ -18,6 +18,7 @@
            'inputvars',
            'cont_inputs',
            'floatX',
+           'smartfloatX',
            'jacobian',
            'CallableTensor',
            'join_nonshared_inputs',
@@ -67,6 +68,15 @@ def floatX(X):
         # Scalar passed
         return np.asarray(X, dtype=theano.config.floatX)
 
+
+def smartfloatX(x):
+    """
+    Convert non int types to floatX 
+    """
+    if str(x.dtype).startswith('float'):
+        x = floatX(x)
+    return x
+
 """
 Theano derivative functions
 """
diff --git a/pymc3/variational/approximations.py b/pymc3/variational/approximations.py
@@ -317,8 +317,10 @@ def randidx(self, size=None):
         else:
             size = tuple(np.atleast_1d(size))
         return (self._rng
-                .uniform(size=size, low=0.0, high=self.histogram.shape[0] - 1e-16)
-                .astype('int64'))
+                .uniform(size=size,
+                         low=pm.floatX(0),
+                         high=pm.floatX(self.histogram.shape[0]) - pm.floatX(1e-16))
+                .astype('int32'))
 
     def random_global(self, size=None, no_rand=False):
         theano_condition_is_here = isinstance(no_rand, tt.Variable)
diff --git a/pymc3/variational/operators.py b/pymc3/variational/operators.py
@@ -2,6 +2,7 @@
 from pymc3.variational.opvi import Operator, ObjectiveFunction, _warn_not_used
 from pymc3.variational.stein import Stein
 from pymc3.variational import updates
+import pymc3 as pm
 
 __all__ = [
     'KL',
@@ -59,7 +60,7 @@ def __call__(self, z, **kwargs):
             params = self.obj_params + kwargs['more_obj_params']
         else:
             params = self.test_params + kwargs['more_tf_params']
-            grad *= -1
+            grad *= pm.floatX(-1)
         grad = theano.clone(grad, {op.input_matrix: z})
         grad = tt.grad(None, params, known_grads={z: grad})
         grad = updates.total_norm_constraint(grad, 10)
@@ -103,7 +104,7 @@ def __init__(self, approx):
     def apply(self, f):
         # f: kernel function for KSD f(histogram) -> (k(x,.), \nabla_x k(x,.))
         stein = Stein(self.approx, f, self.input_matrix)
-        return -1 * stein.grad
+        return pm.floatX(-1) * stein.grad
 
 
 class AKSD(KSD):
diff --git a/pymc3/variational/opvi.py b/pymc3/variational/opvi.py
@@ -581,7 +581,7 @@ def normalizing_constant(self):
         # if not scale_cost_to_minibatch: t=1
         t = tt.switch(self.scale_cost_to_minibatch, t,
                       tt.constant(1, dtype=t.dtype))
-        return t
+        return pm.floatX(t)
 
     def _setup(self, **kwargs):
         pass
diff --git a/pymc3/variational/test_functions.py b/pymc3/variational/test_functions.py