Add HyperGeometric distribution to discrete.py; Add tests

Harivallabha · Harivallabha · commit ddf561753cfd · 2020-12-03T18:19:39.000+05:30
diff --git a/pymc3/distributions/__init__.py b/pymc3/distributions/__init__.py
@@ -63,6 +63,7 @@
 from .discrete import ZeroInflatedBinomial
 from .discrete import DiscreteUniform
 from .discrete import Geometric
+from .discrete import HyperGeometric
 from .discrete import Categorical
 from .discrete import OrderedLogistic
 
diff --git a/pymc3/distributions/discrete.py b/pymc3/distributions/discrete.py
@@ -38,6 +38,7 @@
     "ZeroInflatedNegativeBinomial",
     "DiscreteUniform",
     "Geometric",
+    "HyperGeometric",
     "Categorical",
     "OrderedLogistic",
 ]
@@ -809,6 +810,115 @@ def logp(self, value):
         return bound(tt.log(p) + logpow(1 - p, value - 1), 0 <= p, p <= 1, value >= 1)
 
 
+class HyperGeometric(Discrete):
+    R"""
+    Discrete hypergeometric distribution.
+
+    The probability of :math:`x` successes in a sequence of :math:`n` bernoulli
+    trials taken without replacement from a population of :math:`N` objects,
+    containing :math:`k` good (or successful or Type I) objects.
+    The pmf of this distribution is
+
+    .. math:: f(x \mid N, n, k) = \frac{\binom{k}{x}\binom{N-k}{n-x}}{\binom{N}{n}}
+
+    .. plot::
+
+        import matplotlib.pyplot as plt
+        import numpy as np
+        import scipy.stats as st
+        plt.style.use('seaborn-darkgrid')
+        x = np.arange(1, 15)
+        N = 50
+        k = 10
+        for n in [20, 25]:
+            pmf = st.hypergeom.pmf(x, N, k, n)
+            plt.plot(x, pmf, '-o', label='n = {}'.format(n))
+        plt.plot(x, pmf, '-o', label='N = {}'.format(N))
+        plt.plot(x, pmf, '-o', label='k = {}'.format(k))
+        plt.xlabel('x', fontsize=12)
+        plt.ylabel('f(x)', fontsize=12)
+        plt.legend(loc=1)
+        plt.show()
+
+    ========  =============================
+
+    Support   :math:`x in [max(0, n - \mathbb{N} + k), min(k, n)]`
+    Mean      :math:`\dfrac{nk}{N}`
+    Variance  :math:`\dfrac{(N-n)nk(N-k)}{(N-1)N^2}`
+    ========  =============================
+
+    Parameters
+    ----------
+    N : integer
+        Total size of the population
+    n : integer
+        Number of samples drawn from the population
+    k : integer
+        Number of successful individuals in the population
+    """
+
+    def __init__(self, N, k, n, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.N = intX(N)
+        self.k = intX(k)
+        self.n = intX(n)
+        self.mode = intX(tt.floor((n + 1) * (k + 1) / (N + 2)))
+
+    def random(self, point=None, size=None):
+        r"""
+        Draw random values from HyperGeometric distribution.
+
+        Parameters
+        ----------
+        point : dict, optional
+            Dict of variable values on which random values are to be
+            conditioned (uses default point if not specified).
+        size : int, optional
+            Desired size of random sample (returns one sample if not
+            specified).
+
+        Returns
+        -------
+        array
+        """
+        N, n, k = draw_values([self.N, self.n, self.k], point=point, size=size)
+        return generate_samples(
+            np.random.hypergeometric, N, n, k, dist_shape=self.shape, size=size
+        )
+
+    def logp(self, value):
+        r"""
+        Calculate log-probability of HyperGeometric distribution at specified value.
+
+        Parameters
+        ----------
+        value : numeric
+            Value(s) for which log-probability is calculated. If the log probabilities for multiple
+            values are desired the values must be provided in a numpy array or theano tensor
+
+        Returns
+        -------
+        TensorVariable
+        """
+        N = self.N
+        k = self.k
+        n = self.n
+        tot, good = N, k
+        bad = tot - good
+        result = (
+            betaln(good + 1, 1)
+            + betaln(bad + 1, 1)
+            + betaln(tot - n + 1, n + 1)
+            - betaln(value + 1, good - value + 1)
+            - betaln(n - value + 1, bad - n + value + 1)
+            - betaln(tot + 1, 1)
+        )
+        lower = tt.switch(tt.gt(n - N + k, 0), n - N + k, 0)
+        upper = tt.switch(tt.lt(k, n), k, n)
+        nonint_value = (value != intX(tt.floor(value)))
+        return bound(result, lower <= value, value <= upper, nonint_value)
+
+
 class DiscreteUniform(Discrete):
     R"""
     Discrete uniform distribution.
diff --git a/pymc3/tests/test_distributions.py b/pymc3/tests/test_distributions.py
@@ -75,6 +75,7 @@
     Rice,
     Kumaraswamy,
     Moyal,
+    HyperGeometric,
 )
 
 from ..distributions import continuous
@@ -790,6 +791,14 @@ def test_geometric(self):
             Geometric, Nat, {"p": Unit}, lambda value, p: np.log(sp.geom.pmf(value, p))
         )
 
+    def test_hypergeometric(self):
+        self.pymc3_matches_scipy(
+            HyperGeometric,
+            Nat,
+            {"N": NatSmall, "n": NatSmall, "k": NatSmall},
+            lambda value, N, n, k: sp.hypergeom.logpmf(value, N, k, n),
+        )
+
     def test_negative_binomial(self):
         def test_fun(value, mu, alpha):
             return sp.nbinom.logpmf(value, alpha, 1 - mu / (mu + alpha))
diff --git a/pymc3/tests/test_distributions_random.py b/pymc3/tests/test_distributions_random.py
@@ -506,6 +506,10 @@ class TestGeometric(BaseTestCases.BaseTestCase):
     distribution = pm.Geometric
     params = {"p": 0.5}
 
+class TestHyperGeometric(BaseTestCases.BaseTestCase):
+    distribution = pm.HyperGeometric
+    params = {"N": 50, "n": 25, "k": 10}
+
 
 class TestMoyal(BaseTestCases.BaseTestCase):
     distribution = pm.Moyal
@@ -739,6 +743,9 @@ def ref_rand(size, alpha, mu):
     def test_geometric(self):
         pymc3_random_discrete(pm.Geometric, {"p": Unit}, size=500, fails=50, ref_rand=nr.geometric)
 
+    def test_hypergeometric(self):
+        pymc3_random_discrete(pm.HyperGeometric, {"N": Nat, "n": Nat, "k": Nat}, size=500, fails=50, ref_rand=nr.hypergeometric)
+
     def test_discrete_uniform(self):
         def ref_rand(size, lower, upper):
             return st.randint.rvs(lower, upper + 1, size=size)