pandas-dev · hayd · Jun 1, 2014 · Jun 1, 2014 · hayd · May 31, 2014
diff --git a/pandas/compat/scipy.py b/pandas/compat/scipy.py
diff --git a/pandas/stats/misc.py b/pandas/stats/misc.py
@@ -42,6 +42,94 @@ def correl_ts(frame1, frame2):
 def correl_xs(frame1, frame2):
     return correl_ts(frame1.T, frame2.T)
 
+def percentileofscore(a, score, kind='rank'):
+    """The percentile rank of a score relative to a list of scores.
+
+    A `percentileofscore` of, for example, 80% means that 80% of the
+    scores in `a` are below the given score. In the case of gaps or
+    ties, the exact definition depends on the optional keyword, `kind`.
+
+    Parameters
+    ----------
+    a: array like
+        Array of scores to which `score` is compared.
+    score: int or float
+        Score that is compared to the elements in `a`.
+    kind: {'rank', 'weak', 'strict', 'mean'}, optional
+        This optional parameter specifies the interpretation of the
+        resulting score:
+
+        - "rank": Average percentage ranking of score.  In case of
+                  multiple matches, average the percentage rankings of
+                  all matching scores.
+        - "weak": This kind corresponds to the definition of a cumulative
+                  distribution function.  A percentileofscore of 80%
+                  means that 80% of values are less than or equal
+                  to the provided score.
+        - "strict": Similar to "weak", except that only values that are
+                    strictly less than the given score are counted.
+        - "mean": The average of the "weak" and "strict" scores, often used in
+                  testing.  See
+
+                  http://en.wikipedia.org/wiki/Percentile_rank
+
+    Returns
+    -------
+    pcos : float
+        Percentile-position of score (0-100) relative to `a`.
+
+    Examples
+    --------
+    Three-quarters of the given values lie below a given score:
+
+    >>> percentileofscore([1, 2, 3, 4], 3)
+    75.0
+
+    With multiple matches, note how the scores of the two matches, 0.6
+    and 0.8 respectively, are averaged:
+
+    >>> percentileofscore([1, 2, 3, 3, 4], 3)
+    70.0
+
+    Only 2/5 values are strictly less than 3:
+
+    >>> percentileofscore([1, 2, 3, 3, 4], 3, kind='strict')
+    40.0
+
+    But 4/5 values are less than or equal to 3:
+
+    >>> percentileofscore([1, 2, 3, 3, 4], 3, kind='weak')
+    80.0
+
+    The average between the weak and the strict scores is
+
+    >>> percentileofscore([1, 2, 3, 3, 4], 3, kind='mean')
+    60.0
+
+    """
+    a = np.array(a)
+    n = len(a)
+
+    if kind == 'rank':
+        if not(np.any(a == score)):
+            a = np.append(a, score)
+            a_len = np.array(lrange(len(a)))
+        else:
+            a_len = np.array(lrange(len(a))) + 1.0
+
+        a = np.sort(a)
+        idx = [a == score]
+        pct = (np.mean(a_len[idx]) / n) * 100.0
+        return pct
+
+    elif kind == 'strict':
+        return sum(a < score) / float(n) * 100
+    elif kind == 'weak':
+        return sum(a <= score) / float(n) * 100
+    elif kind == 'mean':
+        return (sum(a < score) + sum(a <= score)) * 50 / float(n)
+    else:
+        raise ValueError("kind can only be 'rank', 'strict', 'weak' or 'mean'")
 
 def percentileRank(frame, column=None, kind='mean'):
     """
@@ -76,7 +164,6 @@ def percentileRank(frame, column=None, kind='mean'):
     -------
     TimeSeries or DataFrame, depending on input
     """
-    from pandas.compat.scipy import percentileofscore
     fun = lambda xs, score: percentileofscore(remove_na(xs),
                                               score, kind=kind)
 

diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py
@@ -11142,7 +11142,8 @@ def test_cumprod(self):
         df.cumprod(1)
 
     def test_rank(self):
-        from pandas.compat.scipy import rankdata
+        _skip_if_no_scipy()
+        from scipy.stats import rankdata
 
         self.frame['A'][::2] = np.nan
         self.frame['B'][::3] = np.nan
@@ -11235,7 +11236,8 @@ def test_rank2(self):
 
 
     def test_rank_na_option(self):
-        from pandas.compat.scipy import rankdata
+        _skip_if_no_scipy()
+        from scipy.stats import rankdata
 
         self.frame['A'][::2] = np.nan
         self.frame['B'][::3] = np.nan

diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py
@@ -4063,7 +4063,8 @@ def test_nsmallest_nlargest(self):
         assert_series_equal(s.nsmallest(), s.iloc[[2, 3, 0, 4]])
 
     def test_rank(self):
-        from pandas.compat.scipy import rankdata
+        _skip_if_no_scipy()
+        from scipy.stats import rankdata
 
         self.ts[::2] = np.nan
         self.ts[:10][::3] = 4.

diff --git a/pandas/tests/test_tseries.py b/pandas/tests/test_tseries.py
@@ -9,6 +9,12 @@
 import pandas.algos as algos
 from datetime import datetime
 
+def _skip_if_no_scipy():
+    try:
+        import scipy.stats
+    except ImportError:
+        raise nose.SkipTest("scipy not installed")
+
 class TestTseriesUtil(tm.TestCase):
     _multiprocess_can_split_ = True
 
@@ -335,7 +341,8 @@ def test_convert_objects_complex_number():
 
 
 def test_rank():
-    from pandas.compat.scipy import rankdata
+    _skip_if_no_scipy()
+    from scipy.stats import rankdata
 
     def _check(arr):
         mask = ~np.isfinite(arr)