pandas-dev · jreback · Nov 9, 2020 · Nov 1, 2020 · Nov 1, 2020 · Nov 1, 2020
diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst
@@ -228,6 +228,7 @@ Other enhancements
 - :class:`Rolling` now supports the ``closed`` argument for fixed windows (:issue:`34315`)
 - :class:`DatetimeIndex` and :class:`Series` with ``datetime64`` or ``datetime64tz`` dtypes now support ``std`` (:issue:`37436`)
 - :class:`Window` now supports all Scipy window types in ``win_type`` with flexible keyword argument support (:issue:`34556`)
+- Improve numerical stability for :meth:`Rolling.skew()`, :meth:`Rolling.kurt()`, :meth:`Expanding.skew()` and :meth:`Expanding.kurt()` through implementation of Kahan summation (:issue:`6929`)
 
 .. _whatsnew_120.api_breaking.python:
 

diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx
@@ -459,40 +459,69 @@ cdef inline float64_t calc_skew(int64_t minp, int64_t nobs,
 
 cdef inline void add_skew(float64_t val, int64_t *nobs,
                           float64_t *x, float64_t *xx,
-                          float64_t *xxx) nogil:
+                          float64_t *xxx,
+                          float64_t *compensation_x,
+                          float64_t *compensation_xx,
+                          float64_t *compensation_xxx) nogil:
     """ add a value from the skew calc """
+    cdef:
+        float64_t y, t
 
     # Not NaN
     if notnan(val):
         nobs[0] = nobs[0] + 1
 
-        # seriously don't ask me why this is faster
-        x[0] = x[0] + val
-        xx[0] = xx[0] + val * val
-        xxx[0] = xxx[0] + val * val * val
+        y = val - compensation_x[0]
+        t = x[0] + y
+        compensation_x[0] = t - x[0] - y
+        x[0] = t
+        y = val * val - compensation_xx[0]
+        t = xx[0] + y
+        compensation_xx[0] = t - xx[0] - y
+        xx[0] = t
+        y = val * val * val - compensation_xxx[0]
+        t = xxx[0] + y
+        compensation_xxx[0] = t - xxx[0] - y
+        xxx[0] = t
 
 
 cdef inline void remove_skew(float64_t val, int64_t *nobs,
                              float64_t *x, float64_t *xx,
-                             float64_t *xxx) nogil:
+                             float64_t *xxx,
+                             float64_t *compensation_x,
+                             float64_t *compensation_xx,
+                             float64_t *compensation_xxx) nogil:
     """ remove a value from the skew calc """
+    cdef:
+        float64_t y, t
 
     # Not NaN
     if notnan(val):
         nobs[0] = nobs[0] - 1
 
-        # seriously don't ask me why this is faster
-        x[0] = x[0] - val
-        xx[0] = xx[0] - val * val
-        xxx[0] = xxx[0] - val * val * val
+        y = - val - compensation_x[0]
+        t = x[0] + y
+        compensation_x[0] = t - x[0] - y
+        x[0] = t
+        y = - val * val - compensation_xx[0]
+        t = xx[0] + y
+        compensation_xx[0] = t - xx[0] - y
+        xx[0] = t
+        y = - val * val * val - compensation_xxx[0]
+        t = xxx[0] + y
+        compensation_xxx[0] = t - xxx[0] - y
+        xxx[0] = t
 
 
 def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start,
               ndarray[int64_t] end, int64_t minp):
     cdef:
-        float64_t val, prev
+        float64_t val, prev, min_val, mean_val, sum_val = 0
+        float64_t compensation_xxx_add = 0, compensation_xxx_remove = 0
+        float64_t compensation_xx_add = 0, compensation_xx_remove = 0
+        float64_t compensation_x_add = 0, compensation_x_remove = 0
         float64_t x = 0, xx = 0, xxx = 0
-        int64_t nobs = 0, i, j, N = len(values)
+        int64_t nobs = 0, i, j, N = len(values), nobs_mean = 0
         int64_t s, e
         ndarray[float64_t] output
         bint is_monotonic_increasing_bounds
@@ -502,9 +531,20 @@ def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start,
         start, end
     )
     output = np.empty(N, dtype=float)
+    min_val = np.nanmin(values)
 
     with nogil:
+        for i in range(0, N):
+            val = values[i]
+            if notnan(val):
+                nobs_mean += 1
+                sum_val += val
+    mean_val = sum_val / nobs_mean
+    # Other cases would lead to imprecision for smallest values
+    if min_val - mean_val > -1e5:
+        values = values - round(mean_val)
 
+    with nogil:
         for i in range(0, N):
 
             s = start[i]
@@ -516,22 +556,24 @@ def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start,
 
                 for j in range(s, e):
                     val = values[j]
-                    add_skew(val, &nobs, &x, &xx, &xxx)
+                    add_skew(val, &nobs, &x, &xx, &xxx, &compensation_x_add,
+                             &compensation_xx_add, &compensation_xxx_add)
 
             else:
 
                 # After the first window, observations can both be added
                 # and removed
+                # calculate deletes
+                for j in range(start[i - 1], s):
+                    val = values[j]
+                    remove_skew(val, &nobs, &x, &xx, &xxx, &compensation_x_remove,
+                                &compensation_xx_remove, &compensation_xxx_remove)
 
                 # calculate adds
                 for j in range(end[i - 1], e):
                     val = values[j]
-                    add_skew(val, &nobs, &x, &xx, &xxx)
-
-                # calculate deletes
-                for j in range(start[i - 1], s):
-                    val = values[j]
-                    remove_skew(val, &nobs, &x, &xx, &xxx)
+                    add_skew(val, &nobs, &x, &xx, &xxx, &compensation_x_add,
+                             &compensation_xx_add, &compensation_xxx_add)
 
             output[i] = calc_skew(minp, nobs, x, xx, xxx)
 
@@ -586,42 +628,80 @@ cdef inline float64_t calc_kurt(int64_t minp, int64_t nobs,
 
 cdef inline void add_kurt(float64_t val, int64_t *nobs,
                           float64_t *x, float64_t *xx,
-                          float64_t *xxx, float64_t *xxxx) nogil:
+                          float64_t *xxx, float64_t *xxxx,
+                          float64_t *compensation_x,
+                          float64_t *compensation_xx,
+                          float64_t *compensation_xxx,
+                          float64_t *compensation_xxxx) nogil:
     """ add a value from the kurotic calc """
+    cdef:
+        float64_t y, t
 
     # Not NaN
     if notnan(val):
         nobs[0] = nobs[0] + 1
 
-        # seriously don't ask me why this is faster
-        x[0] = x[0] + val
-        xx[0] = xx[0] + val * val
-        xxx[0] = xxx[0] + val * val * val
-        xxxx[0] = xxxx[0] + val * val * val * val
+        y = val - compensation_x[0]
+        t = x[0] + y
+        compensation_x[0] = t - x[0] - y
+        x[0] = t
+        y = val * val - compensation_xx[0]
+        t = xx[0] + y
+        compensation_xx[0] = t - xx[0] - y
+        xx[0] = t
+        y = val * val * val - compensation_xxx[0]
+        t = xxx[0] + y
+        compensation_xxx[0] = t - xxx[0] - y
+        xxx[0] = t
+        y = val * val * val * val - compensation_xxxx[0]
+        t = xxxx[0] + y
+        compensation_xxxx[0] = t - xxxx[0] - y
+        xxxx[0] = t
 
 
 cdef inline void remove_kurt(float64_t val, int64_t *nobs,
                              float64_t *x, float64_t *xx,
-                             float64_t *xxx, float64_t *xxxx) nogil:
+                             float64_t *xxx, float64_t *xxxx,
+                             float64_t *compensation_x,
+                             float64_t *compensation_xx,
+                             float64_t *compensation_xxx,
+                             float64_t *compensation_xxxx) nogil:
     """ remove a value from the kurotic calc """
+    cdef:
+        float64_t y, t
 
     # Not NaN
     if notnan(val):
         nobs[0] = nobs[0] - 1
 
-        # seriously don't ask me why this is faster
-        x[0] = x[0] - val
-        xx[0] = xx[0] - val * val
-        xxx[0] = xxx[0] - val * val * val
-        xxxx[0] = xxxx[0] - val * val * val * val
+        y = - val - compensation_x[0]
+        t = x[0] + y
+        compensation_x[0] = t - x[0] - y
+        x[0] = t
+        y = - val * val - compensation_xx[0]
+        t = xx[0] + y
+        compensation_xx[0] = t - xx[0] - y
+        xx[0] = t
+        y = - val * val * val - compensation_xxx[0]
+        t = xxx[0] + y
+        compensation_xxx[0] = t - xxx[0] - y
+        xxx[0] = t
+        y = - val * val * val * val - compensation_xxxx[0]
+        t = xxxx[0] + y
+        compensation_xxxx[0] = t - xxxx[0] - y
+        xxxx[0] = t
 
 
 def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start,
               ndarray[int64_t] end, int64_t minp):
     cdef:
-        float64_t val, prev
+        float64_t val, prev, mean_val, min_val, sum_val = 0
+        float64_t compensation_xxxx_add = 0, compensation_xxxx_remove = 0
+        float64_t compensation_xxx_remove = 0, compensation_xxx_add = 0
+        float64_t compensation_xx_remove = 0, compensation_xx_add = 0
+        float64_t compensation_x_remove = 0, compensation_x_add = 0
         float64_t x = 0, xx = 0, xxx = 0, xxxx = 0
-        int64_t nobs = 0, i, j, s, e, N = len(values)
+        int64_t nobs = 0, i, j, s, e, N = len(values), nobs_mean = 0
         ndarray[float64_t] output
         bint is_monotonic_increasing_bounds
 
@@ -630,6 +710,18 @@ def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start,
         start, end
     )
     output = np.empty(N, dtype=float)
+    min_val = np.nanmin(values)
+
+    with nogil:
+        for i in range(0, N):
+            val = values[i]
+            if notnan(val):
+                nobs_mean += 1
+                sum_val += val
+    mean_val = sum_val / nobs_mean
+    # Other cases would lead to imprecision for smallest values
+    if min_val - mean_val > -1e4:
+        values = values - round(mean_val)
 
     with nogil:
 
@@ -643,20 +735,25 @@ def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start,
             if i == 0 or not is_monotonic_increasing_bounds:
 
                 for j in range(s, e):
-                    add_kurt(values[j], &nobs, &x, &xx, &xxx, &xxxx)
+                    add_kurt(values[j], &nobs, &x, &xx, &xxx, &xxxx,
+                             &compensation_x_add, &compensation_xx_add,
+                             &compensation_xxx_add, &compensation_xxxx_add)
 
             else:
 
                 # After the first window, observations can both be added
                 # and removed
+                # calculate deletes
+                for j in range(start[i - 1], s):
+                    remove_kurt(values[j], &nobs, &x, &xx, &xxx, &xxxx,
+                                &compensation_x_remove, &compensation_xx_remove,
+                                &compensation_xxx_remove, &compensation_xxxx_remove)
 
                 # calculate adds
                 for j in range(end[i - 1], e):
-                    add_kurt(values[j], &nobs, &x, &xx, &xxx, &xxxx)
-
-                # calculate deletes
-                for j in range(start[i - 1], s):
-                    remove_kurt(values[j], &nobs, &x, &xx, &xxx, &xxxx)
+                    add_kurt(values[j], &nobs, &x, &xx, &xxx, &xxxx,
+                             &compensation_x_add, &compensation_xx_add,
+                             &compensation_xxx_add, &compensation_xxxx_add)
 
             output[i] = calc_kurt(minp, nobs, x, xx, xxx, xxxx)
 

diff --git a/pandas/tests/window/test_expanding.py b/pandas/tests/window/test_expanding.py
@@ -255,3 +255,13 @@ def test_expanding_sem(constructor):
         result = pd.Series(result[0].values)
     expected = pd.Series([np.nan] + [0.707107] * 2)
     tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("method", ["skew", "kurt"])
+def test_expanding_skew_kurt_numerical_stability(method):
+    # GH: 6929
+    s = pd.Series(np.random.rand(10))
+    expected = getattr(s.expanding(3), method)()
+    s = s + 5000
+    result = getattr(s.expanding(3), method)()
+    tm.assert_series_equal(result, expected)
diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py
@@ -1087,3 +1087,28 @@ def test_rolling_corr_timedelta_index(index, window):
     result = x.rolling(window).corr(y)
     expected = Series([np.nan, np.nan, 1, 1, 1], index=index)
     tm.assert_almost_equal(result, expected)
+
+
+@pytest.mark.parametrize("method", ["skew", "kurt"])
+def test_rolling_skew_kurt_numerical_stability(method):
+    # GH: 6929
+    s = Series(np.random.rand(10))
+    expected = getattr(s.rolling(3), method)()
+    s = s + 50000
+    result = getattr(s.rolling(3), method)()
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    ("method", "values"),
+    [
+        ("skew", [2.0, 0.854563, 0.0, 1.999984]),
+        ("kurt", [4.0, -1.289256, -1.2, 3.999946]),
+    ],
+)
+def test_rolling_skew_kurt_large_value_range(method, values):
+    # GH: 37557
+    s = Series([3000000, 1, 1, 2, 3, 4, 999])
+    result = getattr(s.rolling(4), method)()
+    expected = Series([np.nan] * 3 + values)
+    tm.assert_series_equal(result, expected)