Skip to content

ENH: Improve numerical stability for window functions skew and kurt #37557

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 20 commits into from
Nov 9, 2020
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,7 @@ Other enhancements
- :class:`Rolling` now supports the ``closed`` argument for fixed windows (:issue:`34315`)
- :class:`DatetimeIndex` and :class:`Series` with ``datetime64`` or ``datetime64tz`` dtypes now support ``std`` (:issue:`37436`)
- :class:`Window` now supports all Scipy window types in ``win_type`` with flexible keyword argument support (:issue:`34556`)
- Improve numerical stability for :meth:`Rolling.skew()`, :meth:`Rolling.kurt()`, :meth:`Expanding.skew()` and :meth:`Expanding.kurt()` through implementation of Kahan summation (:issue:`6929`)

.. _whatsnew_120.api_breaking.python:

Expand Down
175 changes: 136 additions & 39 deletions pandas/_libs/window/aggregations.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -459,40 +459,69 @@ cdef inline float64_t calc_skew(int64_t minp, int64_t nobs,

cdef inline void add_skew(float64_t val, int64_t *nobs,
float64_t *x, float64_t *xx,
float64_t *xxx) nogil:
float64_t *xxx,
float64_t *compensation_x,
float64_t *compensation_xx,
float64_t *compensation_xxx) nogil:
""" add a value from the skew calc """
cdef:
float64_t y, t

# Not NaN
if notnan(val):
nobs[0] = nobs[0] + 1

# seriously don't ask me why this is faster
x[0] = x[0] + val
xx[0] = xx[0] + val * val
xxx[0] = xxx[0] + val * val * val
y = val - compensation_x[0]
t = x[0] + y
compensation_x[0] = t - x[0] - y
x[0] = t
y = val * val - compensation_xx[0]
t = xx[0] + y
compensation_xx[0] = t - xx[0] - y
xx[0] = t
y = val * val * val - compensation_xxx[0]
t = xxx[0] + y
compensation_xxx[0] = t - xxx[0] - y
xxx[0] = t


cdef inline void remove_skew(float64_t val, int64_t *nobs,
float64_t *x, float64_t *xx,
float64_t *xxx) nogil:
float64_t *xxx,
float64_t *compensation_x,
float64_t *compensation_xx,
float64_t *compensation_xxx) nogil:
""" remove a value from the skew calc """
cdef:
float64_t y, t

# Not NaN
if notnan(val):
nobs[0] = nobs[0] - 1

# seriously don't ask me why this is faster
x[0] = x[0] - val
xx[0] = xx[0] - val * val
xxx[0] = xxx[0] - val * val * val
y = - val - compensation_x[0]
t = x[0] + y
compensation_x[0] = t - x[0] - y
x[0] = t
y = - val * val - compensation_xx[0]
t = xx[0] + y
compensation_xx[0] = t - xx[0] - y
xx[0] = t
y = - val * val * val - compensation_xxx[0]
t = xxx[0] + y
compensation_xxx[0] = t - xxx[0] - y
xxx[0] = t


def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start,
ndarray[int64_t] end, int64_t minp):
cdef:
float64_t val, prev
float64_t val, prev, min_val, mean_val, sum_val = 0
float64_t compensation_xxx_add = 0, compensation_xxx_remove = 0
float64_t compensation_xx_add = 0, compensation_xx_remove = 0
float64_t compensation_x_add = 0, compensation_x_remove = 0
float64_t x = 0, xx = 0, xxx = 0
int64_t nobs = 0, i, j, N = len(values)
int64_t nobs = 0, i, j, N = len(values), nobs_mean = 0
int64_t s, e
ndarray[float64_t] output
bint is_monotonic_increasing_bounds
Expand All @@ -502,9 +531,20 @@ def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start,
start, end
)
output = np.empty(N, dtype=float)
min_val = np.nanmin(values)

with nogil:
for i in range(0, N):
val = values[i]
if notnan(val):
nobs_mean += 1
sum_val += val
mean_val = sum_val / nobs_mean
# Other cases would lead to imprecision for smallest values
if min_val - mean_val > -1e5:
values = values - round(mean_val)

with nogil:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you can leave the original nogil right? e.g. no reason to do it twice?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The subtraction of the mean does not wirk with nogil, that is the reason did it there again

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

umm why? this is array - scalar right?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thought so too, but got an error. Will try again

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmmm, getting the following error without gil:

Error compiling Cython file:
------------------------------------------------------------
...
                nobs_mean += 1
                sum_val += val
        mean_val = sum_val / nobs_mean
        # Other cases would lead to imprecision for smallest values
        if min_val - mean_val > -1e5:
            values = values - round(mean_val)
                                   ^
------------------------------------------------------------

pandas/_libs/window/aggregations.pyx:545:36: Converting to Python object not allowed without gil
Traceback (most recent call last):
  File "setup.py", line 790, in <module>
    setup_package()
  File "setup.py", line 760, in setup_package
    ext_modules=maybe_cythonize(extensions, compiler_directives=directives),
  File "setup.py", line 539, in maybe_cythonize
    return cythonize(extensions, *args, **kwargs)
  File "/home/developer/anaconda3/envs/omi_reports/pandas-dev/lib/python3.8/site-packages/Cython/Build/Dependencies.py", line 1102, in cythonize
    cythonize_one(*args)
  File "/home/developer/anaconda3/envs/omi_reports/pandas-dev/lib/python3.8/site-packages/Cython/Build/Dependencies.py", line 1225, in cythonize_one
    raise CompileError(None, pyx_file)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hm no, mean_value is a float. This does not work without round either. Maybe the broadcasting is the problem here?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah maybe have to use np.broadcast_to

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Found a solution for the round issue. Used a loop for the subtraction, because np.broadcast_to did not work either. No matter this, I think the loop is the most intuitive solution

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

from libc.math cimport round?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That works, thanks very much.

for i in range(0, N):

s = start[i]
Expand All @@ -516,22 +556,24 @@ def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start,

for j in range(s, e):
val = values[j]
add_skew(val, &nobs, &x, &xx, &xxx)
add_skew(val, &nobs, &x, &xx, &xxx, &compensation_x_add,
&compensation_xx_add, &compensation_xxx_add)

else:

# After the first window, observations can both be added
# and removed
# calculate deletes
for j in range(start[i - 1], s):
val = values[j]
remove_skew(val, &nobs, &x, &xx, &xxx, &compensation_x_remove,
&compensation_xx_remove, &compensation_xxx_remove)

# calculate adds
for j in range(end[i - 1], e):
val = values[j]
add_skew(val, &nobs, &x, &xx, &xxx)

# calculate deletes
for j in range(start[i - 1], s):
val = values[j]
remove_skew(val, &nobs, &x, &xx, &xxx)
add_skew(val, &nobs, &x, &xx, &xxx, &compensation_x_add,
&compensation_xx_add, &compensation_xxx_add)

output[i] = calc_skew(minp, nobs, x, xx, xxx)

Expand Down Expand Up @@ -586,42 +628,80 @@ cdef inline float64_t calc_kurt(int64_t minp, int64_t nobs,

cdef inline void add_kurt(float64_t val, int64_t *nobs,
float64_t *x, float64_t *xx,
float64_t *xxx, float64_t *xxxx) nogil:
float64_t *xxx, float64_t *xxxx,
float64_t *compensation_x,
float64_t *compensation_xx,
float64_t *compensation_xxx,
float64_t *compensation_xxxx) nogil:
""" add a value from the kurotic calc """
cdef:
float64_t y, t

# Not NaN
if notnan(val):
nobs[0] = nobs[0] + 1

# seriously don't ask me why this is faster
x[0] = x[0] + val
xx[0] = xx[0] + val * val
xxx[0] = xxx[0] + val * val * val
xxxx[0] = xxxx[0] + val * val * val * val
y = val - compensation_x[0]
t = x[0] + y
compensation_x[0] = t - x[0] - y
x[0] = t
y = val * val - compensation_xx[0]
t = xx[0] + y
compensation_xx[0] = t - xx[0] - y
xx[0] = t
y = val * val * val - compensation_xxx[0]
t = xxx[0] + y
compensation_xxx[0] = t - xxx[0] - y
xxx[0] = t
y = val * val * val * val - compensation_xxxx[0]
t = xxxx[0] + y
compensation_xxxx[0] = t - xxxx[0] - y
xxxx[0] = t


cdef inline void remove_kurt(float64_t val, int64_t *nobs,
float64_t *x, float64_t *xx,
float64_t *xxx, float64_t *xxxx) nogil:
float64_t *xxx, float64_t *xxxx,
float64_t *compensation_x,
float64_t *compensation_xx,
float64_t *compensation_xxx,
float64_t *compensation_xxxx) nogil:
""" remove a value from the kurotic calc """
cdef:
float64_t y, t

# Not NaN
if notnan(val):
nobs[0] = nobs[0] - 1

# seriously don't ask me why this is faster
x[0] = x[0] - val
xx[0] = xx[0] - val * val
xxx[0] = xxx[0] - val * val * val
xxxx[0] = xxxx[0] - val * val * val * val
y = - val - compensation_x[0]
t = x[0] + y
compensation_x[0] = t - x[0] - y
x[0] = t
y = - val * val - compensation_xx[0]
t = xx[0] + y
compensation_xx[0] = t - xx[0] - y
xx[0] = t
y = - val * val * val - compensation_xxx[0]
t = xxx[0] + y
compensation_xxx[0] = t - xxx[0] - y
xxx[0] = t
y = - val * val * val * val - compensation_xxxx[0]
t = xxxx[0] + y
compensation_xxxx[0] = t - xxxx[0] - y
xxxx[0] = t


def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start,
ndarray[int64_t] end, int64_t minp):
cdef:
float64_t val, prev
float64_t val, prev, mean_val, min_val, sum_val = 0
float64_t compensation_xxxx_add = 0, compensation_xxxx_remove = 0
float64_t compensation_xxx_remove = 0, compensation_xxx_add = 0
float64_t compensation_xx_remove = 0, compensation_xx_add = 0
float64_t compensation_x_remove = 0, compensation_x_add = 0
float64_t x = 0, xx = 0, xxx = 0, xxxx = 0
int64_t nobs = 0, i, j, s, e, N = len(values)
int64_t nobs = 0, i, j, s, e, N = len(values), nobs_mean = 0
ndarray[float64_t] output
bint is_monotonic_increasing_bounds

Expand All @@ -630,6 +710,18 @@ def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start,
start, end
)
output = np.empty(N, dtype=float)
min_val = np.nanmin(values)

with nogil:
for i in range(0, N):
val = values[i]
if notnan(val):
nobs_mean += 1
sum_val += val
mean_val = sum_val / nobs_mean
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

umm, why are you adding this?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The issues mentioned above describes a use case. where the series is shifted by 5000. In this case, if we do not subtract the mean, The decimal places will be so small, that they will be ignored. Leading to the numerical imprecision described in the issue.

Kahan summation only solves issues, when the range of the values between the windows is really big. Could post a more precise and explicit example, if this helps

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i think i understand the issue, but am unclear why you are not always just subtracting the mean rather than conditionally.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Aaah sorry. Misunderstood you.

In case of something like [1e12, 0.023, 0.04565, 0.343545, 0.343434] and subtract the mean from the complete series, we lose precision for the second window starting at 0.023, because this abs value gets really big too

# Other cases would lead to imprecision for smallest values
if min_val - mean_val > -1e4:
values = values - round(mean_val)

with nogil:

Expand All @@ -643,20 +735,25 @@ def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start,
if i == 0 or not is_monotonic_increasing_bounds:

for j in range(s, e):
add_kurt(values[j], &nobs, &x, &xx, &xxx, &xxxx)
add_kurt(values[j], &nobs, &x, &xx, &xxx, &xxxx,
&compensation_x_add, &compensation_xx_add,
&compensation_xxx_add, &compensation_xxxx_add)

else:

# After the first window, observations can both be added
# and removed
# calculate deletes
for j in range(start[i - 1], s):
remove_kurt(values[j], &nobs, &x, &xx, &xxx, &xxxx,
&compensation_x_remove, &compensation_xx_remove,
&compensation_xxx_remove, &compensation_xxxx_remove)

# calculate adds
for j in range(end[i - 1], e):
add_kurt(values[j], &nobs, &x, &xx, &xxx, &xxxx)

# calculate deletes
for j in range(start[i - 1], s):
remove_kurt(values[j], &nobs, &x, &xx, &xxx, &xxxx)
add_kurt(values[j], &nobs, &x, &xx, &xxx, &xxxx,
&compensation_x_add, &compensation_xx_add,
&compensation_xxx_add, &compensation_xxxx_add)

output[i] = calc_kurt(minp, nobs, x, xx, xxx, xxxx)

Expand Down
10 changes: 10 additions & 0 deletions pandas/tests/window/test_expanding.py
Original file line number Diff line number Diff line change
Expand Up @@ -255,3 +255,13 @@ def test_expanding_sem(constructor):
result = pd.Series(result[0].values)
expected = pd.Series([np.nan] + [0.707107] * 2)
tm.assert_series_equal(result, expected)


@pytest.mark.parametrize("method", ["skew", "kurt"])
def test_expanding_skew_kurt_numerical_stability(method):
# GH: 6929
s = pd.Series(np.random.rand(10))
expected = getattr(s.expanding(3), method)()
s = s + 5000
result = getattr(s.expanding(3), method)()
tm.assert_series_equal(result, expected)
25 changes: 25 additions & 0 deletions pandas/tests/window/test_rolling.py
Original file line number Diff line number Diff line change
Expand Up @@ -1087,3 +1087,28 @@ def test_rolling_corr_timedelta_index(index, window):
result = x.rolling(window).corr(y)
expected = Series([np.nan, np.nan, 1, 1, 1], index=index)
tm.assert_almost_equal(result, expected)


@pytest.mark.parametrize("method", ["skew", "kurt"])
def test_rolling_skew_kurt_numerical_stability(method):
# GH: 6929
s = Series(np.random.rand(10))
expected = getattr(s.rolling(3), method)()
s = s + 50000
result = getattr(s.rolling(3), method)()
tm.assert_series_equal(result, expected)


@pytest.mark.parametrize(
("method", "values"),
[
("skew", [2.0, 0.854563, 0.0, 1.999984]),
("kurt", [4.0, -1.289256, -1.2, 3.999946]),
],
)
def test_rolling_skew_kurt_large_value_range(method, values):
# GH: 37557
s = Series([3000000, 1, 1, 2, 3, 4, 999])
result = getattr(s.rolling(4), method)()
expected = Series([np.nan] * 3 + values)
tm.assert_series_equal(result, expected)