-
-
Notifications
You must be signed in to change notification settings - Fork 18.6k
ENH: Improve numerical stability for window functions skew and kurt #37557
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 13 commits
d4b1a9b
c634539
bf88a63
e57f97c
a71f645
a1874d8
288ac22
eb35925
480ba26
c58be86
fa460b1
ce0a9a2
e643a45
5998b7b
7109729
bfedac0
b03d5c1
852f3ca
4688648
50ea50a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -459,40 +459,69 @@ cdef inline float64_t calc_skew(int64_t minp, int64_t nobs, | |
|
||
cdef inline void add_skew(float64_t val, int64_t *nobs, | ||
float64_t *x, float64_t *xx, | ||
float64_t *xxx) nogil: | ||
float64_t *xxx, | ||
float64_t *compensation_x, | ||
float64_t *compensation_xx, | ||
float64_t *compensation_xxx) nogil: | ||
""" add a value from the skew calc """ | ||
cdef: | ||
float64_t y, t | ||
|
||
# Not NaN | ||
if notnan(val): | ||
nobs[0] = nobs[0] + 1 | ||
|
||
# seriously don't ask me why this is faster | ||
x[0] = x[0] + val | ||
xx[0] = xx[0] + val * val | ||
xxx[0] = xxx[0] + val * val * val | ||
y = val - compensation_x[0] | ||
t = x[0] + y | ||
compensation_x[0] = t - x[0] - y | ||
x[0] = t | ||
y = val * val - compensation_xx[0] | ||
t = xx[0] + y | ||
compensation_xx[0] = t - xx[0] - y | ||
xx[0] = t | ||
y = val * val * val - compensation_xxx[0] | ||
t = xxx[0] + y | ||
compensation_xxx[0] = t - xxx[0] - y | ||
xxx[0] = t | ||
|
||
|
||
cdef inline void remove_skew(float64_t val, int64_t *nobs, | ||
float64_t *x, float64_t *xx, | ||
float64_t *xxx) nogil: | ||
float64_t *xxx, | ||
float64_t *compensation_x, | ||
float64_t *compensation_xx, | ||
float64_t *compensation_xxx) nogil: | ||
""" remove a value from the skew calc """ | ||
cdef: | ||
float64_t y, t | ||
|
||
# Not NaN | ||
if notnan(val): | ||
nobs[0] = nobs[0] - 1 | ||
|
||
# seriously don't ask me why this is faster | ||
x[0] = x[0] - val | ||
xx[0] = xx[0] - val * val | ||
xxx[0] = xxx[0] - val * val * val | ||
y = - val - compensation_x[0] | ||
t = x[0] + y | ||
compensation_x[0] = t - x[0] - y | ||
x[0] = t | ||
y = - val * val - compensation_xx[0] | ||
t = xx[0] + y | ||
compensation_xx[0] = t - xx[0] - y | ||
xx[0] = t | ||
y = - val * val * val - compensation_xxx[0] | ||
t = xxx[0] + y | ||
compensation_xxx[0] = t - xxx[0] - y | ||
xxx[0] = t | ||
|
||
|
||
def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start, | ||
ndarray[int64_t] end, int64_t minp): | ||
cdef: | ||
float64_t val, prev | ||
float64_t val, prev, min_val, mean_val, sum_val = 0 | ||
float64_t compensation_xxx_add = 0, compensation_xxx_remove = 0 | ||
float64_t compensation_xx_add = 0, compensation_xx_remove = 0 | ||
float64_t compensation_x_add = 0, compensation_x_remove = 0 | ||
float64_t x = 0, xx = 0, xxx = 0 | ||
int64_t nobs = 0, i, j, N = len(values) | ||
int64_t nobs = 0, i, j, N = len(values), nobs_mean = 0 | ||
int64_t s, e | ||
ndarray[float64_t] output | ||
bint is_monotonic_increasing_bounds | ||
|
@@ -502,9 +531,20 @@ def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start, | |
start, end | ||
) | ||
output = np.empty(N, dtype=float) | ||
min_val = np.nanmin(values) | ||
|
||
with nogil: | ||
for i in range(0, N): | ||
val = values[i] | ||
if notnan(val): | ||
nobs_mean += 1 | ||
sum_val += val | ||
mean_val = sum_val / nobs_mean | ||
# Other cases would lead to imprecision for smallest values | ||
if min_val - mean_val > -1e5: | ||
values = values - round(mean_val) | ||
|
||
with nogil: | ||
for i in range(0, N): | ||
|
||
s = start[i] | ||
|
@@ -516,22 +556,24 @@ def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start, | |
|
||
for j in range(s, e): | ||
val = values[j] | ||
add_skew(val, &nobs, &x, &xx, &xxx) | ||
add_skew(val, &nobs, &x, &xx, &xxx, &compensation_x_add, | ||
&compensation_xx_add, &compensation_xxx_add) | ||
|
||
else: | ||
|
||
# After the first window, observations can both be added | ||
# and removed | ||
# calculate deletes | ||
for j in range(start[i - 1], s): | ||
val = values[j] | ||
remove_skew(val, &nobs, &x, &xx, &xxx, &compensation_x_remove, | ||
&compensation_xx_remove, &compensation_xxx_remove) | ||
|
||
# calculate adds | ||
for j in range(end[i - 1], e): | ||
val = values[j] | ||
add_skew(val, &nobs, &x, &xx, &xxx) | ||
|
||
# calculate deletes | ||
for j in range(start[i - 1], s): | ||
val = values[j] | ||
remove_skew(val, &nobs, &x, &xx, &xxx) | ||
add_skew(val, &nobs, &x, &xx, &xxx, &compensation_x_add, | ||
&compensation_xx_add, &compensation_xxx_add) | ||
|
||
output[i] = calc_skew(minp, nobs, x, xx, xxx) | ||
|
||
|
@@ -586,42 +628,80 @@ cdef inline float64_t calc_kurt(int64_t minp, int64_t nobs, | |
|
||
cdef inline void add_kurt(float64_t val, int64_t *nobs, | ||
float64_t *x, float64_t *xx, | ||
float64_t *xxx, float64_t *xxxx) nogil: | ||
float64_t *xxx, float64_t *xxxx, | ||
float64_t *compensation_x, | ||
float64_t *compensation_xx, | ||
float64_t *compensation_xxx, | ||
float64_t *compensation_xxxx) nogil: | ||
""" add a value from the kurotic calc """ | ||
cdef: | ||
float64_t y, t | ||
|
||
# Not NaN | ||
if notnan(val): | ||
nobs[0] = nobs[0] + 1 | ||
|
||
# seriously don't ask me why this is faster | ||
x[0] = x[0] + val | ||
xx[0] = xx[0] + val * val | ||
xxx[0] = xxx[0] + val * val * val | ||
xxxx[0] = xxxx[0] + val * val * val * val | ||
y = val - compensation_x[0] | ||
t = x[0] + y | ||
compensation_x[0] = t - x[0] - y | ||
x[0] = t | ||
y = val * val - compensation_xx[0] | ||
t = xx[0] + y | ||
compensation_xx[0] = t - xx[0] - y | ||
xx[0] = t | ||
y = val * val * val - compensation_xxx[0] | ||
t = xxx[0] + y | ||
compensation_xxx[0] = t - xxx[0] - y | ||
xxx[0] = t | ||
y = val * val * val * val - compensation_xxxx[0] | ||
t = xxxx[0] + y | ||
compensation_xxxx[0] = t - xxxx[0] - y | ||
xxxx[0] = t | ||
|
||
|
||
cdef inline void remove_kurt(float64_t val, int64_t *nobs, | ||
float64_t *x, float64_t *xx, | ||
float64_t *xxx, float64_t *xxxx) nogil: | ||
float64_t *xxx, float64_t *xxxx, | ||
float64_t *compensation_x, | ||
float64_t *compensation_xx, | ||
float64_t *compensation_xxx, | ||
float64_t *compensation_xxxx) nogil: | ||
""" remove a value from the kurotic calc """ | ||
cdef: | ||
float64_t y, t | ||
|
||
# Not NaN | ||
if notnan(val): | ||
nobs[0] = nobs[0] - 1 | ||
|
||
# seriously don't ask me why this is faster | ||
x[0] = x[0] - val | ||
xx[0] = xx[0] - val * val | ||
xxx[0] = xxx[0] - val * val * val | ||
xxxx[0] = xxxx[0] - val * val * val * val | ||
y = - val - compensation_x[0] | ||
t = x[0] + y | ||
compensation_x[0] = t - x[0] - y | ||
x[0] = t | ||
y = - val * val - compensation_xx[0] | ||
t = xx[0] + y | ||
compensation_xx[0] = t - xx[0] - y | ||
xx[0] = t | ||
y = - val * val * val - compensation_xxx[0] | ||
t = xxx[0] + y | ||
compensation_xxx[0] = t - xxx[0] - y | ||
xxx[0] = t | ||
y = - val * val * val * val - compensation_xxxx[0] | ||
t = xxxx[0] + y | ||
compensation_xxxx[0] = t - xxxx[0] - y | ||
xxxx[0] = t | ||
|
||
|
||
def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start, | ||
ndarray[int64_t] end, int64_t minp): | ||
cdef: | ||
float64_t val, prev | ||
float64_t val, prev, mean_val, min_val, sum_val = 0 | ||
float64_t compensation_xxxx_add = 0, compensation_xxxx_remove = 0 | ||
float64_t compensation_xxx_remove = 0, compensation_xxx_add = 0 | ||
float64_t compensation_xx_remove = 0, compensation_xx_add = 0 | ||
float64_t compensation_x_remove = 0, compensation_x_add = 0 | ||
float64_t x = 0, xx = 0, xxx = 0, xxxx = 0 | ||
int64_t nobs = 0, i, j, s, e, N = len(values) | ||
int64_t nobs = 0, i, j, s, e, N = len(values), nobs_mean = 0 | ||
ndarray[float64_t] output | ||
bint is_monotonic_increasing_bounds | ||
|
||
|
@@ -630,6 +710,18 @@ def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start, | |
start, end | ||
) | ||
output = np.empty(N, dtype=float) | ||
min_val = np.nanmin(values) | ||
|
||
with nogil: | ||
for i in range(0, N): | ||
val = values[i] | ||
if notnan(val): | ||
nobs_mean += 1 | ||
sum_val += val | ||
mean_val = sum_val / nobs_mean | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. umm, why are you adding this? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The issues mentioned above describes a use case. where the series is shifted by 5000. In this case, if we do not subtract the mean, The decimal places will be so small, that they will be ignored. Leading to the numerical imprecision described in the issue. Kahan summation only solves issues, when the range of the values between the windows is really big. Could post a more precise and explicit example, if this helps There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i think i understand the issue, but am unclear why you are not always just subtracting the mean rather than conditionally. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Aaah sorry. Misunderstood you. In case of something like |
||
# Other cases would lead to imprecision for smallest values | ||
if min_val - mean_val > -1e4: | ||
values = values - round(mean_val) | ||
|
||
with nogil: | ||
|
||
|
@@ -643,20 +735,25 @@ def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start, | |
if i == 0 or not is_monotonic_increasing_bounds: | ||
|
||
for j in range(s, e): | ||
add_kurt(values[j], &nobs, &x, &xx, &xxx, &xxxx) | ||
add_kurt(values[j], &nobs, &x, &xx, &xxx, &xxxx, | ||
&compensation_x_add, &compensation_xx_add, | ||
&compensation_xxx_add, &compensation_xxxx_add) | ||
|
||
else: | ||
|
||
# After the first window, observations can both be added | ||
# and removed | ||
# calculate deletes | ||
for j in range(start[i - 1], s): | ||
remove_kurt(values[j], &nobs, &x, &xx, &xxx, &xxxx, | ||
&compensation_x_remove, &compensation_xx_remove, | ||
&compensation_xxx_remove, &compensation_xxxx_remove) | ||
|
||
# calculate adds | ||
for j in range(end[i - 1], e): | ||
add_kurt(values[j], &nobs, &x, &xx, &xxx, &xxxx) | ||
|
||
# calculate deletes | ||
for j in range(start[i - 1], s): | ||
remove_kurt(values[j], &nobs, &x, &xx, &xxx, &xxxx) | ||
add_kurt(values[j], &nobs, &x, &xx, &xxx, &xxxx, | ||
&compensation_x_add, &compensation_xx_add, | ||
&compensation_xxx_add, &compensation_xxxx_add) | ||
|
||
output[i] = calc_kurt(minp, nobs, x, xx, xxx, xxxx) | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
you can leave the original nogil right? e.g. no reason to do it twice?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The subtraction of the mean does not wirk with nogil, that is the reason did it there again
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
umm why? this is array - scalar right?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thought so too, but got an error. Will try again
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hmmm, getting the following error without gil:
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hm no, mean_value is a float. This does not work without round either. Maybe the broadcasting is the problem here?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yeah maybe have to use
np.broadcast_to
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Found a solution for the round issue. Used a loop for the subtraction, because
np.broadcast_to
did not work either. No matter this, I think the loop is the most intuitive solutionThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
from libc.math cimport round
?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That works, thanks very much.