@@ -2406,6 +2406,13 @@ math_fmod_impl(PyObject *module, double x, double y)
2406
2406
/*
2407
2407
Given an *n* length *vec* of values and a value *max*, compute:
2408
2408
2409
+ sqrt(sum((x * scale) ** 2 for x in vec)) / scale
2410
+
2411
+ where scale is the first power of two
2412
+ greater than max.
2413
+
2414
+ or compute:
2415
+
2409
2416
max * sqrt(sum((x / max) ** 2 for x in vec))
2410
2417
2411
2418
The value of the *max* variable must be non-negative and
@@ -2425,19 +2432,25 @@ The *csum* variable tracks the cumulative sum and *frac* tracks
2425
2432
the cumulative fractional errors at each step. Since this
2426
2433
variant assumes that |csum| >= |x| at each step, we establish
2427
2434
the precondition by starting the accumulation from 1.0 which
2428
- represents the largest possible value of (x/max)**2.
2435
+ represents the largest possible value of (x*scale)**2 or (x /max)**2.
2429
2436
2430
2437
After the loop is finished, the initial 1.0 is subtracted out
2431
2438
for a net zero effect on the final sum. Since *csum* will be
2432
2439
greater than 1.0, the subtraction of 1.0 will not cause
2433
2440
fractional digits to be dropped from *csum*.
2434
2441
2442
+ To get the full benefit from compensated summation, the
2443
+ largest addend should be in the range: 0.5 <= x <= 1.0.
2444
+ Accordingly, scaling or division by *max* should not be skipped
2445
+ even if not otherwise needed to prevent overflow or loss of precision.
2446
+
2435
2447
*/
2436
2448
2437
2449
static inline double
2438
2450
vector_norm (Py_ssize_t n , double * vec , double max , int found_nan )
2439
2451
{
2440
- double x , csum = 1.0 , oldcsum , frac = 0.0 ;
2452
+ double x , csum = 1.0 , oldcsum , frac = 0.0 , scale ;
2453
+ int max_e ;
2441
2454
Py_ssize_t i ;
2442
2455
2443
2456
if (Py_IS_INFINITY (max )) {
@@ -2449,14 +2462,36 @@ vector_norm(Py_ssize_t n, double *vec, double max, int found_nan)
2449
2462
if (max == 0.0 || n <= 1 ) {
2450
2463
return max ;
2451
2464
}
2465
+ frexp (max , & max_e );
2466
+ if (max_e >= -1023 ) {
2467
+ scale = ldexp (1.0 , - max_e );
2468
+ assert (max * scale >= 0.5 );
2469
+ assert (max * scale < 1.0 );
2470
+ for (i = 0 ; i < n ; i ++ ) {
2471
+ x = vec [i ];
2472
+ assert (Py_IS_FINITE (x ) && fabs (x ) <= max );
2473
+ x *= scale ;
2474
+ x = x * x ;
2475
+ assert (x <= 1.0 );
2476
+ assert (csum >= x );
2477
+ oldcsum = csum ;
2478
+ csum += x ;
2479
+ frac += (oldcsum - csum ) + x ;
2480
+ }
2481
+ return sqrt (csum - 1.0 + frac ) / scale ;
2482
+ }
2483
+ /* When max_e < -1023, ldexp(1.0, -max_e) overflows.
2484
+ So instead of multiplying by a scale, we just divide by *max*.
2485
+ */
2452
2486
for (i = 0 ; i < n ; i ++ ) {
2453
2487
x = vec [i ];
2454
2488
assert (Py_IS_FINITE (x ) && fabs (x ) <= max );
2455
2489
x /= max ;
2456
2490
x = x * x ;
2491
+ assert (x <= 1.0 );
2492
+ assert (csum >= x );
2457
2493
oldcsum = csum ;
2458
2494
csum += x ;
2459
- assert (csum >= x );
2460
2495
frac += (oldcsum - csum ) + x ;
2461
2496
}
2462
2497
return max * sqrt (csum - 1.0 + frac );
0 commit comments