@@ -3462,7 +3462,7 @@ Python code to generate the values:
3462
3462
reduced_fac_odd_part = fac_odd_part % (2**64)
3463
3463
print(f"{reduced_fac_odd_part:#018x}u")
3464
3464
*/
3465
- static uint64_t reduced_factorial_odd_part [] = {
3465
+ static const uint64_t reduced_factorial_odd_part [] = {
3466
3466
0x0000000000000001u , 0x0000000000000001u , 0x0000000000000001u , 0x0000000000000003u ,
3467
3467
0x0000000000000003u , 0x000000000000000fu , 0x000000000000002du , 0x000000000000013bu ,
3468
3468
0x000000000000013bu , 0x0000000000000b13u , 0x000000000000375fu , 0x0000000000026115u ,
@@ -3494,7 +3494,7 @@ Python code to generate the values:
3494
3494
inverted_fac_odd_part = pow(fac_odd_part, -1, 2**64)
3495
3495
print(f"{inverted_fac_odd_part:#018x}u")
3496
3496
*/
3497
- static uint64_t inverted_factorial_odd_part [] = {
3497
+ static const uint64_t inverted_factorial_odd_part [] = {
3498
3498
0x0000000000000001u , 0x0000000000000001u , 0x0000000000000001u , 0xaaaaaaaaaaaaaaabu ,
3499
3499
0xaaaaaaaaaaaaaaabu , 0xeeeeeeeeeeeeeeefu , 0x4fa4fa4fa4fa4fa5u , 0x2ff2ff2ff2ff2ff3u ,
3500
3500
0x2ff2ff2ff2ff2ff3u , 0x938cc70553e3771bu , 0xb71c27cddd93e49fu , 0xb38e3229fcdee63du ,
@@ -3514,6 +3514,25 @@ static uint64_t inverted_factorial_odd_part[] = {
3514
3514
0x547fb1b8ab9d0ba3u , 0x8f15a826498852e3u , 0x32e1a03f38880283u , 0x3de4cce63283f0c1u ,
3515
3515
};
3516
3516
3517
+ /* exponent of the largest power of 2 dividing factorial(n), for n in range(68)
3518
+
3519
+ Python code to generate the values:
3520
+
3521
+ import math
3522
+
3523
+ for n in range(68):
3524
+ fac = math.factorial(n)
3525
+ fac_trailing_zeros = (fac & -fac).bit_length() - 1
3526
+ print(fac_trailing_zeros)
3527
+ */
3528
+
3529
+ static const uint8_t factorial_trailing_zeros [] = {
3530
+ 0 , 0 , 1 , 1 , 3 , 3 , 4 , 4 , 7 , 7 , 8 , 8 , 10 , 10 , 11 , 11 , // 0-15
3531
+ 15 , 15 , 16 , 16 , 18 , 18 , 19 , 19 , 22 , 22 , 23 , 23 , 25 , 25 , 26 , 26 , // 16-31
3532
+ 31 , 31 , 32 , 32 , 34 , 34 , 35 , 35 , 38 , 38 , 39 , 39 , 41 , 41 , 42 , 42 , // 32-47
3533
+ 46 , 46 , 47 , 47 , 49 , 49 , 50 , 50 , 53 , 53 , 54 , 54 , 56 , 56 , 57 , 57 , // 48-63
3534
+ 63 , 63 , 64 , 64 , // 64-67
3535
+ };
3517
3536
3518
3537
/*[clinic input]
3519
3538
math.comb
@@ -3588,15 +3607,14 @@ math_comb_impl(PyObject *module, PyObject *n, PyObject *k)
3588
3607
where 2**shift is the largest power of two dividing comb(n, k)
3589
3608
and comb_odd_part is comb(n, k) >> shift. comb_odd_part can be
3590
3609
calculated efficiently via arithmetic modulo 2**64, using three
3591
- lookups and two uint64_t multiplications, while the necessary
3592
- shift can be computed via Kummer's theorem: it's the number of
3593
- carries when adding k to n - k in binary, which in turn is the
3594
- number of set bits of n ^ k ^ (n - k).
3610
+ lookups and two uint64_t multiplications.
3595
3611
*/
3596
3612
uint64_t comb_odd_part = reduced_factorial_odd_part [ni ]
3597
3613
* inverted_factorial_odd_part [ki ]
3598
3614
* inverted_factorial_odd_part [ni - ki ];
3599
- int shift = _Py_popcount32 ((uint32_t )(ni ^ ki ^ (ni - ki )));
3615
+ int shift = factorial_trailing_zeros [ni ]
3616
+ - factorial_trailing_zeros [ki ]
3617
+ - factorial_trailing_zeros [ni - ki ];
3600
3618
result = PyLong_FromUnsignedLongLong (comb_odd_part << shift );
3601
3619
goto done ;
3602
3620
}
0 commit comments