@@ -1617,25 +1617,41 @@ v_rshift(digit *z, digit *a, Py_ssize_t m, int d)
1617
1617
in pout, and returning the remainder. pin and pout point at the LSD.
1618
1618
It's OK for pin == pout on entry, which saves oodles of mallocs/frees in
1619
1619
_PyLong_Format, but that should be done with great care since ints are
1620
- immutable. */
1620
+ immutable.
1621
1621
1622
+ This version of the code can be 20% faster than the pre-2022 version
1623
+ on todays compilers on architectures like amd64. It evolved from Mark
1624
+ Dickinson observing that a 128:64 divide instruction was always being
1625
+ generated by the compiler despite us working with 30-bit digit values.
1626
+ See the thread for full context:
1627
+
1628
+ https://mail.python.org/archives/list/[email protected] /thread/ZICIMX5VFCX4IOFH5NUPVHCUJCQ4Q7QM/#NEUNFZU3TQU4CPTYZNF3WCN7DOJBBTK5
1629
+
1630
+ If you ever want to change this code, pay attention to performance using
1631
+ different compilers, optimization levels, and cpu architectures. Beware of
1632
+ PGO/FDO builds doing value specialization such as a fast path for //10. :)
1633
+
1634
+ Verify that 17 isn't specialized and this works as a quick test:
1635
+ python -m timeit -s 'x = 10**1000; r=x//10; assert r == 10**999, r' 'x//17'
1636
+ */
1622
1637
static digit
1623
1638
inplace_divrem1 (digit * pout , digit * pin , Py_ssize_t size , digit n )
1624
1639
{
1625
- twodigits rem = 0 ;
1640
+ digit remainder = 0 ;
1626
1641
1627
1642
assert (n > 0 && n <= PyLong_MASK );
1628
- pin += size ;
1629
- pout += size ;
1630
1643
while (-- size >= 0 ) {
1631
- digit hi ;
1632
- rem = (rem << PyLong_SHIFT ) | * -- pin ;
1633
- * -- pout = hi = (digit )(rem / n );
1634
- rem -= (twodigits )hi * n ;
1635
- }
1636
- return (digit )rem ;
1644
+ twodigits dividend ;
1645
+ dividend = ((twodigits )remainder << PyLong_SHIFT ) | pin [size ];
1646
+ digit quotient ;
1647
+ quotient = (digit )(dividend / n );
1648
+ remainder = dividend % n ;
1649
+ pout [size ] = quotient ;
1650
+ }
1651
+ return remainder ;
1637
1652
}
1638
1653
1654
+
1639
1655
/* Divide an integer by a digit, returning both the quotient
1640
1656
(as function result) and the remainder (through *prem).
1641
1657
The sign of a is ignored; n should not be zero. */
0 commit comments