@@ -1718,20 +1718,49 @@ completes the proof sketch.
1718
1718
1719
1719
*/
1720
1720
1721
+ /*
1722
+ The _approximate_isqrt_tab table provides approximate square roots for
1723
+ 16-bit integers. For any n in the range 2**14 <= n < 2**16, the value
1724
+
1725
+ a = _approximate_isqrt_tab[(n >> 8) - 64]
1726
+
1727
+ is an approximate square root of n, satisfying (a - 1)**2 < n < (a + 1)**2.
1728
+
1729
+ The table was computed in Python using the expression:
1730
+
1731
+ [min(round(sqrt(256*n + 128)), 255) for n in range(64, 256)]
1732
+ */
1733
+
1734
+ static const uint8_t _approximate_isqrt_tab [192 ] = {
1735
+ 128 , 129 , 130 , 131 , 132 , 133 , 134 , 135 , 136 , 137 , 138 , 139 ,
1736
+ 140 , 141 , 142 , 143 , 144 , 144 , 145 , 146 , 147 , 148 , 149 , 150 ,
1737
+ 151 , 151 , 152 , 153 , 154 , 155 , 156 , 156 , 157 , 158 , 159 , 160 ,
1738
+ 160 , 161 , 162 , 163 , 164 , 164 , 165 , 166 , 167 , 167 , 168 , 169 ,
1739
+ 170 , 170 , 171 , 172 , 173 , 173 , 174 , 175 , 176 , 176 , 177 , 178 ,
1740
+ 179 , 179 , 180 , 181 , 181 , 182 , 183 , 183 , 184 , 185 , 186 , 186 ,
1741
+ 187 , 188 , 188 , 189 , 190 , 190 , 191 , 192 , 192 , 193 , 194 , 194 ,
1742
+ 195 , 196 , 196 , 197 , 198 , 198 , 199 , 200 , 200 , 201 , 201 , 202 ,
1743
+ 203 , 203 , 204 , 205 , 205 , 206 , 206 , 207 , 208 , 208 , 209 , 210 ,
1744
+ 210 , 211 , 211 , 212 , 213 , 213 , 214 , 214 , 215 , 216 , 216 , 217 ,
1745
+ 217 , 218 , 219 , 219 , 220 , 220 , 221 , 221 , 222 , 223 , 223 , 224 ,
1746
+ 224 , 225 , 225 , 226 , 227 , 227 , 228 , 228 , 229 , 229 , 230 , 230 ,
1747
+ 231 , 232 , 232 , 233 , 233 , 234 , 234 , 235 , 235 , 236 , 237 , 237 ,
1748
+ 238 , 238 , 239 , 239 , 240 , 240 , 241 , 241 , 242 , 242 , 243 , 243 ,
1749
+ 244 , 244 , 245 , 246 , 246 , 247 , 247 , 248 , 248 , 249 , 249 , 250 ,
1750
+ 250 , 251 , 251 , 252 , 252 , 253 , 253 , 254 , 254 , 255 , 255 , 255 ,
1751
+ };
1721
1752
1722
1753
/* Approximate square root of a large 64-bit integer.
1723
1754
1724
1755
Given `n` satisfying `2**62 <= n < 2**64`, return `a`
1725
1756
satisfying `(a - 1)**2 < n < (a + 1)**2`. */
1726
1757
1727
- static uint64_t
1758
+ static inline uint32_t
1728
1759
_approximate_isqrt (uint64_t n )
1729
1760
{
1730
- uint32_t u = 1U + (n >> 62 );
1731
- u = (u << 1 ) + (n >> 59 ) / u ;
1732
- u = (u << 3 ) + (n >> 53 ) / u ;
1733
- u = (u << 7 ) + (n >> 41 ) / u ;
1734
- return (u << 15 ) + (n >> 17 ) / u ;
1761
+ uint32_t u = _approximate_isqrt_tab [(n >> 56 ) - 64 ];
1762
+ u = (u << 7 ) + (uint32_t )(n >> 41 ) / u ;
1763
+ return (u << 15 ) + (uint32_t )((n >> 17 ) / u );
1735
1764
}
1736
1765
1737
1766
/*[clinic input]
@@ -1749,7 +1778,8 @@ math_isqrt(PyObject *module, PyObject *n)
1749
1778
{
1750
1779
int a_too_large , c_bit_length ;
1751
1780
size_t c , d ;
1752
- uint64_t m , u ;
1781
+ uint64_t m ;
1782
+ uint32_t u ;
1753
1783
PyObject * a = NULL , * b ;
1754
1784
1755
1785
n = _PyNumber_Index (n );
@@ -1776,18 +1806,17 @@ math_isqrt(PyObject *module, PyObject *n)
1776
1806
c = (c - 1U ) / 2U ;
1777
1807
1778
1808
/* Fast path: if c <= 31 then n < 2**64 and we can compute directly with a
1779
- fast, almost branch-free algorithm. In the final correction, we use `u*u
1780
- - 1 >= m` instead of the simpler `u*u > m` in order to get the correct
1781
- result in the corner case where `u=2**32`. */
1809
+ fast, almost branch-free algorithm. */
1782
1810
if (c <= 31U ) {
1811
+ int shift = 31 - (int )c ;
1783
1812
m = (uint64_t )PyLong_AsUnsignedLongLong (n );
1784
1813
Py_DECREF (n );
1785
1814
if (m == (uint64_t )(-1 ) && PyErr_Occurred ()) {
1786
1815
return NULL ;
1787
1816
}
1788
- u = _approximate_isqrt (m << ( 62U - 2U * c )) >> ( 31U - c ) ;
1789
- u -= u * u - 1U >= m ;
1790
- return PyLong_FromUnsignedLongLong (( unsigned long long ) u );
1817
+ u = _approximate_isqrt (m << 2 * shift ) >> shift ;
1818
+ u -= ( uint64_t ) u * u > m ;
1819
+ return PyLong_FromUnsignedLong ( u );
1791
1820
}
1792
1821
1793
1822
/* Slow path: n >= 2**64. We perform the first five iterations in C integer
@@ -1811,7 +1840,7 @@ math_isqrt(PyObject *module, PyObject *n)
1811
1840
goto error ;
1812
1841
}
1813
1842
u = _approximate_isqrt (m ) >> (31U - d );
1814
- a = PyLong_FromUnsignedLongLong (( unsigned long long ) u );
1843
+ a = PyLong_FromUnsignedLong ( u );
1815
1844
if (a == NULL ) {
1816
1845
goto error ;
1817
1846
}
0 commit comments