Skip to content

Commit c6b292c

Browse files
authored
bpo-29882: Add _Py_popcount32() function (GH-20518)
* Rename pycore_byteswap.h to pycore_bitutils.h. * Move popcount_digit() to pycore_bitutils.h as _Py_popcount32(). * _Py_popcount32() uses GCC and clang builtin function if available. * Add unit tests to _Py_popcount32().
1 parent 301f0d4 commit c6b292c

File tree

11 files changed

+108
-39
lines changed

11 files changed

+108
-39
lines changed

Include/internal/pycore_byteswap.h renamed to Include/internal/pycore_bitutils.h

Lines changed: 50 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
1-
/* Bytes swap functions, reverse order of bytes:
1+
/* Bit and bytes utilities.
2+
3+
Bytes swap functions, reverse order of bytes:
24
35
- _Py_bswap16(uint16_t)
46
- _Py_bswap32(uint32_t)
@@ -82,6 +84,53 @@ _Py_bswap64(uint64_t word)
8284
}
8385

8486

87+
// Population count: count the number of 1's in 'x'
88+
// (number of bits set to 1), also known as the hamming weight.
89+
//
90+
// Implementation note. CPUID is not used, to test if x86 POPCNT instruction
91+
// can be used, to keep the implementation simple. For example, Visual Studio
92+
// __popcnt() is not used this reason. The clang and GCC builtin function can
93+
// use the x86 POPCNT instruction if the target architecture has SSE4a or
94+
// newer.
95+
static inline int
96+
_Py_popcount32(uint32_t x)
97+
{
98+
#if (defined(__clang__) || defined(__GNUC__))
99+
100+
#if SIZEOF_INT >= 4
101+
Py_BUILD_ASSERT(sizeof(x) <= sizeof(unsigned int));
102+
return __builtin_popcount(x);
103+
#else
104+
// The C standard guarantees that unsigned long will always be big enough
105+
// to hold a uint32_t value without losing information.
106+
Py_BUILD_ASSERT(sizeof(x) <= sizeof(unsigned long));
107+
return __builtin_popcountl(x);
108+
#endif
109+
110+
#else
111+
// 32-bit SWAR (SIMD Within A Register) popcount
112+
113+
// Binary: 0 1 0 1 ...
114+
const uint32_t M1 = 0x55555555;
115+
// Binary: 00 11 00 11. ..
116+
const uint32_t M2 = 0x33333333;
117+
// Binary: 0000 1111 0000 1111 ...
118+
const uint32_t M4 = 0x0F0F0F0F;
119+
// 256**4 + 256**3 + 256**2 + 256**1
120+
const uint32_t SUM = 0x01010101;
121+
122+
// Put count of each 2 bits into those 2 bits
123+
x = x - ((x >> 1) & M1);
124+
// Put count of each 4 bits into those 4 bits
125+
x = (x & M2) + ((x >> 2) & M2);
126+
// Put count of each 8 bits into those 8 bits
127+
x = (x + (x >> 4)) & M4;
128+
// Sum of the 4 byte counts
129+
return (uint32_t)((uint64_t)x * (uint64_t)SUM) >> 24;
130+
#endif
131+
}
132+
133+
85134
#ifdef __cplusplus
86135
}
87136
#endif

Makefile.pre.in

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1121,7 +1121,7 @@ PYTHON_HEADERS= \
11211121
$(srcdir)/Include/internal/pycore_abstract.h \
11221122
$(srcdir)/Include/internal/pycore_accu.h \
11231123
$(srcdir)/Include/internal/pycore_atomic.h \
1124-
$(srcdir)/Include/internal/pycore_byteswap.h \
1124+
$(srcdir)/Include/internal/pycore_bitutils.h \
11251125
$(srcdir)/Include/internal/pycore_bytes_methods.h \
11261126
$(srcdir)/Include/internal/pycore_call.h \
11271127
$(srcdir)/Include/internal/pycore_ceval.h \

Modules/_ctypes/cfield.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
#include "Python.h"
2-
#include "pycore_byteswap.h" // _Py_bswap32()
2+
#include "pycore_bitutils.h" // _Py_bswap32()
33

44
#include <ffi.h>
55
#ifdef MS_WIN32

Modules/_testinternalcapi.c

Lines changed: 41 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
#define PY_SSIZE_T_CLEAN
1313

1414
#include "Python.h"
15-
#include "pycore_byteswap.h" // _Py_bswap32()
15+
#include "pycore_bitutils.h" // _Py_bswap32()
1616
#include "pycore_initconfig.h" // _Py_GetConfigsAsDict()
1717
#include "pycore_hashtable.h" // _Py_hashtable_new()
1818
#include "pycore_gc.h" // PyGC_Head
@@ -63,6 +63,45 @@ test_bswap(PyObject *self, PyObject *Py_UNUSED(args))
6363
}
6464

6565

66+
static int
67+
check_popcount(uint32_t x, int expected)
68+
{
69+
// Use volatile to prevent the compiler to optimize out the whole test
70+
volatile uint32_t u = x;
71+
int bits = _Py_popcount32(u);
72+
if (bits != expected) {
73+
PyErr_Format(PyExc_AssertionError,
74+
"_Py_popcount32(%lu) returns %i, expected %i",
75+
(unsigned long)x, bits, expected);
76+
return -1;
77+
}
78+
return 0;
79+
}
80+
81+
82+
static PyObject*
83+
test_popcount(PyObject *self, PyObject *Py_UNUSED(args))
84+
{
85+
#define CHECK(X, RESULT) \
86+
do { \
87+
if (check_popcount(X, RESULT) < 0) { \
88+
return NULL; \
89+
} \
90+
} while (0)
91+
92+
CHECK(0, 0);
93+
CHECK(1, 1);
94+
CHECK(0x08080808, 4);
95+
CHECK(0x10101010, 4);
96+
CHECK(0x10204080, 4);
97+
CHECK(0xDEADCAFE, 22);
98+
CHECK(0xFFFFFFFF, 32);
99+
Py_RETURN_NONE;
100+
101+
#undef CHECK
102+
}
103+
104+
66105
#define TO_PTR(ch) ((void*)(uintptr_t)ch)
67106
#define FROM_PTR(ptr) ((uintptr_t)ptr)
68107
#define VALUE(key) (1 + ((int)(key) - 'a'))
@@ -157,6 +196,7 @@ static PyMethodDef TestMethods[] = {
157196
{"get_configs", get_configs, METH_NOARGS},
158197
{"get_recursion_depth", get_recursion_depth, METH_NOARGS},
159198
{"test_bswap", test_bswap, METH_NOARGS},
199+
{"test_popcount", test_popcount, METH_NOARGS},
160200
{"test_hashtable", test_hashtable, METH_NOARGS},
161201
{NULL, NULL} /* sentinel */
162202
};

Modules/sha256module.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
/* SHA objects */
1818

1919
#include "Python.h"
20-
#include "pycore_byteswap.h" // _Py_bswap32()
20+
#include "pycore_bitutils.h" // _Py_bswap32()
2121
#include "structmember.h" // PyMemberDef
2222
#include "hashlib.h"
2323
#include "pystrhex.h"

Modules/sha512module.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
/* SHA objects */
1818

1919
#include "Python.h"
20-
#include "pycore_byteswap.h" // _Py_bswap32()
20+
#include "pycore_bitutils.h" // _Py_bswap32()
2121
#include "structmember.h" // PyMemberDef
2222
#include "hashlib.h"
2323
#include "pystrhex.h"

Objects/longobject.c

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,9 @@
33
/* XXX The functional organization of this file is terrible */
44

55
#include "Python.h"
6-
#include "pycore_interp.h" // _PY_NSMALLPOSINTS
7-
#include "pycore_pystate.h" // _Py_IsMainInterpreter()
6+
#include "pycore_bitutils.h" // _Py_popcount32()
7+
#include "pycore_interp.h" // _PY_NSMALLPOSINTS
8+
#include "pycore_pystate.h" // _Py_IsMainInterpreter()
89
#include "longintrepr.h"
910

1011
#include <float.h>
@@ -5307,12 +5308,10 @@ int_bit_length_impl(PyObject *self)
53075308
static int
53085309
popcount_digit(digit d)
53095310
{
5310-
/* 32bit SWAR popcount. */
5311-
uint32_t u = d;
5312-
u -= (u >> 1) & 0x55555555U;
5313-
u = (u & 0x33333333U) + ((u >> 2) & 0x33333333U);
5314-
u = (u + (u >> 4)) & 0x0f0f0f0fU;
5315-
return (uint32_t)(u * 0x01010101U) >> 24;
5311+
// digit can be larger than uint32_t, but only PyLong_SHIFT bits
5312+
// of it will be ever used.
5313+
Py_BUILD_ASSERT(PyLong_SHIFT <= 32);
5314+
return _Py_popcount32((uint32_t)d);
53165315
}
53175316

53185317
/*[clinic input]

Objects/stringlib/codecs.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
# error "codecs.h is specific to Unicode"
55
#endif
66

7-
#include "pycore_byteswap.h" // _Py_bswap32()
7+
#include "pycore_bitutils.h" // _Py_bswap32()
88

99
/* Mask to quickly check whether a C 'long' contains a
1010
non-ASCII, UTF8-encoded char. */

PCbuild/pythoncore.vcxproj

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -170,7 +170,7 @@
170170
<ClInclude Include="..\Include\internal\pycore_accu.h" />
171171
<ClInclude Include="..\Include\internal\pycore_atomic.h" />
172172
<ClInclude Include="..\Include\internal\pycore_bytes_methods.h" />
173-
<ClInclude Include="..\Include\internal\pycore_byteswap.h" />
173+
<ClInclude Include="..\Include\internal\pycore_bitutils.h" />
174174
<ClInclude Include="..\Include\internal\pycore_call.h" />
175175
<ClInclude Include="..\Include\internal\pycore_ceval.h" />
176176
<ClInclude Include="..\Include\internal\pycore_code.h" />

PCbuild/pythoncore.vcxproj.filters

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -201,7 +201,7 @@
201201
<ClInclude Include="..\Include\internal\pycore_atomic.h">
202202
<Filter>Include</Filter>
203203
</ClInclude>
204-
<ClInclude Include="..\Include\internal\pycore_byteswap.h">
204+
<ClInclude Include="..\Include\internal\pycore_bitutils.h">
205205
<Filter>Include</Filter>
206206
</ClInclude>
207207
<ClInclude Include="..\Include\internal\pycore_bytes_methods.h">

Python/hamt.c

Lines changed: 3 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
#include "Python.h"
22

3+
#include "pycore_bitutils.h" // _Py_popcount32
34
#include "pycore_hamt.h"
45
#include "pycore_object.h" // _PyObject_GC_TRACK()
56
#include <stddef.h> // offsetof()
@@ -433,30 +434,10 @@ hamt_bitpos(int32_t hash, uint32_t shift)
433434
return (uint32_t)1 << hamt_mask(hash, shift);
434435
}
435436

436-
static inline uint32_t
437-
hamt_bitcount(uint32_t i)
438-
{
439-
/* We could use native popcount instruction but that would
440-
require to either add configure flags to enable SSE4.2
441-
support or to detect it dynamically. Otherwise, we have
442-
a risk of CPython not working properly on older hardware.
443-
444-
In practice, there's no observable difference in
445-
performance between using a popcount instruction or the
446-
following fallback code.
447-
448-
The algorithm is copied from:
449-
https://graphics.stanford.edu/~seander/bithacks.html
450-
*/
451-
i = i - ((i >> 1) & 0x55555555);
452-
i = (i & 0x33333333) + ((i >> 2) & 0x33333333);
453-
return (((i + (i >> 4)) & 0xF0F0F0F) * 0x1010101) >> 24;
454-
}
455-
456437
static inline uint32_t
457438
hamt_bitindex(uint32_t bitmap, uint32_t bit)
458439
{
459-
return hamt_bitcount(bitmap & (bit - 1));
440+
return (uint32_t)_Py_popcount32(bitmap & (bit - 1));
460441
}
461442

462443

@@ -820,7 +801,7 @@ hamt_node_bitmap_assoc(PyHamtNode_Bitmap *self,
820801
else {
821802
/* There was no key before with the same (shift,hash). */
822803

823-
uint32_t n = hamt_bitcount(self->b_bitmap);
804+
uint32_t n = (uint32_t)_Py_popcount32(self->b_bitmap);
824805

825806
if (n >= 16) {
826807
/* When we have a situation where we want to store more

0 commit comments

Comments
 (0)