Skip to content

Commit 08ce01c

Browse files
committed
optimize find_first_nonascii
1 parent c47d574 commit 08ce01c

File tree

1 file changed

+84
-38
lines changed

1 file changed

+84
-38
lines changed

Objects/unicodeobject.c

Lines changed: 84 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -5090,52 +5090,98 @@ ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
50905090
return p - start;
50915091
}
50925092

5093-
static Py_ssize_t
5094-
find_first_nonascii(const char *start, const char *end)
5095-
{
5096-
const char *p = start;
5097-
5098-
while (p < end) {
5099-
/* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
5100-
for an explanation. */
5101-
if (_Py_IS_ALIGNED(p, ALIGNOF_SIZE_T)) {
5102-
const char *e = end - SIZEOF_SIZE_T;
5103-
while (p <= e) {
5104-
size_t value = (*(const size_t *)p) & ASCII_CHAR_MASK;
5105-
if (value) {
5106-
#if PY_LITTLE_ENDIAN && (defined(__clang__) || defined(__GNUC__))
5107-
#if SIZEOF_SIZE_T == 4
5108-
// __builtin_ctz(0x8000) == 15.
5109-
// (15-7) / 8 == 1.
5110-
// p+1 is first non-ASCII char.
5111-
return p - start + (__builtin_ctz(value) - 7) / 8;
5112-
#else
5113-
return p - start + (__builtin_ctzll(value) - 7) / 8;
5114-
#endif
5115-
#elif PY_LITTLE_ENDIAN && defined(_MSC_VER)
5116-
unsigned long bitpos;
5093+
#if (defined(__clang__) || defined(__GNUC__))
5094+
#define HAS_CTZ 1
5095+
static inline unsigned int ctz(size_t v) {
5096+
return __builtin_ctzll((unsigned long long)v);
5097+
}
5098+
#elif defined(_MSC_VER)
5099+
#define HAS_CTZ 1
5100+
static inline unsigned int ctz(size_t v) {
5101+
unsigned long pos;
51175102
#if SIZEOF_SIZE_T == 4
5118-
_BitScanForward(&bitpos, value);
5103+
_BitScanForward(&pos, v);
51195104
#else
5120-
_BitScanForward64(&bitpos, value);
5105+
_BitScanForward64(&pos, v);
5106+
#endif /* SIZEOF_SIZE_T */
5107+
return pos;
5108+
}
51215109
#endif
5122-
return p - start + (bitpos - 7) / 8;
5110+
5111+
static Py_ssize_t
5112+
find_first_nonascii(const unsigned char *start, const unsigned char *end)
5113+
{
5114+
const unsigned char *p = start;
5115+
5116+
if (end - start > SIZEOF_SIZE_T + ALIGNOF_SIZE_T) {
5117+
while (!_Py_IS_ALIGNED(p, ALIGNOF_SIZE_T)) {
5118+
if ((unsigned char)*p & 0x80) {
5119+
return p - start;
5120+
}
5121+
p++;
5122+
}
5123+
const unsigned char *e = end - SIZEOF_SIZE_T;
5124+
while (p <= e) {
5125+
size_t value = (*(const size_t *)p) & ASCII_CHAR_MASK;
5126+
if (value) {
5127+
#if PY_LITTLE_ENDIAN && HAS_CTZ
5128+
return p - start + (ctz(value) - 7) / 8;
51235129
#else
5124-
// big endian and minor compilers are difficult to test.
5125-
// fallback to per byte check.
5126-
break;
5130+
// big endian and minor compilers are difficult to test.
5131+
// fallback to per byte check.
5132+
break;
51275133
#endif
5128-
}
5129-
p += SIZEOF_SIZE_T;
51305134
}
5131-
if (p == end)
5135+
p += SIZEOF_SIZE_T;
5136+
}
5137+
}
5138+
#if HAS_CTZ
5139+
// This part looks bit tricky, but decoding short ASCII is super important.
5140+
// Since we copy from p to size_t manually, this part works fine with big endian.
5141+
while (p < end) {
5142+
size_t u = (size_t)(p[0]);
5143+
switch (end - p) {
5144+
default:
5145+
#if SIZEOF_SIZE_T == 8
5146+
u |= (size_t)(p[7]) << 56ull;
5147+
// fall through
5148+
case 7:
5149+
u |= (size_t)(p[6]) << 48ull;
5150+
// fall through
5151+
case 6:
5152+
u |= (size_t)(p[5]) << 40ull;
5153+
// fall through
5154+
case 5:
5155+
u |= (size_t)(p[4]) << 32ull;
5156+
// fall through
5157+
case 4:
5158+
#endif
5159+
u |= (size_t)(p[3]) << 24;
5160+
// fall through
5161+
case 3:
5162+
u |= (size_t)(p[2]) << 16;
5163+
// fall through
5164+
case 2:
5165+
u |= (size_t)(p[1]) << 8;
5166+
break;
5167+
case 1:
51325168
break;
51335169
}
5134-
if ((unsigned char)*p & 0x80)
5170+
if (u & ASCII_CHAR_MASK) {
5171+
return p - start + (ctz(u & ASCII_CHAR_MASK) - 7) / 8;
5172+
}
5173+
p += SIZEOF_SIZE_T;
5174+
}
5175+
return end - start;
5176+
#else
5177+
while (p < end) {
5178+
if ((unsigned char)*p & 0x80) {
51355179
break;
5136-
++p;
5180+
}
5181+
p++;
51375182
}
51385183
return p - start;
5184+
#endif
51395185
}
51405186

51415187
static inline int scalar_utf8_start_char(unsigned int ch)
@@ -5153,7 +5199,7 @@ static Py_ssize_t utf8_count_codepoints(const unsigned char *s, const unsigned c
51535199
{
51545200
Py_ssize_t len = 0;
51555201

5156-
if (end - s > SIZEOF_SIZE_T * 2) {
5202+
if (end - s > SIZEOF_SIZE_T + ALIGNOF_SIZE_T) {
51575203
while (!_Py_IS_ALIGNED(s, ALIGNOF_SIZE_T)) {
51585204
len += scalar_utf8_start_char(*s++);
51595205
}
@@ -5337,7 +5383,7 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
53375383
const char *starts = s;
53385384
const char *end = s + size;
53395385

5340-
Py_ssize_t pos = find_first_nonascii(starts, end);
5386+
Py_ssize_t pos = find_first_nonascii((const unsigned char*)starts, (const unsigned char*)end);
53415387
if (pos == size) { // fast path: ASCII string.
53425388
PyObject *u = ascii_new(size);
53435389
if (u == NULL) {
@@ -5355,7 +5401,7 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
53555401
int maxchr = 127;
53565402
Py_ssize_t maxsize = size;
53575403

5358-
unsigned char ch = (unsigned char)s[pos];
5404+
unsigned char ch = (unsigned char)(s[pos]);
53595405
// error handler other than strict may remove/replace the invalid byte.
53605406
// consumed != NULL allows 1~3 bytes remainings.
53615407
// 0x80 <= ch < 0xc2 is invalid start byte that cause UnicodeDecodeError.

0 commit comments

Comments
 (0)