Skip to content

Commit 9b47c2b

Browse files
committed
utf8_count_codepoints
1 parent 5a71387 commit 9b47c2b

File tree

1 file changed

+66
-8
lines changed

1 file changed

+66
-8
lines changed

Objects/unicodeobject.c

Lines changed: 66 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4978,12 +4978,17 @@ PyUnicode_DecodeUTF8(const char *s,
49784978
#include "stringlib/codecs.h"
49794979
#include "stringlib/undef.h"
49804980

4981+
#if (SIZEOF_SIZE_T == 8)
49814982
/* Mask to quickly check whether a C 'size_t' contains a
49824983
non-ASCII, UTF8-encoded char. */
4983-
#if (SIZEOF_SIZE_T == 8)
49844984
# define ASCII_CHAR_MASK 0x8080808080808080ULL
4985+
// used to count codepoints in UTF-8 string.
4986+
# define VECTOR_0101 0x0101010101010101ULL
4987+
# define VECTOR_00FF 0x00ff00ff00ff00ffULL
49854988
#elif (SIZEOF_SIZE_T == 4)
49864989
# define ASCII_CHAR_MASK 0x80808080U
4990+
# define VECTOR_0101 0x01010101U
4991+
# define VECTOR_00FF 0x00ff00ffU
49874992
#else
49884993
# error C 'size_t' size should be either 4 or 8!
49894994
#endif
@@ -5056,11 +5061,13 @@ find_first_nonascii(const char *start, const char *end)
50565061
while (p <= e) {
50575062
size_t value = (*(const size_t *)p) & ASCII_CHAR_MASK;
50585063
if (value) {
5059-
// Optimization only for major platforms we have CI.
50605064
#if PY_LITTLE_ENDIAN && (defined(__clang__) || defined(__GNUC__))
5061-
#if SIZEOF_SIZE_T == SIZEOF_LONG
5065+
#if SIZEOF_SIZE_T == 4
5066+
// __builtin_ctzl(0x8000) == 15.
5067+
// (15-7) / 8 == 1.
5068+
// p+1 is first non-ASCII char.
50625069
return p - start + (__builtin_ctzl(value)-7) / 8;
5063-
#elif SIZEOF_SIZE_T == SIZEOF_LONG_LONG
5070+
#else
50645071
return p - start + (__builtin_ctzll(value)-7) / 8;
50655072
#endif
50665073
#elif PY_LITTLE_ENDIAN && defined(_MSC_VER)
@@ -5071,8 +5078,11 @@ find_first_nonascii(const char *start, const char *end)
50715078
_BitScanForward64(&bitpos, value);
50725079
#endif
50735080
return p - start + (bitpos-7) / 8;
5074-
#endif
5081+
#else
5082+
// big endian and minor compilers are difficult to test.
5083+
// fallback to per byte check.
50755084
break;
5085+
#endif
50765086
}
50775087
p += SIZEOF_SIZE_T;
50785088
}
@@ -5086,6 +5096,52 @@ find_first_nonascii(const char *start, const char *end)
50865096
return p - start;
50875097
}
50885098

5099+
static inline int scalar_utf8_start_char(unsigned int ch)
5100+
{
5101+
// 0xxxxxxx or 11xxxxxx are first byte.
5102+
return (~ch >> 7 | ch >> 6) & 1;
5103+
}
5104+
5105+
static inline size_t vector_utf8_start_chars(size_t v)
5106+
{
5107+
return ((~v>>7) | (v>>6)) & VECTOR_0101;
5108+
}
5109+
5110+
static Py_ssize_t utf8_count_codepoints(const unsigned char *s, Py_ssize_t size)
5111+
{
5112+
Py_ssize_t len = 0;
5113+
const unsigned char *end = s+size;
5114+
5115+
if (end - s > SIZEOF_SIZE_T*2) {
5116+
while (!_Py_IS_ALIGNED(s, ALIGNOF_SIZE_T)) {
5117+
len += scalar_utf8_start_char(*s++);
5118+
}
5119+
5120+
while (s + SIZEOF_SIZE_T <= end) {
5121+
const unsigned char *e = end;
5122+
if (e - s > SIZEOF_SIZE_T * 255) {
5123+
e = s + SIZEOF_SIZE_T * 255;
5124+
}
5125+
Py_ssize_t vstart = 0;
5126+
while (s + SIZEOF_SIZE_T <= e) {
5127+
size_t v = *(size_t*)s;
5128+
size_t vs = vector_utf8_start_chars(v);
5129+
vstart += vs;
5130+
s += SIZEOF_SIZE_T;
5131+
}
5132+
vstart = (vstart & VECTOR_00FF) + ((vstart >> 8) & VECTOR_00FF);
5133+
vstart += vstart >> 16;
5134+
#if SIZEOF_SIZE_T == 8
5135+
vstart += vstart >> 32;
5136+
#endif
5137+
len += vstart & 0x7ff;
5138+
}
5139+
}
5140+
while (s < end) {
5141+
len += scalar_utf8_start_char(*s++);
5142+
}
5143+
return len;
5144+
}
50895145

50905146
static int
50915147
unicode_decode_utf8_impl(_PyUnicodeWriter *writer,
@@ -5234,8 +5290,7 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
52345290
const char *end = s + size;
52355291

52365292
Py_ssize_t pos = find_first_nonascii(starts, end);
5237-
if (pos == size) {
5238-
// fast path: ASCII
5293+
if (pos == size) { // fast path: ASCII string.
52395294
PyObject *u = PyUnicode_New(size, 127);
52405295
if (u == NULL) {
52415296
return NULL;
@@ -5248,8 +5303,11 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
52485303
}
52495304

52505305
int maxchr = 127;
5306+
Py_ssize_t maxsize = size;
5307+
52515308
unsigned char ch = (unsigned char)s[pos];
52525309
if (error_handler == _Py_ERROR_STRICT && ch >= 0xc2) {
5310+
maxsize = utf8_count_codepoints((const unsigned char *)s, size);
52535311
if (ch < 0xc4) { // latin1
52545312
maxchr = 255;
52555313
}
@@ -5260,7 +5318,7 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
52605318
maxchr = 0x10ffff;
52615319
}
52625320
}
5263-
PyObject *u = PyUnicode_New(size, maxchr);
5321+
PyObject *u = PyUnicode_New(maxsize, maxchr);
52645322
if (!u) {
52655323
return NULL;
52665324
}

0 commit comments

Comments
 (0)