Skip to content

Commit 5344340

Browse files
committed
utf8_count
1 parent 5a71387 commit 5344340

File tree

1 file changed

+66
-4
lines changed

1 file changed

+66
-4
lines changed

Objects/unicodeobject.c

Lines changed: 66 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4978,12 +4978,17 @@ PyUnicode_DecodeUTF8(const char *s,
49784978
#include "stringlib/codecs.h"
49794979
#include "stringlib/undef.h"
49804980

4981+
#if (SIZEOF_SIZE_T == 8)
49814982
/* Mask to quickly check whether a C 'size_t' contains a
49824983
non-ASCII, UTF8-encoded char. */
4983-
#if (SIZEOF_SIZE_T == 8)
49844984
# define ASCII_CHAR_MASK 0x8080808080808080ULL
4985+
// used to count codepoints in UTF-8 string.
4986+
# define VECTOR_0101 0x0101010101010101ULL
4987+
# define VECTOR_00FF 0x00ff00ff00ff00ffULL
49854988
#elif (SIZEOF_SIZE_T == 4)
49864989
# define ASCII_CHAR_MASK 0x80808080U
4990+
# define VECTOR_0101 0x01010101U
4991+
# define VECTOR_00FF 0x00ff00ffU
49874992
#else
49884993
# error C 'size_t' size should be either 4 or 8!
49894994
#endif
@@ -5087,6 +5092,61 @@ find_first_nonascii(const char *start, const char *end)
50875092
}
50885093

50895094

5095+
#if SIZEOF_SIZE_T == 4
5096+
const size_t vector_01 = 0x01010101;
5097+
const size_t vector_00ff = 0x00ff00ff;
5098+
#else
5099+
const size_t vector_01 = 0x0101010101010101;
5100+
const size_t vector_00ff = 0x00ff00ff00ff00ff;
5101+
#endif
5102+
5103+
static inline int scalar_utf8_start_char(unsigned int ch)
5104+
{
5105+
// 0xxxxxxx or 11xxxxxx are first byte.
5106+
return (~ch >> 7 | ch >> 6) & 1;
5107+
}
5108+
5109+
static inline size_t vector_utf8_start_chars(size_t v)
5110+
{
5111+
return ((~v>>7) | (v>>6)) & VECTOR_0101;
5112+
}
5113+
5114+
static Py_ssize_t utf8_count(const unsigned char *s, Py_ssize_t size)
5115+
{
5116+
Py_ssize_t len = 0;
5117+
const unsigned char *end = s+size;
5118+
5119+
if (end - s > SIZEOF_SIZE_T*2) {
5120+
while (!_Py_IS_ALIGNED(s, ALIGNOF_SIZE_T)) {
5121+
len += scalar_utf8_start_char(*s++);
5122+
}
5123+
5124+
while (s + SIZEOF_SIZE_T <= end) {
5125+
const unsigned char *e = end;
5126+
if (e - s > SIZEOF_SIZE_T * 255) {
5127+
e = s + SIZEOF_SIZE_T * 255;
5128+
}
5129+
Py_ssize_t vstart = 0;
5130+
while (s + SIZEOF_SIZE_T <= e) {
5131+
size_t v = *(size_t*)s;
5132+
size_t vs = vector_utf8_start_chars(v);
5133+
vstart += vs;
5134+
s += SIZEOF_SIZE_T;
5135+
}
5136+
vstart = (vstart & VECTOR_00FF) + ((vstart >> 8) & VECTOR_00FF);
5137+
vstart += vstart >> 16;
5138+
#if SIZEOF_SIZE_T == 8
5139+
vstart += vstart >> 32;
5140+
#endif
5141+
len += vstart & 0x7ff;
5142+
}
5143+
}
5144+
while (s < end) {
5145+
len += scalar_utf8_start_char(*s++);
5146+
}
5147+
return len;
5148+
}
5149+
50905150
static int
50915151
unicode_decode_utf8_impl(_PyUnicodeWriter *writer,
50925152
const char *starts, const char *s, const char *end,
@@ -5234,8 +5294,7 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
52345294
const char *end = s + size;
52355295

52365296
Py_ssize_t pos = find_first_nonascii(starts, end);
5237-
if (pos == size) {
5238-
// fast path: ASCII
5297+
if (pos == size) { // fast path: ASCII string.
52395298
PyObject *u = PyUnicode_New(size, 127);
52405299
if (u == NULL) {
52415300
return NULL;
@@ -5248,8 +5307,11 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
52485307
}
52495308

52505309
int maxchr = 127;
5310+
Py_ssize_t maxsize = size;
5311+
52515312
unsigned char ch = (unsigned char)s[pos];
52525313
if (error_handler == _Py_ERROR_STRICT && ch >= 0xc2) {
5314+
maxsize = utf8_count((const unsigned char *)s, size);
52535315
if (ch < 0xc4) { // latin1
52545316
maxchr = 255;
52555317
}
@@ -5260,7 +5322,7 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
52605322
maxchr = 0x10ffff;
52615323
}
52625324
}
5263-
PyObject *u = PyUnicode_New(size, maxchr);
5325+
PyObject *u = PyUnicode_New(maxsize, maxchr);
52645326
if (!u) {
52655327
return NULL;
52665328
}

0 commit comments

Comments
 (0)