@@ -4978,12 +4978,17 @@ PyUnicode_DecodeUTF8(const char *s,
4978
4978
#include "stringlib/codecs.h"
4979
4979
#include "stringlib/undef.h"
4980
4980
4981
+ #if (SIZEOF_SIZE_T == 8 )
4981
4982
/* Mask to quickly check whether a C 'size_t' contains a
4982
4983
non-ASCII, UTF8-encoded char. */
4983
- #if (SIZEOF_SIZE_T == 8 )
4984
4984
# define ASCII_CHAR_MASK 0x8080808080808080ULL
4985
+ // used to count codepoints in UTF-8 string.
4986
+ # define VECTOR_0101 0x0101010101010101ULL
4987
+ # define VECTOR_00FF 0x00ff00ff00ff00ffULL
4985
4988
#elif (SIZEOF_SIZE_T == 4 )
4986
4989
# define ASCII_CHAR_MASK 0x80808080U
4990
+ # define VECTOR_0101 0x01010101U
4991
+ # define VECTOR_00FF 0x00ff00ffU
4987
4992
#else
4988
4993
# error C 'size_t' size should be either 4 or 8!
4989
4994
#endif
@@ -5056,11 +5061,13 @@ find_first_nonascii(const char *start, const char *end)
5056
5061
while (p <= e ) {
5057
5062
size_t value = (* (const size_t * )p ) & ASCII_CHAR_MASK ;
5058
5063
if (value ) {
5059
- // Optimization only for major platforms we have CI.
5060
5064
#if PY_LITTLE_ENDIAN && (defined(__clang__ ) || defined(__GNUC__ ))
5061
- #if SIZEOF_SIZE_T == SIZEOF_LONG
5065
+ #if SIZEOF_SIZE_T == 4
5066
+ // __builtin_ctzl(0x8000) == 15.
5067
+ // (15-7) / 8 == 1.
5068
+ // p+1 is first non-ASCII char.
5062
5069
return p - start + (__builtin_ctzl (value )- 7 ) / 8 ;
5063
- #elif SIZEOF_SIZE_T == SIZEOF_LONG_LONG
5070
+ #else
5064
5071
return p - start + (__builtin_ctzll (value )- 7 ) / 8 ;
5065
5072
#endif
5066
5073
#elif PY_LITTLE_ENDIAN && defined(_MSC_VER )
@@ -5071,8 +5078,11 @@ find_first_nonascii(const char *start, const char *end)
5071
5078
_BitScanForward64 (& bitpos , value );
5072
5079
#endif
5073
5080
return p - start + (bitpos - 7 ) / 8 ;
5074
- #endif
5081
+ #else
5082
+ // big endian and minor compilers are difficult to test.
5083
+ // fallback to per byte check.
5075
5084
break ;
5085
+ #endif
5076
5086
}
5077
5087
p += SIZEOF_SIZE_T ;
5078
5088
}
@@ -5086,6 +5096,52 @@ find_first_nonascii(const char *start, const char *end)
5086
5096
return p - start ;
5087
5097
}
5088
5098
5099
+ static inline int scalar_utf8_start_char (unsigned int ch )
5100
+ {
5101
+ // 0xxxxxxx or 11xxxxxx are first byte.
5102
+ return (~ch >> 7 | ch >> 6 ) & 1 ;
5103
+ }
5104
+
5105
+ static inline size_t vector_utf8_start_chars (size_t v )
5106
+ {
5107
+ return ((~v >>7 ) | (v >>6 )) & VECTOR_0101 ;
5108
+ }
5109
+
5110
+ static Py_ssize_t utf8_count_codepoints (const unsigned char * s , Py_ssize_t size )
5111
+ {
5112
+ Py_ssize_t len = 0 ;
5113
+ const unsigned char * end = s + size ;
5114
+
5115
+ if (end - s > SIZEOF_SIZE_T * 2 ) {
5116
+ while (!_Py_IS_ALIGNED (s , ALIGNOF_SIZE_T )) {
5117
+ len += scalar_utf8_start_char (* s ++ );
5118
+ }
5119
+
5120
+ while (s + SIZEOF_SIZE_T <= end ) {
5121
+ const unsigned char * e = end ;
5122
+ if (e - s > SIZEOF_SIZE_T * 255 ) {
5123
+ e = s + SIZEOF_SIZE_T * 255 ;
5124
+ }
5125
+ Py_ssize_t vstart = 0 ;
5126
+ while (s + SIZEOF_SIZE_T <= e ) {
5127
+ size_t v = * (size_t * )s ;
5128
+ size_t vs = vector_utf8_start_chars (v );
5129
+ vstart += vs ;
5130
+ s += SIZEOF_SIZE_T ;
5131
+ }
5132
+ vstart = (vstart & VECTOR_00FF ) + ((vstart >> 8 ) & VECTOR_00FF );
5133
+ vstart += vstart >> 16 ;
5134
+ #if SIZEOF_SIZE_T == 8
5135
+ vstart += vstart >> 32 ;
5136
+ #endif
5137
+ len += vstart & 0x7ff ;
5138
+ }
5139
+ }
5140
+ while (s < end ) {
5141
+ len += scalar_utf8_start_char (* s ++ );
5142
+ }
5143
+ return len ;
5144
+ }
5089
5145
5090
5146
static int
5091
5147
unicode_decode_utf8_impl (_PyUnicodeWriter * writer ,
@@ -5234,8 +5290,7 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
5234
5290
const char * end = s + size ;
5235
5291
5236
5292
Py_ssize_t pos = find_first_nonascii (starts , end );
5237
- if (pos == size ) {
5238
- // fast path: ASCII
5293
+ if (pos == size ) { // fast path: ASCII string.
5239
5294
PyObject * u = PyUnicode_New (size , 127 );
5240
5295
if (u == NULL ) {
5241
5296
return NULL ;
@@ -5248,8 +5303,11 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
5248
5303
}
5249
5304
5250
5305
int maxchr = 127 ;
5306
+ Py_ssize_t maxsize = size ;
5307
+
5251
5308
unsigned char ch = (unsigned char )s [pos ];
5252
5309
if (error_handler == _Py_ERROR_STRICT && ch >= 0xc2 ) {
5310
+ maxsize = utf8_count_codepoints ((const unsigned char * )s , size );
5253
5311
if (ch < 0xc4 ) { // latin1
5254
5312
maxchr = 255 ;
5255
5313
}
@@ -5260,7 +5318,7 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
5260
5318
maxchr = 0x10ffff ;
5261
5319
}
5262
5320
}
5263
- PyObject * u = PyUnicode_New (size , maxchr );
5321
+ PyObject * u = PyUnicode_New (maxsize , maxchr );
5264
5322
if (!u ) {
5265
5323
return NULL ;
5266
5324
}
0 commit comments