@@ -4978,12 +4978,17 @@ PyUnicode_DecodeUTF8(const char *s,
4978
4978
#include "stringlib/codecs.h"
4979
4979
#include "stringlib/undef.h"
4980
4980
4981
+ #if (SIZEOF_SIZE_T == 8 )
4981
4982
/* Mask to quickly check whether a C 'size_t' contains a
4982
4983
non-ASCII, UTF8-encoded char. */
4983
- #if (SIZEOF_SIZE_T == 8 )
4984
4984
# define ASCII_CHAR_MASK 0x8080808080808080ULL
4985
+ // used to count codepoints in UTF-8 string.
4986
+ # define VECTOR_0101 0x0101010101010101ULL
4987
+ # define VECTOR_00FF 0x00ff00ff00ff00ffULL
4985
4988
#elif (SIZEOF_SIZE_T == 4 )
4986
4989
# define ASCII_CHAR_MASK 0x80808080U
4990
+ # define VECTOR_0101 0x01010101U
4991
+ # define VECTOR_00FF 0x00ff00ffU
4987
4992
#else
4988
4993
# error C 'size_t' size should be either 4 or 8!
4989
4994
#endif
@@ -5087,6 +5092,61 @@ find_first_nonascii(const char *start, const char *end)
5087
5092
}
5088
5093
5089
5094
5095
+ #if SIZEOF_SIZE_T == 4
5096
+ const size_t vector_01 = 0x01010101 ;
5097
+ const size_t vector_00ff = 0x00ff00ff ;
5098
+ #else
5099
+ const size_t vector_01 = 0x0101010101010101 ;
5100
+ const size_t vector_00ff = 0x00ff00ff00ff00ff ;
5101
+ #endif
5102
+
5103
+ static inline int scalar_utf8_start_char (unsigned int ch )
5104
+ {
5105
+ // 0xxxxxxx or 11xxxxxx are first byte.
5106
+ return (~ch >> 7 | ch >> 6 ) & 1 ;
5107
+ }
5108
+
5109
+ static inline size_t vector_utf8_start_chars (size_t v )
5110
+ {
5111
+ return ((~v >>7 ) | (v >>6 )) & VECTOR_0101 ;
5112
+ }
5113
+
5114
+ static Py_ssize_t utf8_count (const unsigned char * s , Py_ssize_t size )
5115
+ {
5116
+ Py_ssize_t len = 0 ;
5117
+ const unsigned char * end = s + size ;
5118
+
5119
+ if (end - s > SIZEOF_SIZE_T * 2 ) {
5120
+ while (!_Py_IS_ALIGNED (s , ALIGNOF_SIZE_T )) {
5121
+ len += scalar_utf8_start_char (* s ++ );
5122
+ }
5123
+
5124
+ while (s + SIZEOF_SIZE_T <= end ) {
5125
+ const unsigned char * e = end ;
5126
+ if (e - s > SIZEOF_SIZE_T * 255 ) {
5127
+ e = s + SIZEOF_SIZE_T * 255 ;
5128
+ }
5129
+ Py_ssize_t vstart = 0 ;
5130
+ while (s + SIZEOF_SIZE_T <= e ) {
5131
+ size_t v = * (size_t * )s ;
5132
+ size_t vs = vector_utf8_start_chars (v );
5133
+ vstart += vs ;
5134
+ s += SIZEOF_SIZE_T ;
5135
+ }
5136
+ vstart = (vstart & VECTOR_00FF ) + ((vstart >> 8 ) & VECTOR_00FF );
5137
+ vstart += vstart >> 16 ;
5138
+ #if SIZEOF_SIZE_T == 8
5139
+ vstart += vstart >> 32 ;
5140
+ #endif
5141
+ len += vstart & 0x7ff ;
5142
+ }
5143
+ }
5144
+ while (s < end ) {
5145
+ len += scalar_utf8_start_char (* s ++ );
5146
+ }
5147
+ return len ;
5148
+ }
5149
+
5090
5150
static int
5091
5151
unicode_decode_utf8_impl (_PyUnicodeWriter * writer ,
5092
5152
const char * starts , const char * s , const char * end ,
@@ -5234,8 +5294,7 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
5234
5294
const char * end = s + size ;
5235
5295
5236
5296
Py_ssize_t pos = find_first_nonascii (starts , end );
5237
- if (pos == size ) {
5238
- // fast path: ASCII
5297
+ if (pos == size ) { // fast path: ASCII string.
5239
5298
PyObject * u = PyUnicode_New (size , 127 );
5240
5299
if (u == NULL ) {
5241
5300
return NULL ;
@@ -5248,8 +5307,11 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
5248
5307
}
5249
5308
5250
5309
int maxchr = 127 ;
5310
+ Py_ssize_t maxsize = size ;
5311
+
5251
5312
unsigned char ch = (unsigned char )s [pos ];
5252
5313
if (error_handler == _Py_ERROR_STRICT && ch >= 0xc2 ) {
5314
+ maxsize = utf8_count ((const unsigned char * )s , size );
5253
5315
if (ch < 0xc4 ) { // latin1
5254
5316
maxchr = 255 ;
5255
5317
}
@@ -5260,7 +5322,7 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
5260
5322
maxchr = 0x10ffff ;
5261
5323
}
5262
5324
}
5263
- PyObject * u = PyUnicode_New (size , maxchr );
5325
+ PyObject * u = PyUnicode_New (maxsize , maxchr );
5264
5326
if (!u ) {
5265
5327
return NULL ;
5266
5328
}
0 commit comments