@@ -5061,6 +5061,14 @@ load_unaligned(const unsigned char *p, size_t size)
5061
5061
}
5062
5062
#endif
5063
5063
5064
+ /*
5065
+ * Find the first non-ASCII character in a byte sequence.
5066
+ *
5067
+ * This function scans a range of bytes from `start` to `end` and returns the
5068
+ * index of the first byte that is not an ASCII character (i.e., has the most
5069
+ * significant bit set). If all characters in the range are ASCII, it returns
5070
+ * `end - start`.
5071
+ */
5064
5072
static Py_ssize_t
5065
5073
find_first_nonascii (const unsigned char * start , const unsigned char * end )
5066
5074
{
@@ -5122,18 +5130,23 @@ find_first_nonascii(const unsigned char *start, const unsigned char *end)
5122
5130
#endif
5123
5131
}
5124
5132
5125
- static inline int scalar_utf8_start_char (unsigned int ch )
5133
+ static inline int
5134
+ scalar_utf8_start_char (unsigned int ch )
5126
5135
{
5127
5136
// 0xxxxxxx or 11xxxxxx are first byte.
5128
5137
return (~ch >> 7 | ch >> 6 ) & 1 ;
5129
5138
}
5130
5139
5131
- static inline size_t vector_utf8_start_chars (size_t v )
5140
+ static inline size_t
5141
+ vector_utf8_start_chars (size_t v )
5132
5142
{
5133
5143
return ((~v >> 7 ) | (v >> 6 )) & VECTOR_0101 ;
5134
5144
}
5135
5145
5136
- static Py_ssize_t utf8_count_codepoints (const unsigned char * s , const unsigned char * end )
5146
+
5147
+ // Count the number of UTF-8 code points in a given byte sequence.
5148
+ static Py_ssize_t
5149
+ utf8_count_codepoints (const unsigned char * s , const unsigned char * end )
5137
5150
{
5138
5151
Py_ssize_t len = 0 ;
5139
5152
@@ -5377,6 +5390,11 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
5377
5390
// otherwise: check the input and decide the maxchr and maxsize to reduce
5378
5391
// reallocation and copy.
5379
5392
if (error_handler == _Py_ERROR_STRICT && !consumed && ch >= 0xc2 ) {
5393
+ // we only calculate the number of codepoints and don't determine the exact maxchr.
5394
+ // This is because writing fast and portable SIMD code to find maxchr is difficult.
5395
+ // If reallocation occurs for a larger maxchar, knowing the exact number of codepoints
5396
+ // means that it is no longer necessary to allocate several times the required amount
5397
+ // of memory.
5380
5398
maxsize = utf8_count_codepoints ((const unsigned char * )s , (const unsigned char * )end );
5381
5399
if (ch < 0xc4 ) { // latin1
5382
5400
maxchr = 0xff ;
0 commit comments