Skip to content

Commit 96c7b19

Browse files
committed
add some comments
1 parent 092c189 commit 96c7b19

File tree

1 file changed

+21
-3
lines changed

1 file changed

+21
-3
lines changed

Objects/unicodeobject.c

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5061,6 +5061,14 @@ load_unaligned(const unsigned char *p, size_t size)
50615061
}
50625062
#endif
50635063

5064+
/*
5065+
* Find the first non-ASCII character in a byte sequence.
5066+
*
5067+
* This function scans a range of bytes from `start` to `end` and returns the
5068+
* index of the first byte that is not an ASCII character (i.e., has the most
5069+
* significant bit set). If all characters in the range are ASCII, it returns
5070+
* `end - start`.
5071+
*/
50645072
static Py_ssize_t
50655073
find_first_nonascii(const unsigned char *start, const unsigned char *end)
50665074
{
@@ -5122,18 +5130,23 @@ find_first_nonascii(const unsigned char *start, const unsigned char *end)
51225130
#endif
51235131
}
51245132

5125-
static inline int scalar_utf8_start_char(unsigned int ch)
5133+
static inline int
5134+
scalar_utf8_start_char(unsigned int ch)
51265135
{
51275136
// 0xxxxxxx or 11xxxxxx are first byte.
51285137
return (~ch >> 7 | ch >> 6) & 1;
51295138
}
51305139

5131-
static inline size_t vector_utf8_start_chars(size_t v)
5140+
static inline size_t
5141+
vector_utf8_start_chars(size_t v)
51325142
{
51335143
return ((~v >> 7) | (v >> 6)) & VECTOR_0101;
51345144
}
51355145

5136-
static Py_ssize_t utf8_count_codepoints(const unsigned char *s, const unsigned char *end)
5146+
5147+
// Count the number of UTF-8 code points in a given byte sequence.
5148+
static Py_ssize_t
5149+
utf8_count_codepoints(const unsigned char *s, const unsigned char *end)
51375150
{
51385151
Py_ssize_t len = 0;
51395152

@@ -5377,6 +5390,11 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
53775390
// otherwise: check the input and decide the maxchr and maxsize to reduce
53785391
// reallocation and copy.
53795392
if (error_handler == _Py_ERROR_STRICT && !consumed && ch >= 0xc2) {
5393+
// we only calculate the number of codepoints and don't determine the exact maxchr.
5394+
// This is because writing fast and portable SIMD code to find maxchr is difficult.
5395+
// If reallocation occurs for a larger maxchar, knowing the exact number of codepoints
5396+
// means that it is no longer necessary to allocate several times the required amount
5397+
// of memory.
53805398
maxsize = utf8_count_codepoints((const unsigned char *)s, (const unsigned char *)end);
53815399
if (ch < 0xc4) { // latin1
53825400
maxchr = 0xff;

0 commit comments

Comments
 (0)