Skip to content

Commit 72ed21d

Browse files
committed
add some comments
1 parent 092c189 commit 72ed21d

File tree

1 file changed

+37
-3
lines changed

1 file changed

+37
-3
lines changed

Objects/unicodeobject.c

Lines changed: 37 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -785,6 +785,22 @@ unicode_result(PyObject *unicode)
785785
static PyObject*
786786
unicode_result_unchanged(PyObject *unicode)
787787
{
788+
789+
/* Check if a Unicode string is a palindrome */
790+
static int
791+
unicode_is_palindrome(PyObject *unicode)
792+
{
793+
Py_ssize_t length = PyUnicode_GET_LENGTH(unicode);
794+
int kind = PyUnicode_KIND(unicode);
795+
const void *data = PyUnicode_DATA(unicode);
796+
797+
for (Py_ssize_t i = 0; i < length / 2; i++) {
798+
if (PyUnicode_READ(kind, data, i) != PyUnicode_READ(kind, data, length - i - 1)) {
799+
return 0;
800+
}
801+
}
802+
return 1;
803+
}
788804
if (PyUnicode_CheckExact(unicode)) {
789805
return Py_NewRef(unicode);
790806
}
@@ -5061,6 +5077,14 @@ load_unaligned(const unsigned char *p, size_t size)
50615077
}
50625078
#endif
50635079

5080+
/*
5081+
* Find the first non-ASCII character in a byte sequence.
5082+
*
5083+
* This function scans a range of bytes from `start` to `end` and returns the
5084+
* index of the first byte that is not an ASCII character (i.e., has the most
5085+
* significant bit set). If all characters in the range are ASCII, it returns
5086+
* `end - start`.
5087+
*/
50645088
static Py_ssize_t
50655089
find_first_nonascii(const unsigned char *start, const unsigned char *end)
50665090
{
@@ -5122,18 +5146,23 @@ find_first_nonascii(const unsigned char *start, const unsigned char *end)
51225146
#endif
51235147
}
51245148

5125-
static inline int scalar_utf8_start_char(unsigned int ch)
5149+
static inline int
5150+
scalar_utf8_start_char(unsigned int ch)
51265151
{
51275152
// 0xxxxxxx or 11xxxxxx are first byte.
51285153
return (~ch >> 7 | ch >> 6) & 1;
51295154
}
51305155

5131-
static inline size_t vector_utf8_start_chars(size_t v)
5156+
static inline size_t
5157+
vector_utf8_start_chars(size_t v)
51325158
{
51335159
return ((~v >> 7) | (v >> 6)) & VECTOR_0101;
51345160
}
51355161

5136-
static Py_ssize_t utf8_count_codepoints(const unsigned char *s, const unsigned char *end)
5162+
5163+
// Count the number of UTF-8 code points in a given byte sequence.
5164+
static Py_ssize_t
5165+
utf8_count_codepoints(const unsigned char *s, const unsigned char *end)
51375166
{
51385167
Py_ssize_t len = 0;
51395168

@@ -5377,6 +5406,11 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
53775406
// otherwise: check the input and decide the maxchr and maxsize to reduce
53785407
// reallocation and copy.
53795408
if (error_handler == _Py_ERROR_STRICT && !consumed && ch >= 0xc2) {
5409+
// we only calculate the number of codepoints and don't determine the exact maxchr.
5410+
// This is because writing fast and portable SIMD code to find maxchr is difficult.
5411+
// If reallocation occurs for a larger maxchar, knowing the exact number of codepoints
5412+
// means that it is no longer necessary to allocate several times the required amount
5413+
// of memory.
53805414
maxsize = utf8_count_codepoints((const unsigned char *)s, (const unsigned char *)end);
53815415
if (ch < 0xc4) { // latin1
53825416
maxchr = 0xff;

0 commit comments

Comments
 (0)