Skip to content

Commit 84a2b8d

Browse files
committed
gh-111089: Add cache to PyUnicode_AsUTF8() for embedded NUL
Add PyASCIIObject.state.embed_null member to Python str objects. It is used as a cache by PyUnicode_AsUTF8() to only check once if a string contains a null character. Strings created by PyUnicode_FromString() initializes *embed_null* since the string cannot contain a null character. Global static strings now also initialize the *embed_null* member. The chr(0) singleton ("\0" string) is the only static string which contains a null character.
1 parent 102685c commit 84a2b8d

File tree

7 files changed

+85
-10
lines changed

7 files changed

+85
-10
lines changed

Include/cpython/unicodeobject.h

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -142,9 +142,16 @@ typedef struct {
142142
unsigned int ascii:1;
143143
/* The object is statically allocated. */
144144
unsigned int statically_allocated:1;
145+
// Does the string embed null characters? Possible values:
146+
// 0: No
147+
// 1: Yes
148+
// 2: Unknown, the string must be scanned
149+
// 3: Invalid state (must not be used)
150+
// Cache used by PyUnicode_AsUTF8() to avoid calling strlen().
151+
unsigned int embed_null:2;
145152
/* Padding to ensure that PyUnicode_DATA() is always aligned to
146153
4 bytes (see issue #19537 on m68k). */
147-
unsigned int :24;
154+
unsigned int :22;
148155
} state;
149156
} PyASCIIObject;
150157

Include/internal/pycore_runtime_init.h

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -215,7 +215,7 @@ extern PyTypeObject _PyExc_MemoryError;
215215
_PyBytes_SIMPLE_INIT((CH), 1) \
216216
}
217217

218-
#define _PyUnicode_ASCII_BASE_INIT(LITERAL, ASCII) \
218+
#define _PyUnicode_ASCII_BASE_INIT(LITERAL, ASCII, EMBED_NUL) \
219219
{ \
220220
.ob_base = _PyObject_HEAD_INIT(&PyUnicode_Type), \
221221
.length = sizeof(LITERAL) - 1, \
@@ -225,11 +225,17 @@ extern PyTypeObject _PyExc_MemoryError;
225225
.compact = 1, \
226226
.ascii = (ASCII), \
227227
.statically_allocated = 1, \
228+
.embed_null = (EMBED_NUL), \
228229
}, \
229230
}
230231
#define _PyASCIIObject_INIT(LITERAL) \
231232
{ \
232-
._ascii = _PyUnicode_ASCII_BASE_INIT((LITERAL), 1), \
233+
._ascii = _PyUnicode_ASCII_BASE_INIT((LITERAL), 1, 0), \
234+
._data = (LITERAL) \
235+
}
236+
#define _PyASCIIObject_INIT_embed_null(LITERAL) \
237+
{ \
238+
._ascii = _PyUnicode_ASCII_BASE_INIT((LITERAL), 1, 1), \
233239
._data = (LITERAL) \
234240
}
235241
#define INIT_STR(NAME, LITERAL) \
@@ -239,7 +245,7 @@ extern PyTypeObject _PyExc_MemoryError;
239245
#define _PyUnicode_LATIN1_INIT(LITERAL, UTF8) \
240246
{ \
241247
._latin1 = { \
242-
._base = _PyUnicode_ASCII_BASE_INIT((LITERAL), 0), \
248+
._base = _PyUnicode_ASCII_BASE_INIT((LITERAL), 0, 0), \
243249
.utf8 = (UTF8), \
244250
.utf8_length = sizeof(UTF8) - 1, \
245251
}, \

Include/internal/pycore_runtime_init_generated.h

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
Add ``PyASCIIObject.state.embed_null`` member to Python str objects. It is
2+
used as a cache by :c:func:`PyUnicode_AsUTF8` to only check once if a string
3+
contains a null character. Strings created by :c:func:`PyUnicode_FromString`
4+
initializes *embed_null* since the string cannot contain a null character.
5+
Patch by Victor Stinner.

Modules/_testcapi/unicode.c

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -301,7 +301,12 @@ unicode_fromstring(PyObject *self, PyObject *arg)
301301
if (!PyArg_Parse(arg, "z#", &s, &size)) {
302302
return NULL;
303303
}
304-
return PyUnicode_FromString(s);
304+
PyObject *unicode = PyUnicode_FromString(s);
305+
if (unicode == NULL) {
306+
return NULL;
307+
}
308+
assert(((PyASCIIObject*)unicode)->state.embed_null == 0);
309+
return unicode;
305310
}
306311

307312
/* Test PyUnicode_FromKindAndData() */

Objects/unicodeobject.c

Lines changed: 44 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -205,6 +205,10 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
205205
static inline int unicode_is_finalizing(void);
206206
static int unicode_is_singleton(PyObject *unicode);
207207
#endif
208+
static inline Py_ssize_t
209+
findchar(const void *s, int kind,
210+
Py_ssize_t size, Py_UCS4 ch,
211+
int direction);
208212

209213

210214
// Return a reference to the immortal empty string singleton.
@@ -623,6 +627,15 @@ _PyUnicode_CheckConsistency(PyObject *op, int check_content)
623627
}
624628
CHECK(PyUnicode_READ(kind, data, ascii->length) == 0);
625629
}
630+
631+
if (_PyUnicode_STATE(ascii).embed_null != 2) {
632+
Py_ssize_t pos = findchar(PyUnicode_DATA(ascii),
633+
PyUnicode_KIND(ascii),
634+
PyUnicode_GET_LENGTH(ascii),
635+
0, 1);
636+
assert(_PyUnicode_STATE(ascii).embed_null == (pos >= 0));
637+
}
638+
626639
return 1;
627640

628641
#undef CHECK
@@ -1253,6 +1266,7 @@ PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
12531266
_PyUnicode_STATE(unicode).compact = 1;
12541267
_PyUnicode_STATE(unicode).ascii = is_ascii;
12551268
_PyUnicode_STATE(unicode).statically_allocated = 0;
1269+
_PyUnicode_STATE(unicode).embed_null = 2;
12561270
if (is_ascii) {
12571271
((char*)data)[size] = 0;
12581272
}
@@ -1890,7 +1904,16 @@ PyUnicode_FromString(const char *u)
18901904
PyErr_SetString(PyExc_OverflowError, "input too long");
18911905
return NULL;
18921906
}
1893-
return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
1907+
PyObject *unicode;
1908+
unicode = PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
1909+
if (unicode != NULL) {
1910+
// PyUnicode_DecodeUTF8Stateful(u, strlen(u)) cannot create NUL
1911+
// characters: the UTF-8 decoder with the strict error handler only
1912+
// creates a NUL character if the input string contains a NUL byte
1913+
// which cannot be the case here.
1914+
_PyUnicode_STATE(unicode).embed_null = 0;
1915+
}
1916+
return unicode;
18941917
}
18951918

18961919

@@ -1932,6 +1955,7 @@ _PyUnicode_FromId(_Py_Identifier *id)
19321955
if (!obj) {
19331956
return NULL;
19341957
}
1958+
_PyUnicode_STATE(obj).embed_null = 0;
19351959
PyUnicode_InternInPlace(&obj);
19361960

19371961
if (index >= ids->size) {
@@ -3846,10 +3870,27 @@ PyUnicode_AsUTF8(PyObject *unicode)
38463870
{
38473871
Py_ssize_t size;
38483872
const char *utf8 = PyUnicode_AsUTF8AndSize(unicode, &size);
3849-
if (utf8 != NULL && strlen(utf8) != (size_t)size) {
3850-
PyErr_SetString(PyExc_ValueError, "embedded null character");
3873+
if (utf8 == NULL) {
38513874
return NULL;
38523875
}
3876+
3877+
// Cache to avoid calling O(n) strlen() operation at every
3878+
// PyUnicode_AsUTF8() call on the same object.
3879+
if (_PyUnicode_STATE(unicode).embed_null == 2) {
3880+
if (strlen(utf8) != (size_t)size) {
3881+
_PyUnicode_STATE(unicode).embed_null = 1;
3882+
}
3883+
else {
3884+
_PyUnicode_STATE(unicode).embed_null = 0;
3885+
}
3886+
}
3887+
3888+
if (_PyUnicode_STATE(unicode).embed_null == 1) {
3889+
PyErr_SetString(PyExc_ValueError,
3890+
"embedded null character");
3891+
return NULL;
3892+
}
3893+
38533894
return utf8;
38543895
}
38553896

Tools/build/generate_global_objects.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -232,6 +232,14 @@ def open_for_changes(filename, orig):
232232
def generate_global_strings(identifiers, strings):
233233
filename = os.path.join(INTERNAL, 'pycore_global_strings.h')
234234

235+
# NUL characters are not supported; see _PyASCIIObject_INIT_embed_null().
236+
for identifier in identifiers:
237+
if "\0" in identifier:
238+
raise Exception(f"an identifier contains a null character: {identifier!r}")
239+
for string in strings:
240+
if "\0" in string:
241+
raise Exception(f"a string contains a null character: {string!r}")
242+
235243
# Read the non-generated part of the file.
236244
with open(filename) as infile:
237245
orig = infile.read()
@@ -321,7 +329,10 @@ def generate_runtime_init(identifiers, strings):
321329
printer.write('')
322330
with printer.block('#define _Py_str_ascii_INIT', continuation=True):
323331
for i in range(128):
324-
printer.write(f'_PyASCIIObject_INIT("\\x{i:02x}"),')
332+
if i == 0:
333+
printer.write(f'_PyASCIIObject_INIT_embed_null("\\x{i:02x}"),')
334+
else:
335+
printer.write(f'_PyASCIIObject_INIT("\\x{i:02x}"),')
325336
immortal_objects.append(f'(PyObject *)&_Py_SINGLETON(strings).ascii[{i}]')
326337
printer.write('')
327338
with printer.block('#define _Py_str_latin1_INIT', continuation=True):

0 commit comments

Comments
 (0)