Skip to content

Commit 02a4d57

Browse files
authored
bpo-39087: Optimize PyUnicode_AsUTF8AndSize() (pythonGH-18327)
Avoid using temporary bytes object.
1 parent 0c6e3aa commit 02a4d57

File tree

3 files changed

+92
-43
lines changed

3 files changed

+92
-43
lines changed
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Optimize :c:func:`PyUnicode_AsUTF8` and :c:func:`PyUnicode_AsUTF8AndSize`
2+
slightly when they need to create internal UTF-8 cache.

Objects/stringlib/codecs.h

Lines changed: 17 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -256,8 +256,9 @@ STRINGLIB(utf8_decode)(const char **inptr, const char *end,
256256
/* UTF-8 encoder specialized for a Unicode kind to avoid the slow
257257
PyUnicode_READ() macro. Delete some parts of the code depending on the kind:
258258
UCS-1 strings don't need to handle surrogates for example. */
259-
Py_LOCAL_INLINE(PyObject *)
260-
STRINGLIB(utf8_encoder)(PyObject *unicode,
259+
Py_LOCAL_INLINE(char *)
260+
STRINGLIB(utf8_encoder)(_PyBytesWriter *writer,
261+
PyObject *unicode,
261262
STRINGLIB_CHAR *data,
262263
Py_ssize_t size,
263264
_Py_error_handler error_handler,
@@ -277,17 +278,16 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,
277278
#else /* STRINGLIB_SIZEOF_CHAR == 4 */
278279
const Py_ssize_t max_char_size = 4;
279280
#endif
280-
_PyBytesWriter writer;
281281

282282
assert(size >= 0);
283-
_PyBytesWriter_Init(&writer);
284-
285283
if (size > PY_SSIZE_T_MAX / max_char_size) {
286284
/* integer overflow */
287-
return PyErr_NoMemory();
285+
PyErr_NoMemory();
286+
return NULL;
288287
}
289288

290-
p = _PyBytesWriter_Alloc(&writer, size * max_char_size);
289+
_PyBytesWriter_Init(writer);
290+
p = _PyBytesWriter_Alloc(writer, size * max_char_size);
291291
if (p == NULL)
292292
return NULL;
293293

@@ -323,7 +323,7 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,
323323
endpos++;
324324

325325
/* Only overallocate the buffer if it's not the last write */
326-
writer.overallocate = (endpos < size);
326+
writer->overallocate = (endpos < size);
327327

328328
switch (error_handler)
329329
{
@@ -347,8 +347,8 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,
347347

348348
case _Py_ERROR_BACKSLASHREPLACE:
349349
/* subtract preallocated bytes */
350-
writer.min_size -= max_char_size * (endpos - startpos);
351-
p = backslashreplace(&writer, p,
350+
writer->min_size -= max_char_size * (endpos - startpos);
351+
p = backslashreplace(writer, p,
352352
unicode, startpos, endpos);
353353
if (p == NULL)
354354
goto error;
@@ -357,8 +357,8 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,
357357

358358
case _Py_ERROR_XMLCHARREFREPLACE:
359359
/* subtract preallocated bytes */
360-
writer.min_size -= max_char_size * (endpos - startpos);
361-
p = xmlcharrefreplace(&writer, p,
360+
writer->min_size -= max_char_size * (endpos - startpos);
361+
p = xmlcharrefreplace(writer, p,
362362
unicode, startpos, endpos);
363363
if (p == NULL)
364364
goto error;
@@ -387,10 +387,10 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,
387387
goto error;
388388

389389
/* subtract preallocated bytes */
390-
writer.min_size -= max_char_size * (newpos - startpos);
390+
writer->min_size -= max_char_size * (newpos - startpos);
391391

392392
if (PyBytes_Check(rep)) {
393-
p = _PyBytesWriter_WriteBytes(&writer, p,
393+
p = _PyBytesWriter_WriteBytes(writer, p,
394394
PyBytes_AS_STRING(rep),
395395
PyBytes_GET_SIZE(rep));
396396
}
@@ -406,7 +406,7 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,
406406
goto error;
407407
}
408408

409-
p = _PyBytesWriter_WriteBytes(&writer, p,
409+
p = _PyBytesWriter_WriteBytes(writer, p,
410410
PyUnicode_DATA(rep),
411411
PyUnicode_GET_LENGTH(rep));
412412
}
@@ -420,7 +420,7 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,
420420

421421
/* If overallocation was disabled, ensure that it was the last
422422
write. Otherwise, we missed an optimization */
423-
assert(writer.overallocate || i == size);
423+
assert(writer->overallocate || i == size);
424424
}
425425
else
426426
#if STRINGLIB_SIZEOF_CHAR > 2
@@ -449,14 +449,13 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,
449449
Py_XDECREF(error_handler_obj);
450450
Py_XDECREF(exc);
451451
#endif
452-
return _PyBytesWriter_Finish(&writer, p);
452+
return p;
453453

454454
#if STRINGLIB_SIZEOF_CHAR > 1
455455
error:
456456
Py_XDECREF(rep);
457457
Py_XDECREF(error_handler_obj);
458458
Py_XDECREF(exc);
459-
_PyBytesWriter_Dealloc(&writer);
460459
return NULL;
461460
#endif
462461
}

Objects/unicodeobject.c

Lines changed: 73 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -3991,11 +3991,11 @@ PyUnicode_FSDecoder(PyObject* arg, void* addr)
39913991
}
39923992

39933993

3994+
static int unicode_fill_utf8(PyObject *unicode);
3995+
39943996
const char *
39953997
PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
39963998
{
3997-
PyObject *bytes;
3998-
39993999
if (!PyUnicode_Check(unicode)) {
40004000
PyErr_BadArgument();
40014001
return NULL;
@@ -4004,21 +4004,9 @@ PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
40044004
return NULL;
40054005

40064006
if (PyUnicode_UTF8(unicode) == NULL) {
4007-
assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
4008-
bytes = _PyUnicode_AsUTF8String(unicode, NULL);
4009-
if (bytes == NULL)
4010-
return NULL;
4011-
_PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
4012-
if (_PyUnicode_UTF8(unicode) == NULL) {
4013-
PyErr_NoMemory();
4014-
Py_DECREF(bytes);
4007+
if (unicode_fill_utf8(unicode) == -1) {
40154008
return NULL;
40164009
}
4017-
_PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
4018-
memcpy(_PyUnicode_UTF8(unicode),
4019-
PyBytes_AS_STRING(bytes),
4020-
_PyUnicode_UTF8_LENGTH(unicode) + 1);
4021-
Py_DECREF(bytes);
40224010
}
40234011

40244012
if (psize)
@@ -5381,10 +5369,6 @@ static PyObject *
53815369
unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
53825370
const char *errors)
53835371
{
5384-
enum PyUnicode_Kind kind;
5385-
void *data;
5386-
Py_ssize_t size;
5387-
53885372
if (!PyUnicode_Check(unicode)) {
53895373
PyErr_BadArgument();
53905374
return NULL;
@@ -5397,22 +5381,86 @@ unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
53975381
return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
53985382
PyUnicode_UTF8_LENGTH(unicode));
53995383

5400-
kind = PyUnicode_KIND(unicode);
5401-
data = PyUnicode_DATA(unicode);
5402-
size = PyUnicode_GET_LENGTH(unicode);
5384+
enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
5385+
void *data = PyUnicode_DATA(unicode);
5386+
Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5387+
5388+
_PyBytesWriter writer;
5389+
char *end;
54035390

54045391
switch (kind) {
54055392
default:
54065393
Py_UNREACHABLE();
54075394
case PyUnicode_1BYTE_KIND:
54085395
/* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
54095396
assert(!PyUnicode_IS_ASCII(unicode));
5410-
return ucs1lib_utf8_encoder(unicode, data, size, error_handler, errors);
5397+
end = ucs1lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5398+
break;
5399+
case PyUnicode_2BYTE_KIND:
5400+
end = ucs2lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5401+
break;
5402+
case PyUnicode_4BYTE_KIND:
5403+
end = ucs4lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5404+
break;
5405+
}
5406+
5407+
if (end == NULL) {
5408+
_PyBytesWriter_Dealloc(&writer);
5409+
return NULL;
5410+
}
5411+
return _PyBytesWriter_Finish(&writer, end);
5412+
}
5413+
5414+
static int
5415+
unicode_fill_utf8(PyObject *unicode)
5416+
{
5417+
/* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5418+
assert(!PyUnicode_IS_ASCII(unicode));
5419+
5420+
enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
5421+
void *data = PyUnicode_DATA(unicode);
5422+
Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5423+
5424+
_PyBytesWriter writer;
5425+
char *end;
5426+
5427+
switch (kind) {
5428+
default:
5429+
Py_UNREACHABLE();
5430+
case PyUnicode_1BYTE_KIND:
5431+
end = ucs1lib_utf8_encoder(&writer, unicode, data, size,
5432+
_Py_ERROR_STRICT, NULL);
5433+
break;
54115434
case PyUnicode_2BYTE_KIND:
5412-
return ucs2lib_utf8_encoder(unicode, data, size, error_handler, errors);
5435+
end = ucs2lib_utf8_encoder(&writer, unicode, data, size,
5436+
_Py_ERROR_STRICT, NULL);
5437+
break;
54135438
case PyUnicode_4BYTE_KIND:
5414-
return ucs4lib_utf8_encoder(unicode, data, size, error_handler, errors);
5439+
end = ucs4lib_utf8_encoder(&writer, unicode, data, size,
5440+
_Py_ERROR_STRICT, NULL);
5441+
break;
5442+
}
5443+
if (end == NULL) {
5444+
_PyBytesWriter_Dealloc(&writer);
5445+
return -1;
5446+
}
5447+
5448+
char *start = writer.use_small_buffer ? writer.small_buffer :
5449+
PyBytes_AS_STRING(writer.buffer);
5450+
Py_ssize_t len = end - start;
5451+
5452+
char *cache = PyObject_MALLOC(len + 1);
5453+
if (cache == NULL) {
5454+
_PyBytesWriter_Dealloc(&writer);
5455+
PyErr_NoMemory();
5456+
return -1;
54155457
}
5458+
_PyUnicode_UTF8(unicode) = cache;
5459+
_PyUnicode_UTF8_LENGTH(unicode) = len;
5460+
memcpy(cache, start, len);
5461+
cache[len] = '\0';
5462+
_PyBytesWriter_Dealloc(&writer);
5463+
return 0;
54165464
}
54175465

54185466
PyObject *

0 commit comments

Comments
 (0)