Skip to content

Commit 68b7a9c

Browse files
committed
bpo-17659: Optimize PyUnicode_AsUTF8AndSize().
Avoid using temporary bytes object.
1 parent 1f44e77 commit 68b7a9c

File tree

2 files changed

+90
-43
lines changed

2 files changed

+90
-43
lines changed

Objects/stringlib/codecs.h

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -256,8 +256,9 @@ STRINGLIB(utf8_decode)(const char **inptr, const char *end,
256256
/* UTF-8 encoder specialized for a Unicode kind to avoid the slow
257257
PyUnicode_READ() macro. Delete some parts of the code depending on the kind:
258258
UCS-1 strings don't need to handle surrogates for example. */
259-
Py_LOCAL_INLINE(PyObject *)
260-
STRINGLIB(utf8_encoder)(PyObject *unicode,
259+
Py_LOCAL_INLINE(char *)
260+
STRINGLIB(utf8_encoder)(_PyBytesWriter *writer,
261+
PyObject *unicode,
261262
STRINGLIB_CHAR *data,
262263
Py_ssize_t size,
263264
_Py_error_handler error_handler,
@@ -277,17 +278,16 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,
277278
#else /* STRINGLIB_SIZEOF_CHAR == 4 */
278279
const Py_ssize_t max_char_size = 4;
279280
#endif
280-
_PyBytesWriter writer;
281281

282282
assert(size >= 0);
283-
_PyBytesWriter_Init(&writer);
284-
285283
if (size > PY_SSIZE_T_MAX / max_char_size) {
286284
/* integer overflow */
287-
return PyErr_NoMemory();
285+
PyErr_NoMemory();
286+
return NULL;
288287
}
289288

290-
p = _PyBytesWriter_Alloc(&writer, size * max_char_size);
289+
_PyBytesWriter_Init(writer);
290+
p = _PyBytesWriter_Alloc(writer, size * max_char_size);
291291
if (p == NULL)
292292
return NULL;
293293

@@ -323,7 +323,7 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,
323323
endpos++;
324324

325325
/* Only overallocate the buffer if it's not the last write */
326-
writer.overallocate = (endpos < size);
326+
writer->overallocate = (endpos < size);
327327

328328
switch (error_handler)
329329
{
@@ -347,8 +347,8 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,
347347

348348
case _Py_ERROR_BACKSLASHREPLACE:
349349
/* subtract preallocated bytes */
350-
writer.min_size -= max_char_size * (endpos - startpos);
351-
p = backslashreplace(&writer, p,
350+
writer->min_size -= max_char_size * (endpos - startpos);
351+
p = backslashreplace(writer, p,
352352
unicode, startpos, endpos);
353353
if (p == NULL)
354354
goto error;
@@ -357,8 +357,8 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,
357357

358358
case _Py_ERROR_XMLCHARREFREPLACE:
359359
/* subtract preallocated bytes */
360-
writer.min_size -= max_char_size * (endpos - startpos);
361-
p = xmlcharrefreplace(&writer, p,
360+
writer->min_size -= max_char_size * (endpos - startpos);
361+
p = xmlcharrefreplace(writer, p,
362362
unicode, startpos, endpos);
363363
if (p == NULL)
364364
goto error;
@@ -387,10 +387,10 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,
387387
goto error;
388388

389389
/* subtract preallocated bytes */
390-
writer.min_size -= max_char_size * (newpos - startpos);
390+
writer->min_size -= max_char_size * (newpos - startpos);
391391

392392
if (PyBytes_Check(rep)) {
393-
p = _PyBytesWriter_WriteBytes(&writer, p,
393+
p = _PyBytesWriter_WriteBytes(writer, p,
394394
PyBytes_AS_STRING(rep),
395395
PyBytes_GET_SIZE(rep));
396396
}
@@ -406,7 +406,7 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,
406406
goto error;
407407
}
408408

409-
p = _PyBytesWriter_WriteBytes(&writer, p,
409+
p = _PyBytesWriter_WriteBytes(writer, p,
410410
PyUnicode_DATA(rep),
411411
PyUnicode_GET_LENGTH(rep));
412412
}
@@ -420,7 +420,7 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,
420420

421421
/* If overallocation was disabled, ensure that it was the last
422422
write. Otherwise, we missed an optimization */
423-
assert(writer.overallocate || i == size);
423+
assert(writer->overallocate || i == size);
424424
}
425425
else
426426
#if STRINGLIB_SIZEOF_CHAR > 2
@@ -449,14 +449,14 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,
449449
Py_XDECREF(error_handler_obj);
450450
Py_XDECREF(exc);
451451
#endif
452-
return _PyBytesWriter_Finish(&writer, p);
452+
return p;
453453

454454
#if STRINGLIB_SIZEOF_CHAR > 1
455455
error:
456456
Py_XDECREF(rep);
457457
Py_XDECREF(error_handler_obj);
458458
Py_XDECREF(exc);
459-
_PyBytesWriter_Dealloc(&writer);
459+
_PyBytesWriter_Dealloc(writer);
460460
return NULL;
461461
#endif
462462
}

Objects/unicodeobject.c

Lines changed: 72 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -3996,11 +3996,11 @@ PyUnicode_FSDecoder(PyObject* arg, void* addr)
39963996
}
39973997

39983998

3999+
static int unicode_fill_utf8(PyObject *unicode);
4000+
39994001
const char *
40004002
PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
40014003
{
4002-
PyObject *bytes;
4003-
40044004
if (!PyUnicode_Check(unicode)) {
40054005
PyErr_BadArgument();
40064006
return NULL;
@@ -4009,21 +4009,9 @@ PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
40094009
return NULL;
40104010

40114011
if (PyUnicode_UTF8(unicode) == NULL) {
4012-
assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
4013-
bytes = _PyUnicode_AsUTF8String(unicode, NULL);
4014-
if (bytes == NULL)
4015-
return NULL;
4016-
_PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
4017-
if (_PyUnicode_UTF8(unicode) == NULL) {
4018-
PyErr_NoMemory();
4019-
Py_DECREF(bytes);
4012+
if (unicode_fill_utf8(unicode) == -1) {
40204013
return NULL;
40214014
}
4022-
_PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
4023-
memcpy(_PyUnicode_UTF8(unicode),
4024-
PyBytes_AS_STRING(bytes),
4025-
_PyUnicode_UTF8_LENGTH(unicode) + 1);
4026-
Py_DECREF(bytes);
40274015
}
40284016

40294017
if (psize)
@@ -5386,10 +5374,6 @@ static PyObject *
53865374
unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
53875375
const char *errors)
53885376
{
5389-
enum PyUnicode_Kind kind;
5390-
void *data;
5391-
Py_ssize_t size;
5392-
53935377
if (!PyUnicode_Check(unicode)) {
53945378
PyErr_BadArgument();
53955379
return NULL;
@@ -5402,22 +5386,85 @@ unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
54025386
return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
54035387
PyUnicode_UTF8_LENGTH(unicode));
54045388

5405-
kind = PyUnicode_KIND(unicode);
5406-
data = PyUnicode_DATA(unicode);
5407-
size = PyUnicode_GET_LENGTH(unicode);
5389+
enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
5390+
void *data = PyUnicode_DATA(unicode);
5391+
Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5392+
5393+
_PyBytesWriter writer;
5394+
char *end;
54085395

54095396
switch (kind) {
54105397
default:
54115398
Py_UNREACHABLE();
54125399
case PyUnicode_1BYTE_KIND:
54135400
/* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
54145401
assert(!PyUnicode_IS_ASCII(unicode));
5415-
return ucs1lib_utf8_encoder(unicode, data, size, error_handler, errors);
5402+
end = ucs1lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5403+
break;
5404+
case PyUnicode_2BYTE_KIND:
5405+
end = ucs2lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5406+
break;
5407+
case PyUnicode_4BYTE_KIND:
5408+
end = ucs4lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5409+
break;
5410+
}
5411+
5412+
if (end == NULL) {
5413+
return NULL;
5414+
}
5415+
return _PyBytesWriter_Finish(&writer, end);
5416+
}
5417+
5418+
static int
5419+
unicode_fill_utf8(PyObject *unicode)
5420+
{
5421+
/* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5422+
assert(!PyUnicode_IS_ASCII(unicode));
5423+
5424+
enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
5425+
void *data = PyUnicode_DATA(unicode);
5426+
Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5427+
5428+
_PyBytesWriter writer;
5429+
char *end;
5430+
5431+
switch (kind) {
5432+
default:
5433+
Py_UNREACHABLE();
5434+
case PyUnicode_1BYTE_KIND:
5435+
end = ucs1lib_utf8_encoder(&writer, unicode, data, size,
5436+
_Py_ERROR_STRICT, NULL);
5437+
break;
54165438
case PyUnicode_2BYTE_KIND:
5417-
return ucs2lib_utf8_encoder(unicode, data, size, error_handler, errors);
5439+
end = ucs2lib_utf8_encoder(&writer, unicode, data, size,
5440+
_Py_ERROR_STRICT, NULL);
5441+
break;
54185442
case PyUnicode_4BYTE_KIND:
5419-
return ucs4lib_utf8_encoder(unicode, data, size, error_handler, errors);
5443+
end = ucs4lib_utf8_encoder(&writer, unicode, data, size,
5444+
_Py_ERROR_STRICT, NULL);
5445+
break;
5446+
}
5447+
if (end == NULL) {
5448+
// _PyBytesWriter_Dealloc is called in encoder
5449+
return -1;
5450+
}
5451+
5452+
char *start = writer.use_small_buffer ? writer.small_buffer :
5453+
PyBytes_AS_STRING(writer.buffer);
5454+
Py_ssize_t len = end - start;
5455+
5456+
char *cache = PyObject_MALLOC(len + 1);
5457+
if (cache == NULL) {
5458+
_PyBytesWriter_Dealloc(&writer);
5459+
PyErr_NoMemory();
5460+
return -1;
54205461
}
5462+
_PyUnicode_UTF8(unicode) = cache;
5463+
_PyUnicode_UTF8_LENGTH(unicode) = len;
5464+
memcpy(cache, start, len);
5465+
cache[len] = '\0';
5466+
_PyBytesWriter_Dealloc(&writer);
5467+
return 0;
54215468
}
54225469

54235470
PyObject *

0 commit comments

Comments
 (0)