Skip to content

bpo-39087: Optimize PyUnicode_AsUTF8AndSize(). #18327

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Feb 27, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Optimize :c:func:`PyUnicode_AsUTF8` and :c:func:`PyUnicode_AsUTF8AndSize`
slightly when they need to create internal UTF-8 cache.
35 changes: 17 additions & 18 deletions Objects/stringlib/codecs.h
Original file line number Diff line number Diff line change
Expand Up @@ -256,8 +256,9 @@ STRINGLIB(utf8_decode)(const char **inptr, const char *end,
/* UTF-8 encoder specialized for a Unicode kind to avoid the slow
PyUnicode_READ() macro. Delete some parts of the code depending on the kind:
UCS-1 strings don't need to handle surrogates for example. */
Py_LOCAL_INLINE(PyObject *)
STRINGLIB(utf8_encoder)(PyObject *unicode,
Py_LOCAL_INLINE(char *)
STRINGLIB(utf8_encoder)(_PyBytesWriter *writer,
PyObject *unicode,
STRINGLIB_CHAR *data,
Py_ssize_t size,
_Py_error_handler error_handler,
Expand All @@ -277,17 +278,16 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,
#else /* STRINGLIB_SIZEOF_CHAR == 4 */
const Py_ssize_t max_char_size = 4;
#endif
_PyBytesWriter writer;

assert(size >= 0);
_PyBytesWriter_Init(&writer);

if (size > PY_SSIZE_T_MAX / max_char_size) {
/* integer overflow */
return PyErr_NoMemory();
PyErr_NoMemory();
return NULL;
}

p = _PyBytesWriter_Alloc(&writer, size * max_char_size);
_PyBytesWriter_Init(writer);
p = _PyBytesWriter_Alloc(writer, size * max_char_size);
if (p == NULL)
return NULL;

Expand Down Expand Up @@ -323,7 +323,7 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,
endpos++;

/* Only overallocate the buffer if it's not the last write */
writer.overallocate = (endpos < size);
writer->overallocate = (endpos < size);

switch (error_handler)
{
Expand All @@ -347,8 +347,8 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,

case _Py_ERROR_BACKSLASHREPLACE:
/* subtract preallocated bytes */
writer.min_size -= max_char_size * (endpos - startpos);
p = backslashreplace(&writer, p,
writer->min_size -= max_char_size * (endpos - startpos);
p = backslashreplace(writer, p,
unicode, startpos, endpos);
if (p == NULL)
goto error;
Expand All @@ -357,8 +357,8 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,

case _Py_ERROR_XMLCHARREFREPLACE:
/* subtract preallocated bytes */
writer.min_size -= max_char_size * (endpos - startpos);
p = xmlcharrefreplace(&writer, p,
writer->min_size -= max_char_size * (endpos - startpos);
p = xmlcharrefreplace(writer, p,
unicode, startpos, endpos);
if (p == NULL)
goto error;
Expand Down Expand Up @@ -387,10 +387,10 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,
goto error;

/* subtract preallocated bytes */
writer.min_size -= max_char_size * (newpos - startpos);
writer->min_size -= max_char_size * (newpos - startpos);

if (PyBytes_Check(rep)) {
p = _PyBytesWriter_WriteBytes(&writer, p,
p = _PyBytesWriter_WriteBytes(writer, p,
PyBytes_AS_STRING(rep),
PyBytes_GET_SIZE(rep));
}
Expand All @@ -406,7 +406,7 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,
goto error;
}

p = _PyBytesWriter_WriteBytes(&writer, p,
p = _PyBytesWriter_WriteBytes(writer, p,
PyUnicode_DATA(rep),
PyUnicode_GET_LENGTH(rep));
}
Expand All @@ -420,7 +420,7 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,

/* If overallocation was disabled, ensure that it was the last
write. Otherwise, we missed an optimization */
assert(writer.overallocate || i == size);
assert(writer->overallocate || i == size);
}
else
#if STRINGLIB_SIZEOF_CHAR > 2
Expand Down Expand Up @@ -449,14 +449,13 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,
Py_XDECREF(error_handler_obj);
Py_XDECREF(exc);
#endif
return _PyBytesWriter_Finish(&writer, p);
return p;

#if STRINGLIB_SIZEOF_CHAR > 1
error:
Py_XDECREF(rep);
Py_XDECREF(error_handler_obj);
Py_XDECREF(exc);
_PyBytesWriter_Dealloc(&writer);
return NULL;
#endif
}
Expand Down
98 changes: 73 additions & 25 deletions Objects/unicodeobject.c
Original file line number Diff line number Diff line change
Expand Up @@ -3996,11 +3996,11 @@ PyUnicode_FSDecoder(PyObject* arg, void* addr)
}


static int unicode_fill_utf8(PyObject *unicode);

const char *
PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
{
PyObject *bytes;

if (!PyUnicode_Check(unicode)) {
PyErr_BadArgument();
return NULL;
Expand All @@ -4009,21 +4009,9 @@ PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
return NULL;

if (PyUnicode_UTF8(unicode) == NULL) {
assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
bytes = _PyUnicode_AsUTF8String(unicode, NULL);
if (bytes == NULL)
return NULL;
_PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
if (_PyUnicode_UTF8(unicode) == NULL) {
PyErr_NoMemory();
Py_DECREF(bytes);
if (unicode_fill_utf8(unicode) == -1) {
return NULL;
}
_PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
memcpy(_PyUnicode_UTF8(unicode),
PyBytes_AS_STRING(bytes),
_PyUnicode_UTF8_LENGTH(unicode) + 1);
Py_DECREF(bytes);
}

if (psize)
Expand Down Expand Up @@ -5386,10 +5374,6 @@ static PyObject *
unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
const char *errors)
{
enum PyUnicode_Kind kind;
void *data;
Py_ssize_t size;

if (!PyUnicode_Check(unicode)) {
PyErr_BadArgument();
return NULL;
Expand All @@ -5402,22 +5386,86 @@ unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
PyUnicode_UTF8_LENGTH(unicode));

kind = PyUnicode_KIND(unicode);
data = PyUnicode_DATA(unicode);
size = PyUnicode_GET_LENGTH(unicode);
enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
void *data = PyUnicode_DATA(unicode);
Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);

_PyBytesWriter writer;
char *end;

switch (kind) {
default:
Py_UNREACHABLE();
case PyUnicode_1BYTE_KIND:
/* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
assert(!PyUnicode_IS_ASCII(unicode));
return ucs1lib_utf8_encoder(unicode, data, size, error_handler, errors);
end = ucs1lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
break;
case PyUnicode_2BYTE_KIND:
end = ucs2lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
break;
case PyUnicode_4BYTE_KIND:
end = ucs4lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
break;
}

if (end == NULL) {
_PyBytesWriter_Dealloc(&writer);
return NULL;
}
return _PyBytesWriter_Finish(&writer, end);
}

static int
unicode_fill_utf8(PyObject *unicode)
{
/* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
assert(!PyUnicode_IS_ASCII(unicode));

enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
void *data = PyUnicode_DATA(unicode);
Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);

_PyBytesWriter writer;
char *end;

switch (kind) {
default:
Py_UNREACHABLE();
case PyUnicode_1BYTE_KIND:
end = ucs1lib_utf8_encoder(&writer, unicode, data, size,
_Py_ERROR_STRICT, NULL);
break;
case PyUnicode_2BYTE_KIND:
return ucs2lib_utf8_encoder(unicode, data, size, error_handler, errors);
end = ucs2lib_utf8_encoder(&writer, unicode, data, size,
_Py_ERROR_STRICT, NULL);
break;
case PyUnicode_4BYTE_KIND:
return ucs4lib_utf8_encoder(unicode, data, size, error_handler, errors);
end = ucs4lib_utf8_encoder(&writer, unicode, data, size,
_Py_ERROR_STRICT, NULL);
break;
}
if (end == NULL) {
_PyBytesWriter_Dealloc(&writer);
return -1;
}

char *start = writer.use_small_buffer ? writer.small_buffer :
PyBytes_AS_STRING(writer.buffer);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can use _PyBytesWriter_AsString() function here.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is static function: we can not call it from here.

Py_ssize_t len = end - start;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note: _PyBytesWriter_GetSize() exists, but using it would be slower and your code is correct ;-)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is static function too.


char *cache = PyObject_MALLOC(len + 1);
if (cache == NULL) {
_PyBytesWriter_Dealloc(&writer);
PyErr_NoMemory();
return -1;
}
_PyUnicode_UTF8(unicode) = cache;
_PyUnicode_UTF8_LENGTH(unicode) = len;
memcpy(cache, start, len);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I read somewhere that memcpy(dst, src, 0) is an undefined behavior. Maybe:

Suggested change
memcpy(cache, start, len);
if (len != 0) { memcpy(cache, start, len); }

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe, memcpy(dst, src, 0) is undefined when dst is invalid address. When dest is valid address, it is safe.
https://stackoverflow.com/questions/29844298/is-it-legal-to-call-memcpy-with-zero-length-on-a-pointer-just-past-the-end-of-an?rq=1

cache[len] = '\0';
_PyBytesWriter_Dealloc(&writer);
return 0;
}

PyObject *
Expand Down