Skip to content

Commit 9b422fc

Browse files
authored
gh-119396: Optimize PyUnicode_FromFormat() UTF-8 decoder (#119398)
Add unicode_decode_utf8_writer() to write directly characters into a _PyUnicodeWriter writer: avoid the creation of a temporary string. Optimize PyUnicode_FromFormat() by using the new unicode_decode_utf8_writer(). Rename unicode_fromformat_write_cstr() to unicode_fromformat_write_utf8(). Microbenchmark on the code: return PyUnicode_FromFormat( "%s %s %s %s %s.", "format", "multiple", "utf8", "short", "strings"); Result: 620 ns +- 8 ns -> 382 ns +- 2 ns: 1.62x faster.
1 parent 14b063c commit 9b422fc

File tree

1 file changed

+141
-62
lines changed

1 file changed

+141
-62
lines changed

Objects/unicodeobject.c

Lines changed: 141 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -202,6 +202,11 @@ static PyObject *
202202
unicode_decode_utf8(const char *s, Py_ssize_t size,
203203
_Py_error_handler error_handler, const char *errors,
204204
Py_ssize_t *consumed);
205+
static int
206+
unicode_decode_utf8_writer(_PyUnicodeWriter *writer,
207+
const char *s, Py_ssize_t size,
208+
_Py_error_handler error_handler, const char *errors,
209+
Py_ssize_t *consumed);
205210
#ifdef Py_DEBUG
206211
static inline int unicode_is_finalizing(void);
207212
static int unicode_is_singleton(PyObject *unicode);
@@ -2377,14 +2382,11 @@ unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
23772382
}
23782383

23792384
static int
2380-
unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2385+
unicode_fromformat_write_utf8(_PyUnicodeWriter *writer, const char *str,
23812386
Py_ssize_t width, Py_ssize_t precision, int flags)
23822387
{
23832388
/* UTF-8 */
23842389
Py_ssize_t length;
2385-
PyObject *unicode;
2386-
int res;
2387-
23882390
if (precision == -1) {
23892391
length = strlen(str);
23902392
}
@@ -2394,11 +2396,19 @@ unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
23942396
length++;
23952397
}
23962398
}
2397-
unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2399+
2400+
if (width < 0) {
2401+
return unicode_decode_utf8_writer(writer, str, length,
2402+
_Py_ERROR_REPLACE, "replace", NULL);
2403+
}
2404+
2405+
PyObject *unicode = PyUnicode_DecodeUTF8Stateful(str, length,
2406+
"replace", NULL);
23982407
if (unicode == NULL)
23992408
return -1;
24002409

2401-
res = unicode_fromformat_write_str(writer, unicode, width, -1, flags);
2410+
int res = unicode_fromformat_write_str(writer, unicode,
2411+
width, -1, flags);
24022412
Py_DECREF(unicode);
24032413
return res;
24042414
}
@@ -2700,7 +2710,7 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer,
27002710
else {
27012711
/* UTF-8 */
27022712
const char *s = va_arg(*vargs, const char*);
2703-
if (unicode_fromformat_write_cstr(writer, s, width, precision, flags) < 0)
2713+
if (unicode_fromformat_write_utf8(writer, s, width, precision, flags) < 0)
27042714
return NULL;
27052715
}
27062716
break;
@@ -2739,7 +2749,7 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer,
27392749
}
27402750
else {
27412751
assert(str != NULL);
2742-
if (unicode_fromformat_write_cstr(writer, str, width, precision, flags) < 0)
2752+
if (unicode_fromformat_write_utf8(writer, str, width, precision, flags) < 0)
27432753
return NULL;
27442754
}
27452755
break;
@@ -4737,65 +4747,33 @@ ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
47374747
return p - start;
47384748
}
47394749

4740-
static PyObject *
4741-
unicode_decode_utf8(const char *s, Py_ssize_t size,
4742-
_Py_error_handler error_handler, const char *errors,
4743-
Py_ssize_t *consumed)
4744-
{
4745-
if (size == 0) {
4746-
if (consumed)
4747-
*consumed = 0;
4748-
_Py_RETURN_UNICODE_EMPTY();
4749-
}
4750-
4751-
/* ASCII is equivalent to the first 128 ordinals in Unicode. */
4752-
if (size == 1 && (unsigned char)s[0] < 128) {
4753-
if (consumed) {
4754-
*consumed = 1;
4755-
}
4756-
return get_latin1_char((unsigned char)s[0]);
4757-
}
4758-
4759-
const char *starts = s;
4760-
const char *end = s + size;
4761-
4762-
// fast path: try ASCII string.
4763-
PyObject *u = PyUnicode_New(size, 127);
4764-
if (u == NULL) {
4765-
return NULL;
4766-
}
4767-
s += ascii_decode(s, end, PyUnicode_1BYTE_DATA(u));
4768-
if (s == end) {
4769-
if (consumed) {
4770-
*consumed = size;
4771-
}
4772-
return u;
4773-
}
4774-
4775-
// Use _PyUnicodeWriter after fast path is failed.
4776-
_PyUnicodeWriter writer;
4777-
_PyUnicodeWriter_InitWithBuffer(&writer, u);
4778-
writer.pos = s - starts;
47794750

4751+
static int
4752+
unicode_decode_utf8_impl(_PyUnicodeWriter *writer,
4753+
const char *starts, const char *s, const char *end,
4754+
_Py_error_handler error_handler,
4755+
const char *errors,
4756+
Py_ssize_t *consumed)
4757+
{
47804758
Py_ssize_t startinpos, endinpos;
47814759
const char *errmsg = "";
47824760
PyObject *error_handler_obj = NULL;
47834761
PyObject *exc = NULL;
47844762

47854763
while (s < end) {
47864764
Py_UCS4 ch;
4787-
int kind = writer.kind;
4765+
int kind = writer->kind;
47884766

47894767
if (kind == PyUnicode_1BYTE_KIND) {
4790-
if (PyUnicode_IS_ASCII(writer.buffer))
4791-
ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
4768+
if (PyUnicode_IS_ASCII(writer->buffer))
4769+
ch = asciilib_utf8_decode(&s, end, writer->data, &writer->pos);
47924770
else
4793-
ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
4771+
ch = ucs1lib_utf8_decode(&s, end, writer->data, &writer->pos);
47944772
} else if (kind == PyUnicode_2BYTE_KIND) {
4795-
ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
4773+
ch = ucs2lib_utf8_decode(&s, end, writer->data, &writer->pos);
47964774
} else {
47974775
assert(kind == PyUnicode_4BYTE_KIND);
4798-
ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
4776+
ch = ucs4lib_utf8_decode(&s, end, writer->data, &writer->pos);
47994777
}
48004778

48014779
switch (ch) {
@@ -4826,7 +4804,9 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
48264804
endinpos = startinpos + ch - 1;
48274805
break;
48284806
default:
4829-
if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
4807+
// ch doesn't fit into kind, so change the buffer kind to write
4808+
// the character
4809+
if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
48304810
goto onError;
48314811
continue;
48324812
}
@@ -4840,7 +4820,7 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
48404820
break;
48414821

48424822
case _Py_ERROR_REPLACE:
4843-
if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
4823+
if (_PyUnicodeWriter_WriteCharInline(writer, 0xfffd) < 0)
48444824
goto onError;
48454825
s += (endinpos - startinpos);
48464826
break;
@@ -4849,13 +4829,13 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
48494829
{
48504830
Py_ssize_t i;
48514831

4852-
if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
4832+
if (_PyUnicodeWriter_PrepareKind(writer, PyUnicode_2BYTE_KIND) < 0)
48534833
goto onError;
48544834
for (i=startinpos; i<endinpos; i++) {
48554835
ch = (Py_UCS4)(unsigned char)(starts[i]);
4856-
PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
4836+
PyUnicode_WRITE(writer->kind, writer->data, writer->pos,
48574837
ch + 0xdc00);
4858-
writer.pos++;
4838+
writer->pos++;
48594839
}
48604840
s += (endinpos - startinpos);
48614841
break;
@@ -4866,8 +4846,13 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
48664846
errors, &error_handler_obj,
48674847
"utf-8", errmsg,
48684848
&starts, &end, &startinpos, &endinpos, &exc, &s,
4869-
&writer))
4849+
writer)) {
48704850
goto onError;
4851+
}
4852+
4853+
if (_PyUnicodeWriter_Prepare(writer, end - s, 127) < 0) {
4854+
return -1;
4855+
}
48714856
}
48724857
}
48734858

@@ -4877,13 +4862,107 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
48774862

48784863
Py_XDECREF(error_handler_obj);
48794864
Py_XDECREF(exc);
4880-
return _PyUnicodeWriter_Finish(&writer);
4865+
return 0;
48814866

48824867
onError:
48834868
Py_XDECREF(error_handler_obj);
48844869
Py_XDECREF(exc);
4885-
_PyUnicodeWriter_Dealloc(&writer);
4886-
return NULL;
4870+
return -1;
4871+
}
4872+
4873+
4874+
static PyObject *
4875+
unicode_decode_utf8(const char *s, Py_ssize_t size,
4876+
_Py_error_handler error_handler, const char *errors,
4877+
Py_ssize_t *consumed)
4878+
{
4879+
if (size == 0) {
4880+
if (consumed) {
4881+
*consumed = 0;
4882+
}
4883+
_Py_RETURN_UNICODE_EMPTY();
4884+
}
4885+
4886+
/* ASCII is equivalent to the first 128 ordinals in Unicode. */
4887+
if (size == 1 && (unsigned char)s[0] < 128) {
4888+
if (consumed) {
4889+
*consumed = 1;
4890+
}
4891+
return get_latin1_char((unsigned char)s[0]);
4892+
}
4893+
4894+
// fast path: try ASCII string.
4895+
const char *starts = s;
4896+
const char *end = s + size;
4897+
PyObject *u = PyUnicode_New(size, 127);
4898+
if (u == NULL) {
4899+
return NULL;
4900+
}
4901+
Py_ssize_t decoded = ascii_decode(s, end, PyUnicode_1BYTE_DATA(u));
4902+
if (decoded == size) {
4903+
if (consumed) {
4904+
*consumed = size;
4905+
}
4906+
return u;
4907+
}
4908+
s += decoded;
4909+
size -= decoded;
4910+
4911+
// Use _PyUnicodeWriter after fast path is failed.
4912+
_PyUnicodeWriter writer;
4913+
_PyUnicodeWriter_InitWithBuffer(&writer, u);
4914+
writer.pos = decoded;
4915+
4916+
if (unicode_decode_utf8_impl(&writer, starts, s, end,
4917+
error_handler, errors,
4918+
consumed) < 0) {
4919+
_PyUnicodeWriter_Dealloc(&writer);
4920+
return NULL;
4921+
}
4922+
return _PyUnicodeWriter_Finish(&writer);
4923+
}
4924+
4925+
4926+
static int
4927+
unicode_decode_utf8_writer(_PyUnicodeWriter *writer,
4928+
const char *s, Py_ssize_t size,
4929+
_Py_error_handler error_handler, const char *errors,
4930+
Py_ssize_t *consumed)
4931+
{
4932+
if (size == 0) {
4933+
if (consumed) {
4934+
*consumed = 0;
4935+
}
4936+
return 0;
4937+
}
4938+
4939+
// fast path: try ASCII string.
4940+
if (_PyUnicodeWriter_Prepare(writer, size, 127) < 0) {
4941+
return -1;
4942+
}
4943+
4944+
const char *starts = s;
4945+
const char *end = s + size;
4946+
Py_ssize_t decoded = 0;
4947+
Py_UCS1 *dest = (Py_UCS1*)writer->data + writer->pos * writer->kind;
4948+
if (writer->kind == PyUnicode_1BYTE_KIND
4949+
&& _Py_IS_ALIGNED(dest, ALIGNOF_SIZE_T))
4950+
{
4951+
decoded = ascii_decode(s, end, dest);
4952+
writer->pos += decoded;
4953+
4954+
if (decoded == size) {
4955+
if (consumed) {
4956+
*consumed = size;
4957+
}
4958+
return 0;
4959+
}
4960+
s += decoded;
4961+
size -= decoded;
4962+
}
4963+
4964+
return unicode_decode_utf8_impl(writer, starts, s, end,
4965+
error_handler, errors, consumed);
48874966
}
48884967

48894968

0 commit comments

Comments
 (0)