Skip to content

gh-119396: Optimize unicode_repr() #119617

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
May 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Makefile.pre.in
Original file line number Diff line number Diff line change
Expand Up @@ -1841,6 +1841,7 @@ UNICODE_DEPS = \
$(srcdir)/Objects/stringlib/localeutil.h \
$(srcdir)/Objects/stringlib/partition.h \
$(srcdir)/Objects/stringlib/replace.h \
$(srcdir)/Objects/stringlib/repr.h \
$(srcdir)/Objects/stringlib/split.h \
$(srcdir)/Objects/stringlib/ucs1lib.h \
$(srcdir)/Objects/stringlib/ucs2lib.h \
Expand Down
95 changes: 95 additions & 0 deletions Objects/stringlib/repr.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
/* stringlib: repr() implementation */

#ifndef STRINGLIB_FASTSEARCH_H
#error must include "stringlib/fastsearch.h" before including this module
#endif


static void
STRINGLIB(repr)(PyObject *unicode, Py_UCS4 quote,
STRINGLIB_CHAR *odata)
{
Py_ssize_t isize = PyUnicode_GET_LENGTH(unicode);
const void *idata = PyUnicode_DATA(unicode);
int ikind = PyUnicode_KIND(unicode);

*odata++ = quote;
for (Py_ssize_t i = 0; i < isize; i++) {
Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);

/* Escape quotes and backslashes */
if ((ch == quote) || (ch == '\\')) {
*odata++ = '\\';
*odata++ = ch;
continue;
}

/* Map special whitespace to '\t', \n', '\r' */
if (ch == '\t') {
*odata++ = '\\';
*odata++ = 't';
}
else if (ch == '\n') {
*odata++ = '\\';
*odata++ = 'n';
}
else if (ch == '\r') {
*odata++ = '\\';
*odata++ = 'r';
}

/* Map non-printable US ASCII to '\xhh' */
else if (ch < ' ' || ch == 0x7F) {
*odata++ = '\\';
*odata++ = 'x';
*odata++ = Py_hexdigits[(ch >> 4) & 0x000F];
*odata++ = Py_hexdigits[ch & 0x000F];
}

/* Copy ASCII characters as-is */
else if (ch < 0x7F) {
*odata++ = ch;
}

/* Non-ASCII characters */
else {
/* Map Unicode whitespace and control characters
(categories Z* and C* except ASCII space)
*/
if (!Py_UNICODE_ISPRINTABLE(ch)) {
*odata++ = '\\';
/* Map 8-bit characters to '\xhh' */
if (ch <= 0xff) {
*odata++ = 'x';
*odata++ = Py_hexdigits[(ch >> 4) & 0x000F];
*odata++ = Py_hexdigits[ch & 0x000F];
}
/* Map 16-bit characters to '\uxxxx' */
else if (ch <= 0xffff) {
*odata++ = 'u';
*odata++ = Py_hexdigits[(ch >> 12) & 0xF];
*odata++ = Py_hexdigits[(ch >> 8) & 0xF];
*odata++ = Py_hexdigits[(ch >> 4) & 0xF];
*odata++ = Py_hexdigits[ch & 0xF];
}
/* Map 21-bit characters to '\U00xxxxxx' */
else {
*odata++ = 'U';
*odata++ = Py_hexdigits[(ch >> 28) & 0xF];
*odata++ = Py_hexdigits[(ch >> 24) & 0xF];
*odata++ = Py_hexdigits[(ch >> 20) & 0xF];
*odata++ = Py_hexdigits[(ch >> 16) & 0xF];
*odata++ = Py_hexdigits[(ch >> 12) & 0xF];
*odata++ = Py_hexdigits[(ch >> 8) & 0xF];
*odata++ = Py_hexdigits[(ch >> 4) & 0xF];
*odata++ = Py_hexdigits[ch & 0xF];
}
}
/* Copy characters as-is */
else {
*odata++ = ch;
}
}
}
*odata = quote;
}
136 changes: 34 additions & 102 deletions Objects/unicodeobject.c
Original file line number Diff line number Diff line change
Expand Up @@ -899,6 +899,7 @@ ensure_unicode(PyObject *obj)
#include "stringlib/count.h"
#include "stringlib/find.h"
#include "stringlib/replace.h"
#include "stringlib/repr.h"
#include "stringlib/find_max_char.h"
#include "stringlib/undef.h"

Expand All @@ -909,6 +910,7 @@ ensure_unicode(PyObject *obj)
#include "stringlib/count.h"
#include "stringlib/find.h"
#include "stringlib/replace.h"
#include "stringlib/repr.h"
#include "stringlib/find_max_char.h"
#include "stringlib/undef.h"

Expand All @@ -919,6 +921,7 @@ ensure_unicode(PyObject *obj)
#include "stringlib/count.h"
#include "stringlib/find.h"
#include "stringlib/replace.h"
#include "stringlib/repr.h"
#include "stringlib/find_max_char.h"
#include "stringlib/undef.h"

Expand Down Expand Up @@ -12336,24 +12339,17 @@ unicode_removesuffix_impl(PyObject *self, PyObject *suffix)
static PyObject *
unicode_repr(PyObject *unicode)
{
PyObject *repr;
Py_ssize_t isize;
Py_ssize_t osize, squote, dquote, i, o;
Py_UCS4 max, quote;
int ikind, okind, unchanged;
const void *idata;
void *odata;

isize = PyUnicode_GET_LENGTH(unicode);
idata = PyUnicode_DATA(unicode);
Py_ssize_t isize = PyUnicode_GET_LENGTH(unicode);
const void *idata = PyUnicode_DATA(unicode);

/* Compute length of output, quote characters, and
maximum character */
osize = 0;
max = 127;
squote = dquote = 0;
ikind = PyUnicode_KIND(unicode);
for (i = 0; i < isize; i++) {
Py_ssize_t osize = 0;
Py_UCS4 maxch = 127;
Py_ssize_t squote = 0;
Py_ssize_t dquote = 0;
int ikind = PyUnicode_KIND(unicode);
for (Py_ssize_t i = 0; i < isize; i++) {
Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Py_ssize_t incr = 1;
switch (ch) {
Expand All @@ -12369,7 +12365,7 @@ unicode_repr(PyObject *unicode)
else if (ch < 0x7f)
;
else if (Py_UNICODE_ISPRINTABLE(ch))
max = ch > max ? ch : max;
maxch = (ch > maxch) ? ch : maxch;
else if (ch < 0x100)
incr = 4; /* \xHH */
else if (ch < 0x10000)
Expand All @@ -12385,10 +12381,10 @@ unicode_repr(PyObject *unicode)
osize += incr;
}

quote = '\'';
unchanged = (osize == isize);
Py_UCS4 quote = '\'';
int changed = (osize != isize);
if (squote) {
unchanged = 0;
changed = 1;
if (dquote)
/* Both squote and dquote present. Use squote,
and escape them */
Expand All @@ -12398,99 +12394,35 @@ unicode_repr(PyObject *unicode)
}
osize += 2; /* quotes */

repr = PyUnicode_New(osize, max);
PyObject *repr = PyUnicode_New(osize, maxch);
if (repr == NULL)
return NULL;
okind = PyUnicode_KIND(repr);
odata = PyUnicode_DATA(repr);
int okind = PyUnicode_KIND(repr);
void *odata = PyUnicode_DATA(repr);

if (!changed) {
PyUnicode_WRITE(okind, odata, 0, quote);

PyUnicode_WRITE(okind, odata, 0, quote);
PyUnicode_WRITE(okind, odata, osize-1, quote);
if (unchanged) {
_PyUnicode_FastCopyCharacters(repr, 1,
unicode, 0,
isize);

PyUnicode_WRITE(okind, odata, osize-1, quote);
}
else {
for (i = 0, o = 1; i < isize; i++) {
Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);

/* Escape quotes and backslashes */
if ((ch == quote) || (ch == '\\')) {
PyUnicode_WRITE(okind, odata, o++, '\\');
PyUnicode_WRITE(okind, odata, o++, ch);
continue;
}

/* Map special whitespace to '\t', \n', '\r' */
if (ch == '\t') {
PyUnicode_WRITE(okind, odata, o++, '\\');
PyUnicode_WRITE(okind, odata, o++, 't');
}
else if (ch == '\n') {
PyUnicode_WRITE(okind, odata, o++, '\\');
PyUnicode_WRITE(okind, odata, o++, 'n');
}
else if (ch == '\r') {
PyUnicode_WRITE(okind, odata, o++, '\\');
PyUnicode_WRITE(okind, odata, o++, 'r');
}

/* Map non-printable US ASCII to '\xhh' */
else if (ch < ' ' || ch == 0x7F) {
PyUnicode_WRITE(okind, odata, o++, '\\');
PyUnicode_WRITE(okind, odata, o++, 'x');
PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
}

/* Copy ASCII characters as-is */
else if (ch < 0x7F) {
PyUnicode_WRITE(okind, odata, o++, ch);
}

/* Non-ASCII characters */
else {
/* Map Unicode whitespace and control characters
(categories Z* and C* except ASCII space)
*/
if (!Py_UNICODE_ISPRINTABLE(ch)) {
PyUnicode_WRITE(okind, odata, o++, '\\');
/* Map 8-bit characters to '\xhh' */
if (ch <= 0xff) {
PyUnicode_WRITE(okind, odata, o++, 'x');
PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
}
/* Map 16-bit characters to '\uxxxx' */
else if (ch <= 0xffff) {
PyUnicode_WRITE(okind, odata, o++, 'u');
PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
}
/* Map 21-bit characters to '\U00xxxxxx' */
else {
PyUnicode_WRITE(okind, odata, o++, 'U');
PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
}
}
/* Copy characters as-is */
else {
PyUnicode_WRITE(okind, odata, o++, ch);
}
}
switch (okind) {
case PyUnicode_1BYTE_KIND:
ucs1lib_repr(unicode, quote, odata);
break;
case PyUnicode_2BYTE_KIND:
ucs2lib_repr(unicode, quote, odata);
break;
default:
assert(okind == PyUnicode_4BYTE_KIND);
ucs4lib_repr(unicode, quote, odata);
}
}
/* Closing quote already added at the beginning */

assert(_PyUnicode_CheckConsistency(repr, 1));
return repr;
}
Expand Down
1 change: 1 addition & 0 deletions Tools/c-analyzer/cpython/_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,7 @@ def clean_lines(text):
Objects/stringlib/find.h Objects/stringlib/fastsearch.h
Objects/stringlib/partition.h Objects/stringlib/fastsearch.h
Objects/stringlib/replace.h Objects/stringlib/fastsearch.h
Objects/stringlib/repr.h Objects/stringlib/fastsearch.h
Objects/stringlib/split.h Objects/stringlib/fastsearch.h

# @end=tsv@
Expand Down
Loading