Skip to content

Commit a0bd9e9

Browse files
bpo-28307: Convert simple C-style formatting with literal format into f-string. (GH-5012)
C-style formatting with literal format containing only format codes %s, %r and %a (with optional width, precision and alignment) will be converted to an equivalent f-string expression. It can speed up formatting more than 2 times by eliminating runtime parsing of the format string and creating temporary tuple.
1 parent 6692dc1 commit a0bd9e9

File tree

4 files changed

+2390
-2165
lines changed

4 files changed

+2390
-2165
lines changed

Doc/whatsnew/3.11.rst

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,10 @@ Improved Modules
9090
Optimizations
9191
=============
9292

93-
93+
* Compiler optimizes now simple C-style formatting with literal format
94+
containing only format codes ``%s``, ``%r`` and ``%a`` and makes it as
95+
fast as corresponding f-string expression.
96+
(Contributed by Serhiy Storchaka in :issue:`28307`.)
9497

9598

9699
Build and C API Changes
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
Compiler now optimizes simple C-style formatting with literal format
2+
containing only format codes %s, %r and %a by converting them to f-string
3+
expressions.

Python/ast_opt.c

Lines changed: 221 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
#include "pycore_ast.h" // _PyAST_GetDocString()
44
#include "pycore_compile.h" // _PyASTOptimizeState
55
#include "pycore_pystate.h" // _PyThreadState_GET()
6+
#include "pycore_format.h" // F_LJUST
67

78

89
static int
@@ -224,17 +225,235 @@ safe_mod(PyObject *v, PyObject *w)
224225
return PyNumber_Remainder(v, w);
225226
}
226227

228+
229+
static expr_ty
230+
parse_literal(PyObject *fmt, Py_ssize_t *ppos, PyArena *arena)
231+
{
232+
const void *data = PyUnicode_DATA(fmt);
233+
int kind = PyUnicode_KIND(fmt);
234+
Py_ssize_t size = PyUnicode_GET_LENGTH(fmt);
235+
Py_ssize_t start, pos;
236+
int has_percents = 0;
237+
start = pos = *ppos;
238+
while (pos < size) {
239+
if (PyUnicode_READ(kind, data, pos) != '%') {
240+
pos++;
241+
}
242+
else if (pos+1 < size && PyUnicode_READ(kind, data, pos+1) == '%') {
243+
has_percents = 1;
244+
pos += 2;
245+
}
246+
else {
247+
break;
248+
}
249+
}
250+
*ppos = pos;
251+
if (pos == start) {
252+
return NULL;
253+
}
254+
PyObject *str = PyUnicode_Substring(fmt, start, pos);
255+
/* str = str.replace('%%', '%') */
256+
if (str && has_percents) {
257+
_Py_static_string(PyId_double_percent, "%%");
258+
_Py_static_string(PyId_percent, "%");
259+
PyObject *double_percent = _PyUnicode_FromId(&PyId_double_percent);
260+
PyObject *percent = _PyUnicode_FromId(&PyId_percent);
261+
if (!double_percent || !percent) {
262+
Py_DECREF(str);
263+
return NULL;
264+
}
265+
Py_SETREF(str, PyUnicode_Replace(str, double_percent, percent, -1));
266+
}
267+
if (!str) {
268+
return NULL;
269+
}
270+
271+
if (_PyArena_AddPyObject(arena, str) < 0) {
272+
Py_DECREF(str);
273+
return NULL;
274+
}
275+
return _PyAST_Constant(str, NULL, -1, -1, -1, -1, arena);
276+
}
277+
278+
#define MAXDIGITS 3
279+
280+
static int
281+
simple_format_arg_parse(PyObject *fmt, Py_ssize_t *ppos,
282+
int *spec, int *flags, int *width, int *prec)
283+
{
284+
Py_ssize_t pos = *ppos, len = PyUnicode_GET_LENGTH(fmt);
285+
Py_UCS4 ch;
286+
287+
#define NEXTC do { \
288+
if (pos >= len) { \
289+
return 0; \
290+
} \
291+
ch = PyUnicode_READ_CHAR(fmt, pos); \
292+
pos++; \
293+
} while (0)
294+
295+
*flags = 0;
296+
while (1) {
297+
NEXTC;
298+
switch (ch) {
299+
case '-': *flags |= F_LJUST; continue;
300+
case '+': *flags |= F_SIGN; continue;
301+
case ' ': *flags |= F_BLANK; continue;
302+
case '#': *flags |= F_ALT; continue;
303+
case '0': *flags |= F_ZERO; continue;
304+
}
305+
break;
306+
}
307+
if ('0' <= ch && ch <= '9') {
308+
*width = 0;
309+
int digits = 0;
310+
while ('0' <= ch && ch <= '9') {
311+
*width = *width * 10 + (ch - '0');
312+
NEXTC;
313+
if (++digits >= MAXDIGITS) {
314+
return 0;
315+
}
316+
}
317+
}
318+
319+
if (ch == '.') {
320+
NEXTC;
321+
if ('0' <= ch && ch <= '9') {
322+
*prec = 0;
323+
int digits = 0;
324+
while ('0' <= ch && ch <= '9') {
325+
*prec = *prec * 10 + (ch - '0');
326+
NEXTC;
327+
if (++digits >= MAXDIGITS) {
328+
return 0;
329+
}
330+
}
331+
}
332+
}
333+
*spec = ch;
334+
*ppos = pos;
335+
return 1;
336+
337+
#undef NEXTC
338+
}
339+
340+
static expr_ty
341+
parse_format(PyObject *fmt, Py_ssize_t *ppos, expr_ty arg, PyArena *arena)
342+
{
343+
int spec, flags, width = -1, prec = -1;
344+
if (!simple_format_arg_parse(fmt, ppos, &spec, &flags, &width, &prec)) {
345+
// Unsupported format.
346+
return NULL;
347+
}
348+
if (spec == 's' || spec == 'r' || spec == 'a') {
349+
char buf[1 + MAXDIGITS + 1 + MAXDIGITS + 1], *p = buf;
350+
if (!(flags & F_LJUST) && width > 0) {
351+
*p++ = '>';
352+
}
353+
if (width >= 0) {
354+
p += snprintf(p, MAXDIGITS + 1, "%d", width);
355+
}
356+
if (prec >= 0) {
357+
p += snprintf(p, MAXDIGITS + 2, ".%d", prec);
358+
}
359+
expr_ty format_spec = NULL;
360+
if (p != buf) {
361+
PyObject *str = PyUnicode_FromString(buf);
362+
if (str == NULL) {
363+
return NULL;
364+
}
365+
if (_PyArena_AddPyObject(arena, str) < 0) {
366+
Py_DECREF(str);
367+
return NULL;
368+
}
369+
format_spec = _PyAST_Constant(str, NULL, -1, -1, -1, -1, arena);
370+
if (format_spec == NULL) {
371+
return NULL;
372+
}
373+
}
374+
return _PyAST_FormattedValue(arg, spec, format_spec,
375+
arg->lineno, arg->col_offset,
376+
arg->end_lineno, arg->end_col_offset,
377+
arena);
378+
}
379+
// Unsupported format.
380+
return NULL;
381+
}
382+
383+
static int
384+
optimize_format(expr_ty node, PyObject *fmt, asdl_expr_seq *elts, PyArena *arena)
385+
{
386+
Py_ssize_t pos = 0;
387+
Py_ssize_t cnt = 0;
388+
asdl_expr_seq *seq = _Py_asdl_expr_seq_new(asdl_seq_LEN(elts) * 2 + 1, arena);
389+
if (!seq) {
390+
return 0;
391+
}
392+
seq->size = 0;
393+
394+
while (1) {
395+
expr_ty lit = parse_literal(fmt, &pos, arena);
396+
if (lit) {
397+
asdl_seq_SET(seq, seq->size++, lit);
398+
}
399+
else if (PyErr_Occurred()) {
400+
return 0;
401+
}
402+
403+
if (pos >= PyUnicode_GET_LENGTH(fmt)) {
404+
break;
405+
}
406+
if (cnt >= asdl_seq_LEN(elts)) {
407+
// More format units than items.
408+
return 1;
409+
}
410+
assert(PyUnicode_READ_CHAR(fmt, pos) == '%');
411+
pos++;
412+
expr_ty expr = parse_format(fmt, &pos, asdl_seq_GET(elts, cnt), arena);
413+
cnt++;
414+
if (!expr) {
415+
return !PyErr_Occurred();
416+
}
417+
asdl_seq_SET(seq, seq->size++, expr);
418+
}
419+
if (cnt < asdl_seq_LEN(elts)) {
420+
// More items than format units.
421+
return 1;
422+
}
423+
expr_ty res = _PyAST_JoinedStr(seq,
424+
node->lineno, node->col_offset,
425+
node->end_lineno, node->end_col_offset,
426+
arena);
427+
if (!res) {
428+
return 0;
429+
}
430+
COPY_NODE(node, res);
431+
// PySys_FormatStderr("format = %R\n", fmt);
432+
return 1;
433+
}
434+
227435
static int
228436
fold_binop(expr_ty node, PyArena *arena, _PyASTOptimizeState *state)
229437
{
230438
expr_ty lhs, rhs;
231439
lhs = node->v.BinOp.left;
232440
rhs = node->v.BinOp.right;
233-
if (lhs->kind != Constant_kind || rhs->kind != Constant_kind) {
441+
if (lhs->kind != Constant_kind) {
234442
return 1;
235443
}
236-
237444
PyObject *lv = lhs->v.Constant.value;
445+
446+
if (node->v.BinOp.op == Mod &&
447+
rhs->kind == Tuple_kind &&
448+
PyUnicode_Check(lv))
449+
{
450+
return optimize_format(node, lv, rhs->v.Tuple.elts, arena);
451+
}
452+
453+
if (rhs->kind != Constant_kind) {
454+
return 1;
455+
}
456+
238457
PyObject *rv = rhs->v.Constant.value;
239458
PyObject *newval = NULL;
240459

0 commit comments

Comments
 (0)