Skip to content

Commit 001eb52

Browse files
authored
bpo-44187: Quickening infrastructure (GH-26264)
* Add co_firstinstr field to code object. * Implement barebones quickening. * Use non-quickened bytecode when tracing. * Add NEWS item * Add new file to Windows build. * Don't specialize instructions with EXTENDED_ARG.
1 parent 89e50ab commit 001eb52

File tree

12 files changed

+416
-12
lines changed

12 files changed

+416
-12
lines changed

Include/cpython/code.h

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,11 @@ typedef uint16_t _Py_CODEUNIT;
77
#ifdef WORDS_BIGENDIAN
88
# define _Py_OPCODE(word) ((word) >> 8)
99
# define _Py_OPARG(word) ((word) & 255)
10+
# define _Py_MAKECODEUNIT(opcode, oparg) (((opcode)<<8)|(oparg))
1011
#else
1112
# define _Py_OPCODE(word) ((word) & 255)
1213
# define _Py_OPARG(word) ((word) >> 8)
14+
# define _Py_MAKECODEUNIT(opcode, oparg) ((opcode)|((oparg)<<8))
1315
#endif
1416

1517
typedef struct _PyOpcache _PyOpcache;
@@ -43,24 +45,27 @@ struct PyCodeObject {
4345
/* These fields are set with provided values on new code objects. */
4446

4547
// The hottest fields (in the eval loop) are grouped here at the top.
46-
PyObject *co_code; /* instruction opcodes */
4748
PyObject *co_consts; /* list (constants used) */
4849
PyObject *co_names; /* list of strings (names used) */
50+
_Py_CODEUNIT *co_firstinstr; /* Pointer to first instruction, used for quickening */
51+
PyObject *co_exceptiontable; /* Byte string encoding exception handling table */
4952
int co_flags; /* CO_..., see below */
53+
int co_warmup; /* Warmup counter for quickening */
54+
5055
// The rest are not so impactful on performance.
5156
int co_argcount; /* #arguments, except *args */
5257
int co_posonlyargcount; /* #positional only arguments */
5358
int co_kwonlyargcount; /* #keyword only arguments */
5459
int co_stacksize; /* #entries needed for evaluation stack */
5560
int co_firstlineno; /* first source line number */
61+
PyObject *co_code; /* instruction opcodes */
5662
PyObject *co_varnames; /* tuple of strings (local variable names) */
5763
PyObject *co_cellvars; /* tuple of strings (cell variable names) */
5864
PyObject *co_freevars; /* tuple of strings (free variable names) */
5965
PyObject *co_filename; /* unicode (where it was loaded from) */
6066
PyObject *co_name; /* unicode (name, for reference) */
6167
PyObject *co_linetable; /* string (encoding addr<->lineno mapping) See
6268
Objects/lnotab_notes.txt for details. */
63-
PyObject *co_exceptiontable; /* Byte string encoding exception handling table */
6469

6570
/* These fields are set with computed values on new code objects. */
6671

@@ -78,6 +83,10 @@ struct PyCodeObject {
7883
Type is a void* to keep the format private in codeobject.c to force
7984
people to go through the proper APIs. */
8085
void *co_extra;
86+
/* Quickened instructions and cache, or NULL
87+
This should be treated as opaque by all code except the specializer and
88+
interpreter. */
89+
union _cache_or_instruction *co_quickened;
8190

8291
/* Per opcodes just-in-time cache
8392
*

Include/internal/pycore_code.h

Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
extern "C" {
55
#endif
66

7+
/* Legacy Opcache */
78

89
typedef struct {
910
PyObject *ptr; /* Cached pointer (borrowed reference) */
@@ -26,6 +27,129 @@ struct _PyOpcache {
2627
};
2728

2829

30+
/* PEP 659
31+
* Specialization and quickening structs and helper functions
32+
*/
33+
34+
typedef struct {
35+
int32_t cache_count;
36+
int32_t _; /* Force 8 byte size */
37+
} _PyEntryZero;
38+
39+
typedef struct {
40+
uint8_t original_oparg;
41+
uint8_t counter;
42+
uint16_t index;
43+
} _PyAdaptiveEntry;
44+
45+
/* Add specialized versions of entries to this union.
46+
*
47+
* Do not break the invariant: sizeof(SpecializedCacheEntry) == 8
48+
* Preserving this invariant is necessary because:
49+
- If any one form uses more space, then all must and on 64 bit machines
50+
this is likely to double the memory consumption of caches
51+
- The function for calculating the offset of caches assumes a 4:1
52+
cache:instruction size ratio. Changing that would need careful
53+
analysis to choose a new function.
54+
*/
55+
typedef union {
56+
_PyEntryZero zero;
57+
_PyAdaptiveEntry adaptive;
58+
} SpecializedCacheEntry;
59+
60+
#define INSTRUCTIONS_PER_ENTRY (sizeof(SpecializedCacheEntry)/sizeof(_Py_CODEUNIT))
61+
62+
/* Maximum size of code to quicken, in code units. */
63+
#define MAX_SIZE_TO_QUICKEN 5000
64+
65+
typedef union _cache_or_instruction {
66+
_Py_CODEUNIT code[1];
67+
SpecializedCacheEntry entry;
68+
} SpecializedCacheOrInstruction;
69+
70+
/* Get pointer to the nth cache entry, from the first instruction and n.
71+
* Cache entries are indexed backwards, with [count-1] first in memory, and [0] last.
72+
* The zeroth entry immediately precedes the instructions.
73+
*/
74+
static inline SpecializedCacheEntry *
75+
_GetSpecializedCacheEntry(_Py_CODEUNIT *first_instr, Py_ssize_t n)
76+
{
77+
SpecializedCacheOrInstruction *last_cache_plus_one = (SpecializedCacheOrInstruction *)first_instr;
78+
assert(&last_cache_plus_one->code[0] == first_instr);
79+
return &last_cache_plus_one[-1-n].entry;
80+
}
81+
82+
/* Following two functions form a pair.
83+
*
84+
* oparg_from_offset_and_index() is used to compute the oparg
85+
* when quickening, so that offset_from_oparg_and_nexti()
86+
* can be used at runtime to compute the offset.
87+
*
88+
* The relationship between the three values is currently
89+
* offset == (index>>1) + oparg
90+
* This relation is chosen based on the following observations:
91+
* 1. typically 1 in 4 instructions need a cache
92+
* 2. instructions that need a cache typically use 2 entries
93+
* These observations imply: offset ≈ index/2
94+
* We use the oparg to fine tune the relation to avoid wasting space
95+
* and allow consecutive instructions to use caches.
96+
*
97+
* If the number of cache entries < number of instructions/2 we will waste
98+
* some small amoount of space.
99+
* If the number of cache entries > (number of instructions/2) + 255, then
100+
* some instructions will not be able to use a cache.
101+
* In practice, we expect some small amount of wasted space in a shorter functions
102+
* and only functions exceeding a 1000 lines or more not to have enugh cache space.
103+
*
104+
*/
105+
static inline int
106+
oparg_from_offset_and_nexti(int offset, int nexti)
107+
{
108+
return offset-(nexti>>1);
109+
}
110+
111+
static inline int
112+
offset_from_oparg_and_nexti(int oparg, int nexti)
113+
{
114+
return (nexti>>1)+oparg;
115+
}
116+
117+
/* Get pointer to the cache entry associated with an instruction.
118+
* nexti is the index of the instruction plus one.
119+
* nexti is used as it corresponds to the instruction pointer in the interpreter.
120+
* This doesn't check that an entry has been allocated for that instruction. */
121+
static inline SpecializedCacheEntry *
122+
_GetSpecializedCacheEntryForInstruction(_Py_CODEUNIT *first_instr, int nexti, int oparg)
123+
{
124+
return _GetSpecializedCacheEntry(
125+
first_instr,
126+
offset_from_oparg_and_nexti(oparg, nexti)
127+
);
128+
}
129+
130+
#define QUICKENING_WARMUP_DELAY 8
131+
132+
/* We want to compare to zero for efficiency, so we offset values accordingly */
133+
#define QUICKENING_INITIAL_WARMUP_VALUE (-QUICKENING_WARMUP_DELAY)
134+
#define QUICKENING_WARMUP_COLDEST 1
135+
136+
static inline void
137+
PyCodeObject_IncrementWarmup(PyCodeObject * co)
138+
{
139+
co->co_warmup++;
140+
}
141+
142+
/* Used by the interpreter to determine when a code object should be quickened */
143+
static inline int
144+
PyCodeObject_IsWarmedUp(PyCodeObject * co)
145+
{
146+
return (co->co_warmup == 0);
147+
}
148+
149+
int _Py_Quicken(PyCodeObject *code);
150+
151+
extern Py_ssize_t _Py_QuickenedCount;
152+
29153
struct _PyCodeConstructor {
30154
/* metadata */
31155
PyObject *filename;

Lib/test/libregrtest/refleak.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -73,9 +73,10 @@ def get_pooled_int(value):
7373
alloc_deltas = [0] * repcount
7474
fd_deltas = [0] * repcount
7575
getallocatedblocks = sys.getallocatedblocks
76+
getallocatedblocks = sys.getallocatedblocks
7677
gettotalrefcount = sys.gettotalrefcount
78+
_getquickenedcount = sys._getquickenedcount
7779
fd_count = os_helper.fd_count
78-
7980
# initialize variables to make pyflakes quiet
8081
rc_before = alloc_before = fd_before = 0
8182

@@ -92,7 +93,7 @@ def get_pooled_int(value):
9293

9394
# dash_R_cleanup() ends with collecting cyclic trash:
9495
# read memory statistics immediately after.
95-
alloc_after = getallocatedblocks()
96+
alloc_after = getallocatedblocks() - _getquickenedcount()
9697
rc_after = gettotalrefcount()
9798
fd_after = fd_count()
9899

Makefile.pre.in

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -378,6 +378,7 @@ PYTHON_OBJS= \
378378
Python/pythonrun.o \
379379
Python/pytime.o \
380380
Python/bootstrap_hash.o \
381+
Python/specialize.o \
381382
Python/structmember.o \
382383
Python/symtable.o \
383384
Python/sysmodule.o \
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
Implement quickening in the interpreter. This offers no advantages as
2+
yet, but is an enabler of future optimizations. See PEP 659 for full
3+
explanation.

Objects/codeobject.c

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -211,6 +211,7 @@ init_code(PyCodeObject *co, struct _PyCodeConstructor *con)
211211

212212
Py_INCREF(con->code);
213213
co->co_code = con->code;
214+
co->co_firstinstr = (_Py_CODEUNIT *)PyBytes_AS_STRING(con->code);
214215
co->co_firstlineno = con->firstlineno;
215216
Py_INCREF(con->linetable);
216217
co->co_linetable = con->linetable;
@@ -250,6 +251,8 @@ init_code(PyCodeObject *co, struct _PyCodeConstructor *con)
250251
co->co_opcache = NULL;
251252
co->co_opcache_flag = 0;
252253
co->co_opcache_size = 0;
254+
co->co_warmup = QUICKENING_INITIAL_WARMUP_VALUE;
255+
co->co_quickened = NULL;
253256
}
254257

255258
/* The caller is responsible for ensuring that the given data is valid. */
@@ -376,7 +379,8 @@ PyCode_NewWithPosOnlyArgs(int argcount, int posonlyargcount, int kwonlyargcount,
376379
if (_PyCode_Validate(&con) < 0) {
377380
return NULL;
378381
}
379-
382+
assert(PyBytes_GET_SIZE(code) % sizeof(_Py_CODEUNIT) == 0);
383+
assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(code), sizeof(_Py_CODEUNIT)));
380384
if (nlocals != PyTuple_GET_SIZE(varnames)) {
381385
PyErr_SetString(PyExc_ValueError,
382386
"code: co_nlocals != len(co_varnames)");
@@ -1039,6 +1043,10 @@ code_dealloc(PyCodeObject *co)
10391043
PyMem_Free(co->co_cell2arg);
10401044
if (co->co_weakreflist != NULL)
10411045
PyObject_ClearWeakRefs((PyObject*)co);
1046+
if (co->co_quickened) {
1047+
PyMem_Free(co->co_quickened);
1048+
_Py_QuickenedCount--;
1049+
}
10421050
PyObject_Free(co);
10431051
}
10441052

PCbuild/pythoncore.vcxproj

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -487,6 +487,7 @@
487487
<ClCompile Include="..\Python\dtoa.c" />
488488
<ClCompile Include="..\Python\Python-ast.c" />
489489
<ClCompile Include="..\Python\pythonrun.c" />
490+
<ClCompile Include="..\Python\specialize.c" />
490491
<ClCompile Include="..\Python\suggestions.c" />
491492
<ClCompile Include="..\Python\structmember.c" />
492493
<ClCompile Include="..\Python\symtable.c" />

PCbuild/pythoncore.vcxproj.filters

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1103,6 +1103,9 @@
11031103
<ClCompile Include="..\Python\pythonrun.c">
11041104
<Filter>Python</Filter>
11051105
</ClCompile>
1106+
<ClCompile Include="..\Python\specialize.c">
1107+
<Filter>Python</Filter>
1108+
</ClCompile>
11061109
<ClCompile Include="..\Python\structmember.c">
11071110
<Filter>Python</Filter>
11081111
</ClCompile>

Python/ceval.c

Lines changed: 22 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1343,6 +1343,14 @@ eval_frame_handle_pending(PyThreadState *tstate)
13431343
#define JUMPTO(x) (next_instr = first_instr + (x))
13441344
#define JUMPBY(x) (next_instr += (x))
13451345

1346+
/* Get opcode and oparg from original instructions, not quickened form. */
1347+
#define TRACING_NEXTOPARG() do { \
1348+
_Py_CODEUNIT word = ((_Py_CODEUNIT *)PyBytes_AS_STRING(co->co_code))[INSTR_OFFSET()]; \
1349+
opcode = _Py_OPCODE(word); \
1350+
oparg = _Py_OPARG(word); \
1351+
next_instr++; \
1352+
} while (0)
1353+
13461354
/* OpCode prediction macros
13471355
Some opcodes tend to come in pairs thus making it possible to
13481356
predict the second code when the first is run. For example,
@@ -1644,15 +1652,23 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, PyFrameObject *f, int throwflag)
16441652
if (PyDTrace_FUNCTION_ENTRY_ENABLED())
16451653
dtrace_function_entry(f);
16461654

1655+
/* Increment the warmup counter and quicken if warm enough
1656+
* _Py_Quicken is idempotent so we don't worry about overflow */
1657+
if (!PyCodeObject_IsWarmedUp(co)) {
1658+
PyCodeObject_IncrementWarmup(co);
1659+
if (PyCodeObject_IsWarmedUp(co)) {
1660+
if (_Py_Quicken(co)) {
1661+
goto exit_eval_frame;
1662+
}
1663+
}
1664+
}
1665+
1666+
16471667
names = co->co_names;
16481668
consts = co->co_consts;
16491669
fastlocals = f->f_localsptr;
1670+
first_instr = co->co_firstinstr;
16501671
freevars = f->f_localsptr + co->co_nlocals;
1651-
assert(PyBytes_Check(co->co_code));
1652-
assert(PyBytes_GET_SIZE(co->co_code) <= INT_MAX);
1653-
assert(PyBytes_GET_SIZE(co->co_code) % sizeof(_Py_CODEUNIT) == 0);
1654-
assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(co->co_code), sizeof(_Py_CODEUNIT)));
1655-
first_instr = (_Py_CODEUNIT *) PyBytes_AS_STRING(co->co_code);
16561672
/*
16571673
f->f_lasti refers to the index of the last instruction,
16581674
unless it's -1 in which case next_instr should be first_instr.
@@ -1757,7 +1773,7 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, PyFrameObject *f, int throwflag)
17571773

17581774
tracing_dispatch:
17591775
f->f_lasti = INSTR_OFFSET();
1760-
NEXTOPARG();
1776+
TRACING_NEXTOPARG();
17611777

17621778
if (PyDTrace_LINE_ENABLED())
17631779
maybe_dtrace_line(f, &trace_info);

Python/clinic/sysmodule.c.h

Lines changed: 28 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)