Skip to content

Commit b47cdae

Browse files
ammaraskarisidenticalpablogsal
committed
bpo-43950: Make compiler output more source offsets.
This PR is part of PEP 657 and augments the compiler to emit ending line numbers as well as starting and ending columns from the AST into compiled code objects. This allows bytecodes to be correlated to the exact source code ranges that generated them. This information is made available through the following public APIs: * The `co_positions` method on code objects. * The C APIs `PyCode_Addr2EndLine`, `PyCode_Addr2Offset` and `PyCode_Addr2EndOffset` Co-authored-by: Batuhan Taskaya <[email protected]> Co-authored-by: Pablo Galindo <[email protected]>
1 parent 0d7f61d commit b47cdae

File tree

15 files changed

+7701
-5203
lines changed

15 files changed

+7701
-5203
lines changed

Include/cpython/code.h

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -75,8 +75,13 @@ struct PyCodeObject {
7575
PyObject *co_localspluskinds; /* Bytes mapping to local kinds (one byte per variable) */
7676
PyObject *co_filename; /* unicode (where it was loaded from) */
7777
PyObject *co_name; /* unicode (name, for reference) */
78-
PyObject *co_linetable; /* string (encoding addr<->lineno mapping) See
78+
PyObject *co_linetable; /* bytes (encoding addr<->lineno mapping) See
7979
Objects/lnotab_notes.txt for details. */
80+
PyObject *co_endlinetable; /* bytes object that holds end lineno for
81+
instructions separated across different
82+
lines */
83+
PyObject *co_columntable; /* bytes object that holds start/end column
84+
offset each instruction */
8085

8186
/* These fields are set with computed values on new code objects. */
8287

@@ -149,12 +154,14 @@ PyAPI_DATA(PyTypeObject) PyCode_Type;
149154
PyAPI_FUNC(PyCodeObject *) PyCode_New(
150155
int, int, int, int, int, PyObject *, PyObject *,
151156
PyObject *, PyObject *, PyObject *, PyObject *,
152-
PyObject *, PyObject *, int, PyObject *, PyObject *);
157+
PyObject *, PyObject *, int, PyObject *, PyObject *,
158+
PyObject *, PyObject *);
153159

154160
PyAPI_FUNC(PyCodeObject *) PyCode_NewWithPosOnlyArgs(
155161
int, int, int, int, int, int, PyObject *, PyObject *,
156162
PyObject *, PyObject *, PyObject *, PyObject *,
157-
PyObject *, PyObject *, int, PyObject *, PyObject *);
163+
PyObject *, PyObject *, int, PyObject *, PyObject *,
164+
PyObject *, PyObject *);
158165
/* same as struct above */
159166

160167
/* Creates a new empty code object with the specified source location. */
@@ -165,6 +172,12 @@ PyCode_NewEmpty(const char *filename, const char *funcname, int firstlineno);
165172
in this code object. If you just need the line number of a frame,
166173
use PyFrame_GetLineNumber() instead. */
167174
PyAPI_FUNC(int) PyCode_Addr2Line(PyCodeObject *, int);
175+
/* Return the ending source code line number from a bytecode index. */
176+
PyAPI_FUNC(int) PyCode_Addr2EndLine(PyCodeObject*, int);
177+
/* Return the starting source code column offset from a bytecode index. */
178+
PyAPI_FUNC(int) PyCode_Addr2Offset(PyCodeObject*, int);
179+
/* Return the ending source code column offset from a bytecode index. */
180+
PyAPI_FUNC(int) PyCode_Addr2EndOffset(PyCodeObject*, int);
168181

169182
/* for internal use only */
170183
struct _opaque {
@@ -203,8 +216,9 @@ PyAPI_FUNC(int) _PyCode_GetExtra(PyObject *code, Py_ssize_t index,
203216
PyAPI_FUNC(int) _PyCode_SetExtra(PyObject *code, Py_ssize_t index,
204217
void *extra);
205218

206-
/** API for initializing the line number table. */
219+
/** API for initializing the line number tables. */
207220
int _PyCode_InitAddressRange(PyCodeObject* co, PyCodeAddressRange *bounds);
221+
int _PyCode_InitEndAddressRange(PyCodeObject* co, PyCodeAddressRange* bounds);
208222

209223
/** Out of process API for initializing the line number table. */
210224
void PyLineTable_InitAddressRange(const char *linetable, Py_ssize_t length, int firstlineno, PyCodeAddressRange *range);

Include/internal/pycore_code.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -218,6 +218,8 @@ struct _PyCodeConstructor {
218218
PyObject *code;
219219
int firstlineno;
220220
PyObject *linetable;
221+
PyObject *endlinetable;
222+
PyObject *columntable;
221223

222224
/* used by the code */
223225
PyObject *consts;

Lib/ctypes/test/test_values.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -80,9 +80,9 @@ class struct_frozen(Structure):
8080
continue
8181
items.append((entry.name.decode("ascii"), entry.size))
8282

83-
expected = [("__hello__", 133),
84-
("__phello__", -133),
85-
("__phello__.spam", 133),
83+
expected = [("__hello__", 159),
84+
("__phello__", -159),
85+
("__phello__.spam", 159),
8686
]
8787
self.assertEqual(items, expected, "PyImport_FrozenModules example "
8888
"in Doc/library/ctypes.rst may be out of date")

Lib/importlib/_bootstrap_external.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -361,6 +361,7 @@ def _write_atomic(path, data, mode=0o666):
361361
# Python 3.11a1 3456 (interleave cell args bpo-43693)
362362
# Python 3.11a1 3457 (Change localsplus to a bytes object bpo-43693)
363363
# Python 3.11a1 3458 (imported objects now don't use LOAD_METHOD/CALL_METHOD)
364+
# Python 3.11a1 3459 (PEP 657: add end line numbers and column offsets for instructions)
364365

365366
#
366367
# MAGIC must change whenever the bytecode emitted by the compiler may no
@@ -370,7 +371,7 @@ def _write_atomic(path, data, mode=0o666):
370371
# Whenever MAGIC_NUMBER is changed, the ranges in the magic_values array
371372
# in PC/launcher.c must also be updated.
372373

373-
MAGIC_NUMBER = (3458).to_bytes(2, 'little') + b'\r\n'
374+
MAGIC_NUMBER = (3459).to_bytes(2, 'little') + b'\r\n'
374375
_RAW_MAGIC_NUMBER = int.from_bytes(MAGIC_NUMBER, 'little') # For import.c
375376

376377
_PYCACHE = '__pycache__'

Lib/test/test_code.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -223,6 +223,8 @@ def func(): pass
223223
co.co_name,
224224
co.co_firstlineno,
225225
co.co_lnotab,
226+
co.co_endlinetable,
227+
co.co_columntable,
226228
co.co_exceptiontable,
227229
co.co_freevars,
228230
co.co_cellvars)
@@ -257,6 +259,8 @@ def func2():
257259
("co_filename", "newfilename"),
258260
("co_name", "newname"),
259261
("co_linetable", code2.co_linetable),
262+
("co_endlinetable", code2.co_endlinetable),
263+
("co_columntable", code2.co_columntable),
260264
):
261265
with self.subTest(attr=attr, value=value):
262266
new_code = code.replace(**{attr: value})
@@ -293,6 +297,8 @@ def func():
293297
co.co_name,
294298
co.co_firstlineno,
295299
co.co_lnotab,
300+
co.co_endlinetable,
301+
co.co_columntable,
296302
co.co_exceptiontable,
297303
co.co_freevars,
298304
co.co_cellvars,
@@ -309,6 +315,34 @@ def func():
309315
new_code = code = func.__code__.replace(co_linetable=b'')
310316
self.assertEqual(list(new_code.co_lines()), [])
311317

318+
# co_positions behavior when info is missing.
319+
320+
def test_co_positions_empty_linetable(self):
321+
def func():
322+
x = 1
323+
new_code = func.__code__.replace(co_linetable=b'')
324+
for line, end_line, column, end_column in new_code.co_positions():
325+
self.assertIsNone(line)
326+
self.assertEqual(end_line, new_code.co_firstlineno + 1)
327+
328+
def test_co_positions_empty_endlinetable(self):
329+
def func():
330+
x = 1
331+
new_code = func.__code__.replace(co_endlinetable=b'')
332+
for line, end_line, column, end_column in new_code.co_positions():
333+
self.assertEqual(line, new_code.co_firstlineno + 1)
334+
self.assertIsNone(end_line)
335+
336+
def test_co_positions_empty_columntable(self):
337+
def func():
338+
x = 1
339+
new_code = func.__code__.replace(co_columntable=b'')
340+
for line, end_line, column, end_column in new_code.co_positions():
341+
self.assertEqual(line, new_code.co_firstlineno + 1)
342+
self.assertEqual(end_line, new_code.co_firstlineno + 1)
343+
self.assertIsNone(column)
344+
self.assertIsNone(end_column)
345+
312346

313347
def isinterned(s):
314348
return s is sys.intern(('_' + s + '_')[1:-1])

Lib/test/test_compile.py

Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import os
44
import unittest
55
import sys
6+
import ast
67
import _ast
78
import tempfile
89
import types
@@ -985,6 +986,118 @@ def if_else_break():
985986
self.assertNotEqual(instr.arg, (line + 1)*INSTR_SIZE)
986987

987988

989+
class TestSourcePositions(unittest.TestCase):
990+
# Ensure that compiled code snippets have correct line and column numbers
991+
# in `co_positions()`.
992+
993+
def check_positions_against_ast(self, snippet):
994+
# Basic check that makes sure each line and column is at least present
995+
# in one of the AST nodes of the source code.
996+
code = compile(snippet, 'test_compile.py', 'exec')
997+
ast_tree = compile(snippet, 'test_compile.py', 'exec', _ast.PyCF_ONLY_AST)
998+
self.assertTrue(type(ast_tree) == _ast.Module)
999+
1000+
# Use an AST visitor that notes all the offsets.
1001+
lines, end_lines, columns, end_columns = set(), set(), set(), set()
1002+
class SourceOffsetVisitor(ast.NodeVisitor):
1003+
def generic_visit(self, node):
1004+
super().generic_visit(node)
1005+
if not isinstance(node, ast.expr) and not isinstance(node, ast.stmt):
1006+
return
1007+
lines.add(node.lineno)
1008+
end_lines.add(node.end_lineno)
1009+
columns.add(node.col_offset + 1)
1010+
end_columns.add(node.end_col_offset + 1)
1011+
1012+
SourceOffsetVisitor().visit(ast_tree)
1013+
1014+
# Check against the positions in the code object.
1015+
for (line, end_line, col, end_col) in code.co_positions():
1016+
# If the offset is not None (indicating missing data), ensure that
1017+
# it was part of one of the AST nodes.
1018+
if line is not None:
1019+
self.assertIn(line, lines)
1020+
if end_line is not None:
1021+
self.assertIn(end_line, end_lines)
1022+
if col is not None:
1023+
self.assertIn(col, columns)
1024+
if end_col is not None:
1025+
self.assertIn(end_col, end_columns)
1026+
1027+
return code, ast_tree
1028+
1029+
def assertOpCodeSourcePositionIs(self, code, opcode,
1030+
line, end_line, column, end_column):
1031+
1032+
for instr, position in zip(dis.Bytecode(code), code.co_positions()):
1033+
if instr.opname == opcode:
1034+
self.assertEqual(position[0], line)
1035+
self.assertEqual(position[1], end_line)
1036+
self.assertEqual(position[2], column)
1037+
self.assertEqual(position[3], end_column)
1038+
return
1039+
1040+
self.fail(f"Opcode {opcode} not found in code")
1041+
1042+
def test_simple_assignment(self):
1043+
snippet = "x = 1"
1044+
self.check_positions_against_ast(snippet)
1045+
1046+
def test_compiles_to_extended_op_arg(self):
1047+
# Make sure we still have valid positions when the code compiles to an
1048+
# EXTENDED_ARG by performing a loop which needs a JUMP_ABSOLUTE after
1049+
# a bunch of opcodes.
1050+
snippet = "x = x\n" * 10_000
1051+
snippet += "while x != 0:\n"
1052+
snippet += " x -= 1\n"
1053+
snippet += "while x != 0:\n"
1054+
snippet += " x += 1\n"
1055+
1056+
compiled_code, _ = self.check_positions_against_ast(snippet)
1057+
1058+
self.assertOpCodeSourcePositionIs(compiled_code, 'INPLACE_SUBTRACT',
1059+
line=10_000 + 2, end_line=10_000 + 2,
1060+
column=3, end_column=9)
1061+
self.assertOpCodeSourcePositionIs(compiled_code, 'INPLACE_ADD',
1062+
line=10_000 + 4, end_line=10_000 + 4,
1063+
column=3, end_column=10)
1064+
1065+
def test_multiline_expression(self):
1066+
snippet = """\
1067+
f(
1068+
1, 2, 3, 4
1069+
)
1070+
"""
1071+
compiled_code, _ = self.check_positions_against_ast(snippet)
1072+
self.assertOpCodeSourcePositionIs(compiled_code, 'CALL_FUNCTION',
1073+
line=1, end_line=3, column=1, end_column=2)
1074+
1075+
def test_very_long_line_end_offset(self):
1076+
# Make sure we get None for when the column offset is too large to
1077+
# store in a byte.
1078+
long_string = "a" * 1000
1079+
snippet = f"g('{long_string}')"
1080+
1081+
compiled_code, _ = self.check_positions_against_ast(snippet)
1082+
self.assertOpCodeSourcePositionIs(compiled_code, 'CALL_FUNCTION',
1083+
line=1, end_line=1, column=None, end_column=None)
1084+
1085+
def test_complex_single_line_expression(self):
1086+
snippet = "a - b @ (c * x['key'] + 23)"
1087+
1088+
compiled_code, _ = self.check_positions_against_ast(snippet)
1089+
self.assertOpCodeSourcePositionIs(compiled_code, 'BINARY_SUBSCR',
1090+
line=1, end_line=1, column=14, end_column=22)
1091+
self.assertOpCodeSourcePositionIs(compiled_code, 'BINARY_MULTIPLY',
1092+
line=1, end_line=1, column=10, end_column=22)
1093+
self.assertOpCodeSourcePositionIs(compiled_code, 'BINARY_ADD',
1094+
line=1, end_line=1, column=10, end_column=27)
1095+
self.assertOpCodeSourcePositionIs(compiled_code, 'BINARY_MATRIX_MULTIPLY',
1096+
line=1, end_line=1, column=5, end_column=28)
1097+
self.assertOpCodeSourcePositionIs(compiled_code, 'BINARY_SUBTRACT',
1098+
line=1, end_line=1, column=1, end_column=28)
1099+
1100+
9881101
class TestExpressionStackSize(unittest.TestCase):
9891102
# These tests check that the computed stack size for a code object
9901103
# stays within reasonable bounds (see issue #21523 for an example

0 commit comments

Comments
 (0)