Skip to content

bpo-30299: Display a bytecode when compile a regex in debug mode. #1491

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
May 14, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
148 changes: 147 additions & 1 deletion Lib/sre_compile.py
Original file line number Diff line number Diff line change
Expand Up @@ -595,6 +595,150 @@ def _code(p, flags):

return code

def _hex_code(code):
return '[%s]' % ', '.join('%#0*x' % (_sre.CODESIZE*2+2, x) for x in code)

def dis(code):
import sys

labels = set()
level = 0
offset_width = len(str(len(code) - 1))

def dis_(start, end):
def print_(*args, to=None):
if to is not None:
labels.add(to)
args += ('(to %d)' % (to,),)
print('%*d%s ' % (offset_width, start, ':' if start in labels else '.'),
end=' '*(level-1))
print(*args)

def print_2(*args):
print(end=' '*(offset_width + 2*level))
print(*args)

nonlocal level
level += 1
i = start
while i < end:
start = i
op = code[i]
i += 1
op = OPCODES[op]
if op in (SUCCESS, FAILURE, ANY, ANY_ALL,
MAX_UNTIL, MIN_UNTIL, NEGATE):
print_(op)
elif op in (LITERAL, NOT_LITERAL,
LITERAL_IGNORE, NOT_LITERAL_IGNORE,
LITERAL_LOC_IGNORE, NOT_LITERAL_LOC_IGNORE):
arg = code[i]
i += 1
print_(op, '%#02x (%r)' % (arg, chr(arg)))
elif op is AT:
arg = code[i]
i += 1
arg = str(ATCODES[arg])
assert arg[:3] == 'AT_'
print_(op, arg[3:])
elif op is CATEGORY:
arg = code[i]
i += 1
arg = str(CHCODES[arg])
assert arg[:9] == 'CATEGORY_'
print_(op, arg[9:])
elif op in (IN, IN_IGNORE, IN_LOC_IGNORE):
skip = code[i]
print_(op, skip, to=i+skip)
dis_(i+1, i+skip)
i += skip
elif op in (RANGE, RANGE_IGNORE):
lo, hi = code[i: i+2]
i += 2
print_(op, '%#02x %#02x (%r-%r)' % (lo, hi, chr(lo), chr(hi)))
elif op is CHARSET:
print_(op, _hex_code(code[i: i + 256//_CODEBITS]))
i += 256//_CODEBITS
elif op is BIGCHARSET:
arg = code[i]
i += 1
mapping = list(b''.join(x.to_bytes(_sre.CODESIZE, sys.byteorder)
for x in code[i: i + 256//_sre.CODESIZE]))
print_(op, arg, mapping)
i += 256//_sre.CODESIZE
level += 1
for j in range(arg):
print_2(_hex_code(code[i: i + 256//_CODEBITS]))
i += 256//_CODEBITS
level -= 1
elif op in (MARK, GROUPREF, GROUPREF_IGNORE):
arg = code[i]
i += 1
print_(op, arg)
elif op is JUMP:
skip = code[i]
print_(op, skip, to=i+skip)
i += 1
elif op is BRANCH:
skip = code[i]
print_(op, skip, to=i+skip)
while skip:
dis_(i+1, i+skip)
i += skip
start = i
skip = code[i]
if skip:
print_('branch', skip, to=i+skip)
else:
print_(FAILURE)
i += 1
elif op in (REPEAT, REPEAT_ONE, MIN_REPEAT_ONE):
skip, min, max = code[i: i+3]
if max == MAXREPEAT:
max = 'MAXREPEAT'
print_(op, skip, min, max, to=i+skip)
dis_(i+3, i+skip)
i += skip
elif op is GROUPREF_EXISTS:
arg, skip = code[i: i+2]
print_(op, arg, skip, to=i+skip)
i += 2
elif op in (ASSERT, ASSERT_NOT):
skip, arg = code[i: i+2]
print_(op, skip, arg, to=i+skip)
dis_(i+2, i+skip)
i += skip
elif op is INFO:
skip, flags, min, max = code[i: i+4]
if max == MAXREPEAT:
max = 'MAXREPEAT'
print_(op, skip, bin(flags), min, max, to=i+skip)
start = i+4
if flags & SRE_INFO_PREFIX:
prefix_len, prefix_skip = code[i+4: i+6]
print_2(' prefix_skip', prefix_skip)
start = i + 6
prefix = code[start: start+prefix_len]
print_2(' prefix',
'[%s]' % ', '.join('%#02x' % x for x in prefix),
'(%r)' % ''.join(map(chr, prefix)))
start += prefix_len
print_2(' overlap', code[start: start+prefix_len])
start += prefix_len
if flags & SRE_INFO_CHARSET:
level += 1
print_2('in')
dis_(start, i+skip)
level -= 1
i += skip
else:
raise ValueError(op)

level -= 1

dis_(0, len(code))


def compile(p, flags=0):
# internal: convert pattern list to internal format

Expand All @@ -606,7 +750,9 @@ def compile(p, flags=0):

code = _code(p, flags)

# print(code)
if flags & SRE_FLAG_DEBUG:
print()
dis(code)

# map in either direction
groupindex = p.pattern.groupdict
Expand Down
27 changes: 27 additions & 0 deletions Lib/test/test_re.py
Original file line number Diff line number Diff line change
Expand Up @@ -1688,10 +1688,12 @@ def test_bug_2537(self):
self.assertEqual(m.group(1), "")
self.assertEqual(m.group(2), "y")

@cpython_only
def test_debug_flag(self):
pat = r'(\.)(?:[ch]|py)(?(1)$|: )'
with captured_stdout() as out:
re.compile(pat, re.DEBUG)
self.maxDiff = None
dump = '''\
SUBPATTERN 1 0 0
LITERAL 46
Expand All @@ -1707,6 +1709,31 @@ def test_debug_flag(self):
ELSE
LITERAL 58
LITERAL 32

0. INFO 8 0b1 2 5 (to 9)
prefix_skip 0
prefix [0x2e] ('.')
overlap [0]
9: MARK 0
11. LITERAL 0x2e ('.')
13. MARK 1
15. BRANCH 10 (to 26)
17. IN 6 (to 24)
19. LITERAL 0x63 ('c')
21. LITERAL 0x68 ('h')
23. FAILURE
24: JUMP 9 (to 34)
26: branch 7 (to 33)
27. LITERAL 0x70 ('p')
29. LITERAL 0x79 ('y')
31. JUMP 2 (to 34)
33: FAILURE
34: GROUPREF_EXISTS 0 6 (to 41)
37. AT END
39. JUMP 5 (to 45)
41: LITERAL 0x3a (':')
43. LITERAL 0x20 (' ')
45: SUCCESS
'''
self.assertEqual(out.getvalue(), dump)
# Debug output is output again even a second time (bypassing
Expand Down
3 changes: 3 additions & 0 deletions Misc/NEWS
Original file line number Diff line number Diff line change
Expand Up @@ -323,6 +323,9 @@ Extension Modules
Library
-------

- bpo-30299: Compiling regular expression in debug mode on CPython now displays
the compiled bytecode in human readable form.

- bpo-30048: Fixed ``Task.cancel()`` can be ignored when the task is
running coroutine and the coroutine returned without any more ``await``.

Expand Down