Skip to content

Commit 4ab6abf

Browse files
bpo-30299: Display a bytecode when compile a regex in debug mode. (#1491)
`re.compile(..., re.DEBUG)` now displays the compiled bytecode in human readable form.
1 parent 821a9d1 commit 4ab6abf

File tree

3 files changed

+177
-1
lines changed

3 files changed

+177
-1
lines changed

Lib/sre_compile.py

Lines changed: 147 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -595,6 +595,150 @@ def _code(p, flags):
595595

596596
return code
597597

598+
def _hex_code(code):
599+
return '[%s]' % ', '.join('%#0*x' % (_sre.CODESIZE*2+2, x) for x in code)
600+
601+
def dis(code):
602+
import sys
603+
604+
labels = set()
605+
level = 0
606+
offset_width = len(str(len(code) - 1))
607+
608+
def dis_(start, end):
609+
def print_(*args, to=None):
610+
if to is not None:
611+
labels.add(to)
612+
args += ('(to %d)' % (to,),)
613+
print('%*d%s ' % (offset_width, start, ':' if start in labels else '.'),
614+
end=' '*(level-1))
615+
print(*args)
616+
617+
def print_2(*args):
618+
print(end=' '*(offset_width + 2*level))
619+
print(*args)
620+
621+
nonlocal level
622+
level += 1
623+
i = start
624+
while i < end:
625+
start = i
626+
op = code[i]
627+
i += 1
628+
op = OPCODES[op]
629+
if op in (SUCCESS, FAILURE, ANY, ANY_ALL,
630+
MAX_UNTIL, MIN_UNTIL, NEGATE):
631+
print_(op)
632+
elif op in (LITERAL, NOT_LITERAL,
633+
LITERAL_IGNORE, NOT_LITERAL_IGNORE,
634+
LITERAL_LOC_IGNORE, NOT_LITERAL_LOC_IGNORE):
635+
arg = code[i]
636+
i += 1
637+
print_(op, '%#02x (%r)' % (arg, chr(arg)))
638+
elif op is AT:
639+
arg = code[i]
640+
i += 1
641+
arg = str(ATCODES[arg])
642+
assert arg[:3] == 'AT_'
643+
print_(op, arg[3:])
644+
elif op is CATEGORY:
645+
arg = code[i]
646+
i += 1
647+
arg = str(CHCODES[arg])
648+
assert arg[:9] == 'CATEGORY_'
649+
print_(op, arg[9:])
650+
elif op in (IN, IN_IGNORE, IN_LOC_IGNORE):
651+
skip = code[i]
652+
print_(op, skip, to=i+skip)
653+
dis_(i+1, i+skip)
654+
i += skip
655+
elif op in (RANGE, RANGE_IGNORE):
656+
lo, hi = code[i: i+2]
657+
i += 2
658+
print_(op, '%#02x %#02x (%r-%r)' % (lo, hi, chr(lo), chr(hi)))
659+
elif op is CHARSET:
660+
print_(op, _hex_code(code[i: i + 256//_CODEBITS]))
661+
i += 256//_CODEBITS
662+
elif op is BIGCHARSET:
663+
arg = code[i]
664+
i += 1
665+
mapping = list(b''.join(x.to_bytes(_sre.CODESIZE, sys.byteorder)
666+
for x in code[i: i + 256//_sre.CODESIZE]))
667+
print_(op, arg, mapping)
668+
i += 256//_sre.CODESIZE
669+
level += 1
670+
for j in range(arg):
671+
print_2(_hex_code(code[i: i + 256//_CODEBITS]))
672+
i += 256//_CODEBITS
673+
level -= 1
674+
elif op in (MARK, GROUPREF, GROUPREF_IGNORE):
675+
arg = code[i]
676+
i += 1
677+
print_(op, arg)
678+
elif op is JUMP:
679+
skip = code[i]
680+
print_(op, skip, to=i+skip)
681+
i += 1
682+
elif op is BRANCH:
683+
skip = code[i]
684+
print_(op, skip, to=i+skip)
685+
while skip:
686+
dis_(i+1, i+skip)
687+
i += skip
688+
start = i
689+
skip = code[i]
690+
if skip:
691+
print_('branch', skip, to=i+skip)
692+
else:
693+
print_(FAILURE)
694+
i += 1
695+
elif op in (REPEAT, REPEAT_ONE, MIN_REPEAT_ONE):
696+
skip, min, max = code[i: i+3]
697+
if max == MAXREPEAT:
698+
max = 'MAXREPEAT'
699+
print_(op, skip, min, max, to=i+skip)
700+
dis_(i+3, i+skip)
701+
i += skip
702+
elif op is GROUPREF_EXISTS:
703+
arg, skip = code[i: i+2]
704+
print_(op, arg, skip, to=i+skip)
705+
i += 2
706+
elif op in (ASSERT, ASSERT_NOT):
707+
skip, arg = code[i: i+2]
708+
print_(op, skip, arg, to=i+skip)
709+
dis_(i+2, i+skip)
710+
i += skip
711+
elif op is INFO:
712+
skip, flags, min, max = code[i: i+4]
713+
if max == MAXREPEAT:
714+
max = 'MAXREPEAT'
715+
print_(op, skip, bin(flags), min, max, to=i+skip)
716+
start = i+4
717+
if flags & SRE_INFO_PREFIX:
718+
prefix_len, prefix_skip = code[i+4: i+6]
719+
print_2(' prefix_skip', prefix_skip)
720+
start = i + 6
721+
prefix = code[start: start+prefix_len]
722+
print_2(' prefix',
723+
'[%s]' % ', '.join('%#02x' % x for x in prefix),
724+
'(%r)' % ''.join(map(chr, prefix)))
725+
start += prefix_len
726+
print_2(' overlap', code[start: start+prefix_len])
727+
start += prefix_len
728+
if flags & SRE_INFO_CHARSET:
729+
level += 1
730+
print_2('in')
731+
dis_(start, i+skip)
732+
level -= 1
733+
i += skip
734+
else:
735+
raise ValueError(op)
736+
737+
level -= 1
738+
739+
dis_(0, len(code))
740+
741+
598742
def compile(p, flags=0):
599743
# internal: convert pattern list to internal format
600744

@@ -606,7 +750,9 @@ def compile(p, flags=0):
606750

607751
code = _code(p, flags)
608752

609-
# print(code)
753+
if flags & SRE_FLAG_DEBUG:
754+
print()
755+
dis(code)
610756

611757
# map in either direction
612758
groupindex = p.pattern.groupdict

Lib/test/test_re.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1688,10 +1688,12 @@ def test_bug_2537(self):
16881688
self.assertEqual(m.group(1), "")
16891689
self.assertEqual(m.group(2), "y")
16901690

1691+
@cpython_only
16911692
def test_debug_flag(self):
16921693
pat = r'(\.)(?:[ch]|py)(?(1)$|: )'
16931694
with captured_stdout() as out:
16941695
re.compile(pat, re.DEBUG)
1696+
self.maxDiff = None
16951697
dump = '''\
16961698
SUBPATTERN 1 0 0
16971699
LITERAL 46
@@ -1707,6 +1709,31 @@ def test_debug_flag(self):
17071709
ELSE
17081710
LITERAL 58
17091711
LITERAL 32
1712+
1713+
0. INFO 8 0b1 2 5 (to 9)
1714+
prefix_skip 0
1715+
prefix [0x2e] ('.')
1716+
overlap [0]
1717+
9: MARK 0
1718+
11. LITERAL 0x2e ('.')
1719+
13. MARK 1
1720+
15. BRANCH 10 (to 26)
1721+
17. IN 6 (to 24)
1722+
19. LITERAL 0x63 ('c')
1723+
21. LITERAL 0x68 ('h')
1724+
23. FAILURE
1725+
24: JUMP 9 (to 34)
1726+
26: branch 7 (to 33)
1727+
27. LITERAL 0x70 ('p')
1728+
29. LITERAL 0x79 ('y')
1729+
31. JUMP 2 (to 34)
1730+
33: FAILURE
1731+
34: GROUPREF_EXISTS 0 6 (to 41)
1732+
37. AT END
1733+
39. JUMP 5 (to 45)
1734+
41: LITERAL 0x3a (':')
1735+
43. LITERAL 0x20 (' ')
1736+
45: SUCCESS
17101737
'''
17111738
self.assertEqual(out.getvalue(), dump)
17121739
# Debug output is output again even a second time (bypassing

Misc/NEWS

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -323,6 +323,9 @@ Extension Modules
323323
Library
324324
-------
325325

326+
- bpo-30299: Compiling regular expression in debug mode on CPython now displays
327+
the compiled bytecode in human readable form.
328+
326329
- bpo-30048: Fixed ``Task.cancel()`` can be ignored when the task is
327330
running coroutine and the coroutine returned without any more ``await``.
328331

0 commit comments

Comments
 (0)