Skip to content

Commit 821a9d1

Browse files
bpo-30340: Enhanced regular expressions optimization. (#1542)
This increased the performance of matching some patterns up to 25 times.
1 parent cbddf58 commit 821a9d1

File tree

4 files changed

+95
-54
lines changed

4 files changed

+95
-54
lines changed

Lib/sre_compile.py

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
_REPEATING_CODES = {REPEAT, MIN_REPEAT, MAX_REPEAT}
2121
_SUCCESS_CODES = {SUCCESS, FAILURE}
2222
_ASSERT_CODES = {ASSERT, ASSERT_NOT}
23+
_UNIT_CODES = _LITERAL_CODES | {ANY, IN}
2324

2425
# Sets of lowercase characters which have the same uppercase.
2526
_equivalences = (
@@ -125,7 +126,7 @@ def _compile(code, pattern, flags):
125126
elif op in REPEATING_CODES:
126127
if flags & SRE_FLAG_TEMPLATE:
127128
raise error("internal: unsupported template operator %r" % (op,))
128-
elif _simple(av) and op is not REPEAT:
129+
if _simple(av[2]):
129130
if op is MAX_REPEAT:
130131
emit(REPEAT_ONE)
131132
else:
@@ -404,10 +405,14 @@ def _bytes_to_codes(b):
404405
assert len(a) * a.itemsize == len(b)
405406
return a.tolist()
406407

407-
def _simple(av):
408-
# check if av is a "simple" operator
409-
lo, hi = av[2].getwidth()
410-
return lo == hi == 1 and av[2][0][0] != SUBPATTERN
408+
def _simple(p):
409+
# check if this subpattern is a "simple" operator
410+
if len(p) != 1:
411+
return False
412+
op, av = p[0]
413+
if op is SUBPATTERN:
414+
return av[0] is None and _simple(av[-1])
415+
return op in _UNIT_CODES
411416

412417
def _generate_overlap_table(prefix):
413418
"""

Lib/sre_parse.py

Lines changed: 70 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,7 @@ def __init__(self, pattern, data=None):
114114
data = []
115115
self.data = data
116116
self.width = None
117+
117118
def dump(self, level=0):
118119
nl = True
119120
seqtypes = (tuple, list)
@@ -404,6 +405,15 @@ def _escape(source, escape, state):
404405
pass
405406
raise source.error("bad escape %s" % escape, len(escape))
406407

408+
def _uniq(items):
409+
if len(set(items)) == len(items):
410+
return items
411+
newitems = []
412+
for item in items:
413+
if item not in newitems:
414+
newitems.append(item)
415+
return newitems
416+
407417
def _parse_sub(source, state, verbose, nested=True):
408418
# parse an alternation: a|b|c
409419

@@ -420,7 +430,6 @@ def _parse_sub(source, state, verbose, nested=True):
420430
return items[0]
421431

422432
subpattern = SubPattern(state)
423-
subpatternappend = subpattern.append
424433

425434
# check if all items share a common prefix
426435
while True:
@@ -437,35 +446,31 @@ def _parse_sub(source, state, verbose, nested=True):
437446
# move it out of the branch
438447
for item in items:
439448
del item[0]
440-
subpatternappend(prefix)
449+
subpattern.append(prefix)
441450
continue # check next one
442451
break
443452

444453
# check if the branch can be replaced by a character set
454+
set = []
445455
for item in items:
446-
if len(item) != 1 or item[0][0] is not LITERAL:
456+
if len(item) != 1:
457+
break
458+
op, av = item[0]
459+
if op is LITERAL:
460+
set.append((op, av))
461+
elif op is IN and av[0][0] is not NEGATE:
462+
set.extend(av)
463+
else:
447464
break
448465
else:
449466
# we can store this as a character set instead of a
450467
# branch (the compiler may optimize this even more)
451-
subpatternappend((IN, [item[0] for item in items]))
468+
subpattern.append((IN, _uniq(set)))
452469
return subpattern
453470

454471
subpattern.append((BRANCH, (None, items)))
455472
return subpattern
456473

457-
def _parse_sub_cond(source, state, condgroup, verbose):
458-
item_yes = _parse(source, state, verbose)
459-
if source.match("|"):
460-
item_no = _parse(source, state, verbose)
461-
if source.next == "|":
462-
raise source.error("conditional backref with more than two branches")
463-
else:
464-
item_no = None
465-
subpattern = SubPattern(state)
466-
subpattern.append((GROUPREF_EXISTS, (condgroup, item_yes, item_no)))
467-
return subpattern
468-
469474
def _parse(source, state, verbose, first=False):
470475
# parse a simple pattern
471476
subpattern = SubPattern(state)
@@ -511,16 +516,14 @@ def _parse(source, state, verbose, first=False):
511516
setappend = set.append
512517
## if sourcematch(":"):
513518
## pass # handle character classes
514-
if sourcematch("^"):
515-
setappend((NEGATE, None))
519+
negate = sourcematch("^")
516520
# check remaining characters
517-
start = set[:]
518521
while True:
519522
this = sourceget()
520523
if this is None:
521524
raise source.error("unterminated character set",
522525
source.tell() - here)
523-
if this == "]" and set != start:
526+
if this == "]" and set:
524527
break
525528
elif this[0] == "\\":
526529
code1 = _class_escape(source, this)
@@ -556,13 +559,19 @@ def _parse(source, state, verbose, first=False):
556559
code1 = code1[1][0]
557560
setappend(code1)
558561

562+
set = _uniq(set)
559563
# XXX: <fl> should move set optimization to compiler!
560-
if _len(set)==1 and set[0][0] is LITERAL:
561-
subpatternappend(set[0]) # optimization
562-
elif _len(set)==2 and set[0][0] is NEGATE and set[1][0] is LITERAL:
563-
subpatternappend((NOT_LITERAL, set[1][1])) # optimization
564+
if _len(set) == 1 and set[0][0] is LITERAL:
565+
# optimization
566+
if negate:
567+
subpatternappend((NOT_LITERAL, set[0][1]))
568+
else:
569+
subpatternappend(set[0])
564570
else:
565-
# XXX: <fl> should add charmap optimization here
571+
if negate:
572+
set.insert(0, (NEGATE, None))
573+
# charmap optimization can't be added here because
574+
# global flags still are not known
566575
subpatternappend((IN, set))
567576

568577
elif this in REPEAT_CHARS:
@@ -579,6 +588,7 @@ def _parse(source, state, verbose, first=False):
579588
if source.next == "}":
580589
subpatternappend((LITERAL, _ord(this)))
581590
continue
591+
582592
min, max = 0, MAXREPEAT
583593
lo = hi = ""
584594
while source.next in DIGITS:
@@ -592,6 +602,7 @@ def _parse(source, state, verbose, first=False):
592602
subpatternappend((LITERAL, _ord(this)))
593603
source.seek(here)
594604
continue
605+
595606
if lo:
596607
min = int(lo)
597608
if min >= MAXREPEAT:
@@ -610,12 +621,16 @@ def _parse(source, state, verbose, first=False):
610621
item = subpattern[-1:]
611622
else:
612623
item = None
613-
if not item or (_len(item) == 1 and item[0][0] is AT):
624+
if not item or item[0][0] is AT:
614625
raise source.error("nothing to repeat",
615626
source.tell() - here + len(this))
616627
if item[0][0] in _REPEATCODES:
617628
raise source.error("multiple repeat",
618629
source.tell() - here + len(this))
630+
if item[0][0] is SUBPATTERN:
631+
group, add_flags, del_flags, p = item[0][1]
632+
if group is None and not add_flags and not del_flags:
633+
item = p
619634
if sourcematch("?"):
620635
subpattern[-1] = (MIN_REPEAT, (min, max, item))
621636
else:
@@ -628,7 +643,6 @@ def _parse(source, state, verbose, first=False):
628643
start = source.tell() - 1
629644
group = True
630645
name = None
631-
condgroup = None
632646
add_flags = 0
633647
del_flags = 0
634648
if sourcematch("?"):
@@ -660,6 +674,7 @@ def _parse(source, state, verbose, first=False):
660674
state.checklookbehindgroup(gid, source)
661675
subpatternappend((GROUPREF, gid))
662676
continue
677+
663678
else:
664679
char = sourceget()
665680
if char is None:
@@ -678,6 +693,7 @@ def _parse(source, state, verbose, first=False):
678693
if sourceget() == ")":
679694
break
680695
continue
696+
681697
elif char in "=!<":
682698
# lookahead assertions
683699
dir = 1
@@ -704,10 +720,10 @@ def _parse(source, state, verbose, first=False):
704720
else:
705721
subpatternappend((ASSERT_NOT, (dir, p)))
706722
continue
723+
707724
elif char == "(":
708725
# conditional backreference group
709726
condname = source.getuntil(")")
710-
group = None
711727
if condname.isidentifier():
712728
condgroup = state.groupdict.get(condname)
713729
if condgroup is None:
@@ -728,6 +744,19 @@ def _parse(source, state, verbose, first=False):
728744
msg = "invalid group reference %d" % condgroup
729745
raise source.error(msg, len(condname) + 1)
730746
state.checklookbehindgroup(condgroup, source)
747+
item_yes = _parse(source, state, verbose)
748+
if source.match("|"):
749+
item_no = _parse(source, state, verbose)
750+
if source.next == "|":
751+
raise source.error("conditional backref with more than two branches")
752+
else:
753+
item_no = None
754+
if not source.match(")"):
755+
raise source.error("missing ), unterminated subpattern",
756+
source.tell() - start)
757+
subpatternappend((GROUPREF_EXISTS, (condgroup, item_yes, item_no)))
758+
continue
759+
731760
elif char in FLAGS or char == "-":
732761
# flags
733762
flags = _parse_flags(source, state, char)
@@ -744,6 +773,7 @@ def _parse(source, state, verbose, first=False):
744773
if (state.flags & SRE_FLAG_VERBOSE) and not verbose:
745774
raise Verbose
746775
continue
776+
747777
add_flags, del_flags = flags
748778
group = None
749779
else:
@@ -756,12 +786,9 @@ def _parse(source, state, verbose, first=False):
756786
group = state.opengroup(name)
757787
except error as err:
758788
raise source.error(err.msg, len(name) + 1) from None
759-
if condgroup:
760-
p = _parse_sub_cond(source, state, condgroup, verbose)
761-
else:
762-
sub_verbose = ((verbose or (add_flags & SRE_FLAG_VERBOSE)) and
763-
not (del_flags & SRE_FLAG_VERBOSE))
764-
p = _parse_sub(source, state, sub_verbose)
789+
sub_verbose = ((verbose or (add_flags & SRE_FLAG_VERBOSE)) and
790+
not (del_flags & SRE_FLAG_VERBOSE))
791+
p = _parse_sub(source, state, sub_verbose)
765792
if not source.match(")"):
766793
raise source.error("missing ), unterminated subpattern",
767794
source.tell() - start)
@@ -773,11 +800,19 @@ def _parse(source, state, verbose, first=False):
773800
subpatternappend((AT, AT_BEGINNING))
774801

775802
elif this == "$":
776-
subpattern.append((AT, AT_END))
803+
subpatternappend((AT, AT_END))
777804

778805
else:
779806
raise AssertionError("unsupported special character %r" % (char,))
780807

808+
# unpack non-capturing groups
809+
for i in range(len(subpattern))[::-1]:
810+
op, av = subpattern[i]
811+
if op is SUBPATTERN:
812+
group, add_flags, del_flags, p = av
813+
if group is None and not add_flags and not del_flags:
814+
subpattern[i: i+1] = p
815+
781816
return subpattern
782817

783818
def _parse_flags(source, state, char):

Lib/test/test_re.py

Lines changed: 12 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1695,20 +1695,18 @@ def test_debug_flag(self):
16951695
dump = '''\
16961696
SUBPATTERN 1 0 0
16971697
LITERAL 46
1698-
SUBPATTERN None 0 0
1699-
BRANCH
1700-
IN
1701-
LITERAL 99
1702-
LITERAL 104
1703-
OR
1704-
LITERAL 112
1705-
LITERAL 121
1706-
SUBPATTERN None 0 0
1707-
GROUPREF_EXISTS 1
1708-
AT AT_END
1709-
ELSE
1710-
LITERAL 58
1711-
LITERAL 32
1698+
BRANCH
1699+
IN
1700+
LITERAL 99
1701+
LITERAL 104
1702+
OR
1703+
LITERAL 112
1704+
LITERAL 121
1705+
GROUPREF_EXISTS 1
1706+
AT AT_END
1707+
ELSE
1708+
LITERAL 58
1709+
LITERAL 32
17121710
'''
17131711
self.assertEqual(out.getvalue(), dump)
17141712
# Debug output is output again even a second time (bypassing

Misc/NEWS

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -326,6 +326,9 @@ Library
326326
- bpo-30048: Fixed ``Task.cancel()`` can be ignored when the task is
327327
running coroutine and the coroutine returned without any more ``await``.
328328

329+
- bpo-30340: Enhanced regular expressions optimization. This increased
330+
the performance of matching some patterns up to 25 times.
331+
329332
- bpo-30298: Weaken the condition of deprecation warnings for inline modifiers.
330333
Now allowed several subsequential inline modifiers at the start of the
331334
pattern (e.g. ``'(?i)(?s)...'``). In verbose mode whitespaces and comments

0 commit comments

Comments
 (0)