Skip to content

Commit 1c26f1c

Browse files
[3.11] gh-109747: Improve errors for unsupported look-behind patterns (GH-109859) (GH-110860)
Now re.error is raised instead of OverflowError or RuntimeError for too large width of look-behind pattern. The limit is increased to 2**32-1 (was 2**31-1). (cherry picked from commit e2b3d83) Co-authored-by: Serhiy Storchaka <[email protected]>
1 parent 035f9e0 commit 1c26f1c

File tree

6 files changed

+46
-13
lines changed

6 files changed

+46
-13
lines changed

Lib/re/_compiler.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,8 @@ def _compile(code, pattern, flags):
149149
emit(0) # look ahead
150150
else:
151151
lo, hi = av[1].getwidth()
152+
if lo > MAXCODE:
153+
raise error("looks too much behind")
152154
if lo != hi:
153155
raise error("look-behind requires fixed-width pattern")
154156
emit(lo) # look behind
@@ -549,7 +551,7 @@ def _compile_info(code, pattern, flags):
549551
else:
550552
emit(MAXCODE)
551553
prefix = prefix[:MAXCODE]
552-
emit(min(hi, MAXCODE))
554+
emit(hi)
553555
# add literal prefix
554556
if prefix:
555557
emit(len(prefix)) # length

Lib/re/_parser.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,10 @@
6868
TYPE_FLAGS = SRE_FLAG_ASCII | SRE_FLAG_LOCALE | SRE_FLAG_UNICODE
6969
GLOBAL_FLAGS = SRE_FLAG_DEBUG | SRE_FLAG_TEMPLATE
7070

71+
# Maximal value returned by SubPattern.getwidth().
72+
# Must be larger than MAXREPEAT, MAXCODE and sys.maxsize.
73+
MAXWIDTH = 1 << 64
74+
7175
class State:
7276
# keeps track of state for parsing
7377
def __init__(self):
@@ -178,7 +182,7 @@ def getwidth(self):
178182
lo = hi = 0
179183
for op, av in self.data:
180184
if op is BRANCH:
181-
i = MAXREPEAT - 1
185+
i = MAXWIDTH
182186
j = 0
183187
for av in av[1]:
184188
l, h = av.getwidth()
@@ -197,7 +201,10 @@ def getwidth(self):
197201
elif op in _REPEATCODES:
198202
i, j = av[2].getwidth()
199203
lo = lo + i * av[0]
200-
hi = hi + j * av[1]
204+
if av[1] == MAXREPEAT and j:
205+
hi = MAXWIDTH
206+
else:
207+
hi = hi + j * av[1]
201208
elif op in _UNITCODES:
202209
lo = lo + 1
203210
hi = hi + 1
@@ -217,7 +224,7 @@ def getwidth(self):
217224
hi = hi + j
218225
elif op is SUCCESS:
219226
break
220-
self.width = min(lo, MAXREPEAT - 1), min(hi, MAXREPEAT)
227+
self.width = min(lo, MAXWIDTH), min(hi, MAXWIDTH)
221228
return self.width
222229

223230
class Tokenizer:

Lib/test/test_re.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1830,6 +1830,29 @@ def test_repeat_minmax_overflow(self):
18301830
self.assertRaises(OverflowError, re.compile, r".{%d,}?" % 2**128)
18311831
self.assertRaises(OverflowError, re.compile, r".{%d,%d}" % (2**129, 2**128))
18321832

1833+
def test_look_behind_overflow(self):
1834+
string = "x" * 2_500_000
1835+
p1 = r"(?<=((.{%d}){%d}){%d})"
1836+
p2 = r"(?<!((.{%d}){%d}){%d})"
1837+
# Test that the templates are valid and look-behind with width 2**21
1838+
# (larger than sys.maxunicode) are supported.
1839+
self.assertEqual(re.search(p1 % (2**7, 2**7, 2**7), string).span(),
1840+
(2**21, 2**21))
1841+
self.assertEqual(re.search(p2 % (2**7, 2**7, 2**7), string).span(),
1842+
(0, 0))
1843+
# Test that 2**22 is accepted as a repetition number and look-behind
1844+
# width.
1845+
re.compile(p1 % (2**22, 1, 1))
1846+
re.compile(p1 % (1, 2**22, 1))
1847+
re.compile(p1 % (1, 1, 2**22))
1848+
re.compile(p2 % (2**22, 1, 1))
1849+
re.compile(p2 % (1, 2**22, 1))
1850+
re.compile(p2 % (1, 1, 2**22))
1851+
# But 2**66 is too large for look-behind width.
1852+
errmsg = "looks too much behind"
1853+
self.assertRaisesRegex(re.error, errmsg, re.compile, p1 % (2**22, 2**22, 2**22))
1854+
self.assertRaisesRegex(re.error, errmsg, re.compile, p2 % (2**22, 2**22, 2**22))
1855+
18331856
def test_backref_group_name_in_exception(self):
18341857
# Issue 17341: Poor error message when compiling invalid regex
18351858
self.checkPatternError('(?P=<foo>)',
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
Improve errors for unsupported look-behind patterns. Now re.error is raised
2+
instead of OverflowError or RuntimeError for too large width of look-behind
3+
pattern.

Modules/_sre/sre.c

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1939,8 +1939,6 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
19391939
GET_SKIP;
19401940
GET_ARG; /* 0 for lookahead, width for lookbehind */
19411941
code--; /* Back up over arg to simplify math below */
1942-
if (arg & 0x80000000)
1943-
FAIL; /* Width too large */
19441942
/* Stop 1 before the end; we check the SUCCESS below */
19451943
if (_validate_inner(code+1, code+skip-2, groups))
19461944
FAIL;

Modules/_sre/sre_lib.h

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -589,8 +589,8 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel)
589589
/* optimization info block */
590590
/* <INFO> <1=skip> <2=flags> <3=min> ... */
591591
if (pattern[3] && (uintptr_t)(end - ptr) < pattern[3]) {
592-
TRACE(("reject (got %zd chars, need %zd)\n",
593-
end - ptr, (Py_ssize_t) pattern[3]));
592+
TRACE(("reject (got %tu chars, need %zu)\n",
593+
end - ptr, (size_t) pattern[3]));
594594
RETURN_FAILURE;
595595
}
596596
pattern += pattern[1] + 1;
@@ -1507,7 +1507,7 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel)
15071507
/* <ASSERT> <skip> <back> <pattern> */
15081508
TRACE(("|%p|%p|ASSERT %d\n", pattern,
15091509
ptr, pattern[1]));
1510-
if (ptr - (SRE_CHAR *)state->beginning < (Py_ssize_t)pattern[1])
1510+
if ((uintptr_t)(ptr - (SRE_CHAR *)state->beginning) < pattern[1])
15111511
RETURN_FAILURE;
15121512
state->ptr = ptr - pattern[1];
15131513
DO_JUMP0(JUMP_ASSERT, jump_assert, pattern+2);
@@ -1520,7 +1520,7 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel)
15201520
/* <ASSERT_NOT> <skip> <back> <pattern> */
15211521
TRACE(("|%p|%p|ASSERT_NOT %d\n", pattern,
15221522
ptr, pattern[1]));
1523-
if (ptr - (SRE_CHAR *)state->beginning >= (Py_ssize_t)pattern[1]) {
1523+
if ((uintptr_t)(ptr - (SRE_CHAR *)state->beginning) >= pattern[1]) {
15241524
state->ptr = ptr - pattern[1];
15251525
LASTMARK_SAVE();
15261526
if (state->repeat)
@@ -1655,9 +1655,9 @@ SRE(search)(SRE_STATE* state, SRE_CODE* pattern)
16551655

16561656
flags = pattern[2];
16571657

1658-
if (pattern[3] && end - ptr < (Py_ssize_t)pattern[3]) {
1659-
TRACE(("reject (got %u chars, need %u)\n",
1660-
(unsigned int)(end - ptr), pattern[3]));
1658+
if (pattern[3] && (uintptr_t)(end - ptr) < pattern[3]) {
1659+
TRACE(("reject (got %tu chars, need %zu)\n",
1660+
end - ptr, (size_t) pattern[3]));
16611661
return 0;
16621662
}
16631663
if (pattern[3] > 1) {

0 commit comments

Comments
 (0)