Skip to content

bpo-35859: re module, fix three bugs about capturing groups #11756

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
100 changes: 100 additions & 0 deletions Lib/test/test_re.py
Original file line number Diff line number Diff line change
Expand Up @@ -2100,6 +2100,106 @@ def test_bug_34294(self):
{'tag': 'foo', 'text': None},
{'tag': 'foo', 'text': None}])

def test_bug_35859(self):
# three bugs about capturing groups, these bugs exist since Python 2

# =====================================================
# macro MARK_PUSH(lastmark) bug, reported in issue35859
# =====================================================
self.assertEqual(re.match(r'(ab|a)*?b', 'ab').groups(), ('a',))
self.assertEqual(re.match(r'(ab|a)+?b', 'ab').groups(), ('a',))
self.assertEqual(re.match(r'(ab|a){0,2}?b', 'ab').groups(), ('a',))
self.assertEqual(re.match(r'(.b|a)*?b', 'ab').groups(), ('a',))

# =======================================================
# JUMP_MIN_UNTIL_3 should LASTMARK_SAVE() and MARK_PUSH()
# =======================================================

# 1, triggered by SRE_OP_REPEAT_ONE, b? in this pattern
self.assertEqual(re.match(r'(ab?)*?b', 'ab').groups(), ('a',))

# 2, triggered by SRE_OP_MIN_REPEAT_ONE, a*? in this pattern
s = 'axxzaz'
p = r'(?:a*?(xx)??z)*'
self.assertEqual(re.match(p, s).groups(), ('xx',))

# 3, triggered by SRE_OP_MIN_UNTIL (JUMP_MIN_UNTIL_2)
# (?:a|bc)*? in this pattern
s = 'axxzbcz'
p = r'(?:(?:a|bc)*?(xx)??z)*'
self.assertEqual(re.match(p, s).groups(), ('xx',))
# test-case provided by issue9134
s = 'xtcxyzxc'
p = r'((x|yz)+?(t)??c)*'
self.assertEqual(re.match(p, s).groups(), ('xyzxc', 'x', 't'))

# ======================================
# JUMP_ASSERT_NOT should LASTMARK_SAVE()
# ======================================
# reported in issue725149
# negative assertion
self.assertEqual(re.match(r'(?!(..)c)', 'ab').groups(), (None,))
# negative assertion in a repeat
self.assertEqual(re.match(r'(?:(?!(ab)c).)*', 'ab').groups(), (None,))
self.assertEqual(re.match(r'((?!(bc)d)(.))*', 'abc').groups(),
('c', None, 'c'))

# =============================================================
# below asserts didn't fail before fix, just prevent regression
# =============================================================

# 1, why JUMP_MIN_REPEAT_ONE should LASTMARK_SAVE()
# .?? in this pattern
m = re.match(r'.??(?=(a)?)b', 'ab')
self.assertEqual(m.span(), (0, 2))
self.assertEqual(m.groups(), (None,))
# put in a repeat
m = re.match(r'(?:.??(?=(a)?)b)*', 'abab')
self.assertEqual(m.span(), (0, 4))
self.assertEqual(m.groups(), (None,))

# 2, why JUMP_MIN_UNTIL_2 should LASTMARK_SAVE()
# (?:..)?? in this pattern
m = re.match(r'(?:..)??(?=(aa)?)bb', 'aabb')
self.assertEqual(m.span(), (0, 4))
self.assertEqual(m.groups(), (None,))
# put in a repeat
m = re.match(r'(?:(?:..)??(?=(aa)?)bb)*', 'aabbaabb')
self.assertEqual(m.span(), (0, 8))
self.assertEqual(m.groups(), (None,))

# 3, why JUMP_REPEAT_ONE_1 should LASTMARK_SAVE()
# .* in this pattern, tail starts with a literal.
self.assertEqual(re.match(r'.*x(?=(b)?)a', 'xaxb').groups(), (None,))

# 4, why JUMP_REPEAT_ONE_2 should LASTMARK_SAVE()
# .* in this pattern, tail is general case
self.assertEqual(re.match(r'.*(?=(b)?)a', 'ab').groups(), (None,))

# 5, demonstrate that JUMP_MAX_UNTIL_3 doesn't need LASTMARK_SAVE()
# this pattern is similar to 4
self.assertEqual(re.match(r'(.)*(?=(b)?)a', 'ab').groups(),
(None, None))
self.assertEqual(re.match(r'(.){0}(?=(b)?)a', 'ab').groups(),
(None, None))

# 6, positive assertion in a repeat
# strictly speaking, this is a bug, the correct result should be
# (None,), but it's very hard to fix with the current fundamental
# implementation of sre.
# PHP 7.3.2, Java 11.0.2, Ruby 2.6.1, and the third-party module
# regex 2019.2.21, return ('a',) as well.
# Perl 5.26.1, Node.js 10.15.1, return the correct result (None,)
# Go 1.12, Rust 1.32.0, don't support lookaround yet.
self.assertEqual(re.match(r'(?:(?=(a)?).)*', 'ab').groups(), ('a',))

# 7, negative assertion
# PHP 7.3.2, Ruby 2.6.1, Node.js 10.15.1, regex 2019.2.21 return
# (None,)
# Java 11.0.2, Perl 5.26.1, return ('b',)
# Go 1.12, Rust 1.32.0, don't support lookaround yet.
self.assertEqual(re.match(r'a*(?!(b))', 'ab').groups(), (None,))


class PatternReprTests(unittest.TestCase):
def check(self, pattern, expected):
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
re module, fix wrong capturing groups in rare cases. The span of capturing
group may be lost when backtracking. Patch by Ma Lin.
20 changes: 16 additions & 4 deletions Modules/sre_lib.h
Original file line number Diff line number Diff line change
Expand Up @@ -478,20 +478,20 @@ do { \
DATA_STACK_LOOKUP_AT(state,t,p,pos)

#define MARK_PUSH(lastmark) \
do if (lastmark > 0) { \
do if (lastmark >= 0) { \
i = lastmark; /* ctx->lastmark may change if reallocated */ \
DATA_STACK_PUSH(state, state->mark, (i+1)*sizeof(void*)); \
} while (0)
#define MARK_POP(lastmark) \
do if (lastmark > 0) { \
do if (lastmark >= 0) { \
DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 1); \
} while (0)
#define MARK_POP_KEEP(lastmark) \
do if (lastmark > 0) { \
do if (lastmark >= 0) { \
DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 0); \
} while (0)
#define MARK_POP_DISCARD(lastmark) \
do if (lastmark > 0) { \
do if (lastmark >= 0) { \
DATA_STACK_POP_DISCARD(state, (lastmark+1)*sizeof(void*)); \
} while (0)

Expand Down Expand Up @@ -1126,16 +1126,20 @@ SRE(match)(SRE_STATE* state, SRE_CODE* pattern, int toplevel)
RETURN_FAILURE;

ctx->u.rep->count = ctx->count;
MARK_PUSH(ctx->lastmark); /* already LASTMARK_SAVE() above */
/* zero-width match protection */
DATA_PUSH(&ctx->u.rep->last_ptr);
ctx->u.rep->last_ptr = state->ptr;
DO_JUMP(JUMP_MIN_UNTIL_3,jump_min_until_3,
ctx->u.rep->pattern+3);
DATA_POP(&ctx->u.rep->last_ptr);
if (ret) {
MARK_POP_DISCARD(ctx->lastmark);
RETURN_ON_ERROR(ret);
RETURN_SUCCESS;
}
MARK_POP(ctx->lastmark);
LASTMARK_RESTORE();
ctx->u.rep->count = ctx->count-1;
state->ptr = ctx->ptr;
RETURN_FAILURE;
Expand Down Expand Up @@ -1285,11 +1289,19 @@ SRE(match)(SRE_STATE* state, SRE_CODE* pattern, int toplevel)
ctx->ptr, ctx->pattern[1]));
if (ctx->ptr - (SRE_CHAR *)state->beginning >= (Py_ssize_t)ctx->pattern[1]) {
state->ptr = ctx->ptr - ctx->pattern[1];
LASTMARK_SAVE();
if (state->repeat)
MARK_PUSH(ctx->lastmark);
DO_JUMP0(JUMP_ASSERT_NOT, jump_assert_not, ctx->pattern+2);
if (ret) {
if (state->repeat)
MARK_POP_DISCARD(ctx->lastmark);
RETURN_ON_ERROR(ret);
RETURN_FAILURE;
}
if (state->repeat)
MARK_POP(ctx->lastmark);
LASTMARK_RESTORE();
}
ctx->pattern += ctx->pattern[0];
break;
Expand Down