Skip to content

[2.7] bpo-35859: re module, fix four bugs about capturing groups (GH-11756) #12134

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
109 changes: 109 additions & 0 deletions Lib/test/test_re.py
Original file line number Diff line number Diff line change
Expand Up @@ -1194,6 +1194,115 @@ def check_en_US_utf8(self):
self.assertIsNone(re.match(b'(?Li)\xc5', b'\xe5'))
self.assertIsNone(re.match(b'(?Li)\xe5', b'\xc5'))

def test_bug_35859(self):
# four bugs about capturing groups

# ============================================
# need to reset capturing groups in SRE_SEARCH
# ============================================
# found in issue34294
s = "a\tx"
p = r"\b(?=(\t)|(x))x"
self.assertEqual(re.search(p, s).groups(), (None, 'x'))

# =============================
# macro MARK_PUSH(lastmark) bug
# =============================
# reported in issue35859
self.assertEqual(re.match(r'(ab|a)*?b', 'ab').groups(), ('a',))
self.assertEqual(re.match(r'(ab|a)+?b', 'ab').groups(), ('a',))
self.assertEqual(re.match(r'(ab|a){0,2}?b', 'ab').groups(), ('a',))
self.assertEqual(re.match(r'(.b|a)*?b', 'ab').groups(), ('a',))

# =======================================================
# JUMP_MIN_UNTIL_3 should LASTMARK_SAVE() and MARK_PUSH()
# =======================================================

# 1, triggered by SRE_OP_REPEAT_ONE, b? in this pattern
self.assertEqual(re.match(r'(ab?)*?b', 'ab').groups(), ('a',))

# 2, triggered by SRE_OP_MIN_REPEAT_ONE, a*? in this pattern
s = 'axxzaz'
p = r'(?:a*?(xx)??z)*'
self.assertEqual(re.match(p, s).groups(), ('xx',))

# 3, triggered by SRE_OP_MIN_UNTIL (JUMP_MIN_UNTIL_2)
# (?:a|bc)*? in this pattern
s = 'axxzbcz'
p = r'(?:(?:a|bc)*?(xx)??z)*'
self.assertEqual(re.match(p, s).groups(), ('xx',))
# test-case provided by issue9134
s = 'xtcxyzxc'
p = r'((x|yz)+?(t)??c)*'
self.assertEqual(re.match(p, s).groups(), ('xyzxc', 'x', 't'))

# ======================================================
# JUMP_ASSERT_NOT should LASTMARK_SAVE() and MARK_PUSH()
# ======================================================
# reported in issue725149
# negative assertion
self.assertEqual(re.match(r'(?!(..)c)', 'ab').groups(), (None,))
# negative assertion in a repeat
self.assertEqual(re.match(r'(?:(?!(ab)c).)*', 'ab').groups(), (None,))
self.assertEqual(re.match(r'((?!(bc)d)(.))*', 'abc').groups(),
('c', None, 'c'))

# =============================================================
# below asserts didn't fail before fix, just prevent regression
# =============================================================

# 1, why JUMP_MIN_REPEAT_ONE should LASTMARK_SAVE()
# .?? in this pattern
m = re.match(r'.??(?=(a)?)b', 'ab')
self.assertEqual(m.span(), (0, 2))
self.assertEqual(m.groups(), (None,))
# put in a repeat
m = re.match(r'(?:.??(?=(a)?)b)*', 'abab')
self.assertEqual(m.span(), (0, 4))
self.assertEqual(m.groups(), (None,))

# 2, why JUMP_MIN_UNTIL_2 should LASTMARK_SAVE()
# (?:..)?? in this pattern
m = re.match(r'(?:..)??(?=(aa)?)bb', 'aabb')
self.assertEqual(m.span(), (0, 4))
self.assertEqual(m.groups(), (None,))
# put in a repeat
m = re.match(r'(?:(?:..)??(?=(aa)?)bb)*', 'aabbaabb')
self.assertEqual(m.span(), (0, 8))
self.assertEqual(m.groups(), (None,))

# 3, why JUMP_REPEAT_ONE_1 should LASTMARK_SAVE()
# .* in this pattern, tail starts with a literal.
self.assertEqual(re.match(r'.*x(?=(b)?)a', 'xaxb').groups(), (None,))

# 4, why JUMP_REPEAT_ONE_2 should LASTMARK_SAVE()
# .* in this pattern, tail is general case
self.assertEqual(re.match(r'.*(?=(b)?)a', 'ab').groups(), (None,))

# 5, demonstrate that JUMP_MAX_UNTIL_3 doesn't need LASTMARK_SAVE()
# this pattern is similar to 4
self.assertEqual(re.match(r'(.)*(?=(b)?)a', 'ab').groups(),
(None, None))
self.assertEqual(re.match(r'(.){0}(?=(b)?)a', 'ab').groups(),
(None, None))

# 6, positive assertion in a repeat
# strictly speaking, this is a bug, the correct result should be
# (None,), but it's very hard to fix with the current fundamental
# implementation of sre.
# PHP 7.3.2, Java 11.0.2, Ruby 2.6.1, and the third-party module
# regex 2019.2.21, return ('a',) as well.
# Perl 5.26.1, Node.js 10.15.1, return the correct result (None,)
# Go 1.12, Rust 1.32.0, don't support lookaround yet.
self.assertEqual(re.match(r'(?:(?=(a)?).)*', 'ab').groups(), ('a',))

# 7, negative assertion
# PHP 7.3.2, Ruby 2.6.1, Node.js 10.15.1, regex 2019.2.21 return
# (None,)
# Java 11.0.2, Perl 5.26.1, return ('b',)
# Go 1.12, Rust 1.32.0, don't support lookaround yet.
self.assertEqual(re.match(r'a*(?!(b))', 'ab').groups(), (None,))


def run_re_tests():
from test.re_tests import tests, SUCCEED, FAIL, SYNTAX_ERROR
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
re module, fix four bugs about capturing groups.
28 changes: 24 additions & 4 deletions Modules/_sre.c
Original file line number Diff line number Diff line change
Expand Up @@ -749,20 +749,20 @@ do { \
DATA_STACK_LOOKUP_AT(state,t,p,pos)

#define MARK_PUSH(lastmark) \
do if (lastmark > 0) { \
do if (lastmark >= 0) { \
i = lastmark; /* ctx->lastmark may change if reallocated */ \
DATA_STACK_PUSH(state, state->mark, (i+1)*sizeof(void*)); \
} while (0)
#define MARK_POP(lastmark) \
do if (lastmark > 0) { \
do if (lastmark >= 0) { \
DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 1); \
} while (0)
#define MARK_POP_KEEP(lastmark) \
do if (lastmark > 0) { \
do if (lastmark >= 0) { \
DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 0); \
} while (0)
#define MARK_POP_DISCARD(lastmark) \
do if (lastmark > 0) { \
do if (lastmark >= 0) { \
DATA_STACK_POP_DISCARD(state, (lastmark+1)*sizeof(void*)); \
} while (0)

Expand Down Expand Up @@ -1314,16 +1314,20 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
RETURN_FAILURE;

ctx->u.rep->count = ctx->count;
MARK_PUSH(ctx->lastmark); /* already LASTMARK_SAVE() above */
/* zero-width match protection */
DATA_PUSH(&ctx->u.rep->last_ptr);
ctx->u.rep->last_ptr = state->ptr;
DO_JUMP(JUMP_MIN_UNTIL_3,jump_min_until_3,
ctx->u.rep->pattern+3);
DATA_POP(&ctx->u.rep->last_ptr);
if (ret) {
MARK_POP_DISCARD(ctx->lastmark);
RETURN_ON_ERROR(ret);
RETURN_SUCCESS;
}
MARK_POP(ctx->lastmark);
LASTMARK_RESTORE();
ctx->u.rep->count = ctx->count-1;
state->ptr = ctx->ptr;
RETURN_FAILURE;
Expand Down Expand Up @@ -1419,11 +1423,19 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
ctx->ptr, ctx->pattern[1]));
if (ctx->ptr - (SRE_CHAR *)state->beginning >= (Py_ssize_t)ctx->pattern[1]) {
state->ptr = ctx->ptr - ctx->pattern[1];
LASTMARK_SAVE();
if (state->repeat)
MARK_PUSH(ctx->lastmark);
DO_JUMP(JUMP_ASSERT_NOT, jump_assert_not, ctx->pattern+2);
if (ret) {
if (state->repeat)
MARK_POP_DISCARD(ctx->lastmark);
RETURN_ON_ERROR(ret);
RETURN_FAILURE;
}
if (state->repeat)
MARK_POP(ctx->lastmark);
LASTMARK_RESTORE();
}
ctx->pattern += ctx->pattern[0];
break;
Expand Down Expand Up @@ -1497,6 +1509,10 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
return ret; /* should never get here */
}

/* need to reset capturing groups between two SRE(match) callings in loops */
#define RESET_CAPTURE_GROUP() \
do { state->lastmark = state->lastindex = -1; } while (0)

LOCAL(Py_ssize_t)
SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
{
Expand Down Expand Up @@ -1575,6 +1591,7 @@ SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
status = SRE_MATCH(state, pattern + 2*prefix_skip);
if (status != 0)
return status;
RESET_CAPTURE_GROUP();
/* close but no cigar -- try again */
i = overlap[i];
}
Expand Down Expand Up @@ -1605,6 +1622,7 @@ SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
status = SRE_MATCH(state, pattern + 2);
if (status != 0)
break;
RESET_CAPTURE_GROUP();
}
} else if (charset) {
/* pattern starts with a character from a known set */
Expand All @@ -1620,6 +1638,7 @@ SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
status = SRE_MATCH(state, pattern);
if (status != 0)
break;
RESET_CAPTURE_GROUP();
ptr++;
}
} else {
Expand All @@ -1631,6 +1650,7 @@ SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
status = SRE_MATCH(state, pattern);
if (status != 0 || ptr >= end)
break;
RESET_CAPTURE_GROUP();
ptr++;
}
}
Expand Down