Skip to content

gh-111259: Optimize complementary character sets in RE #120742

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 27 additions & 13 deletions Lib/re/_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@
POSSESSIVE_REPEAT: (POSSESSIVE_REPEAT, SUCCESS, POSSESSIVE_REPEAT_ONE),
}

_CHARSET_ALL = [(NEGATE, None)]

def _combine_flags(flags, add_flags, del_flags,
TYPE_FLAGS=_parser.TYPE_FLAGS):
if add_flags & TYPE_FLAGS:
Expand Down Expand Up @@ -84,17 +86,22 @@ def _compile(code, pattern, flags):
code[skip] = _len(code) - skip
elif op is IN:
charset, hascased = _optimize_charset(av, iscased, tolower, fixes)
if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE:
emit(IN_LOC_IGNORE)
elif not hascased:
emit(IN)
elif not fixes: # ascii
emit(IN_IGNORE)
if not charset:
emit(FAILURE)
elif charset == _CHARSET_ALL:
emit(ANY_ALL)
else:
emit(IN_UNI_IGNORE)
skip = _len(code); emit(0)
_compile_charset(charset, flags, code)
code[skip] = _len(code) - skip
if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE:
emit(IN_LOC_IGNORE)
elif not hascased:
emit(IN)
elif not fixes: # ascii
emit(IN_IGNORE)
else:
emit(IN_UNI_IGNORE)
skip = _len(code); emit(0)
_compile_charset(charset, flags, code)
code[skip] = _len(code) - skip
elif op is ANY:
if flags & SRE_FLAG_DOTALL:
emit(ANY_ALL)
Expand Down Expand Up @@ -277,6 +284,10 @@ def _optimize_charset(charset, iscased=None, fixup=None, fixes=None):
charmap[i] = 1
elif op is NEGATE:
out.append((op, av))
elif op is CATEGORY and tail and (CATEGORY, CH_NEGATE[av]) in tail:
# Optimize [\s\S] etc.
out = [] if out else _CHARSET_ALL
return out, False
else:
tail.append((op, av))
except IndexError:
Expand Down Expand Up @@ -519,13 +530,18 @@ def _compile_info(code, pattern, flags):
# look for a literal prefix
prefix = []
prefix_skip = 0
charset = [] # not used
charset = None # not used
if not (flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE):
# look for literal prefix
prefix, prefix_skip, got_all = _get_literal_prefix(pattern, flags)
# if no prefix, look for charset prefix
if not prefix:
charset = _get_charset_prefix(pattern, flags)
if charset:
charset, hascased = _optimize_charset(charset)
assert not hascased
if charset == _CHARSET_ALL:
charset = None
## if prefix:
## print("*** PREFIX", prefix, prefix_skip)
## if charset:
Expand Down Expand Up @@ -560,8 +576,6 @@ def _compile_info(code, pattern, flags):
# generate overlap table
code.extend(_generate_overlap_table(prefix))
elif charset:
charset, hascased = _optimize_charset(charset)
assert not hascased
_compile_charset(charset, flags, code)
code[skip] = len(code) - skip

Expand Down
2 changes: 2 additions & 0 deletions Lib/re/_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,8 @@ def _makecodes(*names):
CATEGORY_NOT_LINEBREAK: CATEGORY_UNI_NOT_LINEBREAK
}

CH_NEGATE = dict(zip(CHCODES[::2] + CHCODES[1::2], CHCODES[1::2] + CHCODES[::2]))

# flags
SRE_FLAG_IGNORECASE = 2 # case insensitive
SRE_FLAG_LOCALE = 4 # honour system locale
Expand Down
18 changes: 18 additions & 0 deletions Lib/test/test_re.py
Original file line number Diff line number Diff line change
Expand Up @@ -2473,6 +2473,24 @@ def test_regression_gh94675(self):
def test_fail(self):
self.assertEqual(re.search(r'12(?!)|3', '123')[0], '3')

def test_character_set_any(self):
# The union of complementary character sets mathes any character
# and is equivalent to "(?s:.)".
s = '1x\n'
for p in r'[\s\S]', r'[\d\D]', r'[\w\W]', r'[\S\s]', r'\s|\S':
with self.subTest(pattern=p):
self.assertEqual(re.findall(p, s), list(s))
self.assertEqual(re.fullmatch('(?:' + p + ')+', s).group(), s)

def test_character_set_none(self):
# Negation of the union of complementary character sets does not match
# any character.
s = '1x\n'
for p in r'[^\s\S]', r'[^\d\D]', r'[^\w\W]', r'[^\S\s]':
with self.subTest(pattern=p):
self.assertIsNone(re.search(p, s))
self.assertIsNone(re.search('(?s:.)' + p, s))


def get_debug_out(pat):
with captured_stdout() as out:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
:mod:`re` now handles patterns like ``"[\s\S]"`` or ``"\s|\S"`` which match
any character as effectively as a dot with the ``DOTALL`` modifier
(``"(?s:.)"``).
Loading