Skip to content

bpo-29995: re.escape() now escapes only special characters. #1007

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Apr 13, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 7 additions & 3 deletions Doc/library/re.rst
Original file line number Diff line number Diff line change
Expand Up @@ -786,7 +786,7 @@ form.

.. function:: escape(pattern)

Escape all the characters in *pattern* except ASCII letters, numbers and ``'_'``.
Escape special characters in *pattern*.
This is useful if you want to match an arbitrary literal string that may
have regular expression metacharacters in it. For example::

Expand All @@ -795,15 +795,19 @@ form.

>>> legal_chars = string.ascii_lowercase + string.digits + "!#$%&'*+-.^_`|~:"
>>> print('[%s]+' % re.escape(legal_chars))
[abcdefghijklmnopqrstuvwxyz0123456789\!\#\$\%\&\'\*\+\-\.\^_\`\|\~\:]+
[abcdefghijklmnopqrstuvwxyz0123456789!\#\$%&'\*\+\-\.\^_`\|~:]+

>>> operators = ['+', '-', '*', '/', '**']
>>> print('|'.join(map(re.escape, sorted(operators, reverse=True))))
\/|\-|\+|\*\*|\*
/|\-|\+|\*\*|\*

.. versionchanged:: 3.3
The ``'_'`` character is no longer escaped.

.. versionchanged:: 3.7
Only characters that can have special meaning in a regular expression
are escaped.


.. function:: purge()

Expand Down
2 changes: 1 addition & 1 deletion Doc/tools/susp-ignored.csv
Original file line number Diff line number Diff line change
Expand Up @@ -303,7 +303,7 @@ whatsnew/3.2,,:gz,">>> with tarfile.open(name='myarchive.tar.gz', mode='w:gz') a
whatsnew/3.2,,:location,zope9-location = ${zope9:location}
whatsnew/3.2,,:prefix,zope-conf = ${custom:prefix}/etc/zope.conf
library/re,,`,!#$%&'*+-.^_`|~:
library/re,,`,\!\#\$\%\&\'\*\+\-\.\^_\`\|\~\:
library/re,,`,!\#\$%&'\*\+\-\.\^_`\|~:
library/tarfile,,:xz,'x:xz'
library/xml.etree.elementtree,,:sometag,prefix:sometag
library/xml.etree.elementtree,,:fictional,"<actors xmlns:fictional=""http://characters.example.com"""
Expand Down
4 changes: 2 additions & 2 deletions Lib/idlelib/idle_test/test_replace.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,8 +221,8 @@ def test_replace_regex(self):
self.assertIn('Invalid Replace Expression', showerror.message)

# test access method
self.engine.setcookedpat("\'")
equal(pv.get(), "\\'")
self.engine.setcookedpat("?")
equal(pv.get(), "\\?")

def test_replace_backwards(self):
equal = self.assertEqual
Expand Down
36 changes: 9 additions & 27 deletions Lib/re.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,39 +241,21 @@ def template(pattern, flags=0):
"Compile a template pattern, returning a pattern object"
return _compile(pattern, flags|T)

_alphanum_str = frozenset(
"_abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ01234567890")
_alphanum_bytes = frozenset(
b"_abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ01234567890")
# SPECIAL_CHARS
# closing ')', '}' and ']'
# '-' (a range in character set)
# '#' (comment) and WHITESPACE (ignored) in verbose mode
Copy link

@bz2 bz2 Apr 11, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For my own reference, re.VERBOSE does only look at ascii whitespace:

>>> re.compile("a \f\n\r\v\t\u3000", re.DEBUG|re.VERBOSE)
LITERAL 97
LITERAL 12288
re.compile('a \x0c\n\r\x0b\t\u3000', re.VERBOSE|re.DEBUG)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

SPECIAL_CHARS and WHITESPACE are constants in the sre_parse module. WHITESPACE contains only ascii whitespace.

_special_chars_map = {i: '\\' + chr(i) for i in b'()[]{}?*+-|^$\\.# \t\n\r\v\f'}

def escape(pattern):
"""
Escape all the characters in pattern except ASCII letters, numbers and '_'.
Escape special characters in a string.
"""
if isinstance(pattern, str):
alphanum = _alphanum_str
s = list(pattern)
for i, c in enumerate(pattern):
if c not in alphanum:
if c == "\000":
s[i] = "\\000"
else:
s[i] = "\\" + c
return "".join(s)
return pattern.translate(_special_chars_map)
else:
alphanum = _alphanum_bytes
s = []
esc = ord(b"\\")
for c in pattern:
if c in alphanum:
s.append(c)
else:
if c == 0:
s.extend(b"\\000")
else:
s.append(esc)
s.append(c)
return bytes(s)
pattern = str(pattern, 'latin1')
return pattern.translate(_special_chars_map).encode('latin1')

# --------------------------------------------------------------------
# internals
Expand Down
37 changes: 19 additions & 18 deletions Lib/test/test_re.py
Original file line number Diff line number Diff line change
Expand Up @@ -904,7 +904,7 @@ def test_search_coverage(self):
self.assertEqual(re.search(r"a\s", "a ").group(0), "a ")

def assertMatch(self, pattern, text, match=None, span=None,
matcher=re.match):
matcher=re.fullmatch):
if match is None and span is None:
# the pattern matches the whole text
match = text
Expand All @@ -917,45 +917,46 @@ def assertMatch(self, pattern, text, match=None, span=None,
self.assertEqual(m.group(), match)
self.assertEqual(m.span(), span)

LITERAL_CHARS = string.ascii_letters + string.digits + '!"%&\',/:;<=>@_`~'

def test_re_escape(self):
alnum_chars = string.ascii_letters + string.digits + '_'
p = ''.join(chr(i) for i in range(256))
for c in p:
if c in alnum_chars:
self.assertEqual(re.escape(c), c)
elif c == '\x00':
self.assertEqual(re.escape(c), '\\000')
else:
self.assertEqual(re.escape(c), '\\' + c)
self.assertMatch(re.escape(c), c)
self.assertMatch('[' + re.escape(c) + ']', c)
self.assertMatch('(?x)' + re.escape(c), c)
self.assertMatch(re.escape(p), p)
for c in '-.]{}':
self.assertEqual(re.escape(c)[:1], '\\')
literal_chars = self.LITERAL_CHARS
self.assertEqual(re.escape(literal_chars), literal_chars)

def test_re_escape_byte(self):
alnum_chars = (string.ascii_letters + string.digits + '_').encode('ascii')
def test_re_escape_bytes(self):
p = bytes(range(256))
for i in p:
b = bytes([i])
if b in alnum_chars:
self.assertEqual(re.escape(b), b)
elif i == 0:
self.assertEqual(re.escape(b), b'\\000')
else:
self.assertEqual(re.escape(b), b'\\' + b)
self.assertMatch(re.escape(b), b)
self.assertMatch(b'[' + re.escape(b) + b']', b)
self.assertMatch(b'(?x)' + re.escape(b), b)
self.assertMatch(re.escape(p), p)
for i in b'-.]{}':
b = bytes([i])
self.assertEqual(re.escape(b)[:1], b'\\')
literal_chars = self.LITERAL_CHARS.encode('ascii')
self.assertEqual(re.escape(literal_chars), literal_chars)

def test_re_escape_non_ascii(self):
s = 'xxx\u2620\u2620\u2620xxx'
s_escaped = re.escape(s)
self.assertEqual(s_escaped, 'xxx\\\u2620\\\u2620\\\u2620xxx')
self.assertEqual(s_escaped, s)
self.assertMatch(s_escaped, s)
self.assertMatch('.%s+.' % re.escape('\u2620'), s,
'x\u2620\u2620\u2620x', (2, 7), re.search)

def test_re_escape_non_ascii_bytes(self):
b = 'y\u2620y\u2620y'.encode('utf-8')
b_escaped = re.escape(b)
self.assertEqual(b_escaped, b'y\\\xe2\\\x98\\\xa0y\\\xe2\\\x98\\\xa0y')
self.assertEqual(b_escaped, b)
self.assertMatch(b_escaped, b)
res = re.findall(re.escape('\u2620'.encode('utf-8')), b)
self.assertEqual(len(res), 2)
Expand Down
2 changes: 2 additions & 0 deletions Misc/NEWS
Original file line number Diff line number Diff line change
Expand Up @@ -320,6 +320,8 @@ Library
- bpo-29998: Pickling and copying ImportError now preserves name and path
attributes.

- bpo-29995: re.escape() now escapes only regex special characters.

- bpo-29962: Add math.remainder operation, implementing remainder
as specified in IEEE 754.

Expand Down