Skip to content

bpo-30688: Support \N{name} escapes in re patterns. #5588

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Feb 9, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 6 additions & 3 deletions Doc/library/re.rst
Original file line number Diff line number Diff line change
Expand Up @@ -468,13 +468,13 @@ Most of the standard escapes supported by Python string literals are also
accepted by the regular expression parser::

\a \b \f \n
\r \t \u \U
\v \x \\
\N \r \t \u
\U \v \x \\

(Note that ``\b`` is used to represent word boundaries, and means "backspace"
only inside character classes.)

``'\u'`` and ``'\U'`` escape sequences are only recognized in Unicode
``'\u'``, ``'\U'``, and ``'\N'`` escape sequences are only recognized in Unicode
patterns. In bytes patterns they are errors.

Octal escapes are included in a limited form. If the first digit is a 0, or if
Expand All @@ -488,6 +488,9 @@ three digits in length.
.. versionchanged:: 3.6
Unknown escapes consisting of ``'\'`` and an ASCII letter now are errors.

.. versionchanged:: 3.8
The ``'\N{name}'`` escape sequence has been added. As in string literals,
it expands to the named Unicode character (e.g. ``'\N{EM DASH}'``).

.. seealso::

Expand Down
2 changes: 2 additions & 0 deletions Doc/whatsnew/3.8.rst
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,8 @@ New Features
Other Language Changes
======================

* Added support of ``\N{name}`` escapes in :mod:`regular expressions <re>`.
(Contributed by Jonathan Eunice and Serhiy Storchaka in :issue:`30688`.)


New Modules
Expand Down
37 changes: 30 additions & 7 deletions Lib/sre_parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# XXX: show string offset and offending character for all errors

from sre_constants import *
import unicodedata

SPECIAL_CHARS = ".\\[{()*+?^$|"
REPEAT_CHARS = "*+?{"
Expand Down Expand Up @@ -264,19 +265,19 @@ def getwhile(self, n, charset):
result += c
self.__next()
return result
def getuntil(self, terminator):
def getuntil(self, terminator, name):
result = ''
while True:
c = self.next
self.__next()
if c is None:
if not result:
raise self.error("missing group name")
raise self.error("missing " + name)
raise self.error("missing %s, unterminated name" % terminator,
len(result))
if c == terminator:
if not result:
raise self.error("missing group name", 1)
raise self.error("missing " + name, 1)
break
result += c
return result
Expand Down Expand Up @@ -322,6 +323,17 @@ def _class_escape(source, escape):
c = int(escape[2:], 16)
chr(c) # raise ValueError for invalid code
return LITERAL, c
elif c == "N" and source.istext:
# named unicode escape e.g. \N{EM DASH}
if not source.match('{'):
raise source.error("missing {")
charname = source.getuntil('}', 'character name')
try:
c = ord(unicodedata.lookup(charname))
except KeyError:
raise source.error("undefined character name %r" % charname,
len(charname) + len(r'\N{}'))
return LITERAL, c
elif c in OCTDIGITS:
# octal escape (up to three digits)
escape += source.getwhile(2, OCTDIGITS)
Expand Down Expand Up @@ -370,6 +382,17 @@ def _escape(source, escape, state):
c = int(escape[2:], 16)
chr(c) # raise ValueError for invalid code
return LITERAL, c
elif c == "N" and source.istext:
# named unicode escape e.g. \N{EM DASH}
if not source.match('{'):
raise source.error("missing {")
charname = source.getuntil('}', 'character name')
try:
c = ord(unicodedata.lookup(charname))
except KeyError:
raise source.error("undefined character name %r" % charname,
len(charname) + len(r'\N{}'))
return LITERAL, c
elif c == "0":
# octal escape
escape += source.getwhile(2, OCTDIGITS)
Expand Down Expand Up @@ -679,13 +702,13 @@ def _parse(source, state, verbose, nested, first=False):
# python extensions
if sourcematch("<"):
# named group: skip forward to end of name
name = source.getuntil(">")
name = source.getuntil(">", "group name")
if not name.isidentifier():
msg = "bad character in group name %r" % name
raise source.error(msg, len(name) + 1)
elif sourcematch("="):
# named backreference
name = source.getuntil(")")
name = source.getuntil(")", "group name")
if not name.isidentifier():
msg = "bad character in group name %r" % name
raise source.error(msg, len(name) + 1)
Expand Down Expand Up @@ -748,7 +771,7 @@ def _parse(source, state, verbose, nested, first=False):

elif char == "(":
# conditional backreference group
condname = source.getuntil(")")
condname = source.getuntil(")", "group name")
if condname.isidentifier():
condgroup = state.groupdict.get(condname)
if condgroup is None:
Expand Down Expand Up @@ -977,7 +1000,7 @@ def addgroup(index, pos):
name = ""
if not s.match("<"):
raise s.error("missing <")
name = s.getuntil(">")
name = s.getuntil(">", "group name")
if name.isidentifier():
try:
index = groupindex[name]
Expand Down
36 changes: 36 additions & 0 deletions Lib/test/test_re.py
Original file line number Diff line number Diff line change
Expand Up @@ -694,6 +694,42 @@ def test_other_escapes(self):
with self.subTest(c):
self.assertRaises(re.error, re.compile, '[\\%c]' % c)

def test_named_unicode_escapes(self):
# test individual Unicode named escapes
self.assertTrue(re.match(r'\N{LESS-THAN SIGN}', '<'))
self.assertTrue(re.match(r'\N{less-than sign}', '<'))
self.assertIsNone(re.match(r'\N{LESS-THAN SIGN}', '>'))
self.assertTrue(re.match(r'\N{SNAKE}', '\U0001f40d'))
self.assertTrue(re.match(r'\N{ARABIC LIGATURE UIGHUR KIRGHIZ YEH WITH '
r'HAMZA ABOVE WITH ALEF MAKSURA ISOLATED FORM}',
'\ufbf9'))
self.assertTrue(re.match(r'[\N{LESS-THAN SIGN}-\N{GREATER-THAN SIGN}]',
'='))
self.assertIsNone(re.match(r'[\N{LESS-THAN SIGN}-\N{GREATER-THAN SIGN}]',
';'))

# test errors in \N{name} handling - only valid names should pass
self.checkPatternError(r'\N', 'missing {', 2)
self.checkPatternError(r'[\N]', 'missing {', 3)
self.checkPatternError(r'\N{', 'missing character name', 3)
self.checkPatternError(r'[\N{', 'missing character name', 4)
self.checkPatternError(r'\N{}', 'missing character name', 3)
self.checkPatternError(r'[\N{}]', 'missing character name', 4)
self.checkPatternError(r'\NSNAKE}', 'missing {', 2)
self.checkPatternError(r'[\NSNAKE}]', 'missing {', 3)
self.checkPatternError(r'\N{SNAKE',
'missing }, unterminated name', 3)
self.checkPatternError(r'[\N{SNAKE]',
'missing }, unterminated name', 4)
self.checkPatternError(r'[\N{SNAKE]}',
"undefined character name 'SNAKE]'", 1)
self.checkPatternError(r'\N{SPAM}',
"undefined character name 'SPAM'", 0)
self.checkPatternError(r'[\N{SPAM}]',
"undefined character name 'SPAM'", 1)
self.checkPatternError(br'\N{LESS-THAN SIGN}', r'bad escape \N', 0)
self.checkPatternError(br'[\N{LESS-THAN SIGN}]', r'bad escape \N', 1)

def test_string_boundaries(self):
# See http://bugs.python.org/issue10713
self.assertEqual(re.search(r"\b(abc)\b", "abc").group(1),
Expand Down
1 change: 1 addition & 0 deletions Misc/ACKS
Original file line number Diff line number Diff line change
Expand Up @@ -440,6 +440,7 @@ Andy Eskilsson
André Espaze
Stefan Esser
Nicolas Estibals
Jonathan Eunice
Carey Evans
Stephen D Evans
Tim Everett
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Added support of ``\N{name}`` escapes in regular expressions. Based on
patch by Jonathan Eunice.