-
-
Notifications
You must be signed in to change notification settings - Fork 32.3k
bpo-30688: support \N{name} escapes in re patterns #2261
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
5f72f7a
7fb2983
4db797b
1113472
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -13,6 +13,7 @@ | |
# XXX: show string offset and offending character for all errors | ||
|
||
from sre_constants import * | ||
from ast import literal_eval | ||
|
||
SPECIAL_CHARS = ".\\[{()*+?^$|" | ||
REPEAT_CHARS = "*+?{" | ||
|
@@ -25,6 +26,11 @@ | |
|
||
WHITESPACE = frozenset(" \t\n\r\v\f") | ||
|
||
UNICODE_NAME = ASCIILETTERS | DIGITS | frozenset(' -') | ||
CLOSING_BRACE = frozenset("}") | ||
OPENING_BRACE = frozenset("{") | ||
|
||
|
||
_REPEATCODES = frozenset({MIN_REPEAT, MAX_REPEAT}) | ||
_UNITCODES = frozenset({ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY}) | ||
|
||
|
@@ -322,6 +328,17 @@ def _class_escape(source, escape): | |
c = int(escape[2:], 16) | ||
chr(c) # raise ValueError for invalid code | ||
return LITERAL, c | ||
elif c == "N" and source.istext: | ||
# named unicode escape e.g. \N{EM DASH} | ||
escape += source.getwhile(1, OPENING_BRACE) | ||
escape += source.getwhile(100, UNICODE_NAME) | ||
escape += source.getwhile(1, CLOSING_BRACE) | ||
try: | ||
c = ord(literal_eval('"%s"' % escape)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I prefer using |
||
except SyntaxError: | ||
charname = escape[2:].strip('{}') | ||
raise source.error("unknown Unicode character name %s" % charname, len(escape)) | ||
return LITERAL, c | ||
elif c in OCTDIGITS: | ||
# octal escape (up to three digits) | ||
escape += source.getwhile(2, OCTDIGITS) | ||
|
@@ -370,6 +387,17 @@ def _escape(source, escape, state): | |
c = int(escape[2:], 16) | ||
chr(c) # raise ValueError for invalid code | ||
return LITERAL, c | ||
elif c == "N" and source.istext: | ||
# named unicode escape e.g. \N{EM DASH} | ||
escape += source.getwhile(1, OPENING_BRACE) | ||
escape += source.getwhile(100, UNICODE_NAME) | ||
escape += source.getwhile(1, CLOSING_BRACE) | ||
try: | ||
c = ord(literal_eval('"%s"' % escape)) | ||
except SyntaxError: | ||
charname = escape[2:].strip('{}') | ||
raise source.error("unknown Unicode character name %s" % charname, len(escape)) | ||
return LITERAL, c | ||
elif c == "0": | ||
# octal escape | ||
escape += source.getwhile(2, OCTDIGITS) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -700,6 +700,39 @@ def test_other_escapes(self): | |
with self.subTest(c): | ||
self.assertRaises(re.error, re.compile, '[\\%c]' % c) | ||
|
||
def test_named_unicode_escapes(self): | ||
# test individual Unicode named escapes | ||
suites = [ | ||
[ # basic matches | ||
['\u2014', r'\u2014', '\N{EM DASH}', | ||
r'\N{EM DASH}'], # pattern | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Isn't the last case enough? |
||
['\u2014', '\N{EM DASH}', '—', '—and more'], # matches | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It is hard to see differences between different dashes on terminal. Use just |
||
['\u2015', '\N{EN DASH}'] # no match | ||
], | ||
[ # character set matches | ||
['[\u2014-\u2020]', r'[\u2014-\u2020]', | ||
'[\N{EM DASH}-\N{DAGGER}]', r'[\N{EM DASH}-\N{DAGGER}]', | ||
'[\u2014-\N{DAGGER}]', '[\N{EM DASH}-\u2020]',], # pattern | ||
['\u2014', '\N{EM DASH}', '—', '—and more', '\u2020', | ||
'\N{DAGGER}', '†', '\u2017', '\N{DOUBLE LOW LINE}'], | ||
['\u2011', '\N{EN DASH}', '\u2013', 'xyz', '\u2021'] | ||
], | ||
] | ||
|
||
for patterns, match_yes, match_no in suites: | ||
for pat in patterns: | ||
for target in match_yes: | ||
self.assertTrue(re.match(pat, target)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Use Actually I think that loops are not needed. It is enough to test just one case for a pattern. |
||
for target in match_no: | ||
self.assertIsNone(re.match(pat, target)) | ||
|
||
# test errors in \N{name} handling - only valid names should pass | ||
badly_formed = [r'\N{BUBBA DASH}', r'\N{EM DASH', | ||
r'\NEM DASH}', r'\NOGGIN'] | ||
for bad in badly_formed: | ||
with self.assertRaises(re.error): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Use |
||
re.compile(bad) | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Add also tests for |
||
def test_string_boundaries(self): | ||
# See http://bugs.python.org/issue10713 | ||
self.assertEqual(re.search(r"\b(abc)\b", "abc").group(1), | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You could use modified
getuntil()
. Just add yet one parameter for specifying what is missing in error messages.