|
13 | 13 | # XXX: show string offset and offending character for all errors
|
14 | 14 |
|
15 | 15 | from sre_constants import *
|
16 |
| -from ast import literal_eval |
| 16 | +import unicodedata |
17 | 17 |
|
18 | 18 | SPECIAL_CHARS = ".\\[{()*+?^$|"
|
19 | 19 | REPEAT_CHARS = "*+?{"
|
|
26 | 26 |
|
27 | 27 | WHITESPACE = frozenset(" \t\n\r\v\f")
|
28 | 28 |
|
29 |
| -UNICODE_NAME = ASCIILETTERS | DIGITS | frozenset(' -') |
30 |
| -CLOSING_BRACE = frozenset("}") |
31 |
| -OPENING_BRACE = frozenset("{") |
32 |
| - |
33 | 29 |
|
34 | 30 | _REPEATCODES = frozenset({MIN_REPEAT, MAX_REPEAT})
|
35 | 31 | _UNITCODES = frozenset({ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY})
|
@@ -270,19 +266,19 @@ def getwhile(self, n, charset):
|
270 | 266 | result += c
|
271 | 267 | self.__next()
|
272 | 268 | return result
|
273 |
| - def getuntil(self, terminator): |
| 269 | + def getuntil(self, terminator, name): |
274 | 270 | result = ''
|
275 | 271 | while True:
|
276 | 272 | c = self.next
|
277 | 273 | self.__next()
|
278 | 274 | if c is None:
|
279 | 275 | if not result:
|
280 |
| - raise self.error("missing group name") |
| 276 | + raise self.error("missing " + name) |
281 | 277 | raise self.error("missing %s, unterminated name" % terminator,
|
282 | 278 | len(result))
|
283 | 279 | if c == terminator:
|
284 | 280 | if not result:
|
285 |
| - raise self.error("missing group name", 1) |
| 281 | + raise self.error("missing " + name, 1) |
286 | 282 | break
|
287 | 283 | result += c
|
288 | 284 | return result
|
@@ -330,14 +326,14 @@ def _class_escape(source, escape):
|
330 | 326 | return LITERAL, c
|
331 | 327 | elif c == "N" and source.istext:
|
332 | 328 | # named unicode escape e.g. \N{EM DASH}
|
333 |
| - escape += source.getwhile(1, OPENING_BRACE) |
334 |
| - escape += source.getwhile(100, UNICODE_NAME) |
335 |
| - escape += source.getwhile(1, CLOSING_BRACE) |
| 329 | + if not source.match('{'): |
| 330 | + raise source.error("missing {") |
| 331 | + charname = source.getuntil('}', 'character name') |
336 | 332 | try:
|
337 |
| - c = ord(literal_eval('"%s"' % escape)) |
338 |
| - except SyntaxError: |
339 |
| - charname = escape[2:].strip('{}') |
340 |
| - raise source.error("unknown Unicode character name %s" % charname, len(escape)) |
| 333 | + c = ord(unicodedata.lookup(charname)) |
| 334 | + except KeyError: |
| 335 | + raise source.error("undefined character name %r" % charname, |
| 336 | + len(charname) + len(r'\N{}')) |
341 | 337 | return LITERAL, c
|
342 | 338 | elif c in OCTDIGITS:
|
343 | 339 | # octal escape (up to three digits)
|
@@ -389,14 +385,14 @@ def _escape(source, escape, state):
|
389 | 385 | return LITERAL, c
|
390 | 386 | elif c == "N" and source.istext:
|
391 | 387 | # named unicode escape e.g. \N{EM DASH}
|
392 |
| - escape += source.getwhile(1, OPENING_BRACE) |
393 |
| - escape += source.getwhile(100, UNICODE_NAME) |
394 |
| - escape += source.getwhile(1, CLOSING_BRACE) |
| 388 | + if not source.match('{'): |
| 389 | + raise source.error("missing {") |
| 390 | + charname = source.getuntil('}', 'character name') |
395 | 391 | try:
|
396 |
| - c = ord(literal_eval('"%s"' % escape)) |
397 |
| - except SyntaxError: |
398 |
| - charname = escape[2:].strip('{}') |
399 |
| - raise source.error("unknown Unicode character name %s" % charname, len(escape)) |
| 392 | + c = ord(unicodedata.lookup(charname)) |
| 393 | + except KeyError: |
| 394 | + raise source.error("undefined character name %r" % charname, |
| 395 | + len(charname) + len(r'\N{}')) |
400 | 396 | return LITERAL, c
|
401 | 397 | elif c == "0":
|
402 | 398 | # octal escape
|
@@ -707,13 +703,13 @@ def _parse(source, state, verbose, nested, first=False):
|
707 | 703 | # python extensions
|
708 | 704 | if sourcematch("<"):
|
709 | 705 | # named group: skip forward to end of name
|
710 |
| - name = source.getuntil(">") |
| 706 | + name = source.getuntil(">", "group name") |
711 | 707 | if not name.isidentifier():
|
712 | 708 | msg = "bad character in group name %r" % name
|
713 | 709 | raise source.error(msg, len(name) + 1)
|
714 | 710 | elif sourcematch("="):
|
715 | 711 | # named backreference
|
716 |
| - name = source.getuntil(")") |
| 712 | + name = source.getuntil(")", "group name") |
717 | 713 | if not name.isidentifier():
|
718 | 714 | msg = "bad character in group name %r" % name
|
719 | 715 | raise source.error(msg, len(name) + 1)
|
@@ -776,7 +772,7 @@ def _parse(source, state, verbose, nested, first=False):
|
776 | 772 |
|
777 | 773 | elif char == "(":
|
778 | 774 | # conditional backreference group
|
779 |
| - condname = source.getuntil(")") |
| 775 | + condname = source.getuntil(")", "group name") |
780 | 776 | if condname.isidentifier():
|
781 | 777 | condgroup = state.groupdict.get(condname)
|
782 | 778 | if condgroup is None:
|
@@ -1005,7 +1001,7 @@ def addgroup(index, pos):
|
1005 | 1001 | name = ""
|
1006 | 1002 | if not s.match("<"):
|
1007 | 1003 | raise s.error("missing <")
|
1008 |
| - name = s.getuntil(">") |
| 1004 | + name = s.getuntil(">", "group name") |
1009 | 1005 | if name.isidentifier():
|
1010 | 1006 | try:
|
1011 | 1007 | index = groupindex[name]
|
|
0 commit comments