Skip to content

Commit 4738340

Browse files
committed
Implement PEP 3131. Add isidentifier to str.
1 parent 32c4ac0 commit 4738340

File tree

11 files changed

+152
-5
lines changed

11 files changed

+152
-5
lines changed

Doc/lib/libstdtypes.tex

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -653,6 +653,11 @@ \subsection{String Methods \label{string-methods}}
653653
For 8-bit strings, this method is locale-dependent.
654654
\end{methoddesc}
655655

656+
\begin{methoddesc}[str]{isidentifier}{}
657+
Return True if S is a valid identifier according\n\
658+
to the language definition.
659+
\end{methoddesc}
660+
656661
\begin{methoddesc}[str]{islower}{}
657662
Return true if all cased characters in the string are lowercase and
658663
there is at least one cased character, false otherwise.

Include/errcode.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ extern "C" {
2929
#define E_EOFS 23 /* EOF in triple-quoted string */
3030
#define E_EOLS 24 /* EOL in single-quoted string */
3131
#define E_LINECONT 25 /* Unexpected characters after a line continuation */
32+
#define E_IDENTIFIER 26 /* Invalid characters in identifier */
3233

3334
#ifdef __cplusplus
3435
}

Include/unicodeobject.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -182,6 +182,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
182182
# define PyUnicode_GetDefaultEncoding PyUnicodeUCS2_GetDefaultEncoding
183183
# define PyUnicode_GetMax PyUnicodeUCS2_GetMax
184184
# define PyUnicode_GetSize PyUnicodeUCS2_GetSize
185+
# define PyUnicode_IsIdentifier PyUnicodeUCS2_IsIdentifier
185186
# define PyUnicode_Join PyUnicodeUCS2_Join
186187
# define PyUnicode_Partition PyUnicodeUCS2_Partition
187188
# define PyUnicode_RPartition PyUnicodeUCS2_RPartition
@@ -268,6 +269,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
268269
# define PyUnicode_GetDefaultEncoding PyUnicodeUCS4_GetDefaultEncoding
269270
# define PyUnicode_GetMax PyUnicodeUCS4_GetMax
270271
# define PyUnicode_GetSize PyUnicodeUCS4_GetSize
272+
# define PyUnicode_IsIdentifier PyUnicodeUCS4_IsIdentifier
271273
# define PyUnicode_Join PyUnicodeUCS4_Join
272274
# define PyUnicode_Partition PyUnicodeUCS4_Partition
273275
# define PyUnicode_RPartition PyUnicodeUCS4_RPartition
@@ -1250,6 +1252,10 @@ PyAPI_FUNC(int) PyUnicode_Contains(
12501252
PyObject *element /* Element string */
12511253
);
12521254

1255+
/* Checks whether argument is a valid identifier. */
1256+
1257+
PyAPI_FUNC(int) PyUnicode_IsIdentifier(PyObject *s);
1258+
12531259
/* Externally visible for str.strip(unicode) */
12541260
PyAPI_FUNC(PyObject *) _PyUnicode_XStrip(
12551261
PyUnicodeObject *self,

Lib/test/badsyntax_3131.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
# -*- coding: utf-8 -*-
2+
= 2

Lib/test/test_pep3131.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
# -*- coding: utf-8 -*-
2+
import unittest
3+
from test import test_support
4+
5+
class PEP3131Test(unittest.TestCase):
6+
7+
def test_valid(self):
8+
class T:
9+
ä = 1
10+
µ = 2 # this is a compatibility character
11+
= 3
12+
self.assertEquals(getattr(T, "\xe4"), 1)
13+
self.assertEquals(getattr(T, "\u03bc"), 2)
14+
self.assertEquals(getattr(T, '\u87d2'), 3)
15+
16+
def test_invalid(self):
17+
try:
18+
from test import badsyntax_3131
19+
except SyntaxError as s:
20+
self.assertEquals(str(s),
21+
"invalid character in identifier (badsyntax_3131.py, line 2)")
22+
else:
23+
self.fail("expected exception didn't occur")
24+
25+
def test_main():
26+
test_support.run_unittest(PEP3131Test)
27+
28+
if __name__=="__main__":
29+
test_main()

Lib/test/test_unicode.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -313,6 +313,19 @@ def test_isnumeric(self):
313313

314314
self.assertRaises(TypeError, "abc".isnumeric, 42)
315315

316+
def test_isidentifier(self):
317+
self.assertTrue("a".isidentifier())
318+
self.assertTrue("Z".isidentifier())
319+
self.assertTrue("_".isidentifier())
320+
self.assertTrue("b0".isidentifier())
321+
self.assertTrue("bc".isidentifier())
322+
self.assertTrue("b_".isidentifier())
323+
self.assertTrue("µ".isidentifier())
324+
325+
self.assertFalse(" ".isidentifier())
326+
self.assertFalse("[".isidentifier())
327+
self.assertFalse("©".isidentifier())
328+
316329
def test_contains(self):
317330
# Testing Unicode contains method
318331
self.assert_('a' in 'abdb')

Misc/NEWS

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@ TO DO
2626
Core and Builtins
2727
-----------------
2828

29+
- PEP 3131: Support non-ASCII identifiers.
30+
2931
- PEP 3120: Change default encoding to UTF-8.
3032

3133
- PEP 3123: Use proper C inheritance for PyObject.

Objects/unicodeobject.c

Lines changed: 44 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -227,7 +227,8 @@ int unicode_resize(register PyUnicodeObject *unicode,
227227
}
228228

229229
/* We allocate one more byte to make sure the string is
230-
Ux0000 terminated -- XXX is this needed ?
230+
Ux0000 terminated; some code (e.g. new_identifier)
231+
relies on that.
231232
232233
XXX This allocator could further be enhanced by assuring that the
233234
free list never reduces its size below 1.
@@ -6679,6 +6680,47 @@ unicode_isnumeric(PyUnicodeObject *self)
66796680
return PyBool_FromLong(1);
66806681
}
66816682

6683+
int
6684+
PyUnicode_IsIdentifier(PyObject *self)
6685+
{
6686+
register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
6687+
register const Py_UNICODE *e;
6688+
6689+
/* Special case for empty strings */
6690+
if (PyUnicode_GET_SIZE(self) == 0)
6691+
return 0;
6692+
6693+
/* PEP 3131 says that the first character must be in
6694+
XID_Start and subsequent characters in XID_Continue,
6695+
and for the ASCII range, the 2.x rules apply (i.e
6696+
start with letters and underscore, continue with
6697+
letters, digits, underscore). However, given the current
6698+
definition of XID_Start and XID_Continue, it is sufficient
6699+
to check just for these, except that _ must be allowed
6700+
as starting an identifier. */
6701+
if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
6702+
return 0;
6703+
6704+
e = p + PyUnicode_GET_SIZE(self);
6705+
for (p++; p < e; p++) {
6706+
if (!_PyUnicode_IsXidContinue(*p))
6707+
return 0;
6708+
}
6709+
return 1;
6710+
}
6711+
6712+
PyDoc_STRVAR(isidentifier__doc__,
6713+
"S.isidentifier() -> bool\n\
6714+
\n\
6715+
Return True if S is a valid identifier according\n\
6716+
to the language definition.");
6717+
6718+
static PyObject*
6719+
unicode_isidentifier(PyObject *self)
6720+
{
6721+
return PyBool_FromLong(PyUnicode_IsIdentifier(self));
6722+
}
6723+
66826724
PyDoc_STRVAR(join__doc__,
66836725
"S.join(sequence) -> unicode\n\
66846726
\n\
@@ -7714,6 +7756,7 @@ static PyMethodDef unicode_methods[] = {
77147756
{"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
77157757
{"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
77167758
{"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
7759+
{"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
77177760
{"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
77187761
#if 0
77197762
{"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},

Parser/tokenizer.c

Lines changed: 26 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,13 +21,15 @@
2121
#define is_potential_identifier_start(c) (\
2222
(c >= 'a' && c <= 'z')\
2323
|| (c >= 'A' && c <= 'Z')\
24-
|| c == '_')
24+
|| c == '_'\
25+
|| (c >= 128))
2526

2627
#define is_potential_identifier_char(c) (\
2728
(c >= 'a' && c <= 'z')\
2829
|| (c >= 'A' && c <= 'Z')\
2930
|| (c >= '0' && c <= '9')\
30-
|| c == '_')
31+
|| c == '_'\
32+
|| (c >= 128))
3133

3234
extern char *PyOS_Readline(FILE *, FILE *, char *);
3335
/* Return malloc'ed string including trailing \n;
@@ -1070,14 +1072,27 @@ indenterror(struct tok_state *tok)
10701072
return 0;
10711073
}
10721074

1075+
#ifdef PGEN
1076+
#define verify_identifier(s,e) 1
1077+
#else
1078+
/* Verify that the identifier follows PEP 3131. */
1079+
static int
1080+
verify_identifier(char *start, char *end)
1081+
{
1082+
PyObject *s = PyUnicode_DecodeUTF8(start, end-start, NULL);
1083+
int result = PyUnicode_IsIdentifier(s);
1084+
Py_DECREF(s);
1085+
return result;
1086+
}
1087+
#endif
10731088

10741089
/* Get next token, after space stripping etc. */
10751090

10761091
static int
10771092
tok_get(register struct tok_state *tok, char **p_start, char **p_end)
10781093
{
10791094
register int c;
1080-
int blankline;
1095+
int blankline, nonascii;
10811096

10821097
*p_start = *p_end = NULL;
10831098
nextline:
@@ -1195,6 +1210,7 @@ tok_get(register struct tok_state *tok, char **p_start, char **p_end)
11951210
}
11961211

11971212
/* Identifier (most frequent token!) */
1213+
nonascii = 0;
11981214
if (is_potential_identifier_start(c)) {
11991215
/* Process r"", u"" and ur"" */
12001216
switch (c) {
@@ -1214,9 +1230,16 @@ tok_get(register struct tok_state *tok, char **p_start, char **p_end)
12141230
break;
12151231
}
12161232
while (is_potential_identifier_char(c)) {
1233+
if (c >= 128)
1234+
nonascii = 1;
12171235
c = tok_nextc(tok);
12181236
}
12191237
tok_backup(tok, c);
1238+
if (nonascii &&
1239+
!verify_identifier(tok->start, tok->cur)) {
1240+
tok->done = E_IDENTIFIER;
1241+
return ERRORTOKEN;
1242+
}
12201243
*p_start = tok->start;
12211244
*p_end = tok->cur;
12221245
return NAME;

Python/ast.c

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,8 +47,27 @@ static PyObject *parsestrplus(struct compiling *, const node *n,
4747
#define COMP_SETCOMP 2
4848

4949
static identifier
50-
new_identifier(const char* n, PyArena *arena) {
50+
new_identifier(const char* n, PyArena *arena)
51+
{
5152
PyObject* id = PyUnicode_DecodeUTF8(n, strlen(n), NULL);
53+
Py_UNICODE *u = PyUnicode_AS_UNICODE(id);
54+
/* Check whether there are non-ASCII characters in the
55+
identifier; if so, normalize to NFKC. */
56+
for (; *u; u++) {
57+
if (*u >= 128) {
58+
PyObject *m = PyImport_ImportModule("unicodedata");
59+
PyObject *id2;
60+
if (!m)
61+
return NULL;
62+
id2 = PyObject_CallMethod(m, "normalize", "sO", "NFKC", id);
63+
Py_DECREF(m);
64+
if (!id2)
65+
return NULL;
66+
Py_DECREF(id);
67+
id = id2;
68+
break;
69+
}
70+
}
5271
PyUnicode_InternInPlace(&id);
5372
PyArena_AddPyObject(arena, id);
5473
return id;

Python/pythonrun.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1530,6 +1530,10 @@ err_input(perrdetail *err)
15301530
case E_LINECONT:
15311531
msg = "unexpected character after line continuation character";
15321532
break;
1533+
1534+
case E_IDENTIFIER:
1535+
msg = "invalid character in identifier";
1536+
break;
15331537
default:
15341538
fprintf(stderr, "error=%d\n", err->error);
15351539
msg = "unknown parsing error";

0 commit comments

Comments
 (0)