Skip to content

Commit deee43b

Browse files
committed
Add tests for the C tokenizer and expose it as a private module
1 parent e41912c commit deee43b

File tree

9 files changed

+1141
-5
lines changed

9 files changed

+1141
-5
lines changed

Lib/test/test_tokenize.py

Lines changed: 887 additions & 2 deletions
Large diffs are not rendered by default.

Lib/tokenize.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ def exact_type(self):
5656
else:
5757
return self.type
5858

59+
5960
def group(*choices): return '(' + '|'.join(choices) + ')'
6061
def any(*choices): return group(*choices) + '*'
6162
def maybe(*choices): return group(*choices) + '?'
@@ -680,5 +681,13 @@ def error(message, filename=None, location=None):
680681
perror("unexpected error: %s" % err)
681682
raise
682683

684+
def _generate_tokens_from_c_tokenizer(source):
685+
"""Tokenize a source reading Python code as unicode strings using the internal C tokenizer"""
686+
import _tokenize as c_tokenizer
687+
for info in c_tokenizer.TokenizerIter(source):
688+
tok, type, lineno, end_lineno, col_off, end_col_off, line = info
689+
yield TokenInfo(type, tok, (lineno, col_off), (end_lineno, end_col_off), line)
690+
691+
683692
if __name__ == "__main__":
684693
main()

Makefile.pre.in

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -339,6 +339,7 @@ PARSER_HEADERS= \
339339
PYTHON_OBJS= \
340340
Python/_warnings.o \
341341
Python/Python-ast.o \
342+
Python/Python-tokenize.o \
342343
Python/asdl.o \
343344
Python/ast.o \
344345
Python/ast_opt.o \

Modules/config.c.in

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ extern PyObject* PyMarshal_Init(void);
2828
extern PyObject* PyInit__imp(void);
2929
extern PyObject* PyInit_gc(void);
3030
extern PyObject* PyInit__ast(void);
31+
extern PyObject* PyInit__tokenize(void);
3132
extern PyObject* _PyWarnings_Init(void);
3233
extern PyObject* PyInit__string(void);
3334

@@ -44,6 +45,9 @@ struct _inittab _PyImport_Inittab[] = {
4445
/* This lives in Python/Python-ast.c */
4546
{"_ast", PyInit__ast},
4647

48+
/* This lives in Python/Python-tokenizer.c */
49+
{"_tokenize", PyInit__tokenize},
50+
4751
/* These entries are here for sys.builtin_module_names */
4852
{"builtins", NULL},
4953
{"sys", NULL},

PC/config.c

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -72,9 +72,8 @@ extern PyObject* _PyWarnings_Init(void);
7272
extern PyObject* PyInit__string(void);
7373
extern PyObject* PyInit__stat(void);
7474
extern PyObject* PyInit__opcode(void);
75-
7675
extern PyObject* PyInit__contextvars(void);
77-
76+
extern PyObject* PyInit__tokenize(void);
7877

7978
/* tools/freeze/makeconfig.py marker for additional "extern" */
8079
/* -- ADDMODULE MARKER 1 -- */
@@ -83,7 +82,6 @@ extern PyObject* PyMarshal_Init(void);
8382
extern PyObject* PyInit__imp(void);
8483

8584
struct _inittab _PyImport_Inittab[] = {
86-
8785
{"_abc", PyInit__abc},
8886
{"array", PyInit_array},
8987
{"_ast", PyInit__ast},
@@ -105,6 +103,7 @@ struct _inittab _PyImport_Inittab[] = {
105103
{"_blake2", PyInit__blake2},
106104
{"time", PyInit_time},
107105
{"_thread", PyInit__thread},
106+
{"_tokenize", PyInit__tokenize},
108107
{"_typing", PyInit__typing},
109108
{"_statistics", PyInit__statistics},
110109
#ifdef WIN32

PCbuild/pythoncore.vcxproj

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -488,6 +488,7 @@
488488
<ClCompile Include="..\Python\pystrtod.c" />
489489
<ClCompile Include="..\Python\dtoa.c" />
490490
<ClCompile Include="..\Python\Python-ast.c" />
491+
<ClCompile Include="..\Python\Python-tokenize.c" />
491492
<ClCompile Include="..\Python\pythonrun.c" />
492493
<ClCompile Include="..\Python\specialize.c" />
493494
<ClCompile Include="..\Python\suggestions.c" />

Python/Python-tokenize.c

Lines changed: 195 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,195 @@
1+
#include "Python.h"
2+
#include "../Parser/tokenizer.h"
3+
4+
static struct PyModuleDef _tokenizemodule;
5+
6+
typedef struct
7+
{
8+
PyTypeObject* TokenizerIter;
9+
} tokenize_state;
10+
11+
static tokenize_state*
12+
get_tokenize_state(PyObject* module)
13+
{
14+
return (tokenize_state*)PyModule_GetState(module);
15+
}
16+
17+
#define _tokenize_get_state_by_type(type) \
18+
get_tokenize_state(_PyType_GetModuleByDef(type, &_tokenizemodule))
19+
20+
#include "clinic/Python-tokenize.c.h"
21+
22+
/*[clinic input]
23+
module _tokenizer
24+
class _tokenizer.tokenizeriter "tokenizeriterobject *" "_tokenize_get_state_by_type(type)->TokenizerIter"
25+
[clinic start generated code]*/
26+
/*[clinic end generated code: output=da39a3ee5e6b4b0d input=96d98ee2fef7a8bc]*/
27+
28+
typedef struct
29+
{
30+
PyObject_HEAD struct tok_state* tok;
31+
} tokenizeriterobject;
32+
33+
/*[clinic input]
34+
@classmethod
35+
_tokenizer.tokenizeriter.__new__ as tokenizeriter_new
36+
37+
source: str
38+
[clinic start generated code]*/
39+
40+
static PyObject *
41+
tokenizeriter_new_impl(PyTypeObject *type, const char *source)
42+
/*[clinic end generated code: output=7fd9f46cf9263cbb input=4384b368407375c6]*/
43+
{
44+
tokenizeriterobject* self = (tokenizeriterobject*)type->tp_alloc(type, 0);
45+
if (self == NULL) {
46+
return NULL;
47+
}
48+
PyObject* filename = PyUnicode_FromString("<string>");
49+
if (filename == NULL) {
50+
return NULL;
51+
}
52+
self->tok = PyTokenizer_FromUTF8(source, 1);
53+
if (self->tok == NULL) {
54+
return NULL;
55+
}
56+
self->tok->filename = filename;
57+
return (PyObject*)self;
58+
}
59+
60+
static PyObject*
61+
tokenizeriter_next(tokenizeriterobject* it)
62+
{
63+
const char* start;
64+
const char* end;
65+
int type = PyTokenizer_Get(it->tok, &start, &end);
66+
if (type == ERRORTOKEN && PyErr_Occurred()) {
67+
return NULL;
68+
}
69+
if (type == ERRORTOKEN || type == ENDMARKER) {
70+
PyErr_SetString(PyExc_StopIteration, "EOF");
71+
return NULL;
72+
}
73+
PyObject* str = NULL;
74+
if (start == NULL || end == NULL) {
75+
str = PyUnicode_FromStringAndSize(start, end - start);
76+
} else {
77+
str = PyUnicode_FromString("");
78+
}
79+
if (str == NULL) {
80+
return NULL;
81+
}
82+
83+
Py_ssize_t size = it->tok->inp - it->tok->buf;
84+
PyObject* line = PyUnicode_DecodeUTF8(it->tok->buf, size, "replace");
85+
if (line == NULL) {
86+
Py_DECREF(str);
87+
return NULL;
88+
}
89+
const char* line_start = type == STRING ? it->tok->multi_line_start : it->tok->line_start;
90+
int lineno = type == STRING ? it->tok->first_lineno : it->tok->lineno;
91+
int end_lineno = it->tok->lineno;
92+
int col_offset = -1;
93+
int end_col_offset = -1;
94+
if (start != NULL && start >= line_start) {
95+
col_offset = (int)(start - line_start);
96+
}
97+
if (end != NULL && end >= it->tok->line_start) {
98+
end_col_offset = (int)(end - it->tok->line_start);
99+
}
100+
101+
return Py_BuildValue("(NiiiiiN)", str, type, lineno, end_lineno, col_offset, end_col_offset, line);
102+
}
103+
104+
static void
105+
tokenizeriter_dealloc(tokenizeriterobject* it)
106+
{
107+
PyTypeObject* tp = Py_TYPE(it);
108+
PyTokenizer_Free(it->tok);
109+
tp->tp_free(it);
110+
}
111+
112+
static PyType_Slot tokenizeriter_slots[] = {
113+
{Py_tp_new, tokenizeriter_new},
114+
{Py_tp_dealloc, tokenizeriter_dealloc},
115+
{Py_tp_getattro, PyObject_GenericGetAttr},
116+
{Py_tp_iter, PyObject_SelfIter},
117+
{Py_tp_iternext, tokenizeriter_next},
118+
{0, NULL},
119+
};
120+
121+
static PyType_Spec tokenizeriter_spec = {
122+
.name = "_tokenize.TokenizerIter",
123+
.basicsize = sizeof(tokenizeriterobject),
124+
.flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_IMMUTABLETYPE),
125+
.slots = tokenizeriter_slots,
126+
};
127+
128+
129+
static int
130+
tokenizemodule_exec(PyObject* m)
131+
{
132+
tokenize_state* state = get_tokenize_state(m);
133+
if (state == NULL) {
134+
return -1;
135+
}
136+
137+
state->TokenizerIter = (PyTypeObject *)PyType_FromModuleAndSpec(
138+
m, &tokenizeriter_spec, NULL);
139+
if (state->TokenizerIter == NULL) {
140+
return -1;
141+
}
142+
if (PyModule_AddType(m, state->TokenizerIter) < 0) {
143+
return -1;
144+
}
145+
146+
return 0;
147+
}
148+
149+
static PyMethodDef tokenize_methods[] = {
150+
{NULL, NULL, 0, NULL} /* Sentinel */
151+
};
152+
153+
static PyModuleDef_Slot tokenizemodule_slots[] = {
154+
{Py_mod_exec, tokenizemodule_exec},
155+
{0, NULL}
156+
};
157+
158+
static int
159+
tokenizemodule_traverse(PyObject *m, visitproc visit, void *arg)
160+
{
161+
tokenize_state *state = get_tokenize_state(m);
162+
Py_VISIT(state->TokenizerIter);
163+
return 0;
164+
}
165+
166+
static int
167+
tokenizemodule_clear(PyObject *m)
168+
{
169+
tokenize_state *state = get_tokenize_state(m);
170+
Py_CLEAR(state->TokenizerIter);
171+
return 0;
172+
}
173+
174+
static void
175+
tokenizemodule_free(void *m)
176+
{
177+
tokenizemodule_clear((PyObject *)m);
178+
}
179+
180+
static struct PyModuleDef _tokenizemodule = {
181+
PyModuleDef_HEAD_INIT,
182+
.m_name = "_tokenize",
183+
.m_size = sizeof(tokenize_state),
184+
.m_slots = tokenizemodule_slots,
185+
.m_methods = tokenize_methods,
186+
.m_traverse = tokenizemodule_traverse,
187+
.m_clear = tokenizemodule_clear,
188+
.m_free = tokenizemodule_free,
189+
};
190+
191+
PyMODINIT_FUNC
192+
PyInit__tokenize(void)
193+
{
194+
return PyModuleDef_Init(&_tokenizemodule);
195+
}

Python/clinic/Python-tokenize.c.h

Lines changed: 41 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Python/stdlib_module_names.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,7 @@ static const char* _Py_stdlib_module_names[] = {
8080
"_thread",
8181
"_threading_local",
8282
"_tkinter",
83+
"_tokenize",
8384
"_tracemalloc",
8485
"_typing",
8586
"_uuid",

0 commit comments

Comments
 (0)