Skip to content

Commit a24676b

Browse files
authored
Add tests for the C tokenizer and expose it as a private module (GH-27924)
1 parent 9ed5231 commit a24676b

File tree

9 files changed

+1114
-5
lines changed

9 files changed

+1114
-5
lines changed

Lib/test/test_tokenize.py

Lines changed: 861 additions & 2 deletions
Large diffs are not rendered by default.

Lib/tokenize.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -680,5 +680,13 @@ def error(message, filename=None, location=None):
680680
perror("unexpected error: %s" % err)
681681
raise
682682

683+
def _generate_tokens_from_c_tokenizer(source):
684+
"""Tokenize a source reading Python code as unicode strings using the internal C tokenizer"""
685+
import _tokenize as c_tokenizer
686+
for info in c_tokenizer.TokenizerIter(source):
687+
tok, type, lineno, end_lineno, col_off, end_col_off, line = info
688+
yield TokenInfo(type, tok, (lineno, col_off), (end_lineno, end_col_off), line)
689+
690+
683691
if __name__ == "__main__":
684692
main()

Makefile.pre.in

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -339,6 +339,7 @@ PARSER_HEADERS= \
339339
PYTHON_OBJS= \
340340
Python/_warnings.o \
341341
Python/Python-ast.o \
342+
Python/Python-tokenize.o \
342343
Python/asdl.o \
343344
Python/ast.o \
344345
Python/ast_opt.o \

Modules/config.c.in

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ extern PyObject* PyMarshal_Init(void);
2828
extern PyObject* PyInit__imp(void);
2929
extern PyObject* PyInit_gc(void);
3030
extern PyObject* PyInit__ast(void);
31+
extern PyObject* PyInit__tokenize(void);
3132
extern PyObject* _PyWarnings_Init(void);
3233
extern PyObject* PyInit__string(void);
3334

@@ -44,6 +45,9 @@ struct _inittab _PyImport_Inittab[] = {
4445
/* This lives in Python/Python-ast.c */
4546
{"_ast", PyInit__ast},
4647

48+
/* This lives in Python/Python-tokenizer.c */
49+
{"_tokenize", PyInit__tokenize},
50+
4751
/* These entries are here for sys.builtin_module_names */
4852
{"builtins", NULL},
4953
{"sys", NULL},

PC/config.c

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -72,9 +72,8 @@ extern PyObject* _PyWarnings_Init(void);
7272
extern PyObject* PyInit__string(void);
7373
extern PyObject* PyInit__stat(void);
7474
extern PyObject* PyInit__opcode(void);
75-
7675
extern PyObject* PyInit__contextvars(void);
77-
76+
extern PyObject* PyInit__tokenize(void);
7877

7978
/* tools/freeze/makeconfig.py marker for additional "extern" */
8079
/* -- ADDMODULE MARKER 1 -- */
@@ -83,7 +82,6 @@ extern PyObject* PyMarshal_Init(void);
8382
extern PyObject* PyInit__imp(void);
8483

8584
struct _inittab _PyImport_Inittab[] = {
86-
8785
{"_abc", PyInit__abc},
8886
{"array", PyInit_array},
8987
{"_ast", PyInit__ast},
@@ -105,6 +103,7 @@ struct _inittab _PyImport_Inittab[] = {
105103
{"_blake2", PyInit__blake2},
106104
{"time", PyInit_time},
107105
{"_thread", PyInit__thread},
106+
{"_tokenize", PyInit__tokenize},
108107
{"_typing", PyInit__typing},
109108
{"_statistics", PyInit__statistics},
110109
#ifdef WIN32

PCbuild/pythoncore.vcxproj

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -488,6 +488,7 @@
488488
<ClCompile Include="..\Python\pystrtod.c" />
489489
<ClCompile Include="..\Python\dtoa.c" />
490490
<ClCompile Include="..\Python\Python-ast.c" />
491+
<ClCompile Include="..\Python\Python-tokenize.c" />
491492
<ClCompile Include="..\Python\pythonrun.c" />
492493
<ClCompile Include="..\Python\specialize.c" />
493494
<ClCompile Include="..\Python\suggestions.c" />

Python/Python-tokenize.c

Lines changed: 195 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,195 @@
1+
#include "Python.h"
2+
#include "../Parser/tokenizer.h"
3+
4+
static struct PyModuleDef _tokenizemodule;
5+
6+
typedef struct {
7+
PyTypeObject* TokenizerIter;
8+
} tokenize_state;
9+
10+
static tokenize_state*
11+
get_tokenize_state(PyObject* module)
12+
{
13+
return (tokenize_state*)PyModule_GetState(module);
14+
}
15+
16+
#define _tokenize_get_state_by_type(type) \
17+
get_tokenize_state(_PyType_GetModuleByDef(type, &_tokenizemodule))
18+
19+
#include "clinic/Python-tokenize.c.h"
20+
21+
/*[clinic input]
22+
module _tokenizer
23+
class _tokenizer.tokenizeriter "tokenizeriterobject *" "_tokenize_get_state_by_type(type)->TokenizerIter"
24+
[clinic start generated code]*/
25+
/*[clinic end generated code: output=da39a3ee5e6b4b0d input=96d98ee2fef7a8bc]*/
26+
27+
typedef struct {
28+
PyObject_HEAD
29+
struct tok_state* tok;
30+
} tokenizeriterobject;
31+
32+
/*[clinic input]
33+
@classmethod
34+
_tokenizer.tokenizeriter.__new__ as tokenizeriter_new
35+
36+
source: str
37+
[clinic start generated code]*/
38+
39+
static PyObject *
40+
tokenizeriter_new_impl(PyTypeObject *type, const char *source)
41+
/*[clinic end generated code: output=7fd9f46cf9263cbb input=4384b368407375c6]*/
42+
{
43+
tokenizeriterobject* self = (tokenizeriterobject*)type->tp_alloc(type, 0);
44+
if (self == NULL) {
45+
return NULL;
46+
}
47+
PyObject* filename = PyUnicode_FromString("<string>");
48+
if (filename == NULL) {
49+
return NULL;
50+
}
51+
self->tok = PyTokenizer_FromUTF8(source, 1);
52+
if (self->tok == NULL) {
53+
return NULL;
54+
}
55+
self->tok->filename = filename;
56+
return (PyObject*)self;
57+
}
58+
59+
static PyObject*
60+
tokenizeriter_next(tokenizeriterobject* it)
61+
{
62+
const char* start;
63+
const char* end;
64+
int type = PyTokenizer_Get(it->tok, &start, &end);
65+
if (type == ERRORTOKEN && PyErr_Occurred()) {
66+
return NULL;
67+
}
68+
if (type == ERRORTOKEN || type == ENDMARKER) {
69+
PyErr_SetString(PyExc_StopIteration, "EOF");
70+
return NULL;
71+
}
72+
PyObject* str = NULL;
73+
if (start == NULL || end == NULL) {
74+
str = PyUnicode_FromString("");
75+
} else {
76+
str = PyUnicode_FromStringAndSize(start, end - start);
77+
}
78+
if (str == NULL) {
79+
return NULL;
80+
}
81+
82+
Py_ssize_t size = it->tok->inp - it->tok->buf;
83+
PyObject* line = PyUnicode_DecodeUTF8(it->tok->buf, size, "replace");
84+
if (line == NULL) {
85+
Py_DECREF(str);
86+
return NULL;
87+
}
88+
const char* line_start = type == STRING ? it->tok->multi_line_start : it->tok->line_start;
89+
int lineno = type == STRING ? it->tok->first_lineno : it->tok->lineno;
90+
int end_lineno = it->tok->lineno;
91+
int col_offset = -1;
92+
int end_col_offset = -1;
93+
if (start != NULL && start >= line_start) {
94+
col_offset = (int)(start - line_start);
95+
}
96+
if (end != NULL && end >= it->tok->line_start) {
97+
end_col_offset = (int)(end - it->tok->line_start);
98+
}
99+
100+
return Py_BuildValue("(NiiiiiN)", str, type, lineno, end_lineno, col_offset, end_col_offset, line);
101+
}
102+
103+
static void
104+
tokenizeriter_dealloc(tokenizeriterobject* it)
105+
{
106+
PyTypeObject* tp = Py_TYPE(it);
107+
PyTokenizer_Free(it->tok);
108+
tp->tp_free(it);
109+
Py_DECREF(tp);
110+
}
111+
112+
static PyType_Slot tokenizeriter_slots[] = {
113+
{Py_tp_new, tokenizeriter_new},
114+
{Py_tp_dealloc, tokenizeriter_dealloc},
115+
{Py_tp_getattro, PyObject_GenericGetAttr},
116+
{Py_tp_iter, PyObject_SelfIter},
117+
{Py_tp_iternext, tokenizeriter_next},
118+
{0, NULL},
119+
};
120+
121+
static PyType_Spec tokenizeriter_spec = {
122+
.name = "_tokenize.TokenizerIter",
123+
.basicsize = sizeof(tokenizeriterobject),
124+
.flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_IMMUTABLETYPE),
125+
.slots = tokenizeriter_slots,
126+
};
127+
128+
129+
static int
130+
tokenizemodule_exec(PyObject* m)
131+
{
132+
tokenize_state* state = get_tokenize_state(m);
133+
if (state == NULL) {
134+
return -1;
135+
}
136+
137+
state->TokenizerIter = (PyTypeObject *)PyType_FromModuleAndSpec(
138+
m, &tokenizeriter_spec, NULL);
139+
if (state->TokenizerIter == NULL) {
140+
return -1;
141+
}
142+
if (PyModule_AddType(m, state->TokenizerIter) < 0) {
143+
return -1;
144+
}
145+
146+
return 0;
147+
}
148+
149+
static PyMethodDef tokenize_methods[] = {
150+
{NULL, NULL, 0, NULL} /* Sentinel */
151+
};
152+
153+
static PyModuleDef_Slot tokenizemodule_slots[] = {
154+
{Py_mod_exec, tokenizemodule_exec},
155+
{0, NULL}
156+
};
157+
158+
static int
159+
tokenizemodule_traverse(PyObject *m, visitproc visit, void *arg)
160+
{
161+
tokenize_state *state = get_tokenize_state(m);
162+
Py_VISIT(state->TokenizerIter);
163+
return 0;
164+
}
165+
166+
static int
167+
tokenizemodule_clear(PyObject *m)
168+
{
169+
tokenize_state *state = get_tokenize_state(m);
170+
Py_CLEAR(state->TokenizerIter);
171+
return 0;
172+
}
173+
174+
static void
175+
tokenizemodule_free(void *m)
176+
{
177+
tokenizemodule_clear((PyObject *)m);
178+
}
179+
180+
static struct PyModuleDef _tokenizemodule = {
181+
PyModuleDef_HEAD_INIT,
182+
.m_name = "_tokenize",
183+
.m_size = sizeof(tokenize_state),
184+
.m_slots = tokenizemodule_slots,
185+
.m_methods = tokenize_methods,
186+
.m_traverse = tokenizemodule_traverse,
187+
.m_clear = tokenizemodule_clear,
188+
.m_free = tokenizemodule_free,
189+
};
190+
191+
PyMODINIT_FUNC
192+
PyInit__tokenize(void)
193+
{
194+
return PyModuleDef_Init(&_tokenizemodule);
195+
}

Python/clinic/Python-tokenize.c.h

Lines changed: 41 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Python/stdlib_module_names.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,7 @@ static const char* _Py_stdlib_module_names[] = {
8080
"_thread",
8181
"_threading_local",
8282
"_tkinter",
83+
"_tokenize",
8384
"_tracemalloc",
8485
"_typing",
8586
"_uuid",

0 commit comments

Comments
 (0)