Skip to content

Add tests for the C tokenizer and expose it as a private module #27924

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Aug 24, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
863 changes: 861 additions & 2 deletions Lib/test/test_tokenize.py

Large diffs are not rendered by default.

8 changes: 8 additions & 0 deletions Lib/tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -680,5 +680,13 @@ def error(message, filename=None, location=None):
perror("unexpected error: %s" % err)
raise

def _generate_tokens_from_c_tokenizer(source):
"""Tokenize a source reading Python code as unicode strings using the internal C tokenizer"""
import _tokenize as c_tokenizer
for info in c_tokenizer.TokenizerIter(source):
tok, type, lineno, end_lineno, col_off, end_col_off, line = info
yield TokenInfo(type, tok, (lineno, col_off), (end_lineno, end_col_off), line)


if __name__ == "__main__":
main()
1 change: 1 addition & 0 deletions Makefile.pre.in
Original file line number Diff line number Diff line change
Expand Up @@ -339,6 +339,7 @@ PARSER_HEADERS= \
PYTHON_OBJS= \
Python/_warnings.o \
Python/Python-ast.o \
Python/Python-tokenize.o \
Python/asdl.o \
Python/ast.o \
Python/ast_opt.o \
Expand Down
4 changes: 4 additions & 0 deletions Modules/config.c.in
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ extern PyObject* PyMarshal_Init(void);
extern PyObject* PyInit__imp(void);
extern PyObject* PyInit_gc(void);
extern PyObject* PyInit__ast(void);
extern PyObject* PyInit__tokenize(void);
extern PyObject* _PyWarnings_Init(void);
extern PyObject* PyInit__string(void);

Expand All @@ -44,6 +45,9 @@ struct _inittab _PyImport_Inittab[] = {
/* This lives in Python/Python-ast.c */
{"_ast", PyInit__ast},

/* This lives in Python/Python-tokenizer.c */
{"_tokenize", PyInit__tokenize},

/* These entries are here for sys.builtin_module_names */
{"builtins", NULL},
{"sys", NULL},
Expand Down
5 changes: 2 additions & 3 deletions PC/config.c
Original file line number Diff line number Diff line change
Expand Up @@ -72,9 +72,8 @@ extern PyObject* _PyWarnings_Init(void);
extern PyObject* PyInit__string(void);
extern PyObject* PyInit__stat(void);
extern PyObject* PyInit__opcode(void);

extern PyObject* PyInit__contextvars(void);

extern PyObject* PyInit__tokenize(void);

/* tools/freeze/makeconfig.py marker for additional "extern" */
/* -- ADDMODULE MARKER 1 -- */
Expand All @@ -83,7 +82,6 @@ extern PyObject* PyMarshal_Init(void);
extern PyObject* PyInit__imp(void);

struct _inittab _PyImport_Inittab[] = {

{"_abc", PyInit__abc},
{"array", PyInit_array},
{"_ast", PyInit__ast},
Expand All @@ -105,6 +103,7 @@ struct _inittab _PyImport_Inittab[] = {
{"_blake2", PyInit__blake2},
{"time", PyInit_time},
{"_thread", PyInit__thread},
{"_tokenize", PyInit__tokenize},
{"_typing", PyInit__typing},
{"_statistics", PyInit__statistics},
#ifdef WIN32
Expand Down
1 change: 1 addition & 0 deletions PCbuild/pythoncore.vcxproj
Original file line number Diff line number Diff line change
Expand Up @@ -488,6 +488,7 @@
<ClCompile Include="..\Python\pystrtod.c" />
<ClCompile Include="..\Python\dtoa.c" />
<ClCompile Include="..\Python\Python-ast.c" />
<ClCompile Include="..\Python\Python-tokenize.c" />
<ClCompile Include="..\Python\pythonrun.c" />
<ClCompile Include="..\Python\specialize.c" />
<ClCompile Include="..\Python\suggestions.c" />
Expand Down
195 changes: 195 additions & 0 deletions Python/Python-tokenize.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,195 @@
#include "Python.h"
#include "../Parser/tokenizer.h"

static struct PyModuleDef _tokenizemodule;

typedef struct {
PyTypeObject* TokenizerIter;
} tokenize_state;

static tokenize_state*
get_tokenize_state(PyObject* module)
{
return (tokenize_state*)PyModule_GetState(module);
}

#define _tokenize_get_state_by_type(type) \
get_tokenize_state(_PyType_GetModuleByDef(type, &_tokenizemodule))

#include "clinic/Python-tokenize.c.h"

/*[clinic input]
module _tokenizer
class _tokenizer.tokenizeriter "tokenizeriterobject *" "_tokenize_get_state_by_type(type)->TokenizerIter"
[clinic start generated code]*/
/*[clinic end generated code: output=da39a3ee5e6b4b0d input=96d98ee2fef7a8bc]*/
Comment on lines +24 to +25
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this expected?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, check other modules that have classes created with the argument clinic. Maybe I am missing what you mean though 😅


typedef struct {
PyObject_HEAD
struct tok_state* tok;
} tokenizeriterobject;

/*[clinic input]
@classmethod
_tokenizer.tokenizeriter.__new__ as tokenizeriter_new

source: str
[clinic start generated code]*/

static PyObject *
tokenizeriter_new_impl(PyTypeObject *type, const char *source)
/*[clinic end generated code: output=7fd9f46cf9263cbb input=4384b368407375c6]*/
{
tokenizeriterobject* self = (tokenizeriterobject*)type->tp_alloc(type, 0);
if (self == NULL) {
return NULL;
}
PyObject* filename = PyUnicode_FromString("<string>");
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This returns a new reference, don't you have to decref it in line 53?

Copy link
Member Author

@pablogsal pablogsal Aug 24, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, the tokenizer free function does it

Ah, I misread the comment. Yeah, we are missing a decref in the error path

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Addressed in #27935

if (filename == NULL) {
return NULL;
}
self->tok = PyTokenizer_FromUTF8(source, 1);
if (self->tok == NULL) {
return NULL;
}
self->tok->filename = filename;
return (PyObject*)self;
}

static PyObject*
tokenizeriter_next(tokenizeriterobject* it)
{
const char* start;
const char* end;
int type = PyTokenizer_Get(it->tok, &start, &end);
if (type == ERRORTOKEN && PyErr_Occurred()) {
return NULL;
}
if (type == ERRORTOKEN || type == ENDMARKER) {
PyErr_SetString(PyExc_StopIteration, "EOF");
return NULL;
}
PyObject* str = NULL;
if (start == NULL || end == NULL) {
str = PyUnicode_FromString("");
} else {
str = PyUnicode_FromStringAndSize(start, end - start);
}
if (str == NULL) {
return NULL;
}

Py_ssize_t size = it->tok->inp - it->tok->buf;
PyObject* line = PyUnicode_DecodeUTF8(it->tok->buf, size, "replace");
if (line == NULL) {
Py_DECREF(str);
return NULL;
}
const char* line_start = type == STRING ? it->tok->multi_line_start : it->tok->line_start;
int lineno = type == STRING ? it->tok->first_lineno : it->tok->lineno;
int end_lineno = it->tok->lineno;
int col_offset = -1;
int end_col_offset = -1;
if (start != NULL && start >= line_start) {
col_offset = (int)(start - line_start);
}
if (end != NULL && end >= it->tok->line_start) {
end_col_offset = (int)(end - it->tok->line_start);
}

return Py_BuildValue("(NiiiiiN)", str, type, lineno, end_lineno, col_offset, end_col_offset, line);
}

static void
tokenizeriter_dealloc(tokenizeriterobject* it)
{
PyTypeObject* tp = Py_TYPE(it);
PyTokenizer_Free(it->tok);
tp->tp_free(it);
Py_DECREF(tp);
}

static PyType_Slot tokenizeriter_slots[] = {
{Py_tp_new, tokenizeriter_new},
{Py_tp_dealloc, tokenizeriter_dealloc},
{Py_tp_getattro, PyObject_GenericGetAttr},
{Py_tp_iter, PyObject_SelfIter},
{Py_tp_iternext, tokenizeriter_next},
{0, NULL},
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why are some of the initializers 8-char indented while others are 4-char indented?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hummm, I think is because I messed up with the format. All should be 4 char. Will correct this in another pR unless you manage to do it before

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Addressed in #27935

};

static PyType_Spec tokenizeriter_spec = {
.name = "_tokenize.TokenizerIter",
.basicsize = sizeof(tokenizeriterobject),
.flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_IMMUTABLETYPE),
.slots = tokenizeriter_slots,
};


static int
tokenizemodule_exec(PyObject* m)
{
tokenize_state* state = get_tokenize_state(m);
if (state == NULL) {
return -1;
}

state->TokenizerIter = (PyTypeObject *)PyType_FromModuleAndSpec(
m, &tokenizeriter_spec, NULL);
if (state->TokenizerIter == NULL) {
return -1;
}
if (PyModule_AddType(m, state->TokenizerIter) < 0) {
return -1;
}

return 0;
}

static PyMethodDef tokenize_methods[] = {
{NULL, NULL, 0, NULL} /* Sentinel */
};

static PyModuleDef_Slot tokenizemodule_slots[] = {
{Py_mod_exec, tokenizemodule_exec},
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Example of a 4-indented initializer.

{0, NULL}
};

static int
tokenizemodule_traverse(PyObject *m, visitproc visit, void *arg)
{
tokenize_state *state = get_tokenize_state(m);
Py_VISIT(state->TokenizerIter);
return 0;
}

static int
tokenizemodule_clear(PyObject *m)
{
tokenize_state *state = get_tokenize_state(m);
Py_CLEAR(state->TokenizerIter);
return 0;
}

static void
tokenizemodule_free(void *m)
{
tokenizemodule_clear((PyObject *)m);
}

static struct PyModuleDef _tokenizemodule = {
PyModuleDef_HEAD_INIT,
.m_name = "_tokenize",
.m_size = sizeof(tokenize_state),
.m_slots = tokenizemodule_slots,
.m_methods = tokenize_methods,
.m_traverse = tokenizemodule_traverse,
.m_clear = tokenizemodule_clear,
.m_free = tokenizemodule_free,
};

PyMODINIT_FUNC
PyInit__tokenize(void)
{
return PyModuleDef_Init(&_tokenizemodule);
}
41 changes: 41 additions & 0 deletions Python/clinic/Python-tokenize.c.h

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Python/stdlib_module_names.h
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ static const char* _Py_stdlib_module_names[] = {
"_thread",
"_threading_local",
"_tkinter",
"_tokenize",
"_tracemalloc",
"_typing",
"_uuid",
Expand Down