python · pablogsal · Aug 24, 2021 · Aug 24, 2021 · Aug 24, 2021 · Aug 24, 2021
@@ -680,5 +680,13 @@ def error(message, filename=None, location=None):
         perror("unexpected error: %s" % err)
         raise
 
+def _generate_tokens_from_c_tokenizer(source):
+    """Tokenize a source reading Python code as unicode strings using the internal C tokenizer"""
+    import _tokenize as c_tokenizer
+    for info in c_tokenizer.TokenizerIter(source):
+        tok, type, lineno, end_lineno, col_off, end_col_off, line = info
+        yield TokenInfo(type, tok, (lineno, col_off), (end_lineno, end_col_off), line)
+
+
 if __name__ == "__main__":
     main()
@@ -339,6 +339,7 @@ PARSER_HEADERS= \
 PYTHON_OBJS=	\
 		Python/_warnings.o \
 		Python/Python-ast.o \
+		Python/Python-tokenize.o \
 		Python/asdl.o \
 		Python/ast.o \
 		Python/ast_opt.o \

diff --git a/Modules/config.c.in b/Modules/config.c.in
@@ -28,6 +28,7 @@ extern PyObject* PyMarshal_Init(void);
 extern PyObject* PyInit__imp(void);
 extern PyObject* PyInit_gc(void);
 extern PyObject* PyInit__ast(void);
+extern PyObject* PyInit__tokenize(void);
 extern PyObject* _PyWarnings_Init(void);
 extern PyObject* PyInit__string(void);
 
@@ -44,6 +45,9 @@ struct _inittab _PyImport_Inittab[] = {
     /* This lives in Python/Python-ast.c */
     {"_ast", PyInit__ast},
 
+    /* This lives in Python/Python-tokenizer.c */
+    {"_tokenize", PyInit__tokenize},
+
     /* These entries are here for sys.builtin_module_names */
     {"builtins", NULL},
     {"sys", NULL},

@@ -72,9 +72,8 @@ extern PyObject* _PyWarnings_Init(void);
 extern PyObject* PyInit__string(void);
 extern PyObject* PyInit__stat(void);
 extern PyObject* PyInit__opcode(void);
-
 extern PyObject* PyInit__contextvars(void);
-
+extern PyObject* PyInit__tokenize(void);
 
 /* tools/freeze/makeconfig.py marker for additional "extern" */
 /* -- ADDMODULE MARKER 1 -- */
@@ -83,7 +82,6 @@ extern PyObject* PyMarshal_Init(void);
 extern PyObject* PyInit__imp(void);
 
 struct _inittab _PyImport_Inittab[] = {
-
     {"_abc", PyInit__abc},
     {"array", PyInit_array},
     {"_ast", PyInit__ast},
@@ -105,6 +103,7 @@ struct _inittab _PyImport_Inittab[] = {
     {"_blake2", PyInit__blake2},
     {"time", PyInit_time},
     {"_thread", PyInit__thread},
+    {"_tokenize", PyInit__tokenize},
     {"_typing", PyInit__typing},
     {"_statistics", PyInit__statistics},
 #ifdef WIN32

@@ -488,6 +488,7 @@
     <ClCompile Include="..\Python\pystrtod.c" />
     <ClCompile Include="..\Python\dtoa.c" />
     <ClCompile Include="..\Python\Python-ast.c" />
+    <ClCompile Include="..\Python\Python-tokenize.c" />
     <ClCompile Include="..\Python\pythonrun.c" />
     <ClCompile Include="..\Python\specialize.c" />
     <ClCompile Include="..\Python\suggestions.c" />

diff --git a/Python/Python-tokenize.c b/Python/Python-tokenize.c
@@ -0,0 +1,195 @@
+#include "Python.h"
+#include "../Parser/tokenizer.h"
+
+static struct PyModuleDef _tokenizemodule;
+
+typedef struct {
+    PyTypeObject* TokenizerIter;
+} tokenize_state;
+
+static tokenize_state*
+get_tokenize_state(PyObject* module)
+{
+    return (tokenize_state*)PyModule_GetState(module);
+}
+
+#define _tokenize_get_state_by_type(type)                                                               \
+    get_tokenize_state(_PyType_GetModuleByDef(type, &_tokenizemodule))
+
+#include "clinic/Python-tokenize.c.h"
+
+/*[clinic input]
+module _tokenizer
+class _tokenizer.tokenizeriter "tokenizeriterobject *" "_tokenize_get_state_by_type(type)->TokenizerIter"
+[clinic start generated code]*/
+/*[clinic end generated code: output=da39a3ee5e6b4b0d input=96d98ee2fef7a8bc]*/
+
+typedef struct {
+    PyObject_HEAD
+    struct tok_state* tok;
+} tokenizeriterobject;
+
+/*[clinic input]
+@classmethod
+_tokenizer.tokenizeriter.__new__ as tokenizeriter_new
+
+    source: str
+[clinic start generated code]*/
+
+static PyObject *
+tokenizeriter_new_impl(PyTypeObject *type, const char *source)
+/*[clinic end generated code: output=7fd9f46cf9263cbb input=4384b368407375c6]*/
+{
+    tokenizeriterobject* self = (tokenizeriterobject*)type->tp_alloc(type, 0);
+    if (self == NULL) {
+        return NULL;
+    }
+    PyObject* filename = PyUnicode_FromString("<string>");
+    if (filename == NULL) {
+        return NULL;
+    }
+    self->tok = PyTokenizer_FromUTF8(source, 1);
+    if (self->tok == NULL) {
+        return NULL;
+    }
+    self->tok->filename = filename;
+    return (PyObject*)self;
+}
+
+static PyObject*
+tokenizeriter_next(tokenizeriterobject* it)
+{
+    const char* start;
+    const char* end;
+    int type = PyTokenizer_Get(it->tok, &start, &end);
+    if (type == ERRORTOKEN && PyErr_Occurred()) {
+        return NULL;
+    }
+    if (type == ERRORTOKEN || type == ENDMARKER) {
+        PyErr_SetString(PyExc_StopIteration, "EOF");
+        return NULL;
+    }
+    PyObject* str = NULL;
+    if (start == NULL || end == NULL) {
+        str = PyUnicode_FromString("");
+    } else {
+        str = PyUnicode_FromStringAndSize(start, end - start);
+    }
+    if (str == NULL) {
+        return NULL;
+    }
+
+    Py_ssize_t size = it->tok->inp - it->tok->buf;
+    PyObject* line = PyUnicode_DecodeUTF8(it->tok->buf, size, "replace");
+    if (line == NULL) {
+        Py_DECREF(str);
+        return NULL;
+    }
+    const char* line_start = type == STRING ? it->tok->multi_line_start : it->tok->line_start;
+    int lineno = type == STRING ? it->tok->first_lineno : it->tok->lineno;
+    int end_lineno = it->tok->lineno;
+    int col_offset = -1;
+    int end_col_offset = -1;
+    if (start != NULL && start >= line_start) {
+        col_offset = (int)(start - line_start);
+    }
+    if (end != NULL && end >= it->tok->line_start) {
+        end_col_offset = (int)(end - it->tok->line_start);
+    }
+
+    return Py_BuildValue("(NiiiiiN)", str, type, lineno, end_lineno, col_offset, end_col_offset, line);
+}
+
+static void
+tokenizeriter_dealloc(tokenizeriterobject* it)
+{
+    PyTypeObject* tp = Py_TYPE(it);
+    PyTokenizer_Free(it->tok);
+    tp->tp_free(it);
+    Py_DECREF(tp);
+}
+
+static PyType_Slot tokenizeriter_slots[] = {
+        {Py_tp_new, tokenizeriter_new},
+        {Py_tp_dealloc, tokenizeriter_dealloc},
+        {Py_tp_getattro, PyObject_GenericGetAttr},
+        {Py_tp_iter, PyObject_SelfIter},
+        {Py_tp_iternext, tokenizeriter_next},
+        {0, NULL},
+};
+
+static PyType_Spec tokenizeriter_spec = {
+        .name = "_tokenize.TokenizerIter",
+        .basicsize = sizeof(tokenizeriterobject),
+        .flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_IMMUTABLETYPE),
+        .slots = tokenizeriter_slots,
+};
+
+
+static int
+tokenizemodule_exec(PyObject* m)
+{
+    tokenize_state* state = get_tokenize_state(m);
+    if (state == NULL) {
+        return -1;
+    }
+
+    state->TokenizerIter = (PyTypeObject *)PyType_FromModuleAndSpec(
+        m, &tokenizeriter_spec, NULL);
+    if (state->TokenizerIter == NULL) {
+        return -1;
+    }
+    if (PyModule_AddType(m, state->TokenizerIter) < 0) {
+        return -1;
+    }
+
+    return 0;
+}
+
+static PyMethodDef tokenize_methods[] = {
+        {NULL, NULL, 0, NULL} /* Sentinel */
+};
+
+static PyModuleDef_Slot tokenizemodule_slots[] = {
+    {Py_mod_exec, tokenizemodule_exec},
+    {0, NULL}
+};
+
+static int
+tokenizemodule_traverse(PyObject *m, visitproc visit, void *arg)
+{
+    tokenize_state *state = get_tokenize_state(m);
+    Py_VISIT(state->TokenizerIter);
+    return 0;
+}
+
+static int
+tokenizemodule_clear(PyObject *m)
+{
+    tokenize_state *state = get_tokenize_state(m);
+    Py_CLEAR(state->TokenizerIter);
+    return 0;
+}
+
+static void
+tokenizemodule_free(void *m)
+{
+    tokenizemodule_clear((PyObject *)m);
+}
+
+static struct PyModuleDef _tokenizemodule = {
+        PyModuleDef_HEAD_INIT,
+        .m_name = "_tokenize",
+        .m_size = sizeof(tokenize_state),
+        .m_slots = tokenizemodule_slots,
+        .m_methods = tokenize_methods,
+        .m_traverse = tokenizemodule_traverse,
+        .m_clear = tokenizemodule_clear,
+        .m_free = tokenizemodule_free,
+};
+
+PyMODINIT_FUNC
+PyInit__tokenize(void)
+{
+    return PyModuleDef_Init(&_tokenizemodule);
+}
diff --git a/Python/clinic/Python-tokenize.c.h b/Python/clinic/Python-tokenize.c.h
diff --git a/Python/stdlib_module_names.h b/Python/stdlib_module_names.h
@@ -80,6 +80,7 @@ static const char* _Py_stdlib_module_names[] = {
 "_thread",
 "_threading_local",
 "_tkinter",
+"_tokenize",
 "_tracemalloc",
 "_typing",
 "_uuid",