-
-
Notifications
You must be signed in to change notification settings - Fork 32.2k
Add tests for the C tokenizer and expose it as a private module #27924
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
deee43b
0e47958
832bf92
0fa7650
7abe25a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Large diffs are not rendered by default.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,195 @@ | ||
#include "Python.h" | ||
#include "../Parser/tokenizer.h" | ||
|
||
static struct PyModuleDef _tokenizemodule; | ||
|
||
typedef struct { | ||
PyTypeObject* TokenizerIter; | ||
} tokenize_state; | ||
|
||
static tokenize_state* | ||
get_tokenize_state(PyObject* module) | ||
{ | ||
return (tokenize_state*)PyModule_GetState(module); | ||
} | ||
|
||
#define _tokenize_get_state_by_type(type) \ | ||
get_tokenize_state(_PyType_GetModuleByDef(type, &_tokenizemodule)) | ||
|
||
#include "clinic/Python-tokenize.c.h" | ||
|
||
/*[clinic input] | ||
module _tokenizer | ||
class _tokenizer.tokenizeriter "tokenizeriterobject *" "_tokenize_get_state_by_type(type)->TokenizerIter" | ||
[clinic start generated code]*/ | ||
/*[clinic end generated code: output=da39a3ee5e6b4b0d input=96d98ee2fef7a8bc]*/ | ||
|
||
typedef struct { | ||
PyObject_HEAD | ||
struct tok_state* tok; | ||
} tokenizeriterobject; | ||
|
||
/*[clinic input] | ||
@classmethod | ||
_tokenizer.tokenizeriter.__new__ as tokenizeriter_new | ||
|
||
source: str | ||
[clinic start generated code]*/ | ||
|
||
static PyObject * | ||
tokenizeriter_new_impl(PyTypeObject *type, const char *source) | ||
/*[clinic end generated code: output=7fd9f46cf9263cbb input=4384b368407375c6]*/ | ||
{ | ||
tokenizeriterobject* self = (tokenizeriterobject*)type->tp_alloc(type, 0); | ||
if (self == NULL) { | ||
return NULL; | ||
} | ||
PyObject* filename = PyUnicode_FromString("<string>"); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This returns a new reference, don't you have to decref it in line 53? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Ah, I misread the comment. Yeah, we are missing a decref in the error path There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Addressed in #27935 |
||
if (filename == NULL) { | ||
return NULL; | ||
} | ||
self->tok = PyTokenizer_FromUTF8(source, 1); | ||
if (self->tok == NULL) { | ||
return NULL; | ||
} | ||
self->tok->filename = filename; | ||
return (PyObject*)self; | ||
} | ||
|
||
static PyObject* | ||
tokenizeriter_next(tokenizeriterobject* it) | ||
{ | ||
const char* start; | ||
const char* end; | ||
int type = PyTokenizer_Get(it->tok, &start, &end); | ||
if (type == ERRORTOKEN && PyErr_Occurred()) { | ||
return NULL; | ||
} | ||
if (type == ERRORTOKEN || type == ENDMARKER) { | ||
PyErr_SetString(PyExc_StopIteration, "EOF"); | ||
return NULL; | ||
} | ||
PyObject* str = NULL; | ||
if (start == NULL || end == NULL) { | ||
str = PyUnicode_FromString(""); | ||
} else { | ||
str = PyUnicode_FromStringAndSize(start, end - start); | ||
} | ||
if (str == NULL) { | ||
return NULL; | ||
} | ||
|
||
Py_ssize_t size = it->tok->inp - it->tok->buf; | ||
PyObject* line = PyUnicode_DecodeUTF8(it->tok->buf, size, "replace"); | ||
if (line == NULL) { | ||
Py_DECREF(str); | ||
return NULL; | ||
} | ||
const char* line_start = type == STRING ? it->tok->multi_line_start : it->tok->line_start; | ||
int lineno = type == STRING ? it->tok->first_lineno : it->tok->lineno; | ||
int end_lineno = it->tok->lineno; | ||
int col_offset = -1; | ||
int end_col_offset = -1; | ||
if (start != NULL && start >= line_start) { | ||
col_offset = (int)(start - line_start); | ||
} | ||
if (end != NULL && end >= it->tok->line_start) { | ||
end_col_offset = (int)(end - it->tok->line_start); | ||
} | ||
|
||
return Py_BuildValue("(NiiiiiN)", str, type, lineno, end_lineno, col_offset, end_col_offset, line); | ||
} | ||
|
||
static void | ||
tokenizeriter_dealloc(tokenizeriterobject* it) | ||
{ | ||
PyTypeObject* tp = Py_TYPE(it); | ||
PyTokenizer_Free(it->tok); | ||
tp->tp_free(it); | ||
Py_DECREF(tp); | ||
} | ||
|
||
static PyType_Slot tokenizeriter_slots[] = { | ||
{Py_tp_new, tokenizeriter_new}, | ||
{Py_tp_dealloc, tokenizeriter_dealloc}, | ||
{Py_tp_getattro, PyObject_GenericGetAttr}, | ||
{Py_tp_iter, PyObject_SelfIter}, | ||
{Py_tp_iternext, tokenizeriter_next}, | ||
{0, NULL}, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why are some of the initializers 8-char indented while others are 4-char indented? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hummm, I think is because I messed up with the format. All should be 4 char. Will correct this in another pR unless you manage to do it before There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Addressed in #27935 |
||
}; | ||
|
||
static PyType_Spec tokenizeriter_spec = { | ||
.name = "_tokenize.TokenizerIter", | ||
.basicsize = sizeof(tokenizeriterobject), | ||
.flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_IMMUTABLETYPE), | ||
.slots = tokenizeriter_slots, | ||
}; | ||
|
||
|
||
static int | ||
tokenizemodule_exec(PyObject* m) | ||
{ | ||
tokenize_state* state = get_tokenize_state(m); | ||
if (state == NULL) { | ||
return -1; | ||
} | ||
|
||
state->TokenizerIter = (PyTypeObject *)PyType_FromModuleAndSpec( | ||
m, &tokenizeriter_spec, NULL); | ||
if (state->TokenizerIter == NULL) { | ||
return -1; | ||
} | ||
if (PyModule_AddType(m, state->TokenizerIter) < 0) { | ||
return -1; | ||
} | ||
|
||
return 0; | ||
} | ||
|
||
static PyMethodDef tokenize_methods[] = { | ||
{NULL, NULL, 0, NULL} /* Sentinel */ | ||
}; | ||
|
||
static PyModuleDef_Slot tokenizemodule_slots[] = { | ||
{Py_mod_exec, tokenizemodule_exec}, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Example of a 4-indented initializer. |
||
{0, NULL} | ||
}; | ||
|
||
static int | ||
tokenizemodule_traverse(PyObject *m, visitproc visit, void *arg) | ||
{ | ||
tokenize_state *state = get_tokenize_state(m); | ||
Py_VISIT(state->TokenizerIter); | ||
return 0; | ||
} | ||
|
||
static int | ||
tokenizemodule_clear(PyObject *m) | ||
{ | ||
tokenize_state *state = get_tokenize_state(m); | ||
Py_CLEAR(state->TokenizerIter); | ||
return 0; | ||
} | ||
|
||
static void | ||
tokenizemodule_free(void *m) | ||
{ | ||
tokenizemodule_clear((PyObject *)m); | ||
} | ||
|
||
static struct PyModuleDef _tokenizemodule = { | ||
PyModuleDef_HEAD_INIT, | ||
.m_name = "_tokenize", | ||
.m_size = sizeof(tokenize_state), | ||
.m_slots = tokenizemodule_slots, | ||
.m_methods = tokenize_methods, | ||
.m_traverse = tokenizemodule_traverse, | ||
.m_clear = tokenizemodule_clear, | ||
.m_free = tokenizemodule_free, | ||
}; | ||
|
||
PyMODINIT_FUNC | ||
PyInit__tokenize(void) | ||
{ | ||
return PyModuleDef_Init(&_tokenizemodule); | ||
} |
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this expected?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah, check other modules that have classes created with the argument clinic. Maybe I am missing what you mean though 😅