Skip to content

Simplify _PyPegen_fill_token in pegen.c #25295

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Apr 9, 2021
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
122 changes: 64 additions & 58 deletions Parser/pegen.c
Original file line number Diff line number Diff line change
Expand Up @@ -625,6 +625,64 @@ growable_comment_array_deallocate(growable_comment_array *arr) {
PyMem_Free(arr->items);
}

static int
initialize_token(Parser *p, Token *token, const char *start, const char *end, int token_type) {
assert(token != NULL);

token->type = (token_type == NAME) ? _get_keyword_or_name_type(p, start, (int)(end - start)) : token_type;
token->bytes = PyBytes_FromStringAndSize(start, end - start);
if (token->bytes == NULL) {
return -1;
}

if (_PyArena_AddPyObject(p->arena, token->bytes) < 0) {
Py_DECREF(token->bytes);
return -1;
}

const char *line_start = token_type == STRING ? p->tok->multi_line_start : p->tok->line_start;
int lineno = token_type == STRING ? p->tok->first_lineno : p->tok->lineno;
int end_lineno = p->tok->lineno;

int col_offset = (start != NULL && start >= line_start) ? (int)(start - line_start) : -1;
int end_col_offset = (end != NULL && end >= p->tok->line_start) ? (int)(end - p->tok->line_start) : -1;

token->lineno = p->starting_lineno + lineno;
token->col_offset = p->tok->lineno == 1 ? p->starting_col_offset + col_offset : col_offset;
token->end_lineno = p->starting_lineno + end_lineno;
token->end_col_offset = p->tok->lineno == 1 ? p->starting_col_offset + end_col_offset : end_col_offset;

p->fill += 1;

if (token_type == ERRORTOKEN && p->tok->done == E_DECODE) {
return raise_decode_error(p);
}

return (token_type == ERRORTOKEN ? tokenizer_error(p) : 0);
}

static int
_resize_tokens_array(Parser *p) {
int newsize = p->size * 2;
Token **new_tokens = PyMem_Realloc(p->tokens, newsize * sizeof(Token *));
if (new_tokens == NULL) {
PyErr_NoMemory();
return -1;
}
p->tokens = new_tokens;

for (int i = p->size; i < newsize; i++) {
p->tokens[i] = PyMem_Calloc(1, sizeof(Token));
if (p->tokens[i] == NULL) {
p->size = i; // Needed, in order to cleanup correctly after parser fails
PyErr_NoMemory();
return -1;
}
}
p->size = newsize;
return 0;
}

int
_PyPegen_fill_token(Parser *p)
{
Expand All @@ -650,7 +708,8 @@ _PyPegen_fill_token(Parser *p)
type = PyTokenizer_Get(p->tok, &start, &end);
}

if (type == ENDMARKER && p->start_rule == Py_single_input && p->parsing_started) {
// If we have reached the end and we are in single input mode we need to insert a newline and reset the parsing
if (p->start_rule == Py_single_input && type == ENDMARKER && p->parsing_started) {
type = NEWLINE; /* Add an extra newline */
p->parsing_started = 0;

Expand All @@ -663,66 +722,13 @@ _PyPegen_fill_token(Parser *p)
p->parsing_started = 1;
}

if (p->fill == p->size) {
int newsize = p->size * 2;
Token **new_tokens = PyMem_Realloc(p->tokens, newsize * sizeof(Token *));
if (new_tokens == NULL) {
PyErr_NoMemory();
return -1;
}
p->tokens = new_tokens;

for (int i = p->size; i < newsize; i++) {
p->tokens[i] = PyMem_Malloc(sizeof(Token));
if (p->tokens[i] == NULL) {
p->size = i; // Needed, in order to cleanup correctly after parser fails
PyErr_NoMemory();
return -1;
}
memset(p->tokens[i], '\0', sizeof(Token));
}
p->size = newsize;
}

Token *t = p->tokens[p->fill];
t->type = (type == NAME) ? _get_keyword_or_name_type(p, start, (int)(end - start)) : type;
t->bytes = PyBytes_FromStringAndSize(start, end - start);
if (t->bytes == NULL) {
return -1;
}
if (_PyArena_AddPyObject(p->arena, t->bytes) < 0) {
Py_DECREF(t->bytes);
// Check if we are at the limit of the token array capacity and resize if needed
if ((p->fill == p->size) && (_resize_tokens_array(p) != 0)) {
return -1;
}

int lineno = type == STRING ? p->tok->first_lineno : p->tok->lineno;
const char *line_start = type == STRING ? p->tok->multi_line_start : p->tok->line_start;
int end_lineno = p->tok->lineno;
int col_offset = -1;
int end_col_offset = -1;
if (start != NULL && start >= line_start) {
col_offset = (int)(start - line_start);
}
if (end != NULL && end >= p->tok->line_start) {
end_col_offset = (int)(end - p->tok->line_start);
}

t->lineno = p->starting_lineno + lineno;
t->col_offset = p->tok->lineno == 1 ? p->starting_col_offset + col_offset : col_offset;
t->end_lineno = p->starting_lineno + end_lineno;
t->end_col_offset = p->tok->lineno == 1 ? p->starting_col_offset + end_col_offset : end_col_offset;

p->fill += 1;

if (type == ERRORTOKEN) {
if (p->tok->done == E_DECODE) {
return raise_decode_error(p);
}
return tokenizer_error(p);

}

return 0;
Token *t = p->tokens[p->fill];
return initialize_token(p, t, start, end, type);
}

// Instrumentation to count the effectiveness of memoization.
Expand Down