Skip to content

gh-105017: Include CRLF lines in strings and column numbers #105030

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
May 28, 2023
19 changes: 14 additions & 5 deletions Lib/test/test_tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,11 +85,20 @@ def test_basic(self):
DEDENT '' (5, 0) (5, 0)
""")

self.check_tokenize("foo='bar'\r\n", """\
NAME 'foo' (1, 0) (1, 3)
OP '=' (1, 3) (1, 4)
STRING "'bar'" (1, 4) (1, 9)
NEWLINE '\\n' (1, 9) (1, 10)
self.check_tokenize("if True:\r\n # NL\r\n foo='bar'\r\n\r\n", """\
NAME 'if' (1, 0) (1, 2)
NAME 'True' (1, 3) (1, 7)
OP ':' (1, 7) (1, 8)
NEWLINE '\\r\\n' (1, 8) (1, 10)
COMMENT '# NL' (2, 4) (2, 8)
NL '\\r\\n' (2, 8) (2, 10)
INDENT ' ' (3, 0) (3, 4)
NAME 'foo' (3, 4) (3, 7)
OP '=' (3, 7) (3, 8)
STRING "\'bar\'" (3, 8) (3, 13)
NEWLINE '\\r\\n' (3, 13) (3, 15)
NL '\\r\\n' (4, 0) (4, 2)
DEDENT '' (5, 0) (5, 0)
""")

indent_error_file = b"""\
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Show CRLF lines in the tokenize string attribute in both NL and NEWLINE tokens. Patch by Marta Gómez.
23 changes: 8 additions & 15 deletions Parser/tokenizer.c
Original file line number Diff line number Diff line change
Expand Up @@ -773,7 +773,6 @@ translate_into_utf8(const char* str, const char* enc) {

static char *
translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
int skip_next_lf = 0;
size_t needed_length = strlen(s) + 2, final_length;
char *buf, *current;
char c = '\0';
Expand All @@ -784,18 +783,8 @@ translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
}
for (current = buf; *s; s++, current++) {
c = *s;
if (skip_next_lf) {
skip_next_lf = 0;
if (c == '\n') {
c = *++s;
if (!c)
break;
}
}
if (c == '\r') {
skip_next_lf = 1;
c = '\n';
}
if (!c)
break;
*current = c;
}
/* If this is exec input, add a newline to the end of the string if
Expand Down Expand Up @@ -1693,7 +1682,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
}
}
tok_backup(tok, c);
if (c == '#' || c == '\n') {
if (c == '#' || c == '\n' || c == '\r') {
/* Lines with only whitespace and/or comments
shouldn't affect the indentation and are
not passed to the parser as NEWLINE tokens,
Expand Down Expand Up @@ -1822,7 +1811,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
const char *prefix, *type_start;
int current_starting_col_offset;

while (c != EOF && c != '\n') {
while (c != EOF && c != '\n' && c != '\r') {
c = tok_nextc(tok);
}

Expand Down Expand Up @@ -2002,6 +1991,10 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
return MAKE_TOKEN(NAME);
}

if (c == '\r') {
c = tok_nextc(tok);
}

/* Newline */
if (c == '\n') {
tok->atbol = 1;
Expand Down
6 changes: 5 additions & 1 deletion Python/Python-tokenize.c
Original file line number Diff line number Diff line change
Expand Up @@ -240,7 +240,11 @@ tokenizeriter_next(tokenizeriterobject *it)
type = NAME;
}
else if (type == NEWLINE) {
str = PyUnicode_FromString("\n");
if (it->tok->start[0] == '\r') {
str = PyUnicode_FromString("\r\n");
} else {
str = PyUnicode_FromString("\n");
}
end_col_offset++;
}
}
Expand Down