Skip to content

Commit 7829bba

Browse files
ammaraskartaleinat
authored andcommitted
[2.7] bpo-33899: Make tokenize module mirror end-of-file is end-of-line behavior (GH-7891) (#8133)
Most of the change involves fixing up the test suite, which previously made the assumption that there wouldn't be a new line if the input didn't end in one. Contributed by Ammar Askar. (cherry picked from commit c4ef489)
1 parent 9720f60 commit 7829bba

File tree

3 files changed

+47
-12
lines changed

3 files changed

+47
-12
lines changed

Lib/test/test_tokenize.py

Lines changed: 34 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,32 +1,54 @@
11
from test import test_support
2-
from tokenize import (untokenize, generate_tokens, NUMBER, NAME, OP,
2+
from tokenize import (untokenize, generate_tokens, NUMBER, NAME, OP, NEWLINE,
33
STRING, ENDMARKER, tok_name, Untokenizer, tokenize)
44
from StringIO import StringIO
55
import os
66
from unittest import TestCase
77

88

9+
# Converts a source string into a list of textual representation
10+
# of the tokens such as:
11+
# ` NAME 'if' (1, 0) (1, 2)`
12+
# to make writing tests easier.
13+
def stringify_tokens_from_source(token_generator, source_string):
14+
result = []
15+
num_lines = len(source_string.splitlines())
16+
missing_trailing_nl = source_string[-1] not in '\r\n'
17+
18+
for type, token, start, end, line in token_generator:
19+
if type == ENDMARKER:
20+
break
21+
# Ignore the new line on the last line if the input lacks one
22+
if missing_trailing_nl and type == NEWLINE and end[0] == num_lines:
23+
continue
24+
type = tok_name[type]
25+
result.append(" %(type)-10.10s %(token)-13.13r %(start)s %(end)s" %
26+
locals())
27+
28+
return result
29+
930
class TokenizeTest(TestCase):
1031
# Tests for the tokenize module.
1132

1233
# The tests can be really simple. Given a small fragment of source
13-
# code, print out a table with tokens. The ENDMARKER is omitted for
14-
# brevity.
34+
# code, print out a table with tokens. The ENDMARKER, ENCODING and
35+
# final NEWLINE are omitted for brevity.
1536

1637
def check_tokenize(self, s, expected):
1738
# Format the tokens in s in a table format.
18-
# The ENDMARKER is omitted.
19-
result = []
2039
f = StringIO(s)
21-
for type, token, start, end, line in generate_tokens(f.readline):
22-
if type == ENDMARKER:
23-
break
24-
type = tok_name[type]
25-
result.append(" %(type)-10.10s %(token)-13.13r %(start)s %(end)s" %
26-
locals())
40+
result = stringify_tokens_from_source(generate_tokens(f.readline), s)
41+
2742
self.assertEqual(result,
2843
expected.rstrip().splitlines())
2944

45+
def test_implicit_newline(self):
46+
# Make sure that the tokenizer puts in an implicit NEWLINE
47+
# when the input lacks a trailing new line.
48+
f = StringIO("x")
49+
tokens = list(generate_tokens(f.readline))
50+
self.assertEqual(tokens[-2][0], NEWLINE)
51+
self.assertEqual(tokens[-1][0], ENDMARKER)
3052

3153
def test_basic(self):
3254
self.check_tokenize("1 + 1", """\
@@ -616,7 +638,7 @@ def test_roundtrip(self):
616638
self.check_roundtrip("if x == 1:\n"
617639
" print x\n")
618640
self.check_roundtrip("# This is a comment\n"
619-
"# This also")
641+
"# This also\n")
620642

621643
# Some people use different formatting conventions, which makes
622644
# untokenize a little trickier. Note that this test involves trailing

Lib/tokenize.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -306,8 +306,15 @@ def generate_tokens(readline):
306306
contline = None
307307
indents = [0]
308308

309+
last_line = b''
310+
line = b''
309311
while 1: # loop over lines in stream
310312
try:
313+
# We capture the value of the line variable here because
314+
# readline uses the empty string '' to signal end of input,
315+
# hence `line` itself will always be overwritten at the end
316+
# of this loop.
317+
last_line = line
311318
line = readline()
312319
except StopIteration:
313320
line = ''
@@ -437,6 +444,9 @@ def generate_tokens(readline):
437444
(lnum, pos), (lnum, pos+1), line)
438445
pos += 1
439446

447+
# Add an implicit NEWLINE if the input doesn't end in one
448+
if last_line and last_line[-1] not in '\r\n':
449+
yield (NEWLINE, '', (lnum - 1, len(last_line)), (lnum - 1, len(last_line) + 1), '')
440450
for indent in indents[1:]: # pop remaining indent levels
441451
yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
442452
yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
Tokenize module now implicitly emits a NEWLINE when provided with input that
2+
does not have a trailing new line. This behavior now matches what the C
3+
tokenizer does internally. Contributed by Ammar Askar.

0 commit comments

Comments
 (0)