Skip to content

bpo-36520: Email header folded incorrectly #13608

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Jun 6, 2019
Merged
1 change: 1 addition & 0 deletions Lib/email/_header_value_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -2661,6 +2661,7 @@ def _refold_parse_tree(parse_tree, *, policy):
newline = _steal_trailing_WSP_if_exists(lines)
if newline or part.startswith_fws():
lines.append(newline + tstr)
last_ew = None
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Github won't let me comment on line not in this diff, but Line 2683 of this diff does the similar wrapping with a newline and I was wondering if there could be a code path to trigger that resulting in a similar bug?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That code path never gets triggered when there are UTF-8 characters in the input. Line 2607 collapses the UnstructuredTokenList into a byte string containing the entire input text, and then lines 2609-2619 determine that the text contains UTF-8 characters and sets want_encoding to True. This always sends it down the code path that calls _fold_as_ew() to fold a line with encoded words and moves on to the next token regardless of whether individual tokens are ASCII or UTF-8.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, thanks for detailed explanation! My bad for not looking closely enough.

continue
if not hasattr(part, 'encode'):
# It's not a terminal, try folding the subparts.
Expand Down
131 changes: 131 additions & 0 deletions Lib/test/test_email/test_message.py
Original file line number Diff line number Diff line change
Expand Up @@ -784,6 +784,137 @@ def test_str_defaults_to_utf8(self):
m['Subject'] = 'unicöde'
self.assertEqual(str(m), 'Subject: unicöde\n\n')

def test_folding_with_utf8_encoding_1(self):
# bpo-36520
#
# Fold a line that contains UTF-8 words before
# and after the whitespace fold point, where the
# line length limit is reached within an ASCII
# word.

m = EmailMessage()
m['Subject'] = 'Hello Wörld! Hello Wörld! ' \
'Hello Wörld! Hello Wörld!Hello Wörld!'
self.assertEqual(bytes(m),
b'Subject: Hello =?utf-8?q?W=C3=B6rld!_Hello_W'
b'=C3=B6rld!_Hello_W=C3=B6rld!?=\n'
b' Hello =?utf-8?q?W=C3=B6rld!Hello_W=C3=B6rld!?=\n\n')


def test_folding_with_utf8_encoding_2(self):
# bpo-36520
#
# Fold a line that contains UTF-8 words before
# and after the whitespace fold point, where the
# line length limit is reached at the end of an
# encoded word.

m = EmailMessage()
m['Subject'] = 'Hello Wörld! Hello Wörld! ' \
'Hello Wörlds123! Hello Wörld!Hello Wörld!'
self.assertEqual(bytes(m),
b'Subject: Hello =?utf-8?q?W=C3=B6rld!_Hello_W'
b'=C3=B6rld!_Hello_W=C3=B6rlds123!?=\n'
b' Hello =?utf-8?q?W=C3=B6rld!Hello_W=C3=B6rld!?=\n\n')

def test_folding_with_utf8_encoding_3(self):
# bpo-36520
#
# Fold a line that contains UTF-8 words before
# and after the whitespace fold point, where the
# line length limit is reached at the end of the
# first word.

m = EmailMessage()
m['Subject'] = 'Hello-Wörld!-Hello-Wörld!-Hello-Wörlds123! ' \
'Hello Wörld!Hello Wörld!'
self.assertEqual(bytes(m), \
b'Subject: =?utf-8?q?Hello-W=C3=B6rld!-Hello-W'
b'=C3=B6rld!-Hello-W=C3=B6rlds123!?=\n'
b' Hello =?utf-8?q?W=C3=B6rld!Hello_W=C3=B6rld!?=\n\n')

def test_folding_with_utf8_encoding_4(self):
# bpo-36520
#
# Fold a line that contains UTF-8 words before
# and after the fold point, where the first
# word is UTF-8 and the fold point is within
# the word.

m = EmailMessage()
m['Subject'] = 'Hello-Wörld!-Hello-Wörld!-Hello-Wörlds123!-Hello' \
' Wörld!Hello Wörld!'
self.assertEqual(bytes(m),
b'Subject: =?utf-8?q?Hello-W=C3=B6rld!-Hello-W'
b'=C3=B6rld!-Hello-W=C3=B6rlds123!?=\n'
b' =?utf-8?q?-Hello_W=C3=B6rld!Hello_W=C3=B6rld!?=\n\n')

def test_folding_with_utf8_encoding_5(self):
# bpo-36520
#
# Fold a line that contains a UTF-8 word after
# the fold point.

m = EmailMessage()
m['Subject'] = '123456789 123456789 123456789 123456789 123456789' \
' 123456789 123456789 Hello Wörld!'
self.assertEqual(bytes(m),
b'Subject: 123456789 123456789 123456789 123456789'
b' 123456789 123456789 123456789\n'
b' Hello =?utf-8?q?W=C3=B6rld!?=\n\n')

def test_folding_with_utf8_encoding_6(self):
# bpo-36520
#
# Fold a line that contains a UTF-8 word before
# the fold point and ASCII words after

m = EmailMessage()
m['Subject'] = '123456789 123456789 123456789 123456789 Hello Wörld!' \
' 123456789 123456789 123456789 123456789 123456789' \
' 123456789'
self.assertEqual(bytes(m),
b'Subject: 123456789 123456789 123456789 123456789'
b' Hello =?utf-8?q?W=C3=B6rld!?=\n 123456789 '
b'123456789 123456789 123456789 123456789 '
b'123456789\n\n')

def test_folding_with_utf8_encoding_7(self):
# bpo-36520
#
# Fold a line twice that contains UTF-8 words before
# and after the first fold point, and ASCII words
# after the second fold point.

m = EmailMessage()
m['Subject'] = '123456789 123456789 Hello Wörld! Hello Wörld! ' \
'123456789-123456789 123456789 Hello Wörld! 123456789' \
' 123456789'
self.assertEqual(bytes(m),
b'Subject: 123456789 123456789 Hello =?utf-8?q?'
b'W=C3=B6rld!_Hello_W=C3=B6rld!?=\n'
b' 123456789-123456789 123456789 Hello '
b'=?utf-8?q?W=C3=B6rld!?= 123456789\n 123456789\n\n')

def test_folding_with_utf8_encoding_8(self):
# bpo-36520
#
# Fold a line twice that contains UTF-8 words before
# the first fold point, and ASCII words after the
# first fold point, and UTF-8 words after the second
# fold point.

m = EmailMessage()
m['Subject'] = '123456789 123456789 Hello Wörld! Hello Wörld! ' \
'123456789 123456789 123456789 123456789 123456789 ' \
'123456789-123456789 123456789 Hello Wörld! 123456789' \
' 123456789'
self.assertEqual(bytes(m),
b'Subject: 123456789 123456789 Hello '
b'=?utf-8?q?W=C3=B6rld!_Hello_W=C3=B6rld!?=\n 123456789 '
b'123456789 123456789 123456789 123456789 '
b'123456789-123456789\n 123456789 Hello '
b'=?utf-8?q?W=C3=B6rld!?= 123456789 123456789\n\n')

class TestMIMEPart(TestEmailMessageBase, TestEmailBase):
# Doing the full test run here may seem a bit redundant, since the two
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Lengthy email headers with UTF-8 characters are now properly encoded when they are folded. Patch by Jeffrey Kintscher.