Skip to content

Commit 330c2d9

Browse files
committed
Make the lexer UTF-8 RFC 3629 correct re: prefix octets
RFC 2279 states that, in UTF-8: "The octet values FE and FF never appear." RFC 3629 states that, in UTF-8: "The octet values C0, C1, F5 to FF never appear." Generalize the check to advance past invalid starting bytes for a UTF-8 sequence to fix a crash in the lexer.
1 parent 796994c commit 330c2d9

File tree

4 files changed

+37
-9
lines changed

4 files changed

+37
-9
lines changed

lib/Parse/Lexer.cpp

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,9 @@ static unsigned CLO8(unsigned char C) {
9696
/// isStartOfUTF8Character - Return true if this isn't a UTF8 continuation
9797
/// character, which will be of the form 0b10XXXXXX
9898
static bool isStartOfUTF8Character(unsigned char C) {
99-
return (signed char)C >= 0 || C >= 0xC0; // C0 = 0b11000000
99+
// RFC 2279: The octet values FE and FF never appear.
100+
// RFC 3629: The octet values C0, C1, F5 to FF never appear.
101+
return C <= 0x80 || (C >= 0xC2 && C < 0xF5);
100102
}
101103

102104
/// validateUTF8CharacterAndAdvance - Given a pointer to the starting byte of a
@@ -117,11 +119,7 @@ static uint32_t validateUTF8CharacterAndAdvance(const char *&Ptr,
117119

118120
// If this is 0b10XXXXXX, then it is a continuation character.
119121
if (EncodedBytes == 1 ||
120-
// If the number of encoded bytes is > 4, then this is an invalid
121-
// character in the range of 0xF5 and above. These would start an
122-
// encoding for something that couldn't be represented with UTF16
123-
// digraphs, so Unicode rejects them.
124-
EncodedBytes > 4) {
122+
!isStartOfUTF8Character(CurByte)) {
125123
// Skip until we get the start of another character. This is guaranteed to
126124
// at least stop at the nul at the end of the buffer.
127125
while (Ptr < End && !isStartOfUTF8Character(*Ptr))

test/Parse/invalid-utf8.swift

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,3 +4,34 @@ var
44

55
// Make sure we don't stop processing the whole file.
66
static func foo() {} // expected-error{{static methods may only be declared on a type}} {{1-8=}}
7+
8+
// UTF-8 RFC 2279: The octet values FE and FF never appear.
9+
// UTF-8 RFC 3629: The octet values C0, C1, F5 to FF never appear.
10+
// Below this line are such octets that should be skipped by the lexer.
11+
// They may not be rendered correctly by your text editor, if at all.
12+
13+
// Begin magic UTF-8 garbage
14+
// 0xC0
15+
À // expected-error {{invalid UTF-8 found in source file}}
16+
// 0xC1
17+
Á // expected-error {{invalid UTF-8 found in source file}}
18+
// 0xF5
19+
õ // expected-error {{invalid UTF-8 found in source file}}
20+
// 0xF6
21+
ö // expected-error {{invalid UTF-8 found in source file}}
22+
// 0xF7
23+
÷ // expected-error {{invalid UTF-8 found in source file}}
24+
// 0xF8
25+
ø // expected-error {{invalid UTF-8 found in source file}}
26+
// 0xF9
27+
ù // expected-error {{invalid UTF-8 found in source file}}
28+
// 0xFB
29+
û // expected-error {{invalid UTF-8 found in source file}}
30+
// 0xFC
31+
ü // expected-error {{invalid UTF-8 found in source file}}
32+
// 0xFD
33+
ý // expected-error {{invalid UTF-8 found in source file}}
34+
// End magic UTF-8 garbage
35+
36+
// Make sure we don't stop processing the whole file.
37+
static func bar() {} // expected-error{{static methods may only be declared on a type}} {{1-8=}}

validation-test/IDE/crashers/033-swift-identifier-isoperatorslow.swift

Lines changed: 0 additions & 3 deletions
This file was deleted.
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
// RUN: %target-swift-ide-test -code-completion -code-completion-token=A -source-filename=%s
2+
��#^A^#

0 commit comments

Comments
 (0)