Make the lexer UTF-8 RFC 3629 correct re: prefix octets

bitjammer · bitjammer · commit 330c2d96e649 · 2016-12-05T17:21:17.000-08:00
RFC 2279 states that, in UTF-8:
"The octet values FE and FF never appear."

RFC 3629 states that, in UTF-8:
"The octet values C0, C1, F5 to FF never appear."

Generalize the check to advance past invalid starting bytes for
a UTF-8 sequence to fix a crash in the lexer.
diff --git a/lib/Parse/Lexer.cpp b/lib/Parse/Lexer.cpp
@@ -96,7 +96,9 @@ static unsigned CLO8(unsigned char C) {
 /// isStartOfUTF8Character - Return true if this isn't a UTF8 continuation
 /// character, which will be of the form 0b10XXXXXX
 static bool isStartOfUTF8Character(unsigned char C) {
-  return (signed char)C >= 0 || C >= 0xC0;  // C0 = 0b11000000
+  // RFC 2279: The octet values FE and FF never appear.
+  // RFC 3629: The octet values C0, C1, F5 to FF never appear.
+  return C <= 0x80 || (C >= 0xC2 && C < 0xF5);
 }
 
 /// validateUTF8CharacterAndAdvance - Given a pointer to the starting byte of a
@@ -117,11 +119,7 @@ static uint32_t validateUTF8CharacterAndAdvance(const char *&Ptr,
   
   // If this is 0b10XXXXXX, then it is a continuation character.
   if (EncodedBytes == 1 ||
-      // If the number of encoded bytes is > 4, then this is an invalid
-      // character in the range of 0xF5 and above.  These would start an
-      // encoding for something that couldn't be represented with UTF16
-      // digraphs, so Unicode rejects them.
-      EncodedBytes > 4) {
+      !isStartOfUTF8Character(CurByte)) {
     // Skip until we get the start of another character.  This is guaranteed to
     // at least stop at the nul at the end of the buffer.
     while (Ptr < End && !isStartOfUTF8Character(*Ptr))
diff --git a/test/Parse/invalid-utf8.swift b/test/Parse/invalid-utf8.swift
@@ -4,3 +4,34 @@ var 
 
 // Make sure we don't stop processing the whole file.
 static func foo() {} // expected-error{{static methods may only be declared on a type}} {{1-8=}}
+
+// UTF-8 RFC 2279: The octet values FE and FF never appear.
+// UTF-8 RFC 3629: The octet values C0, C1, F5 to FF never appear.
+// Below this line are such octets that should be skipped by the lexer.
+// They may not be rendered correctly by your text editor, if at all.
+
+// Begin magic UTF-8 garbage
+// 0xC0
+� // expected-error {{invalid UTF-8 found in source file}}
+// 0xC1
+� // expected-error {{invalid UTF-8 found in source file}}
+// 0xF5
+� // expected-error {{invalid UTF-8 found in source file}}
+// 0xF6
+� // expected-error {{invalid UTF-8 found in source file}}
+// 0xF7
+� // expected-error {{invalid UTF-8 found in source file}}
+// 0xF8
+� // expected-error {{invalid UTF-8 found in source file}}
+// 0xF9
+� // expected-error {{invalid UTF-8 found in source file}}
+// 0xFB
+� // expected-error {{invalid UTF-8 found in source file}}
+// 0xFC
+� // expected-error {{invalid UTF-8 found in source file}}
+// 0xFD
+� // expected-error {{invalid UTF-8 found in source file}}
+// End magic UTF-8 garbage
+
+// Make sure we don't stop processing the whole file.
+static func bar() {} // expected-error{{static methods may only be declared on a type}} {{1-8=}}
diff --git a/validation-test/IDE/crashers/033-swift-identifier-isoperatorslow.swift b/validation-test/IDE/crashers/033-swift-identifier-isoperatorslow.swift
diff --git a/validation-test/IDE/crashers_fixed/033-swift-identifier-isoperatorslow.swift b/validation-test/IDE/crashers_fixed/033-swift-identifier-isoperatorslow.swift
@@ -0,0 +1,2 @@
+// RUN: %target-swift-ide-test -code-completion -code-completion-token=A -source-filename=%s
+��#^A^#

-Original file line number
+Diff line change
 // Make sure we don't stop processing the whole file.
 static func foo() {} // expected-error{{static methods may only be declared on a type}} {{1-8=}}
++
 +// UTF-8 RFC 2279: The octet values FE and FF never appear.
 +// UTF-8 RFC 3629: The octet values C0, C1, F5 to FF never appear.
 +// Below this line are such octets that should be skipped by the lexer.
 +// They may not be rendered correctly by your text editor, if at all.
++
 +// Begin magic UTF-8 garbage
 +// 0xC0
 +À // expected-error {{invalid UTF-8 found in source file}}
 +// 0xC1
 +Á // expected-error {{invalid UTF-8 found in source file}}
 +// 0xF5
 +õ // expected-error {{invalid UTF-8 found in source file}}
 +// 0xF6
 +ö // expected-error {{invalid UTF-8 found in source file}}
 +// 0xF7
 +÷ // expected-error {{invalid UTF-8 found in source file}}
 +// 0xF8
 +ø // expected-error {{invalid UTF-8 found in source file}}
 +// 0xF9
 +ù // expected-error {{invalid UTF-8 found in source file}}
 +// 0xFB
 +û // expected-error {{invalid UTF-8 found in source file}}
 +// 0xFC
 +ü // expected-error {{invalid UTF-8 found in source file}}
 +// 0xFD
 +ý // expected-error {{invalid UTF-8 found in source file}}
 +// End magic UTF-8 garbage
++
 +// Make sure we don't stop processing the whole file.
 +static func bar() {} // expected-error{{static methods may only be declared on a type}} {{1-8=}}
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+// RUN: %target-swift-ide-test -code-completion -code-completion-token=A -source-filename=%s`
	`2`	`+��#^A^#`