swiftlang · swift-ci · Dec 6, 2016 · Dec 6, 2016
diff --git a/lib/Parse/Lexer.cpp b/lib/Parse/Lexer.cpp
@@ -96,7 +96,9 @@ static unsigned CLO8(unsigned char C) {
 /// isStartOfUTF8Character - Return true if this isn't a UTF8 continuation
 /// character, which will be of the form 0b10XXXXXX
 static bool isStartOfUTF8Character(unsigned char C) {
-  return (signed char)C >= 0 || C >= 0xC0;  // C0 = 0b11000000
+  // RFC 2279: The octet values FE and FF never appear.
+  // RFC 3629: The octet values C0, C1, F5 to FF never appear.
+  return C <= 0x80 || (C >= 0xC2 && C < 0xF5);
 }
 
 /// validateUTF8CharacterAndAdvance - Given a pointer to the starting byte of a
@@ -117,11 +119,7 @@ static uint32_t validateUTF8CharacterAndAdvance(const char *&Ptr,
 
   // If this is 0b10XXXXXX, then it is a continuation character.
   if (EncodedBytes == 1 ||
-      // If the number of encoded bytes is > 4, then this is an invalid
-      // character in the range of 0xF5 and above.  These would start an
-      // encoding for something that couldn't be represented with UTF16
-      // digraphs, so Unicode rejects them.
-      EncodedBytes > 4) {
+      !isStartOfUTF8Character(CurByte)) {
     // Skip until we get the start of another character.  This is guaranteed to
     // at least stop at the nul at the end of the buffer.
     while (Ptr < End && !isStartOfUTF8Character(*Ptr))

diff --git a/test/Parse/invalid-utf8.swift b/test/Parse/invalid-utf8.swift
@@ -4,3 +4,34 @@ var
 
 // Make sure we don't stop processing the whole file.
 static func foo() {} // expected-error{{static methods may only be declared on a type}} {{1-8=}}
+
+// UTF-8 RFC 2279: The octet values FE and FF never appear.
+// UTF-8 RFC 3629: The octet values C0, C1, F5 to FF never appear.
+// Below this line are such octets that should be skipped by the lexer.
+// They may not be rendered correctly by your text editor, if at all.
+
+// Begin magic UTF-8 garbage
+// 0xC0
+� // expected-error {{invalid UTF-8 found in source file}}
+// 0xC1
+� // expected-error {{invalid UTF-8 found in source file}}
+// 0xF5
+� // expected-error {{invalid UTF-8 found in source file}}
+// 0xF6
+� // expected-error {{invalid UTF-8 found in source file}}
+// 0xF7
+� // expected-error {{invalid UTF-8 found in source file}}
+// 0xF8
+� // expected-error {{invalid UTF-8 found in source file}}
+// 0xF9
+� // expected-error {{invalid UTF-8 found in source file}}
+// 0xFB
+� // expected-error {{invalid UTF-8 found in source file}}
+// 0xFC
+� // expected-error {{invalid UTF-8 found in source file}}
+// 0xFD
+� // expected-error {{invalid UTF-8 found in source file}}
+// End magic UTF-8 garbage
+
+// Make sure we don't stop processing the whole file.
+static func bar() {} // expected-error{{static methods may only be declared on a type}} {{1-8=}}
diff --git a/validation-test/IDE/crashers/033-swift-identifier-isoperatorslow.swift b/validation-test/IDE/crashers/033-swift-identifier-isoperatorslow.swift
diff --git a/validation-test/IDE/crashers_fixed/033-swift-identifier-isoperatorslow.swift b/validation-test/IDE/crashers_fixed/033-swift-identifier-isoperatorslow.swift
@@ -0,0 +1,2 @@
+// RUN: %target-swift-ide-test -code-completion -code-completion-token=A -source-filename=%s
+��#^A^#
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		// RUN: %target-swift-ide-test -code-completion -code-completion-token=A -source-filename=%s
		��#^A^#