swiftlang · omochi · Mar 5, 2018 · rintaro · Mar 6, 2018 · omochi
diff --git a/lib/Parse/Lexer.cpp b/lib/Parse/Lexer.cpp
@@ -2106,59 +2106,72 @@ void Lexer::lexImpl() {
       // start, attempt to recover by eating more continuation characters.
       diagnose(CurPtr-1, diag::lex_invalid_identifier_start_character);
       while (advanceIfValidContinuationOfIdentifier(tmp, BufferEnd));
-    } else {
-      // This character isn't allowed in Swift source.
-      uint32_t codepoint = validateUTF8CharacterAndAdvance(tmp, BufferEnd);
-      if (codepoint == ~0U) {
-        diagnose(CurPtr-1, diag::lex_invalid_utf8)
-          .fixItReplaceChars(getSourceLoc(CurPtr-1), getSourceLoc(tmp), " ");
-        CurPtr = tmp;
-        goto Restart;  // Skip presumed whitespace.
-      } else if (codepoint == 0x0000201D) {
-        // If this is an end curly quote, just diagnose it with a fixit hint.
-        diagnose(CurPtr-1, diag::lex_invalid_curly_quote)
-          .fixItReplaceChars(getSourceLoc(CurPtr-1), getSourceLoc(tmp), "\"");
-      } else if (codepoint == 0x0000201C) {
-        auto endPtr = tmp;
-        // If this is a start curly quote, do a fuzzy match of a string literal
-        // to improve recovery.
-        if (auto tmp2 = findEndOfCurlyQuoteStringLiteral(tmp))
-          tmp = tmp2;
-
-        // Note, we intentionally diagnose the end quote before the start quote,
-        // so that the IDE suggests fixing the end quote before the start quote.
-        // This, in turn, works better with our error recovery because we won't
-        // diagnose an end curly quote in the middle of a straight quoted
-        // literal.
-        diagnose(CurPtr-1, diag::lex_invalid_curly_quote)
-          .fixItReplaceChars(getSourceLoc(CurPtr-1), getSourceLoc(endPtr),"\"");
 
-      } else {
-        diagnose(CurPtr-1, diag::lex_invalid_character)
-          .fixItReplaceChars(getSourceLoc(CurPtr-1), getSourceLoc(tmp), " ");
-
-        char expectedCodepoint;
-        if ((expectedCodepoint =
-            confusable::tryConvertConfusableCharacterToASCII(codepoint))) {
-
-          llvm::SmallString<4> confusedChar;
-          EncodeToUTF8(codepoint, confusedChar);
-          llvm::SmallString<1> expectedChar;
-          expectedChar += expectedCodepoint;
-          diagnose(CurPtr-1, diag::lex_confusable_character,
-                   confusedChar, expectedChar)
-            .fixItReplaceChars(getSourceLoc(CurPtr-1),
-                               getSourceLoc(tmp),
-                               expectedChar);
-        }
+      CurPtr = tmp;
+      return formToken(tok::unknown, TokStart);
+    }
 
-        CurPtr = tmp;
-        goto Restart;  // Skip presumed whitespace.
+    // This character isn't allowed in Swift source.
+    uint32_t codepoint = validateUTF8CharacterAndAdvance(tmp, BufferEnd);
+    if (codepoint == ~0U) {
+      diagnose(CurPtr - 1, diag::lex_invalid_utf8)
+          .fixItReplaceChars(getSourceLoc(CurPtr - 1), getSourceLoc(tmp), " ");
+      CurPtr = tmp;
+      if (TriviaRetention == TriviaRetentionMode::WithTrivia) {
+        size_t TriviaLength = CurPtr - TokStart;
+        LeadingTrivia.push_back(
+            TriviaPiece::garbageText({TokStart, TriviaLength}));
       }
+      goto Restart; // Skip presumed whitespace.
+    } else if (codepoint == 0x0000201D) {
+      // If this is an end curly quote, just diagnose it with a fixit hint.
+      diagnose(CurPtr - 1, diag::lex_invalid_curly_quote)
+          .fixItReplaceChars(getSourceLoc(CurPtr - 1), getSourceLoc(tmp), "\"");
+      CurPtr = tmp;
+      return formToken(tok::unknown, TokStart);
+    } else if (codepoint == 0x0000201C) {
+      auto endPtr = tmp;
+      // If this is a start curly quote, do a fuzzy match of a string literal
+      // to improve recovery.
+      if (auto tmp2 = findEndOfCurlyQuoteStringLiteral(tmp))
+        tmp = tmp2;
+
+      // Note, we intentionally diagnose the end quote before the start quote,
+      // so that the IDE suggests fixing the end quote before the start quote.
+      // This, in turn, works better with our error recovery because we won't
+      // diagnose an end curly quote in the middle of a straight quoted
+      // literal.
+      diagnose(CurPtr - 1, diag::lex_invalid_curly_quote)
+          .fixItReplaceChars(getSourceLoc(CurPtr - 1), getSourceLoc(endPtr),
+                             "\"");
+      CurPtr = tmp;
+      return formToken(tok::unknown, TokStart);
+    }
+
+    diagnose(CurPtr - 1, diag::lex_invalid_character)
+        .fixItReplaceChars(getSourceLoc(CurPtr - 1), getSourceLoc(tmp), " ");
+
+    char expectedCodepoint;
+    if ((expectedCodepoint =
+             confusable::tryConvertConfusableCharacterToASCII(codepoint))) {
+
+      llvm::SmallString<4> confusedChar;
+      EncodeToUTF8(codepoint, confusedChar);
+      llvm::SmallString<1> expectedChar;
+      expectedChar += expectedCodepoint;
+      diagnose(CurPtr - 1, diag::lex_confusable_character, confusedChar,
+               expectedChar)
+          .fixItReplaceChars(getSourceLoc(CurPtr - 1), getSourceLoc(tmp),
+                             expectedChar);
     }
 
     CurPtr = tmp;
-    return formToken(tok::unknown, TokStart);
+    if (TriviaRetention == TriviaRetentionMode::WithTrivia) {
+      size_t TriviaLength = CurPtr - TokStart;
+      LeadingTrivia.push_back(
+          TriviaPiece::garbageText({TokStart, TriviaLength}));
+    }
+    goto Restart; // Skip presumed whitespace.
   }
 
   case '\n':
@@ -2341,7 +2354,6 @@ void Lexer::lexTrivia(syntax::Trivia &Pieces, bool IsForTrailingTrivia) {
 Restart:
   const char *TriviaStart = CurPtr;
 
-  // TODO: Handle invalid UTF8 sequence which is skipped in lexImpl().
   switch (*CurPtr++) {
   case '\n':
     if (IsForTrailingTrivia)

diff --git a/test/Syntax/round_trip_invalids.swift b/test/Syntax/round_trip_invalids.swift
@@ -0,0 +1,23 @@
+// To know about setup, see `tokens_invalids.swift`.
+
+// RUN: cat %s > %t
+// RUN: cat %t | sed 's/'$(echo -ne "\x5a1")'/'$(echo -ne "\xc2")'/g' > %t.sed
+// RUN: cp -f %t.sed %t
+// RUN: cat %t | sed 's/'$(echo -ne "\x5a2")'/'$(echo -ne "\xcc\x82")'/g' > %t.sed
+// RUN: cp -f %t.sed %t
+// RUN: cat %t | sed 's/'$(echo -ne "\x5a3")'/'$(echo -ne "\xe2\x80\x9d")'/g' > %t.sed
+// RUN: cp -f %t.sed %t
+// RUN: cat %t | sed 's/'$(echo -ne "\x5a4")'/'$(echo -ne "\xe2\x80\x9c")'/g' > %t.sed
+// RUN: cp -f %t.sed %t
+// RUN: cat %t | sed 's/'$(echo -ne "\x5a5")'/'$(echo -ne "\xe1\x9a\x80")'/g' > %t.sed
+// RUN: cp -f %t.sed %t
+
+// RUN: %round-trip-syntax-test --swift-syntax-test %swift-syntax-test --file %t
+
+x
+Z1 x
+Z2
+Z3
+Z4
+Z4 abcdef Z3
+Z5 x
diff --git a/test/Syntax/tokens_invalids.swift b/test/Syntax/tokens_invalids.swift
@@ -0,0 +1,79 @@
+// RUN: cat %s > %t
+
+// 5a is Z. "ZN" style marker is used for marker. N is number.
+
+// C2 is utf8 2 byte character start byte.
+// RUN: cat %t | sed 's/'$(echo -ne "\x5a1")'/'$(echo -ne "\xc2")'/g' > %t.sed
+// RUN: cp -f %t.sed %t
+
+// CC 82 is U+0302, invalid for identifier start, valid for identifier body.
+// RUN: cat %t | sed 's/'$(echo -ne "\x5a2")'/'$(echo -ne "\xcc\x82")'/g' > %t.sed
+// RUN: cp -f %t.sed %t
+
+// E2 80 9D is U+201D, right quote.
+// RUN: cat %t | sed 's/'$(echo -ne "\x5a3")'/'$(echo -ne "\xe2\x80\x9d")'/g' > %t.sed
+// RUN: cp -f %t.sed %t
+
+// E2 80 9C is U+201C, left quote.
+// RUN: cat %t | sed 's/'$(echo -ne "\x5a4")'/'$(echo -ne "\xe2\x80\x9c")'/g' > %t.sed
+// RUN: cp -f %t.sed %t
+
+// E1 9A 80 is U+1680, invalid for swift source.
+// RUN: cat %t | sed 's/'$(echo -ne "\x5a5")'/'$(echo -ne "\xe1\x9a\x80")'/g' > %t.sed
+// RUN: cp -f %t.sed %t
+
+// RUN: %swift-syntax-test -input-source-filename %t -dump-full-tokens 2>&1 | %FileCheck %t
+
+x
+Z1 x
+Z2
+Z3
+Z4
+Z4 abcdef Z3
+Z5 x
+
+// test diagnostics.
+
+// CHECK: 28:1: error: invalid UTF-8 found in source file
+// CHECK: 29:1: error: an identifier cannot begin with this character
+// CHECK: 30:1: error: unicode curly quote found
+// CHECK: 31:1: error: unicode curly quote found
+// CHECK: 32:12: error: unicode curly quote found
+// CHECK: 32:1: error: unicode curly quote found
+// CHECK: 33:1: error: invalid character in source file
+
+// test tokens and trivias.
+
+// CHECK-LABEL: 28:3
+// CHECK-NEXT:  (Token identifier
+// CHECK-NEXT:   (trivia newline 1)
+// CHECK-NEXT:   (trivia garbage_text \302)
+// CHECK-NEXT:   (trivia space 1)
+// CHECK-NEXT:   (text="x"))
+
+// CHECK-LABEL: 29:1
+// CHECK-NEXT:  (Token unknown
+// CHECK-NEXT:   (trivia newline 1)
+// CHECK-NEXT:   (text="\xCC\x82"))
+
+// CHECK-LABEL: 30:1
+// CHECK-NEXT:  (Token unknown
+// CHECK-NEXT:   (trivia newline 1)
+// CHECK-NEXT:   (text="\xE2\x80\x9D"))
+
+// CHECK-LABEL: 31:1
+// CHECK-NEXT:  (Token unknown
+// CHECK-NEXT:   (trivia newline 1)
+// CHECK-NEXT:   (text="\xE2\x80\x9C"))
+
+// CHECK-LABEL: 32:1
+// CHECK-NEXT:  (Token unknown
+// CHECK-NEXT:   (trivia newline 1)
+// CHECK-NEXT:   (text="\xE2\x80\x9C abcdef \xE2\x80\x9D"))
+
+// CHECK-LABEL: 33:5
+// CHECK-NEXT:  (Token identifier
+// CHECK-NEXT:   (trivia newline 1)
+// CHECK-NEXT:   (trivia garbage_text \341\232\200)
+// CHECK-NEXT:   (trivia space 1)
+// CHECK-NEXT:   (text="x"))