swiftlang · omochi · Mar 4, 2018 · Mar 4, 2018 · Mar 4, 2018
diff --git a/include/swift/Parse/Lexer.h b/include/swift/Parse/Lexer.h
@@ -525,13 +525,17 @@ class Lexer {
   void lexEscapedIdentifier();
 
   void tryLexEditorPlaceholder();
-  const char *findEndOfCurlyQuoteStringLiteral(const char*);
+  const char *findEndOfCurlyQuoteStringLiteral(const char *,
+                                               bool EmitDiagnostics);
 
   /// Try to lex conflict markers by checking for the presence of the start and
   /// end of the marker in diff3 or Perforce style respectively.
   bool tryLexConflictMarker(bool EatNewline);
 
   NulCharacterKind getNulCharacterKind(const char *Ptr) const;
+
+  /// Lex invalid characters and return which it should be tokenized.
+  bool lexInvalidCharacters(const char *&Ptr, bool InLexTrivia);
 };
 
 /// Given an ordered token \param Array , get the iterator pointing to the first

diff --git a/lib/Parse/Lexer.cpp b/lib/Parse/Lexer.cpp
@@ -557,6 +557,43 @@ static bool advanceIfValidContinuationOfOperator(char const *&ptr,
   return advanceIf(ptr, end, Identifier::isOperatorContinuationCodePoint);
 }
 
+static bool isStartOfInvalidCharacters(const char *Ptr, const char *EndPtr) {
+  // This logic must equals to switch-case in lexImpl.
+  switch ((signed char)*Ptr) {
+  case '\n': case '\r':
+  case ' ': case '\t': case '\f': case '\v':
+  case -1: case -2:
+  case 0:
+  case '@': case '{': case '[': case '(': case '}': case ']': case ')':
+  case ',': case ';': case ':': case '\\': case '#': case '/': case '%':
+  case '!': case '?': case '<': case '>': case '=':
+  case '-': case '+': case '*': case '&': case '|': case '^': case '~': case '.':
+  case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
+  case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N':
+  case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U':
+  case 'V': case 'W': case 'X': case 'Y': case 'Z':
+  case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
+  case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
+  case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u':
+  case 'v': case 'w': case 'x': case 'y': case 'z':
+  case '_': case '$':
+  case '0': case '1': case '2': case '3': case '4':
+  case '5': case '6': case '7': case '8': case '9':
+  case '"': case '\'': case '`':
+    return false;
+  default: {
+    if (advanceIfValidStartOfIdentifier(Ptr, EndPtr)) {
+      return false;
+    }
+    if (advanceIfValidStartOfOperator(Ptr, EndPtr)) {
+      return false;
+    }
+
+    return true;
+  }
+  }
+}
+
 bool Lexer::isIdentifier(StringRef string) {
   if (string.empty()) return false;
   char const *p = string.data(), *end = string.end();
@@ -1739,8 +1776,9 @@ void Lexer::lexStringLiteral() {
 /// string literal, diagnose the problem and return a pointer to the end of the
 /// entire string literal.  This helps us avoid parsing the body of the string
 /// as program tokens, which will only lead to massive confusion.
-const char *Lexer::findEndOfCurlyQuoteStringLiteral(const char *Body) {
-
+const char *Lexer::findEndOfCurlyQuoteStringLiteral(const char *Body,
+                                                    bool EmitDiagnostics) {
+
   while (true) {
     // Don't bother with string interpolations.
     if (*Body == '\\' && *(Body + 1) == '(')
@@ -1752,7 +1790,7 @@ const char *Lexer::findEndOfCurlyQuoteStringLiteral(const char *Body) {
 
     // Get the next character.
     const char *CharStart = Body;
-    unsigned CharValue = lexCharacter(Body, '\0', false);
+    unsigned CharValue = lexCharacter(Body, '\0', /*EmitDiagnostics=*/false);
     // If the character was incorrectly encoded, give up.
     if (CharValue == ~1U) return nullptr;
 
@@ -1764,8 +1802,11 @@ const char *Lexer::findEndOfCurlyQuoteStringLiteral(const char *Body) {
     // If we found an ending curly quote (common since this thing started with
     // an opening curly quote) diagnose it with a fixit and then return.
     if (CharValue == 0x0000201D) {
-      diagnose(CharStart, diag::lex_invalid_curly_quote)
-        .fixItReplaceChars(getSourceLoc(CharStart), getSourceLoc(Body), "\"");
+      if (EmitDiagnostics) {
+        diagnose(CharStart, diag::lex_invalid_curly_quote)
+            .fixItReplaceChars(getSourceLoc(CharStart), getSourceLoc(Body),
+                               "\"");
+      }
       return Body;
     }
 
@@ -1875,6 +1916,87 @@ Lexer::NulCharacterKind Lexer::getNulCharacterKind(const char *Ptr) const {
   return NulCharacterKind::Embedded;
 }
 
+bool Lexer::lexInvalidCharacters(const char *&Ptr, bool InLexTrivia) {
+  // in lexTrivia, diagnose only when its should not be tokenize.
+
+  assert(Ptr != nullptr);
+
+  const char *const StartPtr = Ptr;
+
+  if (advanceIfValidContinuationOfIdentifier(Ptr, BufferEnd)) {
+    // If this is a valid identifier continuation, but not a valid identifier
+    // start, attempt to recover by eating more continuation characters.
+    if (!InLexTrivia) {
+      diagnose(StartPtr, diag::lex_invalid_identifier_start_character);
+    }
+    while (advanceIfValidContinuationOfIdentifier(Ptr, BufferEnd))
+      ;
+    return true;
+  }
+
+  // This character isn't allowed in Swift source.
+  uint32_t codepoint = validateUTF8CharacterAndAdvance(Ptr, BufferEnd);
+  if (codepoint == ~0U) {
+    diagnose(StartPtr, diag::lex_invalid_utf8)
+        .fixItReplaceChars(getSourceLoc(StartPtr), getSourceLoc(Ptr), " ");
+    // Skip presumed whitespace.
+    return false;
+  }
+
+  if (codepoint == 0x0000201D) {
+    // If this is an end curly quote, just diagnose it with a fixit hint.
+    if (!InLexTrivia) {
+      diagnose(CurPtr - 1, diag::lex_invalid_curly_quote)
+          .fixItReplaceChars(getSourceLoc(StartPtr), getSourceLoc(Ptr), "\"");
+    }
+    return true;
+  }
+
+  if (codepoint == 0x0000201C) {
+    const char *const LeftQuoteEndPtr = Ptr;
+    bool EmitDiagnostics = !InLexTrivia;
+
+    // If this is a start curly quote, do a fuzzy match of a string literal
+    // to improve recovery.
+    if (const char *const RightQuoteEndPtr =
+            findEndOfCurlyQuoteStringLiteral(Ptr, EmitDiagnostics)) {
+      Ptr = RightQuoteEndPtr;
+    }
+
+    // Note, we intentionally diagnose the end quote before the start quote,
+    // so that the IDE suggests fixing the end quote before the start quote.
+    // This, in turn, works better with our error recovery because we won't
+    // diagnose an end curly quote in the middle of a straight quoted
+    // literal.
+    if (EmitDiagnostics) {
+      diagnose(StartPtr, diag::lex_invalid_curly_quote)
+          .fixItReplaceChars(getSourceLoc(StartPtr),
+                             getSourceLoc(LeftQuoteEndPtr), "\"");
+    }
+    return true;
+  }
+
+  diagnose(StartPtr, diag::lex_invalid_character)
+      .fixItReplaceChars(getSourceLoc(StartPtr), getSourceLoc(Ptr), " ");
+
+  char expectedCodepoint;
+  if ((expectedCodepoint =
+           confusable::tryConvertConfusableCharacterToASCII(codepoint))) {
+
+    llvm::SmallString<4> confusedChar;
+    EncodeToUTF8(codepoint, confusedChar);
+    llvm::SmallString<1> expectedChar;
+    expectedChar += expectedCodepoint;
+    diagnose(StartPtr, diag::lex_confusable_character, confusedChar,
+             expectedChar)
+        .fixItReplaceChars(getSourceLoc(StartPtr), getSourceLoc(Ptr),
+                           expectedChar);
+  }
+
+  // Skip presumed whitespace.
+  return false;
+}
+
 void Lexer::tryLexEditorPlaceholder() {
   assert(CurPtr[-1] == '<' && CurPtr[0] == '#');
   const char *TokStart = CurPtr-1;
@@ -2100,65 +2222,15 @@ void Lexer::lexImpl() {
 
     if (advanceIfValidStartOfOperator(tmp, BufferEnd))
       return lexOperatorIdentifier();
-
-    if (advanceIfValidContinuationOfIdentifier(tmp, BufferEnd)) {
-      // If this is a valid identifier continuation, but not a valid identifier
-      // start, attempt to recover by eating more continuation characters.
-      diagnose(CurPtr-1, diag::lex_invalid_identifier_start_character);
-      while (advanceIfValidContinuationOfIdentifier(tmp, BufferEnd));
-    } else {
-      // This character isn't allowed in Swift source.
-      uint32_t codepoint = validateUTF8CharacterAndAdvance(tmp, BufferEnd);
-      if (codepoint == ~0U) {
-        diagnose(CurPtr-1, diag::lex_invalid_utf8)
-          .fixItReplaceChars(getSourceLoc(CurPtr-1), getSourceLoc(tmp), " ");
-        CurPtr = tmp;
-        goto Restart;  // Skip presumed whitespace.
-      } else if (codepoint == 0x0000201D) {
-        // If this is an end curly quote, just diagnose it with a fixit hint.
-        diagnose(CurPtr-1, diag::lex_invalid_curly_quote)
-          .fixItReplaceChars(getSourceLoc(CurPtr-1), getSourceLoc(tmp), "\"");
-      } else if (codepoint == 0x0000201C) {
-        auto endPtr = tmp;
-        // If this is a start curly quote, do a fuzzy match of a string literal
-        // to improve recovery.
-        if (auto tmp2 = findEndOfCurlyQuoteStringLiteral(tmp))
-          tmp = tmp2;
-
-        // Note, we intentionally diagnose the end quote before the start quote,
-        // so that the IDE suggests fixing the end quote before the start quote.
-        // This, in turn, works better with our error recovery because we won't
-        // diagnose an end curly quote in the middle of a straight quoted
-        // literal.
-        diagnose(CurPtr-1, diag::lex_invalid_curly_quote)
-          .fixItReplaceChars(getSourceLoc(CurPtr-1), getSourceLoc(endPtr),"\"");
 
-      } else {
-        diagnose(CurPtr-1, diag::lex_invalid_character)
-          .fixItReplaceChars(getSourceLoc(CurPtr-1), getSourceLoc(tmp), " ");
-
-        char expectedCodepoint;
-        if ((expectedCodepoint =
-            confusable::tryConvertConfusableCharacterToASCII(codepoint))) {
-
-          llvm::SmallString<4> confusedChar;
-          EncodeToUTF8(codepoint, confusedChar);
-          llvm::SmallString<1> expectedChar;
-          expectedChar += expectedCodepoint;
-          diagnose(CurPtr-1, diag::lex_confusable_character,
-                   confusedChar, expectedChar)
-            .fixItReplaceChars(getSourceLoc(CurPtr-1),
-                               getSourceLoc(tmp),
-                               expectedChar);
-        }
+    bool ShouldTokenize = lexInvalidCharacters(tmp, /*InLexTrivia=*/false);
+    CurPtr = tmp;
 
-        CurPtr = tmp;
-        goto Restart;  // Skip presumed whitespace.
-      }
+    if (ShouldTokenize) {
+      return formToken(tok::unknown, TokStart);
     }
 
-    CurPtr = tmp;
-    return formToken(tok::unknown, TokStart);
+    goto Restart;
   }
 
   case '\n':
@@ -2341,7 +2413,6 @@ void Lexer::lexTrivia(syntax::Trivia &Pieces, bool IsForTrailingTrivia) {
 Restart:
   const char *TriviaStart = CurPtr;
 
-  // TODO: Handle invalid UTF8 sequence which is skipped in lexImpl().
   switch (*CurPtr++) {
   case '\n':
     if (IsForTrailingTrivia)
@@ -2433,8 +2504,22 @@ void Lexer::lexTrivia(syntax::Trivia &Pieces, bool IsForTrailingTrivia) {
       break;
     }
     break;
-  default:
-    break;
+  default: {
+    const char *Ptr = CurPtr - 1;
+    if (!isStartOfInvalidCharacters(Ptr, BufferEnd)) {
+      break;
+    }
+
+    bool ShouldTokenize = lexInvalidCharacters(Ptr, /*InLexTrivia=*/true);
+    if (ShouldTokenize) {
+      break;
+    }
+
+    CurPtr = Ptr;
+    size_t Length = CurPtr - TriviaStart;
+    Pieces.push_back(TriviaPiece::garbageText({TriviaStart, Length}));
+    goto Restart;
+  }
   }
   // Reset the cursor.
   --CurPtr;

diff --git a/test/Syntax/round_trip_invalids.swift b/test/Syntax/round_trip_invalids.swift
@@ -0,0 +1,23 @@
+// To know about setup, see `tokens_invalids.swift`.
+
+// RUN: cat %s > %t
+// RUN: cat %t | sed 's/'$(echo -ne "\x5a1")'/'$(echo -ne "\xc2")'/g' > %t.sed
+// RUN: cp -f %t.sed %t
+// RUN: cat %t | sed 's/'$(echo -ne "\x5a2")'/'$(echo -ne "\xcc\x82")'/g' > %t.sed
+// RUN: cp -f %t.sed %t
+// RUN: cat %t | sed 's/'$(echo -ne "\x5a3")'/'$(echo -ne "\xe2\x80\x9d")'/g' > %t.sed
+// RUN: cp -f %t.sed %t
+// RUN: cat %t | sed 's/'$(echo -ne "\x5a4")'/'$(echo -ne "\xe2\x80\x9c")'/g' > %t.sed
+// RUN: cp -f %t.sed %t
+// RUN: cat %t | sed 's/'$(echo -ne "\x5a5")'/'$(echo -ne "\xe1\x9a\x80")'/g' > %t.sed
+// RUN: cp -f %t.sed %t
+
+// RUN: %round-trip-syntax-test --swift-syntax-test %swift-syntax-test --file %t
+
+x
+Z1 x
+Z2
+Z3
+Z4
+Z4 abcdef Z3
+Z5 x
diff --git a/test/Syntax/tokens_invalids.swift b/test/Syntax/tokens_invalids.swift
@@ -0,0 +1,79 @@
+// RUN: cat %s > %t
+
+// 5a is Z. "ZN" style marker is used for marker. N is number.
+
+// C2 is utf8 2 byte character start byte.
+// RUN: cat %t | sed 's/'$(echo -ne "\x5a1")'/'$(echo -ne "\xc2")'/g' > %t.sed
+// RUN: cp -f %t.sed %t
+
+// CC 82 is U+0302, invalid for identifier start, valid for identifier body.
+// RUN: cat %t | sed 's/'$(echo -ne "\x5a2")'/'$(echo -ne "\xcc\x82")'/g' > %t.sed
+// RUN: cp -f %t.sed %t
+
+// E2 80 9D is U+201D, right quote.
+// RUN: cat %t | sed 's/'$(echo -ne "\x5a3")'/'$(echo -ne "\xe2\x80\x9d")'/g' > %t.sed
+// RUN: cp -f %t.sed %t
+
+// E2 80 9C is U+201C, left quote.
+// RUN: cat %t | sed 's/'$(echo -ne "\x5a4")'/'$(echo -ne "\xe2\x80\x9c")'/g' > %t.sed
+// RUN: cp -f %t.sed %t
+
+// E1 9A 80 is U+1680, invalid for swift source.
+// RUN: cat %t | sed 's/'$(echo -ne "\x5a5")'/'$(echo -ne "\xe1\x9a\x80")'/g' > %t.sed
+// RUN: cp -f %t.sed %t
+
+// RUN: %swift-syntax-test -input-source-filename %t -dump-full-tokens 2>&1 | %FileCheck %t
+
+x
+Z1 x
+Z2
+Z3
+Z4
+Z4 abcdef Z3
+Z5 x
+
+// test diagnostics.
+
+// CHECK: 28:1: error: invalid UTF-8 found in source file
+// CHECK: 29:1: error: an identifier cannot begin with this character
+// CHECK: 30:1: error: unicode curly quote found
+// CHECK: 31:1: error: unicode curly quote found
+// CHECK: 32:12: error: unicode curly quote found
+// CHECK: 32:1: error: unicode curly quote found
+// CHECK: 33:1: error: invalid character in source file
+
+// test tokens and trivias.
+
+// CHECK-LABEL: 28:3
+// CHECK-NEXT:  (Token identifier
+// CHECK-NEXT:   (trivia newline 1)
+// CHECK-NEXT:   (trivia garbage_text \302)
+// CHECK-NEXT:   (trivia space 1)
+// CHECK-NEXT:   (text="x"))
+
+// CHECK-LABEL: 29:1
+// CHECK-NEXT:  (Token unknown
+// CHECK-NEXT:   (trivia newline 1)
+// CHECK-NEXT:   (text="\xCC\x82"))
+
+// CHECK-LABEL: 30:1
+// CHECK-NEXT:  (Token unknown
+// CHECK-NEXT:   (trivia newline 1)
+// CHECK-NEXT:   (text="\xE2\x80\x9D"))
+
+// CHECK-LABEL: 31:1
+// CHECK-NEXT:  (Token unknown
+// CHECK-NEXT:   (trivia newline 1)
+// CHECK-NEXT:   (text="\xE2\x80\x9C"))
+
+// CHECK-LABEL: 32:1
+// CHECK-NEXT:  (Token unknown
+// CHECK-NEXT:   (trivia newline 1)
+// CHECK-NEXT:   (text="\xE2\x80\x9C abcdef \xE2\x80\x9D"))
+
+// CHECK-LABEL: 33:5
+// CHECK-NEXT:  (Token identifier
+// CHECK-NEXT:   (trivia newline 1)
+// CHECK-NEXT:   (trivia garbage_text \341\232\200)
+// CHECK-NEXT:   (trivia space 1)
+// CHECK-NEXT:   (text="x"))