[Parse] handle invalid chars in lexTrivia

omochi · omochi · commit c56ae83485a9 · 2018-03-05T19:23:29.000+09:00
diff --git a/lib/Parse/Lexer.cpp b/lib/Parse/Lexer.cpp
@@ -557,6 +557,43 @@ static bool advanceIfValidContinuationOfOperator(char const *&ptr,
   return advanceIf(ptr, end, Identifier::isOperatorContinuationCodePoint);
 }
 
+static bool isStartOfInvalidCharacters(const char *Ptr, const char *EndPtr) {
+  // This logic must equals to switch-case in lexImpl.
+  switch ((signed char)*Ptr) {
+  case '\n': case '\r':
+  case ' ': case '\t': case '\f': case '\v':
+  case -1: case -2:
+  case 0:
+  case '@': case '{': case '[': case '(': case '}': case ']': case ')':
+  case ',': case ';': case ':': case '\\': case '#': case '/': case '%':
+  case '!': case '?': case '<': case '>': case '=':
+  case '-': case '+': case '*': case '&': case '|': case '^': case '~': case '.':
+  case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
+  case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N':
+  case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U':
+  case 'V': case 'W': case 'X': case 'Y': case 'Z':
+  case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
+  case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
+  case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u':
+  case 'v': case 'w': case 'x': case 'y': case 'z':
+  case '_': case '$':
+  case '0': case '1': case '2': case '3': case '4':
+  case '5': case '6': case '7': case '8': case '9':
+  case '"': case '\'': case '`':
+    return false;
+  default: {
+    if (advanceIfValidStartOfIdentifier(Ptr, EndPtr)) {
+      return false;
+    }
+    if (advanceIfValidStartOfOperator(Ptr, EndPtr)) {
+      return false;
+    }
+
+    return true;
+  }
+  }
+}
+
 bool Lexer::isIdentifier(StringRef string) {
   if (string.empty()) return false;
   char const *p = string.data(), *end = string.end();
@@ -2376,7 +2413,6 @@ void Lexer::lexTrivia(syntax::Trivia &Pieces, bool IsForTrailingTrivia) {
 Restart:
   const char *TriviaStart = CurPtr;
 
-  // TODO: Handle invalid UTF8 sequence which is skipped in lexImpl().
   switch (*CurPtr++) {
   case '\n':
     if (IsForTrailingTrivia)
@@ -2468,8 +2504,22 @@ void Lexer::lexTrivia(syntax::Trivia &Pieces, bool IsForTrailingTrivia) {
       break;
     }
     break;
-  default:
-    break;
+  default: {
+    const char *Ptr = CurPtr - 1;
+    if (!isStartOfInvalidCharacters(Ptr, BufferEnd)) {
+      break;
+    }
+
+    bool ShouldTokenize = lexInvalidCharacters(Ptr, /*InLexTrivia=*/true);
+    if (ShouldTokenize) {
+      break;
+    }
+
+    CurPtr = Ptr;
+    size_t Length = CurPtr - TriviaStart;
+    Pieces.push_back(TriviaPiece::garbageText({TriviaStart, Length}));
+    goto Restart;
+  }
   }
   // Reset the cursor.
   --CurPtr;
diff --git a/test/Syntax/round_trip_invalids.swift b/test/Syntax/round_trip_invalids.swift
@@ -0,0 +1,23 @@
+// To know about setup, see `tokens_invalids.swift`.
+
+// RUN: cat %s > %t
+// RUN: cat %t | sed 's/'$(echo -ne "\x5a1")'/'$(echo -ne "\xc2")'/g' > %t.sed
+// RUN: cp -f %t.sed %t
+// RUN: cat %t | sed 's/'$(echo -ne "\x5a2")'/'$(echo -ne "\xcc\x82")'/g' > %t.sed
+// RUN: cp -f %t.sed %t
+// RUN: cat %t | sed 's/'$(echo -ne "\x5a3")'/'$(echo -ne "\xe2\x80\x9d")'/g' > %t.sed
+// RUN: cp -f %t.sed %t
+// RUN: cat %t | sed 's/'$(echo -ne "\x5a4")'/'$(echo -ne "\xe2\x80\x9c")'/g' > %t.sed
+// RUN: cp -f %t.sed %t
+// RUN: cat %t | sed 's/'$(echo -ne "\x5a5")'/'$(echo -ne "\xe1\x9a\x80")'/g' > %t.sed
+// RUN: cp -f %t.sed %t
+
+// RUN: %round-trip-syntax-test --swift-syntax-test %swift-syntax-test --file %t
+
+x
+Z1 x
+Z2
+Z3
+Z4
+Z4 abcdef Z3
+Z5 x
diff --git a/test/Syntax/tokens_invalids.swift b/test/Syntax/tokens_invalids.swift
@@ -0,0 +1,79 @@
+// RUN: cat %s > %t
+
+// 5a is Z. "ZN" style marker is used for marker. N is number.
+
+// C2 is utf8 2 byte character start byte.
+// RUN: cat %t | sed 's/'$(echo -ne "\x5a1")'/'$(echo -ne "\xc2")'/g' > %t.sed
+// RUN: cp -f %t.sed %t
+
+// CC 82 is U+0302, invalid for identifier start, valid for identifier body.
+// RUN: cat %t | sed 's/'$(echo -ne "\x5a2")'/'$(echo -ne "\xcc\x82")'/g' > %t.sed
+// RUN: cp -f %t.sed %t
+
+// E2 80 9D is U+201D, right quote.
+// RUN: cat %t | sed 's/'$(echo -ne "\x5a3")'/'$(echo -ne "\xe2\x80\x9d")'/g' > %t.sed
+// RUN: cp -f %t.sed %t
+
+// E2 80 9C is U+201C, left quote.
+// RUN: cat %t | sed 's/'$(echo -ne "\x5a4")'/'$(echo -ne "\xe2\x80\x9c")'/g' > %t.sed
+// RUN: cp -f %t.sed %t
+
+// E1 9A 80 is U+1680, invalid for swift source.
+// RUN: cat %t | sed 's/'$(echo -ne "\x5a5")'/'$(echo -ne "\xe1\x9a\x80")'/g' > %t.sed
+// RUN: cp -f %t.sed %t
+
+// RUN: %swift-syntax-test -input-source-filename %t -dump-full-tokens 2>&1 | %FileCheck %t
+
+x
+Z1 x
+Z2
+Z3
+Z4
+Z4 abcdef Z3
+Z5 x
+
+// test diagnostics.
+
+// CHECK: 28:1: error: invalid UTF-8 found in source file
+// CHECK: 29:1: error: an identifier cannot begin with this character
+// CHECK: 30:1: error: unicode curly quote found
+// CHECK: 31:1: error: unicode curly quote found
+// CHECK: 32:12: error: unicode curly quote found
+// CHECK: 32:1: error: unicode curly quote found
+// CHECK: 33:1: error: invalid character in source file
+
+// test tokens and trivias.
+
+// CHECK-LABEL: 28:3
+// CHECK-NEXT:  (Token identifier
+// CHECK-NEXT:   (trivia newline 1)
+// CHECK-NEXT:   (trivia garbage_text \302)
+// CHECK-NEXT:   (trivia space 1)
+// CHECK-NEXT:   (text="x"))
+
+// CHECK-LABEL: 29:1
+// CHECK-NEXT:  (Token unknown
+// CHECK-NEXT:   (trivia newline 1)
+// CHECK-NEXT:   (text="\xCC\x82"))
+
+// CHECK-LABEL: 30:1
+// CHECK-NEXT:  (Token unknown
+// CHECK-NEXT:   (trivia newline 1)
+// CHECK-NEXT:   (text="\xE2\x80\x9D"))
+
+// CHECK-LABEL: 31:1
+// CHECK-NEXT:  (Token unknown
+// CHECK-NEXT:   (trivia newline 1)
+// CHECK-NEXT:   (text="\xE2\x80\x9C"))
+
+// CHECK-LABEL: 32:1
+// CHECK-NEXT:  (Token unknown
+// CHECK-NEXT:   (trivia newline 1)
+// CHECK-NEXT:   (text="\xE2\x80\x9C abcdef \xE2\x80\x9D"))
+
+// CHECK-LABEL: 33:5
+// CHECK-NEXT:  (Token identifier
+// CHECK-NEXT:   (trivia newline 1)
+// CHECK-NEXT:   (trivia garbage_text \341\232\200)
+// CHECK-NEXT:   (trivia space 1)
+// CHECK-NEXT:   (text="x"))