[Syntax] Parse invalid chars as trivia

omochi · nkcsgexi · commit 22cddbf033e0 · 2018-03-06T08:25:09.000-08:00
diff --git a/lib/Parse/Lexer.cpp b/lib/Parse/Lexer.cpp
@@ -2365,7 +2365,6 @@ void Lexer::lexTrivia(syntax::Trivia &Pieces, bool IsForTrailingTrivia) {
 Restart:
   const char *TriviaStart = CurPtr;
 
-  // TODO: Handle invalid UTF8 sequence which is skipped in lexImpl().
   switch (*CurPtr++) {
   case '\n':
     if (IsForTrailingTrivia)
@@ -2457,8 +2456,46 @@ void Lexer::lexTrivia(syntax::Trivia &Pieces, bool IsForTrailingTrivia) {
       break;
     }
     break;
-  default:
+  // Start character of tokens.
+  case -1: case -2:
+  case '@': case '{': case '[': case '(': case '}': case ']': case ')':
+  case ',': case ';': case ':': case '\\': case '$':
+  case '0': case '1': case '2': case '3': case '4':
+  case '5': case '6': case '7': case '8': case '9':
+  case '"': case '\'': case '`':
+  // Start of identifiers.
+  case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
+  case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N':
+  case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U':
+  case 'V': case 'W': case 'X': case 'Y': case 'Z':
+  case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
+  case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
+  case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u':
+  case 'v': case 'w': case 'x': case 'y': case 'z':
+  case '_':
+  // Start of operators.
+  case '%': case '!': case '?': case '=':
+  case '-': case '+': case '*':
+  case '&': case '|': case '^': case '~': case '.':
     break;
+  default:
+    const char *Tmp = CurPtr - 1;
+    if (advanceIfValidStartOfIdentifier(Tmp, BufferEnd)) {
+      break;
+    }
+    if (advanceIfValidStartOfOperator(Tmp, BufferEnd)) {
+      break;
+    }
+
+    bool ShouldTokenize = lexUnknown(/*EmitDiagnosticsIfToken=*/false);
+    if (ShouldTokenize) {
+      CurPtr = Tmp;
+      return;
+    }
+
+    size_t Length = CurPtr - TriviaStart;
+    Pieces.push_back(TriviaPiece::garbageText({TriviaStart, Length}));
+    goto Restart;
   }
   // Reset the cursor.
   --CurPtr;
diff --git a/test/Syntax/tokens_unknown_and_invalid.swift b/test/Syntax/tokens_unknown_and_invalid.swift
@@ -0,0 +1,127 @@
+// To embed test byte sequence,
+// this source replace marker to byte sequence first in runtime.
+// Marker(N) have `ZN` style format. Z is Z, N is number.
+// Byte sequence is represented in escape sequence.
+// To avoid replace marker in sed command by sed itself,
+// marker is also represented in escape sequence.
+
+// RUN: cat %s | sed \
+
+// [0xC2] is utf8 2 byte character start byte.
+// 0xC2 without second byte is invalid UTF-8 sequence.
+// It becomes garbage text trivia.
+// Marker(1) is replaced to this sequence.
+
+// RUN: -e 's/'$(echo -ne "\x5a1")'/'$(echo -ne "\xc2")'/g' \
+
+// [0xCC, 0x82] in UTF-8 is U+0302.
+// This character is invalid for identifier start, but valid for identifier body.
+// It becomes unknown token.
+// If this type characters are conitguous, they are concatenated to one long unknown token.
+// Marker(2) is replaced to this sequence.
+
+// RUN: -e 's/'$(echo -ne "\x5a2")'/'$(echo -ne "\xcc\x82")'/g' \
+
+// [0xE2, 0x80, 0x9C] in UTF-8 is U+201C, left quote.
+// It becomes single character unknown token.
+// If this left quote and right quote enclosure text,
+// they become one long unknown token.
+// Marker(3) is replaced to this sequence.
+
+// RUN: -e 's/'$(echo -ne "\x5a3")'/'$(echo -ne "\xe2\x80\x9c")'/g' \
+
+// [0xE2, 0x80, 0x9D] in UTF-8 is U+201D, right quote.
+// It becomes single character unknown token.
+// Marker(4) is replaced to this sequence.
+
+// RUN: -e 's/'$(echo -ne "\x5a4")'/'$(echo -ne "\xe2\x80\x9d")'/g' \
+
+// [0xE1, 0x9A, 0x80] in UTF-8 is U+1680.
+// This character is invalid for swift source.
+// It becomes garbage trivia.
+// Marker(5) is replaced to this sequence.
+
+// RUN: -e 's/'$(echo -ne "\x5a5")'/'$(echo -ne "\xe1\x9a\x80")'/g' \
+
+// RUN: > %t
+
+// RUN: %swift-syntax-test -input-source-filename %t -dump-full-tokens 2>&1 | %FileCheck %t
+// RUN: %round-trip-syntax-test --swift-syntax-test %swift-syntax-test --file %t
+
+aaa
+Z1 bbb Z1
+
+ccc Z2
+
+ddd Z2Z2Z2Z2
+
+eee Z3Z3
+
+fff Z3hello worldZ4
+
+ggg Z4
+
+hhh
+Z5 iii Z5
+jjj
+
+// Diagnostics
+// CHECK: 52:1: error: invalid UTF-8 found in source file
+// CHECK: 52:7: error: invalid UTF-8 found in source file
+// CHECK: 54:5: error: an identifier cannot begin with this character
+// CHECK: 56:5: error: an identifier cannot begin with this character
+// CHECK: 58:5: error: unicode curly quote found
+// CHECK: 58:8: error: unicode curly quote found
+// CHECK: 60:19: error: unicode curly quote found
+// CHECK: 60:5: error: unicode curly quote found
+// CHECK: 62:5: error: unicode curly quote found
+// CHECK: 65:1: error: invalid character in source file
+// CHECK: 65:9: error: invalid character in source file
+
+// Checks around bbb
+// CHECK-LABEL: 52:3
+// CHECK-NEXT:  (Token identifier
+// CHECK-NEXT:   (trivia newline 1)
+// CHECK-NEXT:   (trivia garbage_text \302)
+// CHECK-NEXT:   (trivia space 1)
+// CHECK-NEXT:   (text="bbb")
+// CHECK-NEXT:   (trivia space 1)
+// CHECK-NEXT:   (trivia garbage_text \302))
+
+// Checks around ccc
+// CHECK-LABEL: 54:5
+// CHECK-NEXT:  (Token unknown
+// CHECK-NEXT:   (text="\xCC\x82"))
+
+// Checks around ddd
+// CHECK-LABEL: 56:5
+// CHECK-NEXT:  (Token unknown
+// CHECK-NEXT:   (text="\xCC\x82\xCC\x82\xCC\x82\xCC\x82"))
+
+// Checks around eee
+// CHECK-LABEL: 58:5
+// CHECK-NEXT:  (Token unknown
+// CHECK-NEXT:   (text="\xE2\x80\x9C"))
+// CHECK-LABEL: 58:8
+// CHECK-NEXT:  (Token unknown
+// CHECK-NEXT:   (text="\xE2\x80\x9C"))
+
+// Checks around fff
+// CHECK-LABEL: 60:5
+// CHECK-NEXT:  (Token unknown
+// CHECK-NEXT:   (text="\xE2\x80\x9Chello world\xE2\x80\x9D"))
+
+// Checks around ggg
+// CHECK-LABEL: 62:5
+// CHECK-NEXT:  (Token unknown
+// CHECK-NEXT:   (text="\xE2\x80\x9D"))
+
+// Checks around iii
+// CHECK-LABEL: 65:5
+// CHECK-NEXT:  (Token identifier
+// CHECK-NEXT:   (trivia newline 1)
+// CHECK-NEXT:   (trivia garbage_text \341\232\200)
+// CHECK-NEXT:   (trivia space 1)
+// CHECK-NEXT:   (text="iii")
+// CHECK-NEXT:   (trivia space 1)
+// CHECK-NEXT:   (trivia garbage_text \341\232\200))