Skip to content

Commit c56ae83

Browse files
committed
[Parse] handle invalid chars in lexTrivia
1 parent ca19fe4 commit c56ae83

File tree

3 files changed

+155
-3
lines changed

3 files changed

+155
-3
lines changed

lib/Parse/Lexer.cpp

Lines changed: 53 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -557,6 +557,43 @@ static bool advanceIfValidContinuationOfOperator(char const *&ptr,
557557
return advanceIf(ptr, end, Identifier::isOperatorContinuationCodePoint);
558558
}
559559

560+
static bool isStartOfInvalidCharacters(const char *Ptr, const char *EndPtr) {
561+
// This logic must equals to switch-case in lexImpl.
562+
switch ((signed char)*Ptr) {
563+
case '\n': case '\r':
564+
case ' ': case '\t': case '\f': case '\v':
565+
case -1: case -2:
566+
case 0:
567+
case '@': case '{': case '[': case '(': case '}': case ']': case ')':
568+
case ',': case ';': case ':': case '\\': case '#': case '/': case '%':
569+
case '!': case '?': case '<': case '>': case '=':
570+
case '-': case '+': case '*': case '&': case '|': case '^': case '~': case '.':
571+
case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
572+
case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N':
573+
case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U':
574+
case 'V': case 'W': case 'X': case 'Y': case 'Z':
575+
case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
576+
case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
577+
case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u':
578+
case 'v': case 'w': case 'x': case 'y': case 'z':
579+
case '_': case '$':
580+
case '0': case '1': case '2': case '3': case '4':
581+
case '5': case '6': case '7': case '8': case '9':
582+
case '"': case '\'': case '`':
583+
return false;
584+
default: {
585+
if (advanceIfValidStartOfIdentifier(Ptr, EndPtr)) {
586+
return false;
587+
}
588+
if (advanceIfValidStartOfOperator(Ptr, EndPtr)) {
589+
return false;
590+
}
591+
592+
return true;
593+
}
594+
}
595+
}
596+
560597
bool Lexer::isIdentifier(StringRef string) {
561598
if (string.empty()) return false;
562599
char const *p = string.data(), *end = string.end();
@@ -2376,7 +2413,6 @@ void Lexer::lexTrivia(syntax::Trivia &Pieces, bool IsForTrailingTrivia) {
23762413
Restart:
23772414
const char *TriviaStart = CurPtr;
23782415

2379-
// TODO: Handle invalid UTF8 sequence which is skipped in lexImpl().
23802416
switch (*CurPtr++) {
23812417
case '\n':
23822418
if (IsForTrailingTrivia)
@@ -2468,8 +2504,22 @@ void Lexer::lexTrivia(syntax::Trivia &Pieces, bool IsForTrailingTrivia) {
24682504
break;
24692505
}
24702506
break;
2471-
default:
2472-
break;
2507+
default: {
2508+
const char *Ptr = CurPtr - 1;
2509+
if (!isStartOfInvalidCharacters(Ptr, BufferEnd)) {
2510+
break;
2511+
}
2512+
2513+
bool ShouldTokenize = lexInvalidCharacters(Ptr, /*InLexTrivia=*/true);
2514+
if (ShouldTokenize) {
2515+
break;
2516+
}
2517+
2518+
CurPtr = Ptr;
2519+
size_t Length = CurPtr - TriviaStart;
2520+
Pieces.push_back(TriviaPiece::garbageText({TriviaStart, Length}));
2521+
goto Restart;
2522+
}
24732523
}
24742524
// Reset the cursor.
24752525
--CurPtr;

test/Syntax/round_trip_invalids.swift

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
// To know about setup, see `tokens_invalids.swift`.
2+
3+
// RUN: cat %s > %t
4+
// RUN: cat %t | sed 's/'$(echo -ne "\x5a1")'/'$(echo -ne "\xc2")'/g' > %t.sed
5+
// RUN: cp -f %t.sed %t
6+
// RUN: cat %t | sed 's/'$(echo -ne "\x5a2")'/'$(echo -ne "\xcc\x82")'/g' > %t.sed
7+
// RUN: cp -f %t.sed %t
8+
// RUN: cat %t | sed 's/'$(echo -ne "\x5a3")'/'$(echo -ne "\xe2\x80\x9d")'/g' > %t.sed
9+
// RUN: cp -f %t.sed %t
10+
// RUN: cat %t | sed 's/'$(echo -ne "\x5a4")'/'$(echo -ne "\xe2\x80\x9c")'/g' > %t.sed
11+
// RUN: cp -f %t.sed %t
12+
// RUN: cat %t | sed 's/'$(echo -ne "\x5a5")'/'$(echo -ne "\xe1\x9a\x80")'/g' > %t.sed
13+
// RUN: cp -f %t.sed %t
14+
15+
// RUN: %round-trip-syntax-test --swift-syntax-test %swift-syntax-test --file %t
16+
17+
x
18+
Z1 x
19+
Z2
20+
Z3
21+
Z4
22+
Z4 abcdef Z3
23+
Z5 x

test/Syntax/tokens_invalids.swift

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
// RUN: cat %s > %t
2+
3+
// 5a is Z. "ZN" style marker is used for marker. N is number.
4+
5+
// C2 is utf8 2 byte character start byte.
6+
// RUN: cat %t | sed 's/'$(echo -ne "\x5a1")'/'$(echo -ne "\xc2")'/g' > %t.sed
7+
// RUN: cp -f %t.sed %t
8+
9+
// CC 82 is U+0302, invalid for identifier start, valid for identifier body.
10+
// RUN: cat %t | sed 's/'$(echo -ne "\x5a2")'/'$(echo -ne "\xcc\x82")'/g' > %t.sed
11+
// RUN: cp -f %t.sed %t
12+
13+
// E2 80 9D is U+201D, right quote.
14+
// RUN: cat %t | sed 's/'$(echo -ne "\x5a3")'/'$(echo -ne "\xe2\x80\x9d")'/g' > %t.sed
15+
// RUN: cp -f %t.sed %t
16+
17+
// E2 80 9C is U+201C, left quote.
18+
// RUN: cat %t | sed 's/'$(echo -ne "\x5a4")'/'$(echo -ne "\xe2\x80\x9c")'/g' > %t.sed
19+
// RUN: cp -f %t.sed %t
20+
21+
// E1 9A 80 is U+1680, invalid for swift source.
22+
// RUN: cat %t | sed 's/'$(echo -ne "\x5a5")'/'$(echo -ne "\xe1\x9a\x80")'/g' > %t.sed
23+
// RUN: cp -f %t.sed %t
24+
25+
// RUN: %swift-syntax-test -input-source-filename %t -dump-full-tokens 2>&1 | %FileCheck %t
26+
27+
x
28+
Z1 x
29+
Z2
30+
Z3
31+
Z4
32+
Z4 abcdef Z3
33+
Z5 x
34+
35+
// test diagnostics.
36+
37+
// CHECK: 28:1: error: invalid UTF-8 found in source file
38+
// CHECK: 29:1: error: an identifier cannot begin with this character
39+
// CHECK: 30:1: error: unicode curly quote found
40+
// CHECK: 31:1: error: unicode curly quote found
41+
// CHECK: 32:12: error: unicode curly quote found
42+
// CHECK: 32:1: error: unicode curly quote found
43+
// CHECK: 33:1: error: invalid character in source file
44+
45+
// test tokens and trivias.
46+
47+
// CHECK-LABEL: 28:3
48+
// CHECK-NEXT: (Token identifier
49+
// CHECK-NEXT: (trivia newline 1)
50+
// CHECK-NEXT: (trivia garbage_text \302)
51+
// CHECK-NEXT: (trivia space 1)
52+
// CHECK-NEXT: (text="x"))
53+
54+
// CHECK-LABEL: 29:1
55+
// CHECK-NEXT: (Token unknown
56+
// CHECK-NEXT: (trivia newline 1)
57+
// CHECK-NEXT: (text="\xCC\x82"))
58+
59+
// CHECK-LABEL: 30:1
60+
// CHECK-NEXT: (Token unknown
61+
// CHECK-NEXT: (trivia newline 1)
62+
// CHECK-NEXT: (text="\xE2\x80\x9D"))
63+
64+
// CHECK-LABEL: 31:1
65+
// CHECK-NEXT: (Token unknown
66+
// CHECK-NEXT: (trivia newline 1)
67+
// CHECK-NEXT: (text="\xE2\x80\x9C"))
68+
69+
// CHECK-LABEL: 32:1
70+
// CHECK-NEXT: (Token unknown
71+
// CHECK-NEXT: (trivia newline 1)
72+
// CHECK-NEXT: (text="\xE2\x80\x9C abcdef \xE2\x80\x9D"))
73+
74+
// CHECK-LABEL: 33:5
75+
// CHECK-NEXT: (Token identifier
76+
// CHECK-NEXT: (trivia newline 1)
77+
// CHECK-NEXT: (trivia garbage_text \341\232\200)
78+
// CHECK-NEXT: (trivia space 1)
79+
// CHECK-NEXT: (text="x"))

0 commit comments

Comments
 (0)