Skip to content

Commit 22cddbf

Browse files
omochinkcsgexi
authored andcommitted
[Syntax] Parse invalid chars as trivia
1 parent b4192d8 commit 22cddbf

File tree

2 files changed

+166
-2
lines changed

2 files changed

+166
-2
lines changed

lib/Parse/Lexer.cpp

Lines changed: 39 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2365,7 +2365,6 @@ void Lexer::lexTrivia(syntax::Trivia &Pieces, bool IsForTrailingTrivia) {
23652365
Restart:
23662366
const char *TriviaStart = CurPtr;
23672367

2368-
// TODO: Handle invalid UTF8 sequence which is skipped in lexImpl().
23692368
switch (*CurPtr++) {
23702369
case '\n':
23712370
if (IsForTrailingTrivia)
@@ -2457,8 +2456,46 @@ void Lexer::lexTrivia(syntax::Trivia &Pieces, bool IsForTrailingTrivia) {
24572456
break;
24582457
}
24592458
break;
2460-
default:
2459+
// Start character of tokens.
2460+
case -1: case -2:
2461+
case '@': case '{': case '[': case '(': case '}': case ']': case ')':
2462+
case ',': case ';': case ':': case '\\': case '$':
2463+
case '0': case '1': case '2': case '3': case '4':
2464+
case '5': case '6': case '7': case '8': case '9':
2465+
case '"': case '\'': case '`':
2466+
// Start of identifiers.
2467+
case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
2468+
case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N':
2469+
case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U':
2470+
case 'V': case 'W': case 'X': case 'Y': case 'Z':
2471+
case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
2472+
case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
2473+
case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u':
2474+
case 'v': case 'w': case 'x': case 'y': case 'z':
2475+
case '_':
2476+
// Start of operators.
2477+
case '%': case '!': case '?': case '=':
2478+
case '-': case '+': case '*':
2479+
case '&': case '|': case '^': case '~': case '.':
24612480
break;
2481+
default:
2482+
const char *Tmp = CurPtr - 1;
2483+
if (advanceIfValidStartOfIdentifier(Tmp, BufferEnd)) {
2484+
break;
2485+
}
2486+
if (advanceIfValidStartOfOperator(Tmp, BufferEnd)) {
2487+
break;
2488+
}
2489+
2490+
bool ShouldTokenize = lexUnknown(/*EmitDiagnosticsIfToken=*/false);
2491+
if (ShouldTokenize) {
2492+
CurPtr = Tmp;
2493+
return;
2494+
}
2495+
2496+
size_t Length = CurPtr - TriviaStart;
2497+
Pieces.push_back(TriviaPiece::garbageText({TriviaStart, Length}));
2498+
goto Restart;
24622499
}
24632500
// Reset the cursor.
24642501
--CurPtr;
Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
// To embed test byte sequence,
2+
// this source replace marker to byte sequence first in runtime.
3+
// Marker(N) have `ZN` style format. Z is Z, N is number.
4+
// Byte sequence is represented in escape sequence.
5+
// To avoid replace marker in sed command by sed itself,
6+
// marker is also represented in escape sequence.
7+
8+
// RUN: cat %s | sed \
9+
10+
// [0xC2] is utf8 2 byte character start byte.
11+
// 0xC2 without second byte is invalid UTF-8 sequence.
12+
// It becomes garbage text trivia.
13+
// Marker(1) is replaced to this sequence.
14+
15+
// RUN: -e 's/'$(echo -ne "\x5a1")'/'$(echo -ne "\xc2")'/g' \
16+
17+
// [0xCC, 0x82] in UTF-8 is U+0302.
18+
// This character is invalid for identifier start, but valid for identifier body.
19+
// It becomes unknown token.
20+
// If this type characters are conitguous, they are concatenated to one long unknown token.
21+
// Marker(2) is replaced to this sequence.
22+
23+
// RUN: -e 's/'$(echo -ne "\x5a2")'/'$(echo -ne "\xcc\x82")'/g' \
24+
25+
// [0xE2, 0x80, 0x9C] in UTF-8 is U+201C, left quote.
26+
// It becomes single character unknown token.
27+
// If this left quote and right quote enclosure text,
28+
// they become one long unknown token.
29+
// Marker(3) is replaced to this sequence.
30+
31+
// RUN: -e 's/'$(echo -ne "\x5a3")'/'$(echo -ne "\xe2\x80\x9c")'/g' \
32+
33+
// [0xE2, 0x80, 0x9D] in UTF-8 is U+201D, right quote.
34+
// It becomes single character unknown token.
35+
// Marker(4) is replaced to this sequence.
36+
37+
// RUN: -e 's/'$(echo -ne "\x5a4")'/'$(echo -ne "\xe2\x80\x9d")'/g' \
38+
39+
// [0xE1, 0x9A, 0x80] in UTF-8 is U+1680.
40+
// This character is invalid for swift source.
41+
// It becomes garbage trivia.
42+
// Marker(5) is replaced to this sequence.
43+
44+
// RUN: -e 's/'$(echo -ne "\x5a5")'/'$(echo -ne "\xe1\x9a\x80")'/g' \
45+
46+
// RUN: > %t
47+
48+
// RUN: %swift-syntax-test -input-source-filename %t -dump-full-tokens 2>&1 | %FileCheck %t
49+
// RUN: %round-trip-syntax-test --swift-syntax-test %swift-syntax-test --file %t
50+
51+
aaa
52+
Z1 bbb Z1
53+
54+
ccc Z2
55+
56+
ddd Z2Z2Z2Z2
57+
58+
eee Z3Z3
59+
60+
fff Z3hello worldZ4
61+
62+
ggg Z4
63+
64+
hhh
65+
Z5 iii Z5
66+
jjj
67+
68+
// Diagnostics
69+
// CHECK: 52:1: error: invalid UTF-8 found in source file
70+
// CHECK: 52:7: error: invalid UTF-8 found in source file
71+
// CHECK: 54:5: error: an identifier cannot begin with this character
72+
// CHECK: 56:5: error: an identifier cannot begin with this character
73+
// CHECK: 58:5: error: unicode curly quote found
74+
// CHECK: 58:8: error: unicode curly quote found
75+
// CHECK: 60:19: error: unicode curly quote found
76+
// CHECK: 60:5: error: unicode curly quote found
77+
// CHECK: 62:5: error: unicode curly quote found
78+
// CHECK: 65:1: error: invalid character in source file
79+
// CHECK: 65:9: error: invalid character in source file
80+
81+
// Checks around bbb
82+
// CHECK-LABEL: 52:3
83+
// CHECK-NEXT: (Token identifier
84+
// CHECK-NEXT: (trivia newline 1)
85+
// CHECK-NEXT: (trivia garbage_text \302)
86+
// CHECK-NEXT: (trivia space 1)
87+
// CHECK-NEXT: (text="bbb")
88+
// CHECK-NEXT: (trivia space 1)
89+
// CHECK-NEXT: (trivia garbage_text \302))
90+
91+
// Checks around ccc
92+
// CHECK-LABEL: 54:5
93+
// CHECK-NEXT: (Token unknown
94+
// CHECK-NEXT: (text="\xCC\x82"))
95+
96+
// Checks around ddd
97+
// CHECK-LABEL: 56:5
98+
// CHECK-NEXT: (Token unknown
99+
// CHECK-NEXT: (text="\xCC\x82\xCC\x82\xCC\x82\xCC\x82"))
100+
101+
// Checks around eee
102+
// CHECK-LABEL: 58:5
103+
// CHECK-NEXT: (Token unknown
104+
// CHECK-NEXT: (text="\xE2\x80\x9C"))
105+
// CHECK-LABEL: 58:8
106+
// CHECK-NEXT: (Token unknown
107+
// CHECK-NEXT: (text="\xE2\x80\x9C"))
108+
109+
// Checks around fff
110+
// CHECK-LABEL: 60:5
111+
// CHECK-NEXT: (Token unknown
112+
// CHECK-NEXT: (text="\xE2\x80\x9Chello world\xE2\x80\x9D"))
113+
114+
// Checks around ggg
115+
// CHECK-LABEL: 62:5
116+
// CHECK-NEXT: (Token unknown
117+
// CHECK-NEXT: (text="\xE2\x80\x9D"))
118+
119+
// Checks around iii
120+
// CHECK-LABEL: 65:5
121+
// CHECK-NEXT: (Token identifier
122+
// CHECK-NEXT: (trivia newline 1)
123+
// CHECK-NEXT: (trivia garbage_text \341\232\200)
124+
// CHECK-NEXT: (trivia space 1)
125+
// CHECK-NEXT: (text="iii")
126+
// CHECK-NEXT: (trivia space 1)
127+
// CHECK-NEXT: (trivia garbage_text \341\232\200))

0 commit comments

Comments
 (0)