Skip to content

[Do not merge] [Syntax] parse invalid chars as trivia #14979

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
108 changes: 60 additions & 48 deletions lib/Parse/Lexer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2106,59 +2106,72 @@ void Lexer::lexImpl() {
// start, attempt to recover by eating more continuation characters.
diagnose(CurPtr-1, diag::lex_invalid_identifier_start_character);
while (advanceIfValidContinuationOfIdentifier(tmp, BufferEnd));
} else {
// This character isn't allowed in Swift source.
uint32_t codepoint = validateUTF8CharacterAndAdvance(tmp, BufferEnd);
if (codepoint == ~0U) {
diagnose(CurPtr-1, diag::lex_invalid_utf8)
.fixItReplaceChars(getSourceLoc(CurPtr-1), getSourceLoc(tmp), " ");
CurPtr = tmp;
goto Restart; // Skip presumed whitespace.
} else if (codepoint == 0x0000201D) {
// If this is an end curly quote, just diagnose it with a fixit hint.
diagnose(CurPtr-1, diag::lex_invalid_curly_quote)
.fixItReplaceChars(getSourceLoc(CurPtr-1), getSourceLoc(tmp), "\"");
} else if (codepoint == 0x0000201C) {
auto endPtr = tmp;
// If this is a start curly quote, do a fuzzy match of a string literal
// to improve recovery.
if (auto tmp2 = findEndOfCurlyQuoteStringLiteral(tmp))
tmp = tmp2;

// Note, we intentionally diagnose the end quote before the start quote,
// so that the IDE suggests fixing the end quote before the start quote.
// This, in turn, works better with our error recovery because we won't
// diagnose an end curly quote in the middle of a straight quoted
// literal.
diagnose(CurPtr-1, diag::lex_invalid_curly_quote)
.fixItReplaceChars(getSourceLoc(CurPtr-1), getSourceLoc(endPtr),"\"");

} else {
diagnose(CurPtr-1, diag::lex_invalid_character)
.fixItReplaceChars(getSourceLoc(CurPtr-1), getSourceLoc(tmp), " ");

char expectedCodepoint;
if ((expectedCodepoint =
confusable::tryConvertConfusableCharacterToASCII(codepoint))) {

llvm::SmallString<4> confusedChar;
EncodeToUTF8(codepoint, confusedChar);
llvm::SmallString<1> expectedChar;
expectedChar += expectedCodepoint;
diagnose(CurPtr-1, diag::lex_confusable_character,
confusedChar, expectedChar)
.fixItReplaceChars(getSourceLoc(CurPtr-1),
getSourceLoc(tmp),
expectedChar);
}
CurPtr = tmp;
return formToken(tok::unknown, TokStart);
}

CurPtr = tmp;
goto Restart; // Skip presumed whitespace.
// This character isn't allowed in Swift source.
uint32_t codepoint = validateUTF8CharacterAndAdvance(tmp, BufferEnd);
if (codepoint == ~0U) {
diagnose(CurPtr - 1, diag::lex_invalid_utf8)
.fixItReplaceChars(getSourceLoc(CurPtr - 1), getSourceLoc(tmp), " ");
CurPtr = tmp;
if (TriviaRetention == TriviaRetentionMode::WithTrivia) {
size_t TriviaLength = CurPtr - TokStart;
LeadingTrivia.push_back(
TriviaPiece::garbageText({TokStart, TriviaLength}));
}
goto Restart; // Skip presumed whitespace.
} else if (codepoint == 0x0000201D) {
// If this is an end curly quote, just diagnose it with a fixit hint.
diagnose(CurPtr - 1, diag::lex_invalid_curly_quote)
.fixItReplaceChars(getSourceLoc(CurPtr - 1), getSourceLoc(tmp), "\"");
CurPtr = tmp;
return formToken(tok::unknown, TokStart);
} else if (codepoint == 0x0000201C) {
auto endPtr = tmp;
// If this is a start curly quote, do a fuzzy match of a string literal
// to improve recovery.
if (auto tmp2 = findEndOfCurlyQuoteStringLiteral(tmp))
tmp = tmp2;

// Note, we intentionally diagnose the end quote before the start quote,
// so that the IDE suggests fixing the end quote before the start quote.
// This, in turn, works better with our error recovery because we won't
// diagnose an end curly quote in the middle of a straight quoted
// literal.
diagnose(CurPtr - 1, diag::lex_invalid_curly_quote)
.fixItReplaceChars(getSourceLoc(CurPtr - 1), getSourceLoc(endPtr),
"\"");
CurPtr = tmp;
return formToken(tok::unknown, TokStart);
}

diagnose(CurPtr - 1, diag::lex_invalid_character)
.fixItReplaceChars(getSourceLoc(CurPtr - 1), getSourceLoc(tmp), " ");

char expectedCodepoint;
if ((expectedCodepoint =
confusable::tryConvertConfusableCharacterToASCII(codepoint))) {

llvm::SmallString<4> confusedChar;
EncodeToUTF8(codepoint, confusedChar);
llvm::SmallString<1> expectedChar;
expectedChar += expectedCodepoint;
diagnose(CurPtr - 1, diag::lex_confusable_character, confusedChar,
expectedChar)
.fixItReplaceChars(getSourceLoc(CurPtr - 1), getSourceLoc(tmp),
expectedChar);
}

CurPtr = tmp;
return formToken(tok::unknown, TokStart);
if (TriviaRetention == TriviaRetentionMode::WithTrivia) {
size_t TriviaLength = CurPtr - TokStart;
LeadingTrivia.push_back(
TriviaPiece::garbageText({TokStart, TriviaLength}));
}
goto Restart; // Skip presumed whitespace.
}

case '\n':
Expand Down Expand Up @@ -2341,7 +2354,6 @@ void Lexer::lexTrivia(syntax::Trivia &Pieces, bool IsForTrailingTrivia) {
Restart:
const char *TriviaStart = CurPtr;

// TODO: Handle invalid UTF8 sequence which is skipped in lexImpl().
switch (*CurPtr++) {
case '\n':
if (IsForTrailingTrivia)
Expand Down
23 changes: 23 additions & 0 deletions test/Syntax/round_trip_invalids.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
// To know about setup, see `tokens_invalids.swift`.

// RUN: cat %s > %t
// RUN: cat %t | sed 's/'$(echo -ne "\x5a1")'/'$(echo -ne "\xc2")'/g' > %t.sed
// RUN: cp -f %t.sed %t
// RUN: cat %t | sed 's/'$(echo -ne "\x5a2")'/'$(echo -ne "\xcc\x82")'/g' > %t.sed
// RUN: cp -f %t.sed %t
// RUN: cat %t | sed 's/'$(echo -ne "\x5a3")'/'$(echo -ne "\xe2\x80\x9d")'/g' > %t.sed
// RUN: cp -f %t.sed %t
// RUN: cat %t | sed 's/'$(echo -ne "\x5a4")'/'$(echo -ne "\xe2\x80\x9c")'/g' > %t.sed
// RUN: cp -f %t.sed %t
// RUN: cat %t | sed 's/'$(echo -ne "\x5a5")'/'$(echo -ne "\xe1\x9a\x80")'/g' > %t.sed
// RUN: cp -f %t.sed %t

// RUN: %round-trip-syntax-test --swift-syntax-test %swift-syntax-test --file %t

x
Z1 x
Z2
Z3
Z4
Z4 abcdef Z3
Z5 x
79 changes: 79 additions & 0 deletions test/Syntax/tokens_invalids.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
// RUN: cat %s > %t

// 5a is Z. "ZN" style marker is used for marker. N is number.

// C2 is utf8 2 byte character start byte.
// RUN: cat %t | sed 's/'$(echo -ne "\x5a1")'/'$(echo -ne "\xc2")'/g' > %t.sed
Copy link
Member

@rintaro rintaro Mar 6, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

// RUN: cat %t | sed 's/Z1/'$(echo -ne "\xc2")'/g' > %t.sed
                        ^^

doesn't work? I think it's easier to read.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No. If I do so, Z1 in sed line also replaced by this sed itself.
It makes problematic input source.

If Lexer is correct, it also skipped as line comment of course.
But for example it introduce other complex discussion about diagnostics about invalid character in comment.

// RUN: cp -f %t.sed %t

// CC 82 is U+0302, invalid for identifier start, valid for identifier body.
// RUN: cat %t | sed 's/'$(echo -ne "\x5a2")'/'$(echo -ne "\xcc\x82")'/g' > %t.sed
// RUN: cp -f %t.sed %t

// E2 80 9D is U+201D, right quote.
// RUN: cat %t | sed 's/'$(echo -ne "\x5a3")'/'$(echo -ne "\xe2\x80\x9d")'/g' > %t.sed
// RUN: cp -f %t.sed %t

// E2 80 9C is U+201C, left quote.
// RUN: cat %t | sed 's/'$(echo -ne "\x5a4")'/'$(echo -ne "\xe2\x80\x9c")'/g' > %t.sed
// RUN: cp -f %t.sed %t

// E1 9A 80 is U+1680, invalid for swift source.
// RUN: cat %t | sed 's/'$(echo -ne "\x5a5")'/'$(echo -ne "\xe1\x9a\x80")'/g' > %t.sed
// RUN: cp -f %t.sed %t

// RUN: %swift-syntax-test -input-source-filename %t -dump-full-tokens 2>&1 | %FileCheck %t

x
Z1 x
Z2
Z3
Z4
Z4 abcdef Z3
Z5 x

// test diagnostics.

// CHECK: 28:1: error: invalid UTF-8 found in source file
// CHECK: 29:1: error: an identifier cannot begin with this character
// CHECK: 30:1: error: unicode curly quote found
// CHECK: 31:1: error: unicode curly quote found
// CHECK: 32:12: error: unicode curly quote found
// CHECK: 32:1: error: unicode curly quote found
// CHECK: 33:1: error: invalid character in source file

// test tokens and trivias.

// CHECK-LABEL: 28:3
// CHECK-NEXT: (Token identifier
// CHECK-NEXT: (trivia newline 1)
// CHECK-NEXT: (trivia garbage_text \302)
// CHECK-NEXT: (trivia space 1)
// CHECK-NEXT: (text="x"))

// CHECK-LABEL: 29:1
// CHECK-NEXT: (Token unknown
// CHECK-NEXT: (trivia newline 1)
// CHECK-NEXT: (text="\xCC\x82"))

// CHECK-LABEL: 30:1
// CHECK-NEXT: (Token unknown
// CHECK-NEXT: (trivia newline 1)
// CHECK-NEXT: (text="\xE2\x80\x9D"))

// CHECK-LABEL: 31:1
// CHECK-NEXT: (Token unknown
// CHECK-NEXT: (trivia newline 1)
// CHECK-NEXT: (text="\xE2\x80\x9C"))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are these characters invalid? If so, should they be garbage_text trivia instead of token text?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@harlanhaskins
Ah.. no.

E2 80 9C is U+201C, unicode style left quote.
http://unicode.org/cldr/utility/character.jsp?a=201C

E2 80 9D is U+201D, unicode style right quote.
https://unicode.org/cldr/utility/character.jsp?a=201D

They becomes to single character unknown token, or,
If found that pattern they enclosure text, it becomes long unknown token.
e.g. <201C>hello world<201D

The reason of this test in here, just that the logic about handling them are there in invalid character detection control flow.

And these characters are safe in text file,
I will split test case file about U+201C/201D.


// CHECK-LABEL: 32:1
// CHECK-NEXT: (Token unknown
// CHECK-NEXT: (trivia newline 1)
// CHECK-NEXT: (text="\xE2\x80\x9C abcdef \xE2\x80\x9D"))

// CHECK-LABEL: 33:5
// CHECK-NEXT: (Token identifier
// CHECK-NEXT: (trivia newline 1)
// CHECK-NEXT: (trivia garbage_text \341\232\200)
// CHECK-NEXT: (trivia space 1)
// CHECK-NEXT: (text="x"))