Skip to content

[Do not merge] [Syntax] support invalid characters as trivia #14967

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion include/swift/Parse/Lexer.h
Original file line number Diff line number Diff line change
Expand Up @@ -525,13 +525,17 @@ class Lexer {
void lexEscapedIdentifier();

void tryLexEditorPlaceholder();
const char *findEndOfCurlyQuoteStringLiteral(const char*);
const char *findEndOfCurlyQuoteStringLiteral(const char *,
bool EmitDiagnostics);

/// Try to lex conflict markers by checking for the presence of the start and
/// end of the marker in diff3 or Perforce style respectively.
bool tryLexConflictMarker(bool EatNewline);

NulCharacterKind getNulCharacterKind(const char *Ptr) const;

/// Lex invalid characters and return which it should be tokenized.
bool lexInvalidCharacters(const char *&Ptr, bool InLexTrivia);
};

/// Given an ordered token \param Array , get the iterator pointing to the first
Expand Down
211 changes: 148 additions & 63 deletions lib/Parse/Lexer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -557,6 +557,43 @@ static bool advanceIfValidContinuationOfOperator(char const *&ptr,
return advanceIf(ptr, end, Identifier::isOperatorContinuationCodePoint);
}

static bool isStartOfInvalidCharacters(const char *Ptr, const char *EndPtr) {
// This logic must equals to switch-case in lexImpl.
switch ((signed char)*Ptr) {
case '\n': case '\r':
case ' ': case '\t': case '\f': case '\v':
case -1: case -2:
case 0:
case '@': case '{': case '[': case '(': case '}': case ']': case ')':
case ',': case ';': case ':': case '\\': case '#': case '/': case '%':
case '!': case '?': case '<': case '>': case '=':
case '-': case '+': case '*': case '&': case '|': case '^': case '~': case '.':
case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N':
case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U':
case 'V': case 'W': case 'X': case 'Y': case 'Z':
case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u':
case 'v': case 'w': case 'x': case 'y': case 'z':
case '_': case '$':
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
case '"': case '\'': case '`':
return false;
default: {
if (advanceIfValidStartOfIdentifier(Ptr, EndPtr)) {
return false;
}
if (advanceIfValidStartOfOperator(Ptr, EndPtr)) {
return false;
}

return true;
}
}
}

bool Lexer::isIdentifier(StringRef string) {
if (string.empty()) return false;
char const *p = string.data(), *end = string.end();
Expand Down Expand Up @@ -1739,8 +1776,9 @@ void Lexer::lexStringLiteral() {
/// string literal, diagnose the problem and return a pointer to the end of the
/// entire string literal. This helps us avoid parsing the body of the string
/// as program tokens, which will only lead to massive confusion.
const char *Lexer::findEndOfCurlyQuoteStringLiteral(const char *Body) {

const char *Lexer::findEndOfCurlyQuoteStringLiteral(const char *Body,
bool EmitDiagnostics) {

while (true) {
// Don't bother with string interpolations.
if (*Body == '\\' && *(Body + 1) == '(')
Expand All @@ -1752,7 +1790,7 @@ const char *Lexer::findEndOfCurlyQuoteStringLiteral(const char *Body) {

// Get the next character.
const char *CharStart = Body;
unsigned CharValue = lexCharacter(Body, '\0', false);
unsigned CharValue = lexCharacter(Body, '\0', /*EmitDiagnostics=*/false);
// If the character was incorrectly encoded, give up.
if (CharValue == ~1U) return nullptr;

Expand All @@ -1764,8 +1802,11 @@ const char *Lexer::findEndOfCurlyQuoteStringLiteral(const char *Body) {
// If we found an ending curly quote (common since this thing started with
// an opening curly quote) diagnose it with a fixit and then return.
if (CharValue == 0x0000201D) {
diagnose(CharStart, diag::lex_invalid_curly_quote)
.fixItReplaceChars(getSourceLoc(CharStart), getSourceLoc(Body), "\"");
if (EmitDiagnostics) {
diagnose(CharStart, diag::lex_invalid_curly_quote)
.fixItReplaceChars(getSourceLoc(CharStart), getSourceLoc(Body),
"\"");
}
return Body;
}

Expand Down Expand Up @@ -1875,6 +1916,87 @@ Lexer::NulCharacterKind Lexer::getNulCharacterKind(const char *Ptr) const {
return NulCharacterKind::Embedded;
}

bool Lexer::lexInvalidCharacters(const char *&Ptr, bool InLexTrivia) {
// in lexTrivia, diagnose only when its should not be tokenize.

assert(Ptr != nullptr);

const char *const StartPtr = Ptr;

if (advanceIfValidContinuationOfIdentifier(Ptr, BufferEnd)) {
// If this is a valid identifier continuation, but not a valid identifier
// start, attempt to recover by eating more continuation characters.
if (!InLexTrivia) {
diagnose(StartPtr, diag::lex_invalid_identifier_start_character);
}
while (advanceIfValidContinuationOfIdentifier(Ptr, BufferEnd))
;
return true;
}

// This character isn't allowed in Swift source.
uint32_t codepoint = validateUTF8CharacterAndAdvance(Ptr, BufferEnd);
if (codepoint == ~0U) {
diagnose(StartPtr, diag::lex_invalid_utf8)
.fixItReplaceChars(getSourceLoc(StartPtr), getSourceLoc(Ptr), " ");
// Skip presumed whitespace.
return false;
}

if (codepoint == 0x0000201D) {
// If this is an end curly quote, just diagnose it with a fixit hint.
if (!InLexTrivia) {
diagnose(CurPtr - 1, diag::lex_invalid_curly_quote)
.fixItReplaceChars(getSourceLoc(StartPtr), getSourceLoc(Ptr), "\"");
}
return true;
}

if (codepoint == 0x0000201C) {
const char *const LeftQuoteEndPtr = Ptr;
bool EmitDiagnostics = !InLexTrivia;

// If this is a start curly quote, do a fuzzy match of a string literal
// to improve recovery.
if (const char *const RightQuoteEndPtr =
findEndOfCurlyQuoteStringLiteral(Ptr, EmitDiagnostics)) {
Ptr = RightQuoteEndPtr;
}

// Note, we intentionally diagnose the end quote before the start quote,
// so that the IDE suggests fixing the end quote before the start quote.
// This, in turn, works better with our error recovery because we won't
// diagnose an end curly quote in the middle of a straight quoted
// literal.
if (EmitDiagnostics) {
diagnose(StartPtr, diag::lex_invalid_curly_quote)
.fixItReplaceChars(getSourceLoc(StartPtr),
getSourceLoc(LeftQuoteEndPtr), "\"");
}
return true;
}

diagnose(StartPtr, diag::lex_invalid_character)
.fixItReplaceChars(getSourceLoc(StartPtr), getSourceLoc(Ptr), " ");

char expectedCodepoint;
if ((expectedCodepoint =
confusable::tryConvertConfusableCharacterToASCII(codepoint))) {

llvm::SmallString<4> confusedChar;
EncodeToUTF8(codepoint, confusedChar);
llvm::SmallString<1> expectedChar;
expectedChar += expectedCodepoint;
diagnose(StartPtr, diag::lex_confusable_character, confusedChar,
expectedChar)
.fixItReplaceChars(getSourceLoc(StartPtr), getSourceLoc(Ptr),
expectedChar);
}

// Skip presumed whitespace.
return false;
}

void Lexer::tryLexEditorPlaceholder() {
assert(CurPtr[-1] == '<' && CurPtr[0] == '#');
const char *TokStart = CurPtr-1;
Expand Down Expand Up @@ -2100,65 +2222,15 @@ void Lexer::lexImpl() {

if (advanceIfValidStartOfOperator(tmp, BufferEnd))
return lexOperatorIdentifier();

if (advanceIfValidContinuationOfIdentifier(tmp, BufferEnd)) {
// If this is a valid identifier continuation, but not a valid identifier
// start, attempt to recover by eating more continuation characters.
diagnose(CurPtr-1, diag::lex_invalid_identifier_start_character);
while (advanceIfValidContinuationOfIdentifier(tmp, BufferEnd));
} else {
// This character isn't allowed in Swift source.
uint32_t codepoint = validateUTF8CharacterAndAdvance(tmp, BufferEnd);
if (codepoint == ~0U) {
diagnose(CurPtr-1, diag::lex_invalid_utf8)
.fixItReplaceChars(getSourceLoc(CurPtr-1), getSourceLoc(tmp), " ");
CurPtr = tmp;
goto Restart; // Skip presumed whitespace.
} else if (codepoint == 0x0000201D) {
// If this is an end curly quote, just diagnose it with a fixit hint.
diagnose(CurPtr-1, diag::lex_invalid_curly_quote)
.fixItReplaceChars(getSourceLoc(CurPtr-1), getSourceLoc(tmp), "\"");
} else if (codepoint == 0x0000201C) {
auto endPtr = tmp;
// If this is a start curly quote, do a fuzzy match of a string literal
// to improve recovery.
if (auto tmp2 = findEndOfCurlyQuoteStringLiteral(tmp))
tmp = tmp2;

// Note, we intentionally diagnose the end quote before the start quote,
// so that the IDE suggests fixing the end quote before the start quote.
// This, in turn, works better with our error recovery because we won't
// diagnose an end curly quote in the middle of a straight quoted
// literal.
diagnose(CurPtr-1, diag::lex_invalid_curly_quote)
.fixItReplaceChars(getSourceLoc(CurPtr-1), getSourceLoc(endPtr),"\"");

} else {
diagnose(CurPtr-1, diag::lex_invalid_character)
.fixItReplaceChars(getSourceLoc(CurPtr-1), getSourceLoc(tmp), " ");

char expectedCodepoint;
if ((expectedCodepoint =
confusable::tryConvertConfusableCharacterToASCII(codepoint))) {

llvm::SmallString<4> confusedChar;
EncodeToUTF8(codepoint, confusedChar);
llvm::SmallString<1> expectedChar;
expectedChar += expectedCodepoint;
diagnose(CurPtr-1, diag::lex_confusable_character,
confusedChar, expectedChar)
.fixItReplaceChars(getSourceLoc(CurPtr-1),
getSourceLoc(tmp),
expectedChar);
}
bool ShouldTokenize = lexInvalidCharacters(tmp, /*InLexTrivia=*/false);
CurPtr = tmp;

CurPtr = tmp;
goto Restart; // Skip presumed whitespace.
}
if (ShouldTokenize) {
return formToken(tok::unknown, TokStart);
}

CurPtr = tmp;
return formToken(tok::unknown, TokStart);
goto Restart;
}

case '\n':
Expand Down Expand Up @@ -2341,7 +2413,6 @@ void Lexer::lexTrivia(syntax::Trivia &Pieces, bool IsForTrailingTrivia) {
Restart:
const char *TriviaStart = CurPtr;

// TODO: Handle invalid UTF8 sequence which is skipped in lexImpl().
switch (*CurPtr++) {
case '\n':
if (IsForTrailingTrivia)
Expand Down Expand Up @@ -2433,8 +2504,22 @@ void Lexer::lexTrivia(syntax::Trivia &Pieces, bool IsForTrailingTrivia) {
break;
}
break;
default:
break;
default: {
const char *Ptr = CurPtr - 1;
if (!isStartOfInvalidCharacters(Ptr, BufferEnd)) {
break;
}

bool ShouldTokenize = lexInvalidCharacters(Ptr, /*InLexTrivia=*/true);
if (ShouldTokenize) {
break;
}

CurPtr = Ptr;
size_t Length = CurPtr - TriviaStart;
Pieces.push_back(TriviaPiece::garbageText({TriviaStart, Length}));
goto Restart;
}
}
// Reset the cursor.
--CurPtr;
Expand Down
23 changes: 23 additions & 0 deletions test/Syntax/round_trip_invalids.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
// To know about setup, see `tokens_invalids.swift`.

// RUN: cat %s > %t
// RUN: cat %t | sed 's/'$(echo -ne "\x5a1")'/'$(echo -ne "\xc2")'/g' > %t.sed
// RUN: cp -f %t.sed %t
// RUN: cat %t | sed 's/'$(echo -ne "\x5a2")'/'$(echo -ne "\xcc\x82")'/g' > %t.sed
// RUN: cp -f %t.sed %t
// RUN: cat %t | sed 's/'$(echo -ne "\x5a3")'/'$(echo -ne "\xe2\x80\x9d")'/g' > %t.sed
// RUN: cp -f %t.sed %t
// RUN: cat %t | sed 's/'$(echo -ne "\x5a4")'/'$(echo -ne "\xe2\x80\x9c")'/g' > %t.sed
// RUN: cp -f %t.sed %t
// RUN: cat %t | sed 's/'$(echo -ne "\x5a5")'/'$(echo -ne "\xe1\x9a\x80")'/g' > %t.sed
// RUN: cp -f %t.sed %t

// RUN: %round-trip-syntax-test --swift-syntax-test %swift-syntax-test --file %t

x
Z1 x
Z2
Z3
Z4
Z4 abcdef Z3
Z5 x
79 changes: 79 additions & 0 deletions test/Syntax/tokens_invalids.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
// RUN: cat %s > %t

// 5a is Z. "ZN" style marker is used for marker. N is number.

// C2 is utf8 2 byte character start byte.
// RUN: cat %t | sed 's/'$(echo -ne "\x5a1")'/'$(echo -ne "\xc2")'/g' > %t.sed
// RUN: cp -f %t.sed %t

// CC 82 is U+0302, invalid for identifier start, valid for identifier body.
// RUN: cat %t | sed 's/'$(echo -ne "\x5a2")'/'$(echo -ne "\xcc\x82")'/g' > %t.sed
// RUN: cp -f %t.sed %t

// E2 80 9D is U+201D, right quote.
// RUN: cat %t | sed 's/'$(echo -ne "\x5a3")'/'$(echo -ne "\xe2\x80\x9d")'/g' > %t.sed
// RUN: cp -f %t.sed %t

// E2 80 9C is U+201C, left quote.
// RUN: cat %t | sed 's/'$(echo -ne "\x5a4")'/'$(echo -ne "\xe2\x80\x9c")'/g' > %t.sed
// RUN: cp -f %t.sed %t

// E1 9A 80 is U+1680, invalid for swift source.
// RUN: cat %t | sed 's/'$(echo -ne "\x5a5")'/'$(echo -ne "\xe1\x9a\x80")'/g' > %t.sed
// RUN: cp -f %t.sed %t

// RUN: %swift-syntax-test -input-source-filename %t -dump-full-tokens 2>&1 | %FileCheck %t

x
Z1 x
Z2
Z3
Z4
Z4 abcdef Z3
Z5 x

// test diagnostics.

// CHECK: 28:1: error: invalid UTF-8 found in source file
// CHECK: 29:1: error: an identifier cannot begin with this character
// CHECK: 30:1: error: unicode curly quote found
// CHECK: 31:1: error: unicode curly quote found
// CHECK: 32:12: error: unicode curly quote found
// CHECK: 32:1: error: unicode curly quote found
// CHECK: 33:1: error: invalid character in source file

// test tokens and trivias.

// CHECK-LABEL: 28:3
// CHECK-NEXT: (Token identifier
// CHECK-NEXT: (trivia newline 1)
// CHECK-NEXT: (trivia garbage_text \302)
// CHECK-NEXT: (trivia space 1)
// CHECK-NEXT: (text="x"))

// CHECK-LABEL: 29:1
// CHECK-NEXT: (Token unknown
// CHECK-NEXT: (trivia newline 1)
// CHECK-NEXT: (text="\xCC\x82"))

// CHECK-LABEL: 30:1
// CHECK-NEXT: (Token unknown
// CHECK-NEXT: (trivia newline 1)
// CHECK-NEXT: (text="\xE2\x80\x9D"))

// CHECK-LABEL: 31:1
// CHECK-NEXT: (Token unknown
// CHECK-NEXT: (trivia newline 1)
// CHECK-NEXT: (text="\xE2\x80\x9C"))

// CHECK-LABEL: 32:1
// CHECK-NEXT: (Token unknown
// CHECK-NEXT: (trivia newline 1)
// CHECK-NEXT: (text="\xE2\x80\x9C abcdef \xE2\x80\x9D"))

// CHECK-LABEL: 33:5
// CHECK-NEXT: (Token identifier
// CHECK-NEXT: (trivia newline 1)
// CHECK-NEXT: (trivia garbage_text \341\232\200)
// CHECK-NEXT: (trivia space 1)
// CHECK-NEXT: (text="x"))