Skip to content

Commit 4ee8506

Browse files
committed
[Parse] split lexInvalidCharacters from lexImpl
1 parent db47cb1 commit 4ee8506

File tree

2 files changed

+81
-55
lines changed

2 files changed

+81
-55
lines changed

include/swift/Parse/Lexer.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -532,6 +532,9 @@ class Lexer {
532532
bool tryLexConflictMarker(bool EatNewline);
533533

534534
NulCharacterKind getNulCharacterKind(const char *Ptr) const;
535+
536+
/// Lex invalid characters and return which it should be tokenized.
537+
bool lexInvalidCharacters(const char *&Ptr);
535538
};
536539

537540
/// Given an ordered token \param Array , get the iterator pointing to the first

lib/Parse/Lexer.cpp

Lines changed: 78 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -1875,6 +1875,79 @@ Lexer::NulCharacterKind Lexer::getNulCharacterKind(const char *Ptr) const {
18751875
return NulCharacterKind::Embedded;
18761876
}
18771877

1878+
bool Lexer::lexInvalidCharacters(const char *&Ptr) {
1879+
assert(Ptr != nullptr);
1880+
1881+
const char *const StartPtr = Ptr;
1882+
1883+
if (advanceIfValidContinuationOfIdentifier(Ptr, BufferEnd)) {
1884+
// If this is a valid identifier continuation, but not a valid identifier
1885+
// start, attempt to recover by eating more continuation characters.
1886+
diagnose(StartPtr, diag::lex_invalid_identifier_start_character);
1887+
while (advanceIfValidContinuationOfIdentifier(Ptr, BufferEnd))
1888+
;
1889+
return true;
1890+
}
1891+
1892+
// This character isn't allowed in Swift source.
1893+
uint32_t codepoint = validateUTF8CharacterAndAdvance(Ptr, BufferEnd);
1894+
if (codepoint == ~0U) {
1895+
diagnose(StartPtr, diag::lex_invalid_utf8)
1896+
.fixItReplaceChars(getSourceLoc(StartPtr), getSourceLoc(Ptr), " ");
1897+
// Skip presumed whitespace.
1898+
return false;
1899+
}
1900+
1901+
if (codepoint == 0x0000201D) {
1902+
// If this is an end curly quote, just diagnose it with a fixit hint.
1903+
diagnose(CurPtr - 1, diag::lex_invalid_curly_quote)
1904+
.fixItReplaceChars(getSourceLoc(StartPtr), getSourceLoc(Ptr), "\"");
1905+
return true;
1906+
}
1907+
1908+
if (codepoint == 0x0000201C) {
1909+
const char *const LeftQuoteEndPtr = Ptr;
1910+
1911+
// If this is a start curly quote, do a fuzzy match of a string literal
1912+
// to improve recovery.
1913+
if (const char *const RightQuoteEndPtr =
1914+
findEndOfCurlyQuoteStringLiteral(Ptr)) {
1915+
Ptr = RightQuoteEndPtr;
1916+
}
1917+
1918+
// Note, we intentionally diagnose the end quote before the start quote,
1919+
// so that the IDE suggests fixing the end quote before the start quote.
1920+
// This, in turn, works better with our error recovery because we won't
1921+
// diagnose an end curly quote in the middle of a straight quoted
1922+
// literal.
1923+
diagnose(StartPtr, diag::lex_invalid_curly_quote)
1924+
.fixItReplaceChars(getSourceLoc(StartPtr),
1925+
getSourceLoc(LeftQuoteEndPtr), "\"");
1926+
1927+
return true;
1928+
}
1929+
1930+
diagnose(StartPtr, diag::lex_invalid_character)
1931+
.fixItReplaceChars(getSourceLoc(StartPtr), getSourceLoc(Ptr), " ");
1932+
1933+
char expectedCodepoint;
1934+
if ((expectedCodepoint =
1935+
confusable::tryConvertConfusableCharacterToASCII(codepoint))) {
1936+
1937+
llvm::SmallString<4> confusedChar;
1938+
EncodeToUTF8(codepoint, confusedChar);
1939+
llvm::SmallString<1> expectedChar;
1940+
expectedChar += expectedCodepoint;
1941+
diagnose(StartPtr, diag::lex_confusable_character, confusedChar,
1942+
expectedChar)
1943+
.fixItReplaceChars(getSourceLoc(StartPtr), getSourceLoc(Ptr),
1944+
expectedChar);
1945+
}
1946+
1947+
// Skip presumed whitespace.
1948+
return false;
1949+
}
1950+
18781951
void Lexer::tryLexEditorPlaceholder() {
18791952
assert(CurPtr[-1] == '<' && CurPtr[0] == '#');
18801953
const char *TokStart = CurPtr-1;
@@ -2100,65 +2173,15 @@ void Lexer::lexImpl() {
21002173

21012174
if (advanceIfValidStartOfOperator(tmp, BufferEnd))
21022175
return lexOperatorIdentifier();
2103-
2104-
if (advanceIfValidContinuationOfIdentifier(tmp, BufferEnd)) {
2105-
// If this is a valid identifier continuation, but not a valid identifier
2106-
// start, attempt to recover by eating more continuation characters.
2107-
diagnose(CurPtr-1, diag::lex_invalid_identifier_start_character);
2108-
while (advanceIfValidContinuationOfIdentifier(tmp, BufferEnd));
2109-
} else {
2110-
// This character isn't allowed in Swift source.
2111-
uint32_t codepoint = validateUTF8CharacterAndAdvance(tmp, BufferEnd);
2112-
if (codepoint == ~0U) {
2113-
diagnose(CurPtr-1, diag::lex_invalid_utf8)
2114-
.fixItReplaceChars(getSourceLoc(CurPtr-1), getSourceLoc(tmp), " ");
2115-
CurPtr = tmp;
2116-
goto Restart; // Skip presumed whitespace.
2117-
} else if (codepoint == 0x0000201D) {
2118-
// If this is an end curly quote, just diagnose it with a fixit hint.
2119-
diagnose(CurPtr-1, diag::lex_invalid_curly_quote)
2120-
.fixItReplaceChars(getSourceLoc(CurPtr-1), getSourceLoc(tmp), "\"");
2121-
} else if (codepoint == 0x0000201C) {
2122-
auto endPtr = tmp;
2123-
// If this is a start curly quote, do a fuzzy match of a string literal
2124-
// to improve recovery.
2125-
if (auto tmp2 = findEndOfCurlyQuoteStringLiteral(tmp))
2126-
tmp = tmp2;
2127-
2128-
// Note, we intentionally diagnose the end quote before the start quote,
2129-
// so that the IDE suggests fixing the end quote before the start quote.
2130-
// This, in turn, works better with our error recovery because we won't
2131-
// diagnose an end curly quote in the middle of a straight quoted
2132-
// literal.
2133-
diagnose(CurPtr-1, diag::lex_invalid_curly_quote)
2134-
.fixItReplaceChars(getSourceLoc(CurPtr-1), getSourceLoc(endPtr),"\"");
21352176

2136-
} else {
2137-
diagnose(CurPtr-1, diag::lex_invalid_character)
2138-
.fixItReplaceChars(getSourceLoc(CurPtr-1), getSourceLoc(tmp), " ");
2139-
2140-
char expectedCodepoint;
2141-
if ((expectedCodepoint =
2142-
confusable::tryConvertConfusableCharacterToASCII(codepoint))) {
2143-
2144-
llvm::SmallString<4> confusedChar;
2145-
EncodeToUTF8(codepoint, confusedChar);
2146-
llvm::SmallString<1> expectedChar;
2147-
expectedChar += expectedCodepoint;
2148-
diagnose(CurPtr-1, diag::lex_confusable_character,
2149-
confusedChar, expectedChar)
2150-
.fixItReplaceChars(getSourceLoc(CurPtr-1),
2151-
getSourceLoc(tmp),
2152-
expectedChar);
2153-
}
2177+
bool ShouldTokenize = lexInvalidCharacters(tmp);
2178+
CurPtr = tmp;
21542179

2155-
CurPtr = tmp;
2156-
goto Restart; // Skip presumed whitespace.
2157-
}
2180+
if (ShouldTokenize) {
2181+
return formToken(tok::unknown, TokStart);
21582182
}
21592183

2160-
CurPtr = tmp;
2161-
return formToken(tok::unknown, TokStart);
2184+
goto Restart;
21622185
}
21632186

21642187
case '\n':

0 commit comments

Comments
 (0)