@@ -1875,6 +1875,79 @@ Lexer::NulCharacterKind Lexer::getNulCharacterKind(const char *Ptr) const {
1875
1875
return NulCharacterKind::Embedded;
1876
1876
}
1877
1877
1878
+ bool Lexer::lexInvalidCharacters (const char *&Ptr) {
1879
+ assert (Ptr != nullptr );
1880
+
1881
+ const char *const StartPtr = Ptr;
1882
+
1883
+ if (advanceIfValidContinuationOfIdentifier (Ptr, BufferEnd)) {
1884
+ // If this is a valid identifier continuation, but not a valid identifier
1885
+ // start, attempt to recover by eating more continuation characters.
1886
+ diagnose (StartPtr, diag::lex_invalid_identifier_start_character);
1887
+ while (advanceIfValidContinuationOfIdentifier (Ptr, BufferEnd))
1888
+ ;
1889
+ return true ;
1890
+ }
1891
+
1892
+ // This character isn't allowed in Swift source.
1893
+ uint32_t codepoint = validateUTF8CharacterAndAdvance (Ptr, BufferEnd);
1894
+ if (codepoint == ~0U ) {
1895
+ diagnose (StartPtr, diag::lex_invalid_utf8)
1896
+ .fixItReplaceChars (getSourceLoc (StartPtr), getSourceLoc (Ptr), " " );
1897
+ // Skip presumed whitespace.
1898
+ return false ;
1899
+ }
1900
+
1901
+ if (codepoint == 0x0000201D ) {
1902
+ // If this is an end curly quote, just diagnose it with a fixit hint.
1903
+ diagnose (CurPtr - 1 , diag::lex_invalid_curly_quote)
1904
+ .fixItReplaceChars (getSourceLoc (StartPtr), getSourceLoc (Ptr), " \" " );
1905
+ return true ;
1906
+ }
1907
+
1908
+ if (codepoint == 0x0000201C ) {
1909
+ const char *const LeftQuoteEndPtr = Ptr;
1910
+
1911
+ // If this is a start curly quote, do a fuzzy match of a string literal
1912
+ // to improve recovery.
1913
+ if (const char *const RightQuoteEndPtr =
1914
+ findEndOfCurlyQuoteStringLiteral (Ptr)) {
1915
+ Ptr = RightQuoteEndPtr;
1916
+ }
1917
+
1918
+ // Note, we intentionally diagnose the end quote before the start quote,
1919
+ // so that the IDE suggests fixing the end quote before the start quote.
1920
+ // This, in turn, works better with our error recovery because we won't
1921
+ // diagnose an end curly quote in the middle of a straight quoted
1922
+ // literal.
1923
+ diagnose (StartPtr, diag::lex_invalid_curly_quote)
1924
+ .fixItReplaceChars (getSourceLoc (StartPtr),
1925
+ getSourceLoc (LeftQuoteEndPtr), " \" " );
1926
+
1927
+ return true ;
1928
+ }
1929
+
1930
+ diagnose (StartPtr, diag::lex_invalid_character)
1931
+ .fixItReplaceChars (getSourceLoc (StartPtr), getSourceLoc (Ptr), " " );
1932
+
1933
+ char expectedCodepoint;
1934
+ if ((expectedCodepoint =
1935
+ confusable::tryConvertConfusableCharacterToASCII (codepoint))) {
1936
+
1937
+ llvm::SmallString<4 > confusedChar;
1938
+ EncodeToUTF8 (codepoint, confusedChar);
1939
+ llvm::SmallString<1 > expectedChar;
1940
+ expectedChar += expectedCodepoint;
1941
+ diagnose (StartPtr, diag::lex_confusable_character, confusedChar,
1942
+ expectedChar)
1943
+ .fixItReplaceChars (getSourceLoc (StartPtr), getSourceLoc (Ptr),
1944
+ expectedChar);
1945
+ }
1946
+
1947
+ // Skip presumed whitespace.
1948
+ return false ;
1949
+ }
1950
+
1878
1951
void Lexer::tryLexEditorPlaceholder () {
1879
1952
assert (CurPtr[-1 ] == ' <' && CurPtr[0 ] == ' #' );
1880
1953
const char *TokStart = CurPtr-1 ;
@@ -2100,65 +2173,15 @@ void Lexer::lexImpl() {
2100
2173
2101
2174
if (advanceIfValidStartOfOperator (tmp, BufferEnd))
2102
2175
return lexOperatorIdentifier ();
2103
-
2104
- if (advanceIfValidContinuationOfIdentifier (tmp, BufferEnd)) {
2105
- // If this is a valid identifier continuation, but not a valid identifier
2106
- // start, attempt to recover by eating more continuation characters.
2107
- diagnose (CurPtr-1 , diag::lex_invalid_identifier_start_character);
2108
- while (advanceIfValidContinuationOfIdentifier (tmp, BufferEnd));
2109
- } else {
2110
- // This character isn't allowed in Swift source.
2111
- uint32_t codepoint = validateUTF8CharacterAndAdvance (tmp, BufferEnd);
2112
- if (codepoint == ~0U ) {
2113
- diagnose (CurPtr-1 , diag::lex_invalid_utf8)
2114
- .fixItReplaceChars (getSourceLoc (CurPtr-1 ), getSourceLoc (tmp), " " );
2115
- CurPtr = tmp;
2116
- goto Restart; // Skip presumed whitespace.
2117
- } else if (codepoint == 0x0000201D ) {
2118
- // If this is an end curly quote, just diagnose it with a fixit hint.
2119
- diagnose (CurPtr-1 , diag::lex_invalid_curly_quote)
2120
- .fixItReplaceChars (getSourceLoc (CurPtr-1 ), getSourceLoc (tmp), " \" " );
2121
- } else if (codepoint == 0x0000201C ) {
2122
- auto endPtr = tmp;
2123
- // If this is a start curly quote, do a fuzzy match of a string literal
2124
- // to improve recovery.
2125
- if (auto tmp2 = findEndOfCurlyQuoteStringLiteral (tmp))
2126
- tmp = tmp2;
2127
-
2128
- // Note, we intentionally diagnose the end quote before the start quote,
2129
- // so that the IDE suggests fixing the end quote before the start quote.
2130
- // This, in turn, works better with our error recovery because we won't
2131
- // diagnose an end curly quote in the middle of a straight quoted
2132
- // literal.
2133
- diagnose (CurPtr-1 , diag::lex_invalid_curly_quote)
2134
- .fixItReplaceChars (getSourceLoc (CurPtr-1 ), getSourceLoc (endPtr)," \" " );
2135
2176
2136
- } else {
2137
- diagnose (CurPtr-1 , diag::lex_invalid_character)
2138
- .fixItReplaceChars (getSourceLoc (CurPtr-1 ), getSourceLoc (tmp), " " );
2139
-
2140
- char expectedCodepoint;
2141
- if ((expectedCodepoint =
2142
- confusable::tryConvertConfusableCharacterToASCII (codepoint))) {
2143
-
2144
- llvm::SmallString<4 > confusedChar;
2145
- EncodeToUTF8 (codepoint, confusedChar);
2146
- llvm::SmallString<1 > expectedChar;
2147
- expectedChar += expectedCodepoint;
2148
- diagnose (CurPtr-1 , diag::lex_confusable_character,
2149
- confusedChar, expectedChar)
2150
- .fixItReplaceChars (getSourceLoc (CurPtr-1 ),
2151
- getSourceLoc (tmp),
2152
- expectedChar);
2153
- }
2177
+ bool ShouldTokenize = lexInvalidCharacters (tmp);
2178
+ CurPtr = tmp;
2154
2179
2155
- CurPtr = tmp;
2156
- goto Restart; // Skip presumed whitespace.
2157
- }
2180
+ if (ShouldTokenize) {
2181
+ return formToken (tok::unknown, TokStart);
2158
2182
}
2159
2183
2160
- CurPtr = tmp;
2161
- return formToken (tok::unknown, TokStart);
2184
+ goto Restart;
2162
2185
}
2163
2186
2164
2187
case ' \n ' :
0 commit comments