@@ -1206,6 +1206,38 @@ static bool maybeConsumeNewlineEscape(const char *&CurPtr, ssize_t Offset) {
1206
1206
}
1207
1207
}
1208
1208
1209
+ // / diagnoseZeroWidth - check for and error zero-width characters in delimiters
1210
+ static bool diagnoseZeroWidth (const char *&CurPtr, DiagnosticEngine *Diags) {
1211
+ const unsigned char *TmpPtr = (const unsigned char *)CurPtr;
1212
+ // zero-width set assumed to be: U+200B, U+200C, U+200D, U+2060, U+FEFF
1213
+ while ((TmpPtr[0 ] == 0xE2 && ((TmpPtr[1 ] == 0x80 &&
1214
+ (TmpPtr[2 ] == 0x8B || TmpPtr[2 ] == 0x8C || TmpPtr[2 ] == 0x8D )) ||
1215
+ (TmpPtr[1 ] == 0x81 && TmpPtr[2 ] == 0xA0 ))) ||
1216
+ (TmpPtr[0 ] == 0xEF && TmpPtr[1 ] == 0xBB && TmpPtr[2 ] == 0xBF )) {
1217
+ if (Diags)
1218
+ Diags->diagnose (Lexer::getSourceLoc (CurPtr),
1219
+ diag::lex_zerowidth_in_string_delimiter)
1220
+ .fixItRemoveChars (Lexer::getSourceLoc (CurPtr),
1221
+ Lexer::getSourceLoc (CurPtr + 3 ));
1222
+ TmpPtr += 3 ;
1223
+ CurPtr += 3 ;
1224
+ }
1225
+ return true ;
1226
+ }
1227
+
1228
+ // / advanceIfMultilineDelimiter - centralized check for multiline delimiter
1229
+ static bool advanceIfMultilineDelimiter (const char *&CurPtr,
1230
+ DiagnosticEngine *Diags) {
1231
+ const char *TmpPtr = CurPtr - 1 ;
1232
+ if (*TmpPtr++ == ' "' && diagnoseZeroWidth (TmpPtr, Diags) &&
1233
+ *TmpPtr++ == ' "' && diagnoseZeroWidth (TmpPtr, Diags) &&
1234
+ *TmpPtr++ == ' "' ) {
1235
+ CurPtr = TmpPtr;
1236
+ return true ;
1237
+ }
1238
+ return false ;
1239
+ }
1240
+
1209
1241
// / extractStringDelimiterLength - Extracts/detects any custom delimiter on
1210
1242
// / opening a string literal and advances CurPtr if a delimiter is found and
1211
1243
// / returns a non-zero delimiter length. CurPtr[-1] generally '#' when called.
@@ -1226,13 +1258,15 @@ static unsigned extractStringDelimiterLength(const char *&CurPtr) {
1226
1258
// / interpolation inside a "raw" string. Normal/cooked string processing is
1227
1259
// / the degenerate case of there being no # characters surrounding the quotes.
1228
1260
// / If delimiter matches, advances byte pointer passed in and returns true.
1229
- static bool delimiterMatches (unsigned DelimiterLength, const char *&BytesPtr) {
1261
+ static bool delimiterMatches (unsigned DelimiterLength, const char *&BytesPtr,
1262
+ DiagnosticEngine *Diags) {
1230
1263
if (!DelimiterLength)
1231
1264
return true ;
1265
+ const char *TmpPtr = BytesPtr;
1232
1266
for (unsigned i = 0 ; i < DelimiterLength; i++)
1233
- if (BytesPtr[i] != ' #' )
1267
+ if (diagnoseZeroWidth (TmpPtr, Diags) && *TmpPtr++ != ' #' )
1234
1268
return false ;
1235
- BytesPtr += DelimiterLength ;
1269
+ BytesPtr = TmpPtr ;
1236
1270
return true ;
1237
1271
}
1238
1272
@@ -1295,7 +1329,7 @@ unsigned Lexer::lexCharacter(const char *&CurPtr, char StopQuote,
1295
1329
diagnose (CurPtr-1 , diag::lex_unterminated_string);
1296
1330
return ~1U ;
1297
1331
case ' \\ ' : // Escapes.
1298
- if (!delimiterMatches (DelimiterLength, CurPtr))
1332
+ if (!delimiterMatches (DelimiterLength, CurPtr, Diags ))
1299
1333
return ' \\ ' ;
1300
1334
break ;
1301
1335
}
@@ -1413,7 +1447,8 @@ static const char *skipToEndOfInterpolatedExpression(const char *CurPtr,
1413
1447
if (!AllowNewline.back () && inStringLiteral ()) {
1414
1448
unsigned InnerDelimiter = CustomDelimiter.back ();
1415
1449
if (OpenDelimiters.back () == CurPtr[-1 ] && (!InnerDelimiter ||
1416
- (delimiterMatches (InnerDelimiter, CurPtr) && *CurPtr != ' #' ))) {
1450
+ (delimiterMatches (InnerDelimiter, CurPtr, Diags)
1451
+ && *CurPtr != ' #' ))) {
1417
1452
// Closing single line string literal.
1418
1453
OpenDelimiters.pop_back ();
1419
1454
AllowNewline.pop_back ();
@@ -1423,10 +1458,7 @@ static const char *skipToEndOfInterpolatedExpression(const char *CurPtr,
1423
1458
continue ;
1424
1459
}
1425
1460
1426
- bool isMultilineQuote = (
1427
- *CurPtr == ' "' && *(CurPtr + 1 ) == ' "' && *(CurPtr - 1 ) == ' "' );
1428
- if (isMultilineQuote)
1429
- CurPtr += 2 ;
1461
+ bool isMultilineQuote = advanceIfMultilineDelimiter (CurPtr, Diags);
1430
1462
1431
1463
if (!inStringLiteral ()) {
1432
1464
// Open string literal
@@ -1439,7 +1471,7 @@ static const char *skipToEndOfInterpolatedExpression(const char *CurPtr,
1439
1471
// We are in multiline string literal.
1440
1472
assert (AllowNewline.back () && " other cases must be handled above" );
1441
1473
if (isMultilineQuote &&
1442
- delimiterMatches (CustomDelimiter.back (), CurPtr)) {
1474
+ delimiterMatches (CustomDelimiter.back (), CurPtr, Diags )) {
1443
1475
// Close multiline string literal.
1444
1476
OpenDelimiters.pop_back ();
1445
1477
AllowNewline.pop_back ();
@@ -1451,7 +1483,7 @@ static const char *skipToEndOfInterpolatedExpression(const char *CurPtr,
1451
1483
}
1452
1484
case ' \\ ' :
1453
1485
if (inStringLiteral () &&
1454
- delimiterMatches (CustomDelimiter.back (), CurPtr)) {
1486
+ delimiterMatches (CustomDelimiter.back (), CurPtr, Diags )) {
1455
1487
char escapedChar = *CurPtr++;
1456
1488
switch (escapedChar) {
1457
1489
case ' (' :
@@ -1716,23 +1748,18 @@ void Lexer::lexStringLiteral(unsigned DelimiterLength) {
1716
1748
// diagnostics about changing them to double quotes.
1717
1749
1718
1750
bool wasErroneous = false , MultilineString = false ;
1719
- SmallString<8 > ExtraTermination;
1720
1751
1721
1752
// Is this the start of a multiline string literal?
1722
- if (*TokStart == ' "' && *CurPtr == ' "' && *(CurPtr + 1 ) == ' "' ) {
1723
- MultilineString = true ;
1724
- CurPtr += 2 ;
1753
+ if ((MultilineString = advanceIfMultilineDelimiter (CurPtr, Diags))) {
1725
1754
if (*CurPtr != ' \n ' && *CurPtr != ' \r ' )
1726
1755
diagnose (CurPtr, diag::lex_illegal_multiline_string_start)
1727
1756
.fixItInsert (Lexer::getSourceLoc (CurPtr), " \n " );
1728
- ExtraTermination.append (2 , *TokStart);
1729
1757
}
1730
- ExtraTermination.append (DelimiterLength, ' #' );
1731
1758
1732
1759
while (true ) {
1733
1760
const char *TmpPtr = CurPtr + 1 ;
1734
1761
if (*CurPtr == ' \\ ' &&
1735
- delimiterMatches (DelimiterLength, TmpPtr) && *TmpPtr == ' (' ) {
1762
+ delimiterMatches (DelimiterLength, TmpPtr, Diags ) && *TmpPtr == ' (' ) {
1736
1763
// Consume tokens until we hit the corresponding ')'.
1737
1764
CurPtr = TmpPtr + 1 ;
1738
1765
const char *EndPtr =
@@ -1801,10 +1828,10 @@ void Lexer::lexStringLiteral(unsigned DelimiterLength) {
1801
1828
}
1802
1829
1803
1830
// Is this the end of multiline/delimited string literal?
1804
- if (StringRef (CurPtr, BufferEnd - CurPtr).startswith (ExtraTermination) &&
1805
- (!DelimiterLength || *(CurPtr + ExtraTermination.size ()) != ' #' )) {
1831
+ if ((!MultilineString || advanceIfMultilineDelimiter (CurPtr, Diags)) &&
1832
+ (!DelimiterLength || (delimiterMatches (DelimiterLength, CurPtr, Diags)
1833
+ && *CurPtr != ' #' ))) {
1806
1834
TokStart -= DelimiterLength;
1807
- CurPtr += ExtraTermination.size ();
1808
1835
if (wasErroneous)
1809
1836
return formToken (tok::unknown, TokStart);
1810
1837
@@ -2127,7 +2154,8 @@ StringRef Lexer::getEncodedStringSegment(StringRef Bytes,
2127
2154
continue ;
2128
2155
}
2129
2156
2130
- if (CurChar != ' \\ ' || !delimiterMatches (DelimiterLength, BytesPtr)) {
2157
+ if (CurChar != ' \\ ' ||
2158
+ !delimiterMatches (DelimiterLength, BytesPtr, nullptr )) {
2131
2159
TempString.push_back (CurChar);
2132
2160
continue ;
2133
2161
}
@@ -2212,7 +2240,8 @@ void Lexer::getStringLiteralSegments(
2212
2240
while ((pos = Bytes.find (' \\ ' , BytesPtr-Bytes.begin ())) != StringRef::npos) {
2213
2241
BytesPtr = Bytes.begin () + pos + 1 ;
2214
2242
2215
- if (!delimiterMatches (DelimiterLength, BytesPtr) || *BytesPtr++ != ' (' )
2243
+ if (!delimiterMatches (DelimiterLength, BytesPtr, Diags) ||
2244
+ *BytesPtr++ != ' (' )
2216
2245
continue ;
2217
2246
2218
2247
// String interpolation.
0 commit comments