Skip to content

Commit 3fc43bc

Browse files
committed
Check for zero-width characters in delimiters
1 parent 032d865 commit 3fc43bc

File tree

2 files changed

+55
-24
lines changed

2 files changed

+55
-24
lines changed

include/swift/AST/DiagnosticsParse.def

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -139,7 +139,9 @@ ERROR(lex_invalid_u_escape,none,
139139
ERROR(lex_invalid_u_escape_rbrace,none,
140140
"expected '}' in \\u{...} escape sequence", ())
141141
ERROR(lex_invalid_delimiter_escape,none,
142-
"Too many # characters in delimited escape", ())
142+
"too many # characters in delimited escape", ())
143+
ERROR(lex_zerowidth_in_string_delimiter,none,
144+
"zero-width character detected in string delimiter", ())
143145

144146
ERROR(lex_invalid_unicode_scalar,none,
145147
"invalid unicode scalar", ())

lib/Parse/Lexer.cpp

Lines changed: 52 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1206,6 +1206,38 @@ static bool maybeConsumeNewlineEscape(const char *&CurPtr, ssize_t Offset) {
12061206
}
12071207
}
12081208

1209+
/// diagnoseZeroWidth - check for and error zero-width characters in delimiters
1210+
static bool diagnoseZeroWidth(const char *&CurPtr, DiagnosticEngine *Diags) {
1211+
const unsigned char *TmpPtr = (const unsigned char *)CurPtr;
1212+
// zero-width set assumed to be: U+200B, U+200C, U+200D, U+2060, U+FEFF
1213+
while ((TmpPtr[0] == 0xE2 && ((TmpPtr[1] == 0x80 &&
1214+
(TmpPtr[2] == 0x8B || TmpPtr[2] == 0x8C || TmpPtr[2] == 0x8D)) ||
1215+
(TmpPtr[1] == 0x81 && TmpPtr[2] == 0xA0))) ||
1216+
(TmpPtr[0] == 0xEF && TmpPtr[1] == 0xBB && TmpPtr[2] == 0xBF)) {
1217+
if (Diags)
1218+
Diags->diagnose(Lexer::getSourceLoc(CurPtr),
1219+
diag::lex_zerowidth_in_string_delimiter)
1220+
.fixItRemoveChars(Lexer::getSourceLoc(CurPtr),
1221+
Lexer::getSourceLoc(CurPtr + 3));
1222+
TmpPtr += 3;
1223+
CurPtr += 3;
1224+
}
1225+
return true;
1226+
}
1227+
1228+
/// advanceIfMultilineDelimiter - centralized check for multiline delimiter
1229+
static bool advanceIfMultilineDelimiter(const char *&CurPtr,
1230+
DiagnosticEngine *Diags) {
1231+
const char *TmpPtr = CurPtr - 1;
1232+
if (*TmpPtr++ == '"' && diagnoseZeroWidth(TmpPtr, Diags) &&
1233+
*TmpPtr++ == '"' && diagnoseZeroWidth(TmpPtr, Diags) &&
1234+
*TmpPtr++ == '"') {
1235+
CurPtr = TmpPtr;
1236+
return true;
1237+
}
1238+
return false;
1239+
}
1240+
12091241
/// extractStringDelimiterLength - Extracts/detects any custom delimiter on
12101242
/// opening a string literal and advances CurPtr if a delimiter is found and
12111243
/// returns a non-zero delimiter length. CurPtr[-1] generally '#' when called.
@@ -1226,13 +1258,15 @@ static unsigned extractStringDelimiterLength(const char *&CurPtr) {
12261258
/// interpolation inside a "raw" string. Normal/cooked string processing is
12271259
/// the degenerate case of there being no # characters surrounding the quotes.
12281260
/// If delimiter matches, advances byte pointer passed in and returns true.
1229-
static bool delimiterMatches(unsigned DelimiterLength, const char *&BytesPtr) {
1261+
static bool delimiterMatches(unsigned DelimiterLength, const char *&BytesPtr,
1262+
DiagnosticEngine *Diags) {
12301263
if (!DelimiterLength)
12311264
return true;
1265+
const char *TmpPtr = BytesPtr;
12321266
for (unsigned i = 0; i < DelimiterLength; i++)
1233-
if (BytesPtr[i] != '#')
1267+
if (diagnoseZeroWidth(TmpPtr, Diags) && *TmpPtr++ != '#')
12341268
return false;
1235-
BytesPtr += DelimiterLength;
1269+
BytesPtr = TmpPtr;
12361270
return true;
12371271
}
12381272

@@ -1295,7 +1329,7 @@ unsigned Lexer::lexCharacter(const char *&CurPtr, char StopQuote,
12951329
diagnose(CurPtr-1, diag::lex_unterminated_string);
12961330
return ~1U;
12971331
case '\\': // Escapes.
1298-
if (!delimiterMatches(DelimiterLength, CurPtr))
1332+
if (!delimiterMatches(DelimiterLength, CurPtr, Diags))
12991333
return '\\';
13001334
break;
13011335
}
@@ -1413,7 +1447,8 @@ static const char *skipToEndOfInterpolatedExpression(const char *CurPtr,
14131447
if (!AllowNewline.back() && inStringLiteral()) {
14141448
unsigned InnerDelimiter = CustomDelimiter.back();
14151449
if (OpenDelimiters.back() == CurPtr[-1] && (!InnerDelimiter ||
1416-
(delimiterMatches(InnerDelimiter, CurPtr) && *CurPtr != '#'))) {
1450+
(delimiterMatches(InnerDelimiter, CurPtr, Diags)
1451+
&& *CurPtr != '#'))) {
14171452
// Closing single line string literal.
14181453
OpenDelimiters.pop_back();
14191454
AllowNewline.pop_back();
@@ -1423,10 +1458,7 @@ static const char *skipToEndOfInterpolatedExpression(const char *CurPtr,
14231458
continue;
14241459
}
14251460

1426-
bool isMultilineQuote = (
1427-
*CurPtr == '"' && *(CurPtr + 1) == '"' && *(CurPtr - 1) == '"');
1428-
if (isMultilineQuote)
1429-
CurPtr += 2;
1461+
bool isMultilineQuote = advanceIfMultilineDelimiter(CurPtr, Diags);
14301462

14311463
if (!inStringLiteral()) {
14321464
// Open string literal
@@ -1439,7 +1471,7 @@ static const char *skipToEndOfInterpolatedExpression(const char *CurPtr,
14391471
// We are in multiline string literal.
14401472
assert(AllowNewline.back() && "other cases must be handled above");
14411473
if (isMultilineQuote &&
1442-
delimiterMatches(CustomDelimiter.back(), CurPtr)) {
1474+
delimiterMatches(CustomDelimiter.back(), CurPtr, Diags)) {
14431475
// Close multiline string literal.
14441476
OpenDelimiters.pop_back();
14451477
AllowNewline.pop_back();
@@ -1451,7 +1483,7 @@ static const char *skipToEndOfInterpolatedExpression(const char *CurPtr,
14511483
}
14521484
case '\\':
14531485
if (inStringLiteral() &&
1454-
delimiterMatches(CustomDelimiter.back(), CurPtr)) {
1486+
delimiterMatches(CustomDelimiter.back(), CurPtr, Diags)) {
14551487
char escapedChar = *CurPtr++;
14561488
switch (escapedChar) {
14571489
case '(':
@@ -1716,23 +1748,18 @@ void Lexer::lexStringLiteral(unsigned DelimiterLength) {
17161748
// diagnostics about changing them to double quotes.
17171749

17181750
bool wasErroneous = false, MultilineString = false;
1719-
SmallString<8> ExtraTermination;
17201751

17211752
// Is this the start of a multiline string literal?
1722-
if (*TokStart == '"' && *CurPtr == '"' && *(CurPtr + 1) == '"') {
1723-
MultilineString = true;
1724-
CurPtr += 2;
1753+
if ((MultilineString = advanceIfMultilineDelimiter(CurPtr, Diags))) {
17251754
if (*CurPtr != '\n' && *CurPtr != '\r')
17261755
diagnose(CurPtr, diag::lex_illegal_multiline_string_start)
17271756
.fixItInsert(Lexer::getSourceLoc(CurPtr), "\n");
1728-
ExtraTermination.append(2, *TokStart);
17291757
}
1730-
ExtraTermination.append(DelimiterLength, '#');
17311758

17321759
while (true) {
17331760
const char *TmpPtr = CurPtr + 1;
17341761
if (*CurPtr == '\\' &&
1735-
delimiterMatches(DelimiterLength, TmpPtr) && *TmpPtr == '(') {
1762+
delimiterMatches(DelimiterLength, TmpPtr, Diags) && *TmpPtr == '(') {
17361763
// Consume tokens until we hit the corresponding ')'.
17371764
CurPtr = TmpPtr + 1;
17381765
const char *EndPtr =
@@ -1801,10 +1828,10 @@ void Lexer::lexStringLiteral(unsigned DelimiterLength) {
18011828
}
18021829

18031830
// Is this the end of multiline/delimited string literal?
1804-
if (StringRef(CurPtr, BufferEnd - CurPtr).startswith(ExtraTermination) &&
1805-
(!DelimiterLength || *(CurPtr + ExtraTermination.size()) != '#')) {
1831+
if ((!MultilineString || advanceIfMultilineDelimiter(CurPtr, Diags)) &&
1832+
(!DelimiterLength || (delimiterMatches(DelimiterLength, CurPtr, Diags)
1833+
&& *CurPtr != '#'))) {
18061834
TokStart -= DelimiterLength;
1807-
CurPtr += ExtraTermination.size();
18081835
if (wasErroneous)
18091836
return formToken(tok::unknown, TokStart);
18101837

@@ -2127,7 +2154,8 @@ StringRef Lexer::getEncodedStringSegment(StringRef Bytes,
21272154
continue;
21282155
}
21292156

2130-
if (CurChar != '\\' || !delimiterMatches(DelimiterLength, BytesPtr)) {
2157+
if (CurChar != '\\' ||
2158+
!delimiterMatches(DelimiterLength, BytesPtr, nullptr)) {
21312159
TempString.push_back(CurChar);
21322160
continue;
21332161
}
@@ -2212,7 +2240,8 @@ void Lexer::getStringLiteralSegments(
22122240
while ((pos = Bytes.find('\\', BytesPtr-Bytes.begin())) != StringRef::npos) {
22132241
BytesPtr = Bytes.begin() + pos + 1;
22142242

2215-
if (!delimiterMatches(DelimiterLength, BytesPtr) || *BytesPtr++ != '(')
2243+
if (!delimiterMatches(DelimiterLength, BytesPtr, Diags) ||
2244+
*BytesPtr++ != '(')
22162245
continue;
22172246

22182247
// String interpolation.

0 commit comments

Comments
 (0)