Skip to content

Commit 2317048

Browse files
committed
Alternative implementation for raw strings
1 parent 14213b8 commit 2317048

File tree

4 files changed

+137
-74
lines changed

4 files changed

+137
-74
lines changed

include/swift/Parse/Lexer.h

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -355,21 +355,21 @@ class Lexer {
355355
enum : char { Literal, Expr } Kind;
356356
// Loc+Length for the segment inside the string literal, without quotes.
357357
SourceLoc Loc;
358-
unsigned Length, IndentToStrip;
359-
bool IsFirstSegment, IsLastSegment, RawString;
358+
unsigned Length, IndentToStrip, DelimiterLength;
359+
bool IsFirstSegment, IsLastSegment;
360360

361361
static StringSegment getLiteral(SourceLoc Loc, unsigned Length,
362362
bool IsFirstSegment, bool IsLastSegment,
363363
unsigned IndentToStrip,
364-
bool RawString) {
364+
unsigned DelimiterLength) {
365365
StringSegment Result;
366366
Result.Kind = Literal;
367367
Result.Loc = Loc;
368368
Result.Length = Length;
369369
Result.IsFirstSegment = IsFirstSegment;
370370
Result.IsLastSegment = IsLastSegment;
371371
Result.IndentToStrip = IndentToStrip;
372-
Result.RawString = RawString;
372+
Result.DelimiterLength = DelimiterLength;
373373
return Result;
374374
}
375375

@@ -381,7 +381,7 @@ class Lexer {
381381
Result.IsFirstSegment = false;
382382
Result.IsLastSegment = false;
383383
Result.IndentToStrip = 0;
384-
Result.RawString = false;
384+
Result.DelimiterLength = 0;
385385
return Result;
386386
}
387387

@@ -399,13 +399,13 @@ class Lexer {
399399
bool IsFirstSegment = false,
400400
bool IsLastSegment = false,
401401
unsigned IndentToStrip = 0,
402-
bool RawString = false);
402+
unsigned DelimiterLength = 0);
403403
StringRef getEncodedStringSegment(StringSegment Segment,
404404
SmallVectorImpl<char> &Buffer) const {
405405
return getEncodedStringSegment(
406406
StringRef(getBufferPtrForSourceLoc(Segment.Loc), Segment.Length),
407407
Buffer, Segment.IsFirstSegment, Segment.IsLastSegment,
408-
Segment.IndentToStrip, Segment.RawString);
408+
Segment.IndentToStrip, Segment.DelimiterLength);
409409
}
410410

411411
/// \brief Given a string literal token, separate it into string/expr segments
@@ -470,7 +470,7 @@ class Lexer {
470470
}
471471

472472
void formToken(tok Kind, const char *TokStart, bool MultilineString = false,
473-
bool RawString = false, size_t DelimiterLength = 0);
473+
unsigned DelimiterLength = 0);
474474
void formEscapedIdentifierToken(const char *TokStart);
475475

476476
/// Advance to the end of the line.
@@ -496,8 +496,8 @@ class Lexer {
496496

497497
unsigned lexCharacter(const char *&CurPtr,
498498
char StopQuote, bool EmitDiagnostics,
499-
bool Multiline = false, bool RawString = false);
500-
void lexStringLiteral(bool RawString = false, std::string Delimiter = "");
499+
bool Multiline = false, unsigned DelimiterLength = 0);
500+
void lexStringLiteral(unsigned DelimiterLength = 0);
501501
void lexEscapedIdentifier();
502502

503503
void tryLexEditorPlaceholder();

include/swift/Parse/Token.h

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -44,9 +44,11 @@ class Token {
4444

4545
/// Modifiers for string literals
4646
unsigned MultilineString : 1;
47-
unsigned RawString : 1;
4847

49-
// Padding bits == 32 - sizeof(Kind) * 8 - 4;
48+
/// Length of custom delimiter of "raw" string literals
49+
unsigned StringDelimiterLength : 8;
50+
51+
// Padding bits == 32 - 11;
5052

5153
/// \brief The length of the comment that precedes the token.
5254
unsigned CommentLength;
@@ -63,7 +65,7 @@ class Token {
6365
public:
6466
Token(tok Kind, StringRef Text, unsigned CommentLength = 0)
6567
: Kind(Kind), AtStartOfLine(false), EscapedIdentifier(false),
66-
MultilineString(false), RawString(false),
68+
MultilineString(false), StringDelimiterLength(0),
6769
CommentLength(CommentLength), Text(Text) {}
6870

6971
Token() : Token(tok::NUM_TOKENS, {}, 0) {}
@@ -267,21 +269,22 @@ class Token {
267269

268270
/// \brief Set the token to the specified kind and source range.
269271
void setToken(tok K, StringRef T, unsigned CommentLength = 0,
270-
bool MultilineString = false, bool RawString = false) {
272+
bool MultilineString = false, unsigned DelimiterLength = 0) {
271273
Kind = K;
272274
Text = T;
273275
this->CommentLength = CommentLength;
274276
EscapedIdentifier = false;
275277
this->MultilineString = MultilineString;
276-
this->RawString = RawString;
278+
StringDelimiterLength = DelimiterLength;
279+
assert(StringDelimiterLength == DelimiterLength && "delimiter too long");
277280
}
278281

279282
bool IsMultilineString() const {
280283
return MultilineString;
281284
}
282285

283-
bool IsRawString() const {
284-
return RawString;
286+
unsigned DelimiterLength() const {
287+
return StringDelimiterLength;
285288
}
286289
};
287290

lib/Parse/Lexer.cpp

Lines changed: 51 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -268,8 +268,7 @@ Token Lexer::getTokenAt(SourceLoc Loc) {
268268
}
269269

270270
void Lexer::formToken(tok Kind, const char *TokStart,
271-
bool MultilineString, bool RawString,
272-
size_t DelimiterLength) {
271+
bool MultilineString, unsigned DelimiterLength) {
273272
assert(CurPtr >= BufferStart &&
274273
CurPtr <= BufferEnd && "Current pointer out of range!");
275274

@@ -296,7 +295,7 @@ void Lexer::formToken(tok Kind, const char *TokStart,
296295
}
297296

298297
NextToken.setToken(Kind, TokenText, CommentLength,
299-
MultilineString, RawString);
298+
MultilineString, DelimiterLength);
300299
CurPtr += DelimiterLength;
301300
}
302301

@@ -1208,6 +1207,21 @@ static bool maybeConsumeNewlineEscape(const char *&CurPtr, ssize_t Offset) {
12081207
}
12091208
}
12101209

1210+
/// delimiterMatches - Does custom delimiter (# characters surrounding quotes)
1211+
/// match the number of # charatters after \ inside the string? This allows
1212+
/// interpolation inside a "raw" string. Normal/cooked string processing is
1213+
/// the degenerate case of there being no # characters surrounding the quotes.
1214+
/// If delimiter matches, advances byte pointer passed in and returns true.
1215+
static bool delimiterMatches(unsigned DelimiterLength, const char *&BytesPtr) {
1216+
if (!DelimiterLength)
1217+
return true;
1218+
for (unsigned i = 0; i < DelimiterLength ; i++)
1219+
if (BytesPtr[i] != '#')
1220+
return false;
1221+
BytesPtr += DelimiterLength;
1222+
return true;
1223+
}
1224+
12111225
/// lexCharacter - Read a character and return its UTF32 code. If this is the
12121226
/// end of enclosing string/character sequence (i.e. the character is equal to
12131227
/// 'StopQuote'), this returns ~0U and leaves 'CurPtr' pointing to the terminal
@@ -1218,7 +1232,7 @@ static bool maybeConsumeNewlineEscape(const char *&CurPtr, ssize_t Offset) {
12181232
/// character_escape ::= unicode_character_escape
12191233
unsigned Lexer::lexCharacter(const char *&CurPtr, char StopQuote,
12201234
bool EmitDiagnostics, bool MultilineString,
1221-
bool RawString) {
1235+
unsigned DelimiterLength) {
12221236
const char *CharStart = CurPtr;
12231237

12241238
switch (*CurPtr++) {
@@ -1267,7 +1281,7 @@ unsigned Lexer::lexCharacter(const char *&CurPtr, char StopQuote,
12671281
diagnose(CurPtr-1, diag::lex_unterminated_string);
12681282
return ~1U;
12691283
case '\\': // Escapes.
1270-
if (RawString)
1284+
if (!delimiterMatches(DelimiterLength, CurPtr))
12711285
return '\\';
12721286
break;
12731287
}
@@ -1496,7 +1510,7 @@ getMultilineTrailingIndent(const Token &Str, DiagnosticEngine *Diags) {
14961510
auto string = StringRef(start, end - start);
14971511

14981512
// Disallow escaped newline in the last line.
1499-
if (Diags && !Str.IsRawString()) {
1513+
if (Diags && Str.DelimiterLength() == 0) {
15001514
auto *Ptr = start - 1;
15011515
if (*Ptr == '\n') --Ptr;
15021516
if (*Ptr == '\r') --Ptr;
@@ -1652,30 +1666,34 @@ static void validateMultilineIndents(const Token &Str,
16521666
/// lexStringLiteral:
16531667
/// string_literal ::= ["]([^"\\\n\r]|character_escape)*["]
16541668
/// string_literal ::= ["]["]["].*["]["]["] - approximately
1655-
void Lexer::lexStringLiteral(bool RawString, std::string Delimiter) {
1656-
CurPtr += Delimiter.length();
1669+
void Lexer::lexStringLiteral(unsigned DelimiterLength) {
16571670
const char *TokStart = CurPtr-1;
16581671
assert((*TokStart == '"' || *TokStart == '\'') && "Unexpected start");
16591672
// NOTE: We only allow single-quote string literals so we can emit useful
16601673
// diagnostics about changing them to double quotes.
16611674

16621675
bool wasErroneous = false, MultilineString = false;
1663-
Delimiter.insert(0, 1, *TokStart);
1676+
std::string Delimiter;
1677+
Delimiter.push_back(*TokStart);
16641678

16651679
// Is this the start of a multiline string literal?
16661680
if (*TokStart == '"' && *CurPtr == '"' && *(CurPtr + 1) == '"') {
16671681
MultilineString = true;
16681682
CurPtr += 2;
1669-
Delimiter.insert(0, 2, *TokStart);
1683+
Delimiter.push_back(*TokStart);
1684+
Delimiter.push_back(*TokStart);
16701685
if (*CurPtr != '\n' && *CurPtr != '\r')
16711686
diagnose(CurPtr, diag::lex_illegal_multiline_string_start)
16721687
.fixItInsert(Lexer::getSourceLoc(CurPtr), "\n");
16731688
}
1689+
Delimiter.insert(Delimiter.size(), DelimiterLength, '#');
16741690

16751691
while (true) {
1676-
if (*CurPtr == '\\' && *(CurPtr + 1) == '(' && !RawString) {
1692+
const char *TmpPtr = CurPtr + 1;
1693+
if (*CurPtr == '\\' &&
1694+
delimiterMatches(DelimiterLength, TmpPtr) && *TmpPtr == '(') {
16771695
// Consume tokens until we hit the corresponding ')'.
1678-
CurPtr += 2;
1696+
CurPtr = TmpPtr + 1;
16791697
const char *EndPtr =
16801698
skipToEndOfInterpolatedExpression(CurPtr, BufferEnd,
16811699
Diags, MultilineString);
@@ -1698,7 +1716,7 @@ void Lexer::lexStringLiteral(bool RawString, std::string Delimiter) {
16981716
}
16991717

17001718
unsigned CharValue = lexCharacter(CurPtr, *TokStart, true,
1701-
MultilineString, RawString);
1719+
MultilineString, DelimiterLength);
17021720
wasErroneous |= CharValue == ~1U;
17031721

17041722
// If this is the end of string, we are done. If it is a normal character
@@ -1747,14 +1765,14 @@ void Lexer::lexStringLiteral(bool RawString, std::string Delimiter) {
17471765
if (MultilineString) {
17481766
CurPtr += 2;
17491767
formToken(tok::string_literal, TokStart,
1750-
MultilineString, RawString, Delimiter.length() - 3);
1768+
MultilineString, DelimiterLength);
17511769
if (Diags)
17521770
validateMultilineIndents(NextToken, Diags);
17531771
return;
17541772
}
17551773
else
17561774
return formToken(tok::string_literal, TokStart,
1757-
MultilineString, RawString, Delimiter.length() - 1);
1775+
MultilineString, DelimiterLength);
17581776
}
17591777
}
17601778
}
@@ -2021,7 +2039,7 @@ StringRef Lexer::getEncodedStringSegment(StringRef Bytes,
20212039
bool IsFirstSegment,
20222040
bool IsLastSegment,
20232041
unsigned IndentToStrip,
2024-
bool RawString) {
2042+
unsigned DelmiterLength) {
20252043

20262044
TempString.clear();
20272045
// Note that it is always safe to read one over the end of "Bytes" because
@@ -2048,7 +2066,7 @@ StringRef Lexer::getEncodedStringSegment(StringRef Bytes,
20482066
continue;
20492067
}
20502068

2051-
if (CurChar != '\\' || RawString) {
2069+
if (CurChar != '\\' || !delimiterMatches(DelmiterLength, BytesPtr)) {
20522070
TempString.push_back(CurChar);
20532071
continue;
20542072
}
@@ -2119,7 +2137,7 @@ void Lexer::getStringLiteralSegments(
21192137
// Are substitutions required either for indent stripping or line ending
21202138
// normalization?
21212139
bool MultilineString = Str.IsMultilineString(), IsFirstSegment = true;
2122-
unsigned IndentToStrip = 0;
2140+
unsigned IndentToStrip = 0, DelimiterLength = Str.DelimiterLength();
21232141
if (MultilineString)
21242142
IndentToStrip =
21252143
std::get<0>(getMultilineTrailingIndent(Str, /*Diags=*/nullptr)).size();
@@ -2130,20 +2148,20 @@ void Lexer::getStringLiteralSegments(
21302148
const char *SegmentStartPtr = Bytes.begin();
21312149
const char *BytesPtr = SegmentStartPtr;
21322150
size_t pos;
2133-
while (!Str.IsRawString() &&
2134-
(pos = Bytes.find('\\', BytesPtr-Bytes.begin())) != StringRef::npos) {
2151+
while ((pos = Bytes.find('\\', BytesPtr-Bytes.begin())) != StringRef::npos) {
21352152
BytesPtr = Bytes.begin() + pos + 1;
21362153

2137-
if (*BytesPtr++ != '(')
2154+
if (!delimiterMatches(DelimiterLength, BytesPtr) || *BytesPtr++ != '(')
21382155
continue;
21392156

21402157
// String interpolation.
21412158

21422159
// Push the current segment.
21432160
Segments.push_back(
21442161
StringSegment::getLiteral(getSourceLoc(SegmentStartPtr),
2145-
BytesPtr-SegmentStartPtr-2,
2146-
IsFirstSegment, false, IndentToStrip, false));
2162+
BytesPtr-SegmentStartPtr-2-DelimiterLength,
2163+
IsFirstSegment, false, IndentToStrip,
2164+
DelimiterLength));
21472165
IsFirstSegment = false;
21482166

21492167
// Find the closing ')'.
@@ -2167,14 +2185,7 @@ void Lexer::getStringLiteralSegments(
21672185
StringSegment::getLiteral(getSourceLoc(SegmentStartPtr),
21682186
Bytes.end()-SegmentStartPtr,
21692187
IsFirstSegment, true, IndentToStrip,
2170-
Str.IsRawString()));
2171-
}
2172-
2173-
/// A custom delimiter is zero or more # characters surrounding a quoted string
2174-
static bool isDelimitedString(const char *CurPtr, std::string &delimiter) {
2175-
while (*CurPtr == '#')
2176-
delimiter.push_back(*CurPtr++);
2177-
return *CurPtr == '"';
2188+
DelimiterLength));
21782189
}
21792190

21802191
//===----------------------------------------------------------------------===//
@@ -2268,19 +2279,17 @@ void Lexer::lexImpl() {
22682279
case ',': return formToken(tok::comma, TokStart);
22692280
case ';': return formToken(tok::semi, TokStart);
22702281
case ':': return formToken(tok::colon, TokStart);
2271-
case '\\': {
2272-
std::string Delimiter;
2273-
if (isDelimitedString(CurPtr, Delimiter)) {
2274-
CurPtr++;
2275-
return lexStringLiteral(true, Delimiter);
2276-
}
2277-
}
2278-
return formToken(tok::backslash, TokStart);
2282+
case '\\': return formToken(tok::backslash, TokStart);
22792283

22802284
case '#': {
2281-
std::string Delimiter;
2282-
if (isDelimitedString(CurPtr - 1, Delimiter))
2283-
return lexStringLiteral(false, Delimiter);
2285+
const char *Lookahead = CurPtr;
2286+
while (*Lookahead == '#')
2287+
Lookahead++;
2288+
if (*Lookahead++ == '"') {
2289+
unsigned DelimiterLength = Lookahead - CurPtr;
2290+
CurPtr = Lookahead;
2291+
return lexStringLiteral(DelimiterLength);
2292+
}
22842293
}
22852294
return lexHash();
22862295

0 commit comments

Comments
 (0)