Skip to content

Commit 14213b8

Browse files
committed
Revised implementation for raw strings
1 parent 5459e0d commit 14213b8

File tree

5 files changed

+126
-39
lines changed

5 files changed

+126
-39
lines changed

include/swift/Parse/Lexer.h

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -356,18 +356,20 @@ class Lexer {
356356
// Loc+Length for the segment inside the string literal, without quotes.
357357
SourceLoc Loc;
358358
unsigned Length, IndentToStrip;
359-
bool IsFirstSegment, IsLastSegment;
359+
bool IsFirstSegment, IsLastSegment, RawString;
360360

361361
static StringSegment getLiteral(SourceLoc Loc, unsigned Length,
362362
bool IsFirstSegment, bool IsLastSegment,
363-
unsigned IndentToStrip) {
363+
unsigned IndentToStrip,
364+
bool RawString) {
364365
StringSegment Result;
365366
Result.Kind = Literal;
366367
Result.Loc = Loc;
367368
Result.Length = Length;
368369
Result.IsFirstSegment = IsFirstSegment;
369370
Result.IsLastSegment = IsLastSegment;
370371
Result.IndentToStrip = IndentToStrip;
372+
Result.RawString = RawString;
371373
return Result;
372374
}
373375

@@ -379,6 +381,7 @@ class Lexer {
379381
Result.IsFirstSegment = false;
380382
Result.IsLastSegment = false;
381383
Result.IndentToStrip = 0;
384+
Result.RawString = false;
382385
return Result;
383386
}
384387

@@ -395,13 +398,14 @@ class Lexer {
395398
SmallVectorImpl<char> &Buffer,
396399
bool IsFirstSegment = false,
397400
bool IsLastSegment = false,
398-
unsigned IndentToStrip = 0);
401+
unsigned IndentToStrip = 0,
402+
bool RawString = false);
399403
StringRef getEncodedStringSegment(StringSegment Segment,
400404
SmallVectorImpl<char> &Buffer) const {
401405
return getEncodedStringSegment(
402406
StringRef(getBufferPtrForSourceLoc(Segment.Loc), Segment.Length),
403407
Buffer, Segment.IsFirstSegment, Segment.IsLastSegment,
404-
Segment.IndentToStrip);
408+
Segment.IndentToStrip, Segment.RawString);
405409
}
406410

407411
/// \brief Given a string literal token, separate it into string/expr segments
@@ -465,7 +469,8 @@ class Lexer {
465469
return diagnose(Loc, Diagnostic(DiagID, std::forward<ArgTypes>(Args)...));
466470
}
467471

468-
void formToken(tok Kind, const char *TokStart, bool MultilineString = false);
472+
void formToken(tok Kind, const char *TokStart, bool MultilineString = false,
473+
bool RawString = false, size_t DelimiterLength = 0);
469474
void formEscapedIdentifierToken(const char *TokStart);
470475

471476
/// Advance to the end of the line.
@@ -491,8 +496,8 @@ class Lexer {
491496

492497
unsigned lexCharacter(const char *&CurPtr,
493498
char StopQuote, bool EmitDiagnostics,
494-
bool MultilineString = false);
495-
void lexStringLiteral();
499+
bool Multiline = false, bool RawString = false);
500+
void lexStringLiteral(bool RawString = false, std::string Delimiter = "");
496501
void lexEscapedIdentifier();
497502

498503
void tryLexEditorPlaceholder();

include/swift/Parse/Token.h

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -44,8 +44,9 @@ class Token {
4444

4545
/// Modifiers for string literals
4646
unsigned MultilineString : 1;
47+
unsigned RawString : 1;
4748

48-
// Padding bits == 32 - sizeof(Kind) * 8 - 3;
49+
// Padding bits == 32 - sizeof(Kind) * 8 - 4;
4950

5051
/// \brief The length of the comment that precedes the token.
5152
unsigned CommentLength;
@@ -62,8 +63,8 @@ class Token {
6263
public:
6364
Token(tok Kind, StringRef Text, unsigned CommentLength = 0)
6465
: Kind(Kind), AtStartOfLine(false), EscapedIdentifier(false),
65-
MultilineString(false), CommentLength(CommentLength),
66-
Text(Text) {}
66+
MultilineString(false), RawString(false),
67+
CommentLength(CommentLength), Text(Text) {}
6768

6869
Token() : Token(tok::NUM_TOKENS, {}, 0) {}
6970

@@ -266,17 +267,22 @@ class Token {
266267

267268
/// \brief Set the token to the specified kind and source range.
268269
void setToken(tok K, StringRef T, unsigned CommentLength = 0,
269-
bool MultilineString = false) {
270+
bool MultilineString = false, bool RawString = false) {
270271
Kind = K;
271272
Text = T;
272273
this->CommentLength = CommentLength;
273274
EscapedIdentifier = false;
274275
this->MultilineString = MultilineString;
276+
this->RawString = RawString;
275277
}
276278

277279
bool IsMultilineString() const {
278280
return MultilineString;
279281
}
282+
283+
bool IsRawString() const {
284+
return RawString;
285+
}
280286
};
281287

282288
} // end namespace swift

lib/Parse/Lexer.cpp

Lines changed: 54 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -267,7 +267,9 @@ Token Lexer::getTokenAt(SourceLoc Loc) {
267267
return Result;
268268
}
269269

270-
void Lexer::formToken(tok Kind, const char *TokStart, bool MultilineString) {
270+
void Lexer::formToken(tok Kind, const char *TokStart,
271+
bool MultilineString, bool RawString,
272+
size_t DelimiterLength) {
271273
assert(CurPtr >= BufferStart &&
272274
CurPtr <= BufferEnd && "Current pointer out of range!");
273275

@@ -293,7 +295,9 @@ void Lexer::formToken(tok Kind, const char *TokStart, bool MultilineString) {
293295
lexTrivia(TrailingTrivia, /* IsForTrailingTrivia */ true);
294296
}
295297

296-
NextToken.setToken(Kind, TokenText, CommentLength, MultilineString);
298+
NextToken.setToken(Kind, TokenText, CommentLength,
299+
MultilineString, RawString);
300+
CurPtr += DelimiterLength;
297301
}
298302

299303
void Lexer::formEscapedIdentifierToken(const char *TokStart) {
@@ -1213,7 +1217,8 @@ static bool maybeConsumeNewlineEscape(const char *&CurPtr, ssize_t Offset) {
12131217
/// character_escape ::= [\][\] | [\]t | [\]n | [\]r | [\]" | [\]' | [\]0
12141218
/// character_escape ::= unicode_character_escape
12151219
unsigned Lexer::lexCharacter(const char *&CurPtr, char StopQuote,
1216-
bool EmitDiagnostics, bool MultilineString) {
1220+
bool EmitDiagnostics, bool MultilineString,
1221+
bool RawString) {
12171222
const char *CharStart = CurPtr;
12181223

12191224
switch (*CurPtr++) {
@@ -1262,6 +1267,8 @@ unsigned Lexer::lexCharacter(const char *&CurPtr, char StopQuote,
12621267
diagnose(CurPtr-1, diag::lex_unterminated_string);
12631268
return ~1U;
12641269
case '\\': // Escapes.
1270+
if (RawString)
1271+
return '\\';
12651272
break;
12661273
}
12671274

@@ -1489,7 +1496,7 @@ getMultilineTrailingIndent(const Token &Str, DiagnosticEngine *Diags) {
14891496
auto string = StringRef(start, end - start);
14901497

14911498
// Disallow escaped newline in the last line.
1492-
if (Diags) {
1499+
if (Diags && !Str.IsRawString()) {
14931500
auto *Ptr = start - 1;
14941501
if (*Ptr == '\n') --Ptr;
14951502
if (*Ptr == '\r') --Ptr;
@@ -1645,25 +1652,28 @@ static void validateMultilineIndents(const Token &Str,
16451652
/// lexStringLiteral:
16461653
/// string_literal ::= ["]([^"\\\n\r]|character_escape)*["]
16471654
/// string_literal ::= ["]["]["].*["]["]["] - approximately
1648-
void Lexer::lexStringLiteral() {
1655+
void Lexer::lexStringLiteral(bool RawString, std::string Delimiter) {
1656+
CurPtr += Delimiter.length();
16491657
const char *TokStart = CurPtr-1;
16501658
assert((*TokStart == '"' || *TokStart == '\'') && "Unexpected start");
16511659
// NOTE: We only allow single-quote string literals so we can emit useful
16521660
// diagnostics about changing them to double quotes.
16531661

16541662
bool wasErroneous = false, MultilineString = false;
1663+
Delimiter.insert(0, 1, *TokStart);
16551664

16561665
// Is this the start of a multiline string literal?
16571666
if (*TokStart == '"' && *CurPtr == '"' && *(CurPtr + 1) == '"') {
16581667
MultilineString = true;
16591668
CurPtr += 2;
1669+
Delimiter.insert(0, 2, *TokStart);
16601670
if (*CurPtr != '\n' && *CurPtr != '\r')
16611671
diagnose(CurPtr, diag::lex_illegal_multiline_string_start)
16621672
.fixItInsert(Lexer::getSourceLoc(CurPtr), "\n");
16631673
}
16641674

16651675
while (true) {
1666-
if (*CurPtr == '\\' && *(CurPtr + 1) == '(') {
1676+
if (*CurPtr == '\\' && *(CurPtr + 1) == '(' && !RawString) {
16671677
// Consume tokens until we hit the corresponding ')'.
16681678
CurPtr += 2;
16691679
const char *EndPtr =
@@ -1687,7 +1697,8 @@ void Lexer::lexStringLiteral() {
16871697
return formToken(tok::unknown, TokStart);
16881698
}
16891699

1690-
unsigned CharValue = lexCharacter(CurPtr, *TokStart, true, MultilineString);
1700+
unsigned CharValue = lexCharacter(CurPtr, *TokStart, true,
1701+
MultilineString, RawString);
16911702
wasErroneous |= CharValue == ~1U;
16921703

16931704
// If this is the end of string, we are done. If it is a normal character
@@ -1731,20 +1742,20 @@ void Lexer::lexStringLiteral() {
17311742
replacement);
17321743
}
17331744

1734-
// Is this the end of a multiline string literal?
1735-
if (MultilineString) {
1736-
if (*CurPtr == '"' && *(CurPtr + 1) == '"' && *(CurPtr + 2) != '"') {
1745+
// Is this the end of a delimited/multiline string literal?
1746+
if(StringRef(CurPtr - 1, Delimiter.length()) == Delimiter) {
1747+
if (MultilineString) {
17371748
CurPtr += 2;
1738-
formToken(tok::string_literal, TokStart, MultilineString);
1749+
formToken(tok::string_literal, TokStart,
1750+
MultilineString, RawString, Delimiter.length() - 3);
17391751
if (Diags)
17401752
validateMultilineIndents(NextToken, Diags);
17411753
return;
17421754
}
17431755
else
1744-
continue;
1756+
return formToken(tok::string_literal, TokStart,
1757+
MultilineString, RawString, Delimiter.length() - 1);
17451758
}
1746-
1747-
return formToken(tok::string_literal, TokStart, MultilineString);
17481759
}
17491760
}
17501761
}
@@ -2009,7 +2020,8 @@ StringRef Lexer::getEncodedStringSegment(StringRef Bytes,
20092020
SmallVectorImpl<char> &TempString,
20102021
bool IsFirstSegment,
20112022
bool IsLastSegment,
2012-
unsigned IndentToStrip) {
2023+
unsigned IndentToStrip,
2024+
bool RawString) {
20132025

20142026
TempString.clear();
20152027
// Note that it is always safe to read one over the end of "Bytes" because
@@ -2036,7 +2048,7 @@ StringRef Lexer::getEncodedStringSegment(StringRef Bytes,
20362048
continue;
20372049
}
20382050

2039-
if (CurChar != '\\') {
2051+
if (CurChar != '\\' || RawString) {
20402052
TempString.push_back(CurChar);
20412053
continue;
20422054
}
@@ -2117,11 +2129,10 @@ void Lexer::getStringLiteralSegments(
21172129
// range check subscripting on the StringRef.
21182130
const char *SegmentStartPtr = Bytes.begin();
21192131
const char *BytesPtr = SegmentStartPtr;
2120-
// FIXME: Use SSE to scan for '\'.
2121-
while (BytesPtr != Bytes.end()) {
2122-
char CurChar = *BytesPtr++;
2123-
if (CurChar != '\\')
2124-
continue;
2132+
size_t pos;
2133+
while (!Str.IsRawString() &&
2134+
(pos = Bytes.find('\\', BytesPtr-Bytes.begin())) != StringRef::npos) {
2135+
BytesPtr = Bytes.begin() + pos + 1;
21252136

21262137
if (*BytesPtr++ != '(')
21272138
continue;
@@ -2132,7 +2143,7 @@ void Lexer::getStringLiteralSegments(
21322143
Segments.push_back(
21332144
StringSegment::getLiteral(getSourceLoc(SegmentStartPtr),
21342145
BytesPtr-SegmentStartPtr-2,
2135-
IsFirstSegment, false, IndentToStrip));
2146+
IsFirstSegment, false, IndentToStrip, false));
21362147
IsFirstSegment = false;
21372148

21382149
// Find the closing ')'.
@@ -2155,9 +2166,16 @@ void Lexer::getStringLiteralSegments(
21552166
Segments.push_back(
21562167
StringSegment::getLiteral(getSourceLoc(SegmentStartPtr),
21572168
Bytes.end()-SegmentStartPtr,
2158-
IsFirstSegment, true, IndentToStrip));
2169+
IsFirstSegment, true, IndentToStrip,
2170+
Str.IsRawString()));
21592171
}
21602172

2173+
/// A custom delimiter is zero or more # characters surrounding a quoted string
2174+
static bool isDelimitedString(const char *CurPtr, std::string &delimiter) {
2175+
while (*CurPtr == '#')
2176+
delimiter.push_back(*CurPtr++);
2177+
return *CurPtr == '"';
2178+
}
21612179

21622180
//===----------------------------------------------------------------------===//
21632181
// Main Lexer Loop
@@ -2250,9 +2268,20 @@ void Lexer::lexImpl() {
22502268
case ',': return formToken(tok::comma, TokStart);
22512269
case ';': return formToken(tok::semi, TokStart);
22522270
case ':': return formToken(tok::colon, TokStart);
2253-
case '\\': return formToken(tok::backslash, TokStart);
2271+
case '\\': {
2272+
std::string Delimiter;
2273+
if (isDelimitedString(CurPtr, Delimiter)) {
2274+
CurPtr++;
2275+
return lexStringLiteral(true, Delimiter);
2276+
}
2277+
}
2278+
return formToken(tok::backslash, TokStart);
22542279

2255-
case '#':
2280+
case '#': {
2281+
std::string Delimiter;
2282+
if (isDelimitedString(CurPtr - 1, Delimiter))
2283+
return lexStringLiteral(false, Delimiter);
2284+
}
22562285
return lexHash();
22572286

22582287
// Operator characters.

lib/Parse/Parser.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -373,7 +373,7 @@ class TokenRecorder: public ConsumeTokenReceiver {
373373
}
374374

375375
void relexComment(CharSourceRange CommentRange,
376-
llvm::SmallVectorImpl<Token> &Scracth) {
376+
llvm::SmallVectorImpl<Token> &Scratch) {
377377
Lexer L(Ctx.LangOpts, Ctx.SourceMgr, BufferID, nullptr, /*InSILMode=*/false,
378378
CommentRetentionMode::ReturnAsTokens,
379379
TriviaRetentionMode::WithoutTrivia,
@@ -384,8 +384,8 @@ class TokenRecorder: public ConsumeTokenReceiver {
384384
L.lex(Result);
385385
if (Result.is(tok::eof))
386386
break;
387-
assert(Result.is(tok::comment));
388-
Scracth.push_back(Result);
387+
if(Result.is(tok::comment)) // interacts badly with custom delimiters
388+
Scratch.push_back(Result);
389389
}
390390
}
391391

test/Parse/raw_string.swift

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
// RUN: %target-swift-frontend -dump-ast %s 2>&1 | %FileCheck %s
2+
3+
import Swift
4+
5+
// ===---------- Multiline RawString --------===
6+
7+
_ = ##"""
8+
One
9+
""Alpha""
10+
"""##
11+
// CHECK: "One\n\"\"Alpha\"\""
12+
13+
_ = ##"""
14+
Two
15+
Beta
16+
"""##
17+
// CHECK: " Two\nBeta"
18+
19+
_ = \"""
20+
Three\r
21+
Gamma\
22+
"""
23+
// CHECK: " Three\\r\n Gamma\\"
24+
25+
_ = \###"""
26+
Four \(foo)
27+
Delta
28+
"""###
29+
// CHECK: " Four \\(foo)\n Delta"
30+
31+
_ = ##"""
32+
print("""
33+
Five\n\n\nEpsilon
34+
""")
35+
"""##
36+
// CHECK: "print(\"\"\"\n Five\n\n\nEpsilon\n \"\"\")"
37+
38+
// ===---------- Single line --------===
39+
40+
_ = #""Zeta""#
41+
// CHECK: "\"Zeta\""
42+
43+
_ = #""Eta"\n\n\n\""#
44+
// CHECK: "\"Eta\"\n\n\n\"
45+
46+
_ = \#""Iota"\n\n\n\""#
47+
// CHECK: "\"Iota\"\\n\\n\\n\\\""

0 commit comments

Comments
 (0)