Skip to content

Commit e180bf9

Browse files
authored
[Parse] Tweak a utility function that relies on reading past the end (#19230)
Lexer::getEncodedStringSegment (now getEncodedStringSegmentImpl) assumes that it can read one byte past the end of a string segment in order to avoid bounds-checks on things like "is this a \r\n sequence?". However, the function was being used for strings that did not come from source where this assumption was not always valid. Change the reusable form of the function to always copy into a temporary buffer, allowing the fast path to continue to be used for normal parsing. Caught by ASan! rdar://problem/44306756
1 parent d414ece commit e180bf9

File tree

4 files changed

+95
-22
lines changed

4 files changed

+95
-22
lines changed

include/swift/Parse/Lexer.h

Lines changed: 33 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -399,24 +399,48 @@ class Lexer {
399399
}
400400

401401
};
402-
402+
403+
/// Implementation of getEncodedStringSegment. Note that \p Str must support
404+
/// reading one byte past the end.
405+
static StringRef getEncodedStringSegmentImpl(StringRef Str,
406+
SmallVectorImpl<char> &Buffer,
407+
bool IsFirstSegment,
408+
bool IsLastSegment,
409+
unsigned IndentToStrip,
410+
unsigned CustomDelimiterLen);
411+
403412
/// \brief Compute the bytes that the actual string literal should codegen to.
404413
/// If a copy needs to be made, it will be allocated out of the provided
405-
/// Buffer.
406-
static StringRef getEncodedStringSegment(StringRef Str,
407-
SmallVectorImpl<char> &Buffer,
408-
bool IsFirstSegment = false,
409-
bool IsLastSegment = false,
410-
unsigned IndentToStrip = 0,
411-
unsigned CustomDelimiterLen = 0);
414+
/// \p Buffer.
412415
StringRef getEncodedStringSegment(StringSegment Segment,
413416
SmallVectorImpl<char> &Buffer) const {
414-
return getEncodedStringSegment(
417+
return getEncodedStringSegmentImpl(
415418
StringRef(getBufferPtrForSourceLoc(Segment.Loc), Segment.Length),
416419
Buffer, Segment.IsFirstSegment, Segment.IsLastSegment,
417420
Segment.IndentToStrip, Segment.CustomDelimiterLen);
418421
}
419422

423+
/// \brief Given a string encoded with escapes like a string literal, compute
424+
/// the byte content.
425+
///
426+
/// If a copy needs to be made, it will be allocated out of the provided
427+
/// \p Buffer.
428+
static StringRef getEncodedStringSegment(StringRef Str,
429+
SmallVectorImpl<char> &Buffer) {
430+
SmallString<128> TerminatedStrBuf(Str);
431+
TerminatedStrBuf.push_back('\0');
432+
StringRef TerminatedStr = StringRef(TerminatedStrBuf).drop_back();
433+
StringRef Result = getEncodedStringSegmentImpl(TerminatedStr, Buffer,
434+
/*IsFirstSegment*/false,
435+
/*IsLastSegment*/false,
436+
/*IndentToStrip*/0,
437+
/*CustomDelimiterLen*/0);
438+
if (Result == TerminatedStr)
439+
return Str;
440+
assert(Result.data() == Buffer.data());
441+
return Result;
442+
}
443+
420444
/// \brief Given a string literal token, separate it into string/expr segments
421445
/// of a potentially interpolated string.
422446
static void getStringLiteralSegments(

lib/Parse/Lexer.cpp

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2095,17 +2095,18 @@ void Lexer::tryLexEditorPlaceholder() {
20952095
lexOperatorIdentifier();
20962096
}
20972097

2098-
StringRef Lexer::getEncodedStringSegment(StringRef Bytes,
2099-
SmallVectorImpl<char> &TempString,
2100-
bool IsFirstSegment,
2101-
bool IsLastSegment,
2102-
unsigned IndentToStrip,
2103-
unsigned CustomDelimiterLen) {
2098+
StringRef Lexer::getEncodedStringSegmentImpl(StringRef Bytes,
2099+
SmallVectorImpl<char> &TempString,
2100+
bool IsFirstSegment,
2101+
bool IsLastSegment,
2102+
unsigned IndentToStrip,
2103+
unsigned CustomDelimiterLen) {
21042104

21052105
TempString.clear();
2106-
// Note that it is always safe to read one over the end of "Bytes" because
2107-
// we know that there is a terminating " character. Use BytesPtr to avoid a
2108-
// range check subscripting on the StringRef.
2106+
// Note that it is always safe to read one over the end of "Bytes" because we
2107+
// know that there is a terminating " character (or null byte for an
2108+
// unterminated literal or a segment that doesn't come from source). Use
2109+
// BytesPtr to avoid a range check subscripting on the StringRef.
21092110
const char *BytesPtr = Bytes.begin();
21102111

21112112
bool IsEscapedNewline = false;

lib/ParseSIL/ParseSIL.cpp

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2378,18 +2378,22 @@ bool SILParser::parseSILInstruction(SILBuilder &B) {
23782378
return true;
23792379
}
23802380

2381-
// Drop the double quotes.
2382-
StringRef rawString = P.Tok.getText().drop_front().drop_back();
2381+
// Parse the string.
2382+
SmallVector<Lexer::StringSegment, 1> segments;
2383+
P.L->getStringLiteralSegments(P.Tok, segments);
2384+
assert(segments.size() == 1);
23832385

23842386
P.consumeToken(tok::string_literal);
23852387
if (parseSILDebugLocation(InstLoc, B))
23862388
return true;
23872389

2388-
// Ask the lexer to interpret the entire string as a literal segment.
23892390
SmallVector<char, 128> stringBuffer;
23902391

23912392
if (encoding == StringLiteralInst::Encoding::Bytes) {
23922393
// Decode hex bytes.
2394+
CharSourceRange rawStringRange(segments.front().Loc,
2395+
segments.front().Length);
2396+
StringRef rawString = P.SourceMgr.extractText(rawStringRange);
23932397
if (rawString.size() & 1) {
23942398
P.diagnose(P.Tok, diag::expected_tok_in_sil_instr,
23952399
"even number of hex bytes");
@@ -2411,7 +2415,8 @@ bool SILParser::parseSILInstruction(SILBuilder &B) {
24112415
break;
24122416
}
24132417

2414-
StringRef string = P.L->getEncodedStringSegment(rawString, stringBuffer);
2418+
StringRef string = P.L->getEncodedStringSegment(segments.front(),
2419+
stringBuffer);
24152420
ResultVal = B.createStringLiteral(InstLoc, string, encoding);
24162421
break;
24172422
}

unittests/Parse/LexerTests.cpp

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,21 @@
11
#include "swift/AST/DiagnosticConsumer.h"
22
#include "swift/AST/DiagnosticEngine.h"
3+
#include "swift/Basic/Defer.h"
34
#include "swift/Basic/LangOptions.h"
45
#include "swift/Basic/SourceManager.h"
56
#include "swift/Parse/Lexer.h"
67
#include "swift/Subsystems.h"
78
#include "llvm/Support/MemoryBuffer.h"
9+
#include "llvm/Support/Process.h"
810
#include "gtest/gtest.h"
911

12+
#if __has_include(<sys/mman.h>)
13+
# include <sys/mman.h>
14+
# define HAS_MMAP 1
15+
#else
16+
# define HAS_MMAP 0
17+
#endif
18+
1019
using namespace swift;
1120
using namespace llvm;
1221

@@ -806,3 +815,37 @@ TEST_F(LexerTest, DiagnoseEmbeddedNulOffset) {
806815
ASSERT_FALSE(containsPrefix(
807816
DiagConsumer.messages, "1, 4: nul character embedded in middle of file"));
808817
}
818+
819+
#if HAS_MMAP
820+
821+
// This test requires mmap because llvm::sys::Memory doesn't support protecting
822+
// pages to have no permissions.
823+
TEST_F(LexerTest, EncodedStringSegmentPastTheEnd) {
824+
size_t PageSize = llvm::sys::Process::getPageSize();
825+
826+
void *FirstPage = mmap(/*addr*/nullptr, PageSize * 2, PROT_NONE,
827+
MAP_PRIVATE | MAP_ANON, /*fd*/-1, /*offset*/0);
828+
SWIFT_DEFER { (void)munmap(FirstPage, PageSize * 2); };
829+
ASSERT_NE(FirstPage, MAP_FAILED);
830+
int ProtectResult = mprotect(FirstPage, PageSize, PROT_READ | PROT_WRITE);
831+
ASSERT_EQ(ProtectResult, 0);
832+
833+
auto check = [FirstPage, PageSize](StringRef Input, StringRef Expected) {
834+
char *StartPtr = static_cast<char *>(FirstPage) + PageSize - Input.size();
835+
memcpy(StartPtr, Input.data(), Input.size());
836+
837+
SmallString<64> Buffer;
838+
StringRef Escaped = Lexer::getEncodedStringSegment({StartPtr, Input.size()},
839+
Buffer);
840+
EXPECT_EQ(Escaped, Expected);
841+
};
842+
843+
check("needs escaping\\r",
844+
"needs escaping\r");
845+
check("does not need escaping",
846+
"does not need escaping");
847+
check("invalid escape at the end \\",
848+
"invalid escape at the end ");
849+
}
850+
851+
#endif // HAS_MMAP

0 commit comments

Comments
 (0)