Skip to content

[Parse] Tweak a utility function that relies on reading past the end #19230

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Sep 12, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 33 additions & 9 deletions include/swift/Parse/Lexer.h
Original file line number Diff line number Diff line change
Expand Up @@ -399,24 +399,48 @@ class Lexer {
}

};


/// Implementation of getEncodedStringSegment. Note that \p Str must support
/// reading one byte past the end.
static StringRef getEncodedStringSegmentImpl(StringRef Str,
SmallVectorImpl<char> &Buffer,
bool IsFirstSegment,
bool IsLastSegment,
unsigned IndentToStrip,
unsigned CustomDelimiterLen);

/// \brief Compute the bytes that the actual string literal should codegen to.
/// If a copy needs to be made, it will be allocated out of the provided
/// Buffer.
static StringRef getEncodedStringSegment(StringRef Str,
SmallVectorImpl<char> &Buffer,
bool IsFirstSegment = false,
bool IsLastSegment = false,
unsigned IndentToStrip = 0,
unsigned CustomDelimiterLen = 0);
/// \p Buffer.
StringRef getEncodedStringSegment(StringSegment Segment,
SmallVectorImpl<char> &Buffer) const {
return getEncodedStringSegment(
return getEncodedStringSegmentImpl(
StringRef(getBufferPtrForSourceLoc(Segment.Loc), Segment.Length),
Buffer, Segment.IsFirstSegment, Segment.IsLastSegment,
Segment.IndentToStrip, Segment.CustomDelimiterLen);
}

/// \brief Given a string encoded with escapes like a string literal, compute
/// the byte content.
///
/// If a copy needs to be made, it will be allocated out of the provided
/// \p Buffer.
static StringRef getEncodedStringSegment(StringRef Str,
SmallVectorImpl<char> &Buffer) {
SmallString<128> TerminatedStrBuf(Str);
TerminatedStrBuf.push_back('\0');
StringRef TerminatedStr = StringRef(TerminatedStrBuf).drop_back();
StringRef Result = getEncodedStringSegmentImpl(TerminatedStr, Buffer,
/*IsFirstSegment*/false,
/*IsLastSegment*/false,
/*IndentToStrip*/0,
/*CustomDelimiterLen*/0);
if (Result == TerminatedStr)
return Str;
assert(Result.data() == Buffer.data());
return Result;
}

/// \brief Given a string literal token, separate it into string/expr segments
/// of a potentially interpolated string.
static void getStringLiteralSegments(
Expand Down
19 changes: 10 additions & 9 deletions lib/Parse/Lexer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2095,17 +2095,18 @@ void Lexer::tryLexEditorPlaceholder() {
lexOperatorIdentifier();
}

StringRef Lexer::getEncodedStringSegment(StringRef Bytes,
SmallVectorImpl<char> &TempString,
bool IsFirstSegment,
bool IsLastSegment,
unsigned IndentToStrip,
unsigned CustomDelimiterLen) {
StringRef Lexer::getEncodedStringSegmentImpl(StringRef Bytes,
SmallVectorImpl<char> &TempString,
bool IsFirstSegment,
bool IsLastSegment,
unsigned IndentToStrip,
unsigned CustomDelimiterLen) {

TempString.clear();
// Note that it is always safe to read one over the end of "Bytes" because
// we know that there is a terminating " character. Use BytesPtr to avoid a
// range check subscripting on the StringRef.
// Note that it is always safe to read one over the end of "Bytes" because we
// know that there is a terminating " character (or null byte for an
// unterminated literal or a segment that doesn't come from source). Use
// BytesPtr to avoid a range check subscripting on the StringRef.
const char *BytesPtr = Bytes.begin();

bool IsEscapedNewline = false;
Expand Down
13 changes: 9 additions & 4 deletions lib/ParseSIL/ParseSIL.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2378,18 +2378,22 @@ bool SILParser::parseSILInstruction(SILBuilder &B) {
return true;
}

// Drop the double quotes.
StringRef rawString = P.Tok.getText().drop_front().drop_back();
// Parse the string.
SmallVector<Lexer::StringSegment, 1> segments;
P.L->getStringLiteralSegments(P.Tok, segments);
assert(segments.size() == 1);

P.consumeToken(tok::string_literal);
if (parseSILDebugLocation(InstLoc, B))
return true;

// Ask the lexer to interpret the entire string as a literal segment.
SmallVector<char, 128> stringBuffer;

if (encoding == StringLiteralInst::Encoding::Bytes) {
// Decode hex bytes.
CharSourceRange rawStringRange(segments.front().Loc,
segments.front().Length);
StringRef rawString = P.SourceMgr.extractText(rawStringRange);
if (rawString.size() & 1) {
P.diagnose(P.Tok, diag::expected_tok_in_sil_instr,
"even number of hex bytes");
Expand All @@ -2411,7 +2415,8 @@ bool SILParser::parseSILInstruction(SILBuilder &B) {
break;
}

StringRef string = P.L->getEncodedStringSegment(rawString, stringBuffer);
StringRef string = P.L->getEncodedStringSegment(segments.front(),
stringBuffer);
ResultVal = B.createStringLiteral(InstLoc, string, encoding);
break;
}
Expand Down
43 changes: 43 additions & 0 deletions unittests/Parse/LexerTests.cpp
Original file line number Diff line number Diff line change
@@ -1,12 +1,21 @@
#include "swift/AST/DiagnosticConsumer.h"
#include "swift/AST/DiagnosticEngine.h"
#include "swift/Basic/Defer.h"
#include "swift/Basic/LangOptions.h"
#include "swift/Basic/SourceManager.h"
#include "swift/Parse/Lexer.h"
#include "swift/Subsystems.h"
#include "llvm/Support/MemoryBuffer.h"
#include "llvm/Support/Process.h"
#include "gtest/gtest.h"

#if __has_include(<sys/mman.h>)
# include <sys/mman.h>
# define HAS_MMAP 1
#else
# define HAS_MMAP 0
#endif

using namespace swift;
using namespace llvm;

Expand Down Expand Up @@ -806,3 +815,37 @@ TEST_F(LexerTest, DiagnoseEmbeddedNulOffset) {
ASSERT_FALSE(containsPrefix(
DiagConsumer.messages, "1, 4: nul character embedded in middle of file"));
}

#if HAS_MMAP

// This test requires mmap because llvm::sys::Memory doesn't support protecting
// pages to have no permissions.
TEST_F(LexerTest, EncodedStringSegmentPastTheEnd) {
size_t PageSize = llvm::sys::Process::getPageSize();

void *FirstPage = mmap(/*addr*/nullptr, PageSize * 2, PROT_NONE,
MAP_PRIVATE | MAP_ANON, /*fd*/-1, /*offset*/0);
SWIFT_DEFER { (void)munmap(FirstPage, PageSize * 2); };
ASSERT_NE(FirstPage, MAP_FAILED);
int ProtectResult = mprotect(FirstPage, PageSize, PROT_READ | PROT_WRITE);
ASSERT_EQ(ProtectResult, 0);

auto check = [FirstPage, PageSize](StringRef Input, StringRef Expected) {
char *StartPtr = static_cast<char *>(FirstPage) + PageSize - Input.size();
memcpy(StartPtr, Input.data(), Input.size());

SmallString<64> Buffer;
StringRef Escaped = Lexer::getEncodedStringSegment({StartPtr, Input.size()},
Buffer);
EXPECT_EQ(Escaped, Expected);
};

check("needs escaping\\r",
"needs escaping\r");
check("does not need escaping",
"does not need escaping");
check("invalid escape at the end \\",
"invalid escape at the end ");
}

#endif // HAS_MMAP