swiftlang · jrose-apple · Sep 12, 2018 · Sep 10, 2018
diff --git a/include/swift/Parse/Lexer.h b/include/swift/Parse/Lexer.h
@@ -399,24 +399,48 @@ class Lexer {
     }
 
   };
-
+
+  /// Implementation of getEncodedStringSegment. Note that \p Str must support
+  /// reading one byte past the end.
+  static StringRef getEncodedStringSegmentImpl(StringRef Str,
+                                               SmallVectorImpl<char> &Buffer,
+                                               bool IsFirstSegment,
+                                               bool IsLastSegment,
+                                               unsigned IndentToStrip,
+                                               unsigned CustomDelimiterLen);
+
   /// \brief Compute the bytes that the actual string literal should codegen to.
   /// If a copy needs to be made, it will be allocated out of the provided
-  /// Buffer.
-  static StringRef getEncodedStringSegment(StringRef Str,
-                                           SmallVectorImpl<char> &Buffer,
-                                           bool IsFirstSegment = false,
-                                           bool IsLastSegment = false,
-                                           unsigned IndentToStrip = 0,
-                                           unsigned CustomDelimiterLen = 0);
+  /// \p Buffer.
   StringRef getEncodedStringSegment(StringSegment Segment,
                                     SmallVectorImpl<char> &Buffer) const {
-    return getEncodedStringSegment(
+    return getEncodedStringSegmentImpl(
         StringRef(getBufferPtrForSourceLoc(Segment.Loc), Segment.Length),
         Buffer, Segment.IsFirstSegment, Segment.IsLastSegment,
         Segment.IndentToStrip, Segment.CustomDelimiterLen);
   }
 
+  /// \brief Given a string encoded with escapes like a string literal, compute
+  /// the byte content.
+  ///
+  /// If a copy needs to be made, it will be allocated out of the provided
+  /// \p Buffer.
+  static StringRef getEncodedStringSegment(StringRef Str,
+                                           SmallVectorImpl<char> &Buffer) {
+    SmallString<128> TerminatedStrBuf(Str);
+    TerminatedStrBuf.push_back('\0');
+    StringRef TerminatedStr = StringRef(TerminatedStrBuf).drop_back();
+    StringRef Result = getEncodedStringSegmentImpl(TerminatedStr, Buffer,
+                                                   /*IsFirstSegment*/false,
+                                                   /*IsLastSegment*/false,
+                                                   /*IndentToStrip*/0,
+                                                   /*CustomDelimiterLen*/0);
+    if (Result == TerminatedStr)
+      return Str;
+    assert(Result.data() == Buffer.data());
+    return Result;
+  }
+
   /// \brief Given a string literal token, separate it into string/expr segments
   /// of a potentially interpolated string.
   static void getStringLiteralSegments(

diff --git a/lib/Parse/Lexer.cpp b/lib/Parse/Lexer.cpp
@@ -2095,17 +2095,18 @@ void Lexer::tryLexEditorPlaceholder() {
   lexOperatorIdentifier();
 }
 
-StringRef Lexer::getEncodedStringSegment(StringRef Bytes,
-                                         SmallVectorImpl<char> &TempString,
-                                         bool IsFirstSegment,
-                                         bool IsLastSegment,
-                                         unsigned IndentToStrip,
-                                         unsigned CustomDelimiterLen) {
+StringRef Lexer::getEncodedStringSegmentImpl(StringRef Bytes,
+                                             SmallVectorImpl<char> &TempString,
+                                             bool IsFirstSegment,
+                                             bool IsLastSegment,
+                                             unsigned IndentToStrip,
+                                             unsigned CustomDelimiterLen) {
 
   TempString.clear();
-  // Note that it is always safe to read one over the end of "Bytes" because
-  // we know that there is a terminating " character.  Use BytesPtr to avoid a
-  // range check subscripting on the StringRef.
+  // Note that it is always safe to read one over the end of "Bytes" because we
+  // know that there is a terminating " character (or null byte for an
+  // unterminated literal or a segment that doesn't come from source). Use
+  // BytesPtr to avoid a range check subscripting on the StringRef.
   const char *BytesPtr = Bytes.begin();
 
   bool IsEscapedNewline = false;

diff --git a/lib/ParseSIL/ParseSIL.cpp b/lib/ParseSIL/ParseSIL.cpp
@@ -2378,18 +2378,22 @@ bool SILParser::parseSILInstruction(SILBuilder &B) {
       return true;
     }
 
-    // Drop the double quotes.
-    StringRef rawString = P.Tok.getText().drop_front().drop_back();
+    // Parse the string.
+    SmallVector<Lexer::StringSegment, 1> segments;
+    P.L->getStringLiteralSegments(P.Tok, segments);
+    assert(segments.size() == 1);
 
     P.consumeToken(tok::string_literal);
     if (parseSILDebugLocation(InstLoc, B))
       return true;
 
-    // Ask the lexer to interpret the entire string as a literal segment.
     SmallVector<char, 128> stringBuffer;
 
     if (encoding == StringLiteralInst::Encoding::Bytes) {
       // Decode hex bytes.
+      CharSourceRange rawStringRange(segments.front().Loc,
+                                     segments.front().Length);
+      StringRef rawString = P.SourceMgr.extractText(rawStringRange);
       if (rawString.size() & 1) {
         P.diagnose(P.Tok, diag::expected_tok_in_sil_instr,
                    "even number of hex bytes");
@@ -2411,7 +2415,8 @@ bool SILParser::parseSILInstruction(SILBuilder &B) {
       break;
     }
 
-    StringRef string = P.L->getEncodedStringSegment(rawString, stringBuffer);
+    StringRef string = P.L->getEncodedStringSegment(segments.front(),
+                                                    stringBuffer);
     ResultVal = B.createStringLiteral(InstLoc, string, encoding);
     break;
   }

diff --git a/unittests/Parse/LexerTests.cpp b/unittests/Parse/LexerTests.cpp
@@ -1,12 +1,21 @@
 #include "swift/AST/DiagnosticConsumer.h"
 #include "swift/AST/DiagnosticEngine.h"
+#include "swift/Basic/Defer.h"
 #include "swift/Basic/LangOptions.h"
 #include "swift/Basic/SourceManager.h"
 #include "swift/Parse/Lexer.h"
 #include "swift/Subsystems.h"
 #include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Process.h"
 #include "gtest/gtest.h"
 
+#if __has_include(<sys/mman.h>)
+# include <sys/mman.h>
+# define HAS_MMAP 1
+#else
+# define HAS_MMAP 0
+#endif
+
 using namespace swift;
 using namespace llvm;
 
@@ -806,3 +815,37 @@ TEST_F(LexerTest, DiagnoseEmbeddedNulOffset) {
   ASSERT_FALSE(containsPrefix(
       DiagConsumer.messages, "1, 4: nul character embedded in middle of file"));
 }
+
+#if HAS_MMAP
+
+// This test requires mmap because llvm::sys::Memory doesn't support protecting
+// pages to have no permissions.
+TEST_F(LexerTest, EncodedStringSegmentPastTheEnd) {
+  size_t PageSize = llvm::sys::Process::getPageSize();
+
+  void *FirstPage = mmap(/*addr*/nullptr, PageSize * 2, PROT_NONE,
+                         MAP_PRIVATE | MAP_ANON, /*fd*/-1, /*offset*/0);
+  SWIFT_DEFER { (void)munmap(FirstPage, PageSize * 2); };
+  ASSERT_NE(FirstPage, MAP_FAILED);
+  int ProtectResult = mprotect(FirstPage, PageSize, PROT_READ | PROT_WRITE);
+  ASSERT_EQ(ProtectResult, 0);
+
+  auto check = [FirstPage, PageSize](StringRef Input, StringRef Expected) {
+    char *StartPtr = static_cast<char *>(FirstPage) + PageSize - Input.size();
+    memcpy(StartPtr, Input.data(), Input.size());
+
+    SmallString<64> Buffer;
+    StringRef Escaped = Lexer::getEncodedStringSegment({StartPtr, Input.size()},
+                                                       Buffer);
+    EXPECT_EQ(Escaped, Expected);
+  };
+
+  check("needs escaping\\r",
+        "needs escaping\r");
+  check("does not need escaping",
+        "does not need escaping");
+  check("invalid escape at the end \\",
+        "invalid escape at the end ");
+}
+
+#endif // HAS_MMAP