[Parse] Tweak a utility function that relies on reading past the end (#19230)

jrose-apple · web-flow · commit e180bf9d4e61 · 2018-09-11T20:11:58.000-07:00
Lexer::getEncodedStringSegment (now getEncodedStringSegmentImpl)
assumes that it can read one byte past the end of a string segment in
order to avoid bounds-checks on things like "is this a \r\n
sequence?". However, the function was being used for strings that did
not come from source where this assumption was not always valid.
Change the reusable form of the function to always copy into a
temporary buffer, allowing the fast path to continue to be used for
normal parsing.

Caught by ASan!

rdar://problem/44306756
diff --git a/include/swift/Parse/Lexer.h b/include/swift/Parse/Lexer.h
@@ -399,24 +399,48 @@ class Lexer {
     }
 
   };
-  
+
+  /// Implementation of getEncodedStringSegment. Note that \p Str must support
+  /// reading one byte past the end.
+  static StringRef getEncodedStringSegmentImpl(StringRef Str,
+                                               SmallVectorImpl<char> &Buffer,
+                                               bool IsFirstSegment,
+                                               bool IsLastSegment,
+                                               unsigned IndentToStrip,
+                                               unsigned CustomDelimiterLen);
+
   /// \brief Compute the bytes that the actual string literal should codegen to.
   /// If a copy needs to be made, it will be allocated out of the provided
-  /// Buffer.
-  static StringRef getEncodedStringSegment(StringRef Str,
-                                           SmallVectorImpl<char> &Buffer,
-                                           bool IsFirstSegment = false,
-                                           bool IsLastSegment = false,
-                                           unsigned IndentToStrip = 0,
-                                           unsigned CustomDelimiterLen = 0);
+  /// \p Buffer.
   StringRef getEncodedStringSegment(StringSegment Segment,
                                     SmallVectorImpl<char> &Buffer) const {
-    return getEncodedStringSegment(
+    return getEncodedStringSegmentImpl(
         StringRef(getBufferPtrForSourceLoc(Segment.Loc), Segment.Length),
         Buffer, Segment.IsFirstSegment, Segment.IsLastSegment,
         Segment.IndentToStrip, Segment.CustomDelimiterLen);
   }
 
+  /// \brief Given a string encoded with escapes like a string literal, compute
+  /// the byte content.
+  ///
+  /// If a copy needs to be made, it will be allocated out of the provided
+  /// \p Buffer.
+  static StringRef getEncodedStringSegment(StringRef Str,
+                                           SmallVectorImpl<char> &Buffer) {
+    SmallString<128> TerminatedStrBuf(Str);
+    TerminatedStrBuf.push_back('\0');
+    StringRef TerminatedStr = StringRef(TerminatedStrBuf).drop_back();
+    StringRef Result = getEncodedStringSegmentImpl(TerminatedStr, Buffer,
+                                                   /*IsFirstSegment*/false,
+                                                   /*IsLastSegment*/false,
+                                                   /*IndentToStrip*/0,
+                                                   /*CustomDelimiterLen*/0);
+    if (Result == TerminatedStr)
+      return Str;
+    assert(Result.data() == Buffer.data());
+    return Result;
+  }
+
   /// \brief Given a string literal token, separate it into string/expr segments
   /// of a potentially interpolated string.
   static void getStringLiteralSegments(
diff --git a/lib/Parse/Lexer.cpp b/lib/Parse/Lexer.cpp
@@ -2095,17 +2095,18 @@ void Lexer::tryLexEditorPlaceholder() {
   lexOperatorIdentifier();
 }
 
-StringRef Lexer::getEncodedStringSegment(StringRef Bytes,
-                                         SmallVectorImpl<char> &TempString,
-                                         bool IsFirstSegment,
-                                         bool IsLastSegment,
-                                         unsigned IndentToStrip,
-                                         unsigned CustomDelimiterLen) {
+StringRef Lexer::getEncodedStringSegmentImpl(StringRef Bytes,
+                                             SmallVectorImpl<char> &TempString,
+                                             bool IsFirstSegment,
+                                             bool IsLastSegment,
+                                             unsigned IndentToStrip,
+                                             unsigned CustomDelimiterLen) {
 
   TempString.clear();
-  // Note that it is always safe to read one over the end of "Bytes" because
-  // we know that there is a terminating " character.  Use BytesPtr to avoid a
-  // range check subscripting on the StringRef.
+  // Note that it is always safe to read one over the end of "Bytes" because we
+  // know that there is a terminating " character (or null byte for an
+  // unterminated literal or a segment that doesn't come from source). Use
+  // BytesPtr to avoid a range check subscripting on the StringRef.
   const char *BytesPtr = Bytes.begin();
 
   bool IsEscapedNewline = false;
diff --git a/lib/ParseSIL/ParseSIL.cpp b/lib/ParseSIL/ParseSIL.cpp
@@ -2378,18 +2378,22 @@ bool SILParser::parseSILInstruction(SILBuilder &B) {
       return true;
     }
 
-    // Drop the double quotes.
-    StringRef rawString = P.Tok.getText().drop_front().drop_back();
+    // Parse the string.
+    SmallVector<Lexer::StringSegment, 1> segments;
+    P.L->getStringLiteralSegments(P.Tok, segments);
+    assert(segments.size() == 1);
 
     P.consumeToken(tok::string_literal);
     if (parseSILDebugLocation(InstLoc, B))
       return true;
 
-    // Ask the lexer to interpret the entire string as a literal segment.
     SmallVector<char, 128> stringBuffer;
 
     if (encoding == StringLiteralInst::Encoding::Bytes) {
       // Decode hex bytes.
+      CharSourceRange rawStringRange(segments.front().Loc,
+                                     segments.front().Length);
+      StringRef rawString = P.SourceMgr.extractText(rawStringRange);
       if (rawString.size() & 1) {
         P.diagnose(P.Tok, diag::expected_tok_in_sil_instr,
                    "even number of hex bytes");
@@ -2411,7 +2415,8 @@ bool SILParser::parseSILInstruction(SILBuilder &B) {
       break;
     }
 
-    StringRef string = P.L->getEncodedStringSegment(rawString, stringBuffer);
+    StringRef string = P.L->getEncodedStringSegment(segments.front(),
+                                                    stringBuffer);
     ResultVal = B.createStringLiteral(InstLoc, string, encoding);
     break;
   }
diff --git a/unittests/Parse/LexerTests.cpp b/unittests/Parse/LexerTests.cpp
@@ -1,12 +1,21 @@
 #include "swift/AST/DiagnosticConsumer.h"
 #include "swift/AST/DiagnosticEngine.h"
+#include "swift/Basic/Defer.h"
 #include "swift/Basic/LangOptions.h"
 #include "swift/Basic/SourceManager.h"
 #include "swift/Parse/Lexer.h"
 #include "swift/Subsystems.h"
 #include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Process.h"
 #include "gtest/gtest.h"
 
+#if __has_include(<sys/mman.h>)
+# include <sys/mman.h>
+# define HAS_MMAP 1
+#else
+# define HAS_MMAP 0
+#endif
+
 using namespace swift;
 using namespace llvm;
 
@@ -806,3 +815,37 @@ TEST_F(LexerTest, DiagnoseEmbeddedNulOffset) {
   ASSERT_FALSE(containsPrefix(
       DiagConsumer.messages, "1, 4: nul character embedded in middle of file"));
 }
+
+#if HAS_MMAP
+
+// This test requires mmap because llvm::sys::Memory doesn't support protecting
+// pages to have no permissions.
+TEST_F(LexerTest, EncodedStringSegmentPastTheEnd) {
+  size_t PageSize = llvm::sys::Process::getPageSize();
+
+  void *FirstPage = mmap(/*addr*/nullptr, PageSize * 2, PROT_NONE,
+                         MAP_PRIVATE | MAP_ANON, /*fd*/-1, /*offset*/0);
+  SWIFT_DEFER { (void)munmap(FirstPage, PageSize * 2); };
+  ASSERT_NE(FirstPage, MAP_FAILED);
+  int ProtectResult = mprotect(FirstPage, PageSize, PROT_READ | PROT_WRITE);
+  ASSERT_EQ(ProtectResult, 0);
+
+  auto check = [FirstPage, PageSize](StringRef Input, StringRef Expected) {
+    char *StartPtr = static_cast<char *>(FirstPage) + PageSize - Input.size();
+    memcpy(StartPtr, Input.data(), Input.size());
+
+    SmallString<64> Buffer;
+    StringRef Escaped = Lexer::getEncodedStringSegment({StartPtr, Input.size()},
+                                                       Buffer);
+    EXPECT_EQ(Escaped, Expected);
+  };
+
+  check("needs escaping\\r",
+        "needs escaping\r");
+  check("does not need escaping",
+        "does not need escaping");
+  check("invalid escape at the end \\",
+        "invalid escape at the end ");
+}
+
+#endif // HAS_MMAP