swiftlang · rintaro · Mar 5, 2018 · Mar 3, 2018 · Mar 4, 2018
diff --git a/include/swift/Parse/Lexer.h b/include/swift/Parse/Lexer.h
@@ -461,6 +461,16 @@ class Lexer {
   };
 
 private:
+  /// Nul character meaning kind.
+  enum class NulCharacterKind {
+    /// String buffer terminator.
+    BufferEnd,
+    /// Embedded nul character.
+    Embedded,
+    /// Code completion marker.
+    CodeCompletion
+  };
+
   /// For a source location in the current buffer, returns the corresponding
   /// pointer.
   const char *getBufferPtrForSourceLoc(SourceLoc Loc) const {
@@ -520,6 +530,8 @@ class Lexer {
   /// Try to lex conflict markers by checking for the presence of the start and
   /// end of the marker in diff3 or Perforce style respectively.
   bool tryLexConflictMarker(bool EatNewline);
+
+  NulCharacterKind getNulCharacterKind(const char *Ptr) const;
 };
 
 /// Given an ordered token \param Array , get the iterator pointing to the first

diff --git a/lib/Parse/Lexer.cpp b/lib/Parse/Lexer.cpp
@@ -351,16 +351,19 @@ void Lexer::skipToEndOfLine(bool EatNewline) {
       }
       break;   // Otherwise, eat other characters.
     case 0:
-      // If this is a random nul character in the middle of a buffer, skip it as
-      // whitespace.
-      if (CurPtr-1 != BufferEnd) {
+      switch (getNulCharacterKind(CurPtr - 1)) {
+      case NulCharacterKind::Embedded:
+        // If this is a random nul character in the middle of a buffer, skip it
+        // as whitespace.
         diagnoseEmbeddedNul(Diags, CurPtr-1);
-        break;
+        LLVM_FALLTHROUGH;
+      case NulCharacterKind::CodeCompletion:
+        continue;
+      case NulCharacterKind::BufferEnd:
+        // Otherwise, the last line of the file does not have a newline.
+        --CurPtr;
+        return;
       }
-
-      // Otherwise, the last line of the file does not have a newline.
-      --CurPtr;
-      return;
     }
   }
 }
@@ -422,26 +425,30 @@ void Lexer::skipSlashStarComment() {
 
       break;   // Otherwise, eat other characters.
     case 0:
-      // If this is a random nul character in the middle of a buffer, skip it as
-      // whitespace.
-      if (CurPtr-1 != BufferEnd) {
-        diagnoseEmbeddedNul(Diags, CurPtr-1);
-        break;
-      }
-
-      // Otherwise, we have an unterminated /* comment.
-      --CurPtr;
+      switch (getNulCharacterKind(CurPtr - 1)) {
+      case NulCharacterKind::Embedded:
+        // If this is a random nul character in the middle of a buffer, skip it
+        // as whitespace.
+        diagnoseEmbeddedNul(Diags, CurPtr - 1);
+        LLVM_FALLTHROUGH;
+      case NulCharacterKind::CodeCompletion:
+        continue;
+      case NulCharacterKind::BufferEnd: {
+        // Otherwise, we have an unterminated /* comment.
+        --CurPtr;
 
-      // Count how many levels deep we are.
-      llvm::SmallString<8> Terminator("*/");
-      while (--Depth != 0)
-        Terminator += "*/";
+        // Count how many levels deep we are.
+        llvm::SmallString<8> Terminator("*/");
+        while (--Depth != 0)
+          Terminator += "*/";
 
-      const char *EOL = (CurPtr[-1] == '\n') ? (CurPtr - 1) : CurPtr;
-      diagnose(EOL, diag::lex_unterminated_block_comment)
-        .fixItInsert(getSourceLoc(EOL), Terminator);
-      diagnose(StartPtr, diag::lex_comment_start);
-      return;
+        const char *EOL = (CurPtr[-1] == '\n') ? (CurPtr - 1) : CurPtr;
+        diagnose(EOL, diag::lex_unterminated_block_comment)
+            .fixItInsert(getSourceLoc(EOL), Terminator);
+        diagnose(StartPtr, diag::lex_comment_start);
+        return;
+      }
+      }
     }
   }
 }
@@ -1857,6 +1864,16 @@ bool Lexer::tryLexConflictMarker(bool EatNewline) {
   return false;
 }
 
+Lexer::NulCharacterKind Lexer::getNulCharacterKind(const char *Ptr) const {
+  assert(Ptr != nullptr && *Ptr == 0);
+  if (Ptr == CodeCompletionPtr) {
+    return NulCharacterKind::CodeCompletion;
+  }
+  if (Ptr == BufferEnd) {
+    return NulCharacterKind::BufferEnd;
+  }
+  return NulCharacterKind::Embedded;
+}
 
 void Lexer::tryLexEditorPlaceholder() {
   assert(CurPtr[-1] == '<' && CurPtr[0] == '#');
@@ -2164,22 +2181,23 @@ void Lexer::lexImpl() {
     return formToken(tok::unknown, TokStart);
 
   case 0:
-    if (CurPtr-1 == CodeCompletionPtr)
+    switch (getNulCharacterKind(CurPtr - 1)) {
+    case NulCharacterKind::CodeCompletion:
       return formToken(tok::code_complete, TokStart);
 
-    // If this is a random nul character in the middle of a buffer, skip it as
-    // whitespace.
-    if (CurPtr-1 != BufferEnd) {
+    case NulCharacterKind::Embedded:
+      // If this is a random nul character in the middle of a buffer, skip it as
+      // whitespace.
       diagnoseEmbeddedNul(Diags, CurPtr-1);
       goto Restart;
+    case NulCharacterKind::BufferEnd:
+      // Otherwise, this is the real end of the buffer.  Put CurPtr back into
+      // buffer bounds.
+      --CurPtr;
+      // Return EOF.
+      return formToken(tok::eof, TokStart);
     }
 
-    // Otherwise, this is the real end of the buffer.  Put CurPtr back into
-    // buffer bounds.
-    --CurPtr;
-    // Return EOF.
-    return formToken(tok::eof, TokStart);
-
   case '@': return formToken(tok::at_sign, TokStart);
   case '{': return formToken(tok::l_brace, TokStart);
   case '[': {
@@ -2323,7 +2341,6 @@ void Lexer::lexTrivia(syntax::Trivia &Pieces, bool IsForTrailingTrivia) {
 Restart:
   const char *TriviaStart = CurPtr;
 
-  // TODO: Handle random nul('\0') character in the middle of a buffer.
   // TODO: Handle invalid UTF8 sequence which is skipped in lexImpl().
   switch (*CurPtr++) {
   case '\n':
@@ -2403,6 +2420,19 @@ void Lexer::lexTrivia(syntax::Trivia &Pieces, bool IsForTrailingTrivia) {
       goto Restart;
     }
     break;
+  case 0:
+    switch (getNulCharacterKind(CurPtr - 1)) {
+    case NulCharacterKind::Embedded: {
+      diagnoseEmbeddedNul(Diags, CurPtr - 1);
+      size_t Length = CurPtr - TriviaStart;
+      Pieces.push_back(TriviaPiece::garbageText({TriviaStart, Length}));
+      goto Restart;
+    }
+    case NulCharacterKind::CodeCompletion:
+    case NulCharacterKind::BufferEnd:
+      break;
+    }
+    break;
   default:
     break;
   }

diff --git a/test/Syntax/lexer_invalid_nul.swift b/test/Syntax/lexer_invalid_nul.swift
diff --git a/test/Syntax/round_trip_nul.swift b/test/Syntax/round_trip_nul.swift
@@ -0,0 +1,5 @@
+// RUN: cat %s | tr '\132' '\0' > %t.tr
+// RUN: cp -f %t.tr %t
+// RUN: %round-trip-syntax-test --swift-syntax-test %swift-syntax-test --file %t
+let a = Z3Z // nul(Z)
+func b() {}
diff --git a/test/Syntax/tokens_nul.swift b/test/Syntax/tokens_nul.swift
@@ -0,0 +1,28 @@
+// RUN: cat %s | tr '\132' '\0' > %t.tmp
+// RUN: cp -f %t.tmp %t
+// RUN: %swift-syntax-test -input-source-filename %t -dump-full-tokens 2>&1 | %FileCheck %t
+let a = Z3Z // nul(Z)
+func b() {}
+
+// CHECK: 4:9: warning: nul character embedded in middle of file
+// CHECK: 4:11: warning: nul character embedded in middle of file
+// CHECK: 4:20: warning: nul character embedded in middle of file
+
+// CHECK-LABEL: 4:7
+// CHECK-NEXT:(Token equal
+// CHECK-NEXT: (text="=")
+// CHECK-NEXT: (trivia space 1)
+// CHECK-NEXT: (trivia garbage_text \000))
+
+// CHECK-LABEL: 4:10
+// CHECK-NEXT:(Token integer_literal
+// CHECK-NEXT: (text="3")
+// CHECK-NEXT: (trivia garbage_text \000)
+// CHECK-NEXT: (trivia space 1))
+
+// CHECK-LABEL: 5:1
+// CHECK-NEXT:(Token kw_func
+// CHECK-NEXT: (trivia line_comment // nul(\000))
+// CHECK-NEXT: (trivia newline 1)
+// CHECK-NEXT: (text="func")
+// CHECK-NEXT: (trivia space 1))