Preserve garbage text trivia when pretty printing.

allevato · allevato · commit 91070505ded7 · 2020-01-26T21:30:33.000-08:00
Garbage text typically refers to unrecognized characters in the input,
which we haven't typically cared about. However, there are also some
uses of garbage text that don't represent "garbage" per se, but content
that the lexer/parser doesn't care about but which we should still
preserve during formatting:

* a hashbang line at the beginning of the file
* a Unicode byte-order marker at the beginning of the file
* source control conflict markers

When we see these (or any garbage), we treat them as verbatim content
so that we can handle it as nondestructively as possible.

This change also makes some small fixes to verbatim handling, and
removes a convenience constructor for verbatim tokens so that we force
the caller to specify the indentation behavior instead of relying on an
arbitrary default.
diff --git a/Sources/SwiftFormatPrettyPrint/PrettyPrint.swift b/Sources/SwiftFormatPrettyPrint/PrettyPrint.swift
@@ -553,14 +553,7 @@ public class PrettyPrinter {
         total += wasEndOfLine ? 0 : comment.length
 
       case .verbatim(let verbatim):
-        var length: Int
-        if verbatim.lines.count > 1 {
-          length = maxLineLength
-        } else if verbatim.lines.count == 0 {
-          length = 0
-        } else {
-          length = verbatim.lines[0].count
-        }
+        let length = verbatim.prettyPrintingLength(maximum: maxLineLength)
         lengths.append(length)
         total += length
 
diff --git a/Sources/SwiftFormatPrettyPrint/Token.swift b/Sources/SwiftFormatPrettyPrint/Token.swift
@@ -201,8 +201,4 @@ enum Token {
   static func `break`(_ kind: BreakKind, newlines: NewlineBehavior) -> Token {
     return .break(kind, size: 1, newlines: newlines)
   }
-
-  static func verbatim(text: String) -> Token {
-    return Token.verbatim(Verbatim(text: text))
-  }
 }
diff --git a/Sources/SwiftFormatPrettyPrint/TokenStreamCreator.swift b/Sources/SwiftFormatPrettyPrint/TokenStreamCreator.swift
@@ -132,7 +132,7 @@ private final class TokenStreamCreator: SyntaxVisitor {
       appendBeforeTokens(firstToken)
     }
 
-    appendToken(.verbatim(Verbatim(text: node.description)))
+    appendToken(.verbatim(Verbatim(text: node.description, indentingBehavior: .allLines)))
 
     if let lastToken = node.lastToken {
       // Extract any comments that trail the verbatim block since they belong to the next syntax
@@ -1978,6 +1978,7 @@ private final class TokenStreamCreator: SyntaxVisitor {
       appendToken(.syntax(token.text))
     }
 
+    appendTrailingTrivia(token)
     appendAfterTokensAndTrailingComments(token)
 
     // It doesn't matter what we return here, tokens do not have children.
@@ -2033,6 +2034,43 @@ private final class TokenStreamCreator: SyntaxVisitor {
     }
   }
 
+  /// Handle trailing trivia that might contain garbage text that we don't want to indiscriminantly
+  /// discard.
+  ///
+  /// In syntactically valid code, trailing trivia will only contain spaces or tabs, so we can
+  /// usually ignore it entirely. If there is garbage text after a token, however, then we preserve
+  /// it (and any whitespace immediately before it) and "glue" it to the end of the preceding token
+  /// using a `verbatim` formatting token. Any whitespace following the last garbage text in the
+  /// trailing trivia will be discarded, with the assumption that the formatter will have inserted
+  /// some kind of break there that would be more appropriate (and we want to avoid inserting
+  /// trailing whitespace on a line).
+  ///
+  /// The choices above are admittedly somewhat arbitrary, but given that garbage text in trailing
+  /// trivia represents a malformed input (as opposed to garbage text in leading trivia, which has
+  /// some legitimate uses), this is a reasonable compromise to keep the garbage text roughly in the
+  /// same place but still let surrounding formatting occur somewhat as expected.
+  private func appendTrailingTrivia(_ token: TokenSyntax) {
+    let trailingTrivia = Array(token.trailingTrivia)
+    guard let lastGarbageIndex = trailingTrivia.lastIndex(where: { $0.isGarbageText }) else {
+      return
+    }
+
+    var verbatimText = ""
+    for piece in trailingTrivia[...lastGarbageIndex] {
+      switch piece {
+      case .garbageText, .spaces, .tabs, .formfeeds, .verticalTabs:
+        piece.write(to: &verbatimText)
+      default:
+        // The implementation of the lexer today ensures that newlines, carriage returns, and
+        // comments will not be present in trailing trivia. Ignore them for now (rather than assert,
+        // in case that changes in a future version).
+        break
+      }
+    }
+
+    appendToken(.verbatim(Verbatim(text: verbatimText, indentingBehavior: .none)))
+  }
+
   /// Appends the after-tokens and trailing comments (if present) of the given syntax token
   /// to the token stream.
   ///
@@ -2399,7 +2437,11 @@ private final class TokenStreamCreator: SyntaxVisitor {
       }
     }
 
-    var lastPieceWasLineComment = false
+    // Updated throughout the loop to indicate whether the next newline *must* be honored (for
+    // example, even if discretionary newlines are discarded). This is the case when the preceding
+    // trivia was a line comment or garbage text.
+    var requiresNextNewline = false
+
     for (index, piece) in trivia.enumerated() {
       if let cutoff = cutoffIndex, index == cutoff { break }
       switch piece {
@@ -2409,7 +2451,7 @@ private final class TokenStreamCreator: SyntaxVisitor {
           appendNewlines(.soft)
           isStartOfFile = false
         }
-        lastPieceWasLineComment = true
+        requiresNextNewline = true
 
       case .blockComment(let text):
         if index > 0 || isStartOfFile {
@@ -2420,39 +2462,51 @@ private final class TokenStreamCreator: SyntaxVisitor {
           appendToken(.break(.same, size: 0))
           isStartOfFile = false
         }
-        lastPieceWasLineComment = false
+        requiresNextNewline = false
 
       case .docLineComment(let text):
         appendToken(.comment(Comment(kind: .docLine, text: text), wasEndOfLine: false))
         appendNewlines(.soft)
         isStartOfFile = false
-        lastPieceWasLineComment = true
+        requiresNextNewline = true
 
       case .docBlockComment(let text):
         appendToken(.comment(Comment(kind: .docBlock, text: text), wasEndOfLine: false))
         appendNewlines(.soft)
         isStartOfFile = false
-        lastPieceWasLineComment = false
+        requiresNextNewline = false
 
       case .newlines(let count), .carriageReturns(let count), .carriageReturnLineFeeds(let count):
         guard !isStartOfFile else { break }
-        // Even if we aren't respecting discretionary newlines, there must always be a newline after
-        // a line comment.
-        if lastPieceWasLineComment ||
+
+        if requiresNextNewline ||
           (config.respectsExistingLineBreaks && isDiscretionaryNewlineAllowed(before: token))
         {
           appendNewlines(.soft(count: count, discretionary: true))
         } else {
           // Even if discretionary line breaks are not being respected, we still respect multiple
           // line breaks in order to keep blank separator lines that the user might want.
           // TODO: It would be nice to restrict this to only allow multiple lines between statements
-          // and declarations; as currently implemented, multiple newlines will locally the
+          // and declarations; as currently implemented, multiple newlines will locally ignore the
           // configuration setting.
           if count > 1 {
             appendNewlines(.soft(count: count, discretionary: true))
           }
         }
 
+      case .garbageText(let text):
+        // Garbage text in leading trivia might be something meaningful that would be disruptive to
+        // throw away when formatting the file, like a hashbang line or Unicode byte-order marker at
+        // the beginning of a file, or source control conflict markers. Keep it as verbatim text so
+        // that it is printed exactly as we got it.
+        appendToken(.verbatim(Verbatim(text: text, indentingBehavior: .none)))
+
+        // Unicode byte-order markers shouldn't allow leading newlines to otherwise appear in the
+        // file, nor should they modify our detection of the beginning of the file.
+        let isBOM = text == "\u{feff}"
+        requiresNextNewline = !isBOM
+        isStartOfFile = isStartOfFile && isBOM
+
       default:
         break
       }
@@ -2958,6 +3012,16 @@ extension Collection {
   }
 }
 
+extension TriviaPiece {
+  /// True if the trivia piece is garbage text.
+  fileprivate var isGarbageText: Bool {
+    switch self {
+    case .garbageText: return true
+    default: return false
+    }
+  }
+}
+
 /// Returns whether the given trivia includes a directive to ignore formatting for the next node.
 ///
 /// - Parameter trivia: Leading trivia for a node that the formatter supports ignoring.
diff --git a/Sources/SwiftFormatPrettyPrint/Verbatim.swift b/Sources/SwiftFormatPrettyPrint/Verbatim.swift
@@ -26,31 +26,69 @@ enum IndentingBehavior {
 }
 
 struct Verbatim {
-  let indentingBehavior: IndentingBehavior
-  var lines: [String] = []
-  var leadingWhitespaceCounts: [Int] = []
+  /// The behavior used to adjust indentation when printing verbatim content.
+  private let indentingBehavior: IndentingBehavior
 
-  init(text: String, indentingBehavior: IndentingBehavior = .allLines) {
+  /// The lines of verbatim text.
+  private let lines: [String]
+
+  /// The number of leading whitespaces to print for each line of verbatim content, not including
+  /// any additional indentation requested externally.
+  private let leadingWhitespaceCounts: [Int]
+
+  init(text: String, indentingBehavior: IndentingBehavior) {
     self.indentingBehavior = indentingBehavior
-    tokenizeTextAndTrimWhitespace(text: text)
-  }
 
-  mutating func tokenizeTextAndTrimWhitespace(text: String) {
-    lines = text.split(separator: "\n", omittingEmptySubsequences: false).map { String($0) }
+    var originalLines = text.split(separator: "\n", omittingEmptySubsequences: false)
 
     // Prevents an extra leading new line from being created.
-    if lines[0] == "" {
-      lines.remove(at: 0)
+    if originalLines[0].isEmpty {
+      originalLines.remove(at: 0)
+    }
+
+    // If we have no lines left (or none with any content), just initialize everything empty and
+    // exit.
+    guard
+      !originalLines.isEmpty,
+      let index = originalLines.firstIndex(where: { !$0.isEmpty })
+    else {
+      self.lines = []
+      self.leadingWhitespaceCounts = []
+      return
     }
 
-    guard lines.count > 0, let index = lines.firstIndex(where: { $0 != "" }) else { return }
+    // If our indenting behavior is `none`, then keep the original lines _exactly_ as is---don't
+    // attempt to calculate or trim their leading indentation.
+    guard indentingBehavior != .none else {
+      self.lines = originalLines.map(String.init)
+      self.leadingWhitespaceCounts = [Int](repeating: 0, count: originalLines.count)
+      return
+    }
 
-    // Get the number of leading whitespaces of the first line, and subract this from the number of
-    // leading whitespaces for subsequent lines (if possible). Record the new leading whitespaces
-    // counts, and trim off whitespace from the ends of the strings.
-    let count = countLeadingWhitespaces(text: lines[index])
-    leadingWhitespaceCounts = lines.map { max(countLeadingWhitespaces(text: $0) - count, 0) }
-    lines = lines.map { $0.trimmingCharacters(in: CharacterSet(charactersIn: " ")) }
+    // Otherwise, we're in one of the indentation compensating modes. Get the number of leading
+    // whitespaces of the first line, and subtract this from the number of leading whitespaces for
+    // subsequent lines (if possible). Record the new leading whitespaces counts, and trim off
+    // whitespace from the ends of the strings.
+    let firstLineLeadingSpaceCount = numberOfLeadingSpaces(in: originalLines[index])
+    self.leadingWhitespaceCounts = originalLines.map {
+      max(numberOfLeadingSpaces(in: $0) - firstLineLeadingSpaceCount, 0)
+    }
+    self.lines = originalLines.map { $0.trimmingCharacters(in: CharacterSet(charactersIn: " ")) }
+  }
+
+  /// Returns the length that the pretty printer should use when determining layout for this
+  /// verbatim content.
+  ///
+  /// Specifically, multiline content should have a length equal to the maximum (to force breaking),
+  /// while single-line content should have its natural length.
+  func prettyPrintingLength(maximum: Int) -> Int {
+    if lines.isEmpty {
+      return 0
+    }
+    if lines.count > 1 {
+      return maximum
+    }
+    return lines[0].count
   }
 
   func print(indent: [Indent]) -> String {
@@ -60,11 +98,12 @@ struct Verbatim {
         switch indentingBehavior {
         case .firstLine where i == 0, .allLines:
           output += indent.indentation()
-          break
         case .none, .firstLine:
           break
         }
-        output += String(repeating: " ", count: leadingWhitespaceCounts[i])
+        if leadingWhitespaceCounts[i] > 0 {
+          output += String(repeating: " ", count: leadingWhitespaceCounts[i])
+        }
         output += lines[i]
       }
       if i < lines.count - 1 {
@@ -73,12 +112,13 @@ struct Verbatim {
     }
     return output
   }
+}
 
-  func countLeadingWhitespaces(text: String) -> Int {
-    var count = 0
-    for char in text {
-      if char == " " { count += 1 } else { break }
-    }
-    return count
+/// Returns the leading number of spaces in the given string.
+fileprivate func numberOfLeadingSpaces(in text: Substring) -> Int {
+  var count = 0
+  for char in text {
+    if char == " " { count += 1 } else { break }
   }
+  return count
 }
diff --git a/Tests/SwiftFormatPrettyPrintTests/GarbageTextTests.swift b/Tests/SwiftFormatPrettyPrintTests/GarbageTextTests.swift
diff --git a/Tests/SwiftFormatPrettyPrintTests/XCTestManifests.swift b/Tests/SwiftFormatPrettyPrintTests/XCTestManifests.swift

Original file line number	Diff line number	Diff line change
`@@ -201,8 +201,4 @@ enum Token {`
`201`	`201`	static func `break`(_ kind: BreakKind, newlines: NewlineBehavior) -> Token {
`202`	`202`	`return .break(kind, size: 1, newlines: newlines)`
`203`	`203`	`}`
`204`		`-`
`205`		`- static func verbatim(text: String) -> Token {`
`206`		`- return Token.verbatim(Verbatim(text: text))`
`207`		`- }`
`208`	`204`	`}`