Merge pull request swiftlang#127 from allevato/garbage-correction

allevato · web-flow · commit 63366a87f6c2 · 2020-01-28T12:30:48.000-08:00
Preserve garbage text trivia when pretty printing.
diff --git a/Sources/SwiftFormatPrettyPrint/PrettyPrint.swift b/Sources/SwiftFormatPrettyPrint/PrettyPrint.swift
@@ -554,14 +554,7 @@ public class PrettyPrinter {
         total += wasEndOfLine ? 0 : comment.length
 
       case .verbatim(let verbatim):
-        var length: Int
-        if verbatim.lines.count > 1 {
-          length = maxLineLength
-        } else if verbatim.lines.count == 0 {
-          length = 0
-        } else {
-          length = verbatim.lines[0].count
-        }
+        let length = verbatim.prettyPrintingLength(maximum: maxLineLength)
         lengths.append(length)
         total += length
 
diff --git a/Sources/SwiftFormatPrettyPrint/Token.swift b/Sources/SwiftFormatPrettyPrint/Token.swift
@@ -201,8 +201,4 @@ enum Token {
   static func `break`(_ kind: BreakKind, newlines: NewlineBehavior) -> Token {
     return .break(kind, size: 1, newlines: newlines)
   }
-
-  static func verbatim(text: String) -> Token {
-    return Token.verbatim(Verbatim(text: text))
-  }
 }
diff --git a/Sources/SwiftFormatPrettyPrint/TokenStreamCreator.swift b/Sources/SwiftFormatPrettyPrint/TokenStreamCreator.swift
@@ -132,7 +132,7 @@ private final class TokenStreamCreator: SyntaxVisitor {
       appendBeforeTokens(firstToken)
     }
 
-    appendToken(.verbatim(Verbatim(text: node.description)))
+    appendToken(.verbatim(Verbatim(text: node.description, indentingBehavior: .allLines)))
 
     if let lastToken = node.lastToken {
       // Extract any comments that trail the verbatim block since they belong to the next syntax
@@ -1978,6 +1978,7 @@ private final class TokenStreamCreator: SyntaxVisitor {
       appendToken(.syntax(token.text))
     }
 
+    appendTrailingTrivia(token)
     appendAfterTokensAndTrailingComments(token)
 
     // It doesn't matter what we return here, tokens do not have children.
@@ -2033,6 +2034,43 @@ private final class TokenStreamCreator: SyntaxVisitor {
     }
   }
 
+  /// Handle trailing trivia that might contain garbage text that we don't want to indiscriminantly
+  /// discard.
+  ///
+  /// In syntactically valid code, trailing trivia will only contain spaces or tabs, so we can
+  /// usually ignore it entirely. If there is garbage text after a token, however, then we preserve
+  /// it (and any whitespace immediately before it) and "glue" it to the end of the preceding token
+  /// using a `verbatim` formatting token. Any whitespace following the last garbage text in the
+  /// trailing trivia will be discarded, with the assumption that the formatter will have inserted
+  /// some kind of break there that would be more appropriate (and we want to avoid inserting
+  /// trailing whitespace on a line).
+  ///
+  /// The choices above are admittedly somewhat arbitrary, but given that garbage text in trailing
+  /// trivia represents a malformed input (as opposed to garbage text in leading trivia, which has
+  /// some legitimate uses), this is a reasonable compromise to keep the garbage text roughly in the
+  /// same place but still let surrounding formatting occur somewhat as expected.
+  private func appendTrailingTrivia(_ token: TokenSyntax) {
+    let trailingTrivia = Array(token.trailingTrivia)
+    guard let lastGarbageIndex = trailingTrivia.lastIndex(where: { $0.isGarbageText }) else {
+      return
+    }
+
+    var verbatimText = ""
+    for piece in trailingTrivia[...lastGarbageIndex] {
+      switch piece {
+      case .garbageText, .spaces, .tabs, .formfeeds, .verticalTabs:
+        piece.write(to: &verbatimText)
+      default:
+        // The implementation of the lexer today ensures that newlines, carriage returns, and
+        // comments will not be present in trailing trivia. Ignore them for now (rather than assert,
+        // in case that changes in a future version).
+        break
+      }
+    }
+
+    appendToken(.verbatim(Verbatim(text: verbatimText, indentingBehavior: .none)))
+  }
+
   /// Appends the after-tokens and trailing comments (if present) of the given syntax token
   /// to the token stream.
   ///
@@ -2397,7 +2435,11 @@ private final class TokenStreamCreator: SyntaxVisitor {
       }
     }
 
-    var lastPieceWasLineComment = false
+    // Updated throughout the loop to indicate whether the next newline *must* be honored (for
+    // example, even if discretionary newlines are discarded). This is the case when the preceding
+    // trivia was a line comment or garbage text.
+    var requiresNextNewline = false
+
     for (index, piece) in trivia.enumerated() {
       if let cutoff = cutoffIndex, index == cutoff { break }
       switch piece {
@@ -2407,7 +2449,7 @@ private final class TokenStreamCreator: SyntaxVisitor {
           appendNewlines(.soft)
           isStartOfFile = false
         }
-        lastPieceWasLineComment = true
+        requiresNextNewline = true
 
       case .blockComment(let text):
         if index > 0 || isStartOfFile {
@@ -2418,39 +2460,51 @@ private final class TokenStreamCreator: SyntaxVisitor {
           appendToken(.break(.same, size: 0))
           isStartOfFile = false
         }
-        lastPieceWasLineComment = false
+        requiresNextNewline = false
 
       case .docLineComment(let text):
         appendToken(.comment(Comment(kind: .docLine, text: text), wasEndOfLine: false))
         appendNewlines(.soft)
         isStartOfFile = false
-        lastPieceWasLineComment = true
+        requiresNextNewline = true
 
       case .docBlockComment(let text):
         appendToken(.comment(Comment(kind: .docBlock, text: text), wasEndOfLine: false))
         appendNewlines(.soft)
         isStartOfFile = false
-        lastPieceWasLineComment = false
+        requiresNextNewline = false
 
       case .newlines(let count), .carriageReturns(let count), .carriageReturnLineFeeds(let count):
         guard !isStartOfFile else { break }
-        // Even if we aren't respecting discretionary newlines, there must always be a newline after
-        // a line comment.
-        if lastPieceWasLineComment ||
+
+        if requiresNextNewline ||
           (config.respectsExistingLineBreaks && isDiscretionaryNewlineAllowed(before: token))
         {
           appendNewlines(.soft(count: count, discretionary: true))
         } else {
           // Even if discretionary line breaks are not being respected, we still respect multiple
           // line breaks in order to keep blank separator lines that the user might want.
           // TODO: It would be nice to restrict this to only allow multiple lines between statements
-          // and declarations; as currently implemented, multiple newlines will locally the
+          // and declarations; as currently implemented, multiple newlines will locally ignore the
           // configuration setting.
           if count > 1 {
             appendNewlines(.soft(count: count, discretionary: true))
           }
         }
 
+      case .garbageText(let text):
+        // Garbage text in leading trivia might be something meaningful that would be disruptive to
+        // throw away when formatting the file, like a hashbang line or Unicode byte-order marker at
+        // the beginning of a file, or source control conflict markers. Keep it as verbatim text so
+        // that it is printed exactly as we got it.
+        appendToken(.verbatim(Verbatim(text: text, indentingBehavior: .none)))
+
+        // Unicode byte-order markers shouldn't allow leading newlines to otherwise appear in the
+        // file, nor should they modify our detection of the beginning of the file.
+        let isBOM = text == "\u{feff}"
+        requiresNextNewline = !isBOM
+        isStartOfFile = isStartOfFile && isBOM
+
       default:
         break
       }
@@ -2956,6 +3010,16 @@ extension Collection {
   }
 }
 
+extension TriviaPiece {
+  /// True if the trivia piece is garbage text.
+  fileprivate var isGarbageText: Bool {
+    switch self {
+    case .garbageText: return true
+    default: return false
+    }
+  }
+}
+
 /// Returns whether the given trivia includes a directive to ignore formatting for the next node.
 ///
 /// - Parameter trivia: Leading trivia for a node that the formatter supports ignoring.
diff --git a/Sources/SwiftFormatPrettyPrint/Verbatim.swift b/Sources/SwiftFormatPrettyPrint/Verbatim.swift
@@ -26,31 +26,69 @@ enum IndentingBehavior {
 }
 
 struct Verbatim {
-  let indentingBehavior: IndentingBehavior
-  var lines: [String] = []
-  var leadingWhitespaceCounts: [Int] = []
+  /// The behavior used to adjust indentation when printing verbatim content.
+  private let indentingBehavior: IndentingBehavior
 
-  init(text: String, indentingBehavior: IndentingBehavior = .allLines) {
+  /// The lines of verbatim text.
+  private let lines: [String]
+
+  /// The number of leading whitespaces to print for each line of verbatim content, not including
+  /// any additional indentation requested externally.
+  private let leadingWhitespaceCounts: [Int]
+
+  init(text: String, indentingBehavior: IndentingBehavior) {
     self.indentingBehavior = indentingBehavior
-    tokenizeTextAndTrimWhitespace(text: text)
-  }
 
-  mutating func tokenizeTextAndTrimWhitespace(text: String) {
-    lines = text.split(separator: "\n", omittingEmptySubsequences: false).map { String($0) }
+    var originalLines = text.split(separator: "\n", omittingEmptySubsequences: false)
 
     // Prevents an extra leading new line from being created.
-    if lines[0] == "" {
-      lines.remove(at: 0)
+    if originalLines[0].isEmpty {
+      originalLines.remove(at: 0)
+    }
+
+    // If we have no lines left (or none with any content), just initialize everything empty and
+    // exit.
+    guard
+      !originalLines.isEmpty,
+      let index = originalLines.firstIndex(where: { !$0.isEmpty })
+    else {
+      self.lines = []
+      self.leadingWhitespaceCounts = []
+      return
     }
 
-    guard lines.count > 0, let index = lines.firstIndex(where: { $0 != "" }) else { return }
+    // If our indenting behavior is `none`, then keep the original lines _exactly_ as is---don't
+    // attempt to calculate or trim their leading indentation.
+    guard indentingBehavior != .none else {
+      self.lines = originalLines.map(String.init)
+      self.leadingWhitespaceCounts = [Int](repeating: 0, count: originalLines.count)
+      return
+    }
 
-    // Get the number of leading whitespaces of the first line, and subract this from the number of
-    // leading whitespaces for subsequent lines (if possible). Record the new leading whitespaces
-    // counts, and trim off whitespace from the ends of the strings.
-    let count = countLeadingWhitespaces(text: lines[index])
-    leadingWhitespaceCounts = lines.map { max(countLeadingWhitespaces(text: $0) - count, 0) }
-    lines = lines.map { $0.trimmingCharacters(in: CharacterSet(charactersIn: " ")) }
+    // Otherwise, we're in one of the indentation compensating modes. Get the number of leading
+    // whitespaces of the first line, and subtract this from the number of leading whitespaces for
+    // subsequent lines (if possible). Record the new leading whitespaces counts, and trim off
+    // whitespace from the ends of the strings.
+    let firstLineLeadingSpaceCount = numberOfLeadingSpaces(in: originalLines[index])
+    self.leadingWhitespaceCounts = originalLines.map {
+      max(numberOfLeadingSpaces(in: $0) - firstLineLeadingSpaceCount, 0)
+    }
+    self.lines = originalLines.map { $0.trimmingCharacters(in: CharacterSet(charactersIn: " ")) }
+  }
+
+  /// Returns the length that the pretty printer should use when determining layout for this
+  /// verbatim content.
+  ///
+  /// Specifically, multiline content should have a length equal to the maximum (to force breaking),
+  /// while single-line content should have its natural length.
+  func prettyPrintingLength(maximum: Int) -> Int {
+    if lines.isEmpty {
+      return 0
+    }
+    if lines.count > 1 {
+      return maximum
+    }
+    return lines[0].count
   }
 
   func print(indent: [Indent]) -> String {
@@ -60,11 +98,12 @@ struct Verbatim {
         switch indentingBehavior {
         case .firstLine where i == 0, .allLines:
           output += indent.indentation()
-          break
         case .none, .firstLine:
           break
         }
-        output += String(repeating: " ", count: leadingWhitespaceCounts[i])
+        if leadingWhitespaceCounts[i] > 0 {
+          output += String(repeating: " ", count: leadingWhitespaceCounts[i])
+        }
         output += lines[i]
       }
       if i < lines.count - 1 {
@@ -73,12 +112,13 @@ struct Verbatim {
     }
     return output
   }
+}
 
-  func countLeadingWhitespaces(text: String) -> Int {
-    var count = 0
-    for char in text {
-      if char == " " { count += 1 } else { break }
-    }
-    return count
+/// Returns the leading number of spaces in the given string.
+fileprivate func numberOfLeadingSpaces(in text: Substring) -> Int {
+  var count = 0
+  for char in text {
+    if char == " " { count += 1 } else { break }
   }
+  return count
 }
diff --git a/Tests/SwiftFormatPrettyPrintTests/GarbageTextTests.swift b/Tests/SwiftFormatPrettyPrintTests/GarbageTextTests.swift
diff --git a/Tests/SwiftFormatPrettyPrintTests/XCTestManifests.swift b/Tests/SwiftFormatPrettyPrintTests/XCTestManifests.swift

Original file line number	Diff line number	Diff line change
`@@ -201,8 +201,4 @@ enum Token {`
`201`	`201`	static func `break`(_ kind: BreakKind, newlines: NewlineBehavior) -> Token {
`202`	`202`	`return .break(kind, size: 1, newlines: newlines)`
`203`	`203`	`}`
`204`		`-`
`205`		`- static func verbatim(text: String) -> Token {`
`206`		`- return Token.verbatim(Verbatim(text: text))`
`207`		`- }`
`208`	`204`	`}`