Skip to content

Commit 9107050

Browse files
committed
Preserve garbage text trivia when pretty printing.
Garbage text typically refers to unrecognized characters in the input, which we haven't typically cared about. However, there are also some uses of garbage text that don't represent "garbage" per se, but content that the lexer/parser doesn't care about but which we should still preserve during formatting: * a hashbang line at the beginning of the file * a Unicode byte-order marker at the beginning of the file * source control conflict markers When we see these (or any garbage), we treat them as verbatim content so that we can handle it as nondestructively as possible. This change also makes some small fixes to verbatim handling, and removes a convenience constructor for verbatim tokens so that we force the caller to specify the indentation behavior instead of relying on an arbitrary default.
1 parent 6dc8fc2 commit 9107050

File tree

6 files changed

+302
-47
lines changed

6 files changed

+302
-47
lines changed

Sources/SwiftFormatPrettyPrint/PrettyPrint.swift

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -553,14 +553,7 @@ public class PrettyPrinter {
553553
total += wasEndOfLine ? 0 : comment.length
554554

555555
case .verbatim(let verbatim):
556-
var length: Int
557-
if verbatim.lines.count > 1 {
558-
length = maxLineLength
559-
} else if verbatim.lines.count == 0 {
560-
length = 0
561-
} else {
562-
length = verbatim.lines[0].count
563-
}
556+
let length = verbatim.prettyPrintingLength(maximum: maxLineLength)
564557
lengths.append(length)
565558
total += length
566559

Sources/SwiftFormatPrettyPrint/Token.swift

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -201,8 +201,4 @@ enum Token {
201201
static func `break`(_ kind: BreakKind, newlines: NewlineBehavior) -> Token {
202202
return .break(kind, size: 1, newlines: newlines)
203203
}
204-
205-
static func verbatim(text: String) -> Token {
206-
return Token.verbatim(Verbatim(text: text))
207-
}
208204
}

Sources/SwiftFormatPrettyPrint/TokenStreamCreator.swift

Lines changed: 74 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -132,7 +132,7 @@ private final class TokenStreamCreator: SyntaxVisitor {
132132
appendBeforeTokens(firstToken)
133133
}
134134

135-
appendToken(.verbatim(Verbatim(text: node.description)))
135+
appendToken(.verbatim(Verbatim(text: node.description, indentingBehavior: .allLines)))
136136

137137
if let lastToken = node.lastToken {
138138
// Extract any comments that trail the verbatim block since they belong to the next syntax
@@ -1978,6 +1978,7 @@ private final class TokenStreamCreator: SyntaxVisitor {
19781978
appendToken(.syntax(token.text))
19791979
}
19801980

1981+
appendTrailingTrivia(token)
19811982
appendAfterTokensAndTrailingComments(token)
19821983

19831984
// It doesn't matter what we return here, tokens do not have children.
@@ -2033,6 +2034,43 @@ private final class TokenStreamCreator: SyntaxVisitor {
20332034
}
20342035
}
20352036

2037+
/// Handle trailing trivia that might contain garbage text that we don't want to indiscriminantly
2038+
/// discard.
2039+
///
2040+
/// In syntactically valid code, trailing trivia will only contain spaces or tabs, so we can
2041+
/// usually ignore it entirely. If there is garbage text after a token, however, then we preserve
2042+
/// it (and any whitespace immediately before it) and "glue" it to the end of the preceding token
2043+
/// using a `verbatim` formatting token. Any whitespace following the last garbage text in the
2044+
/// trailing trivia will be discarded, with the assumption that the formatter will have inserted
2045+
/// some kind of break there that would be more appropriate (and we want to avoid inserting
2046+
/// trailing whitespace on a line).
2047+
///
2048+
/// The choices above are admittedly somewhat arbitrary, but given that garbage text in trailing
2049+
/// trivia represents a malformed input (as opposed to garbage text in leading trivia, which has
2050+
/// some legitimate uses), this is a reasonable compromise to keep the garbage text roughly in the
2051+
/// same place but still let surrounding formatting occur somewhat as expected.
2052+
private func appendTrailingTrivia(_ token: TokenSyntax) {
2053+
let trailingTrivia = Array(token.trailingTrivia)
2054+
guard let lastGarbageIndex = trailingTrivia.lastIndex(where: { $0.isGarbageText }) else {
2055+
return
2056+
}
2057+
2058+
var verbatimText = ""
2059+
for piece in trailingTrivia[...lastGarbageIndex] {
2060+
switch piece {
2061+
case .garbageText, .spaces, .tabs, .formfeeds, .verticalTabs:
2062+
piece.write(to: &verbatimText)
2063+
default:
2064+
// The implementation of the lexer today ensures that newlines, carriage returns, and
2065+
// comments will not be present in trailing trivia. Ignore them for now (rather than assert,
2066+
// in case that changes in a future version).
2067+
break
2068+
}
2069+
}
2070+
2071+
appendToken(.verbatim(Verbatim(text: verbatimText, indentingBehavior: .none)))
2072+
}
2073+
20362074
/// Appends the after-tokens and trailing comments (if present) of the given syntax token
20372075
/// to the token stream.
20382076
///
@@ -2399,7 +2437,11 @@ private final class TokenStreamCreator: SyntaxVisitor {
23992437
}
24002438
}
24012439

2402-
var lastPieceWasLineComment = false
2440+
// Updated throughout the loop to indicate whether the next newline *must* be honored (for
2441+
// example, even if discretionary newlines are discarded). This is the case when the preceding
2442+
// trivia was a line comment or garbage text.
2443+
var requiresNextNewline = false
2444+
24032445
for (index, piece) in trivia.enumerated() {
24042446
if let cutoff = cutoffIndex, index == cutoff { break }
24052447
switch piece {
@@ -2409,7 +2451,7 @@ private final class TokenStreamCreator: SyntaxVisitor {
24092451
appendNewlines(.soft)
24102452
isStartOfFile = false
24112453
}
2412-
lastPieceWasLineComment = true
2454+
requiresNextNewline = true
24132455

24142456
case .blockComment(let text):
24152457
if index > 0 || isStartOfFile {
@@ -2420,39 +2462,51 @@ private final class TokenStreamCreator: SyntaxVisitor {
24202462
appendToken(.break(.same, size: 0))
24212463
isStartOfFile = false
24222464
}
2423-
lastPieceWasLineComment = false
2465+
requiresNextNewline = false
24242466

24252467
case .docLineComment(let text):
24262468
appendToken(.comment(Comment(kind: .docLine, text: text), wasEndOfLine: false))
24272469
appendNewlines(.soft)
24282470
isStartOfFile = false
2429-
lastPieceWasLineComment = true
2471+
requiresNextNewline = true
24302472

24312473
case .docBlockComment(let text):
24322474
appendToken(.comment(Comment(kind: .docBlock, text: text), wasEndOfLine: false))
24332475
appendNewlines(.soft)
24342476
isStartOfFile = false
2435-
lastPieceWasLineComment = false
2477+
requiresNextNewline = false
24362478

24372479
case .newlines(let count), .carriageReturns(let count), .carriageReturnLineFeeds(let count):
24382480
guard !isStartOfFile else { break }
2439-
// Even if we aren't respecting discretionary newlines, there must always be a newline after
2440-
// a line comment.
2441-
if lastPieceWasLineComment ||
2481+
2482+
if requiresNextNewline ||
24422483
(config.respectsExistingLineBreaks && isDiscretionaryNewlineAllowed(before: token))
24432484
{
24442485
appendNewlines(.soft(count: count, discretionary: true))
24452486
} else {
24462487
// Even if discretionary line breaks are not being respected, we still respect multiple
24472488
// line breaks in order to keep blank separator lines that the user might want.
24482489
// TODO: It would be nice to restrict this to only allow multiple lines between statements
2449-
// and declarations; as currently implemented, multiple newlines will locally the
2490+
// and declarations; as currently implemented, multiple newlines will locally ignore the
24502491
// configuration setting.
24512492
if count > 1 {
24522493
appendNewlines(.soft(count: count, discretionary: true))
24532494
}
24542495
}
24552496

2497+
case .garbageText(let text):
2498+
// Garbage text in leading trivia might be something meaningful that would be disruptive to
2499+
// throw away when formatting the file, like a hashbang line or Unicode byte-order marker at
2500+
// the beginning of a file, or source control conflict markers. Keep it as verbatim text so
2501+
// that it is printed exactly as we got it.
2502+
appendToken(.verbatim(Verbatim(text: text, indentingBehavior: .none)))
2503+
2504+
// Unicode byte-order markers shouldn't allow leading newlines to otherwise appear in the
2505+
// file, nor should they modify our detection of the beginning of the file.
2506+
let isBOM = text == "\u{feff}"
2507+
requiresNextNewline = !isBOM
2508+
isStartOfFile = isStartOfFile && isBOM
2509+
24562510
default:
24572511
break
24582512
}
@@ -2958,6 +3012,16 @@ extension Collection {
29583012
}
29593013
}
29603014

3015+
extension TriviaPiece {
3016+
/// True if the trivia piece is garbage text.
3017+
fileprivate var isGarbageText: Bool {
3018+
switch self {
3019+
case .garbageText: return true
3020+
default: return false
3021+
}
3022+
}
3023+
}
3024+
29613025
/// Returns whether the given trivia includes a directive to ignore formatting for the next node.
29623026
///
29633027
/// - Parameter trivia: Leading trivia for a node that the formatter supports ignoring.

Sources/SwiftFormatPrettyPrint/Verbatim.swift

Lines changed: 65 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -26,31 +26,69 @@ enum IndentingBehavior {
2626
}
2727

2828
struct Verbatim {
29-
let indentingBehavior: IndentingBehavior
30-
var lines: [String] = []
31-
var leadingWhitespaceCounts: [Int] = []
29+
/// The behavior used to adjust indentation when printing verbatim content.
30+
private let indentingBehavior: IndentingBehavior
3231

33-
init(text: String, indentingBehavior: IndentingBehavior = .allLines) {
32+
/// The lines of verbatim text.
33+
private let lines: [String]
34+
35+
/// The number of leading whitespaces to print for each line of verbatim content, not including
36+
/// any additional indentation requested externally.
37+
private let leadingWhitespaceCounts: [Int]
38+
39+
init(text: String, indentingBehavior: IndentingBehavior) {
3440
self.indentingBehavior = indentingBehavior
35-
tokenizeTextAndTrimWhitespace(text: text)
36-
}
3741

38-
mutating func tokenizeTextAndTrimWhitespace(text: String) {
39-
lines = text.split(separator: "\n", omittingEmptySubsequences: false).map { String($0) }
42+
var originalLines = text.split(separator: "\n", omittingEmptySubsequences: false)
4043

4144
// Prevents an extra leading new line from being created.
42-
if lines[0] == "" {
43-
lines.remove(at: 0)
45+
if originalLines[0].isEmpty {
46+
originalLines.remove(at: 0)
47+
}
48+
49+
// If we have no lines left (or none with any content), just initialize everything empty and
50+
// exit.
51+
guard
52+
!originalLines.isEmpty,
53+
let index = originalLines.firstIndex(where: { !$0.isEmpty })
54+
else {
55+
self.lines = []
56+
self.leadingWhitespaceCounts = []
57+
return
4458
}
4559

46-
guard lines.count > 0, let index = lines.firstIndex(where: { $0 != "" }) else { return }
60+
// If our indenting behavior is `none`, then keep the original lines _exactly_ as is---don't
61+
// attempt to calculate or trim their leading indentation.
62+
guard indentingBehavior != .none else {
63+
self.lines = originalLines.map(String.init)
64+
self.leadingWhitespaceCounts = [Int](repeating: 0, count: originalLines.count)
65+
return
66+
}
4767

48-
// Get the number of leading whitespaces of the first line, and subract this from the number of
49-
// leading whitespaces for subsequent lines (if possible). Record the new leading whitespaces
50-
// counts, and trim off whitespace from the ends of the strings.
51-
let count = countLeadingWhitespaces(text: lines[index])
52-
leadingWhitespaceCounts = lines.map { max(countLeadingWhitespaces(text: $0) - count, 0) }
53-
lines = lines.map { $0.trimmingCharacters(in: CharacterSet(charactersIn: " ")) }
68+
// Otherwise, we're in one of the indentation compensating modes. Get the number of leading
69+
// whitespaces of the first line, and subtract this from the number of leading whitespaces for
70+
// subsequent lines (if possible). Record the new leading whitespaces counts, and trim off
71+
// whitespace from the ends of the strings.
72+
let firstLineLeadingSpaceCount = numberOfLeadingSpaces(in: originalLines[index])
73+
self.leadingWhitespaceCounts = originalLines.map {
74+
max(numberOfLeadingSpaces(in: $0) - firstLineLeadingSpaceCount, 0)
75+
}
76+
self.lines = originalLines.map { $0.trimmingCharacters(in: CharacterSet(charactersIn: " ")) }
77+
}
78+
79+
/// Returns the length that the pretty printer should use when determining layout for this
80+
/// verbatim content.
81+
///
82+
/// Specifically, multiline content should have a length equal to the maximum (to force breaking),
83+
/// while single-line content should have its natural length.
84+
func prettyPrintingLength(maximum: Int) -> Int {
85+
if lines.isEmpty {
86+
return 0
87+
}
88+
if lines.count > 1 {
89+
return maximum
90+
}
91+
return lines[0].count
5492
}
5593

5694
func print(indent: [Indent]) -> String {
@@ -60,11 +98,12 @@ struct Verbatim {
6098
switch indentingBehavior {
6199
case .firstLine where i == 0, .allLines:
62100
output += indent.indentation()
63-
break
64101
case .none, .firstLine:
65102
break
66103
}
67-
output += String(repeating: " ", count: leadingWhitespaceCounts[i])
104+
if leadingWhitespaceCounts[i] > 0 {
105+
output += String(repeating: " ", count: leadingWhitespaceCounts[i])
106+
}
68107
output += lines[i]
69108
}
70109
if i < lines.count - 1 {
@@ -73,12 +112,13 @@ struct Verbatim {
73112
}
74113
return output
75114
}
115+
}
76116

77-
func countLeadingWhitespaces(text: String) -> Int {
78-
var count = 0
79-
for char in text {
80-
if char == " " { count += 1 } else { break }
81-
}
82-
return count
117+
/// Returns the leading number of spaces in the given string.
118+
fileprivate func numberOfLeadingSpaces(in text: Substring) -> Int {
119+
var count = 0
120+
for char in text {
121+
if char == " " { count += 1 } else { break }
83122
}
123+
return count
84124
}

0 commit comments

Comments
 (0)