Skip to content

Commit a2606bb

Browse files
committed
Switch whitespace linter to iterate over UTF-8 code units.
Keeping the actual/expected sources as `String`s is unnecessary because we're only scanning them for runs of differing whitespace. This switches to storing arrays of UTF-8 code units and working in terms of those arrays and `ArraySlice`s, avoiding all the overhead of `String` indexing w.r.t. grapheme boundaries.
1 parent ca86e5d commit a2606bb

File tree

1 file changed

+43
-32
lines changed

1 file changed

+43
-32
lines changed

Sources/SwiftFormatWhitespaceLinter/WhitespaceLinter.swift

Lines changed: 43 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -14,15 +14,18 @@ import SwiftFormatConfiguration
1414
import SwiftFormatCore
1515
import SwiftSyntax
1616

17+
private let utf8Newline = UTF8.CodeUnit(ascii: "\n")
18+
private let utf8Tab = UTF8.CodeUnit(ascii: "\t")
19+
1720
/// Emits linter errors for whitespace style violations by comparing the raw text of the input Swift
1821
/// code with formatted text.
1922
public class WhitespaceLinter {
2023

2124
/// The text of the input source code to be linted.
22-
private let userText: String
25+
private let userText: [UTF8.CodeUnit]
2326

2427
/// The formatted version of `userText`.
25-
private let formattedText: String
28+
private let formattedText: [UTF8.CodeUnit]
2629

2730
/// The Context object containing the DiagnosticEngine.
2831
private let context: Context
@@ -37,8 +40,8 @@ public class WhitespaceLinter {
3740
/// - formatted: The formatted text to compare to `user`.
3841
/// - context: The context object containing the DiagnosticEngine instance we wish to use.
3942
public init(user: String, formatted: String, context: Context) {
40-
self.userText = user
41-
self.formattedText = formatted
43+
self.userText = Array(user.utf8)
44+
self.formattedText = Array(formatted.utf8)
4245
self.context = context
4346
self.isLineTooLong = false
4447
}
@@ -48,7 +51,7 @@ public class WhitespaceLinter {
4851
var userOffset = 0
4952
var formOffset = 0
5053
var isFirstCharater = true
51-
var lastChar: Character?
54+
var lastChar: UTF8.CodeUnit?
5255

5356
repeat {
5457
let userNext = nextCharacter(offset: userOffset, data: self.userText)
@@ -90,13 +93,12 @@ public class WhitespaceLinter {
9093
/// - userWs: The user leading whitespace buffer at the current character.
9194
/// - formattedWs: The formatted leading whitespace buffer at the current character.
9295
private func compareWhitespace(
93-
userOffset: Int, formOffset: Int, isFirstCharacter: Bool, userWs: String, formattedWs: String
96+
userOffset: Int, formOffset: Int, isFirstCharacter: Bool,
97+
userWs: [UTF8.CodeUnit], formattedWs: [UTF8.CodeUnit]
9498
) {
9599
// e.g. "\n" -> ["", ""], and "" -> [""]
96-
let userTokens = userWs.split(
97-
separator: "\n", omittingEmptySubsequences: false).map(String.init)
98-
let formTokens = formattedWs.split(
99-
separator: "\n", omittingEmptySubsequences: false).map(String.init)
100+
let userTokens = userWs.split(separator: utf8Newline, omittingEmptySubsequences: false)
101+
let formTokens = formattedWs.split(separator: utf8Newline, omittingEmptySubsequences: false)
100102

101103
checkForLineLengthErrors(
102104
userOffset: userOffset,
@@ -135,7 +137,8 @@ public class WhitespaceLinter {
135137
/// - user: The tokenized user whitespace buffer.
136138
/// - form: The tokenized formatted whitespace buffer.
137139
private func checkForLineLengthErrors(
138-
userOffset: Int, formOffset: Int, isFirstCharacter: Bool, user: [String], form: [String]
140+
userOffset: Int, formOffset: Int, isFirstCharacter: Bool,
141+
user: [ArraySlice<UTF8.CodeUnit>], form: [ArraySlice<UTF8.CodeUnit>]
139142
) {
140143
// Only run this check at the start of a line.
141144
guard
@@ -164,7 +167,7 @@ public class WhitespaceLinter {
164167
let char = userText[index]
165168

166169
// Count characters up to the newline.
167-
if char == "\n" { break } else { userLength += 1 }
170+
if char == utf8Newline { break } else { userLength += 1 }
168171
}
169172
}
170173

@@ -188,7 +191,7 @@ public class WhitespaceLinter {
188191
let char = formattedText[index]
189192

190193
// Count characters up to the newline.
191-
if char == "\n" { break } else { formLength += 1 }
194+
if char == utf8Newline { break } else { formLength += 1 }
192195
}
193196
}
194197

@@ -218,7 +221,8 @@ public class WhitespaceLinter {
218221
/// - user: The tokenized user whitespace buffer.
219222
/// - form: The tokenized formatted whitespace buffer.
220223
private func checkForIndentationErrors(
221-
userOffset: Int, isFirstCharacter: Bool, user: [String], form: [String]
224+
userOffset: Int, isFirstCharacter: Bool,
225+
user: [ArraySlice<UTF8.CodeUnit>], form: [ArraySlice<UTF8.CodeUnit>]
222226
) {
223227
guard form.count > 1 && user.count > 1 else {
224228
// Ordinarily, we only look for indentation spacing following a newline. The first character
@@ -242,8 +246,8 @@ public class WhitespaceLinter {
242246
}
243247
if form.last != user.last {
244248
let pos = calculatePosition(offset: userOffset + offset, data: self.userText)
245-
let actual = indentation(of: user.last ?? "")
246-
let expected = indentation(of: form.last ?? "")
249+
let actual = indentation(of: user.last ?? [])
250+
let expected = indentation(of: form.last ?? [])
247251
diagnose(
248252
.indentationError(expected: expected, actual: actual),
249253
line: pos.line,
@@ -258,7 +262,9 @@ public class WhitespaceLinter {
258262
/// - userOffset: The current character offset within the user text.
259263
/// - user: The tokenized user whitespace buffer.
260264
/// - form: The tokenized formatted whitespace buffer.
261-
private func checkForTrailingWhitespaceErrors(userOffset: Int, user: [String], form: [String]) {
265+
private func checkForTrailingWhitespaceErrors(
266+
userOffset: Int, user: [ArraySlice<UTF8.CodeUnit>], form: [ArraySlice<UTF8.CodeUnit>]
267+
) {
262268
guard form.count > 1 && user.count > 1 else { return }
263269
var offset = 0
264270
for i in 0..<(user.count - 1) {
@@ -282,14 +288,15 @@ public class WhitespaceLinter {
282288
/// - user: The tokenized user whitespace buffer.
283289
/// - form: The tokenized formatted whitespace buffer.
284290
private func checkForSpacingErrors(
285-
userOffset: Int, isFirstCharacter: Bool, user: [String], form: [String]
291+
userOffset: Int, isFirstCharacter: Bool,
292+
user: [ArraySlice<UTF8.CodeUnit>], form: [ArraySlice<UTF8.CodeUnit>]
286293
) {
287294
// The spaces in front of the first character of a file are indentation and not spacing related.
288295
guard form.count == 1 && user.count == 1 && !isFirstCharacter else { return }
289296
guard form[0] != user[0] else { return }
290297

291298
let pos = calculatePosition(offset: userOffset, data: self.userText)
292-
let illegalSpacingCharacters = ["\t"]
299+
let illegalSpacingCharacters: [UTF8.CodeUnit] = [utf8Tab]
293300
if illegalSpacingCharacters.contains(where: { user[0].contains($0) }) {
294301
diagnose(.spacingCharError, line: pos.line, column: pos.column, utf8Offset: 0)
295302
} else if form[0].count != user[0].count {
@@ -315,7 +322,9 @@ public class WhitespaceLinter {
315322
/// - userOffset: The current character offset within the user text.
316323
/// - user: The tokenized user whitespace buffer.
317324
/// - form: The tokenized formatted whitespace buffer.
318-
private func checkForRemoveLineErrors(userOffset: Int, user: [String], form: [String]) {
325+
private func checkForRemoveLineErrors(
326+
userOffset: Int, user: [ArraySlice<UTF8.CodeUnit>], form: [ArraySlice<UTF8.CodeUnit>]
327+
) {
319328
guard form.count < user.count else { return }
320329
var offset = 0
321330
for i in 0..<(user.count - form.count) {
@@ -343,7 +352,9 @@ public class WhitespaceLinter {
343352
/// - userOffset: The current character offset within the user text.
344353
/// - user: The tokenized user whitespace buffer.
345354
/// - form: The tokenized formatted whitespace buffer.
346-
private func checkForAddLineErrors(userOffset: Int, user: [String], form: [String]) {
355+
private func checkForAddLineErrors(
356+
userOffset: Int, user: [ArraySlice<UTF8.CodeUnit>], form: [ArraySlice<UTF8.CodeUnit>]
357+
) {
347358
guard form.count > user.count && !isLineTooLong else { return }
348359
let pos = calculatePosition(offset: userOffset, data: self.userText)
349360
diagnose(
@@ -363,17 +374,17 @@ public class WhitespaceLinter {
363374
/// - data: The input string.
364375
/// - Returns a tuple of the new offset, the non-whitespace character we landed on, and a string
365376
/// containing the leading whitespace.
366-
private func nextCharacter(offset: Int, data: String)
367-
-> (offset: Int, char: Character?, whitespace: String)
377+
private func nextCharacter(offset: Int, data: [UTF8.CodeUnit])
378+
-> (offset: Int, char: UTF8.CodeUnit?, whitespace: [UTF8.CodeUnit])
368379
{
369-
var whitespaceBuffer = ""
380+
var whitespaceBuffer = [UTF8.CodeUnit]()
370381

371382
for i in offset..<data.count {
372383
let index = data.index(data.startIndex, offsetBy: i)
373384
let char = data[index]
374385

375-
if char.isWhitespace {
376-
whitespaceBuffer += String(char)
386+
if UnicodeScalar(char).properties.isWhitespace {
387+
whitespaceBuffer.append(char)
377388
} else {
378389
return (offset: i, char: char, whitespace: whitespaceBuffer)
379390
}
@@ -387,12 +398,12 @@ public class WhitespaceLinter {
387398
/// - offset: The printable character offset.
388399
/// - data: The input string for which we want the line and column numbers.
389400
/// - Returns a tuple with the line and column numbers within `data`.
390-
private func calculatePosition(offset: Int, data: String) -> (line: Int, column: Int) {
401+
private func calculatePosition(offset: Int, data: [UTF8.CodeUnit]) -> (line: Int, column: Int) {
391402
var line = 1
392403
var column = 0
393404

394405
for (index, char) in data.enumerated() {
395-
if char == "\n" {
406+
if char == utf8Newline {
396407
line += 1
397408
column = 0
398409
} else {
@@ -432,15 +443,15 @@ public class WhitespaceLinter {
432443
/// leading spacing for a line.
433444
///
434445
/// A return value of nil indicates that there was no indentation.
435-
private func indentation(of whitespace: String) -> WhitespaceIndentation {
446+
private func indentation(of whitespace: ArraySlice<UTF8.CodeUnit>) -> WhitespaceIndentation {
436447
if whitespace.count == 0 {
437448
return .none
438449
}
439450

440-
var orderedRuns: [(char: Character, count: Int)] = []
451+
var orderedRuns: [(char: UTF8.CodeUnit, count: Int)] = []
441452
for char in whitespace {
442453
// Any non-whitespace character indicates the end of the indentation whitespace.
443-
guard char.isWhitespace else { break }
454+
guard UnicodeScalar(char).properties.isWhitespace else { break }
444455

445456
let lastRun = orderedRuns.last
446457
if lastRun?.char == char {
@@ -452,7 +463,7 @@ public class WhitespaceLinter {
452463

453464
let indents = orderedRuns.map { run in
454465
// Assumes any non-tab whitespace character is some type of space.
455-
return run.char == "\t" ? Indent.tabs(run.count) : Indent.spaces(run.count)
466+
return run.char == utf8Tab ? Indent.tabs(run.count) : Indent.spaces(run.count)
456467
}
457468
if indents.count == 1, let onlyIndent = indents.first {
458469
return .homogeneous(onlyIndent)

0 commit comments

Comments
 (0)