Skip to content

Commit fdcc576

Browse files
committed
Make SourceLocationConverter handle invalid UTF-8
The `SourceLocationConverter` previously worked on `String`, which is lossy if the source contains invalid UTF-8. Make it work on `SyntaxText` and `RawTriviaPiece` so it’s source accurate.
1 parent 20f8f83 commit fdcc576

File tree

2 files changed

+63
-16
lines changed

2 files changed

+63
-16
lines changed

Sources/SwiftSyntax/SourceLocation.swift

Lines changed: 17 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -138,7 +138,9 @@ public final class SourceLocationConverter {
138138
public init(file: String, source: String) {
139139
self.file = file
140140
self.source = Array(source.utf8)
141-
(self.lines, endOfFile) = computeLines(source)
141+
(self.lines, endOfFile) = self.source.withUnsafeBufferPointer { buf in
142+
return computeLines(SyntaxText(buffer: buf))
143+
}
142144
assert(source.utf8.count == endOfFile.utf8Offset)
143145
}
144146

@@ -397,7 +399,7 @@ fileprivate func computeLines(
397399
return (lines, position)
398400
}
399401

400-
fileprivate func computeLines(_ source: String) -> ([AbsolutePosition], AbsolutePosition) {
402+
fileprivate func computeLines(_ source: SyntaxText) -> ([AbsolutePosition], AbsolutePosition) {
401403
var lines: [AbsolutePosition] = []
402404
// First line starts from the beginning.
403405
lines.append(.startOfFile)
@@ -412,26 +414,25 @@ fileprivate func computeLines(_ source: String) -> ([AbsolutePosition], Absolute
412414
return (lines, position)
413415
}
414416

415-
fileprivate extension String {
417+
fileprivate extension SyntaxText {
416418
/// Walks and passes to `body` the `SourceLength` for every detected line,
417419
/// with the newline character included.
418420
/// - Returns: The leftover `SourceLength` at the end of the walk.
419421
func forEachLineLength(
420422
prefix: SourceLength = .zero,
421423
body: (SourceLength) -> ()
422424
) -> SourceLength {
423-
let utf8 = self.utf8
424-
let startIndex = utf8.startIndex
425-
let endIndex = utf8.endIndex
425+
// let startIndex = utf8.startIndex
426+
// let endIndex = utf8.endIndex
426427
var curIdx = startIndex
427428
var lineLength = prefix
428429
let advanceLengthByOne = { () -> () in
429430
lineLength += SourceLength(utf8Length: 1)
430-
curIdx = utf8.index(after: curIdx)
431+
curIdx = self.index(after: curIdx)
431432
}
432433

433434
while curIdx < endIndex {
434-
let char = utf8[curIdx]
435+
let char = self[curIdx]
435436
advanceLengthByOne()
436437

437438
/// From https://docs.swift.org/swift-book/ReferenceManual/LexicalStructure.html#grammar_line-break
@@ -441,7 +442,7 @@ fileprivate extension String {
441442
let isNewline = { () -> Bool in
442443
if char == 10 { return true }
443444
if char == 13 {
444-
if curIdx < endIndex && utf8[curIdx] == 10 { advanceLengthByOne() }
445+
if curIdx < endIndex && self[curIdx] == 10 { advanceLengthByOne() }
445446
return true
446447
}
447448
return false
@@ -456,11 +457,11 @@ fileprivate extension String {
456457
}
457458

458459
func containsSwiftNewline() -> Bool {
459-
return utf8.contains { $0 == 10 || $0 == 13 }
460+
return self.contains { $0 == 10 || $0 == 13 }
460461
}
461462
}
462463

463-
fileprivate extension TriviaPiece {
464+
fileprivate extension RawTriviaPiece {
464465
/// Walks and passes to `body` the `SourceLength` for every detected line,
465466
/// with the newline character included.
466467
/// - Returns: The leftover `SourceLength` at the end of the walk.
@@ -495,7 +496,7 @@ fileprivate extension TriviaPiece {
495496
let .docLineComment(text):
496497
// Line comments are not supposed to contain newlines.
497498
assert(!text.containsSwiftNewline(), "line comment created that contained a new-line character")
498-
lineLength += SourceLength(utf8Length: text.utf8.count)
499+
lineLength += SourceLength(utf8Length: text.count)
499500
case let .blockComment(text),
500501
let .docBlockComment(text),
501502
let .unexpectedText(text):
@@ -505,7 +506,7 @@ fileprivate extension TriviaPiece {
505506
}
506507
}
507508

508-
fileprivate extension Trivia {
509+
fileprivate extension Array where Element == RawTriviaPiece {
509510
/// Walks and passes to `body` the `SourceLength` for every detected line,
510511
/// with the newline character included.
511512
/// - Returns: The leftover `SourceLength` at the end of the walk.
@@ -530,9 +531,9 @@ fileprivate extension TokenSyntax {
530531
body: (SourceLength) -> ()
531532
) -> SourceLength {
532533
var curPrefix = prefix
533-
curPrefix = self.leadingTrivia.forEachLineLength(prefix: curPrefix, body: body)
534-
curPrefix = self.text.forEachLineLength(prefix: curPrefix, body: body)
535-
curPrefix = self.trailingTrivia.forEachLineLength(prefix: curPrefix, body: body)
534+
curPrefix = self.tokenView.leadingRawTriviaPieces.forEachLineLength(prefix: curPrefix, body: body)
535+
curPrefix = self.tokenView.rawText.forEachLineLength(prefix: curPrefix, body: body)
536+
curPrefix = self.tokenView.trailingRawTriviaPieces.forEachLineLength(prefix: curPrefix, body: body)
536537
return curPrefix
537538
}
538539
}
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// This source file is part of the Swift.org open source project
4+
//
5+
// Copyright (c) 2014 - 2023 Apple Inc. and the Swift project authors
6+
// Licensed under Apache License v2.0 with Runtime Library Exception
7+
//
8+
// See https://swift.org/LICENSE.txt for license information
9+
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
10+
//
11+
//===----------------------------------------------------------------------===//
12+
13+
import XCTest
14+
@_spi(RawSyntax) import SwiftSyntax
15+
16+
final class SourceLocationConverterTests: XCTestCase {
17+
func testInvalidUtf8() {
18+
let eofToken = withExtendedLifetime(SyntaxArena()) { arena in
19+
let leadingTriviaText = [UInt8(0xfd)].withUnsafeBufferPointer { buf in
20+
arena.intern(SyntaxText(buffer: buf))
21+
}
22+
23+
let nodeWithInvalidUtf8 = RawTokenSyntax(
24+
kind: .eof,
25+
text: "",
26+
leadingTriviaPieces: [
27+
.unexpectedText(leadingTriviaText)
28+
],
29+
presence: .present,
30+
arena: arena
31+
)
32+
33+
return Syntax(raw: nodeWithInvalidUtf8.raw).cast(TokenSyntax.self)
34+
}
35+
36+
let tree = SourceFileSyntax(statements: [], eofToken: eofToken)
37+
38+
// This used to violate the following assertion in the SourceLocationConverter's
39+
// initializer, because we were using `String` which was lossy when handling the
40+
// invalid UTF-8:
41+
// ```
42+
// assert(tree.byteSize == endOfFile.utf8Offset)
43+
// ```
44+
_ = SourceLocationConverter(file: "", tree: tree)
45+
}
46+
}

0 commit comments

Comments
 (0)