Skip to content

Commit 101eaf0

Browse files
authored
Merge pull request #1286 from ahoppen/ahoppen/invalid-utf8
Fix two assertion failures related to invalid UTF-8
2 parents e9e0b24 + fdcc576 commit 101eaf0

File tree

5 files changed

+167
-75
lines changed

5 files changed

+167
-75
lines changed

Sources/SwiftParser/Lexer/Cursor.swift

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2102,7 +2102,7 @@ extension Lexer.Cursor {
21022102
/// valid operator start, advance the cursor by what can be considered a
21032103
/// lexeme.
21042104
mutating func lexUnknown() -> UnknownCharactersClassification {
2105-
assert(self.peekScalar()?.isValidIdentifierStartCodePoint == false && self.peekScalar()?.isOperatorStartCodePoint == false)
2105+
assert(!(self.peekScalar()?.isValidIdentifierStartCodePoint ?? false) && !(self.peekScalar()?.isOperatorStartCodePoint ?? false))
21062106
var tmp = self
21072107
if tmp.advance(if: { Unicode.Scalar($0).isValidIdentifierContinuationCodePoint }) {
21082108
// If this is a valid identifier continuation, but not a valid identifier

Sources/SwiftSyntax/SourceLocation.swift

Lines changed: 17 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -138,7 +138,9 @@ public final class SourceLocationConverter {
138138
public init(file: String, source: String) {
139139
self.file = file
140140
self.source = Array(source.utf8)
141-
(self.lines, endOfFile) = computeLines(source)
141+
(self.lines, endOfFile) = self.source.withUnsafeBufferPointer { buf in
142+
return computeLines(SyntaxText(buffer: buf))
143+
}
142144
assert(source.utf8.count == endOfFile.utf8Offset)
143145
}
144146

@@ -397,7 +399,7 @@ fileprivate func computeLines(
397399
return (lines, position)
398400
}
399401

400-
fileprivate func computeLines(_ source: String) -> ([AbsolutePosition], AbsolutePosition) {
402+
fileprivate func computeLines(_ source: SyntaxText) -> ([AbsolutePosition], AbsolutePosition) {
401403
var lines: [AbsolutePosition] = []
402404
// First line starts from the beginning.
403405
lines.append(.startOfFile)
@@ -412,26 +414,25 @@ fileprivate func computeLines(_ source: String) -> ([AbsolutePosition], Absolute
412414
return (lines, position)
413415
}
414416

415-
fileprivate extension String {
417+
fileprivate extension SyntaxText {
416418
/// Walks and passes to `body` the `SourceLength` for every detected line,
417419
/// with the newline character included.
418420
/// - Returns: The leftover `SourceLength` at the end of the walk.
419421
func forEachLineLength(
420422
prefix: SourceLength = .zero,
421423
body: (SourceLength) -> ()
422424
) -> SourceLength {
423-
let utf8 = self.utf8
424-
let startIndex = utf8.startIndex
425-
let endIndex = utf8.endIndex
425+
// let startIndex = utf8.startIndex
426+
// let endIndex = utf8.endIndex
426427
var curIdx = startIndex
427428
var lineLength = prefix
428429
let advanceLengthByOne = { () -> () in
429430
lineLength += SourceLength(utf8Length: 1)
430-
curIdx = utf8.index(after: curIdx)
431+
curIdx = self.index(after: curIdx)
431432
}
432433

433434
while curIdx < endIndex {
434-
let char = utf8[curIdx]
435+
let char = self[curIdx]
435436
advanceLengthByOne()
436437

437438
/// From https://docs.swift.org/swift-book/ReferenceManual/LexicalStructure.html#grammar_line-break
@@ -441,7 +442,7 @@ fileprivate extension String {
441442
let isNewline = { () -> Bool in
442443
if char == 10 { return true }
443444
if char == 13 {
444-
if curIdx < endIndex && utf8[curIdx] == 10 { advanceLengthByOne() }
445+
if curIdx < endIndex && self[curIdx] == 10 { advanceLengthByOne() }
445446
return true
446447
}
447448
return false
@@ -456,11 +457,11 @@ fileprivate extension String {
456457
}
457458

458459
func containsSwiftNewline() -> Bool {
459-
return utf8.contains { $0 == 10 || $0 == 13 }
460+
return self.contains { $0 == 10 || $0 == 13 }
460461
}
461462
}
462463

463-
fileprivate extension TriviaPiece {
464+
fileprivate extension RawTriviaPiece {
464465
/// Walks and passes to `body` the `SourceLength` for every detected line,
465466
/// with the newline character included.
466467
/// - Returns: The leftover `SourceLength` at the end of the walk.
@@ -495,7 +496,7 @@ fileprivate extension TriviaPiece {
495496
let .docLineComment(text):
496497
// Line comments are not supposed to contain newlines.
497498
assert(!text.containsSwiftNewline(), "line comment created that contained a new-line character")
498-
lineLength += SourceLength(utf8Length: text.utf8.count)
499+
lineLength += SourceLength(utf8Length: text.count)
499500
case let .blockComment(text),
500501
let .docBlockComment(text),
501502
let .unexpectedText(text):
@@ -505,7 +506,7 @@ fileprivate extension TriviaPiece {
505506
}
506507
}
507508

508-
fileprivate extension Trivia {
509+
fileprivate extension Array where Element == RawTriviaPiece {
509510
/// Walks and passes to `body` the `SourceLength` for every detected line,
510511
/// with the newline character included.
511512
/// - Returns: The leftover `SourceLength` at the end of the walk.
@@ -530,9 +531,9 @@ fileprivate extension TokenSyntax {
530531
body: (SourceLength) -> ()
531532
) -> SourceLength {
532533
var curPrefix = prefix
533-
curPrefix = self.leadingTrivia.forEachLineLength(prefix: curPrefix, body: body)
534-
curPrefix = self.text.forEachLineLength(prefix: curPrefix, body: body)
535-
curPrefix = self.trailingTrivia.forEachLineLength(prefix: curPrefix, body: body)
534+
curPrefix = self.tokenView.leadingRawTriviaPieces.forEachLineLength(prefix: curPrefix, body: body)
535+
curPrefix = self.tokenView.rawText.forEachLineLength(prefix: curPrefix, body: body)
536+
curPrefix = self.tokenView.trailingRawTriviaPieces.forEachLineLength(prefix: curPrefix, body: body)
536537
return curPrefix
537538
}
538539
}

Sources/swift-parser-cli/swift-parser-cli.swift

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -134,15 +134,19 @@ class VerifyRoundTrip: ParsableCommand {
134134
) throws {
135135
let tree = Parser.parse(source: source)
136136

137-
_ = ParseDiagnosticsGenerator.diagnostics(for: tree)
137+
var diags = ParseDiagnosticsGenerator.diagnostics(for: tree)
138138

139139
let resultTree: Syntax
140140
if foldSequences {
141-
resultTree = foldAllSequences(tree).0
141+
let folded = foldAllSequences(tree)
142+
resultTree = folded.0
143+
diags += folded.1
142144
} else {
143145
resultTree = Syntax(tree)
144146
}
145147

148+
_ = DiagnosticsFormatter.annotatedSource(tree: tree, diags: diags)
149+
146150
if resultTree.syntaxTextBytes != [UInt8](source) {
147151
throw Error.roundTripFailed
148152
}
@@ -207,6 +211,9 @@ class PrintDiags: ParsableCommand {
207211
source.withUnsafeBufferPointer { sourceBuffer in
208212
let tree = Parser.parse(source: sourceBuffer)
209213
var diags = ParseDiagnosticsGenerator.diagnostics(for: tree)
214+
if foldSequences {
215+
diags += foldAllSequences(tree).1
216+
}
210217
let annotatedSource = DiagnosticsFormatter.annotatedSource(
211218
tree: tree,
212219
diags: diags,
@@ -215,10 +222,6 @@ class PrintDiags: ParsableCommand {
215222

216223
print(annotatedSource)
217224

218-
if foldSequences {
219-
diags += foldAllSequences(tree).1
220-
}
221-
222225
if diags.isEmpty {
223226
print("No diagnostics produced")
224227
}
@@ -424,7 +427,7 @@ class Reduce: ParsableCommand {
424427
if verbose {
425428
printerr("Reduced from \(source.count) to \(reduced.count) characters in \(checks) iterations")
426429
}
427-
let reducedString = String(decoding: reduced, as: UTF8.self)
428-
print(reducedString)
430+
431+
FileHandle.standardOutput.write(Data(reduced))
429432
}
430433
}

Tests/SwiftParserTest/LexerTests.swift

Lines changed: 92 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,45 @@ import XCTest
1414
@_spi(RawSyntax) import SwiftSyntax
1515
@_spi(RawSyntax) import SwiftParser
1616

17+
fileprivate func lex(_ sourceBytes: [UInt8]) -> [Lexer.Lexeme] {
18+
return sourceBytes.withUnsafeBufferPointer { buf in
19+
var lexemes = [Lexer.Lexeme]()
20+
for token in Lexer.tokenize(buf, from: 0) {
21+
lexemes.append(token)
22+
23+
if token.rawTokenKind == .eof {
24+
break
25+
}
26+
}
27+
return lexemes
28+
}
29+
}
30+
31+
/// `LexemeSpec` heavily relies on string literals to represent the expected
32+
/// values for trivia and text. While this is good for most cases, string
33+
/// literals can't contain invalid UTF-8. Thus, we need a different assert
34+
/// function working on byte arrays to test source code containing invalid UTF-8.
35+
fileprivate func AssertRawBytesLexeme(
36+
_ lexeme: Lexer.Lexeme,
37+
kind: RawTokenKind,
38+
leadingTrivia: [UInt8] = [],
39+
text: [UInt8],
40+
trailingTrivia: [UInt8] = [],
41+
file: StaticString = #file,
42+
line: UInt = #line
43+
) {
44+
XCTAssertEqual(lexeme.rawTokenKind, kind, file: file, line: line)
45+
leadingTrivia.withUnsafeBufferPointer { leadingTrivia in
46+
XCTAssertEqual(lexeme.leadingTriviaText, SyntaxText(buffer: leadingTrivia), file: file, line: line)
47+
}
48+
text.withUnsafeBufferPointer { text in
49+
XCTAssertEqual(lexeme.tokenText, SyntaxText(buffer: text), file: file, line: line)
50+
}
51+
trailingTrivia.withUnsafeBufferPointer { trailingTrivia in
52+
XCTAssertEqual(lexeme.trailingTriviaText, SyntaxText(buffer: trailingTrivia), file: file, line: line)
53+
}
54+
}
55+
1756
public class LexerTests: XCTestCase {
1857
func testIdentifiers() {
1958
AssertLexemes(
@@ -756,76 +795,79 @@ public class LexerTests: XCTestCase {
756795

757796
func testBOMAtStartOfFile() throws {
758797
let sourceBytes: [UInt8] = [0xef, 0xbb, 0xbf]
759-
let lexemes = sourceBytes.withUnsafeBufferPointer { buf in
760-
var lexemes = [Lexer.Lexeme]()
761-
for token in Lexer.tokenize(buf, from: 0) {
762-
lexemes.append(token)
798+
let lexemes = lex(sourceBytes)
763799

764-
if token.rawTokenKind == .eof {
765-
break
766-
}
767-
}
768-
return lexemes
800+
guard lexemes.count == 1 else {
801+
return XCTFail("Expected 1 lexeme, got \(lexemes.count)")
769802
}
770803

771-
XCTAssertEqual(lexemes.count, 1)
772-
let lexeme = try XCTUnwrap(lexemes.first)
773-
XCTAssertEqual(lexeme.rawTokenKind, .eof)
774-
775-
let bomBytes: [UInt8] = [0xef, 0xbb, 0xbf]
776-
bomBytes.withUnsafeBufferPointer { bomBytes in
777-
XCTAssertEqual(lexeme.leadingTriviaText, SyntaxText(buffer: bomBytes))
778-
}
804+
AssertRawBytesLexeme(
805+
lexemes[0],
806+
kind: .eof,
807+
leadingTrivia: sourceBytes,
808+
text: []
809+
)
779810
}
780811

781812
func testBOMInTheMiddleOfIdentifier() throws {
782813
let sourceBytes: [UInt8] = [UInt8(ascii: "a"), 0xef, 0xbb, 0xbf, UInt8(ascii: "b")]
783-
let lexemes = sourceBytes.withUnsafeBufferPointer { buf in
784-
var lexemes = [Lexer.Lexeme]()
785-
for token in Lexer.tokenize(buf, from: 0) {
786-
lexemes.append(token)
814+
let lexemes = lex(sourceBytes)
787815

788-
if token.rawTokenKind == .eof {
789-
break
790-
}
791-
}
792-
return lexemes
816+
guard lexemes.count == 2 else {
817+
return XCTFail("Expected 2 lexemes, got \(lexemes.count)")
793818
}
794819

795-
XCTAssertEqual(lexemes.count, 2)
796-
let lexeme = try XCTUnwrap(lexemes.first)
797-
XCTAssertEqual(lexeme.rawTokenKind, .identifier)
798-
799-
sourceBytes.withUnsafeBufferPointer { sourceBytes in
800-
XCTAssertEqual(lexeme.tokenText, SyntaxText(buffer: sourceBytes))
801-
}
820+
AssertRawBytesLexeme(
821+
lexemes[0],
822+
kind: .identifier,
823+
text: sourceBytes
824+
)
802825
}
803826

804827
func testBOMAsLeadingTriviaInSourceFile() throws {
805828
let sourceBytes: [UInt8] = [UInt8(ascii: "1"), UInt8(ascii: " "), UInt8(ascii: "+"), UInt8(ascii: " "), 0xef, 0xbb, 0xbf, UInt8(ascii: "2")]
806-
let lexemes = sourceBytes.withUnsafeBufferPointer { buf in
807-
var lexemes = [Lexer.Lexeme]()
808-
for token in Lexer.tokenize(buf, from: 0) {
809-
lexemes.append(token)
829+
let lexemes = lex(sourceBytes)
810830

811-
if token.rawTokenKind == .eof {
812-
break
813-
}
814-
}
815-
return lexemes
831+
guard lexemes.count == 4 else {
832+
return XCTFail("Expected 4 lexemes, got \(lexemes.count)")
816833
}
817834

818-
guard lexemes.count == 4 else {
819-
return XCTFail("Expected 4 lexemes")
835+
AssertRawBytesLexeme(
836+
lexemes[1],
837+
kind: .binaryOperator,
838+
text: [UInt8(ascii: "+")],
839+
trailingTrivia: [UInt8(ascii: " "), 0xef, 0xbb, 0xbf]
840+
)
841+
}
842+
843+
func testInvalidUtf8() {
844+
let sourceBytes: [UInt8] = [0xef, 0xfb, 0xbd, 0x0a]
845+
846+
let lexemes = lex(sourceBytes)
847+
guard lexemes.count == 1 else {
848+
return XCTFail("Expected 1 lexeme, got \(lexemes.count)")
820849
}
821-
let lexeme = lexemes[1]
822-
XCTAssertEqual(lexeme.rawTokenKind, .binaryOperator)
850+
AssertRawBytesLexeme(
851+
lexemes[0],
852+
kind: .eof,
853+
leadingTrivia: sourceBytes,
854+
text: []
855+
)
856+
}
857+
858+
func testInvalidUtf8_2() {
859+
let sourceBytes: [UInt8] = [0xfd]
823860

824-
let expectedTrailingTrivia: [UInt8] = [UInt8(ascii: " "), 0xef, 0xbb, 0xbf]
825-
expectedTrailingTrivia.withUnsafeBufferPointer { expectedTrailingTrivia in
826-
XCTAssertEqual(lexeme.trailingTriviaText, SyntaxText(buffer: expectedTrailingTrivia))
827-
XCTAssertEqual(lexeme.tokenText, "+")
861+
let lexemes = lex(sourceBytes)
862+
guard lexemes.count == 1 else {
863+
return XCTFail("Expected 1 lexeme, got \(lexemes.count)")
828864
}
865+
AssertRawBytesLexeme(
866+
lexemes[0],
867+
kind: .eof,
868+
leadingTrivia: sourceBytes,
869+
text: []
870+
)
829871
}
830872

831873
func testInterpolatedString() {
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// This source file is part of the Swift.org open source project
4+
//
5+
// Copyright (c) 2014 - 2023 Apple Inc. and the Swift project authors
6+
// Licensed under Apache License v2.0 with Runtime Library Exception
7+
//
8+
// See https://swift.org/LICENSE.txt for license information
9+
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
10+
//
11+
//===----------------------------------------------------------------------===//
12+
13+
import XCTest
14+
@_spi(RawSyntax) import SwiftSyntax
15+
16+
final class SourceLocationConverterTests: XCTestCase {
17+
func testInvalidUtf8() {
18+
let eofToken = withExtendedLifetime(SyntaxArena()) { arena in
19+
let leadingTriviaText = [UInt8(0xfd)].withUnsafeBufferPointer { buf in
20+
arena.intern(SyntaxText(buffer: buf))
21+
}
22+
23+
let nodeWithInvalidUtf8 = RawTokenSyntax(
24+
kind: .eof,
25+
text: "",
26+
leadingTriviaPieces: [
27+
.unexpectedText(leadingTriviaText)
28+
],
29+
presence: .present,
30+
arena: arena
31+
)
32+
33+
return Syntax(raw: nodeWithInvalidUtf8.raw).cast(TokenSyntax.self)
34+
}
35+
36+
let tree = SourceFileSyntax(statements: [], eofToken: eofToken)
37+
38+
// This used to violate the following assertion in the SourceLocationConverter's
39+
// initializer, because we were using `String` which was lossy when handling the
40+
// invalid UTF-8:
41+
// ```
42+
// assert(tree.byteSize == endOfFile.utf8Offset)
43+
// ```
44+
_ = SourceLocationConverter(file: "", tree: tree)
45+
}
46+
}

0 commit comments

Comments
 (0)