Skip to content

Commit 19c68cb

Browse files
committed
Diagnose non-breaking space, invalid identifier start and invalid characters
1 parent 1463460 commit 19c68cb

File tree

9 files changed

+88
-61
lines changed

9 files changed

+88
-61
lines changed

Sources/SwiftParser/Lexer/Cursor.swift

Lines changed: 29 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -289,8 +289,11 @@ extension Lexer.Cursor {
289289
// Leading trivia.
290290
let leadingTriviaStart = self
291291
let newlineInLeadingTrivia: NewlinePresence
292+
var error: LexerError? = nil
292293
if let leadingTriviaMode = self.currentState.leadingTriviaLexingMode(cursor: self) {
293-
newlineInLeadingTrivia = self.lexTrivia(mode: leadingTriviaMode)
294+
let triviaResult = self.lexTrivia(mode: leadingTriviaMode)
295+
newlineInLeadingTrivia = triviaResult.newlinePresence
296+
error = error ?? triviaResult.error.map { LexerError($0.kind, byteOffset: cursor.distance(to: $0.position)) }
294297
} else {
295298
newlineInLeadingTrivia = .absent
296299
}
@@ -325,7 +328,8 @@ extension Lexer.Cursor {
325328
// Trailing trivia.
326329
let trailingTriviaStart = self
327330
if let trailingTriviaMode = result.trailingTriviaLexingMode ?? currentState.trailingTriviaLexingMode(cursor: self) {
328-
_ = self.lexTrivia(mode: trailingTriviaMode)
331+
let triviaResult = self.lexTrivia(mode: trailingTriviaMode)
332+
error = error ?? triviaResult.error.map { LexerError($0.kind, byteOffset: cursor.distance(to: $0.position)) }
329333
}
330334

331335
if self.currentState.shouldPopStateWhenReachingNewlineInTrailingTrivia && self.is(at: "\r", "\n") {
@@ -338,9 +342,7 @@ extension Lexer.Cursor {
338342
}
339343

340344
self.previousTokenKind = result.tokenKind.base
341-
let error = result.error.map { error in
342-
return LexerError(error.kind, byteOffset: cursor.distance(to: error.position))
343-
}
345+
error = error ?? result.error.map { LexerError($0.kind, byteOffset: cursor.distance(to: $0.position)) }
344346

345347
return .init(
346348
tokenKind: result.tokenKind,
@@ -999,41 +1001,47 @@ extension Lexer.Cursor {
9991001
case escapedNewlineInMultiLineStringLiteral
10001002
}
10011003

1002-
fileprivate mutating func lexTrivia(mode: TriviaLexingMode) -> NewlinePresence {
1004+
fileprivate struct TriviaResult {
1005+
let newlinePresence: NewlinePresence
1006+
let error: (kind: LexerError.Kind, position: Lexer.Cursor)?
1007+
}
1008+
1009+
fileprivate mutating func lexTrivia(mode: TriviaLexingMode) -> TriviaResult {
1010+
var newlinePresence = NewlinePresence.absent
1011+
var error: (kind: LexerError.Kind, position: Lexer.Cursor)? = nil
10031012
if mode == .escapedNewlineInMultiLineStringLiteral {
10041013
_ = self.advance(matching: "\\")
10051014
self.advance(while: { $0 == "#" })
10061015
self.advance(while: { $0 == " " || $0 == "\t" })
10071016
if self.advance(matching: "\r") {
10081017
_ = self.advance(matching: "\n")
1009-
return .present
1018+
return TriviaResult(newlinePresence: .present, error: nil)
10101019
} else if self.advance(matching: "\n") {
1011-
return .present
1020+
return TriviaResult(newlinePresence: .present, error: nil)
10121021
} else {
1013-
return .absent
1022+
return TriviaResult(newlinePresence: .absent, error: nil)
10141023
}
10151024
}
10161025

1017-
var hasNewline = false
10181026
while true {
10191027
let start = self
10201028

10211029
switch self.advance() {
1022-
// 'continue' - the character is a part of the trivia.
1030+
// 'continue' - the character is a part of the trivia9.
10231031
// 'break' - the character should a part of token text.
10241032
case nil:
10251033
break
10261034
case UInt8(ascii: "\n"):
10271035
if mode == .noNewlines {
10281036
break
10291037
}
1030-
hasNewline = true
1038+
newlinePresence = .present
10311039
continue
10321040
case UInt8(ascii: "\r"):
10331041
if mode == .noNewlines {
10341042
break
10351043
}
1036-
hasNewline = true
1044+
newlinePresence = .present
10371045
continue
10381046

10391047
case UInt8(ascii: " "):
@@ -1129,7 +1137,8 @@ extension Lexer.Cursor {
11291137

11301138
// `lexUnknown` expects that the first character has not been consumed yet.
11311139
self = start
1132-
if case .trivia = self.lexUnknown() {
1140+
if case .trivia(let unknownError) = self.lexUnknown() {
1141+
error = error ?? unknownError
11331142
continue
11341143
} else {
11351144
break
@@ -1139,7 +1148,7 @@ extension Lexer.Cursor {
11391148
// `break` means the character was not a trivia. Reset the cursor and
11401149
// return the result.
11411150
self = start
1142-
return hasNewline ? .present : .absent
1151+
return TriviaResult(newlinePresence: newlinePresence, error: error)
11431152
}
11441153
}
11451154
}
@@ -2104,7 +2113,7 @@ extension Lexer.Cursor {
21042113

21052114
enum UnknownCharactersClassification {
21062115
/// The characters consumed by `lexUnknown` should be classified as trivia
2107-
case trivia
2116+
case trivia(error: (kind: LexerError.Kind, position: Lexer.Cursor))
21082117
/// The characters consumed by `lexUnknown` should be classified as the contents of a lexeme
21092118
case lexemeContents(Lexer.Result)
21102119
}
@@ -2129,24 +2138,17 @@ extension Lexer.Cursor {
21292138

21302139
// This character isn't allowed in Swift source.
21312140
guard let codepoint = tmp.advanceValidatingUTF8Character() else {
2132-
// diagnose(CurPtr - 1, diag::lex_invalid_utf8)
2133-
// .fixItReplaceChars(getSourceLoc(CurPtr - 1), getSourceLoc(Tmp), " ")
21342141
self = tmp
2135-
return .trivia
2142+
return .trivia(error: (kind: .invalidUtf8, position: start))
21362143
}
21372144
if codepoint.value == 0xA0 { // Non-breaking whitespace (U+00A0)
21382145
while tmp.is(at: 0xC2) && tmp.is(offset: 1, at: 0xA0) {
21392146
_ = tmp.advance()
21402147
_ = tmp.advance()
21412148
}
21422149

2143-
// SmallString<8> Spaces
2144-
// Spaces.assign((Tmp - CurPtr + 1) / 2, ' ')
2145-
// diagnose(CurPtr - 1, diag::lex_nonbreaking_space)
2146-
// .fixItReplaceChars(getSourceLoc(CurPtr - 1), getSourceLoc(Tmp),
2147-
// Spaces)
21482150
self = tmp
2149-
return .trivia
2151+
return .trivia(error: (kind: .nonBreakingSpace, position: start))
21502152
} else if codepoint.value == 0x201D { // Closing curly quote (U+201D)
21512153
// If this is an end curly quote, just diagnose it with a fixit hint.
21522154
self = tmp
@@ -2166,26 +2168,9 @@ extension Lexer.Cursor {
21662168
return .lexemeContents(Lexer.Result(.identifier, error: (.unicodeCurlyQuote, position: start)))
21672169
}
21682170

2169-
// diagnose(CurPtr - 1, diag::lex_invalid_character)
2170-
// .fixItReplaceChars(getSourceLoc(CurPtr - 1), getSourceLoc(Tmp), " ")
2171-
2172-
// char ExpectedCodepoint
2173-
// if ((ExpectedCodepoint =
2174-
// confusable::tryConvertConfusableCharacterToASCII(Codepoint))) {
2175-
//
2176-
// llvm::SmallString<4> ConfusedChar
2177-
// EncodeToUTF8(Codepoint, ConfusedChar)
2178-
// llvm::SmallString<1> ExpectedChar
2179-
// ExpectedChar += ExpectedCodepoint
2180-
// auto charNames = confusable::getConfusableAndBaseCodepointNames(Codepoint)
2181-
// diagnose(CurPtr - 1, diag::lex_confusable_character, ConfusedChar,
2182-
// charNames.first, ExpectedChar, charNames.second)
2183-
// .fixItReplaceChars(getSourceLoc(CurPtr - 1), getSourceLoc(Tmp),
2184-
// ExpectedChar)
2185-
// }
2186-
2171+
// TODO: Try map confusables to ASCII characters
21872172
self = tmp
2188-
return .trivia
2173+
return .trivia(error: (kind: .invalidCharacter, position: start))
21892174
}
21902175

21912176
enum ConflictMarker {

Sources/SwiftParserDiagnostics/LexerDiagnosticMessages.swift

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ public enum StaticLexerError: String, DiagnosticMessage {
4242
case expectedClosingBraceInUnicodeEscape = #"expected '}' in \u{...} escape sequence"#
4343
case expectedDigitInFloatLiteral = "expected a digit in floating point exponent"
4444
case expectedHexCodeInUnicodeEscape = #"expected hexadecimal code in \u{...} escape sequence"#
45+
case invalidCharacter = "invalid character in source file"
4546
case invalidEscapeSequenceInStringLiteral = "invalid escape sequence in literal"
4647
case invalidIdentifierStartCharacter = "an identifier cannot begin with this character"
4748
case invalidNumberOfHexDigitsInUnicodeEscape = #"\u{...} escape sequence expects between 1 and 8 hex digits"#
@@ -126,6 +127,7 @@ public extension SwiftSyntax.LexerError {
126127
// here in case the error is not diagnosed.
127128
return InvalidIndentationInMultiLineStringLiteralError(kind: .insufficientIndentation, lines: 1)
128129
case .invalidBinaryDigitInIntegerLiteral: return InvalidDigitInIntegerLiteral(kind: .binary(scalarAtErrorOffset))
130+
case .invalidCharacter: return StaticLexerError.invalidCharacter
129131
case .invalidDecimalDigitInIntegerLiteral: return InvalidDigitInIntegerLiteral(kind: .decimal(scalarAtErrorOffset))
130132
case .invalidEscapeSequenceInStringLiteral: return StaticLexerError.invalidEscapeSequenceInStringLiteral
131133
case .invalidFloatingPointExponentCharacter: return InvalidFloatingPointExponentDigit(kind: .character(scalarAtErrorOffset))
@@ -150,16 +152,29 @@ public extension SwiftSyntax.LexerError {
150152
func fixIts(in token: TokenSyntax) -> [FixIt] {
151153
switch self.kind {
152154
case .nonBreakingSpace:
153-
return []
155+
let replaceNonBreakingSpace = { (piece: TriviaPiece) -> TriviaPiece in
156+
if piece == .unexpectedText("\u{a0}") {
157+
return .spaces(1)
158+
} else {
159+
return piece
160+
}
161+
}
162+
let fixedToken =
163+
token
164+
.with(\.leadingTrivia, Trivia(pieces: token.leadingTrivia.map(replaceNonBreakingSpace)))
165+
.with(\.trailingTrivia, Trivia(pieces: token.trailingTrivia.map(replaceNonBreakingSpace)))
166+
return [
167+
FixIt(message: .replaceNonBreakingSpaceBySpace, changes: [[.replace(oldNode: Syntax(token), newNode: Syntax(fixedToken))]])
168+
]
154169
case .unicodeCurlyQuote:
155170
let (rawKind, text) = token.tokenKind.decomposeToRaw()
156171
guard let text = text else {
157172
return []
158173
}
159174
let replacedText =
160175
text
161-
.replaceFirstOccuranceOf("", with: #"""#)
162-
.replaceLastOccuranceOf("", with: #"""#)
176+
.replacingFirstOccurance(of: "", with: #"""#)
177+
.replacingLastOccurance(of: "", with: #"""#)
163178

164179
let fixedToken = token.withKind(TokenKind.fromRaw(kind: rawKind, text: replacedText))
165180
return [

Sources/SwiftParserDiagnostics/ParserDiagnosticMessages.swift

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -477,6 +477,9 @@ extension FixItMessage where Self == StaticParserFixIt {
477477
public static var replaceCurlyQuoteByNormalQuote: Self {
478478
.init(#"replace curly quotes by '"'"#)
479479
}
480+
public static var replaceNonBreakingSpaceBySpace: Self {
481+
.init("replace non-breaking space by ' '")
482+
}
480483
public static var wrapInBackticks: Self {
481484
.init("if this name is unavoidable, use backticks to escape it")
482485
}

Sources/SwiftParserDiagnostics/Utils.swift

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,14 +19,14 @@ extension String {
1919
}
2020
}
2121

22-
func replaceFirstOccuranceOf(_ character: Character, with replacement: Character) -> String {
22+
func replacingFirstOccurance(of character: Character, with replacement: Character) -> String {
2323
guard let match = self.firstIndex(of: character) else {
2424
return self
2525
}
2626
return self[startIndex..<match] + String(replacement) + self[index(after: match)...]
2727
}
2828

29-
func replaceLastOccuranceOf(_ character: Character, with replacement: Character) -> String {
29+
func replacingLastOccurance(of character: Character, with replacement: Character) -> String {
3030
guard let match = self.lastIndex(of: character) else {
3131
return self
3232
}

Sources/SwiftSyntax/LexerError.swift

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ public struct LexerError: Hashable {
2323
case expectedHexCodeInUnicodeEscape
2424
case insufficientIndentationInMultilineStringLiteral
2525
case invalidBinaryDigitInIntegerLiteral
26+
case invalidCharacter
2627
case invalidDecimalDigitInIntegerLiteral
2728
case invalidEscapeSequenceInStringLiteral
2829
case invalidFloatingPointExponentCharacter

Tests/SwiftParserTest/ExpressionTests.swift

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1153,6 +1153,16 @@ final class ExpressionTests: XCTestCase {
11531153
"""
11541154
)
11551155
}
1156+
1157+
func testNonBreakingSpace() {
1158+
AssertParse(
1159+
"a 1️⃣\u{a0}+ 2",
1160+
diagnostics: [
1161+
DiagnosticSpec(message: "non-breaking space (U+00A0) used instead of regular space", fixIts: ["replace non-breaking space by ' '"])
1162+
],
1163+
fixedSource: "a + 2"
1164+
)
1165+
}
11561166
}
11571167

11581168
final class MemberExprTests: XCTestCase {

Tests/SwiftParserTest/LexerTests.swift

Lines changed: 19 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -426,10 +426,10 @@ public class LexerTests: XCTestCase {
426426

427427
func testUnexpectedLexing() {
428428
AssertLexemes(
429-
"static func �() {}",
429+
"static func 1️⃣�() {}",
430430
lexemes: [
431431
LexemeSpec(.keyword(.static), text: "static", trailing: " "),
432-
LexemeSpec(.keyword(.func), text: "func", trailing: ""),
432+
LexemeSpec(.keyword(.func), text: "func", trailing: "", error: "invalid character in source file"),
433433
LexemeSpec(.leftParen, text: "("),
434434
LexemeSpec(.rightParen, text: ")", trailing: " "),
435435
LexemeSpec(.leftBrace, text: "{"),
@@ -635,9 +635,9 @@ public class LexerTests: XCTestCase {
635635
)
636636

637637
AssertLexemes(
638-
"y\u{fffe} + z",
638+
"y1️⃣\u{fffe} + z",
639639
lexemes: [
640-
LexemeSpec(.identifier, text: "y", trailing: "\u{fffe} "),
640+
LexemeSpec(.identifier, text: "y", trailing: "\u{fffe} ", error: "invalid character in source file"),
641641
LexemeSpec(.binaryOperator, text: "+", trailing: " "),
642642
LexemeSpec(.identifier, text: "z"),
643643
]
@@ -861,7 +861,8 @@ public class LexerTests: XCTestCase {
861861
lexemes[0],
862862
kind: .eof,
863863
leadingTrivia: sourceBytes,
864-
text: []
864+
text: [],
865+
error: LexerError(.invalidUtf8, byteOffset: 0)
865866
)
866867
}
867868
}
@@ -877,7 +878,8 @@ public class LexerTests: XCTestCase {
877878
lexemes[0],
878879
kind: .eof,
879880
leadingTrivia: sourceBytes,
880-
text: []
881+
text: [],
882+
error: LexerError(.invalidUtf8, byteOffset: 0)
881883
)
882884
}
883885
}
@@ -1195,4 +1197,15 @@ public class LexerTests: XCTestCase {
11951197
]
11961198
)
11971199
}
1200+
1201+
func testNonBreakingSpace() {
1202+
AssertLexemes(
1203+
"a 1️⃣\u{a0} b",
1204+
lexemes: [
1205+
LexemeSpec(.identifier, text: "a", trailing: " \u{a0} ", error: "non-breaking space (U+00A0) used instead of regular space"),
1206+
LexemeSpec(.identifier, text: "b"),
1207+
]
1208+
)
1209+
}
1210+
11981211
}

Tests/SwiftParserTest/translated/IdentifiersTests.swift

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -68,13 +68,13 @@ final class IdentifiersTests: XCTestCase {
6868
}
6969

7070
func testIdentifiers6() {
71+
// Private-use characters aren't valid in Swift source.
7172
AssertParse(
7273
"""
73-
// Private-use characters aren't valid in Swift source.
74-
()
74+
1️⃣()
7575
""",
7676
diagnostics: [
77-
// TODO: Old parser expected error on line 2: invalid character in source file, Fix-It replacements: 1 - 4 = ' '
77+
DiagnosticSpec(message: "invalid character in source file")
7878
]
7979
)
8080
}

Tests/SwiftParserTest/translated/RecoveryTests.swift

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1949,13 +1949,13 @@ final class RecoveryTests: XCTestCase {
19491949
}
19501950

19511951
func testRecovery160() {
1952+
// <rdar://problem/21196171> compiler should recover better from "unicode Specials" characters
19521953
AssertParse(
19531954
#"""
1954-
// <rdar://problem/21196171> compiler should recover better from "unicode Specials" characters
1955-
let tryx = 123
1955+
let 1️⃣tryx = 123
19561956
"""#,
19571957
diagnostics: [
1958-
// TODO: Old parser expected error on line 2: invalid character in source file, Fix-It replacements: 5 - 8 = ' '
1958+
DiagnosticSpec(message: "invalid character in source file")
19591959
]
19601960
)
19611961
}

0 commit comments

Comments
 (0)