Skip to content

Commit ae0da24

Browse files
authored
Merge pull request #1245 from ahoppen/ahoppen/99-lexing-backlog
Implement more lexer diagnostics
2 parents 8bc71f0 + 139f9df commit ae0da24

16 files changed

+723
-346
lines changed

Sources/SwiftParser/Lexer/Cursor.swift

Lines changed: 161 additions & 195 deletions
Large diffs are not rendered by default.

Sources/SwiftParserDiagnostics/LexerDiagnosticMessages.swift

Lines changed: 88 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -39,8 +39,20 @@ public extension LexerError {
3939
/// Please order the cases in this enum alphabetically by case name.
4040
public enum StaticLexerError: String, DiagnosticMessage {
4141
case expectedBinaryExponentInHexFloatLiteral = "hexadecimal floating point literal must end with an exponent"
42+
case expectedClosingBraceInUnicodeEscape = #"expected '}' in \u{...} escape sequence"#
4243
case expectedDigitInFloatLiteral = "expected a digit in floating point exponent"
44+
case expectedHexCodeInUnicodeEscape = #"expected hexadecimal code in \u{...} escape sequence"#
45+
case expectedHexDigitInHexLiteral = "expected hexadecimal digit (0-9, A-F) in integer literal"
46+
case invalidCharacter = "invalid character in source file"
47+
case invalidEscapeSequenceInStringLiteral = "invalid escape sequence in literal"
48+
case invalidIdentifierStartCharacter = "an identifier cannot begin with this character"
49+
case invalidNumberOfHexDigitsInUnicodeEscape = #"\u{...} escape sequence expects between 1 and 8 hex digits"#
50+
case invalidUtf8 = "invalid UTF-8 found in source file"
4351
case lexerErrorOffsetOverflow = "the lexer dicovered an error in this token but was not able to represent its offset due to overflow; please split the token"
52+
case sourceConflictMarker = "source control conflict marker in source file"
53+
case unexpectedBlockCommentEnd = "unexpected end of block comment"
54+
case unicodeCurlyQuote = #"unicode curly quote found; use '"' instead"#
55+
case unprintableAsciiCharacter = "unprintable ASCII character found in source file"
4456

4557
public var message: String { self.rawValue }
4658

@@ -51,6 +63,20 @@ public enum StaticLexerError: String, DiagnosticMessage {
5163
public var severity: DiagnosticSeverity { .error }
5264
}
5365

66+
/// Please order the cases in this enum alphabetically by case name.
67+
public enum StaticLexerWarning: String, DiagnosticMessage {
68+
case nonBreakingSpace = "non-breaking space (U+00A0) used instead of regular space"
69+
case nulCharacter = "nul character embedded in middle of file"
70+
71+
public var message: String { self.rawValue }
72+
73+
public var diagnosticID: MessageID {
74+
MessageID(domain: diagnosticDomain, id: "\(type(of: self)).\(self)")
75+
}
76+
77+
public var severity: DiagnosticSeverity { .warning }
78+
}
79+
5480
public struct InvalidFloatingPointExponentDigit: LexerError {
5581
public enum Kind {
5682
case digit(Unicode.Scalar)
@@ -98,45 +124,82 @@ public extension SwiftSyntax.LexerError {
98124
/// `tokenText` is the entire text of the token in which the `LexerError`
99125
/// occurred, including trivia.
100126
@_spi(RawSyntax)
101-
func diagnostic(wholeTextBytes: [UInt8]) -> DiagnosticMessage {
127+
func diagnosticMessage(wholeTextBytes: [UInt8]) -> DiagnosticMessage {
102128
var scalarAtErrorOffset: UnicodeScalar {
103129
// Fall back to the Unicode replacement character U+FFFD in case we can't
104130
// lex the unicode character at `byteOffset`. It's the best we can do
105131
Unicode.Scalar.lexing(from: wholeTextBytes[Int(self.byteOffset)...]) ?? UnicodeScalar("")
106132
}
107133

108134
switch self.kind {
109-
case .expectedBinaryExponentInHexFloatLiteral:
110-
return StaticLexerError.expectedBinaryExponentInHexFloatLiteral
111-
case .expectedDigitInFloatLiteral:
112-
return StaticLexerError.expectedDigitInFloatLiteral
135+
case .expectedBinaryExponentInHexFloatLiteral: return StaticLexerError.expectedBinaryExponentInHexFloatLiteral
136+
case .expectedClosingBraceInUnicodeEscape: return StaticLexerError.expectedClosingBraceInUnicodeEscape
137+
case .expectedDigitInFloatLiteral: return StaticLexerError.expectedDigitInFloatLiteral
138+
case .expectedHexCodeInUnicodeEscape: return StaticLexerError.expectedHexCodeInUnicodeEscape
139+
case .expectedHexDigitInHexLiteral: return StaticLexerError.expectedHexDigitInHexLiteral
113140
case .insufficientIndentationInMultilineStringLiteral:
114141
// This should be diagnosed when visiting the `StringLiteralExprSyntax`
115142
// inside `ParseDiagnosticsGenerator` but fall back to an error message
116143
// here in case the error is not diagnosed.
117144
return InvalidIndentationInMultiLineStringLiteralError(kind: .insufficientIndentation, lines: 1)
118-
case .invalidBinaryDigitInIntegerLiteral:
119-
return InvalidDigitInIntegerLiteral(kind: .binary(scalarAtErrorOffset))
120-
case .invalidDecimalDigitInIntegerLiteral:
121-
return InvalidDigitInIntegerLiteral(kind: .decimal(scalarAtErrorOffset))
122-
case .invalidFloatingPointCharacter:
123-
fatalError()
124-
case .invalidFloatingPointDigit:
125-
fatalError()
126-
case .invalidFloatingPointExponentCharacter:
127-
return InvalidFloatingPointExponentDigit(kind: .character(scalarAtErrorOffset))
128-
case .invalidFloatingPointExponentDigit:
129-
return InvalidFloatingPointExponentDigit(kind: .digit(scalarAtErrorOffset))
130-
case .invalidHexDigitInIntegerLiteral:
131-
return InvalidDigitInIntegerLiteral(kind: .hex(scalarAtErrorOffset))
132-
case .invalidOctalDigitInIntegerLiteral:
133-
return InvalidDigitInIntegerLiteral(kind: .octal(scalarAtErrorOffset))
134-
case .lexerErrorOffsetOverflow:
135-
return StaticLexerError.lexerErrorOffsetOverflow
145+
case .invalidBinaryDigitInIntegerLiteral: return InvalidDigitInIntegerLiteral(kind: .binary(scalarAtErrorOffset))
146+
case .invalidCharacter: return StaticLexerError.invalidCharacter
147+
case .invalidDecimalDigitInIntegerLiteral: return InvalidDigitInIntegerLiteral(kind: .decimal(scalarAtErrorOffset))
148+
case .invalidEscapeSequenceInStringLiteral: return StaticLexerError.invalidEscapeSequenceInStringLiteral
149+
case .invalidFloatingPointExponentCharacter: return InvalidFloatingPointExponentDigit(kind: .character(scalarAtErrorOffset))
150+
case .invalidFloatingPointExponentDigit: return InvalidFloatingPointExponentDigit(kind: .digit(scalarAtErrorOffset))
151+
case .invalidHexDigitInIntegerLiteral: return InvalidDigitInIntegerLiteral(kind: .hex(scalarAtErrorOffset))
152+
case .invalidIdentifierStartCharacter: return StaticLexerError.invalidIdentifierStartCharacter
153+
case .invalidNumberOfHexDigitsInUnicodeEscape: return StaticLexerError.invalidNumberOfHexDigitsInUnicodeEscape
154+
case .invalidOctalDigitInIntegerLiteral: return InvalidDigitInIntegerLiteral(kind: .octal(scalarAtErrorOffset))
155+
case .invalidUtf8: return StaticLexerError.invalidUtf8
156+
case .lexerErrorOffsetOverflow: return StaticLexerError.lexerErrorOffsetOverflow
157+
case .nonBreakingSpace: return StaticLexerWarning.nonBreakingSpace
158+
case .nulCharacter: return StaticLexerWarning.nulCharacter
159+
case .sourceConflictMarker: return StaticLexerError.sourceConflictMarker
160+
case .unexpectedBlockCommentEnd: return StaticLexerError.unexpectedBlockCommentEnd
161+
case .unicodeCurlyQuote: return StaticLexerError.unicodeCurlyQuote
162+
case .unprintableAsciiCharacter: return StaticLexerError.unprintableAsciiCharacter
136163
}
137164
}
138165

139-
func diagnostic(in token: TokenSyntax) -> DiagnosticMessage {
140-
return self.diagnostic(wholeTextBytes: token.syntaxTextBytes)
166+
func diagnosticMessage(in token: TokenSyntax) -> DiagnosticMessage {
167+
return self.diagnosticMessage(wholeTextBytes: token.syntaxTextBytes)
168+
}
169+
170+
func fixIts(in token: TokenSyntax) -> [FixIt] {
171+
switch self.kind {
172+
case .nonBreakingSpace:
173+
let replaceNonBreakingSpace = { (piece: TriviaPiece) -> TriviaPiece in
174+
if piece == .unexpectedText("\u{a0}") {
175+
return .spaces(1)
176+
} else {
177+
return piece
178+
}
179+
}
180+
let fixedToken =
181+
token
182+
.with(\.leadingTrivia, Trivia(pieces: token.leadingTrivia.map(replaceNonBreakingSpace)))
183+
.with(\.trailingTrivia, Trivia(pieces: token.trailingTrivia.map(replaceNonBreakingSpace)))
184+
return [
185+
FixIt(message: .replaceNonBreakingSpaceBySpace, changes: [[.replace(oldNode: Syntax(token), newNode: Syntax(fixedToken))]])
186+
]
187+
case .unicodeCurlyQuote:
188+
let (rawKind, text) = token.tokenKind.decomposeToRaw()
189+
guard let text = text else {
190+
return []
191+
}
192+
let replacedText =
193+
text
194+
.replacingFirstOccurence(of: "", with: #"""#)
195+
.replacingLastOccurence(of: "", with: #"""#)
196+
197+
let fixedToken = token.withKind(TokenKind.fromRaw(kind: rawKind, text: replacedText))
198+
return [
199+
FixIt(message: .replaceCurlyQuoteByNormalQuote, changes: [[.replace(oldNode: Syntax(token), newNode: Syntax(fixedToken))]])
200+
]
201+
default:
202+
return []
203+
}
141204
}
142205
}

Sources/SwiftParserDiagnostics/ParseDiagnosticsGenerator.swift

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,19 @@ fileprivate extension TokenSyntax {
3030
}
3131
}
3232

33+
fileprivate extension DiagnosticSeverity {
34+
func matches(_ lexerErorSeverity: SwiftSyntax.LexerError.Severity) -> Bool {
35+
switch (self, lexerErorSeverity) {
36+
case (.error, .error):
37+
return true
38+
case (.warning, .warning):
39+
return true
40+
default:
41+
return false
42+
}
43+
}
44+
}
45+
3346
public class ParseDiagnosticsGenerator: SyntaxAnyVisitor {
3447
private var diagnostics: [Diagnostic] = []
3548

@@ -101,7 +114,7 @@ public class ParseDiagnosticsGenerator: SyntaxAnyVisitor {
101114
/// Whether the node should be skipped for diagnostic emission.
102115
/// Every visit method must check this at the beginning.
103116
func shouldSkip<T: SyntaxProtocol>(_ node: T) -> Bool {
104-
if !node.hasError {
117+
if !node.hasError && !node.hasWarning {
105118
return true
106119
}
107120
return handledNodes.contains(node.id)
@@ -347,7 +360,14 @@ public class ParseDiagnosticsGenerator: SyntaxAnyVisitor {
347360
handleMissingToken(token)
348361
} else {
349362
if let lexerError = token.lexerError {
350-
self.addDiagnostic(token, position: token.position.advanced(by: Int(lexerError.byteOffset)), lexerError.diagnostic(in: token))
363+
let message = lexerError.diagnosticMessage(in: token)
364+
assert(message.severity.matches(lexerError.severity))
365+
self.addDiagnostic(
366+
token,
367+
position: token.position.advanced(by: Int(lexerError.byteOffset)),
368+
message,
369+
fixIts: lexerError.fixIts(in: token)
370+
)
351371
}
352372
}
353373

Sources/SwiftParserDiagnostics/ParserDiagnosticMessages.swift

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -474,6 +474,12 @@ extension FixItMessage where Self == StaticParserFixIt {
474474
public static var removeOperatorBody: Self {
475475
.init("remove operator body")
476476
}
477+
public static var replaceCurlyQuoteByNormalQuote: Self {
478+
.init(#"replace curly quotes by '"'"#)
479+
}
480+
public static var replaceNonBreakingSpaceBySpace: Self {
481+
.init("replace non-breaking space by ' '")
482+
}
477483
public static var wrapInBackticks: Self {
478484
.init("if this name is unavoidable, use backticks to escape it")
479485
}

Sources/SwiftParserDiagnostics/Utils.swift

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,20 @@ extension String {
1818
return self
1919
}
2020
}
21+
22+
func replacingFirstOccurence(of character: Character, with replacement: Character) -> String {
23+
guard let match = self.firstIndex(of: character) else {
24+
return self
25+
}
26+
return self[startIndex..<match] + String(replacement) + self[index(after: match)...]
27+
}
28+
29+
func replacingLastOccurence(of character: Character, with replacement: Character) -> String {
30+
guard let match = self.lastIndex(of: character) else {
31+
return self
32+
}
33+
return self[startIndex..<match] + String(replacement) + self[index(after: match)...]
34+
}
2135
}
2236

2337
extension Collection {

Sources/SwiftSyntax/LexerError.swift

Lines changed: 48 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,22 +14,39 @@
1414
/// `lexerErrorOffset` in the token will specify at which offset the error
1515
/// occurred.
1616
public struct LexerError: Hashable {
17+
public enum Severity {
18+
case error
19+
case warning
20+
}
21+
1722
public enum Kind {
1823
// Please order these alphabetically
1924

2025
case expectedBinaryExponentInHexFloatLiteral
26+
case expectedClosingBraceInUnicodeEscape
2127
case expectedDigitInFloatLiteral
28+
case expectedHexCodeInUnicodeEscape
29+
case expectedHexDigitInHexLiteral
2230
case insufficientIndentationInMultilineStringLiteral
2331
case invalidBinaryDigitInIntegerLiteral
32+
case invalidCharacter
2433
case invalidDecimalDigitInIntegerLiteral
25-
case invalidFloatingPointCharacter
26-
case invalidFloatingPointDigit
34+
case invalidEscapeSequenceInStringLiteral
2735
case invalidFloatingPointExponentCharacter
2836
case invalidFloatingPointExponentDigit
2937
case invalidHexDigitInIntegerLiteral
38+
case invalidIdentifierStartCharacter
39+
case invalidNumberOfHexDigitsInUnicodeEscape
3040
case invalidOctalDigitInIntegerLiteral
41+
case invalidUtf8
3142
/// The lexer dicovered an error but was not able to represent the offset of the error because it would overflow `LexerErrorOffset`.
3243
case lexerErrorOffsetOverflow
44+
case nonBreakingSpace
45+
case nulCharacter
46+
case sourceConflictMarker
47+
case unexpectedBlockCommentEnd
48+
case unicodeCurlyQuote
49+
case unprintableAsciiCharacter
3350
}
3451

3552
public let kind: Kind
@@ -54,4 +71,33 @@ public struct LexerError: Hashable {
5471
self.byteOffset = UInt16(byteOffset)
5572
}
5673
}
74+
75+
public var severity: Severity {
76+
switch kind {
77+
case .expectedBinaryExponentInHexFloatLiteral: return .error
78+
case .expectedClosingBraceInUnicodeEscape: return .error
79+
case .expectedDigitInFloatLiteral: return .error
80+
case .expectedHexCodeInUnicodeEscape: return .error
81+
case .expectedHexDigitInHexLiteral: return .error
82+
case .insufficientIndentationInMultilineStringLiteral: return .error
83+
case .invalidBinaryDigitInIntegerLiteral: return .error
84+
case .invalidCharacter: return .error
85+
case .invalidDecimalDigitInIntegerLiteral: return .error
86+
case .invalidEscapeSequenceInStringLiteral: return .error
87+
case .invalidFloatingPointExponentCharacter: return .error
88+
case .invalidFloatingPointExponentDigit: return .error
89+
case .invalidHexDigitInIntegerLiteral: return .error
90+
case .invalidIdentifierStartCharacter: return .error
91+
case .invalidNumberOfHexDigitsInUnicodeEscape: return .error
92+
case .invalidOctalDigitInIntegerLiteral: return .error
93+
case .invalidUtf8: return .error
94+
case .lexerErrorOffsetOverflow: return .error
95+
case .nonBreakingSpace: return .warning
96+
case .nulCharacter: return .warning
97+
case .sourceConflictMarker: return .error
98+
case .unexpectedBlockCommentEnd: return .error
99+
case .unicodeCurlyQuote: return .error
100+
case .unprintableAsciiCharacter: return .error
101+
}
102+
}
57103
}

Sources/SwiftSyntax/Raw/RawSyntax.swift

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,10 +23,16 @@ fileprivate extension SyntaxKind {
2323
struct RecursiveRawSyntaxFlags: OptionSet {
2424
let rawValue: UInt8
2525

26-
/// Whether the tree contained by this layout has any missing or unexpected nodes.
26+
/// Whether the tree contained by this layout has any
27+
/// - missing nodes or
28+
/// - unexpected nodes or
29+
/// - tokens with a `LexerError` of severity `error`
2730
static let hasError = RecursiveRawSyntaxFlags(rawValue: 1 << 0)
28-
static let hasSequenceExpr = RecursiveRawSyntaxFlags(rawValue: 1 << 1)
29-
static let hasMaximumNestingLevelOverflow = RecursiveRawSyntaxFlags(rawValue: 1 << 2)
31+
/// Whether the tree contained by this layout has any tokens with a `LexerError`
32+
/// of severity `warning`.
33+
static let hasWarning = RecursiveRawSyntaxFlags(rawValue: 1 << 1)
34+
static let hasSequenceExpr = RecursiveRawSyntaxFlags(rawValue: 1 << 2)
35+
static let hasMaximumNestingLevelOverflow = RecursiveRawSyntaxFlags(rawValue: 1 << 3)
3036
}
3137

3238
/// Node data for RawSyntax tree. Tagged union plus common data.
@@ -227,9 +233,17 @@ extension RawSyntax {
227233
switch view {
228234
case .token(let tokenView):
229235
var recursiveFlags: RecursiveRawSyntaxFlags = []
230-
if tokenView.lexerError != nil || tokenView.presence == .missing {
236+
if tokenView.presence == .missing {
231237
recursiveFlags.insert(.hasError)
232238
}
239+
switch tokenView.lexerError?.severity {
240+
case .error:
241+
recursiveFlags.insert(.hasError)
242+
case .warning:
243+
recursiveFlags.insert(.hasWarning)
244+
case nil:
245+
break
246+
}
233247
return recursiveFlags
234248
case .layout(let layoutView):
235249
return layoutView.recursiveFlags

Sources/SwiftSyntax/Syntax.swift

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -283,11 +283,20 @@ public extension SyntaxProtocol {
283283
return raw.kind.isSyntaxCollection
284284
}
285285

286-
/// Whether this tree contains a missing token or unexpected node.
286+
/// Whether the tree contained by this layout has any
287+
/// - missing nodes or
288+
/// - unexpected nodes or
289+
/// - tokens with a `LexerError` of severity `error`
287290
var hasError: Bool {
288291
return raw.recursiveFlags.contains(.hasError)
289292
}
290293

294+
/// Whether the tree contained by this layout has any tokens with a `LexerError`
295+
/// of severity `warning`.
296+
var hasWarning: Bool {
297+
return raw.recursiveFlags.contains(.hasWarning)
298+
}
299+
291300
/// Whether this tree contains a missing token or unexpected node.
292301
var hasSequenceExpr: Bool {
293302
return raw.recursiveFlags.contains(.hasSequenceExpr)

0 commit comments

Comments
 (0)