Skip to content

Commit 82ff647

Browse files
authored
Merge pull request #1274 from ahoppen/ahoppen/lexer-error-offset-relative-to-leading-trivia
Store lexer error offset relative to leading trivia start
2 parents 78d7bc1 + 14e23d5 commit 82ff647

File tree

8 files changed

+148
-91
lines changed

8 files changed

+148
-91
lines changed

Sources/SwiftParser/Lexer/Cursor.swift

Lines changed: 39 additions & 82 deletions
Original file line numberDiff line numberDiff line change
@@ -255,13 +255,15 @@ extension Lexer {
255255
struct Result {
256256
let tokenKind: RawTokenKind
257257
let flags: Lexer.Lexeme.Flags
258-
let error: LexerError?
258+
/// The error kind and the cursor pointing to the character at which the
259+
/// error occurred
260+
let error: (kind: LexerError.Kind, position: Lexer.Cursor)?
259261
let stateTransition: StateTransition?
260262

261263
init(
262264
_ tokenKind: RawTokenKind,
263265
flags: Lexer.Lexeme.Flags = [],
264-
error: LexerError? = nil,
266+
error: (kind: LexerError.Kind, position: Cursor)? = nil,
265267
stateTransition: StateTransition? = nil
266268
) {
267269
self.tokenKind = tokenKind
@@ -335,10 +337,14 @@ extension Lexer.Cursor {
335337
flags.insert(.isAtStartOfLine)
336338
}
337339

340+
let error = result.error.map { error in
341+
return LexerError(error.kind, byteOffset: cursor.distance(to: error.position))
342+
}
343+
338344
return .init(
339345
tokenKind: result.tokenKind,
340346
flags: flags,
341-
error: result.error,
347+
error: error,
342348
start: leadingTriviaStart.pointer,
343349
leadingTriviaLength: leadingTriviaStart.distance(to: textStart),
344350
textLength: textStart.distance(to: trailingTriviaStart),
@@ -668,62 +674,7 @@ extension Lexer.Cursor {
668674
/// that case bytes are consumed until we reach the next start of a UTF-8
669675
/// character.
670676
mutating func advanceValidatingUTF8Character() -> Unicode.Scalar? {
671-
guard let curByte = self.advance() else {
672-
return nil
673-
}
674-
675-
if (curByte < 0x80) {
676-
return Unicode.Scalar(curByte)
677-
}
678-
679-
// Read the number of high bits set, which indicates the number of bytes in
680-
// the character.
681-
let encodedBytes = (~(UInt32(curByte) << 24)).leadingZeroBitCount
682-
683-
// If this is 0b10XXXXXX, then it is a continuation character.
684-
if encodedBytes == 1 || !Unicode.Scalar(curByte).isStartOfUTF8Character {
685-
// Skip until we get the start of another character. This is guaranteed to
686-
// at least stop at the nul at the end of the buffer.
687-
self.advance(while: { !$0.isStartOfUTF8Character })
688-
return nil
689-
}
690-
691-
// Drop the high bits indicating the # bytes of the result.
692-
var charValue = UInt32(curByte << encodedBytes) >> encodedBytes
693-
694-
// Read and validate the continuation bytes.
695-
for _ in 1..<encodedBytes {
696-
guard let curByte = self.peek() else {
697-
return nil
698-
}
699-
// If the high bit isn't set or the second bit isn't clear, then this is not
700-
// a continuation byte!
701-
if (curByte < 0x80 || curByte >= 0xC0) {
702-
return nil
703-
}
704-
705-
// Accumulate our result.
706-
charValue <<= 6
707-
charValue |= UInt32(curByte & 0x3F)
708-
_ = self.advance()
709-
}
710-
711-
// UTF-16 surrogate pair values are not valid code points.
712-
if (charValue >= 0xD800 && charValue <= 0xDFFF) {
713-
return nil
714-
}
715-
716-
// If we got here, we read the appropriate number of accumulated bytes.
717-
// Verify that the encoding was actually minimal.
718-
// Number of bits in the value, ignoring leading zeros.
719-
let numBits = 32 - charValue.leadingZeroBitCount
720-
if numBits <= 5 + 6 {
721-
return encodedBytes == 2 ? Unicode.Scalar(charValue) : nil
722-
}
723-
if numBits <= 4 + 6 + 6 {
724-
return encodedBytes == 3 ? Unicode.Scalar(charValue) : nil
725-
}
726-
return encodedBytes == 4 ? Unicode.Scalar(charValue) : nil
677+
return Unicode.Scalar.lexing(advance: { self.advance() }, peek: { self.peek(at: 0) })
727678
}
728679

729680
/// Rever the lexer by `offset` bytes. This should only be used by `resetForSplit`.
@@ -1194,11 +1145,11 @@ extension Lexer.Cursor {
11941145
let oConsumed = self.advance(matching: "o") // Consome 'o'
11951146
assert(zeroConsumed && oConsumed)
11961147
if let peeked = self.peek(), peeked < UInt8(ascii: "0") || peeked > UInt8(ascii: "7") {
1197-
let errorOffset = tokenStart.distance(to: self)
1148+
let errorPos = self
11981149
self.advance(while: { $0.isValidIdentifierContinuationCodePoint })
11991150
return Lexer.Result(
12001151
.integerLiteral,
1201-
error: LexerError(.invalidOctalDigitInIntegerLiteral, byteOffset: errorOffset)
1152+
error: (.invalidOctalDigitInIntegerLiteral, errorPos)
12021153
)
12031154
}
12041155

@@ -1208,11 +1159,11 @@ extension Lexer.Cursor {
12081159

12091160
let tmp = self
12101161
if self.advance(if: { $0.isValidIdentifierContinuationCodePoint }) {
1211-
let errorOffset = tokenStart.distance(to: tmp)
1162+
let errorPos = tmp
12121163
self.advance(while: { $0.isValidIdentifierContinuationCodePoint })
12131164
return Lexer.Result(
12141165
.integerLiteral,
1215-
error: LexerError(.invalidOctalDigitInIntegerLiteral, byteOffset: errorOffset)
1166+
error: (.invalidOctalDigitInIntegerLiteral, errorPos)
12161167
)
12171168
}
12181169

@@ -1225,11 +1176,11 @@ extension Lexer.Cursor {
12251176
let bConsumed = self.advance(matching: "b") // Consume 'b'
12261177
assert(zeroConsumed && bConsumed)
12271178
if self.is(notAt: "0", "1") {
1228-
let errorOffset = tokenStart.distance(to: self)
1179+
let errorPos = self
12291180
self.advance(while: { $0.isValidIdentifierContinuationCodePoint })
12301181
return Lexer.Result(
12311182
.integerLiteral,
1232-
error: LexerError(.invalidBinaryDigitInIntegerLiteral, byteOffset: errorOffset)
1183+
error: (.invalidBinaryDigitInIntegerLiteral, errorPos)
12331184
)
12341185
}
12351186

@@ -1239,11 +1190,11 @@ extension Lexer.Cursor {
12391190

12401191
let tmp = self
12411192
if self.advance(if: { $0.isValidIdentifierContinuationCodePoint }) {
1242-
let errorOffset = tokenStart.distance(to: tmp)
1193+
let errorPos = tmp
12431194
self.advance(while: { $0.isValidIdentifierContinuationCodePoint })
12441195
return Lexer.Result(
12451196
.integerLiteral,
1246-
error: LexerError(.invalidBinaryDigitInIntegerLiteral, byteOffset: errorOffset)
1197+
error: (.invalidBinaryDigitInIntegerLiteral, errorPos)
12471198
)
12481199
}
12491200

@@ -1268,11 +1219,11 @@ extension Lexer.Cursor {
12681219
// something else, then this is the end of the token.
12691220
let tmp = self
12701221
if self.advance(if: { $0.isValidIdentifierContinuationCodePoint }) {
1271-
let errorOffset = tokenStart.distance(to: tmp)
1222+
let errorPos = tmp
12721223
self.advance(while: { $0.isValidIdentifierContinuationCodePoint })
12731224
return Lexer.Result(
12741225
.integerLiteral,
1275-
error: LexerError(.invalidDecimalDigitInIntegerLiteral, byteOffset: errorOffset)
1226+
error: (.invalidDecimalDigitInIntegerLiteral, errorPos)
12761227
)
12771228
}
12781229

@@ -1305,20 +1256,23 @@ extension Lexer.Cursor {
13051256
errorKind = .expectedDigitInFloatLiteral
13061257
}
13071258

1308-
let errorOffset = tokenStart.distance(to: tmp)
1259+
let errorPos = tmp
13091260
self.advance(while: { $0.isValidIdentifierContinuationCodePoint })
1310-
return Lexer.Result(.floatingLiteral, error: LexerError(errorKind, byteOffset: errorOffset))
1261+
return Lexer.Result(
1262+
.floatingLiteral,
1263+
error: (errorKind, errorPos)
1264+
)
13111265
}
13121266

13131267
self.advance(while: { $0.isDigit || $0 == Unicode.Scalar("_") })
13141268

13151269
let tmp = self
13161270
if self.advance(if: { $0.isValidIdentifierContinuationCodePoint }) {
1317-
let errorOffset = tokenStart.distance(to: tmp)
1271+
let errorPos = tmp
13181272
self.advance(while: { $0.isValidIdentifierContinuationCodePoint })
13191273
return Lexer.Result(
13201274
.floatingLiteral,
1321-
error: LexerError(.invalidFloatingPointExponentDigit, byteOffset: errorOffset)
1275+
error: (.invalidFloatingPointExponentDigit, errorPos)
13221276
)
13231277
}
13241278
}
@@ -1339,11 +1293,11 @@ extension Lexer.Cursor {
13391293
return Lexer.Result(.integerLiteral)
13401294
}
13411295
guard let peeked = self.peek(), Unicode.Scalar(peeked).isHexDigit else {
1342-
let errorOffset = tokStart.distance(to: self)
1296+
let errorPos = self
13431297
self.advance(while: { $0.isValidIdentifierContinuationCodePoint })
13441298
return Lexer.Result(
13451299
.integerLiteral,
1346-
error: LexerError(.invalidHexDigitInIntegerLiteral, byteOffset: errorOffset)
1300+
error: (.invalidHexDigitInIntegerLiteral, errorPos)
13471301
)
13481302
}
13491303

@@ -1352,11 +1306,11 @@ extension Lexer.Cursor {
13521306
if self.isAtEndOfFile || self.is(notAt: ".", "p", "P") {
13531307
let tmp = self
13541308
if self.advance(if: { $0.isValidIdentifierContinuationCodePoint }) {
1355-
let errorOffset = tokStart.distance(to: tmp)
1309+
let errorPos = tmp
13561310
self.advance(while: { $0.isValidIdentifierContinuationCodePoint })
13571311
return Lexer.Result(
13581312
.integerLiteral,
1359-
error: LexerError(.invalidHexDigitInIntegerLiteral, byteOffset: errorOffset)
1313+
error: (.invalidHexDigitInIntegerLiteral, errorPos)
13601314
)
13611315
} else {
13621316
return Lexer.Result(.integerLiteral)
@@ -1385,7 +1339,7 @@ extension Lexer.Cursor {
13851339
}
13861340
return Lexer.Result(
13871341
.integerLiteral,
1388-
error: LexerError(.expectedBinaryExponentInHexFloatLiteral, byteOffset: tokStart.distance(to: self))
1342+
error: (.expectedBinaryExponentInHexFloatLiteral, self)
13891343
)
13901344
}
13911345
} else {
@@ -1424,20 +1378,23 @@ extension Lexer.Cursor {
14241378
} else {
14251379
errorKind = .expectedDigitInFloatLiteral
14261380
}
1427-
let errorOffset = tokStart.distance(to: tmp)
1381+
let errorPos = tmp
14281382
self.advance(while: { $0.isValidIdentifierContinuationCodePoint })
1429-
return Lexer.Result(.floatingLiteral, error: LexerError(errorKind, byteOffset: errorOffset))
1383+
return Lexer.Result(
1384+
.floatingLiteral,
1385+
error: (errorKind, errorPos)
1386+
)
14301387
}
14311388

14321389
self.advance(while: { $0.isDigit || $0 == Unicode.Scalar("_") })
14331390

14341391
let tmp = self
14351392
if self.advance(if: { $0.isValidIdentifierContinuationCodePoint }) {
1436-
let errorOffset = tokStart.distance(to: tmp)
1393+
let errorPos = tmp
14371394
self.advance(while: { $0.isValidIdentifierContinuationCodePoint })
14381395
return Lexer.Result(
14391396
.floatingLiteral,
1440-
error: LexerError(.invalidFloatingPointExponentDigit, byteOffset: errorOffset)
1397+
error: (.invalidFloatingPointExponentDigit, errorPos)
14411398
)
14421399
}
14431400
return Lexer.Result(.floatingLiteral)

Sources/SwiftParser/Lexer/UnicodeScalarExtensions.swift

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -157,3 +157,91 @@ extension Unicode.Scalar {
157157
return self.value <= 0x80 || (self.value >= 0xC2 && self.value < 0xF5)
158158
}
159159
}
160+
161+
extension Unicode.Scalar {
162+
/// Lex a single unicode scalar, which might consists of multiple bytes.
163+
/// `advance` returns the current byte in the lexer and advances the lexer by
164+
/// one byte.
165+
/// `peek` returns the current byte in the lexer without advancing it.
166+
@inline(__always)
167+
static func lexing(advance: () -> UInt8?, peek: () -> UInt8?) -> Self? {
168+
guard let curByte = advance() else {
169+
return nil
170+
}
171+
172+
if (curByte < 0x80) {
173+
return Unicode.Scalar(curByte)
174+
}
175+
176+
// Read the number of high bits set, which indicates the number of bytes in
177+
// the character.
178+
let encodedBytes = (~(UInt32(curByte) << 24)).leadingZeroBitCount
179+
180+
// If this is 0b10XXXXXX, then it is a continuation character.
181+
if encodedBytes == 1 || !Unicode.Scalar(curByte).isStartOfUTF8Character {
182+
// Skip until we get the start of another character. This is guaranteed to
183+
// at least stop at the nul at the end of the buffer.
184+
while let peeked = peek(), Unicode.Scalar(peeked).isStartOfUTF8Character {
185+
_ = advance()
186+
}
187+
return nil
188+
}
189+
190+
// Drop the high bits indicating the # bytes of the result.
191+
var charValue = UInt32(curByte << encodedBytes) >> encodedBytes
192+
193+
// Read and validate the continuation bytes.
194+
for _ in 1..<encodedBytes {
195+
guard let curByte = peek() else {
196+
return nil
197+
}
198+
// If the high bit isn't set or the second bit isn't clear, then this is not
199+
// a continuation byte!
200+
if (curByte < 0x80 || curByte >= 0xC0) {
201+
return nil
202+
}
203+
204+
// Accumulate our result.
205+
charValue <<= 6
206+
charValue |= UInt32(curByte & 0x3F)
207+
_ = advance()
208+
}
209+
210+
// UTF-16 surrogate pair values are not valid code points.
211+
if (charValue >= 0xD800 && charValue <= 0xDFFF) {
212+
return nil
213+
}
214+
215+
// If we got here, we read the appropriate number of accumulated bytes.
216+
// Verify that the encoding was actually minimal.
217+
// Number of bits in the value, ignoring leading zeros.
218+
let numBits = 32 - charValue.leadingZeroBitCount
219+
if numBits <= 5 + 6 {
220+
return encodedBytes == 2 ? Unicode.Scalar(charValue) : nil
221+
}
222+
if numBits <= 4 + 6 + 6 {
223+
return encodedBytes == 3 ? Unicode.Scalar(charValue) : nil
224+
}
225+
return encodedBytes == 4 ? Unicode.Scalar(charValue) : nil
226+
}
227+
228+
/// Returns the first unicode scalar in `byteSequence`, which may span multiple bytes.
229+
public static func lexing<S: Collection>(from byteSequence: S) -> Self? where S.Element == UInt8 {
230+
var index = byteSequence.startIndex
231+
let peek = { () -> UInt8? in
232+
if index < byteSequence.endIndex {
233+
return byteSequence[index]
234+
} else {
235+
return nil
236+
}
237+
}
238+
let advance = { () -> UInt8? in
239+
defer {
240+
index = byteSequence.index(after: index)
241+
}
242+
return peek()
243+
}
244+
245+
return self.lexing(advance: advance, peek: peek)
246+
}
247+
}

Sources/SwiftParser/Parser.swift

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -512,7 +512,7 @@ extension Parser {
512512

513513
let endIndex = current.textRange.lowerBound.advanced(by: prefix.count)
514514
var lexerError = current.error
515-
if let error = lexerError, error.byteOffset > prefix.count {
515+
if let error = lexerError, error.byteOffset > prefix.count + current.leadingTriviaByteLength {
516516
// The lexer error isn't in the prefix. Drop it.
517517
lexerError = nil
518518
}

Sources/SwiftParserDiagnostics/LexerDiagnosticMessages.swift

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -98,9 +98,11 @@ public extension SwiftSyntax.LexerError {
9898
/// `tokenText` is the entire text of the token in which the `LexerError`
9999
/// occurred, including trivia.
100100
@_spi(RawSyntax)
101-
func diagnostic(tokenText: SyntaxText) -> DiagnosticMessage {
101+
func diagnostic(wholeTextBytes: [UInt8]) -> DiagnosticMessage {
102102
var scalarAtErrorOffset: UnicodeScalar {
103-
Unicode.Scalar(tokenText[Int(self.byteOffset)])
103+
// Fall back to the Unicode replacement character U+FFFD in case we can't
104+
// lex the unicode character at `byteOffset`. It's the best we can do
105+
Unicode.Scalar.lexing(from: wholeTextBytes[Int(self.byteOffset)...]) ?? UnicodeScalar("")
104106
}
105107

106108
switch self.kind {
@@ -130,6 +132,6 @@ public extension SwiftSyntax.LexerError {
130132
}
131133

132134
func diagnostic(in token: TokenSyntax) -> DiagnosticMessage {
133-
return self.diagnostic(tokenText: token.tokenView.rawText)
135+
return self.diagnostic(wholeTextBytes: token.syntaxTextBytes)
134136
}
135137
}

Sources/SwiftParserDiagnostics/ParseDiagnosticsGenerator.swift

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -338,7 +338,7 @@ public class ParseDiagnosticsGenerator: SyntaxAnyVisitor {
338338
handleMissingToken(token)
339339
} else {
340340
if let lexerError = token.lexerError {
341-
self.addDiagnostic(token, position: token.positionAfterSkippingLeadingTrivia.advanced(by: Int(lexerError.byteOffset)), lexerError.diagnostic(in: token))
341+
self.addDiagnostic(token, position: token.position.advanced(by: Int(lexerError.byteOffset)), lexerError.diagnostic(in: token))
342342
}
343343
}
344344

0 commit comments

Comments
 (0)