@@ -255,13 +255,15 @@ extension Lexer {
255
255
struct Result {
256
256
let tokenKind : RawTokenKind
257
257
let flags : Lexer . Lexeme . Flags
258
- let error : LexerError ?
258
+ /// The error kind and the cursor pointing to the character at which the
259
+ /// error occurred
260
+ let error : ( kind: LexerError . Kind , position: Lexer . Cursor ) ?
259
261
let stateTransition : StateTransition ?
260
262
261
263
init (
262
264
_ tokenKind: RawTokenKind ,
263
265
flags: Lexer . Lexeme . Flags = [ ] ,
264
- error: LexerError ? = nil ,
266
+ error: ( kind : LexerError . Kind , position : Cursor ) ? = nil ,
265
267
stateTransition: StateTransition ? = nil
266
268
) {
267
269
self . tokenKind = tokenKind
@@ -335,10 +337,14 @@ extension Lexer.Cursor {
335
337
flags. insert ( . isAtStartOfLine)
336
338
}
337
339
340
+ let error = result. error. map { error in
341
+ return LexerError ( error. kind, byteOffset: cursor. distance ( to: error. position) )
342
+ }
343
+
338
344
return . init(
339
345
tokenKind: result. tokenKind,
340
346
flags: flags,
341
- error: result . error,
347
+ error: error,
342
348
start: leadingTriviaStart. pointer,
343
349
leadingTriviaLength: leadingTriviaStart. distance ( to: textStart) ,
344
350
textLength: textStart. distance ( to: trailingTriviaStart) ,
@@ -668,62 +674,7 @@ extension Lexer.Cursor {
668
674
/// that case bytes are consumed until we reach the next start of a UTF-8
669
675
/// character.
670
676
mutating func advanceValidatingUTF8Character( ) -> Unicode . Scalar ? {
671
- guard let curByte = self . advance ( ) else {
672
- return nil
673
- }
674
-
675
- if ( curByte < 0x80 ) {
676
- return Unicode . Scalar ( curByte)
677
- }
678
-
679
- // Read the number of high bits set, which indicates the number of bytes in
680
- // the character.
681
- let encodedBytes = ( ~ ( UInt32 ( curByte) << 24 ) ) . leadingZeroBitCount
682
-
683
- // If this is 0b10XXXXXX, then it is a continuation character.
684
- if encodedBytes == 1 || !Unicode. Scalar ( curByte) . isStartOfUTF8Character {
685
- // Skip until we get the start of another character. This is guaranteed to
686
- // at least stop at the nul at the end of the buffer.
687
- self . advance ( while: { !$0. isStartOfUTF8Character } )
688
- return nil
689
- }
690
-
691
- // Drop the high bits indicating the # bytes of the result.
692
- var charValue = UInt32 ( curByte << encodedBytes) >> encodedBytes
693
-
694
- // Read and validate the continuation bytes.
695
- for _ in 1 ..< encodedBytes {
696
- guard let curByte = self . peek ( ) else {
697
- return nil
698
- }
699
- // If the high bit isn't set or the second bit isn't clear, then this is not
700
- // a continuation byte!
701
- if ( curByte < 0x80 || curByte >= 0xC0 ) {
702
- return nil
703
- }
704
-
705
- // Accumulate our result.
706
- charValue <<= 6
707
- charValue |= UInt32 ( curByte & 0x3F )
708
- _ = self . advance ( )
709
- }
710
-
711
- // UTF-16 surrogate pair values are not valid code points.
712
- if ( charValue >= 0xD800 && charValue <= 0xDFFF ) {
713
- return nil
714
- }
715
-
716
- // If we got here, we read the appropriate number of accumulated bytes.
717
- // Verify that the encoding was actually minimal.
718
- // Number of bits in the value, ignoring leading zeros.
719
- let numBits = 32 - charValue. leadingZeroBitCount
720
- if numBits <= 5 + 6 {
721
- return encodedBytes == 2 ? Unicode . Scalar ( charValue) : nil
722
- }
723
- if numBits <= 4 + 6 + 6 {
724
- return encodedBytes == 3 ? Unicode . Scalar ( charValue) : nil
725
- }
726
- return encodedBytes == 4 ? Unicode . Scalar ( charValue) : nil
677
+ return Unicode . Scalar. lexing ( advance: { self . advance ( ) } , peek: { self . peek ( at: 0 ) } )
727
678
}
728
679
729
680
/// Rever the lexer by `offset` bytes. This should only be used by `resetForSplit`.
@@ -1194,11 +1145,11 @@ extension Lexer.Cursor {
1194
1145
let oConsumed = self . advance ( matching: " o " ) // Consome 'o'
1195
1146
assert ( zeroConsumed && oConsumed)
1196
1147
if let peeked = self . peek ( ) , peeked < UInt8 ( ascii: " 0 " ) || peeked > UInt8 ( ascii: " 7 " ) {
1197
- let errorOffset = tokenStart . distance ( to : self )
1148
+ let errorPos = self
1198
1149
self . advance ( while: { $0. isValidIdentifierContinuationCodePoint } )
1199
1150
return Lexer . Result (
1200
1151
. integerLiteral,
1201
- error: LexerError ( . invalidOctalDigitInIntegerLiteral, byteOffset : errorOffset )
1152
+ error: ( . invalidOctalDigitInIntegerLiteral, errorPos )
1202
1153
)
1203
1154
}
1204
1155
@@ -1208,11 +1159,11 @@ extension Lexer.Cursor {
1208
1159
1209
1160
let tmp = self
1210
1161
if self . advance ( if: { $0. isValidIdentifierContinuationCodePoint } ) {
1211
- let errorOffset = tokenStart . distance ( to : tmp)
1162
+ let errorPos = tmp
1212
1163
self . advance ( while: { $0. isValidIdentifierContinuationCodePoint } )
1213
1164
return Lexer . Result (
1214
1165
. integerLiteral,
1215
- error: LexerError ( . invalidOctalDigitInIntegerLiteral, byteOffset : errorOffset )
1166
+ error: ( . invalidOctalDigitInIntegerLiteral, errorPos )
1216
1167
)
1217
1168
}
1218
1169
@@ -1225,11 +1176,11 @@ extension Lexer.Cursor {
1225
1176
let bConsumed = self . advance ( matching: " b " ) // Consume 'b'
1226
1177
assert ( zeroConsumed && bConsumed)
1227
1178
if self . is ( notAt: " 0 " , " 1 " ) {
1228
- let errorOffset = tokenStart . distance ( to : self )
1179
+ let errorPos = self
1229
1180
self . advance ( while: { $0. isValidIdentifierContinuationCodePoint } )
1230
1181
return Lexer . Result (
1231
1182
. integerLiteral,
1232
- error: LexerError ( . invalidBinaryDigitInIntegerLiteral, byteOffset : errorOffset )
1183
+ error: ( . invalidBinaryDigitInIntegerLiteral, errorPos )
1233
1184
)
1234
1185
}
1235
1186
@@ -1239,11 +1190,11 @@ extension Lexer.Cursor {
1239
1190
1240
1191
let tmp = self
1241
1192
if self . advance ( if: { $0. isValidIdentifierContinuationCodePoint } ) {
1242
- let errorOffset = tokenStart . distance ( to : tmp)
1193
+ let errorPos = tmp
1243
1194
self . advance ( while: { $0. isValidIdentifierContinuationCodePoint } )
1244
1195
return Lexer . Result (
1245
1196
. integerLiteral,
1246
- error: LexerError ( . invalidBinaryDigitInIntegerLiteral, byteOffset : errorOffset )
1197
+ error: ( . invalidBinaryDigitInIntegerLiteral, errorPos )
1247
1198
)
1248
1199
}
1249
1200
@@ -1268,11 +1219,11 @@ extension Lexer.Cursor {
1268
1219
// something else, then this is the end of the token.
1269
1220
let tmp = self
1270
1221
if self . advance ( if: { $0. isValidIdentifierContinuationCodePoint } ) {
1271
- let errorOffset = tokenStart . distance ( to : tmp)
1222
+ let errorPos = tmp
1272
1223
self . advance ( while: { $0. isValidIdentifierContinuationCodePoint } )
1273
1224
return Lexer . Result (
1274
1225
. integerLiteral,
1275
- error: LexerError ( . invalidDecimalDigitInIntegerLiteral, byteOffset : errorOffset )
1226
+ error: ( . invalidDecimalDigitInIntegerLiteral, errorPos )
1276
1227
)
1277
1228
}
1278
1229
@@ -1305,20 +1256,23 @@ extension Lexer.Cursor {
1305
1256
errorKind = . expectedDigitInFloatLiteral
1306
1257
}
1307
1258
1308
- let errorOffset = tokenStart . distance ( to : tmp)
1259
+ let errorPos = tmp
1309
1260
self . advance ( while: { $0. isValidIdentifierContinuationCodePoint } )
1310
- return Lexer . Result ( . floatingLiteral, error: LexerError ( errorKind, byteOffset: errorOffset) )
1261
+ return Lexer . Result (
1262
+ . floatingLiteral,
1263
+ error: ( errorKind, errorPos)
1264
+ )
1311
1265
}
1312
1266
1313
1267
self . advance ( while: { $0. isDigit || $0 == Unicode . Scalar ( " _ " ) } )
1314
1268
1315
1269
let tmp = self
1316
1270
if self . advance ( if: { $0. isValidIdentifierContinuationCodePoint } ) {
1317
- let errorOffset = tokenStart . distance ( to : tmp)
1271
+ let errorPos = tmp
1318
1272
self . advance ( while: { $0. isValidIdentifierContinuationCodePoint } )
1319
1273
return Lexer . Result (
1320
1274
. floatingLiteral,
1321
- error: LexerError ( . invalidFloatingPointExponentDigit, byteOffset : errorOffset )
1275
+ error: ( . invalidFloatingPointExponentDigit, errorPos )
1322
1276
)
1323
1277
}
1324
1278
}
@@ -1339,11 +1293,11 @@ extension Lexer.Cursor {
1339
1293
return Lexer . Result ( . integerLiteral)
1340
1294
}
1341
1295
guard let peeked = self . peek ( ) , Unicode . Scalar ( peeked) . isHexDigit else {
1342
- let errorOffset = tokStart . distance ( to : self )
1296
+ let errorPos = self
1343
1297
self . advance ( while: { $0. isValidIdentifierContinuationCodePoint } )
1344
1298
return Lexer . Result (
1345
1299
. integerLiteral,
1346
- error: LexerError ( . invalidHexDigitInIntegerLiteral, byteOffset : errorOffset )
1300
+ error: ( . invalidHexDigitInIntegerLiteral, errorPos )
1347
1301
)
1348
1302
}
1349
1303
@@ -1352,11 +1306,11 @@ extension Lexer.Cursor {
1352
1306
if self . isAtEndOfFile || self . is ( notAt: " . " , " p " , " P " ) {
1353
1307
let tmp = self
1354
1308
if self . advance ( if: { $0. isValidIdentifierContinuationCodePoint } ) {
1355
- let errorOffset = tokStart . distance ( to : tmp)
1309
+ let errorPos = tmp
1356
1310
self . advance ( while: { $0. isValidIdentifierContinuationCodePoint } )
1357
1311
return Lexer . Result (
1358
1312
. integerLiteral,
1359
- error: LexerError ( . invalidHexDigitInIntegerLiteral, byteOffset : errorOffset )
1313
+ error: ( . invalidHexDigitInIntegerLiteral, errorPos )
1360
1314
)
1361
1315
} else {
1362
1316
return Lexer . Result ( . integerLiteral)
@@ -1385,7 +1339,7 @@ extension Lexer.Cursor {
1385
1339
}
1386
1340
return Lexer . Result (
1387
1341
. integerLiteral,
1388
- error: LexerError ( . expectedBinaryExponentInHexFloatLiteral, byteOffset : tokStart . distance ( to : self ) )
1342
+ error: ( . expectedBinaryExponentInHexFloatLiteral, self )
1389
1343
)
1390
1344
}
1391
1345
} else {
@@ -1424,20 +1378,23 @@ extension Lexer.Cursor {
1424
1378
} else {
1425
1379
errorKind = . expectedDigitInFloatLiteral
1426
1380
}
1427
- let errorOffset = tokStart . distance ( to : tmp)
1381
+ let errorPos = tmp
1428
1382
self . advance ( while: { $0. isValidIdentifierContinuationCodePoint } )
1429
- return Lexer . Result ( . floatingLiteral, error: LexerError ( errorKind, byteOffset: errorOffset) )
1383
+ return Lexer . Result (
1384
+ . floatingLiteral,
1385
+ error: ( errorKind, errorPos)
1386
+ )
1430
1387
}
1431
1388
1432
1389
self . advance ( while: { $0. isDigit || $0 == Unicode . Scalar ( " _ " ) } )
1433
1390
1434
1391
let tmp = self
1435
1392
if self . advance ( if: { $0. isValidIdentifierContinuationCodePoint } ) {
1436
- let errorOffset = tokStart . distance ( to : tmp)
1393
+ let errorPos = tmp
1437
1394
self . advance ( while: { $0. isValidIdentifierContinuationCodePoint } )
1438
1395
return Lexer . Result (
1439
1396
. floatingLiteral,
1440
- error: LexerError ( . invalidFloatingPointExponentDigit, byteOffset : errorOffset )
1397
+ error: ( . invalidFloatingPointExponentDigit, errorPos )
1441
1398
)
1442
1399
}
1443
1400
return Lexer . Result ( . floatingLiteral)
0 commit comments