@@ -59,7 +59,6 @@ case emptyInput
59
59
}
60
60
}
61
61
62
-
63
62
extension UTF8 {
64
63
enum Classification : UInt8 {
65
64
@inline ( __always)
@@ -101,6 +100,9 @@ extension UTF8 {
101
100
return isValid3BytePrefix ( c0, c1) || isValid4BytePrefix ( c0, c1)
102
101
}
103
102
103
+ // FIXME: Benchmark parse1ForwardOpenCoded vs parse1Forward and decide which
104
+ // implementation strategy to keep.
105
+
104
106
/// Parses one scalar forward from `input`.
105
107
///
106
108
/// - Parameters:
@@ -114,7 +116,7 @@ extension UTF8 {
114
116
/// - knownCountExceeds3: true if and only if the input is known be at least
115
117
/// 4 elements long. If so, we can skip end checks. Note: pass a
116
118
/// compile-time constant here or you will just slow the algorithm down!
117
- static func parseForward < C: Collection > (
119
+ static func parse1ForwardOpenCoded < C: Collection > (
118
120
_ input: C ,
119
121
knownValid: Bool = false ,
120
122
knownCountExceeds3: Bool = false
@@ -212,16 +214,20 @@ extension UTF8 {
212
214
return UInt8 ( _leadingZeros ( ( ~ x) . _value) )
213
215
}
214
216
215
- static func maskLeading1s( _ x: UInt8 ) -> UInt8 {
216
- return x & ( ( 1 << ( 7 &- leading1s ( x) ) ) - 1 )
217
+ /// Given a valid leading byte of a multibyte sequence, strip the leading 1
218
+ /// bits.
219
+ ///
220
+ /// - Note: Given any other byte, the result is unspecified.
221
+ static func maskLeadByte( _ x: UInt8 ) -> UInt8 {
222
+ return x & ( 0b11111 >> ( x >> 5 & 1 ) )
217
223
}
218
224
219
225
/// Parses one scalar forward from `input`.
220
226
///
221
227
/// - Parameter knownCountExceeds3: true if and only if the input is known
222
228
/// be at least 4 elements long. If so, we can skip end checks. Note: pass
223
229
/// a compile-time constant here or you will just slow the algorithm down!
224
- static func parseForwardStateMachine < C: Collection > (
230
+ static func parse1Forward < C: Collection > (
225
231
_ input: C , knownCountExceeds3: Bool = false
226
232
) -> ParseResult < UInt32 , C . Index >
227
233
where C. Iterator. Element == UTF8 . CodeUnit {
@@ -246,7 +252,7 @@ extension UTF8 {
246
252
i = j // even if there are errors, we eat 1 byte
247
253
248
254
// Begin accumulating result
249
- var r = UInt32 ( maskLeading1s ( u0) )
255
+ var r = UInt32 ( maskLeadByte ( u0) )
250
256
251
257
252
258
// Mark one more token recognized and get the next lookahead token iff it
@@ -309,7 +315,7 @@ extension UTF8 {
309
315
/// - Parameter knownCountExceeds3: true if and only if the input is known
310
316
/// be at least 4 elements long. If so, we can skip end checks. Note: pass
311
317
/// a compile-time constant here or you will just slow the algorithm down!
312
- static func parseReverseStateMachine < C: BidirectionalCollection > (
318
+ static func parse1Reverse < C: BidirectionalCollection > (
313
319
_ input: C , knownCountExceeds3: Bool = false
314
320
) -> ParseResult < UInt32 , C . Index >
315
321
where C. Iterator. Element == UTF8 . CodeUnit ,
@@ -352,7 +358,7 @@ extension UTF8 {
352
358
@inline ( __always)
353
359
func accept( _ pat: ClosedRange < UInt8 > ) -> ParseResult < UInt32 , C . Index > ? {
354
360
if _fastPath ( pat. contains ( u) ) {
355
- r |= UInt32 ( maskLeading1s ( u) ) << shift
361
+ r |= UInt32 ( maskLeadByte ( u) ) << shift
356
362
return . valid( r, resumptionPoint: j)
357
363
}
358
364
return nil
@@ -556,17 +562,12 @@ struct TestUTF8 : UnicodeCodec {
556
562
while let x = input. next ( ) { buffer. append ( x) }
557
563
558
564
UTF8 . parseForward ( buffer, using: TestUTF8 . parse) {
559
- print ( $0. valid. map { String ( $0, radix: 16 ) } ?? String ( describing: $0) , terminator: " " )
560
565
results. append (
561
566
$0. valid. map { . scalarValue( UnicodeScalar ( $0) !) } ?? . error
562
567
)
563
568
}
564
- print ( )
565
- print ( " input: \( buffer. map { String ( $0, radix: 16 ) } ) " )
566
- print ( " output: \( results) " )
567
569
}
568
- if let next = results. popFirst ( ) { return next }
569
- return . emptyInput
570
+ return results. popFirst ( ) ?? . emptyInput
570
571
}
571
572
572
573
/// Encodes a Unicode scalar as a series of code units by calling the given
@@ -643,7 +644,7 @@ func addUTF8Suite(name: String, parser: @escaping UTF8ParseFunction) {
643
644
let ret = checkDecodeUTF ( TestUTF8 . self, expectedHead, expectedRepairedTail, utf8Str)
644
645
645
646
var reverseResult : [ UInt32 ] = [ ]
646
- UTF8 . parseReverse ( utf8Str, using: UTF8 . parseReverseStateMachine ) {
647
+ UTF8 . parseReverse ( utf8Str, using: UTF8 . parse1Reverse ) {
647
648
reverseResult. append ( $0. valid ?? 0xFFFD )
648
649
}
649
650
let expected = expectedHead + expectedRepairedTail
@@ -1731,11 +1732,11 @@ func addUTF8Suite(name: String, parser: @escaping UTF8ParseFunction) {
1731
1732
1732
1733
addUTF8Suite (
1733
1734
name: " OpenCodedUTF8Decoder " ,
1734
- parser: { UTF8 . parseForward ( $0, knownCountExceeds3: $1) } )
1735
+ parser: { UTF8 . parse1ForwardOpenCoded ( $0, knownCountExceeds3: $1) } )
1735
1736
1736
1737
addUTF8Suite (
1737
1738
name: " UnrolledStateMachineUTF8Decoder " ,
1738
- parser: UTF8 . parseForwardStateMachine )
1739
+ parser: UTF8 . parse1Forward )
1739
1740
1740
1741
runAllTests ( )
1741
1742
0 commit comments