Skip to content

Commit 497d481

Browse files
author
Dave Abrahams
committed
[stdlib] Bidirectional UTF-8 Prototype: tweaks/cleanups
Some small optimizations and reorganization.
1 parent 6e5127e commit 497d481

File tree

1 file changed

+18
-17
lines changed

1 file changed

+18
-17
lines changed

test/Prototypes/UTF8.swift

Lines changed: 18 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,6 @@ case emptyInput
5959
}
6060
}
6161

62-
6362
extension UTF8 {
6463
enum Classification : UInt8 {
6564
@inline(__always)
@@ -101,6 +100,9 @@ extension UTF8 {
101100
return isValid3BytePrefix(c0, c1) || isValid4BytePrefix(c0, c1)
102101
}
103102

103+
// FIXME: Benchmark parse1ForwardOpenCoded vs parse1Forward and decide which
104+
// implementation strategy to keep.
105+
104106
/// Parses one scalar forward from `input`.
105107
///
106108
/// - Parameters:
@@ -114,7 +116,7 @@ extension UTF8 {
114116
/// - knownCountExceeds3: true if and only if the input is known be at least
115117
/// 4 elements long. If so, we can skip end checks. Note: pass a
116118
/// compile-time constant here or you will just slow the algorithm down!
117-
static func parseForward<C: Collection>(
119+
static func parse1ForwardOpenCoded<C: Collection>(
118120
_ input: C,
119121
knownValid: Bool = false,
120122
knownCountExceeds3: Bool = false
@@ -212,16 +214,20 @@ extension UTF8 {
212214
return UInt8(_leadingZeros((~x)._value))
213215
}
214216

215-
static func maskLeading1s(_ x: UInt8) -> UInt8 {
216-
return x & ((1 << (7 &- leading1s(x))) - 1)
217+
/// Given a valid leading byte of a multibyte sequence, strip the leading 1
218+
/// bits.
219+
///
220+
/// - Note: Given any other byte, the result is unspecified.
221+
static func maskLeadByte(_ x: UInt8) -> UInt8 {
222+
return x & (0b11111 >> (x >> 5 & 1))
217223
}
218224

219225
/// Parses one scalar forward from `input`.
220226
///
221227
/// - Parameter knownCountExceeds3: true if and only if the input is known
222228
/// be at least 4 elements long. If so, we can skip end checks. Note: pass
223229
/// a compile-time constant here or you will just slow the algorithm down!
224-
static func parseForwardStateMachine<C: Collection>(
230+
static func parse1Forward<C: Collection>(
225231
_ input: C, knownCountExceeds3: Bool = false
226232
) -> ParseResult<UInt32, C.Index>
227233
where C.Iterator.Element == UTF8.CodeUnit {
@@ -246,7 +252,7 @@ extension UTF8 {
246252
i = j // even if there are errors, we eat 1 byte
247253

248254
// Begin accumulating result
249-
var r = UInt32(maskLeading1s(u0))
255+
var r = UInt32(maskLeadByte(u0))
250256

251257

252258
// Mark one more token recognized and get the next lookahead token iff it
@@ -309,7 +315,7 @@ extension UTF8 {
309315
/// - Parameter knownCountExceeds3: true if and only if the input is known
310316
/// be at least 4 elements long. If so, we can skip end checks. Note: pass
311317
/// a compile-time constant here or you will just slow the algorithm down!
312-
static func parseReverseStateMachine<C: BidirectionalCollection>(
318+
static func parse1Reverse<C: BidirectionalCollection>(
313319
_ input: C, knownCountExceeds3: Bool = false
314320
) -> ParseResult<UInt32, C.Index>
315321
where C.Iterator.Element == UTF8.CodeUnit,
@@ -352,7 +358,7 @@ extension UTF8 {
352358
@inline(__always)
353359
func accept(_ pat: ClosedRange<UInt8>) -> ParseResult<UInt32, C.Index>? {
354360
if _fastPath(pat.contains(u)) {
355-
r |= UInt32(maskLeading1s(u)) << shift
361+
r |= UInt32(maskLeadByte(u)) << shift
356362
return .valid(r, resumptionPoint: j)
357363
}
358364
return nil
@@ -556,17 +562,12 @@ struct TestUTF8 : UnicodeCodec {
556562
while let x = input.next() { buffer.append(x) }
557563

558564
UTF8.parseForward(buffer, using: TestUTF8.parse) {
559-
print($0.valid.map { String($0, radix: 16) } ?? String(describing: $0), terminator: " ")
560565
results.append(
561566
$0.valid.map { .scalarValue(UnicodeScalar($0)!) } ?? .error
562567
)
563568
}
564-
print()
565-
print("input: \(buffer.map { String($0, radix: 16) })")
566-
print("output: \(results)")
567569
}
568-
if let next = results.popFirst() { return next }
569-
return .emptyInput
570+
return results.popFirst() ?? .emptyInput
570571
}
571572

572573
/// Encodes a Unicode scalar as a series of code units by calling the given
@@ -643,7 +644,7 @@ func addUTF8Suite(name: String, parser: @escaping UTF8ParseFunction) {
643644
let ret = checkDecodeUTF(TestUTF8.self, expectedHead, expectedRepairedTail, utf8Str)
644645

645646
var reverseResult: [UInt32] = []
646-
UTF8.parseReverse(utf8Str, using: UTF8.parseReverseStateMachine) {
647+
UTF8.parseReverse(utf8Str, using: UTF8.parse1Reverse) {
647648
reverseResult.append($0.valid ?? 0xFFFD)
648649
}
649650
let expected = expectedHead + expectedRepairedTail
@@ -1731,11 +1732,11 @@ func addUTF8Suite(name: String, parser: @escaping UTF8ParseFunction) {
17311732

17321733
addUTF8Suite(
17331734
name: "OpenCodedUTF8Decoder",
1734-
parser: { UTF8.parseForward($0, knownCountExceeds3: $1) })
1735+
parser: { UTF8.parse1ForwardOpenCoded($0, knownCountExceeds3: $1) })
17351736

17361737
addUTF8Suite(
17371738
name: "UnrolledStateMachineUTF8Decoder",
1738-
parser: UTF8.parseForwardStateMachine)
1739+
parser: UTF8.parse1Forward)
17391740

17401741
runAllTests()
17411742

0 commit comments

Comments
 (0)