Skip to content

Commit c04dcf3

Browse files
committed
[String] More efficient breadcrumb-scanning code.
Rather than rely on the UTF16View, scan between breadcrumbs by hand for a decent 20% speedup. This code will also make it more obvious how to slot in a vectorized solution later.
1 parent 3820393 commit c04dcf3

File tree

2 files changed

+86
-31
lines changed

2 files changed

+86
-31
lines changed

stdlib/public/core/StringUTF16View.swift

Lines changed: 44 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -186,8 +186,8 @@ extension String.UTF16View: BidirectionalCollection {
186186
return _foreignIndex(i, offsetBy: n)
187187
}
188188

189-
let lowerOffset = _getOffset(for: i)
190-
let result = _getIndex(for: lowerOffset + n)
189+
let lowerOffset = _nativeGetOffset(for: i)
190+
let result = _nativeGetIndex(for: lowerOffset + n)
191191
return result
192192
}
193193

@@ -199,8 +199,8 @@ extension String.UTF16View: BidirectionalCollection {
199199
return _foreignIndex(i, offsetBy: n, limitedBy: limit)
200200
}
201201

202-
let iOffset = _getOffset(for: i)
203-
let limitOffset = _getOffset(for: limit)
202+
let iOffset = _nativeGetOffset(for: i)
203+
let limitOffset = _nativeGetOffset(for: limit)
204204

205205
// If distance < 0, limit has no effect if it is greater than i.
206206
if _slowPath(n < 0 && limit <= i && limitOffset > iOffset + n) {
@@ -211,7 +211,7 @@ extension String.UTF16View: BidirectionalCollection {
211211
return nil
212212
}
213213

214-
let result = _getIndex(for: iOffset + n)
214+
let result = _nativeGetIndex(for: iOffset + n)
215215
return result
216216
}
217217

@@ -221,8 +221,8 @@ extension String.UTF16View: BidirectionalCollection {
221221
return _foreignDistance(from: start, to: end)
222222
}
223223

224-
let lower = _getOffset(for: start)
225-
let upper = _getOffset(for: end)
224+
let lower = _nativeGetOffset(for: start)
225+
let upper = _nativeGetOffset(for: end)
226226
return upper &- lower
227227
}
228228

@@ -231,7 +231,7 @@ extension String.UTF16View: BidirectionalCollection {
231231
if _slowPath(_guts.isForeign) {
232232
return _foreignCount()
233233
}
234-
return _getOffset(for: endIndex)
234+
return _nativeGetOffset(for: endIndex)
235235
}
236236

237237
/// Accesses the code unit at the given position.
@@ -458,7 +458,7 @@ extension String.UTF16View {
458458

459459
@usableFromInline
460460
@_effects(releasenone)
461-
internal func _getOffset(for idx: Index) -> Int {
461+
internal func _nativeGetOffset(for idx: Index) -> Int {
462462
// Trivial and common: start
463463
if idx == startIndex { return 0 }
464464

@@ -478,7 +478,7 @@ extension String.UTF16View {
478478

479479
@usableFromInline
480480
@_effects(releasenone)
481-
internal func _getIndex(for offset: Int) -> Index {
481+
internal func _nativeGetIndex(for offset: Int) -> Index {
482482
// Trivial and common: start
483483
if offset == 0 { return startIndex }
484484

@@ -493,7 +493,40 @@ extension String.UTF16View {
493493
// Otherwise, find the nearest lower-bound breadcrumb and advance that
494494
let (crumb, remaining) = breadcrumbsPtr.pointee.getBreadcrumb(
495495
forOffset: offset)
496-
return _index(crumb, offsetBy: remaining)
496+
if remaining == 0 { return crumb }
497+
498+
return _guts.withFastUTF8 { utf8 in
499+
var readIdx = crumb.encodedOffset
500+
var readEnd = utf8.count
501+
_sanityCheck(readIdx < readEnd)
502+
503+
var utf16I = 0
504+
let utf16End: Int = remaining
505+
506+
// Adjust for sub-scalar initial transcoding: If we're starting the scan
507+
// at a trailing surrogate, then we set our starting count to be -1 so as
508+
// offset counting the leading surrogate.
509+
if crumb.transcodedOffset != 0 {
510+
utf16I = -1
511+
}
512+
513+
while true {
514+
let len = _utf8ScalarLength(utf8[readIdx])
515+
let utf16Len = len == 4 ? 2 : 1
516+
utf16I &+= utf16Len
517+
518+
if utf16I >= utf16End {
519+
// Uncommon: final sub-scalar transcoded offset
520+
if _slowPath(utf16I > utf16End) {
521+
_sanityCheck(utf16Len == 2)
522+
return Index(encodedOffset: readIdx, transcodedOffset: 1)
523+
}
524+
return Index(encodedOffset: readIdx &+ len)
525+
}
526+
527+
readIdx &+= len
528+
}
529+
}
497530
}
498531
}
499532

validation-test/stdlib/StringBreadcrumbs.swift

Lines changed: 42 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -28,42 +28,64 @@ let largeString: String = {
2828
return result
2929
}()
3030

31+
extension FixedWidthInteger {
32+
var hexStr: String { return "0x\(String(self, radix: 16, uppercase: true))" }
33+
}
34+
3135
let StringBreadcrumbsTests = TestSuite("StringBreadcrumbsTests")
3236

33-
StringBreadcrumbsTests.test("largeString") {
34-
var utf16CodeUnits = Array(largeString.utf16)
35-
var utf16Indices = Array(largeString.utf16.indices)
37+
func validateBreadcrumbs(_ str: String) {
38+
var utf16CodeUnits = Array(str.utf16)
39+
var utf16Indices = Array(str.utf16.indices)
3640
var outputBuffer = Array<UInt16>(repeating: 0, count: utf16CodeUnits.count)
3741

3842
for i in 0..<(utf16CodeUnits.count-1) {
3943
for j in (i+1)..<utf16CodeUnits.count {
4044
let range = Range(uncheckedBounds: (i, j))
41-
let indexRange = largeString._toUTF16Indices(range)
4245

43-
// Range<String.Index> from Range<Int>
46+
let indexRange = str._toUTF16Indices(range)
47+
// Range<String.Index> <=> Range<Int>
48+
expectEqual(utf16Indices[i], indexRange.lowerBound)
49+
expectEqual(utf16Indices[j], indexRange.upperBound)
4450
expectEqualSequence(
45-
utf16CodeUnits[i..<j], largeString.utf16[indexRange])
51+
utf16CodeUnits[i..<j], str.utf16[indexRange])
52+
let roundTripOffsets = str._toUTF16Offsets(indexRange)
53+
expectEqualSequence(range, roundTripOffsets)
4654

47-
// Copy characters
55+
// Single Int <=> String.Index
56+
expectEqual(indexRange.lowerBound, str._toUTF16Index(i))
57+
expectEqual(indexRange.upperBound, str._toUTF16Index(j))
58+
expectEqual(i, str._toUTF16Offset(indexRange.lowerBound))
59+
expectEqual(j, str._toUTF16Offset(indexRange.upperBound))
60+
61+
// Copy characters
4862
outputBuffer.withUnsafeMutableBufferPointer {
49-
largeString._copyUTF16CodeUnits(into: $0, range: range)
63+
str._copyUTF16CodeUnits(into: $0, range: range)
5064
}
5165
expectEqualSequence(utf16CodeUnits[i..<j], outputBuffer[..<range.count])
52-
53-
// Range<Int> from Range<String.Index>
54-
let roundTripOffsets = largeString._toUTF16Offsets(indexRange)
55-
expectEqualSequence(range, roundTripOffsets)
56-
57-
// Single Int <=> String.Index
58-
expectEqual(indexRange.lowerBound, largeString._toUTF16Index(i))
59-
expectEqual(indexRange.upperBound, largeString._toUTF16Index(j))
60-
expectEqual(i, largeString._toUTF16Offset(indexRange.lowerBound))
61-
expectEqual(j, largeString._toUTF16Offset(indexRange.upperBound))
6266
}
6367
}
6468
}
6569

66-
// TODO(String testing): hammer breadcrumb boundaries more, maybe internals too
70+
StringBreadcrumbsTests.test("largeString") {
71+
validateBreadcrumbs(largeString)
72+
}
6773

68-
runAllTests()
74+
// Test various boundary conditions with surrogate pairs aligning or not
75+
// aligning
76+
StringBreadcrumbsTests.test("surrogates-heavy") {
77+
let nonBMP = String(repeating: "𓀀", count: 1 + (64 / 2))
6978

79+
// Mis-align the hieroglyphics by 1,2,3 UTF-8 and UTF-16 code units
80+
validateBreadcrumbs(nonBMP)
81+
validateBreadcrumbs("a" + nonBMP)
82+
validateBreadcrumbs("ab" + nonBMP)
83+
validateBreadcrumbs("abc" + nonBMP)
84+
validateBreadcrumbs("é" + nonBMP)
85+
validateBreadcrumbs("" + nonBMP)
86+
validateBreadcrumbs("" + nonBMP)
87+
}
88+
89+
// TODO(String testing): test breadcrumb validity after mutation
90+
91+
runAllTests()

0 commit comments

Comments
 (0)