Skip to content

Commit 4ffc5fe

Browse files
authored
Merge pull request #62717 from lorentey/string-utf16-speedup
[stdlib] Speed up short UTF-16 distance calculations
2 parents e7fc160 + e46f8f8 commit 4ffc5fe

File tree

4 files changed

+226
-20
lines changed

4 files changed

+226
-20
lines changed

stdlib/public/core/StringBreadcrumbs.swift

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@
1313

1414
// @opaque
1515
internal final class _StringBreadcrumbs {
16+
/// The distance between successive breadcrumbs, measured in UTF-16 code
17+
/// units.
1618
internal static var breadcrumbStride: Int { 64 }
1719

1820
internal var utf16Length: Int

stdlib/public/core/StringBridge.swift

Lines changed: 20 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -791,23 +791,33 @@ extension StringProtocol {
791791
return self.utf16.index(self.utf16.startIndex, offsetBy: offset)
792792
}
793793

794-
@_specialize(where Self == String)
795-
@_specialize(where Self == Substring)
796794
public // SPI(Foundation)
797795
func _toUTF16Offsets(_ indices: Range<Index>) -> Range<Int> {
798-
let lowerbound = _toUTF16Offset(indices.lowerBound)
799-
let length = self.utf16.distance(
800-
from: indices.lowerBound, to: indices.upperBound)
801-
return Range(
802-
uncheckedBounds: (lower: lowerbound, upper: lowerbound + length))
796+
if Self.self == String.self {
797+
let s = unsafeBitCast(self, to: String.self)
798+
return s.utf16._offsetRange(for: indices, from: s.startIndex)
799+
}
800+
if Self.self == Substring.self {
801+
let s = unsafeBitCast(self, to: Substring.self)
802+
return s._slice._base.utf16._offsetRange(for: indices, from: s.startIndex)
803+
}
804+
let startOffset = _toUTF16Offset(indices.lowerBound)
805+
let endOffset = _toUTF16Offset(indices.upperBound)
806+
return Range(uncheckedBounds: (lower: startOffset, upper: endOffset))
803807
}
804808

805-
@_specialize(where Self == String)
806-
@_specialize(where Self == Substring)
807809
public // SPI(Foundation)
808810
func _toUTF16Indices(_ range: Range<Int>) -> Range<Index> {
811+
if Self.self == String.self {
812+
let s = unsafeBitCast(self, to: String.self)
813+
return s.utf16._indexRange(for: range, from: s.startIndex)
814+
}
815+
if Self.self == Substring.self {
816+
let s = unsafeBitCast(self, to: Substring.self)
817+
return s._slice._base.utf16._indexRange(for: range, from: s.startIndex)
818+
}
809819
let lowerbound = _toUTF16Index(range.lowerBound)
810-
let upperbound = _toUTF16Index(range.lowerBound + range.count)
820+
let upperbound = _toUTF16Index(range.upperBound)
811821
return Range(uncheckedBounds: (lower: lowerbound, upper: upperbound))
812822
}
813823
}

stdlib/public/core/StringUTF16View.swift

Lines changed: 166 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,9 @@ extension String.UTF16View: BidirectionalCollection {
138138
@inlinable @inline(__always)
139139
public var endIndex: Index { return _guts.endIndex }
140140

141+
@inline(__always)
142+
internal var _breadcrumbStride: Int { _StringBreadcrumbs.breadcrumbStride }
143+
141144
@inlinable @inline(__always)
142145
public func index(after idx: Index) -> Index {
143146
var idx = _guts.ensureMatchingEncoding(idx)
@@ -194,31 +197,61 @@ extension String.UTF16View: BidirectionalCollection {
194197
return idx.encoded(offsetBy: -len)._scalarAligned._knownUTF8
195198
}
196199

200+
@_effects(releasenone)
197201
public func index(_ i: Index, offsetBy n: Int) -> Index {
198-
let i = _guts.ensureMatchingEncoding(i)
202+
var i = _guts.ensureMatchingEncoding(i)
199203
_precondition(i <= endIndex, "String index is out of bounds")
204+
200205
if _slowPath(_guts.isForeign) {
201206
return _foreignIndex(i, offsetBy: n)
202207
}
203208

209+
if _guts.isASCII {
210+
return Index(
211+
_encodedOffset: i._encodedOffset + n
212+
)._scalarAligned._encodingIndependent
213+
}
214+
215+
i = _utf16AlignNativeIndex(i)
216+
let threshold = (
217+
i == startIndex ? _breadcrumbStride / 2 : _breadcrumbStride)
218+
if n.magnitude < threshold {
219+
// Do not use breadcrumbs if directly computing the result is expected
220+
// to be cheaper.
221+
return _index(i, offsetBy: n)._knownUTF8
222+
}
223+
204224
let lowerOffset = _nativeGetOffset(for: i)
205225
let result = _nativeGetIndex(for: lowerOffset + n)
206226
return result
207227
}
208228

229+
@_effects(releasenone)
209230
public func index(
210231
_ i: Index, offsetBy n: Int, limitedBy limit: Index
211232
) -> Index? {
212-
let limit = _guts.ensureMatchingEncoding(limit)
233+
var limit = _guts.ensureMatchingEncoding(limit)
213234
guard _fastPath(limit <= endIndex) else { return index(i, offsetBy: n) }
214235

215-
let i = _guts.ensureMatchingEncoding(i)
236+
var i = _guts.ensureMatchingEncoding(i)
216237
_precondition(i <= endIndex, "String index is out of bounds")
217238

218239
if _slowPath(_guts.isForeign) {
219240
return _foreignIndex(i, offsetBy: n, limitedBy: limit)
220241
}
221242

243+
if !_guts.isASCII { // We have ASCII fast paths below
244+
limit = _utf16AlignNativeIndex(limit)
245+
i = _utf16AlignNativeIndex(i)
246+
let threshold = (
247+
_breadcrumbStride + (i == startIndex ? 0 : _breadcrumbStride / 2))
248+
if n.magnitude < threshold {
249+
// Do not use breadcrumbs if directly computing the result is expected
250+
// to be cheaper.
251+
return _index(i, offsetBy: n, limitedBy: limit)?._knownUTF8
252+
}
253+
}
254+
222255
let iOffset = _nativeGetOffset(for: i)
223256
let limitOffset = _nativeGetOffset(for: limit)
224257

@@ -235,9 +268,10 @@ extension String.UTF16View: BidirectionalCollection {
235268
return result
236269
}
237270

271+
@_effects(releasenone)
238272
public func distance(from start: Index, to end: Index) -> Int {
239-
let start = _guts.ensureMatchingEncoding(start)
240-
let end = _guts.ensureMatchingEncoding(end)
273+
var start = _guts.ensureMatchingEncoding(start)
274+
var end = _guts.ensureMatchingEncoding(end)
241275

242276
// FIXME: This method used to not properly validate indices before 5.7;
243277
// temporarily allow older binaries to keep invoking undefined behavior as
@@ -255,6 +289,29 @@ extension String.UTF16View: BidirectionalCollection {
255289
return _foreignDistance(from: start, to: end)
256290
}
257291

292+
let utf8Distance = end._encodedOffset - start._encodedOffset
293+
294+
if _guts.isASCII {
295+
return utf8Distance
296+
}
297+
298+
let threshold = (start == startIndex || end == startIndex
299+
? _breadcrumbStride / 2
300+
: _breadcrumbStride)
301+
if utf8Distance.magnitude < threshold {
302+
// Do not use breadcrumbs if directly computing the result is expected to
303+
// be cheaper. The conservative threshold above assumes that each UTF-16
304+
// code unit will map to a single UTF-8 code unit, i.e., the worst
305+
// possible (a.k.a. most compact) case with all ASCII scalars.
306+
// FIXME: Figure out if a more optimistic threshold would work better.
307+
start = _utf16AlignNativeIndex(start)
308+
end = _utf16AlignNativeIndex(end)
309+
guard start <= end else {
310+
return -_utf16Distance(from: end, to: start)
311+
}
312+
return _utf16Distance(from: start, to: end)
313+
}
314+
258315
let lower = _nativeGetOffset(for: start)
259316
let upper = _nativeGetOffset(for: end)
260317
return upper &- lower
@@ -268,6 +325,86 @@ extension String.UTF16View: BidirectionalCollection {
268325
return _nativeGetOffset(for: endIndex)
269326
}
270327

328+
internal func _indexRange(
329+
for offsets: Range<Int>,
330+
from start: Index
331+
) -> Range<Index> {
332+
_internalInvariant(_guts.hasMatchingEncoding(start))
333+
if _slowPath(_guts.isForeign) {
334+
let lower = self.index(start, offsetBy: offsets.lowerBound)
335+
let upper = _foreignIndex(lower, offsetBy: offsets.count)
336+
return Range(uncheckedBounds: (lower, upper))
337+
}
338+
339+
if _guts.isASCII {
340+
let lower = self.index(start, offsetBy: offsets.lowerBound)
341+
let upper = self.index(lower, offsetBy: offsets.count)
342+
return Range(uncheckedBounds: (lower, upper))
343+
}
344+
345+
if offsets.count < _breadcrumbStride / 2 {
346+
let lower = self.index(start, offsetBy: offsets.lowerBound)
347+
let upper = _index(lower, offsetBy: offsets.count)._knownUTF8
348+
return Range(uncheckedBounds: (lower, upper))
349+
}
350+
351+
let bias = _nativeGetOffset(for: start)
352+
let lower = (
353+
offsets.lowerBound - bias <= _breadcrumbStride / 2
354+
? _index(start, offsetBy: offsets.lowerBound)
355+
: _nativeGetIndex(for: bias + offsets.lowerBound))
356+
let upper = _nativeGetIndex(for: bias + offsets.upperBound)
357+
return Range(uncheckedBounds: (lower, upper))
358+
}
359+
360+
internal func _offsetRange(
361+
for range: Range<Index>,
362+
from start: Index
363+
) -> Range<Int> {
364+
var lower = _guts.ensureMatchingEncoding(range.lowerBound)
365+
var upper = _guts.ensureMatchingEncoding(range.upperBound)
366+
_internalInvariant(_guts.hasMatchingEncoding(start))
367+
368+
_precondition(
369+
ifLinkedOnOrAfter: .v5_7_0,
370+
lower._encodedOffset <= _guts.count,
371+
"String index is out of bounds")
372+
_precondition(
373+
ifLinkedOnOrAfter: .v5_7_0,
374+
upper._encodedOffset <= _guts.count,
375+
"String index is out of bounds")
376+
377+
if _slowPath(_guts.isForeign) {
378+
let lowerOffset = _foreignDistance(from: start, to: lower)
379+
let distance = _foreignDistance(from: lower, to: upper)
380+
return Range(uncheckedBounds: (lowerOffset, lowerOffset + distance))
381+
}
382+
383+
let utf8Distance = upper._encodedOffset - lower._encodedOffset
384+
385+
if _guts.isASCII {
386+
let lowerOffset = lower._encodedOffset - start._encodedOffset
387+
return Range(uncheckedBounds: (lowerOffset, lowerOffset + utf8Distance))
388+
}
389+
390+
if utf8Distance.magnitude <= _breadcrumbStride / 2 {
391+
lower = _utf16AlignNativeIndex(lower)
392+
upper = _utf16AlignNativeIndex(upper)
393+
let lowerOffset = distance(from: start, to: lower)
394+
let distance = _utf16Distance(from: lower, to: upper)
395+
return Range(uncheckedBounds: (lowerOffset, lowerOffset + distance))
396+
}
397+
398+
let bias = _nativeGetOffset(for: start)
399+
let utf8StartOffset = lower._encodedOffset - start._encodedOffset
400+
let lowerOffset = (
401+
utf8StartOffset <= _breadcrumbStride / 2
402+
? _utf16Distance(from: start, to: lower)
403+
: _nativeGetOffset(for: lower) - bias)
404+
let upperOffset = _nativeGetOffset(for: upper) - bias
405+
return Range(uncheckedBounds: (lowerOffset, upperOffset))
406+
}
407+
271408
/// Accesses the code unit at the given position.
272409
///
273410
/// The following example uses the subscript to print the value of a
@@ -618,8 +755,7 @@ extension String.UTF16View {
618755
return utf16Count
619756
}
620757
#endif
621-
622-
@inline(__always)
758+
623759
internal func _utf16Distance(from start: Index, to end: Index) -> Int {
624760
_internalInvariant(end.transcodedOffset == 0 || end.transcodedOffset == 1)
625761

@@ -691,17 +827,24 @@ extension String.UTF16View {
691827
}
692828
}
693829

830+
/// Return the UTF-16 offset corresponding to `idx`, measured from the
831+
/// start of this string, which must be a native UTF-8 string.
832+
///
833+
/// - Complexity: This measures the UTF-16 distance of `idx` from its nearest
834+
/// breadcrumb index (rounding down), so on average it needs to look at
835+
/// `breadcrumbStride / 2` UTF-16 code units. (In addition to the O(log(n))
836+
/// cost of looking up the nearest breadcrumb, and the amortizable O(n)
837+
/// cost of generating the breadcrumbs in the first place.)
694838
@usableFromInline
695839
@_effects(releasenone)
696840
internal func _nativeGetOffset(for idx: Index) -> Int {
697841
_internalInvariant(idx._encodedOffset <= _guts.count)
698-
// Trivial and common: start
699-
if idx == startIndex { return 0 }
700-
701842
if _guts.isASCII {
702843
_internalInvariant(idx.transcodedOffset == 0)
703844
return idx._encodedOffset
704845
}
846+
// Trivial and common: start
847+
if idx == startIndex { return 0 }
705848

706849
let idx = _utf16AlignNativeIndex(idx)
707850

@@ -714,11 +857,22 @@ extension String.UTF16View {
714857
if idx == endIndex { return breadcrumbsPtr.pointee.utf16Length }
715858

716859
// Otherwise, find the nearest lower-bound breadcrumb and count from there
860+
// FIXME: Starting from the upper-bound crumb when that is closer would cut
861+
// the average cost of the subsequent iteration by 50%.
717862
let (crumb, crumbOffset) = breadcrumbsPtr.pointee.getBreadcrumb(
718863
forIndex: idx)
719864
return crumbOffset + _utf16Distance(from: crumb, to: idx)
720865
}
721866

867+
/// Return the index at the given UTF-16 offset, measured from the
868+
/// start of this string, which must be a native UTF-8 string.
869+
///
870+
/// - Complexity: This iterates UTF-16 code units starting from the
871+
/// nearest breadcrumb to `offset` (rounding down), so on
872+
/// average it needs to look at `breadcrumbStride / 2` UTF-16 code
873+
/// units. (In addition to the O(1) cost of looking up the nearest
874+
/// breadcrumb, and the amortizable O(n) cost of generating the
875+
/// breadcrumbs in the first place.)
722876
@usableFromInline
723877
@_effects(releasenone)
724878
internal func _nativeGetIndex(for offset: Int) -> Index {
@@ -742,6 +896,8 @@ extension String.UTF16View {
742896
if offset == breadcrumbsPtr.pointee.utf16Length { return endIndex }
743897

744898
// Otherwise, find the nearest lower-bound breadcrumb and advance that
899+
// FIXME: Starting from the upper-bound crumb when that is closer would cut
900+
// the average cost of the subsequent iteration by 50%.
745901
let (crumb, remaining) = breadcrumbsPtr.pointee.getBreadcrumb(
746902
forOffset: offset)
747903
if remaining == 0 { return crumb }

test/stdlib/StringIndex.swift

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -995,6 +995,44 @@ suite.test("Index encoding correction/UTF-8→16/conversions/UTF-16") {
995995
}
996996
#endif
997997

998+
suite.test("UTF-16 breadcrumbs") {
999+
1000+
let string = #"""
1001+
The powerful programming language that is also easy to learn.
1002+
손쉽게 학습할 수 있는 강력한 프로그래밍 언어.
1003+
🪙 A 🥞 short 🍰 piece 🫘 of 🌰 text 👨‍👨‍👧‍👧 with 👨‍👩‍👦 some 🚶🏽 emoji 🇺🇸🇨🇦 characters 🧈
1004+
some🔩times 🛺 placed 🎣 in 🥌 the 🆘 mid🔀dle 🇦🇶or🏁 around 🏳️‍🌈 a 🍇 w🍑o🥒r🥨d
1005+
Unicode is such fun!
1006+
U̷n̷i̷c̷o̴d̴e̷ ̶i̸s̷ ̸s̵u̵c̸h̷ ̸f̵u̷n̴!̵
1007+
U̴̡̲͋̾n̵̻̳͌ì̶̠̕c̴̭̈͘ǫ̷̯͋̊d̸͖̩̈̈́ḛ̴́ ̴̟͎͐̈i̴̦̓s̴̜̱͘ ̶̲̮̚s̶̙̞͘u̵͕̯̎̽c̵̛͕̜̓h̶̘̍̽ ̸̜̞̿f̵̤̽ṷ̴͇̎͘ń̷͓̒!̷͍̾̚
1008+
U̷̢̢̧̨̼̬̰̪͓̞̠͔̗̼̙͕͕̭̻̗̮̮̥̣͉̫͉̬̲̺͍̺͊̂ͅ\#
1009+
n̶̨̢̨̯͓̹̝̲̣̖̞̼̺̬̤̝̊̌́̑̋̋͜͝ͅ\#
1010+
ḭ̸̦̺̺͉̳͎́͑\#
1011+
c̵̛̘̥̮̙̥̟̘̝͙̤̮͉͔̭̺̺̅̀̽̒̽̏̊̆͒͌̂͌̌̓̈́̐̔̿̂͑͠͝͝ͅ\#
1012+
"""#
1013+
1014+
print(string.utf16.count)
1015+
let indices = Array(string.utf16.indices) + [string.utf16.endIndex]
1016+
for i in 0 ..< indices.count {
1017+
for j in 0 ..< indices.count {
1018+
let distance = string.utf16.distance(from: indices[i], to: indices[j])
1019+
expectEqual(distance, j - i,
1020+
"""
1021+
i: \(i), indices[i]: \(indices[i]._description)
1022+
j: \(j), indices[j]: \(indices[j]._description)
1023+
""")
1024+
1025+
let target = string.utf16.index(indices[i], offsetBy: j - i)
1026+
expectEqual(target, indices[j],
1027+
"""
1028+
i: \(i), indices[i]: \(indices[i]._description)
1029+
j: \(j), indices[j]: \(indices[j]._description)
1030+
target: \(target._description)
1031+
""")
1032+
}
1033+
}
1034+
}
1035+
9981036
suite.test("String.replaceSubrange index validation")
9991037
.forEach(in: examples) { string in
10001038
guard #available(SwiftStdlib 5.7, *) else {

0 commit comments

Comments
 (0)