Skip to content

Commit b7d1174

Browse files
committed
[stdlib] Optimize StringProtocol._toUTF16Indices/_toUTF16Offsets
Speed up conversion between UTF-16 offset ranges and string index ranges, by carefully switching between absolute and relative index calculations, depending on the distance we need to go. It is a surprisingly tricky puzzle to do this correctly while avoiding redundant calculations. Offset ranges within substrings add the additional complication of having to bias offset values with the absolute offset of the substring’s start index.
1 parent 6fee1b3 commit b7d1174

File tree

2 files changed

+97
-12
lines changed

2 files changed

+97
-12
lines changed

stdlib/public/core/StringBridge.swift

Lines changed: 20 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -795,19 +795,33 @@ extension StringProtocol {
795795
@_specialize(where Self == Substring)
796796
public // SPI(Foundation)
797797
func _toUTF16Offsets(_ indices: Range<Index>) -> Range<Int> {
798-
let lowerbound = _toUTF16Offset(indices.lowerBound)
799-
let length = self.utf16.distance(
800-
from: indices.lowerBound, to: indices.upperBound)
801-
return Range(
802-
uncheckedBounds: (lower: lowerbound, upper: lowerbound + length))
798+
if Self.self == String.self {
799+
let s = unsafeBitCast(self, to: String.self)
800+
return s.utf16._offsetRange(for: indices, from: s.startIndex)
801+
}
802+
if Self.self == Substring.self {
803+
let s = unsafeBitCast(self, to: Substring.self)
804+
return s._slice._base.utf16._offsetRange(for: indices, from: s.startIndex)
805+
}
806+
let startOffset = _toUTF16Offset(indices.lowerBound)
807+
let endOffset = _toUTF16Offset(indices.upperBound)
808+
return Range(uncheckedBounds: (lower: startOffset, upper: endOffset))
803809
}
804810

805811
@_specialize(where Self == String)
806812
@_specialize(where Self == Substring)
807813
public // SPI(Foundation)
808814
func _toUTF16Indices(_ range: Range<Int>) -> Range<Index> {
815+
if Self.self == String.self {
816+
let s = unsafeBitCast(self, to: String.self)
817+
return s.utf16._indexRange(for: range, from: s.startIndex)
818+
}
819+
if Self.self == Substring.self {
820+
let s = unsafeBitCast(self, to: Substring.self)
821+
return s._slice._base.utf16._indexRange(for: range, from: s.startIndex)
822+
}
809823
let lowerbound = _toUTF16Index(range.lowerBound)
810-
let upperbound = self.utf16.index(lowerbound, offsetBy: range.count)
824+
let upperbound = _toUTF16Index(range.upperBound)
811825
return Range(uncheckedBounds: (lower: lowerbound, upper: upperbound))
812826
}
813827
}

stdlib/public/core/StringUTF16View.swift

Lines changed: 77 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,9 @@ extension String.UTF16View: BidirectionalCollection {
138138
@inlinable @inline(__always)
139139
public var endIndex: Index { return _guts.endIndex }
140140

141+
@inline(__always)
142+
internal var _breadcrumbStride: Int { _StringBreadcrumbs.breadcrumbStride }
143+
141144
@inlinable @inline(__always)
142145
public func index(after idx: Index) -> Index {
143146
var idx = _guts.ensureMatchingEncoding(idx)
@@ -201,7 +204,9 @@ extension String.UTF16View: BidirectionalCollection {
201204
return _foreignIndex(i, offsetBy: n)
202205
}
203206

204-
if n.magnitude <= _StringBreadcrumbs.breadcrumbStride, !_guts.isASCII {
207+
let threshold = (
208+
i == startIndex ? _breadcrumbStride / 2 : _breadcrumbStride)
209+
if n.magnitude < threshold, !_guts.isASCII {
205210
// Do not use breadcrumbs if directly computing the result is expected to
206211
// be cheaper.
207212
return _index(i, offsetBy: n)._knownUTF8
@@ -225,7 +230,9 @@ extension String.UTF16View: BidirectionalCollection {
225230
return _foreignIndex(i, offsetBy: n, limitedBy: limit)
226231
}
227232

228-
if n.magnitude <= _StringBreadcrumbs.breadcrumbStride, !_guts.isASCII {
233+
let threshold = (
234+
_breadcrumbStride + (i == startIndex ? 0 : _breadcrumbStride / 2))
235+
if n.magnitude < threshold, !_guts.isASCII {
229236
// Do not use breadcrumbs if directly computing the result is expected to
230237
// be cheaper.
231238
return _index(i, offsetBy: n, limitedBy: limit)?._knownUTF8
@@ -268,10 +275,10 @@ extension String.UTF16View: BidirectionalCollection {
268275
}
269276

270277
let utf8Distance = end._encodedOffset - start._encodedOffset
271-
if
272-
utf8Distance.magnitude <= _StringBreadcrumbs.breadcrumbStride,
273-
!_guts.isASCII
274-
{
278+
let threshold = (start == startIndex || end == startIndex
279+
? _breadcrumbStride / 2
280+
: _breadcrumbStride)
281+
if utf8Distance.magnitude < threshold, !_guts.isASCII {
275282
// Do not use breadcrumbs if directly computing the result is expected to
276283
// be cheaper. The conservative threshold above assumes that each UTF-16
277284
// code unit will map to a single UTF-8 code unit, i.e., the worst
@@ -292,6 +299,70 @@ extension String.UTF16View: BidirectionalCollection {
292299
return _nativeGetOffset(for: endIndex)
293300
}
294301

302+
internal func _indexRange(
303+
for offsets: Range<Int>,
304+
from start: Index
305+
) -> Range<Index> {
306+
_internalInvariant(_guts.hasMatchingEncoding(start))
307+
if _slowPath(_guts.isForeign) {
308+
let lower = self.index(start, offsetBy: offsets.lowerBound)
309+
let upper = _foreignIndex(lower, offsetBy: offsets.count)
310+
return Range(uncheckedBounds: (lower, upper))
311+
}
312+
if offsets.count < _breadcrumbStride / 2, !_guts.isASCII {
313+
let lower = self.index(start, offsetBy: offsets.lowerBound)
314+
let upper = _index(lower, offsetBy: offsets.count)._knownUTF8
315+
return Range(uncheckedBounds: (lower, upper))
316+
}
317+
318+
let bias = _nativeGetOffset(for: start)
319+
let lower = (
320+
offsets.lowerBound - bias <= _breadcrumbStride / 2
321+
? _index(start, offsetBy: offsets.lowerBound)
322+
: _nativeGetIndex(for: bias + offsets.lowerBound))
323+
let upper = _nativeGetIndex(for: bias + offsets.upperBound)
324+
return Range(uncheckedBounds: (lower, upper))
325+
}
326+
327+
internal func _offsetRange(
328+
for range: Range<Index>,
329+
from start: Index
330+
) -> Range<Int> {
331+
let lower = _guts.ensureMatchingEncoding(range.lowerBound)
332+
let upper = _guts.ensureMatchingEncoding(range.upperBound)
333+
_internalInvariant(_guts.hasMatchingEncoding(start))
334+
335+
_precondition(
336+
ifLinkedOnOrAfter: .v5_7_0,
337+
lower._encodedOffset <= _guts.count,
338+
"String index is out of bounds")
339+
_precondition(
340+
ifLinkedOnOrAfter: .v5_7_0,
341+
upper._encodedOffset <= _guts.count,
342+
"String index is out of bounds")
343+
344+
if _slowPath(_guts.isForeign) {
345+
let lowerOffset = _foreignDistance(from: start, to: lower)
346+
let distance = _foreignDistance(from: lower, to: upper)
347+
return Range(uncheckedBounds: (lowerOffset, lowerOffset + distance))
348+
}
349+
350+
let utf8Distance = upper._encodedOffset - lower._encodedOffset
351+
if utf8Distance.magnitude <= _breadcrumbStride / 2, !_guts.isASCII {
352+
let lowerOffset = distance(from: start, to: lower)
353+
let distance = _utf16Distance(from: lower, to: upper)
354+
return Range(uncheckedBounds: (lowerOffset, lowerOffset + distance))
355+
}
356+
let bias = _nativeGetOffset(for: start)
357+
let utf8StartOffset = lower._encodedOffset - start._encodedOffset
358+
let lowerOffset = (
359+
utf8StartOffset <= _breadcrumbStride / 2
360+
? _utf16Distance(from: start, to: lower)
361+
: _nativeGetOffset(for: lower) - bias)
362+
let upperOffset = _nativeGetOffset(for: upper) - bias
363+
return Range(uncheckedBounds: (lowerOffset, upperOffset))
364+
}
365+
295366
/// Accesses the code unit at the given position.
296367
///
297368
/// The following example uses the subscript to print the value of a

0 commit comments

Comments
 (0)