Skip to content

Vectorize UTF16 offset calculations #41866

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Mar 18, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
109 changes: 104 additions & 5 deletions stdlib/public/core/StringUTF16View.swift
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ extension String.UTF16View: BidirectionalCollection {
/// In an empty UTF-16 view, `endIndex` is equal to `startIndex`.
@inlinable @inline(__always)
public var endIndex: Index { return _guts.endIndex }

@inlinable @inline(__always)
public func index(after idx: Index) -> Index {
if _slowPath(_guts.isForeign) { return _foreignIndex(after: idx) }
Expand All @@ -149,6 +149,7 @@ extension String.UTF16View: BidirectionalCollection {
// TODO: If transcoded is 1, can we just skip ahead 4?

let idx = _utf16AlignNativeIndex(idx)

let len = _guts.fastUTF8ScalarLength(startingAt: idx._encodedOffset)
if len == 4 && idx.transcodedOffset == 0 {
return idx.nextTranscoded
Expand Down Expand Up @@ -518,6 +519,105 @@ extension _StringGuts {
}

extension String.UTF16View {

@inline(__always)
internal func _utf16Length<U: SIMD, S: SIMD>(
readPtr: inout UnsafeRawPointer,
endPtr: UnsafeRawPointer,
unsignedSIMDType: U.Type,
signedSIMDType: S.Type
) -> Int where U.Scalar == UInt8, S.Scalar == Int8 {
var utf16Count = 0

while readPtr + MemoryLayout<U>.stride < endPtr {
//Find the number of continuations (0b10xxxxxx)
let sValue = Builtin.loadRaw(readPtr._rawValue) as S
let continuations = S.zero.replacing(with: S.one, where: sValue .< -65 + 1)
let continuationCount = Int(continuations.wrappedSum())

//Find the number of 4 byte code points (0b11110xxx)
let uValue = Builtin.loadRaw(readPtr._rawValue) as U
let fourBytes = U.zero.replacing(with: U.one, where: uValue .>= 0b11110000)
let fourByteCount = Int(fourBytes.wrappedSum())

utf16Count &+= (U.scalarCount - continuationCount) + fourByteCount
Copy link
Contributor

@stephentyrone stephentyrone Mar 18, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Horizontal operations (wrappedSum) are expensive; usually in SIMD code we keep the accumulation in vector until forced to map down to a scalar rather, than accumulating every loop iteration. It's a little bit of a pain with Int8 because you can only do 127 accumulations before you have to worry about overflow, but still doing one horizontal operation every N iterations is much better than 2/iteration.

The easy change here would be to do:

&+= U.scalarCount + (fourByteCount &- continuationCount).wrappedSum()

so that you only do half as many as currently.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure, I'll make a followup patch

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(The optimizer might be managing to pull this apart for you already, but it would be nice to make explicit anyway.)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(I'm hoping once wider vectors are feasible that the common case here, post-breadcrumb offsets, will only ever iterate once because we can just do the entire 64 bytes at once)


readPtr += MemoryLayout<U>.stride
}

return utf16Count
}

@inline(__always)
internal func _utf16Distance(from start: Index, to end: Index) -> Int {
_internalInvariant(end.transcodedOffset == 0 || end.transcodedOffset == 1)

return (end.transcodedOffset - start.transcodedOffset) + _guts.withFastUTF8(
range: start._encodedOffset ..< end._encodedOffset
) { utf8 in
let rawBuffer = UnsafeRawBufferPointer(utf8)
guard rawBuffer.count > 0 else { return 0 }

var utf16Count = 0
var readPtr = rawBuffer.baseAddress.unsafelyUnwrapped
let initialReadPtr = readPtr
let endPtr = readPtr + rawBuffer.count

//eat leading continuations
while readPtr < endPtr {
let byte = readPtr.load(as: UInt8.self)
if !UTF8.isContinuation(byte) {
break
}
readPtr += 1
}

// TODO: Currently, using SIMD sizes above SIMD8 is slower
// Once that's fixed we should go up to SIMD64 here

utf16Count &+= _utf16Length(
readPtr: &readPtr,
endPtr: endPtr,
unsignedSIMDType: SIMD8<UInt8>.self,
signedSIMDType: SIMD8<Int8>.self
)

//TO CONSIDER: SIMD widths <8 here

//back up to the start of the current scalar if we may have a trailing
//incomplete scalar
if utf16Count > 0 && UTF8.isContinuation(readPtr.load(as: UInt8.self)) {
while readPtr > initialReadPtr && UTF8.isContinuation(readPtr.load(as: UInt8.self)) {
readPtr -= 1
}

//The trailing scalar may be incomplete, subtract it out and check below
let byte = readPtr.load(as: UInt8.self)
let len = _utf8ScalarLength(byte)
utf16Count &-= len == 4 ? 2 : 1
if readPtr == initialReadPtr {
//if we backed up all the way and didn't hit a non-continuation, then
//we don't have any complete scalars, and we should bail.
return 0
}
}

//trailing bytes
while readPtr < endPtr {
let byte = readPtr.load(as: UInt8.self)
let len = _utf8ScalarLength(byte)
// if we don't have enough bytes left, we don't have a complete scalar,
// so don't add it to the count.
if readPtr + len <= endPtr {
utf16Count &+= len == 4 ? 2 : 1
}
readPtr += len
}

return utf16Count
}
}

@usableFromInline
@_effects(releasenone)
internal func _nativeGetOffset(for idx: Index) -> Int {
Expand All @@ -532,9 +632,7 @@ extension String.UTF16View {
let idx = _utf16AlignNativeIndex(idx)

guard _guts._useBreadcrumbs(forEncodedOffset: idx._encodedOffset) else {
// TODO: Generic _distance is still very slow. We should be able to
// skip over ASCII substrings quickly
return _distance(from: startIndex, to: idx)
return _utf16Distance(from: startIndex, to: idx)
}

// Simple and common: endIndex aka `length`.
Expand All @@ -544,7 +642,8 @@ extension String.UTF16View {
// Otherwise, find the nearest lower-bound breadcrumb and count from there
let (crumb, crumbOffset) = breadcrumbsPtr.pointee.getBreadcrumb(
forIndex: idx)
return crumbOffset + _distance(from: crumb, to: idx)

return crumbOffset + _utf16Distance(from: crumb, to: idx)
}

@usableFromInline
Expand Down
2 changes: 0 additions & 2 deletions validation-test/stdlib/StringBreadcrumbs.swift
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
// rdar://84233775
// REQUIRES: rdar84233775

// RUN: %target-run-stdlib-swift
// REQUIRES: executable_test,optimized_stdlib
Expand Down