Skip to content

Add a private implementation of a String initializer with access to uninitialized storage (https://github.com/apple/swift-evolution/pull/1022) and use it to speed up uppercased() and lowercased() #26007

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 64 additions & 0 deletions benchmark/single-source/NSStringConversion.swift
Original file line number Diff line number Diff line change
Expand Up @@ -30,13 +30,41 @@ public let NSStringConversion = [
runFunction: run_NSMutableStringConversion,
tags: [.validation, .api, .String, .bridging],
setUpFunction: { test = NSMutableString(cString: "test", encoding: String.Encoding.ascii.rawValue)! }),
BenchmarkInfo(name: "NSStringConversion.Medium",
runFunction: run_NSStringConversion_medium,
tags: [.validation, .api, .String, .bridging],
setUpFunction: { test = NSString(cString: "aaaaaaaaaaaaaaa", encoding: String.Encoding.ascii.rawValue)! } ),
BenchmarkInfo(name: "NSStringConversion.Long",
runFunction: run_NSStringConversion_long,
tags: [.validation, .api, .String, .bridging],
setUpFunction: { test = NSString(cString: "The quick brown fox jumps over the lazy dog", encoding: String.Encoding.ascii.rawValue)! } ),
BenchmarkInfo(name: "NSStringConversion.LongUTF8",
runFunction: run_NSStringConversion_longNonASCII,
tags: [.validation, .api, .String, .bridging],
setUpFunction: { test = NSString(cString: "Thë qüick bröwn föx jumps over the lazy dög", encoding: String.Encoding.utf8.rawValue)! } ),
BenchmarkInfo(name: "NSStringConversion.Rebridge",
runFunction: run_NSStringConversion_rebridge,
tags: [.validation, .api, .String, .bridging],
setUpFunction: { test = NSString(cString: "test", encoding: String.Encoding.ascii.rawValue)! }),
BenchmarkInfo(name: "NSStringConversion.Rebridge.UTF8",
runFunction: run_NSStringConversion_nonASCII_rebridge,
tags: [.validation, .api, .String, .bridging],
setUpFunction: { test = NSString(cString: "tëst", encoding: String.Encoding.utf8.rawValue)! }),
BenchmarkInfo(name: "NSStringConversion.Rebridge.Mutable",
runFunction: run_NSMutableStringConversion_rebridge,
tags: [.validation, .api, .String, .bridging],
setUpFunction: { test = NSMutableString(cString: "test", encoding: String.Encoding.ascii.rawValue)! }),
BenchmarkInfo(name: "NSStringConversion.Rebridge.Medium",
runFunction: run_NSStringConversion_medium_rebridge,
tags: [.validation, .api, .String, .bridging],
setUpFunction: { test = NSString(cString: "aaaaaaaaaaaaaaa", encoding: String.Encoding.ascii.rawValue)! } ),
BenchmarkInfo(name: "NSStringConversion.Rebridge.Long",
runFunction: run_NSStringConversion_long_rebridge,
tags: [.validation, .api, .String, .bridging],
setUpFunction: { test = NSString(cString: "The quick brown fox jumps over the lazy dog", encoding: String.Encoding.ascii.rawValue)! } ),
BenchmarkInfo(name: "NSStringConversion.Rebridge.LongUTF8",
runFunction: run_NSStringConversion_longNonASCII_rebridge,
tags: [.validation, .api, .String, .bridging],
setUpFunction: { test = NSString(cString: "Thë qüick bröwn föx jumps over the lazy dög", encoding: String.Encoding.utf8.rawValue)! })]

public func run_NSStringConversion(_ N: Int) {
Expand All @@ -63,6 +91,10 @@ public func run_NSMutableStringConversion(_ N: Int) {
innerLoop(test, N)
}

public func run_NSStringConversion_medium(_ N: Int) {
innerLoop(test, N, 1000)
}

public func run_NSStringConversion_long(_ N: Int) {
innerLoop(test, N, 1000)
}
Expand All @@ -71,4 +103,36 @@ public func run_NSStringConversion_longNonASCII(_ N: Int) {
innerLoop(test, N, 300)
}

fileprivate func innerRebridge(_ str: NSString, _ N: Int, _ scale: Int = 5000) {
for _ in 1...N * scale {
let bridged = identity(str) as String
blackHole(bridged)
blackHole(bridged as NSString)
}
}

public func run_NSStringConversion_rebridge(_ N: Int) {
innerRebridge(test, N, 2500)
}

public func run_NSStringConversion_nonASCII_rebridge(_ N: Int) {
innerRebridge(test, N, 2500)
}

public func run_NSMutableStringConversion_rebridge(_ N: Int) {
innerRebridge(test, N)
}

public func run_NSStringConversion_medium_rebridge(_ N: Int) {
innerRebridge(test, N, 1000)
}

public func run_NSStringConversion_long_rebridge(_ N: Int) {
innerRebridge(test, N, 1000)
}

public func run_NSStringConversion_longNonASCII_rebridge(_ N: Int) {
innerRebridge(test, N, 300)
}

#endif
19 changes: 18 additions & 1 deletion stdlib/public/core/SmallString.swift
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,10 @@ extension _SmallString {
return try f(UnsafeMutableBufferPointer(
start: ptr, count: _SmallString.capacity))
}
if len == 0 {
self = _SmallString()
return
}
_internalInvariant(len <= _SmallString.capacity)

let (leading, trailing) = self.zeroTerminatedRawCodeUnits
Expand Down Expand Up @@ -260,6 +264,19 @@ extension _SmallString {

self.init(leading: leading, trailing: trailing, count: count)
}

@inline(__always)
internal init(
initializingUTF8With initializer: (
_ buffer: UnsafeMutableBufferPointer<UInt8>
) throws -> Int
) rethrows {
self.init()
try self.withMutableCapacity {
return try initializer($0)
}
self._invariantCheck()
}

@usableFromInline // @testable
internal init?(_ base: _SmallString, appending other: _SmallString) {
Expand Down Expand Up @@ -292,7 +309,7 @@ extension _SmallString {
self.init()
self.withMutableCapacity {
let len = _bridgeTagged(cocoa, intoUTF8: $0)
_internalInvariant(len != nil && len! < _SmallString.capacity,
_internalInvariant(len != nil && len! <= _SmallString.capacity,
"Internal invariant violated: large tagged NSStrings")
return len._unsafelyUnwrappedUnchecked
}
Expand Down
93 changes: 81 additions & 12 deletions stdlib/public/core/String.swift
Original file line number Diff line number Diff line change
Expand Up @@ -427,6 +427,77 @@ extension String {
}
return
}

/// Creates a new String with the specified capacity in UTF-8 code units then
/// calls the given closure with a buffer covering the String's uninitialized
/// memory.
///
/// The closure should return the number of initialized code units,
/// or 0 if it couldn't initialize the buffer (for example if the
/// requested capacity was too small).
///
/// This method replaces ill-formed UTF-8 sequences with the Unicode
/// replacement character (`"\u{FFFD}"`); This may require resizing
/// the buffer beyond its original capacity.
///
/// The following examples use this initializer with the contents of two
/// different `UInt8` arrays---the first with well-formed UTF-8 code unit
/// sequences and the second with an ill-formed sequence at the end.
///
/// let validUTF8: [UInt8] = [67, 97, 102, -61, -87, 0]
/// let s = String(uninitializedCapacity: validUTF8.count,
/// initializingUTF8With: { ptr in
/// ptr.initializeFrom(validUTF8)
/// return validUTF8.count
/// })
/// // Prints "Café"
///
/// let invalidUTF8: [UInt8] = [67, 97, 102, -61, 0]
/// let s = String(uninitializedCapacity: invalidUTF8.count,
/// initializingUTF8With: { ptr in
/// ptr.initializeFrom(invalidUTF8)
/// return invalidUTF8.count
/// })
/// // Prints "Caf�"
///
/// let s = String(uninitializedCapacity: invalidUTF8.count,
/// initializingUTF8With: { ptr in
/// ptr.initializeFrom(invalidUTF8)
/// return 0
/// })
/// // Prints ""
///
/// - Parameters:
/// - capacity: The number of UTF-8 code units worth of memory to allocate
/// for the String.
/// - initializer: A closure that initializes elements and sets the count of
/// the new String
/// - Parameters:
/// - buffer: A buffer covering uninitialized memory with room for the
/// specified number of UTF-8 code units.
@inline(__always)
internal init(
uninitializedCapacity capacity: Int,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this requires an underscored prefix as per stdlib naming convention, does it not?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's intended to be made public; there's a S-E pitch thread.

initializingUTF8With initializer: (
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit

Suggested change
initializingUTF8With initializer: (
initializingUTF8With initializer: (

_ buffer: UnsafeMutableBufferPointer<UInt8>
) throws -> Int
) rethrows {
if _fastPath(capacity <= _SmallString.capacity) {
let smol = try _SmallString(initializingUTF8With: initializer)
// Fast case where we fit in a _SmallString and don't need UTF8 validation
if _fastPath(smol.isASCII) {
self = String(_StringGuts(smol))
} else {
//We succeeded in making a _SmallString, but may need to repair UTF8
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Space after //

self = smol.withUTF8 { String._fromUTF8Repairing($0).result }
}
return
}

self = try String._fromLargeUTF8Repairing(
uninitializedCapacity: capacity,
initializingWith: initializer)
}

/// Calls the given closure with a pointer to the contents of the string,
/// represented as a null-terminated sequence of code units.
Expand Down Expand Up @@ -715,13 +786,12 @@ extension String {
public func lowercased() -> String {
if _fastPath(_guts.isFastASCII) {
return _guts.withFastUTF8 { utf8 in
// TODO(String performance): We can directly call appendInPlace
var result = String()
result.reserveCapacity(utf8.count)
for u8 in utf8 {
result._guts.append(String(Unicode.Scalar(_lowercaseASCII(u8)))._guts)
return String(uninitializedCapacity: utf8.count) { buffer in
for i in 0 ..< utf8.count {
buffer[i] = _lowercaseASCII(utf8[i])
}
return utf8.count
}
return result
}
}

Expand Down Expand Up @@ -776,13 +846,12 @@ extension String {
public func uppercased() -> String {
if _fastPath(_guts.isFastASCII) {
return _guts.withFastUTF8 { utf8 in
// TODO(String performance): code-unit appendInPlace on guts
var result = String()
result.reserveCapacity(utf8.count)
for u8 in utf8 {
result._guts.append(String(Unicode.Scalar(_uppercaseASCII(u8)))._guts)
return String(uninitializedCapacity: utf8.count) { buffer in
for i in 0 ..< utf8.count {
buffer[i] = _uppercaseASCII(utf8[i])
}
return utf8.count
}
return result
}
}

Expand Down
29 changes: 26 additions & 3 deletions stdlib/public/core/StringCreate.swift
Original file line number Diff line number Diff line change
Expand Up @@ -79,13 +79,36 @@ extension String {
) -> (result: String, repairsMade: Bool) {
switch validateUTF8(input) {
case .success(let extraInfo):
return (String._uncheckedFromUTF8(
input, asciiPreScanResult: extraInfo.isASCII
), false)
return (String._uncheckedFromUTF8(
input, asciiPreScanResult: extraInfo.isASCII
), false)
case .error(let initialRange):
return (repairUTF8(input, firstKnownBrokenRange: initialRange), true)
}
}

internal static func _fromLargeUTF8Repairing(
uninitializedCapacity capacity: Int,
initializingWith initializer: (
_ buffer: UnsafeMutableBufferPointer<UInt8>
) throws -> Int
) rethrows -> String {
let result = try __StringStorage.create(
uninitializedCapacity: capacity,
initializingUncheckedUTF8With: initializer)

switch validateUTF8(result.codeUnits) {
case .success(let info):
result._updateCountAndFlags(
newCount: result.count,
newIsASCII: info.isASCII
)
return result.asString
case .error(let initialRange):
//This could be optimized to use excess tail capacity
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Space after //

return repairUTF8(result.codeUnits, firstKnownBrokenRange: initialRange)
}
}

@usableFromInline
internal static func _uncheckedFromUTF8(
Expand Down
42 changes: 35 additions & 7 deletions stdlib/public/core/StringStorage.swift
Original file line number Diff line number Diff line change
Expand Up @@ -398,6 +398,34 @@ extension __StringStorage {
return __StringStorage.create(
realCodeUnitCapacity: realCapacity, countAndFlags: countAndFlags)
}

// The caller is expected to check UTF8 validity and ASCII-ness and update
// the resulting StringStorage accordingly
internal static func create(
uninitializedCapacity capacity: Int,
initializingUncheckedUTF8With initializer: (
_ buffer: UnsafeMutableBufferPointer<UInt8>
) throws -> Int
) rethrows -> __StringStorage {
let storage = __StringStorage.create(
capacity: capacity,
countAndFlags: CountAndFlags(mortalCount: 0, isASCII: false)
)
let buffer = UnsafeMutableBufferPointer(start: storage.mutableStart,
count: capacity)
let count = try initializer(buffer)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shouldn't this just end in a call to _updateCountAndFlags or something like that?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

_updateCountAndFlags checks invariants, and this particular initializer doesn't guarantee that all logically invariant things (for example "has ascii contents and claims to be ascii") hold, which is why I didn't use it in this one place.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

One day I'll refactor all this logic to flow more reasonably, make the stored properties private, etc., and then have a dedicated bit for "this has passed through some sanctioned create call".

let countAndFlags = CountAndFlags(mortalCount: count, isASCII: false)
#if arch(i386) || arch(arm)
storage._count = countAndFlags.count
storage._flags = countAndFlags.flags
#else
storage._countAndFlags = countAndFlags
#endif

storage.terminator.pointee = 0 // nul-terminated
return storage
}

@_effects(releasenone)
internal static func create(
Expand Down Expand Up @@ -453,7 +481,7 @@ extension __StringStorage {
}

@inline(__always)
private var codeUnits: UnsafeBufferPointer<UInt8> {
internal var codeUnits: UnsafeBufferPointer<UInt8> {
return UnsafeBufferPointer(start: start, count: count)
}

Expand Down Expand Up @@ -518,7 +546,7 @@ extension __StringStorage {
extension __StringStorage {
// Perform common post-RRC adjustments and invariant enforcement.
@_effects(releasenone)
private func _postRRCAdjust(newCount: Int, newIsASCII: Bool) {
internal func _updateCountAndFlags(newCount: Int, newIsASCII: Bool) {
let countAndFlags = CountAndFlags(
mortalCount: newCount, isASCII: newIsASCII)
#if arch(i386) || arch(arm)
Expand All @@ -540,7 +568,7 @@ extension __StringStorage {
appendedCount: Int, appendedIsASCII isASCII: Bool
) {
let oldTerminator = self.terminator
_postRRCAdjust(
_updateCountAndFlags(
newCount: self.count + appendedCount, newIsASCII: self.isASCII && isASCII)
_internalInvariant(oldTerminator + appendedCount == self.terminator)
}
Expand Down Expand Up @@ -570,7 +598,7 @@ extension __StringStorage {
}

internal func clear() {
_postRRCAdjust(newCount: 0, newIsASCII: true)
_updateCountAndFlags(newCount: 0, newIsASCII: true)
}
}

Expand All @@ -585,7 +613,7 @@ extension __StringStorage {
let tailCount = mutableEnd - upperPtr
lowerPtr.moveInitialize(from: upperPtr, count: tailCount)

_postRRCAdjust(
_updateCountAndFlags(
newCount: self.count &- (upper &- lower), newIsASCII: self.isASCII)
}

Expand Down Expand Up @@ -622,7 +650,7 @@ extension __StringStorage {
count: replCount)

let isASCII = self.isASCII && _allASCII(replacement)
_postRRCAdjust(newCount: lower + replCount + tailCount, newIsASCII: isASCII)
_updateCountAndFlags(newCount: lower + replCount + tailCount, newIsASCII: isASCII)
}


Expand Down Expand Up @@ -651,7 +679,7 @@ extension __StringStorage {
}
_internalInvariant(srcCount == replCount)

_postRRCAdjust(
_updateCountAndFlags(
newCount: lower + replCount + tailCount, newIsASCII: isASCII)
}
}
Expand Down