-
Notifications
You must be signed in to change notification settings - Fork 10.5k
Add a private implementation of a String initializer with access to uninitialized storage (https://github.com/apple/swift-evolution/pull/1022) and use it to speed up uppercased() and lowercased() #26007
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -427,6 +427,77 @@ extension String { | |||||
} | ||||||
return | ||||||
} | ||||||
|
||||||
/// Creates a new String with the specified capacity in UTF-8 code units then | ||||||
/// calls the given closure with a buffer covering the String's uninitialized | ||||||
/// memory. | ||||||
/// | ||||||
/// The closure should return the number of initialized code units, | ||||||
/// or 0 if it couldn't initialize the buffer (for example if the | ||||||
/// requested capacity was too small). | ||||||
/// | ||||||
/// This method replaces ill-formed UTF-8 sequences with the Unicode | ||||||
/// replacement character (`"\u{FFFD}"`); This may require resizing | ||||||
/// the buffer beyond its original capacity. | ||||||
/// | ||||||
/// The following examples use this initializer with the contents of two | ||||||
/// different `UInt8` arrays---the first with well-formed UTF-8 code unit | ||||||
/// sequences and the second with an ill-formed sequence at the end. | ||||||
/// | ||||||
/// let validUTF8: [UInt8] = [67, 97, 102, -61, -87, 0] | ||||||
/// let s = String(uninitializedCapacity: validUTF8.count, | ||||||
/// initializingUTF8With: { ptr in | ||||||
/// ptr.initializeFrom(validUTF8) | ||||||
/// return validUTF8.count | ||||||
/// }) | ||||||
/// // Prints "Café" | ||||||
/// | ||||||
/// let invalidUTF8: [UInt8] = [67, 97, 102, -61, 0] | ||||||
/// let s = String(uninitializedCapacity: invalidUTF8.count, | ||||||
/// initializingUTF8With: { ptr in | ||||||
/// ptr.initializeFrom(invalidUTF8) | ||||||
/// return invalidUTF8.count | ||||||
/// }) | ||||||
/// // Prints "Caf�" | ||||||
/// | ||||||
/// let s = String(uninitializedCapacity: invalidUTF8.count, | ||||||
/// initializingUTF8With: { ptr in | ||||||
/// ptr.initializeFrom(invalidUTF8) | ||||||
/// return 0 | ||||||
/// }) | ||||||
/// // Prints "" | ||||||
/// | ||||||
/// - Parameters: | ||||||
/// - capacity: The number of UTF-8 code units worth of memory to allocate | ||||||
/// for the String. | ||||||
/// - initializer: A closure that initializes elements and sets the count of | ||||||
/// the new String | ||||||
/// - Parameters: | ||||||
/// - buffer: A buffer covering uninitialized memory with room for the | ||||||
/// specified number of UTF-8 code units. | ||||||
@inline(__always) | ||||||
internal init( | ||||||
uninitializedCapacity capacity: Int, | ||||||
initializingUTF8With initializer: ( | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nit
Suggested change
|
||||||
_ buffer: UnsafeMutableBufferPointer<UInt8> | ||||||
) throws -> Int | ||||||
) rethrows { | ||||||
if _fastPath(capacity <= _SmallString.capacity) { | ||||||
let smol = try _SmallString(initializingUTF8With: initializer) | ||||||
// Fast case where we fit in a _SmallString and don't need UTF8 validation | ||||||
if _fastPath(smol.isASCII) { | ||||||
self = String(_StringGuts(smol)) | ||||||
} else { | ||||||
//We succeeded in making a _SmallString, but may need to repair UTF8 | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Space after |
||||||
self = smol.withUTF8 { String._fromUTF8Repairing($0).result } | ||||||
} | ||||||
return | ||||||
} | ||||||
|
||||||
self = try String._fromLargeUTF8Repairing( | ||||||
uninitializedCapacity: capacity, | ||||||
initializingWith: initializer) | ||||||
} | ||||||
|
||||||
/// Calls the given closure with a pointer to the contents of the string, | ||||||
/// represented as a null-terminated sequence of code units. | ||||||
|
@@ -715,13 +786,12 @@ extension String { | |||||
public func lowercased() -> String { | ||||||
if _fastPath(_guts.isFastASCII) { | ||||||
return _guts.withFastUTF8 { utf8 in | ||||||
// TODO(String performance): We can directly call appendInPlace | ||||||
var result = String() | ||||||
result.reserveCapacity(utf8.count) | ||||||
for u8 in utf8 { | ||||||
result._guts.append(String(Unicode.Scalar(_lowercaseASCII(u8)))._guts) | ||||||
return String(uninitializedCapacity: utf8.count) { buffer in | ||||||
for i in 0 ..< utf8.count { | ||||||
buffer[i] = _lowercaseASCII(utf8[i]) | ||||||
} | ||||||
return utf8.count | ||||||
Catfish-Man marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
} | ||||||
return result | ||||||
} | ||||||
} | ||||||
|
||||||
|
@@ -776,13 +846,12 @@ extension String { | |||||
public func uppercased() -> String { | ||||||
if _fastPath(_guts.isFastASCII) { | ||||||
return _guts.withFastUTF8 { utf8 in | ||||||
// TODO(String performance): code-unit appendInPlace on guts | ||||||
var result = String() | ||||||
result.reserveCapacity(utf8.count) | ||||||
for u8 in utf8 { | ||||||
result._guts.append(String(Unicode.Scalar(_uppercaseASCII(u8)))._guts) | ||||||
return String(uninitializedCapacity: utf8.count) { buffer in | ||||||
for i in 0 ..< utf8.count { | ||||||
buffer[i] = _uppercaseASCII(utf8[i]) | ||||||
} | ||||||
return utf8.count | ||||||
} | ||||||
return result | ||||||
} | ||||||
} | ||||||
|
||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -79,13 +79,36 @@ extension String { | |
) -> (result: String, repairsMade: Bool) { | ||
switch validateUTF8(input) { | ||
case .success(let extraInfo): | ||
return (String._uncheckedFromUTF8( | ||
input, asciiPreScanResult: extraInfo.isASCII | ||
), false) | ||
return (String._uncheckedFromUTF8( | ||
input, asciiPreScanResult: extraInfo.isASCII | ||
), false) | ||
case .error(let initialRange): | ||
return (repairUTF8(input, firstKnownBrokenRange: initialRange), true) | ||
} | ||
} | ||
|
||
internal static func _fromLargeUTF8Repairing( | ||
uninitializedCapacity capacity: Int, | ||
initializingWith initializer: ( | ||
_ buffer: UnsafeMutableBufferPointer<UInt8> | ||
) throws -> Int | ||
) rethrows -> String { | ||
let result = try __StringStorage.create( | ||
uninitializedCapacity: capacity, | ||
initializingUncheckedUTF8With: initializer) | ||
|
||
switch validateUTF8(result.codeUnits) { | ||
case .success(let info): | ||
result._updateCountAndFlags( | ||
newCount: result.count, | ||
newIsASCII: info.isASCII | ||
) | ||
return result.asString | ||
case .error(let initialRange): | ||
//This could be optimized to use excess tail capacity | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Space after |
||
return repairUTF8(result.codeUnits, firstKnownBrokenRange: initialRange) | ||
} | ||
} | ||
Catfish-Man marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
@usableFromInline | ||
internal static func _uncheckedFromUTF8( | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -398,6 +398,34 @@ extension __StringStorage { | |
return __StringStorage.create( | ||
realCodeUnitCapacity: realCapacity, countAndFlags: countAndFlags) | ||
} | ||
|
||
// The caller is expected to check UTF8 validity and ASCII-ness and update | ||
// the resulting StringStorage accordingly | ||
internal static func create( | ||
uninitializedCapacity capacity: Int, | ||
initializingUncheckedUTF8With initializer: ( | ||
_ buffer: UnsafeMutableBufferPointer<UInt8> | ||
) throws -> Int | ||
) rethrows -> __StringStorage { | ||
let storage = __StringStorage.create( | ||
capacity: capacity, | ||
countAndFlags: CountAndFlags(mortalCount: 0, isASCII: false) | ||
) | ||
let buffer = UnsafeMutableBufferPointer(start: storage.mutableStart, | ||
count: capacity) | ||
let count = try initializer(buffer) | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Shouldn't this just end in a call to There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. _updateCountAndFlags checks invariants, and this particular initializer doesn't guarantee that all logically invariant things (for example "has ascii contents and claims to be ascii") hold, which is why I didn't use it in this one place. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. One day I'll refactor all this logic to flow more reasonably, make the stored properties private, etc., and then have a dedicated bit for "this has passed through some sanctioned create call". |
||
let countAndFlags = CountAndFlags(mortalCount: count, isASCII: false) | ||
#if arch(i386) || arch(arm) | ||
storage._count = countAndFlags.count | ||
storage._flags = countAndFlags.flags | ||
#else | ||
storage._countAndFlags = countAndFlags | ||
#endif | ||
|
||
storage.terminator.pointee = 0 // nul-terminated | ||
return storage | ||
Catfish-Man marked this conversation as resolved.
Show resolved
Hide resolved
|
||
} | ||
|
||
@_effects(releasenone) | ||
internal static func create( | ||
|
@@ -453,7 +481,7 @@ extension __StringStorage { | |
} | ||
|
||
@inline(__always) | ||
private var codeUnits: UnsafeBufferPointer<UInt8> { | ||
internal var codeUnits: UnsafeBufferPointer<UInt8> { | ||
return UnsafeBufferPointer(start: start, count: count) | ||
} | ||
|
||
|
@@ -518,7 +546,7 @@ extension __StringStorage { | |
extension __StringStorage { | ||
// Perform common post-RRC adjustments and invariant enforcement. | ||
@_effects(releasenone) | ||
private func _postRRCAdjust(newCount: Int, newIsASCII: Bool) { | ||
internal func _updateCountAndFlags(newCount: Int, newIsASCII: Bool) { | ||
let countAndFlags = CountAndFlags( | ||
mortalCount: newCount, isASCII: newIsASCII) | ||
#if arch(i386) || arch(arm) | ||
|
@@ -540,7 +568,7 @@ extension __StringStorage { | |
appendedCount: Int, appendedIsASCII isASCII: Bool | ||
) { | ||
let oldTerminator = self.terminator | ||
_postRRCAdjust( | ||
_updateCountAndFlags( | ||
newCount: self.count + appendedCount, newIsASCII: self.isASCII && isASCII) | ||
_internalInvariant(oldTerminator + appendedCount == self.terminator) | ||
} | ||
|
@@ -570,7 +598,7 @@ extension __StringStorage { | |
} | ||
|
||
internal func clear() { | ||
_postRRCAdjust(newCount: 0, newIsASCII: true) | ||
_updateCountAndFlags(newCount: 0, newIsASCII: true) | ||
} | ||
} | ||
|
||
|
@@ -585,7 +613,7 @@ extension __StringStorage { | |
let tailCount = mutableEnd - upperPtr | ||
lowerPtr.moveInitialize(from: upperPtr, count: tailCount) | ||
|
||
_postRRCAdjust( | ||
_updateCountAndFlags( | ||
newCount: self.count &- (upper &- lower), newIsASCII: self.isASCII) | ||
} | ||
|
||
|
@@ -622,7 +650,7 @@ extension __StringStorage { | |
count: replCount) | ||
|
||
let isASCII = self.isASCII && _allASCII(replacement) | ||
_postRRCAdjust(newCount: lower + replCount + tailCount, newIsASCII: isASCII) | ||
_updateCountAndFlags(newCount: lower + replCount + tailCount, newIsASCII: isASCII) | ||
} | ||
|
||
|
||
|
@@ -651,7 +679,7 @@ extension __StringStorage { | |
} | ||
_internalInvariant(srcCount == replCount) | ||
|
||
_postRRCAdjust( | ||
_updateCountAndFlags( | ||
newCount: lower + replCount + tailCount, newIsASCII: isASCII) | ||
} | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this requires an underscored prefix as per stdlib naming convention, does it not?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It's intended to be made public; there's a S-E pitch thread.