Skip to content

Commit 68687e7

Browse files
authored
Merge pull request #26007 from Catfish-Man/uninitialized-initialize-uppercase
Add a private implementation of a String initializer with access to uninitialized storage (swiftlang/swift-evolution#1022) and use it to speed up uppercased() and lowercased()
2 parents 20378a7 + b06137b commit 68687e7

File tree

5 files changed

+224
-23
lines changed

5 files changed

+224
-23
lines changed

benchmark/single-source/NSStringConversion.swift

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,13 +30,41 @@ public let NSStringConversion = [
3030
runFunction: run_NSMutableStringConversion,
3131
tags: [.validation, .api, .String, .bridging],
3232
setUpFunction: { test = NSMutableString(cString: "test", encoding: String.Encoding.ascii.rawValue)! }),
33+
BenchmarkInfo(name: "NSStringConversion.Medium",
34+
runFunction: run_NSStringConversion_medium,
35+
tags: [.validation, .api, .String, .bridging],
36+
setUpFunction: { test = NSString(cString: "aaaaaaaaaaaaaaa", encoding: String.Encoding.ascii.rawValue)! } ),
3337
BenchmarkInfo(name: "NSStringConversion.Long",
3438
runFunction: run_NSStringConversion_long,
3539
tags: [.validation, .api, .String, .bridging],
3640
setUpFunction: { test = NSString(cString: "The quick brown fox jumps over the lazy dog", encoding: String.Encoding.ascii.rawValue)! } ),
3741
BenchmarkInfo(name: "NSStringConversion.LongUTF8",
3842
runFunction: run_NSStringConversion_longNonASCII,
3943
tags: [.validation, .api, .String, .bridging],
44+
setUpFunction: { test = NSString(cString: "Thë qüick bröwn föx jumps over the lazy dög", encoding: String.Encoding.utf8.rawValue)! } ),
45+
BenchmarkInfo(name: "NSStringConversion.Rebridge",
46+
runFunction: run_NSStringConversion_rebridge,
47+
tags: [.validation, .api, .String, .bridging],
48+
setUpFunction: { test = NSString(cString: "test", encoding: String.Encoding.ascii.rawValue)! }),
49+
BenchmarkInfo(name: "NSStringConversion.Rebridge.UTF8",
50+
runFunction: run_NSStringConversion_nonASCII_rebridge,
51+
tags: [.validation, .api, .String, .bridging],
52+
setUpFunction: { test = NSString(cString: "tëst", encoding: String.Encoding.utf8.rawValue)! }),
53+
BenchmarkInfo(name: "NSStringConversion.Rebridge.Mutable",
54+
runFunction: run_NSMutableStringConversion_rebridge,
55+
tags: [.validation, .api, .String, .bridging],
56+
setUpFunction: { test = NSMutableString(cString: "test", encoding: String.Encoding.ascii.rawValue)! }),
57+
BenchmarkInfo(name: "NSStringConversion.Rebridge.Medium",
58+
runFunction: run_NSStringConversion_medium_rebridge,
59+
tags: [.validation, .api, .String, .bridging],
60+
setUpFunction: { test = NSString(cString: "aaaaaaaaaaaaaaa", encoding: String.Encoding.ascii.rawValue)! } ),
61+
BenchmarkInfo(name: "NSStringConversion.Rebridge.Long",
62+
runFunction: run_NSStringConversion_long_rebridge,
63+
tags: [.validation, .api, .String, .bridging],
64+
setUpFunction: { test = NSString(cString: "The quick brown fox jumps over the lazy dog", encoding: String.Encoding.ascii.rawValue)! } ),
65+
BenchmarkInfo(name: "NSStringConversion.Rebridge.LongUTF8",
66+
runFunction: run_NSStringConversion_longNonASCII_rebridge,
67+
tags: [.validation, .api, .String, .bridging],
4068
setUpFunction: { test = NSString(cString: "Thë qüick bröwn föx jumps over the lazy dög", encoding: String.Encoding.utf8.rawValue)! })]
4169

4270
public func run_NSStringConversion(_ N: Int) {
@@ -63,6 +91,10 @@ public func run_NSMutableStringConversion(_ N: Int) {
6391
innerLoop(test, N)
6492
}
6593

94+
public func run_NSStringConversion_medium(_ N: Int) {
95+
innerLoop(test, N, 1000)
96+
}
97+
6698
public func run_NSStringConversion_long(_ N: Int) {
6799
innerLoop(test, N, 1000)
68100
}
@@ -71,4 +103,36 @@ public func run_NSStringConversion_longNonASCII(_ N: Int) {
71103
innerLoop(test, N, 300)
72104
}
73105

106+
fileprivate func innerRebridge(_ str: NSString, _ N: Int, _ scale: Int = 5000) {
107+
for _ in 1...N * scale {
108+
let bridged = identity(str) as String
109+
blackHole(bridged)
110+
blackHole(bridged as NSString)
111+
}
112+
}
113+
114+
public func run_NSStringConversion_rebridge(_ N: Int) {
115+
innerRebridge(test, N, 2500)
116+
}
117+
118+
public func run_NSStringConversion_nonASCII_rebridge(_ N: Int) {
119+
innerRebridge(test, N, 2500)
120+
}
121+
122+
public func run_NSMutableStringConversion_rebridge(_ N: Int) {
123+
innerRebridge(test, N)
124+
}
125+
126+
public func run_NSStringConversion_medium_rebridge(_ N: Int) {
127+
innerRebridge(test, N, 1000)
128+
}
129+
130+
public func run_NSStringConversion_long_rebridge(_ N: Int) {
131+
innerRebridge(test, N, 1000)
132+
}
133+
134+
public func run_NSStringConversion_longNonASCII_rebridge(_ N: Int) {
135+
innerRebridge(test, N, 300)
136+
}
137+
74138
#endif

stdlib/public/core/SmallString.swift

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -218,6 +218,10 @@ extension _SmallString {
218218
return try f(UnsafeMutableBufferPointer(
219219
start: ptr, count: _SmallString.capacity))
220220
}
221+
if len == 0 {
222+
self = _SmallString()
223+
return
224+
}
221225
_internalInvariant(len <= _SmallString.capacity)
222226

223227
let (leading, trailing) = self.zeroTerminatedRawCodeUnits
@@ -260,6 +264,19 @@ extension _SmallString {
260264

261265
self.init(leading: leading, trailing: trailing, count: count)
262266
}
267+
268+
@inline(__always)
269+
internal init(
270+
initializingUTF8With initializer: (
271+
_ buffer: UnsafeMutableBufferPointer<UInt8>
272+
) throws -> Int
273+
) rethrows {
274+
self.init()
275+
try self.withMutableCapacity {
276+
return try initializer($0)
277+
}
278+
self._invariantCheck()
279+
}
263280

264281
@usableFromInline // @testable
265282
internal init?(_ base: _SmallString, appending other: _SmallString) {
@@ -292,7 +309,7 @@ extension _SmallString {
292309
self.init()
293310
self.withMutableCapacity {
294311
let len = _bridgeTagged(cocoa, intoUTF8: $0)
295-
_internalInvariant(len != nil && len! < _SmallString.capacity,
312+
_internalInvariant(len != nil && len! <= _SmallString.capacity,
296313
"Internal invariant violated: large tagged NSStrings")
297314
return len._unsafelyUnwrappedUnchecked
298315
}

stdlib/public/core/String.swift

Lines changed: 81 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -427,6 +427,77 @@ extension String {
427427
}
428428
return
429429
}
430+
431+
/// Creates a new String with the specified capacity in UTF-8 code units then
432+
/// calls the given closure with a buffer covering the String's uninitialized
433+
/// memory.
434+
///
435+
/// The closure should return the number of initialized code units,
436+
/// or 0 if it couldn't initialize the buffer (for example if the
437+
/// requested capacity was too small).
438+
///
439+
/// This method replaces ill-formed UTF-8 sequences with the Unicode
440+
/// replacement character (`"\u{FFFD}"`); This may require resizing
441+
/// the buffer beyond its original capacity.
442+
///
443+
/// The following examples use this initializer with the contents of two
444+
/// different `UInt8` arrays---the first with well-formed UTF-8 code unit
445+
/// sequences and the second with an ill-formed sequence at the end.
446+
///
447+
/// let validUTF8: [UInt8] = [67, 97, 102, -61, -87, 0]
448+
/// let s = String(uninitializedCapacity: validUTF8.count,
449+
/// initializingUTF8With: { ptr in
450+
/// ptr.initializeFrom(validUTF8)
451+
/// return validUTF8.count
452+
/// })
453+
/// // Prints "Café"
454+
///
455+
/// let invalidUTF8: [UInt8] = [67, 97, 102, -61, 0]
456+
/// let s = String(uninitializedCapacity: invalidUTF8.count,
457+
/// initializingUTF8With: { ptr in
458+
/// ptr.initializeFrom(invalidUTF8)
459+
/// return invalidUTF8.count
460+
/// })
461+
/// // Prints "Caf�"
462+
///
463+
/// let s = String(uninitializedCapacity: invalidUTF8.count,
464+
/// initializingUTF8With: { ptr in
465+
/// ptr.initializeFrom(invalidUTF8)
466+
/// return 0
467+
/// })
468+
/// // Prints ""
469+
///
470+
/// - Parameters:
471+
/// - capacity: The number of UTF-8 code units worth of memory to allocate
472+
/// for the String.
473+
/// - initializer: A closure that initializes elements and sets the count of
474+
/// the new String
475+
/// - Parameters:
476+
/// - buffer: A buffer covering uninitialized memory with room for the
477+
/// specified number of UTF-8 code units.
478+
@inline(__always)
479+
internal init(
480+
uninitializedCapacity capacity: Int,
481+
initializingUTF8With initializer: (
482+
_ buffer: UnsafeMutableBufferPointer<UInt8>
483+
) throws -> Int
484+
) rethrows {
485+
if _fastPath(capacity <= _SmallString.capacity) {
486+
let smol = try _SmallString(initializingUTF8With: initializer)
487+
// Fast case where we fit in a _SmallString and don't need UTF8 validation
488+
if _fastPath(smol.isASCII) {
489+
self = String(_StringGuts(smol))
490+
} else {
491+
//We succeeded in making a _SmallString, but may need to repair UTF8
492+
self = smol.withUTF8 { String._fromUTF8Repairing($0).result }
493+
}
494+
return
495+
}
496+
497+
self = try String._fromLargeUTF8Repairing(
498+
uninitializedCapacity: capacity,
499+
initializingWith: initializer)
500+
}
430501

431502
/// Calls the given closure with a pointer to the contents of the string,
432503
/// represented as a null-terminated sequence of code units.
@@ -715,13 +786,12 @@ extension String {
715786
public func lowercased() -> String {
716787
if _fastPath(_guts.isFastASCII) {
717788
return _guts.withFastUTF8 { utf8 in
718-
// TODO(String performance): We can directly call appendInPlace
719-
var result = String()
720-
result.reserveCapacity(utf8.count)
721-
for u8 in utf8 {
722-
result._guts.append(String(Unicode.Scalar(_lowercaseASCII(u8)))._guts)
789+
return String(uninitializedCapacity: utf8.count) { buffer in
790+
for i in 0 ..< utf8.count {
791+
buffer[i] = _lowercaseASCII(utf8[i])
792+
}
793+
return utf8.count
723794
}
724-
return result
725795
}
726796
}
727797

@@ -776,13 +846,12 @@ extension String {
776846
public func uppercased() -> String {
777847
if _fastPath(_guts.isFastASCII) {
778848
return _guts.withFastUTF8 { utf8 in
779-
// TODO(String performance): code-unit appendInPlace on guts
780-
var result = String()
781-
result.reserveCapacity(utf8.count)
782-
for u8 in utf8 {
783-
result._guts.append(String(Unicode.Scalar(_uppercaseASCII(u8)))._guts)
849+
return String(uninitializedCapacity: utf8.count) { buffer in
850+
for i in 0 ..< utf8.count {
851+
buffer[i] = _uppercaseASCII(utf8[i])
852+
}
853+
return utf8.count
784854
}
785-
return result
786855
}
787856
}
788857

stdlib/public/core/StringCreate.swift

Lines changed: 26 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -79,13 +79,36 @@ extension String {
7979
) -> (result: String, repairsMade: Bool) {
8080
switch validateUTF8(input) {
8181
case .success(let extraInfo):
82-
return (String._uncheckedFromUTF8(
83-
input, asciiPreScanResult: extraInfo.isASCII
84-
), false)
82+
return (String._uncheckedFromUTF8(
83+
input, asciiPreScanResult: extraInfo.isASCII
84+
), false)
8585
case .error(let initialRange):
8686
return (repairUTF8(input, firstKnownBrokenRange: initialRange), true)
8787
}
8888
}
89+
90+
internal static func _fromLargeUTF8Repairing(
91+
uninitializedCapacity capacity: Int,
92+
initializingWith initializer: (
93+
_ buffer: UnsafeMutableBufferPointer<UInt8>
94+
) throws -> Int
95+
) rethrows -> String {
96+
let result = try __StringStorage.create(
97+
uninitializedCapacity: capacity,
98+
initializingUncheckedUTF8With: initializer)
99+
100+
switch validateUTF8(result.codeUnits) {
101+
case .success(let info):
102+
result._updateCountAndFlags(
103+
newCount: result.count,
104+
newIsASCII: info.isASCII
105+
)
106+
return result.asString
107+
case .error(let initialRange):
108+
//This could be optimized to use excess tail capacity
109+
return repairUTF8(result.codeUnits, firstKnownBrokenRange: initialRange)
110+
}
111+
}
89112

90113
@usableFromInline
91114
internal static func _uncheckedFromUTF8(

stdlib/public/core/StringStorage.swift

Lines changed: 35 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -398,6 +398,34 @@ extension __StringStorage {
398398
return __StringStorage.create(
399399
realCodeUnitCapacity: realCapacity, countAndFlags: countAndFlags)
400400
}
401+
402+
// The caller is expected to check UTF8 validity and ASCII-ness and update
403+
// the resulting StringStorage accordingly
404+
internal static func create(
405+
uninitializedCapacity capacity: Int,
406+
initializingUncheckedUTF8With initializer: (
407+
_ buffer: UnsafeMutableBufferPointer<UInt8>
408+
) throws -> Int
409+
) rethrows -> __StringStorage {
410+
let storage = __StringStorage.create(
411+
capacity: capacity,
412+
countAndFlags: CountAndFlags(mortalCount: 0, isASCII: false)
413+
)
414+
let buffer = UnsafeMutableBufferPointer(start: storage.mutableStart,
415+
count: capacity)
416+
let count = try initializer(buffer)
417+
418+
let countAndFlags = CountAndFlags(mortalCount: count, isASCII: false)
419+
#if arch(i386) || arch(arm)
420+
storage._count = countAndFlags.count
421+
storage._flags = countAndFlags.flags
422+
#else
423+
storage._countAndFlags = countAndFlags
424+
#endif
425+
426+
storage.terminator.pointee = 0 // nul-terminated
427+
return storage
428+
}
401429

402430
@_effects(releasenone)
403431
internal static func create(
@@ -453,7 +481,7 @@ extension __StringStorage {
453481
}
454482

455483
@inline(__always)
456-
private var codeUnits: UnsafeBufferPointer<UInt8> {
484+
internal var codeUnits: UnsafeBufferPointer<UInt8> {
457485
return UnsafeBufferPointer(start: start, count: count)
458486
}
459487

@@ -518,7 +546,7 @@ extension __StringStorage {
518546
extension __StringStorage {
519547
// Perform common post-RRC adjustments and invariant enforcement.
520548
@_effects(releasenone)
521-
private func _postRRCAdjust(newCount: Int, newIsASCII: Bool) {
549+
internal func _updateCountAndFlags(newCount: Int, newIsASCII: Bool) {
522550
let countAndFlags = CountAndFlags(
523551
mortalCount: newCount, isASCII: newIsASCII)
524552
#if arch(i386) || arch(arm)
@@ -540,7 +568,7 @@ extension __StringStorage {
540568
appendedCount: Int, appendedIsASCII isASCII: Bool
541569
) {
542570
let oldTerminator = self.terminator
543-
_postRRCAdjust(
571+
_updateCountAndFlags(
544572
newCount: self.count + appendedCount, newIsASCII: self.isASCII && isASCII)
545573
_internalInvariant(oldTerminator + appendedCount == self.terminator)
546574
}
@@ -570,7 +598,7 @@ extension __StringStorage {
570598
}
571599

572600
internal func clear() {
573-
_postRRCAdjust(newCount: 0, newIsASCII: true)
601+
_updateCountAndFlags(newCount: 0, newIsASCII: true)
574602
}
575603
}
576604

@@ -585,7 +613,7 @@ extension __StringStorage {
585613
let tailCount = mutableEnd - upperPtr
586614
lowerPtr.moveInitialize(from: upperPtr, count: tailCount)
587615

588-
_postRRCAdjust(
616+
_updateCountAndFlags(
589617
newCount: self.count &- (upper &- lower), newIsASCII: self.isASCII)
590618
}
591619

@@ -622,7 +650,7 @@ extension __StringStorage {
622650
count: replCount)
623651

624652
let isASCII = self.isASCII && _allASCII(replacement)
625-
_postRRCAdjust(newCount: lower + replCount + tailCount, newIsASCII: isASCII)
653+
_updateCountAndFlags(newCount: lower + replCount + tailCount, newIsASCII: isASCII)
626654
}
627655

628656

@@ -651,7 +679,7 @@ extension __StringStorage {
651679
}
652680
_internalInvariant(srcCount == replCount)
653681

654-
_postRRCAdjust(
682+
_updateCountAndFlags(
655683
newCount: lower + replCount + tailCount, newIsASCII: isASCII)
656684
}
657685
}

0 commit comments

Comments
 (0)