Skip to content

Commit fd1c861

Browse files
committed
[string] ASCII/UTF-8 fast paths for String.init(decoding:as:)
Add some fast paths to String.init(decoding:as:) for inputs of contiguously stored UTF-8 code units. Dramatically speeds up creation when the String happens to be ASCII and we can form more small strings.
1 parent e3d69d1 commit fd1c861

File tree

4 files changed

+99
-21
lines changed

4 files changed

+99
-21
lines changed

stdlib/public/core/CString.swift

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -174,7 +174,7 @@ internal func _decodeValidCString(
174174
return cString.withMemoryRebound(to: UInt8.self, capacity: len) {
175175
(ptr: UnsafePointer<UInt8>) -> String in
176176
let bufPtr = UnsafeBufferPointer(start: ptr, count: len)
177-
return String._fromWellFormedUTF8CodeUnitSequence(bufPtr, repair: repair)
177+
return String._fromWellFormedUTF8(bufPtr, repair: repair)
178178
}
179179
}
180180

@@ -183,7 +183,7 @@ internal func _decodeValidCString(
183183
) -> String {
184184
let len = UTF8._nullCodeUnitOffset(in: cString)
185185
let bufPtr = UnsafeBufferPointer(start: cString, count: len)
186-
return String._fromWellFormedUTF8CodeUnitSequence(bufPtr, repair: repair)
186+
return String._fromWellFormedUTF8(bufPtr, repair: repair)
187187
}
188188

189189
internal func _decodeCString(
@@ -193,7 +193,7 @@ internal func _decodeCString(
193193
return cString.withMemoryRebound(to: UInt8.self, capacity: len) {
194194
(ptr: UnsafePointer<UInt8>) -> String? in
195195
let bufPtr = UnsafeBufferPointer(start: ptr, count: len)
196-
return String._fromUTF8CodeUnitSequence(bufPtr, repair: repair)
196+
return String._fromUTF8(bufPtr, repair: repair)
197197
}
198198
}
199199

@@ -202,7 +202,7 @@ internal func _decodeCString(
202202
) -> String? {
203203
let len = UTF8._nullCodeUnitOffset(in: cString)
204204
let bufPtr = UnsafeBufferPointer(start: cString, count: len)
205-
return String._fromUTF8CodeUnitSequence(bufPtr, repair: repair)
205+
return String._fromUTF8(bufPtr, repair: repair)
206206
}
207207

208208
/// Creates a new string by copying the null-terminated data referenced by

stdlib/public/core/InputStream.swift

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ public func readLine(strippingNewline: Bool = true) -> String? {
6464
}
6565
}
6666
}
67-
let result = String._fromUTF8CodeUnitSequence(
67+
let result = String._fromUTF8(
6868
UnsafeBufferPointer(start: linePtr, count: readBytes),
6969
repair: true)!
7070
_stdlib_free(linePtr)

stdlib/public/core/StaticString.swift

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -272,7 +272,7 @@ public struct StaticString
272272
if isASCII {
273273
return String._fromASCII(buffer)
274274
} else {
275-
return String._fromWellFormedUTF8CodeUnitSequence(buffer)
275+
return String._fromWellFormedUTF8(buffer)
276276
}
277277
}
278278
}

stdlib/public/core/String.swift

Lines changed: 93 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -138,8 +138,7 @@ extension String {
138138
>(
139139
_ input: Input,
140140
encoding: Encoding.Type,
141-
repairIllFormedSequences: Bool,
142-
minimumCapacity: Int = 0
141+
repairIllFormedSequences: Bool
143142
) -> String?
144143
where Input.Element == Encoding.CodeUnit {
145144

@@ -154,7 +153,7 @@ extension String {
154153
return nil
155154
}
156155

157-
let capacity = Swift.max(utf16Count, minimumCapacity)
156+
let capacity = utf16Count
158157
if isASCII {
159158
if let small = _SmallUTF8String(
160159
_fromCodeUnits: input,
@@ -201,6 +200,43 @@ extension String {
201200
}
202201
}
203202

203+
internal static func _fromNonASCIIUTF8(
204+
_ input: UnsafeBufferPointer<UInt8>, repair: Bool
205+
) -> String? {
206+
if let smol = _SmallUTF8String(input) {
207+
return String(_StringGuts(smol))
208+
}
209+
210+
// Determine how many UTF-16 code units we'll need
211+
let inputStream = input.makeIterator()
212+
213+
// TODO: Replace with much, much faster length check
214+
guard let (utf16Count, isASCII) = UTF16.transcodedLength(
215+
of: inputStream,
216+
decodedAs: UTF8.self,
217+
repairingIllFormedSequences: repair) else {
218+
return nil
219+
}
220+
221+
let capacity = utf16Count
222+
_sanityCheck(!isASCII, "was given ASCII UTF-8")
223+
let storage = _SwiftStringStorage<UTF16.CodeUnit>.create(
224+
capacity: capacity,
225+
count: utf16Count)
226+
var p = storage.start
227+
let sink: (UTF16.CodeUnit) -> Void = {
228+
p.pointee = $0
229+
p += 1
230+
}
231+
// TODO: Replace with much, much faster transcoding
232+
_ = transcode(
233+
input.makeIterator(),
234+
from: UTF8.self, to: UTF16.self,
235+
stoppingOnError: !repair,
236+
into: sink)
237+
return String(_largeStorage: storage)
238+
}
239+
204240
/// Creates a string from the given Unicode code units in the specified
205241
/// encoding.
206242
///
@@ -210,9 +246,24 @@ extension String {
210246
/// - sourceEncoding: The encoding in which `codeUnits` should be
211247
/// interpreted.
212248
@inlinable // FIXME(sil-serialize-all)
249+
@inline(__always) // Eliminate dynamic type check when possible
213250
public init<C: Collection, Encoding: Unicode.Encoding>(
214251
decoding codeUnits: C, as sourceEncoding: Encoding.Type
215252
) where C.Iterator.Element == Encoding.CodeUnit {
253+
if let contigBytes = codeUnits as? _HasContiguousBytes,
254+
sourceEncoding == UTF8.self
255+
{
256+
self = contigBytes.withUnsafeBytes { rawBufPtr in
257+
let ptr = rawBufPtr.baseAddress._unsafelyUnwrappedUnchecked
258+
return String._fromUTF8(
259+
UnsafeBufferPointer(
260+
start: ptr.assumingMemoryBound(to: UInt8.self),
261+
count: rawBufPtr.count),
262+
repair: true).unsafelyUnwrapped
263+
}
264+
return
265+
}
266+
216267
self = String._fromCodeUnits(
217268
codeUnits, encoding: sourceEncoding, repairIllFormedSequences: true)!
218269
}
@@ -629,20 +680,47 @@ internal func _isAllASCII(_ input: UnsafeBufferPointer<UInt8>) -> Bool {
629680
return true
630681
}
631682

683+
// TODO: re-organize a bit before merging...
684+
685+
@usableFromInline
686+
internal protocol _HasContiguousBytes {
687+
func withUnsafeBytes<R>(
688+
_ body: (UnsafeRawBufferPointer) throws -> R
689+
) rethrows -> R
690+
}
691+
extension Array: _HasContiguousBytes {}
692+
extension UnsafeBufferPointer: _HasContiguousBytes {
693+
@inlinable
694+
@inline(__always)
695+
func withUnsafeBytes<R>(
696+
_ body: (UnsafeRawBufferPointer) throws -> R
697+
) rethrows -> R {
698+
let ptr = UnsafeRawPointer(self.baseAddress._unsafelyUnwrappedUnchecked)
699+
let len = self.count &* MemoryLayout<Element>.stride
700+
return try body(UnsafeRawBufferPointer(start: ptr, count: len))
701+
}
702+
}
703+
extension UnsafeMutableBufferPointer: _HasContiguousBytes {
704+
@inlinable
705+
@inline(__always)
706+
func withUnsafeBytes<R>(
707+
_ body: (UnsafeRawBufferPointer) throws -> R
708+
) rethrows -> R {
709+
let ptr = UnsafeRawPointer(self.baseAddress._unsafelyUnwrappedUnchecked)
710+
let len = self.count &* MemoryLayout<Element>.stride
711+
return try body(UnsafeRawBufferPointer(start: ptr, count: len))
712+
}
713+
}
714+
632715
extension String {
633-
static func _fromUTF8CodeUnitSequence(
716+
@usableFromInline
717+
static func _fromUTF8(
634718
_ input: UnsafeBufferPointer<UInt8>, repair: Bool
635719
) -> String? {
636720
if _isAllASCII(input) {
637721
return _fromASCII(input)
638722
}
639-
640-
if let smol = _SmallUTF8String(input) {
641-
return String(_StringGuts(smol))
642-
}
643-
644-
return String._fromCodeUnits(
645-
input, encoding: UTF8.self, repairIllFormedSequences: repair)
723+
return _fromNonASCIIUTF8(input, repair: repair)
646724
}
647725

648726
@usableFromInline
@@ -659,10 +737,10 @@ extension String {
659737
}
660738

661739
@usableFromInline
662-
static func _fromWellFormedUTF8CodeUnitSequence(
740+
static func _fromWellFormedUTF8(
663741
_ input: UnsafeBufferPointer<UInt8>, repair: Bool = false
664742
) -> String {
665-
return String._fromUTF8CodeUnitSequence(input, repair: repair)!
743+
return String._fromUTF8(input, repair: repair)!
666744
}
667745
}
668746

@@ -759,7 +837,7 @@ extension String : _ExpressibleByBuiltinStringLiteral {
759837
self = String(_StringGuts(_large: _UnmanagedString(bufPtr)))
760838
return
761839
}
762-
self = String._fromWellFormedUTF8CodeUnitSequence(bufPtr)
840+
self = String._fromWellFormedUTF8(bufPtr)
763841
}
764842
}
765843

@@ -951,7 +1029,7 @@ extension String {
9511029
utf8CodeUnitCount: Int
9521030
) {
9531031
resultStorage.initialize(to:
954-
String._fromWellFormedUTF8CodeUnitSequence(
1032+
String._fromWellFormedUTF8(
9551033
UnsafeBufferPointer(start: start, count: utf8CodeUnitCount)))
9561034
}
9571035
}

0 commit comments

Comments
 (0)