Skip to content

[string] ASCII/UTF-8 fast paths for String.init(decoding:as:) #17244

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jun 18, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions stdlib/public/core/CString.swift
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@ internal func _decodeValidCString(
return cString.withMemoryRebound(to: UInt8.self, capacity: len) {
(ptr: UnsafePointer<UInt8>) -> String in
let bufPtr = UnsafeBufferPointer(start: ptr, count: len)
return String._fromWellFormedUTF8CodeUnitSequence(bufPtr, repair: repair)
return String._fromWellFormedUTF8(bufPtr, repair: repair)
}
}

Expand All @@ -183,7 +183,7 @@ internal func _decodeValidCString(
) -> String {
let len = UTF8._nullCodeUnitOffset(in: cString)
let bufPtr = UnsafeBufferPointer(start: cString, count: len)
return String._fromWellFormedUTF8CodeUnitSequence(bufPtr, repair: repair)
return String._fromWellFormedUTF8(bufPtr, repair: repair)
}

internal func _decodeCString(
Expand All @@ -193,7 +193,7 @@ internal func _decodeCString(
return cString.withMemoryRebound(to: UInt8.self, capacity: len) {
(ptr: UnsafePointer<UInt8>) -> String? in
let bufPtr = UnsafeBufferPointer(start: ptr, count: len)
return String._fromUTF8CodeUnitSequence(bufPtr, repair: repair)
return String._fromUTF8(bufPtr, repair: repair)
}
}

Expand All @@ -202,7 +202,7 @@ internal func _decodeCString(
) -> String? {
let len = UTF8._nullCodeUnitOffset(in: cString)
let bufPtr = UnsafeBufferPointer(start: cString, count: len)
return String._fromUTF8CodeUnitSequence(bufPtr, repair: repair)
return String._fromUTF8(bufPtr, repair: repair)
}

/// Creates a new string by copying the null-terminated data referenced by
Expand Down
2 changes: 1 addition & 1 deletion stdlib/public/core/InputStream.swift
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ public func readLine(strippingNewline: Bool = true) -> String? {
}
}
}
let result = String._fromUTF8CodeUnitSequence(
let result = String._fromUTF8(
UnsafeBufferPointer(start: linePtr, count: readBytes),
repair: true)!
_stdlib_free(linePtr)
Expand Down
2 changes: 1 addition & 1 deletion stdlib/public/core/StaticString.swift
Original file line number Diff line number Diff line change
Expand Up @@ -272,7 +272,7 @@ public struct StaticString
if isASCII {
return String._fromASCII(buffer)
} else {
return String._fromWellFormedUTF8CodeUnitSequence(buffer)
return String._fromWellFormedUTF8(buffer)
}
}
}
Expand Down
108 changes: 93 additions & 15 deletions stdlib/public/core/String.swift
Original file line number Diff line number Diff line change
Expand Up @@ -138,8 +138,7 @@ extension String {
>(
_ input: Input,
encoding: Encoding.Type,
repairIllFormedSequences: Bool,
minimumCapacity: Int = 0
repairIllFormedSequences: Bool
) -> String?
where Input.Element == Encoding.CodeUnit {

Expand All @@ -154,7 +153,7 @@ extension String {
return nil
}

let capacity = Swift.max(utf16Count, minimumCapacity)
let capacity = utf16Count
if isASCII {
if let small = _SmallUTF8String(
_fromCodeUnits: input,
Expand Down Expand Up @@ -201,6 +200,43 @@ extension String {
}
}

internal static func _fromNonASCIIUTF8(
_ input: UnsafeBufferPointer<UInt8>, repair: Bool
) -> String? {
if let smol = _SmallUTF8String(input) {
return String(_StringGuts(smol))
}

// Determine how many UTF-16 code units we'll need
let inputStream = input.makeIterator()

// TODO: Replace with much, much faster length check
guard let (utf16Count, isASCII) = UTF16.transcodedLength(
of: inputStream,
decodedAs: UTF8.self,
repairingIllFormedSequences: repair) else {
return nil
}

let capacity = utf16Count
_sanityCheck(!isASCII, "was given ASCII UTF-8")
let storage = _SwiftStringStorage<UTF16.CodeUnit>.create(
capacity: capacity,
count: utf16Count)
var p = storage.start
let sink: (UTF16.CodeUnit) -> Void = {
p.pointee = $0
p += 1
}
// TODO: Replace with much, much faster transcoding
_ = transcode(
input.makeIterator(),
from: UTF8.self, to: UTF16.self,
stoppingOnError: !repair,
into: sink)
return String(_largeStorage: storage)
}

/// Creates a string from the given Unicode code units in the specified
/// encoding.
///
Expand All @@ -210,9 +246,24 @@ extension String {
/// - sourceEncoding: The encoding in which `codeUnits` should be
/// interpreted.
@inlinable // FIXME(sil-serialize-all)
@inline(__always) // Eliminate dynamic type check when possible
public init<C: Collection, Encoding: Unicode.Encoding>(
decoding codeUnits: C, as sourceEncoding: Encoding.Type
) where C.Iterator.Element == Encoding.CodeUnit {
if let contigBytes = codeUnits as? _HasContiguousBytes,
sourceEncoding == UTF8.self
{
self = contigBytes.withUnsafeBytes { rawBufPtr in
let ptr = rawBufPtr.baseAddress._unsafelyUnwrappedUnchecked
return String._fromUTF8(
UnsafeBufferPointer(
start: ptr.assumingMemoryBound(to: UInt8.self),
count: rawBufPtr.count),
repair: true).unsafelyUnwrapped
}
return
}

self = String._fromCodeUnits(
codeUnits, encoding: sourceEncoding, repairIllFormedSequences: true)!
}
Expand Down Expand Up @@ -629,20 +680,47 @@ internal func _isAllASCII(_ input: UnsafeBufferPointer<UInt8>) -> Bool {
return true
}

// TODO: re-organize a bit before merging...

@usableFromInline
internal protocol _HasContiguousBytes {
func withUnsafeBytes<R>(
_ body: (UnsafeRawBufferPointer) throws -> R
) rethrows -> R
}
extension Array: _HasContiguousBytes {}
extension UnsafeBufferPointer: _HasContiguousBytes {
@inlinable
@inline(__always)
func withUnsafeBytes<R>(
_ body: (UnsafeRawBufferPointer) throws -> R
) rethrows -> R {
let ptr = UnsafeRawPointer(self.baseAddress._unsafelyUnwrappedUnchecked)
let len = self.count &* MemoryLayout<Element>.stride
return try body(UnsafeRawBufferPointer(start: ptr, count: len))
}
}
extension UnsafeMutableBufferPointer: _HasContiguousBytes {
@inlinable
@inline(__always)
func withUnsafeBytes<R>(
_ body: (UnsafeRawBufferPointer) throws -> R
) rethrows -> R {
let ptr = UnsafeRawPointer(self.baseAddress._unsafelyUnwrappedUnchecked)
let len = self.count &* MemoryLayout<Element>.stride
return try body(UnsafeRawBufferPointer(start: ptr, count: len))
}
}

extension String {
static func _fromUTF8CodeUnitSequence(
@usableFromInline
static func _fromUTF8(
_ input: UnsafeBufferPointer<UInt8>, repair: Bool
) -> String? {
if _isAllASCII(input) {
return _fromASCII(input)
}

if let smol = _SmallUTF8String(input) {
return String(_StringGuts(smol))
}

return String._fromCodeUnits(
input, encoding: UTF8.self, repairIllFormedSequences: repair)
return _fromNonASCIIUTF8(input, repair: repair)
}

@usableFromInline
Expand All @@ -659,10 +737,10 @@ extension String {
}

@usableFromInline
static func _fromWellFormedUTF8CodeUnitSequence(
static func _fromWellFormedUTF8(
_ input: UnsafeBufferPointer<UInt8>, repair: Bool = false
) -> String {
return String._fromUTF8CodeUnitSequence(input, repair: repair)!
return String._fromUTF8(input, repair: repair)!
}
}

Expand Down Expand Up @@ -759,7 +837,7 @@ extension String : _ExpressibleByBuiltinStringLiteral {
self = String(_StringGuts(_large: _UnmanagedString(bufPtr)))
return
}
self = String._fromWellFormedUTF8CodeUnitSequence(bufPtr)
self = String._fromWellFormedUTF8(bufPtr)
}
}

Expand Down Expand Up @@ -951,7 +1029,7 @@ extension String {
utf8CodeUnitCount: Int
) {
resultStorage.initialize(to:
String._fromWellFormedUTF8CodeUnitSequence(
String._fromWellFormedUTF8(
UnsafeBufferPointer(start: start, count: utf8CodeUnitCount)))
}
}
Expand Down