Skip to content

String initialization with encoding and C-string interop #9412

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
May 9, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions stdlib/public/core/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ set(SWIFTLIB_ESSENTIAL
Reverse.swift
Runtime.swift.gyb
SipHash.swift.gyb
SentinelCollection.swift
Sequence.swift
SequenceAlgorithms.swift.gyb
SequenceWrapper.swift
Expand Down
1 change: 1 addition & 0 deletions stdlib/public/core/GroupInfo.json
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@
"Repeat.swift",
"Sort.swift",
"Range.swift",
"SentinelCollection.swift",
"ClosedRange.swift",
"CollectionOfOne.swift",
"HeapBuffer.swift",
Expand Down
109 changes: 109 additions & 0 deletions stdlib/public/core/SentinelCollection.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
//===--- SentinelCollection.swift -----------------------------------------===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2014 - 2017 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See https://swift.org/LICENSE.txt for license information
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
//
//===----------------------------------------------------------------------===//
public // @testable
protocol _Function {
associatedtype Input
associatedtype Output
func apply(_: Input) -> Output
}

protocol _Predicate : _Function where Output == Bool { }

struct _SentinelIterator<
Base: IteratorProtocol,
IsSentinel : _Predicate
> : IteratorProtocol, Sequence
where IsSentinel.Input == Base.Element {
var _base: Base
var _isSentinel: IsSentinel
var _expired: Bool = false

init(_ base: Base, until condition: IsSentinel) {
_base = base
_isSentinel = condition
}

mutating func next() -> Base.Element? {
guard _fastPath(!_expired) else { return nil }
let x = _base.next()
// We don't need this check if it's a precondition that the sentinel will be
// found
// guard _fastPath(x != nil), let y = x else { return x }
guard _fastPath(!_isSentinel.apply(x!)) else { _expired = true; return nil }
return x
}
}

struct _SentinelCollection<
Base: Collection,
IsSentinel : _Predicate
> : Collection
where IsSentinel.Input == Base.Iterator.Element {
let _isSentinel: IsSentinel
var _base : Base

typealias IndexDistance = Base.IndexDistance

func makeIterator() -> _SentinelIterator<Base.Iterator, IsSentinel> {
return _SentinelIterator(_base.makeIterator(), until: _isSentinel)
}

struct Index : Comparable {
var _impl: (position: Base.Index, element: Base.Iterator.Element)?

static func == (lhs: Index, rhs: Index) -> Bool {
if rhs._impl == nil { return lhs._impl == nil }
return lhs._impl != nil && rhs._impl!.position == lhs._impl!.position
}

static func < (lhs: Index, rhs: Index) -> Bool {
if rhs._impl == nil { return lhs._impl != nil }
return lhs._impl != nil && rhs._impl!.position < lhs._impl!.position
}
}

var startIndex : Index {
return _index(at: _base.startIndex)
}

var endIndex : Index {
return Index(_impl: nil)
}

subscript(i: Index) -> Base.Iterator.Element {
return i._impl!.element
}

func index(after i: Index) -> Index {
return _index(at: _base.index(after: i._impl!.position))
}

func _index(at i: Base.Index) -> Index {
// We don't need this check if it's a precondition that the sentinel will be
// found
// guard _fastPath(i != _base.endIndex) else { return endIndex }
let e = _base[i]
guard _fastPath(!_isSentinel.apply(e)) else { return endIndex }
return Index(_impl: (position: i, element: e))
}

init(_ base: Base, until condition: IsSentinel) {
_base = base
_isSentinel = condition
}
}

struct _IsZero<T : BinaryInteger> : _Predicate {
func apply(_ x: T) -> Bool {
return x == 0
}
}
175 changes: 174 additions & 1 deletion stdlib/public/core/String.swift
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,45 @@ public protocol StringProtocol

func lowercased() -> String
func uppercased() -> String

/// Constructs a `String` having the same contents as `codeUnits`.
///
/// - Parameter codeUnits: a collection of code units in
/// the given `encoding`.
/// - Parameter encoding: describes the encoding in which the code units
/// should be interpreted.
init<C: Collection, Encoding: UnicodeEncoding>(
codeUnits: C, encoding: Encoding.Type
)
where C.Iterator.Element == Encoding.CodeUnit

/// Constructs a `String` having the same contents as `nulTerminatedUTF8`.
///
/// - Parameter nulTerminatedUTF8: a sequence of contiguous UTF-8 encoded
/// bytes ending just before the first zero byte (NUL character).
init(cString nulTerminatedUTF8: UnsafePointer<CChar>)

/// Constructs a `String` having the same contents as `nulTerminatedCodeUnits`.
///
/// - Parameter nulTerminatedCodeUnits: a sequence of contiguous code units in
/// the given `encoding`, ending just before the first zero code unit.
/// - Parameter encoding: describes the encoding in which the code units
/// should be interpreted.
init<Encoding: UnicodeEncoding>(
cString nulTerminatedCodeUnits: UnsafePointer<Encoding.CodeUnit>,
encoding: Encoding.Type)

/// Invokes the given closure on the contents of the string, represented as a
/// pointer to a null-terminated sequence of UTF-8 code units.
func withCString<Result>(
_ body: (UnsafePointer<CChar>) throws -> Result) rethrows -> Result

/// Invokes the given closure on the contents of the string, represented as a
/// pointer to a null-terminated sequence of code units in the given encoding.
func withCString<Result, Encoding: UnicodeEncoding>(
encoding: Encoding.Type,
_ body: (UnsafePointer<Encoding.CodeUnit>) throws -> Result
) rethrows -> Result
}

extension StringProtocol {
Expand All @@ -52,7 +91,141 @@ extension StringProtocol {
}
}

// FIXME: complexity documentation for most of methods on String is ought to be
/// Call body with a pointer to zero-terminated sequence of
/// `TargetEncoding.CodeUnit` representing the same string as `source`, when
/// `source` is interpreted as being encoded with `SourceEncoding`.
internal func _withCString<
Source : Collection,
SourceEncoding : UnicodeEncoding,
TargetEncoding : UnicodeEncoding,
Result
>(
encodedAs targetEncoding: TargetEncoding.Type,
from source: Source,
encodedAs sourceEncoding: SourceEncoding.Type,
execute body : (UnsafePointer<TargetEncoding.CodeUnit>) throws -> Result
) rethrows -> Result
where Source.Iterator.Element == SourceEncoding.CodeUnit {
return try _withCStringAndLength(
encodedAs: targetEncoding,
from: source,
encodedAs: sourceEncoding) { p, _ in try body(p) }
}

internal func _withCStringAndLength<
Source : Collection,
SourceEncoding : UnicodeEncoding,
TargetEncoding : UnicodeEncoding,
Result
>(
encodedAs targetEncoding: TargetEncoding.Type,
from source: Source,
encodedAs sourceEncoding: SourceEncoding.Type,
execute body : (UnsafePointer<TargetEncoding.CodeUnit>, Int) throws -> Result
) rethrows -> Result
where Source.Iterator.Element == SourceEncoding.CodeUnit {
var targetLength = 0 // nul terminator
var i = source.makeIterator()
SourceEncoding.ForwardParser.parse(&i) {
targetLength += numericCast(
targetEncoding.transcode($0, from: SourceEncoding.self).count)
}
var a: [TargetEncoding.CodeUnit] = []
a.reserveCapacity(targetLength + 1)
i = source.makeIterator()
SourceEncoding.ForwardParser.parse(&i) {
a.append(
contentsOf: targetEncoding.transcode($0, from: SourceEncoding.self))
}
a.append(0)
return try body(a, targetLength)
}

extension _StringCore {
/// Invokes `body` on a null-terminated sequence of code units in the given
/// encoding corresponding to the substring in `bounds`.
internal func _withCSubstring<Result, TargetEncoding: UnicodeEncoding>(
in bounds: Range<Index>,
encoding targetEncoding: TargetEncoding.Type,
_ body: (UnsafePointer<TargetEncoding.CodeUnit>) throws -> Result
) rethrows -> Result {
return try _withCSubstringAndLength(in: bounds, encoding: targetEncoding) {
p,_ in try body(p)
}
}

internal func _withCSubstringAndLength<
Result, TargetEncoding: UnicodeEncoding
>(
in bounds: Range<Index>,
encoding targetEncoding: TargetEncoding.Type,
_ body: (UnsafePointer<TargetEncoding.CodeUnit>, Int) throws -> Result
) rethrows -> Result {
if _fastPath(hasContiguousStorage) {
defer { _fixLifetime(self) }
if isASCII {
return try Swift._withCStringAndLength(
encodedAs: targetEncoding,
from: UnsafeBufferPointer(start: startASCII, count: count)[bounds],
encodedAs: _Unicode.ASCII.self,
execute: body
)
}
else {
return try Swift._withCStringAndLength(
encodedAs: targetEncoding,
from: UnsafeBufferPointer(start: startUTF16, count: count)[bounds],
encodedAs: _Unicode.UTF16.self,
execute: body
)
}
}
return try Swift._withCStringAndLength(
encodedAs: targetEncoding,
from: self[bounds],
encodedAs: _Unicode.UTF16.self,
execute: body
)
}
}

extension String {
public init<C: Collection, Encoding: UnicodeEncoding>(
codeUnits: C, encoding: Encoding.Type
) where C.Iterator.Element == Encoding.CodeUnit {
let (b,_) = _StringBuffer.fromCodeUnits(
codeUnits, encoding: encoding, repairIllFormedSequences: true)
self = String(_StringCore(b!))
}

/// Constructs a `String` having the same contents as `nulTerminatedCodeUnits`.
///
/// - Parameter nulTerminatedCodeUnits: a sequence of contiguous code units in
/// the given `encoding`, ending just before the first zero code unit.
/// - Parameter encoding: describes the encoding in which the code units
/// should be interpreted.
public init<Encoding: UnicodeEncoding>(
cString nulTerminatedCodeUnits: UnsafePointer<Encoding.CodeUnit>,
encoding: Encoding.Type) {

let codeUnits = _SentinelCollection(
UnsafeBufferPointer(_unboundedStartingAt: nulTerminatedCodeUnits),
until: _IsZero()
)
self.init(codeUnits: codeUnits, encoding: encoding)
}

/// Invokes the given closure on the contents of the string, represented as a
/// pointer to a null-terminated sequence of code units in the given encoding.
public func withCString<Result, TargetEncoding: UnicodeEncoding>(
encoding targetEncoding: TargetEncoding.Type,
_ body: (UnsafePointer<TargetEncoding.CodeUnit>) throws -> Result
) rethrows -> Result {
return try _core._withCSubstring(
in: _core.startIndex..<_core.endIndex, encoding: targetEncoding, body)
}
}
// FIXME: complexity documentation for most of methods on String ought to be
// qualified with "amortized" at least, as Characters are variable-length.

/// A Unicode string value.
Expand Down
7 changes: 2 additions & 5 deletions stdlib/public/core/StringBuffer.swift
Original file line number Diff line number Diff line change
Expand Up @@ -91,14 +91,11 @@ public struct _StringBuffer {
= ((_storage._capacity() - capacityBump) &<< 1) + elementShift
}

static func fromCodeUnits<Input, Encoding>(
static func fromCodeUnits<Input : Sequence, Encoding : UnicodeEncoding>(
_ input: Input, encoding: Encoding.Type, repairIllFormedSequences: Bool,
minimumCapacity: Int = 0
) -> (_StringBuffer?, hadError: Bool)
where
Input : Collection, // Sequence?
Encoding : UnicodeEncoding,
Input.Iterator.Element == Encoding.CodeUnit {
where Input.Iterator.Element == Encoding.CodeUnit {
// Determine how many UTF-16 code units we'll need
let inputStream = input.makeIterator()
guard let (utf16Count, isAscii) = UTF16.transcodedLength(
Expand Down
2 changes: 1 addition & 1 deletion stdlib/public/core/StringCore.swift
Original file line number Diff line number Diff line change
Expand Up @@ -387,7 +387,7 @@ public struct _StringCore {
}
}
else if let content = _unmanagedUTF16 {
var i = content.makeIterator()
var i = content.makeIterator()
_Unicode.UTF16.ForwardParser.parse(&i) {
Encoding.transcode($0, from: UTF16.self).forEach(processCodeUnit)
}
Expand Down
Loading