Skip to content

Commit fd8cfea

Browse files
author
Dave Abrahams
committed
[stdlib] String initialization with encoding and CString interop
1 parent cb5b5ba commit fd8cfea

File tree

5 files changed

+284
-40
lines changed

5 files changed

+284
-40
lines changed

stdlib/public/core/String.swift

Lines changed: 174 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,45 @@ public protocol StringProtocol
4242

4343
func lowercased() -> String
4444
func uppercased() -> String
45+
46+
/// Constructs a `String` having the same contents as `codeUnits`.
47+
///
48+
/// - Parameter codeUnits: a collection of code units in
49+
/// the given `encoding`.
50+
/// - Parameter encoding: describes the encoding in which the code units
51+
/// should be interpreted.
52+
init<C: Collection, Encoding: UnicodeEncoding>(
53+
codeUnits: C, encoding: Encoding.Type
54+
)
55+
where C.Iterator.Element == Encoding.CodeUnit
56+
57+
/// Constructs a `String` having the same contents as `nulTerminatedUTF8`.
58+
///
59+
/// - Parameter nulTerminatedUTF8: a sequence of contiguous UTF-8 encoded
60+
/// bytes ending just before the first zero byte (NUL character).
61+
init(cString nulTerminatedUTF8: UnsafePointer<CChar>)
62+
63+
/// Constructs a `String` having the same contents as `nulTerminatedCodeUnits`.
64+
///
65+
/// - Parameter nulTerminatedCodeUnits: a sequence of contiguous code units in
66+
/// the given `encoding`, ending just before the first zero code unit.
67+
/// - Parameter encoding: describes the encoding in which the code units
68+
/// should be interpreted.
69+
init<Encoding: UnicodeEncoding>(
70+
cString nulTerminatedCodeUnits: UnsafePointer<Encoding.CodeUnit>,
71+
encoding: Encoding.Type)
72+
73+
/// Invokes the given closure on the contents of the string, represented as a
74+
/// pointer to a null-terminated sequence of UTF-8 code units.
75+
func withCString<Result>(
76+
_ body: (UnsafePointer<CChar>) throws -> Result) rethrows -> Result
77+
78+
/// Invokes the given closure on the contents of the string, represented as a
79+
/// pointer to a null-terminated sequence of code units in the given encoding.
80+
func withCString<Result, Encoding: UnicodeEncoding>(
81+
encoding: Encoding.Type,
82+
_ body: (UnsafePointer<Encoding.CodeUnit>) throws -> Result
83+
) rethrows -> Result
4584
}
4685

4786
extension StringProtocol {
@@ -52,7 +91,141 @@ extension StringProtocol {
5291
}
5392
}
5493

55-
// FIXME: complexity documentation for most of methods on String is ought to be
94+
/// Call body with a pointer to zero-terminated sequence of
95+
/// `TargetEncoding.CodeUnit` representing the same string as `source`, when
96+
/// `source` is interpreted as being encoded with `SourceEncoding`.
97+
internal func _withCString<
98+
Source : Collection,
99+
SourceEncoding : UnicodeEncoding,
100+
TargetEncoding : UnicodeEncoding,
101+
Result
102+
>(
103+
encodedAs targetEncoding: TargetEncoding.Type,
104+
from source: Source,
105+
encodedAs sourceEncoding: SourceEncoding.Type,
106+
execute body : (UnsafePointer<TargetEncoding.CodeUnit>) throws -> Result
107+
) rethrows -> Result
108+
where Source.Iterator.Element == SourceEncoding.CodeUnit {
109+
return try _withCStringAndLength(
110+
encodedAs: targetEncoding,
111+
from: source,
112+
encodedAs: sourceEncoding) { p, _ in try body(p) }
113+
}
114+
115+
internal func _withCStringAndLength<
116+
Source : Collection,
117+
SourceEncoding : UnicodeEncoding,
118+
TargetEncoding : UnicodeEncoding,
119+
Result
120+
>(
121+
encodedAs targetEncoding: TargetEncoding.Type,
122+
from source: Source,
123+
encodedAs sourceEncoding: SourceEncoding.Type,
124+
execute body : (UnsafePointer<TargetEncoding.CodeUnit>, Int) throws -> Result
125+
) rethrows -> Result
126+
where Source.Iterator.Element == SourceEncoding.CodeUnit {
127+
var targetLength = 0 // nul terminator
128+
var i = source.makeIterator()
129+
SourceEncoding.ForwardParser.parse(&i) {
130+
targetLength += numericCast(
131+
targetEncoding.transcode($0, from: SourceEncoding.self).count)
132+
}
133+
var a: [TargetEncoding.CodeUnit] = []
134+
a.reserveCapacity(targetLength + 1)
135+
i = source.makeIterator()
136+
SourceEncoding.ForwardParser.parse(&i) {
137+
a.append(
138+
contentsOf: targetEncoding.transcode($0, from: SourceEncoding.self))
139+
}
140+
a.append(0)
141+
return try body(a, targetLength)
142+
}
143+
144+
extension _StringCore {
145+
/// Invokes `body` on a null-terminated sequence of code units in the given
146+
/// encoding corresponding to the substring in `bounds`.
147+
internal func _withCSubstring<Result, TargetEncoding: UnicodeEncoding>(
148+
in bounds: Range<Index>,
149+
encoding targetEncoding: TargetEncoding.Type,
150+
_ body: (UnsafePointer<TargetEncoding.CodeUnit>) throws -> Result
151+
) rethrows -> Result {
152+
return try _withCSubstringAndLength(in: bounds, encoding: targetEncoding) {
153+
p,_ in try body(p)
154+
}
155+
}
156+
157+
internal func _withCSubstringAndLength<
158+
Result, TargetEncoding: UnicodeEncoding
159+
>(
160+
in bounds: Range<Index>,
161+
encoding targetEncoding: TargetEncoding.Type,
162+
_ body: (UnsafePointer<TargetEncoding.CodeUnit>, Int) throws -> Result
163+
) rethrows -> Result {
164+
if _fastPath(hasContiguousStorage) {
165+
defer { _fixLifetime(self) }
166+
if isASCII {
167+
return try Swift._withCStringAndLength(
168+
encodedAs: targetEncoding,
169+
from: UnsafeBufferPointer(start: startASCII, count: count)[bounds],
170+
encodedAs: _Unicode.ASCII.self,
171+
execute: body
172+
)
173+
}
174+
else {
175+
return try Swift._withCStringAndLength(
176+
encodedAs: targetEncoding,
177+
from: UnsafeBufferPointer(start: startUTF16, count: count)[bounds],
178+
encodedAs: _Unicode.UTF16.self,
179+
execute: body
180+
)
181+
}
182+
}
183+
return try Swift._withCStringAndLength(
184+
encodedAs: targetEncoding,
185+
from: self[bounds],
186+
encodedAs: _Unicode.UTF16.self,
187+
execute: body
188+
)
189+
}
190+
}
191+
192+
extension String {
193+
public init<C: Collection, Encoding: UnicodeEncoding>(
194+
codeUnits: C, encoding: Encoding.Type
195+
) where C.Iterator.Element == Encoding.CodeUnit {
196+
let (b,_) = _StringBuffer.fromCodeUnits(
197+
codeUnits, encoding: encoding, repairIllFormedSequences: true)
198+
self = String(_StringCore(b!))
199+
}
200+
201+
/// Constructs a `String` having the same contents as `nulTerminatedCodeUnits`.
202+
///
203+
/// - Parameter nulTerminatedCodeUnits: a sequence of contiguous code units in
204+
/// the given `encoding`, ending just before the first zero code unit.
205+
/// - Parameter encoding: describes the encoding in which the code units
206+
/// should be interpreted.
207+
public init<Encoding: UnicodeEncoding>(
208+
cString nulTerminatedCodeUnits: UnsafePointer<Encoding.CodeUnit>,
209+
encoding: Encoding.Type) {
210+
211+
let codeUnits = _SentinelCollection(
212+
UnsafeBufferPointer(_unboundedStartingAt: nulTerminatedCodeUnits),
213+
until: _IsZero()
214+
)
215+
self.init(codeUnits: codeUnits, encoding: encoding)
216+
}
217+
218+
/// Invokes the given closure on the contents of the string, represented as a
219+
/// pointer to a null-terminated sequence of code units in the given encoding.
220+
public func withCString<Result, TargetEncoding: UnicodeEncoding>(
221+
encoding targetEncoding: TargetEncoding.Type,
222+
_ body: (UnsafePointer<TargetEncoding.CodeUnit>) throws -> Result
223+
) rethrows -> Result {
224+
return try _core._withCSubstring(
225+
in: _core.startIndex..<_core.endIndex, encoding: targetEncoding, body)
226+
}
227+
}
228+
// FIXME: complexity documentation for most of methods on String ought to be
56229
// qualified with "amortized" at least, as Characters are variable-length.
57230

58231
/// A Unicode string value.

stdlib/public/core/StringCore.swift

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -387,7 +387,7 @@ public struct _StringCore {
387387
}
388388
}
389389
else if let content = _unmanagedUTF16 {
390-
var i = content.makeIterator()
390+
var i = content.makeIterator()
391391
_Unicode.UTF16.ForwardParser.parse(&i) {
392392
Encoding.transcode($0, from: UTF16.self).forEach(processCodeUnit)
393393
}

stdlib/public/core/Substring.swift.gyb

Lines changed: 51 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,9 @@ public struct Substring : StringProtocol {
3232
_slice = RangeReplaceableBidirectionalSlice(base: base, bounds: bounds)
3333
}
3434

35-
internal init<R: RangeExpression>(_base base: String, _ bounds: R) where R.Bound == Index {
35+
internal init<R: RangeExpression>(
36+
_base base: String, _ bounds: R
37+
) where R.Bound == Index {
3638
self.init(_base: base, bounds.relative(to: base))
3739
}
3840

@@ -99,6 +101,52 @@ public struct Substring : StringProtocol {
99101
}
100102

101103
% end
104+
105+
public init<C: Collection, Encoding: UnicodeEncoding>(
106+
codeUnits: C, encoding: Encoding.Type
107+
) where C.Iterator.Element == Encoding.CodeUnit {
108+
self.init(String(codeUnits: codeUnits, encoding: encoding))
109+
}
110+
111+
public init(cString nulTerminatedUTF8: UnsafePointer<CChar>) {
112+
self.init(String(cString: nulTerminatedUTF8))
113+
}
114+
115+
/// Constructs a `String` having the same contents as `nulTerminatedCodeUnits`.
116+
///
117+
/// - Parameter nulTerminatedCodeUnits: a sequence of contiguous code units in
118+
/// the given `encoding`, ending just before the first zero code unit.
119+
/// - Parameter encoding: describes the encoding in which the code units
120+
/// should be interpreted.
121+
public init<Encoding: UnicodeEncoding>(
122+
cString nulTerminatedCodeUnits: UnsafePointer<Encoding.CodeUnit>,
123+
encoding: Encoding.Type) {
124+
self.init(String(cString: nulTerminatedCodeUnits, encoding: encoding))
125+
}
126+
127+
/// Invokes the given closure on the contents of the string, represented as a
128+
/// pointer to a null-terminated sequence of UTF-8 code units.
129+
public func withCString<Result>(
130+
_ body: (UnsafePointer<CChar>) throws -> Result) rethrows -> Result {
131+
return try _slice._base._core._withCSubstringAndLength(
132+
in: startIndex._base._position..<endIndex._base._position,
133+
encoding: UTF8.self) {
134+
p, length in try p.withMemoryRebound(to: CChar.self, capacity: length) {
135+
try body($0)
136+
}
137+
}
138+
}
139+
140+
/// Invokes the given closure on the contents of the string, represented as a
141+
/// pointer to a null-terminated sequence of code units in the given encoding.
142+
public func withCString<Result, TargetEncoding: UnicodeEncoding>(
143+
encoding targetEncoding: TargetEncoding.Type,
144+
_ body: (UnsafePointer<TargetEncoding.CodeUnit>) throws -> Result
145+
) rethrows -> Result {
146+
return try _slice._base._core._withCSubstring(
147+
in: startIndex._base._position..<endIndex._base._position,
148+
encoding: targetEncoding, body)
149+
}
102150
}
103151

104152

@@ -127,8 +175,8 @@ extension Substring : CustomDebugStringConvertible {
127175
}
128176

129177
extension Substring : LosslessStringConvertible {
130-
public init?(_ description: String) {
131-
self.init(_base: description, description.startIndex ..< description.endIndex)
178+
public init(_ content: String) {
179+
self.init(_base: content, content.startIndex ..< content.endIndex)
132180
}
133181
}
134182

stdlib/public/core/UnicodeEncoding.swift

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -62,13 +62,15 @@ extension _UnicodeEncoding {
6262
}
6363

6464
/// Converts from encoding-independent to encoded representation, returning
65-
/// `nil` if the scalar can't be represented in this encoding.
65+
/// `encodedReplacementCharacter` if the scalar can't be represented in this
66+
/// encoding.
6667
public static func encode(_ content: UnicodeScalar) -> EncodedScalar {
6768
return encodeIfRepresentable(content) ?? encodedReplacementCharacter
6869
}
6970

7071
/// Converts a scalar from another encoding's representation, returning
71-
/// `nil` if the scalar can't be represented in this encoding.
72+
/// `encodedReplacementCharacter` if the scalar can't be represented in this
73+
/// encoding.
7274
public static func transcode<FromEncoding : UnicodeEncoding>(
7375
_ content: FromEncoding.EncodedScalar, from _: FromEncoding.Type
7476
) -> EncodedScalar {

test/Prototypes/UnicodeDecoders.swift

Lines changed: 54 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -148,37 +148,6 @@ extension _Unicode.DefaultScalarView : Collection {
148148
}
149149
}
150150

151-
/// An iterator that can be much faster than the iterator of a reversed slice.
152-
// TODO: See about using this in more places
153-
@_fixed_layout
154-
public struct _ReverseIndexingIterator<
155-
Elements : BidirectionalCollection
156-
> : IteratorProtocol, Sequence {
157-
158-
@_inlineable
159-
@inline(__always)
160-
/// Creates an iterator over the given collection.
161-
public /// @testable
162-
init(_elements: Elements, _position: Elements.Index) {
163-
self._elements = _elements
164-
self._position = _position
165-
}
166-
167-
@_inlineable
168-
@inline(__always)
169-
public mutating func next() -> Elements._Element? {
170-
guard _fastPath(_position != _elements.startIndex) else { return nil }
171-
_position = _elements.index(before: _position)
172-
return _elements[_position]
173-
}
174-
175-
@_versioned
176-
internal let _elements: Elements
177-
@_versioned
178-
internal var _position: Elements.Index
179-
}
180-
181-
182151
extension _Unicode.DefaultScalarView : BidirectionalCollection {
183152
@inline(__always)
184153
public func index(before i: Index) -> Index {
@@ -210,9 +179,47 @@ extension _Unicode.DefaultScalarView : BidirectionalCollection {
210179
import StdlibUnittest
211180
import SwiftPrivate
212181

182+
func utf32<S : StringProtocol>(_ s: S) -> [UInt32] {
183+
return s.unicodeScalars.map { $0.value }
184+
}
185+
186+
func checkStringProtocol<S : StringProtocol, Encoding: UnicodeEncoding>(
187+
_ s: S,
188+
_ utfStr: [Encoding.CodeUnit],
189+
encodedAs: Encoding.Type,
190+
expectingUTF32 expected: [UInt32]
191+
) {
192+
expectEqualSequence(
193+
expected, utf32(s), "\(S.self) init(codeUnits:encoding:)")
194+
195+
if !utfStr.contains(0) {
196+
if Encoding.self == UTF8.self {
197+
var ntbs = utfStr.map { CChar(extendingOrTruncating: $0) }
198+
ntbs.append(0)
199+
expectEqualSequence(
200+
expected, utf32(S(cString: ntbs)), "\(S.self) init(cString:)")
201+
}
202+
203+
var ntbs = Array(utfStr); ntbs.append(0)
204+
expectEqualSequence(
205+
expected, utf32(S(cString: ntbs, encoding: Encoding.self)),
206+
"\(S.self) init(cString:encoding:)"
207+
)
208+
209+
s.withCString {
210+
expectEqual(s, S(cString: $0), "\(S.self) withCString(_:)")
211+
}
212+
213+
s.withCString(encoding: Encoding.self) {
214+
expectEqual(s, S(cString: $0, encoding: Encoding.self),
215+
"\(S.self) withCString(encoding:_:)")
216+
}
217+
}
218+
}
219+
213220
func checkDecodeUTF<Codec : UnicodeCodec & UnicodeEncoding>(
214-
_ codec: Codec.Type, _ expectedHead: [UInt32],
215-
_ expectedRepairedTail: [UInt32], _ utfStr: [Codec.CodeUnit]
221+
_ codec: Codec.Type, _ expectedHead: [UInt32],
222+
_ expectedRepairedTail: [UInt32], _ utfStr: [Codec.CodeUnit]
216223
) -> AssertionResult {
217224
var decoded = [UInt32]()
218225
var expected = expectedHead
@@ -303,6 +310,20 @@ func checkDecodeUTF<Codec : UnicodeCodec & UnicodeEncoding>(
303310
else { expectNotEqual(0, errorCount) }
304311
}
305312
check(expected.reversed(), "reverse, repairing: true")
313+
314+
//===--- String/Substring Construction and C-String interop -------------===//
315+
do {
316+
let s = String(codeUnits: utfStr, encoding: Codec.self)
317+
checkStringProtocol(
318+
s, utfStr, encodedAs: Codec.self, expectingUTF32: expected)
319+
}
320+
321+
do {
322+
let s0 = "\n" + String(codeUnits: utfStr, encoding: Codec.self) + "\n"
323+
checkStringProtocol(
324+
s0.dropFirst().dropLast(),
325+
utfStr, encodedAs: Codec.self, expectingUTF32: expected)
326+
}
306327

307328
//===--- Transcoded Scalars ---------------------------------------------===//
308329
for x in decoded.lazy.map({ UnicodeScalar($0)! }) {

0 commit comments

Comments
 (0)