Skip to content

Commit f7ff8ce

Browse files
authored
Merge pull request #9381 from apple/direct-transcoding
2 parents 5b3e077 + c5d6880 commit f7ff8ce

14 files changed

+263
-118
lines changed

stdlib/public/core/ASCII.swift

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
//===--- ASCII.swift ------------------------------------------------------===//
2+
//
3+
// This source file is part of the Swift.org open source project
4+
//
5+
// Copyright (c) 2014 - 2017 Apple Inc. and the Swift project authors
6+
// Licensed under Apache License v2.0 with Runtime Library Exception
7+
//
8+
// See https://swift.org/LICENSE.txt for license information
9+
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
10+
//
11+
//===----------------------------------------------------------------------===//
12+
extension _Unicode {
13+
@_fixed_layout
14+
public enum ASCII {}
15+
}
16+
17+
extension _Unicode.ASCII : UnicodeEncoding {
18+
public typealias CodeUnit = UInt8
19+
public typealias EncodedScalar = CollectionOfOne<CodeUnit>
20+
21+
public static var encodedReplacementCharacter : EncodedScalar {
22+
return EncodedScalar(0x1a) // U+001A SUBSTITUTE; best we can do for ASCII
23+
}
24+
25+
@inline(__always)
26+
@_inlineable
27+
public static func _isScalar(_ x: CodeUnit) -> Bool {
28+
return true
29+
}
30+
31+
@inline(__always)
32+
@_inlineable
33+
public static func decode(_ source: EncodedScalar) -> UnicodeScalar {
34+
return UnicodeScalar(_unchecked: UInt32(
35+
source.first._unsafelyUnwrappedUnchecked))
36+
}
37+
38+
@inline(__always)
39+
@_inlineable
40+
public static func encodeIfRepresentable(
41+
_ source: UnicodeScalar
42+
) -> EncodedScalar? {
43+
guard source.value < (1&<<7) else { return nil }
44+
return EncodedScalar(UInt8(extendingOrTruncating: source.value))
45+
}
46+
47+
@inline(__always)
48+
public static func transcodeIfRepresentable<FromEncoding : UnicodeEncoding>(
49+
_ content: FromEncoding.EncodedScalar, from _: FromEncoding.Type
50+
) -> EncodedScalar? {
51+
if _fastPath(FromEncoding.self == UTF16.self) {
52+
let c = unsafeBitCast(content, to: UTF16.EncodedScalar.self)
53+
guard (c._storage & 0xFF80 == 0) else { return nil }
54+
return EncodedScalar(CodeUnit(c._storage & 0x7f))
55+
}
56+
else if _fastPath(FromEncoding.self == UTF8.self) {
57+
let c = unsafeBitCast(content, to: UTF8.EncodedScalar.self)
58+
guard (c._storage & 0x80 == 0) else { return nil }
59+
return EncodedScalar(CodeUnit(c._storage & 0x7f))
60+
}
61+
return encodeIfRepresentable(FromEncoding.decode(content))
62+
}
63+
64+
public struct Parser {
65+
public init() { }
66+
}
67+
68+
public typealias ForwardParser = Parser
69+
public typealias ReverseParser = Parser
70+
}
71+
72+
extension _Unicode.ASCII.Parser : UnicodeParser {
73+
public typealias Encoding = _Unicode.ASCII
74+
75+
/// Parses a single Unicode scalar value from `input`.
76+
public mutating func parseScalar<I : IteratorProtocol>(
77+
from input: inout I
78+
) -> _Unicode.ParseResult<Encoding.EncodedScalar>
79+
where I.Element == Encoding.CodeUnit {
80+
let n = input.next()
81+
if _fastPath(n != nil), let x = n {
82+
guard _fastPath(Int8(extendingOrTruncating: x) >= 0)
83+
else { return .invalid(length: 1) }
84+
return .valid(_Unicode.ASCII.EncodedScalar(x))
85+
}
86+
return .emptyInput
87+
}
88+
}

stdlib/public/core/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ set(SWIFTLIB_ESSENTIAL
2525
ArrayCast.swift
2626
Arrays.swift.gyb
2727
ArrayType.swift
28+
ASCII.swift
2829
Assert.swift
2930
AssertCommon.swift
3031
BidirectionalCollection.swift

stdlib/public/core/CString.swift

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -46,8 +46,8 @@ extension String {
4646
public init(cString: UnsafePointer<CChar>) {
4747
let len = UTF8._nullCodeUnitOffset(in: cString)
4848
let (result, _) = cString.withMemoryRebound(to: UInt8.self, capacity: len) {
49-
_decodeCString($0, as: UTF8.self, length: len,
50-
repairingInvalidCodeUnits: true)!
49+
_decodeCString(
50+
$0, as: UTF8.self, length: len, repairingInvalidCodeUnits: true)!
5151
}
5252
self = result
5353
}
@@ -58,8 +58,8 @@ extension String {
5858
/// This is identical to init(cString: UnsafePointer<CChar> but operates on an
5959
/// unsigned sequence of bytes.
6060
public init(cString: UnsafePointer<UInt8>) {
61-
self = String.decodeCString(cString, as: UTF8.self,
62-
repairingInvalidCodeUnits: true)!.result
61+
self = String.decodeCString(
62+
cString, as: UTF8.self, repairingInvalidCodeUnits: true)!.result
6363
}
6464

6565
/// Creates a new string by copying and validating the null-terminated UTF-8
@@ -145,7 +145,7 @@ extension String {
145145
/// ill-formed sequence is detected, this method returns `nil`.
146146
///
147147
/// - SeeAlso: `UnicodeCodec`
148-
public static func decodeCString<Encoding : UnicodeCodec>(
148+
public static func decodeCString<Encoding : UnicodeEncoding>(
149149
_ cString: UnsafePointer<Encoding.CodeUnit>?,
150150
as encoding: Encoding.Type,
151151
repairingInvalidCodeUnits isRepairing: Bool = true)
@@ -154,8 +154,11 @@ extension String {
154154
guard let cString = cString else {
155155
return nil
156156
}
157-
let len = encoding._nullCodeUnitOffset(in: cString)
158-
return _decodeCString(cString, as: encoding, length: len,
157+
var end = cString
158+
while end.pointee != 0 { end += 1 }
159+
let len = end - cString
160+
return _decodeCString(
161+
cString, as: encoding, length: len,
159162
repairingInvalidCodeUnits: isRepairing)
160163
}
161164

@@ -180,7 +183,7 @@ public func _persistCString(_ p: UnsafePointer<CChar>?) -> [CChar]? {
180183
/// the given pointer using the specified encoding.
181184
///
182185
/// This internal helper takes the string length as an argument.
183-
func _decodeCString<Encoding : UnicodeCodec>(
186+
func _decodeCString<Encoding : UnicodeEncoding>(
184187
_ cString: UnsafePointer<Encoding.CodeUnit>,
185188
as encoding: Encoding.Type, length: Int,
186189
repairingInvalidCodeUnits isRepairing: Bool = true)

stdlib/public/core/GroupInfo.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
"AssertCommon.swift"
55
],
66
"String": [
7+
"ASCII.swift",
78
"CString.swift",
89
"Character.swift",
910
"StaticString.swift",

stdlib/public/core/String.swift

Lines changed: 13 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -344,23 +344,22 @@ public struct String {
344344

345345
extension String {
346346
public // @testable
347-
static func _fromWellFormedCodeUnitSequence<Encoding, Input>(
347+
static func _fromWellFormedCodeUnitSequence<
348+
Encoding : UnicodeEncoding, Input : Collection
349+
>(
348350
_ encoding: Encoding.Type, input: Input
349351
) -> String
350-
where
351-
Encoding: UnicodeCodec,
352-
Input: Collection,
353-
Input.Iterator.Element == Encoding.CodeUnit {
352+
where Input.Iterator.Element == Encoding.CodeUnit {
354353
return String._fromCodeUnitSequence(encoding, input: input)!
355354
}
356355

357356
public // @testable
358-
static func _fromCodeUnitSequence<Encoding, Input>(
357+
static func _fromCodeUnitSequence<
358+
Encoding : UnicodeEncoding, Input : Collection
359+
>(
359360
_ encoding: Encoding.Type, input: Input
360361
) -> String?
361362
where
362-
Encoding: UnicodeCodec,
363-
Input: Collection,
364363
Input.Iterator.Element == Encoding.CodeUnit {
365364
let (stringBufferOptional, _) =
366365
_StringBuffer.fromCodeUnits(input, encoding: encoding,
@@ -369,12 +368,12 @@ extension String {
369368
}
370369

371370
public // @testable
372-
static func _fromCodeUnitSequenceWithRepair<Encoding, Input>(
371+
static func _fromCodeUnitSequenceWithRepair<
372+
Encoding : UnicodeEncoding, Input : Collection
373+
>(
373374
_ encoding: Encoding.Type, input: Input
374375
) -> (String, hadError: Bool)
375376
where
376-
Encoding: UnicodeCodec,
377-
Input: Collection,
378377
Input.Iterator.Element == Encoding.CodeUnit {
379378

380379
let (stringBuffer, hadError) =
@@ -485,23 +484,21 @@ extension String {
485484
/// Returns the number of code units occupied by this string
486485
/// in the given encoding.
487486
func _encodedLength<
488-
Encoding: UnicodeCodec
487+
Encoding: UnicodeEncoding
489488
>(_ encoding: Encoding.Type) -> Int {
490489
var codeUnitCount = 0
491490
self._encode(encoding, into: { _ in codeUnitCount += 1 })
492491
return codeUnitCount
493492
}
494493

495-
// FIXME: this function does not handle the case when a wrapped NSString
494+
// FIXME: this function may not handle the case when a wrapped NSString
496495
// contains unpaired surrogates. Fix this before exposing this function as a
497496
// public API. But it is unclear if it is valid to have such an NSString in
498497
// the first place. If it is not, we should not be crashing in an obscure
499498
// way -- add a test for that.
500499
// Related: <rdar://problem/17340917> Please document how NSString interacts
501500
// with unpaired surrogates
502-
func _encode<
503-
Encoding: UnicodeCodec
504-
>(
501+
func _encode<Encoding: UnicodeEncoding>(
505502
_ encoding: Encoding.Type,
506503
into processCodeUnit: (Encoding.CodeUnit) -> Void
507504
) {

stdlib/public/core/StringBuffer.swift

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ public struct _StringBuffer {
9797
) -> (_StringBuffer?, hadError: Bool)
9898
where
9999
Input : Collection, // Sequence?
100-
Encoding : UnicodeCodec,
100+
Encoding : UnicodeEncoding,
101101
Input.Iterator.Element == Encoding.CodeUnit {
102102
// Determine how many UTF-16 code units we'll need
103103
let inputStream = input.makeIterator()

stdlib/public/core/StringCore.swift

Lines changed: 47 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -336,32 +336,60 @@ public struct _StringCore {
336336
}
337337
}
338338

339+
var _unmanagedASCII : UnsafeBufferPointer<_Unicode.ASCII.CodeUnit>? {
340+
@inline(__always)
341+
get {
342+
guard _fastPath(_baseAddress != nil && elementWidth == 1) else {
343+
return nil
344+
}
345+
return UnsafeBufferPointer(
346+
start: _baseAddress!.assumingMemoryBound(
347+
to: _Unicode.ASCII.CodeUnit.self),
348+
count: count
349+
)
350+
}
351+
}
352+
353+
var _unmanagedUTF16 : UnsafeBufferPointer<UTF16.CodeUnit>? {
354+
@inline(__always)
355+
get {
356+
guard _fastPath(_baseAddress != nil && elementWidth != 1) else {
357+
return nil
358+
}
359+
return UnsafeBufferPointer(
360+
start: _baseAddress!.assumingMemoryBound(to: UTF16.CodeUnit.self),
361+
count: count
362+
)
363+
}
364+
}
365+
339366
/// Write the string, in the given encoding, to output.
340-
func encode<Encoding: UnicodeCodec>(
367+
func encode<Encoding: UnicodeEncoding>(
341368
_ encoding: Encoding.Type,
342369
into processCodeUnit: (Encoding.CodeUnit) -> Void)
343370
{
344-
if _fastPath(_baseAddress != nil) {
345-
if _fastPath(elementWidth == 1) {
346-
for x in UnsafeBufferPointer(
347-
start: _baseAddress!.assumingMemoryBound(to: UTF8.CodeUnit.self),
348-
count: count
349-
) {
350-
Encoding.encode(UnicodeScalar(x), into: processCodeUnit)
371+
defer { _fixLifetime(self) }
372+
if let bytes = _unmanagedASCII {
373+
if encoding == _Unicode.ASCII.self
374+
|| encoding == _Unicode.UTF8.self
375+
|| encoding == _Unicode.UTF16.self
376+
|| encoding == _Unicode.UTF32.self {
377+
bytes.forEach {
378+
processCodeUnit(Encoding.CodeUnit(extendingOrTruncating: $0))
351379
}
352380
}
353381
else {
354-
let hadError = transcode(
355-
UnsafeBufferPointer(
356-
start: _baseAddress!.assumingMemoryBound(to: UTF16.CodeUnit.self),
357-
count: count
358-
).makeIterator(),
359-
from: UTF16.self,
360-
to: encoding,
361-
stoppingOnError: true,
362-
into: processCodeUnit
363-
)
364-
_sanityCheck(!hadError, "Swift.String with native storage should not have unpaired surrogates")
382+
// TODO: be sure tests exercise this code path.
383+
for b in bytes {
384+
Encoding.encode(
385+
UnicodeScalar(_unchecked: UInt32(b))).forEach(processCodeUnit)
386+
}
387+
}
388+
}
389+
else if let content = _unmanagedUTF16 {
390+
var i = content.makeIterator()
391+
_Unicode.UTF16.ForwardParser.parse(&i) {
392+
Encoding.transcode($0, from: UTF16.self).forEach(processCodeUnit)
365393
}
366394
}
367395
else if hasCocoaBuffer {

stdlib/public/core/UTF16.swift

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,9 @@ extension _Unicode.UTF16 : UnicodeEncoding {
3737
return UnicodeScalar(_unchecked: value)
3838
}
3939

40-
public static func encode(_ source: UnicodeScalar) -> EncodedScalar? {
40+
public static func encodeIfRepresentable(
41+
_ source: UnicodeScalar
42+
) -> EncodedScalar? {
4143
let x = source.value
4244
if _fastPath(x < (1 << 16)) {
4345
return EncodedScalar(_storage: x, _bitCount: 16)
@@ -50,7 +52,7 @@ extension _Unicode.UTF16 : UnicodeEncoding {
5052
}
5153

5254
@inline(__always)
53-
public static func transcode<FromEncoding : UnicodeEncoding>(
55+
public static func transcodeIfRepresentable<FromEncoding : UnicodeEncoding>(
5456
_ content: FromEncoding.EncodedScalar, from _: FromEncoding.Type
5557
) -> EncodedScalar? {
5658
if _fastPath(FromEncoding.self == UTF8.self) {
@@ -84,12 +86,12 @@ extension _Unicode.UTF16 : UnicodeEncoding {
8486
s &>>= 8
8587
r |= s & 0b0__11_1111
8688
r &= (1 &<< 21) - 1
87-
return encode(UnicodeScalar(_unchecked: r))
89+
return encodeIfRepresentable(UnicodeScalar(_unchecked: r))
8890
}
8991
else if _fastPath(FromEncoding.self == UTF16.self) {
9092
return unsafeBitCast(content, to: UTF16.EncodedScalar.self)
9193
}
92-
return encode(FromEncoding.decode(content))
94+
return encodeIfRepresentable(FromEncoding.decode(content))
9395
}
9496

9597
public struct ForwardParser {

stdlib/public/core/UTF32.swift

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,9 @@ extension _Unicode.UTF32 : UnicodeEncoding {
3434
}
3535

3636
@inline(__always)
37-
public static func encode(_ source: UnicodeScalar) -> EncodedScalar? {
37+
public static func encodeIfRepresentable(
38+
_ source: UnicodeScalar
39+
) -> EncodedScalar? {
3840
return EncodedScalar(source.value)
3941
}
4042

stdlib/public/core/UTF8.swift

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,9 @@ extension _Unicode.UTF8 : UnicodeEncoding {
5858

5959
@inline(__always)
6060
@_inlineable
61-
public static func encode(_ source: UnicodeScalar) -> EncodedScalar? {
61+
public static func encodeIfRepresentable(
62+
_ source: UnicodeScalar
63+
) -> EncodedScalar? {
6264
var c = source.value
6365
if _fastPath(c < (1&<<7)) {
6466
return EncodedScalar(_storage: c, _bitCount: 8)
@@ -86,7 +88,7 @@ extension _Unicode.UTF8 : UnicodeEncoding {
8688
}
8789

8890
@inline(__always)
89-
public static func transcode<FromEncoding : UnicodeEncoding>(
91+
public static func transcodeIfRepresentable<FromEncoding : UnicodeEncoding>(
9092
_ content: FromEncoding.EncodedScalar, from _: FromEncoding.Type
9193
) -> EncodedScalar? {
9294
if _fastPath(FromEncoding.self == UTF16.self) {
@@ -116,7 +118,7 @@ extension _Unicode.UTF8 : UnicodeEncoding {
116118
else if _fastPath(FromEncoding.self == UTF8.self) {
117119
return unsafeBitCast(content, to: UTF8.EncodedScalar.self)
118120
}
119-
return encode(FromEncoding.decode(content))
121+
return encodeIfRepresentable(FromEncoding.decode(content))
120122
}
121123

122124
@_fixed_layout

0 commit comments

Comments
 (0)