22
22
// FIXME(ABI)#73 : The UTF-8 string view should have a custom iterator type to
23
23
// allow performance optimizations of linear traversals.
24
24
25
- extension _StringCore {
26
- /// An integral type that holds a sequence of UTF-8 code units, starting in
27
- /// its low byte.
28
- internal typealias _UTF8Chunk = UInt64
29
-
30
- /// Encode text starting at `i` as UTF-8. Returns a pair whose first
31
- /// element is the index of the text following whatever got encoded,
32
- /// and the second element contains the encoded UTF-8 starting in its
33
- /// low byte. Any unused high bytes in the result will be set to
34
- /// 0xFF.
35
- @inline ( __always)
36
- func _encodeSomeUTF8( from i: Int ) -> ( Int , _UTF8Chunk ) {
37
- _sanityCheck ( i <= count)
38
-
39
- if let asciiBuffer = self . asciiBuffer {
40
- // How many UTF-16 code units might we use before we've filled up
41
- // our _UTF8Chunk with UTF-8 code units?
42
- let utf16Count =
43
- Swift . min ( MemoryLayout< _UTF8Chunk> . size, asciiBuffer. count - i)
44
-
45
- var result : _UTF8Chunk = ~ 0 // Start with all bits set
46
-
47
- _memcpy (
48
- dest: UnsafeMutableRawPointer ( Builtin . addressof ( & result) ) ,
49
- src: asciiBuffer. baseAddress! + i,
50
- size: numericCast ( utf16Count) )
51
-
52
- // Convert the _UTF8Chunk into host endianness.
53
- return ( i + utf16Count, _UTF8Chunk ( littleEndian: result) )
54
- } else if _fastPath ( _baseAddress != nil ) {
55
- // Transcoding should return a _UTF8Chunk in host endianness.
56
- return _encodeSomeContiguousUTF16AsUTF8 ( from: i)
57
- } else {
58
- #if _runtime(_ObjC)
59
- return _encodeSomeNonContiguousUTF16AsUTF8 ( from: i)
60
- #else
61
- _sanityCheckFailure ( " _encodeSomeUTF8: Unexpected cocoa string " )
62
- #endif
63
- }
64
- }
65
-
66
- /// Helper for `_encodeSomeUTF8`, above. Handles the case where the
67
- /// storage is contiguous UTF-16.
68
- func _encodeSomeContiguousUTF16AsUTF8( from i: Int ) -> ( Int , _UTF8Chunk ) {
69
- _sanityCheck ( elementWidth == 2 )
70
- _sanityCheck ( _baseAddress != nil )
71
-
72
- let storage = UnsafeBufferPointer ( start: startUTF16, count: self . count)
73
- return _transcodeSomeUTF16AsUTF8 ( storage, i)
74
- }
75
-
76
- #if _runtime(_ObjC)
77
- /// Helper for `_encodeSomeUTF8`, above. Handles the case where the
78
- /// storage is non-contiguous UTF-16.
79
- func _encodeSomeNonContiguousUTF16AsUTF8( from i: Int ) -> ( Int , _UTF8Chunk ) {
80
- _sanityCheck ( elementWidth == 2 )
81
- _sanityCheck ( _baseAddress == nil )
82
-
83
- let storage = _CollectionOf < Int , UInt16 > (
84
- _startIndex: 0 , endIndex: self . count
85
- ) {
86
- ( i: Int ) -> UInt16 in
87
- return _cocoaStringSubscript ( self , i)
88
- }
89
- return _transcodeSomeUTF16AsUTF8 ( storage, i)
90
- }
91
- #endif
92
- }
93
-
94
25
extension String {
95
26
/// A view of a string's contents as a collection of UTF-8 code units.
96
27
///
@@ -170,6 +101,7 @@ extension String {
170
101
: Collection ,
171
102
CustomStringConvertible ,
172
103
CustomDebugStringConvertible {
104
+ @_versioned
173
105
internal let _core : _StringCore
174
106
175
107
init ( _ _core: _StringCore ) {
@@ -195,56 +127,97 @@ extension String {
195
127
return Index ( encodedOffset: _core. endIndex)
196
128
}
197
129
130
+ @_versioned
198
131
internal func _index( atEncodedOffset n: Int ) -> Index {
199
132
if _fastPath ( _core. isASCII) { return Index ( encodedOffset: n) }
133
+ if n == _core. endIndex { return endIndex }
134
+
200
135
var p = UTF16 . ForwardParser ( )
201
136
var i = _core [ n... ] . makeIterator ( )
202
- let s = p. parseScalar ( from: & i)
203
-
204
- if case . valid( let u16) = s {
205
- _onFastPath ( )
206
- let u8 = UTF8 . transcode ( u16, from: UTF16 . self)
207
- _sanityCheck ( u16. count >= 0 && u16. count <= 2 )
208
- let stride = UInt8 ( extendingOrTruncating: u16. count)
209
- return Index (
210
- encodedOffset: n,
211
- . utf8(
212
- encodedScalar: u8. _unsafelyUnwrappedUnchecked, stride: stride) )
213
- }
214
-
215
- if case . error( let stride) = s {
216
- return Index (
217
- encodedOffset: n,
218
- . utf8(
219
- encodedScalar: UTF8 . encodedReplacementCharacter,
220
- stride: UInt8 ( extendingOrTruncating: stride) ) )
137
+ var buffer = Index . _UTF8Buffer ( )
138
+ Loop:
139
+ while true {
140
+ switch p. parseScalar ( from: & i) {
141
+ case . valid( let u16) :
142
+ let u8 = Unicode . UTF8. transcode ( u16, from: Unicode . UTF16. self)
143
+ . _unsafelyUnwrappedUnchecked
144
+ if buffer. count + u8. count > buffer. capacity { break Loop }
145
+ buffer. append ( contentsOf: u8)
146
+ case . error:
147
+ let u8 = Unicode . UTF8. encodedReplacementCharacter
148
+ if buffer. count + u8. count > buffer. capacity { break Loop }
149
+ buffer. append ( contentsOf: u8)
150
+ case . emptyInput:
151
+ break Loop
152
+ }
221
153
}
222
- _onFastPath ( )
223
- return Index ( encodedOffset: n)
154
+ return Index ( encodedOffset: n, . utf8( buffer: buffer) )
224
155
}
225
156
226
157
/// Returns the next consecutive position after `i`.
227
158
///
228
159
/// - Precondition: The next position is representable.
160
+ @inline ( __always)
229
161
public func index( after i: Index ) -> Index {
230
- _precondition ( i != endIndex, " Can't advance past endIndex " )
231
162
if _fastPath ( _core. isASCII) {
163
+ precondition ( i. encodedOffset < _core. count)
232
164
return Index ( encodedOffset: i. encodedOffset + 1 )
233
165
}
166
+
234
167
var j = i
235
168
while true {
236
- if case . utf8( let encodedScalar , let stride ) = j. _cache {
169
+ if case . utf8( let buffer ) = j. _cache {
237
170
_onFastPath ( )
238
- j. _transcodedOffset += 1
239
- if _fastPath ( j. _transcodedOffset < encodedScalar. count) {
240
- return j
171
+ var scalarLength16 = 1
172
+ let b0 = buffer. first. _unsafelyUnwrappedUnchecked
173
+ var nextBuffer = buffer
174
+
175
+ let leading1s = ( ~ b0) . leadingZeroBitCount
176
+ if leading1s == 0 {
177
+ nextBuffer. removeFirst ( )
178
+ }
179
+ else {
180
+ let n8 = j. _transcodedOffset + 1
181
+ // If we haven't reached a scalar boundary...
182
+ if _fastPath ( n8 < leading1s) {
183
+ return Index (
184
+ encodedOffset: j. encodedOffset,
185
+ transcodedOffset: n8, . utf8( buffer: nextBuffer) )
186
+ }
187
+ scalarLength16 = n8 >> 2 + 1
188
+ nextBuffer. removeFirst ( n8)
241
189
}
242
- return _index ( atEncodedOffset: j. encodedOffset &+ numericCast ( stride) )
190
+ if _fastPath ( !nextBuffer. isEmpty) {
191
+ return Index (
192
+ encodedOffset: j. encodedOffset + scalarLength16,
193
+ . utf8( buffer: nextBuffer) )
194
+ }
195
+ return _index ( atEncodedOffset: j. encodedOffset + scalarLength16)
243
196
}
244
197
j = _index ( atEncodedOffset: j. encodedOffset)
198
+ precondition ( j != endIndex, " index out of bounds " )
245
199
}
246
200
}
247
201
202
+ public func distance( from i: Index , to j: Index ) -> IndexDistance {
203
+ if _fastPath ( _core. isASCII) {
204
+ return j. encodedOffset - i. encodedOffset
205
+ }
206
+ return j >= i
207
+ ? _forwardDistance ( from: i, to: j) : - _forwardDistance( from: j, to: i)
208
+ }
209
+
210
+ @_versioned
211
+ @inline ( __always)
212
+ internal func _forwardDistance( from i: Index , to j: Index ) -> IndexDistance {
213
+ var r : IndexDistance = j. _transcodedOffset - i. _transcodedOffset
214
+ UTF8 . _transcode (
215
+ _core [ i. encodedOffset..< j. encodedOffset] , from: UTF16 . self) {
216
+ r += $0. count
217
+ }
218
+ return r
219
+ }
220
+
248
221
/// Accesses the code unit at the given position.
249
222
///
250
223
/// The following example uses the subscript to print the value of a
@@ -258,27 +231,27 @@ extension String {
258
231
/// - Parameter position: A valid index of the view. `position`
259
232
/// must be less than the view's end index.
260
233
public subscript( position: Index ) -> UTF8 . CodeUnit {
261
- _precondition ( position != endIndex, " cannot subscript using endIndex " )
262
- if _fastPath ( _core. isASCII) {
263
- return UTF8 . CodeUnit ( _core [ position. encodedOffset] )
264
- }
265
- var j = position
266
- while true {
267
- if case let . utf8( encodedScalar, _) = j. _cache {
268
- _onFastPath ( )
269
- _sanityCheck ( ( 0 ..< 4 ) . contains ( j. _transcodedOffset) )
270
-
271
- let i = encodedScalar. index (
272
- encodedScalar. startIndex, offsetBy: j. _transcodedOffset)
273
-
274
- return encodedScalar [ i]
234
+ @inline ( __always)
235
+ get {
236
+ if _fastPath ( _core. asciiBuffer != nil ) , let ascii = _core. asciiBuffer {
237
+ _precondition ( position < endIndex, " index out of bounds " )
238
+ return ascii [ position. encodedOffset]
239
+ }
240
+ var j = position
241
+ while true {
242
+ if case . utf8( let buffer) = j. _cache {
243
+ _onFastPath ( )
244
+ return buffer [
245
+ buffer. index ( buffer. startIndex, offsetBy: j. _transcodedOffset) ]
246
+ }
247
+ j = _index ( atEncodedOffset: j. encodedOffset)
248
+ precondition ( j < endIndex, " index out of bounds " )
275
249
}
276
- j = _index ( atEncodedOffset: j. encodedOffset)
277
250
}
278
251
}
279
252
280
253
public var description : String {
281
- return String . _fromCodeUnitSequenceWithRepair ( UTF8 . self , input : self ) . 0
254
+ return String ( _core )
282
255
}
283
256
284
257
public var debugDescription : String {
@@ -445,7 +418,6 @@ extension String.UTF8View.Iterator : IteratorProtocol {
445
418
refillingFrom source: Source
446
419
) -> Unicode . UTF8 . CodeUnit ?
447
420
where Source. Element == Unicode . UTF16 . CodeUnit ,
448
- Source. _Element == Unicode . UTF16 . CodeUnit ,
449
421
Source. Index == Int
450
422
{
451
423
_sanityCheck ( _buffer == 0 )
0 commit comments