Skip to content

Commit 283775e

Browse files
author
Dave Abrahams
committed
[stdlib] Rebuild String.Index for UTF8View
1 parent 576b8de commit 283775e

File tree

7 files changed

+151
-218
lines changed

7 files changed

+151
-218
lines changed

stdlib/public/core/StringCore.swift

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -428,7 +428,6 @@ public struct _StringCore {
428428
// In order to grow the substring in place, this _StringCore should point
429429
// at the substring at the end of a _StringBuffer. Otherwise, some other
430430
// String is using parts of the buffer beyond our last byte.
431-
let usedStart = _pointer(toElementAt:0)
432431
let usedEnd = _pointer(toElementAt:count)
433432

434433
// Attempt to claim unused capacity in the buffer

stdlib/public/core/StringIndex.swift

Lines changed: 26 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -13,17 +13,36 @@ extension String {
1313
/// A position of a character or code unit in a string.
1414
public struct Index {
1515
internal var _compoundOffset : UInt64
16+
@_versioned
1617
internal var _cache: _Cache
17-
18+
19+
internal typealias _UTF8Buffer = _ValidUTF8Buffer<UInt64>
20+
@_versioned
1821
internal enum _Cache {
1922
case utf16
20-
case utf8(encodedScalar: Unicode.UTF8.EncodedScalar, stride: UInt8)
23+
case utf8(buffer: _UTF8Buffer)
2124
case character(stride: UInt16)
2225
case unicodeScalar(value: Unicode.Scalar)
2326
}
2427
}
2528
}
2629

30+
/// Convenience accessors
31+
extension String.Index._Cache {
32+
var utf16: Void? {
33+
if case .utf16 = self { return () } else { return nil }
34+
}
35+
var utf8: String.Index._UTF8Buffer? {
36+
if case .utf8(let r) = self { return r } else { return nil }
37+
}
38+
var character: UInt16? {
39+
if case .character(let r) = self { return r } else { return nil }
40+
}
41+
var unicodeScalar: UnicodeScalar? {
42+
if case .unicodeScalar(let r) = self { return r } else { return nil }
43+
}
44+
}
45+
2746
extension String.Index : Equatable {
2847
public static func == (lhs: String.Index, rhs: String.Index) -> Bool {
2948
return lhs._compoundOffset == rhs._compoundOffset
@@ -46,9 +65,10 @@ extension String.Index {
4665
_compoundOffset = UInt64(offset << _Self._strideBits)
4766
_cache = .utf16
4867
}
49-
50-
internal init(encodedOffset o: Int, _ c: _Cache) {
51-
_compoundOffset = UInt64(o << _Self._strideBits)
68+
69+
@_versioned
70+
internal init(encodedOffset o: Int, transcodedOffset: Int = 0, _ c: _Cache) {
71+
_compoundOffset = UInt64(o << _Self._strideBits | transcodedOffset)
5272
_cache = c
5373
}
5474

@@ -65,6 +85,7 @@ extension String.Index {
6585
}
6686

6787
/// The offset of this index within whatever encoding this is being viewed as
88+
@_versioned
6889
internal var _transcodedOffset : Int {
6990
get {
7091
return Int(_compoundOffset & _Self._mask)

stdlib/public/core/StringUTF8.swift

Lines changed: 85 additions & 113 deletions
Original file line numberDiff line numberDiff line change
@@ -22,75 +22,6 @@
2222
// FIXME(ABI)#73 : The UTF-8 string view should have a custom iterator type to
2323
// allow performance optimizations of linear traversals.
2424

25-
extension _StringCore {
26-
/// An integral type that holds a sequence of UTF-8 code units, starting in
27-
/// its low byte.
28-
internal typealias _UTF8Chunk = UInt64
29-
30-
/// Encode text starting at `i` as UTF-8. Returns a pair whose first
31-
/// element is the index of the text following whatever got encoded,
32-
/// and the second element contains the encoded UTF-8 starting in its
33-
/// low byte. Any unused high bytes in the result will be set to
34-
/// 0xFF.
35-
@inline(__always)
36-
func _encodeSomeUTF8(from i: Int) -> (Int, _UTF8Chunk) {
37-
_sanityCheck(i <= count)
38-
39-
if let asciiBuffer = self.asciiBuffer {
40-
// How many UTF-16 code units might we use before we've filled up
41-
// our _UTF8Chunk with UTF-8 code units?
42-
let utf16Count =
43-
Swift.min(MemoryLayout<_UTF8Chunk>.size, asciiBuffer.count - i)
44-
45-
var result: _UTF8Chunk = ~0 // Start with all bits set
46-
47-
_memcpy(
48-
dest: UnsafeMutableRawPointer(Builtin.addressof(&result)),
49-
src: asciiBuffer.baseAddress! + i,
50-
size: numericCast(utf16Count))
51-
52-
// Convert the _UTF8Chunk into host endianness.
53-
return (i + utf16Count, _UTF8Chunk(littleEndian: result))
54-
} else if _fastPath(_baseAddress != nil) {
55-
// Transcoding should return a _UTF8Chunk in host endianness.
56-
return _encodeSomeContiguousUTF16AsUTF8(from: i)
57-
} else {
58-
#if _runtime(_ObjC)
59-
return _encodeSomeNonContiguousUTF16AsUTF8(from: i)
60-
#else
61-
_sanityCheckFailure("_encodeSomeUTF8: Unexpected cocoa string")
62-
#endif
63-
}
64-
}
65-
66-
/// Helper for `_encodeSomeUTF8`, above. Handles the case where the
67-
/// storage is contiguous UTF-16.
68-
func _encodeSomeContiguousUTF16AsUTF8(from i: Int) -> (Int, _UTF8Chunk) {
69-
_sanityCheck(elementWidth == 2)
70-
_sanityCheck(_baseAddress != nil)
71-
72-
let storage = UnsafeBufferPointer(start: startUTF16, count: self.count)
73-
return _transcodeSomeUTF16AsUTF8(storage, i)
74-
}
75-
76-
#if _runtime(_ObjC)
77-
/// Helper for `_encodeSomeUTF8`, above. Handles the case where the
78-
/// storage is non-contiguous UTF-16.
79-
func _encodeSomeNonContiguousUTF16AsUTF8(from i: Int) -> (Int, _UTF8Chunk) {
80-
_sanityCheck(elementWidth == 2)
81-
_sanityCheck(_baseAddress == nil)
82-
83-
let storage = _CollectionOf<Int, UInt16>(
84-
_startIndex: 0, endIndex: self.count
85-
) {
86-
(i: Int) -> UInt16 in
87-
return _cocoaStringSubscript(self, i)
88-
}
89-
return _transcodeSomeUTF16AsUTF8(storage, i)
90-
}
91-
#endif
92-
}
93-
9425
extension String {
9526
/// A view of a string's contents as a collection of UTF-8 code units.
9627
///
@@ -170,6 +101,7 @@ extension String {
170101
: Collection,
171102
CustomStringConvertible,
172103
CustomDebugStringConvertible {
104+
@_versioned
173105
internal let _core: _StringCore
174106

175107
init(_ _core: _StringCore) {
@@ -195,56 +127,97 @@ extension String {
195127
return Index(encodedOffset: _core.endIndex)
196128
}
197129

130+
@_versioned
198131
internal func _index(atEncodedOffset n: Int) -> Index {
199132
if _fastPath(_core.isASCII) { return Index(encodedOffset: n) }
133+
if n == _core.endIndex { return endIndex }
134+
200135
var p = UTF16.ForwardParser()
201136
var i = _core[n...].makeIterator()
202-
let s = p.parseScalar(from: &i)
203-
204-
if case .valid(let u16) = s {
205-
_onFastPath()
206-
let u8 = UTF8.transcode(u16, from: UTF16.self)
207-
_sanityCheck(u16.count >= 0 && u16.count <= 2)
208-
let stride = UInt8(extendingOrTruncating: u16.count)
209-
return Index(
210-
encodedOffset: n,
211-
.utf8(
212-
encodedScalar: u8._unsafelyUnwrappedUnchecked, stride: stride))
213-
}
214-
215-
if case .error(let stride) = s {
216-
return Index(
217-
encodedOffset: n,
218-
.utf8(
219-
encodedScalar: UTF8.encodedReplacementCharacter,
220-
stride: UInt8(extendingOrTruncating: stride)))
137+
var buffer = Index._UTF8Buffer()
138+
Loop:
139+
while true {
140+
switch p.parseScalar(from: &i) {
141+
case .valid(let u16):
142+
let u8 = Unicode.UTF8.transcode(u16, from: Unicode.UTF16.self)
143+
._unsafelyUnwrappedUnchecked
144+
if buffer.count + u8.count > buffer.capacity { break Loop }
145+
buffer.append(contentsOf: u8)
146+
case .error:
147+
let u8 = Unicode.UTF8.encodedReplacementCharacter
148+
if buffer.count + u8.count > buffer.capacity { break Loop }
149+
buffer.append(contentsOf: u8)
150+
case .emptyInput:
151+
break Loop
152+
}
221153
}
222-
_onFastPath()
223-
return Index(encodedOffset: n)
154+
return Index(encodedOffset: n, .utf8(buffer: buffer))
224155
}
225156

226157
/// Returns the next consecutive position after `i`.
227158
///
228159
/// - Precondition: The next position is representable.
160+
@inline(__always)
229161
public func index(after i: Index) -> Index {
230-
_precondition(i != endIndex, "Can't advance past endIndex")
231162
if _fastPath(_core.isASCII) {
163+
precondition(i.encodedOffset < _core.count)
232164
return Index(encodedOffset: i.encodedOffset + 1)
233165
}
166+
234167
var j = i
235168
while true {
236-
if case .utf8(let encodedScalar, let stride) = j._cache {
169+
if case .utf8(let buffer) = j._cache {
237170
_onFastPath()
238-
j._transcodedOffset += 1
239-
if _fastPath(j._transcodedOffset < encodedScalar.count) {
240-
return j
171+
var scalarLength16 = 1
172+
let b0 = buffer.first._unsafelyUnwrappedUnchecked
173+
var nextBuffer = buffer
174+
175+
let leading1s = (~b0).leadingZeroBitCount
176+
if leading1s == 0 {
177+
nextBuffer.removeFirst()
178+
}
179+
else {
180+
let n8 = j._transcodedOffset + 1
181+
// If we haven't reached a scalar boundary...
182+
if _fastPath(n8 < leading1s) {
183+
return Index(
184+
encodedOffset: j.encodedOffset,
185+
transcodedOffset: n8, .utf8(buffer: nextBuffer))
186+
}
187+
scalarLength16 = n8 >> 2 + 1
188+
nextBuffer.removeFirst(n8)
241189
}
242-
return _index(atEncodedOffset: j.encodedOffset &+ numericCast(stride))
190+
if _fastPath(!nextBuffer.isEmpty) {
191+
return Index(
192+
encodedOffset: j.encodedOffset + scalarLength16,
193+
.utf8(buffer: nextBuffer))
194+
}
195+
return _index(atEncodedOffset: j.encodedOffset + scalarLength16)
243196
}
244197
j = _index(atEncodedOffset: j.encodedOffset)
198+
precondition(j != endIndex, "index out of bounds")
245199
}
246200
}
247201

202+
public func distance(from i: Index, to j: Index) -> IndexDistance {
203+
if _fastPath(_core.isASCII) {
204+
return j.encodedOffset - i.encodedOffset
205+
}
206+
return j >= i
207+
? _forwardDistance(from: i, to: j) : -_forwardDistance(from: j, to: i)
208+
}
209+
210+
@_versioned
211+
@inline(__always)
212+
internal func _forwardDistance(from i: Index, to j: Index) -> IndexDistance {
213+
var r: IndexDistance = j._transcodedOffset - i._transcodedOffset
214+
UTF8._transcode(
215+
_core[i.encodedOffset..<j.encodedOffset], from: UTF16.self) {
216+
r += $0.count
217+
}
218+
return r
219+
}
220+
248221
/// Accesses the code unit at the given position.
249222
///
250223
/// The following example uses the subscript to print the value of a
@@ -258,27 +231,27 @@ extension String {
258231
/// - Parameter position: A valid index of the view. `position`
259232
/// must be less than the view's end index.
260233
public subscript(position: Index) -> UTF8.CodeUnit {
261-
_precondition(position != endIndex, "cannot subscript using endIndex")
262-
if _fastPath(_core.isASCII) {
263-
return UTF8.CodeUnit(_core[position.encodedOffset])
264-
}
265-
var j = position
266-
while true {
267-
if case let .utf8(encodedScalar, _) = j._cache {
268-
_onFastPath()
269-
_sanityCheck((0..<4).contains(j._transcodedOffset))
270-
271-
let i = encodedScalar.index(
272-
encodedScalar.startIndex, offsetBy: j._transcodedOffset)
273-
274-
return encodedScalar[i]
234+
@inline(__always)
235+
get {
236+
if _fastPath(_core.asciiBuffer != nil), let ascii = _core.asciiBuffer {
237+
_precondition(position < endIndex, "index out of bounds")
238+
return ascii[position.encodedOffset]
239+
}
240+
var j = position
241+
while true {
242+
if case .utf8(let buffer) = j._cache {
243+
_onFastPath()
244+
return buffer[
245+
buffer.index(buffer.startIndex, offsetBy: j._transcodedOffset)]
246+
}
247+
j = _index(atEncodedOffset: j.encodedOffset)
248+
precondition(j < endIndex, "index out of bounds")
275249
}
276-
j = _index(atEncodedOffset: j.encodedOffset)
277250
}
278251
}
279252

280253
public var description: String {
281-
return String._fromCodeUnitSequenceWithRepair(UTF8.self, input: self).0
254+
return String(_core)
282255
}
283256

284257
public var debugDescription: String {
@@ -445,7 +418,6 @@ extension String.UTF8View.Iterator : IteratorProtocol {
445418
refillingFrom source: Source
446419
) -> Unicode.UTF8.CodeUnit?
447420
where Source.Element == Unicode.UTF16.CodeUnit,
448-
Source._Element == Unicode.UTF16.CodeUnit,
449421
Source.Index == Int
450422
{
451423
_sanityCheck(_buffer == 0)

stdlib/public/core/UIntBuffer.swift

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,13 @@ extension _UIntBuffer : RangeReplaceableCollection {
172172
_storage |= Storage(newElement) &<< _bitCount
173173
_bitCount = _bitCount &+ _elementWidth
174174
}
175+
176+
@inline(__always)
177+
public mutating func removeFirst() {
178+
_debugPrecondition(!isEmpty)
179+
_bitCount = _bitCount &- _elementWidth
180+
_storage = _storage._fullShiftRight(_elementWidth)
181+
}
175182

176183
@inline(__always)
177184
public mutating func replaceSubrange<C: Collection>(

0 commit comments

Comments
 (0)