Skip to content

Commit c5b7a3d

Browse files
lorenteymilseman
authored andcommitted
String.UTF8View: Migrate to guts
1 parent 781b3eb commit c5b7a3d

File tree

2 files changed

+81
-132
lines changed

2 files changed

+81
-132
lines changed

stdlib/public/core/StringUTF8.swift

Lines changed: 76 additions & 132 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
//===--- StringUTF8.swift - A UTF8 view of _LegacyStringCore --------------------===//
1+
//===--- StringUTF8.swift - A UTF8 view of String -------------------------===//
22
//
33
// This source file is part of the Swift.org open source project
44
//
@@ -9,12 +9,6 @@
99
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
1010
//
1111
//===----------------------------------------------------------------------===//
12-
//
13-
// _LegacyStringCore currently has three representations: Native ASCII,
14-
// Native UTF-16, and Opaque Cocoa. Expose each of these as UTF-8 in a
15-
// way that will hopefully be efficient to traverse
16-
//
17-
//===----------------------------------------------------------------------===//
1812

1913
extension String {
2014
/// A view of a string's contents as a collection of UTF-8 code units.
@@ -101,13 +95,7 @@ extension String {
10195
@_versioned
10296
internal var _guts: _StringGuts
10397

104-
@_versioned
105-
internal var _core: _LegacyStringCore {
106-
get { return _guts._legacyCore }
107-
set { self._guts = _StringGuts(newValue) }
108-
}
109-
110-
/// Distances to `(startIndex, endIndex)` from the endpoints of _core,
98+
/// Distances to `(startIndex, endIndex)` from the endpoints of _guts,
11199
/// measured in UTF-8 code units.
112100
///
113101
/// Note: this is *only* here to support legacy Swift3-style slicing where
@@ -177,10 +165,11 @@ extension String {
177165
@_versioned
178166
internal func _index(atEncodedOffset n: Int) -> Index {
179167
if _fastPath(_guts.isASCII) { return Index(encodedOffset: n) }
180-
if n == _guts.endIndex { return endIndex }
181-
168+
let count = _guts.count
169+
if n == count { return endIndex }
170+
182171
var p = UTF16.ForwardParser()
183-
var i = _core[n...].makeIterator()
172+
var i = _guts.makeIterator(in: n..<count)
184173
var buffer = Index._UTF8Buffer()
185174
Loop:
186175
while true {
@@ -270,20 +259,11 @@ extension String {
270259
}
271260

272261
// Handle the scalar boundary the same way as the not-a-utf8-index case.
273-
262+
_precondition(i.encodedOffset > 0, "Can't move before startIndex")
263+
274264
// Parse a single scalar
275-
var p = Unicode.UTF16.ReverseParser()
276-
var s = _core[..<i.encodedOffset].reversed().makeIterator()
277-
let u8: Unicode.UTF8.EncodedScalar
278-
switch p.parseScalar(from: &s) {
279-
case .valid(let u16):
280-
u8 = Unicode.UTF8.transcode(
281-
u16, from: Unicode.UTF16.self)._unsafelyUnwrappedUnchecked
282-
case .error:
283-
u8 = Unicode.UTF8.encodedReplacementCharacter
284-
case .emptyInput:
285-
_preconditionFailure("Index out of bounds")
286-
}
265+
let u = _guts.unicodeScalar(endingAt: i.encodedOffset)
266+
let u8 = Unicode.UTF8.encode(u)._unsafelyUnwrappedUnchecked
287267
return Index(
288268
encodedOffset: i.encodedOffset &- (u8.count < 4 ? 1 : 2),
289269
transcodedOffset: u8.count &- 1,
@@ -304,12 +284,9 @@ extension String {
304284
@_versioned
305285
@inline(__always)
306286
internal func _forwardDistance(from i: Index, to j: Index) -> Int {
307-
var r = j._transcodedOffset - i._transcodedOffset
308-
UTF8._transcode(
309-
_core[i.encodedOffset..<j.encodedOffset], from: UTF16.self) {
310-
r += $0.count
311-
}
312-
return r
287+
return j._transcodedOffset - i._transcodedOffset +
288+
_count(fromUTF16: IteratorSequence(_guts.makeIterator(
289+
in: i.encodedOffset..<j.encodedOffset)))
313290
}
314291

315292
/// Accesses the code unit at the given position.
@@ -399,10 +376,8 @@ extension String {
399376
internal func _withUnsafeBufferPointerToUTF8<R>(
400377
_ body: (UnsafeBufferPointer<UTF8.CodeUnit>) throws -> R
401378
) rethrows -> R {
402-
if let asciiBuffer = self._core.asciiBuffer {
403-
return try body(UnsafeBufferPointer(
404-
start: asciiBuffer.baseAddress,
405-
count: asciiBuffer.count))
379+
if _guts.isASCII {
380+
return try body(_guts._unmanagedASCIIView.buffer)
406381
}
407382
var nullTerminatedUTF8 = ContiguousArray<UTF8.CodeUnit>()
408383
nullTerminatedUTF8.reserveCapacity(utf8.count + 1)
@@ -479,132 +454,102 @@ extension String.UTF8View : _SwiftStringView {
479454
extension String.UTF8View {
480455
@_fixed_layout // FIXME(sil-serialize-all)
481456
public struct Iterator {
482-
internal typealias _OutputBuffer = UInt64
457+
internal typealias _OutputBuffer = _ValidUTF8Buffer<UInt64>
483458
@_versioned
484-
internal var _guts: _StringGuts
459+
internal let _guts: _StringGuts
460+
@_versioned
461+
internal let _endOffset: Int
485462
@_versioned // FIXME(sil-serialize-all)
486-
internal var _sourceIndex: Int
463+
internal var _nextOffset: Int
487464
@_versioned // FIXME(sil-serialize-all)
488465
internal var _buffer: _OutputBuffer
489-
490-
@_versioned
491-
internal var _endIndex: Int
492-
493-
@_versioned
494-
internal var _asciiPointer: UnsafePointer<UInt8>?
495466
}
496467

497468
public func makeIterator() -> Iterator {
498-
return Iterator(_guts)
469+
return Iterator(self)
499470
}
500471
}
501472

502473
extension String.UTF8View.Iterator : IteratorProtocol {
474+
public typealias Element = String.UTF8View.Element
475+
503476
@_inlineable // FIXME(sil-serialize-all)
504477
@_versioned // FIXME(sil-serialize-all)
505-
internal init(_ guts: _StringGuts) {
506-
self._guts = guts
507-
self._sourceIndex = 0
508-
self._buffer = 0
509-
if _fastPath(guts._isContiguous && guts.isASCII) {
510-
let ascii = guts._unmanagedASCIIView
511-
self._endIndex = ascii.count
512-
self._asciiPointer = ascii.start
513-
} else {
514-
self._endIndex = self._guts.count
515-
self._asciiPointer = nil
516-
}
478+
internal init(_ utf8: String.UTF8View) {
479+
self._guts = utf8._guts
480+
self._nextOffset = 0
481+
self._buffer = _OutputBuffer()
482+
self._endOffset = utf8._guts.count
517483
}
518-
484+
519485
@_inlineable // FIXME(sil-serialize-all)
520486
public mutating func next() -> Unicode.UTF8.CodeUnit? {
521-
if _fastPath(_buffer != 0) {
522-
let r = UInt8(truncatingIfNeeded: _buffer) &- 1
523-
_buffer >>= 8
524-
return r
487+
if _fastPath(!_buffer.isEmpty) {
488+
return _buffer.removeFirst()
525489
}
526-
if _slowPath(_sourceIndex == _endIndex) { return nil }
490+
if _nextOffset == _endOffset { return nil }
491+
return _fillBuffer()
492+
}
527493

494+
@_versioned
495+
@inline(never)
496+
internal mutating func _fillBuffer() -> Unicode.UTF8.CodeUnit {
497+
_sanityCheck(_buffer.isEmpty)
498+
_sanityCheck(_nextOffset < _endOffset)
528499
defer { _fixLifetime(_guts) }
529-
530-
if _fastPath(self._asciiPointer != nil) {
531-
let ascii = self._asciiPointer._unsafelyUnwrappedUnchecked
532-
let result = ascii[_sourceIndex]
533-
_sourceIndex += 1
534-
for i in 0 ..< _OutputBuffer.bitWidth>>3 {
535-
if _sourceIndex == _endIndex { break }
536-
_buffer |= _OutputBuffer(ascii[_sourceIndex] &+ 1) &<< (i << 3)
537-
_sourceIndex += 1
500+
if _guts.isASCII {
501+
// FIXME: Measure if it's worth inlining this path
502+
let ascii = _guts._unmanagedASCIIView.buffer
503+
let result = ascii[_nextOffset]
504+
_nextOffset += 1
505+
let fillCount = min(_buffer.capacity, _endOffset - _nextOffset)
506+
for _ in 0 ..< fillCount {
507+
_buffer.append(ascii[_nextOffset])
508+
_nextOffset += 1
538509
}
539510
return result
540511
}
541-
542-
if _fastPath(_guts._isContiguous) {
543-
return _next(refillingFrom: _guts._unmanagedUTF16View.buffer)
512+
if _guts._isContiguous {
513+
return _fillBuffer(from: _guts._unmanagedUTF16View)
544514
}
545-
546-
return _next(refillingFrom: _guts._legacyCore)
515+
return _fillBuffer(from: _guts._asOpaque())
547516
}
548517

549-
@_inlineable // FIXME(sil-serialize-all)
550-
@_versioned // FIXME(sil-serialize-all)
551-
internal mutating func _next<Source: Collection>(
552-
refillingFrom source: Source
553-
) -> Unicode.UTF8.CodeUnit?
554-
where Source.Element == Unicode.UTF16.CodeUnit,
555-
Source.Index == Int
556-
{
557-
_sanityCheck(_buffer == 0)
558-
var shift = 0
559-
560-
// ASCII fastpath
561-
while _sourceIndex != _endIndex && shift < _OutputBuffer.bitWidth {
562-
let u = _guts[_sourceIndex]
563-
if u >= 0x80 { break }
564-
_buffer |= _OutputBuffer(UInt8(truncatingIfNeeded: u &+ 1)) &<< shift
565-
_sourceIndex += 1
566-
shift = shift &+ 8
518+
// NOT @_versioned
519+
internal mutating func _fillBuffer<V: _StringVariant>(
520+
from variant: V
521+
) -> Unicode.UTF8.CodeUnit {
522+
// Eat as many ASCII characters as possible
523+
let asciiEnd = Swift.min(_nextOffset + _buffer.capacity, _endOffset)
524+
for cu in variant[_nextOffset..<asciiEnd] {
525+
if !UTF16._isASCII(cu) { break }
526+
_buffer.append(UInt8(truncatingIfNeeded: cu))
527+
_nextOffset += 1
567528
}
568-
569-
var i = IndexingIterator(_elements: source, _position: _sourceIndex)
570-
var parser = Unicode.UTF16.ForwardParser()
571-
Loop:
572-
while true {
573-
let u8: UTF8.EncodedScalar
574-
switch parser.parseScalar(from: &i) {
575-
case .valid(let s):
576-
u8 = UTF8.transcode(s, from: UTF16.self)._unsafelyUnwrappedUnchecked
577-
case .error(_):
578-
u8 = UTF8.encodedReplacementCharacter
579-
case .emptyInput:
580-
break Loop
581-
}
582-
var newBuffer = _buffer
583-
for x in u8 {
584-
newBuffer |= _OutputBuffer(x &+ 1) &<< shift
585-
shift = shift &+ 8
586-
}
587-
guard _fastPath(shift <= _OutputBuffer.bitWidth) else { break Loop }
588-
_buffer = newBuffer
589-
_sourceIndex = i._position &- parser._buffer.count
529+
if _nextOffset == asciiEnd {
530+
return _buffer.removeFirst()
590531
}
591-
guard _fastPath(_buffer != 0) else { return nil }
592-
let result = UInt8(truncatingIfNeeded: _buffer) &- 1
593-
_buffer >>= 8
594-
return result
532+
// Decode UTF-16, encode UTF-8
533+
for scalar in IteratorSequence(
534+
variant[_nextOffset..<_endOffset].makeUnicodeScalarIterator()) {
535+
let u8 = UTF8.encode(scalar)._unsafelyUnwrappedUnchecked
536+
let c8 = u8.count
537+
guard _buffer.count + c8 <= _buffer.capacity else { break }
538+
_buffer.append(contentsOf: u8)
539+
_nextOffset += 1 &+ (c8 &>> 2)
540+
}
541+
return _buffer.removeFirst()
595542
}
596543
}
597544

598545
extension String.UTF8View {
599546
@_inlineable // FIXME(sil-serialize-all)
600547
public var count: Int {
601548
if _fastPath(_guts.isASCII) { return _guts.count }
602-
let b = _core._unmanagedUTF16
603-
if _fastPath(b != nil) {
604-
defer { _fixLifetime(_core) }
605-
return _count(fromUTF16: b!)
549+
if _guts._isContiguous {
550+
return _count(fromUTF16: _guts._unmanagedUTF16View)
606551
}
607-
return _count(fromUTF16: self._core)
552+
return _count(fromUTF16: _guts._asOpaque())
608553
}
609554

610555
@_inlineable // FIXME(sil-serialize-all)
@@ -786,4 +731,3 @@ extension String.UTF8View {
786731
return self[bounds.relative(to: self)]
787732
}
788733
}
789-

stdlib/public/core/UTF16.swift

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,11 @@ extension Unicode.UTF16 : Unicode.Encoding {
3131
return EncodedScalar(_storage: 0xFFFD, _bitCount: 16)
3232
}
3333

34+
@_inlineable // FIXME(sil-serialize-all)
35+
public static func _isASCII(_ x: CodeUnit) -> Bool {
36+
return x <= 0x7f
37+
}
38+
3439
@_inlineable // FIXME(sil-serialize-all)
3540
public static func _isScalar(_ x: CodeUnit) -> Bool {
3641
return x & 0xf800 != 0xd800

0 commit comments

Comments
 (0)