Skip to content

Commit 4cd1e81

Browse files
committed
[String] Scalar-alignment bug fixes.
Fixes a general category (pun intended) of scalar-alignment bugs surrounding exchanging non-scalar-aligned indices between views and for slicing. SE-0180 unifies the Index type of String and all its views and allows non-scalar-aligned indices to be used across views. In order to guarantee behavior, we often have to check and perform scalar alignment. To speed up these checks, we allocate a bit denoting known-to-be-aligned, so that the alignment check can skip the load. The below shows what views need to check for alignment before they can operate, and whether the indices they produce are aligned. ┌───────────────╥────────────────────┬──────────────────────────┐ │ View ║ Requires Alignment │ Produces Aligned Indices │ ╞═══════════════╬════════════════════╪══════════════════════════╡ │ Native UTF8 ║ no │ no │ ├───────────────╫────────────────────┼──────────────────────────┤ │ Native UTF16 ║ yes │ no │ ╞═══════════════╬════════════════════╪══════════════════════════╡ │ Foreign UTF8 ║ yes │ no │ ├───────────────╫────────────────────┼──────────────────────────┤ │ Foreign UTF16 ║ no │ no │ ╞═══════════════╬════════════════════╪══════════════════════════╡ │ UnicodeScalar ║ yes │ yes │ ├───────────────╫────────────────────┼──────────────────────────┤ │ Character ║ yes │ yes │ └───────────────╨────────────────────┴──────────────────────────┘ The "requires alignment" applies to any operation taking a String.Index that's not defined entirely in terms of other operations taking a String.Index. These include: * index(after:) * index(before:) * subscript * distance(from:to:) (since `to` is compared against directly) * UTF16View._nativeGetOffset(for:)
1 parent 93d65fc commit 4cd1e81

15 files changed

+547
-166
lines changed

stdlib/public/core/LegacyABI.swift

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,3 +71,11 @@ internal func _branchHint(_ actual: Bool, expected: Bool) -> Bool {
7171
// value without any branch hint.
7272
return actual
7373
}
74+
75+
extension String {
76+
@usableFromInline // Never actually used in inlinable code...
77+
internal func _nativeCopyUTF16CodeUnits(
78+
into buffer: UnsafeMutableBufferPointer<UInt16>,
79+
range: Range<String.Index>
80+
) { fatalError() }
81+
}

stdlib/public/core/StringBridge.swift

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -420,6 +420,6 @@ extension String {
420420
) {
421421
_internalInvariant(buffer.count >= range.count)
422422
let indexRange = self._toUTF16Indices(range)
423-
self._nativeCopyUTF16CodeUnits(into: buffer, range: indexRange)
423+
self.utf16._nativeCopy(into: buffer, alignedRange: indexRange)
424424
}
425425
}

stdlib/public/core/StringCharacterView.swift

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -60,13 +60,14 @@ extension String: BidirectionalCollection {
6060
_precondition(i < endIndex, "String index is out of bounds")
6161

6262
// TODO: known-ASCII fast path, single-scalar-grapheme fast path, etc.
63+
let i = _guts.scalarAlign(i)
6364
let stride = _characterStride(startingAt: i)
6465
let nextOffset = i._encodedOffset &+ stride
6566
let nextStride = _characterStride(
66-
startingAt: Index(_encodedOffset: nextOffset))
67+
startingAt: Index(_encodedOffset: nextOffset).aligned)
6768

6869
return Index(
69-
encodedOffset: nextOffset, characterStride: nextStride)
70+
encodedOffset: nextOffset, characterStride: nextStride).aligned
7071
}
7172

7273
/// Returns the position immediately before the given index.
@@ -78,9 +79,10 @@ extension String: BidirectionalCollection {
7879
_precondition(i > startIndex, "String index is out of bounds")
7980

8081
// TODO: known-ASCII fast path, single-scalar-grapheme fast path, etc.
82+
let i = _guts.scalarAlign(i)
8183
let stride = _characterStride(endingAt: i)
8284
let priorOffset = i._encodedOffset &- stride
83-
return Index(encodedOffset: priorOffset, characterStride: stride)
85+
return Index(encodedOffset: priorOffset, characterStride: stride).aligned
8486
}
8587
/// Returns an index that is the specified distance from the given index.
8688
///
@@ -167,7 +169,7 @@ extension String: BidirectionalCollection {
167169
@inlinable @inline(__always)
168170
public func distance(from start: Index, to end: Index) -> IndexDistance {
169171
// TODO: known-ASCII and single-scalar-grapheme fast path, etc.
170-
return _distance(from: start, to: end)
172+
return _distance(from: _guts.scalarAlign(start), to: _guts.scalarAlign(end))
171173
}
172174

173175
/// Accesses the character at the given position.
@@ -191,12 +193,15 @@ extension String: BidirectionalCollection {
191193

192194
let i = _guts.scalarAlign(i)
193195
let distance = _characterStride(startingAt: i)
196+
194197
return _guts.errorCorrectedCharacter(
195198
startingAt: i._encodedOffset, endingAt: i._encodedOffset &+ distance)
196199
}
197200

198201
@inlinable @inline(__always)
199202
internal func _characterStride(startingAt i: Index) -> Int {
203+
_internalInvariant(i.isAligned)
204+
200205
// Fast check if it's already been measured, otherwise check resiliently
201206
if let d = i.characterStride { return d }
202207

@@ -207,6 +212,8 @@ extension String: BidirectionalCollection {
207212

208213
@inlinable @inline(__always)
209214
internal func _characterStride(endingAt i: Index) -> Int {
215+
_internalInvariant(i.isAligned)
216+
210217
if i == startIndex { return 0 }
211218

212219
return _guts._opaqueCharacterStride(endingAt: i._encodedOffset)

stdlib/public/core/StringGuts.swift

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -268,11 +268,11 @@ extension _StringGuts {
268268

269269
@inlinable @inline(__always)
270270
internal var startIndex: String.Index {
271-
return Index(_encodedOffset: 0)
271+
return Index(_encodedOffset: 0).aligned
272272
}
273273
@inlinable @inline(__always)
274274
internal var endIndex: String.Index {
275-
return Index(_encodedOffset: self.count)
275+
return Index(_encodedOffset: self.count).aligned
276276
}
277277
}
278278

stdlib/public/core/StringIndex.swift

Lines changed: 76 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -16,21 +16,25 @@ import SwiftShims
1616

1717
String's Index has the following layout:
1818

19-
┌──────────┬───────────────────┬────────────────┬──────────┐
20-
│ b63:b16 │ b15:b14 │ b13:b8 │ b7:b0 │
21-
├──────────┼───────────────────┼────────────────┼──────────┤
22-
│ position │ transcoded offset │ grapheme cache │ reserved │
23-
└──────────┴───────────────────┴────────────────┴──────────┘
24-
25-
- grapheme cache: A 6-bit value remembering the distance to the next grapheme
19+
┌──────────┬───────────────────┬─────────╥────────────────┬──────────┐
20+
│ b63:b16 │ b15:b14 │ b13 ║ b12:b8 │ b6:b0 │
21+
├──────────┼───────────────────┼─────────╫────────────────┼──────────┤
22+
│ position │ transcoded offset │ aligned ║ grapheme cache │ reserved │
23+
└──────────┴───────────────────┴─────────╨────────────────┴──────────┘
24+
25+
Position, transcoded offset, and aligned are fully exposed in the ABI. Grapheme
26+
cache and reserved are partially resilient: the fact that there are 13 bits with
27+
a default value of `0` is ABI, but not the layout, construction, or
28+
interpretation of those bits. All use of grapheme cache should be behind
29+
non-inlinable function calls.
30+
31+
- position aka `encodedOffset`: A 48-bit offset into the string's code units
32+
- transcoded offset: a 2-bit sub-scalar offset, derived from transcoding
33+
- aligned, whether this index is known to be scalar-aligned (see below)
34+
<resilience barrier>
35+
- grapheme cache: A 5-bit value remembering the distance to the next grapheme
2636
boundary
27-
- position aka `encodedOffset`: An offset into the string's code units
28-
- transcoded offset: a sub-scalar offset, derived from transcoding
29-
30-
The use and interpretation of both `reserved` and `grapheme cache` is not part
31-
of Index's ABI; it should be hidden behind non-inlinable calls. However, the
32-
position of the sequence of 14 bits allocated is part of Index's ABI, as well as
33-
the default value being `0`.
37+
- reserved: 8-bit for future use.
3438

3539
*/
3640
extension String {
@@ -82,7 +86,7 @@ extension String.Index {
8286

8387
@usableFromInline
8488
internal var characterStride: Int? {
85-
let value = (_rawBits & 0x3F00) &>> 8
89+
let value = (_rawBits & 0x1F00) &>> 8
8690
return value > 0 ? Int(truncatingIfNeeded: value) : nil
8791
}
8892

@@ -132,9 +136,7 @@ extension String.Index {
132136
encodedOffset: Int, transcodedOffset: Int, characterStride: Int
133137
) {
134138
self.init(encodedOffset: encodedOffset, transcodedOffset: transcodedOffset)
135-
if _slowPath(characterStride > 63) { return }
136-
137-
_internalInvariant(characterStride == characterStride & 0x3F)
139+
if _slowPath(characterStride > 0x1F) { return }
138140
self._rawBits |= UInt64(truncatingIfNeeded: characterStride &<< 8)
139141
self._invariantCheck()
140142
}
@@ -150,6 +152,9 @@ extension String.Index {
150152
@usableFromInline @inline(never) @_effects(releasenone)
151153
internal func _invariantCheck() {
152154
_internalInvariant(_encodedOffset >= 0)
155+
if self.isAligned {
156+
_internalInvariant(transcodedOffset == 0)
157+
}
153158
}
154159
#endif // INTERNAL_CHECKS_ENABLED
155160
}
@@ -188,6 +193,7 @@ extension String.Index {
188193
transcodedOffset: self.transcodedOffset &- 1)
189194
}
190195

196+
191197
// Get an index with an encoded offset relative to this one.
192198
// Note: strips any transcoded offset.
193199
@inlinable @inline(__always)
@@ -200,7 +206,59 @@ extension String.Index {
200206
_internalInvariant(self.transcodedOffset == 0)
201207
return String.Index(encodedOffset: self._encodedOffset, transcodedOffset: n)
202208
}
209+
}
203210

211+
/*
212+
Index Alignment
213+
214+
SE-0180 unifies the Index type of String and all its views and allows
215+
non-scalar-aligned indices to be used across views. In order to guarantee
216+
behavior, we often have to check and perform scalar alignment. To speed up
217+
these checks, we allocate a bit denoting known-to-be-aligned, so that the
218+
alignment check can skip the load. The below shows what views need to check
219+
for alignment before they can operate, and whether the indices they produce
220+
are aligned.
221+
222+
┌───────────────╥────────────────────┬──────────────────────────┐
223+
│ View ║ Requires Alignment │ Produces Aligned Indices │
224+
╞═══════════════╬════════════════════╪══════════════════════════╡
225+
│ Native UTF8 ║ no │ no │
226+
├───────────────╫────────────────────┼──────────────────────────┤
227+
│ Native UTF16 ║ yes │ no │
228+
╞═══════════════╬════════════════════╪══════════════════════════╡
229+
│ Foreign UTF8 ║ yes │ no │
230+
├───────────────╫────────────────────┼──────────────────────────┤
231+
│ Foreign UTF16 ║ no │ no │
232+
╞═══════════════╬════════════════════╪══════════════════════════╡
233+
│ UnicodeScalar ║ yes │ yes │
234+
├───────────────╫────────────────────┼──────────────────────────┤
235+
│ Character ║ yes │ yes │
236+
└───────────────╨────────────────────┴──────────────────────────┘
237+
238+
The "requires alignment" applies to any operation taking a String.Index that's
239+
not defined entirely in terms of other operations taking a String.Index. These
240+
include:
241+
242+
* index(after:)
243+
* index(before:)
244+
* subscript
245+
* distance(from:to:) (since `to` is compared against directly)
246+
* UTF16View._nativeGetOffset(for:)
247+
248+
*/
249+
extension String.Index {
250+
@_alwaysEmitIntoClient // Swift 5.1
251+
@inline(__always)
252+
internal var isAligned: Bool { return 0 != _rawBits & 0x2000 }
253+
254+
@_alwaysEmitIntoClient // Swift 5.1
255+
@inline(__always)
256+
internal var aligned: String.Index {
257+
var idx = self
258+
idx._rawBits |= 0x2000
259+
idx._invariantCheck()
260+
return idx
261+
}
204262
}
205263

206264
extension String.Index: Equatable {

0 commit comments

Comments
 (0)