Skip to content

[String] Scalar-alignment bug fixes. #23834

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Jun 27, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions stdlib/public/core/LegacyABI.swift
Original file line number Diff line number Diff line change
Expand Up @@ -71,3 +71,11 @@ internal func _branchHint(_ actual: Bool, expected: Bool) -> Bool {
// value without any branch hint.
return actual
}

extension String {
@usableFromInline // Never actually used in inlinable code...
internal func _nativeCopyUTF16CodeUnits(
into buffer: UnsafeMutableBufferPointer<UInt16>,
range: Range<String.Index>
) { fatalError() }
}
14 changes: 7 additions & 7 deletions stdlib/public/core/StringBridge.swift
Original file line number Diff line number Diff line change
Expand Up @@ -173,17 +173,17 @@ internal enum _KnownCocoaString {
#if !(arch(i386) || arch(arm))
case tagged
#endif

@inline(__always)
init(_ str: _CocoaString) {

#if !(arch(i386) || arch(arm))
if _isObjCTaggedPointer(str) {
self = .tagged
return
}
#endif

switch unsafeBitCast(_swift_classOfObjCHeapObject(str), to: UInt.self) {
case unsafeBitCast(__StringStorage.self, to: UInt.self):
self = .storage
Expand Down Expand Up @@ -272,21 +272,21 @@ internal func _bridgeCocoaString(_ cocoaString: _CocoaString) -> _StringGuts {
// 3) If it's mutable with associated information, must make the call
let immutableCopy
= _stdlib_binary_CFStringCreateCopy(cocoaString) as AnyObject

#if !(arch(i386) || arch(arm))
if _isObjCTaggedPointer(immutableCopy) {
return _StringGuts(_SmallString(taggedCocoa: immutableCopy))
}
#endif

let (fastUTF8, isASCII): (Bool, Bool)
switch _getCocoaStringPointer(immutableCopy) {
case .ascii(_): (fastUTF8, isASCII) = (true, true)
case .utf8(_): (fastUTF8, isASCII) = (true, false)
default: (fastUTF8, isASCII) = (false, false)
}
let length = _stdlib_binary_CFStringGetLength(immutableCopy)

return _StringGuts(
cocoa: immutableCopy,
providesFastUTF8: fastUTF8,
Expand Down Expand Up @@ -420,6 +420,6 @@ extension String {
) {
_internalInvariant(buffer.count >= range.count)
let indexRange = self._toUTF16Indices(range)
self._nativeCopyUTF16CodeUnits(into: buffer, range: indexRange)
self.utf16._nativeCopy(into: buffer, alignedRange: indexRange)
}
}
15 changes: 11 additions & 4 deletions stdlib/public/core/StringCharacterView.swift
Original file line number Diff line number Diff line change
Expand Up @@ -60,13 +60,14 @@ extension String: BidirectionalCollection {
_precondition(i < endIndex, "String index is out of bounds")

// TODO: known-ASCII fast path, single-scalar-grapheme fast path, etc.
let i = _guts.scalarAlign(i)
let stride = _characterStride(startingAt: i)
let nextOffset = i._encodedOffset &+ stride
let nextStride = _characterStride(
startingAt: Index(_encodedOffset: nextOffset))
startingAt: Index(_encodedOffset: nextOffset)._aligned)

return Index(
encodedOffset: nextOffset, characterStride: nextStride)
encodedOffset: nextOffset, characterStride: nextStride)._aligned
}

/// Returns the position immediately before the given index.
Expand All @@ -78,9 +79,10 @@ extension String: BidirectionalCollection {
_precondition(i > startIndex, "String index is out of bounds")

// TODO: known-ASCII fast path, single-scalar-grapheme fast path, etc.
let i = _guts.scalarAlign(i)
let stride = _characterStride(endingAt: i)
let priorOffset = i._encodedOffset &- stride
return Index(encodedOffset: priorOffset, characterStride: stride)
return Index(encodedOffset: priorOffset, characterStride: stride)._aligned
}
/// Returns an index that is the specified distance from the given index.
///
Expand Down Expand Up @@ -167,7 +169,7 @@ extension String: BidirectionalCollection {
@inlinable @inline(__always)
public func distance(from start: Index, to end: Index) -> IndexDistance {
// TODO: known-ASCII and single-scalar-grapheme fast path, etc.
return _distance(from: start, to: end)
return _distance(from: _guts.scalarAlign(start), to: _guts.scalarAlign(end))
}

/// Accesses the character at the given position.
Expand All @@ -191,12 +193,15 @@ extension String: BidirectionalCollection {

let i = _guts.scalarAlign(i)
let distance = _characterStride(startingAt: i)

return _guts.errorCorrectedCharacter(
startingAt: i._encodedOffset, endingAt: i._encodedOffset &+ distance)
}

@inlinable @inline(__always)
internal func _characterStride(startingAt i: Index) -> Int {
_internalInvariant(i._isAligned)

// Fast check if it's already been measured, otherwise check resiliently
if let d = i.characterStride { return d }

Expand All @@ -207,6 +212,8 @@ extension String: BidirectionalCollection {

@inlinable @inline(__always)
internal func _characterStride(endingAt i: Index) -> Int {
_internalInvariant(i._isAligned)

if i == startIndex { return 0 }

return _guts._opaqueCharacterStride(endingAt: i._encodedOffset)
Expand Down
59 changes: 34 additions & 25 deletions stdlib/public/core/StringGraphemeBreaking.swift
Original file line number Diff line number Diff line change
Expand Up @@ -86,44 +86,50 @@ private func _hasGraphemeBreakBetween(
private func _measureCharacterStrideICU(
of utf8: UnsafeBufferPointer<UInt8>, startingAt i: Int
) -> Int {
let iterator = _ThreadLocalStorage.getUBreakIterator(utf8)
let offset = __swift_stdlib_ubrk_following(
iterator, Int32(truncatingIfNeeded: i))
// ICU will gives us a different result if we feed in the whole buffer, so
// slice it appropriately.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

😱 I remember hearing about this, but this still gives me goosebumps. Does it merely jump back to the nearest scalar boundary, or does it also look at previous scalars? Should we have a dedicated test for the difference in behavior?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We care about the behavior of String, for which we have tests. Why would we care if a detail of an ICU API that we no longer use changes?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The value of that test would be to call out the specific difference in behaviour between 5.0 and 5.1. This has an observable effect on how indexing works, doesn't it?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We already have that test in this PR, and it would fail on 5.0. We don't usually add test cases trying to mimic prior bugs by subverting the stdlib's interfaces.

let utf8Slice = UnsafeBufferPointer(rebasing: utf8[i...])
let iterator = _ThreadLocalStorage.getUBreakIterator(utf8Slice)
let offset = __swift_stdlib_ubrk_following(iterator, 0)

// ubrk_following returns -1 (UBRK_DONE) when it hits the end of the buffer.
if _fastPath(offset != -1) {
// The offset into our buffer is the distance.
_internalInvariant(offset > i, "zero-sized grapheme?")
return Int(truncatingIfNeeded: offset) &- i
}
_internalInvariant(utf8.count > i)
return utf8.count &- i
guard _fastPath(offset != -1) else { return utf8Slice.count }

// The offset into our buffer is the distance.
_internalInvariant(offset > 0, "zero-sized grapheme?")
return Int(truncatingIfNeeded: offset)
}

@inline(never) // slow-path
@_effects(releasenone)
private func _measureCharacterStrideICU(
of utf16: UnsafeBufferPointer<UInt16>, startingAt i: Int
) -> Int {
let iterator = _ThreadLocalStorage.getUBreakIterator(utf16)
let offset = __swift_stdlib_ubrk_following(
iterator, Int32(truncatingIfNeeded: i))
// ICU will gives us a different result if we feed in the whole buffer, so
// slice it appropriately.
let utf16Slice = UnsafeBufferPointer(rebasing: utf16[i...])
let iterator = _ThreadLocalStorage.getUBreakIterator(utf16Slice)
let offset = __swift_stdlib_ubrk_following(iterator, 0)

// ubrk_following returns -1 (UBRK_DONE) when it hits the end of the buffer.
if _fastPath(offset != -1) {
// The offset into our buffer is the distance.
_internalInvariant(offset > i, "zero-sized grapheme?")
return Int(truncatingIfNeeded: offset) &- i
}
return utf16.count &- i
guard _fastPath(offset != -1) else { return utf16Slice.count }

// The offset into our buffer is the distance.
_internalInvariant(offset > 0, "zero-sized grapheme?")
return Int(truncatingIfNeeded: offset)
}

@inline(never) // slow-path
@_effects(releasenone)
private func _measureCharacterStrideICU(
of utf8: UnsafeBufferPointer<UInt8>, endingAt i: Int
) -> Int {
let iterator = _ThreadLocalStorage.getUBreakIterator(utf8)
let offset = __swift_stdlib_ubrk_preceding(
iterator, Int32(truncatingIfNeeded: i))
// Slice backwards as well, even though ICU currently seems to give the same
// answer as unsliced.
let utf8Slice = UnsafeBufferPointer(rebasing: utf8[..<i])
let iterator = _ThreadLocalStorage.getUBreakIterator(utf8Slice)
let offset = __swift_stdlib_ubrk_preceding(iterator, Int32(utf8Slice.count))

// ubrk_following returns -1 (UBRK_DONE) when it hits the end of the buffer.
if _fastPath(offset != -1) {
// The offset into our buffer is the distance.
Expand All @@ -138,9 +144,12 @@ private func _measureCharacterStrideICU(
private func _measureCharacterStrideICU(
of utf16: UnsafeBufferPointer<UInt16>, endingAt i: Int
) -> Int {
let iterator = _ThreadLocalStorage.getUBreakIterator(utf16)
let offset = __swift_stdlib_ubrk_preceding(
iterator, Int32(truncatingIfNeeded: i))
// Slice backwards as well, even though ICU currently seems to give the same
// answer as unsliced.
let utf16Slice = UnsafeBufferPointer(rebasing: utf16[..<i])
let iterator = _ThreadLocalStorage.getUBreakIterator(utf16Slice)
let offset = __swift_stdlib_ubrk_preceding(iterator, Int32(utf16Slice.count))

// ubrk_following returns -1 (UBRK_DONE) when it hits the end of the buffer.
if _fastPath(offset != -1) {
// The offset into our buffer is the distance.
Expand Down
4 changes: 2 additions & 2 deletions stdlib/public/core/StringGuts.swift
Original file line number Diff line number Diff line change
Expand Up @@ -268,11 +268,11 @@ extension _StringGuts {

@inlinable @inline(__always)
internal var startIndex: String.Index {
return Index(_encodedOffset: 0)
return Index(_encodedOffset: 0)._aligned
}
@inlinable @inline(__always)
internal var endIndex: String.Index {
return Index(_encodedOffset: self.count)
return Index(_encodedOffset: self.count)._aligned
}
}

Expand Down
94 changes: 76 additions & 18 deletions stdlib/public/core/StringIndex.swift
Original file line number Diff line number Diff line change
Expand Up @@ -16,21 +16,25 @@ import SwiftShims

String's Index has the following layout:

┌──────────┬───────────────────┬────────────────┬──────────┐
│ b63:b16 │ b15:b14 │ b13:b8 │ b7:b0 │
├──────────┼───────────────────┼────────────────┼──────────┤
│ position │ transcoded offset │ grapheme cache │ reserved │
└──────────┴───────────────────┴────────────────┴──────────┘

- grapheme cache: A 6-bit value remembering the distance to the next grapheme
┌──────────┬───────────────────┬─────────╥────────────────┬──────────┐
│ b63:b16 │ b15:b14 │ b13 ║ b12:b8 │ b6:b0 │
├──────────┼───────────────────┼─────────╫────────────────┼──────────┤
│ position │ transcoded offset │ aligned ║ grapheme cache │ reserved │
└──────────┴───────────────────┴─────────╨────────────────┴──────────┘

Position, transcoded offset, and aligned are fully exposed in the ABI. Grapheme
cache and reserved are partially resilient: the fact that there are 13 bits with
a default value of `0` is ABI, but not the layout, construction, or
interpretation of those bits. All use of grapheme cache should be behind
non-inlinable function calls.

- position aka `encodedOffset`: A 48-bit offset into the string's code units
- transcoded offset: a 2-bit sub-scalar offset, derived from transcoding
- aligned, whether this index is known to be scalar-aligned (see below)
<resilience barrier>
- grapheme cache: A 5-bit value remembering the distance to the next grapheme
boundary
- position aka `encodedOffset`: An offset into the string's code units
- transcoded offset: a sub-scalar offset, derived from transcoding

The use and interpretation of both `reserved` and `grapheme cache` is not part
of Index's ABI; it should be hidden behind non-inlinable calls. However, the
position of the sequence of 14 bits allocated is part of Index's ABI, as well as
the default value being `0`.
- reserved: 8-bit for future use.

*/
extension String {
Expand Down Expand Up @@ -82,7 +86,7 @@ extension String.Index {

@usableFromInline
internal var characterStride: Int? {
let value = (_rawBits & 0x3F00) &>> 8
let value = (_rawBits & 0x1F00) &>> 8
return value > 0 ? Int(truncatingIfNeeded: value) : nil
}

Expand Down Expand Up @@ -132,9 +136,7 @@ extension String.Index {
encodedOffset: Int, transcodedOffset: Int, characterStride: Int
) {
self.init(encodedOffset: encodedOffset, transcodedOffset: transcodedOffset)
if _slowPath(characterStride > 63) { return }

_internalInvariant(characterStride == characterStride & 0x3F)
if _slowPath(characterStride > 0x1F) { return }
self._rawBits |= UInt64(truncatingIfNeeded: characterStride &<< 8)
self._invariantCheck()
}
Expand All @@ -150,6 +152,9 @@ extension String.Index {
@usableFromInline @inline(never) @_effects(releasenone)
internal func _invariantCheck() {
_internalInvariant(_encodedOffset >= 0)
if self._isAligned {
_internalInvariant(transcodedOffset == 0)
}
}
#endif // INTERNAL_CHECKS_ENABLED
}
Expand Down Expand Up @@ -188,6 +193,7 @@ extension String.Index {
transcodedOffset: self.transcodedOffset &- 1)
}


// Get an index with an encoded offset relative to this one.
// Note: strips any transcoded offset.
@inlinable @inline(__always)
Expand All @@ -200,7 +206,59 @@ extension String.Index {
_internalInvariant(self.transcodedOffset == 0)
return String.Index(encodedOffset: self._encodedOffset, transcodedOffset: n)
}
}

/*
Index Alignment

SE-0180 unifies the Index type of String and all its views and allows
non-scalar-aligned indices to be used across views. In order to guarantee
behavior, we often have to check and perform scalar alignment. To speed up
these checks, we allocate a bit denoting known-to-be-aligned, so that the
alignment check can skip the load. The below shows what views need to check
for alignment before they can operate, and whether the indices they produce
are aligned.

┌───────────────╥────────────────────┬──────────────────────────┐
│ View ║ Requires Alignment │ Produces Aligned Indices │
╞═══════════════╬════════════════════╪══════════════════════════╡
│ Native UTF8 ║ no │ no │
├───────────────╫────────────────────┼──────────────────────────┤
│ Native UTF16 ║ yes │ no │
╞═══════════════╬════════════════════╪══════════════════════════╡
│ Foreign UTF8 ║ yes │ no │
├───────────────╫────────────────────┼──────────────────────────┤
│ Foreign UTF16 ║ no │ no │
╞═══════════════╬════════════════════╪══════════════════════════╡
│ UnicodeScalar ║ yes │ yes │
├───────────────╫────────────────────┼──────────────────────────┤
│ Character ║ yes │ yes │
└───────────────╨────────────────────┴──────────────────────────┘

The "requires alignment" applies to any operation taking a String.Index that's
not defined entirely in terms of other operations taking a String.Index. These
include:

* index(after:)
* index(before:)
* subscript
* distance(from:to:) (since `to` is compared against directly)
* UTF16View._nativeGetOffset(for:)

*/
extension String.Index {
@_alwaysEmitIntoClient // Swift 5.1
@inline(__always)
internal var _isAligned: Bool { return 0 != _rawBits & 0x2000 }

@_alwaysEmitIntoClient // Swift 5.1
@inline(__always)
internal var _aligned: String.Index {
var idx = self
idx._rawBits |= 0x2000
idx._invariantCheck()
return idx
}
}

extension String.Index: Equatable {
Expand Down
8 changes: 4 additions & 4 deletions stdlib/public/core/StringStorage.swift
Original file line number Diff line number Diff line change
Expand Up @@ -141,9 +141,9 @@ extension _AbstractStringStorage {
// one of ours.

defer { _fixLifetime(other) }

let otherUTF16Length = _stdlib_binary_CFStringGetLength(other)

// CFString will only give us ASCII bytes here, but that's fine.
// We already handled non-ASCII UTF8 strings earlier since they're Swift.
if let otherStart = _cocoaUTF8Pointer(other) {
Expand All @@ -154,11 +154,11 @@ extension _AbstractStringStorage {
return (start == otherStart ||
(memcmp(start, otherStart, count) == 0)) ? 1 : 0
}

if UTF16Length != otherUTF16Length {
return 0
}

/*
The abstract implementation of -isEqualToString: falls back to -compare:
immediately, so when we run out of fast options to try, do the same.
Expand Down
Loading