Skip to content

[stdlib] Make String.Index(_:within:) initializers more permissive #42442

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Apr 20, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
89 changes: 61 additions & 28 deletions stdlib/public/core/StringGuts.swift
Original file line number Diff line number Diff line change
Expand Up @@ -301,19 +301,13 @@ extension _StringGuts {
// Encoding
extension _StringGuts {
/// Returns whether this string has a UTF-8 storage representation.
/// If this returns false, then the string is encoded in UTF-16.
///
/// This always returns a value corresponding to the string's actual encoding.
@_alwaysEmitIntoClient
@inline(__always)
internal var isUTF8: Bool { _object.isUTF8 }

/// Returns whether this string has a UTF-16 storage representation.
///
/// This always returns a value corresponding to the string's actual encoding.
@_alwaysEmitIntoClient
@inline(__always)
internal var isUTF16: Bool { _object.isUTF16 }

@_alwaysEmitIntoClient // Swift 5.7
@inline(__always)
internal func markEncoding(_ i: String.Index) -> String.Index {
Expand All @@ -333,41 +327,75 @@ extension _StringGuts {
i._hasMatchingEncoding(isUTF8: isUTF8)
}

/// Return an index whose encoding can be assumed to match that of `self`.
/// Return an index whose encoding can be assumed to match that of `self`,
/// trapping if `i` has an incompatible encoding.
///
/// If `i` is UTF-8 encoded, but `self` is an UTF-16 string, then trap.
///
/// If `i` is UTF-16 encoded, but `self` is an UTF-8 string, then transcode
/// `i`'s offset to UTF-8 and return the resulting index. This allows the use
/// of indices from a bridged Cocoa string after the string has been converted
/// to a native Swift string. (Such indices are technically still considered
/// invalid, but we allow this specific case to keep compatibility with
/// existing code that assumes otherwise.)
///
/// Detecting an encoding mismatch isn't always possible -- older binaries did
/// not set the flags that this method relies on. However, false positives
/// cannot happen: if this method detects a mismatch, then it is guaranteed to
/// be a real one.
@_alwaysEmitIntoClient
@inline(__always)
internal func ensureMatchingEncoding(_ i: String.Index) -> String.Index {
internal func ensureMatchingEncoding(_ i: Index) -> Index {
if _fastPath(hasMatchingEncoding(i)) { return i }
if let i = _slowEnsureMatchingEncoding(i) { return i }
// Note that this trap is not guaranteed to trigger when the process
// includes client binaries compiled with a previous Swift release.
// (`i._canBeUTF16` can sometimes return true in that case even if the index
// actually came from an UTF-8 string.) However, the trap will still often
// trigger in this case, as long as the index was initialized by code that
// was compiled with 5.7+.
//
// This trap will rarely if ever trigger on OSes that have stdlibs <= 5.6,
// because those versions never set the `isKnownUTF16` flag in
// `_StringObject`. (The flag may still be set within inlinable code,
// though.)
_preconditionFailure("Invalid string index")
}

/// Return an index that corresponds to the same position as `i`, but whose
/// encoding can be assumed to match that of `self`, returning `nil` if `i`
/// has incompatible encoding.
///
/// If `i` is UTF-8 encoded, but `self` is an UTF-16 string, then return nil.
///
/// If `i` is UTF-16 encoded, but `self` is an UTF-8 string, then transcode
/// `i`'s offset to UTF-8 and return the resulting index. This allows the use
/// of indices from a bridged Cocoa string after the string has been converted
/// to a native Swift string. (Such indices are technically still considered
/// invalid, but we allow this specific case to keep compatibility with
/// existing code that assumes otherwise.)
///
/// Detecting an encoding mismatch isn't always possible -- older binaries did
/// not set the flags that this method relies on. However, false positives
/// cannot happen: if this method detects a mismatch, then it is guaranteed to
/// be a real one.
internal func ensureMatchingEncodingNoTrap(_ i: Index) -> Index? {
if hasMatchingEncoding(i) { return i }
return _slowEnsureMatchingEncoding(i)
}

@_alwaysEmitIntoClient
@inline(never)
@_effects(releasenone)
internal func _slowEnsureMatchingEncoding(_ i: String.Index) -> String.Index {
internal func _slowEnsureMatchingEncoding(_ i: Index) -> Index? {
guard isUTF8 else {
// Attempt to use an UTF-8 index on a UTF-16 string. Strings don't usually
// get converted to UTF-16 storage, so it seems okay to trap in this case
// -- the index most likely comes from an unrelated string. (Trapping here
// may still turn out to affect binary compatibility with broken code in
// get converted to UTF-16 storage, so it seems okay to reject this case
// -- the index most likely comes from an unrelated string. (This may
// still turn out to affect binary compatibility with broken code in
// existing binaries running with new stdlibs. If so, we can replace this
// with the same transcoding hack as in the UTF-16->8 case below.)
//
// Note that this trap is not guaranteed to trigger when the process
// includes client binaries compiled with a previous Swift release.
// (`i._canBeUTF16` can sometimes return true in that case even if the
// index actually came from an UTF-8 string.) However, the trap will still
// often trigger in this case, as long as the index was initialized by
// code that was compiled with 5.7+.
//
// This trap can never trigger on OSes that have stdlibs <= 5.6, because
// those versions never set the `isKnownUTF16` flag in `_StringObject`.
_preconditionFailure("Invalid string index")
return nil
}
// Attempt to use an UTF-16 index on a UTF-8 string.
//
Expand All @@ -383,10 +411,15 @@ extension _StringGuts {
// FIXME: Consider emitting a runtime warning here.
// FIXME: Consider performing a linked-on-or-after check & trapping if the
// client executable was built on some particular future Swift release.
let utf16 = String(self).utf16
let base = utf16.index(utf16.startIndex, offsetBy: i._encodedOffset)
if i.transcodedOffset == 0 { return base }
return base.encoded(offsetBy: i.transcodedOffset)._knownUTF8
let utf16 = String.UTF16View(self)
var r = utf16.index(utf16.startIndex, offsetBy: i._encodedOffset)
if i.transcodedOffset != 0 {
r = r.encoded(offsetBy: i.transcodedOffset)
} else {
// Preserve alignment bits if possible.
r = r._copyingAlignment(from: i)
}
return r._knownUTF8
}
}

Expand Down
7 changes: 3 additions & 4 deletions stdlib/public/core/StringGutsRangeReplaceable.swift
Original file line number Diff line number Diff line change
Expand Up @@ -466,10 +466,9 @@ extension _StringGuts {
_internalInvariant(
subrange.lowerBound >= startIndex && subrange.upperBound <= endIndex)

if _slowPath(isUTF16) {
// UTF-16 (i.e., foreign) string. The mutation will convert this to the
// native UTF-8 encoding, so we need to do some extra work to preserve our
// bounds.
guard _fastPath(isUTF8) else {
// UTF-16 string. The mutation will convert this to the native UTF-8
// encoding, so we need to do some extra work to preserve our bounds.
let utf8StartOffset = String(self).utf8.distance(
from: self.startIndex, to: startIndex)
let oldUTF8Count = String(self).utf8.distance(
Expand Down
22 changes: 9 additions & 13 deletions stdlib/public/core/StringIndex.swift
Original file line number Diff line number Diff line change
Expand Up @@ -365,6 +365,14 @@ extension String.Index {
}
}

extension String.Index {
@_alwaysEmitIntoClient // Swift 5.7
internal func _copyingAlignment(from index: Self) -> Self {
let mask = Self.__scalarAlignmentBit | Self.__characterAlignmentBit
return Self((_rawBits & ~mask) | (index._rawBits & mask))
}
}

// ### Index Encoding
//
// Swift 5.7 introduced bookkeeping to keep track of the Unicode encoding
Expand Down Expand Up @@ -473,24 +481,12 @@ extension String.Index {
}

@_alwaysEmitIntoClient // Swift 5.7
internal func _copyEncoding(from index: Self) -> Self {
internal func _copyingEncoding(from index: Self) -> Self {
let mask = Self.__utf8Bit | Self.__utf16Bit
return Self((_rawBits & ~mask) | (index._rawBits & mask))
}
}

extension String.Index {
@_alwaysEmitIntoClient @inline(__always) // Swift 5.7
internal var _isUTF8CharacterIndex: Bool {
_canBeUTF8 && _isCharacterAligned
}

@_alwaysEmitIntoClient @inline(__always) // Swift 5.7
internal var _isUTF8ScalarIndex: Bool {
_canBeUTF8 && _isScalarAligned
}
}

extension String.Index: Equatable {
@inlinable @inline(__always)
public static func == (lhs: String.Index, rhs: String.Index) -> Bool {
Expand Down
26 changes: 22 additions & 4 deletions stdlib/public/core/StringIndexConversions.swift
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,17 @@ extension String.Index {
/// of `target`.
/// - target: The string referenced by the resulting index.
public init?(_ sourcePosition: String.Index, within target: String) {
guard target._isValidIndex(sourcePosition) else { return nil }
self = sourcePosition._characterAligned
// As a special exception, we allow `sourcePosition` to be an UTF-16 index
// when `self` is a UTF-8 string, to preserve compatibility with (broken)
// code that keeps using indices from a bridged string after converting the
// string to a native representation. Such indices are invalid, but
// returning nil here can break code that appeared to work fine for ASCII
// strings in Swift releases prior to 5.7.
guard
let i = target._guts.ensureMatchingEncodingNoTrap(sourcePosition),
target._isValidIndex(i)
else { return nil }
self = i._characterAligned
}

/// Creates an index in the given string that corresponds exactly to the
Expand Down Expand Up @@ -101,8 +110,17 @@ extension String.Index {
return
}
if let str = target as? Substring {
guard str._isValidIndex(sourcePosition) else { return nil }
self = sourcePosition
// As a special exception, we allow `sourcePosition` to be an UTF-16 index
// when `self` is a UTF-8 string, to preserve compatibility with (broken)
// code that keeps using indices from a bridged string after converting
// the string to a native representation. Such indices are invalid, but
// returning nil here can break code that appeared to work fine for ASCII
// strings in Swift releases prior to 5.7.
guard
let i = str._wholeGuts.ensureMatchingEncodingNoTrap(sourcePosition),
str._isValidIndex(i)
else { return nil }
self = i
return
}
self.init(sourcePosition, within: String(target))
Expand Down
10 changes: 1 addition & 9 deletions stdlib/public/core/StringObject.swift
Original file line number Diff line number Diff line change
Expand Up @@ -1009,6 +1009,7 @@ extension _StringObject {
}

/// Returns whether this string has a UTF-8 storage representation.
/// If this returns false, then the string is encoded in UTF-16.
///
/// This always returns a value corresponding to the string's actual encoding.
@_alwaysEmitIntoClient
Expand All @@ -1030,15 +1031,6 @@ extension _StringObject {
providesFastUTF8 || _countAndFlags.isForeignUTF8
}

/// Returns whether this string has a UTF-16 storage representation.
///
/// This always returns a value corresponding to the string's actual encoding.
@_alwaysEmitIntoClient
@inline(__always) // Swift 5.7
internal var isUTF16: Bool {
!isUTF8
}

// Get access to fast UTF-8 contents for large strings which provide it.
@inlinable @inline(__always)
internal var fastUTF8: UnsafeBufferPointer<UInt8> {
Expand Down
24 changes: 20 additions & 4 deletions stdlib/public/core/StringUTF16View.swift
Original file line number Diff line number Diff line change
Expand Up @@ -402,12 +402,28 @@ extension String.UTF16View.Index {
public init?(
_ idx: String.Index, within target: String.UTF16View
) {
guard target._guts.hasMatchingEncoding(idx) else { return nil }
guard idx._encodedOffset <= target._guts.count else { return nil }
// As a special exception, we allow `idx` to be an UTF-16 index when `self`
// is a UTF-8 string, to preserve compatibility with (broken) code that
// keeps using indices from a bridged string after converting the string to
// a native representation. Such indices are invalid, but returning nil here
// can break code that appeared to work fine for ASCII strings in Swift
// releases prior to 5.7.
guard
let idx = target._guts.ensureMatchingEncodingNoTrap(idx),
idx._encodedOffset <= target._guts.count
else { return nil }

if _slowPath(target._guts.isForeign) {
guard idx._foreignIsWithin(target) else { return nil }
} else {
guard target._guts.isOnUnicodeScalarBoundary(idx) else { return nil }
} else { // fast UTF-8
guard (
// If the transcoded offset is non-zero, then `idx` addresses a trailing
// surrogate, so its encoding offset is on a scalar boundary, and it's a
// valid UTF-16 index.
idx.transcodedOffset != 0
/// Otherwise we need to reject indices that aren't scalar aligned.
|| target._guts.isOnUnicodeScalarBoundary(idx)
) else { return nil }
}

self = idx
Expand Down
13 changes: 11 additions & 2 deletions stdlib/public/core/StringUTF8View.swift
Original file line number Diff line number Diff line change
Expand Up @@ -359,8 +359,17 @@ extension String.UTF8View.Index {
public init?(_ idx: String.Index, within target: String.UTF8View) {
// Note: This method used to be inlinable until Swift 5.7.

guard target._guts.hasMatchingEncoding(idx) else { return nil }
guard idx._encodedOffset <= target._guts.count else { return nil }
// As a special exception, we allow `idx` to be an UTF-16 index when `self`
// is a UTF-8 string, to preserve compatibility with (broken) code that
// keeps using indices from a bridged string after converting the string to
// a native representation. Such indices are invalid, but returning nil here
// can break code that appeared to work fine for ASCII strings in Swift
// releases prior to 5.7.
guard
let idx = target._guts.ensureMatchingEncodingNoTrap(idx),
idx._encodedOffset <= target._guts.count
else { return nil }

if _slowPath(target._guts.isForeign) {
guard idx._foreignIsWithin(target) else { return nil }
} else {
Expand Down
8 changes: 4 additions & 4 deletions stdlib/public/core/StringUnicodeScalarView.swift
Original file line number Diff line number Diff line change
Expand Up @@ -429,13 +429,13 @@ extension String.UnicodeScalarIndex {
within unicodeScalars: String.UnicodeScalarView
) {
guard
unicodeScalars._guts.hasMatchingEncoding(sourcePosition),
sourcePosition._encodedOffset <= unicodeScalars._guts.count,
unicodeScalars._guts.isOnUnicodeScalarBoundary(sourcePosition)
let i = unicodeScalars._guts.ensureMatchingEncodingNoTrap(sourcePosition),
i._encodedOffset <= unicodeScalars._guts.count,
unicodeScalars._guts.isOnUnicodeScalarBoundary(i)
else {
return nil
}
self = sourcePosition
self = i
}

/// Returns the position in the given string that corresponds exactly to this
Expand Down
2 changes: 1 addition & 1 deletion stdlib/public/core/UnicodeHelpers.swift
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,7 @@ extension _StringGuts {
result = idx
} else {
// TODO(String performance): isASCII check
result = scalarAlignSlow(idx)._scalarAligned._copyEncoding(from: idx)
result = scalarAlignSlow(idx)._scalarAligned._copyingEncoding(from: idx)
}

_internalInvariant(isOnUnicodeScalarBoundary(result),
Expand Down
Loading