Skip to content

Commit 54d2bbf

Browse files
authored
Merge pull request #42509 from lorentey/better-index-conversions-5.7
[5.7][stdlib] Make String.Index(_:within:) initializers more permissive
2 parents 3c533ff + c73faab commit 54d2bbf

10 files changed

+257
-71
lines changed

stdlib/public/core/StringGuts.swift

Lines changed: 61 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -302,19 +302,13 @@ extension _StringGuts {
302302
// Encoding
303303
extension _StringGuts {
304304
/// Returns whether this string has a UTF-8 storage representation.
305+
/// If this returns false, then the string is encoded in UTF-16.
305306
///
306307
/// This always returns a value corresponding to the string's actual encoding.
307308
@_alwaysEmitIntoClient
308309
@inline(__always)
309310
internal var isUTF8: Bool { _object.isUTF8 }
310311

311-
/// Returns whether this string has a UTF-16 storage representation.
312-
///
313-
/// This always returns a value corresponding to the string's actual encoding.
314-
@_alwaysEmitIntoClient
315-
@inline(__always)
316-
internal var isUTF16: Bool { _object.isUTF16 }
317-
318312
@_alwaysEmitIntoClient // Swift 5.7
319313
@inline(__always)
320314
internal func markEncoding(_ i: String.Index) -> String.Index {
@@ -334,41 +328,75 @@ extension _StringGuts {
334328
i._hasMatchingEncoding(isUTF8: isUTF8)
335329
}
336330

337-
/// Return an index whose encoding can be assumed to match that of `self`.
331+
/// Return an index whose encoding can be assumed to match that of `self`,
332+
/// trapping if `i` has an incompatible encoding.
333+
///
334+
/// If `i` is UTF-8 encoded, but `self` is an UTF-16 string, then trap.
335+
///
336+
/// If `i` is UTF-16 encoded, but `self` is an UTF-8 string, then transcode
337+
/// `i`'s offset to UTF-8 and return the resulting index. This allows the use
338+
/// of indices from a bridged Cocoa string after the string has been converted
339+
/// to a native Swift string. (Such indices are technically still considered
340+
/// invalid, but we allow this specific case to keep compatibility with
341+
/// existing code that assumes otherwise.)
338342
///
339343
/// Detecting an encoding mismatch isn't always possible -- older binaries did
340344
/// not set the flags that this method relies on. However, false positives
341345
/// cannot happen: if this method detects a mismatch, then it is guaranteed to
342346
/// be a real one.
343347
@_alwaysEmitIntoClient
344348
@inline(__always)
345-
internal func ensureMatchingEncoding(_ i: String.Index) -> String.Index {
349+
internal func ensureMatchingEncoding(_ i: Index) -> Index {
346350
if _fastPath(hasMatchingEncoding(i)) { return i }
351+
if let i = _slowEnsureMatchingEncoding(i) { return i }
352+
// Note that this trap is not guaranteed to trigger when the process
353+
// includes client binaries compiled with a previous Swift release.
354+
// (`i._canBeUTF16` can sometimes return true in that case even if the index
355+
// actually came from an UTF-8 string.) However, the trap will still often
356+
// trigger in this case, as long as the index was initialized by code that
357+
// was compiled with 5.7+.
358+
//
359+
// This trap will rarely if ever trigger on OSes that have stdlibs <= 5.6,
360+
// because those versions never set the `isKnownUTF16` flag in
361+
// `_StringObject`. (The flag may still be set within inlinable code,
362+
// though.)
363+
_preconditionFailure("Invalid string index")
364+
}
365+
366+
/// Return an index that corresponds to the same position as `i`, but whose
367+
/// encoding can be assumed to match that of `self`, returning `nil` if `i`
368+
/// has incompatible encoding.
369+
///
370+
/// If `i` is UTF-8 encoded, but `self` is an UTF-16 string, then return nil.
371+
///
372+
/// If `i` is UTF-16 encoded, but `self` is an UTF-8 string, then transcode
373+
/// `i`'s offset to UTF-8 and return the resulting index. This allows the use
374+
/// of indices from a bridged Cocoa string after the string has been converted
375+
/// to a native Swift string. (Such indices are technically still considered
376+
/// invalid, but we allow this specific case to keep compatibility with
377+
/// existing code that assumes otherwise.)
378+
///
379+
/// Detecting an encoding mismatch isn't always possible -- older binaries did
380+
/// not set the flags that this method relies on. However, false positives
381+
/// cannot happen: if this method detects a mismatch, then it is guaranteed to
382+
/// be a real one.
383+
internal func ensureMatchingEncodingNoTrap(_ i: Index) -> Index? {
384+
if hasMatchingEncoding(i) { return i }
347385
return _slowEnsureMatchingEncoding(i)
348386
}
349387

350388
@_alwaysEmitIntoClient
351389
@inline(never)
352390
@_effects(releasenone)
353-
internal func _slowEnsureMatchingEncoding(_ i: String.Index) -> String.Index {
391+
internal func _slowEnsureMatchingEncoding(_ i: Index) -> Index? {
354392
guard isUTF8 else {
355393
// Attempt to use an UTF-8 index on a UTF-16 string. Strings don't usually
356-
// get converted to UTF-16 storage, so it seems okay to trap in this case
357-
// -- the index most likely comes from an unrelated string. (Trapping here
358-
// may still turn out to affect binary compatibility with broken code in
394+
// get converted to UTF-16 storage, so it seems okay to reject this case
395+
// -- the index most likely comes from an unrelated string. (This may
396+
// still turn out to affect binary compatibility with broken code in
359397
// existing binaries running with new stdlibs. If so, we can replace this
360398
// with the same transcoding hack as in the UTF-16->8 case below.)
361-
//
362-
// Note that this trap is not guaranteed to trigger when the process
363-
// includes client binaries compiled with a previous Swift release.
364-
// (`i._canBeUTF16` can sometimes return true in that case even if the
365-
// index actually came from an UTF-8 string.) However, the trap will still
366-
// often trigger in this case, as long as the index was initialized by
367-
// code that was compiled with 5.7+.
368-
//
369-
// This trap can never trigger on OSes that have stdlibs <= 5.6, because
370-
// those versions never set the `isKnownUTF16` flag in `_StringObject`.
371-
_preconditionFailure("Invalid string index")
399+
return nil
372400
}
373401
// Attempt to use an UTF-16 index on a UTF-8 string.
374402
//
@@ -384,10 +412,15 @@ extension _StringGuts {
384412
// FIXME: Consider emitting a runtime warning here.
385413
// FIXME: Consider performing a linked-on-or-after check & trapping if the
386414
// client executable was built on some particular future Swift release.
387-
let utf16 = String(self).utf16
388-
let base = utf16.index(utf16.startIndex, offsetBy: i._encodedOffset)
389-
if i.transcodedOffset == 0 { return base }
390-
return base.encoded(offsetBy: i.transcodedOffset)._knownUTF8
415+
let utf16 = String.UTF16View(self)
416+
var r = utf16.index(utf16.startIndex, offsetBy: i._encodedOffset)
417+
if i.transcodedOffset != 0 {
418+
r = r.encoded(offsetBy: i.transcodedOffset)
419+
} else {
420+
// Preserve alignment bits if possible.
421+
r = r._copyingAlignment(from: i)
422+
}
423+
return r._knownUTF8
391424
}
392425
}
393426

stdlib/public/core/StringGutsRangeReplaceable.swift

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -466,10 +466,9 @@ extension _StringGuts {
466466
_internalInvariant(
467467
subrange.lowerBound >= startIndex && subrange.upperBound <= endIndex)
468468

469-
if _slowPath(isUTF16) {
470-
// UTF-16 (i.e., foreign) string. The mutation will convert this to the
471-
// native UTF-8 encoding, so we need to do some extra work to preserve our
472-
// bounds.
469+
guard _fastPath(isUTF8) else {
470+
// UTF-16 string. The mutation will convert this to the native UTF-8
471+
// encoding, so we need to do some extra work to preserve our bounds.
473472
let utf8StartOffset = String(self).utf8.distance(
474473
from: self.startIndex, to: startIndex)
475474
let oldUTF8Count = String(self).utf8.distance(

stdlib/public/core/StringIndex.swift

Lines changed: 9 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -365,6 +365,14 @@ extension String.Index {
365365
}
366366
}
367367

368+
extension String.Index {
369+
@_alwaysEmitIntoClient // Swift 5.7
370+
internal func _copyingAlignment(from index: Self) -> Self {
371+
let mask = Self.__scalarAlignmentBit | Self.__characterAlignmentBit
372+
return Self((_rawBits & ~mask) | (index._rawBits & mask))
373+
}
374+
}
375+
368376
// ### Index Encoding
369377
//
370378
// Swift 5.7 introduced bookkeeping to keep track of the Unicode encoding
@@ -473,24 +481,12 @@ extension String.Index {
473481
}
474482

475483
@_alwaysEmitIntoClient // Swift 5.7
476-
internal func _copyEncoding(from index: Self) -> Self {
484+
internal func _copyingEncoding(from index: Self) -> Self {
477485
let mask = Self.__utf8Bit | Self.__utf16Bit
478486
return Self((_rawBits & ~mask) | (index._rawBits & mask))
479487
}
480488
}
481489

482-
extension String.Index {
483-
@_alwaysEmitIntoClient @inline(__always) // Swift 5.7
484-
internal var _isUTF8CharacterIndex: Bool {
485-
_canBeUTF8 && _isCharacterAligned
486-
}
487-
488-
@_alwaysEmitIntoClient @inline(__always) // Swift 5.7
489-
internal var _isUTF8ScalarIndex: Bool {
490-
_canBeUTF8 && _isScalarAligned
491-
}
492-
}
493-
494490
extension String.Index: Equatable {
495491
@inlinable @inline(__always)
496492
public static func == (lhs: String.Index, rhs: String.Index) -> Bool {

stdlib/public/core/StringIndexConversions.swift

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -50,8 +50,17 @@ extension String.Index {
5050
/// of `target`.
5151
/// - target: The string referenced by the resulting index.
5252
public init?(_ sourcePosition: String.Index, within target: String) {
53-
guard target._isValidIndex(sourcePosition) else { return nil }
54-
self = sourcePosition._characterAligned
53+
// As a special exception, we allow `sourcePosition` to be an UTF-16 index
54+
// when `self` is a UTF-8 string, to preserve compatibility with (broken)
55+
// code that keeps using indices from a bridged string after converting the
56+
// string to a native representation. Such indices are invalid, but
57+
// returning nil here can break code that appeared to work fine for ASCII
58+
// strings in Swift releases prior to 5.7.
59+
guard
60+
let i = target._guts.ensureMatchingEncodingNoTrap(sourcePosition),
61+
target._isValidIndex(i)
62+
else { return nil }
63+
self = i._characterAligned
5564
}
5665

5766
/// Creates an index in the given string that corresponds exactly to the
@@ -101,8 +110,17 @@ extension String.Index {
101110
return
102111
}
103112
if let str = target as? Substring {
104-
guard str._isValidIndex(sourcePosition) else { return nil }
105-
self = sourcePosition
113+
// As a special exception, we allow `sourcePosition` to be an UTF-16 index
114+
// when `self` is a UTF-8 string, to preserve compatibility with (broken)
115+
// code that keeps using indices from a bridged string after converting
116+
// the string to a native representation. Such indices are invalid, but
117+
// returning nil here can break code that appeared to work fine for ASCII
118+
// strings in Swift releases prior to 5.7.
119+
guard
120+
let i = str._wholeGuts.ensureMatchingEncodingNoTrap(sourcePosition),
121+
str._isValidIndex(i)
122+
else { return nil }
123+
self = i
106124
return
107125
}
108126
self.init(sourcePosition, within: String(target))

stdlib/public/core/StringObject.swift

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1009,6 +1009,7 @@ extension _StringObject {
10091009
}
10101010

10111011
/// Returns whether this string has a UTF-8 storage representation.
1012+
/// If this returns false, then the string is encoded in UTF-16.
10121013
///
10131014
/// This always returns a value corresponding to the string's actual encoding.
10141015
@_alwaysEmitIntoClient
@@ -1030,15 +1031,6 @@ extension _StringObject {
10301031
providesFastUTF8 || _countAndFlags.isForeignUTF8
10311032
}
10321033

1033-
/// Returns whether this string has a UTF-16 storage representation.
1034-
///
1035-
/// This always returns a value corresponding to the string's actual encoding.
1036-
@_alwaysEmitIntoClient
1037-
@inline(__always) // Swift 5.7
1038-
internal var isUTF16: Bool {
1039-
!isUTF8
1040-
}
1041-
10421034
// Get access to fast UTF-8 contents for large strings which provide it.
10431035
@inlinable @inline(__always)
10441036
internal var fastUTF8: UnsafeBufferPointer<UInt8> {

stdlib/public/core/StringUTF16View.swift

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -402,12 +402,28 @@ extension String.UTF16View.Index {
402402
public init?(
403403
_ idx: String.Index, within target: String.UTF16View
404404
) {
405-
guard target._guts.hasMatchingEncoding(idx) else { return nil }
406-
guard idx._encodedOffset <= target._guts.count else { return nil }
405+
// As a special exception, we allow `idx` to be an UTF-16 index when `self`
406+
// is a UTF-8 string, to preserve compatibility with (broken) code that
407+
// keeps using indices from a bridged string after converting the string to
408+
// a native representation. Such indices are invalid, but returning nil here
409+
// can break code that appeared to work fine for ASCII strings in Swift
410+
// releases prior to 5.7.
411+
guard
412+
let idx = target._guts.ensureMatchingEncodingNoTrap(idx),
413+
idx._encodedOffset <= target._guts.count
414+
else { return nil }
415+
407416
if _slowPath(target._guts.isForeign) {
408417
guard idx._foreignIsWithin(target) else { return nil }
409-
} else {
410-
guard target._guts.isOnUnicodeScalarBoundary(idx) else { return nil }
418+
} else { // fast UTF-8
419+
guard (
420+
// If the transcoded offset is non-zero, then `idx` addresses a trailing
421+
// surrogate, so its encoding offset is on a scalar boundary, and it's a
422+
// valid UTF-16 index.
423+
idx.transcodedOffset != 0
424+
/// Otherwise we need to reject indices that aren't scalar aligned.
425+
|| target._guts.isOnUnicodeScalarBoundary(idx)
426+
) else { return nil }
411427
}
412428

413429
self = idx

stdlib/public/core/StringUTF8View.swift

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -359,8 +359,17 @@ extension String.UTF8View.Index {
359359
public init?(_ idx: String.Index, within target: String.UTF8View) {
360360
// Note: This method used to be inlinable until Swift 5.7.
361361

362-
guard target._guts.hasMatchingEncoding(idx) else { return nil }
363-
guard idx._encodedOffset <= target._guts.count else { return nil }
362+
// As a special exception, we allow `idx` to be an UTF-16 index when `self`
363+
// is a UTF-8 string, to preserve compatibility with (broken) code that
364+
// keeps using indices from a bridged string after converting the string to
365+
// a native representation. Such indices are invalid, but returning nil here
366+
// can break code that appeared to work fine for ASCII strings in Swift
367+
// releases prior to 5.7.
368+
guard
369+
let idx = target._guts.ensureMatchingEncodingNoTrap(idx),
370+
idx._encodedOffset <= target._guts.count
371+
else { return nil }
372+
364373
if _slowPath(target._guts.isForeign) {
365374
guard idx._foreignIsWithin(target) else { return nil }
366375
} else {

stdlib/public/core/StringUnicodeScalarView.swift

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -429,13 +429,13 @@ extension String.UnicodeScalarIndex {
429429
within unicodeScalars: String.UnicodeScalarView
430430
) {
431431
guard
432-
unicodeScalars._guts.hasMatchingEncoding(sourcePosition),
433-
sourcePosition._encodedOffset <= unicodeScalars._guts.count,
434-
unicodeScalars._guts.isOnUnicodeScalarBoundary(sourcePosition)
432+
let i = unicodeScalars._guts.ensureMatchingEncodingNoTrap(sourcePosition),
433+
i._encodedOffset <= unicodeScalars._guts.count,
434+
unicodeScalars._guts.isOnUnicodeScalarBoundary(i)
435435
else {
436436
return nil
437437
}
438-
self = sourcePosition
438+
self = i
439439
}
440440

441441
/// Returns the position in the given string that corresponds exactly to this

stdlib/public/core/UnicodeHelpers.swift

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -167,7 +167,7 @@ extension _StringGuts {
167167
result = idx
168168
} else {
169169
// TODO(String performance): isASCII check
170-
result = scalarAlignSlow(idx)._scalarAligned._copyEncoding(from: idx)
170+
result = scalarAlignSlow(idx)._scalarAligned._copyingEncoding(from: idx)
171171
}
172172

173173
_internalInvariant(isOnUnicodeScalarBoundary(result),

0 commit comments

Comments
 (0)