Skip to content

Commit 2574d78

Browse files
authored
Merge pull request #42442 from lorentey/better-index-conversions
2 parents 5c5e80c + 3eed534 commit 2574d78

10 files changed

+257
-71
lines changed

stdlib/public/core/StringGuts.swift

Lines changed: 61 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -301,19 +301,13 @@ extension _StringGuts {
301301
// Encoding
302302
extension _StringGuts {
303303
/// Returns whether this string has a UTF-8 storage representation.
304+
/// If this returns false, then the string is encoded in UTF-16.
304305
///
305306
/// This always returns a value corresponding to the string's actual encoding.
306307
@_alwaysEmitIntoClient
307308
@inline(__always)
308309
internal var isUTF8: Bool { _object.isUTF8 }
309310

310-
/// Returns whether this string has a UTF-16 storage representation.
311-
///
312-
/// This always returns a value corresponding to the string's actual encoding.
313-
@_alwaysEmitIntoClient
314-
@inline(__always)
315-
internal var isUTF16: Bool { _object.isUTF16 }
316-
317311
@_alwaysEmitIntoClient // Swift 5.7
318312
@inline(__always)
319313
internal func markEncoding(_ i: String.Index) -> String.Index {
@@ -333,41 +327,75 @@ extension _StringGuts {
333327
i._hasMatchingEncoding(isUTF8: isUTF8)
334328
}
335329

336-
/// Return an index whose encoding can be assumed to match that of `self`.
330+
/// Return an index whose encoding can be assumed to match that of `self`,
331+
/// trapping if `i` has an incompatible encoding.
332+
///
333+
/// If `i` is UTF-8 encoded, but `self` is an UTF-16 string, then trap.
334+
///
335+
/// If `i` is UTF-16 encoded, but `self` is an UTF-8 string, then transcode
336+
/// `i`'s offset to UTF-8 and return the resulting index. This allows the use
337+
/// of indices from a bridged Cocoa string after the string has been converted
338+
/// to a native Swift string. (Such indices are technically still considered
339+
/// invalid, but we allow this specific case to keep compatibility with
340+
/// existing code that assumes otherwise.)
337341
///
338342
/// Detecting an encoding mismatch isn't always possible -- older binaries did
339343
/// not set the flags that this method relies on. However, false positives
340344
/// cannot happen: if this method detects a mismatch, then it is guaranteed to
341345
/// be a real one.
342346
@_alwaysEmitIntoClient
343347
@inline(__always)
344-
internal func ensureMatchingEncoding(_ i: String.Index) -> String.Index {
348+
internal func ensureMatchingEncoding(_ i: Index) -> Index {
345349
if _fastPath(hasMatchingEncoding(i)) { return i }
350+
if let i = _slowEnsureMatchingEncoding(i) { return i }
351+
// Note that this trap is not guaranteed to trigger when the process
352+
// includes client binaries compiled with a previous Swift release.
353+
// (`i._canBeUTF16` can sometimes return true in that case even if the index
354+
// actually came from an UTF-8 string.) However, the trap will still often
355+
// trigger in this case, as long as the index was initialized by code that
356+
// was compiled with 5.7+.
357+
//
358+
// This trap will rarely if ever trigger on OSes that have stdlibs <= 5.6,
359+
// because those versions never set the `isKnownUTF16` flag in
360+
// `_StringObject`. (The flag may still be set within inlinable code,
361+
// though.)
362+
_preconditionFailure("Invalid string index")
363+
}
364+
365+
/// Return an index that corresponds to the same position as `i`, but whose
366+
/// encoding can be assumed to match that of `self`, returning `nil` if `i`
367+
/// has incompatible encoding.
368+
///
369+
/// If `i` is UTF-8 encoded, but `self` is an UTF-16 string, then return nil.
370+
///
371+
/// If `i` is UTF-16 encoded, but `self` is an UTF-8 string, then transcode
372+
/// `i`'s offset to UTF-8 and return the resulting index. This allows the use
373+
/// of indices from a bridged Cocoa string after the string has been converted
374+
/// to a native Swift string. (Such indices are technically still considered
375+
/// invalid, but we allow this specific case to keep compatibility with
376+
/// existing code that assumes otherwise.)
377+
///
378+
/// Detecting an encoding mismatch isn't always possible -- older binaries did
379+
/// not set the flags that this method relies on. However, false positives
380+
/// cannot happen: if this method detects a mismatch, then it is guaranteed to
381+
/// be a real one.
382+
internal func ensureMatchingEncodingNoTrap(_ i: Index) -> Index? {
383+
if hasMatchingEncoding(i) { return i }
346384
return _slowEnsureMatchingEncoding(i)
347385
}
348386

349387
@_alwaysEmitIntoClient
350388
@inline(never)
351389
@_effects(releasenone)
352-
internal func _slowEnsureMatchingEncoding(_ i: String.Index) -> String.Index {
390+
internal func _slowEnsureMatchingEncoding(_ i: Index) -> Index? {
353391
guard isUTF8 else {
354392
// Attempt to use an UTF-8 index on a UTF-16 string. Strings don't usually
355-
// get converted to UTF-16 storage, so it seems okay to trap in this case
356-
// -- the index most likely comes from an unrelated string. (Trapping here
357-
// may still turn out to affect binary compatibility with broken code in
393+
// get converted to UTF-16 storage, so it seems okay to reject this case
394+
// -- the index most likely comes from an unrelated string. (This may
395+
// still turn out to affect binary compatibility with broken code in
358396
// existing binaries running with new stdlibs. If so, we can replace this
359397
// with the same transcoding hack as in the UTF-16->8 case below.)
360-
//
361-
// Note that this trap is not guaranteed to trigger when the process
362-
// includes client binaries compiled with a previous Swift release.
363-
// (`i._canBeUTF16` can sometimes return true in that case even if the
364-
// index actually came from an UTF-8 string.) However, the trap will still
365-
// often trigger in this case, as long as the index was initialized by
366-
// code that was compiled with 5.7+.
367-
//
368-
// This trap can never trigger on OSes that have stdlibs <= 5.6, because
369-
// those versions never set the `isKnownUTF16` flag in `_StringObject`.
370-
_preconditionFailure("Invalid string index")
398+
return nil
371399
}
372400
// Attempt to use an UTF-16 index on a UTF-8 string.
373401
//
@@ -383,10 +411,15 @@ extension _StringGuts {
383411
// FIXME: Consider emitting a runtime warning here.
384412
// FIXME: Consider performing a linked-on-or-after check & trapping if the
385413
// client executable was built on some particular future Swift release.
386-
let utf16 = String(self).utf16
387-
let base = utf16.index(utf16.startIndex, offsetBy: i._encodedOffset)
388-
if i.transcodedOffset == 0 { return base }
389-
return base.encoded(offsetBy: i.transcodedOffset)._knownUTF8
414+
let utf16 = String.UTF16View(self)
415+
var r = utf16.index(utf16.startIndex, offsetBy: i._encodedOffset)
416+
if i.transcodedOffset != 0 {
417+
r = r.encoded(offsetBy: i.transcodedOffset)
418+
} else {
419+
// Preserve alignment bits if possible.
420+
r = r._copyingAlignment(from: i)
421+
}
422+
return r._knownUTF8
390423
}
391424
}
392425

stdlib/public/core/StringGutsRangeReplaceable.swift

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -466,10 +466,9 @@ extension _StringGuts {
466466
_internalInvariant(
467467
subrange.lowerBound >= startIndex && subrange.upperBound <= endIndex)
468468

469-
if _slowPath(isUTF16) {
470-
// UTF-16 (i.e., foreign) string. The mutation will convert this to the
471-
// native UTF-8 encoding, so we need to do some extra work to preserve our
472-
// bounds.
469+
guard _fastPath(isUTF8) else {
470+
// UTF-16 string. The mutation will convert this to the native UTF-8
471+
// encoding, so we need to do some extra work to preserve our bounds.
473472
let utf8StartOffset = String(self).utf8.distance(
474473
from: self.startIndex, to: startIndex)
475474
let oldUTF8Count = String(self).utf8.distance(

stdlib/public/core/StringIndex.swift

Lines changed: 9 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -365,6 +365,14 @@ extension String.Index {
365365
}
366366
}
367367

368+
extension String.Index {
369+
@_alwaysEmitIntoClient // Swift 5.7
370+
internal func _copyingAlignment(from index: Self) -> Self {
371+
let mask = Self.__scalarAlignmentBit | Self.__characterAlignmentBit
372+
return Self((_rawBits & ~mask) | (index._rawBits & mask))
373+
}
374+
}
375+
368376
// ### Index Encoding
369377
//
370378
// Swift 5.7 introduced bookkeeping to keep track of the Unicode encoding
@@ -473,24 +481,12 @@ extension String.Index {
473481
}
474482

475483
@_alwaysEmitIntoClient // Swift 5.7
476-
internal func _copyEncoding(from index: Self) -> Self {
484+
internal func _copyingEncoding(from index: Self) -> Self {
477485
let mask = Self.__utf8Bit | Self.__utf16Bit
478486
return Self((_rawBits & ~mask) | (index._rawBits & mask))
479487
}
480488
}
481489

482-
extension String.Index {
483-
@_alwaysEmitIntoClient @inline(__always) // Swift 5.7
484-
internal var _isUTF8CharacterIndex: Bool {
485-
_canBeUTF8 && _isCharacterAligned
486-
}
487-
488-
@_alwaysEmitIntoClient @inline(__always) // Swift 5.7
489-
internal var _isUTF8ScalarIndex: Bool {
490-
_canBeUTF8 && _isScalarAligned
491-
}
492-
}
493-
494490
extension String.Index: Equatable {
495491
@inlinable @inline(__always)
496492
public static func == (lhs: String.Index, rhs: String.Index) -> Bool {

stdlib/public/core/StringIndexConversions.swift

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -50,8 +50,17 @@ extension String.Index {
5050
/// of `target`.
5151
/// - target: The string referenced by the resulting index.
5252
public init?(_ sourcePosition: String.Index, within target: String) {
53-
guard target._isValidIndex(sourcePosition) else { return nil }
54-
self = sourcePosition._characterAligned
53+
// As a special exception, we allow `sourcePosition` to be an UTF-16 index
54+
// when `self` is a UTF-8 string, to preserve compatibility with (broken)
55+
// code that keeps using indices from a bridged string after converting the
56+
// string to a native representation. Such indices are invalid, but
57+
// returning nil here can break code that appeared to work fine for ASCII
58+
// strings in Swift releases prior to 5.7.
59+
guard
60+
let i = target._guts.ensureMatchingEncodingNoTrap(sourcePosition),
61+
target._isValidIndex(i)
62+
else { return nil }
63+
self = i._characterAligned
5564
}
5665

5766
/// Creates an index in the given string that corresponds exactly to the
@@ -101,8 +110,17 @@ extension String.Index {
101110
return
102111
}
103112
if let str = target as? Substring {
104-
guard str._isValidIndex(sourcePosition) else { return nil }
105-
self = sourcePosition
113+
// As a special exception, we allow `sourcePosition` to be an UTF-16 index
114+
// when `self` is a UTF-8 string, to preserve compatibility with (broken)
115+
// code that keeps using indices from a bridged string after converting
116+
// the string to a native representation. Such indices are invalid, but
117+
// returning nil here can break code that appeared to work fine for ASCII
118+
// strings in Swift releases prior to 5.7.
119+
guard
120+
let i = str._wholeGuts.ensureMatchingEncodingNoTrap(sourcePosition),
121+
str._isValidIndex(i)
122+
else { return nil }
123+
self = i
106124
return
107125
}
108126
self.init(sourcePosition, within: String(target))

stdlib/public/core/StringObject.swift

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1009,6 +1009,7 @@ extension _StringObject {
10091009
}
10101010

10111011
/// Returns whether this string has a UTF-8 storage representation.
1012+
/// If this returns false, then the string is encoded in UTF-16.
10121013
///
10131014
/// This always returns a value corresponding to the string's actual encoding.
10141015
@_alwaysEmitIntoClient
@@ -1030,15 +1031,6 @@ extension _StringObject {
10301031
providesFastUTF8 || _countAndFlags.isForeignUTF8
10311032
}
10321033

1033-
/// Returns whether this string has a UTF-16 storage representation.
1034-
///
1035-
/// This always returns a value corresponding to the string's actual encoding.
1036-
@_alwaysEmitIntoClient
1037-
@inline(__always) // Swift 5.7
1038-
internal var isUTF16: Bool {
1039-
!isUTF8
1040-
}
1041-
10421034
// Get access to fast UTF-8 contents for large strings which provide it.
10431035
@inlinable @inline(__always)
10441036
internal var fastUTF8: UnsafeBufferPointer<UInt8> {

stdlib/public/core/StringUTF16View.swift

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -402,12 +402,28 @@ extension String.UTF16View.Index {
402402
public init?(
403403
_ idx: String.Index, within target: String.UTF16View
404404
) {
405-
guard target._guts.hasMatchingEncoding(idx) else { return nil }
406-
guard idx._encodedOffset <= target._guts.count else { return nil }
405+
// As a special exception, we allow `idx` to be an UTF-16 index when `self`
406+
// is a UTF-8 string, to preserve compatibility with (broken) code that
407+
// keeps using indices from a bridged string after converting the string to
408+
// a native representation. Such indices are invalid, but returning nil here
409+
// can break code that appeared to work fine for ASCII strings in Swift
410+
// releases prior to 5.7.
411+
guard
412+
let idx = target._guts.ensureMatchingEncodingNoTrap(idx),
413+
idx._encodedOffset <= target._guts.count
414+
else { return nil }
415+
407416
if _slowPath(target._guts.isForeign) {
408417
guard idx._foreignIsWithin(target) else { return nil }
409-
} else {
410-
guard target._guts.isOnUnicodeScalarBoundary(idx) else { return nil }
418+
} else { // fast UTF-8
419+
guard (
420+
// If the transcoded offset is non-zero, then `idx` addresses a trailing
421+
// surrogate, so its encoding offset is on a scalar boundary, and it's a
422+
// valid UTF-16 index.
423+
idx.transcodedOffset != 0
424+
/// Otherwise we need to reject indices that aren't scalar aligned.
425+
|| target._guts.isOnUnicodeScalarBoundary(idx)
426+
) else { return nil }
411427
}
412428

413429
self = idx

stdlib/public/core/StringUTF8View.swift

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -359,8 +359,17 @@ extension String.UTF8View.Index {
359359
public init?(_ idx: String.Index, within target: String.UTF8View) {
360360
// Note: This method used to be inlinable until Swift 5.7.
361361

362-
guard target._guts.hasMatchingEncoding(idx) else { return nil }
363-
guard idx._encodedOffset <= target._guts.count else { return nil }
362+
// As a special exception, we allow `idx` to be an UTF-16 index when `self`
363+
// is a UTF-8 string, to preserve compatibility with (broken) code that
364+
// keeps using indices from a bridged string after converting the string to
365+
// a native representation. Such indices are invalid, but returning nil here
366+
// can break code that appeared to work fine for ASCII strings in Swift
367+
// releases prior to 5.7.
368+
guard
369+
let idx = target._guts.ensureMatchingEncodingNoTrap(idx),
370+
idx._encodedOffset <= target._guts.count
371+
else { return nil }
372+
364373
if _slowPath(target._guts.isForeign) {
365374
guard idx._foreignIsWithin(target) else { return nil }
366375
} else {

stdlib/public/core/StringUnicodeScalarView.swift

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -429,13 +429,13 @@ extension String.UnicodeScalarIndex {
429429
within unicodeScalars: String.UnicodeScalarView
430430
) {
431431
guard
432-
unicodeScalars._guts.hasMatchingEncoding(sourcePosition),
433-
sourcePosition._encodedOffset <= unicodeScalars._guts.count,
434-
unicodeScalars._guts.isOnUnicodeScalarBoundary(sourcePosition)
432+
let i = unicodeScalars._guts.ensureMatchingEncodingNoTrap(sourcePosition),
433+
i._encodedOffset <= unicodeScalars._guts.count,
434+
unicodeScalars._guts.isOnUnicodeScalarBoundary(i)
435435
else {
436436
return nil
437437
}
438-
self = sourcePosition
438+
self = i
439439
}
440440

441441
/// Returns the position in the given string that corresponds exactly to this

stdlib/public/core/UnicodeHelpers.swift

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -167,7 +167,7 @@ extension _StringGuts {
167167
result = idx
168168
} else {
169169
// TODO(String performance): isASCII check
170-
result = scalarAlignSlow(idx)._scalarAligned._copyEncoding(from: idx)
170+
result = scalarAlignSlow(idx)._scalarAligned._copyingEncoding(from: idx)
171171
}
172172

173173
_internalInvariant(isOnUnicodeScalarBoundary(result),

0 commit comments

Comments
 (0)