Skip to content

Commit 09afcac

Browse files
committed
[stdlib] Fix String indexing edge cases, anomalies & validation bugs
Cherry-picked from #41417
1 parent 7fb1fd1 commit 09afcac

25 files changed

+3391
-752
lines changed

stdlib/private/StdlibUnicodeUnittest/StdlibUnicodeUnittest.swift

Lines changed: 139 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -779,3 +779,142 @@ public let utf16Tests = [
779779
],
780780
]
781781

782+
extension String {
783+
/// Print out a full list of indices in every view of this string.
784+
/// This is useful while debugging string indexing issues.
785+
public func dumpIndices() {
786+
print("-------------------------------------------------------------------")
787+
print("String: \(String(reflecting: self))")
788+
print("Characters:")
789+
self.indices.forEach { i in
790+
let char = self[i]
791+
print(" \(i) -> \(String(reflecting: char))")
792+
}
793+
print("Scalars:")
794+
self.unicodeScalars.indices.forEach { i in
795+
let scalar = self.unicodeScalars[i]
796+
let value = String(scalar.value, radix: 16, uppercase: true)
797+
let padding = String(repeating: "0", count: max(0, 4 - value.count))
798+
let name = scalar.properties.name ?? "\(scalar.debugDescription)"
799+
print(" \(i) -> U+\(padding)\(value) \(name)")
800+
}
801+
print("UTF-8:")
802+
self.utf8.indices.forEach { i in
803+
let code = self.utf8[i]
804+
let value = String(code, radix: 16, uppercase: true)
805+
let padding = value.count < 2 ? "0" : ""
806+
print(" \(i) -> \(padding)\(value)")
807+
}
808+
print("UTF-16:")
809+
self.utf16.indices.forEach { i in
810+
let code = self.utf16[i]
811+
let value = String(code, radix: 16, uppercase: true)
812+
let padding = String(repeating: "0", count: 4 - value.count)
813+
print(" \(i) -> \(padding)\(value)")
814+
}
815+
}
816+
817+
// Returns a list of every valid index in every string view, optionally
818+
// including end indices. We keep equal indices originating from different
819+
// views because they may have different grapheme size caches or flags etc.
820+
public func allIndices(includingEnd: Bool = true) -> [String.Index] {
821+
var r = Array(self.indices)
822+
if includingEnd { r.append(self.endIndex) }
823+
r += Array(self.unicodeScalars.indices)
824+
if includingEnd { r.append(self.unicodeScalars.endIndex) }
825+
r += Array(self.utf8.indices)
826+
if includingEnd { r.append(self.utf8.endIndex) }
827+
r += Array(self.utf16.indices)
828+
if includingEnd { r.append(self.utf16.endIndex) }
829+
return r
830+
}
831+
}
832+
833+
extension Substring {
834+
// Returns a list of every valid index in every substring view, optionally
835+
// including end indices. We keep equal indices originating from different
836+
// views because they may have different grapheme size caches or flags etc.
837+
public func allIndices(includingEnd: Bool = true) -> [String.Index] {
838+
var r = Array(self.indices)
839+
if includingEnd { r.append(self.endIndex) }
840+
r += Array(self.unicodeScalars.indices)
841+
if includingEnd { r.append(self.unicodeScalars.endIndex) }
842+
r += Array(self.utf8.indices)
843+
if includingEnd { r.append(self.utf8.endIndex) }
844+
r += Array(self.utf16.indices)
845+
if includingEnd { r.append(self.utf16.endIndex) }
846+
return r
847+
}
848+
}
849+
850+
extension Collection {
851+
// Assuming both `self` and `other` use the same index space, call `body` for
852+
// each index `i` in `other`, along with the slice in `self` that begins at
853+
// `i` and ends at the index following it in `other`.
854+
//
855+
// `other` must start with an item that is less than or equal to the first
856+
// item in `self`.
857+
func forEachIndexGroup<G: Collection>(
858+
by other: G,
859+
body: (G.Index, Self.SubSequence, Int) throws -> Void
860+
) rethrows
861+
where G.Index == Self.Index
862+
{
863+
if other.isEmpty {
864+
assert(self.isEmpty)
865+
return
866+
}
867+
var i = other.startIndex
868+
var j = self.startIndex
869+
var offset = 0
870+
while i != other.endIndex {
871+
let current = i
872+
other.formIndex(after: &i)
873+
let start = j
874+
while j < i, j < self.endIndex {
875+
self.formIndex(after: &j)
876+
}
877+
let end = j
878+
try body(current, self[start ..< end], offset)
879+
offset += 1
880+
}
881+
}
882+
}
883+
884+
extension String {
885+
/// Returns a dictionary mapping each valid index to the index that addresses
886+
/// the nearest scalar boundary, rounding down.
887+
public func scalarMap() -> [Index: (index: Index, offset: Int)] {
888+
var map: [Index: (index: Index, offset: Int)] = [:]
889+
890+
utf8.forEachIndexGroup(by: unicodeScalars) { scalar, slice, offset in
891+
for i in slice.indices { map[i] = (scalar, offset) }
892+
}
893+
utf16.forEachIndexGroup(by: unicodeScalars) { scalar, slice, offset in
894+
for i in slice.indices { map[i] = (scalar, offset) }
895+
}
896+
self.forEachIndexGroup(by: unicodeScalars) { scalar, slice, offset in
897+
for i in slice.indices { map[i] = (scalar, offset) }
898+
}
899+
map[endIndex] = (endIndex, unicodeScalars.count)
900+
return map
901+
}
902+
903+
/// Returns a dictionary mapping each valid index to the index that addresses
904+
/// the nearest character boundary, rounding down.
905+
public func characterMap() -> [Index: (index: Index, offset: Int)] {
906+
var map: [Index: (index: Index, offset: Int)] = [:]
907+
utf8.forEachIndexGroup(by: self) { char, slice, offset in
908+
for i in slice.indices { map[i] = (char, offset) }
909+
}
910+
utf16.forEachIndexGroup(by: self) { char, slice, offset in
911+
for i in slice.indices { map[i] = (char, offset) }
912+
}
913+
unicodeScalars.forEachIndexGroup(by: self) { char, slice, offset in
914+
for i in slice.indices { map[i] = (char, offset) }
915+
}
916+
map[endIndex] = (endIndex, count)
917+
return map
918+
}
919+
}
920+

stdlib/public/core/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,7 @@ set(SWIFTLIB_ESSENTIAL
157157
StringProtocol.swift
158158
StringIndex.swift
159159
StringIndexConversions.swift
160+
StringIndexValidation.swift
160161
StringInterpolation.swift
161162
StringLegacy.swift
162163
StringNormalization.swift

stdlib/public/core/GroupInfo.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
"StringHashable.swift",
2727
"StringIndex.swift",
2828
"StringIndexConversions.swift",
29+
"StringIndexValidation.swift",
2930
"StringInterpolation.swift",
3031
"StringLegacy.swift",
3132
"StringNormalization.swift",

stdlib/public/core/Range.swift

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1027,3 +1027,14 @@ extension PartialRangeUpTo: Sendable where Bound: Sendable { }
10271027
extension PartialRangeThrough: Sendable where Bound: Sendable { }
10281028
extension PartialRangeFrom: Sendable where Bound: Sendable { }
10291029
extension PartialRangeFrom.Iterator: Sendable where Bound: Sendable { }
1030+
1031+
extension Range where Bound == String.Index {
1032+
@_alwaysEmitIntoClient // Swift 5.7
1033+
internal var _encodedOffsetRange: Range<Int> {
1034+
_internalInvariant(
1035+
(lowerBound._canBeUTF8 && upperBound._canBeUTF8)
1036+
|| (lowerBound._canBeUTF16 && upperBound._canBeUTF16))
1037+
return Range<Int>(
1038+
_uncheckedBounds: (lowerBound._encodedOffset, upperBound._encodedOffset))
1039+
}
1040+
}

stdlib/public/core/Slice.swift

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,11 @@ public struct Slice<Base: Collection> {
135135
public var base: Base {
136136
return _base
137137
}
138+
139+
@_alwaysEmitIntoClient @inline(__always)
140+
internal var _bounds: Range<Base.Index> {
141+
Range(_uncheckedBounds: (_startIndex, _endIndex))
142+
}
138143
}
139144

140145
extension Slice: Collection {
@@ -157,21 +162,21 @@ extension Slice: Collection {
157162
@inlinable // generic-performance
158163
public subscript(index: Index) -> Base.Element {
159164
get {
160-
_failEarlyRangeCheck(index, bounds: startIndex..<endIndex)
165+
_failEarlyRangeCheck(index, bounds: _bounds)
161166
return _base[index]
162167
}
163168
}
164169

165170
@inlinable // generic-performance
166171
public subscript(bounds: Range<Index>) -> Slice<Base> {
167172
get {
168-
_failEarlyRangeCheck(bounds, bounds: startIndex..<endIndex)
173+
_failEarlyRangeCheck(bounds, bounds: _bounds)
169174
return Slice(base: _base, bounds: bounds)
170175
}
171176
}
172177

173-
public var indices: Indices {
174-
return _base.indices[_startIndex..<_endIndex]
178+
public var indices: Indices {
179+
return _base.indices[_bounds]
175180
}
176181

177182
@inlinable // generic-performance
@@ -264,11 +269,11 @@ extension Slice: MutableCollection where Base: MutableCollection {
264269
@inlinable // generic-performance
265270
public subscript(index: Index) -> Base.Element {
266271
get {
267-
_failEarlyRangeCheck(index, bounds: startIndex..<endIndex)
272+
_failEarlyRangeCheck(index, bounds: _bounds)
268273
return _base[index]
269274
}
270275
set {
271-
_failEarlyRangeCheck(index, bounds: startIndex..<endIndex)
276+
_failEarlyRangeCheck(index, bounds: _bounds)
272277
_base[index] = newValue
273278
// MutableSlice requires that the underlying collection's subscript
274279
// setter does not invalidate indices, so our `startIndex` and `endIndex`
@@ -279,7 +284,7 @@ extension Slice: MutableCollection where Base: MutableCollection {
279284
@inlinable // generic-performance
280285
public subscript(bounds: Range<Index>) -> Slice<Base> {
281286
get {
282-
_failEarlyRangeCheck(bounds, bounds: startIndex..<endIndex)
287+
_failEarlyRangeCheck(bounds, bounds: _bounds)
283288
return Slice(base: _base, bounds: bounds)
284289
}
285290
set {

0 commit comments

Comments
 (0)