Skip to content

Commit d1a9c63

Browse files
authored
Merge pull request #42402 from lorentey/the-horror-of-se-0180-5.7
[5.7][stdlib] Fix String indexing edge cases, anomalies & validation bugs
2 parents 7fb1fd1 + 7f34490 commit d1a9c63

28 files changed

+3418
-753
lines changed

stdlib/private/StdlibUnicodeUnittest/StdlibUnicodeUnittest.swift

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -778,4 +778,3 @@ public let utf16Tests = [
778778
[ 0xDC00, 0xD800, 0xD800, 0xDC00 ]),
779779
],
780780
]
781-

stdlib/private/StdlibUnittest/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ add_swift_target_library(swiftStdlibUnittest ${SWIFT_STDLIB_LIBRARY_BUILD_TYPES}
4242
Statistics.swift
4343
StdlibCoreExtras.swift
4444
StringConvertible.swift
45+
StringTestHelpers.swift
4546
SymbolLookup.swift
4647
TestHelpers.swift
4748
TypeIndexed.swift
Lines changed: 139 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,139 @@
1+
extension String {
2+
/// Print out a full list of indices in every view of this string.
3+
/// This is useful while debugging string indexing issues.
4+
public func dumpIndices() {
5+
print("-------------------------------------------------------------------")
6+
print("String: \(String(reflecting: self))")
7+
print("Characters:")
8+
self.indices.forEach { i in
9+
let char = self[i]
10+
print(" \(i) -> \(String(reflecting: char))")
11+
}
12+
print("Scalars:")
13+
self.unicodeScalars.indices.forEach { i in
14+
let scalar = self.unicodeScalars[i]
15+
let value = String(scalar.value, radix: 16, uppercase: true)
16+
let padding = String(repeating: "0", count: max(0, 4 - value.count))
17+
let name = scalar.properties.name ?? "\(scalar.debugDescription)"
18+
print(" \(i) -> U+\(padding)\(value) \(name)")
19+
}
20+
print("UTF-8:")
21+
self.utf8.indices.forEach { i in
22+
let code = self.utf8[i]
23+
let value = String(code, radix: 16, uppercase: true)
24+
let padding = value.count < 2 ? "0" : ""
25+
print(" \(i) -> \(padding)\(value)")
26+
}
27+
print("UTF-16:")
28+
self.utf16.indices.forEach { i in
29+
let code = self.utf16[i]
30+
let value = String(code, radix: 16, uppercase: true)
31+
let padding = String(repeating: "0", count: 4 - value.count)
32+
print(" \(i) -> \(padding)\(value)")
33+
}
34+
}
35+
36+
// Returns a list of every valid index in every string view, optionally
37+
// including end indices. We keep equal indices originating from different
38+
// views because they may have different grapheme size caches or flags etc.
39+
public func allIndices(includingEnd: Bool = true) -> [String.Index] {
40+
var r = Array(self.indices)
41+
if includingEnd { r.append(self.endIndex) }
42+
r += Array(self.unicodeScalars.indices)
43+
if includingEnd { r.append(self.unicodeScalars.endIndex) }
44+
r += Array(self.utf8.indices)
45+
if includingEnd { r.append(self.utf8.endIndex) }
46+
r += Array(self.utf16.indices)
47+
if includingEnd { r.append(self.utf16.endIndex) }
48+
return r
49+
}
50+
}
51+
52+
extension Substring {
53+
// Returns a list of every valid index in every substring view, optionally
54+
// including end indices. We keep equal indices originating from different
55+
// views because they may have different grapheme size caches or flags etc.
56+
public func allIndices(includingEnd: Bool = true) -> [String.Index] {
57+
var r = Array(self.indices)
58+
if includingEnd { r.append(self.endIndex) }
59+
r += Array(self.unicodeScalars.indices)
60+
if includingEnd { r.append(self.unicodeScalars.endIndex) }
61+
r += Array(self.utf8.indices)
62+
if includingEnd { r.append(self.utf8.endIndex) }
63+
r += Array(self.utf16.indices)
64+
if includingEnd { r.append(self.utf16.endIndex) }
65+
return r
66+
}
67+
}
68+
69+
extension Collection {
70+
// Assuming both `self` and `other` use the same index space, call `body` for
71+
// each index `i` in `other`, along with the slice in `self` that begins at
72+
// `i` and ends at the index following it in `other`.
73+
//
74+
// `other` must start with an item that is less than or equal to the first
75+
// item in `self`.
76+
func forEachIndexGroup<G: Collection>(
77+
by other: G,
78+
body: (G.Index, Self.SubSequence, Int) throws -> Void
79+
) rethrows
80+
where G.Index == Self.Index
81+
{
82+
if other.isEmpty {
83+
assert(self.isEmpty)
84+
return
85+
}
86+
var i = other.startIndex
87+
var j = self.startIndex
88+
var offset = 0
89+
while i != other.endIndex {
90+
let current = i
91+
other.formIndex(after: &i)
92+
let start = j
93+
while j < i, j < self.endIndex {
94+
self.formIndex(after: &j)
95+
}
96+
let end = j
97+
try body(current, self[start ..< end], offset)
98+
offset += 1
99+
}
100+
}
101+
}
102+
103+
extension String {
104+
/// Returns a dictionary mapping each valid index to the index that addresses
105+
/// the nearest scalar boundary, rounding down.
106+
public func scalarMap() -> [Index: (index: Index, offset: Int)] {
107+
var map: [Index: (index: Index, offset: Int)] = [:]
108+
109+
utf8.forEachIndexGroup(by: unicodeScalars) { scalar, slice, offset in
110+
for i in slice.indices { map[i] = (scalar, offset) }
111+
}
112+
utf16.forEachIndexGroup(by: unicodeScalars) { scalar, slice, offset in
113+
for i in slice.indices { map[i] = (scalar, offset) }
114+
}
115+
self.forEachIndexGroup(by: unicodeScalars) { scalar, slice, offset in
116+
for i in slice.indices { map[i] = (scalar, offset) }
117+
}
118+
map[endIndex] = (endIndex, unicodeScalars.count)
119+
return map
120+
}
121+
122+
/// Returns a dictionary mapping each valid index to the index that addresses
123+
/// the nearest character boundary, rounding down.
124+
public func characterMap() -> [Index: (index: Index, offset: Int)] {
125+
var map: [Index: (index: Index, offset: Int)] = [:]
126+
utf8.forEachIndexGroup(by: self) { char, slice, offset in
127+
for i in slice.indices { map[i] = (char, offset) }
128+
}
129+
utf16.forEachIndexGroup(by: self) { char, slice, offset in
130+
for i in slice.indices { map[i] = (char, offset) }
131+
}
132+
unicodeScalars.forEachIndexGroup(by: self) { char, slice, offset in
133+
for i in slice.indices { map[i] = (char, offset) }
134+
}
135+
map[endIndex] = (endIndex, count)
136+
return map
137+
}
138+
}
139+

stdlib/public/core/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,7 @@ set(SWIFTLIB_ESSENTIAL
157157
StringProtocol.swift
158158
StringIndex.swift
159159
StringIndexConversions.swift
160+
StringIndexValidation.swift
160161
StringInterpolation.swift
161162
StringLegacy.swift
162163
StringNormalization.swift

stdlib/public/core/GroupInfo.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
"StringHashable.swift",
2727
"StringIndex.swift",
2828
"StringIndexConversions.swift",
29+
"StringIndexValidation.swift",
2930
"StringInterpolation.swift",
3031
"StringLegacy.swift",
3132
"StringNormalization.swift",

stdlib/public/core/Range.swift

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1027,3 +1027,14 @@ extension PartialRangeUpTo: Sendable where Bound: Sendable { }
10271027
extension PartialRangeThrough: Sendable where Bound: Sendable { }
10281028
extension PartialRangeFrom: Sendable where Bound: Sendable { }
10291029
extension PartialRangeFrom.Iterator: Sendable where Bound: Sendable { }
1030+
1031+
extension Range where Bound == String.Index {
1032+
@_alwaysEmitIntoClient // Swift 5.7
1033+
internal var _encodedOffsetRange: Range<Int> {
1034+
_internalInvariant(
1035+
(lowerBound._canBeUTF8 && upperBound._canBeUTF8)
1036+
|| (lowerBound._canBeUTF16 && upperBound._canBeUTF16))
1037+
return Range<Int>(
1038+
_uncheckedBounds: (lowerBound._encodedOffset, upperBound._encodedOffset))
1039+
}
1040+
}

stdlib/public/core/Slice.swift

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,11 @@ public struct Slice<Base: Collection> {
135135
public var base: Base {
136136
return _base
137137
}
138+
139+
@_alwaysEmitIntoClient @inline(__always)
140+
internal var _bounds: Range<Base.Index> {
141+
Range(_uncheckedBounds: (_startIndex, _endIndex))
142+
}
138143
}
139144

140145
extension Slice: Collection {
@@ -157,21 +162,21 @@ extension Slice: Collection {
157162
@inlinable // generic-performance
158163
public subscript(index: Index) -> Base.Element {
159164
get {
160-
_failEarlyRangeCheck(index, bounds: startIndex..<endIndex)
165+
_failEarlyRangeCheck(index, bounds: _bounds)
161166
return _base[index]
162167
}
163168
}
164169

165170
@inlinable // generic-performance
166171
public subscript(bounds: Range<Index>) -> Slice<Base> {
167172
get {
168-
_failEarlyRangeCheck(bounds, bounds: startIndex..<endIndex)
173+
_failEarlyRangeCheck(bounds, bounds: _bounds)
169174
return Slice(base: _base, bounds: bounds)
170175
}
171176
}
172177

173-
public var indices: Indices {
174-
return _base.indices[_startIndex..<_endIndex]
178+
public var indices: Indices {
179+
return _base.indices[_bounds]
175180
}
176181

177182
@inlinable // generic-performance
@@ -264,11 +269,11 @@ extension Slice: MutableCollection where Base: MutableCollection {
264269
@inlinable // generic-performance
265270
public subscript(index: Index) -> Base.Element {
266271
get {
267-
_failEarlyRangeCheck(index, bounds: startIndex..<endIndex)
272+
_failEarlyRangeCheck(index, bounds: _bounds)
268273
return _base[index]
269274
}
270275
set {
271-
_failEarlyRangeCheck(index, bounds: startIndex..<endIndex)
276+
_failEarlyRangeCheck(index, bounds: _bounds)
272277
_base[index] = newValue
273278
// MutableSlice requires that the underlying collection's subscript
274279
// setter does not invalidate indices, so our `startIndex` and `endIndex`
@@ -279,7 +284,7 @@ extension Slice: MutableCollection where Base: MutableCollection {
279284
@inlinable // generic-performance
280285
public subscript(bounds: Range<Index>) -> Slice<Base> {
281286
get {
282-
_failEarlyRangeCheck(bounds, bounds: startIndex..<endIndex)
287+
_failEarlyRangeCheck(bounds, bounds: _bounds)
283288
return Slice(base: _base, bounds: bounds)
284289
}
285290
set {

0 commit comments

Comments
 (0)