Skip to content

[5.7][stdlib] Fix String indexing edge cases, anomalies & validation bugs #42402

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Apr 16, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -778,4 +778,3 @@ public let utf16Tests = [
[ 0xDC00, 0xD800, 0xD800, 0xDC00 ]),
],
]

1 change: 1 addition & 0 deletions stdlib/private/StdlibUnittest/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ add_swift_target_library(swiftStdlibUnittest ${SWIFT_STDLIB_LIBRARY_BUILD_TYPES}
Statistics.swift
StdlibCoreExtras.swift
StringConvertible.swift
StringTestHelpers.swift
SymbolLookup.swift
TestHelpers.swift
TypeIndexed.swift
Expand Down
139 changes: 139 additions & 0 deletions stdlib/private/StdlibUnittest/StringTestHelpers.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
extension String {
/// Print out a full list of indices in every view of this string.
/// This is useful while debugging string indexing issues.
public func dumpIndices() {
print("-------------------------------------------------------------------")
print("String: \(String(reflecting: self))")
print("Characters:")
self.indices.forEach { i in
let char = self[i]
print(" \(i) -> \(String(reflecting: char))")
}
print("Scalars:")
self.unicodeScalars.indices.forEach { i in
let scalar = self.unicodeScalars[i]
let value = String(scalar.value, radix: 16, uppercase: true)
let padding = String(repeating: "0", count: max(0, 4 - value.count))
let name = scalar.properties.name ?? "\(scalar.debugDescription)"
print(" \(i) -> U+\(padding)\(value) \(name)")
}
print("UTF-8:")
self.utf8.indices.forEach { i in
let code = self.utf8[i]
let value = String(code, radix: 16, uppercase: true)
let padding = value.count < 2 ? "0" : ""
print(" \(i) -> \(padding)\(value)")
}
print("UTF-16:")
self.utf16.indices.forEach { i in
let code = self.utf16[i]
let value = String(code, radix: 16, uppercase: true)
let padding = String(repeating: "0", count: 4 - value.count)
print(" \(i) -> \(padding)\(value)")
}
}

// Returns a list of every valid index in every string view, optionally
// including end indices. We keep equal indices originating from different
// views because they may have different grapheme size caches or flags etc.
public func allIndices(includingEnd: Bool = true) -> [String.Index] {
var r = Array(self.indices)
if includingEnd { r.append(self.endIndex) }
r += Array(self.unicodeScalars.indices)
if includingEnd { r.append(self.unicodeScalars.endIndex) }
r += Array(self.utf8.indices)
if includingEnd { r.append(self.utf8.endIndex) }
r += Array(self.utf16.indices)
if includingEnd { r.append(self.utf16.endIndex) }
return r
}
}

extension Substring {
// Returns a list of every valid index in every substring view, optionally
// including end indices. We keep equal indices originating from different
// views because they may have different grapheme size caches or flags etc.
public func allIndices(includingEnd: Bool = true) -> [String.Index] {
var r = Array(self.indices)
if includingEnd { r.append(self.endIndex) }
r += Array(self.unicodeScalars.indices)
if includingEnd { r.append(self.unicodeScalars.endIndex) }
r += Array(self.utf8.indices)
if includingEnd { r.append(self.utf8.endIndex) }
r += Array(self.utf16.indices)
if includingEnd { r.append(self.utf16.endIndex) }
return r
}
}

extension Collection {
// Assuming both `self` and `other` use the same index space, call `body` for
// each index `i` in `other`, along with the slice in `self` that begins at
// `i` and ends at the index following it in `other`.
//
// `other` must start with an item that is less than or equal to the first
// item in `self`.
func forEachIndexGroup<G: Collection>(
by other: G,
body: (G.Index, Self.SubSequence, Int) throws -> Void
) rethrows
where G.Index == Self.Index
{
if other.isEmpty {
assert(self.isEmpty)
return
}
var i = other.startIndex
var j = self.startIndex
var offset = 0
while i != other.endIndex {
let current = i
other.formIndex(after: &i)
let start = j
while j < i, j < self.endIndex {
self.formIndex(after: &j)
}
let end = j
try body(current, self[start ..< end], offset)
offset += 1
}
}
}

extension String {
/// Returns a dictionary mapping each valid index to the index that addresses
/// the nearest scalar boundary, rounding down.
public func scalarMap() -> [Index: (index: Index, offset: Int)] {
var map: [Index: (index: Index, offset: Int)] = [:]

utf8.forEachIndexGroup(by: unicodeScalars) { scalar, slice, offset in
for i in slice.indices { map[i] = (scalar, offset) }
}
utf16.forEachIndexGroup(by: unicodeScalars) { scalar, slice, offset in
for i in slice.indices { map[i] = (scalar, offset) }
}
self.forEachIndexGroup(by: unicodeScalars) { scalar, slice, offset in
for i in slice.indices { map[i] = (scalar, offset) }
}
map[endIndex] = (endIndex, unicodeScalars.count)
return map
}

/// Returns a dictionary mapping each valid index to the index that addresses
/// the nearest character boundary, rounding down.
public func characterMap() -> [Index: (index: Index, offset: Int)] {
var map: [Index: (index: Index, offset: Int)] = [:]
utf8.forEachIndexGroup(by: self) { char, slice, offset in
for i in slice.indices { map[i] = (char, offset) }
}
utf16.forEachIndexGroup(by: self) { char, slice, offset in
for i in slice.indices { map[i] = (char, offset) }
}
unicodeScalars.forEachIndexGroup(by: self) { char, slice, offset in
for i in slice.indices { map[i] = (char, offset) }
}
map[endIndex] = (endIndex, count)
return map
}
}

1 change: 1 addition & 0 deletions stdlib/public/core/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,7 @@ set(SWIFTLIB_ESSENTIAL
StringProtocol.swift
StringIndex.swift
StringIndexConversions.swift
StringIndexValidation.swift
StringInterpolation.swift
StringLegacy.swift
StringNormalization.swift
Expand Down
1 change: 1 addition & 0 deletions stdlib/public/core/GroupInfo.json
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
"StringHashable.swift",
"StringIndex.swift",
"StringIndexConversions.swift",
"StringIndexValidation.swift",
"StringInterpolation.swift",
"StringLegacy.swift",
"StringNormalization.swift",
Expand Down
11 changes: 11 additions & 0 deletions stdlib/public/core/Range.swift
Original file line number Diff line number Diff line change
Expand Up @@ -1027,3 +1027,14 @@ extension PartialRangeUpTo: Sendable where Bound: Sendable { }
extension PartialRangeThrough: Sendable where Bound: Sendable { }
extension PartialRangeFrom: Sendable where Bound: Sendable { }
extension PartialRangeFrom.Iterator: Sendable where Bound: Sendable { }

extension Range where Bound == String.Index {
@_alwaysEmitIntoClient // Swift 5.7
internal var _encodedOffsetRange: Range<Int> {
_internalInvariant(
(lowerBound._canBeUTF8 && upperBound._canBeUTF8)
|| (lowerBound._canBeUTF16 && upperBound._canBeUTF16))
return Range<Int>(
_uncheckedBounds: (lowerBound._encodedOffset, upperBound._encodedOffset))
}
}
19 changes: 12 additions & 7 deletions stdlib/public/core/Slice.swift
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,11 @@ public struct Slice<Base: Collection> {
public var base: Base {
return _base
}

@_alwaysEmitIntoClient @inline(__always)
internal var _bounds: Range<Base.Index> {
Range(_uncheckedBounds: (_startIndex, _endIndex))
}
}

extension Slice: Collection {
Expand All @@ -157,21 +162,21 @@ extension Slice: Collection {
@inlinable // generic-performance
public subscript(index: Index) -> Base.Element {
get {
_failEarlyRangeCheck(index, bounds: startIndex..<endIndex)
_failEarlyRangeCheck(index, bounds: _bounds)
return _base[index]
}
}

@inlinable // generic-performance
public subscript(bounds: Range<Index>) -> Slice<Base> {
get {
_failEarlyRangeCheck(bounds, bounds: startIndex..<endIndex)
_failEarlyRangeCheck(bounds, bounds: _bounds)
return Slice(base: _base, bounds: bounds)
}
}

public var indices: Indices {
return _base.indices[_startIndex..<_endIndex]
public var indices: Indices {
return _base.indices[_bounds]
}

@inlinable // generic-performance
Expand Down Expand Up @@ -264,11 +269,11 @@ extension Slice: MutableCollection where Base: MutableCollection {
@inlinable // generic-performance
public subscript(index: Index) -> Base.Element {
get {
_failEarlyRangeCheck(index, bounds: startIndex..<endIndex)
_failEarlyRangeCheck(index, bounds: _bounds)
return _base[index]
}
set {
_failEarlyRangeCheck(index, bounds: startIndex..<endIndex)
_failEarlyRangeCheck(index, bounds: _bounds)
_base[index] = newValue
// MutableSlice requires that the underlying collection's subscript
// setter does not invalidate indices, so our `startIndex` and `endIndex`
Expand All @@ -279,7 +284,7 @@ extension Slice: MutableCollection where Base: MutableCollection {
@inlinable // generic-performance
public subscript(bounds: Range<Index>) -> Slice<Base> {
get {
_failEarlyRangeCheck(bounds, bounds: startIndex..<endIndex)
_failEarlyRangeCheck(bounds, bounds: _bounds)
return Slice(base: _base, bounds: bounds)
}
set {
Expand Down
Loading