Skip to content

Commit 6f469b2

Browse files
committed
[String.Index] Deprecate encodedOffset var/init
String.Index has an encodedOffset-based initializer and computed property that exists for serialization purposes. It was documented as UTF-16 in the SE proposal introducing it, which was String's underlying encoding at the time, but the dream of String even then was to abstract away whatever encoding happend to be used. Serialization needs an explicit encoding for serialized indices to make sense: the offsets need to align with the view. With String utilizing UTF-8 encoding for native contents in Swift 5, serialization isn't necessarily the most efficient in UTF-16. Furthermore, the majority of usage of encodedOffset in the wild is buggy and operates under the assumption that a UTF-16 code unit was a Swift Character, which isn't even valid if the String is known to be all-ASCII (because CR-LF). This change introduces a pair of semantics-preserving alternatives to encodedOffset that explicitly call out the UTF-16 assumption. These serve as a gentle off-ramp for current mis-uses of encodedOffset.
1 parent 817dff3 commit 6f469b2

23 files changed

+223
-111
lines changed

stdlib/public/SDK/Foundation/URLComponents.swift

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -192,16 +192,8 @@ public struct URLComponents : ReferenceConvertible, Hashable, Equatable, _Mutabl
192192

193193
@available(macOS 10.11, iOS 9.0, *)
194194
private func _toStringRange(_ r : NSRange) -> Range<String.Index>? {
195-
guard r.location != NSNotFound else { return nil }
196-
197-
let utf16Start = String.UTF16View.Index(encodedOffset: r.location)
198-
let utf16End = String.UTF16View.Index(encodedOffset: r.location + r.length)
199-
200195
guard let s = self.string else { return nil }
201-
guard let start = String.Index(utf16Start, within: s) else { return nil }
202-
guard let end = String.Index(utf16End, within: s) else { return nil }
203-
204-
return start..<end
196+
return Range(r, in: s)
205197
}
206198

207199
/// Returns the character range of the scheme in the string returned by `var string`.

stdlib/public/SDK/NaturalLanguage/NLTagger.swift

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,15 +18,15 @@ extension NLTagger {
1818
@nonobjc
1919
public func tokenRange(at index: String.Index, unit: NLTokenUnit) -> Range<String.Index> {
2020
let str = self.string ?? ""
21-
let characterIndex = index.encodedOffset
21+
let characterIndex = index.utf16Offset(in: str)
2222
let nsrange = self.__tokenRange(at: characterIndex, unit: unit)
2323
return Range(nsrange, in: str)!
2424
}
2525

2626
@nonobjc
2727
public func tag(at index: String.Index, unit: NLTokenUnit, scheme: NLTagScheme) -> (NLTag?, Range<String.Index>) {
2828
let str = self.string ?? ""
29-
let characterIndex = index.encodedOffset
29+
let characterIndex = index.utf16Offset(in: str)
3030
let rangePointer = NSRangePointer.allocate(capacity: 1)
3131
rangePointer.initialize(to: NSMakeRange(0, 0))
3232
let tag = self.__tag(at: characterIndex, unit: unit, scheme: scheme, tokenRange: rangePointer)

stdlib/public/SDK/NaturalLanguage/NLTokenizer.swift

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ extension NLTokenizer {
1818
@nonobjc
1919
public func tokenRange(at index: String.Index) -> Range<String.Index> {
2020
let str = self.string ?? ""
21-
let characterIndex = index.encodedOffset
21+
let characterIndex = index.utf16Offset(in: str)
2222
let nsrange = self.__tokenRange(at:characterIndex)
2323
return Range(nsrange, in: str)!
2424
}

stdlib/public/core/String.swift

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -923,8 +923,8 @@ internal func _fastWithNormalizedCodeUnitsImpl(
923923
var icuInputBuffer = icuInputBuffer
924924
var icuOutputBuffer = icuOutputBuffer
925925

926-
var index = String.Index(encodedOffset: 0)
927-
let cachedEndIndex = String.Index(encodedOffset: sourceBuffer.count)
926+
var index = String.Index(_encodedOffset: 0)
927+
let cachedEndIndex = String.Index(_encodedOffset: sourceBuffer.count)
928928

929929
var hasBufferOwnership = false
930930

stdlib/public/core/StringBreadcrumbs.swift

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -79,8 +79,8 @@ extension _StringBreadcrumbs {
7979
internal func getBreadcrumb(
8080
forIndex idx: String.Index
8181
) -> (lowerBound: String.Index, offset: Int) {
82-
var lowerBound = idx.encodedOffset / 3 / stride
83-
var upperBound = Swift.min(1 + (idx.encodedOffset / stride), crumbs.count)
82+
var lowerBound = idx._encodedOffset / 3 / stride
83+
var upperBound = Swift.min(1 + (idx._encodedOffset / stride), crumbs.count)
8484
_internalInvariant(crumbs[lowerBound] <= idx)
8585
_internalInvariant(upperBound == crumbs.count || crumbs[upperBound] >= idx)
8686

stdlib/public/core/StringCharacterView.swift

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -66,9 +66,9 @@ extension String: BidirectionalCollection {
6666

6767
// TODO: known-ASCII fast path, single-scalar-grapheme fast path, etc.
6868
let stride = _characterStride(startingAt: i)
69-
let nextOffset = i.encodedOffset &+ stride
69+
let nextOffset = i._encodedOffset &+ stride
7070
let nextStride = _characterStride(
71-
startingAt: Index(encodedOffset: nextOffset))
71+
startingAt: Index(_encodedOffset: nextOffset))
7272

7373
return Index(
7474
encodedOffset: nextOffset, characterStride: nextStride)
@@ -84,7 +84,7 @@ extension String: BidirectionalCollection {
8484

8585
// TODO: known-ASCII fast path, single-scalar-grapheme fast path, etc.
8686
let stride = _characterStride(endingAt: i)
87-
let priorOffset = i.encodedOffset &- stride
87+
let priorOffset = i._encodedOffset &- stride
8888
return Index(encodedOffset: priorOffset, characterStride: stride)
8989
}
9090
/// Returns an index that is the specified distance from the given index.
@@ -198,7 +198,7 @@ extension String: BidirectionalCollection {
198198
let i = _guts.scalarAlign(i)
199199
let distance = _characterStride(startingAt: i)
200200
return _guts.errorCorrectedCharacter(
201-
startingAt: i.encodedOffset, endingAt: i.encodedOffset &+ distance)
201+
startingAt: i._encodedOffset, endingAt: i._encodedOffset &+ distance)
202202
}
203203
}
204204

@@ -209,14 +209,14 @@ extension String: BidirectionalCollection {
209209

210210
if i == endIndex { return 0 }
211211

212-
return _guts._opaqueCharacterStride(startingAt: i.encodedOffset)
212+
return _guts._opaqueCharacterStride(startingAt: i._encodedOffset)
213213
}
214214

215215
@inlinable @inline(__always)
216216
internal func _characterStride(endingAt i: Index) -> Int {
217217
if i == startIndex { return 0 }
218218

219-
return _guts._opaqueCharacterStride(endingAt: i.encodedOffset)
219+
return _guts._opaqueCharacterStride(endingAt: i._encodedOffset)
220220
}
221221
}
222222

stdlib/public/core/StringComparison.swift

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -328,10 +328,10 @@ extension _StringGutsSlice {
328328
if _fastPath(self.isFastUTF8 && other.isFastUTF8) {
329329
return self.withFastUTF8 { leftUTF8 in
330330
other.withFastUTF8 { rightUTF8 in
331-
let leftStartIndex = String.Index(encodedOffset: 0)
332-
let rightStartIndex = String.Index(encodedOffset: 0)
333-
let leftEndIndex = String.Index(encodedOffset: leftUTF8.count)
334-
let rightEndIndex = String.Index(encodedOffset: rightUTF8.count)
331+
let leftStartIndex = String.Index(_encodedOffset: 0)
332+
let rightStartIndex = String.Index(_encodedOffset: 0)
333+
let leftEndIndex = String.Index(_encodedOffset: leftUTF8.count)
334+
let rightEndIndex = String.Index(_encodedOffset: rightUTF8.count)
335335
return _normalizedCompareImpl(
336336
left_outputBuffer: _castOutputBuffer(&left_output),
337337
left_icuInputBuffer: _castOutputBuffer(&left_icuInput),

stdlib/public/core/StringGraphemeBreaking.swift

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -156,7 +156,7 @@ extension _StringGuts {
156156
internal func isOnGraphemeClusterBoundary(_ i: String.Index) -> Bool {
157157
guard i.transcodedOffset == 0 else { return false }
158158

159-
let offset = i.encodedOffset
159+
let offset = i._encodedOffset
160160
if offset == 0 || offset == self.count { return true }
161161

162162
guard isOnUnicodeScalarBoundary(i) else { return false }
@@ -197,7 +197,7 @@ extension _StringGuts {
197197
let count = _object.largeCount
198198
let cocoa = _object.cocoaObject
199199

200-
let startIdx = String.Index(encodedOffset: i)
200+
let startIdx = String.Index(_encodedOffset: i)
201201
let (sc1, len) = foreignErrorCorrectedScalar(startingAt: startIdx)
202202
if i &+ len == count {
203203
// Last scalar is last grapheme
@@ -263,7 +263,7 @@ extension _StringGuts {
263263
let count = _object.largeCount
264264
let cocoa = _object.cocoaObject
265265

266-
let endIdx = String.Index(encodedOffset: i)
266+
let endIdx = String.Index(_encodedOffset: i)
267267
let (sc2, len) = foreignErrorCorrectedScalar(endingAt: endIdx)
268268
if i &- len == 0 {
269269
// First scalar is first grapheme

stdlib/public/core/StringGuts.swift

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -274,11 +274,11 @@ extension _StringGuts {
274274

275275
@inlinable
276276
internal var startIndex: String.Index {
277-
@inline(__always) get { return Index(encodedOffset: 0) }
277+
@inline(__always) get { return Index(_encodedOffset: 0) }
278278
}
279279
@inlinable
280280
internal var endIndex: String.Index {
281-
@inline(__always) get { return Index(encodedOffset: self.count) }
281+
@inline(__always) get { return Index(_encodedOffset: self.count) }
282282
}
283283
}
284284

stdlib/public/core/StringGutsRangeReplaceable.swift

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -218,8 +218,8 @@ extension _StringGuts {
218218
}
219219

220220
internal mutating func remove(from lower: Index, to upper: Index) {
221-
let lowerOffset = lower.encodedOffset
222-
let upperOffset = upper.encodedOffset
221+
let lowerOffset = lower._encodedOffset
222+
let upperOffset = upper._encodedOffset
223223
_internalInvariant(lower.transcodedOffset == 0 && upper.transcodedOffset == 0)
224224
_internalInvariant(lowerOffset <= upperOffset && upperOffset <= self.count)
225225

@@ -279,16 +279,16 @@ extension _StringGuts {
279279
isASCII: Bool
280280
) {
281281
let neededCapacity =
282-
bounds.lowerBound.encodedOffset
283-
+ codeUnits.count + (self.count - bounds.upperBound.encodedOffset)
282+
bounds.lowerBound._encodedOffset
283+
+ codeUnits.count + (self.count - bounds.upperBound._encodedOffset)
284284
reserveCapacity(neededCapacity)
285285

286286
_internalInvariant(bounds.lowerBound.transcodedOffset == 0)
287287
_internalInvariant(bounds.upperBound.transcodedOffset == 0)
288288

289289
_object.nativeStorage.replace(
290-
from: bounds.lowerBound.encodedOffset,
291-
to: bounds.upperBound.encodedOffset,
290+
from: bounds.lowerBound._encodedOffset,
291+
to: bounds.upperBound._encodedOffset,
292292
with: codeUnits)
293293
self = _StringGuts(_object.nativeStorage)
294294
}
@@ -300,16 +300,16 @@ extension _StringGuts {
300300
let replCount = codeUnits.count
301301

302302
let neededCapacity =
303-
bounds.lowerBound.encodedOffset
304-
+ replCount + (self.count - bounds.upperBound.encodedOffset)
303+
bounds.lowerBound._encodedOffset
304+
+ replCount + (self.count - bounds.upperBound._encodedOffset)
305305
reserveCapacity(neededCapacity)
306306

307307
_internalInvariant(bounds.lowerBound.transcodedOffset == 0)
308308
_internalInvariant(bounds.upperBound.transcodedOffset == 0)
309309

310310
_object.nativeStorage.replace(
311-
from: bounds.lowerBound.encodedOffset,
312-
to: bounds.upperBound.encodedOffset,
311+
from: bounds.lowerBound._encodedOffset,
312+
to: bounds.upperBound._encodedOffset,
313313
with: codeUnits,
314314
replacementCount: replCount)
315315
self = _StringGuts(_object.nativeStorage)

stdlib/public/core/StringGutsSlice.swift

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -74,8 +74,8 @@ internal struct _StringGutsSlice {
7474
@inlinable
7575
internal var range: Range<String.Index> {
7676
@inline(__always) get {
77-
return String.Index(encodedOffset: _offsetRange.lowerBound)
78-
..< String.Index(encodedOffset: _offsetRange.upperBound)
77+
return String.Index(_encodedOffset: _offsetRange.lowerBound)
78+
..< String.Index(_encodedOffset: _offsetRange.upperBound)
7979
}
8080
}
8181

stdlib/public/core/StringIndex.swift

Lines changed: 47 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -62,10 +62,22 @@ extension String.Index {
6262
@inline(__always) get { return orderingValue == 0 }
6363
}
6464

65+
/// The UTF-16 code unit offset corresponding to this Index
66+
public func utf16Offset<S: StringProtocol>(in s: S) -> Int {
67+
return s.utf16.distance(from: s.utf16.startIndex, to: self)
68+
}
69+
6570
/// The offset into a string's code units for this index.
71+
@available(swift, deprecated: 4.2, message: """
72+
encodedOffset has been deprecated as most common usage is incorrect. \
73+
Use utf16Offset(in:) to achieve the same behavior.
74+
""")
6675
@inlinable
67-
public var encodedOffset: Int {
68-
@inline(__always) get { return Int(truncatingIfNeeded: _rawBits &>> 16) }
76+
public var encodedOffset: Int { return _encodedOffset }
77+
78+
@inlinable @inline(__always)
79+
internal var _encodedOffset: Int {
80+
return Int(truncatingIfNeeded: _rawBits &>> 16)
6981
}
7082

7183
@inlinable
@@ -91,12 +103,35 @@ extension String.Index {
91103
self.init((pos &<< 16) | (trans &<< 14))
92104
}
93105

106+
/// Creates a new index at the specified UTF-16 code unit offset
107+
///
108+
/// - Parameter offset: An offset in UTF-16 code units.
109+
public init<S: StringProtocol>(utf16Offset offset: Int, in s: S) {
110+
let (start, end) = (s.utf16.startIndex, s.utf16.endIndex)
111+
guard offset >= 0,
112+
let idx = s.utf16.index(start, offsetBy: offset, limitedBy: end)
113+
else {
114+
self = end.nextEncoded
115+
return
116+
}
117+
self = idx
118+
}
119+
94120
/// Creates a new index at the specified code unit offset.
95121
///
96122
/// - Parameter offset: An offset in code units.
123+
@available(swift, deprecated: 4.2, message: """
124+
encodedOffset has been deprecated as most common usage is incorrect. \
125+
Use String.Index(utf16Offset:in:) to achieve the same behavior.
126+
""")
127+
@inlinable
128+
public init(encodedOffset offset: Int) {
129+
self.init(_encodedOffset: offset)
130+
}
131+
97132
@inlinable @inline(__always)
98-
public init(encodedOffset: Int) {
99-
self.init(encodedOffset: encodedOffset, transcodedOffset: 0)
133+
internal init(_encodedOffset offset: Int) {
134+
self.init(encodedOffset: offset, transcodedOffset: 0)
100135
}
101136

102137
@usableFromInline
@@ -121,7 +156,7 @@ extension String.Index {
121156
#else
122157
@usableFromInline @inline(never) @_effects(releasenone)
123158
internal func _invariantCheck() {
124-
_internalInvariant(encodedOffset >= 0)
159+
_internalInvariant(_encodedOffset >= 0)
125160
}
126161
#endif // INTERNAL_CHECKS_ENABLED
127162
}
@@ -132,31 +167,31 @@ extension String.Index {
132167
@inlinable
133168
internal var strippingTranscoding: String.Index {
134169
@inline(__always) get {
135-
return String.Index(encodedOffset: self.encodedOffset)
170+
return String.Index(_encodedOffset: self._encodedOffset)
136171
}
137172
}
138173

139174
@inlinable
140175
internal var nextEncoded: String.Index {
141176
@inline(__always) get {
142177
_internalInvariant(self.transcodedOffset == 0)
143-
return String.Index(encodedOffset: self.encodedOffset &+ 1)
178+
return String.Index(_encodedOffset: self._encodedOffset &+ 1)
144179
}
145180
}
146181

147182
@inlinable
148183
internal var priorEncoded: String.Index {
149184
@inline(__always) get {
150185
_internalInvariant(self.transcodedOffset == 0)
151-
return String.Index(encodedOffset: self.encodedOffset &- 1)
186+
return String.Index(_encodedOffset: self._encodedOffset &- 1)
152187
}
153188
}
154189

155190
@inlinable
156191
internal var nextTranscoded: String.Index {
157192
@inline(__always) get {
158193
return String.Index(
159-
encodedOffset: self.encodedOffset,
194+
encodedOffset: self._encodedOffset,
160195
transcodedOffset: self.transcodedOffset &+ 1)
161196
}
162197
}
@@ -165,7 +200,7 @@ extension String.Index {
165200
internal var priorTranscoded: String.Index {
166201
@inline(__always) get {
167202
return String.Index(
168-
encodedOffset: self.encodedOffset,
203+
encodedOffset: self._encodedOffset,
169204
transcodedOffset: self.transcodedOffset &- 1)
170205
}
171206
}
@@ -174,13 +209,13 @@ extension String.Index {
174209
// Note: strips any transcoded offset.
175210
@inlinable @inline(__always)
176211
internal func encoded(offsetBy n: Int) -> String.Index {
177-
return String.Index(encodedOffset: self.encodedOffset &+ n)
212+
return String.Index(_encodedOffset: self._encodedOffset &+ n)
178213
}
179214

180215
@inlinable @inline(__always)
181216
internal func transcoded(withOffset n: Int) -> String.Index {
182217
_internalInvariant(self.transcodedOffset == 0)
183-
return String.Index(encodedOffset: self.encodedOffset, transcodedOffset: n)
218+
return String.Index(encodedOffset: self._encodedOffset, transcodedOffset: n)
184219
}
185220

186221
}

0 commit comments

Comments
 (0)