Skip to content

Commit 251a60a

Browse files
author
Dave Abrahams
committed
More WIP
1 parent 6141fa7 commit 251a60a

File tree

3 files changed

+79
-29
lines changed

3 files changed

+79
-29
lines changed

stdlib/public/core/Unicode2.swift

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,8 @@ case emptyInput
3939
}
4040
}
4141

42+
// FIXME: closure-taking methods should rethrow.
43+
4244
public protocol AnyUnicodeEncoding {
4345
// FIXME: a single scalar might not be the most efficient buffer to use here.
4446
// SIMD instructions can be used to decode UTF-8 much more efficiently, which

stdlib/public/core/UnicodeStorage.swift

Lines changed: 75 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,15 @@ public func _debugLog(_ arg0: @autoclosure ()->Any, _ arg1: @autoclosure ()->Any
2525
/// An index type for views onto random access collections whose elements are
2626
/// effectively variable-width.
2727
public protocol UnicodeIndexProtocol {
28+
29+
// FIXME: it's not clear that there is always enough information to construct
30+
// these from just an offset without also using the Collection into which they
31+
// are indexing (e.g. when an index caches information). If so, this
32+
// requirement would need to be replaced by a requirement on the collection.
33+
// In all such scenarios we've found *so far*, indices can have an empty cache
34+
// that will be filled on demand without loss of efficiency.
35+
init(codeUnitOffset: Int64)
36+
2837
var codeUnitOffset: Int64 { get }
2938
}
3039

@@ -90,6 +99,20 @@ extension UnicodeStorage.EncodedScalars {
9099
// practice.
91100
let nextStride: UInt8
92101

102+
public init(codeUnitOffset: Int64) {
103+
self.init(offset: numericCast(codeUnitOffset))
104+
}
105+
106+
internal init(
107+
offset: CodeUnits.IndexDistance,
108+
nextStride: UInt8 = 0,
109+
scalar: Encoding.EncodedScalar? = nil
110+
) {
111+
self.offset = offset
112+
self.nextStride = nextStride
113+
self.scalar = scalar
114+
}
115+
93116
public var codeUnitOffset: Int64 { return numericCast(offset) }
94117

95118
var nextOffset: CodeUnits.IndexDistance {
@@ -100,74 +123,86 @@ extension UnicodeStorage.EncodedScalars {
100123
// EncodedScalar so as not to waste a separate bool here.
101124
let scalar: Encoding.EncodedScalar?
102125
}
126+
127+
internal func _base(_ i: Index) -> CodeUnits.Index {
128+
return codeUnits.index(atOffset: i.offset)
129+
}
130+
131+
internal func _next(_ i: Index) -> CodeUnits.Index {
132+
return codeUnits.index(atOffset: i.nextOffset)
133+
}
134+
135+
internal func _index(
136+
base: CodeUnits.Index, next: CodeUnits.Index, scalar: Encoding.EncodedScalar?
137+
) -> Index {
138+
return Index(
139+
offset: codeUnits.offset(of: base),
140+
nextStride: numericCast(codeUnits[base..<next].count),
141+
scalar: scalar
142+
)
143+
}
103144
}
104145

105146
/// Collection Conformance
106147
extension UnicodeStorage.EncodedScalars : BidirectionalCollection {
107148
public var startIndex: Index {
108149
if _slowPath(codeUnits.isEmpty) { return endIndex }
109-
return index(after: Index(offset: 0, nextStride: 0, scalar: nil))
150+
let s = codeUnits.startIndex
151+
return index(after: _index(base: s, next: s, scalar: nil))
110152
}
111153

112154
public var endIndex: Index {
113-
return Index(offset: codeUnits.count, nextStride: 0, scalar: nil)
155+
let s = codeUnits.endIndex
156+
return _index(base: s, next: s, scalar: nil)
114157
}
115158

116159
public subscript(i: Index) -> Encoding.EncodedScalar {
117160
if let r = i.scalar {
118161
return r
119162
}
120163
return index(after:
121-
Index(offset: i.offset, nextStride: 0, scalar: nil)).scalar!
164+
_index(base: _base(i), next: _next(i), scalar: nil)
165+
).scalar!
122166
}
123167

124168
public func index(after i: Index) -> Index {
125-
let p = codeUnits.index(atOffset: i.nextOffset)
126-
var remainder = codeUnits[p...]
169+
var remainder = codeUnits[_next(i)..<codeUnits.endIndex]
127170
while true {
128171
switch Encoding.parse1Forward(remainder, knownCount: 0) {
129172
case .valid(let scalar, let nextIndex):
130-
return Index(
131-
offset: i.nextOffset,
132-
nextStride: numericCast(remainder.offset(of: nextIndex)),
133-
scalar: scalar)
173+
return _index(base: _next(i), next: nextIndex, scalar: scalar)
134174
case .error(let nextIndex):
135175
// FIXME: don't go through UnicodeScalar once this is in the stdlib
136176
if let replacement = Encoding.encode(
137177
UTF32.EncodedScalar(UnicodeScalar(0xFFFD)!)) {
138-
return Index(
139-
offset: i.nextOffset,
140-
nextStride: numericCast(remainder.offset(of: nextIndex)),
141-
scalar: replacement)
178+
return _index(base: _next(i), next: nextIndex, scalar: replacement)
142179
}
143-
remainder = remainder.dropFirst()
180+
// If we get here, the encoding couldn't represent a replacement
181+
// character, so the best we can do is to drop that scalar on the floor
182+
// and keep going.
183+
remainder = codeUnits[nextIndex...]
144184
case .emptyInput:
145185
return endIndex
146186
}
147187
}
148188
}
149189

150190
public func index(before i: Index) -> Index {
151-
var remainder = codeUnits[..<codeUnits.index(atOffset: i.offset)]
191+
var remainder = codeUnits[..<_base(i)]
152192
while true {
153193
switch Encoding.parse1Reverse(remainder, knownCount: 0) {
154194
case .valid(let scalar, let priorIndex):
155-
let stride = remainder[priorIndex...].count
156-
return Index(
157-
offset: i.offset - numericCast(stride),
158-
nextStride: numericCast(stride),
159-
scalar: scalar)
195+
return _index(base: priorIndex, next: _base(i), scalar: scalar)
160196
case .error(let priorIndex):
161-
let stride = remainder[priorIndex...].count
162197
// FIXME: don't go through UnicodeScalar once this is in the stdlib
163198
if let replacement = Encoding.encode(
164199
UTF32.EncodedScalar(UnicodeScalar(0xFFFD)!)) {
165-
return Index(
166-
offset: i.offset - numericCast(stride),
167-
nextStride: numericCast(stride),
168-
scalar: replacement)
200+
return _index(base: priorIndex, next: _base(i), scalar: replacement)
169201
}
170-
remainder = remainder.dropLast()
202+
// If we get here, the encoding couldn't represent a replacement
203+
// character, so the best we can do is to drop that scalar on the floor
204+
// and keep going.
205+
remainder = codeUnits[..<priorIndex]
171206
case .emptyInput:
172207
fatalError("Indexing past start of code units")
173208
}
@@ -247,6 +282,7 @@ extension UnicodeStorage : _UTextable {
247282
_ deep: Bool, _ status: UnsafeMutablePointer<_UErrorCode>?
248283
) -> UnsafeMutablePointer<_UText> {
249284
UnsafeMutablePointer(mutating: src)[0].validate()
285+
_sanityCheck(!deep, "deep cloning not supported")
250286
// _debugLog("_clone with dst = \(String(describing: dst))")
251287
// _debugLog("src: \(src[0])")
252288
let r = dst
@@ -258,6 +294,14 @@ extension UnicodeStorage : _UTextable {
258294
return r
259295
}
260296

297+
// A helper for translating indices out of the result of _parsedSuffix
298+
internal var _indexBase
299+
: UnicodeStorage<CodeUnits.SubSequence, Encoding>.EncodedScalars {
300+
return UnicodeStorage<
301+
CodeUnits.SubSequence, Encoding
302+
>(codeUnits[...]).scalars
303+
}
304+
261305
internal func _access(
262306
_ u: inout _UText, _ nativeTargetIndex: Int64, _ forward: Bool
263307
) -> Bool {
@@ -280,6 +324,8 @@ extension UnicodeStorage : _UTextable {
280324
return true
281325
}
282326
// _debugLog("_access: filling buffer")
327+
328+
// FIXME: should we use parseForward/parseReverse on some slice?
283329

284330
guard (0...codeUnits.count^).contains(nativeTargetIndex)
285331
else { return false }
@@ -301,7 +347,7 @@ extension UnicodeStorage : _UTextable {
301347
buffer[u.chunkLength^] = unit
302348
u.chunkLength += 1
303349
}
304-
u.chunkNativeLimit = i.nextOffset^
350+
u.chunkNativeLimit = codeUnits.offset(of: _indexBase._next(i))^
305351
}
306352
}
307353
else {
@@ -320,7 +366,7 @@ extension UnicodeStorage : _UTextable {
320366
buffer[u.chunkLength^] = unit
321367
u.chunkLength += 1
322368
}
323-
u.chunkNativeStart = i.codeUnitOffset
369+
u.chunkNativeStart = codeUnits.offset(of: _indexBase._base(i))^
324370
u.chunkOffset = u.chunkLength
325371
}
326372
var b = buffer // copy due to https://bugs.swift.org/browse/SR-3782
@@ -382,7 +428,7 @@ extension UnicodeStorage : _UTextable {
382428
for i in chunkSource.indices {
383429
chunkOffset += chunkSource[i].utf16.count
384430
if chunkOffset == u[0].chunkOffset^ {
385-
return i.nextOffset^
431+
return codeUnits.offset(of: _indexBase._next(i))^
386432
}
387433
}
388434
fatalError("supposed to be unreachable")

test/Prototypes/Unicode.swift

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,8 @@ t.test("basic") {
123123
let s16to8 = UnicodeStorage.TranscodedView(s16, from: UTF16.self, to: UTF8.self)
124124
let s8to16 = UnicodeStorage.TranscodedView(s8, from: UTF8.self, to: UTF16.self)
125125
let s8Vto16 = UnicodeStorage.TranscodedView(s8, from: ValidUTF8.self, to: UTF16.self)
126+
print(Array(s32))
127+
print(Array(s16to32))
126128
expectTrue(s32.elementsEqual(s16to32))
127129
expectTrue(s8.elementsEqual(s16to8))
128130
expectTrue(s16.elementsEqual(s8to16))

0 commit comments

Comments
 (0)