Skip to content

Commit 05c5564

Browse files
committed
wip: closer to proposal
1 parent ca47be5 commit 05c5564

File tree

7 files changed

+482
-636
lines changed

7 files changed

+482
-636
lines changed

stdlib/public/core/UTF8EncodingError.swift

Lines changed: 0 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,6 @@ extension Unicode.UTF8 {
9999
@available(SwiftStdlib 6.1, *)
100100
@frozen
101101
public struct EncodingError: Error, Sendable, Hashable
102-
// TODO: embedded? , Codable
103102
{
104103
/// The kind of encoding error
105104
public var kind: Unicode.UTF8.EncodingError.Kind
@@ -129,7 +128,6 @@ extension UTF8.EncodingError {
129128
/// The kind of encoding error encountered during validation
130129
@frozen
131130
public struct Kind: Error, Sendable, Hashable, RawRepresentable
132-
// FIXME: error unavailable in embedded swift, Codable
133131
{
134132
public var rawValue: UInt8
135133

@@ -199,52 +197,3 @@ extension UTF8.EncodingError: CustomStringConvertible {
199197
"UTF8.EncodingError(\(kind), \(range))"
200198
}
201199
}
202-
203-
@available(SwiftStdlib 6.1, *)
204-
extension UTF8 {
205-
public // For demo purposes
206-
static func _checkAllErrors(
207-
_ s: some Sequence<UInt8>
208-
) -> some Sequence<UTF8.EncodingError> {
209-
// TODO: Span fast path
210-
// TODO: Fixed size buffer for non-contig inputs
211-
// TODO: Lifetime-dependent result variant
212-
let cus = Array(s)
213-
return cus.withUnsafeBytes {
214-
var bufPtr = $0
215-
var start = 0
216-
var errors: Array<UTF8.EncodingError> = []
217-
218-
// Remember the previous error, so that we can
219-
// apply it to subsequent bytes instead of reporting
220-
// just `.unexpectedContinuation`.
221-
var priorError: UTF8.EncodingError? = nil
222-
while true {
223-
do throws(UTF8.EncodingError) {
224-
_ = try bufPtr.baseAddress!._validateUTF8(limitedBy: bufPtr.count)
225-
return errors
226-
} catch {
227-
let adjustedRange =
228-
error.range.lowerBound + start ..< error.range.upperBound + start
229-
230-
let kind: UTF8.EncodingError.Kind
231-
if let prior = priorError,
232-
prior.range.upperBound == adjustedRange.lowerBound,
233-
error.kind == .unexpectedContinuationByte
234-
{
235-
kind = prior.kind
236-
} else {
237-
kind = error.kind
238-
}
239-
let adjustedErr = UTF8.EncodingError(kind, adjustedRange)
240-
priorError = adjustedErr
241-
242-
let errEnd = error.range.upperBound
243-
start += errEnd
244-
bufPtr = .init(rebasing: bufPtr[errEnd...])
245-
errors.append(adjustedErr)
246-
}
247-
}
248-
}
249-
}
250-
}

stdlib/public/core/UTF8Span.swift

Lines changed: 73 additions & 83 deletions
Original file line numberDiff line numberDiff line change
@@ -5,35 +5,20 @@
55
@frozen
66
@available(SwiftStdlib 6.1, *)
77
public struct UTF8Span: Copyable, ~Escapable, BitwiseCopyable {
8-
public var unsafeBaseAddress: UnsafeRawPointer?
8+
@usableFromInline
9+
internal var _unsafeBaseAddress: UnsafeRawPointer?
910

1011
/*
1112
A bit-packed count and flags (such as isASCII)
1213

13-
╔═══════╦═════╦═════╦═════╦══════════╦═══════╗
14-
║ b63 ║ b62 ║ b61 ║ b60 ║ b59:56 ║ b56:0 ║
15-
╠═══════╬═════╬═════╬═════╬══════════╬═══════╣
16-
║ ASCII ║ NFC ║ SSC ║ NUL ║ reserved ║ count ║
17-
╚═══════╩═════╩═════╩═════╩══════════╩═══════╝
18-
19-
ASCII means the contents are all-ASCII (<0x7F).
20-
NFC means contents are in normal form C for fast comparisons.
21-
SSC means single-scalar Characters (i.e. grapheme clusters): every
22-
`Character` holds only a single `Unicode.Scalar`.
23-
NUL means the contents are a null-terminated C string (that is,
24-
there is a guranteed, borrowed NULL byte after the end of `count`).
25-
26-
TODO: NUL means both no-interior and null-terminator, so does this
27-
mean that String doesn't ever set it because we don't want to scan
28-
for interior nulls? I think this is the only viable option...
29-
30-
TODO: Contains-newline would be useful for Regex `.`
31-
32-
Question: Should we have null-termination support?
33-
A null-terminated UTF8Span has a NUL byte after its contents
34-
and contains no interior NULs. How would we ensure the
35-
NUL byte is exclusively borrowed by us?
14+
╔═══════╦═════╦══════════╦═══════╗
15+
║ b63 ║ b62 ║ b61:56 ║ b56:0 ║
16+
╠═══════╬═════╬══════════╬═══════╣
17+
║ ASCII ║ NFC ║ reserved ║ count ║
18+
╚═══════╩═════╩══════════╩═══════╝
3619

20+
ASCII means the contents are known to be all-ASCII (<0x7F).
21+
NFC means contents are known to be in normal form C for fast comparisons.
3722
*/
3823
@usableFromInline
3924
internal var _countAndFlags: UInt64
@@ -45,7 +30,7 @@ public struct UTF8Span: Copyable, ~Escapable, BitwiseCopyable {
4530
_unsafeAssumingValidUTF8 start: borrowing UnsafeRawPointer,
4631
_countAndFlags: UInt64
4732
) {
48-
self.unsafeBaseAddress = copy start
33+
self._unsafeBaseAddress = copy start
4934
self._countAndFlags = _countAndFlags
5035

5136
_invariantCheck()
@@ -55,8 +40,11 @@ public struct UTF8Span: Copyable, ~Escapable, BitwiseCopyable {
5540
// at least check the count first
5641
@_alwaysEmitIntoClient
5742
internal func _start() -> UnsafeRawPointer {
58-
unsafeBaseAddress._unsafelyUnwrappedUnchecked
43+
_unsafeBaseAddress._unsafelyUnwrappedUnchecked
5944
}
45+
46+
// HACK: working around lack of internals
47+
internal var _str: String { _start()._str(0..<count) }
6048
}
6149

6250
// TODO: init strategy: underscored public that use lifetime annotations
@@ -65,13 +53,23 @@ public struct UTF8Span: Copyable, ~Escapable, BitwiseCopyable {
6553

6654
@available(SwiftStdlib 6.1, *)
6755
extension UTF8Span {
56+
/// Creates a UTF8Span containing `codeUnits`. Validates that the input is
57+
/// valid UTF-8, otherwise throws an error.
58+
///
59+
/// The resulting UTF8Span has the same lifetime constraints as `codeUnits`.
60+
public init(
61+
validating codeUnits: consuming Span<UInt8>
62+
) throws(UTF8.EncodingError) {
63+
try self.init(_validating: codeUnits)
64+
}
65+
6866
// TODO: this doesn't need to be underscored, I don't think
6967
@lifetime(codeUnits)
70-
public init(
68+
internal init(
7169
_validating codeUnits: consuming Span<UInt8>
7270
) throws(UTF8.EncodingError) {
7371
guard let ptr = codeUnits._pointer else {
74-
self.unsafeBaseAddress = nil
72+
self._unsafeBaseAddress = nil
7573
self._countAndFlags = 0
7674
return
7775
}
@@ -81,14 +79,37 @@ extension UTF8Span {
8179
let count = codeUnits._count
8280
let isASCII = try basePtr._validateUTF8(limitedBy: count)
8381

84-
self.unsafeBaseAddress = .init(basePtr)
82+
self._unsafeBaseAddress = .init(basePtr)
8583
self._countAndFlags = UInt64(truncatingIfNeeded: count)
8684
if isASCII {
8785
_setIsASCII()
8886
}
8987
_internalInvariant(self.count == codeUnits.count)
9088
}
9189

90+
// TODO: SPI?
91+
internal init(
92+
_uncheckedAssumingValidUTF8 codeUnits: consuming Span<UInt8>,
93+
isKnownASCII: Bool,
94+
isKnownNFC: Bool
95+
) {
96+
guard let ptr = codeUnits._pointer else {
97+
self._unsafeBaseAddress = nil
98+
self._countAndFlags = 0
99+
return
100+
}
101+
102+
self._unsafeBaseAddress = codeUnits._start()
103+
self._countAndFlags = UInt64(truncatingIfNeeded: codeUnits.count)
104+
if isKnownASCII {
105+
_setIsASCII()
106+
}
107+
if isKnownNFC {
108+
_setIsNFC()
109+
}
110+
_internalInvariant(self.count == codeUnits.count)
111+
}
112+
92113
// @_alwaysEmitIntoClient
93114
// public init<Owner: ~Copyable & ~Escapable>(
94115
// validatingUnsafe codeUnits: UnsafeBufferPointer<UInt8>,
@@ -161,30 +182,6 @@ extension UTF8Span {
161182
}
162183

163184

164-
// MARK: Canonical comparison
165-
166-
@_unavailableInEmbedded
167-
@available(SwiftStdlib 6.1, *)
168-
extension UTF8Span {
169-
// HACK: working around lack of internals
170-
internal var _str: String { _start()._str(0..<count) }
171-
172-
/// Whether `self` is equivalent to `other` under Unicode Canonical
173-
/// Equivalence.
174-
public func isCanonicallyEquivalent(
175-
to other: UTF8Span
176-
) -> Bool {
177-
self._str == other._str
178-
}
179-
180-
/// Whether `self` orders less than `other` under Unicode Canonical
181-
/// Equivalence using normalized code-unit order (in NFC).
182-
public func isCanonicallyLessThan(
183-
_ other: UTF8Span
184-
) -> Bool {
185-
self._str < other._str
186-
}
187-
}
188185

189186

190187

@@ -206,7 +203,7 @@ extension UTF8Span {
206203
/// parameter is valid only for the duration of its execution.
207204
/// - Returns: The return value of the `body` closure parameter.
208205
@_alwaysEmitIntoClient
209-
borrowing public func withUnsafeBufferPointer<
206+
borrowing public func _withUnsafeBufferPointer<
210207
E: Error, Result: ~Copyable //& ~Escapable
211208
>(
212209
_ body: (_ buffer: /*borrowing*/ UnsafeBufferPointer<UInt8>) throws(E) -> Result
@@ -233,45 +230,38 @@ extension UTF8Span {
233230
}
234231
}
235232

236-
#if false
237-
extension RawSpan {
238-
public func parseUTF8(from start: Int, length: Int) throws -> UTF8Span {
239-
let span = self[
240-
uncheckedOffsets: start ..< start &+ length
241-
].view(as: UInt8.self)
242-
return try UTF8Span(validating: span)
233+
@available(SwiftStdlib 6.1, *)
234+
extension UTF8Span {
235+
public var isEmpty: Bool {
236+
self.count == 0
243237
}
244238

245-
// TODO: Below are contingent on how we want to handle NUL-termination
246-
public func parseNullTerminatedUTF8() throws -> UTF8Span {
247-
fatalError()
239+
public var span: Span<UInt8> {
240+
Span(_unchecked: _unsafeBaseAddress, count: self.count)
248241
}
242+
243+
249244
}
250245

251-
// TODO: Below is contingent on a Cursor or Iterator type
252-
extension RawSpan.Cursor {
253-
public mutating func parseUTF8(length: Int) throws -> UTF8Span {
254-
fatalError()
255-
}
256-
public mutating func parseNullTerminatedUTF8() throws -> UTF8Span {
257-
fatalError()
258-
}
246+
func TODO(_ message: String) -> Never {
247+
fatalError("TODO: message")
259248
}
260-
#endif
261249

250+
// TODO(toolchain): decide if we rebase on top of Guillaume's work
262251
@available(SwiftStdlib 6.1, *)
263-
extension UTF8Span {
264-
public static func ~=(_ lhs: StaticString, _ rhs: UTF8Span) -> Bool {
265-
return lhs.withUTF8Buffer { str in
266-
rhs.withUnsafeBufferPointer { span in
267-
str.elementsEqual(span)
268-
}
269-
}
252+
extension String {
253+
public var utf8Span: UTF8Span {
254+
TODO("Decide when to rebase on top of Guillaume's PR")
270255
}
256+
}
271257

272-
// Not doing == between two UTFSpan, as pointerness
273-
// Note: avove might not be possible
258+
@available(SwiftStdlib 6.1, *)
259+
extension Substring {
260+
public var utf8Span: UTF8Span {
261+
TODO("Decide when to rebase on top of Guillaume's PR")
262+
}
274263
}
275264

276265

277-
// TODO: cString var, or something like that
266+
267+

0 commit comments

Comments
 (0)