Skip to content

Commit 1cfce45

Browse files
author
Dave Abrahams
committed
[stdlib] UnicodeDecoders: generalization preparing for UTF16
1 parent 4e802e2 commit 1cfce45

File tree

1 file changed

+27
-10
lines changed

1 file changed

+27
-10
lines changed

test/Prototypes/UnicodeDecoders.swift

Lines changed: 27 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -503,31 +503,47 @@ public protocol UnicodeEncoding {
503503
where ReverseDecoder.CodeUnit == CodeUnit
504504
}
505505

506+
public protocol _UTFDecoderBase : UnicodeDecoder
507+
where Buffer == EncodedScalar {
508+
associatedtype BufferStorage = UInt32
509+
}
510+
511+
public protocol _UTFDecoder : _UTFDecoderBase
512+
where Buffer == _UIntBuffer<BufferStorage, CodeUnit>,
513+
BufferStorage == UInt32
514+
{
515+
static func _isScalar(_: CodeUnit) -> Bool
516+
func _parseMultipleCodeUnits() -> (isValid: Bool, bitCount: UInt8)
517+
var buffer: Buffer { get set }
518+
}
506519

507-
public protocol _UTF8Decoder : UnicodeDecoder where Buffer == EncodedScalar {
508-
func _parseNonASCII() -> (isValid: Bool, bitCount: UInt8)
520+
public protocol _UTF8Decoder : _UTFDecoder {
509521
var buffer: Buffer { get set }
510522
}
511523

512-
extension _UTF8Decoder where Buffer == _UIntBuffer<UInt32, UInt8> {
524+
extension _UTF8Decoder {
525+
public static func _isScalar(_ x: CodeUnit) -> Bool { return x & 0x80 == 0 }
526+
}
527+
528+
extension _UTFDecoder {
513529
public mutating func parseOne<I : IteratorProtocol>(
514530
_ input: inout I
515531
) -> Unicode.ParseResult<EncodedScalar>
516-
where I.Element == Unicode.UTF8.CodeUnit {
532+
where I.Element == CodeUnit {
517533

518534
// Bufferless ASCII fastpath.
519535
if _fastPath(buffer.isEmpty) {
520536
guard let codeUnit = input.next() else { return .emptyInput }
521537
// ASCII, return immediately.
522-
if codeUnit & 0x80 == 0 {
538+
if Self._isScalar(codeUnit) {
523539
return .valid(EncodedScalar(containing: codeUnit))
524540
}
525541
// Non-ASCII, proceed to buffering mode.
526542
buffer.append(codeUnit)
527543
} else if buffer._storage & 0x80 == 0 {
528544
// ASCII in buffer. We don't refill the buffer so we can return
529545
// to bufferless mode once we've exhausted it.
530-
let codeUnit = UInt8(extendingOrTruncating: buffer._storage)
546+
let codeUnit = CodeUnit(extendingOrTruncating: buffer._storage)
531547
buffer.remove(at: buffer.startIndex)
532548
return .valid(EncodedScalar(containing: codeUnit))
533549
}
@@ -544,8 +560,9 @@ extension _UTF8Decoder where Buffer == _UIntBuffer<UInt32, UInt8> {
544560
} while buffer._bitCount < 32
545561

546562
// Find one unicode scalar.
547-
let (isValid, scalarBitCount) = _parseNonASCII()
548-
_sanityCheck(scalarBitCount % 8 == 0 && 1...4 ~= scalarBitCount / 8)
563+
let (isValid, scalarBitCount) = _parseMultipleCodeUnits()
564+
_sanityCheck(scalarBitCount % numericCast(CodeUnit.bitWidth) == 0)
565+
_sanityCheck(1...4 ~= scalarBitCount / 8)
549566
_sanityCheck(scalarBitCount <= buffer._bitCount)
550567

551568
// Consume the decoded bytes (or maximal subpart of ill-formed sequence).
@@ -609,7 +626,7 @@ extension UTF8.ReverseDecoder : _UTF8Decoder {
609626
}
610627

611628
public // @testable
612-
func _parseNonASCII() -> (isValid: Bool, bitCount: UInt8) {
629+
func _parseMultipleCodeUnits() -> (isValid: Bool, bitCount: UInt8) {
613630
_sanityCheck(buffer._storage & 0x80 != 0) // this case handled elsewhere
614631
if buffer._storage & 0b0__1110_0000__1100_0000
615632
== 0b0__1100_0000__1000_0000 {
@@ -681,7 +698,7 @@ extension Unicode.UTF8.ForwardDecoder : _UTF8Decoder {
681698
}
682699

683700
public // @testable
684-
func _parseNonASCII() -> (isValid: Bool, bitCount: UInt8) {
701+
func _parseMultipleCodeUnits() -> (isValid: Bool, bitCount: UInt8) {
685702
_sanityCheck(buffer._storage & 0x80 != 0) // this case handled elsewhere
686703

687704
if buffer._storage & 0b0__1100_0000__1110_0000

0 commit comments

Comments
 (0)