Skip to content

Commit 144dc55

Browse files
authored
Merge pull request #9297 from apple/integrate-utf16
2 parents 0db10e9 + bea71aa commit 144dc55

File tree

2 files changed

+25
-62
lines changed

2 files changed

+25
-62
lines changed

stdlib/public/core/UTF16.swift

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,14 @@
99
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
1010
//
1111
//===----------------------------------------------------------------------===//
12+
extension _Unicode {
13+
public enum UTF16 {
14+
case _swift3Buffer(_Unicode.UTF16.ForwardParser)
15+
}
16+
}
17+
1218
extension _Unicode.UTF16 : UnicodeEncoding {
19+
public typealias CodeUnit = UInt16
1320
public typealias EncodedScalar = _UIntBuffer<UInt32, UInt16>
1421

1522
public static var encodedReplacementCharacter : EncodedScalar {
@@ -37,8 +44,8 @@ extension _Unicode.UTF16 : UnicodeEncoding {
3744
}
3845
let x1 = x - (1 << 16)
3946
var r = (0xdc00 + (x1 & 0x3ff))
40-
r <<= 16
41-
r |= (0xd800 + (x1 >> 10 & 0x3ff))
47+
r &<<= 16
48+
r |= (0xd800 + (x1 &>> 10 & 0x3ff))
4249
return EncodedScalar(_storage: r, _bitCount: 32)
4350
}
4451

stdlib/public/core/Unicode.swift

Lines changed: 16 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -195,7 +195,7 @@ extension _Unicode.UTF8 : UnicodeCodec {
195195
_ input: inout I
196196
) -> UnicodeDecodingResult where I.Element == CodeUnit {
197197
guard case ._swift3Buffer(var parser) = self else {
198-
fatalError("unreachable")
198+
Builtin.unreachable()
199199
}
200200
defer { self = ._swift3Buffer(parser) }
201201

@@ -315,15 +315,9 @@ public typealias UTF8 = _Unicode.UTF8
315315

316316
/// A codec for translating between Unicode scalar values and UTF-16 code
317317
/// units.
318-
public struct UTF16 : UnicodeCodec {
319-
/// A type that can hold code unit values for this encoding.
320-
public typealias CodeUnit = UInt16
321-
318+
extension _Unicode.UTF16 : UnicodeCodec {
322319
/// Creates an instance of the UTF-16 codec.
323-
public init() {}
324-
325-
/// A lookahead buffer for one UTF-16 code unit.
326-
internal var _decodeLookahead: UInt16?
320+
public init() { self = ._swift3Buffer(ForwardParser()) }
327321

328322
/// Starts or continues decoding a UTF-16 sequence.
329323
///
@@ -369,47 +363,15 @@ public struct UTF16 : UnicodeCodec {
369363
public mutating func decode<I : IteratorProtocol>(
370364
_ input: inout I
371365
) -> UnicodeDecodingResult where I.Element == CodeUnit {
372-
// Note: maximal subpart of ill-formed sequence for UTF-16 can only have
373-
// length 1. Length 0 does not make sense. Neither does length 2 -- in
374-
// that case the sequence is valid.
375-
376-
let unit0: UInt16
377-
if _fastPath(_decodeLookahead == nil) {
378-
guard let next = input.next() else { return .emptyInput }
379-
unit0 = next
380-
} else { // Consume lookahead first.
381-
unit0 = _decodeLookahead!
382-
_decodeLookahead = nil
383-
}
384-
385-
// A well-formed pair of surrogates looks like this:
386-
// high-surrogate low-surrogate
387-
// [1101 10xx xxxx xxxx] [1101 11xx xxxx xxxx]
388-
389-
// Common case first, non-surrogate -- just a sequence of 1 code unit.
390-
if _fastPath((unit0 &>> 11) != 0b1101_1) {
391-
return .scalarValue(UnicodeScalar(
392-
_unchecked: UInt32(extendingOrTruncating: unit0)))
366+
guard case ._swift3Buffer(var parser) = self else {
367+
Builtin.unreachable()
393368
}
394-
395-
// Ensure `unit0` is a high-surrogate.
396-
guard _fastPath((unit0 &>> 10) == 0b1101_10) else { return .error }
397-
398-
// We already have a high-surrogate, so there should be a next code unit.
399-
guard let unit1 = input.next() else { return .error }
400-
401-
// `unit0` is a high-surrogate, so `unit1` should be a low-surrogate.
402-
guard _fastPath((unit1 &>> 10) == 0b1101_11) else {
403-
// Invalid sequence, discard `unit0` and store `unit1` for the next call.
404-
_decodeLookahead = unit1
405-
return .error
369+
defer { self = ._swift3Buffer(parser) }
370+
switch parser.parseScalar(from: &input) {
371+
case .valid(let s): return .scalarValue(UTF16.decode(s))
372+
case .invalid: return .error
373+
case .emptyInput: return .emptyInput
406374
}
407-
408-
// We have a well-formed surrogate pair, decode it.
409-
let result = 0x10000 + (
410-
(UInt32(extendingOrTruncating: unit0 & 0x03ff) &<< 10) |
411-
UInt32(extendingOrTruncating: unit1 & 0x03ff))
412-
return .scalarValue(UnicodeScalar(_unchecked: result))
413375
}
414376

415377
/// Try to decode one Unicode scalar, and return the actual number of code
@@ -452,19 +414,14 @@ public struct UTF16 : UnicodeCodec {
452414
_ input: UnicodeScalar,
453415
into processCodeUnit: (CodeUnit) -> Void
454416
) {
455-
let scalarValue: UInt32 = UInt32(input)
456-
457-
if scalarValue <= UInt32(extendingOrTruncating: UInt16.max) {
458-
processCodeUnit(UInt16(extendingOrTruncating: scalarValue))
459-
}
460-
else {
461-
let lead_offset =
462-
(0xd800 as UInt32) - UInt32(extendingOrTruncating: 0x10000 &>> 10)
463-
processCodeUnit(UInt16(lead_offset + (scalarValue &>> (10 as UInt32))))
464-
processCodeUnit(UInt16(0xdc00 + (scalarValue & 0x3ff)))
465-
}
417+
var s = encode(input)._storage
418+
processCodeUnit(UInt16(extendingOrTruncating: s))
419+
s &>>= 16
420+
if _fastPath(s == 0) { return }
421+
processCodeUnit(UInt16(extendingOrTruncating: s))
466422
}
467423
}
424+
public typealias UTF16 = _Unicode.UTF16
468425

469426
/// A codec for translating between Unicode scalar values and UTF-32 code
470427
/// units.
@@ -1060,7 +1017,6 @@ extension UTF16 {
10601017

10611018
/// A namespace for Unicode utilities.
10621019
public enum _Unicode {
1063-
public typealias UTF16 = Swift.UTF16
10641020
public typealias UTF32 = Swift.UTF32
10651021
}
10661022

0 commit comments

Comments
 (0)