Skip to content

Commit 56785e8

Browse files
committed
Merge pull request #1477 from PatrickPijnappel/patch-3
[stdlib] Rewrite UTF8._isValidUTF8()
2 parents fbd74a3 + 7c7be3d commit 56785e8

File tree

3 files changed

+89
-140
lines changed

3 files changed

+89
-140
lines changed

stdlib/public/core/StringUTF8.swift

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -159,8 +159,7 @@ extension String {
159159
/// True iff the index is at the end of its view or if the next
160160
/// byte begins a new UnicodeScalar.
161161
internal var _isOnUnicodeScalarBoundary : Bool {
162-
let next = UTF8.CodeUnit(truncatingBitPattern: _buffer)
163-
return UTF8._numTrailingBytes(next) != 4 || _isAtEnd
162+
return UTF8._isValidUTF8(UInt32(truncatingBitPattern: _buffer)) || _isAtEnd
164163
}
165164

166165
/// True iff the index is at the end of its view

stdlib/public/core/Unicode.swift

Lines changed: 57 additions & 116 deletions
Original file line numberDiff line numberDiff line change
@@ -78,48 +78,6 @@ public struct UTF8 : UnicodeCodecType {
7878

7979
public init() {}
8080

81-
/// Returns the number of expected trailing bytes for a given first byte: 0,
82-
/// 1, 2 or 3. If the first byte cannot start a valid UTF-8 code unit
83-
/// sequence, returns 4.
84-
@warn_unused_result
85-
public static func _numTrailingBytes(cu0: CodeUnit) -> UInt8 {
86-
if _fastPath(cu0 & 0x80 == 0) {
87-
// 0x00 -- 0x7f: 1-byte sequences.
88-
return 0
89-
}
90-
91-
// 0xc0 -- 0xc1: invalid first byte.
92-
// 0xc2 -- 0xdf: 2-byte sequences.
93-
// 0xe0 -- 0xef: 3-byte sequences.
94-
// 0xf0 -- 0xf4: 4-byte sequences.
95-
// 0xf5 -- 0xff: invalid first byte.
96-
97-
// The rules above are represented as a lookup table. The lookup table
98-
// consists of two words, where `high` contains the high bit of the result,
99-
// `low` contains the low bit.
100-
//
101-
// Bit patterns:
102-
// high | low | meaning
103-
// -----+-----+----------------
104-
// 0 | 0 | 2-byte sequence
105-
// 0 | 1 | 3-byte sequence
106-
// 1 | 0 | 4-byte sequence
107-
// 1 | 1 | invalid
108-
//
109-
// This implementation allows us to handle these cases without branches.
110-
111-
// ---------0xf?------- ---------0xe?------- ---------0xd?------- ---------0xc?-------
112-
let low: UInt64 =
113-
0b1111_1111__1110_0000__1111_1111__1111_1111__0000_0000__0000_0000__0000_0000__0000_0011
114-
let high: UInt64 =
115-
0b1111_1111__1111_1111__0000_0000__0000_0000__0000_0000__0000_0000__0000_0000__0000_0011
116-
117-
let index = UInt64(max(0, Int(cu0) - 0xc0))
118-
let highBit = ((high >> index) & 1) << 1
119-
let lowBit = (low >> index) & 1
120-
return UInt8(1 + (highBit | lowBit))
121-
}
122-
12381
/// Lookahead buffer used for UTF-8 decoding. New bytes are inserted at LSB,
12482
/// and bytes are read at MSB.
12583
var _decodeLookahead: UInt32 = 0
@@ -141,83 +99,66 @@ public struct UTF8 : UnicodeCodecType {
14199
/// buffer with a shift, and update flags with a single-bit right shift.
142100
var _lookaheadFlags: UInt8 = 0
143101

144-
/// Returns `true` if the LSB bytes in `buffer` are well-formed UTF-8 code
145-
/// unit sequence.
102+
103+
/// Returns `true` if the LSB bytes in `buffer` are a well-formed UTF-8 code
104+
/// unit sequence. The lowest byte is considered the first code unit.
105+
///
106+
/// - Requires: There is at least one used byte in `buffer`, and the unused
107+
/// space in `buffer` is filled with some value not matching the UTF-8
108+
/// continuation byte form (`0b10xxxxxx`).
146109
@warn_unused_result
147-
static func _isValidUTF8Impl(buffer: UInt32, length: UInt8) -> Bool {
148-
switch length {
149-
case 4:
150-
let cu3 = UInt8((buffer >> 24) & 0xff)
151-
if cu3 < 0x80 || cu3 > 0xbf {
152-
return false
153-
}
154-
fallthrough
155-
case 3:
156-
let cu2 = UInt8((buffer >> 16) & 0xff)
157-
if cu2 < 0x80 || cu2 > 0xbf {
158-
return false
159-
}
160-
fallthrough
161-
case 2:
162-
let cu0 = UInt8(buffer & 0xff)
163-
let cu1 = UInt8((buffer >> 8) & 0xff)
164-
switch cu0 {
165-
case 0xe0:
166-
if cu1 < 0xa0 || cu1 > 0xbf {
167-
return false
168-
}
169-
case 0xed:
170-
if cu1 < 0x80 || cu1 > 0x9f {
171-
return false
172-
}
173-
case 0xf0:
174-
if cu1 < 0x90 || cu1 > 0xbf {
175-
return false
176-
}
177-
case 0xf4:
178-
if cu1 < 0x80 || cu1 > 0x8f {
179-
return false
180-
}
181-
default:
182-
_sanityCheck(cu0 >= 0xc2 && cu0 <= 0xf4,
183-
"invalid first bytes should be handled in the caller")
184-
if cu1 < 0x80 || cu1 > 0xbf {
185-
return false
186-
}
187-
}
188-
return true
110+
public // @testable
111+
static func _isValidUTF8(buffer: UInt32) -> Bool {
189112

190-
default:
191-
_sanityCheckFailure("one-byte sequences should be handled in the caller")
113+
if _fastPath(buffer & 0x80 == 0) {
114+
return true // 0x00 -- 0x7f: 1-byte sequences (ASCII).
192115
}
193-
}
194116

195-
/// Returns `true` if the LSB bytes in `buffer` are well-formed UTF-8 code
196-
/// unit sequence.
197-
@warn_unused_result
198-
static func _isValidUTF8(buffer: UInt32, validBytes: UInt8) -> Bool {
199-
_sanityCheck(validBytes & 0b0000_1111 != 0,
200-
"input buffer should not be empty")
201-
202-
let cu0 = UInt8(buffer & 0xff)
203-
let trailingBytes = _numTrailingBytes(cu0)
204-
switch trailingBytes {
205-
case 0:
117+
// Determine sequence length using high 5 bits of 1st byte. We use a
118+
// look-up table to branch less. 1-byte sequences are handled above.
119+
//
120+
// case | pattern | description
121+
// ----------------------------
122+
// 00 | 110xx | 2-byte sequence
123+
// 01 | 1110x | 3-byte sequence
124+
// 10 | 11110 | 4-byte sequence
125+
// 11 | other | invalid
126+
//
127+
// 11xxx 10xxx 01xxx 00xxx
128+
let lut0: UInt32 = 0b1011_0000__1111_1111__1111_1111__1111_1111
129+
let lut1: UInt32 = 0b1100_0000__1111_1111__1111_1111__1111_1111
130+
131+
let index = (buffer >> 3) & 0x1f
132+
let bit0 = (lut0 >> index) & 1
133+
let bit1 = (lut1 >> index) & 1
134+
135+
switch (bit1, bit0) {
136+
case (0, 0): // 2-byte sequence.
137+
// Require 10xx xxxx 110x xxxx.
138+
if buffer & 0xc0e0 != 0x80c0 { return false }
139+
// Disallow xxxx xxxx xxx0 000x (<= 7 bits case).
140+
if buffer & 0x001e == 0x0000 { return false }
206141
return true
207-
208-
case 1, 2, 3:
209-
// We *don't* need to check the if the buffer actually contains at least
210-
// `trailingBytes` bytes. Here's why.
211-
//
212-
// If the buffer is not full -- contains fewer than 4 bytes, we are at
213-
// EOF, and the buffer will be padded with 0x00. Thus, an incomplete
214-
// code unit sequence just before EOF would be seen by code below as
215-
// padded with nuls. This sequence will be rejected by the logic in
216-
// `_isValidUTF8Impl`, because the nul byte is not a valid continuation
217-
// byte for UTF-8.
218-
return _isValidUTF8Impl(buffer, length: trailingBytes + 1)
219-
220-
default:
142+
case (0, 1): // 3-byte sequence.
143+
// Require 10xx xxxx 10xx xxxx 1110 xxxx.
144+
if buffer & 0xc0c0f0 != 0x8080e0 { return false }
145+
// Disallow xxxx xxxx xx0x xxxx xxxx 0000 (<= 11 bits case).
146+
if buffer & 0x00200f == 0x000000 { return false }
147+
// Disallow xxxx xxxx xx1x xxxx xxxx 1101 (surrogate code points).
148+
if buffer & 0x00200f == 0x00200d { return false }
149+
return true
150+
case (1, 0): // 4-byte sequence.
151+
// Require 10xx xxxx 10xx xxxx 10xx xxxx 1111 0xxx.
152+
if buffer & 0xc0c0c0f8 != 0x808080f0 { return false }
153+
// Disallow xxxx xxxx xxxx xxxx xx00 xxxx xxxx x000 (<= 16 bits case).
154+
if buffer & 0x00003007 == 0x00000000 { return false }
155+
// Case xxxx xxxx xxxx xxxx xxxx xxxx xxxx x1xx.
156+
if buffer & 0x00000004 == 0x00000004 {
157+
// Require xxxx xxxx xxxx xxxx xx00 xxxx xxxx xx00 (<= 0x10FFFF).
158+
if buffer & 0x00003003 != 0x00000000 { return false }
159+
}
160+
return true
161+
default: // Invalid sequence.
221162
return false
222163
}
223164
}
@@ -237,7 +178,7 @@ public struct UTF8 : UnicodeCodecType {
237178

238179
_sanityCheck(validBytes != 0,
239180
"input buffer should not be empty")
240-
_sanityCheck(!UTF8._isValidUTF8(buffer, validBytes: validBytes),
181+
_sanityCheck(!UTF8._isValidUTF8(buffer),
241182
"input sequence should be ill-formed UTF-8")
242183

243184
// Unicode 6.3.0, D93b:
@@ -391,7 +332,7 @@ public struct UTF8 : UnicodeCodecType {
391332
// The first byte to read is located at MSB of `_decodeLookahead`. Get a
392333
// representation of the buffer where we can read bytes starting from LSB.
393334
var buffer = _decodeLookahead.byteSwapped
394-
if _slowPath(!UTF8._isValidUTF8(buffer, validBytes: _lookaheadFlags)) {
335+
if _slowPath(!UTF8._isValidUTF8(buffer)) {
395336
// The code unit sequence is ill-formed. According to Unicode
396337
// recommendation, replace the maximal subpart of ill-formed sequence
397338
// with one replacement character.

validation-test/stdlib/Unicode.swift

Lines changed: 31 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -787,32 +787,41 @@ UnicodeScalarTests.test("init") {
787787

788788
var UTF8Decoder = TestSuite("UTF8Decoder")
789789

790-
UTF8Decoder.test("Internal/_numTrailingBytes") {
791-
for i in UInt8(0x00)...UInt8(0x7f) {
792-
expectEqual(0, UTF8._numTrailingBytes(i), "i=\(i)")
793-
}
794-
for i in UInt8(0x80)...UInt8(0xc1) {
795-
expectEqual(4, UTF8._numTrailingBytes(i), "i=\(i)")
796-
}
797-
for i in UInt8(0xc2)...UInt8(0xdf) {
798-
expectEqual(1, UTF8._numTrailingBytes(i), "i=\(i)")
799-
}
800-
for i in UInt8(0xe0)...UInt8(0xef) {
801-
expectEqual(2, UTF8._numTrailingBytes(i), "i=\(i)")
802-
}
803-
for i in UInt8(0xf0)...UInt8(0xf4) {
804-
expectEqual(3, UTF8._numTrailingBytes(i), "i=\(i)")
790+
UTF8Decoder.test("Internal/_isValidUTF8") {
791+
792+
// Ensure we accept all valid scalars
793+
func ensureValid(scalar: UnicodeScalar) {
794+
var data: UInt32 = 0
795+
var i: UInt32 = 0
796+
Swift.UTF8.encode(scalar) { cp in
797+
data |= UInt32(cp) << (i*8)
798+
i += 1
799+
}
800+
expectEqual(UTF8._isValidUTF8(data), true, "data=\(asHex(data))")
805801
}
806-
for i in UInt8(0xf5)...UInt8(0xfe) {
807-
expectEqual(4, UTF8._numTrailingBytes(i), "i=\(i)")
802+
803+
for i in 0..<0xd800 { ensureValid(UnicodeScalar(i)) }
804+
for i in 0xe000...0x10ffff { ensureValid(UnicodeScalar(i)) }
805+
806+
// Ensure we have no false positives
807+
var n = 0
808+
func countValidSequences(head head: Range<UInt32>, tail: Range<UInt32>) {
809+
for cu0 in head {
810+
for rest in tail {
811+
let data = rest << 8 | cu0
812+
if UTF8._isValidUTF8(data) { n += 1 }
813+
}
814+
}
808815
}
809-
// Separate test for 0xff because of:
810-
// <rdar://problem/17376512> Range UInt8(0x00)...UInt8(0xff) invokes a
811-
// runtime trap
812-
var i = UInt8(0xff)
813-
expectEqual(4, UTF8._numTrailingBytes(i), "i=\(i)")
816+
817+
countValidSequences(head: 0x00...0x7f, tail: 0...0)
818+
countValidSequences(head: 0xc0...0xdf, tail: 0...0xff)
819+
countValidSequences(head: 0xe0...0xef, tail: 0...0xffff)
820+
countValidSequences(head: 0xf0...0xf7, tail: 0...0xffffff)
821+
expectEqual(n, 0x10f800, "n=\(asHex(n))") // 0x10ffff minus surrogates
814822
}
815823

824+
816825
UTF8Decoder.test("Empty") {
817826
expectTrue(checkDecodeUTF8([], [], []))
818827
}

0 commit comments

Comments
 (0)