Skip to content

[stdlib] Rewrite UTF8._isValidUTF8() #1477

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Feb 29, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions stdlib/public/core/StringUTF8.swift
Original file line number Diff line number Diff line change
Expand Up @@ -159,8 +159,7 @@ extension String {
/// True iff the index is at the end of its view or if the next
/// byte begins a new UnicodeScalar.
internal var _isOnUnicodeScalarBoundary : Bool {
let next = UTF8.CodeUnit(truncatingBitPattern: _buffer)
return UTF8._numTrailingBytes(next) != 4 || _isAtEnd
return UTF8._isValidUTF8(UInt32(truncatingBitPattern: _buffer)) || _isAtEnd
}

/// True iff the index is at the end of its view
Expand Down
173 changes: 57 additions & 116 deletions stdlib/public/core/Unicode.swift
Original file line number Diff line number Diff line change
Expand Up @@ -78,48 +78,6 @@ public struct UTF8 : UnicodeCodecType {

public init() {}

/// Returns the number of expected trailing bytes for a given first byte: 0,
/// 1, 2 or 3. If the first byte cannot start a valid UTF-8 code unit
/// sequence, returns 4.
@warn_unused_result
public static func _numTrailingBytes(cu0: CodeUnit) -> UInt8 {
if _fastPath(cu0 & 0x80 == 0) {
// 0x00 -- 0x7f: 1-byte sequences.
return 0
}

// 0xc0 -- 0xc1: invalid first byte.
// 0xc2 -- 0xdf: 2-byte sequences.
// 0xe0 -- 0xef: 3-byte sequences.
// 0xf0 -- 0xf4: 4-byte sequences.
// 0xf5 -- 0xff: invalid first byte.

// The rules above are represented as a lookup table. The lookup table
// consists of two words, where `high` contains the high bit of the result,
// `low` contains the low bit.
//
// Bit patterns:
// high | low | meaning
// -----+-----+----------------
// 0 | 0 | 2-byte sequence
// 0 | 1 | 3-byte sequence
// 1 | 0 | 4-byte sequence
// 1 | 1 | invalid
//
// This implementation allows us to handle these cases without branches.

// ---------0xf?------- ---------0xe?------- ---------0xd?------- ---------0xc?-------
let low: UInt64 =
0b1111_1111__1110_0000__1111_1111__1111_1111__0000_0000__0000_0000__0000_0000__0000_0011
let high: UInt64 =
0b1111_1111__1111_1111__0000_0000__0000_0000__0000_0000__0000_0000__0000_0000__0000_0011

let index = UInt64(max(0, Int(cu0) - 0xc0))
let highBit = ((high >> index) & 1) << 1
let lowBit = (low >> index) & 1
return UInt8(1 + (highBit | lowBit))
}

/// Lookahead buffer used for UTF-8 decoding. New bytes are inserted at LSB,
/// and bytes are read at MSB.
var _decodeLookahead: UInt32 = 0
Expand All @@ -141,83 +99,66 @@ public struct UTF8 : UnicodeCodecType {
/// buffer with a shift, and update flags with a single-bit right shift.
var _lookaheadFlags: UInt8 = 0

/// Returns `true` if the LSB bytes in `buffer` are well-formed UTF-8 code
/// unit sequence.

/// Returns `true` if the LSB bytes in `buffer` are a well-formed UTF-8 code
/// unit sequence. The lowest byte is considered the first code unit.
///
/// - Requires: There is at least one used byte in `buffer`, and the unused
/// space in `buffer` is filled with some value not matching the UTF-8
/// continuation byte form (`0b10xxxxxx`).
@warn_unused_result
static func _isValidUTF8Impl(buffer: UInt32, length: UInt8) -> Bool {
switch length {
case 4:
let cu3 = UInt8((buffer >> 24) & 0xff)
if cu3 < 0x80 || cu3 > 0xbf {
return false
}
fallthrough
case 3:
let cu2 = UInt8((buffer >> 16) & 0xff)
if cu2 < 0x80 || cu2 > 0xbf {
return false
}
fallthrough
case 2:
let cu0 = UInt8(buffer & 0xff)
let cu1 = UInt8((buffer >> 8) & 0xff)
switch cu0 {
case 0xe0:
if cu1 < 0xa0 || cu1 > 0xbf {
return false
}
case 0xed:
if cu1 < 0x80 || cu1 > 0x9f {
return false
}
case 0xf0:
if cu1 < 0x90 || cu1 > 0xbf {
return false
}
case 0xf4:
if cu1 < 0x80 || cu1 > 0x8f {
return false
}
default:
_sanityCheck(cu0 >= 0xc2 && cu0 <= 0xf4,
"invalid first bytes should be handled in the caller")
if cu1 < 0x80 || cu1 > 0xbf {
return false
}
}
return true
public // @testable
static func _isValidUTF8(buffer: UInt32) -> Bool {

default:
_sanityCheckFailure("one-byte sequences should be handled in the caller")
if _fastPath(buffer & 0x80 == 0) {
return true // 0x00 -- 0x7f: 1-byte sequences (ASCII).
}
}

/// Returns `true` if the LSB bytes in `buffer` are well-formed UTF-8 code
/// unit sequence.
@warn_unused_result
static func _isValidUTF8(buffer: UInt32, validBytes: UInt8) -> Bool {
_sanityCheck(validBytes & 0b0000_1111 != 0,
"input buffer should not be empty")

let cu0 = UInt8(buffer & 0xff)
let trailingBytes = _numTrailingBytes(cu0)
switch trailingBytes {
case 0:
// Determine sequence length using high 5 bits of 1st byte. We use a
// look-up table to branch less. 1-byte sequences are handled above.
//
// case | pattern | description
// ----------------------------
// 00 | 110xx | 2-byte sequence
// 01 | 1110x | 3-byte sequence
// 10 | 11110 | 4-byte sequence
// 11 | other | invalid
//
// 11xxx 10xxx 01xxx 00xxx
let lut0: UInt32 = 0b1011_0000__1111_1111__1111_1111__1111_1111
let lut1: UInt32 = 0b1100_0000__1111_1111__1111_1111__1111_1111

let index = (buffer >> 3) & 0x1f
let bit0 = (lut0 >> index) & 1
let bit1 = (lut1 >> index) & 1

switch (bit1, bit0) {
case (0, 0): // 2-byte sequence.
// Require 10xx xxxx 110x xxxx.
if buffer & 0xc0e0 != 0x80c0 { return false }
// Disallow xxxx xxxx xxx0 000x (<= 7 bits case).
if buffer & 0x001e == 0x0000 { return false }
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry, I don't understand this case. I think you meant to test against 0x1f00 instead of 0x001e.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The bytes come in reverse order. Never mind.

return true

case 1, 2, 3:
// We *don't* need to check the if the buffer actually contains at least
// `trailingBytes` bytes. Here's why.
//
// If the buffer is not full -- contains fewer than 4 bytes, we are at
// EOF, and the buffer will be padded with 0x00. Thus, an incomplete
// code unit sequence just before EOF would be seen by code below as
// padded with nuls. This sequence will be rejected by the logic in
// `_isValidUTF8Impl`, because the nul byte is not a valid continuation
// byte for UTF-8.
return _isValidUTF8Impl(buffer, length: trailingBytes + 1)

default:
case (0, 1): // 3-byte sequence.
// Require 10xx xxxx 10xx xxxx 1110 xxxx.
if buffer & 0xc0c0f0 != 0x8080e0 { return false }
// Disallow xxxx xxxx xx0x xxxx xxxx 0000 (<= 11 bits case).
if buffer & 0x00200f == 0x000000 { return false }
// Disallow xxxx xxxx xx1x xxxx xxxx 1101 (surrogate code points).
if buffer & 0x00200f == 0x00200d { return false }
return true
case (1, 0): // 4-byte sequence.
// Require 10xx xxxx 10xx xxxx 10xx xxxx 1111 0xxx.
if buffer & 0xc0c0c0f8 != 0x808080f0 { return false }
// Disallow xxxx xxxx xxxx xxxx xx00 xxxx xxxx x000 (<= 16 bits case).
if buffer & 0x00003007 == 0x00000000 { return false }
// Case xxxx xxxx xxxx xxxx xxxx xxxx xxxx x1xx.
if buffer & 0x00000004 == 0x00000004 {
// Require xxxx xxxx xxxx xxxx xx00 xxxx xxxx xx00 (<= 0x10FFFF).
if buffer & 0x00003003 != 0x00000000 { return false }
}
return true
default: // Invalid sequence.
return false
}
}
Expand All @@ -237,7 +178,7 @@ public struct UTF8 : UnicodeCodecType {

_sanityCheck(validBytes != 0,
"input buffer should not be empty")
_sanityCheck(!UTF8._isValidUTF8(buffer, validBytes: validBytes),
_sanityCheck(!UTF8._isValidUTF8(buffer),
"input sequence should be ill-formed UTF-8")

// Unicode 6.3.0, D93b:
Expand Down Expand Up @@ -391,7 +332,7 @@ public struct UTF8 : UnicodeCodecType {
// The first byte to read is located at MSB of `_decodeLookahead`. Get a
// representation of the buffer where we can read bytes starting from LSB.
var buffer = _decodeLookahead.byteSwapped
if _slowPath(!UTF8._isValidUTF8(buffer, validBytes: _lookaheadFlags)) {
if _slowPath(!UTF8._isValidUTF8(buffer)) {
// The code unit sequence is ill-formed. According to Unicode
// recommendation, replace the maximal subpart of ill-formed sequence
// with one replacement character.
Expand Down
53 changes: 31 additions & 22 deletions validation-test/stdlib/Unicode.swift
Original file line number Diff line number Diff line change
Expand Up @@ -787,32 +787,41 @@ UnicodeScalarTests.test("init") {

var UTF8Decoder = TestSuite("UTF8Decoder")

UTF8Decoder.test("Internal/_numTrailingBytes") {
for i in UInt8(0x00)...UInt8(0x7f) {
expectEqual(0, UTF8._numTrailingBytes(i), "i=\(i)")
}
for i in UInt8(0x80)...UInt8(0xc1) {
expectEqual(4, UTF8._numTrailingBytes(i), "i=\(i)")
}
for i in UInt8(0xc2)...UInt8(0xdf) {
expectEqual(1, UTF8._numTrailingBytes(i), "i=\(i)")
}
for i in UInt8(0xe0)...UInt8(0xef) {
expectEqual(2, UTF8._numTrailingBytes(i), "i=\(i)")
}
for i in UInt8(0xf0)...UInt8(0xf4) {
expectEqual(3, UTF8._numTrailingBytes(i), "i=\(i)")
UTF8Decoder.test("Internal/_isValidUTF8") {

// Ensure we accept all valid scalars
func ensureValid(scalar: UnicodeScalar) {
var data: UInt32 = 0
var i: UInt32 = 0
Swift.UTF8.encode(scalar) { cp in
data |= UInt32(cp) << (i*8)
i += 1
}
expectEqual(UTF8._isValidUTF8(data), true, "data=\(asHex(data))")
}
for i in UInt8(0xf5)...UInt8(0xfe) {
expectEqual(4, UTF8._numTrailingBytes(i), "i=\(i)")

for i in 0..<0xd800 { ensureValid(UnicodeScalar(i)) }
for i in 0xe000...0x10ffff { ensureValid(UnicodeScalar(i)) }

// Ensure we have no false positives
var n = 0
func countValidSequences(head head: Range<UInt32>, tail: Range<UInt32>) {
for cu0 in head {
for rest in tail {
let data = rest << 8 | cu0
if UTF8._isValidUTF8(data) { n += 1 }
}
}
}
// Separate test for 0xff because of:
// <rdar://problem/17376512> Range UInt8(0x00)...UInt8(0xff) invokes a
// runtime trap
var i = UInt8(0xff)
expectEqual(4, UTF8._numTrailingBytes(i), "i=\(i)")

countValidSequences(head: 0x00...0x7f, tail: 0...0)
countValidSequences(head: 0xc0...0xdf, tail: 0...0xff)
countValidSequences(head: 0xe0...0xef, tail: 0...0xffff)
countValidSequences(head: 0xf0...0xf7, tail: 0...0xffffff)
expectEqual(n, 0x10f800, "n=\(asHex(n))") // 0x10ffff minus surrogates
}


UTF8Decoder.test("Empty") {
expectTrue(checkDecodeUTF8([], [], []))
}
Expand Down