@@ -78,48 +78,6 @@ public struct UTF8 : UnicodeCodecType {
78
78
79
79
public init ( ) { }
80
80
81
- /// Returns the number of expected trailing bytes for a given first byte: 0,
82
- /// 1, 2 or 3. If the first byte cannot start a valid UTF-8 code unit
83
- /// sequence, returns 4.
84
- @warn_unused_result
85
- public static func _numTrailingBytes( cu0: CodeUnit ) -> UInt8 {
86
- if _fastPath ( cu0 & 0x80 == 0 ) {
87
- // 0x00 -- 0x7f: 1-byte sequences.
88
- return 0
89
- }
90
-
91
- // 0xc0 -- 0xc1: invalid first byte.
92
- // 0xc2 -- 0xdf: 2-byte sequences.
93
- // 0xe0 -- 0xef: 3-byte sequences.
94
- // 0xf0 -- 0xf4: 4-byte sequences.
95
- // 0xf5 -- 0xff: invalid first byte.
96
-
97
- // The rules above are represented as a lookup table. The lookup table
98
- // consists of two words, where `high` contains the high bit of the result,
99
- // `low` contains the low bit.
100
- //
101
- // Bit patterns:
102
- // high | low | meaning
103
- // -----+-----+----------------
104
- // 0 | 0 | 2-byte sequence
105
- // 0 | 1 | 3-byte sequence
106
- // 1 | 0 | 4-byte sequence
107
- // 1 | 1 | invalid
108
- //
109
- // This implementation allows us to handle these cases without branches.
110
-
111
- // ---------0xf?------- ---------0xe?------- ---------0xd?------- ---------0xc?-------
112
- let low : UInt64 =
113
- 0b1111_1111__1110_0000__1111_1111__1111_1111__0000_0000__0000_0000__0000_0000__0000_0011
114
- let high : UInt64 =
115
- 0b1111_1111__1111_1111__0000_0000__0000_0000__0000_0000__0000_0000__0000_0000__0000_0011
116
-
117
- let index = UInt64 ( max ( 0 , Int ( cu0) - 0xc0 ) )
118
- let highBit = ( ( high >> index) & 1 ) << 1
119
- let lowBit = ( low >> index) & 1
120
- return UInt8 ( 1 + ( highBit | lowBit) )
121
- }
122
-
123
81
/// Lookahead buffer used for UTF-8 decoding. New bytes are inserted at LSB,
124
82
/// and bytes are read at MSB.
125
83
var _decodeLookahead : UInt32 = 0
@@ -141,83 +99,66 @@ public struct UTF8 : UnicodeCodecType {
141
99
/// buffer with a shift, and update flags with a single-bit right shift.
142
100
var _lookaheadFlags : UInt8 = 0
143
101
144
- /// Returns `true` if the LSB bytes in `buffer` are well-formed UTF-8 code
145
- /// unit sequence.
102
+
103
+ /// Returns `true` if the LSB bytes in `buffer` are a well-formed UTF-8 code
104
+ /// unit sequence. The lowest byte is considered the first code unit.
105
+ ///
106
+ /// - Requires: There is at least one used byte in `buffer`, and the unused
107
+ /// space in `buffer` is filled with some value not matching the UTF-8
108
+ /// continuation byte form (`0b10xxxxxx`).
146
109
@warn_unused_result
147
- static func _isValidUTF8Impl( buffer: UInt32 , length: UInt8 ) -> Bool {
148
- switch length {
149
- case 4 :
150
- let cu3 = UInt8 ( ( buffer >> 24 ) & 0xff )
151
- if cu3 < 0x80 || cu3 > 0xbf {
152
- return false
153
- }
154
- fallthrough
155
- case 3 :
156
- let cu2 = UInt8 ( ( buffer >> 16 ) & 0xff )
157
- if cu2 < 0x80 || cu2 > 0xbf {
158
- return false
159
- }
160
- fallthrough
161
- case 2 :
162
- let cu0 = UInt8 ( buffer & 0xff )
163
- let cu1 = UInt8 ( ( buffer >> 8 ) & 0xff )
164
- switch cu0 {
165
- case 0xe0 :
166
- if cu1 < 0xa0 || cu1 > 0xbf {
167
- return false
168
- }
169
- case 0xed :
170
- if cu1 < 0x80 || cu1 > 0x9f {
171
- return false
172
- }
173
- case 0xf0 :
174
- if cu1 < 0x90 || cu1 > 0xbf {
175
- return false
176
- }
177
- case 0xf4 :
178
- if cu1 < 0x80 || cu1 > 0x8f {
179
- return false
180
- }
181
- default :
182
- _sanityCheck ( cu0 >= 0xc2 && cu0 <= 0xf4 ,
183
- " invalid first bytes should be handled in the caller " )
184
- if cu1 < 0x80 || cu1 > 0xbf {
185
- return false
186
- }
187
- }
188
- return true
110
+ public // @testable
111
+ static func _isValidUTF8( buffer: UInt32 ) -> Bool {
189
112
190
- default :
191
- _sanityCheckFailure ( " one-byte sequences should be handled in the caller " )
113
+ if _fastPath ( buffer & 0x80 == 0 ) {
114
+ return true // 0x00 -- 0x7f: 1-byte sequences (ASCII).
192
115
}
193
- }
194
116
195
- /// Returns `true` if the LSB bytes in `buffer` are well-formed UTF-8 code
196
- /// unit sequence.
197
- @warn_unused_result
198
- static func _isValidUTF8( buffer: UInt32 , validBytes: UInt8 ) -> Bool {
199
- _sanityCheck ( validBytes & 0b0000_1111 != 0 ,
200
- " input buffer should not be empty " )
201
-
202
- let cu0 = UInt8 ( buffer & 0xff )
203
- let trailingBytes = _numTrailingBytes ( cu0)
204
- switch trailingBytes {
205
- case 0 :
117
+ // Determine sequence length using high 5 bits of 1st byte. We use a
118
+ // look-up table to branch less. 1-byte sequences are handled above.
119
+ //
120
+ // case | pattern | description
121
+ // ----------------------------
122
+ // 00 | 110xx | 2-byte sequence
123
+ // 01 | 1110x | 3-byte sequence
124
+ // 10 | 11110 | 4-byte sequence
125
+ // 11 | other | invalid
126
+ //
127
+ // 11xxx 10xxx 01xxx 00xxx
128
+ let lut0 : UInt32 = 0b1011_0000__1111_1111__1111_1111__1111_1111
129
+ let lut1 : UInt32 = 0b1100_0000__1111_1111__1111_1111__1111_1111
130
+
131
+ let index = ( buffer >> 3 ) & 0x1f
132
+ let bit0 = ( lut0 >> index) & 1
133
+ let bit1 = ( lut1 >> index) & 1
134
+
135
+ switch ( bit1, bit0) {
136
+ case ( 0 , 0 ) : // 2-byte sequence.
137
+ // Require 10xx xxxx 110x xxxx.
138
+ if buffer & 0xc0e0 != 0x80c0 { return false }
139
+ // Disallow xxxx xxxx xxx0 000x (<= 7 bits case).
140
+ if buffer & 0x001e == 0x0000 { return false }
206
141
return true
207
-
208
- case 1 , 2 , 3 :
209
- // We *don't* need to check the if the buffer actually contains at least
210
- // `trailingBytes` bytes. Here's why.
211
- //
212
- // If the buffer is not full -- contains fewer than 4 bytes, we are at
213
- // EOF, and the buffer will be padded with 0x00. Thus, an incomplete
214
- // code unit sequence just before EOF would be seen by code below as
215
- // padded with nuls. This sequence will be rejected by the logic in
216
- // `_isValidUTF8Impl`, because the nul byte is not a valid continuation
217
- // byte for UTF-8.
218
- return _isValidUTF8Impl ( buffer, length: trailingBytes + 1 )
219
-
220
- default :
142
+ case ( 0 , 1 ) : // 3-byte sequence.
143
+ // Require 10xx xxxx 10xx xxxx 1110 xxxx.
144
+ if buffer & 0xc0c0f0 != 0x8080e0 { return false }
145
+ // Disallow xxxx xxxx xx0x xxxx xxxx 0000 (<= 11 bits case).
146
+ if buffer & 0x00200f == 0x000000 { return false }
147
+ // Disallow xxxx xxxx xx1x xxxx xxxx 1101 (surrogate code points).
148
+ if buffer & 0x00200f == 0x00200d { return false }
149
+ return true
150
+ case ( 1 , 0 ) : // 4-byte sequence.
151
+ // Require 10xx xxxx 10xx xxxx 10xx xxxx 1111 0xxx.
152
+ if buffer & 0xc0c0c0f8 != 0x808080f0 { return false }
153
+ // Disallow xxxx xxxx xxxx xxxx xx00 xxxx xxxx x000 (<= 16 bits case).
154
+ if buffer & 0x00003007 == 0x00000000 { return false }
155
+ // Case xxxx xxxx xxxx xxxx xxxx xxxx xxxx x1xx.
156
+ if buffer & 0x00000004 == 0x00000004 {
157
+ // Require xxxx xxxx xxxx xxxx xx00 xxxx xxxx xx00 (<= 0x10FFFF).
158
+ if buffer & 0x00003003 != 0x00000000 { return false }
159
+ }
160
+ return true
161
+ default : // Invalid sequence.
221
162
return false
222
163
}
223
164
}
@@ -237,7 +178,7 @@ public struct UTF8 : UnicodeCodecType {
237
178
238
179
_sanityCheck ( validBytes != 0 ,
239
180
" input buffer should not be empty " )
240
- _sanityCheck ( !UTF8. _isValidUTF8 ( buffer, validBytes : validBytes ) ,
181
+ _sanityCheck ( !UTF8. _isValidUTF8 ( buffer) ,
241
182
" input sequence should be ill-formed UTF-8 " )
242
183
243
184
// Unicode 6.3.0, D93b:
@@ -391,7 +332,7 @@ public struct UTF8 : UnicodeCodecType {
391
332
// The first byte to read is located at MSB of `_decodeLookahead`. Get a
392
333
// representation of the buffer where we can read bytes starting from LSB.
393
334
var buffer = _decodeLookahead. byteSwapped
394
- if _slowPath ( !UTF8. _isValidUTF8 ( buffer, validBytes : _lookaheadFlags ) ) {
335
+ if _slowPath ( !UTF8. _isValidUTF8 ( buffer) ) {
395
336
// The code unit sequence is ill-formed. According to Unicode
396
337
// recommendation, replace the maximal subpart of ill-formed sequence
397
338
// with one replacement character.
0 commit comments