@@ -156,12 +156,6 @@ extension Unicode.Scalar {
156
156
// including and above the DEL character U+7F.
157
157
return self . value >= 0x20 && self . value < 0x7F
158
158
}
159
-
160
- var isStartOfUTF8Character : Bool {
161
- // RFC 2279: The octet values FE and FF never appear.
162
- // RFC 3629: The octet values C0, C1, F5 to FF never appear.
163
- return self . value <= 0x80 || ( self . value >= 0xC2 && self . value < 0xF5 )
164
- }
165
159
}
166
160
167
161
extension Unicode . Scalar {
@@ -179,20 +173,22 @@ extension Unicode.Scalar {
179
173
return Unicode . Scalar ( curByte)
180
174
}
181
175
182
- // Read the number of high bits set, which indicates the number of bytes in
183
- // the character.
184
- let encodedBytes = ( ~ ( UInt32 ( curByte) << 24 ) ) . leadingZeroBitCount
185
-
186
- // If this is 0b10XXXXXX, then it is a continuation character.
187
- if encodedBytes == 1 || !Unicode. Scalar ( curByte) . isStartOfUTF8Character {
176
+ // If this is not the start of a UTF8 character,
177
+ // then it is either a continuation byte or an invalid UTF8 code point.
178
+ if !curByte. isStartOfUTF8Character {
188
179
// Skip until we get the start of another character. This is guaranteed to
189
180
// at least stop at the nul at the end of the buffer.
190
- while let peeked = peek ( ) , !Unicode . Scalar ( peeked) . isStartOfUTF8Character {
181
+ while let peeked = peek ( ) , !peeked. isStartOfUTF8Character {
191
182
_ = advance ( )
192
183
}
193
184
return nil
194
185
}
195
186
187
+ // Read the number of high bits set, which indicates the number of bytes in
188
+ // the character.
189
+ let encodedBytes = ( ~ curByte) . leadingZeroBitCount
190
+ precondition ( encodedBytes >= 2 && encodedBytes <= 4 )
191
+
196
192
// Drop the high bits indicating the # bytes of the result.
197
193
var charValue = UInt32 ( curByte << encodedBytes) >> encodedBytes
198
194
@@ -252,3 +248,11 @@ extension Unicode.Scalar {
252
248
return self . lexing ( advance: advance, peek: peek)
253
249
}
254
250
}
251
+
252
+ extension UInt8 {
253
+ var isStartOfUTF8Character : Bool {
254
+ // RFC 2279: The octet values FE and FF never appear.
255
+ // RFC 3629: The octet values C0, C1, F5 to FF never appear.
256
+ return self < 0x80 || ( self >= 0xC2 && self < 0xF5 )
257
+ }
258
+ }
0 commit comments