@@ -120,6 +120,87 @@ const fn contains_nonascii(x: usize) -> bool {
120
120
( x & NONASCII_MASK ) != 0
121
121
}
122
122
123
+ /// Reads the first code point out of a byte slice validating whether it’s
124
+ /// valid.
125
+ ///
126
+ /// This is different than [`next_code_point`] in that it doesn’t assume
127
+ /// argument is well-formed UTF-8-like string. Together with the character its
128
+ /// encoded length is returned.
129
+ ///
130
+ /// If front of the bytes slice doesn’t contain valid UTF-8 bytes sequence (that
131
+ /// includes a WTF-8 encoded surrogate) returns `None`.
132
+ ///
133
+ /// ```
134
+ /// #![feature(str_internals)]
135
+ /// use core::str::try_next_code_point;
136
+ ///
137
+ /// assert_eq!(Some(('f', 1)), try_next_code_point(b"foo".as_ref()));
138
+ /// assert_eq!(Some(('Ż', 2)), try_next_code_point("Żółw".as_bytes()));
139
+ /// assert_eq!(None, try_next_code_point(b"\xffoo".as_ref()));
140
+ /// ```
141
+ #[ unstable( feature = "str_internals" , issue = "none" ) ]
142
+ #[ inline]
143
+ pub const fn try_next_code_point ( bytes : & [ u8 ] ) -> Option < ( char , usize ) > {
144
+ let first = match bytes. first ( ) {
145
+ Some ( & byte) => byte,
146
+ None => return None ,
147
+ } ;
148
+ let ( value, length) = if first < 0x80 {
149
+ ( first as u32 , 1 )
150
+ } else if let Ok ( ( cp, len) ) = try_finish_byte_sequence ( first, bytes, 0 ) {
151
+ ( cp, len)
152
+ } else {
153
+ return None ;
154
+ } ;
155
+ // SAFETY: We’ve just verified value is correct Unicode scalar value.
156
+ // Either ASCII (first branch of the if-else-if-else) or non-ASCII Unicode
157
+ // character (second branch).
158
+ Some ( ( unsafe { char:: from_u32_unchecked ( value) } , length) )
159
+ }
160
+
161
+ /// Reads the last code point out of a byte slice validating whether it’s
162
+ /// valid.
163
+ ///
164
+ /// This is different than `next_code_point_reverse` in that it doesn’t assume
165
+ /// argument is well-formed UTF-8-like string. Together with the character its
166
+ /// encoded length is returned.
167
+ ///
168
+ /// If back of the bytes slice doesn’t contain valid UTF-8 bytes sequence (that
169
+ /// includes a WTF-8 encoded surrogate) returns `None`.
170
+ ///
171
+ /// ```
172
+ /// #![feature(str_internals)]
173
+ /// use core::str::try_next_code_point_reverse;
174
+ ///
175
+ /// assert_eq!(Some(('o', 1)), try_next_code_point_reverse(b"foo".as_ref()));
176
+ /// assert_eq!(Some(('‽', 3)), try_next_code_point_reverse("Uh‽".as_bytes()));
177
+ /// assert_eq!(None, try_next_code_point_reverse(b"foo\xff".as_ref()));
178
+ /// ```
179
+ #[ unstable( feature = "str_internals" , issue = "none" ) ]
180
+ #[ inline]
181
+ pub const fn try_next_code_point_reverse ( bytes : & [ u8 ] ) -> Option < ( char , usize ) > {
182
+ let mut n = 1 ;
183
+ let limit = bytes. len ( ) ;
184
+ let limit = if limit < 4 { limit } else { 4 } ; // not .min(4) because of const
185
+ while n <= limit && !bytes[ bytes. len ( ) - n] . is_utf8_char_boundary ( ) {
186
+ n += 1 ;
187
+ }
188
+ if n <= limit {
189
+ // It’s not clear to me why, but range indexing isn’t const here,
190
+ // i.e. `&bytes[bytes.len() - n..]` doesn’t compile. Because of that
191
+ // I’m resorting to unsafe block with from_raw_parts.
192
+ // SAFETY: n ≤ limit ≤ bytes.len() thus bytes.len() - n ≥ 0 and we
193
+ // have n remaining bytes.
194
+ let bytes = unsafe { crate :: slice:: from_raw_parts ( bytes. as_ptr ( ) . add ( bytes. len ( ) - n) , n) } ;
195
+ if let Some ( ( chr, len) ) = try_next_code_point ( bytes) {
196
+ if n == len {
197
+ return Some ( ( chr, len) ) ;
198
+ }
199
+ }
200
+ }
201
+ None
202
+ }
203
+
123
204
/// Walks through `v` checking that it's a valid UTF-8 sequence,
124
205
/// returning `Ok(())` in that case, or, if it is invalid, `Err(err)`.
125
206
#[ inline( always) ]
@@ -134,78 +215,13 @@ pub(super) const fn run_utf8_validation(v: &[u8]) -> Result<(), Utf8Error> {
134
215
let align = v. as_ptr ( ) . align_offset ( usize_bytes) ;
135
216
136
217
while index < len {
137
- let old_offset = index;
138
- macro_rules! err {
139
- ( $error_len: expr) => {
140
- return Err ( Utf8Error { valid_up_to: old_offset, error_len: $error_len } )
141
- } ;
142
- }
143
-
144
- macro_rules! next {
145
- ( ) => { {
146
- index += 1 ;
147
- // we needed data, but there was none: error!
148
- if index >= len {
149
- err!( None )
150
- }
151
- v[ index]
152
- } } ;
153
- }
154
-
218
+ let valid_up_to = index;
155
219
let first = v[ index] ;
156
220
if first >= 128 {
157
- let w = utf8_char_width ( first) ;
158
- // 2-byte encoding is for codepoints \u{0080} to \u{07ff}
159
- // first C2 80 last DF BF
160
- // 3-byte encoding is for codepoints \u{0800} to \u{ffff}
161
- // first E0 A0 80 last EF BF BF
162
- // excluding surrogates codepoints \u{d800} to \u{dfff}
163
- // ED A0 80 to ED BF BF
164
- // 4-byte encoding is for codepoints \u{1000}0 to \u{10ff}ff
165
- // first F0 90 80 80 last F4 8F BF BF
166
- //
167
- // Use the UTF-8 syntax from the RFC
168
- //
169
- // https://tools.ietf.org/html/rfc3629
170
- // UTF8-1 = %x00-7F
171
- // UTF8-2 = %xC2-DF UTF8-tail
172
- // UTF8-3 = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) /
173
- // %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail )
174
- // UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
175
- // %xF4 %x80-8F 2( UTF8-tail )
176
- match w {
177
- 2 => {
178
- if next ! ( ) as i8 >= -64 {
179
- err ! ( Some ( 1 ) )
180
- }
181
- }
182
- 3 => {
183
- match ( first, next ! ( ) ) {
184
- ( 0xE0 , 0xA0 ..=0xBF )
185
- | ( 0xE1 ..=0xEC , 0x80 ..=0xBF )
186
- | ( 0xED , 0x80 ..=0x9F )
187
- | ( 0xEE ..=0xEF , 0x80 ..=0xBF ) => { }
188
- _ => err ! ( Some ( 1 ) ) ,
189
- }
190
- if next ! ( ) as i8 >= -64 {
191
- err ! ( Some ( 2 ) )
192
- }
193
- }
194
- 4 => {
195
- match ( first, next ! ( ) ) {
196
- ( 0xF0 , 0x90 ..=0xBF ) | ( 0xF1 ..=0xF3 , 0x80 ..=0xBF ) | ( 0xF4 , 0x80 ..=0x8F ) => { }
197
- _ => err ! ( Some ( 1 ) ) ,
198
- }
199
- if next ! ( ) as i8 >= -64 {
200
- err ! ( Some ( 2 ) )
201
- }
202
- if next ! ( ) as i8 >= -64 {
203
- err ! ( Some ( 3 ) )
204
- }
205
- }
206
- _ => err ! ( Some ( 1 ) ) ,
221
+ match try_finish_byte_sequence ( first, v, index) {
222
+ Ok ( ( _value, length) ) => index += length,
223
+ Err ( error_len) => return Err ( Utf8Error { valid_up_to, error_len } ) ,
207
224
}
208
- index += 1 ;
209
225
} else {
210
226
// Ascii case, try to skip forward quickly.
211
227
// When the pointer is aligned, read 2 words of data per iteration
@@ -241,6 +257,93 @@ pub(super) const fn run_utf8_validation(v: &[u8]) -> Result<(), Utf8Error> {
241
257
Ok ( ( ) )
242
258
}
243
259
260
+ /// Try to finish an UTF-8 byte sequence.
261
+ ///
262
+ /// Assumes that `bytes[index] == first` and than `first >= 128`, i.e. that
263
+ /// `index` points at the beginning of a non-ASCII UTF-8 sequence in `bytes`.
264
+ ///
265
+ /// If the byte sequence at the index is correct, returns decoded code point and
266
+ /// length of the sequence. If it was invalid returns number of invalid bytes
267
+ /// or None if read was cut short.
268
+ #[ inline( always) ]
269
+ #[ rustc_const_unstable( feature = "str_internals" , issue = "none" ) ]
270
+ const fn try_finish_byte_sequence (
271
+ first : u8 ,
272
+ bytes : & [ u8 ] ,
273
+ index : usize ,
274
+ ) -> Result < ( u32 , usize ) , Option < u8 > > {
275
+ macro_rules! get {
276
+ ( raw $offset: expr) => {
277
+ if index + $offset < bytes. len( ) {
278
+ bytes[ index + $offset]
279
+ } else {
280
+ return Err ( None )
281
+ }
282
+ } ;
283
+ ( cont $offset: expr) => { {
284
+ let byte = get!( raw $offset) ;
285
+ if !utf8_is_cont_byte( byte) {
286
+ return Err ( Some ( $offset as u8 ) )
287
+ }
288
+ byte
289
+ } }
290
+ }
291
+
292
+ // 2-byte encoding is for codepoints \u{0080} to \u{07ff}
293
+ // first C2 80 last DF BF
294
+ // 3-byte encoding is for codepoints \u{0800} to \u{ffff}
295
+ // first E0 A0 80 last EF BF BF
296
+ // excluding surrogates codepoints \u{d800} to \u{dfff}
297
+ // ED A0 80 to ED BF BF
298
+ // 4-byte encoding is for codepoints \u{1000}0 to \u{10ff}ff
299
+ // first F0 90 80 80 last F4 8F BF BF
300
+ //
301
+ // Use the UTF-8 syntax from the RFC
302
+ //
303
+ // https://tools.ietf.org/html/rfc3629
304
+ // UTF8-1 = %x00-7F
305
+ // UTF8-2 = %xC2-DF UTF8-tail
306
+ // UTF8-3 = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) /
307
+ // %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail )
308
+ // UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
309
+ // %xF4 %x80-8F 2( UTF8-tail )
310
+ match utf8_char_width ( first) {
311
+ 2 => {
312
+ let second = get ! ( cont 1 ) ;
313
+ let value = utf8_first_byte ( first, 3 ) ;
314
+ let value = utf8_acc_cont_byte ( value, second) ;
315
+ Ok ( ( value, 2 ) )
316
+ }
317
+ 3 => {
318
+ let second = get ! ( raw 1 ) ;
319
+ match ( first, second) {
320
+ ( 0xE0 , 0xA0 ..=0xBF )
321
+ | ( 0xE1 ..=0xEC , 0x80 ..=0xBF )
322
+ | ( 0xED , 0x80 ..=0x9F )
323
+ | ( 0xEE ..=0xEF , 0x80 ..=0xBF ) => { }
324
+ _ => return Err ( Some ( 1 ) ) ,
325
+ }
326
+ let value = utf8_first_byte ( first, 3 ) ;
327
+ let value = utf8_acc_cont_byte ( value, second) ;
328
+ let value = utf8_acc_cont_byte ( value, get ! ( cont 2 ) ) ;
329
+ Ok ( ( value, 3 ) )
330
+ }
331
+ 4 => {
332
+ let second = get ! ( raw 1 ) ;
333
+ match ( first, second) {
334
+ ( 0xF0 , 0x90 ..=0xBF ) | ( 0xF1 ..=0xF3 , 0x80 ..=0xBF ) | ( 0xF4 , 0x80 ..=0x8F ) => { }
335
+ _ => return Err ( Some ( 1 ) ) ,
336
+ }
337
+ let value = utf8_first_byte ( first, 4 ) ;
338
+ let value = utf8_acc_cont_byte ( value, second) ;
339
+ let value = utf8_acc_cont_byte ( value, get ! ( cont 2 ) ) ;
340
+ let value = utf8_acc_cont_byte ( value, get ! ( cont 3 ) ) ;
341
+ Ok ( ( value, 4 ) )
342
+ }
343
+ _ => Err ( Some ( 1 ) ) ,
344
+ }
345
+ }
346
+
244
347
// https://tools.ietf.org/html/rfc3629
245
348
const UTF8_CHAR_WIDTH : & [ u8 ; 256 ] = & [
246
349
// 1 2 3 4 5 6 7 8 9 A B C D E F
0 commit comments