Skip to content

Commit 4149fcd

Browse files
committed
core: add try_next_code_point{,_reverse} internal functions
1 parent d01dc93 commit 4149fcd

File tree

2 files changed

+175
-70
lines changed

2 files changed

+175
-70
lines changed

library/core/src/str/mod.rs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,9 @@ pub use iter::SplitAsciiWhitespace;
6969
pub use iter::SplitInclusive;
7070

7171
#[unstable(feature = "str_internals", issue = "none")]
72-
pub use validations::{next_code_point, utf8_char_width};
72+
pub use validations::{
73+
next_code_point, try_next_code_point, try_next_code_point_reverse, utf8_char_width,
74+
};
7375

7476
use iter::MatchIndicesInternal;
7577
use iter::MatchesInternal;

library/core/src/str/validations.rs

Lines changed: 172 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,87 @@ const fn contains_nonascii(x: usize) -> bool {
120120
(x & NONASCII_MASK) != 0
121121
}
122122

123+
/// Reads the first code point out of a byte slice validating whether it’s
124+
/// valid.
125+
///
126+
/// This is different than [`next_code_point`] in that it doesn’t assume
127+
/// argument is well-formed UTF-8-like string. Together with the character its
128+
/// encoded length is returned.
129+
///
130+
/// If front of the bytes slice doesn’t contain valid UTF-8 bytes sequence (that
131+
/// includes a WTF-8 encoded surrogate) returns `None`.
132+
///
133+
/// ```
134+
/// #![feature(str_internals)]
135+
/// use core::str::try_next_code_point;
136+
///
137+
/// assert_eq!(Some(('f', 1)), try_next_code_point(b"foo".as_ref()));
138+
/// assert_eq!(Some(('Ż', 2)), try_next_code_point("Żółw".as_bytes()));
139+
/// assert_eq!(None, try_next_code_point(b"\xffoo".as_ref()));
140+
/// ```
141+
#[unstable(feature = "str_internals", issue = "none")]
142+
#[inline]
143+
pub const fn try_next_code_point(bytes: &[u8]) -> Option<(char, usize)> {
144+
let first = match bytes.first() {
145+
Some(&byte) => byte,
146+
None => return None,
147+
};
148+
let (value, length) = if first < 0x80 {
149+
(first as u32, 1)
150+
} else if let Ok((cp, len)) = try_finish_byte_sequence(first, bytes, 0) {
151+
(cp, len)
152+
} else {
153+
return None;
154+
};
155+
// SAFETY: We’ve just verified value is correct Unicode scalar value.
156+
// Either ASCII (first branch of the if-else-if-else) or non-ASCII Unicode
157+
// character (second branch).
158+
Some((unsafe { char::from_u32_unchecked(value) }, length))
159+
}
160+
161+
/// Reads the last code point out of a byte slice validating whether it’s
162+
/// valid.
163+
///
164+
/// This is different than `next_code_point_reverse` in that it doesn’t assume
165+
/// argument is well-formed UTF-8-like string. Together with the character its
166+
/// encoded length is returned.
167+
///
168+
/// If back of the bytes slice doesn’t contain valid UTF-8 bytes sequence (that
169+
/// includes a WTF-8 encoded surrogate) returns `None`.
170+
///
171+
/// ```
172+
/// #![feature(str_internals)]
173+
/// use core::str::try_next_code_point_reverse;
174+
///
175+
/// assert_eq!(Some(('o', 1)), try_next_code_point_reverse(b"foo".as_ref()));
176+
/// assert_eq!(Some(('‽', 3)), try_next_code_point_reverse("Uh‽".as_bytes()));
177+
/// assert_eq!(None, try_next_code_point_reverse(b"foo\xff".as_ref()));
178+
/// ```
179+
#[unstable(feature = "str_internals", issue = "none")]
180+
#[inline]
181+
pub const fn try_next_code_point_reverse(bytes: &[u8]) -> Option<(char, usize)> {
182+
let mut n = 1;
183+
let limit = bytes.len();
184+
let limit = if limit < 4 { limit } else { 4 }; // not .min(4) because of const
185+
while n <= limit && !bytes[bytes.len() - n].is_utf8_char_boundary() {
186+
n += 1;
187+
}
188+
if n <= limit {
189+
// It’s not clear to me why, but range indexing isn’t const here,
190+
// i.e. `&bytes[bytes.len() - n..]` doesn’t compile. Because of that
191+
// I’m resorting to unsafe block with from_raw_parts.
192+
// SAFETY: n ≤ limit ≤ bytes.len() thus bytes.len() - n ≥ 0 and we
193+
// have n remaining bytes.
194+
let bytes = unsafe { crate::slice::from_raw_parts(bytes.as_ptr().add(bytes.len() - n), n) };
195+
if let Some((chr, len)) = try_next_code_point(bytes) {
196+
if n == len {
197+
return Some((chr, len));
198+
}
199+
}
200+
}
201+
None
202+
}
203+
123204
/// Walks through `v` checking that it's a valid UTF-8 sequence,
124205
/// returning `Ok(())` in that case, or, if it is invalid, `Err(err)`.
125206
#[inline(always)]
@@ -134,78 +215,13 @@ pub(super) const fn run_utf8_validation(v: &[u8]) -> Result<(), Utf8Error> {
134215
let align = v.as_ptr().align_offset(usize_bytes);
135216

136217
while index < len {
137-
let old_offset = index;
138-
macro_rules! err {
139-
($error_len: expr) => {
140-
return Err(Utf8Error { valid_up_to: old_offset, error_len: $error_len })
141-
};
142-
}
143-
144-
macro_rules! next {
145-
() => {{
146-
index += 1;
147-
// we needed data, but there was none: error!
148-
if index >= len {
149-
err!(None)
150-
}
151-
v[index]
152-
}};
153-
}
154-
218+
let valid_up_to = index;
155219
let first = v[index];
156220
if first >= 128 {
157-
let w = utf8_char_width(first);
158-
// 2-byte encoding is for codepoints \u{0080} to \u{07ff}
159-
// first C2 80 last DF BF
160-
// 3-byte encoding is for codepoints \u{0800} to \u{ffff}
161-
// first E0 A0 80 last EF BF BF
162-
// excluding surrogates codepoints \u{d800} to \u{dfff}
163-
// ED A0 80 to ED BF BF
164-
// 4-byte encoding is for codepoints \u{1000}0 to \u{10ff}ff
165-
// first F0 90 80 80 last F4 8F BF BF
166-
//
167-
// Use the UTF-8 syntax from the RFC
168-
//
169-
// https://tools.ietf.org/html/rfc3629
170-
// UTF8-1 = %x00-7F
171-
// UTF8-2 = %xC2-DF UTF8-tail
172-
// UTF8-3 = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) /
173-
// %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail )
174-
// UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
175-
// %xF4 %x80-8F 2( UTF8-tail )
176-
match w {
177-
2 => {
178-
if next!() as i8 >= -64 {
179-
err!(Some(1))
180-
}
181-
}
182-
3 => {
183-
match (first, next!()) {
184-
(0xE0, 0xA0..=0xBF)
185-
| (0xE1..=0xEC, 0x80..=0xBF)
186-
| (0xED, 0x80..=0x9F)
187-
| (0xEE..=0xEF, 0x80..=0xBF) => {}
188-
_ => err!(Some(1)),
189-
}
190-
if next!() as i8 >= -64 {
191-
err!(Some(2))
192-
}
193-
}
194-
4 => {
195-
match (first, next!()) {
196-
(0xF0, 0x90..=0xBF) | (0xF1..=0xF3, 0x80..=0xBF) | (0xF4, 0x80..=0x8F) => {}
197-
_ => err!(Some(1)),
198-
}
199-
if next!() as i8 >= -64 {
200-
err!(Some(2))
201-
}
202-
if next!() as i8 >= -64 {
203-
err!(Some(3))
204-
}
205-
}
206-
_ => err!(Some(1)),
221+
match try_finish_byte_sequence(first, v, index) {
222+
Ok((_value, length)) => index += length,
223+
Err(error_len) => return Err(Utf8Error { valid_up_to, error_len }),
207224
}
208-
index += 1;
209225
} else {
210226
// Ascii case, try to skip forward quickly.
211227
// When the pointer is aligned, read 2 words of data per iteration
@@ -241,6 +257,93 @@ pub(super) const fn run_utf8_validation(v: &[u8]) -> Result<(), Utf8Error> {
241257
Ok(())
242258
}
243259

260+
/// Try to finish an UTF-8 byte sequence.
261+
///
262+
/// Assumes that `bytes[index] == first` and than `first >= 128`, i.e. that
263+
/// `index` points at the beginning of a non-ASCII UTF-8 sequence in `bytes`.
264+
///
265+
/// If the byte sequence at the index is correct, returns decoded code point and
266+
/// length of the sequence. If it was invalid returns number of invalid bytes
267+
/// or None if read was cut short.
268+
#[inline(always)]
269+
#[rustc_const_unstable(feature = "str_internals", issue = "none")]
270+
const fn try_finish_byte_sequence(
271+
first: u8,
272+
bytes: &[u8],
273+
index: usize,
274+
) -> Result<(u32, usize), Option<u8>> {
275+
macro_rules! get {
276+
(raw $offset:expr) => {
277+
if index + $offset < bytes.len() {
278+
bytes[index + $offset]
279+
} else {
280+
return Err(None)
281+
}
282+
};
283+
(cont $offset:expr) => {{
284+
let byte = get!(raw $offset);
285+
if !utf8_is_cont_byte(byte) {
286+
return Err(Some($offset as u8))
287+
}
288+
byte
289+
}}
290+
}
291+
292+
// 2-byte encoding is for codepoints \u{0080} to \u{07ff}
293+
// first C2 80 last DF BF
294+
// 3-byte encoding is for codepoints \u{0800} to \u{ffff}
295+
// first E0 A0 80 last EF BF BF
296+
// excluding surrogates codepoints \u{d800} to \u{dfff}
297+
// ED A0 80 to ED BF BF
298+
// 4-byte encoding is for codepoints \u{1000}0 to \u{10ff}ff
299+
// first F0 90 80 80 last F4 8F BF BF
300+
//
301+
// Use the UTF-8 syntax from the RFC
302+
//
303+
// https://tools.ietf.org/html/rfc3629
304+
// UTF8-1 = %x00-7F
305+
// UTF8-2 = %xC2-DF UTF8-tail
306+
// UTF8-3 = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) /
307+
// %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail )
308+
// UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
309+
// %xF4 %x80-8F 2( UTF8-tail )
310+
match utf8_char_width(first) {
311+
2 => {
312+
let second = get!(cont 1);
313+
let value = utf8_first_byte(first, 3);
314+
let value = utf8_acc_cont_byte(value, second);
315+
Ok((value, 2))
316+
}
317+
3 => {
318+
let second = get!(raw 1);
319+
match (first, second) {
320+
(0xE0, 0xA0..=0xBF)
321+
| (0xE1..=0xEC, 0x80..=0xBF)
322+
| (0xED, 0x80..=0x9F)
323+
| (0xEE..=0xEF, 0x80..=0xBF) => {}
324+
_ => return Err(Some(1)),
325+
}
326+
let value = utf8_first_byte(first, 3);
327+
let value = utf8_acc_cont_byte(value, second);
328+
let value = utf8_acc_cont_byte(value, get!(cont 2));
329+
Ok((value, 3))
330+
}
331+
4 => {
332+
let second = get!(raw 1);
333+
match (first, second) {
334+
(0xF0, 0x90..=0xBF) | (0xF1..=0xF3, 0x80..=0xBF) | (0xF4, 0x80..=0x8F) => {}
335+
_ => return Err(Some(1)),
336+
}
337+
let value = utf8_first_byte(first, 4);
338+
let value = utf8_acc_cont_byte(value, second);
339+
let value = utf8_acc_cont_byte(value, get!(cont 2));
340+
let value = utf8_acc_cont_byte(value, get!(cont 3));
341+
Ok((value, 4))
342+
}
343+
_ => Err(Some(1)),
344+
}
345+
}
346+
244347
// https://tools.ietf.org/html/rfc3629
245348
const UTF8_CHAR_WIDTH: &[u8; 256] = &[
246349
// 1 2 3 4 5 6 7 8 9 A B C D E F

0 commit comments

Comments
 (0)