Skip to content

Commit fdb0aa6

Browse files
committed
core: add concept of Flavour to core::str_bytes
Since core::str_bytes module cannot assume byte slices it deals with are well-formed UTF-8 (or even WTF-8), the code must be defensive and accept invalid sequences. This eliminates optimisations which would be otherwise possible. Introduce a `Flavour` trait which tags `Bytes` type with information about the byte sequence. For example, if a `Bytes` object is created from `&str` it’s tagged with `Utf8` flavour which gives the code freedom to assume data is well-formed UTF-8. This brings back all the optimisations removed in previous commit.
1 parent c9dce1b commit fdb0aa6

File tree

4 files changed

+484
-168
lines changed

4 files changed

+484
-168
lines changed

library/core/src/str/mod.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ pub use iter::SplitAsciiWhitespace;
6868
#[stable(feature = "split_inclusive", since = "1.51.0")]
6969
pub use iter::SplitInclusive;
7070

71+
pub(crate) use validations::next_code_point_reverse;
7172
#[unstable(feature = "str_internals", issue = "none")]
7273
pub use validations::{
7374
next_code_point, try_next_code_point, try_next_code_point_reverse, utf8_char_width,

library/core/src/str/pattern.rs

Lines changed: 6 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ impl<'a> Haystack for &'a str {
9191

9292
/// Associated type for `<char as Pattern<H>>::Searcher`.
9393
#[derive(Clone, Debug)]
94-
pub struct CharSearcher<'a>(str_bytes::CharSearcher<'a>);
94+
pub struct CharSearcher<'a>(str_bytes::CharSearcher<'a, str_bytes::Utf8>);
9595

9696
impl<'a> CharSearcher<'a> {
9797
fn new(haystack: &'a str, chr: char) -> Self {
@@ -102,9 +102,7 @@ impl<'a> CharSearcher<'a> {
102102
unsafe impl<'a> Searcher<&'a str> for CharSearcher<'a> {
103103
#[inline]
104104
fn haystack(&self) -> &'a str {
105-
// SAFETY: self.0’s haystack was created from &str thus it is valid
106-
// UTF-8.
107-
unsafe { super::from_utf8_unchecked(self.0.haystack().as_bytes()) }
105+
self.0.haystack().into()
108106
}
109107
#[inline]
110108
fn next(&mut self) -> SearchStep {
@@ -165,12 +163,7 @@ impl<'a> Pattern<&'a str> for char {
165163

166164
#[inline]
167165
fn strip_prefix_of(self, haystack: &'a str) -> Option<&'a str> {
168-
self.strip_prefix_of(str_bytes::Bytes::from(haystack)).map(|bytes| {
169-
// SAFETY: Bytes were created from &str and Bytes never splits
170-
// inside of UTF-8 bytes sequences thus `bytes` is still valid
171-
// UTF-8.
172-
unsafe { super::from_utf8_unchecked(bytes.as_bytes()) }
173-
})
166+
self.strip_prefix_of(str_bytes::Bytes::from(haystack)).map(<&str>::from)
174167
}
175168

176169
#[inline]
@@ -180,12 +173,7 @@ impl<'a> Pattern<&'a str> for char {
180173

181174
#[inline]
182175
fn strip_suffix_of(self, haystack: &'a str) -> Option<&'a str> {
183-
self.strip_suffix_of(str_bytes::Bytes::from(haystack)).map(|bytes| {
184-
// SAFETY: Bytes were created from &str and Bytes never splits
185-
// inside of UTF-8 bytes sequences thus `bytes` is still valid
186-
// UTF-8.
187-
unsafe { super::from_utf8_unchecked(bytes.as_bytes()) }
188-
})
176+
self.strip_suffix_of(str_bytes::Bytes::from(haystack)).map(<&str>::from)
189177
}
190178
}
191179

@@ -613,7 +601,7 @@ impl<'a, 'b> Pattern<&'a str> for &'b str {
613601

614602
#[derive(Clone, Debug)]
615603
/// Associated type for `<&str as Pattern<&'a str>>::Searcher`.
616-
pub struct StrSearcher<'a, 'b>(crate::str_bytes::StrSearcher<'a, 'b>);
604+
pub struct StrSearcher<'a, 'b>(crate::str_bytes::StrSearcher<'a, 'b, crate::str_bytes::Utf8>);
617605

618606
impl<'a, 'b> StrSearcher<'a, 'b> {
619607
fn new(haystack: &'a str, needle: &'b str) -> StrSearcher<'a, 'b> {
@@ -625,9 +613,7 @@ impl<'a, 'b> StrSearcher<'a, 'b> {
625613
unsafe impl<'a, 'b> Searcher<&'a str> for StrSearcher<'a, 'b> {
626614
#[inline]
627615
fn haystack(&self) -> &'a str {
628-
let bytes = self.0.haystack().as_bytes();
629-
// SAFETY: self.0.haystack() was created from a &str.
630-
unsafe { crate::str::from_utf8_unchecked(bytes) }
616+
self.0.haystack().into()
631617
}
632618

633619
#[inline]

library/core/src/str/validations.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ pub unsafe fn next_code_point<'a, I: Iterator<Item = &'a u8>>(bytes: &mut I) ->
7676
///
7777
/// `bytes` must produce a valid UTF-8-like (UTF-8 or WTF-8) string
7878
#[inline]
79-
pub(super) unsafe fn next_code_point_reverse<'a, I>(bytes: &mut I) -> Option<u32>
79+
pub(crate) unsafe fn next_code_point_reverse<'a, I>(bytes: &mut I) -> Option<u32>
8080
where
8181
I: DoubleEndedIterator<Item = &'a u8>,
8282
{

0 commit comments

Comments
 (0)