core: add concept of Flavour to core::str_bytes

mina86 · mina86 · commit fdb0aa6c8762 · 2023-05-05T22:12:20.000+02:00
Since core::str_bytes module cannot assume byte slices it deals with
are well-formed UTF-8 (or even WTF-8), the code must be defensive and
accept invalid sequences.  This eliminates optimisations which would
be otherwise possible.

Introduce a `Flavour` trait which tags `Bytes` type with information
about the byte sequence.  For example, if a `Bytes` object is created
from `&amp;str` it’s tagged with `Utf8` flavour which gives the code
freedom to assume data is well-formed UTF-8.

This brings back all the optimisations removed in previous commit.
diff --git a/library/core/src/str/mod.rs b/library/core/src/str/mod.rs
@@ -68,6 +68,7 @@ pub use iter::SplitAsciiWhitespace;
 #[stable(feature = "split_inclusive", since = "1.51.0")]
 pub use iter::SplitInclusive;
 
+pub(crate) use validations::next_code_point_reverse;
 #[unstable(feature = "str_internals", issue = "none")]
 pub use validations::{
     next_code_point, try_next_code_point, try_next_code_point_reverse, utf8_char_width,
diff --git a/library/core/src/str/pattern.rs b/library/core/src/str/pattern.rs
@@ -91,7 +91,7 @@ impl<'a> Haystack for &'a str {
 
 /// Associated type for `<char as Pattern<H>>::Searcher`.
 #[derive(Clone, Debug)]
-pub struct CharSearcher<'a>(str_bytes::CharSearcher<'a>);
+pub struct CharSearcher<'a>(str_bytes::CharSearcher<'a, str_bytes::Utf8>);
 
 impl<'a> CharSearcher<'a> {
     fn new(haystack: &'a str, chr: char) -> Self {
@@ -102,9 +102,7 @@ impl<'a> CharSearcher<'a> {
 unsafe impl<'a> Searcher<&'a str> for CharSearcher<'a> {
     #[inline]
     fn haystack(&self) -> &'a str {
-        // SAFETY: self.0’s haystack was created from &str thus it is valid
-        // UTF-8.
-        unsafe { super::from_utf8_unchecked(self.0.haystack().as_bytes()) }
+        self.0.haystack().into()
     }
     #[inline]
     fn next(&mut self) -> SearchStep {
@@ -165,12 +163,7 @@ impl<'a> Pattern<&'a str> for char {
 
     #[inline]
     fn strip_prefix_of(self, haystack: &'a str) -> Option<&'a str> {
-        self.strip_prefix_of(str_bytes::Bytes::from(haystack)).map(|bytes| {
-            // SAFETY: Bytes were created from &str and Bytes never splits
-            // inside of UTF-8 bytes sequences thus `bytes` is still valid
-            // UTF-8.
-            unsafe { super::from_utf8_unchecked(bytes.as_bytes()) }
-        })
+        self.strip_prefix_of(str_bytes::Bytes::from(haystack)).map(<&str>::from)
     }
 
     #[inline]
@@ -180,12 +173,7 @@ impl<'a> Pattern<&'a str> for char {
 
     #[inline]
     fn strip_suffix_of(self, haystack: &'a str) -> Option<&'a str> {
-        self.strip_suffix_of(str_bytes::Bytes::from(haystack)).map(|bytes| {
-            // SAFETY: Bytes were created from &str and Bytes never splits
-            // inside of UTF-8 bytes sequences thus `bytes` is still valid
-            // UTF-8.
-            unsafe { super::from_utf8_unchecked(bytes.as_bytes()) }
-        })
+        self.strip_suffix_of(str_bytes::Bytes::from(haystack)).map(<&str>::from)
     }
 }
 
@@ -613,7 +601,7 @@ impl<'a, 'b> Pattern<&'a str> for &'b str {
 
 #[derive(Clone, Debug)]
 /// Associated type for `<&str as Pattern<&'a str>>::Searcher`.
-pub struct StrSearcher<'a, 'b>(crate::str_bytes::StrSearcher<'a, 'b>);
+pub struct StrSearcher<'a, 'b>(crate::str_bytes::StrSearcher<'a, 'b, crate::str_bytes::Utf8>);
 
 impl<'a, 'b> StrSearcher<'a, 'b> {
     fn new(haystack: &'a str, needle: &'b str) -> StrSearcher<'a, 'b> {
@@ -625,9 +613,7 @@ impl<'a, 'b> StrSearcher<'a, 'b> {
 unsafe impl<'a, 'b> Searcher<&'a str> for StrSearcher<'a, 'b> {
     #[inline]
     fn haystack(&self) -> &'a str {
-        let bytes = self.0.haystack().as_bytes();
-        // SAFETY: self.0.haystack() was created from a &str.
-        unsafe { crate::str::from_utf8_unchecked(bytes) }
+        self.0.haystack().into()
     }
 
     #[inline]
diff --git a/library/core/src/str/validations.rs b/library/core/src/str/validations.rs
@@ -76,7 +76,7 @@ pub unsafe fn next_code_point<'a, I: Iterator<Item = &'a u8>>(bytes: &mut I) ->
 ///
 /// `bytes` must produce a valid UTF-8-like (UTF-8 or WTF-8) string
 #[inline]
-pub(super) unsafe fn next_code_point_reverse<'a, I>(bytes: &mut I) -> Option<u32>
+pub(crate) unsafe fn next_code_point_reverse<'a, I>(bytes: &mut I) -> Option<u32>
 where
     I: DoubleEndedIterator<Item = &'a u8>,
 {
diff --git a/library/core/src/str_bytes.rs b/library/core/src/str_bytes.rs

Original file line number	Diff line number	Diff line change
`@@ -76,7 +76,7 @@ pub unsafe fn next_code_point<'a, I: Iterator<Item = &'a u8>>(bytes: &mut I) ->`
`76`	`76`	`///`
`77`	`77`	/// `bytes` must produce a valid UTF-8-like (UTF-8 or WTF-8) string
`78`	`78`	`#[inline]`
`79`		`-pub(super) unsafe fn next_code_point_reverse<'a, I>(bytes: &mut I) -> Option<u32>`
	`79`	`+pub(crate) unsafe fn next_code_point_reverse<'a, I>(bytes: &mut I) -> Option<u32>`
`80`	`80`	`where`
`81`	`81`	`I: DoubleEndedIterator<Item = &'a u8>,`
`82`	`82`	`{`