Inline memchr2 logic into Mycroft's algorithm

purplesyringa · purplesyringa · commit 5496579070cd · 2024-08-11T15:23:04.000+03:00
diff --git a/src/read.rs b/src/read.rs
@@ -426,53 +426,49 @@ impl<'a> SliceRead<'a> {
         }
     }
 
-    #[inline(always)]
     fn skip_to_escape(&mut self, forbid_control_characters: bool) {
         let rest = &self.slice[self.index..];
-        let end = self.index + memchr::memchr2(b'"', b'\\', rest).unwrap_or(rest.len());
 
         if !forbid_control_characters {
-            self.index = end;
+            self.index += memchr::memchr2(b'"', b'\\', rest).unwrap_or(rest.len());
             return;
         }
 
-        // We now wish to check if the chunk contains a byte in range 0x00..=0x1F. Ideally, this
-        // would be integrated this into the memchr2 check above, but memchr does not support this
-        // at the moment. Therefore, use a variation on Mycroft's algorithm [1] to provide
-        // performance better than a naive loop. It runs faster than just a single memchr call on
-        // benchmarks and is faster than both SSE2 and AVX-based code, and it's cross-platform, so
-        // probably the right fit.
+        // We wish to find the first byte in range 0x00..=0x1F or " or \. Ideally, we'd use
+        // something akin to memchr3, but the memchr crate does not support this at the moment.
+        // Therefore, we use a variation on Mycroft's algorithm [1] to provide performance better
+        // than a naive loop. It runs faster than equivalent two-pass memchr2+SWAR code on
+        // benchmarks and it's cross-platform, so probably the right fit.
         // [1]: https://groups.google.com/forum/#!original/comp.lang.c/2HtQXvg7iKc/xOJeipH6KLMJ
-        const STEP: usize = mem::size_of::<usize>();
-
-        // Moving this to a local variable removes a spill in the hot loop.
-        let mut index = self.index;
-
-        if self.slice.len() >= STEP {
-            while index < end.min(self.slice.len() - STEP + 1) {
-                // We can safely overread past end in most cases. This ensures that SWAR code is
-                // used to handle the tail in the hot path.
-                const ONE_BYTES: usize = usize::MAX / 255;
-                let chars = usize::from_ne_bytes(self.slice[index..][..STEP].try_into().unwrap());
-                let mask = chars.wrapping_sub(ONE_BYTES * 0x20) & !chars & (ONE_BYTES << 7);
-
-                if mask != 0 {
-                    index += mask.trailing_zeros() as usize / 8;
-                    break;
-                }
-
-                index += STEP;
-            }
-        }
-
-        if index < end {
-            if let Some(offset) = self.slice[index..end].iter().position(|&c| c <= 0x1F) {
-                self.index = index + offset;
+        type Chunk = usize;
+        const STEP: usize = mem::size_of::<Chunk>();
+        const ONE_BYTES: Chunk = Chunk::MAX / 255; // 0x0101...01
+
+        for chunk in rest.chunks_exact(STEP) {
+            let chars = Chunk::from_ne_bytes(chunk.try_into().unwrap());
+            let contains_ctrl = chars.wrapping_sub(ONE_BYTES * 0x20) & !chars;
+            let chars_quote = chars ^ (ONE_BYTES * Chunk::from(b'"'));
+            let contains_quote = chars_quote.wrapping_sub(ONE_BYTES) & !chars_quote;
+            let chars_backslash = chars ^ (ONE_BYTES * Chunk::from(b'\\'));
+            let contains_backslash = chars_backslash.wrapping_sub(ONE_BYTES) & !chars_backslash;
+            let masked = (contains_ctrl | contains_quote | contains_backslash) & (ONE_BYTES << 7);
+            if masked != 0 {
+                // SAFETY: chunk is in-bounds for slice
+                self.index = unsafe { chunk.as_ptr().offset_from(self.slice.as_ptr()) } as usize
+                    + masked.trailing_zeros() as usize / 8;
                 return;
             }
         }
 
-        self.index = end;
+        self.skip_to_escape_slow();
+    }
+
+    #[cold]
+    #[inline(never)]
+    fn skip_to_escape_slow(&mut self) {
+        while self.index < self.slice.len() && !is_escape(self.slice[self.index]) {
+            self.index += 1;
+        }
     }
 
     /// The big optimization here over IoRead is that if the string contains no
@@ -823,8 +819,6 @@ pub trait Fused: private::Sealed {}
 impl<'a> Fused for SliceRead<'a> {}
 impl<'a> Fused for StrRead<'a> {}
 
-// This is only used in IoRead. SliceRead hardcodes the arguments to memchr.
-#[cfg(feature = "std")]
 fn is_escape(ch: u8) -> bool {
     ch == b'"' || ch == b'\\' || ch < 0x20
 }