Merge pull request #1161 from iex-rs/vectorized-string-parsing

dtolnay · web-flow · commit 859ead8e6d60 · 2024-08-11T10:44:24.000-07:00
Vectorize string parsing
diff --git a/src/lib.rs b/src/lib.rs
@@ -340,6 +340,7 @@
     clippy::wildcard_imports,
     // things are often more readable this way
     clippy::cast_lossless,
+    clippy::items_after_statements,
     clippy::module_name_repetitions,
     clippy::redundant_else,
     clippy::shadow_unrelated,
diff --git a/src/read.rs b/src/read.rs
@@ -2,6 +2,7 @@ use crate::error::{Error, ErrorCode, Result};
 use alloc::vec::Vec;
 use core::char;
 use core::cmp;
+use core::mem;
 use core::ops::Deref;
 use core::str;
 
@@ -221,7 +222,7 @@ where
     {
         loop {
             let ch = tri!(next_or_eof(self));
-            if !ESCAPE[ch as usize] {
+            if !is_escape(ch, true) {
                 scratch.push(ch);
                 continue;
             }
@@ -342,7 +343,7 @@ where
     fn ignore_str(&mut self) -> Result<()> {
         loop {
             let ch = tri!(next_or_eof(self));
-            if !ESCAPE[ch as usize] {
+            if !is_escape(ch, true) {
                 continue;
             }
             match ch {
@@ -425,6 +426,65 @@ impl<'a> SliceRead<'a> {
         }
     }
 
+    fn skip_to_escape(&mut self, forbid_control_characters: bool) {
+        // Immediately bail-out on empty strings and consecutive escapes (e.g. \u041b\u0435)
+        if self.index == self.slice.len()
+            || is_escape(self.slice[self.index], forbid_control_characters)
+        {
+            return;
+        }
+        self.index += 1;
+
+        let rest = &self.slice[self.index..];
+
+        if !forbid_control_characters {
+            self.index += memchr::memchr2(b'"', b'\\', rest).unwrap_or(rest.len());
+            return;
+        }
+
+        // We wish to find the first byte in range 0x00..=0x1F or " or \. Ideally, we'd use
+        // something akin to memchr3, but the memchr crate does not support this at the moment.
+        // Therefore, we use a variation on Mycroft's algorithm [1] to provide performance better
+        // than a naive loop. It runs faster than equivalent two-pass memchr2+SWAR code on
+        // benchmarks and it's cross-platform, so probably the right fit.
+        // [1]: https://groups.google.com/forum/#!original/comp.lang.c/2HtQXvg7iKc/xOJeipH6KLMJ
+        type Chunk = usize;
+        const STEP: usize = mem::size_of::<Chunk>();
+        const ONE_BYTES: Chunk = Chunk::MAX / 255; // 0x0101...01
+
+        for chunk in rest.chunks_exact(STEP) {
+            let chars = Chunk::from_ne_bytes(chunk.try_into().unwrap());
+            let contains_ctrl = chars.wrapping_sub(ONE_BYTES * 0x20) & !chars;
+            let chars_quote = chars ^ (ONE_BYTES * Chunk::from(b'"'));
+            let contains_quote = chars_quote.wrapping_sub(ONE_BYTES) & !chars_quote;
+            let chars_backslash = chars ^ (ONE_BYTES * Chunk::from(b'\\'));
+            let contains_backslash = chars_backslash.wrapping_sub(ONE_BYTES) & !chars_backslash;
+            let masked = (contains_ctrl | contains_quote | contains_backslash) & (ONE_BYTES << 7);
+            if masked != 0 {
+                let addresswise_first_bit = if cfg!(target_endian = "little") {
+                    masked.trailing_zeros()
+                } else {
+                    masked.leading_zeros()
+                };
+                // SAFETY: chunk is in-bounds for slice
+                self.index = unsafe { chunk.as_ptr().offset_from(self.slice.as_ptr()) } as usize
+                    + addresswise_first_bit as usize / 8;
+                return;
+            }
+        }
+
+        self.index += rest.len() / STEP * STEP;
+        self.skip_to_escape_slow();
+    }
+
+    #[cold]
+    #[inline(never)]
+    fn skip_to_escape_slow(&mut self) {
+        while self.index < self.slice.len() && !is_escape(self.slice[self.index], true) {
+            self.index += 1;
+        }
+    }
+
     /// The big optimization here over IoRead is that if the string contains no
     /// backslash escape sequences, the returned &str is a slice of the raw JSON
     /// data so we avoid copying into the scratch space.
@@ -442,9 +502,7 @@ impl<'a> SliceRead<'a> {
         let mut start = self.index;
 
         loop {
-            while self.index < self.slice.len() && !ESCAPE[self.slice[self.index] as usize] {
-                self.index += 1;
-            }
+            self.skip_to_escape(validate);
             if self.index == self.slice.len() {
                 return error(self, ErrorCode::EofWhileParsingString);
             }
@@ -470,9 +528,7 @@ impl<'a> SliceRead<'a> {
                 }
                 _ => {
                     self.index += 1;
-                    if validate {
-                        return error(self, ErrorCode::ControlCharacterWhileParsingString);
-                    }
+                    return error(self, ErrorCode::ControlCharacterWhileParsingString);
                 }
             }
         }
@@ -538,9 +594,7 @@ impl<'a> Read<'a> for SliceRead<'a> {
 
     fn ignore_str(&mut self) -> Result<()> {
         loop {
-            while self.index < self.slice.len() && !ESCAPE[self.slice[self.index] as usize] {
-                self.index += 1;
-            }
+            self.skip_to_escape(true);
             if self.index == self.slice.len() {
                 return error(self, ErrorCode::EofWhileParsingString);
             }
@@ -779,33 +833,9 @@ pub trait Fused: private::Sealed {}
 impl<'a> Fused for SliceRead<'a> {}
 impl<'a> Fused for StrRead<'a> {}
 
-// Lookup table of bytes that must be escaped. A value of true at index i means
-// that byte i requires an escape sequence in the input.
-static ESCAPE: [bool; 256] = {
-    const CT: bool = true; // control character \x00..=\x1F
-    const QU: bool = true; // quote \x22
-    const BS: bool = true; // backslash \x5C
-    const __: bool = false; // allow unescaped
-    [
-        //   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F
-        CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, // 0
-        CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, // 1
-        __, __, QU, __, __, __, __, __, __, __, __, __, __, __, __, __, // 2
-        __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 3
-        __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 4
-        __, __, __, __, __, __, __, __, __, __, __, __, BS, __, __, __, // 5
-        __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 6
-        __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 7
-        __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 8
-        __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 9
-        __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // A
-        __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // B
-        __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // C
-        __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // D
-        __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // E
-        __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // F
-    ]
-};
+fn is_escape(ch: u8, including_control_characters: bool) -> bool {
+    ch == b'"' || ch == b'\\' || (including_control_characters && ch < 0x20)
+}
 
 fn next_or_eof<'de, R>(read: &mut R) -> Result<u8>
 where
diff --git a/tests/test.rs b/tests/test.rs
@@ -2497,3 +2497,22 @@ fn hash_positive_and_negative_zero() {
         assert_eq!(rand.hash_one(k1), rand.hash_one(k2));
     }
 }
+
+#[test]
+fn test_control_character_search() {
+    // Different space circumstances
+    for n in 0..16 {
+        for m in 0..16 {
+            test_parse_err::<String>(&[(
+                &format!("\"{}\n{}\"", ".".repeat(n), ".".repeat(m)),
+                "control character (\\u0000-\\u001F) found while parsing a string at line 2 column 0",
+            )]);
+        }
+    }
+
+    // Multiple occurrences
+    test_parse_err::<String>(&[(
+        &"\"\t\n\r\"",
+        "control character (\\u0000-\\u001F) found while parsing a string at line 1 column 2",
+    )]);
+}