Skip to content

Commit 859ead8

Browse files
authored
Merge pull request #1161 from iex-rs/vectorized-string-parsing
Vectorize string parsing
2 parents 54381d6 + e43da5e commit 859ead8

File tree

3 files changed

+88
-38
lines changed

3 files changed

+88
-38
lines changed

src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -340,6 +340,7 @@
340340
clippy::wildcard_imports,
341341
// things are often more readable this way
342342
clippy::cast_lossless,
343+
clippy::items_after_statements,
343344
clippy::module_name_repetitions,
344345
clippy::redundant_else,
345346
clippy::shadow_unrelated,

src/read.rs

Lines changed: 68 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ use crate::error::{Error, ErrorCode, Result};
22
use alloc::vec::Vec;
33
use core::char;
44
use core::cmp;
5+
use core::mem;
56
use core::ops::Deref;
67
use core::str;
78

@@ -221,7 +222,7 @@ where
221222
{
222223
loop {
223224
let ch = tri!(next_or_eof(self));
224-
if !ESCAPE[ch as usize] {
225+
if !is_escape(ch, true) {
225226
scratch.push(ch);
226227
continue;
227228
}
@@ -342,7 +343,7 @@ where
342343
fn ignore_str(&mut self) -> Result<()> {
343344
loop {
344345
let ch = tri!(next_or_eof(self));
345-
if !ESCAPE[ch as usize] {
346+
if !is_escape(ch, true) {
346347
continue;
347348
}
348349
match ch {
@@ -425,6 +426,65 @@ impl<'a> SliceRead<'a> {
425426
}
426427
}
427428

429+
fn skip_to_escape(&mut self, forbid_control_characters: bool) {
430+
// Immediately bail-out on empty strings and consecutive escapes (e.g. \u041b\u0435)
431+
if self.index == self.slice.len()
432+
|| is_escape(self.slice[self.index], forbid_control_characters)
433+
{
434+
return;
435+
}
436+
self.index += 1;
437+
438+
let rest = &self.slice[self.index..];
439+
440+
if !forbid_control_characters {
441+
self.index += memchr::memchr2(b'"', b'\\', rest).unwrap_or(rest.len());
442+
return;
443+
}
444+
445+
// We wish to find the first byte in range 0x00..=0x1F or " or \. Ideally, we'd use
446+
// something akin to memchr3, but the memchr crate does not support this at the moment.
447+
// Therefore, we use a variation on Mycroft's algorithm [1] to provide performance better
448+
// than a naive loop. It runs faster than equivalent two-pass memchr2+SWAR code on
449+
// benchmarks and it's cross-platform, so probably the right fit.
450+
// [1]: https://groups.google.com/forum/#!original/comp.lang.c/2HtQXvg7iKc/xOJeipH6KLMJ
451+
type Chunk = usize;
452+
const STEP: usize = mem::size_of::<Chunk>();
453+
const ONE_BYTES: Chunk = Chunk::MAX / 255; // 0x0101...01
454+
455+
for chunk in rest.chunks_exact(STEP) {
456+
let chars = Chunk::from_ne_bytes(chunk.try_into().unwrap());
457+
let contains_ctrl = chars.wrapping_sub(ONE_BYTES * 0x20) & !chars;
458+
let chars_quote = chars ^ (ONE_BYTES * Chunk::from(b'"'));
459+
let contains_quote = chars_quote.wrapping_sub(ONE_BYTES) & !chars_quote;
460+
let chars_backslash = chars ^ (ONE_BYTES * Chunk::from(b'\\'));
461+
let contains_backslash = chars_backslash.wrapping_sub(ONE_BYTES) & !chars_backslash;
462+
let masked = (contains_ctrl | contains_quote | contains_backslash) & (ONE_BYTES << 7);
463+
if masked != 0 {
464+
let addresswise_first_bit = if cfg!(target_endian = "little") {
465+
masked.trailing_zeros()
466+
} else {
467+
masked.leading_zeros()
468+
};
469+
// SAFETY: chunk is in-bounds for slice
470+
self.index = unsafe { chunk.as_ptr().offset_from(self.slice.as_ptr()) } as usize
471+
+ addresswise_first_bit as usize / 8;
472+
return;
473+
}
474+
}
475+
476+
self.index += rest.len() / STEP * STEP;
477+
self.skip_to_escape_slow();
478+
}
479+
480+
#[cold]
481+
#[inline(never)]
482+
fn skip_to_escape_slow(&mut self) {
483+
while self.index < self.slice.len() && !is_escape(self.slice[self.index], true) {
484+
self.index += 1;
485+
}
486+
}
487+
428488
/// The big optimization here over IoRead is that if the string contains no
429489
/// backslash escape sequences, the returned &str is a slice of the raw JSON
430490
/// data so we avoid copying into the scratch space.
@@ -442,9 +502,7 @@ impl<'a> SliceRead<'a> {
442502
let mut start = self.index;
443503

444504
loop {
445-
while self.index < self.slice.len() && !ESCAPE[self.slice[self.index] as usize] {
446-
self.index += 1;
447-
}
505+
self.skip_to_escape(validate);
448506
if self.index == self.slice.len() {
449507
return error(self, ErrorCode::EofWhileParsingString);
450508
}
@@ -470,9 +528,7 @@ impl<'a> SliceRead<'a> {
470528
}
471529
_ => {
472530
self.index += 1;
473-
if validate {
474-
return error(self, ErrorCode::ControlCharacterWhileParsingString);
475-
}
531+
return error(self, ErrorCode::ControlCharacterWhileParsingString);
476532
}
477533
}
478534
}
@@ -538,9 +594,7 @@ impl<'a> Read<'a> for SliceRead<'a> {
538594

539595
fn ignore_str(&mut self) -> Result<()> {
540596
loop {
541-
while self.index < self.slice.len() && !ESCAPE[self.slice[self.index] as usize] {
542-
self.index += 1;
543-
}
597+
self.skip_to_escape(true);
544598
if self.index == self.slice.len() {
545599
return error(self, ErrorCode::EofWhileParsingString);
546600
}
@@ -779,33 +833,9 @@ pub trait Fused: private::Sealed {}
779833
impl<'a> Fused for SliceRead<'a> {}
780834
impl<'a> Fused for StrRead<'a> {}
781835

782-
// Lookup table of bytes that must be escaped. A value of true at index i means
783-
// that byte i requires an escape sequence in the input.
784-
static ESCAPE: [bool; 256] = {
785-
const CT: bool = true; // control character \x00..=\x1F
786-
const QU: bool = true; // quote \x22
787-
const BS: bool = true; // backslash \x5C
788-
const __: bool = false; // allow unescaped
789-
[
790-
// 1 2 3 4 5 6 7 8 9 A B C D E F
791-
CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, // 0
792-
CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, // 1
793-
__, __, QU, __, __, __, __, __, __, __, __, __, __, __, __, __, // 2
794-
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 3
795-
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 4
796-
__, __, __, __, __, __, __, __, __, __, __, __, BS, __, __, __, // 5
797-
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 6
798-
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 7
799-
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 8
800-
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 9
801-
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // A
802-
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // B
803-
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // C
804-
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // D
805-
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // E
806-
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // F
807-
]
808-
};
836+
fn is_escape(ch: u8, including_control_characters: bool) -> bool {
837+
ch == b'"' || ch == b'\\' || (including_control_characters && ch < 0x20)
838+
}
809839

810840
fn next_or_eof<'de, R>(read: &mut R) -> Result<u8>
811841
where

tests/test.rs

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2497,3 +2497,22 @@ fn hash_positive_and_negative_zero() {
24972497
assert_eq!(rand.hash_one(k1), rand.hash_one(k2));
24982498
}
24992499
}
2500+
2501+
#[test]
2502+
fn test_control_character_search() {
2503+
// Different space circumstances
2504+
for n in 0..16 {
2505+
for m in 0..16 {
2506+
test_parse_err::<String>(&[(
2507+
&format!("\"{}\n{}\"", ".".repeat(n), ".".repeat(m)),
2508+
"control character (\\u0000-\\u001F) found while parsing a string at line 2 column 0",
2509+
)]);
2510+
}
2511+
}
2512+
2513+
// Multiple occurrences
2514+
test_parse_err::<String>(&[(
2515+
&"\"\t\n\r\"",
2516+
"control character (\\u0000-\\u001F) found while parsing a string at line 1 column 2",
2517+
)]);
2518+
}

0 commit comments

Comments
 (0)