@@ -2,6 +2,7 @@ use crate::error::{Error, ErrorCode, Result};
2
2
use alloc:: vec:: Vec ;
3
3
use core:: char;
4
4
use core:: cmp;
5
+ use core:: mem;
5
6
use core:: ops:: Deref ;
6
7
use core:: str;
7
8
@@ -221,7 +222,7 @@ where
221
222
{
222
223
loop {
223
224
let ch = tri ! ( next_or_eof( self ) ) ;
224
- if !ESCAPE [ ch as usize ] {
225
+ if !is_escape ( ch , true ) {
225
226
scratch. push ( ch) ;
226
227
continue ;
227
228
}
@@ -342,7 +343,7 @@ where
342
343
fn ignore_str ( & mut self ) -> Result < ( ) > {
343
344
loop {
344
345
let ch = tri ! ( next_or_eof( self ) ) ;
345
- if !ESCAPE [ ch as usize ] {
346
+ if !is_escape ( ch , true ) {
346
347
continue ;
347
348
}
348
349
match ch {
@@ -425,6 +426,65 @@ impl<'a> SliceRead<'a> {
425
426
}
426
427
}
427
428
429
+ fn skip_to_escape ( & mut self , forbid_control_characters : bool ) {
430
+ // Immediately bail-out on empty strings and consecutive escapes (e.g. \u041b\u0435)
431
+ if self . index == self . slice . len ( )
432
+ || is_escape ( self . slice [ self . index ] , forbid_control_characters)
433
+ {
434
+ return ;
435
+ }
436
+ self . index += 1 ;
437
+
438
+ let rest = & self . slice [ self . index ..] ;
439
+
440
+ if !forbid_control_characters {
441
+ self . index += memchr:: memchr2 ( b'"' , b'\\' , rest) . unwrap_or ( rest. len ( ) ) ;
442
+ return ;
443
+ }
444
+
445
+ // We wish to find the first byte in range 0x00..=0x1F or " or \. Ideally, we'd use
446
+ // something akin to memchr3, but the memchr crate does not support this at the moment.
447
+ // Therefore, we use a variation on Mycroft's algorithm [1] to provide performance better
448
+ // than a naive loop. It runs faster than equivalent two-pass memchr2+SWAR code on
449
+ // benchmarks and it's cross-platform, so probably the right fit.
450
+ // [1]: https://groups.google.com/forum/#!original/comp.lang.c/2HtQXvg7iKc/xOJeipH6KLMJ
451
+ type Chunk = usize ;
452
+ const STEP : usize = mem:: size_of :: < Chunk > ( ) ;
453
+ const ONE_BYTES : Chunk = Chunk :: MAX / 255 ; // 0x0101...01
454
+
455
+ for chunk in rest. chunks_exact ( STEP ) {
456
+ let chars = Chunk :: from_ne_bytes ( chunk. try_into ( ) . unwrap ( ) ) ;
457
+ let contains_ctrl = chars. wrapping_sub ( ONE_BYTES * 0x20 ) & !chars;
458
+ let chars_quote = chars ^ ( ONE_BYTES * Chunk :: from ( b'"' ) ) ;
459
+ let contains_quote = chars_quote. wrapping_sub ( ONE_BYTES ) & !chars_quote;
460
+ let chars_backslash = chars ^ ( ONE_BYTES * Chunk :: from ( b'\\' ) ) ;
461
+ let contains_backslash = chars_backslash. wrapping_sub ( ONE_BYTES ) & !chars_backslash;
462
+ let masked = ( contains_ctrl | contains_quote | contains_backslash) & ( ONE_BYTES << 7 ) ;
463
+ if masked != 0 {
464
+ let addresswise_first_bit = if cfg ! ( target_endian = "little" ) {
465
+ masked. trailing_zeros ( )
466
+ } else {
467
+ masked. leading_zeros ( )
468
+ } ;
469
+ // SAFETY: chunk is in-bounds for slice
470
+ self . index = unsafe { chunk. as_ptr ( ) . offset_from ( self . slice . as_ptr ( ) ) } as usize
471
+ + addresswise_first_bit as usize / 8 ;
472
+ return ;
473
+ }
474
+ }
475
+
476
+ self . index += rest. len ( ) / STEP * STEP ;
477
+ self . skip_to_escape_slow ( ) ;
478
+ }
479
+
480
+ #[ cold]
481
+ #[ inline( never) ]
482
+ fn skip_to_escape_slow ( & mut self ) {
483
+ while self . index < self . slice . len ( ) && !is_escape ( self . slice [ self . index ] , true ) {
484
+ self . index += 1 ;
485
+ }
486
+ }
487
+
428
488
/// The big optimization here over IoRead is that if the string contains no
429
489
/// backslash escape sequences, the returned &str is a slice of the raw JSON
430
490
/// data so we avoid copying into the scratch space.
@@ -442,9 +502,7 @@ impl<'a> SliceRead<'a> {
442
502
let mut start = self . index ;
443
503
444
504
loop {
445
- while self . index < self . slice . len ( ) && !ESCAPE [ self . slice [ self . index ] as usize ] {
446
- self . index += 1 ;
447
- }
505
+ self . skip_to_escape ( validate) ;
448
506
if self . index == self . slice . len ( ) {
449
507
return error ( self , ErrorCode :: EofWhileParsingString ) ;
450
508
}
@@ -470,9 +528,7 @@ impl<'a> SliceRead<'a> {
470
528
}
471
529
_ => {
472
530
self . index += 1 ;
473
- if validate {
474
- return error ( self , ErrorCode :: ControlCharacterWhileParsingString ) ;
475
- }
531
+ return error ( self , ErrorCode :: ControlCharacterWhileParsingString ) ;
476
532
}
477
533
}
478
534
}
@@ -538,9 +594,7 @@ impl<'a> Read<'a> for SliceRead<'a> {
538
594
539
595
fn ignore_str ( & mut self ) -> Result < ( ) > {
540
596
loop {
541
- while self . index < self . slice . len ( ) && !ESCAPE [ self . slice [ self . index ] as usize ] {
542
- self . index += 1 ;
543
- }
597
+ self . skip_to_escape ( true ) ;
544
598
if self . index == self . slice . len ( ) {
545
599
return error ( self , ErrorCode :: EofWhileParsingString ) ;
546
600
}
@@ -779,33 +833,9 @@ pub trait Fused: private::Sealed {}
779
833
impl < ' a > Fused for SliceRead < ' a > { }
780
834
impl < ' a > Fused for StrRead < ' a > { }
781
835
782
- // Lookup table of bytes that must be escaped. A value of true at index i means
783
- // that byte i requires an escape sequence in the input.
784
- static ESCAPE : [ bool ; 256 ] = {
785
- const CT : bool = true ; // control character \x00..=\x1F
786
- const QU : bool = true ; // quote \x22
787
- const BS : bool = true ; // backslash \x5C
788
- const __: bool = false ; // allow unescaped
789
- [
790
- // 1 2 3 4 5 6 7 8 9 A B C D E F
791
- CT , CT , CT , CT , CT , CT , CT , CT , CT , CT , CT , CT , CT , CT , CT , CT , // 0
792
- CT , CT , CT , CT , CT , CT , CT , CT , CT , CT , CT , CT , CT , CT , CT , CT , // 1
793
- __, __, QU , __, __, __, __, __, __, __, __, __, __, __, __, __, // 2
794
- __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 3
795
- __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 4
796
- __, __, __, __, __, __, __, __, __, __, __, __, BS , __, __, __, // 5
797
- __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 6
798
- __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 7
799
- __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 8
800
- __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 9
801
- __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // A
802
- __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // B
803
- __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // C
804
- __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // D
805
- __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // E
806
- __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // F
807
- ]
808
- } ;
836
+ fn is_escape ( ch : u8 , including_control_characters : bool ) -> bool {
837
+ ch == b'"' || ch == b'\\' || ( including_control_characters && ch < 0x20 )
838
+ }
809
839
810
840
fn next_or_eof < ' de , R > ( read : & mut R ) -> Result < u8 >
811
841
where
0 commit comments