@@ -443,32 +443,36 @@ impl<'a> SliceRead<'a> {
443
443
// benchmarks and is faster than both SSE2 and AVX-based code, and it's cross-platform, so
444
444
// probably the right fit.
445
445
// [1]: https://groups.google.com/forum/#!original/comp.lang.c/2HtQXvg7iKc/xOJeipH6KLMJ
446
+ const STEP : usize = mem:: size_of :: < usize > ( ) ;
447
+
448
+ // Moving this to a local variable removes a spill in the hot loop.
449
+ let mut index = self . index ;
450
+
451
+ if self . slice . len ( ) >= STEP {
452
+ while index < end. min ( self . slice . len ( ) - STEP + 1 ) {
453
+ // We can safely overread past end in most cases. This ensures that SWAR code is
454
+ // used to handle the tail in the hot path.
455
+ const ONE_BYTES : usize = usize:: MAX / 255 ;
456
+ let chars = usize:: from_ne_bytes ( self . slice [ index..] [ ..STEP ] . try_into ( ) . unwrap ( ) ) ;
457
+ let mask = chars. wrapping_sub ( ONE_BYTES * 0x20 ) & !chars & ( ONE_BYTES << 7 ) ;
458
+
459
+ if mask != 0 {
460
+ index += mask. trailing_zeros ( ) as usize / 8 ;
461
+ break ;
462
+ }
446
463
447
- // Pad the chunk to a whole count of units if possible. This ensures that SWAR code is used
448
- // to handle the tail in the hot path.
449
- let block_end = ( self . index + ( end - self . index ) . next_multiple_of ( mem:: size_of :: < usize > ( ) ) )
450
- . min ( self . slice . len ( ) ) ;
451
- let mut block = & self . slice [ self . index ..block_end] ;
452
-
453
- while let Some ( ( chars, block_remainder) ) = block. split_first_chunk ( ) {
454
- const ONE_BYTES : usize = usize:: MAX / 255 ;
455
- let chars = usize:: from_ne_bytes ( * chars) ;
456
- let mask = chars. wrapping_sub ( ONE_BYTES * 0x20 ) & !chars & ( ONE_BYTES << 7 ) ;
464
+ index += STEP ;
465
+ }
466
+ }
457
467
458
- if mask != 0 {
459
- let control_index = block_end - block . len ( ) + mask . trailing_zeros ( ) as usize / 8 ;
460
- self . index = control_index . min ( end ) ;
468
+ if index < end {
469
+ if let Some ( offset ) = self . slice [ index..end ] . iter ( ) . position ( | & c| c <= 0x1F ) {
470
+ self . index = index + offset ;
461
471
return ;
462
472
}
463
-
464
- block = block_remainder;
465
473
}
466
474
467
- if let Some ( offset) = block. iter ( ) . position ( |& c| c <= 0x1F ) {
468
- self . index = ( block_end - block. len ( ) + offset) . min ( end) ;
469
- } else {
470
- self . index = end;
471
- }
475
+ self . index = end;
472
476
}
473
477
474
478
/// The big optimization here over IoRead is that if the string contains no
0 commit comments