@@ -114,11 +114,11 @@ macro_rules! utf8_first_byte(
114
114
115
115
// return the value of $ch updated with continuation byte $byte
116
116
macro_rules! utf8_acc_cont_byte(
117
- ( $ch: expr, $byte: expr) => ( ( $ch << 6 ) | ( $byte & 63u8 ) as u32 )
117
+ ( $ch: expr, $byte: expr) => ( ( $ch << 6 ) | ( $byte & CONT_MASK ) as u32 )
118
118
)
119
119
120
120
macro_rules! utf8_is_cont_byte(
121
- ( $byte: expr) => ( ( $byte & 192u8 ) == 128 )
121
+ ( $byte: expr) => ( ( $byte & ! CONT_MASK ) == TAG_CONT_U8 )
122
122
)
123
123
124
124
#[ inline]
@@ -137,20 +137,20 @@ impl<'a> Iterator<char> for Chars<'a> {
137
137
fn decode_multibyte < ' a > ( x : u8 , it : & mut slice:: Items < ' a , u8 > ) -> char {
138
138
// NOTE: Performance is very sensitive to the exact formulation here
139
139
// Decode from a byte combination out of: [[[x y] z] w]
140
- let cont_mask = 0x3F ; // continuation byte mask
141
140
let init = utf8_first_byte ! ( x, 2 ) ;
142
141
let y = unwrap_or_0 ( it. next ( ) ) ;
143
142
let mut ch = utf8_acc_cont_byte ! ( init, y) ;
144
143
if x >= 0xE0 {
145
- /* [[x y z] w] case */
144
+ /* [[x y z] w] case
145
+ * 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid */
146
146
let z = unwrap_or_0 ( it. next ( ) ) ;
147
-
148
- let y_z = ( ( ( y & cont_mask) as u32 ) << 6 ) | ( z & cont_mask) as u32 ;
147
+ let y_z = utf8_acc_cont_byte ! ( ( y & CONT_MASK ) as u32 , z) ;
149
148
ch = init << 12 | y_z;
150
149
if x >= 0xF0 {
151
- /* [x y z w] case */
150
+ /* [x y z w] case
151
+ * use only the lower 3 bits of `init` */
152
152
let w = unwrap_or_0 ( it. next ( ) ) ;
153
- ch = ( init & 7 ) << 18 | y_z << 6 | ( w & cont_mask ) as u32 ;
153
+ ch = ( init & 7 ) << 18 | utf8_acc_cont_byte ! ( y_z, w ) ;
154
154
}
155
155
}
156
156
unsafe {
@@ -754,9 +754,9 @@ fn run_utf8_validation_iterator(iter: &mut slice::Items<u8>) -> bool {
754
754
// UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
755
755
// %xF4 %x80-8F 2( UTF8-tail )
756
756
match w {
757
- 2 => if second & 192 != TAG_CONT_U8 { err ! ( ) } ,
757
+ 2 => if second & ! CONT_MASK != TAG_CONT_U8 { err ! ( ) } ,
758
758
3 => {
759
- match ( first, second, next ! ( ) & 192 ) {
759
+ match ( first, second, next ! ( ) & ! CONT_MASK ) {
760
760
( 0xE0 , 0xA0 .. 0xBF , TAG_CONT_U8 ) |
761
761
( 0xE1 .. 0xEC , 0x80 .. 0xBF , TAG_CONT_U8 ) |
762
762
( 0xED , 0x80 .. 0x9F , TAG_CONT_U8 ) |
@@ -765,7 +765,7 @@ fn run_utf8_validation_iterator(iter: &mut slice::Items<u8>) -> bool {
765
765
}
766
766
}
767
767
4 => {
768
- match ( first, second, next ! ( ) & 192 , next ! ( ) & 192 ) {
768
+ match ( first, second, next ! ( ) & ! CONT_MASK , next ! ( ) & ! CONT_MASK ) {
769
769
( 0xF0 , 0x90 .. 0xBF , TAG_CONT_U8 , TAG_CONT_U8 ) |
770
770
( 0xF1 .. 0xF3 , 0x80 .. 0xBF , TAG_CONT_U8 , TAG_CONT_U8 ) |
771
771
( 0xF4 , 0x80 .. 0x8F , TAG_CONT_U8 , TAG_CONT_U8 ) => { }
@@ -962,7 +962,10 @@ pub struct CharRange {
962
962
pub next : uint ,
963
963
}
964
964
965
- static TAG_CONT_U8 : u8 = 128u8 ;
965
+ /// Mask of the value bits of a continuation byte
966
+ static CONT_MASK : u8 = 0b0011_1111u8 ;
967
+ /// Value of the tag bits (tag mask is !CONT_MASK) of a continuation byte
968
+ static TAG_CONT_U8 : u8 = 0b1000_0000u8 ;
966
969
967
970
/// Unsafe operations
968
971
pub mod raw {
@@ -1898,7 +1901,7 @@ impl<'a> StrSlice<'a> for &'a str {
1898
1901
// Multibyte case is a fn to allow char_range_at_reverse to inline cleanly
1899
1902
fn multibyte_char_range_at_reverse ( s : & str , mut i : uint ) -> CharRange {
1900
1903
// while there is a previous byte == 10......
1901
- while i > 0 && s. as_bytes ( ) [ i] & 192u8 == TAG_CONT_U8 {
1904
+ while i > 0 && s. as_bytes ( ) [ i] & ! CONT_MASK == TAG_CONT_U8 {
1902
1905
i -= 1 u;
1903
1906
}
1904
1907
0 commit comments