Skip to content

Commit 0ad91f7

Browse files
committed
Simplify u8::to_ascii_{upp,low}ercase while keeping it fast
1 parent 4a3241a commit 0ad91f7

File tree

2 files changed

+27
-43
lines changed

2 files changed

+27
-43
lines changed

src/libcore/benches/ascii.rs

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,26 @@
1-
// See comments in `u8::to_ascii_uppercase` in `src/libcore/num/mod.rs`.
1+
// Lower-case ASCII 'a' is the first byte that has its highest bit set
2+
// after wrap-adding 0x1F:
3+
//
4+
// b'a' + 0x1F == 0x80 == 0b1000_0000
5+
// b'z' + 0x1F == 0x98 == 0b10011000
6+
//
7+
// Lower-case ASCII 'z' is the last byte that has its highest bit unset
8+
// after wrap-adding 0x05:
9+
//
10+
// b'a' + 0x05 == 0x66 == 0b0110_0110
11+
// b'z' + 0x05 == 0x7F == 0b0111_1111
12+
//
13+
// … except for 0xFB to 0xFF, but those are in the range of bytes
14+
// that have the highest bit unset again after adding 0x1F.
15+
//
16+
// So `(byte + 0x1f) & !(byte + 5)` has its highest bit set
17+
// iff `byte` is a lower-case ASCII letter.
18+
//
19+
// Lower-case ASCII letters all have the 0x20 bit set.
20+
// (Two positions right of 0x80, the highest bit.)
21+
// Unsetting that bit produces the same letter, in upper-case.
22+
//
23+
// Therefore:
224
fn branchless_to_ascii_upper_case(byte: u8) -> u8 {
325
byte &
426
!(

src/libcore/num/mod.rs

Lines changed: 4 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -3794,39 +3794,8 @@ impl u8 {
37943794
#[stable(feature = "ascii_methods_on_intrinsics", since = "1.23.0")]
37953795
#[inline]
37963796
pub fn to_ascii_uppercase(&self) -> u8 {
3797-
// See benchmarks in src/libcore/benches/ascii_case.rs
3798-
3799-
// Lower-case ASCII 'a' is the first byte that has its highest bit set
3800-
// after wrap-adding 0x1F:
3801-
//
3802-
// b'a' + 0x1F == 0x80 == 0b1000_0000
3803-
// b'z' + 0x1F == 0x98 == 0b10011000
3804-
//
3805-
// Lower-case ASCII 'z' is the last byte that has its highest bit unset
3806-
// after wrap-adding 0x05:
3807-
//
3808-
// b'a' + 0x05 == 0x66 == 0b0110_0110
3809-
// b'z' + 0x05 == 0x7F == 0b0111_1111
3810-
//
3811-
// … except for 0xFB to 0xFF, but those are in the range of bytes
3812-
// that have the highest bit unset again after adding 0x1F.
3813-
//
3814-
// So `(byte + 0x1f) & !(byte + 5)` has its highest bit set
3815-
// iff `byte` is a lower-case ASCII letter.
3816-
//
3817-
// Lower-case ASCII letters all have the 0x20 bit set.
3818-
// (Two positions right of 0x80, the highest bit.)
3819-
// Unsetting that bit produces the same letter, in upper-case.
3820-
//
3821-
// Therefore:
3822-
*self &
3823-
!(
3824-
(
3825-
self.wrapping_add(0x1f) &
3826-
!self.wrapping_add(0x05) &
3827-
0x80
3828-
) >> 2
3829-
)
3797+
// Unset the fith bit if this is a lowercase letter
3798+
*self & !((self.is_ascii_lowercase() as u8) << 5)
38303799
}
38313800

38323801
/// Makes a copy of the value in its ASCII lower case equivalent.
@@ -3848,15 +3817,8 @@ impl u8 {
38483817
#[stable(feature = "ascii_methods_on_intrinsics", since = "1.23.0")]
38493818
#[inline]
38503819
pub fn to_ascii_lowercase(&self) -> u8 {
3851-
// See comments in to_ascii_uppercase above.
3852-
*self |
3853-
(
3854-
(
3855-
self.wrapping_add(0x3f) &
3856-
!self.wrapping_add(0x25) &
3857-
0x80
3858-
) >> 2
3859-
)
3820+
// Set the fith bit if this is an uppercase letter
3821+
*self | ((self.is_ascii_uppercase() as u8) << 5)
38603822
}
38613823

38623824
/// Checks that two values are an ASCII case-insensitive match.

0 commit comments

Comments
 (0)