Skip to content

Commit b0e121d

Browse files
Shrink bitset words through functional mapping
Previously, all words in the (deduplicated) bitset would be stored raw -- a full 64 bits (8 bytes). Now, those words that are equivalent to others through a specific mapping are stored separately and "mapped" to the original when loading; this shrinks the table sizes significantly, as each mapped word is stored in 2 bytes (a 4x decrease from the previous). The new encoding is also potentially non-optimal: the "mapped" byte is frequently repeated, as in practice many mapped words use the same base word. Currently we only support two forms of mapping: rotation and inversion. Note that these are both guaranteed to map transitively if at all, and supporting mappings for which this is not true may require a more interesting algorithm for choosing the optimal pairing. Updated sizes: Alphabetic : 2622 bytes (- 414 bytes) Case_Ignorable : 1803 bytes (- 330 bytes) Cased : 808 bytes (- 126 bytes) Cc : 32 bytes Grapheme_Extend: 1508 bytes (- 252 bytes) Lowercase : 901 bytes (- 84 bytes) N : 1064 bytes (- 156 bytes) Uppercase : 838 bytes (- 96 bytes) White_Space : 91 bytes (- 6 bytes) Total table sizes: 9667 bytes (-1,464 bytes)
1 parent 6c7691a commit b0e121d

File tree

4 files changed

+1211
-432
lines changed

4 files changed

+1211
-432
lines changed

src/libcore/unicode/mod.rs

Lines changed: 24 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -34,12 +34,19 @@ pub use unicode_data::uppercase::lookup as Uppercase;
3434
pub use unicode_data::white_space::lookup as White_Space;
3535

3636
#[inline(always)]
37-
fn range_search<const N: usize, const CHUNK_SIZE: usize, const N1: usize, const N2: usize>(
37+
fn range_search<
38+
const N: usize,
39+
const CHUNK_SIZE: usize,
40+
const N1: usize,
41+
const CANONICAL: usize,
42+
const CANONICALIZED: usize,
43+
>(
3844
needle: u32,
3945
chunk_idx_map: &[u8; N],
4046
(last_chunk_idx, last_chunk_mapping): (u16, u8),
4147
bitset_chunk_idx: &[[u8; CHUNK_SIZE]; N1],
42-
bitset: &[u64; N2],
48+
bitset_canonical: &[u64; CANONICAL],
49+
bitset_canonicalized: &[(u8, u8); CANONICALIZED],
4350
) -> bool {
4451
let bucket_idx = (needle / 64) as usize;
4552
let chunk_map_idx = bucket_idx / CHUNK_SIZE;
@@ -53,7 +60,20 @@ fn range_search<const N: usize, const CHUNK_SIZE: usize, const N1: usize, const
5360
} else {
5461
chunk_idx_map[chunk_map_idx]
5562
};
56-
let idx = bitset_chunk_idx[(chunk_idx as usize)][chunk_piece];
57-
let word = bitset[(idx as usize)];
63+
let idx = bitset_chunk_idx[(chunk_idx as usize)][chunk_piece] as usize;
64+
let word = if idx < CANONICAL {
65+
bitset_canonical[idx]
66+
} else {
67+
let (real_idx, mapping) = bitset_canonicalized[idx - CANONICAL];
68+
let mut word = bitset_canonical[real_idx as usize];
69+
let should_invert = mapping & (1 << 7) != 0;
70+
if should_invert {
71+
word = !word;
72+
}
73+
// Unset the inversion bit
74+
let rotate_by = mapping & !(1 << 7);
75+
word = word.rotate_left(rotate_by as u32);
76+
word
77+
};
5878
(word & (1 << (needle % 64) as u64)) != 0
5979
}

0 commit comments

Comments
 (0)