Skip to content

Commit 233ab2f

Browse files
Push the byte of LAST_CHUNK_MAP into the array
This optimizes slightly better. Alphabetic : 2536 bytes Case_Ignorable : 1771 bytes Cased : 788 bytes Cc : 24 bytes Grapheme_Extend: 1488 bytes Lowercase : 863 bytes N : 1038 bytes Uppercase : 776 bytes White_Space : 83 bytes Total table sizes: 9367 bytes (-18 bytes; 2 bytes per set)
1 parent 5f71d98 commit 233ab2f

File tree

3 files changed

+49
-46
lines changed

3 files changed

+49
-46
lines changed

src/libcore/unicode/unicode_data.rs

Lines changed: 35 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -10,22 +10,22 @@ fn range_search<
1010
>(
1111
needle: u32,
1212
chunk_idx_map: &[u8; N],
13-
(last_chunk_idx, last_chunk_mapping): (u16, u8),
13+
last_chunk_idx: u16,
1414
bitset_chunk_idx: &[[u8; CHUNK_SIZE]; N1],
1515
bitset_canonical: &[u64; CANONICAL],
1616
bitset_canonicalized: &[(u8, u8); CANONICALIZED],
1717
) -> bool {
1818
let bucket_idx = (needle / 64) as usize;
1919
let chunk_map_idx = bucket_idx / CHUNK_SIZE;
2020
let chunk_piece = bucket_idx % CHUNK_SIZE;
21-
let chunk_idx = if chunk_map_idx >= N {
22-
if chunk_map_idx == last_chunk_idx as usize {
23-
last_chunk_mapping
24-
} else {
25-
return false;
26-
}
27-
} else {
21+
// The last entry of `chunk_idx_map` actually should be at `last_chunk_idx`,
22+
// so we need to remap it
23+
let chunk_idx = if chunk_map_idx < (chunk_idx_map.len() - 1) {
2824
chunk_idx_map[chunk_map_idx]
25+
} else if chunk_map_idx == last_chunk_idx as usize {
26+
chunk_idx_map[chunk_idx_map.len() - 1]
27+
} else {
28+
return false;
2929
};
3030
let idx = bitset_chunk_idx[(chunk_idx as usize)][chunk_piece] as usize;
3131
let word = if idx < CANONICAL {
@@ -54,8 +54,8 @@ pub const UNICODE_VERSION: (u32, u32, u32) = (13, 0, 0);
5454

5555
#[rustfmt::skip]
5656
pub mod alphabetic {
57-
static BITSET_LAST_CHUNK_MAP: (u16, u8) = (393, 13);
58-
static BITSET_CHUNKS_MAP: [u8; 393] = [
57+
const BITSET_LAST_CHUNK_MAP: u16 = 393;
58+
static BITSET_CHUNKS_MAP: [u8; 394] = [
5959
61, 18, 2, 35, 46, 39, 38, 74, 37, 25, 70, 34, 36, 73, 66, 5, 52, 58, 54, 58, 58, 58, 69,
6060
64, 43, 58, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
6161
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 10, 6, 6, 23,
@@ -70,7 +70,7 @@ pub mod alphabetic {
7070
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
7171
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 21, 6, 6, 6, 6,
7272
6, 6, 6, 15, 72, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 8,
73-
58, 58, 58, 58, 58, 58, 6, 62, 58, 58, 6, 6, 6, 6, 6, 6, 6, 6, 6,
73+
58, 58, 58, 58, 58, 58, 6, 62, 58, 58, 6, 6, 6, 6, 6, 6, 6, 6, 6, 13,
7474
];
7575
static BITSET_INDEX_CHUNKS: [[u8; 8]; 75] = [
7676
[0, 252, 121, 172, 14, 172, 172, 172], [13, 51, 125, 172, 79, 35, 166, 172],
@@ -312,8 +312,8 @@ pub mod alphabetic {
312312

313313
#[rustfmt::skip]
314314
pub mod case_ignorable {
315-
static BITSET_LAST_CHUNK_MAP: (u16, u8) = (1792, 2);
316-
static BITSET_CHUNKS_MAP: [u8; 250] = [
315+
const BITSET_LAST_CHUNK_MAP: u16 = 1792;
316+
static BITSET_CHUNKS_MAP: [u8; 251] = [
317317
14, 28, 47, 22, 19, 11, 4, 13, 9, 40, 39, 32, 49, 23, 15, 36, 18, 39, 39, 39, 39, 39, 27,
318318
26, 12, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39,
319319
39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39,
@@ -324,7 +324,7 @@ pub mod case_ignorable {
324324
39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 44, 39, 35, 39, 39,
325325
39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39,
326326
39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 29, 39, 39, 39, 39, 39, 39, 39, 39, 39,
327-
34, 48, 39, 39, 39, 0, 39, 39, 21, 43, 39, 39, 45, 39, 39, 39, 39, 37,
327+
34, 48, 39, 39, 39, 0, 39, 39, 21, 43, 39, 39, 45, 39, 39, 39, 39, 37, 2,
328328
];
329329
static BITSET_INDEX_CHUNKS: [[u8; 8]; 52] = [
330330
[3, 75, 88, 142, 142, 142, 142, 142], [5, 110, 38, 181, 142, 142, 12, 182],
@@ -503,14 +503,14 @@ pub mod case_ignorable {
503503

504504
#[rustfmt::skip]
505505
pub mod cased {
506-
static BITSET_LAST_CHUNK_MAP: (u16, u8) = (124, 12);
507-
static BITSET_CHUNKS_MAP: [u8; 123] = [
506+
const BITSET_LAST_CHUNK_MAP: u16 = 124;
507+
static BITSET_CHUNKS_MAP: [u8; 124] = [
508508
4, 0, 18, 18, 6, 18, 18, 9, 5, 8, 18, 3, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
509509
18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 14, 15, 18, 18, 18, 18,
510510
18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 17, 16, 18, 1, 18, 10, 18, 18,
511511
7, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 13, 18,
512512
18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
513-
18, 2, 18, 18, 18, 18, 11,
513+
18, 2, 18, 18, 18, 18, 11, 12,
514514
];
515515
static BITSET_INDEX_CHUNKS: [[u8; 16]; 19] = [
516516
[5, 5, 7, 5, 50, 10, 40, 58, 58, 58, 58, 58, 58, 58, 58, 58],
@@ -594,9 +594,9 @@ pub mod cased {
594594

595595
#[rustfmt::skip]
596596
pub mod cc {
597-
static BITSET_LAST_CHUNK_MAP: (u16, u8) = (2, 0);
598-
static BITSET_CHUNKS_MAP: [u8; 2] = [
599-
0, 1,
597+
const BITSET_LAST_CHUNK_MAP: u16 = 2;
598+
static BITSET_CHUNKS_MAP: [u8; 3] = [
599+
0, 1, 0,
600600
];
601601
static BITSET_INDEX_CHUNKS: [[u8; 1]; 3] = [
602602
[0], [1], [2],
@@ -623,8 +623,8 @@ pub mod cc {
623623

624624
#[rustfmt::skip]
625625
pub mod grapheme_extend {
626-
static BITSET_LAST_CHUNK_MAP: (u16, u8) = (1792, 3);
627-
static BITSET_CHUNKS_MAP: [u8; 245] = [
626+
const BITSET_LAST_CHUNK_MAP: u16 = 1792;
627+
static BITSET_CHUNKS_MAP: [u8; 246] = [
628628
34, 30, 41, 44, 17, 11, 0, 12, 9, 36, 34, 29, 43, 20, 13, 34, 21, 34, 34, 34, 34, 34, 26,
629629
34, 16, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34,
630630
34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34,
@@ -635,7 +635,7 @@ pub mod grapheme_extend {
635635
34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 40, 34, 33, 34,
636636
34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34,
637637
34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 24, 34, 34, 34, 34, 34, 34, 34, 34,
638-
34, 32, 42, 34, 34, 34, 1, 34, 34, 19, 38, 34, 34, 39,
638+
34, 32, 42, 34, 34, 34, 1, 34, 34, 19, 38, 34, 34, 39, 3,
639639
];
640640
static BITSET_INDEX_CHUNKS: [[u8; 8]; 45] = [
641641
[1, 85, 27, 86, 34, 84, 100, 88], [4, 60, 71, 120, 120, 120, 120, 120],
@@ -785,12 +785,12 @@ pub mod grapheme_extend {
785785

786786
#[rustfmt::skip]
787787
pub mod lowercase {
788-
static BITSET_LAST_CHUNK_MAP: (u16, u8) = (122, 5);
789-
static BITSET_CHUNKS_MAP: [u8; 118] = [
788+
const BITSET_LAST_CHUNK_MAP: u16 = 122;
789+
static BITSET_CHUNKS_MAP: [u8; 119] = [
790790
16, 2, 9, 9, 4, 9, 9, 15, 3, 12, 9, 1, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
791791
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 11, 7, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
792792
9, 9, 9, 8, 10, 9, 0, 9, 14, 9, 9, 13, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
793-
9, 9, 6, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 17,
793+
9, 9, 6, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 17, 5,
794794
];
795795
static BITSET_INDEX_CHUNKS: [[u8; 16]; 18] = [
796796
[10, 55, 52, 6, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52],
@@ -886,8 +886,8 @@ pub mod lowercase {
886886

887887
#[rustfmt::skip]
888888
pub mod n {
889-
static BITSET_LAST_CHUNK_MAP: (u16, u8) = (253, 21);
890-
static BITSET_CHUNKS_MAP: [u8; 249] = [
889+
const BITSET_LAST_CHUNK_MAP: u16 = 253;
890+
static BITSET_CHUNKS_MAP: [u8; 250] = [
891891
45, 19, 19, 39, 23, 40, 6, 37, 33, 17, 19, 12, 42, 32, 41, 19, 8, 19, 2, 16, 19, 19, 13,
892892
19, 1, 43, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
893893
19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
@@ -898,7 +898,7 @@ pub mod n {
898898
19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 38, 19, 30, 19,
899899
19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
900900
19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
901-
19, 19, 27, 19, 18, 19, 19, 19, 19, 22, 26, 19, 19, 29, 19, 3, 19, 24,
901+
19, 19, 27, 19, 18, 19, 19, 19, 19, 22, 26, 19, 19, 29, 19, 3, 19, 24, 21,
902902
];
903903
static BITSET_INDEX_CHUNKS: [[u8; 8]; 47] = [
904904
[12, 52, 44, 44, 44, 44, 44, 44], [27, 44, 44, 44, 44, 44, 67, 44],
@@ -993,13 +993,13 @@ pub mod n {
993993

994994
#[rustfmt::skip]
995995
pub mod uppercase {
996-
static BITSET_LAST_CHUNK_MAP: (u16, u8) = (124, 3);
997-
static BITSET_CHUNKS_MAP: [u8; 123] = [
996+
const BITSET_LAST_CHUNK_MAP: u16 = 124;
997+
static BITSET_CHUNKS_MAP: [u8; 124] = [
998998
12, 15, 5, 5, 0, 5, 5, 2, 4, 11, 5, 14, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
999999
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 8, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
10001000
5, 5, 5, 6, 5, 13, 5, 10, 5, 5, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
10011001
5, 7, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 16, 5, 5,
1002-
5, 5, 9,
1002+
5, 5, 9, 3,
10031003
];
10041004
static BITSET_INDEX_CHUNKS: [[u8; 16]; 17] = [
10051005
[41, 41, 5, 33, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 5, 0],
@@ -1083,9 +1083,9 @@ pub mod uppercase {
10831083

10841084
#[rustfmt::skip]
10851085
pub mod white_space {
1086-
static BITSET_LAST_CHUNK_MAP: (u16, u8) = (32, 3);
1087-
static BITSET_CHUNKS_MAP: [u8; 22] = [
1088-
0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 1,
1086+
const BITSET_LAST_CHUNK_MAP: u16 = 32;
1087+
static BITSET_CHUNKS_MAP: [u8; 23] = [
1088+
0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 1, 3,
10891089
];
10901090
static BITSET_INDEX_CHUNKS: [[u8; 6]; 4] = [
10911091
[1, 4, 2, 4, 4, 4], [4, 4, 0, 3, 4, 4], [4, 4, 4, 4, 4, 4], [5, 4, 4, 4, 4, 4],

src/tools/unicode-table-generator/src/range_search.rs

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -8,22 +8,22 @@ fn range_search<
88
>(
99
needle: u32,
1010
chunk_idx_map: &[u8; N],
11-
(last_chunk_idx, last_chunk_mapping): (u16, u8),
11+
last_chunk_idx: u16,
1212
bitset_chunk_idx: &[[u8; CHUNK_SIZE]; N1],
1313
bitset_canonical: &[u64; CANONICAL],
1414
bitset_canonicalized: &[(u8, u8); CANONICALIZED],
1515
) -> bool {
1616
let bucket_idx = (needle / 64) as usize;
1717
let chunk_map_idx = bucket_idx / CHUNK_SIZE;
1818
let chunk_piece = bucket_idx % CHUNK_SIZE;
19-
let chunk_idx = if chunk_map_idx >= N {
20-
if chunk_map_idx == last_chunk_idx as usize {
21-
last_chunk_mapping
22-
} else {
23-
return false;
24-
}
25-
} else {
19+
// The last entry of `chunk_idx_map` actually should be at `last_chunk_idx`,
20+
// so we need to remap it
21+
let chunk_idx = if chunk_map_idx < (chunk_idx_map.len() - 1) {
2622
chunk_idx_map[chunk_map_idx]
23+
} else if chunk_map_idx == last_chunk_idx as usize {
24+
chunk_idx_map[chunk_idx_map.len() - 1]
25+
} else {
26+
return false;
2727
};
2828
let idx = bitset_chunk_idx[(chunk_idx as usize)][chunk_piece] as usize;
2929
let word = if idx < CANONICAL {

src/tools/unicode-table-generator/src/raw_emitter.rs

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -150,19 +150,22 @@ impl RawEmitter {
150150
while zero_chunk_idx.is_some() && chunk_indices.last().cloned() == zero_chunk_idx {
151151
chunk_indices.pop();
152152
}
153+
// We do not count the LAST_CHUNK_MAP as adding bytes because it's a
154+
// small constant whose values are inlined directly into the instruction
155+
// stream.
153156
writeln!(
154157
&mut self.file,
155-
"static BITSET_LAST_CHUNK_MAP: (u16, u8) = ({}, {});",
158+
"const BITSET_LAST_CHUNK_MAP: u16 = {};",
156159
chunk_indices.len() - 1,
157-
chunk_indices.pop().unwrap(),
158160
)
159161
.unwrap();
160-
self.bytes_used += 3;
162+
let nonzero = chunk_indices.pop().unwrap();
161163
// Try to pop again, now that we've recorded a non-zero pointing index
162164
// into the LAST_CHUNK_MAP.
163165
while zero_chunk_idx.is_some() && chunk_indices.last().cloned() == zero_chunk_idx {
164166
chunk_indices.pop();
165167
}
168+
chunk_indices.push(nonzero);
166169
writeln!(
167170
&mut self.file,
168171
"static BITSET_CHUNKS_MAP: [u8; {}] = [{}];",

0 commit comments

Comments
 (0)