Skip to content

Commit e06a3bb

Browse files
committed
translation: Compress as unicode, not bytes
By treating each unicode code-point as a single entity for huffman compression, the overall compression rate can be somewhat improved without changing the algorithm. On the decompression side, when compressed values above 127 are encountered, they need to be converted from a 16-bit Unicode code point into a UTF-8 byte sequence. Doing this returns approximately 1.5kB of flash storage with the zh_Latn_pinyin translation. (292 -> 1768 bytes remaining in my build of trinket_m0) Other "more ASCII" translations benefit less, and in fact zh_Latn_pinyin is no longer the most constrained translation! (de_DE 1156 -> 1384 bytes free in flash, I didn't check others before pushing for CI) English is slightly pessimized, 2840 -> 2788 bytes, probably mostly because the "values" array was changed from uint8_t to uint16_t, which is strictly not required for an all-ASCII translation. This could probably be avoided in this case, but as English is not the most constrained translation it doesn't really matter. Testing performed: built for feather nRF52840 express and trinket m0 in English and zh_Latn_pinyin; ran and verified the localized messages such as Àn xià rènhé jiàn jìnrù REPL. Shǐyòng CTRL-D chóngxīn jiāzài. and Press any key to enter the REPL. Use CTRL-D to reload. were properly displayed.
1 parent 83ecb1b commit e06a3bb

File tree

2 files changed

+34
-22
lines changed

2 files changed

+34
-22
lines changed

py/makeqstrdata.py

Lines changed: 16 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -103,14 +103,10 @@ def compute_huffman_coding(translations, qstrs, compression_filename):
103103
# go through each qstr and print it out
104104
for _, _, qstr in qstrs.values():
105105
all_strings.append(qstr)
106-
all_strings_concat = "".join(all_strings).encode("utf-8")
106+
all_strings_concat = "".join(all_strings)
107107
counts = collections.Counter(all_strings_concat)
108-
# add other values
109-
for i in range(256):
110-
if i not in counts:
111-
counts[i] = 0
112108
cb = huffman.codebook(counts.items())
113-
values = bytearray()
109+
values = []
114110
length_count = {}
115111
renumbered = 0
116112
last_l = None
@@ -124,26 +120,26 @@ def compute_huffman_coding(translations, qstrs, compression_filename):
124120
if last_l:
125121
renumbered <<= (l - last_l)
126122
canonical[ch] = '{0:0{width}b}'.format(renumbered, width=l)
127-
if chr(ch) in C_ESCAPES:
128-
s = C_ESCAPES[chr(ch)]
129-
else:
130-
s = chr(ch)
131-
print("//", ch, s, counts[ch], canonical[ch], renumbered)
123+
s = C_ESCAPES.get(ch, ch)
124+
print("//", ord(ch), s, counts[ch], canonical[ch], renumbered)
132125
renumbered += 1
133126
last_l = l
134127
lengths = bytearray()
135-
for i in range(1, max(length_count) + 1):
128+
print("// length count", length_count)
129+
for i in range(1, max(length_count) + 2):
136130
lengths.append(length_count.get(i, 0))
131+
print("// values", values, "lengths", len(lengths), lengths)
132+
print("// estimated total memory size", len(lengths) + 2*len(values) + sum(len(cb[u]) for u in all_strings_concat))
137133
print("//", values, lengths)
138134
with open(compression_filename, "w") as f:
139135
f.write("const uint8_t lengths[] = {{ {} }};\n".format(", ".join(map(str, lengths))))
140-
f.write("const uint8_t values[256] = {{ {} }};\n".format(", ".join(map(str, values))))
136+
f.write("const uint16_t values[] = {{ {} }};\n".format(", ".join(str(ord(u)) for u in values)))
141137
return values, lengths
142138

143139
def decompress(encoding_table, length, encoded):
144140
values, lengths = encoding_table
145141
#print(l, encoded)
146-
dec = bytearray(length)
142+
dec = []
147143
this_byte = 0
148144
this_bit = 7
149145
b = encoded[this_byte]
@@ -173,14 +169,14 @@ def decompress(encoding_table, length, encoded):
173169
searched_length += lengths[bit_length]
174170

175171
v = values[searched_length + bits - max_code]
176-
dec[i] = v
177-
return dec
172+
dec.append(v)
173+
return ''.join(dec)
178174

179175
def compress(encoding_table, decompressed):
180-
if not isinstance(decompressed, bytes):
176+
if not isinstance(decompressed, str):
181177
raise TypeError()
182178
values, lengths = encoding_table
183-
enc = bytearray(len(decompressed) * 2)
179+
enc = bytearray(len(decompressed) * 3)
184180
#print(decompressed)
185181
#print(lengths)
186182
current_bit = 7
@@ -347,9 +343,9 @@ def print_qstr_data(encoding_table, qcfgs, qstrs, i18ns):
347343
total_text_compressed_size = 0
348344
for original, translation in i18ns:
349345
translation_encoded = translation.encode("utf-8")
350-
compressed = compress(encoding_table, translation_encoded)
346+
compressed = compress(encoding_table, translation)
351347
total_text_compressed_size += len(compressed)
352-
decompressed = decompress(encoding_table, len(translation_encoded), compressed).decode("utf-8")
348+
decompressed = decompress(encoding_table, len(translation_encoded), compressed)
353349
for c in C_ESCAPES:
354350
decompressed = decompressed.replace(c, C_ESCAPES[c])
355351
print("TRANSLATION(\"{}\", {}, {{ {} }}) // {}".format(original, len(translation_encoded)+1, ", ".join(["0x{:02x}".format(x) for x in compressed]), decompressed))

supervisor/shared/translate.c

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,12 +42,28 @@ void serial_write_compressed(const compressed_string_t* compressed) {
4242
serial_write(decompressed);
4343
}
4444

45+
STATIC int put_utf8(char *buf, int u) {
46+
if(u <= 0x7f) {
47+
*buf = u;
48+
return 1;
49+
} else if(u <= 0x07ff) {
50+
*buf++ = 0b11000000 | (u >> 6);
51+
*buf = 0b10000000 | (u & 0b00111111);
52+
return 2;
53+
} else { // u <= 0xffff)
54+
*buf++ = 0b11000000 | (u >> 12);
55+
*buf = 0b10000000 | ((u >> 6) & 0b00111111);
56+
*buf = 0b10000000 | (u & 0b00111111);
57+
return 3;
58+
}
59+
}
60+
4561
char* decompress(const compressed_string_t* compressed, char* decompressed) {
4662
uint8_t this_byte = 0;
4763
uint8_t this_bit = 7;
4864
uint8_t b = compressed->data[this_byte];
4965
// Stop one early because the last byte is always NULL.
50-
for (uint16_t i = 0; i < compressed->length - 1; i++) {
66+
for (uint16_t i = 0; i < compressed->length - 1;) {
5167
uint32_t bits = 0;
5268
uint8_t bit_length = 0;
5369
uint32_t max_code = lengths[0];
@@ -72,7 +88,7 @@ char* decompress(const compressed_string_t* compressed, char* decompressed) {
7288
max_code = (max_code << 1) + lengths[bit_length];
7389
searched_length += lengths[bit_length];
7490
}
75-
decompressed[i] = values[searched_length + bits - max_code];
91+
i += put_utf8(decompressed + i, values[searched_length + bits - max_code]);
7692
}
7793

7894
decompressed[compressed->length-1] = '\0';

0 commit comments

Comments
 (0)