Skip to content

Commit 15886b1

Browse files
authored
Merge pull request #2345 from jepler/compressed-unicode
translation: Compress as unicode, not bytes
2 parents fce81e6 + 1a0dcb5 commit 15886b1

File tree

2 files changed

+36
-23
lines changed

2 files changed

+36
-23
lines changed

py/makeqstrdata.py

Lines changed: 18 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -103,14 +103,10 @@ def compute_huffman_coding(translations, qstrs, compression_filename):
103103
# go through each qstr and print it out
104104
for _, _, qstr in qstrs.values():
105105
all_strings.append(qstr)
106-
all_strings_concat = "".join(all_strings).encode("utf-8")
106+
all_strings_concat = "".join(all_strings)
107107
counts = collections.Counter(all_strings_concat)
108-
# add other values
109-
for i in range(256):
110-
if i not in counts:
111-
counts[i] = 0
112108
cb = huffman.codebook(counts.items())
113-
values = bytearray()
109+
values = []
114110
length_count = {}
115111
renumbered = 0
116112
last_l = None
@@ -124,26 +120,27 @@ def compute_huffman_coding(translations, qstrs, compression_filename):
124120
if last_l:
125121
renumbered <<= (l - last_l)
126122
canonical[ch] = '{0:0{width}b}'.format(renumbered, width=l)
127-
if chr(ch) in C_ESCAPES:
128-
s = C_ESCAPES[chr(ch)]
129-
else:
130-
s = chr(ch)
131-
print("//", ch, s, counts[ch], canonical[ch], renumbered)
123+
s = C_ESCAPES.get(ch, ch)
124+
print("//", ord(ch), s, counts[ch], canonical[ch], renumbered)
132125
renumbered += 1
133126
last_l = l
134127
lengths = bytearray()
135-
for i in range(1, max(length_count) + 1):
128+
print("// length count", length_count)
129+
for i in range(1, max(length_count) + 2):
136130
lengths.append(length_count.get(i, 0))
131+
print("// values", values, "lengths", len(lengths), lengths)
132+
print("// estimated total memory size", len(lengths) + 2*len(values) + sum(len(cb[u]) for u in all_strings_concat))
137133
print("//", values, lengths)
134+
values_type = "uint16_t" if max(ord(u) for u in values) > 255 else "uint8_t"
138135
with open(compression_filename, "w") as f:
139136
f.write("const uint8_t lengths[] = {{ {} }};\n".format(", ".join(map(str, lengths))))
140-
f.write("const uint8_t values[256] = {{ {} }};\n".format(", ".join(map(str, values))))
137+
f.write("const {} values[] = {{ {} }};\n".format(values_type, ", ".join(str(ord(u)) for u in values)))
141138
return values, lengths
142139

143140
def decompress(encoding_table, length, encoded):
144141
values, lengths = encoding_table
145142
#print(l, encoded)
146-
dec = bytearray(length)
143+
dec = []
147144
this_byte = 0
148145
this_bit = 7
149146
b = encoded[this_byte]
@@ -173,14 +170,14 @@ def decompress(encoding_table, length, encoded):
173170
searched_length += lengths[bit_length]
174171

175172
v = values[searched_length + bits - max_code]
176-
dec[i] = v
177-
return dec
173+
dec.append(v)
174+
return ''.join(dec)
178175

179176
def compress(encoding_table, decompressed):
180-
if not isinstance(decompressed, bytes):
177+
if not isinstance(decompressed, str):
181178
raise TypeError()
182179
values, lengths = encoding_table
183-
enc = bytearray(len(decompressed) * 2)
180+
enc = bytearray(len(decompressed) * 3)
184181
#print(decompressed)
185182
#print(lengths)
186183
current_bit = 7
@@ -228,7 +225,7 @@ def compress(encoding_table, decompressed):
228225
if current_bit != 7:
229226
current_byte += 1
230227
if current_byte > len(decompressed):
231-
print("Note: compression increased length", repr(decompressed.decode('utf-8')), len(decompressed), current_byte, file=sys.stderr)
228+
print("Note: compression increased length", repr(decompressed), len(decompressed), current_byte, file=sys.stderr)
232229
return enc[:current_byte]
233230

234231
def qstr_escape(qst):
@@ -347,9 +344,9 @@ def print_qstr_data(encoding_table, qcfgs, qstrs, i18ns):
347344
total_text_compressed_size = 0
348345
for original, translation in i18ns:
349346
translation_encoded = translation.encode("utf-8")
350-
compressed = compress(encoding_table, translation_encoded)
347+
compressed = compress(encoding_table, translation)
351348
total_text_compressed_size += len(compressed)
352-
decompressed = decompress(encoding_table, len(translation_encoded), compressed).decode("utf-8")
349+
decompressed = decompress(encoding_table, len(translation_encoded), compressed)
353350
for c in C_ESCAPES:
354351
decompressed = decompressed.replace(c, C_ESCAPES[c])
355352
print("TRANSLATION(\"{}\", {}, {{ {} }}) // {}".format(original, len(translation_encoded)+1, ", ".join(["0x{:02x}".format(x) for x in compressed]), decompressed))

supervisor/shared/translate.c

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,12 +42,28 @@ void serial_write_compressed(const compressed_string_t* compressed) {
4242
serial_write(decompressed);
4343
}
4444

45+
STATIC int put_utf8(char *buf, int u) {
46+
if(u <= 0x7f) {
47+
*buf = u;
48+
return 1;
49+
} else if(u <= 0x07ff) {
50+
*buf++ = 0b11000000 | (u >> 6);
51+
*buf = 0b10000000 | (u & 0b00111111);
52+
return 2;
53+
} else { // u <= 0xffff)
54+
*buf++ = 0b11000000 | (u >> 12);
55+
*buf = 0b10000000 | ((u >> 6) & 0b00111111);
56+
*buf = 0b10000000 | (u & 0b00111111);
57+
return 3;
58+
}
59+
}
60+
4561
char* decompress(const compressed_string_t* compressed, char* decompressed) {
4662
uint8_t this_byte = 0;
4763
uint8_t this_bit = 7;
4864
uint8_t b = compressed->data[this_byte];
4965
// Stop one early because the last byte is always NULL.
50-
for (uint16_t i = 0; i < compressed->length - 1; i++) {
66+
for (uint16_t i = 0; i < compressed->length - 1;) {
5167
uint32_t bits = 0;
5268
uint8_t bit_length = 0;
5369
uint32_t max_code = lengths[0];
@@ -72,7 +88,7 @@ char* decompress(const compressed_string_t* compressed, char* decompressed) {
7288
max_code = (max_code << 1) + lengths[bit_length];
7389
searched_length += lengths[bit_length];
7490
}
75-
decompressed[i] = values[searched_length + bits - max_code];
91+
i += put_utf8(decompressed + i, values[searched_length + bits - max_code]);
7692
}
7793

7894
decompressed[compressed->length-1] = '\0';

0 commit comments

Comments
 (0)