Skip to content

Commit 6ace4ee

Browse files
authored
Merge pull request #2968 from jepler/more-efficient-translation
More efficient translation
2 parents d98151a + d0f9b59 commit 6ace4ee

File tree

8 files changed

+104
-27
lines changed

8 files changed

+104
-27
lines changed

main.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -185,7 +185,7 @@ bool maybe_run_list(const char ** filenames, pyexec_result_t* exec_result) {
185185
}
186186
mp_hal_stdout_tx_str(filename);
187187
const compressed_string_t* compressed = translate(" output:\n");
188-
char decompressed[compressed->length];
188+
char decompressed[decompress_length(compressed)];
189189
decompress(compressed, decompressed);
190190
mp_hal_stdout_tx_str(decompressed);
191191
pyexec_file(filename, exec_result);

py/builtinhelp.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -135,7 +135,7 @@ STATIC void mp_help_print_modules(void) {
135135

136136
// let the user know there may be other modules available from the filesystem
137137
const compressed_string_t* compressed = translate("Plus any modules on the filesystem\n");
138-
char decompressed[compressed->length];
138+
char decompressed[decompress_length(compressed)];
139139
decompress(compressed, decompressed);
140140
mp_print_str(MP_PYTHON_PRINTER, decompressed);
141141
}
@@ -181,7 +181,7 @@ STATIC mp_obj_t mp_builtin_help(size_t n_args, const mp_obj_t *args) {
181181
// print a general help message. Translate only works on single strings on one line.
182182
const compressed_string_t* compressed =
183183
translate("Welcome to Adafruit CircuitPython %s!\n\nPlease visit learn.adafruit.com/category/circuitpython for project guides.\n\nTo list built-in modules please do `help(\"modules\")`.\n");
184-
char decompressed[compressed->length];
184+
char decompressed[decompress_length(compressed)];
185185
decompress(compressed, decompressed);
186186
mp_printf(MP_PYTHON_PRINTER, decompressed, MICROPY_GIT_TAG);
187187
} else {

py/makeqstrdata.py

Lines changed: 47 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,10 @@
11
"""
22
Process raw qstr file and output qstr data with length, hash and data bytes.
33
4-
This script works with Python 2.6, 2.7, 3.3 and 3.4.
4+
This script works with Python 2.7, 3.3 and 3.4.
5+
6+
For documentation about the format of compressed translated strings, see
7+
supervisor/shared/translate.h
58
"""
69

710
from __future__ import print_function
@@ -132,19 +135,37 @@ def compute_huffman_coding(translations, qstrs, compression_filename):
132135
print("// estimated total memory size", len(lengths) + 2*len(values) + sum(len(cb[u]) for u in all_strings_concat))
133136
print("//", values, lengths)
134137
values_type = "uint16_t" if max(ord(u) for u in values) > 255 else "uint8_t"
138+
max_translation_encoded_length = max(len(translation.encode("utf-8")) for original,translation in translations)
135139
with open(compression_filename, "w") as f:
136140
f.write("const uint8_t lengths[] = {{ {} }};\n".format(", ".join(map(str, lengths))))
137141
f.write("const {} values[] = {{ {} }};\n".format(values_type, ", ".join(str(ord(u)) for u in values)))
142+
f.write("#define compress_max_length_bits ({})\n".format(max_translation_encoded_length.bit_length()))
138143
return values, lengths
139144

140-
def decompress(encoding_table, length, encoded):
145+
def decompress(encoding_table, encoded, encoded_length_bits):
141146
values, lengths = encoding_table
142-
#print(l, encoded)
143147
dec = []
144148
this_byte = 0
145149
this_bit = 7
146150
b = encoded[this_byte]
147-
for i in range(length):
151+
bits = 0
152+
for i in range(encoded_length_bits):
153+
bits <<= 1
154+
if 0x80 & b:
155+
bits |= 1
156+
157+
b <<= 1
158+
if this_bit == 0:
159+
this_bit = 7
160+
this_byte += 1
161+
if this_byte < len(encoded):
162+
b = encoded[this_byte]
163+
else:
164+
this_bit -= 1
165+
length = bits
166+
167+
i = 0
168+
while i < length:
148169
bits = 0
149170
bit_length = 0
150171
max_code = lengths[0]
@@ -170,10 +191,11 @@ def decompress(encoding_table, length, encoded):
170191
searched_length += lengths[bit_length]
171192

172193
v = values[searched_length + bits - max_code]
194+
i += len(v.encode('utf-8'))
173195
dec.append(v)
174196
return ''.join(dec)
175197

176-
def compress(encoding_table, decompressed):
198+
def compress(encoding_table, decompressed, encoded_length_bits, len_translation_encoded):
177199
if not isinstance(decompressed, str):
178200
raise TypeError()
179201
values, lengths = encoding_table
@@ -182,6 +204,19 @@ def compress(encoding_table, decompressed):
182204
#print(lengths)
183205
current_bit = 7
184206
current_byte = 0
207+
208+
code = len_translation_encoded
209+
bits = encoded_length_bits+1
210+
for i in range(bits - 1, 0, -1):
211+
if len_translation_encoded & (1 << (i - 1)):
212+
enc[current_byte] |= 1 << current_bit
213+
if current_bit == 0:
214+
current_bit = 7
215+
#print("packed {0:0{width}b}".format(enc[current_byte], width=8))
216+
current_byte += 1
217+
else:
218+
current_bit -= 1
219+
185220
for c in decompressed:
186221
#print()
187222
#print("char", c, values.index(c))
@@ -342,14 +377,17 @@ def print_qstr_data(encoding_table, qcfgs, qstrs, i18ns):
342377

343378
total_text_size = 0
344379
total_text_compressed_size = 0
380+
max_translation_encoded_length = max(len(translation.encode("utf-8")) for original, translation in i18ns)
381+
encoded_length_bits = max_translation_encoded_length.bit_length()
345382
for original, translation in i18ns:
346383
translation_encoded = translation.encode("utf-8")
347-
compressed = compress(encoding_table, translation)
384+
compressed = compress(encoding_table, translation, encoded_length_bits, len(translation_encoded))
348385
total_text_compressed_size += len(compressed)
349-
decompressed = decompress(encoding_table, len(translation_encoded), compressed)
386+
decompressed = decompress(encoding_table, compressed, encoded_length_bits)
387+
assert decompressed == translation
350388
for c in C_ESCAPES:
351389
decompressed = decompressed.replace(c, C_ESCAPES[c])
352-
print("TRANSLATION(\"{}\", {}, {{ {} }}) // {}".format(original, len(translation_encoded)+1, ", ".join(["0x{:02x}".format(x) for x in compressed]), decompressed))
390+
print("TRANSLATION(\"{}\", {}) // {}".format(original, ", ".join(["{:d}".format(x) for x in compressed]), decompressed))
353391
total_text_size += len(translation.encode("utf-8"))
354392

355393
print()
@@ -385,6 +423,7 @@ def print_qstr_enums(qstrs):
385423

386424
qcfgs, qstrs, i18ns = parse_input_headers(args.infiles)
387425
if args.translation:
426+
i18ns = sorted(i18ns)
388427
translations = translate(args.translation, i18ns)
389428
encoding_table = compute_huffman_coding(translations, qstrs, args.compression_filename)
390429
print_qstr_data(encoding_table, qcfgs, qstrs, translations)

py/moduerrno.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -158,7 +158,7 @@ const char *mp_common_errno_to_str(mp_obj_t errno_val, char *buf, size_t len) {
158158
case ENOSPC: desc = translate("No space left on device"); break;
159159
case EROFS: desc = translate("Read-only filesystem"); break;
160160
}
161-
if (desc != NULL && desc->length <= len) {
161+
if (desc != NULL && decompress_length(desc) <= len) {
162162
decompress(desc, buf);
163163
return buf;
164164
}

py/obj.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -94,17 +94,17 @@ void mp_obj_print_exception(const mp_print_t *print, mp_obj_t exc) {
9494
assert(n % 3 == 0);
9595
// Decompress the format strings
9696
const compressed_string_t* traceback = translate("Traceback (most recent call last):\n");
97-
char decompressed[traceback->length];
97+
char decompressed[decompress_length(traceback)];
9898
decompress(traceback, decompressed);
9999
#if MICROPY_ENABLE_SOURCE_LINE
100100
const compressed_string_t* frame = translate(" File \"%q\", line %d");
101101
#else
102102
const compressed_string_t* frame = translate(" File \"%q\"");
103103
#endif
104-
char decompressed_frame[frame->length];
104+
char decompressed_frame[decompress_length(frame)];
105105
decompress(frame, decompressed_frame);
106106
const compressed_string_t* block_fmt = translate(", in %q\n");
107-
char decompressed_block[block_fmt->length];
107+
char decompressed_block[decompress_length(block_fmt)];
108108
decompress(block_fmt, decompressed_block);
109109

110110
// Print the traceback

py/objexcept.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -400,7 +400,7 @@ mp_obj_t mp_obj_new_exception_msg_vlist(const mp_obj_type_t *exc_type, const com
400400

401401
// Try to allocate memory for the message
402402
mp_obj_str_t *o_str = m_new_obj_maybe(mp_obj_str_t);
403-
size_t o_str_alloc = fmt->length + 1;
403+
size_t o_str_alloc = decompress_length(fmt);
404404
byte *o_str_buf = m_new_maybe(byte, o_str_alloc);
405405

406406
bool used_emg_buf = false;
@@ -433,7 +433,7 @@ mp_obj_t mp_obj_new_exception_msg_vlist(const mp_obj_type_t *exc_type, const com
433433
// We have some memory to format the string
434434
struct _exc_printer_t exc_pr = {!used_emg_buf, o_str_alloc, 0, o_str_buf};
435435
mp_print_t print = {&exc_pr, exc_add_strn};
436-
char fmt_decompressed[fmt->length];
436+
char fmt_decompressed[decompress_length(fmt)];
437437
decompress(fmt, fmt_decompressed);
438438
mp_vprintf(&print, fmt_decompressed, ap);
439439
exc_pr.buf[exc_pr.len] = '\0';

supervisor/shared/translate.c

Lines changed: 18 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@
3737
#include "supervisor/serial.h"
3838

3939
void serial_write_compressed(const compressed_string_t* compressed) {
40-
char decompressed[compressed->length];
40+
char decompressed[decompress_length(compressed)];
4141
decompress(compressed, decompressed);
4242
serial_write(decompressed);
4343
}
@@ -58,12 +58,22 @@ STATIC int put_utf8(char *buf, int u) {
5858
}
5959
}
6060

61+
uint16_t decompress_length(const compressed_string_t* compressed) {
62+
if (compress_max_length_bits <= 8) {
63+
return 1 + (compressed->data >> (8 - compress_max_length_bits));
64+
} else {
65+
return 1 + ((compressed->data * 256 + compressed->tail[0]) >> (16 - compress_max_length_bits));
66+
}
67+
}
68+
6169
char* decompress(const compressed_string_t* compressed, char* decompressed) {
62-
uint8_t this_byte = 0;
63-
uint8_t this_bit = 7;
64-
uint8_t b = compressed->data[this_byte];
70+
uint8_t this_byte = compress_max_length_bits / 8;
71+
uint8_t this_bit = 7 - compress_max_length_bits % 8;
72+
uint8_t b = (&compressed->data)[this_byte];
73+
uint16_t length = decompress_length(compressed);
74+
6575
// Stop one early because the last byte is always NULL.
66-
for (uint16_t i = 0; i < compressed->length - 1;) {
76+
for (uint16_t i = 0; i < length - 1;) {
6777
uint32_t bits = 0;
6878
uint8_t bit_length = 0;
6979
uint32_t max_code = lengths[0];
@@ -78,7 +88,7 @@ char* decompress(const compressed_string_t* compressed, char* decompressed) {
7888
if (this_bit == 0) {
7989
this_bit = 7;
8090
this_byte += 1;
81-
b = compressed->data[this_byte]; // This may read past the end but its never used.
91+
b = (&compressed->data)[this_byte]; // This may read past the end but its never used.
8292
} else {
8393
this_bit -= 1;
8494
}
@@ -91,14 +101,14 @@ char* decompress(const compressed_string_t* compressed, char* decompressed) {
91101
i += put_utf8(decompressed + i, values[searched_length + bits - max_code]);
92102
}
93103

94-
decompressed[compressed->length-1] = '\0';
104+
decompressed[length-1] = '\0';
95105
return decompressed;
96106
}
97107

98108
inline __attribute__((always_inline)) const compressed_string_t* translate(const char* original) {
99109
#ifndef NO_QSTR
100110
#define QDEF(id, str)
101-
#define TRANSLATION(id, len, compressed...) if (strcmp(original, id) == 0) { static const compressed_string_t v = {.length = len, .data = compressed}; return &v; } else
111+
#define TRANSLATION(id, firstbyte, ...) if (strcmp(original, id) == 0) { static const compressed_string_t v = { .data = firstbyte, .tail = { __VA_ARGS__ } }; return &v; } else
102112
#include "genhdr/qstrdefs.generated.h"
103113
#undef TRANSLATION
104114
#undef QDEF

supervisor/shared/translate.h

Lines changed: 30 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,13 +29,41 @@
2929

3030
#include <stdint.h>
3131

32+
// The format of the compressed data is:
33+
// - the size of the uncompressed string in UTF-8 bytes, encoded as a
34+
// (compress_max_length_bits)-bit number. compress_max_length_bits is
35+
// computed during dictionary generation time, and happens to be 8
36+
// for all current platforms. However, it'll probably end up being
37+
// 9 in some translations sometime in the future. This length excludes
38+
// the trailing NUL, though notably decompress_length includes it.
39+
//
40+
// - followed by the huffman encoding of the individual UTF-16 code
41+
// points that make up the string. The trailing "\0" is not
42+
// represented by a huffman code, but is implied by the length.
43+
// (building the huffman encoding on UTF-16 code points gave better
44+
// compression than building it on UTF-8 bytes)
45+
//
46+
// The "data" / "tail" construct is so that the struct's last member is a
47+
// "flexible array". However, the _only_ member is not permitted to be
48+
// a flexible member, so we have to declare the first byte as a separte
49+
// member of the structure.
50+
//
51+
// For translations where length needs 8 bits, this saves about 1.5
52+
// bytes per string on average compared to a structure of {uint16_t,
53+
// flexible array}, but is also future-proofed against strings with
54+
// UTF-8 length above 256, with a savings of about 1.375 bytes per
55+
// string.
3256
typedef struct {
33-
uint16_t length;
34-
const uint8_t data[];
57+
uint8_t data;
58+
const uint8_t tail[];
3559
} compressed_string_t;
3660

61+
// Return the compressed, translated version of a source string
62+
// Usually, due to LTO, this is optimized into a load of a constant
63+
// pointer.
3764
const compressed_string_t* translate(const char* c);
3865
void serial_write_compressed(const compressed_string_t* compressed);
3966
char* decompress(const compressed_string_t* compressed, char* decompressed);
67+
uint16_t decompress_length(const compressed_string_t* compressed);
4068

4169
#endif // MICROPY_INCLUDED_SUPERVISOR_TRANSLATE_H

0 commit comments

Comments
 (0)