Skip to content

Commit d59a28d

Browse files
committed
Compress word offset table
By storing "count of words by length", the long `wends` table can be replaced with a short `wlencount` table. This saves flash storage space. Extend the range of string lengths that can be in the dictionary. Originally it was to 2 to 9; at one point it was changed to 3 to 9. Putting the lower bound back at 2 has a positive impact on the French translation (a bunch of them, such as "ch", "\r\n", "%q", are used). Increasing the maximum length gets 'mpossible', ' doit être ', and 'CircuitPyth' at the long end. This adds a bit of processing time to makeqstrdata. The specific 2/11 values are again empirical based on the French translation on the adafruit_proxlight_trinkey_m0.
1 parent 063e394 commit d59a28d

File tree

3 files changed

+43
-34
lines changed

3 files changed

+43
-34
lines changed

py/makeqstrdata.py

Lines changed: 19 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -333,12 +333,9 @@ def compute_huffman_coding(translations, compression_filename):
333333

334334
bits_per_codepoint = 16 if max_ord > 255 else 8
335335
values_type = "uint16_t" if max_ord > 255 else "uint8_t"
336-
max_words_len = 160 if max_ord > 255 else 255
337-
338-
sum_len = 0
339-
while True:
336+
while len(words) < max_words:
340337
# Until the dictionary is filled to capacity, use a heuristic to find
341-
# the best "word" (3- to 9-gram) to add to it.
338+
# the best "word" (2- to 11-gram) to add to it.
342339
#
343340
# The TextSplitter allows us to avoid considering parts of the text
344341
# that are already covered by a previously chosen word, for example
@@ -369,7 +366,8 @@ def est_len(occ):
369366
# the Huffman tree bumps up the encoding lengths of all words in the
370367
# same subtree. In the extreme case when the new word is so frequent
371368
# that it gets a one-bit encoding, all other words will cost an extra
372-
# bit each.
369+
# bit each. This is empirically modeled by the constant factor added to
370+
# cost, but the specific value used isn't "proven" to be correct.
373371
#
374372
# Another source of inaccuracy is that compressed strings end up
375373
# on byte boundaries, not bit boundaries, so saving 1 bit somewhere
@@ -383,14 +381,14 @@ def est_len(occ):
383381
# The difference between the two is the estimated net savings, in bits.
384382
def est_net_savings(s, occ):
385383
savings = occ * (bit_length(s) - est_len(occ))
386-
cost = len(s) * bits_per_codepoint
384+
cost = len(s) * bits_per_codepoint + 24
387385
return savings - cost
388386

389387
counter = collections.Counter()
390388
for t in texts:
391389
for (found, word) in extractor.iter_words(t):
392390
if not found:
393-
for substr in iter_substrings(word, minlen=3, maxlen=9):
391+
for substr in iter_substrings(word, minlen=2, maxlen=11):
394392
counter[substr] += 1
395393

396394
# Score the candidates we found. This is a semi-empirical formula that
@@ -410,16 +408,9 @@ def est_net_savings(s, occ):
410408
break
411409

412410
word = scores[0][0]
413-
414-
# If we can successfully add it to the dictionary, do so. Otherwise,
415-
# we've filled the dictionary to capacity and are done.
416-
if sum_len + len(word) - 2 > max_words_len:
417-
break
418-
if len(words) == max_words:
419-
break
420411
words.append(word)
421-
sum_len += len(word) - 2
422412

413+
words.sort(key=len)
423414
extractor = TextSplitter(words)
424415
counter = collections.Counter()
425416
for t in texts:
@@ -469,30 +460,33 @@ def est_net_savings(s, occ):
469460
len(translation.encode("utf-8")) for (original, translation) in translations
470461
)
471462

472-
wends = list(len(w) - 2 for w in words)
473-
for i in range(1, len(wends)):
474-
wends[i] += wends[i - 1]
463+
maxlen = len(words[-1])
464+
minlen = len(words[0])
465+
wlencount = [len([None for w in words if len(w) == l]) for l in range(minlen, maxlen + 1)]
475466

476467
with open(compression_filename, "w") as f:
468+
f.write("typedef {} mchar_t;".format(values_type))
477469
f.write("const uint8_t lengths[] = {{ {} }};\n".format(", ".join(map(str, lengths))))
478470
f.write(
479-
"const {} values[] = {{ {} }};\n".format(
480-
values_type, ", ".join(str(ord(u)) for u in values)
481-
)
471+
"const mchar_t values[] = {{ {} }};\n".format(", ".join(str(ord(u)) for u in values))
482472
)
483473
f.write(
484474
"#define compress_max_length_bits ({})\n".format(
485475
max_translation_encoded_length.bit_length()
486476
)
487477
)
488478
f.write(
489-
"const {} words[] = {{ {} }};\n".format(
490-
values_type, ", ".join(str(ord(c)) for w in words for c in w)
479+
"const mchar_t words[] = {{ {} }};\n".format(
480+
", ".join(str(ord(c)) for w in words for c in w)
491481
)
492482
)
493-
f.write("const uint8_t wends[] = {{ {} }};\n".format(", ".join(str(p) for p in wends)))
483+
f.write(
484+
"const uint8_t wlencount[] = {{ {} }};\n".format(", ".join(str(p) for p in wlencount))
485+
)
494486
f.write("#define word_start {}\n".format(word_start))
495487
f.write("#define word_end {}\n".format(word_end))
488+
f.write("#define minlen {}\n".format(minlen))
489+
f.write("#define maxlen {}\n".format(maxlen))
496490

497491
return (values, lengths, words, canonical, extractor)
498492

supervisor/shared/translate.c

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -43,22 +43,34 @@ void serial_write_compressed(const compressed_string_t *compressed) {
4343
serial_write(decompressed);
4444
}
4545

46+
STATIC void get_word(int n, const mchar_t **pos, const mchar_t **end) {
47+
int len = minlen;
48+
int i = 0;
49+
*pos = words;
50+
while (wlencount[i] <= n) {
51+
n -= wlencount[i];
52+
*pos += len * wlencount[i];
53+
i++;
54+
len++;
55+
}
56+
*pos += len * n;
57+
*end = *pos + len;
58+
}
59+
4660
STATIC int put_utf8(char *buf, int u) {
4761
if (u <= 0x7f) {
4862
*buf = u;
4963
return 1;
5064
} else if (word_start <= u && u <= word_end) {
5165
uint n = (u - word_start);
52-
size_t pos = 0;
53-
if (n > 0) {
54-
pos = wends[n - 1] + (n * 2);
55-
}
66+
const mchar_t *pos, *end;
67+
get_word(n, &pos, &end);
5668
int ret = 0;
5769
// note that at present, entries in the words table are
5870
// guaranteed not to represent words themselves, so this adds
5971
// at most 1 level of recursive call
60-
for (; pos < wends[n] + (n + 1) * 2; pos++) {
61-
int len = put_utf8(buf, words[pos]);
72+
for (; pos < end; pos++) {
73+
int len = put_utf8(buf, *pos);
6274
buf += len;
6375
ret += len;
6476
}

supervisor/shared/translate.h

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -50,11 +50,14 @@
5050
// are computed with a heuristic based on frequent substrings of 2 to
5151
// 9 code points. These are called "words" but are not, grammatically
5252
// speaking, words. They're just spans of code points that frequently
53-
// occur together.
53+
// occur together. They are ordered shortest to longest.
5454
//
5555
// - dictionary entries are non-overlapping, and the _ending_ index of each
56-
// entry is stored in an array. Since the index given is the ending
57-
// index, the array is called "wends".
56+
// entry is stored in an array. A count of words of each length, from
57+
// minlen to maxlen, is given in the array called wlencount. From
58+
// this small array, the start and end of the N'th word can be
59+
// calculated by an efficient, small loop. (A bit of time is traded
60+
// to reduce the size of this table indicating lengths)
5861
//
5962
// The "data" / "tail" construct is so that the struct's last member is a
6063
// "flexible array". However, the _only_ member is not permitted to be

0 commit comments

Comments
 (0)