Compress word offset table

jepler · jepler · commit d59a28db9774 · 2021-08-07T09:23:35.000-05:00
By storing "count of words by length", the long `wends` table can be
replaced with a short `wlencount` table.  This saves flash storage space.

Extend the range of string lengths that can be in the dictionary.
Originally it was to 2 to 9; at one point it was changed to 3 to 9.
Putting the lower bound back at 2 has a positive impact on the French
translation (a bunch of them, such as "ch", "\r\n", "%q", are used).
Increasing the maximum length gets 'mpossible', ' doit être ',
and 'CircuitPyth' at the long end.  This adds a bit of processing time
to makeqstrdata. The specific 2/11 values are again empirical based on
the French translation on the adafruit_proxlight_trinkey_m0.
diff --git a/py/makeqstrdata.py b/py/makeqstrdata.py
@@ -333,12 +333,9 @@ def compute_huffman_coding(translations, compression_filename):
 
     bits_per_codepoint = 16 if max_ord > 255 else 8
     values_type = "uint16_t" if max_ord > 255 else "uint8_t"
-    max_words_len = 160 if max_ord > 255 else 255
-
-    sum_len = 0
-    while True:
+    while len(words) < max_words:
         # Until the dictionary is filled to capacity, use a heuristic to find
-        # the best "word" (3- to 9-gram) to add to it.
+        # the best "word" (2- to 11-gram) to add to it.
         #
         # The TextSplitter allows us to avoid considering parts of the text
         # that are already covered by a previously chosen word, for example
@@ -369,7 +366,8 @@ def est_len(occ):
         # the Huffman tree bumps up the encoding lengths of all words in the
         # same subtree.  In the extreme case when the new word is so frequent
         # that it gets a one-bit encoding, all other words will cost an extra
-        # bit each.
+        # bit each. This is empirically modeled by the constant factor added to
+        # cost, but the specific value used isn't "proven" to be correct.
         #
         # Another source of inaccuracy is that compressed strings end up
         # on byte boundaries, not bit boundaries, so saving 1 bit somewhere
@@ -383,14 +381,14 @@ def est_len(occ):
         # The difference between the two is the estimated net savings, in bits.
         def est_net_savings(s, occ):
             savings = occ * (bit_length(s) - est_len(occ))
-            cost = len(s) * bits_per_codepoint
+            cost = len(s) * bits_per_codepoint + 24
             return savings - cost
 
         counter = collections.Counter()
         for t in texts:
             for (found, word) in extractor.iter_words(t):
                 if not found:
-                    for substr in iter_substrings(word, minlen=3, maxlen=9):
+                    for substr in iter_substrings(word, minlen=2, maxlen=11):
                         counter[substr] += 1
 
         # Score the candidates we found.  This is a semi-empirical formula that
@@ -410,16 +408,9 @@ def est_net_savings(s, occ):
             break
 
         word = scores[0][0]
-
-        # If we can successfully add it to the dictionary, do so.  Otherwise,
-        # we've filled the dictionary to capacity and are done.
-        if sum_len + len(word) - 2 > max_words_len:
-            break
-        if len(words) == max_words:
-            break
         words.append(word)
-        sum_len += len(word) - 2
 
+    words.sort(key=len)
     extractor = TextSplitter(words)
     counter = collections.Counter()
     for t in texts:
@@ -469,30 +460,33 @@ def est_net_savings(s, occ):
         len(translation.encode("utf-8")) for (original, translation) in translations
     )
 
-    wends = list(len(w) - 2 for w in words)
-    for i in range(1, len(wends)):
-        wends[i] += wends[i - 1]
+    maxlen = len(words[-1])
+    minlen = len(words[0])
+    wlencount = [len([None for w in words if len(w) == l]) for l in range(minlen, maxlen + 1)]
 
     with open(compression_filename, "w") as f:
+        f.write("typedef {} mchar_t;".format(values_type))
         f.write("const uint8_t lengths[] = {{ {} }};\n".format(", ".join(map(str, lengths))))
         f.write(
-            "const {} values[] = {{ {} }};\n".format(
-                values_type, ", ".join(str(ord(u)) for u in values)
-            )
+            "const mchar_t values[] = {{ {} }};\n".format(", ".join(str(ord(u)) for u in values))
         )
         f.write(
             "#define compress_max_length_bits ({})\n".format(
                 max_translation_encoded_length.bit_length()
             )
         )
         f.write(
-            "const {} words[] = {{ {} }};\n".format(
-                values_type, ", ".join(str(ord(c)) for w in words for c in w)
+            "const mchar_t words[] = {{ {} }};\n".format(
+                ", ".join(str(ord(c)) for w in words for c in w)
             )
         )
-        f.write("const uint8_t wends[] = {{ {} }};\n".format(", ".join(str(p) for p in wends)))
+        f.write(
+            "const uint8_t wlencount[] = {{ {} }};\n".format(", ".join(str(p) for p in wlencount))
+        )
         f.write("#define word_start {}\n".format(word_start))
         f.write("#define word_end {}\n".format(word_end))
+        f.write("#define minlen {}\n".format(minlen))
+        f.write("#define maxlen {}\n".format(maxlen))
 
     return (values, lengths, words, canonical, extractor)
 
diff --git a/supervisor/shared/translate.c b/supervisor/shared/translate.c
@@ -43,22 +43,34 @@ void serial_write_compressed(const compressed_string_t *compressed) {
     serial_write(decompressed);
 }
 
+STATIC void get_word(int n, const mchar_t **pos, const mchar_t **end) {
+    int len = minlen;
+    int i = 0;
+    *pos = words;
+    while (wlencount[i] <= n) {
+        n -= wlencount[i];
+        *pos += len * wlencount[i];
+        i++;
+        len++;
+    }
+    *pos += len * n;
+    *end = *pos + len;
+}
+
 STATIC int put_utf8(char *buf, int u) {
     if (u <= 0x7f) {
         *buf = u;
         return 1;
     } else if (word_start <= u && u <= word_end) {
         uint n = (u - word_start);
-        size_t pos = 0;
-        if (n > 0) {
-            pos = wends[n - 1] + (n * 2);
-        }
+        const mchar_t *pos, *end;
+        get_word(n, &pos, &end);
         int ret = 0;
         // note that at present, entries in the words table are
         // guaranteed not to represent words themselves, so this adds
         // at most 1 level of recursive call
-        for (; pos < wends[n] + (n + 1) * 2; pos++) {
-            int len = put_utf8(buf, words[pos]);
+        for (; pos < end; pos++) {
+            int len = put_utf8(buf, *pos);
             buf += len;
             ret += len;
         }
diff --git a/supervisor/shared/translate.h b/supervisor/shared/translate.h
@@ -50,11 +50,14 @@
 //   are computed with a heuristic based on frequent substrings of 2 to
 //   9 code points.  These are called "words" but are not, grammatically
 //   speaking, words.  They're just spans of code points that frequently
-//   occur together.
+//   occur together.  They are ordered shortest to longest.
 //
 // - dictionary entries are non-overlapping, and the _ending_ index of each
-//   entry is stored in an array.  Since the index given is the ending
-//   index, the array is called "wends".
+//   entry is stored in an array.  A count of words of each length, from
+//   minlen to maxlen, is given in the array called wlencount.  From
+//   this small array, the start and end of the N'th word can be
+//   calculated by an efficient, small loop.  (A bit of time is traded
+//   to reduce the size of this table indicating lengths)
 //
 // The "data" / "tail" construct is so that the struct's last member is a
 // "flexible array".  However, the _only_ member is not permitted to be