translation: Compress as unicode, not bytes

jepler · jepler · commit e06a3bbceb54 · 2019-12-02T09:46:46.000-06:00
By treating each unicode code-point as a single entity for huffman
compression, the overall compression rate can be somewhat improved
without changing the algorithm.  On the decompression side, when
compressed values above 127 are encountered, they need to be
converted from a 16-bit Unicode code point into a UTF-8 byte
sequence.

Doing this returns approximately 1.5kB of flash storage with the
zh_Latn_pinyin translation. (292 -&gt; 1768 bytes remaining in my build
of trinket_m0)

Other "more ASCII" translations benefit less, and in fact
zh_Latn_pinyin is no longer the most constrained translation!
(de_DE 1156 -&gt; 1384 bytes free in flash, I didn't check others
before pushing for CI)

English is slightly pessimized, 2840 -&gt; 2788 bytes, probably mostly
because the "values" array was changed from uint8_t to uint16_t,
which is strictly not required for an all-ASCII translation.  This
could probably be avoided in this case, but as English is not the
most constrained translation it doesn't really matter.

Testing performed: built for feather nRF52840 express and trinket m0
in English and zh_Latn_pinyin; ran and verified the localized
messages such as
    Àn xià rènhé jiàn jìnrù REPL. Shǐyòng CTRL-D chóngxīn jiāzài.
and
    Press any key to enter the REPL. Use CTRL-D to reload.
were properly displayed.
diff --git a/py/makeqstrdata.py b/py/makeqstrdata.py
@@ -103,14 +103,10 @@ def compute_huffman_coding(translations, qstrs, compression_filename):
     # go through each qstr and print it out
     for _, _, qstr in qstrs.values():
         all_strings.append(qstr)
-    all_strings_concat = "".join(all_strings).encode("utf-8")
+    all_strings_concat = "".join(all_strings)
     counts = collections.Counter(all_strings_concat)
-    # add other values
-    for i in range(256):
-        if i not in counts:
-            counts[i] = 0
     cb = huffman.codebook(counts.items())
-    values = bytearray()
+    values = []
     length_count = {}
     renumbered = 0
     last_l = None
@@ -124,26 +120,26 @@ def compute_huffman_coding(translations, qstrs, compression_filename):
         if last_l:
             renumbered <<= (l - last_l)
         canonical[ch] = '{0:0{width}b}'.format(renumbered, width=l)
-        if chr(ch) in C_ESCAPES:
-            s = C_ESCAPES[chr(ch)]
-        else:
-            s = chr(ch)
-        print("//", ch, s, counts[ch], canonical[ch], renumbered)
+        s = C_ESCAPES.get(ch, ch)
+        print("//", ord(ch), s, counts[ch], canonical[ch], renumbered)
         renumbered += 1
         last_l = l
     lengths = bytearray()
-    for i in range(1, max(length_count) + 1):
+    print("// length count", length_count)
+    for i in range(1, max(length_count) + 2):
         lengths.append(length_count.get(i, 0))
+    print("// values", values, "lengths", len(lengths), lengths)
+    print("// estimated total memory size", len(lengths) + 2*len(values) + sum(len(cb[u]) for u in all_strings_concat))
     print("//", values, lengths)
     with open(compression_filename, "w") as f:
         f.write("const uint8_t lengths[] = {{ {} }};\n".format(", ".join(map(str, lengths))))
-        f.write("const uint8_t values[256] = {{ {} }};\n".format(", ".join(map(str, values))))
+        f.write("const uint16_t values[] = {{ {} }};\n".format(", ".join(str(ord(u)) for u in values)))
     return values, lengths
 
 def decompress(encoding_table, length, encoded):
     values, lengths = encoding_table
     #print(l, encoded)
-    dec = bytearray(length)
+    dec = []
     this_byte = 0
     this_bit = 7
     b = encoded[this_byte]
@@ -173,14 +169,14 @@ def decompress(encoding_table, length, encoded):
             searched_length += lengths[bit_length]
 
         v = values[searched_length + bits - max_code]
-        dec[i] = v
-    return dec
+        dec.append(v)
+    return ''.join(dec)
 
 def compress(encoding_table, decompressed):
-    if not isinstance(decompressed, bytes):
+    if not isinstance(decompressed, str):
         raise TypeError()
     values, lengths = encoding_table
-    enc = bytearray(len(decompressed) * 2)
+    enc = bytearray(len(decompressed) * 3)
     #print(decompressed)
     #print(lengths)
     current_bit = 7
@@ -347,9 +343,9 @@ def print_qstr_data(encoding_table, qcfgs, qstrs, i18ns):
     total_text_compressed_size = 0
     for original, translation in i18ns:
         translation_encoded = translation.encode("utf-8")
-        compressed = compress(encoding_table, translation_encoded)
+        compressed = compress(encoding_table, translation)
         total_text_compressed_size += len(compressed)
-        decompressed = decompress(encoding_table, len(translation_encoded), compressed).decode("utf-8")
+        decompressed = decompress(encoding_table, len(translation_encoded), compressed)
         for c in C_ESCAPES:
             decompressed = decompressed.replace(c, C_ESCAPES[c])
         print("TRANSLATION(\"{}\", {}, {{ {} }}) // {}".format(original, len(translation_encoded)+1, ", ".join(["0x{:02x}".format(x) for x in compressed]), decompressed))
diff --git a/supervisor/shared/translate.c b/supervisor/shared/translate.c
@@ -42,12 +42,28 @@ void serial_write_compressed(const compressed_string_t* compressed) {
     serial_write(decompressed);
 }
 
+STATIC int put_utf8(char *buf, int u) {
+    if(u <= 0x7f) {
+        *buf = u;
+        return 1;
+    } else if(u <= 0x07ff) {
+        *buf++ = 0b11000000 | (u >> 6);
+        *buf   = 0b10000000 | (u & 0b00111111);
+        return 2;
+    } else { // u <= 0xffff)
+        *buf++ = 0b11000000 | (u >> 12);
+        *buf   = 0b10000000 | ((u >> 6) & 0b00111111);
+        *buf   = 0b10000000 | (u & 0b00111111);
+        return 3;
+    }
+}
+
 char* decompress(const compressed_string_t* compressed, char* decompressed) {
     uint8_t this_byte = 0;
     uint8_t this_bit = 7;
     uint8_t b = compressed->data[this_byte];
     // Stop one early because the last byte is always NULL.
-    for (uint16_t i = 0; i < compressed->length - 1; i++) {
+    for (uint16_t i = 0; i < compressed->length - 1;) {
         uint32_t bits = 0;
         uint8_t bit_length = 0;
         uint32_t max_code = lengths[0];
@@ -72,7 +88,7 @@ char* decompress(const compressed_string_t* compressed, char* decompressed) {
             max_code = (max_code << 1) + lengths[bit_length];
             searched_length += lengths[bit_length];
         }
-        decompressed[i] = values[searched_length + bits - max_code];
+        i += put_utf8(decompressed + i, values[searched_length + bits - max_code]);
     }
 
     decompressed[compressed->length-1] = '\0';