Merge pull request #2345 from jepler/compressed-unicode

tannewt · web-flow · commit 15886b1505d8 · 2019-12-02T17:11:49.000-08:00
translation: Compress as unicode, not bytes
diff --git a/py/makeqstrdata.py b/py/makeqstrdata.py
@@ -103,14 +103,10 @@ def compute_huffman_coding(translations, qstrs, compression_filename):
     # go through each qstr and print it out
     for _, _, qstr in qstrs.values():
         all_strings.append(qstr)
-    all_strings_concat = "".join(all_strings).encode("utf-8")
+    all_strings_concat = "".join(all_strings)
     counts = collections.Counter(all_strings_concat)
-    # add other values
-    for i in range(256):
-        if i not in counts:
-            counts[i] = 0
     cb = huffman.codebook(counts.items())
-    values = bytearray()
+    values = []
     length_count = {}
     renumbered = 0
     last_l = None
@@ -124,26 +120,27 @@ def compute_huffman_coding(translations, qstrs, compression_filename):
         if last_l:
             renumbered <<= (l - last_l)
         canonical[ch] = '{0:0{width}b}'.format(renumbered, width=l)
-        if chr(ch) in C_ESCAPES:
-            s = C_ESCAPES[chr(ch)]
-        else:
-            s = chr(ch)
-        print("//", ch, s, counts[ch], canonical[ch], renumbered)
+        s = C_ESCAPES.get(ch, ch)
+        print("//", ord(ch), s, counts[ch], canonical[ch], renumbered)
         renumbered += 1
         last_l = l
     lengths = bytearray()
-    for i in range(1, max(length_count) + 1):
+    print("// length count", length_count)
+    for i in range(1, max(length_count) + 2):
         lengths.append(length_count.get(i, 0))
+    print("// values", values, "lengths", len(lengths), lengths)
+    print("// estimated total memory size", len(lengths) + 2*len(values) + sum(len(cb[u]) for u in all_strings_concat))
     print("//", values, lengths)
+    values_type = "uint16_t" if max(ord(u) for u in values) > 255 else "uint8_t"
     with open(compression_filename, "w") as f:
         f.write("const uint8_t lengths[] = {{ {} }};\n".format(", ".join(map(str, lengths))))
-        f.write("const uint8_t values[256] = {{ {} }};\n".format(", ".join(map(str, values))))
+        f.write("const {} values[] = {{ {} }};\n".format(values_type, ", ".join(str(ord(u)) for u in values)))
     return values, lengths
 
 def decompress(encoding_table, length, encoded):
     values, lengths = encoding_table
     #print(l, encoded)
-    dec = bytearray(length)
+    dec = []
     this_byte = 0
     this_bit = 7
     b = encoded[this_byte]
@@ -173,14 +170,14 @@ def decompress(encoding_table, length, encoded):
             searched_length += lengths[bit_length]
 
         v = values[searched_length + bits - max_code]
-        dec[i] = v
-    return dec
+        dec.append(v)
+    return ''.join(dec)
 
 def compress(encoding_table, decompressed):
-    if not isinstance(decompressed, bytes):
+    if not isinstance(decompressed, str):
         raise TypeError()
     values, lengths = encoding_table
-    enc = bytearray(len(decompressed) * 2)
+    enc = bytearray(len(decompressed) * 3)
     #print(decompressed)
     #print(lengths)
     current_bit = 7
@@ -228,7 +225,7 @@ def compress(encoding_table, decompressed):
     if current_bit != 7:
         current_byte += 1
     if current_byte > len(decompressed):
-        print("Note: compression increased length", repr(decompressed.decode('utf-8')), len(decompressed), current_byte, file=sys.stderr)
+        print("Note: compression increased length", repr(decompressed), len(decompressed), current_byte, file=sys.stderr)
     return enc[:current_byte]
 
 def qstr_escape(qst):
@@ -347,9 +344,9 @@ def print_qstr_data(encoding_table, qcfgs, qstrs, i18ns):
     total_text_compressed_size = 0
     for original, translation in i18ns:
         translation_encoded = translation.encode("utf-8")
-        compressed = compress(encoding_table, translation_encoded)
+        compressed = compress(encoding_table, translation)
         total_text_compressed_size += len(compressed)
-        decompressed = decompress(encoding_table, len(translation_encoded), compressed).decode("utf-8")
+        decompressed = decompress(encoding_table, len(translation_encoded), compressed)
         for c in C_ESCAPES:
             decompressed = decompressed.replace(c, C_ESCAPES[c])
         print("TRANSLATION(\"{}\", {}, {{ {} }}) // {}".format(original, len(translation_encoded)+1, ", ".join(["0x{:02x}".format(x) for x in compressed]), decompressed))
diff --git a/supervisor/shared/translate.c b/supervisor/shared/translate.c
@@ -42,12 +42,28 @@ void serial_write_compressed(const compressed_string_t* compressed) {
     serial_write(decompressed);
 }
 
+STATIC int put_utf8(char *buf, int u) {
+    if(u <= 0x7f) {
+        *buf = u;
+        return 1;
+    } else if(u <= 0x07ff) {
+        *buf++ = 0b11000000 | (u >> 6);
+        *buf   = 0b10000000 | (u & 0b00111111);
+        return 2;
+    } else { // u <= 0xffff)
+        *buf++ = 0b11000000 | (u >> 12);
+        *buf   = 0b10000000 | ((u >> 6) & 0b00111111);
+        *buf   = 0b10000000 | (u & 0b00111111);
+        return 3;
+    }
+}
+
 char* decompress(const compressed_string_t* compressed, char* decompressed) {
     uint8_t this_byte = 0;
     uint8_t this_bit = 7;
     uint8_t b = compressed->data[this_byte];
     // Stop one early because the last byte is always NULL.
-    for (uint16_t i = 0; i < compressed->length - 1; i++) {
+    for (uint16_t i = 0; i < compressed->length - 1;) {
         uint32_t bits = 0;
         uint8_t bit_length = 0;
         uint32_t max_code = lengths[0];
@@ -72,7 +88,7 @@ char* decompress(const compressed_string_t* compressed, char* decompressed) {
             max_code = (max_code << 1) + lengths[bit_length];
             searched_length += lengths[bit_length];
         }
-        decompressed[i] = values[searched_length + bits - max_code];
+        i += put_utf8(decompressed + i, values[searched_length + bits - max_code]);
     }
 
     decompressed[compressed->length-1] = '\0';