Merge pull request #3398 from jepler/better-dictionary-compression

tannewt · web-flow · commit 750bc1e04ac6 · 2020-09-16T11:10:22.000-07:00
compression: Implement @ciscorn's dictionary approach
diff --git a/py/makeqstrdata.py b/py/makeqstrdata.py
@@ -12,10 +12,14 @@
 import re
 import sys
 
+from math import log
 import collections
 import gettext
 import os.path
 
+sys.stdout.reconfigure(encoding='utf-8')
+sys.stderr.reconfigure(errors='backslashreplace')
+
 py = os.path.dirname(sys.argv[0])
 top = os.path.dirname(py)
 
@@ -100,77 +104,173 @@ def translate(translation_file, i18ns):
             translations.append((original, translation))
         return translations
 
-def frequent_ngrams(corpus, sz, n):
-    return collections.Counter(corpus[i:i+sz] for i in range(len(corpus)-sz)).most_common(n)
+class TextSplitter:
+    def __init__(self, words):
+        words.sort(key=lambda x: len(x), reverse=True)
+        self.words = set(words)
+        self.pat = re.compile("|".join(re.escape(w) for w in words) + "|.", flags=re.DOTALL)
+
+    def iter_words(self, text):
+        s = []
+        words = self.words
+        for m in self.pat.finditer(text):
+            t = m.group(0)
+            if t in words:
+                if s:
+                    yield (False, "".join(s))
+                    s = []
+                yield (True, t)
+            else:
+                s.append(t)
+        if s:
+            yield (False, "".join(s))
+
+    def iter(self, text):
+        for m in self.pat.finditer(text):
+            yield m.group(0)
+
+def iter_substrings(s, minlen, maxlen):
+    len_s = len(s)
+    maxlen = min(len_s, maxlen)
+    for n in range(minlen, maxlen + 1):
+        for begin in range(0, len_s - n + 1):
+            yield s[begin : begin + n]
+
+def compute_huffman_coding(translations, compression_filename):
+    texts = [t[1] for t in translations]
+    words = []
+
+    start_unused = 0x80
+    end_unused = 0xff
+    max_ord = 0
+    for text in texts:
+        for c in text:
+            ord_c = ord(c)
+            max_ord = max(ord_c, max_ord)
+            if 0x80 <= ord_c < 0xff:
+                end_unused = min(ord_c, end_unused)
+    max_words = end_unused - 0x80
+
+    values_type = "uint16_t" if max_ord > 255 else "uint8_t"
+    max_words_len = 160 if max_ord > 255 else 255
+
+    sum_len = 0
+    while True:
+        # Until the dictionary is filled to capacity, use a heuristic to find
+        # the best "word" (2- to 9-gram) to add to it.
+        #
+        # The TextSplitter allows us to avoid considering parts of the text
+        # that are already covered by a previously chosen word, for example
+        # if "the" is in words then not only will "the" not be considered
+        # again, neither will "there" or "wither", since they have "the"
+        # as substrings.
+        extractor = TextSplitter(words)
+        counter = collections.Counter()
+        for t in texts:
+            for (found, word) in extractor.iter_words(t):
+                if not found:
+                    for substr in iter_substrings(word, minlen=2, maxlen=9):
+                        counter[substr] += 1
+
+        # Score the candidates we found.  This is an empirical formula only,
+        # chosen for its effectiveness.
+        scores = sorted(
+            (
+                (s, (len(s) - 1) ** log(max(occ - 2, 1)), occ)
+                for (s, occ) in counter.items()
+            ),
+            key=lambda x: x[1],
+            reverse=True,
+        )
+
+        # Do we have a "word" that occurred 5 times and got a score of at least
+        # 5?  Horray.  Pick the one with the highest score.
+        word = None
+        for (s, score, occ) in scores:
+            if occ < 5:
+                continue
+            if score < 5:
+                break
+            word = s
+            break
+
+        # If we can successfully add it to the dictionary, do so.  Otherwise,
+        # we've filled the dictionary to capacity and are done.
+        if not word:
+            break
+        if sum_len + len(word) - 2 > max_words_len:
+            break
+        if len(words) == max_words:
+            break
+        words.append(word)
+        sum_len += len(word) - 2
+
+    extractor = TextSplitter(words)
+    counter = collections.Counter()
+    for t in texts:
+        for atom in extractor.iter(t):
+            counter[atom] += 1
+    cb = huffman.codebook(counter.items())
+
+    word_start = start_unused
+    word_end = word_start + len(words) - 1
+    print("// # words", len(words))
+    print("// words", words)
 
-def encode_ngrams(translation, ngrams):
-    if len(ngrams) > 32:
-        start = 0xe000
-    else:
-        start = 0x80
-    for i, g in enumerate(ngrams):
-        translation = translation.replace(g, chr(start + i))
-    return translation
-
-def decode_ngrams(compressed, ngrams):
-    if len(ngrams) > 32:
-        start, end = 0xe000, 0xf8ff
-    else:
-        start, end = 0x80, 0x9f
-    return "".join(ngrams[ord(c) - start] if (start <= ord(c) <= end) else c for c in compressed)
-
-def compute_huffman_coding(translations, qstrs, compression_filename):
-    all_strings = [x[1] for x in translations]
-    all_strings_concat = "".join(all_strings)
-    ngrams = [i[0] for i in frequent_ngrams(all_strings_concat, 2, 32)]
-    all_strings_concat = encode_ngrams(all_strings_concat, ngrams)
-    counts = collections.Counter(all_strings_concat)
-    cb = huffman.codebook(counts.items())
     values = []
     length_count = {}
     renumbered = 0
-    last_l = None
+    last_length = None
     canonical = {}
-    for ch, code in sorted(cb.items(), key=lambda x: (len(x[1]), x[0])):
-        values.append(ch)
-        l = len(code)
-        if l not in length_count:
-            length_count[l] = 0
-        length_count[l] += 1
-        if last_l:
-            renumbered <<= (l - last_l)
-        canonical[ch] = '{0:0{width}b}'.format(renumbered, width=l)
-        s = C_ESCAPES.get(ch, ch)
-        print("//", ord(ch), s, counts[ch], canonical[ch], renumbered)
+    for atom, code in sorted(cb.items(), key=lambda x: (len(x[1]), x[0])):
+        values.append(atom)
+        length = len(code)
+        if length not in length_count:
+            length_count[length] = 0
+        length_count[length] += 1
+        if last_length:
+            renumbered <<= (length - last_length)
+        canonical[atom] = '{0:0{width}b}'.format(renumbered, width=length)
+        # print(f"atom={repr(atom)} code={code}", file=sys.stderr)
+        if len(atom) > 1:
+            o = words.index(atom) + 0x80
+            s = "".join(C_ESCAPES.get(ch1, ch1) for ch1 in atom)
+        else:
+            s = C_ESCAPES.get(atom, atom)
+            o = ord(atom)
+        print("//", o, s, counter[atom], canonical[atom], renumbered)
         renumbered += 1
-        last_l = l
+        last_length = length
     lengths = bytearray()
     print("// length count", length_count)
-    print("// bigrams", ngrams)
+
     for i in range(1, max(length_count) + 2):
         lengths.append(length_count.get(i, 0))
     print("// values", values, "lengths", len(lengths), lengths)
-    ngramdata = [ord(ni) for i in ngrams for ni in i]
-    print("// estimated total memory size", len(lengths) + 2*len(values) + 2 * len(ngramdata) + sum((len(cb[u]) + 7)//8 for u in all_strings_concat))
+
     print("//", values, lengths)
-    values_type = "uint16_t" if max(ord(u) for u in values) > 255 else "uint8_t"
-    max_translation_encoded_length = max(len(translation.encode("utf-8")) for original,translation in translations)
+    values = [(atom if len(atom) == 1 else chr(0x80 + words.index(atom))) for atom in values]
+    print("//", values, lengths)
+    max_translation_encoded_length = max(
+        len(translation.encode("utf-8")) for (original, translation) in translations)
+
+    wends = list(len(w) - 2 for w in words)
+    for i in range(1, len(wends)):
+        wends[i] += wends[i - 1]
+
     with open(compression_filename, "w") as f:
         f.write("const uint8_t lengths[] = {{ {} }};\n".format(", ".join(map(str, lengths))))
         f.write("const {} values[] = {{ {} }};\n".format(values_type, ", ".join(str(ord(u)) for u in values)))
         f.write("#define compress_max_length_bits ({})\n".format(max_translation_encoded_length.bit_length()))
-        f.write("const {} bigrams[] = {{ {} }};\n".format(values_type, ", ".join(str(u) for u in ngramdata)))
-        if len(ngrams) > 32:
-            bigram_start = 0xe000
-        else:
-            bigram_start = 0x80
-        bigram_end = bigram_start + len(ngrams) - 1 # End is inclusive
-        f.write("#define bigram_start {}\n".format(bigram_start))
-        f.write("#define bigram_end {}\n".format(bigram_end))
-    return values, lengths, ngrams
+        f.write("const {} words[] = {{ {} }};\n".format(values_type, ", ".join(str(ord(c)) for w in words for c in w)))
+        f.write("const uint8_t wends[] = {{ {} }};\n".format(", ".join(str(p) for p in wends)))
+        f.write("#define word_start {}\n".format(word_start))
+        f.write("#define word_end {}\n".format(word_end))
+
+    return (values, lengths, words, canonical, extractor)
 
 def decompress(encoding_table, encoded, encoded_length_bits):
-    values, lengths, ngrams = encoding_table
+    (values, lengths, words, _, _) = encoding_table
     dec = []
     this_byte = 0
     this_bit = 7
@@ -218,74 +318,41 @@ def decompress(encoding_table, encoded, encoded_length_bits):
             searched_length += lengths[bit_length]
 
         v = values[searched_length + bits - max_code]
-        v = decode_ngrams(v, ngrams)
+        if v >= chr(0x80) and v < chr(0x80 + len(words)):
+            v = words[ord(v) - 0x80]
         i += len(v.encode('utf-8'))
         dec.append(v)
     return ''.join(dec)
 
 def compress(encoding_table, decompressed, encoded_length_bits, len_translation_encoded):
     if not isinstance(decompressed, str):
         raise TypeError()
-    values, lengths, ngrams = encoding_table
-    decompressed = encode_ngrams(decompressed, ngrams)
+    (_, _, _, canonical, extractor) = encoding_table
+
     enc = bytearray(len(decompressed) * 3)
-    #print(decompressed)
-    #print(lengths)
     current_bit = 7
     current_byte = 0
 
-    code = len_translation_encoded
-    bits = encoded_length_bits+1
+    bits = encoded_length_bits + 1
     for i in range(bits - 1, 0, -1):
         if len_translation_encoded & (1 << (i - 1)):
             enc[current_byte] |= 1 << current_bit
         if current_bit == 0:
             current_bit = 7
-            #print("packed {0:0{width}b}".format(enc[current_byte], width=8))
             current_byte += 1
         else:
             current_bit -= 1
 
-    for c in decompressed:
-        #print()
-        #print("char", c, values.index(c))
-        start = 0
-        end = lengths[0]
-        bits = 1
-        compressed = None
-        code = 0
-        while compressed is None:
-            s = start
-            e = end
-            #print("{0:0{width}b}".format(code, width=bits))
-            # Binary search!
-            while e > s:
-                midpoint = (s + e) // 2
-                #print(s, e, midpoint)
-                if values[midpoint] == c:
-                    compressed = code + (midpoint - start)
-                    #print("found {0:0{width}b}".format(compressed, width=bits))
-                    break
-                elif c < values[midpoint]:
-                    e = midpoint
-                else:
-                    s = midpoint + 1
-            code += end - start
-            code <<= 1
-            start = end
-            end += lengths[bits]
-            bits += 1
-            #print("next bit", bits)
-
-        for i in range(bits - 1, 0, -1):
-            if compressed & (1 << (i - 1)):
+    for atom in extractor.iter(decompressed):
+        for b in canonical[atom]:
+            if b == "1":
                 enc[current_byte] |= 1 << current_bit
             if current_bit == 0:
                 current_bit = 7
-                #print("packed {0:0{width}b}".format(enc[current_byte], width=8))
                 current_byte += 1
             else:
                 current_bit -= 1
+
     if current_bit != 7:
         current_byte += 1
     return enc[:current_byte]
@@ -452,7 +519,7 @@ def print_qstr_enums(qstrs):
     if args.translation:
         i18ns = sorted(i18ns)
         translations = translate(args.translation, i18ns)
-        encoding_table = compute_huffman_coding(translations, qstrs, args.compression_filename)
+        encoding_table = compute_huffman_coding(translations, args.compression_filename)
         print_qstr_data(encoding_table, qcfgs, qstrs, translations)
     else:
         print_qstr_enums(qstrs)
diff --git a/supervisor/shared/translate.c b/supervisor/shared/translate.c
@@ -47,13 +47,22 @@ STATIC int put_utf8(char *buf, int u) {
     if(u <= 0x7f) {
         *buf = u;
         return 1;
-    } else if(bigram_start <= u && u <= bigram_end) {
-        int n = (u - 0x80) * 2;
-        // (note that at present, entries in the bigrams table are
-        // guaranteed not to represent bigrams themselves, so this adds
+    } else if(word_start <= u && u <= word_end) {
+        uint n = (u - word_start);
+        size_t pos = 0;
+        if (n > 0) {
+            pos = wends[n - 1] + (n * 2);
+        }
+        int ret = 0;
+        // note that at present, entries in the words table are
+        // guaranteed not to represent words themselves, so this adds
         // at most 1 level of recursive call
-        int ret = put_utf8(buf, bigrams[n]);
-        return ret + put_utf8(buf + ret, bigrams[n+1]);
+        for(; pos < wends[n] + (n + 1) * 2; pos++) {
+            int len = put_utf8(buf, words[pos]);
+            buf += len;
+            ret += len;
+        }
+        return ret;
     } else if(u <= 0x07ff) {
         *buf++ = 0b11000000 | (u >> 6);
         *buf   = 0b10000000 | (u & 0b00111111);
diff --git a/supervisor/shared/translate.h b/supervisor/shared/translate.h
@@ -43,6 +43,19 @@
 //   (building the huffman encoding on UTF-16 code points gave better
 //   compression than building it on UTF-8 bytes)
 //
+// - code points starting at 128 (word_start) and potentially extending
+//   to 255 (word_end) (but never interfering with the target
+//   language's used code points) stand for dictionary entries in a
+//   dictionary with size up to 256 code points.  The dictionary entries
+//   are computed with a heuristic based on frequent substrings of 2 to
+//   9 code points.  These are called "words" but are not, grammatically
+//   speaking, words.  They're just spans of code points that frequently
+//   occur together.
+//
+// - dictionary entries are non-overlapping, and the _ending_ index of each
+//   entry is stored in an array.  Since the index given is the ending
+//   index, the array is called "wends".
+//
 // The "data" / "tail" construct is so that the struct's last member is a
 // "flexible array".  However, the _only_ member is not permitted to be
 // a flexible member, so we have to declare the first byte as a separte