makeqstrdata: Use a better dictionary heuristic

jepler · jepler · commit 260d5ca8bad5 · 2021-03-08T09:41:03.000-06:00
This gets us very close to measuring the actual size of the
diff --git a/py/makeqstrdata.py b/py/makeqstrdata.py
@@ -12,7 +12,7 @@
 import re
 import sys
 
-from math import log
+import bisect
 import collections
 import gettext
 import os.path
@@ -156,6 +156,7 @@ def compute_huffman_coding(translations, compression_filename):
                 end_unused = min(ord_c, end_unused)
     max_words = end_unused - 0x80
 
+    bits_per_codepoint = 16 if max_ord > 255 else 8
     values_type = "uint16_t" if max_ord > 255 else "uint8_t"
     max_words_len = 160 if max_ord > 255 else 255
 
@@ -170,31 +171,50 @@ def compute_huffman_coding(translations, compression_filename):
         # again, neither will "there" or "wither", since they have "the"
         # as substrings.
         extractor = TextSplitter(words)
+        counter = collections.Counter()
+        for t in texts:
+            for atom in extractor.iter(t):
+                counter[atom] += 1
+        cb = huffman.codebook(counter.items())
+        lengths = sorted(dict((v, len(cb[k])) for k, v in counter.items()).items())
+
+        def bit_length(s):
+            return sum(len(cb[c]) for c in s)
+
+        def est_len(occ):
+            idx = bisect.bisect_left(lengths, (occ, 0))
+            return lengths[idx][1] + 1
+
         counter = collections.Counter()
         for t in texts:
             for (found, word) in extractor.iter_words(t):
                 if not found:
                     for substr in iter_substrings(word, minlen=2, maxlen=9):
                         counter[substr] += 1
 
-        # Score the candidates we found.  This is an empirical formula only,
-        # chosen for its effectiveness.
+        # Score the candidates we found.  This is a semi-empirical formula that
+        # attempts to model the number of bits saved as closely as possible.
+        #
+        # It attempts to compute the codeword lengths of the original word
+        # to the codeword length the dictionary entry would get, times
+        # the number of occurrences, less the ovehead of the entries in the
+        # words[] array.
         scores = sorted(
             (
-                (s, (len(s) - 1) ** log(max(occ - 2, 1)), occ)
+                (s, occ * (bit_length(s) - est_len(occ) - 1) - len(s) * bits_per_codepoint, occ)
                 for (s, occ) in counter.items()
             ),
             key=lambda x: x[1],
             reverse=True,
         )
 
-        # Do we have a "word" that occurred 5 times and got a score of at least
-        # 5?  Horray.  Pick the one with the highest score.
+        # Pick the word with the best score (savings in bits).  It also has to
+        # occur more than once and save at least an estimated 2 bytes.
         word = None
         for (s, score, occ) in scores:
-            if occ < 5:
-                continue
-            if score < 5:
+            if occ < 2:
+                break
+            if score < 16:
                 break
             word = s
             break