makeqstrdata: comment my understanding of @ciscorn's code

jepler · jepler · commit 23009bd08e28 · 2020-09-16T07:58:55.000-05:00
diff --git a/py/makeqstrdata.py b/py/makeqstrdata.py
@@ -156,14 +156,24 @@ def compute_huffman_coding(translations, compression_filename):
 
     sum_len = 0
     while True:
+        # Until the dictionary is filled to capacity, use a heuristic to find
+        # the best "word" (2- to 9-gram) to add to it.
+        #
+        # The TextSplitter allows us to avoid considering parts of the text
+        # that are already covered by a previously chosen word, for example
+        # if "the" is in words then not only will "the" not be considered
+        # again, neither will "there" or "wither", since they have "the"
+        # as substrings.
         extractor = TextSplitter(words)
         counter = collections.Counter()
-        for t in texts:
+        for word in texts:
             for (found, word) in extractor.iter_words(t):
                 if not found:
                     for substr in iter_substrings(word, minlen=2, maxlen=9):
                         counter[substr] += 1
 
+        # Score the candidates we found.  This is an empirical formula only,
+        # chosen for its effectiveness.
         scores = sorted(
             (
                 (s, (len(s) - 1) ** log(max(occ - 2, 1)), occ)
@@ -173,6 +183,8 @@ def compute_huffman_coding(translations, compression_filename):
             reverse=True,
         )
 
+        # Do we have a "word" that occurred 5 times and got a score of at least
+        # 5?  Horray.  Pick the one with the highest score.
         word = None
         for (s, score, occ) in scores:
             if occ < 5:
@@ -182,6 +194,8 @@ def compute_huffman_coding(translations, compression_filename):
             word = s
             break
 
+        # If we can successfully add it to the dictionary, do so.  Otherwise,
+        # we've filled the dictionary to capacity and are done.
         if not word:
             break
         if sum_len + len(word) - 2 > max_words_len: