Skip to content

Commit a8e98cd

Browse files
committed
makeqstrdata: comment my understanding of @ciscorn's code
1 parent d9e336d commit a8e98cd

File tree

1 file changed

+14
-0
lines changed

1 file changed

+14
-0
lines changed

py/makeqstrdata.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,14 @@ def compute_huffman_coding(translations, compression_filename):
156156

157157
sum_len = 0
158158
while True:
159+
# Until the dictionary is filled to capacity, use a heuristic to find
160+
# the best "word" (2- to 9-gram) to add to it.
161+
#
162+
# The TextSplitter allows us to avoid considering parts of the text
163+
# that are already covered by a previously chosen word, for example
164+
# if "the" is in words then not only will "the" not be considered
165+
# again, neither will "there" or "wither", since they have "the"
166+
# as substrings.
159167
extractor = TextSplitter(words)
160168
counter = collections.Counter()
161169
for t in texts:
@@ -164,6 +172,8 @@ def compute_huffman_coding(translations, compression_filename):
164172
for substr in iter_substrings(word, minlen=2, maxlen=9):
165173
counter[substr] += 1
166174

175+
# Score the candidates we found. This is an empirical formula only,
176+
# chosen for its effectiveness.
167177
scores = sorted(
168178
(
169179
(s, (len(s) - 1) ** log(max(occ - 2, 1)), occ)
@@ -173,6 +183,8 @@ def compute_huffman_coding(translations, compression_filename):
173183
reverse=True,
174184
)
175185

186+
# Do we have a "word" that occurred 5 times and got a score of at least
187+
# 5? Horray. Pick the one with the highest score.
176188
word = None
177189
for (s, score, occ) in scores:
178190
if occ < 5:
@@ -182,6 +194,8 @@ def compute_huffman_coding(translations, compression_filename):
182194
word = s
183195
break
184196

197+
# If we can successfully add it to the dictionary, do so. Otherwise,
198+
# we've filled the dictionary to capacity and are done.
185199
if not word:
186200
break
187201
if sum_len + len(word) - 2 > max_words_len:

0 commit comments

Comments
 (0)