@@ -156,14 +156,24 @@ def compute_huffman_coding(translations, compression_filename):
156
156
157
157
sum_len = 0
158
158
while True :
159
+ # Until the dictionary is filled to capacity, use a heuristic to find
160
+ # the best "word" (2- to 9-gram) to add to it.
161
+ #
162
+ # The TextSplitter allows us to avoid considering parts of the text
163
+ # that are already covered by a previously chosen word, for example
164
+ # if "the" is in words then not only will "the" not be considered
165
+ # again, neither will "there" or "wither", since they have "the"
166
+ # as substrings.
159
167
extractor = TextSplitter (words )
160
168
counter = collections .Counter ()
161
- for t in texts :
169
+ for word in texts :
162
170
for (found , word ) in extractor .iter_words (t ):
163
171
if not found :
164
172
for substr in iter_substrings (word , minlen = 2 , maxlen = 9 ):
165
173
counter [substr ] += 1
166
174
175
+ # Score the candidates we found. This is an empirical formula only,
176
+ # chosen for its effectiveness.
167
177
scores = sorted (
168
178
(
169
179
(s , (len (s ) - 1 ) ** log (max (occ - 2 , 1 )), occ )
@@ -173,6 +183,8 @@ def compute_huffman_coding(translations, compression_filename):
173
183
reverse = True ,
174
184
)
175
185
186
+ # Do we have a "word" that occurred 5 times and got a score of at least
187
+ # 5? Horray. Pick the one with the highest score.
176
188
word = None
177
189
for (s , score , occ ) in scores :
178
190
if occ < 5 :
@@ -182,6 +194,8 @@ def compute_huffman_coding(translations, compression_filename):
182
194
word = s
183
195
break
184
196
197
+ # If we can successfully add it to the dictionary, do so. Otherwise,
198
+ # we've filled the dictionary to capacity and are done.
185
199
if not word :
186
200
break
187
201
if sum_len + len (word ) - 2 > max_words_len :
0 commit comments