@@ -166,7 +166,7 @@ def compute_huffman_coding(translations, compression_filename):
166
166
sum_len = 0
167
167
while True :
168
168
# Until the dictionary is filled to capacity, use a heuristic to find
169
- # the best "word" (2 - to 9-gram) to add to it.
169
+ # the best "word" (3 - to 9-gram) to add to it.
170
170
#
171
171
# The TextSplitter allows us to avoid considering parts of the text
172
172
# that are already covered by a previously chosen word, for example
@@ -178,32 +178,25 @@ def compute_huffman_coding(translations, compression_filename):
178
178
for t in texts :
179
179
for (found , word ) in extractor .iter_words (t ):
180
180
if not found :
181
- for substr in iter_substrings (word , minlen = 2 , maxlen = 9 ):
181
+ for substr in iter_substrings (word , minlen = 3 , maxlen = 9 ):
182
182
counter [substr ] += 1
183
183
184
184
# Score the candidates we found. This is an empirical formula only,
185
185
# chosen for its effectiveness.
186
186
scores = sorted (
187
- ((s , (len (s ) - 1 ) ** (occ + 4 ), occ ) for (s , occ ) in counter .items ()),
187
+ ((s , (len (s ) - 1 ) ** (occ + 4 )) for (s , occ ) in counter .items () if occ > 4 ),
188
188
key = lambda x : x [1 ],
189
189
reverse = True ,
190
190
)
191
191
192
- # Do we have a "word" that occurred 5 times and got a score of at least
193
- # 5? Horray. Pick the one with the highest score.
194
- word = None
195
- for (s , score , occ ) in scores :
196
- if occ < 5 :
197
- continue
198
- if score < 5 :
199
- break
200
- word = s
192
+ # Pick the one with the highest score.
193
+ if not scores :
201
194
break
202
195
196
+ word = scores [0 ][0 ]
197
+
203
198
# If we can successfully add it to the dictionary, do so. Otherwise,
204
199
# we've filled the dictionary to capacity and are done.
205
- if not word :
206
- break
207
200
if sum_len + len (word ) - 2 > max_words_len :
208
201
break
209
202
if len (words ) == max_words :
0 commit comments