Skip to content

Commit dcee89a

Browse files
committed
build: simplify compute_huffman_coding()
No functional change.
1 parent 6892068 commit dcee89a

File tree

1 file changed

+7
-14
lines changed

1 file changed

+7
-14
lines changed

py/makeqstrdata.py

Lines changed: 7 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -166,7 +166,7 @@ def compute_huffman_coding(translations, compression_filename):
166166
sum_len = 0
167167
while True:
168168
# Until the dictionary is filled to capacity, use a heuristic to find
169-
# the best "word" (2- to 9-gram) to add to it.
169+
# the best "word" (3- to 9-gram) to add to it.
170170
#
171171
# The TextSplitter allows us to avoid considering parts of the text
172172
# that are already covered by a previously chosen word, for example
@@ -178,32 +178,25 @@ def compute_huffman_coding(translations, compression_filename):
178178
for t in texts:
179179
for (found, word) in extractor.iter_words(t):
180180
if not found:
181-
for substr in iter_substrings(word, minlen=2, maxlen=9):
181+
for substr in iter_substrings(word, minlen=3, maxlen=9):
182182
counter[substr] += 1
183183

184184
# Score the candidates we found. This is an empirical formula only,
185185
# chosen for its effectiveness.
186186
scores = sorted(
187-
((s, (len(s) - 1) ** (occ + 4), occ) for (s, occ) in counter.items()),
187+
((s, (len(s) - 1) ** (occ + 4)) for (s, occ) in counter.items() if occ > 4),
188188
key=lambda x: x[1],
189189
reverse=True,
190190
)
191191

192-
# Do we have a "word" that occurred 5 times and got a score of at least
193-
# 5? Horray. Pick the one with the highest score.
194-
word = None
195-
for (s, score, occ) in scores:
196-
if occ < 5:
197-
continue
198-
if score < 5:
199-
break
200-
word = s
192+
# Pick the one with the highest score.
193+
if not scores:
201194
break
202195

196+
word = scores[0][0]
197+
203198
# If we can successfully add it to the dictionary, do so. Otherwise,
204199
# we've filled the dictionary to capacity and are done.
205-
if not word:
206-
break
207200
if sum_len + len(word) - 2 > max_words_len:
208201
break
209202
if len(words) == max_words:

0 commit comments

Comments
 (0)