Skip to content

[build] Simplify compute_huffman_coding() #4623

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 7 additions & 12 deletions py/makeqstrdata.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,7 @@ def compute_huffman_coding(translations, compression_filename):
sum_len = 0
while True:
# Until the dictionary is filled to capacity, use a heuristic to find
# the best "word" (2- to 9-gram) to add to it.
# the best "word" (3- to 9-gram) to add to it.
#
# The TextSplitter allows us to avoid considering parts of the text
# that are already covered by a previously chosen word, for example
Expand All @@ -179,32 +179,27 @@ def compute_huffman_coding(translations, compression_filename):
for t in texts:
for (found, word) in extractor.iter_words(t):
if not found:
for substr in iter_substrings(word, minlen=2, maxlen=9):
for substr in iter_substrings(word, minlen=3, maxlen=9):
counter[substr] += 1

# Score the candidates we found. This is an empirical formula only,
# chosen for its effectiveness.
scores = sorted(
((s, (len(s) - 1) ** log(max(occ - 2, 1)), occ) for (s, occ) in counter.items()),
((s, (len(s) - 1) ** log(occ - 2))
for (s, occ) in counter.items() if occ > 4),
key=lambda x: x[1],
reverse=True,
)

# Do we have a "word" that occurred 5 times and got a score of at least
# 5? Horray. Pick the one with the highest score.
word = None
for (s, score, occ) in scores:
if occ < 5:
continue
if score < 5:
break
word = s
if not scores or scores[0][1] < 5:
break

word = scores[0][0]

# If we can successfully add it to the dictionary, do so. Otherwise,
# we've filled the dictionary to capacity and are done.
if not word:
break
if sum_len + len(word) - 2 > max_words_len:
break
if len(words) == max_words:
Expand Down