Skip to content

Commit 260d5ca

Browse files
committed
makeqstrdata: Use a better dictionary heuristic
This gets us very close to measuring the actual size of the
1 parent 16f82b9 commit 260d5ca

File tree

1 file changed

+29
-9
lines changed

1 file changed

+29
-9
lines changed

py/makeqstrdata.py

Lines changed: 29 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
import re
1313
import sys
1414

15-
from math import log
15+
import bisect
1616
import collections
1717
import gettext
1818
import os.path
@@ -156,6 +156,7 @@ def compute_huffman_coding(translations, compression_filename):
156156
end_unused = min(ord_c, end_unused)
157157
max_words = end_unused - 0x80
158158

159+
bits_per_codepoint = 16 if max_ord > 255 else 8
159160
values_type = "uint16_t" if max_ord > 255 else "uint8_t"
160161
max_words_len = 160 if max_ord > 255 else 255
161162

@@ -170,31 +171,50 @@ def compute_huffman_coding(translations, compression_filename):
170171
# again, neither will "there" or "wither", since they have "the"
171172
# as substrings.
172173
extractor = TextSplitter(words)
174+
counter = collections.Counter()
175+
for t in texts:
176+
for atom in extractor.iter(t):
177+
counter[atom] += 1
178+
cb = huffman.codebook(counter.items())
179+
lengths = sorted(dict((v, len(cb[k])) for k, v in counter.items()).items())
180+
181+
def bit_length(s):
182+
return sum(len(cb[c]) for c in s)
183+
184+
def est_len(occ):
185+
idx = bisect.bisect_left(lengths, (occ, 0))
186+
return lengths[idx][1] + 1
187+
173188
counter = collections.Counter()
174189
for t in texts:
175190
for (found, word) in extractor.iter_words(t):
176191
if not found:
177192
for substr in iter_substrings(word, minlen=2, maxlen=9):
178193
counter[substr] += 1
179194

180-
# Score the candidates we found. This is an empirical formula only,
181-
# chosen for its effectiveness.
195+
# Score the candidates we found. This is a semi-empirical formula that
196+
# attempts to model the number of bits saved as closely as possible.
197+
#
198+
# It attempts to compute the codeword lengths of the original word
199+
# to the codeword length the dictionary entry would get, times
200+
# the number of occurrences, less the ovehead of the entries in the
201+
# words[] array.
182202
scores = sorted(
183203
(
184-
(s, (len(s) - 1) ** log(max(occ - 2, 1)), occ)
204+
(s, occ * (bit_length(s) - est_len(occ) - 1) - len(s) * bits_per_codepoint, occ)
185205
for (s, occ) in counter.items()
186206
),
187207
key=lambda x: x[1],
188208
reverse=True,
189209
)
190210

191-
# Do we have a "word" that occurred 5 times and got a score of at least
192-
# 5? Horray. Pick the one with the highest score.
211+
# Pick the word with the best score (savings in bits). It also has to
212+
# occur more than once and save at least an estimated 2 bytes.
193213
word = None
194214
for (s, score, occ) in scores:
195-
if occ < 5:
196-
continue
197-
if score < 5:
215+
if occ < 2:
216+
break
217+
if score < 16:
198218
break
199219
word = s
200220
break

0 commit comments

Comments
 (0)