12
12
import re
13
13
import sys
14
14
15
- from math import log
15
+ import bisect
16
16
import collections
17
17
import gettext
18
18
import os .path
@@ -156,6 +156,7 @@ def compute_huffman_coding(translations, compression_filename):
156
156
end_unused = min (ord_c , end_unused )
157
157
max_words = end_unused - 0x80
158
158
159
+ bits_per_codepoint = 16 if max_ord > 255 else 8
159
160
values_type = "uint16_t" if max_ord > 255 else "uint8_t"
160
161
max_words_len = 160 if max_ord > 255 else 255
161
162
@@ -170,31 +171,50 @@ def compute_huffman_coding(translations, compression_filename):
170
171
# again, neither will "there" or "wither", since they have "the"
171
172
# as substrings.
172
173
extractor = TextSplitter (words )
174
+ counter = collections .Counter ()
175
+ for t in texts :
176
+ for atom in extractor .iter (t ):
177
+ counter [atom ] += 1
178
+ cb = huffman .codebook (counter .items ())
179
+ lengths = sorted (dict ((v , len (cb [k ])) for k , v in counter .items ()).items ())
180
+
181
+ def bit_length (s ):
182
+ return sum (len (cb [c ]) for c in s )
183
+
184
+ def est_len (occ ):
185
+ idx = bisect .bisect_left (lengths , (occ , 0 ))
186
+ return lengths [idx ][1 ] + 1
187
+
173
188
counter = collections .Counter ()
174
189
for t in texts :
175
190
for (found , word ) in extractor .iter_words (t ):
176
191
if not found :
177
192
for substr in iter_substrings (word , minlen = 2 , maxlen = 9 ):
178
193
counter [substr ] += 1
179
194
180
- # Score the candidates we found. This is an empirical formula only,
181
- # chosen for its effectiveness.
195
+ # Score the candidates we found. This is a semi-empirical formula that
196
+ # attempts to model the number of bits saved as closely as possible.
197
+ #
198
+ # It attempts to compute the codeword lengths of the original word
199
+ # to the codeword length the dictionary entry would get, times
200
+ # the number of occurrences, less the ovehead of the entries in the
201
+ # words[] array.
182
202
scores = sorted (
183
203
(
184
- (s , ( len (s ) - 1 ) ** log ( max ( occ - 2 , 1 )) , occ )
204
+ (s , occ * ( bit_length (s ) - est_len ( occ ) - 1 ) - len ( s ) * bits_per_codepoint , occ )
185
205
for (s , occ ) in counter .items ()
186
206
),
187
207
key = lambda x : x [1 ],
188
208
reverse = True ,
189
209
)
190
210
191
- # Do we have a " word" that occurred 5 times and got a score of at least
192
- # 5? Horray. Pick the one with the highest score .
211
+ # Pick the word with the best score (savings in bits). It also has to
212
+ # occur more than once and save at least an estimated 2 bytes .
193
213
word = None
194
214
for (s , score , occ ) in scores :
195
- if occ < 5 :
196
- continue
197
- if score < 5 :
215
+ if occ < 2 :
216
+ break
217
+ if score < 16 :
198
218
break
199
219
word = s
200
220
break
0 commit comments