|
28 | 28 |
|
29 | 29 | import huffman
|
30 | 30 | from html.entities import codepoint2name
|
| 31 | +import math |
| 32 | + |
31 | 33 |
|
32 | 34 | codepoint2name[ord("-")] = "hyphen"
|
33 | 35 |
|
@@ -220,6 +222,15 @@ def remove_offset(c):
|
220 | 222 | f"Translation {translation_name} expected to fit in 8 bits but required 16 bits"
|
221 | 223 | )
|
222 | 224 |
|
| 225 | + # Prune the qstrs to only those that appear in the texts |
| 226 | + qstr_counters = collections.Counter() |
| 227 | + qstr_extractor = TextSplitter(qstr_strs) |
| 228 | + for t in texts: |
| 229 | + for qstr in qstr_extractor.iter(t): |
| 230 | + if qstr in qstr_strs: |
| 231 | + qstr_counters[qstr] += 1 |
| 232 | + qstr_strs = list(qstr_counters.keys()) |
| 233 | + |
223 | 234 | while len(words) < max_words:
|
224 | 235 | # Until the dictionary is filled to capacity, use a heuristic to find
|
225 | 236 | # the best "word" (2- to 11-gram) to add to it.
|
@@ -287,9 +298,15 @@ def est_net_savings(s, occ):
|
287 | 298 | # to the codeword length the dictionary entry would get, times
|
288 | 299 | # the number of occurrences, less the ovehead of the entries in the
|
289 | 300 | # words[] array.
|
| 301 | + # |
| 302 | + # The set of candidates is pruned by estimating their relative value and |
| 303 | + # picking to top 100 scores. |
290 | 304 |
|
| 305 | + counter = sorted(counter.items(), key=lambda x: math.log(x[1]) * len(x[0]), reverse=True)[ |
| 306 | + :100 |
| 307 | + ] |
291 | 308 | scores = sorted(
|
292 |
| - ((s, -est_net_savings(s, occ)) for (s, occ) in counter.items() if occ > 1), |
| 309 | + ((s, -est_net_savings(s, occ)) for (s, occ) in counter if occ > 1), |
293 | 310 | key=lambda x: x[1],
|
294 | 311 | )
|
295 | 312 |
|
|
0 commit comments