@@ -333,12 +333,9 @@ def compute_huffman_coding(translations, compression_filename):
333
333
334
334
bits_per_codepoint = 16 if max_ord > 255 else 8
335
335
values_type = "uint16_t" if max_ord > 255 else "uint8_t"
336
- max_words_len = 160 if max_ord > 255 else 255
337
-
338
- sum_len = 0
339
- while True :
336
+ while len (words ) < max_words :
340
337
# Until the dictionary is filled to capacity, use a heuristic to find
341
- # the best "word" (3 - to 9 -gram) to add to it.
338
+ # the best "word" (2 - to 11 -gram) to add to it.
342
339
#
343
340
# The TextSplitter allows us to avoid considering parts of the text
344
341
# that are already covered by a previously chosen word, for example
@@ -369,7 +366,8 @@ def est_len(occ):
369
366
# the Huffman tree bumps up the encoding lengths of all words in the
370
367
# same subtree. In the extreme case when the new word is so frequent
371
368
# that it gets a one-bit encoding, all other words will cost an extra
372
- # bit each.
369
+ # bit each. This is empirically modeled by the constant factor added to
370
+ # cost, but the specific value used isn't "proven" to be correct.
373
371
#
374
372
# Another source of inaccuracy is that compressed strings end up
375
373
# on byte boundaries, not bit boundaries, so saving 1 bit somewhere
@@ -383,14 +381,14 @@ def est_len(occ):
383
381
# The difference between the two is the estimated net savings, in bits.
384
382
def est_net_savings (s , occ ):
385
383
savings = occ * (bit_length (s ) - est_len (occ ))
386
- cost = len (s ) * bits_per_codepoint
384
+ cost = len (s ) * bits_per_codepoint + 24
387
385
return savings - cost
388
386
389
387
counter = collections .Counter ()
390
388
for t in texts :
391
389
for (found , word ) in extractor .iter_words (t ):
392
390
if not found :
393
- for substr in iter_substrings (word , minlen = 3 , maxlen = 9 ):
391
+ for substr in iter_substrings (word , minlen = 2 , maxlen = 11 ):
394
392
counter [substr ] += 1
395
393
396
394
# Score the candidates we found. This is a semi-empirical formula that
@@ -410,16 +408,9 @@ def est_net_savings(s, occ):
410
408
break
411
409
412
410
word = scores [0 ][0 ]
413
-
414
- # If we can successfully add it to the dictionary, do so. Otherwise,
415
- # we've filled the dictionary to capacity and are done.
416
- if sum_len + len (word ) - 2 > max_words_len :
417
- break
418
- if len (words ) == max_words :
419
- break
420
411
words .append (word )
421
- sum_len += len (word ) - 2
422
412
413
+ words .sort (key = len )
423
414
extractor = TextSplitter (words )
424
415
counter = collections .Counter ()
425
416
for t in texts :
@@ -469,30 +460,33 @@ def est_net_savings(s, occ):
469
460
len (translation .encode ("utf-8" )) for (original , translation ) in translations
470
461
)
471
462
472
- wends = list ( len (w ) - 2 for w in words )
473
- for i in range ( 1 , len (wends )):
474
- wends [ i ] += wends [ i - 1 ]
463
+ maxlen = len (words [ - 1 ] )
464
+ minlen = len (words [ 0 ])
465
+ wlencount = [ len ([ None for w in words if len ( w ) == l ]) for l in range ( minlen , maxlen + 1 ) ]
475
466
476
467
with open (compression_filename , "w" ) as f :
468
+ f .write ("typedef {} mchar_t;" .format (values_type ))
477
469
f .write ("const uint8_t lengths[] = {{ {} }};\n " .format (", " .join (map (str , lengths ))))
478
470
f .write (
479
- "const {} values[] = {{ {} }};\n " .format (
480
- values_type , ", " .join (str (ord (u )) for u in values )
481
- )
471
+ "const mchar_t values[] = {{ {} }};\n " .format (", " .join (str (ord (u )) for u in values ))
482
472
)
483
473
f .write (
484
474
"#define compress_max_length_bits ({})\n " .format (
485
475
max_translation_encoded_length .bit_length ()
486
476
)
487
477
)
488
478
f .write (
489
- "const {} words[] = {{ {} }};\n " .format (
490
- values_type , ", " .join (str (ord (c )) for w in words for c in w )
479
+ "const mchar_t words[] = {{ {} }};\n " .format (
480
+ ", " .join (str (ord (c )) for w in words for c in w )
491
481
)
492
482
)
493
- f .write ("const uint8_t wends[] = {{ {} }};\n " .format (", " .join (str (p ) for p in wends )))
483
+ f .write (
484
+ "const uint8_t wlencount[] = {{ {} }};\n " .format (", " .join (str (p ) for p in wlencount ))
485
+ )
494
486
f .write ("#define word_start {}\n " .format (word_start ))
495
487
f .write ("#define word_end {}\n " .format (word_end ))
488
+ f .write ("#define minlen {}\n " .format (minlen ))
489
+ f .write ("#define maxlen {}\n " .format (maxlen ))
496
490
497
491
return (values , lengths , words , canonical , extractor )
498
492
0 commit comments