10
10
from __future__ import print_function
11
11
12
12
import bisect
13
+ from dataclasses import dataclass
13
14
import re
14
15
import sys
15
16
@@ -146,7 +147,40 @@ def iter_substrings(s, minlen, maxlen):
146
147
yield s [begin : begin + n ]
147
148
148
149
149
- translation_requires_uint16 = {"cs" , "el" , "fr" , "ja" , "ko" , "pl" , "ru" , "tr" , "zh_Latn_pinyin" }
150
+ translation_requires_uint16 = {"cs" , "fr" , "ja" , "ko" , "pl" , "tr" , "zh_Latn_pinyin" }
151
+
152
+
153
+ def compute_unicode_offset (texts ):
154
+ all_ch = set (" " .join (texts ))
155
+ ch_160 = sorted (c for c in all_ch if 160 <= ord (c ) < 255 )
156
+ ch_256 = sorted (c for c in all_ch if 255 < ord (c ))
157
+ if not ch_256 :
158
+ return 0 , 0
159
+ min_256 = ord (min (ch_256 ))
160
+ span = ord (max (ch_256 )) - ord (min (ch_256 )) + 1
161
+
162
+ if ch_160 :
163
+ max_160 = ord (max (ch_160 )) + 1
164
+ else :
165
+ max_160 = max (160 , 255 - span )
166
+
167
+ if max_160 + span > 256 :
168
+ return 0 , 0
169
+
170
+ offstart = max_160
171
+ offset = min_256 - max_160
172
+ return offstart , offset
173
+
174
+
175
+ @dataclass
176
+ class EncodingTable :
177
+ values : object
178
+ lengths : object
179
+ words : object
180
+ canonical : object
181
+ extractor : object
182
+ apply_offset : object
183
+ remove_offset : object
150
184
151
185
152
186
def compute_huffman_coding (translation_name , translations , f ):
@@ -156,8 +190,26 @@ def compute_huffman_coding(translation_name, translations, f):
156
190
start_unused = 0x80
157
191
end_unused = 0xFF
158
192
max_ord = 0
193
+ offstart , offset = compute_unicode_offset (texts )
194
+
195
+ def apply_offset (c ):
196
+ oc = ord (c )
197
+ if oc >= offstart :
198
+ oc += offset
199
+ return chr (oc )
200
+
201
+ def remove_offset (c ):
202
+ oc = ord (c )
203
+ if oc >= offstart :
204
+ oc = oc - offset
205
+ try :
206
+ return chr (oc )
207
+ except Exception as e :
208
+ raise ValueError (f"remove_offset { offstart = } { oc = } " ) from e
209
+
159
210
for text in texts :
160
211
for c in text :
212
+ c = remove_offset (c )
161
213
ord_c = ord (c )
162
214
max_ord = max (ord_c , max_ord )
163
215
if 0x80 <= ord_c < 0xFF :
@@ -276,15 +328,17 @@ def est_net_savings(s, occ):
276
328
length_count [length ] += 1
277
329
if last_length :
278
330
renumbered <<= length - last_length
279
- canonical [atom ] = "{0:0{width}b}" .format (renumbered , width = length )
280
331
# print(f"atom={repr(atom)} code={code}", file=sys.stderr)
332
+ canonical [atom ] = "{0:0{width}b}" .format (renumbered , width = length )
281
333
if len (atom ) > 1 :
282
334
o = words .index (atom ) + 0x80
283
335
s = "" .join (C_ESCAPES .get (ch1 , ch1 ) for ch1 in atom )
336
+ f .write (f"// { o } { s } { counter [atom ]} { canonical [atom ]} { renumbered } \n " )
284
337
else :
285
338
s = C_ESCAPES .get (atom , atom )
339
+ canonical [atom ] = "{0:0{width}b}" .format (renumbered , width = length )
286
340
o = ord (atom )
287
- f .write (f"// { o } { s } { counter [atom ]} { canonical [atom ]} { renumbered } \n " )
341
+ f .write (f"// { o } { s } { counter [atom ]} { canonical [atom ]} { renumbered } \n " )
288
342
renumbered += 1
289
343
last_length = length
290
344
lengths = bytearray ()
@@ -306,28 +360,37 @@ def est_net_savings(s, occ):
306
360
307
361
f .write ("typedef {} mchar_t;\n " .format (values_type ))
308
362
f .write ("const uint8_t lengths[] = {{ {} }};\n " .format (", " .join (map (str , lengths ))))
309
- f .write ("const mchar_t values[] = {{ {} }};\n " .format (", " .join (str (ord (u )) for u in values )))
363
+ f .write (
364
+ "const mchar_t values[] = {{ {} }};\n " .format (
365
+ ", " .join (str (ord (remove_offset (u ))) for u in values )
366
+ )
367
+ )
310
368
f .write (
311
369
"#define compress_max_length_bits ({})\n " .format (
312
370
max_translation_encoded_length .bit_length ()
313
371
)
314
372
)
315
373
f .write (
316
374
"const mchar_t words[] = {{ {} }};\n " .format (
317
- ", " .join (str (ord (c )) for w in words for c in w )
375
+ ", " .join (str (ord (remove_offset ( c ) )) for w in words for c in w )
318
376
)
319
377
)
320
378
f .write ("const uint8_t wlencount[] = {{ {} }};\n " .format (", " .join (str (p ) for p in wlencount )))
321
379
f .write ("#define word_start {}\n " .format (word_start ))
322
380
f .write ("#define word_end {}\n " .format (word_end ))
323
381
f .write ("#define minlen {}\n " .format (minlen ))
324
382
f .write ("#define maxlen {}\n " .format (maxlen ))
383
+ f .write ("#define offstart {}\n " .format (offstart ))
384
+ f .write ("#define offset {}\n " .format (offset ))
325
385
326
- return (values , lengths , words , canonical , extractor )
386
+ return EncodingTable (values , lengths , words , canonical , extractor , apply_offset , remove_offset )
327
387
328
388
329
389
def decompress (encoding_table , encoded , encoded_length_bits ):
330
- (values , lengths , words , _ , _ ) = encoding_table
390
+ values = encoding_table .values
391
+ lengths = encoding_table .lengths
392
+ words = encoding_table .words
393
+
331
394
dec = []
332
395
this_byte = 0
333
396
this_bit = 7
@@ -385,7 +448,8 @@ def decompress(encoding_table, encoded, encoded_length_bits):
385
448
def compress (encoding_table , decompressed , encoded_length_bits , len_translation_encoded ):
386
449
if not isinstance (decompressed , str ):
387
450
raise TypeError ()
388
- (_ , _ , _ , canonical , extractor ) = encoding_table
451
+ canonical = encoding_table .canonical
452
+ extractor = encoding_table .extractor
389
453
390
454
enc = bytearray (len (decompressed ) * 3 )
391
455
current_bit = 7
0 commit comments