@@ -103,14 +103,10 @@ def compute_huffman_coding(translations, qstrs, compression_filename):
103
103
# go through each qstr and print it out
104
104
for _ , _ , qstr in qstrs .values ():
105
105
all_strings .append (qstr )
106
- all_strings_concat = "" .join (all_strings ). encode ( "utf-8" )
106
+ all_strings_concat = "" .join (all_strings )
107
107
counts = collections .Counter (all_strings_concat )
108
- # add other values
109
- for i in range (256 ):
110
- if i not in counts :
111
- counts [i ] = 0
112
108
cb = huffman .codebook (counts .items ())
113
- values = bytearray ()
109
+ values = []
114
110
length_count = {}
115
111
renumbered = 0
116
112
last_l = None
@@ -124,26 +120,27 @@ def compute_huffman_coding(translations, qstrs, compression_filename):
124
120
if last_l :
125
121
renumbered <<= (l - last_l )
126
122
canonical [ch ] = '{0:0{width}b}' .format (renumbered , width = l )
127
- if chr (ch ) in C_ESCAPES :
128
- s = C_ESCAPES [chr (ch )]
129
- else :
130
- s = chr (ch )
131
- print ("//" , ch , s , counts [ch ], canonical [ch ], renumbered )
123
+ s = C_ESCAPES .get (ch , ch )
124
+ print ("//" , ord (ch ), s , counts [ch ], canonical [ch ], renumbered )
132
125
renumbered += 1
133
126
last_l = l
134
127
lengths = bytearray ()
135
- for i in range (1 , max (length_count ) + 1 ):
128
+ print ("// length count" , length_count )
129
+ for i in range (1 , max (length_count ) + 2 ):
136
130
lengths .append (length_count .get (i , 0 ))
131
+ print ("// values" , values , "lengths" , len (lengths ), lengths )
132
+ print ("// estimated total memory size" , len (lengths ) + 2 * len (values ) + sum (len (cb [u ]) for u in all_strings_concat ))
137
133
print ("//" , values , lengths )
134
+ values_type = "uint16_t" if max (ord (u ) for u in values ) > 255 else "uint8_t"
138
135
with open (compression_filename , "w" ) as f :
139
136
f .write ("const uint8_t lengths[] = {{ {} }};\n " .format (", " .join (map (str , lengths ))))
140
- f .write ("const uint8_t values[256 ] = {{ {} }};\n " .format (", " .join (map ( str , values ) )))
137
+ f .write ("const {} values[] = {{ {} }};\n " .format (values_type , ", " .join (str ( ord ( u )) for u in values )))
141
138
return values , lengths
142
139
143
140
def decompress (encoding_table , length , encoded ):
144
141
values , lengths = encoding_table
145
142
#print(l, encoded)
146
- dec = bytearray ( length )
143
+ dec = []
147
144
this_byte = 0
148
145
this_bit = 7
149
146
b = encoded [this_byte ]
@@ -173,14 +170,14 @@ def decompress(encoding_table, length, encoded):
173
170
searched_length += lengths [bit_length ]
174
171
175
172
v = values [searched_length + bits - max_code ]
176
- dec [ i ] = v
177
- return dec
173
+ dec . append ( v )
174
+ return '' . join ( dec )
178
175
179
176
def compress (encoding_table , decompressed ):
180
- if not isinstance (decompressed , bytes ):
177
+ if not isinstance (decompressed , str ):
181
178
raise TypeError ()
182
179
values , lengths = encoding_table
183
- enc = bytearray (len (decompressed ) * 2 )
180
+ enc = bytearray (len (decompressed ) * 3 )
184
181
#print(decompressed)
185
182
#print(lengths)
186
183
current_bit = 7
@@ -228,7 +225,7 @@ def compress(encoding_table, decompressed):
228
225
if current_bit != 7 :
229
226
current_byte += 1
230
227
if current_byte > len (decompressed ):
231
- print ("Note: compression increased length" , repr (decompressed . decode ( 'utf-8' ) ), len (decompressed ), current_byte , file = sys .stderr )
228
+ print ("Note: compression increased length" , repr (decompressed ), len (decompressed ), current_byte , file = sys .stderr )
232
229
return enc [:current_byte ]
233
230
234
231
def qstr_escape (qst ):
@@ -347,9 +344,9 @@ def print_qstr_data(encoding_table, qcfgs, qstrs, i18ns):
347
344
total_text_compressed_size = 0
348
345
for original , translation in i18ns :
349
346
translation_encoded = translation .encode ("utf-8" )
350
- compressed = compress (encoding_table , translation_encoded )
347
+ compressed = compress (encoding_table , translation )
351
348
total_text_compressed_size += len (compressed )
352
- decompressed = decompress (encoding_table , len (translation_encoded ), compressed ). decode ( "utf-8" )
349
+ decompressed = decompress (encoding_table , len (translation_encoded ), compressed )
353
350
for c in C_ESCAPES :
354
351
decompressed = decompressed .replace (c , C_ESCAPES [c ])
355
352
print ("TRANSLATION(\" {}\" , {}, {{ {} }}) // {}" .format (original , len (translation_encoded )+ 1 , ", " .join (["0x{:02x}" .format (x ) for x in compressed ]), decompressed ))
0 commit comments