|
12 | 12 | import re
|
13 | 13 | import sys
|
14 | 14 |
|
| 15 | +from math import log |
15 | 16 | import collections
|
16 | 17 | import gettext
|
17 | 18 | import os.path
|
18 | 19 |
|
| 20 | +sys.stdout.reconfigure(encoding='utf-8') |
| 21 | +sys.stderr.reconfigure(errors='backslashreplace') |
| 22 | + |
19 | 23 | py = os.path.dirname(sys.argv[0])
|
20 | 24 | top = os.path.dirname(py)
|
21 | 25 |
|
@@ -100,77 +104,173 @@ def translate(translation_file, i18ns):
|
100 | 104 | translations.append((original, translation))
|
101 | 105 | return translations
|
102 | 106 |
|
103 |
| -def frequent_ngrams(corpus, sz, n): |
104 |
| - return collections.Counter(corpus[i:i+sz] for i in range(len(corpus)-sz)).most_common(n) |
| 107 | +class TextSplitter: |
| 108 | + def __init__(self, words): |
| 109 | + words.sort(key=lambda x: len(x), reverse=True) |
| 110 | + self.words = set(words) |
| 111 | + self.pat = re.compile("|".join(re.escape(w) for w in words) + "|.", flags=re.DOTALL) |
| 112 | + |
| 113 | + def iter_words(self, text): |
| 114 | + s = [] |
| 115 | + words = self.words |
| 116 | + for m in self.pat.finditer(text): |
| 117 | + t = m.group(0) |
| 118 | + if t in words: |
| 119 | + if s: |
| 120 | + yield (False, "".join(s)) |
| 121 | + s = [] |
| 122 | + yield (True, t) |
| 123 | + else: |
| 124 | + s.append(t) |
| 125 | + if s: |
| 126 | + yield (False, "".join(s)) |
| 127 | + |
| 128 | + def iter(self, text): |
| 129 | + for m in self.pat.finditer(text): |
| 130 | + yield m.group(0) |
| 131 | + |
| 132 | +def iter_substrings(s, minlen, maxlen): |
| 133 | + len_s = len(s) |
| 134 | + maxlen = min(len_s, maxlen) |
| 135 | + for n in range(minlen, maxlen + 1): |
| 136 | + for begin in range(0, len_s - n + 1): |
| 137 | + yield s[begin : begin + n] |
| 138 | + |
| 139 | +def compute_huffman_coding(translations, compression_filename): |
| 140 | + texts = [t[1] for t in translations] |
| 141 | + words = [] |
| 142 | + |
| 143 | + start_unused = 0x80 |
| 144 | + end_unused = 0xff |
| 145 | + max_ord = 0 |
| 146 | + for text in texts: |
| 147 | + for c in text: |
| 148 | + ord_c = ord(c) |
| 149 | + max_ord = max(ord_c, max_ord) |
| 150 | + if 0x80 <= ord_c < 0xff: |
| 151 | + end_unused = min(ord_c, end_unused) |
| 152 | + max_words = end_unused - 0x80 |
| 153 | + |
| 154 | + values_type = "uint16_t" if max_ord > 255 else "uint8_t" |
| 155 | + max_words_len = 160 if max_ord > 255 else 255 |
| 156 | + |
| 157 | + sum_len = 0 |
| 158 | + while True: |
| 159 | + # Until the dictionary is filled to capacity, use a heuristic to find |
| 160 | + # the best "word" (2- to 9-gram) to add to it. |
| 161 | + # |
| 162 | + # The TextSplitter allows us to avoid considering parts of the text |
| 163 | + # that are already covered by a previously chosen word, for example |
| 164 | + # if "the" is in words then not only will "the" not be considered |
| 165 | + # again, neither will "there" or "wither", since they have "the" |
| 166 | + # as substrings. |
| 167 | + extractor = TextSplitter(words) |
| 168 | + counter = collections.Counter() |
| 169 | + for t in texts: |
| 170 | + for (found, word) in extractor.iter_words(t): |
| 171 | + if not found: |
| 172 | + for substr in iter_substrings(word, minlen=2, maxlen=9): |
| 173 | + counter[substr] += 1 |
| 174 | + |
| 175 | + # Score the candidates we found. This is an empirical formula only, |
| 176 | + # chosen for its effectiveness. |
| 177 | + scores = sorted( |
| 178 | + ( |
| 179 | + (s, (len(s) - 1) ** log(max(occ - 2, 1)), occ) |
| 180 | + for (s, occ) in counter.items() |
| 181 | + ), |
| 182 | + key=lambda x: x[1], |
| 183 | + reverse=True, |
| 184 | + ) |
| 185 | + |
| 186 | + # Do we have a "word" that occurred 5 times and got a score of at least |
| 187 | + # 5? Horray. Pick the one with the highest score. |
| 188 | + word = None |
| 189 | + for (s, score, occ) in scores: |
| 190 | + if occ < 5: |
| 191 | + continue |
| 192 | + if score < 5: |
| 193 | + break |
| 194 | + word = s |
| 195 | + break |
| 196 | + |
| 197 | + # If we can successfully add it to the dictionary, do so. Otherwise, |
| 198 | + # we've filled the dictionary to capacity and are done. |
| 199 | + if not word: |
| 200 | + break |
| 201 | + if sum_len + len(word) - 2 > max_words_len: |
| 202 | + break |
| 203 | + if len(words) == max_words: |
| 204 | + break |
| 205 | + words.append(word) |
| 206 | + sum_len += len(word) - 2 |
| 207 | + |
| 208 | + extractor = TextSplitter(words) |
| 209 | + counter = collections.Counter() |
| 210 | + for t in texts: |
| 211 | + for atom in extractor.iter(t): |
| 212 | + counter[atom] += 1 |
| 213 | + cb = huffman.codebook(counter.items()) |
| 214 | + |
| 215 | + word_start = start_unused |
| 216 | + word_end = word_start + len(words) - 1 |
| 217 | + print("// # words", len(words)) |
| 218 | + print("// words", words) |
105 | 219 |
|
106 |
| -def encode_ngrams(translation, ngrams): |
107 |
| - if len(ngrams) > 32: |
108 |
| - start = 0xe000 |
109 |
| - else: |
110 |
| - start = 0x80 |
111 |
| - for i, g in enumerate(ngrams): |
112 |
| - translation = translation.replace(g, chr(start + i)) |
113 |
| - return translation |
114 |
| - |
115 |
| -def decode_ngrams(compressed, ngrams): |
116 |
| - if len(ngrams) > 32: |
117 |
| - start, end = 0xe000, 0xf8ff |
118 |
| - else: |
119 |
| - start, end = 0x80, 0x9f |
120 |
| - return "".join(ngrams[ord(c) - start] if (start <= ord(c) <= end) else c for c in compressed) |
121 |
| - |
122 |
| -def compute_huffman_coding(translations, qstrs, compression_filename): |
123 |
| - all_strings = [x[1] for x in translations] |
124 |
| - all_strings_concat = "".join(all_strings) |
125 |
| - ngrams = [i[0] for i in frequent_ngrams(all_strings_concat, 2, 32)] |
126 |
| - all_strings_concat = encode_ngrams(all_strings_concat, ngrams) |
127 |
| - counts = collections.Counter(all_strings_concat) |
128 |
| - cb = huffman.codebook(counts.items()) |
129 | 220 | values = []
|
130 | 221 | length_count = {}
|
131 | 222 | renumbered = 0
|
132 |
| - last_l = None |
| 223 | + last_length = None |
133 | 224 | canonical = {}
|
134 |
| - for ch, code in sorted(cb.items(), key=lambda x: (len(x[1]), x[0])): |
135 |
| - values.append(ch) |
136 |
| - l = len(code) |
137 |
| - if l not in length_count: |
138 |
| - length_count[l] = 0 |
139 |
| - length_count[l] += 1 |
140 |
| - if last_l: |
141 |
| - renumbered <<= (l - last_l) |
142 |
| - canonical[ch] = '{0:0{width}b}'.format(renumbered, width=l) |
143 |
| - s = C_ESCAPES.get(ch, ch) |
144 |
| - print("//", ord(ch), s, counts[ch], canonical[ch], renumbered) |
| 225 | + for atom, code in sorted(cb.items(), key=lambda x: (len(x[1]), x[0])): |
| 226 | + values.append(atom) |
| 227 | + length = len(code) |
| 228 | + if length not in length_count: |
| 229 | + length_count[length] = 0 |
| 230 | + length_count[length] += 1 |
| 231 | + if last_length: |
| 232 | + renumbered <<= (length - last_length) |
| 233 | + canonical[atom] = '{0:0{width}b}'.format(renumbered, width=length) |
| 234 | + # print(f"atom={repr(atom)} code={code}", file=sys.stderr) |
| 235 | + if len(atom) > 1: |
| 236 | + o = words.index(atom) + 0x80 |
| 237 | + s = "".join(C_ESCAPES.get(ch1, ch1) for ch1 in atom) |
| 238 | + else: |
| 239 | + s = C_ESCAPES.get(atom, atom) |
| 240 | + o = ord(atom) |
| 241 | + print("//", o, s, counter[atom], canonical[atom], renumbered) |
145 | 242 | renumbered += 1
|
146 |
| - last_l = l |
| 243 | + last_length = length |
147 | 244 | lengths = bytearray()
|
148 | 245 | print("// length count", length_count)
|
149 |
| - print("// bigrams", ngrams) |
| 246 | + |
150 | 247 | for i in range(1, max(length_count) + 2):
|
151 | 248 | lengths.append(length_count.get(i, 0))
|
152 | 249 | print("// values", values, "lengths", len(lengths), lengths)
|
153 |
| - ngramdata = [ord(ni) for i in ngrams for ni in i] |
154 |
| - print("// estimated total memory size", len(lengths) + 2*len(values) + 2 * len(ngramdata) + sum((len(cb[u]) + 7)//8 for u in all_strings_concat)) |
| 250 | + |
155 | 251 | print("//", values, lengths)
|
156 |
| - values_type = "uint16_t" if max(ord(u) for u in values) > 255 else "uint8_t" |
157 |
| - max_translation_encoded_length = max(len(translation.encode("utf-8")) for original,translation in translations) |
| 252 | + values = [(atom if len(atom) == 1 else chr(0x80 + words.index(atom))) for atom in values] |
| 253 | + print("//", values, lengths) |
| 254 | + max_translation_encoded_length = max( |
| 255 | + len(translation.encode("utf-8")) for (original, translation) in translations) |
| 256 | + |
| 257 | + wends = list(len(w) - 2 for w in words) |
| 258 | + for i in range(1, len(wends)): |
| 259 | + wends[i] += wends[i - 1] |
| 260 | + |
158 | 261 | with open(compression_filename, "w") as f:
|
159 | 262 | f.write("const uint8_t lengths[] = {{ {} }};\n".format(", ".join(map(str, lengths))))
|
160 | 263 | f.write("const {} values[] = {{ {} }};\n".format(values_type, ", ".join(str(ord(u)) for u in values)))
|
161 | 264 | f.write("#define compress_max_length_bits ({})\n".format(max_translation_encoded_length.bit_length()))
|
162 |
| - f.write("const {} bigrams[] = {{ {} }};\n".format(values_type, ", ".join(str(u) for u in ngramdata))) |
163 |
| - if len(ngrams) > 32: |
164 |
| - bigram_start = 0xe000 |
165 |
| - else: |
166 |
| - bigram_start = 0x80 |
167 |
| - bigram_end = bigram_start + len(ngrams) - 1 # End is inclusive |
168 |
| - f.write("#define bigram_start {}\n".format(bigram_start)) |
169 |
| - f.write("#define bigram_end {}\n".format(bigram_end)) |
170 |
| - return values, lengths, ngrams |
| 265 | + f.write("const {} words[] = {{ {} }};\n".format(values_type, ", ".join(str(ord(c)) for w in words for c in w))) |
| 266 | + f.write("const uint8_t wends[] = {{ {} }};\n".format(", ".join(str(p) for p in wends))) |
| 267 | + f.write("#define word_start {}\n".format(word_start)) |
| 268 | + f.write("#define word_end {}\n".format(word_end)) |
| 269 | + |
| 270 | + return (values, lengths, words, canonical, extractor) |
171 | 271 |
|
172 | 272 | def decompress(encoding_table, encoded, encoded_length_bits):
|
173 |
| - values, lengths, ngrams = encoding_table |
| 273 | + (values, lengths, words, _, _) = encoding_table |
174 | 274 | dec = []
|
175 | 275 | this_byte = 0
|
176 | 276 | this_bit = 7
|
@@ -218,74 +318,41 @@ def decompress(encoding_table, encoded, encoded_length_bits):
|
218 | 318 | searched_length += lengths[bit_length]
|
219 | 319 |
|
220 | 320 | v = values[searched_length + bits - max_code]
|
221 |
| - v = decode_ngrams(v, ngrams) |
| 321 | + if v >= chr(0x80) and v < chr(0x80 + len(words)): |
| 322 | + v = words[ord(v) - 0x80] |
222 | 323 | i += len(v.encode('utf-8'))
|
223 | 324 | dec.append(v)
|
224 | 325 | return ''.join(dec)
|
225 | 326 |
|
226 | 327 | def compress(encoding_table, decompressed, encoded_length_bits, len_translation_encoded):
|
227 | 328 | if not isinstance(decompressed, str):
|
228 | 329 | raise TypeError()
|
229 |
| - values, lengths, ngrams = encoding_table |
230 |
| - decompressed = encode_ngrams(decompressed, ngrams) |
| 330 | + (_, _, _, canonical, extractor) = encoding_table |
| 331 | + |
231 | 332 | enc = bytearray(len(decompressed) * 3)
|
232 |
| - #print(decompressed) |
233 |
| - #print(lengths) |
234 | 333 | current_bit = 7
|
235 | 334 | current_byte = 0
|
236 | 335 |
|
237 |
| - code = len_translation_encoded |
238 |
| - bits = encoded_length_bits+1 |
| 336 | + bits = encoded_length_bits + 1 |
239 | 337 | for i in range(bits - 1, 0, -1):
|
240 | 338 | if len_translation_encoded & (1 << (i - 1)):
|
241 | 339 | enc[current_byte] |= 1 << current_bit
|
242 | 340 | if current_bit == 0:
|
243 | 341 | current_bit = 7
|
244 |
| - #print("packed {0:0{width}b}".format(enc[current_byte], width=8)) |
245 | 342 | current_byte += 1
|
246 | 343 | else:
|
247 | 344 | current_bit -= 1
|
248 | 345 |
|
249 |
| - for c in decompressed: |
250 |
| - #print() |
251 |
| - #print("char", c, values.index(c)) |
252 |
| - start = 0 |
253 |
| - end = lengths[0] |
254 |
| - bits = 1 |
255 |
| - compressed = None |
256 |
| - code = 0 |
257 |
| - while compressed is None: |
258 |
| - s = start |
259 |
| - e = end |
260 |
| - #print("{0:0{width}b}".format(code, width=bits)) |
261 |
| - # Binary search! |
262 |
| - while e > s: |
263 |
| - midpoint = (s + e) // 2 |
264 |
| - #print(s, e, midpoint) |
265 |
| - if values[midpoint] == c: |
266 |
| - compressed = code + (midpoint - start) |
267 |
| - #print("found {0:0{width}b}".format(compressed, width=bits)) |
268 |
| - break |
269 |
| - elif c < values[midpoint]: |
270 |
| - e = midpoint |
271 |
| - else: |
272 |
| - s = midpoint + 1 |
273 |
| - code += end - start |
274 |
| - code <<= 1 |
275 |
| - start = end |
276 |
| - end += lengths[bits] |
277 |
| - bits += 1 |
278 |
| - #print("next bit", bits) |
279 |
| - |
280 |
| - for i in range(bits - 1, 0, -1): |
281 |
| - if compressed & (1 << (i - 1)): |
| 346 | + for atom in extractor.iter(decompressed): |
| 347 | + for b in canonical[atom]: |
| 348 | + if b == "1": |
282 | 349 | enc[current_byte] |= 1 << current_bit
|
283 | 350 | if current_bit == 0:
|
284 | 351 | current_bit = 7
|
285 |
| - #print("packed {0:0{width}b}".format(enc[current_byte], width=8)) |
286 | 352 | current_byte += 1
|
287 | 353 | else:
|
288 | 354 | current_bit -= 1
|
| 355 | + |
289 | 356 | if current_bit != 7:
|
290 | 357 | current_byte += 1
|
291 | 358 | return enc[:current_byte]
|
@@ -452,7 +519,7 @@ def print_qstr_enums(qstrs):
|
452 | 519 | if args.translation:
|
453 | 520 | i18ns = sorted(i18ns)
|
454 | 521 | translations = translate(args.translation, i18ns)
|
455 |
| - encoding_table = compute_huffman_coding(translations, qstrs, args.compression_filename) |
| 522 | + encoding_table = compute_huffman_coding(translations, args.compression_filename) |
456 | 523 | print_qstr_data(encoding_table, qcfgs, qstrs, translations)
|
457 | 524 | else:
|
458 | 525 | print_qstr_enums(qstrs)
|
0 commit comments