Skip to content

Commit 750bc1e

Browse files
authored
Merge pull request #3398 from jepler/better-dictionary-compression
compression: Implement @ciscorn's dictionary approach
2 parents d774678 + a8e98cd commit 750bc1e

File tree

3 files changed

+191
-102
lines changed

3 files changed

+191
-102
lines changed

py/makeqstrdata.py

Lines changed: 163 additions & 96 deletions
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,14 @@
1212
import re
1313
import sys
1414

15+
from math import log
1516
import collections
1617
import gettext
1718
import os.path
1819

20+
sys.stdout.reconfigure(encoding='utf-8')
21+
sys.stderr.reconfigure(errors='backslashreplace')
22+
1923
py = os.path.dirname(sys.argv[0])
2024
top = os.path.dirname(py)
2125

@@ -100,77 +104,173 @@ def translate(translation_file, i18ns):
100104
translations.append((original, translation))
101105
return translations
102106

103-
def frequent_ngrams(corpus, sz, n):
104-
return collections.Counter(corpus[i:i+sz] for i in range(len(corpus)-sz)).most_common(n)
107+
class TextSplitter:
108+
def __init__(self, words):
109+
words.sort(key=lambda x: len(x), reverse=True)
110+
self.words = set(words)
111+
self.pat = re.compile("|".join(re.escape(w) for w in words) + "|.", flags=re.DOTALL)
112+
113+
def iter_words(self, text):
114+
s = []
115+
words = self.words
116+
for m in self.pat.finditer(text):
117+
t = m.group(0)
118+
if t in words:
119+
if s:
120+
yield (False, "".join(s))
121+
s = []
122+
yield (True, t)
123+
else:
124+
s.append(t)
125+
if s:
126+
yield (False, "".join(s))
127+
128+
def iter(self, text):
129+
for m in self.pat.finditer(text):
130+
yield m.group(0)
131+
132+
def iter_substrings(s, minlen, maxlen):
133+
len_s = len(s)
134+
maxlen = min(len_s, maxlen)
135+
for n in range(minlen, maxlen + 1):
136+
for begin in range(0, len_s - n + 1):
137+
yield s[begin : begin + n]
138+
139+
def compute_huffman_coding(translations, compression_filename):
140+
texts = [t[1] for t in translations]
141+
words = []
142+
143+
start_unused = 0x80
144+
end_unused = 0xff
145+
max_ord = 0
146+
for text in texts:
147+
for c in text:
148+
ord_c = ord(c)
149+
max_ord = max(ord_c, max_ord)
150+
if 0x80 <= ord_c < 0xff:
151+
end_unused = min(ord_c, end_unused)
152+
max_words = end_unused - 0x80
153+
154+
values_type = "uint16_t" if max_ord > 255 else "uint8_t"
155+
max_words_len = 160 if max_ord > 255 else 255
156+
157+
sum_len = 0
158+
while True:
159+
# Until the dictionary is filled to capacity, use a heuristic to find
160+
# the best "word" (2- to 9-gram) to add to it.
161+
#
162+
# The TextSplitter allows us to avoid considering parts of the text
163+
# that are already covered by a previously chosen word, for example
164+
# if "the" is in words then not only will "the" not be considered
165+
# again, neither will "there" or "wither", since they have "the"
166+
# as substrings.
167+
extractor = TextSplitter(words)
168+
counter = collections.Counter()
169+
for t in texts:
170+
for (found, word) in extractor.iter_words(t):
171+
if not found:
172+
for substr in iter_substrings(word, minlen=2, maxlen=9):
173+
counter[substr] += 1
174+
175+
# Score the candidates we found. This is an empirical formula only,
176+
# chosen for its effectiveness.
177+
scores = sorted(
178+
(
179+
(s, (len(s) - 1) ** log(max(occ - 2, 1)), occ)
180+
for (s, occ) in counter.items()
181+
),
182+
key=lambda x: x[1],
183+
reverse=True,
184+
)
185+
186+
# Do we have a "word" that occurred 5 times and got a score of at least
187+
# 5? Horray. Pick the one with the highest score.
188+
word = None
189+
for (s, score, occ) in scores:
190+
if occ < 5:
191+
continue
192+
if score < 5:
193+
break
194+
word = s
195+
break
196+
197+
# If we can successfully add it to the dictionary, do so. Otherwise,
198+
# we've filled the dictionary to capacity and are done.
199+
if not word:
200+
break
201+
if sum_len + len(word) - 2 > max_words_len:
202+
break
203+
if len(words) == max_words:
204+
break
205+
words.append(word)
206+
sum_len += len(word) - 2
207+
208+
extractor = TextSplitter(words)
209+
counter = collections.Counter()
210+
for t in texts:
211+
for atom in extractor.iter(t):
212+
counter[atom] += 1
213+
cb = huffman.codebook(counter.items())
214+
215+
word_start = start_unused
216+
word_end = word_start + len(words) - 1
217+
print("// # words", len(words))
218+
print("// words", words)
105219

106-
def encode_ngrams(translation, ngrams):
107-
if len(ngrams) > 32:
108-
start = 0xe000
109-
else:
110-
start = 0x80
111-
for i, g in enumerate(ngrams):
112-
translation = translation.replace(g, chr(start + i))
113-
return translation
114-
115-
def decode_ngrams(compressed, ngrams):
116-
if len(ngrams) > 32:
117-
start, end = 0xe000, 0xf8ff
118-
else:
119-
start, end = 0x80, 0x9f
120-
return "".join(ngrams[ord(c) - start] if (start <= ord(c) <= end) else c for c in compressed)
121-
122-
def compute_huffman_coding(translations, qstrs, compression_filename):
123-
all_strings = [x[1] for x in translations]
124-
all_strings_concat = "".join(all_strings)
125-
ngrams = [i[0] for i in frequent_ngrams(all_strings_concat, 2, 32)]
126-
all_strings_concat = encode_ngrams(all_strings_concat, ngrams)
127-
counts = collections.Counter(all_strings_concat)
128-
cb = huffman.codebook(counts.items())
129220
values = []
130221
length_count = {}
131222
renumbered = 0
132-
last_l = None
223+
last_length = None
133224
canonical = {}
134-
for ch, code in sorted(cb.items(), key=lambda x: (len(x[1]), x[0])):
135-
values.append(ch)
136-
l = len(code)
137-
if l not in length_count:
138-
length_count[l] = 0
139-
length_count[l] += 1
140-
if last_l:
141-
renumbered <<= (l - last_l)
142-
canonical[ch] = '{0:0{width}b}'.format(renumbered, width=l)
143-
s = C_ESCAPES.get(ch, ch)
144-
print("//", ord(ch), s, counts[ch], canonical[ch], renumbered)
225+
for atom, code in sorted(cb.items(), key=lambda x: (len(x[1]), x[0])):
226+
values.append(atom)
227+
length = len(code)
228+
if length not in length_count:
229+
length_count[length] = 0
230+
length_count[length] += 1
231+
if last_length:
232+
renumbered <<= (length - last_length)
233+
canonical[atom] = '{0:0{width}b}'.format(renumbered, width=length)
234+
# print(f"atom={repr(atom)} code={code}", file=sys.stderr)
235+
if len(atom) > 1:
236+
o = words.index(atom) + 0x80
237+
s = "".join(C_ESCAPES.get(ch1, ch1) for ch1 in atom)
238+
else:
239+
s = C_ESCAPES.get(atom, atom)
240+
o = ord(atom)
241+
print("//", o, s, counter[atom], canonical[atom], renumbered)
145242
renumbered += 1
146-
last_l = l
243+
last_length = length
147244
lengths = bytearray()
148245
print("// length count", length_count)
149-
print("// bigrams", ngrams)
246+
150247
for i in range(1, max(length_count) + 2):
151248
lengths.append(length_count.get(i, 0))
152249
print("// values", values, "lengths", len(lengths), lengths)
153-
ngramdata = [ord(ni) for i in ngrams for ni in i]
154-
print("// estimated total memory size", len(lengths) + 2*len(values) + 2 * len(ngramdata) + sum((len(cb[u]) + 7)//8 for u in all_strings_concat))
250+
155251
print("//", values, lengths)
156-
values_type = "uint16_t" if max(ord(u) for u in values) > 255 else "uint8_t"
157-
max_translation_encoded_length = max(len(translation.encode("utf-8")) for original,translation in translations)
252+
values = [(atom if len(atom) == 1 else chr(0x80 + words.index(atom))) for atom in values]
253+
print("//", values, lengths)
254+
max_translation_encoded_length = max(
255+
len(translation.encode("utf-8")) for (original, translation) in translations)
256+
257+
wends = list(len(w) - 2 for w in words)
258+
for i in range(1, len(wends)):
259+
wends[i] += wends[i - 1]
260+
158261
with open(compression_filename, "w") as f:
159262
f.write("const uint8_t lengths[] = {{ {} }};\n".format(", ".join(map(str, lengths))))
160263
f.write("const {} values[] = {{ {} }};\n".format(values_type, ", ".join(str(ord(u)) for u in values)))
161264
f.write("#define compress_max_length_bits ({})\n".format(max_translation_encoded_length.bit_length()))
162-
f.write("const {} bigrams[] = {{ {} }};\n".format(values_type, ", ".join(str(u) for u in ngramdata)))
163-
if len(ngrams) > 32:
164-
bigram_start = 0xe000
165-
else:
166-
bigram_start = 0x80
167-
bigram_end = bigram_start + len(ngrams) - 1 # End is inclusive
168-
f.write("#define bigram_start {}\n".format(bigram_start))
169-
f.write("#define bigram_end {}\n".format(bigram_end))
170-
return values, lengths, ngrams
265+
f.write("const {} words[] = {{ {} }};\n".format(values_type, ", ".join(str(ord(c)) for w in words for c in w)))
266+
f.write("const uint8_t wends[] = {{ {} }};\n".format(", ".join(str(p) for p in wends)))
267+
f.write("#define word_start {}\n".format(word_start))
268+
f.write("#define word_end {}\n".format(word_end))
269+
270+
return (values, lengths, words, canonical, extractor)
171271

172272
def decompress(encoding_table, encoded, encoded_length_bits):
173-
values, lengths, ngrams = encoding_table
273+
(values, lengths, words, _, _) = encoding_table
174274
dec = []
175275
this_byte = 0
176276
this_bit = 7
@@ -218,74 +318,41 @@ def decompress(encoding_table, encoded, encoded_length_bits):
218318
searched_length += lengths[bit_length]
219319

220320
v = values[searched_length + bits - max_code]
221-
v = decode_ngrams(v, ngrams)
321+
if v >= chr(0x80) and v < chr(0x80 + len(words)):
322+
v = words[ord(v) - 0x80]
222323
i += len(v.encode('utf-8'))
223324
dec.append(v)
224325
return ''.join(dec)
225326

226327
def compress(encoding_table, decompressed, encoded_length_bits, len_translation_encoded):
227328
if not isinstance(decompressed, str):
228329
raise TypeError()
229-
values, lengths, ngrams = encoding_table
230-
decompressed = encode_ngrams(decompressed, ngrams)
330+
(_, _, _, canonical, extractor) = encoding_table
331+
231332
enc = bytearray(len(decompressed) * 3)
232-
#print(decompressed)
233-
#print(lengths)
234333
current_bit = 7
235334
current_byte = 0
236335

237-
code = len_translation_encoded
238-
bits = encoded_length_bits+1
336+
bits = encoded_length_bits + 1
239337
for i in range(bits - 1, 0, -1):
240338
if len_translation_encoded & (1 << (i - 1)):
241339
enc[current_byte] |= 1 << current_bit
242340
if current_bit == 0:
243341
current_bit = 7
244-
#print("packed {0:0{width}b}".format(enc[current_byte], width=8))
245342
current_byte += 1
246343
else:
247344
current_bit -= 1
248345

249-
for c in decompressed:
250-
#print()
251-
#print("char", c, values.index(c))
252-
start = 0
253-
end = lengths[0]
254-
bits = 1
255-
compressed = None
256-
code = 0
257-
while compressed is None:
258-
s = start
259-
e = end
260-
#print("{0:0{width}b}".format(code, width=bits))
261-
# Binary search!
262-
while e > s:
263-
midpoint = (s + e) // 2
264-
#print(s, e, midpoint)
265-
if values[midpoint] == c:
266-
compressed = code + (midpoint - start)
267-
#print("found {0:0{width}b}".format(compressed, width=bits))
268-
break
269-
elif c < values[midpoint]:
270-
e = midpoint
271-
else:
272-
s = midpoint + 1
273-
code += end - start
274-
code <<= 1
275-
start = end
276-
end += lengths[bits]
277-
bits += 1
278-
#print("next bit", bits)
279-
280-
for i in range(bits - 1, 0, -1):
281-
if compressed & (1 << (i - 1)):
346+
for atom in extractor.iter(decompressed):
347+
for b in canonical[atom]:
348+
if b == "1":
282349
enc[current_byte] |= 1 << current_bit
283350
if current_bit == 0:
284351
current_bit = 7
285-
#print("packed {0:0{width}b}".format(enc[current_byte], width=8))
286352
current_byte += 1
287353
else:
288354
current_bit -= 1
355+
289356
if current_bit != 7:
290357
current_byte += 1
291358
return enc[:current_byte]
@@ -452,7 +519,7 @@ def print_qstr_enums(qstrs):
452519
if args.translation:
453520
i18ns = sorted(i18ns)
454521
translations = translate(args.translation, i18ns)
455-
encoding_table = compute_huffman_coding(translations, qstrs, args.compression_filename)
522+
encoding_table = compute_huffman_coding(translations, args.compression_filename)
456523
print_qstr_data(encoding_table, qcfgs, qstrs, translations)
457524
else:
458525
print_qstr_enums(qstrs)

supervisor/shared/translate.c

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -47,13 +47,22 @@ STATIC int put_utf8(char *buf, int u) {
4747
if(u <= 0x7f) {
4848
*buf = u;
4949
return 1;
50-
} else if(bigram_start <= u && u <= bigram_end) {
51-
int n = (u - 0x80) * 2;
52-
// (note that at present, entries in the bigrams table are
53-
// guaranteed not to represent bigrams themselves, so this adds
50+
} else if(word_start <= u && u <= word_end) {
51+
uint n = (u - word_start);
52+
size_t pos = 0;
53+
if (n > 0) {
54+
pos = wends[n - 1] + (n * 2);
55+
}
56+
int ret = 0;
57+
// note that at present, entries in the words table are
58+
// guaranteed not to represent words themselves, so this adds
5459
// at most 1 level of recursive call
55-
int ret = put_utf8(buf, bigrams[n]);
56-
return ret + put_utf8(buf + ret, bigrams[n+1]);
60+
for(; pos < wends[n] + (n + 1) * 2; pos++) {
61+
int len = put_utf8(buf, words[pos]);
62+
buf += len;
63+
ret += len;
64+
}
65+
return ret;
5766
} else if(u <= 0x07ff) {
5867
*buf++ = 0b11000000 | (u >> 6);
5968
*buf = 0b10000000 | (u & 0b00111111);

supervisor/shared/translate.h

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,19 @@
4343
// (building the huffman encoding on UTF-16 code points gave better
4444
// compression than building it on UTF-8 bytes)
4545
//
46+
// - code points starting at 128 (word_start) and potentially extending
47+
// to 255 (word_end) (but never interfering with the target
48+
// language's used code points) stand for dictionary entries in a
49+
// dictionary with size up to 256 code points. The dictionary entries
50+
// are computed with a heuristic based on frequent substrings of 2 to
51+
// 9 code points. These are called "words" but are not, grammatically
52+
// speaking, words. They're just spans of code points that frequently
53+
// occur together.
54+
//
55+
// - dictionary entries are non-overlapping, and the _ending_ index of each
56+
// entry is stored in an array. Since the index given is the ending
57+
// index, the array is called "wends".
58+
//
4659
// The "data" / "tail" construct is so that the struct's last member is a
4760
// "flexible array". However, the _only_ member is not permitted to be
4861
// a flexible member, so we have to declare the first byte as a separte

0 commit comments

Comments
 (0)