Skip to content

Commit 5a6941e

Browse files
authored
Merge pull request #8514 from eightycc/xlate
Improve make translation data performance.
2 parents ec678a5 + 6725be4 commit 5a6941e

File tree

1 file changed

+18
-1
lines changed

1 file changed

+18
-1
lines changed

py/maketranslationdata.py

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@
2828

2929
import huffman
3030
from html.entities import codepoint2name
31+
import math
32+
3133

3234
codepoint2name[ord("-")] = "hyphen"
3335

@@ -220,6 +222,15 @@ def remove_offset(c):
220222
f"Translation {translation_name} expected to fit in 8 bits but required 16 bits"
221223
)
222224

225+
# Prune the qstrs to only those that appear in the texts
226+
qstr_counters = collections.Counter()
227+
qstr_extractor = TextSplitter(qstr_strs)
228+
for t in texts:
229+
for qstr in qstr_extractor.iter(t):
230+
if qstr in qstr_strs:
231+
qstr_counters[qstr] += 1
232+
qstr_strs = list(qstr_counters.keys())
233+
223234
while len(words) < max_words:
224235
# Until the dictionary is filled to capacity, use a heuristic to find
225236
# the best "word" (2- to 11-gram) to add to it.
@@ -287,9 +298,15 @@ def est_net_savings(s, occ):
287298
# to the codeword length the dictionary entry would get, times
288299
# the number of occurrences, less the ovehead of the entries in the
289300
# words[] array.
301+
#
302+
# The set of candidates is pruned by estimating their relative value and
303+
# picking to top 100 scores.
290304

305+
counter = sorted(counter.items(), key=lambda x: math.log(x[1]) * len(x[0]), reverse=True)[
306+
:100
307+
]
291308
scores = sorted(
292-
((s, -est_net_savings(s, occ)) for (s, occ) in counter.items() if occ > 1),
309+
((s, -est_net_savings(s, occ)) for (s, occ) in counter if occ > 1),
293310
key=lambda x: x[1],
294311
)
295312

0 commit comments

Comments
 (0)