makeqstrdata: use an extremely accurate dictionary heuristic

Try to accurately measure the costs of including a word in the dictionary
vs the gains from using it in messages.

This saves about 160 bytes on trinket_m0 ja, the fullest translation
for that board.  Other translations on the same board all have savings,
ranging from 24 to 228 bytes.

```
Translation     Before  After   Savings
ja              1164    1324    160
de_DE           1260    1396    136
fr              1424    1652    228
zh_Latn_pinyin  1448    1520    72
pt_BR           1584    1736    152
pl              1592    1640    48
es              1724    1816    92
ko              1724    1816    92
fil             1764    1800    36
it_IT           1896    2040    144
nl              1956    2136    180
ID              2072    2180    108
cs              2124    2148    24
sv              2340    2448    108
en_x_pirate     2644    2740    96
en_GB           2652    2752    100
el              2656    2768    112
en_US           2656    2768    112
hi              2656    2768    112
```
This commit is contained in:
Jeff Epler 2021-07-09 11:23:55 -05:00
parent 7c2fd270a2
commit 99abd03b7a

View File

@ -9,6 +9,7 @@ supervisor/shared/translate.h
from __future__ import print_function
import bisect
import re
import sys
@ -330,6 +331,7 @@ def compute_huffman_coding(translations, compression_filename):
end_unused = min(ord_c, end_unused)
max_words = end_unused - 0x80
bits_per_codepoint = 16 if max_ord > 255 else 8
values_type = "uint16_t" if max_ord > 255 else "uint8_t"
max_words_len = 160 if max_ord > 255 else 255
@ -344,6 +346,31 @@ def compute_huffman_coding(translations, compression_filename):
# again, neither will "there" or "wither", since they have "the"
# as substrings.
extractor = TextSplitter(words)
counter = collections.Counter()
for t in texts:
for atom in extractor.iter(t):
counter[atom] += 1
cb = huffman.codebook(counter.items())
lengths = sorted(dict((v, len(cb[k])) for k, v in counter.items()).items())
def bit_length(s):
return sum(len(cb[c]) for c in s)
def est_len(occ):
idx = bisect.bisect_left(lengths, (occ, 0))
return lengths[idx][1] + 1
# The cost of adding a dictionary word is just its storage size
# while its savings is the difference between the original
# huffman bit-length of the string and the estimated bit-length
# of the dictionary word, times the number of times the word appears.
#
# The difference between the two is the net savings, in bits.
def est_net_savings(s, occ):
savings = occ * (bit_length(s) - est_len(occ))
cost = len(s) * bits_per_codepoint
return savings - cost
counter = collections.Counter()
for t in texts:
for (found, word) in extractor.iter_words(t):
@ -351,12 +378,16 @@ def compute_huffman_coding(translations, compression_filename):
for substr in iter_substrings(word, minlen=3, maxlen=9):
counter[substr] += 1
# Score the candidates we found. This is an empirical formula only,
# chosen for its effectiveness.
# Score the candidates we found. This is a semi-empirical formula that
# attempts to model the number of bits saved as closely as possible.
#
# It attempts to compute the codeword lengths of the original word
# to the codeword length the dictionary entry would get, times
# the number of occurrences, less the ovehead of the entries in the
# words[] array.
scores = sorted(
((s, (len(s) - 1) ** (occ + 4)) for (s, occ) in counter.items() if occ > 4),
key=lambda x: x[1],
reverse=True,
((s, -est_net_savings(s, occ)) for (s, occ) in counter.items()), key=lambda x: x[1]
)
# Pick the one with the highest score.