From 99abd03b7a4b77ba0cb4e8b1e03db016a7c008a8 Mon Sep 17 00:00:00 2001 From: Jeff Epler Date: Fri, 9 Jul 2021 11:23:55 -0500 Subject: [PATCH] makeqstrdata: use an extremely accurate dictionary heuristic Try to accurately measure the costs of including a word in the dictionary vs the gains from using it in messages. This saves about 160 bytes on trinket_m0 ja, the fullest translation for that board. Other translations on the same board all have savings, ranging from 24 to 228 bytes. ``` Translation Before After Savings ja 1164 1324 160 de_DE 1260 1396 136 fr 1424 1652 228 zh_Latn_pinyin 1448 1520 72 pt_BR 1584 1736 152 pl 1592 1640 48 es 1724 1816 92 ko 1724 1816 92 fil 1764 1800 36 it_IT 1896 2040 144 nl 1956 2136 180 ID 2072 2180 108 cs 2124 2148 24 sv 2340 2448 108 en_x_pirate 2644 2740 96 en_GB 2652 2752 100 el 2656 2768 112 en_US 2656 2768 112 hi 2656 2768 112 ``` --- py/makeqstrdata.py | 41 ++++++++++++++++++++++++++++++++++++----- 1 file changed, 36 insertions(+), 5 deletions(-) diff --git a/py/makeqstrdata.py b/py/makeqstrdata.py index 02f0bea7eb..dc004d2df3 100644 --- a/py/makeqstrdata.py +++ b/py/makeqstrdata.py @@ -9,6 +9,7 @@ supervisor/shared/translate.h from __future__ import print_function +import bisect import re import sys @@ -330,6 +331,7 @@ def compute_huffman_coding(translations, compression_filename): end_unused = min(ord_c, end_unused) max_words = end_unused - 0x80 + bits_per_codepoint = 16 if max_ord > 255 else 8 values_type = "uint16_t" if max_ord > 255 else "uint8_t" max_words_len = 160 if max_ord > 255 else 255 @@ -344,6 +346,31 @@ def compute_huffman_coding(translations, compression_filename): # again, neither will "there" or "wither", since they have "the" # as substrings. extractor = TextSplitter(words) + counter = collections.Counter() + for t in texts: + for atom in extractor.iter(t): + counter[atom] += 1 + cb = huffman.codebook(counter.items()) + lengths = sorted(dict((v, len(cb[k])) for k, v in counter.items()).items()) + + def bit_length(s): + return sum(len(cb[c]) for c in s) + + def est_len(occ): + idx = bisect.bisect_left(lengths, (occ, 0)) + return lengths[idx][1] + 1 + + # The cost of adding a dictionary word is just its storage size + # while its savings is the difference between the original + # huffman bit-length of the string and the estimated bit-length + # of the dictionary word, times the number of times the word appears. + # + # The difference between the two is the net savings, in bits. + def est_net_savings(s, occ): + savings = occ * (bit_length(s) - est_len(occ)) + cost = len(s) * bits_per_codepoint + return savings - cost + counter = collections.Counter() for t in texts: for (found, word) in extractor.iter_words(t): @@ -351,12 +378,16 @@ def compute_huffman_coding(translations, compression_filename): for substr in iter_substrings(word, minlen=3, maxlen=9): counter[substr] += 1 - # Score the candidates we found. This is an empirical formula only, - # chosen for its effectiveness. + # Score the candidates we found. This is a semi-empirical formula that + # attempts to model the number of bits saved as closely as possible. + # + # It attempts to compute the codeword lengths of the original word + # to the codeword length the dictionary entry would get, times + # the number of occurrences, less the ovehead of the entries in the + # words[] array. + scores = sorted( - ((s, (len(s) - 1) ** (occ + 4)) for (s, occ) in counter.items() if occ > 4), - key=lambda x: x[1], - reverse=True, + ((s, -est_net_savings(s, occ)) for (s, occ) in counter.items()), key=lambda x: x[1] ) # Pick the one with the highest score.