From 99abd03b7a4b77ba0cb4e8b1e03db016a7c008a8 Mon Sep 17 00:00:00 2001 From: Jeff Epler Date: Fri, 9 Jul 2021 11:23:55 -0500 Subject: [PATCH 1/5] makeqstrdata: use an extremely accurate dictionary heuristic Try to accurately measure the costs of including a word in the dictionary vs the gains from using it in messages. This saves about 160 bytes on trinket_m0 ja, the fullest translation for that board. Other translations on the same board all have savings, ranging from 24 to 228 bytes. ``` Translation Before After Savings ja 1164 1324 160 de_DE 1260 1396 136 fr 1424 1652 228 zh_Latn_pinyin 1448 1520 72 pt_BR 1584 1736 152 pl 1592 1640 48 es 1724 1816 92 ko 1724 1816 92 fil 1764 1800 36 it_IT 1896 2040 144 nl 1956 2136 180 ID 2072 2180 108 cs 2124 2148 24 sv 2340 2448 108 en_x_pirate 2644 2740 96 en_GB 2652 2752 100 el 2656 2768 112 en_US 2656 2768 112 hi 2656 2768 112 ``` --- py/makeqstrdata.py | 41 ++++++++++++++++++++++++++++++++++++----- 1 file changed, 36 insertions(+), 5 deletions(-) diff --git a/py/makeqstrdata.py b/py/makeqstrdata.py index 02f0bea7eb..dc004d2df3 100644 --- a/py/makeqstrdata.py +++ b/py/makeqstrdata.py @@ -9,6 +9,7 @@ supervisor/shared/translate.h from __future__ import print_function +import bisect import re import sys @@ -330,6 +331,7 @@ def compute_huffman_coding(translations, compression_filename): end_unused = min(ord_c, end_unused) max_words = end_unused - 0x80 + bits_per_codepoint = 16 if max_ord > 255 else 8 values_type = "uint16_t" if max_ord > 255 else "uint8_t" max_words_len = 160 if max_ord > 255 else 255 @@ -344,6 +346,31 @@ def compute_huffman_coding(translations, compression_filename): # again, neither will "there" or "wither", since they have "the" # as substrings. extractor = TextSplitter(words) + counter = collections.Counter() + for t in texts: + for atom in extractor.iter(t): + counter[atom] += 1 + cb = huffman.codebook(counter.items()) + lengths = sorted(dict((v, len(cb[k])) for k, v in counter.items()).items()) + + def bit_length(s): + return sum(len(cb[c]) for c in s) + + def est_len(occ): + idx = bisect.bisect_left(lengths, (occ, 0)) + return lengths[idx][1] + 1 + + # The cost of adding a dictionary word is just its storage size + # while its savings is the difference between the original + # huffman bit-length of the string and the estimated bit-length + # of the dictionary word, times the number of times the word appears. + # + # The difference between the two is the net savings, in bits. + def est_net_savings(s, occ): + savings = occ * (bit_length(s) - est_len(occ)) + cost = len(s) * bits_per_codepoint + return savings - cost + counter = collections.Counter() for t in texts: for (found, word) in extractor.iter_words(t): @@ -351,12 +378,16 @@ def compute_huffman_coding(translations, compression_filename): for substr in iter_substrings(word, minlen=3, maxlen=9): counter[substr] += 1 - # Score the candidates we found. This is an empirical formula only, - # chosen for its effectiveness. + # Score the candidates we found. This is a semi-empirical formula that + # attempts to model the number of bits saved as closely as possible. + # + # It attempts to compute the codeword lengths of the original word + # to the codeword length the dictionary entry would get, times + # the number of occurrences, less the ovehead of the entries in the + # words[] array. + scores = sorted( - ((s, (len(s) - 1) ** (occ + 4)) for (s, occ) in counter.items() if occ > 4), - key=lambda x: x[1], - reverse=True, + ((s, -est_net_savings(s, occ)) for (s, occ) in counter.items()), key=lambda x: x[1] ) # Pick the one with the highest score. From 8836198ff1d03254f8630703e0c8c941c0f733b7 Mon Sep 17 00:00:00 2001 From: Jeff Epler Date: Wed, 3 Feb 2021 17:18:47 -0600 Subject: [PATCH 2/5] TextSplitter: don't mutate 'words' I was puzzled by why the dictionary words were sorted by length. It was because TextSplitter sorted its parameter, instead of a copy. This doesn't affect encoding size, but does affect the encoding NUMBER of the found words. We'll deliberately restore sorting by length next, for other reasons, but not by spooky action. --- py/makeqstrdata.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/py/makeqstrdata.py b/py/makeqstrdata.py index dc004d2df3..bbd298e931 100644 --- a/py/makeqstrdata.py +++ b/py/makeqstrdata.py @@ -280,7 +280,7 @@ def translate(translation_file, i18ns): class TextSplitter: def __init__(self, words): - words.sort(key=lambda x: len(x), reverse=True) + words = sorted(words, key=lambda x: len(x), reverse=True) self.words = set(words) if words: pat = "|".join(re.escape(w) for w in words) + "|." From 52e75c645d614379fd7ea404cb00b40e9fc30b59 Mon Sep 17 00:00:00 2001 From: Jeff Epler Date: Fri, 9 Jul 2021 14:16:14 -0500 Subject: [PATCH 3/5] makeqstrdata: Don't include strings that are a net loss! --- py/makeqstrdata.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/py/makeqstrdata.py b/py/makeqstrdata.py index bbd298e931..9073fc5667 100644 --- a/py/makeqstrdata.py +++ b/py/makeqstrdata.py @@ -390,8 +390,8 @@ def compute_huffman_coding(translations, compression_filename): ((s, -est_net_savings(s, occ)) for (s, occ) in counter.items()), key=lambda x: x[1] ) - # Pick the one with the highest score. - if not scores: + # Pick the one with the highest score. The score must be negative. + if not scores or scores[0][-1] >= 0: break word = scores[0][0] From fd4a7fce91ea26b2b1014e509a7bfc5c3402aaa0 Mon Sep 17 00:00:00 2001 From: Jeff Epler Date: Sat, 10 Jul 2021 19:11:47 -0500 Subject: [PATCH 4/5] add tool to parse out firmware sizes for comparison --- tools/fwsizes.py | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 tools/fwsizes.py diff --git a/tools/fwsizes.py b/tools/fwsizes.py new file mode 100644 index 0000000000..64d1af8b00 --- /dev/null +++ b/tools/fwsizes.py @@ -0,0 +1,38 @@ +#!/usr/bin/env python3 +# Run this from within an unzipped directory of build logs from github +# to get a CSV file full of firmware sizes. Super useful to compare sizes +# from various builds. +# Taken from https://github.com/adafruit/circuitpython/pull/4564#issuecomment-816655101 +import os, re + +for fn in os.listdir(): + if os.path.isfile(fn) and ("build-arm " in fn or "build-riscv " in fn): + board = re.split("[()]", fn)[1] + if board in ( + "spresense", + "teensy40", + "teensy41", + "feather_m7_1011", + "feather_mimxrt1011", + "feather_mimxrt1062", + "imxrt1010_evk", + "imxrt1020_evk", + "imxrt1060_evk", + "metro_m7_1011", + ): + continue + with open(fn, "r") as f: + head = "Build " + board + " for " + lines = iter(f) + for line in lines: + if head in line: + tr = line.split(head)[1].split()[0] + assert "make: Entering directory" in next(lines) + assert "Use make V=1, make V=2" in next(lines) + while re.search( + r"\{\}|QSTR updated|FREEZE|\{'sku':|hex\tfilename|boot2.elf|Including User C Module from|Font missing|section `.bss' type changed to PROGBITS", + next(lines), + ): + pass + free = next(lines).split("bytes used, ")[1].split()[0] + print(board + "," + tr + "," + free) From 0b8b16f6ac6d18fdbe1f61bd53428befa00879d8 Mon Sep 17 00:00:00 2001 From: Jeff Epler Date: Sun, 11 Jul 2021 08:57:27 -0500 Subject: [PATCH 5/5] increase comment on accuracy of the net savings estimate function Thanks to tyomitch for suggesting the comment could be more accurate. --- py/makeqstrdata.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/py/makeqstrdata.py b/py/makeqstrdata.py index 9073fc5667..d2a10d29db 100644 --- a/py/makeqstrdata.py +++ b/py/makeqstrdata.py @@ -361,11 +361,26 @@ def compute_huffman_coding(translations, compression_filename): return lengths[idx][1] + 1 # The cost of adding a dictionary word is just its storage size - # while its savings is the difference between the original + # while its savings is close to the difference between the original # huffman bit-length of the string and the estimated bit-length # of the dictionary word, times the number of times the word appears. # - # The difference between the two is the net savings, in bits. + # The savings is not strictly accurate because including a word into + # the Huffman tree bumps up the encoding lengths of all words in the + # same subtree. In the extreme case when the new word is so frequent + # that it gets a one-bit encoding, all other words will cost an extra + # bit each. + # + # Another source of inaccuracy is that compressed strings end up + # on byte boundaries, not bit boundaries, so saving 1 bit somewhere + # might not save a byte. + # + # In fact, when this change was first made, some translations (luckily, + # ones on boards not at all close to full) wasted up to 40 bytes, + # while the most constrained boards typically gained 100 bytes or + # more. + # + # The difference between the two is the estimated net savings, in bits. def est_net_savings(s, occ): savings = occ * (bit_length(s) - est_len(occ)) cost = len(s) * bits_per_codepoint