diff --git a/py/makeqstrdata.py b/py/makeqstrdata.py index 02f0bea7eb..d2a10d29db 100644 --- a/py/makeqstrdata.py +++ b/py/makeqstrdata.py @@ -9,6 +9,7 @@ supervisor/shared/translate.h from __future__ import print_function +import bisect import re import sys @@ -279,7 +280,7 @@ def translate(translation_file, i18ns): class TextSplitter: def __init__(self, words): - words.sort(key=lambda x: len(x), reverse=True) + words = sorted(words, key=lambda x: len(x), reverse=True) self.words = set(words) if words: pat = "|".join(re.escape(w) for w in words) + "|." @@ -330,6 +331,7 @@ def compute_huffman_coding(translations, compression_filename): end_unused = min(ord_c, end_unused) max_words = end_unused - 0x80 + bits_per_codepoint = 16 if max_ord > 255 else 8 values_type = "uint16_t" if max_ord > 255 else "uint8_t" max_words_len = 160 if max_ord > 255 else 255 @@ -344,6 +346,46 @@ def compute_huffman_coding(translations, compression_filename): # again, neither will "there" or "wither", since they have "the" # as substrings. extractor = TextSplitter(words) + counter = collections.Counter() + for t in texts: + for atom in extractor.iter(t): + counter[atom] += 1 + cb = huffman.codebook(counter.items()) + lengths = sorted(dict((v, len(cb[k])) for k, v in counter.items()).items()) + + def bit_length(s): + return sum(len(cb[c]) for c in s) + + def est_len(occ): + idx = bisect.bisect_left(lengths, (occ, 0)) + return lengths[idx][1] + 1 + + # The cost of adding a dictionary word is just its storage size + # while its savings is close to the difference between the original + # huffman bit-length of the string and the estimated bit-length + # of the dictionary word, times the number of times the word appears. + # + # The savings is not strictly accurate because including a word into + # the Huffman tree bumps up the encoding lengths of all words in the + # same subtree. In the extreme case when the new word is so frequent + # that it gets a one-bit encoding, all other words will cost an extra + # bit each. + # + # Another source of inaccuracy is that compressed strings end up + # on byte boundaries, not bit boundaries, so saving 1 bit somewhere + # might not save a byte. + # + # In fact, when this change was first made, some translations (luckily, + # ones on boards not at all close to full) wasted up to 40 bytes, + # while the most constrained boards typically gained 100 bytes or + # more. + # + # The difference between the two is the estimated net savings, in bits. + def est_net_savings(s, occ): + savings = occ * (bit_length(s) - est_len(occ)) + cost = len(s) * bits_per_codepoint + return savings - cost + counter = collections.Counter() for t in texts: for (found, word) in extractor.iter_words(t): @@ -351,16 +393,20 @@ def compute_huffman_coding(translations, compression_filename): for substr in iter_substrings(word, minlen=3, maxlen=9): counter[substr] += 1 - # Score the candidates we found. This is an empirical formula only, - # chosen for its effectiveness. + # Score the candidates we found. This is a semi-empirical formula that + # attempts to model the number of bits saved as closely as possible. + # + # It attempts to compute the codeword lengths of the original word + # to the codeword length the dictionary entry would get, times + # the number of occurrences, less the ovehead of the entries in the + # words[] array. + scores = sorted( - ((s, (len(s) - 1) ** (occ + 4)) for (s, occ) in counter.items() if occ > 4), - key=lambda x: x[1], - reverse=True, + ((s, -est_net_savings(s, occ)) for (s, occ) in counter.items()), key=lambda x: x[1] ) - # Pick the one with the highest score. - if not scores: + # Pick the one with the highest score. The score must be negative. + if not scores or scores[0][-1] >= 0: break word = scores[0][0] diff --git a/tools/fwsizes.py b/tools/fwsizes.py new file mode 100644 index 0000000000..64d1af8b00 --- /dev/null +++ b/tools/fwsizes.py @@ -0,0 +1,38 @@ +#!/usr/bin/env python3 +# Run this from within an unzipped directory of build logs from github +# to get a CSV file full of firmware sizes. Super useful to compare sizes +# from various builds. +# Taken from https://github.com/adafruit/circuitpython/pull/4564#issuecomment-816655101 +import os, re + +for fn in os.listdir(): + if os.path.isfile(fn) and ("build-arm " in fn or "build-riscv " in fn): + board = re.split("[()]", fn)[1] + if board in ( + "spresense", + "teensy40", + "teensy41", + "feather_m7_1011", + "feather_mimxrt1011", + "feather_mimxrt1062", + "imxrt1010_evk", + "imxrt1020_evk", + "imxrt1060_evk", + "metro_m7_1011", + ): + continue + with open(fn, "r") as f: + head = "Build " + board + " for " + lines = iter(f) + for line in lines: + if head in line: + tr = line.split(head)[1].split()[0] + assert "make: Entering directory" in next(lines) + assert "Use make V=1, make V=2" in next(lines) + while re.search( + r"\{\}|QSTR updated|FREEZE|\{'sku':|hex\tfilename|boot2.elf|Including User C Module from|Font missing|section `.bss' type changed to PROGBITS", + next(lines), + ): + pass + free = next(lines).split("bytes used, ")[1].split()[0] + print(board + "," + tr + "," + free)