Merge pull request #4978 from jepler/dictionary-better-heuristic

makeqstrdata: use an extremely accurate dictionary heuristic
This commit is contained in:
Jeff Epler 2021-07-11 12:49:03 -05:00 committed by GitHub
commit 22e8a50585
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 92 additions and 8 deletions

View File

@ -9,6 +9,7 @@ supervisor/shared/translate.h
from __future__ import print_function
import bisect
import re
import sys
@ -279,7 +280,7 @@ def translate(translation_file, i18ns):
class TextSplitter:
def __init__(self, words):
words.sort(key=lambda x: len(x), reverse=True)
words = sorted(words, key=lambda x: len(x), reverse=True)
self.words = set(words)
if words:
pat = "|".join(re.escape(w) for w in words) + "|."
@ -330,6 +331,7 @@ def compute_huffman_coding(translations, compression_filename):
end_unused = min(ord_c, end_unused)
max_words = end_unused - 0x80
bits_per_codepoint = 16 if max_ord > 255 else 8
values_type = "uint16_t" if max_ord > 255 else "uint8_t"
max_words_len = 160 if max_ord > 255 else 255
@ -344,6 +346,46 @@ def compute_huffman_coding(translations, compression_filename):
# again, neither will "there" or "wither", since they have "the"
# as substrings.
extractor = TextSplitter(words)
counter = collections.Counter()
for t in texts:
for atom in extractor.iter(t):
counter[atom] += 1
cb = huffman.codebook(counter.items())
lengths = sorted(dict((v, len(cb[k])) for k, v in counter.items()).items())
def bit_length(s):
return sum(len(cb[c]) for c in s)
def est_len(occ):
idx = bisect.bisect_left(lengths, (occ, 0))
return lengths[idx][1] + 1
# The cost of adding a dictionary word is just its storage size
# while its savings is close to the difference between the original
# huffman bit-length of the string and the estimated bit-length
# of the dictionary word, times the number of times the word appears.
#
# The savings is not strictly accurate because including a word into
# the Huffman tree bumps up the encoding lengths of all words in the
# same subtree. In the extreme case when the new word is so frequent
# that it gets a one-bit encoding, all other words will cost an extra
# bit each.
#
# Another source of inaccuracy is that compressed strings end up
# on byte boundaries, not bit boundaries, so saving 1 bit somewhere
# might not save a byte.
#
# In fact, when this change was first made, some translations (luckily,
# ones on boards not at all close to full) wasted up to 40 bytes,
# while the most constrained boards typically gained 100 bytes or
# more.
#
# The difference between the two is the estimated net savings, in bits.
def est_net_savings(s, occ):
savings = occ * (bit_length(s) - est_len(occ))
cost = len(s) * bits_per_codepoint
return savings - cost
counter = collections.Counter()
for t in texts:
for (found, word) in extractor.iter_words(t):
@ -351,16 +393,20 @@ def compute_huffman_coding(translations, compression_filename):
for substr in iter_substrings(word, minlen=3, maxlen=9):
counter[substr] += 1
# Score the candidates we found. This is an empirical formula only,
# chosen for its effectiveness.
# Score the candidates we found. This is a semi-empirical formula that
# attempts to model the number of bits saved as closely as possible.
#
# It attempts to compute the codeword lengths of the original word
# to the codeword length the dictionary entry would get, times
# the number of occurrences, less the ovehead of the entries in the
# words[] array.
scores = sorted(
((s, (len(s) - 1) ** (occ + 4)) for (s, occ) in counter.items() if occ > 4),
key=lambda x: x[1],
reverse=True,
((s, -est_net_savings(s, occ)) for (s, occ) in counter.items()), key=lambda x: x[1]
)
# Pick the one with the highest score.
if not scores:
# Pick the one with the highest score. The score must be negative.
if not scores or scores[0][-1] >= 0:
break
word = scores[0][0]

38
tools/fwsizes.py Normal file
View File

@ -0,0 +1,38 @@
#!/usr/bin/env python3
# Run this from within an unzipped directory of build logs from github
# to get a CSV file full of firmware sizes. Super useful to compare sizes
# from various builds.
# Taken from https://github.com/adafruit/circuitpython/pull/4564#issuecomment-816655101
import os, re
for fn in os.listdir():
if os.path.isfile(fn) and ("build-arm " in fn or "build-riscv " in fn):
board = re.split("[()]", fn)[1]
if board in (
"spresense",
"teensy40",
"teensy41",
"feather_m7_1011",
"feather_mimxrt1011",
"feather_mimxrt1062",
"imxrt1010_evk",
"imxrt1020_evk",
"imxrt1060_evk",
"metro_m7_1011",
):
continue
with open(fn, "r") as f:
head = "Build " + board + " for "
lines = iter(f)
for line in lines:
if head in line:
tr = line.split(head)[1].split()[0]
assert "make: Entering directory" in next(lines)
assert "Use make V=1, make V=2" in next(lines)
while re.search(
r"\{\}|QSTR updated|FREEZE|\{'sku':|hex\tfilename|boot2.elf|Including User C Module from|Font missing|section `.bss' type changed to PROGBITS",
next(lines),
):
pass
free = next(lines).split("bytes used, ")[1].split()[0]
print(board + "," + tr + "," + free)