Merge pull request #4978 from jepler/dictionary-better-heuristic
makeqstrdata: use an extremely accurate dictionary heuristic
This commit is contained in:
commit
22e8a50585
@ -9,6 +9,7 @@ supervisor/shared/translate.h
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
import bisect
|
||||
import re
|
||||
import sys
|
||||
|
||||
@ -279,7 +280,7 @@ def translate(translation_file, i18ns):
|
||||
|
||||
class TextSplitter:
|
||||
def __init__(self, words):
|
||||
words.sort(key=lambda x: len(x), reverse=True)
|
||||
words = sorted(words, key=lambda x: len(x), reverse=True)
|
||||
self.words = set(words)
|
||||
if words:
|
||||
pat = "|".join(re.escape(w) for w in words) + "|."
|
||||
@ -330,6 +331,7 @@ def compute_huffman_coding(translations, compression_filename):
|
||||
end_unused = min(ord_c, end_unused)
|
||||
max_words = end_unused - 0x80
|
||||
|
||||
bits_per_codepoint = 16 if max_ord > 255 else 8
|
||||
values_type = "uint16_t" if max_ord > 255 else "uint8_t"
|
||||
max_words_len = 160 if max_ord > 255 else 255
|
||||
|
||||
@ -344,6 +346,46 @@ def compute_huffman_coding(translations, compression_filename):
|
||||
# again, neither will "there" or "wither", since they have "the"
|
||||
# as substrings.
|
||||
extractor = TextSplitter(words)
|
||||
counter = collections.Counter()
|
||||
for t in texts:
|
||||
for atom in extractor.iter(t):
|
||||
counter[atom] += 1
|
||||
cb = huffman.codebook(counter.items())
|
||||
lengths = sorted(dict((v, len(cb[k])) for k, v in counter.items()).items())
|
||||
|
||||
def bit_length(s):
|
||||
return sum(len(cb[c]) for c in s)
|
||||
|
||||
def est_len(occ):
|
||||
idx = bisect.bisect_left(lengths, (occ, 0))
|
||||
return lengths[idx][1] + 1
|
||||
|
||||
# The cost of adding a dictionary word is just its storage size
|
||||
# while its savings is close to the difference between the original
|
||||
# huffman bit-length of the string and the estimated bit-length
|
||||
# of the dictionary word, times the number of times the word appears.
|
||||
#
|
||||
# The savings is not strictly accurate because including a word into
|
||||
# the Huffman tree bumps up the encoding lengths of all words in the
|
||||
# same subtree. In the extreme case when the new word is so frequent
|
||||
# that it gets a one-bit encoding, all other words will cost an extra
|
||||
# bit each.
|
||||
#
|
||||
# Another source of inaccuracy is that compressed strings end up
|
||||
# on byte boundaries, not bit boundaries, so saving 1 bit somewhere
|
||||
# might not save a byte.
|
||||
#
|
||||
# In fact, when this change was first made, some translations (luckily,
|
||||
# ones on boards not at all close to full) wasted up to 40 bytes,
|
||||
# while the most constrained boards typically gained 100 bytes or
|
||||
# more.
|
||||
#
|
||||
# The difference between the two is the estimated net savings, in bits.
|
||||
def est_net_savings(s, occ):
|
||||
savings = occ * (bit_length(s) - est_len(occ))
|
||||
cost = len(s) * bits_per_codepoint
|
||||
return savings - cost
|
||||
|
||||
counter = collections.Counter()
|
||||
for t in texts:
|
||||
for (found, word) in extractor.iter_words(t):
|
||||
@ -351,16 +393,20 @@ def compute_huffman_coding(translations, compression_filename):
|
||||
for substr in iter_substrings(word, minlen=3, maxlen=9):
|
||||
counter[substr] += 1
|
||||
|
||||
# Score the candidates we found. This is an empirical formula only,
|
||||
# chosen for its effectiveness.
|
||||
# Score the candidates we found. This is a semi-empirical formula that
|
||||
# attempts to model the number of bits saved as closely as possible.
|
||||
#
|
||||
# It attempts to compute the codeword lengths of the original word
|
||||
# to the codeword length the dictionary entry would get, times
|
||||
# the number of occurrences, less the ovehead of the entries in the
|
||||
# words[] array.
|
||||
|
||||
scores = sorted(
|
||||
((s, (len(s) - 1) ** (occ + 4)) for (s, occ) in counter.items() if occ > 4),
|
||||
key=lambda x: x[1],
|
||||
reverse=True,
|
||||
((s, -est_net_savings(s, occ)) for (s, occ) in counter.items()), key=lambda x: x[1]
|
||||
)
|
||||
|
||||
# Pick the one with the highest score.
|
||||
if not scores:
|
||||
# Pick the one with the highest score. The score must be negative.
|
||||
if not scores or scores[0][-1] >= 0:
|
||||
break
|
||||
|
||||
word = scores[0][0]
|
||||
|
38
tools/fwsizes.py
Normal file
38
tools/fwsizes.py
Normal file
@ -0,0 +1,38 @@
|
||||
#!/usr/bin/env python3
|
||||
# Run this from within an unzipped directory of build logs from github
|
||||
# to get a CSV file full of firmware sizes. Super useful to compare sizes
|
||||
# from various builds.
|
||||
# Taken from https://github.com/adafruit/circuitpython/pull/4564#issuecomment-816655101
|
||||
import os, re
|
||||
|
||||
for fn in os.listdir():
|
||||
if os.path.isfile(fn) and ("build-arm " in fn or "build-riscv " in fn):
|
||||
board = re.split("[()]", fn)[1]
|
||||
if board in (
|
||||
"spresense",
|
||||
"teensy40",
|
||||
"teensy41",
|
||||
"feather_m7_1011",
|
||||
"feather_mimxrt1011",
|
||||
"feather_mimxrt1062",
|
||||
"imxrt1010_evk",
|
||||
"imxrt1020_evk",
|
||||
"imxrt1060_evk",
|
||||
"metro_m7_1011",
|
||||
):
|
||||
continue
|
||||
with open(fn, "r") as f:
|
||||
head = "Build " + board + " for "
|
||||
lines = iter(f)
|
||||
for line in lines:
|
||||
if head in line:
|
||||
tr = line.split(head)[1].split()[0]
|
||||
assert "make: Entering directory" in next(lines)
|
||||
assert "Use make V=1, make V=2" in next(lines)
|
||||
while re.search(
|
||||
r"\{\}|QSTR updated|FREEZE|\{'sku':|hex\tfilename|boot2.elf|Including User C Module from|Font missing|section `.bss' type changed to PROGBITS",
|
||||
next(lines),
|
||||
):
|
||||
pass
|
||||
free = next(lines).split("bytes used, ")[1].split()[0]
|
||||
print(board + "," + tr + "," + free)
|
Loading…
x
Reference in New Issue
Block a user