Use qstrs to improve compression

any qstr of >= 3 chars long is a candidate for use; the input byte value
(1) is used to indicate a fixed-length qstr number follows.

Before accounting for the code size changes, this saves ~100 bytes
on trinket m0, but it may end up being a wash due to added code.

A cutoff of length>2 is slightly better for ja and ru and worse for others.
This has to do with the relative frequency of latin characters vs non-latin
characters, is my guess.

lang    after   before  change (- = more free space)
en_US   3132    2996    -134
fr      2136    2060    -76
ja      1916    1724    -232
ru      2196    2000    -196
This commit is contained in:
Jeff Epler 2023-08-23 14:01:02 -05:00
parent e08ad22ef6
commit 4d8b354c13
No known key found for this signature in database
GPG Key ID: D5BF15AB975AB4DE
3 changed files with 144 additions and 100 deletions

View File

@ -1,7 +1,9 @@
"""
Process raw qstr file and output qstr data with length, hash and data bytes.
This script works with Python 2.7, 3.3 and 3.4.
This script is only regularly tested with the same version of Python used
during CI, typically the latest "3.x". However, incompatibilities with any
supported CPython version are unintended.
For documentation about the format of compressed translated strings, see
supervisor/shared/translate/translate.h
@ -16,31 +18,16 @@ import sys
import collections
import gettext
import os.path
import pathlib
if hasattr(sys.stdout, "reconfigure"):
sys.stdout.reconfigure(encoding="utf-8")
sys.stderr.reconfigure(errors="backslashreplace")
py = os.path.dirname(sys.argv[0])
top = os.path.dirname(py)
sys.path.append(os.path.join(top, "tools/huffman"))
sys.path.append(str(pathlib.Path(__file__).parent.parent / "tools/huffman"))
import huffman
# Python 2/3 compatibility:
# - iterating through bytes is different
# - codepoint2name lives in a different module
import platform
if platform.python_version_tuple()[0] == "2":
bytes_cons = lambda val, enc=None: bytearray(val)
from htmlentitydefs import codepoint2name
elif platform.python_version_tuple()[0] == "3":
bytes_cons = bytes
from html.entities import codepoint2name
# end compatibility code
codepoint2name[ord("-")] = "hyphen"
@ -182,9 +169,15 @@ class EncodingTable:
extractor: object
apply_offset: object
remove_offset: object
translation_qstr_bits: int
qstrs: object
qstrs_inv: object
def compute_huffman_coding(translation_name, translations, f):
def compute_huffman_coding(qstrs, translation_name, translations, f):
# possible future improvement: some languages are better when consider len(k) > 2. try both?
qstrs = dict((k, v) for k, v in qstrs.items() if len(k) > 3)
qstr_strs = list(qstrs.keys())
texts = [t[1] for t in translations]
words = []
@ -234,10 +227,12 @@ def compute_huffman_coding(translation_name, translations, f):
# if "the" is in words then not only will "the" not be considered
# again, neither will "there" or "wither", since they have "the"
# as substrings.
extractor = TextSplitter(words)
extractor = TextSplitter(words + qstr_strs)
counter = collections.Counter()
for t in texts:
for atom in extractor.iter(t):
if atom in qstrs:
atom = "\1"
counter[atom] += 1
cb = huffman.codebook(counter.items())
lengths = sorted(dict((v, len(cb[k])) for k, v in counter.items()).items())
@ -304,10 +299,14 @@ def compute_huffman_coding(translation_name, translations, f):
words.append(word)
words.sort(key=len)
extractor = TextSplitter(words)
extractor = TextSplitter(words + qstr_strs)
counter = collections.Counter()
used_qstr = 0
for t in texts:
for atom in extractor.iter(t):
if atom in qstrs:
used_qstr = max(used_qstr, qstrs[atom])
atom = "\1"
counter[atom] += 1
cb = huffman.codebook(counter.items())
@ -322,6 +321,8 @@ def compute_huffman_coding(translation_name, translations, f):
last_length = None
canonical = {}
for atom, code in sorted(cb.items(), key=lambda x: (len(x[1]), x[0])):
if atom in qstr_strs:
atom = "\1"
values.append(atom)
length = len(code)
if length not in length_count:
@ -359,6 +360,8 @@ def compute_huffman_coding(translation_name, translations, f):
minlen = len(words[0])
wlencount = [len([None for w in words if len(w) == l]) for l in range(minlen, maxlen + 1)]
translation_qstr_bits = used_qstr.bit_length()
f.write("typedef {} mchar_t;\n".format(values_type))
f.write("const uint8_t lengths[] = {{ {} }};\n".format(", ".join(map(str, lengths))))
f.write(
@ -383,34 +386,44 @@ def compute_huffman_coding(translation_name, translations, f):
f.write("#define maxlen {}\n".format(maxlen))
f.write("#define translation_offstart {}\n".format(offstart))
f.write("#define translation_offset {}\n".format(offset))
f.write("#define translation_qstr_bits {}\n".format(translation_qstr_bits))
return EncodingTable(values, lengths, words, canonical, extractor, apply_offset, remove_offset)
qstrs_inv = dict((v, k) for k, v in qstrs.items())
return EncodingTable(
values,
lengths,
words,
canonical,
extractor,
apply_offset,
remove_offset,
translation_qstr_bits,
qstrs,
qstrs_inv,
)
def decompress(encoding_table, encoded, encoded_length_bits):
qstrs_inv = encoding_table.qstrs_inv
values = encoding_table.values
lengths = encoding_table.lengths
words = encoding_table.words
dec = []
this_byte = 0
this_bit = 7
b = encoded[this_byte]
bits = 0
for i in range(encoded_length_bits):
bits <<= 1
if 0x80 & b:
bits |= 1
def bititer():
for byte in encoded:
for bit in (0x80, 0x40, 0x20, 0x10, 0x8, 0x4, 0x2, 0x1):
yield bool(byte & bit)
b <<= 1
if this_bit == 0:
this_bit = 7
this_byte += 1
if this_byte < len(encoded):
b = encoded[this_byte]
else:
this_bit -= 1
length = bits
nextbit = bititer().__next__
def getnbits(n):
bits = 0
for i in range(n):
bits = (bits << 1) | nextbit()
return bits
dec = []
length = getnbits(encoded_length_bits)
i = 0
while i < length:
@ -419,19 +432,8 @@ def decompress(encoding_table, encoded, encoded_length_bits):
max_code = lengths[0]
searched_length = lengths[0]
while True:
bits <<= 1
if 0x80 & b:
bits |= 1
b <<= 1
bits = (bits << 1) | nextbit()
bit_length += 1
if this_bit == 0:
this_bit = 7
this_byte += 1
if this_byte < len(encoded):
b = encoded[this_byte]
else:
this_bit -= 1
if max_code > 0 and bits < max_code:
# print('{0:0{width}b}'.format(bits, width=bit_length))
break
@ -439,7 +441,10 @@ def decompress(encoding_table, encoded, encoded_length_bits):
searched_length += lengths[bit_length]
v = values[searched_length + bits - max_code]
if v >= chr(0x80) and v < chr(0x80 + len(words)):
if v == chr(1):
qstr_idx = getnbits(encoding_table.translation_qstr_bits)
v = qstrs_inv[qstr_idx]
elif v >= chr(0x80) and v < chr(0x80 + len(words)):
v = words[ord(v) - 0x80]
i += len(v.encode("utf-8"))
dec.append(v)
@ -449,36 +454,37 @@ def decompress(encoding_table, encoded, encoded_length_bits):
def compress(encoding_table, decompressed, encoded_length_bits, len_translation_encoded):
if not isinstance(decompressed, str):
raise TypeError()
qstrs = encoding_table.qstrs
canonical = encoding_table.canonical
extractor = encoding_table.extractor
enc = bytearray(len(decompressed) * 3)
current_bit = 7
current_byte = 0
enc = 1
bits = encoded_length_bits + 1
for i in range(bits - 1, 0, -1):
if len_translation_encoded & (1 << (i - 1)):
enc[current_byte] |= 1 << current_bit
if current_bit == 0:
current_bit = 7
current_byte += 1
else:
current_bit -= 1
def put_bit(enc, b):
return (enc << 1) | bool(b)
def put_bits(enc, b, n):
for i in range(n - 1, -1, -1):
enc = put_bit(enc, b & (1 << i))
return enc
enc = put_bits(enc, len_translation_encoded, encoded_length_bits)
for atom in extractor.iter(decompressed):
for b in canonical[atom]:
if b == "1":
enc[current_byte] |= 1 << current_bit
if current_bit == 0:
current_bit = 7
current_byte += 1
if atom in qstrs:
can = canonical["\1"]
else:
current_bit -= 1
can = canonical[atom]
for b in can:
enc = put_bit(enc, b == "1")
if atom in qstrs:
enc = put_bits(enc, qstrs[atom], encoding_table.translation_qstr_bits)
if current_bit != 7:
current_byte += 1
return enc[:current_byte]
while enc.bit_length() % 8 != 1:
enc = put_bit(enc, 0)
r = enc.to_bytes((enc.bit_length() + 7) // 8, "big")
return r[1:]
def qstr_escape(qst):
@ -493,10 +499,20 @@ def qstr_escape(qst):
return re.sub(r"[^A-Za-z0-9_]", esc_char, qst)
def parse_qstrs(infile):
r = {}
rx = re.compile(r'QDEF\([A-Za-z0-9_]+,\s*\d+,\s*\d+,\s*(?P<cstr>"(?:[^"\\\\]*|\\.)")\)')
content = infile.read()
for i, mat in enumerate(rx.findall(content, re.M)):
mat = eval(mat)
r[mat] = i
return r
def parse_input_headers(infiles):
i18ns = set()
# read the qstrs in from the input files
# read the TRANSLATE strings in from the input files
for infile in infiles:
with open(infile, "rt") as f:
for line in f:
@ -516,12 +532,12 @@ def escape_bytes(qstr):
return qstr
else:
# qstr contains non-printable codes so render entire thing as hex pairs
qbytes = bytes_cons(qstr, "utf8")
qbytes = bytes(qstr, "utf8")
return "".join(("\\x%02x" % b) for b in qbytes)
def make_bytes(cfg_bytes_len, cfg_bytes_hash, qstr):
qbytes = bytes_cons(qstr, "utf8")
qbytes = bytes(qstr, "utf8")
qlen = len(qbytes)
qhash = compute_hash(qbytes, cfg_bytes_hash)
if qlen >= (1 << (8 * cfg_bytes_len)):
@ -551,7 +567,7 @@ def output_translation_data(encoding_table, i18ns, out):
)
total_text_compressed_size += len(compressed)
decompressed = decompress(encoding_table, compressed, encoded_length_bits)
assert decompressed == translation
assert decompressed == translation, (decompressed, translation)
for c in C_ESCAPES:
decompressed = decompressed.replace(c, C_ESCAPES[c])
formatted = ["{:d}".format(x) for x in compressed]
@ -572,7 +588,7 @@ if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(
description="Process QSTR definitions into headers for compilation"
description="Process TRANSLATE strings into headers for compilation"
)
parser.add_argument(
"infiles", metavar="N", type=str, nargs="+", help="an integer for the accumulator"
@ -590,13 +606,19 @@ if __name__ == "__main__":
type=argparse.FileType("w", encoding="UTF-8"),
help="c file for translation data",
)
parser.add_argument(
"--qstrdefs_filename",
type=argparse.FileType("r", encoding="UTF-8"),
help="",
)
args = parser.parse_args()
qstrs = parse_qstrs(args.qstrdefs_filename)
i18ns = parse_input_headers(args.infiles)
i18ns = sorted(i18ns)
translations = translate(args.translation, i18ns)
encoding_table = compute_huffman_coding(
args.translation, translations, args.compression_filename
qstrs, args.translation, translations, args.compression_filename
)
output_translation_data(encoding_table, translations, args.translation_filename)

View File

@ -270,7 +270,7 @@ $(PY_BUILD)/translations-$(TRANSLATION).c: $(HEADER_BUILD)/compressed_translatio
$(HEADER_BUILD)/compressed_translations.generated.h: $(PY_SRC)/maketranslationdata.py $(HEADER_BUILD)/$(TRANSLATION).mo $(HEADER_BUILD)/qstrdefs.generated.h
$(STEPECHO) "GEN $@"
$(Q)mkdir -p $(PY_BUILD)
$(Q)$(PYTHON) $(PY_SRC)/maketranslationdata.py --compression_filename $(HEADER_BUILD)/compressed_translations.generated.h --translation $(HEADER_BUILD)/$(TRANSLATION).mo --translation_filename $(PY_BUILD)/translations-$(TRANSLATION).c $(HEADER_BUILD)/qstrdefs.preprocessed.h
$(Q)$(PYTHON) $(PY_SRC)/maketranslationdata.py --compression_filename $(HEADER_BUILD)/compressed_translations.generated.h --translation $(HEADER_BUILD)/$(TRANSLATION).mo --translation_filename $(PY_BUILD)/translations-$(TRANSLATION).c --qstrdefs_filename $(HEADER_BUILD)/qstrdefs.generated.h $(HEADER_BUILD)/qstrdefs.preprocessed.h
PY_CORE_O += $(PY_BUILD)/translations-$(TRANSLATION).o

View File

@ -25,6 +25,7 @@
*/
#include "supervisor/shared/translate/translate.h"
#include "py/qstr.h"
#include <stdbool.h>
#include <stdint.h>
@ -99,10 +100,34 @@ uint16_t decompress_length(const compressed_string_t *compressed) {
#endif
}
typedef struct {
const uint8_t *ptr;
uint8_t bit;
} bitstream_state_t;
static bool next_bit(bitstream_state_t *st) {
bool r = *st->ptr & st->bit;
st->bit >>= 1;
if (!st->bit) {
st->bit = 0x80;
st->ptr++;
}
return r;
}
static int get_nbits(bitstream_state_t *st, int n) {
int r = 0;
while (n--) {
r = (r << 1) | next_bit(st);
}
return r;
}
char *decompress(const compressed_string_t *compressed, char *decompressed) {
uint8_t this_byte = compress_max_length_bits / 8;
uint8_t this_bit = 7 - compress_max_length_bits % 8;
uint8_t b = (&compressed->data)[this_byte] << (compress_max_length_bits % 8);
bitstream_state_t b = {
.ptr = &(compressed->data) + (compress_max_length_bits >> 3),
.bit = 1 << (7 - ((compress_max_length_bits) & 0x7)),
};
uint16_t length = decompress_length(compressed);
// Stop one early because the last byte is always NULL.
@ -112,26 +137,23 @@ char *decompress(const compressed_string_t *compressed, char *decompressed) {
uint32_t max_code = lengths[0];
uint32_t searched_length = lengths[0];
while (true) {
bits <<= 1;
if ((0x80 & b) != 0) {
bits |= 1;
}
b <<= 1;
bits = (bits << 1) | next_bit(&b);
bit_length += 1;
if (this_bit == 0) {
this_bit = 7;
this_byte += 1;
b = (&compressed->data)[this_byte]; // This may read past the end but its never used.
} else {
this_bit -= 1;
}
if (max_code > 0 && bits < max_code) {
break;
}
max_code = (max_code << 1) + lengths[bit_length];
searched_length += lengths[bit_length];
}
i += put_utf8(decompressed + i, values[searched_length + bits - max_code]);
int v = values[searched_length + bits - max_code];
if (v == 1) {
qstr q = get_nbits(&b, translation_qstr_bits) + 1; // honestly no idea why "+1"...
for (const char *qc = qstr_str(q); *qc;) {
decompressed[i++] = *qc++;
}
} else {
i += put_utf8(decompressed + i, v);
}
}
decompressed[length - 1] = '\0';