Use qstrs to improve compression
any qstr of >= 3 chars long is a candidate for use; the input byte value (1) is used to indicate a fixed-length qstr number follows. Before accounting for the code size changes, this saves ~100 bytes on trinket m0, but it may end up being a wash due to added code. A cutoff of length>2 is slightly better for ja and ru and worse for others. This has to do with the relative frequency of latin characters vs non-latin characters, is my guess. lang after before change (- = more free space) en_US 3132 2996 -134 fr 2136 2060 -76 ja 1916 1724 -232 ru 2196 2000 -196
This commit is contained in:
parent
e08ad22ef6
commit
4d8b354c13
@ -1,7 +1,9 @@
|
||||
"""
|
||||
Process raw qstr file and output qstr data with length, hash and data bytes.
|
||||
|
||||
This script works with Python 2.7, 3.3 and 3.4.
|
||||
This script is only regularly tested with the same version of Python used
|
||||
during CI, typically the latest "3.x". However, incompatibilities with any
|
||||
supported CPython version are unintended.
|
||||
|
||||
For documentation about the format of compressed translated strings, see
|
||||
supervisor/shared/translate/translate.h
|
||||
@ -16,31 +18,16 @@ import sys
|
||||
|
||||
import collections
|
||||
import gettext
|
||||
import os.path
|
||||
import pathlib
|
||||
|
||||
if hasattr(sys.stdout, "reconfigure"):
|
||||
sys.stdout.reconfigure(encoding="utf-8")
|
||||
sys.stderr.reconfigure(errors="backslashreplace")
|
||||
|
||||
py = os.path.dirname(sys.argv[0])
|
||||
top = os.path.dirname(py)
|
||||
|
||||
sys.path.append(os.path.join(top, "tools/huffman"))
|
||||
sys.path.append(str(pathlib.Path(__file__).parent.parent / "tools/huffman"))
|
||||
|
||||
import huffman
|
||||
|
||||
# Python 2/3 compatibility:
|
||||
# - iterating through bytes is different
|
||||
# - codepoint2name lives in a different module
|
||||
import platform
|
||||
|
||||
if platform.python_version_tuple()[0] == "2":
|
||||
bytes_cons = lambda val, enc=None: bytearray(val)
|
||||
from htmlentitydefs import codepoint2name
|
||||
elif platform.python_version_tuple()[0] == "3":
|
||||
bytes_cons = bytes
|
||||
from html.entities import codepoint2name
|
||||
# end compatibility code
|
||||
from html.entities import codepoint2name
|
||||
|
||||
codepoint2name[ord("-")] = "hyphen"
|
||||
|
||||
@ -182,9 +169,15 @@ class EncodingTable:
|
||||
extractor: object
|
||||
apply_offset: object
|
||||
remove_offset: object
|
||||
translation_qstr_bits: int
|
||||
qstrs: object
|
||||
qstrs_inv: object
|
||||
|
||||
|
||||
def compute_huffman_coding(translation_name, translations, f):
|
||||
def compute_huffman_coding(qstrs, translation_name, translations, f):
|
||||
# possible future improvement: some languages are better when consider len(k) > 2. try both?
|
||||
qstrs = dict((k, v) for k, v in qstrs.items() if len(k) > 3)
|
||||
qstr_strs = list(qstrs.keys())
|
||||
texts = [t[1] for t in translations]
|
||||
words = []
|
||||
|
||||
@ -234,10 +227,12 @@ def compute_huffman_coding(translation_name, translations, f):
|
||||
# if "the" is in words then not only will "the" not be considered
|
||||
# again, neither will "there" or "wither", since they have "the"
|
||||
# as substrings.
|
||||
extractor = TextSplitter(words)
|
||||
extractor = TextSplitter(words + qstr_strs)
|
||||
counter = collections.Counter()
|
||||
for t in texts:
|
||||
for atom in extractor.iter(t):
|
||||
if atom in qstrs:
|
||||
atom = "\1"
|
||||
counter[atom] += 1
|
||||
cb = huffman.codebook(counter.items())
|
||||
lengths = sorted(dict((v, len(cb[k])) for k, v in counter.items()).items())
|
||||
@ -304,10 +299,14 @@ def compute_huffman_coding(translation_name, translations, f):
|
||||
words.append(word)
|
||||
|
||||
words.sort(key=len)
|
||||
extractor = TextSplitter(words)
|
||||
extractor = TextSplitter(words + qstr_strs)
|
||||
counter = collections.Counter()
|
||||
used_qstr = 0
|
||||
for t in texts:
|
||||
for atom in extractor.iter(t):
|
||||
if atom in qstrs:
|
||||
used_qstr = max(used_qstr, qstrs[atom])
|
||||
atom = "\1"
|
||||
counter[atom] += 1
|
||||
cb = huffman.codebook(counter.items())
|
||||
|
||||
@ -322,6 +321,8 @@ def compute_huffman_coding(translation_name, translations, f):
|
||||
last_length = None
|
||||
canonical = {}
|
||||
for atom, code in sorted(cb.items(), key=lambda x: (len(x[1]), x[0])):
|
||||
if atom in qstr_strs:
|
||||
atom = "\1"
|
||||
values.append(atom)
|
||||
length = len(code)
|
||||
if length not in length_count:
|
||||
@ -359,6 +360,8 @@ def compute_huffman_coding(translation_name, translations, f):
|
||||
minlen = len(words[0])
|
||||
wlencount = [len([None for w in words if len(w) == l]) for l in range(minlen, maxlen + 1)]
|
||||
|
||||
translation_qstr_bits = used_qstr.bit_length()
|
||||
|
||||
f.write("typedef {} mchar_t;\n".format(values_type))
|
||||
f.write("const uint8_t lengths[] = {{ {} }};\n".format(", ".join(map(str, lengths))))
|
||||
f.write(
|
||||
@ -383,34 +386,44 @@ def compute_huffman_coding(translation_name, translations, f):
|
||||
f.write("#define maxlen {}\n".format(maxlen))
|
||||
f.write("#define translation_offstart {}\n".format(offstart))
|
||||
f.write("#define translation_offset {}\n".format(offset))
|
||||
f.write("#define translation_qstr_bits {}\n".format(translation_qstr_bits))
|
||||
|
||||
return EncodingTable(values, lengths, words, canonical, extractor, apply_offset, remove_offset)
|
||||
qstrs_inv = dict((v, k) for k, v in qstrs.items())
|
||||
return EncodingTable(
|
||||
values,
|
||||
lengths,
|
||||
words,
|
||||
canonical,
|
||||
extractor,
|
||||
apply_offset,
|
||||
remove_offset,
|
||||
translation_qstr_bits,
|
||||
qstrs,
|
||||
qstrs_inv,
|
||||
)
|
||||
|
||||
|
||||
def decompress(encoding_table, encoded, encoded_length_bits):
|
||||
qstrs_inv = encoding_table.qstrs_inv
|
||||
values = encoding_table.values
|
||||
lengths = encoding_table.lengths
|
||||
words = encoding_table.words
|
||||
|
||||
dec = []
|
||||
this_byte = 0
|
||||
this_bit = 7
|
||||
b = encoded[this_byte]
|
||||
bits = 0
|
||||
for i in range(encoded_length_bits):
|
||||
bits <<= 1
|
||||
if 0x80 & b:
|
||||
bits |= 1
|
||||
def bititer():
|
||||
for byte in encoded:
|
||||
for bit in (0x80, 0x40, 0x20, 0x10, 0x8, 0x4, 0x2, 0x1):
|
||||
yield bool(byte & bit)
|
||||
|
||||
b <<= 1
|
||||
if this_bit == 0:
|
||||
this_bit = 7
|
||||
this_byte += 1
|
||||
if this_byte < len(encoded):
|
||||
b = encoded[this_byte]
|
||||
else:
|
||||
this_bit -= 1
|
||||
length = bits
|
||||
nextbit = bititer().__next__
|
||||
|
||||
def getnbits(n):
|
||||
bits = 0
|
||||
for i in range(n):
|
||||
bits = (bits << 1) | nextbit()
|
||||
return bits
|
||||
|
||||
dec = []
|
||||
length = getnbits(encoded_length_bits)
|
||||
|
||||
i = 0
|
||||
while i < length:
|
||||
@ -419,19 +432,8 @@ def decompress(encoding_table, encoded, encoded_length_bits):
|
||||
max_code = lengths[0]
|
||||
searched_length = lengths[0]
|
||||
while True:
|
||||
bits <<= 1
|
||||
if 0x80 & b:
|
||||
bits |= 1
|
||||
|
||||
b <<= 1
|
||||
bits = (bits << 1) | nextbit()
|
||||
bit_length += 1
|
||||
if this_bit == 0:
|
||||
this_bit = 7
|
||||
this_byte += 1
|
||||
if this_byte < len(encoded):
|
||||
b = encoded[this_byte]
|
||||
else:
|
||||
this_bit -= 1
|
||||
if max_code > 0 and bits < max_code:
|
||||
# print('{0:0{width}b}'.format(bits, width=bit_length))
|
||||
break
|
||||
@ -439,7 +441,10 @@ def decompress(encoding_table, encoded, encoded_length_bits):
|
||||
searched_length += lengths[bit_length]
|
||||
|
||||
v = values[searched_length + bits - max_code]
|
||||
if v >= chr(0x80) and v < chr(0x80 + len(words)):
|
||||
if v == chr(1):
|
||||
qstr_idx = getnbits(encoding_table.translation_qstr_bits)
|
||||
v = qstrs_inv[qstr_idx]
|
||||
elif v >= chr(0x80) and v < chr(0x80 + len(words)):
|
||||
v = words[ord(v) - 0x80]
|
||||
i += len(v.encode("utf-8"))
|
||||
dec.append(v)
|
||||
@ -449,36 +454,37 @@ def decompress(encoding_table, encoded, encoded_length_bits):
|
||||
def compress(encoding_table, decompressed, encoded_length_bits, len_translation_encoded):
|
||||
if not isinstance(decompressed, str):
|
||||
raise TypeError()
|
||||
qstrs = encoding_table.qstrs
|
||||
canonical = encoding_table.canonical
|
||||
extractor = encoding_table.extractor
|
||||
|
||||
enc = bytearray(len(decompressed) * 3)
|
||||
current_bit = 7
|
||||
current_byte = 0
|
||||
enc = 1
|
||||
|
||||
bits = encoded_length_bits + 1
|
||||
for i in range(bits - 1, 0, -1):
|
||||
if len_translation_encoded & (1 << (i - 1)):
|
||||
enc[current_byte] |= 1 << current_bit
|
||||
if current_bit == 0:
|
||||
current_bit = 7
|
||||
current_byte += 1
|
||||
else:
|
||||
current_bit -= 1
|
||||
def put_bit(enc, b):
|
||||
return (enc << 1) | bool(b)
|
||||
|
||||
def put_bits(enc, b, n):
|
||||
for i in range(n - 1, -1, -1):
|
||||
enc = put_bit(enc, b & (1 << i))
|
||||
return enc
|
||||
|
||||
enc = put_bits(enc, len_translation_encoded, encoded_length_bits)
|
||||
|
||||
for atom in extractor.iter(decompressed):
|
||||
for b in canonical[atom]:
|
||||
if b == "1":
|
||||
enc[current_byte] |= 1 << current_bit
|
||||
if current_bit == 0:
|
||||
current_bit = 7
|
||||
current_byte += 1
|
||||
else:
|
||||
current_bit -= 1
|
||||
if atom in qstrs:
|
||||
can = canonical["\1"]
|
||||
else:
|
||||
can = canonical[atom]
|
||||
for b in can:
|
||||
enc = put_bit(enc, b == "1")
|
||||
if atom in qstrs:
|
||||
enc = put_bits(enc, qstrs[atom], encoding_table.translation_qstr_bits)
|
||||
|
||||
if current_bit != 7:
|
||||
current_byte += 1
|
||||
return enc[:current_byte]
|
||||
while enc.bit_length() % 8 != 1:
|
||||
enc = put_bit(enc, 0)
|
||||
|
||||
r = enc.to_bytes((enc.bit_length() + 7) // 8, "big")
|
||||
return r[1:]
|
||||
|
||||
|
||||
def qstr_escape(qst):
|
||||
@ -493,10 +499,20 @@ def qstr_escape(qst):
|
||||
return re.sub(r"[^A-Za-z0-9_]", esc_char, qst)
|
||||
|
||||
|
||||
def parse_qstrs(infile):
|
||||
r = {}
|
||||
rx = re.compile(r'QDEF\([A-Za-z0-9_]+,\s*\d+,\s*\d+,\s*(?P<cstr>"(?:[^"\\\\]*|\\.)")\)')
|
||||
content = infile.read()
|
||||
for i, mat in enumerate(rx.findall(content, re.M)):
|
||||
mat = eval(mat)
|
||||
r[mat] = i
|
||||
return r
|
||||
|
||||
|
||||
def parse_input_headers(infiles):
|
||||
i18ns = set()
|
||||
|
||||
# read the qstrs in from the input files
|
||||
# read the TRANSLATE strings in from the input files
|
||||
for infile in infiles:
|
||||
with open(infile, "rt") as f:
|
||||
for line in f:
|
||||
@ -516,12 +532,12 @@ def escape_bytes(qstr):
|
||||
return qstr
|
||||
else:
|
||||
# qstr contains non-printable codes so render entire thing as hex pairs
|
||||
qbytes = bytes_cons(qstr, "utf8")
|
||||
qbytes = bytes(qstr, "utf8")
|
||||
return "".join(("\\x%02x" % b) for b in qbytes)
|
||||
|
||||
|
||||
def make_bytes(cfg_bytes_len, cfg_bytes_hash, qstr):
|
||||
qbytes = bytes_cons(qstr, "utf8")
|
||||
qbytes = bytes(qstr, "utf8")
|
||||
qlen = len(qbytes)
|
||||
qhash = compute_hash(qbytes, cfg_bytes_hash)
|
||||
if qlen >= (1 << (8 * cfg_bytes_len)):
|
||||
@ -551,7 +567,7 @@ def output_translation_data(encoding_table, i18ns, out):
|
||||
)
|
||||
total_text_compressed_size += len(compressed)
|
||||
decompressed = decompress(encoding_table, compressed, encoded_length_bits)
|
||||
assert decompressed == translation
|
||||
assert decompressed == translation, (decompressed, translation)
|
||||
for c in C_ESCAPES:
|
||||
decompressed = decompressed.replace(c, C_ESCAPES[c])
|
||||
formatted = ["{:d}".format(x) for x in compressed]
|
||||
@ -572,7 +588,7 @@ if __name__ == "__main__":
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Process QSTR definitions into headers for compilation"
|
||||
description="Process TRANSLATE strings into headers for compilation"
|
||||
)
|
||||
parser.add_argument(
|
||||
"infiles", metavar="N", type=str, nargs="+", help="an integer for the accumulator"
|
||||
@ -590,13 +606,19 @@ if __name__ == "__main__":
|
||||
type=argparse.FileType("w", encoding="UTF-8"),
|
||||
help="c file for translation data",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--qstrdefs_filename",
|
||||
type=argparse.FileType("r", encoding="UTF-8"),
|
||||
help="",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
qstrs = parse_qstrs(args.qstrdefs_filename)
|
||||
i18ns = parse_input_headers(args.infiles)
|
||||
i18ns = sorted(i18ns)
|
||||
translations = translate(args.translation, i18ns)
|
||||
encoding_table = compute_huffman_coding(
|
||||
args.translation, translations, args.compression_filename
|
||||
qstrs, args.translation, translations, args.compression_filename
|
||||
)
|
||||
output_translation_data(encoding_table, translations, args.translation_filename)
|
||||
|
2
py/py.mk
2
py/py.mk
@ -270,7 +270,7 @@ $(PY_BUILD)/translations-$(TRANSLATION).c: $(HEADER_BUILD)/compressed_translatio
|
||||
$(HEADER_BUILD)/compressed_translations.generated.h: $(PY_SRC)/maketranslationdata.py $(HEADER_BUILD)/$(TRANSLATION).mo $(HEADER_BUILD)/qstrdefs.generated.h
|
||||
$(STEPECHO) "GEN $@"
|
||||
$(Q)mkdir -p $(PY_BUILD)
|
||||
$(Q)$(PYTHON) $(PY_SRC)/maketranslationdata.py --compression_filename $(HEADER_BUILD)/compressed_translations.generated.h --translation $(HEADER_BUILD)/$(TRANSLATION).mo --translation_filename $(PY_BUILD)/translations-$(TRANSLATION).c $(HEADER_BUILD)/qstrdefs.preprocessed.h
|
||||
$(Q)$(PYTHON) $(PY_SRC)/maketranslationdata.py --compression_filename $(HEADER_BUILD)/compressed_translations.generated.h --translation $(HEADER_BUILD)/$(TRANSLATION).mo --translation_filename $(PY_BUILD)/translations-$(TRANSLATION).c --qstrdefs_filename $(HEADER_BUILD)/qstrdefs.generated.h $(HEADER_BUILD)/qstrdefs.preprocessed.h
|
||||
|
||||
PY_CORE_O += $(PY_BUILD)/translations-$(TRANSLATION).o
|
||||
|
||||
|
@ -25,6 +25,7 @@
|
||||
*/
|
||||
|
||||
#include "supervisor/shared/translate/translate.h"
|
||||
#include "py/qstr.h"
|
||||
|
||||
#include <stdbool.h>
|
||||
#include <stdint.h>
|
||||
@ -99,10 +100,34 @@ uint16_t decompress_length(const compressed_string_t *compressed) {
|
||||
#endif
|
||||
}
|
||||
|
||||
typedef struct {
|
||||
const uint8_t *ptr;
|
||||
uint8_t bit;
|
||||
} bitstream_state_t;
|
||||
|
||||
static bool next_bit(bitstream_state_t *st) {
|
||||
bool r = *st->ptr & st->bit;
|
||||
st->bit >>= 1;
|
||||
if (!st->bit) {
|
||||
st->bit = 0x80;
|
||||
st->ptr++;
|
||||
}
|
||||
return r;
|
||||
}
|
||||
|
||||
static int get_nbits(bitstream_state_t *st, int n) {
|
||||
int r = 0;
|
||||
while (n--) {
|
||||
r = (r << 1) | next_bit(st);
|
||||
}
|
||||
return r;
|
||||
}
|
||||
|
||||
char *decompress(const compressed_string_t *compressed, char *decompressed) {
|
||||
uint8_t this_byte = compress_max_length_bits / 8;
|
||||
uint8_t this_bit = 7 - compress_max_length_bits % 8;
|
||||
uint8_t b = (&compressed->data)[this_byte] << (compress_max_length_bits % 8);
|
||||
bitstream_state_t b = {
|
||||
.ptr = &(compressed->data) + (compress_max_length_bits >> 3),
|
||||
.bit = 1 << (7 - ((compress_max_length_bits) & 0x7)),
|
||||
};
|
||||
uint16_t length = decompress_length(compressed);
|
||||
|
||||
// Stop one early because the last byte is always NULL.
|
||||
@ -112,26 +137,23 @@ char *decompress(const compressed_string_t *compressed, char *decompressed) {
|
||||
uint32_t max_code = lengths[0];
|
||||
uint32_t searched_length = lengths[0];
|
||||
while (true) {
|
||||
bits <<= 1;
|
||||
if ((0x80 & b) != 0) {
|
||||
bits |= 1;
|
||||
}
|
||||
b <<= 1;
|
||||
bits = (bits << 1) | next_bit(&b);
|
||||
bit_length += 1;
|
||||
if (this_bit == 0) {
|
||||
this_bit = 7;
|
||||
this_byte += 1;
|
||||
b = (&compressed->data)[this_byte]; // This may read past the end but its never used.
|
||||
} else {
|
||||
this_bit -= 1;
|
||||
}
|
||||
if (max_code > 0 && bits < max_code) {
|
||||
break;
|
||||
}
|
||||
max_code = (max_code << 1) + lengths[bit_length];
|
||||
searched_length += lengths[bit_length];
|
||||
}
|
||||
i += put_utf8(decompressed + i, values[searched_length + bits - max_code]);
|
||||
int v = values[searched_length + bits - max_code];
|
||||
if (v == 1) {
|
||||
qstr q = get_nbits(&b, translation_qstr_bits) + 1; // honestly no idea why "+1"...
|
||||
for (const char *qc = qstr_str(q); *qc;) {
|
||||
decompressed[i++] = *qc++;
|
||||
}
|
||||
} else {
|
||||
i += put_utf8(decompressed + i, v);
|
||||
}
|
||||
}
|
||||
|
||||
decompressed[length - 1] = '\0';
|
||||
|
Loading…
x
Reference in New Issue
Block a user