Merge pull request #8327 from jepler/translation-compression-qstr

Use qstrs to improve compression
This commit is contained in:
Scott Shawcroft 2023-09-05 13:12:13 -07:00 committed by GitHub
commit e0fa155037
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 159 additions and 100 deletions

View File

@ -1,7 +1,9 @@
"""
Process raw qstr file and output qstr data with length, hash and data bytes.
This script works with Python 2.7, 3.3 and 3.4.
This script is only regularly tested with the same version of Python used
during CI, typically the latest "3.x". However, incompatibilities with any
supported CPython version are unintended.
For documentation about the format of compressed translated strings, see
supervisor/shared/translate/translate.h
@ -16,31 +18,16 @@ import sys
import collections
import gettext
import os.path
import pathlib
if hasattr(sys.stdout, "reconfigure"):
sys.stdout.reconfigure(encoding="utf-8")
sys.stderr.reconfigure(errors="backslashreplace")
py = os.path.dirname(sys.argv[0])
top = os.path.dirname(py)
sys.path.append(os.path.join(top, "tools/huffman"))
sys.path.append(str(pathlib.Path(__file__).parent.parent / "tools/huffman"))
import huffman
# Python 2/3 compatibility:
# - iterating through bytes is different
# - codepoint2name lives in a different module
import platform
if platform.python_version_tuple()[0] == "2":
bytes_cons = lambda val, enc=None: bytearray(val)
from htmlentitydefs import codepoint2name
elif platform.python_version_tuple()[0] == "3":
bytes_cons = bytes
from html.entities import codepoint2name
# end compatibility code
from html.entities import codepoint2name
codepoint2name[ord("-")] = "hyphen"
@ -182,9 +169,15 @@ class EncodingTable:
extractor: object
apply_offset: object
remove_offset: object
translation_qstr_bits: int
qstrs: object
qstrs_inv: object
def compute_huffman_coding(translation_name, translations, f):
def compute_huffman_coding(qstrs, translation_name, translations, f):
# possible future improvement: some languages are better when consider len(k) > 2. try both?
qstrs = dict((k, v) for k, v in qstrs.items() if len(k) > 3)
qstr_strs = list(qstrs.keys())
texts = [t[1] for t in translations]
words = []
@ -234,10 +227,12 @@ def compute_huffman_coding(translation_name, translations, f):
# if "the" is in words then not only will "the" not be considered
# again, neither will "there" or "wither", since they have "the"
# as substrings.
extractor = TextSplitter(words)
extractor = TextSplitter(words + qstr_strs)
counter = collections.Counter()
for t in texts:
for atom in extractor.iter(t):
if atom in qstrs:
atom = "\1"
counter[atom] += 1
cb = huffman.codebook(counter.items())
lengths = sorted(dict((v, len(cb[k])) for k, v in counter.items()).items())
@ -304,10 +299,14 @@ def compute_huffman_coding(translation_name, translations, f):
words.append(word)
words.sort(key=len)
extractor = TextSplitter(words)
extractor = TextSplitter(words + qstr_strs)
counter = collections.Counter()
used_qstr = 0
for t in texts:
for atom in extractor.iter(t):
if atom in qstrs:
used_qstr = max(used_qstr, qstrs[atom])
atom = "\1"
counter[atom] += 1
cb = huffman.codebook(counter.items())
@ -322,6 +321,8 @@ def compute_huffman_coding(translation_name, translations, f):
last_length = None
canonical = {}
for atom, code in sorted(cb.items(), key=lambda x: (len(x[1]), x[0])):
if atom in qstr_strs:
atom = "\1"
values.append(atom)
length = len(code)
if length not in length_count:
@ -359,6 +360,8 @@ def compute_huffman_coding(translation_name, translations, f):
minlen = len(words[0])
wlencount = [len([None for w in words if len(w) == l]) for l in range(minlen, maxlen + 1)]
translation_qstr_bits = used_qstr.bit_length()
f.write("typedef {} mchar_t;\n".format(values_type))
f.write("const uint8_t lengths[] = {{ {} }};\n".format(", ".join(map(str, lengths))))
f.write(
@ -383,34 +386,44 @@ def compute_huffman_coding(translation_name, translations, f):
f.write("#define maxlen {}\n".format(maxlen))
f.write("#define translation_offstart {}\n".format(offstart))
f.write("#define translation_offset {}\n".format(offset))
f.write("#define translation_qstr_bits {}\n".format(translation_qstr_bits))
return EncodingTable(values, lengths, words, canonical, extractor, apply_offset, remove_offset)
qstrs_inv = dict((v, k) for k, v in qstrs.items())
return EncodingTable(
values,
lengths,
words,
canonical,
extractor,
apply_offset,
remove_offset,
translation_qstr_bits,
qstrs,
qstrs_inv,
)
def decompress(encoding_table, encoded, encoded_length_bits):
qstrs_inv = encoding_table.qstrs_inv
values = encoding_table.values
lengths = encoding_table.lengths
words = encoding_table.words
dec = []
this_byte = 0
this_bit = 7
b = encoded[this_byte]
bits = 0
for i in range(encoded_length_bits):
bits <<= 1
if 0x80 & b:
bits |= 1
def bititer():
for byte in encoded:
for bit in (0x80, 0x40, 0x20, 0x10, 0x8, 0x4, 0x2, 0x1):
yield bool(byte & bit)
b <<= 1
if this_bit == 0:
this_bit = 7
this_byte += 1
if this_byte < len(encoded):
b = encoded[this_byte]
else:
this_bit -= 1
length = bits
nextbit = bititer().__next__
def getnbits(n):
bits = 0
for i in range(n):
bits = (bits << 1) | nextbit()
return bits
dec = []
length = getnbits(encoded_length_bits)
i = 0
while i < length:
@ -419,19 +432,8 @@ def decompress(encoding_table, encoded, encoded_length_bits):
max_code = lengths[0]
searched_length = lengths[0]
while True:
bits <<= 1
if 0x80 & b:
bits |= 1
b <<= 1
bits = (bits << 1) | nextbit()
bit_length += 1
if this_bit == 0:
this_bit = 7
this_byte += 1
if this_byte < len(encoded):
b = encoded[this_byte]
else:
this_bit -= 1
if max_code > 0 and bits < max_code:
# print('{0:0{width}b}'.format(bits, width=bit_length))
break
@ -439,7 +441,10 @@ def decompress(encoding_table, encoded, encoded_length_bits):
searched_length += lengths[bit_length]
v = values[searched_length + bits - max_code]
if v >= chr(0x80) and v < chr(0x80 + len(words)):
if v == chr(1):
qstr_idx = getnbits(encoding_table.translation_qstr_bits)
v = qstrs_inv[qstr_idx]
elif v >= chr(0x80) and v < chr(0x80 + len(words)):
v = words[ord(v) - 0x80]
i += len(v.encode("utf-8"))
dec.append(v)
@ -449,36 +454,37 @@ def decompress(encoding_table, encoded, encoded_length_bits):
def compress(encoding_table, decompressed, encoded_length_bits, len_translation_encoded):
if not isinstance(decompressed, str):
raise TypeError()
qstrs = encoding_table.qstrs
canonical = encoding_table.canonical
extractor = encoding_table.extractor
enc = bytearray(len(decompressed) * 3)
current_bit = 7
current_byte = 0
enc = 1
bits = encoded_length_bits + 1
for i in range(bits - 1, 0, -1):
if len_translation_encoded & (1 << (i - 1)):
enc[current_byte] |= 1 << current_bit
if current_bit == 0:
current_bit = 7
current_byte += 1
else:
current_bit -= 1
def put_bit(enc, b):
return (enc << 1) | bool(b)
def put_bits(enc, b, n):
for i in range(n - 1, -1, -1):
enc = put_bit(enc, b & (1 << i))
return enc
enc = put_bits(enc, len_translation_encoded, encoded_length_bits)
for atom in extractor.iter(decompressed):
for b in canonical[atom]:
if b == "1":
enc[current_byte] |= 1 << current_bit
if current_bit == 0:
current_bit = 7
current_byte += 1
else:
current_bit -= 1
if atom in qstrs:
can = canonical["\1"]
else:
can = canonical[atom]
for b in can:
enc = put_bit(enc, b == "1")
if atom in qstrs:
enc = put_bits(enc, qstrs[atom], encoding_table.translation_qstr_bits)
if current_bit != 7:
current_byte += 1
return enc[:current_byte]
while enc.bit_length() % 8 != 1:
enc = put_bit(enc, 0)
r = enc.to_bytes((enc.bit_length() + 7) // 8, "big")
return r[1:]
def qstr_escape(qst):
@ -493,10 +499,20 @@ def qstr_escape(qst):
return re.sub(r"[^A-Za-z0-9_]", esc_char, qst)
def parse_qstrs(infile):
r = {}
rx = re.compile(r'QDEF\([A-Za-z0-9_]+,\s*\d+,\s*\d+,\s*(?P<cstr>"(?:[^"\\\\]*|\\.)")\)')
content = infile.read()
for i, mat in enumerate(rx.findall(content, re.M)):
mat = eval(mat)
r[mat] = i
return r
def parse_input_headers(infiles):
i18ns = set()
# read the qstrs in from the input files
# read the TRANSLATE strings in from the input files
for infile in infiles:
with open(infile, "rt") as f:
for line in f:
@ -516,12 +532,12 @@ def escape_bytes(qstr):
return qstr
else:
# qstr contains non-printable codes so render entire thing as hex pairs
qbytes = bytes_cons(qstr, "utf8")
qbytes = bytes(qstr, "utf8")
return "".join(("\\x%02x" % b) for b in qbytes)
def make_bytes(cfg_bytes_len, cfg_bytes_hash, qstr):
qbytes = bytes_cons(qstr, "utf8")
qbytes = bytes(qstr, "utf8")
qlen = len(qbytes)
qhash = compute_hash(qbytes, cfg_bytes_hash)
if qlen >= (1 << (8 * cfg_bytes_len)):
@ -551,7 +567,7 @@ def output_translation_data(encoding_table, i18ns, out):
)
total_text_compressed_size += len(compressed)
decompressed = decompress(encoding_table, compressed, encoded_length_bits)
assert decompressed == translation
assert decompressed == translation, (decompressed, translation)
for c in C_ESCAPES:
decompressed = decompressed.replace(c, C_ESCAPES[c])
formatted = ["{:d}".format(x) for x in compressed]
@ -572,7 +588,7 @@ if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(
description="Process QSTR definitions into headers for compilation"
description="Process TRANSLATE strings into headers for compilation"
)
parser.add_argument(
"infiles", metavar="N", type=str, nargs="+", help="an integer for the accumulator"
@ -590,13 +606,19 @@ if __name__ == "__main__":
type=argparse.FileType("w", encoding="UTF-8"),
help="c file for translation data",
)
parser.add_argument(
"--qstrdefs_filename",
type=argparse.FileType("r", encoding="UTF-8"),
help="",
)
args = parser.parse_args()
qstrs = parse_qstrs(args.qstrdefs_filename)
i18ns = parse_input_headers(args.infiles)
i18ns = sorted(i18ns)
translations = translate(args.translation, i18ns)
encoding_table = compute_huffman_coding(
args.translation, translations, args.compression_filename
qstrs, args.translation, translations, args.compression_filename
)
output_translation_data(encoding_table, translations, args.translation_filename)

View File

@ -270,7 +270,7 @@ $(PY_BUILD)/translations-$(TRANSLATION).c: $(HEADER_BUILD)/compressed_translatio
$(HEADER_BUILD)/compressed_translations.generated.h: $(PY_SRC)/maketranslationdata.py $(HEADER_BUILD)/$(TRANSLATION).mo $(HEADER_BUILD)/qstrdefs.generated.h
$(STEPECHO) "GEN $@"
$(Q)mkdir -p $(PY_BUILD)
$(Q)$(PYTHON) $(PY_SRC)/maketranslationdata.py --compression_filename $(HEADER_BUILD)/compressed_translations.generated.h --translation $(HEADER_BUILD)/$(TRANSLATION).mo --translation_filename $(PY_BUILD)/translations-$(TRANSLATION).c $(HEADER_BUILD)/qstrdefs.preprocessed.h
$(Q)$(PYTHON) $(PY_SRC)/maketranslationdata.py --compression_filename $(HEADER_BUILD)/compressed_translations.generated.h --translation $(HEADER_BUILD)/$(TRANSLATION).mo --translation_filename $(PY_BUILD)/translations-$(TRANSLATION).c --qstrdefs_filename $(HEADER_BUILD)/qstrdefs.generated.h $(HEADER_BUILD)/qstrdefs.preprocessed.h
PY_CORE_O += $(PY_BUILD)/translations-$(TRANSLATION).o

View File

@ -53,6 +53,13 @@
// speaking, words. They're just spans of code points that frequently
// occur together. They are ordered shortest to longest.
//
// - If the translation uses a lot of code points or widely spaced code points,
// then the huffman table entries are UTF-16 code points. But if the translation
// uses only ASCII 7-bit code points plus a SMALL range of higher code points that
// still fit in 8 bits, translation_offset and translation_offstart are used to
// renumber the code points so that they still fit within 8 bits. (it's very beneficial
// for mchar_t to be 8 bits instead of 16!)
//
// - dictionary entries are non-overlapping, and the _ending_ index of each
// entry is stored in an array. A count of words of each length, from
// minlen to maxlen, is given in the array called wlencount. From
@ -60,6 +67,14 @@
// calculated by an efficient, small loop. (A bit of time is traded
// to reduce the size of this table indicating lengths)
//
// - Value 1 ('\1') is used to indicate that a QSTR number follows. the
// QSTR is encoded as a fixed number of bits (translation_qstr_bits), e.g.,
// 10 bits if the highest core qstr is from 512 to 1023 inclusive.
// (maketranslationdata uses a simple heuristic where any qstr >= 3
// characters long is encoded in this way; this is simple but probably not
// optimal. In fact, the rule of >= 2 characters is better for SOME languages
// on SOME boards.)
//
// The "data" / "tail" construct is so that the struct's last member is a
// "flexible array". However, the _only_ member is not permitted to be
// a flexible member, so we have to declare the first byte as a separate

View File

@ -25,6 +25,7 @@
*/
#include "supervisor/shared/translate/translate.h"
#include "py/qstr.h"
#include <stdbool.h>
#include <stdint.h>
@ -99,10 +100,34 @@ uint16_t decompress_length(const compressed_string_t *compressed) {
#endif
}
typedef struct {
const uint8_t *ptr;
uint8_t bit;
} bitstream_state_t;
static bool next_bit(bitstream_state_t *st) {
bool r = *st->ptr & st->bit;
st->bit >>= 1;
if (!st->bit) {
st->bit = 0x80;
st->ptr++;
}
return r;
}
static int get_nbits(bitstream_state_t *st, int n) {
int r = 0;
while (n--) {
r = (r << 1) | next_bit(st);
}
return r;
}
char *decompress(const compressed_string_t *compressed, char *decompressed) {
uint8_t this_byte = compress_max_length_bits / 8;
uint8_t this_bit = 7 - compress_max_length_bits % 8;
uint8_t b = (&compressed->data)[this_byte] << (compress_max_length_bits % 8);
bitstream_state_t b = {
.ptr = &(compressed->data) + (compress_max_length_bits >> 3),
.bit = 1 << (7 - ((compress_max_length_bits) & 0x7)),
};
uint16_t length = decompress_length(compressed);
// Stop one early because the last byte is always NULL.
@ -112,26 +137,23 @@ char *decompress(const compressed_string_t *compressed, char *decompressed) {
uint32_t max_code = lengths[0];
uint32_t searched_length = lengths[0];
while (true) {
bits <<= 1;
if ((0x80 & b) != 0) {
bits |= 1;
}
b <<= 1;
bits = (bits << 1) | next_bit(&b);
bit_length += 1;
if (this_bit == 0) {
this_bit = 7;
this_byte += 1;
b = (&compressed->data)[this_byte]; // This may read past the end but its never used.
} else {
this_bit -= 1;
}
if (max_code > 0 && bits < max_code) {
break;
}
max_code = (max_code << 1) + lengths[bit_length];
searched_length += lengths[bit_length];
}
i += put_utf8(decompressed + i, values[searched_length + bits - max_code]);
int v = values[searched_length + bits - max_code];
if (v == 1) {
qstr q = get_nbits(&b, translation_qstr_bits) + 1; // honestly no idea why "+1"...
for (const char *qc = qstr_str(q); *qc;) {
decompressed[i++] = *qc++;
}
} else {
i += put_utf8(decompressed + i, v);
}
}
decompressed[length - 1] = '\0';