Merge pull request #8327 from jepler/translation-compression-qstr

Use qstrs to improve compression
This commit is contained in:
Scott Shawcroft 2023-09-05 13:12:13 -07:00 committed by GitHub
commit e0fa155037
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 159 additions and 100 deletions

View File

@ -1,7 +1,9 @@
""" """
Process raw qstr file and output qstr data with length, hash and data bytes. Process raw qstr file and output qstr data with length, hash and data bytes.
This script works with Python 2.7, 3.3 and 3.4. This script is only regularly tested with the same version of Python used
during CI, typically the latest "3.x". However, incompatibilities with any
supported CPython version are unintended.
For documentation about the format of compressed translated strings, see For documentation about the format of compressed translated strings, see
supervisor/shared/translate/translate.h supervisor/shared/translate/translate.h
@ -16,31 +18,16 @@ import sys
import collections import collections
import gettext import gettext
import os.path import pathlib
if hasattr(sys.stdout, "reconfigure"): if hasattr(sys.stdout, "reconfigure"):
sys.stdout.reconfigure(encoding="utf-8") sys.stdout.reconfigure(encoding="utf-8")
sys.stderr.reconfigure(errors="backslashreplace") sys.stderr.reconfigure(errors="backslashreplace")
py = os.path.dirname(sys.argv[0]) sys.path.append(str(pathlib.Path(__file__).parent.parent / "tools/huffman"))
top = os.path.dirname(py)
sys.path.append(os.path.join(top, "tools/huffman"))
import huffman import huffman
# Python 2/3 compatibility:
# - iterating through bytes is different
# - codepoint2name lives in a different module
import platform
if platform.python_version_tuple()[0] == "2":
bytes_cons = lambda val, enc=None: bytearray(val)
from htmlentitydefs import codepoint2name
elif platform.python_version_tuple()[0] == "3":
bytes_cons = bytes
from html.entities import codepoint2name from html.entities import codepoint2name
# end compatibility code
codepoint2name[ord("-")] = "hyphen" codepoint2name[ord("-")] = "hyphen"
@ -182,9 +169,15 @@ class EncodingTable:
extractor: object extractor: object
apply_offset: object apply_offset: object
remove_offset: object remove_offset: object
translation_qstr_bits: int
qstrs: object
qstrs_inv: object
def compute_huffman_coding(translation_name, translations, f): def compute_huffman_coding(qstrs, translation_name, translations, f):
# possible future improvement: some languages are better when consider len(k) > 2. try both?
qstrs = dict((k, v) for k, v in qstrs.items() if len(k) > 3)
qstr_strs = list(qstrs.keys())
texts = [t[1] for t in translations] texts = [t[1] for t in translations]
words = [] words = []
@ -234,10 +227,12 @@ def compute_huffman_coding(translation_name, translations, f):
# if "the" is in words then not only will "the" not be considered # if "the" is in words then not only will "the" not be considered
# again, neither will "there" or "wither", since they have "the" # again, neither will "there" or "wither", since they have "the"
# as substrings. # as substrings.
extractor = TextSplitter(words) extractor = TextSplitter(words + qstr_strs)
counter = collections.Counter() counter = collections.Counter()
for t in texts: for t in texts:
for atom in extractor.iter(t): for atom in extractor.iter(t):
if atom in qstrs:
atom = "\1"
counter[atom] += 1 counter[atom] += 1
cb = huffman.codebook(counter.items()) cb = huffman.codebook(counter.items())
lengths = sorted(dict((v, len(cb[k])) for k, v in counter.items()).items()) lengths = sorted(dict((v, len(cb[k])) for k, v in counter.items()).items())
@ -304,10 +299,14 @@ def compute_huffman_coding(translation_name, translations, f):
words.append(word) words.append(word)
words.sort(key=len) words.sort(key=len)
extractor = TextSplitter(words) extractor = TextSplitter(words + qstr_strs)
counter = collections.Counter() counter = collections.Counter()
used_qstr = 0
for t in texts: for t in texts:
for atom in extractor.iter(t): for atom in extractor.iter(t):
if atom in qstrs:
used_qstr = max(used_qstr, qstrs[atom])
atom = "\1"
counter[atom] += 1 counter[atom] += 1
cb = huffman.codebook(counter.items()) cb = huffman.codebook(counter.items())
@ -322,6 +321,8 @@ def compute_huffman_coding(translation_name, translations, f):
last_length = None last_length = None
canonical = {} canonical = {}
for atom, code in sorted(cb.items(), key=lambda x: (len(x[1]), x[0])): for atom, code in sorted(cb.items(), key=lambda x: (len(x[1]), x[0])):
if atom in qstr_strs:
atom = "\1"
values.append(atom) values.append(atom)
length = len(code) length = len(code)
if length not in length_count: if length not in length_count:
@ -359,6 +360,8 @@ def compute_huffman_coding(translation_name, translations, f):
minlen = len(words[0]) minlen = len(words[0])
wlencount = [len([None for w in words if len(w) == l]) for l in range(minlen, maxlen + 1)] wlencount = [len([None for w in words if len(w) == l]) for l in range(minlen, maxlen + 1)]
translation_qstr_bits = used_qstr.bit_length()
f.write("typedef {} mchar_t;\n".format(values_type)) f.write("typedef {} mchar_t;\n".format(values_type))
f.write("const uint8_t lengths[] = {{ {} }};\n".format(", ".join(map(str, lengths)))) f.write("const uint8_t lengths[] = {{ {} }};\n".format(", ".join(map(str, lengths))))
f.write( f.write(
@ -383,34 +386,44 @@ def compute_huffman_coding(translation_name, translations, f):
f.write("#define maxlen {}\n".format(maxlen)) f.write("#define maxlen {}\n".format(maxlen))
f.write("#define translation_offstart {}\n".format(offstart)) f.write("#define translation_offstart {}\n".format(offstart))
f.write("#define translation_offset {}\n".format(offset)) f.write("#define translation_offset {}\n".format(offset))
f.write("#define translation_qstr_bits {}\n".format(translation_qstr_bits))
return EncodingTable(values, lengths, words, canonical, extractor, apply_offset, remove_offset) qstrs_inv = dict((v, k) for k, v in qstrs.items())
return EncodingTable(
values,
lengths,
words,
canonical,
extractor,
apply_offset,
remove_offset,
translation_qstr_bits,
qstrs,
qstrs_inv,
)
def decompress(encoding_table, encoded, encoded_length_bits): def decompress(encoding_table, encoded, encoded_length_bits):
qstrs_inv = encoding_table.qstrs_inv
values = encoding_table.values values = encoding_table.values
lengths = encoding_table.lengths lengths = encoding_table.lengths
words = encoding_table.words words = encoding_table.words
dec = [] def bititer():
this_byte = 0 for byte in encoded:
this_bit = 7 for bit in (0x80, 0x40, 0x20, 0x10, 0x8, 0x4, 0x2, 0x1):
b = encoded[this_byte] yield bool(byte & bit)
bits = 0
for i in range(encoded_length_bits):
bits <<= 1
if 0x80 & b:
bits |= 1
b <<= 1 nextbit = bititer().__next__
if this_bit == 0:
this_bit = 7 def getnbits(n):
this_byte += 1 bits = 0
if this_byte < len(encoded): for i in range(n):
b = encoded[this_byte] bits = (bits << 1) | nextbit()
else: return bits
this_bit -= 1
length = bits dec = []
length = getnbits(encoded_length_bits)
i = 0 i = 0
while i < length: while i < length:
@ -419,19 +432,8 @@ def decompress(encoding_table, encoded, encoded_length_bits):
max_code = lengths[0] max_code = lengths[0]
searched_length = lengths[0] searched_length = lengths[0]
while True: while True:
bits <<= 1 bits = (bits << 1) | nextbit()
if 0x80 & b:
bits |= 1
b <<= 1
bit_length += 1 bit_length += 1
if this_bit == 0:
this_bit = 7
this_byte += 1
if this_byte < len(encoded):
b = encoded[this_byte]
else:
this_bit -= 1
if max_code > 0 and bits < max_code: if max_code > 0 and bits < max_code:
# print('{0:0{width}b}'.format(bits, width=bit_length)) # print('{0:0{width}b}'.format(bits, width=bit_length))
break break
@ -439,7 +441,10 @@ def decompress(encoding_table, encoded, encoded_length_bits):
searched_length += lengths[bit_length] searched_length += lengths[bit_length]
v = values[searched_length + bits - max_code] v = values[searched_length + bits - max_code]
if v >= chr(0x80) and v < chr(0x80 + len(words)): if v == chr(1):
qstr_idx = getnbits(encoding_table.translation_qstr_bits)
v = qstrs_inv[qstr_idx]
elif v >= chr(0x80) and v < chr(0x80 + len(words)):
v = words[ord(v) - 0x80] v = words[ord(v) - 0x80]
i += len(v.encode("utf-8")) i += len(v.encode("utf-8"))
dec.append(v) dec.append(v)
@ -449,36 +454,37 @@ def decompress(encoding_table, encoded, encoded_length_bits):
def compress(encoding_table, decompressed, encoded_length_bits, len_translation_encoded): def compress(encoding_table, decompressed, encoded_length_bits, len_translation_encoded):
if not isinstance(decompressed, str): if not isinstance(decompressed, str):
raise TypeError() raise TypeError()
qstrs = encoding_table.qstrs
canonical = encoding_table.canonical canonical = encoding_table.canonical
extractor = encoding_table.extractor extractor = encoding_table.extractor
enc = bytearray(len(decompressed) * 3) enc = 1
current_bit = 7
current_byte = 0
bits = encoded_length_bits + 1 def put_bit(enc, b):
for i in range(bits - 1, 0, -1): return (enc << 1) | bool(b)
if len_translation_encoded & (1 << (i - 1)):
enc[current_byte] |= 1 << current_bit def put_bits(enc, b, n):
if current_bit == 0: for i in range(n - 1, -1, -1):
current_bit = 7 enc = put_bit(enc, b & (1 << i))
current_byte += 1 return enc
else:
current_bit -= 1 enc = put_bits(enc, len_translation_encoded, encoded_length_bits)
for atom in extractor.iter(decompressed): for atom in extractor.iter(decompressed):
for b in canonical[atom]: if atom in qstrs:
if b == "1": can = canonical["\1"]
enc[current_byte] |= 1 << current_bit
if current_bit == 0:
current_bit = 7
current_byte += 1
else: else:
current_bit -= 1 can = canonical[atom]
for b in can:
enc = put_bit(enc, b == "1")
if atom in qstrs:
enc = put_bits(enc, qstrs[atom], encoding_table.translation_qstr_bits)
if current_bit != 7: while enc.bit_length() % 8 != 1:
current_byte += 1 enc = put_bit(enc, 0)
return enc[:current_byte]
r = enc.to_bytes((enc.bit_length() + 7) // 8, "big")
return r[1:]
def qstr_escape(qst): def qstr_escape(qst):
@ -493,10 +499,20 @@ def qstr_escape(qst):
return re.sub(r"[^A-Za-z0-9_]", esc_char, qst) return re.sub(r"[^A-Za-z0-9_]", esc_char, qst)
def parse_qstrs(infile):
r = {}
rx = re.compile(r'QDEF\([A-Za-z0-9_]+,\s*\d+,\s*\d+,\s*(?P<cstr>"(?:[^"\\\\]*|\\.)")\)')
content = infile.read()
for i, mat in enumerate(rx.findall(content, re.M)):
mat = eval(mat)
r[mat] = i
return r
def parse_input_headers(infiles): def parse_input_headers(infiles):
i18ns = set() i18ns = set()
# read the qstrs in from the input files # read the TRANSLATE strings in from the input files
for infile in infiles: for infile in infiles:
with open(infile, "rt") as f: with open(infile, "rt") as f:
for line in f: for line in f:
@ -516,12 +532,12 @@ def escape_bytes(qstr):
return qstr return qstr
else: else:
# qstr contains non-printable codes so render entire thing as hex pairs # qstr contains non-printable codes so render entire thing as hex pairs
qbytes = bytes_cons(qstr, "utf8") qbytes = bytes(qstr, "utf8")
return "".join(("\\x%02x" % b) for b in qbytes) return "".join(("\\x%02x" % b) for b in qbytes)
def make_bytes(cfg_bytes_len, cfg_bytes_hash, qstr): def make_bytes(cfg_bytes_len, cfg_bytes_hash, qstr):
qbytes = bytes_cons(qstr, "utf8") qbytes = bytes(qstr, "utf8")
qlen = len(qbytes) qlen = len(qbytes)
qhash = compute_hash(qbytes, cfg_bytes_hash) qhash = compute_hash(qbytes, cfg_bytes_hash)
if qlen >= (1 << (8 * cfg_bytes_len)): if qlen >= (1 << (8 * cfg_bytes_len)):
@ -551,7 +567,7 @@ def output_translation_data(encoding_table, i18ns, out):
) )
total_text_compressed_size += len(compressed) total_text_compressed_size += len(compressed)
decompressed = decompress(encoding_table, compressed, encoded_length_bits) decompressed = decompress(encoding_table, compressed, encoded_length_bits)
assert decompressed == translation assert decompressed == translation, (decompressed, translation)
for c in C_ESCAPES: for c in C_ESCAPES:
decompressed = decompressed.replace(c, C_ESCAPES[c]) decompressed = decompressed.replace(c, C_ESCAPES[c])
formatted = ["{:d}".format(x) for x in compressed] formatted = ["{:d}".format(x) for x in compressed]
@ -572,7 +588,7 @@ if __name__ == "__main__":
import argparse import argparse
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
description="Process QSTR definitions into headers for compilation" description="Process TRANSLATE strings into headers for compilation"
) )
parser.add_argument( parser.add_argument(
"infiles", metavar="N", type=str, nargs="+", help="an integer for the accumulator" "infiles", metavar="N", type=str, nargs="+", help="an integer for the accumulator"
@ -590,13 +606,19 @@ if __name__ == "__main__":
type=argparse.FileType("w", encoding="UTF-8"), type=argparse.FileType("w", encoding="UTF-8"),
help="c file for translation data", help="c file for translation data",
) )
parser.add_argument(
"--qstrdefs_filename",
type=argparse.FileType("r", encoding="UTF-8"),
help="",
)
args = parser.parse_args() args = parser.parse_args()
qstrs = parse_qstrs(args.qstrdefs_filename)
i18ns = parse_input_headers(args.infiles) i18ns = parse_input_headers(args.infiles)
i18ns = sorted(i18ns) i18ns = sorted(i18ns)
translations = translate(args.translation, i18ns) translations = translate(args.translation, i18ns)
encoding_table = compute_huffman_coding( encoding_table = compute_huffman_coding(
args.translation, translations, args.compression_filename qstrs, args.translation, translations, args.compression_filename
) )
output_translation_data(encoding_table, translations, args.translation_filename) output_translation_data(encoding_table, translations, args.translation_filename)

View File

@ -270,7 +270,7 @@ $(PY_BUILD)/translations-$(TRANSLATION).c: $(HEADER_BUILD)/compressed_translatio
$(HEADER_BUILD)/compressed_translations.generated.h: $(PY_SRC)/maketranslationdata.py $(HEADER_BUILD)/$(TRANSLATION).mo $(HEADER_BUILD)/qstrdefs.generated.h $(HEADER_BUILD)/compressed_translations.generated.h: $(PY_SRC)/maketranslationdata.py $(HEADER_BUILD)/$(TRANSLATION).mo $(HEADER_BUILD)/qstrdefs.generated.h
$(STEPECHO) "GEN $@" $(STEPECHO) "GEN $@"
$(Q)mkdir -p $(PY_BUILD) $(Q)mkdir -p $(PY_BUILD)
$(Q)$(PYTHON) $(PY_SRC)/maketranslationdata.py --compression_filename $(HEADER_BUILD)/compressed_translations.generated.h --translation $(HEADER_BUILD)/$(TRANSLATION).mo --translation_filename $(PY_BUILD)/translations-$(TRANSLATION).c $(HEADER_BUILD)/qstrdefs.preprocessed.h $(Q)$(PYTHON) $(PY_SRC)/maketranslationdata.py --compression_filename $(HEADER_BUILD)/compressed_translations.generated.h --translation $(HEADER_BUILD)/$(TRANSLATION).mo --translation_filename $(PY_BUILD)/translations-$(TRANSLATION).c --qstrdefs_filename $(HEADER_BUILD)/qstrdefs.generated.h $(HEADER_BUILD)/qstrdefs.preprocessed.h
PY_CORE_O += $(PY_BUILD)/translations-$(TRANSLATION).o PY_CORE_O += $(PY_BUILD)/translations-$(TRANSLATION).o

View File

@ -53,6 +53,13 @@
// speaking, words. They're just spans of code points that frequently // speaking, words. They're just spans of code points that frequently
// occur together. They are ordered shortest to longest. // occur together. They are ordered shortest to longest.
// //
// - If the translation uses a lot of code points or widely spaced code points,
// then the huffman table entries are UTF-16 code points. But if the translation
// uses only ASCII 7-bit code points plus a SMALL range of higher code points that
// still fit in 8 bits, translation_offset and translation_offstart are used to
// renumber the code points so that they still fit within 8 bits. (it's very beneficial
// for mchar_t to be 8 bits instead of 16!)
//
// - dictionary entries are non-overlapping, and the _ending_ index of each // - dictionary entries are non-overlapping, and the _ending_ index of each
// entry is stored in an array. A count of words of each length, from // entry is stored in an array. A count of words of each length, from
// minlen to maxlen, is given in the array called wlencount. From // minlen to maxlen, is given in the array called wlencount. From
@ -60,6 +67,14 @@
// calculated by an efficient, small loop. (A bit of time is traded // calculated by an efficient, small loop. (A bit of time is traded
// to reduce the size of this table indicating lengths) // to reduce the size of this table indicating lengths)
// //
// - Value 1 ('\1') is used to indicate that a QSTR number follows. the
// QSTR is encoded as a fixed number of bits (translation_qstr_bits), e.g.,
// 10 bits if the highest core qstr is from 512 to 1023 inclusive.
// (maketranslationdata uses a simple heuristic where any qstr >= 3
// characters long is encoded in this way; this is simple but probably not
// optimal. In fact, the rule of >= 2 characters is better for SOME languages
// on SOME boards.)
//
// The "data" / "tail" construct is so that the struct's last member is a // The "data" / "tail" construct is so that the struct's last member is a
// "flexible array". However, the _only_ member is not permitted to be // "flexible array". However, the _only_ member is not permitted to be
// a flexible member, so we have to declare the first byte as a separate // a flexible member, so we have to declare the first byte as a separate

View File

@ -25,6 +25,7 @@
*/ */
#include "supervisor/shared/translate/translate.h" #include "supervisor/shared/translate/translate.h"
#include "py/qstr.h"
#include <stdbool.h> #include <stdbool.h>
#include <stdint.h> #include <stdint.h>
@ -99,10 +100,34 @@ uint16_t decompress_length(const compressed_string_t *compressed) {
#endif #endif
} }
typedef struct {
const uint8_t *ptr;
uint8_t bit;
} bitstream_state_t;
static bool next_bit(bitstream_state_t *st) {
bool r = *st->ptr & st->bit;
st->bit >>= 1;
if (!st->bit) {
st->bit = 0x80;
st->ptr++;
}
return r;
}
static int get_nbits(bitstream_state_t *st, int n) {
int r = 0;
while (n--) {
r = (r << 1) | next_bit(st);
}
return r;
}
char *decompress(const compressed_string_t *compressed, char *decompressed) { char *decompress(const compressed_string_t *compressed, char *decompressed) {
uint8_t this_byte = compress_max_length_bits / 8; bitstream_state_t b = {
uint8_t this_bit = 7 - compress_max_length_bits % 8; .ptr = &(compressed->data) + (compress_max_length_bits >> 3),
uint8_t b = (&compressed->data)[this_byte] << (compress_max_length_bits % 8); .bit = 1 << (7 - ((compress_max_length_bits) & 0x7)),
};
uint16_t length = decompress_length(compressed); uint16_t length = decompress_length(compressed);
// Stop one early because the last byte is always NULL. // Stop one early because the last byte is always NULL.
@ -112,26 +137,23 @@ char *decompress(const compressed_string_t *compressed, char *decompressed) {
uint32_t max_code = lengths[0]; uint32_t max_code = lengths[0];
uint32_t searched_length = lengths[0]; uint32_t searched_length = lengths[0];
while (true) { while (true) {
bits <<= 1; bits = (bits << 1) | next_bit(&b);
if ((0x80 & b) != 0) {
bits |= 1;
}
b <<= 1;
bit_length += 1; bit_length += 1;
if (this_bit == 0) {
this_bit = 7;
this_byte += 1;
b = (&compressed->data)[this_byte]; // This may read past the end but its never used.
} else {
this_bit -= 1;
}
if (max_code > 0 && bits < max_code) { if (max_code > 0 && bits < max_code) {
break; break;
} }
max_code = (max_code << 1) + lengths[bit_length]; max_code = (max_code << 1) + lengths[bit_length];
searched_length += lengths[bit_length]; searched_length += lengths[bit_length];
} }
i += put_utf8(decompressed + i, values[searched_length + bits - max_code]); int v = values[searched_length + bits - max_code];
if (v == 1) {
qstr q = get_nbits(&b, translation_qstr_bits) + 1; // honestly no idea why "+1"...
for (const char *qc = qstr_str(q); *qc;) {
decompressed[i++] = *qc++;
}
} else {
i += put_utf8(decompressed + i, v);
}
} }
decompressed[length - 1] = '\0'; decompressed[length - 1] = '\0';