diff --git a/py/maketranslationdata.py b/py/maketranslationdata.py index a07f70265c..9b75e06996 100644 --- a/py/maketranslationdata.py +++ b/py/maketranslationdata.py @@ -1,7 +1,9 @@ """ Process raw qstr file and output qstr data with length, hash and data bytes. -This script works with Python 2.7, 3.3 and 3.4. +This script is only regularly tested with the same version of Python used +during CI, typically the latest "3.x". However, incompatibilities with any +supported CPython version are unintended. For documentation about the format of compressed translated strings, see supervisor/shared/translate/translate.h @@ -16,31 +18,16 @@ import sys import collections import gettext -import os.path +import pathlib if hasattr(sys.stdout, "reconfigure"): sys.stdout.reconfigure(encoding="utf-8") sys.stderr.reconfigure(errors="backslashreplace") -py = os.path.dirname(sys.argv[0]) -top = os.path.dirname(py) - -sys.path.append(os.path.join(top, "tools/huffman")) +sys.path.append(str(pathlib.Path(__file__).parent.parent / "tools/huffman")) import huffman - -# Python 2/3 compatibility: -# - iterating through bytes is different -# - codepoint2name lives in a different module -import platform - -if platform.python_version_tuple()[0] == "2": - bytes_cons = lambda val, enc=None: bytearray(val) - from htmlentitydefs import codepoint2name -elif platform.python_version_tuple()[0] == "3": - bytes_cons = bytes - from html.entities import codepoint2name -# end compatibility code +from html.entities import codepoint2name codepoint2name[ord("-")] = "hyphen" @@ -182,9 +169,15 @@ class EncodingTable: extractor: object apply_offset: object remove_offset: object + translation_qstr_bits: int + qstrs: object + qstrs_inv: object -def compute_huffman_coding(translation_name, translations, f): +def compute_huffman_coding(qstrs, translation_name, translations, f): + # possible future improvement: some languages are better when consider len(k) > 2. try both? + qstrs = dict((k, v) for k, v in qstrs.items() if len(k) > 3) + qstr_strs = list(qstrs.keys()) texts = [t[1] for t in translations] words = [] @@ -234,10 +227,12 @@ def compute_huffman_coding(translation_name, translations, f): # if "the" is in words then not only will "the" not be considered # again, neither will "there" or "wither", since they have "the" # as substrings. - extractor = TextSplitter(words) + extractor = TextSplitter(words + qstr_strs) counter = collections.Counter() for t in texts: for atom in extractor.iter(t): + if atom in qstrs: + atom = "\1" counter[atom] += 1 cb = huffman.codebook(counter.items()) lengths = sorted(dict((v, len(cb[k])) for k, v in counter.items()).items()) @@ -304,10 +299,14 @@ def compute_huffman_coding(translation_name, translations, f): words.append(word) words.sort(key=len) - extractor = TextSplitter(words) + extractor = TextSplitter(words + qstr_strs) counter = collections.Counter() + used_qstr = 0 for t in texts: for atom in extractor.iter(t): + if atom in qstrs: + used_qstr = max(used_qstr, qstrs[atom]) + atom = "\1" counter[atom] += 1 cb = huffman.codebook(counter.items()) @@ -322,6 +321,8 @@ def compute_huffman_coding(translation_name, translations, f): last_length = None canonical = {} for atom, code in sorted(cb.items(), key=lambda x: (len(x[1]), x[0])): + if atom in qstr_strs: + atom = "\1" values.append(atom) length = len(code) if length not in length_count: @@ -359,6 +360,8 @@ def compute_huffman_coding(translation_name, translations, f): minlen = len(words[0]) wlencount = [len([None for w in words if len(w) == l]) for l in range(minlen, maxlen + 1)] + translation_qstr_bits = used_qstr.bit_length() + f.write("typedef {} mchar_t;\n".format(values_type)) f.write("const uint8_t lengths[] = {{ {} }};\n".format(", ".join(map(str, lengths)))) f.write( @@ -383,34 +386,44 @@ def compute_huffman_coding(translation_name, translations, f): f.write("#define maxlen {}\n".format(maxlen)) f.write("#define translation_offstart {}\n".format(offstart)) f.write("#define translation_offset {}\n".format(offset)) + f.write("#define translation_qstr_bits {}\n".format(translation_qstr_bits)) - return EncodingTable(values, lengths, words, canonical, extractor, apply_offset, remove_offset) + qstrs_inv = dict((v, k) for k, v in qstrs.items()) + return EncodingTable( + values, + lengths, + words, + canonical, + extractor, + apply_offset, + remove_offset, + translation_qstr_bits, + qstrs, + qstrs_inv, + ) def decompress(encoding_table, encoded, encoded_length_bits): + qstrs_inv = encoding_table.qstrs_inv values = encoding_table.values lengths = encoding_table.lengths words = encoding_table.words - dec = [] - this_byte = 0 - this_bit = 7 - b = encoded[this_byte] - bits = 0 - for i in range(encoded_length_bits): - bits <<= 1 - if 0x80 & b: - bits |= 1 + def bititer(): + for byte in encoded: + for bit in (0x80, 0x40, 0x20, 0x10, 0x8, 0x4, 0x2, 0x1): + yield bool(byte & bit) - b <<= 1 - if this_bit == 0: - this_bit = 7 - this_byte += 1 - if this_byte < len(encoded): - b = encoded[this_byte] - else: - this_bit -= 1 - length = bits + nextbit = bititer().__next__ + + def getnbits(n): + bits = 0 + for i in range(n): + bits = (bits << 1) | nextbit() + return bits + + dec = [] + length = getnbits(encoded_length_bits) i = 0 while i < length: @@ -419,19 +432,8 @@ def decompress(encoding_table, encoded, encoded_length_bits): max_code = lengths[0] searched_length = lengths[0] while True: - bits <<= 1 - if 0x80 & b: - bits |= 1 - - b <<= 1 + bits = (bits << 1) | nextbit() bit_length += 1 - if this_bit == 0: - this_bit = 7 - this_byte += 1 - if this_byte < len(encoded): - b = encoded[this_byte] - else: - this_bit -= 1 if max_code > 0 and bits < max_code: # print('{0:0{width}b}'.format(bits, width=bit_length)) break @@ -439,7 +441,10 @@ def decompress(encoding_table, encoded, encoded_length_bits): searched_length += lengths[bit_length] v = values[searched_length + bits - max_code] - if v >= chr(0x80) and v < chr(0x80 + len(words)): + if v == chr(1): + qstr_idx = getnbits(encoding_table.translation_qstr_bits) + v = qstrs_inv[qstr_idx] + elif v >= chr(0x80) and v < chr(0x80 + len(words)): v = words[ord(v) - 0x80] i += len(v.encode("utf-8")) dec.append(v) @@ -449,36 +454,37 @@ def decompress(encoding_table, encoded, encoded_length_bits): def compress(encoding_table, decompressed, encoded_length_bits, len_translation_encoded): if not isinstance(decompressed, str): raise TypeError() + qstrs = encoding_table.qstrs canonical = encoding_table.canonical extractor = encoding_table.extractor - enc = bytearray(len(decompressed) * 3) - current_bit = 7 - current_byte = 0 + enc = 1 - bits = encoded_length_bits + 1 - for i in range(bits - 1, 0, -1): - if len_translation_encoded & (1 << (i - 1)): - enc[current_byte] |= 1 << current_bit - if current_bit == 0: - current_bit = 7 - current_byte += 1 - else: - current_bit -= 1 + def put_bit(enc, b): + return (enc << 1) | bool(b) + + def put_bits(enc, b, n): + for i in range(n - 1, -1, -1): + enc = put_bit(enc, b & (1 << i)) + return enc + + enc = put_bits(enc, len_translation_encoded, encoded_length_bits) for atom in extractor.iter(decompressed): - for b in canonical[atom]: - if b == "1": - enc[current_byte] |= 1 << current_bit - if current_bit == 0: - current_bit = 7 - current_byte += 1 - else: - current_bit -= 1 + if atom in qstrs: + can = canonical["\1"] + else: + can = canonical[atom] + for b in can: + enc = put_bit(enc, b == "1") + if atom in qstrs: + enc = put_bits(enc, qstrs[atom], encoding_table.translation_qstr_bits) - if current_bit != 7: - current_byte += 1 - return enc[:current_byte] + while enc.bit_length() % 8 != 1: + enc = put_bit(enc, 0) + + r = enc.to_bytes((enc.bit_length() + 7) // 8, "big") + return r[1:] def qstr_escape(qst): @@ -493,10 +499,20 @@ def qstr_escape(qst): return re.sub(r"[^A-Za-z0-9_]", esc_char, qst) +def parse_qstrs(infile): + r = {} + rx = re.compile(r'QDEF\([A-Za-z0-9_]+,\s*\d+,\s*\d+,\s*(?P"(?:[^"\\\\]*|\\.)")\)') + content = infile.read() + for i, mat in enumerate(rx.findall(content, re.M)): + mat = eval(mat) + r[mat] = i + return r + + def parse_input_headers(infiles): i18ns = set() - # read the qstrs in from the input files + # read the TRANSLATE strings in from the input files for infile in infiles: with open(infile, "rt") as f: for line in f: @@ -516,12 +532,12 @@ def escape_bytes(qstr): return qstr else: # qstr contains non-printable codes so render entire thing as hex pairs - qbytes = bytes_cons(qstr, "utf8") + qbytes = bytes(qstr, "utf8") return "".join(("\\x%02x" % b) for b in qbytes) def make_bytes(cfg_bytes_len, cfg_bytes_hash, qstr): - qbytes = bytes_cons(qstr, "utf8") + qbytes = bytes(qstr, "utf8") qlen = len(qbytes) qhash = compute_hash(qbytes, cfg_bytes_hash) if qlen >= (1 << (8 * cfg_bytes_len)): @@ -551,7 +567,7 @@ def output_translation_data(encoding_table, i18ns, out): ) total_text_compressed_size += len(compressed) decompressed = decompress(encoding_table, compressed, encoded_length_bits) - assert decompressed == translation + assert decompressed == translation, (decompressed, translation) for c in C_ESCAPES: decompressed = decompressed.replace(c, C_ESCAPES[c]) formatted = ["{:d}".format(x) for x in compressed] @@ -572,7 +588,7 @@ if __name__ == "__main__": import argparse parser = argparse.ArgumentParser( - description="Process QSTR definitions into headers for compilation" + description="Process TRANSLATE strings into headers for compilation" ) parser.add_argument( "infiles", metavar="N", type=str, nargs="+", help="an integer for the accumulator" @@ -590,13 +606,19 @@ if __name__ == "__main__": type=argparse.FileType("w", encoding="UTF-8"), help="c file for translation data", ) + parser.add_argument( + "--qstrdefs_filename", + type=argparse.FileType("r", encoding="UTF-8"), + help="", + ) args = parser.parse_args() + qstrs = parse_qstrs(args.qstrdefs_filename) i18ns = parse_input_headers(args.infiles) i18ns = sorted(i18ns) translations = translate(args.translation, i18ns) encoding_table = compute_huffman_coding( - args.translation, translations, args.compression_filename + qstrs, args.translation, translations, args.compression_filename ) output_translation_data(encoding_table, translations, args.translation_filename) diff --git a/py/py.mk b/py/py.mk index d26b8752a4..5cbc213414 100644 --- a/py/py.mk +++ b/py/py.mk @@ -270,7 +270,7 @@ $(PY_BUILD)/translations-$(TRANSLATION).c: $(HEADER_BUILD)/compressed_translatio $(HEADER_BUILD)/compressed_translations.generated.h: $(PY_SRC)/maketranslationdata.py $(HEADER_BUILD)/$(TRANSLATION).mo $(HEADER_BUILD)/qstrdefs.generated.h $(STEPECHO) "GEN $@" $(Q)mkdir -p $(PY_BUILD) - $(Q)$(PYTHON) $(PY_SRC)/maketranslationdata.py --compression_filename $(HEADER_BUILD)/compressed_translations.generated.h --translation $(HEADER_BUILD)/$(TRANSLATION).mo --translation_filename $(PY_BUILD)/translations-$(TRANSLATION).c $(HEADER_BUILD)/qstrdefs.preprocessed.h + $(Q)$(PYTHON) $(PY_SRC)/maketranslationdata.py --compression_filename $(HEADER_BUILD)/compressed_translations.generated.h --translation $(HEADER_BUILD)/$(TRANSLATION).mo --translation_filename $(PY_BUILD)/translations-$(TRANSLATION).c --qstrdefs_filename $(HEADER_BUILD)/qstrdefs.generated.h $(HEADER_BUILD)/qstrdefs.preprocessed.h PY_CORE_O += $(PY_BUILD)/translations-$(TRANSLATION).o diff --git a/supervisor/shared/translate/translate.c b/supervisor/shared/translate/translate.c index fced90f058..7d57ab3bf2 100644 --- a/supervisor/shared/translate/translate.c +++ b/supervisor/shared/translate/translate.c @@ -25,6 +25,7 @@ */ #include "supervisor/shared/translate/translate.h" +#include "py/qstr.h" #include #include @@ -99,10 +100,34 @@ uint16_t decompress_length(const compressed_string_t *compressed) { #endif } +typedef struct { + const uint8_t *ptr; + uint8_t bit; +} bitstream_state_t; + +static bool next_bit(bitstream_state_t *st) { + bool r = *st->ptr & st->bit; + st->bit >>= 1; + if (!st->bit) { + st->bit = 0x80; + st->ptr++; + } + return r; +} + +static int get_nbits(bitstream_state_t *st, int n) { + int r = 0; + while (n--) { + r = (r << 1) | next_bit(st); + } + return r; +} + char *decompress(const compressed_string_t *compressed, char *decompressed) { - uint8_t this_byte = compress_max_length_bits / 8; - uint8_t this_bit = 7 - compress_max_length_bits % 8; - uint8_t b = (&compressed->data)[this_byte] << (compress_max_length_bits % 8); + bitstream_state_t b = { + .ptr = &(compressed->data) + (compress_max_length_bits >> 3), + .bit = 1 << (7 - ((compress_max_length_bits) & 0x7)), + }; uint16_t length = decompress_length(compressed); // Stop one early because the last byte is always NULL. @@ -112,26 +137,23 @@ char *decompress(const compressed_string_t *compressed, char *decompressed) { uint32_t max_code = lengths[0]; uint32_t searched_length = lengths[0]; while (true) { - bits <<= 1; - if ((0x80 & b) != 0) { - bits |= 1; - } - b <<= 1; + bits = (bits << 1) | next_bit(&b); bit_length += 1; - if (this_bit == 0) { - this_bit = 7; - this_byte += 1; - b = (&compressed->data)[this_byte]; // This may read past the end but its never used. - } else { - this_bit -= 1; - } if (max_code > 0 && bits < max_code) { break; } max_code = (max_code << 1) + lengths[bit_length]; searched_length += lengths[bit_length]; } - i += put_utf8(decompressed + i, values[searched_length + bits - max_code]); + int v = values[searched_length + bits - max_code]; + if (v == 1) { + qstr q = get_nbits(&b, translation_qstr_bits) + 1; // honestly no idea why "+1"... + for (const char *qc = qstr_str(q); *qc;) { + decompressed[i++] = *qc++; + } + } else { + i += put_utf8(decompressed + i, v); + } } decompressed[length - 1] = '\0';