Use qstrs to improve compression

any qstr of >= 3 chars long is a candidate for use; the input byte value (1) is used to indicate a fixed-length qstr number follows. Before accounting for the code size changes, this saves ~100 bytes on trinket m0, but it may end up being a wash due to added code. A cutoff of length>2 is slightly better for ja and ru and worse for others. This has to do with the relative frequency of latin characters vs non-latin characters, is my guess. lang after before change (- = more free space) en_US 3132 2996 -134 fr 2136 2060 -76 ja 1916 1724 -232 ru 2196 2000 -196
2023-08-23 14:01:02 -05:00 · 2023-08-23 14:01:02 -05:00 · 4d8b354c13
commit 4d8b354c13
parent e08ad22ef6
3 changed files with 144 additions and 100 deletions
--- a/py/maketranslationdata.py
+++ b/py/maketranslationdata.py
@ -1,7 +1,9 @@
 """
 Process raw qstr file and output qstr data with length, hash and data bytes.

-This script works with Python 2.7, 3.3 and 3.4.
+This script is only regularly tested with the same version of Python used
+during CI, typically the latest "3.x". However, incompatibilities with any
+supported CPython version are unintended.

 For documentation about the format of compressed translated strings, see
 supervisor/shared/translate/translate.h
@ -16,31 +18,16 @@ import sys

 import collections
 import gettext
-import os.path
+import pathlib

 if hasattr(sys.stdout, "reconfigure"):
    sys.stdout.reconfigure(encoding="utf-8")
    sys.stderr.reconfigure(errors="backslashreplace")

-py = os.path.dirname(sys.argv[0])
-top = os.path.dirname(py)
-
-sys.path.append(os.path.join(top, "tools/huffman"))
+sys.path.append(str(pathlib.Path(__file__).parent.parent / "tools/huffman"))

 import huffman
-
-# Python 2/3 compatibility:
-#   - iterating through bytes is different
-#   - codepoint2name lives in a different module
-import platform
-
-if platform.python_version_tuple()[0] == "2":
-    bytes_cons = lambda val, enc=None: bytearray(val)
-    from htmlentitydefs import codepoint2name
-elif platform.python_version_tuple()[0] == "3":
-    bytes_cons = bytes
 from html.entities import codepoint2name
-# end compatibility code

 codepoint2name[ord("-")] = "hyphen"

@ -182,9 +169,15 @@ class EncodingTable:
    extractor: object
    apply_offset: object
    remove_offset: object
+    translation_qstr_bits: int
+    qstrs: object
+    qstrs_inv: object


-def compute_huffman_coding(translation_name, translations, f):
+def compute_huffman_coding(qstrs, translation_name, translations, f):
+    # possible future improvement: some languages are better when consider len(k) > 2. try both?
+    qstrs = dict((k, v) for k, v in qstrs.items() if len(k) > 3)
+    qstr_strs = list(qstrs.keys())
    texts = [t[1] for t in translations]
    words = []

@ -234,10 +227,12 @@ def compute_huffman_coding(translation_name, translations, f):
        # if "the" is in words then not only will "the" not be considered
        # again, neither will "there" or "wither", since they have "the"
        # as substrings.
-        extractor = TextSplitter(words)
+        extractor = TextSplitter(words + qstr_strs)
        counter = collections.Counter()
        for t in texts:
            for atom in extractor.iter(t):
+                if atom in qstrs:
+                    atom = "\1"
                counter[atom] += 1
        cb = huffman.codebook(counter.items())
        lengths = sorted(dict((v, len(cb[k])) for k, v in counter.items()).items())
@ -304,10 +299,14 @@ def compute_huffman_coding(translation_name, translations, f):
        words.append(word)

    words.sort(key=len)
-    extractor = TextSplitter(words)
+    extractor = TextSplitter(words + qstr_strs)
    counter = collections.Counter()
+    used_qstr = 0
    for t in texts:
        for atom in extractor.iter(t):
+            if atom in qstrs:
+                used_qstr = max(used_qstr, qstrs[atom])
+                atom = "\1"
            counter[atom] += 1
    cb = huffman.codebook(counter.items())

@ -322,6 +321,8 @@ def compute_huffman_coding(translation_name, translations, f):
    last_length = None
    canonical = {}
    for atom, code in sorted(cb.items(), key=lambda x: (len(x[1]), x[0])):
+        if atom in qstr_strs:
+            atom = "\1"
        values.append(atom)
        length = len(code)
        if length not in length_count:
@ -359,6 +360,8 @@ def compute_huffman_coding(translation_name, translations, f):
    minlen = len(words[0])
    wlencount = [len([None for w in words if len(w) == l]) for l in range(minlen, maxlen + 1)]

+    translation_qstr_bits = used_qstr.bit_length()
+
    f.write("typedef {} mchar_t;\n".format(values_type))
    f.write("const uint8_t lengths[] = {{ {} }};\n".format(", ".join(map(str, lengths))))
    f.write(
@ -383,34 +386,44 @@ def compute_huffman_coding(translation_name, translations, f):
    f.write("#define maxlen {}\n".format(maxlen))
    f.write("#define translation_offstart {}\n".format(offstart))
    f.write("#define translation_offset {}\n".format(offset))
+    f.write("#define translation_qstr_bits {}\n".format(translation_qstr_bits))

-    return EncodingTable(values, lengths, words, canonical, extractor, apply_offset, remove_offset)
+    qstrs_inv = dict((v, k) for k, v in qstrs.items())
+    return EncodingTable(
+        values,
+        lengths,
+        words,
+        canonical,
+        extractor,
+        apply_offset,
+        remove_offset,
+        translation_qstr_bits,
+        qstrs,
+        qstrs_inv,
+    )


 def decompress(encoding_table, encoded, encoded_length_bits):
+    qstrs_inv = encoding_table.qstrs_inv
    values = encoding_table.values
    lengths = encoding_table.lengths
    words = encoding_table.words

-    dec = []
-    this_byte = 0
-    this_bit = 7
-    b = encoded[this_byte]
-    bits = 0
-    for i in range(encoded_length_bits):
-        bits <<= 1
-        if 0x80 & b:
-            bits |= 1
+    def bititer():
+        for byte in encoded:
+            for bit in (0x80, 0x40, 0x20, 0x10, 0x8, 0x4, 0x2, 0x1):
+                yield bool(byte & bit)

-        b <<= 1
-        if this_bit == 0:
-            this_bit = 7
-            this_byte += 1
-            if this_byte < len(encoded):
-                b = encoded[this_byte]
-        else:
-            this_bit -= 1
-    length = bits
+    nextbit = bititer().__next__
+
+    def getnbits(n):
+        bits = 0
+        for i in range(n):
+            bits = (bits << 1) | nextbit()
+        return bits
+
+    dec = []
+    length = getnbits(encoded_length_bits)

    i = 0
    while i < length:
@ -419,19 +432,8 @@ def decompress(encoding_table, encoded, encoded_length_bits):
        max_code = lengths[0]
        searched_length = lengths[0]
        while True:
-            bits <<= 1
-            if 0x80 & b:
-                bits |= 1
-
-            b <<= 1
+            bits = (bits << 1) | nextbit()
            bit_length += 1
-            if this_bit == 0:
-                this_bit = 7
-                this_byte += 1
-                if this_byte < len(encoded):
-                    b = encoded[this_byte]
-            else:
-                this_bit -= 1
            if max_code > 0 and bits < max_code:
                # print('{0:0{width}b}'.format(bits, width=bit_length))
                break
@ -439,7 +441,10 @@ def decompress(encoding_table, encoded, encoded_length_bits):
            searched_length += lengths[bit_length]

        v = values[searched_length + bits - max_code]
-        if v >= chr(0x80) and v < chr(0x80 + len(words)):
+        if v == chr(1):
+            qstr_idx = getnbits(encoding_table.translation_qstr_bits)
+            v = qstrs_inv[qstr_idx]
+        elif v >= chr(0x80) and v < chr(0x80 + len(words)):
            v = words[ord(v) - 0x80]
        i += len(v.encode("utf-8"))
        dec.append(v)
@ -449,36 +454,37 @@ def decompress(encoding_table, encoded, encoded_length_bits):
 def compress(encoding_table, decompressed, encoded_length_bits, len_translation_encoded):
    if not isinstance(decompressed, str):
        raise TypeError()
+    qstrs = encoding_table.qstrs
    canonical = encoding_table.canonical
    extractor = encoding_table.extractor

-    enc = bytearray(len(decompressed) * 3)
-    current_bit = 7
-    current_byte = 0
+    enc = 1

-    bits = encoded_length_bits + 1
-    for i in range(bits - 1, 0, -1):
-        if len_translation_encoded & (1 << (i - 1)):
-            enc[current_byte] |= 1 << current_bit
-        if current_bit == 0:
-            current_bit = 7
-            current_byte += 1
-        else:
-            current_bit -= 1
+    def put_bit(enc, b):
+        return (enc << 1) | bool(b)
+
+    def put_bits(enc, b, n):
+        for i in range(n - 1, -1, -1):
+            enc = put_bit(enc, b & (1 << i))
+        return enc
+
+    enc = put_bits(enc, len_translation_encoded, encoded_length_bits)

    for atom in extractor.iter(decompressed):
-        for b in canonical[atom]:
-            if b == "1":
-                enc[current_byte] |= 1 << current_bit
-            if current_bit == 0:
-                current_bit = 7
-                current_byte += 1
+        if atom in qstrs:
+            can = canonical["\1"]
        else:
-                current_bit -= 1
+            can = canonical[atom]
+        for b in can:
+            enc = put_bit(enc, b == "1")
+        if atom in qstrs:
+            enc = put_bits(enc, qstrs[atom], encoding_table.translation_qstr_bits)

-    if current_bit != 7:
-        current_byte += 1
-    return enc[:current_byte]
+    while enc.bit_length() % 8 != 1:
+        enc = put_bit(enc, 0)
+
+    r = enc.to_bytes((enc.bit_length() + 7) // 8, "big")
+    return r[1:]


 def qstr_escape(qst):
@ -493,10 +499,20 @@ def qstr_escape(qst):
    return re.sub(r"[^A-Za-z0-9_]", esc_char, qst)


+def parse_qstrs(infile):
+    r = {}
+    rx = re.compile(r'QDEF\([A-Za-z0-9_]+,\s*\d+,\s*\d+,\s*(?P<cstr>"(?:[^"\\\\]*|\\.)")\)')
+    content = infile.read()
+    for i, mat in enumerate(rx.findall(content, re.M)):
+        mat = eval(mat)
+        r[mat] = i
+    return r
+
+
 def parse_input_headers(infiles):
    i18ns = set()

-    # read the qstrs in from the input files
+    # read the TRANSLATE strings in from the input files
    for infile in infiles:
        with open(infile, "rt") as f:
            for line in f:
@ -516,12 +532,12 @@ def escape_bytes(qstr):
        return qstr
    else:
        # qstr contains non-printable codes so render entire thing as hex pairs
-        qbytes = bytes_cons(qstr, "utf8")
+        qbytes = bytes(qstr, "utf8")
        return "".join(("\\x%02x" % b) for b in qbytes)


 def make_bytes(cfg_bytes_len, cfg_bytes_hash, qstr):
-    qbytes = bytes_cons(qstr, "utf8")
+    qbytes = bytes(qstr, "utf8")
    qlen = len(qbytes)
    qhash = compute_hash(qbytes, cfg_bytes_hash)
    if qlen >= (1 << (8 * cfg_bytes_len)):
@ -551,7 +567,7 @@ def output_translation_data(encoding_table, i18ns, out):
        )
        total_text_compressed_size += len(compressed)
        decompressed = decompress(encoding_table, compressed, encoded_length_bits)
-        assert decompressed == translation
+        assert decompressed == translation, (decompressed, translation)
        for c in C_ESCAPES:
            decompressed = decompressed.replace(c, C_ESCAPES[c])
        formatted = ["{:d}".format(x) for x in compressed]
@ -572,7 +588,7 @@ if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser(
-        description="Process QSTR definitions into headers for compilation"
+        description="Process TRANSLATE strings into headers for compilation"
    )
    parser.add_argument(
        "infiles", metavar="N", type=str, nargs="+", help="an integer for the accumulator"
@ -590,13 +606,19 @@ if __name__ == "__main__":
        type=argparse.FileType("w", encoding="UTF-8"),
        help="c file for translation data",
    )
+    parser.add_argument(
+        "--qstrdefs_filename",
+        type=argparse.FileType("r", encoding="UTF-8"),
+        help="",
+    )

    args = parser.parse_args()

+    qstrs = parse_qstrs(args.qstrdefs_filename)
    i18ns = parse_input_headers(args.infiles)
    i18ns = sorted(i18ns)
    translations = translate(args.translation, i18ns)
    encoding_table = compute_huffman_coding(
-        args.translation, translations, args.compression_filename
+        qstrs, args.translation, translations, args.compression_filename
    )
    output_translation_data(encoding_table, translations, args.translation_filename)
--- a/py/py.mk
+++ b/py/py.mk
@ -270,7 +270,7 @@ $(PY_BUILD)/translations-$(TRANSLATION).c: $(HEADER_BUILD)/compressed_translatio
 $(HEADER_BUILD)/compressed_translations.generated.h: $(PY_SRC)/maketranslationdata.py $(HEADER_BUILD)/$(TRANSLATION).mo $(HEADER_BUILD)/qstrdefs.generated.h
 	$(STEPECHO) "GEN $@"
 	$(Q)mkdir -p $(PY_BUILD)
-	$(Q)$(PYTHON) $(PY_SRC)/maketranslationdata.py --compression_filename $(HEADER_BUILD)/compressed_translations.generated.h --translation $(HEADER_BUILD)/$(TRANSLATION).mo --translation_filename $(PY_BUILD)/translations-$(TRANSLATION).c $(HEADER_BUILD)/qstrdefs.preprocessed.h
+	$(Q)$(PYTHON) $(PY_SRC)/maketranslationdata.py --compression_filename $(HEADER_BUILD)/compressed_translations.generated.h --translation $(HEADER_BUILD)/$(TRANSLATION).mo --translation_filename $(PY_BUILD)/translations-$(TRANSLATION).c --qstrdefs_filename  $(HEADER_BUILD)/qstrdefs.generated.h $(HEADER_BUILD)/qstrdefs.preprocessed.h

 PY_CORE_O += $(PY_BUILD)/translations-$(TRANSLATION).o

--- a/supervisor/shared/translate/translate.c
+++ b/supervisor/shared/translate/translate.c
@ -25,6 +25,7 @@
 */

 #include "supervisor/shared/translate/translate.h"
+#include "py/qstr.h"

 #include <stdbool.h>
 #include <stdint.h>
@ -99,10 +100,34 @@ uint16_t decompress_length(const compressed_string_t *compressed) {
    #endif
 }

+typedef struct {
+    const uint8_t *ptr;
+    uint8_t bit;
+} bitstream_state_t;
+
+static bool next_bit(bitstream_state_t *st) {
+    bool r = *st->ptr & st->bit;
+    st->bit >>= 1;
+    if (!st->bit) {
+        st->bit = 0x80;
+        st->ptr++;
+    }
+    return r;
+}
+
+static int get_nbits(bitstream_state_t *st, int n) {
+    int r = 0;
+    while (n--) {
+        r = (r << 1) | next_bit(st);
+    }
+    return r;
+}
+
 char *decompress(const compressed_string_t *compressed, char *decompressed) {
-    uint8_t this_byte = compress_max_length_bits / 8;
-    uint8_t this_bit = 7 - compress_max_length_bits % 8;
-    uint8_t b = (&compressed->data)[this_byte] << (compress_max_length_bits % 8);
+    bitstream_state_t b = {
+        .ptr = &(compressed->data) + (compress_max_length_bits >> 3),
+        .bit = 1 << (7 - ((compress_max_length_bits) & 0x7)),
+    };
    uint16_t length = decompress_length(compressed);

    // Stop one early because the last byte is always NULL.
@ -112,26 +137,23 @@ char *decompress(const compressed_string_t *compressed, char *decompressed) {
        uint32_t max_code = lengths[0];
        uint32_t searched_length = lengths[0];
        while (true) {
-            bits <<= 1;
-            if ((0x80 & b) != 0) {
-                bits |= 1;
-            }
-            b <<= 1;
+            bits = (bits << 1) | next_bit(&b);
            bit_length += 1;
-            if (this_bit == 0) {
-                this_bit = 7;
-                this_byte += 1;
-                b = (&compressed->data)[this_byte]; // This may read past the end but its never used.
-            } else {
-                this_bit -= 1;
-            }
            if (max_code > 0 && bits < max_code) {
                break;
            }
            max_code = (max_code << 1) + lengths[bit_length];
            searched_length += lengths[bit_length];
        }
-        i += put_utf8(decompressed + i, values[searched_length + bits - max_code]);
+        int v = values[searched_length + bits - max_code];
+        if (v == 1) {
+            qstr q = get_nbits(&b, translation_qstr_bits) + 1; // honestly no idea why "+1"...
+            for (const char *qc = qstr_str(q); *qc;) {
+                decompressed[i++] = *qc++;
+            }
+        } else {
+            i += put_utf8(decompressed + i, v);
+        }
    }

    decompressed[length - 1] = '\0';