From fe3e8d1589e54de999cccc775f269a39443c82d6 Mon Sep 17 00:00:00 2001 From: Jeff Epler Date: Thu, 28 May 2020 07:40:56 -0500 Subject: [PATCH] string compression: save a few bits per string Length was stored as a 16-bit number always. Most translations have a max length far less. For example, US English translation lengths always fit in just 8 bits. probably all languages fit in 9 bits. This also has the side effect of reducing the alignment of compressed_string_t from 2 bytes to 1. testing performed: ran in german and english on pyruler, printed messages looked right. Firmware size, en_US Before: 3044 bytes free in flash After: 3408 bytes free in flash Firmware size, de_DE (with #2967 merged to restore translations) Before: 1236 bytes free in flash After: 1600 bytes free in flash --- main.c | 2 +- py/builtinhelp.c | 4 +-- py/makeqstrdata.py | 52 +++++++++++++++++++++++++++++------ py/moduerrno.c | 2 +- py/obj.c | 6 ++-- py/objexcept.c | 4 +-- supervisor/shared/translate.c | 26 ++++++++++++------ supervisor/shared/translate.h | 5 ++-- 8 files changed, 74 insertions(+), 27 deletions(-) diff --git a/main.c b/main.c index bb5a7e5cec..c3787122d3 100755 --- a/main.c +++ b/main.c @@ -185,7 +185,7 @@ bool maybe_run_list(const char ** filenames, pyexec_result_t* exec_result) { } mp_hal_stdout_tx_str(filename); const compressed_string_t* compressed = translate(" output:\n"); - char decompressed[compressed->length]; + char decompressed[decompress_length(compressed)]; decompress(compressed, decompressed); mp_hal_stdout_tx_str(decompressed); pyexec_file(filename, exec_result); diff --git a/py/builtinhelp.c b/py/builtinhelp.c index 9a3407a16f..01c0bc84e0 100644 --- a/py/builtinhelp.c +++ b/py/builtinhelp.c @@ -135,7 +135,7 @@ STATIC void mp_help_print_modules(void) { // let the user know there may be other modules available from the filesystem const compressed_string_t* compressed = translate("Plus any modules on the filesystem\n"); - char decompressed[compressed->length]; + char decompressed[decompress_length(compressed)]; decompress(compressed, decompressed); mp_print_str(MP_PYTHON_PRINTER, decompressed); } @@ -181,7 +181,7 @@ STATIC mp_obj_t mp_builtin_help(size_t n_args, const mp_obj_t *args) { // print a general help message. Translate only works on single strings on one line. const compressed_string_t* compressed = translate("Welcome to Adafruit CircuitPython %s!\n\nPlease visit learn.adafruit.com/category/circuitpython for project guides.\n\nTo list built-in modules please do `help(\"modules\")`.\n"); - char decompressed[compressed->length]; + char decompressed[decompress_length(compressed)]; decompress(compressed, decompressed); mp_printf(MP_PYTHON_PRINTER, decompressed, MICROPY_GIT_TAG); } else { diff --git a/py/makeqstrdata.py b/py/makeqstrdata.py index 0d667959d9..64e4d26f45 100644 --- a/py/makeqstrdata.py +++ b/py/makeqstrdata.py @@ -1,7 +1,7 @@ """ Process raw qstr file and output qstr data with length, hash and data bytes. -This script works with Python 2.6, 2.7, 3.3 and 3.4. +This script works with Python 2.7, 3.3 and 3.4. """ from __future__ import print_function @@ -132,19 +132,37 @@ def compute_huffman_coding(translations, qstrs, compression_filename): print("// estimated total memory size", len(lengths) + 2*len(values) + sum(len(cb[u]) for u in all_strings_concat)) print("//", values, lengths) values_type = "uint16_t" if max(ord(u) for u in values) > 255 else "uint8_t" + max_translation_encoded_length = max(len(translation.encode("utf-8")) for original,translation in translations) with open(compression_filename, "w") as f: f.write("const uint8_t lengths[] = {{ {} }};\n".format(", ".join(map(str, lengths)))) f.write("const {} values[] = {{ {} }};\n".format(values_type, ", ".join(str(ord(u)) for u in values))) + f.write("#define compress_max_length_bits ({})\n".format(max_translation_encoded_length.bit_length())) return values, lengths -def decompress(encoding_table, length, encoded): +def decompress(encoding_table, encoded, encoded_length_bits): values, lengths = encoding_table - #print(l, encoded) dec = [] this_byte = 0 this_bit = 7 b = encoded[this_byte] - for i in range(length): + bits = 0 + for i in range(encoded_length_bits): + bits <<= 1 + if 0x80 & b: + bits |= 1 + + b <<= 1 + if this_bit == 0: + this_bit = 7 + this_byte += 1 + if this_byte < len(encoded): + b = encoded[this_byte] + else: + this_bit -= 1 + length = bits + + i = 0 + while i < length: bits = 0 bit_length = 0 max_code = lengths[0] @@ -170,10 +188,11 @@ def decompress(encoding_table, length, encoded): searched_length += lengths[bit_length] v = values[searched_length + bits - max_code] + i += len(v.encode('utf-8')) dec.append(v) return ''.join(dec) -def compress(encoding_table, decompressed): +def compress(encoding_table, decompressed, encoded_length_bits, len_translation_encoded): if not isinstance(decompressed, str): raise TypeError() values, lengths = encoding_table @@ -182,6 +201,19 @@ def compress(encoding_table, decompressed): #print(lengths) current_bit = 7 current_byte = 0 + + code = len_translation_encoded + bits = encoded_length_bits+1 + for i in range(bits - 1, 0, -1): + if len_translation_encoded & (1 << (i - 1)): + enc[current_byte] |= 1 << current_bit + if current_bit == 0: + current_bit = 7 + #print("packed {0:0{width}b}".format(enc[current_byte], width=8)) + current_byte += 1 + else: + current_bit -= 1 + for c in decompressed: #print() #print("char", c, values.index(c)) @@ -342,14 +374,17 @@ def print_qstr_data(encoding_table, qcfgs, qstrs, i18ns): total_text_size = 0 total_text_compressed_size = 0 + max_translation_encoded_length = max(len(translation.encode("utf-8")) for original, translation in i18ns) + encoded_length_bits = max_translation_encoded_length.bit_length() for original, translation in i18ns: translation_encoded = translation.encode("utf-8") - compressed = compress(encoding_table, translation) + compressed = compress(encoding_table, translation, encoded_length_bits, len(translation_encoded)) total_text_compressed_size += len(compressed) - decompressed = decompress(encoding_table, len(translation_encoded), compressed) + decompressed = decompress(encoding_table, compressed, encoded_length_bits) + assert decompressed == translation for c in C_ESCAPES: decompressed = decompressed.replace(c, C_ESCAPES[c]) - print("TRANSLATION(\"{}\", {}, {{ {} }}) // {}".format(original, len(translation_encoded)+1, ", ".join(["0x{:02x}".format(x) for x in compressed]), decompressed)) + print("TRANSLATION(\"{}\", {}) // {}".format(original, ", ".join(["{:d}".format(x) for x in compressed]), decompressed)) total_text_size += len(translation.encode("utf-8")) print() @@ -385,6 +420,7 @@ if __name__ == "__main__": qcfgs, qstrs, i18ns = parse_input_headers(args.infiles) if args.translation: + i18ns = sorted(i18ns) translations = translate(args.translation, i18ns) encoding_table = compute_huffman_coding(translations, qstrs, args.compression_filename) print_qstr_data(encoding_table, qcfgs, qstrs, translations) diff --git a/py/moduerrno.c b/py/moduerrno.c index 7915603e4e..3be5adba1e 100644 --- a/py/moduerrno.c +++ b/py/moduerrno.c @@ -158,7 +158,7 @@ const char *mp_common_errno_to_str(mp_obj_t errno_val, char *buf, size_t len) { case ENOSPC: desc = translate("No space left on device"); break; case EROFS: desc = translate("Read-only filesystem"); break; } - if (desc != NULL && desc->length <= len) { + if (desc != NULL && decompress_length(desc) <= len) { decompress(desc, buf); return buf; } diff --git a/py/obj.c b/py/obj.c index f1e00de1a4..4fa2032dc7 100644 --- a/py/obj.c +++ b/py/obj.c @@ -94,17 +94,17 @@ void mp_obj_print_exception(const mp_print_t *print, mp_obj_t exc) { assert(n % 3 == 0); // Decompress the format strings const compressed_string_t* traceback = translate("Traceback (most recent call last):\n"); - char decompressed[traceback->length]; + char decompressed[decompress_length(traceback)]; decompress(traceback, decompressed); #if MICROPY_ENABLE_SOURCE_LINE const compressed_string_t* frame = translate(" File \"%q\", line %d"); #else const compressed_string_t* frame = translate(" File \"%q\""); #endif - char decompressed_frame[frame->length]; + char decompressed_frame[decompress_length(frame)]; decompress(frame, decompressed_frame); const compressed_string_t* block_fmt = translate(", in %q\n"); - char decompressed_block[block_fmt->length]; + char decompressed_block[decompress_length(block_fmt)]; decompress(block_fmt, decompressed_block); // Print the traceback diff --git a/py/objexcept.c b/py/objexcept.c index b7a536c5e3..796be122fe 100644 --- a/py/objexcept.c +++ b/py/objexcept.c @@ -400,7 +400,7 @@ mp_obj_t mp_obj_new_exception_msg_vlist(const mp_obj_type_t *exc_type, const com // Try to allocate memory for the message mp_obj_str_t *o_str = m_new_obj_maybe(mp_obj_str_t); - size_t o_str_alloc = fmt->length + 1; + size_t o_str_alloc = decompress_length(fmt); byte *o_str_buf = m_new_maybe(byte, o_str_alloc); bool used_emg_buf = false; @@ -433,7 +433,7 @@ mp_obj_t mp_obj_new_exception_msg_vlist(const mp_obj_type_t *exc_type, const com // We have some memory to format the string struct _exc_printer_t exc_pr = {!used_emg_buf, o_str_alloc, 0, o_str_buf}; mp_print_t print = {&exc_pr, exc_add_strn}; - char fmt_decompressed[fmt->length]; + char fmt_decompressed[decompress_length(fmt)]; decompress(fmt, fmt_decompressed); mp_vprintf(&print, fmt_decompressed, ap); exc_pr.buf[exc_pr.len] = '\0'; diff --git a/supervisor/shared/translate.c b/supervisor/shared/translate.c index 187d5ff8a5..606f8fa91a 100644 --- a/supervisor/shared/translate.c +++ b/supervisor/shared/translate.c @@ -37,7 +37,7 @@ #include "supervisor/serial.h" void serial_write_compressed(const compressed_string_t* compressed) { - char decompressed[compressed->length]; + char decompressed[decompress_length(compressed)]; decompress(compressed, decompressed); serial_write(decompressed); } @@ -58,12 +58,22 @@ STATIC int put_utf8(char *buf, int u) { } } +uint16_t decompress_length(const compressed_string_t* compressed) { + if (compress_max_length_bits <= 8) { + return 1 + (compressed->data >> (8 - compress_max_length_bits)); + } else { + return 1 + ((compressed->data * 256 + compressed->tail[0]) >> (16 - compress_max_length_bits)); + } +} + char* decompress(const compressed_string_t* compressed, char* decompressed) { - uint8_t this_byte = 0; - uint8_t this_bit = 7; - uint8_t b = compressed->data[this_byte]; + uint8_t this_byte = compress_max_length_bits / 8; + uint8_t this_bit = 7 - compress_max_length_bits % 8; + uint8_t b = (&compressed->data)[this_byte]; + uint16_t length = decompress_length(compressed); + // Stop one early because the last byte is always NULL. - for (uint16_t i = 0; i < compressed->length - 1;) { + for (uint16_t i = 0; i < length - 1;) { uint32_t bits = 0; uint8_t bit_length = 0; uint32_t max_code = lengths[0]; @@ -78,7 +88,7 @@ char* decompress(const compressed_string_t* compressed, char* decompressed) { if (this_bit == 0) { this_bit = 7; this_byte += 1; - b = compressed->data[this_byte]; // This may read past the end but its never used. + b = (&compressed->data)[this_byte]; // This may read past the end but its never used. } else { this_bit -= 1; } @@ -91,14 +101,14 @@ char* decompress(const compressed_string_t* compressed, char* decompressed) { i += put_utf8(decompressed + i, values[searched_length + bits - max_code]); } - decompressed[compressed->length-1] = '\0'; + decompressed[length-1] = '\0'; return decompressed; } inline __attribute__((always_inline)) const compressed_string_t* translate(const char* original) { #ifndef NO_QSTR #define QDEF(id, str) - #define TRANSLATION(id, len, compressed...) if (strcmp(original, id) == 0) { static const compressed_string_t v = {.length = len, .data = compressed}; return &v; } else + #define TRANSLATION(id, firstbyte, ...) if (strcmp(original, id) == 0) { static const compressed_string_t v = { .data = firstbyte, .tail = { __VA_ARGS__ } }; return &v; } else #include "genhdr/qstrdefs.generated.h" #undef TRANSLATION #undef QDEF diff --git a/supervisor/shared/translate.h b/supervisor/shared/translate.h index 5e8acbb6af..067a44e1fc 100644 --- a/supervisor/shared/translate.h +++ b/supervisor/shared/translate.h @@ -30,12 +30,13 @@ #include typedef struct { - uint16_t length; - const uint8_t data[]; + uint8_t data; + const uint8_t tail[]; } compressed_string_t; const compressed_string_t* translate(const char* c); void serial_write_compressed(const compressed_string_t* compressed); char* decompress(const compressed_string_t* compressed, char* decompressed); +uint16_t decompress_length(const compressed_string_t* compressed); #endif // MICROPY_INCLUDED_SUPERVISOR_TRANSLATE_H