From 6265ee0e8ce0bf79f3cd73be4b15dc631a403a8b Mon Sep 17 00:00:00 2001 From: Scott Shawcroft Date: Mon, 2 Dec 2019 17:11:49 -0800 Subject: [PATCH] cherry-pick jepler's huffman size reduction --- py/makeqstrdata.py | 39 +++++++++++++++++------------------ supervisor/shared/translate.c | 20 ++++++++++++++++-- 2 files changed, 37 insertions(+), 22 deletions(-) diff --git a/py/makeqstrdata.py b/py/makeqstrdata.py index 5b5ec1c378..0d667959d9 100644 --- a/py/makeqstrdata.py +++ b/py/makeqstrdata.py @@ -103,14 +103,10 @@ def compute_huffman_coding(translations, qstrs, compression_filename): # go through each qstr and print it out for _, _, qstr in qstrs.values(): all_strings.append(qstr) - all_strings_concat = "".join(all_strings).encode("utf-8") + all_strings_concat = "".join(all_strings) counts = collections.Counter(all_strings_concat) - # add other values - for i in range(256): - if i not in counts: - counts[i] = 0 cb = huffman.codebook(counts.items()) - values = bytearray() + values = [] length_count = {} renumbered = 0 last_l = None @@ -124,26 +120,27 @@ def compute_huffman_coding(translations, qstrs, compression_filename): if last_l: renumbered <<= (l - last_l) canonical[ch] = '{0:0{width}b}'.format(renumbered, width=l) - if chr(ch) in C_ESCAPES: - s = C_ESCAPES[chr(ch)] - else: - s = chr(ch) - print("//", ch, s, counts[ch], canonical[ch], renumbered) + s = C_ESCAPES.get(ch, ch) + print("//", ord(ch), s, counts[ch], canonical[ch], renumbered) renumbered += 1 last_l = l lengths = bytearray() - for i in range(1, max(length_count) + 1): + print("// length count", length_count) + for i in range(1, max(length_count) + 2): lengths.append(length_count.get(i, 0)) + print("// values", values, "lengths", len(lengths), lengths) + print("// estimated total memory size", len(lengths) + 2*len(values) + sum(len(cb[u]) for u in all_strings_concat)) print("//", values, lengths) + values_type = "uint16_t" if max(ord(u) for u in values) > 255 else "uint8_t" with open(compression_filename, "w") as f: f.write("const uint8_t lengths[] = {{ {} }};\n".format(", ".join(map(str, lengths)))) - f.write("const uint8_t values[256] = {{ {} }};\n".format(", ".join(map(str, values)))) + f.write("const {} values[] = {{ {} }};\n".format(values_type, ", ".join(str(ord(u)) for u in values))) return values, lengths def decompress(encoding_table, length, encoded): values, lengths = encoding_table #print(l, encoded) - dec = bytearray(length) + dec = [] this_byte = 0 this_bit = 7 b = encoded[this_byte] @@ -173,14 +170,14 @@ def decompress(encoding_table, length, encoded): searched_length += lengths[bit_length] v = values[searched_length + bits - max_code] - dec[i] = v - return dec + dec.append(v) + return ''.join(dec) def compress(encoding_table, decompressed): - if not isinstance(decompressed, bytes): + if not isinstance(decompressed, str): raise TypeError() values, lengths = encoding_table - enc = bytearray(len(decompressed)) + enc = bytearray(len(decompressed) * 3) #print(decompressed) #print(lengths) current_bit = 7 @@ -227,6 +224,8 @@ def compress(encoding_table, decompressed): current_bit -= 1 if current_bit != 7: current_byte += 1 + if current_byte > len(decompressed): + print("Note: compression increased length", repr(decompressed), len(decompressed), current_byte, file=sys.stderr) return enc[:current_byte] def qstr_escape(qst): @@ -345,9 +344,9 @@ def print_qstr_data(encoding_table, qcfgs, qstrs, i18ns): total_text_compressed_size = 0 for original, translation in i18ns: translation_encoded = translation.encode("utf-8") - compressed = compress(encoding_table, translation_encoded) + compressed = compress(encoding_table, translation) total_text_compressed_size += len(compressed) - decompressed = decompress(encoding_table, len(translation_encoded), compressed).decode("utf-8") + decompressed = decompress(encoding_table, len(translation_encoded), compressed) for c in C_ESCAPES: decompressed = decompressed.replace(c, C_ESCAPES[c]) print("TRANSLATION(\"{}\", {}, {{ {} }}) // {}".format(original, len(translation_encoded)+1, ", ".join(["0x{:02x}".format(x) for x in compressed]), decompressed)) diff --git a/supervisor/shared/translate.c b/supervisor/shared/translate.c index aa5e4517c2..187d5ff8a5 100644 --- a/supervisor/shared/translate.c +++ b/supervisor/shared/translate.c @@ -42,12 +42,28 @@ void serial_write_compressed(const compressed_string_t* compressed) { serial_write(decompressed); } +STATIC int put_utf8(char *buf, int u) { + if(u <= 0x7f) { + *buf = u; + return 1; + } else if(u <= 0x07ff) { + *buf++ = 0b11000000 | (u >> 6); + *buf = 0b10000000 | (u & 0b00111111); + return 2; + } else { // u <= 0xffff) + *buf++ = 0b11000000 | (u >> 12); + *buf = 0b10000000 | ((u >> 6) & 0b00111111); + *buf = 0b10000000 | (u & 0b00111111); + return 3; + } +} + char* decompress(const compressed_string_t* compressed, char* decompressed) { uint8_t this_byte = 0; uint8_t this_bit = 7; uint8_t b = compressed->data[this_byte]; // Stop one early because the last byte is always NULL. - for (uint16_t i = 0; i < compressed->length - 1; i++) { + for (uint16_t i = 0; i < compressed->length - 1;) { uint32_t bits = 0; uint8_t bit_length = 0; uint32_t max_code = lengths[0]; @@ -72,7 +88,7 @@ char* decompress(const compressed_string_t* compressed, char* decompressed) { max_code = (max_code << 1) + lengths[bit_length]; searched_length += lengths[bit_length]; } - decompressed[i] = values[searched_length + bits - max_code]; + i += put_utf8(decompressed + i, values[searched_length + bits - max_code]); } decompressed[compressed->length-1] = '\0';