From e06a3bbceb548432969bfcfa1076dc4363478151 Mon Sep 17 00:00:00 2001 From: Jeff Epler Date: Mon, 2 Dec 2019 08:41:03 -0600 Subject: [PATCH] translation: Compress as unicode, not bytes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit By treating each unicode code-point as a single entity for huffman compression, the overall compression rate can be somewhat improved without changing the algorithm. On the decompression side, when compressed values above 127 are encountered, they need to be converted from a 16-bit Unicode code point into a UTF-8 byte sequence. Doing this returns approximately 1.5kB of flash storage with the zh_Latn_pinyin translation. (292 -> 1768 bytes remaining in my build of trinket_m0) Other "more ASCII" translations benefit less, and in fact zh_Latn_pinyin is no longer the most constrained translation! (de_DE 1156 -> 1384 bytes free in flash, I didn't check others before pushing for CI) English is slightly pessimized, 2840 -> 2788 bytes, probably mostly because the "values" array was changed from uint8_t to uint16_t, which is strictly not required for an all-ASCII translation. This could probably be avoided in this case, but as English is not the most constrained translation it doesn't really matter. Testing performed: built for feather nRF52840 express and trinket m0 in English and zh_Latn_pinyin; ran and verified the localized messages such as Àn xià rènhé jiàn jìnrù REPL. Shǐyòng CTRL-D chóngxīn jiāzài. and Press any key to enter the REPL. Use CTRL-D to reload. were properly displayed. --- py/makeqstrdata.py | 36 ++++++++++++++++------------------- supervisor/shared/translate.c | 20 +++++++++++++++++-- 2 files changed, 34 insertions(+), 22 deletions(-) diff --git a/py/makeqstrdata.py b/py/makeqstrdata.py index 8e3835d861..b9c0ce3e11 100644 --- a/py/makeqstrdata.py +++ b/py/makeqstrdata.py @@ -103,14 +103,10 @@ def compute_huffman_coding(translations, qstrs, compression_filename): # go through each qstr and print it out for _, _, qstr in qstrs.values(): all_strings.append(qstr) - all_strings_concat = "".join(all_strings).encode("utf-8") + all_strings_concat = "".join(all_strings) counts = collections.Counter(all_strings_concat) - # add other values - for i in range(256): - if i not in counts: - counts[i] = 0 cb = huffman.codebook(counts.items()) - values = bytearray() + values = [] length_count = {} renumbered = 0 last_l = None @@ -124,26 +120,26 @@ def compute_huffman_coding(translations, qstrs, compression_filename): if last_l: renumbered <<= (l - last_l) canonical[ch] = '{0:0{width}b}'.format(renumbered, width=l) - if chr(ch) in C_ESCAPES: - s = C_ESCAPES[chr(ch)] - else: - s = chr(ch) - print("//", ch, s, counts[ch], canonical[ch], renumbered) + s = C_ESCAPES.get(ch, ch) + print("//", ord(ch), s, counts[ch], canonical[ch], renumbered) renumbered += 1 last_l = l lengths = bytearray() - for i in range(1, max(length_count) + 1): + print("// length count", length_count) + for i in range(1, max(length_count) + 2): lengths.append(length_count.get(i, 0)) + print("// values", values, "lengths", len(lengths), lengths) + print("// estimated total memory size", len(lengths) + 2*len(values) + sum(len(cb[u]) for u in all_strings_concat)) print("//", values, lengths) with open(compression_filename, "w") as f: f.write("const uint8_t lengths[] = {{ {} }};\n".format(", ".join(map(str, lengths)))) - f.write("const uint8_t values[256] = {{ {} }};\n".format(", ".join(map(str, values)))) + f.write("const uint16_t values[] = {{ {} }};\n".format(", ".join(str(ord(u)) for u in values))) return values, lengths def decompress(encoding_table, length, encoded): values, lengths = encoding_table #print(l, encoded) - dec = bytearray(length) + dec = [] this_byte = 0 this_bit = 7 b = encoded[this_byte] @@ -173,14 +169,14 @@ def decompress(encoding_table, length, encoded): searched_length += lengths[bit_length] v = values[searched_length + bits - max_code] - dec[i] = v - return dec + dec.append(v) + return ''.join(dec) def compress(encoding_table, decompressed): - if not isinstance(decompressed, bytes): + if not isinstance(decompressed, str): raise TypeError() values, lengths = encoding_table - enc = bytearray(len(decompressed) * 2) + enc = bytearray(len(decompressed) * 3) #print(decompressed) #print(lengths) current_bit = 7 @@ -347,9 +343,9 @@ def print_qstr_data(encoding_table, qcfgs, qstrs, i18ns): total_text_compressed_size = 0 for original, translation in i18ns: translation_encoded = translation.encode("utf-8") - compressed = compress(encoding_table, translation_encoded) + compressed = compress(encoding_table, translation) total_text_compressed_size += len(compressed) - decompressed = decompress(encoding_table, len(translation_encoded), compressed).decode("utf-8") + decompressed = decompress(encoding_table, len(translation_encoded), compressed) for c in C_ESCAPES: decompressed = decompressed.replace(c, C_ESCAPES[c]) print("TRANSLATION(\"{}\", {}, {{ {} }}) // {}".format(original, len(translation_encoded)+1, ", ".join(["0x{:02x}".format(x) for x in compressed]), decompressed)) diff --git a/supervisor/shared/translate.c b/supervisor/shared/translate.c index aa5e4517c2..187d5ff8a5 100644 --- a/supervisor/shared/translate.c +++ b/supervisor/shared/translate.c @@ -42,12 +42,28 @@ void serial_write_compressed(const compressed_string_t* compressed) { serial_write(decompressed); } +STATIC int put_utf8(char *buf, int u) { + if(u <= 0x7f) { + *buf = u; + return 1; + } else if(u <= 0x07ff) { + *buf++ = 0b11000000 | (u >> 6); + *buf = 0b10000000 | (u & 0b00111111); + return 2; + } else { // u <= 0xffff) + *buf++ = 0b11000000 | (u >> 12); + *buf = 0b10000000 | ((u >> 6) & 0b00111111); + *buf = 0b10000000 | (u & 0b00111111); + return 3; + } +} + char* decompress(const compressed_string_t* compressed, char* decompressed) { uint8_t this_byte = 0; uint8_t this_bit = 7; uint8_t b = compressed->data[this_byte]; // Stop one early because the last byte is always NULL. - for (uint16_t i = 0; i < compressed->length - 1; i++) { + for (uint16_t i = 0; i < compressed->length - 1;) { uint32_t bits = 0; uint8_t bit_length = 0; uint32_t max_code = lengths[0]; @@ -72,7 +88,7 @@ char* decompress(const compressed_string_t* compressed, char* decompressed) { max_code = (max_code << 1) + lengths[bit_length]; searched_length += lengths[bit_length]; } - decompressed[i] = values[searched_length + bits - max_code]; + i += put_utf8(decompressed + i, values[searched_length + bits - max_code]); } decompressed[compressed->length-1] = '\0';