cherry-pick jepler's huffman size reduction

This commit is contained in:
Scott Shawcroft 2019-12-02 17:11:49 -08:00 committed by Dan Halbert
parent 3d14c32399
commit 6265ee0e8c
2 changed files with 37 additions and 22 deletions

View File

@ -103,14 +103,10 @@ def compute_huffman_coding(translations, qstrs, compression_filename):
# go through each qstr and print it out # go through each qstr and print it out
for _, _, qstr in qstrs.values(): for _, _, qstr in qstrs.values():
all_strings.append(qstr) all_strings.append(qstr)
all_strings_concat = "".join(all_strings).encode("utf-8") all_strings_concat = "".join(all_strings)
counts = collections.Counter(all_strings_concat) counts = collections.Counter(all_strings_concat)
# add other values
for i in range(256):
if i not in counts:
counts[i] = 0
cb = huffman.codebook(counts.items()) cb = huffman.codebook(counts.items())
values = bytearray() values = []
length_count = {} length_count = {}
renumbered = 0 renumbered = 0
last_l = None last_l = None
@ -124,26 +120,27 @@ def compute_huffman_coding(translations, qstrs, compression_filename):
if last_l: if last_l:
renumbered <<= (l - last_l) renumbered <<= (l - last_l)
canonical[ch] = '{0:0{width}b}'.format(renumbered, width=l) canonical[ch] = '{0:0{width}b}'.format(renumbered, width=l)
if chr(ch) in C_ESCAPES: s = C_ESCAPES.get(ch, ch)
s = C_ESCAPES[chr(ch)] print("//", ord(ch), s, counts[ch], canonical[ch], renumbered)
else:
s = chr(ch)
print("//", ch, s, counts[ch], canonical[ch], renumbered)
renumbered += 1 renumbered += 1
last_l = l last_l = l
lengths = bytearray() lengths = bytearray()
for i in range(1, max(length_count) + 1): print("// length count", length_count)
for i in range(1, max(length_count) + 2):
lengths.append(length_count.get(i, 0)) lengths.append(length_count.get(i, 0))
print("// values", values, "lengths", len(lengths), lengths)
print("// estimated total memory size", len(lengths) + 2*len(values) + sum(len(cb[u]) for u in all_strings_concat))
print("//", values, lengths) print("//", values, lengths)
values_type = "uint16_t" if max(ord(u) for u in values) > 255 else "uint8_t"
with open(compression_filename, "w") as f: with open(compression_filename, "w") as f:
f.write("const uint8_t lengths[] = {{ {} }};\n".format(", ".join(map(str, lengths)))) f.write("const uint8_t lengths[] = {{ {} }};\n".format(", ".join(map(str, lengths))))
f.write("const uint8_t values[256] = {{ {} }};\n".format(", ".join(map(str, values)))) f.write("const {} values[] = {{ {} }};\n".format(values_type, ", ".join(str(ord(u)) for u in values)))
return values, lengths return values, lengths
def decompress(encoding_table, length, encoded): def decompress(encoding_table, length, encoded):
values, lengths = encoding_table values, lengths = encoding_table
#print(l, encoded) #print(l, encoded)
dec = bytearray(length) dec = []
this_byte = 0 this_byte = 0
this_bit = 7 this_bit = 7
b = encoded[this_byte] b = encoded[this_byte]
@ -173,14 +170,14 @@ def decompress(encoding_table, length, encoded):
searched_length += lengths[bit_length] searched_length += lengths[bit_length]
v = values[searched_length + bits - max_code] v = values[searched_length + bits - max_code]
dec[i] = v dec.append(v)
return dec return ''.join(dec)
def compress(encoding_table, decompressed): def compress(encoding_table, decompressed):
if not isinstance(decompressed, bytes): if not isinstance(decompressed, str):
raise TypeError() raise TypeError()
values, lengths = encoding_table values, lengths = encoding_table
enc = bytearray(len(decompressed)) enc = bytearray(len(decompressed) * 3)
#print(decompressed) #print(decompressed)
#print(lengths) #print(lengths)
current_bit = 7 current_bit = 7
@ -227,6 +224,8 @@ def compress(encoding_table, decompressed):
current_bit -= 1 current_bit -= 1
if current_bit != 7: if current_bit != 7:
current_byte += 1 current_byte += 1
if current_byte > len(decompressed):
print("Note: compression increased length", repr(decompressed), len(decompressed), current_byte, file=sys.stderr)
return enc[:current_byte] return enc[:current_byte]
def qstr_escape(qst): def qstr_escape(qst):
@ -345,9 +344,9 @@ def print_qstr_data(encoding_table, qcfgs, qstrs, i18ns):
total_text_compressed_size = 0 total_text_compressed_size = 0
for original, translation in i18ns: for original, translation in i18ns:
translation_encoded = translation.encode("utf-8") translation_encoded = translation.encode("utf-8")
compressed = compress(encoding_table, translation_encoded) compressed = compress(encoding_table, translation)
total_text_compressed_size += len(compressed) total_text_compressed_size += len(compressed)
decompressed = decompress(encoding_table, len(translation_encoded), compressed).decode("utf-8") decompressed = decompress(encoding_table, len(translation_encoded), compressed)
for c in C_ESCAPES: for c in C_ESCAPES:
decompressed = decompressed.replace(c, C_ESCAPES[c]) decompressed = decompressed.replace(c, C_ESCAPES[c])
print("TRANSLATION(\"{}\", {}, {{ {} }}) // {}".format(original, len(translation_encoded)+1, ", ".join(["0x{:02x}".format(x) for x in compressed]), decompressed)) print("TRANSLATION(\"{}\", {}, {{ {} }}) // {}".format(original, len(translation_encoded)+1, ", ".join(["0x{:02x}".format(x) for x in compressed]), decompressed))

View File

@ -42,12 +42,28 @@ void serial_write_compressed(const compressed_string_t* compressed) {
serial_write(decompressed); serial_write(decompressed);
} }
STATIC int put_utf8(char *buf, int u) {
if(u <= 0x7f) {
*buf = u;
return 1;
} else if(u <= 0x07ff) {
*buf++ = 0b11000000 | (u >> 6);
*buf = 0b10000000 | (u & 0b00111111);
return 2;
} else { // u <= 0xffff)
*buf++ = 0b11000000 | (u >> 12);
*buf = 0b10000000 | ((u >> 6) & 0b00111111);
*buf = 0b10000000 | (u & 0b00111111);
return 3;
}
}
char* decompress(const compressed_string_t* compressed, char* decompressed) { char* decompress(const compressed_string_t* compressed, char* decompressed) {
uint8_t this_byte = 0; uint8_t this_byte = 0;
uint8_t this_bit = 7; uint8_t this_bit = 7;
uint8_t b = compressed->data[this_byte]; uint8_t b = compressed->data[this_byte];
// Stop one early because the last byte is always NULL. // Stop one early because the last byte is always NULL.
for (uint16_t i = 0; i < compressed->length - 1; i++) { for (uint16_t i = 0; i < compressed->length - 1;) {
uint32_t bits = 0; uint32_t bits = 0;
uint8_t bit_length = 0; uint8_t bit_length = 0;
uint32_t max_code = lengths[0]; uint32_t max_code = lengths[0];
@ -72,7 +88,7 @@ char* decompress(const compressed_string_t* compressed, char* decompressed) {
max_code = (max_code << 1) + lengths[bit_length]; max_code = (max_code << 1) + lengths[bit_length];
searched_length += lengths[bit_length]; searched_length += lengths[bit_length];
} }
decompressed[i] = values[searched_length + bits - max_code]; i += put_utf8(decompressed + i, values[searched_length + bits - max_code]);
} }
decompressed[compressed->length-1] = '\0'; decompressed[compressed->length-1] = '\0';