cherry-pick jepler's huffman size reduction
This commit is contained in:
parent
3d14c32399
commit
6265ee0e8c
@ -103,14 +103,10 @@ def compute_huffman_coding(translations, qstrs, compression_filename):
|
||||
# go through each qstr and print it out
|
||||
for _, _, qstr in qstrs.values():
|
||||
all_strings.append(qstr)
|
||||
all_strings_concat = "".join(all_strings).encode("utf-8")
|
||||
all_strings_concat = "".join(all_strings)
|
||||
counts = collections.Counter(all_strings_concat)
|
||||
# add other values
|
||||
for i in range(256):
|
||||
if i not in counts:
|
||||
counts[i] = 0
|
||||
cb = huffman.codebook(counts.items())
|
||||
values = bytearray()
|
||||
values = []
|
||||
length_count = {}
|
||||
renumbered = 0
|
||||
last_l = None
|
||||
@ -124,26 +120,27 @@ def compute_huffman_coding(translations, qstrs, compression_filename):
|
||||
if last_l:
|
||||
renumbered <<= (l - last_l)
|
||||
canonical[ch] = '{0:0{width}b}'.format(renumbered, width=l)
|
||||
if chr(ch) in C_ESCAPES:
|
||||
s = C_ESCAPES[chr(ch)]
|
||||
else:
|
||||
s = chr(ch)
|
||||
print("//", ch, s, counts[ch], canonical[ch], renumbered)
|
||||
s = C_ESCAPES.get(ch, ch)
|
||||
print("//", ord(ch), s, counts[ch], canonical[ch], renumbered)
|
||||
renumbered += 1
|
||||
last_l = l
|
||||
lengths = bytearray()
|
||||
for i in range(1, max(length_count) + 1):
|
||||
print("// length count", length_count)
|
||||
for i in range(1, max(length_count) + 2):
|
||||
lengths.append(length_count.get(i, 0))
|
||||
print("// values", values, "lengths", len(lengths), lengths)
|
||||
print("// estimated total memory size", len(lengths) + 2*len(values) + sum(len(cb[u]) for u in all_strings_concat))
|
||||
print("//", values, lengths)
|
||||
values_type = "uint16_t" if max(ord(u) for u in values) > 255 else "uint8_t"
|
||||
with open(compression_filename, "w") as f:
|
||||
f.write("const uint8_t lengths[] = {{ {} }};\n".format(", ".join(map(str, lengths))))
|
||||
f.write("const uint8_t values[256] = {{ {} }};\n".format(", ".join(map(str, values))))
|
||||
f.write("const {} values[] = {{ {} }};\n".format(values_type, ", ".join(str(ord(u)) for u in values)))
|
||||
return values, lengths
|
||||
|
||||
def decompress(encoding_table, length, encoded):
|
||||
values, lengths = encoding_table
|
||||
#print(l, encoded)
|
||||
dec = bytearray(length)
|
||||
dec = []
|
||||
this_byte = 0
|
||||
this_bit = 7
|
||||
b = encoded[this_byte]
|
||||
@ -173,14 +170,14 @@ def decompress(encoding_table, length, encoded):
|
||||
searched_length += lengths[bit_length]
|
||||
|
||||
v = values[searched_length + bits - max_code]
|
||||
dec[i] = v
|
||||
return dec
|
||||
dec.append(v)
|
||||
return ''.join(dec)
|
||||
|
||||
def compress(encoding_table, decompressed):
|
||||
if not isinstance(decompressed, bytes):
|
||||
if not isinstance(decompressed, str):
|
||||
raise TypeError()
|
||||
values, lengths = encoding_table
|
||||
enc = bytearray(len(decompressed))
|
||||
enc = bytearray(len(decompressed) * 3)
|
||||
#print(decompressed)
|
||||
#print(lengths)
|
||||
current_bit = 7
|
||||
@ -227,6 +224,8 @@ def compress(encoding_table, decompressed):
|
||||
current_bit -= 1
|
||||
if current_bit != 7:
|
||||
current_byte += 1
|
||||
if current_byte > len(decompressed):
|
||||
print("Note: compression increased length", repr(decompressed), len(decompressed), current_byte, file=sys.stderr)
|
||||
return enc[:current_byte]
|
||||
|
||||
def qstr_escape(qst):
|
||||
@ -345,9 +344,9 @@ def print_qstr_data(encoding_table, qcfgs, qstrs, i18ns):
|
||||
total_text_compressed_size = 0
|
||||
for original, translation in i18ns:
|
||||
translation_encoded = translation.encode("utf-8")
|
||||
compressed = compress(encoding_table, translation_encoded)
|
||||
compressed = compress(encoding_table, translation)
|
||||
total_text_compressed_size += len(compressed)
|
||||
decompressed = decompress(encoding_table, len(translation_encoded), compressed).decode("utf-8")
|
||||
decompressed = decompress(encoding_table, len(translation_encoded), compressed)
|
||||
for c in C_ESCAPES:
|
||||
decompressed = decompressed.replace(c, C_ESCAPES[c])
|
||||
print("TRANSLATION(\"{}\", {}, {{ {} }}) // {}".format(original, len(translation_encoded)+1, ", ".join(["0x{:02x}".format(x) for x in compressed]), decompressed))
|
||||
|
@ -42,12 +42,28 @@ void serial_write_compressed(const compressed_string_t* compressed) {
|
||||
serial_write(decompressed);
|
||||
}
|
||||
|
||||
STATIC int put_utf8(char *buf, int u) {
|
||||
if(u <= 0x7f) {
|
||||
*buf = u;
|
||||
return 1;
|
||||
} else if(u <= 0x07ff) {
|
||||
*buf++ = 0b11000000 | (u >> 6);
|
||||
*buf = 0b10000000 | (u & 0b00111111);
|
||||
return 2;
|
||||
} else { // u <= 0xffff)
|
||||
*buf++ = 0b11000000 | (u >> 12);
|
||||
*buf = 0b10000000 | ((u >> 6) & 0b00111111);
|
||||
*buf = 0b10000000 | (u & 0b00111111);
|
||||
return 3;
|
||||
}
|
||||
}
|
||||
|
||||
char* decompress(const compressed_string_t* compressed, char* decompressed) {
|
||||
uint8_t this_byte = 0;
|
||||
uint8_t this_bit = 7;
|
||||
uint8_t b = compressed->data[this_byte];
|
||||
// Stop one early because the last byte is always NULL.
|
||||
for (uint16_t i = 0; i < compressed->length - 1; i++) {
|
||||
for (uint16_t i = 0; i < compressed->length - 1;) {
|
||||
uint32_t bits = 0;
|
||||
uint8_t bit_length = 0;
|
||||
uint32_t max_code = lengths[0];
|
||||
@ -72,7 +88,7 @@ char* decompress(const compressed_string_t* compressed, char* decompressed) {
|
||||
max_code = (max_code << 1) + lengths[bit_length];
|
||||
searched_length += lengths[bit_length];
|
||||
}
|
||||
decompressed[i] = values[searched_length + bits - max_code];
|
||||
i += put_utf8(decompressed + i, values[searched_length + bits - max_code]);
|
||||
}
|
||||
|
||||
decompressed[compressed->length-1] = '\0';
|
||||
|
Loading…
Reference in New Issue
Block a user