Merge pull request #2345 from jepler/compressed-unicode

translation: Compress as unicode, not bytes
This commit is contained in:
Scott Shawcroft 2019-12-02 17:11:49 -08:00 committed by GitHub
commit 15886b1505
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 36 additions and 23 deletions

View File

@ -103,14 +103,10 @@ def compute_huffman_coding(translations, qstrs, compression_filename):
# go through each qstr and print it out
for _, _, qstr in qstrs.values():
all_strings.append(qstr)
all_strings_concat = "".join(all_strings).encode("utf-8")
all_strings_concat = "".join(all_strings)
counts = collections.Counter(all_strings_concat)
# add other values
for i in range(256):
if i not in counts:
counts[i] = 0
cb = huffman.codebook(counts.items())
values = bytearray()
values = []
length_count = {}
renumbered = 0
last_l = None
@ -124,26 +120,27 @@ def compute_huffman_coding(translations, qstrs, compression_filename):
if last_l:
renumbered <<= (l - last_l)
canonical[ch] = '{0:0{width}b}'.format(renumbered, width=l)
if chr(ch) in C_ESCAPES:
s = C_ESCAPES[chr(ch)]
else:
s = chr(ch)
print("//", ch, s, counts[ch], canonical[ch], renumbered)
s = C_ESCAPES.get(ch, ch)
print("//", ord(ch), s, counts[ch], canonical[ch], renumbered)
renumbered += 1
last_l = l
lengths = bytearray()
for i in range(1, max(length_count) + 1):
print("// length count", length_count)
for i in range(1, max(length_count) + 2):
lengths.append(length_count.get(i, 0))
print("// values", values, "lengths", len(lengths), lengths)
print("// estimated total memory size", len(lengths) + 2*len(values) + sum(len(cb[u]) for u in all_strings_concat))
print("//", values, lengths)
values_type = "uint16_t" if max(ord(u) for u in values) > 255 else "uint8_t"
with open(compression_filename, "w") as f:
f.write("const uint8_t lengths[] = {{ {} }};\n".format(", ".join(map(str, lengths))))
f.write("const uint8_t values[256] = {{ {} }};\n".format(", ".join(map(str, values))))
f.write("const {} values[] = {{ {} }};\n".format(values_type, ", ".join(str(ord(u)) for u in values)))
return values, lengths
def decompress(encoding_table, length, encoded):
values, lengths = encoding_table
#print(l, encoded)
dec = bytearray(length)
dec = []
this_byte = 0
this_bit = 7
b = encoded[this_byte]
@ -173,14 +170,14 @@ def decompress(encoding_table, length, encoded):
searched_length += lengths[bit_length]
v = values[searched_length + bits - max_code]
dec[i] = v
return dec
dec.append(v)
return ''.join(dec)
def compress(encoding_table, decompressed):
if not isinstance(decompressed, bytes):
if not isinstance(decompressed, str):
raise TypeError()
values, lengths = encoding_table
enc = bytearray(len(decompressed) * 2)
enc = bytearray(len(decompressed) * 3)
#print(decompressed)
#print(lengths)
current_bit = 7
@ -228,7 +225,7 @@ def compress(encoding_table, decompressed):
if current_bit != 7:
current_byte += 1
if current_byte > len(decompressed):
print("Note: compression increased length", repr(decompressed.decode('utf-8')), len(decompressed), current_byte, file=sys.stderr)
print("Note: compression increased length", repr(decompressed), len(decompressed), current_byte, file=sys.stderr)
return enc[:current_byte]
def qstr_escape(qst):
@ -347,9 +344,9 @@ def print_qstr_data(encoding_table, qcfgs, qstrs, i18ns):
total_text_compressed_size = 0
for original, translation in i18ns:
translation_encoded = translation.encode("utf-8")
compressed = compress(encoding_table, translation_encoded)
compressed = compress(encoding_table, translation)
total_text_compressed_size += len(compressed)
decompressed = decompress(encoding_table, len(translation_encoded), compressed).decode("utf-8")
decompressed = decompress(encoding_table, len(translation_encoded), compressed)
for c in C_ESCAPES:
decompressed = decompressed.replace(c, C_ESCAPES[c])
print("TRANSLATION(\"{}\", {}, {{ {} }}) // {}".format(original, len(translation_encoded)+1, ", ".join(["0x{:02x}".format(x) for x in compressed]), decompressed))

View File

@ -42,12 +42,28 @@ void serial_write_compressed(const compressed_string_t* compressed) {
serial_write(decompressed);
}
STATIC int put_utf8(char *buf, int u) {
if(u <= 0x7f) {
*buf = u;
return 1;
} else if(u <= 0x07ff) {
*buf++ = 0b11000000 | (u >> 6);
*buf = 0b10000000 | (u & 0b00111111);
return 2;
} else { // u <= 0xffff)
*buf++ = 0b11000000 | (u >> 12);
*buf = 0b10000000 | ((u >> 6) & 0b00111111);
*buf = 0b10000000 | (u & 0b00111111);
return 3;
}
}
char* decompress(const compressed_string_t* compressed, char* decompressed) {
uint8_t this_byte = 0;
uint8_t this_bit = 7;
uint8_t b = compressed->data[this_byte];
// Stop one early because the last byte is always NULL.
for (uint16_t i = 0; i < compressed->length - 1; i++) {
for (uint16_t i = 0; i < compressed->length - 1;) {
uint32_t bits = 0;
uint8_t bit_length = 0;
uint32_t max_code = lengths[0];
@ -72,7 +88,7 @@ char* decompress(const compressed_string_t* compressed, char* decompressed) {
max_code = (max_code << 1) + lengths[bit_length];
searched_length += lengths[bit_length];
}
decompressed[i] = values[searched_length + bits - max_code];
i += put_utf8(decompressed + i, values[searched_length + bits - max_code]);
}
decompressed[compressed->length-1] = '\0';