Merge pull request #2968 from jepler/more-efficient-translation

More efficient translation
This commit is contained in:
Dan Halbert 2020-05-28 21:37:45 -04:00 committed by GitHub
commit 6ace4ee7e8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 104 additions and 27 deletions

2
main.c
View File

@ -185,7 +185,7 @@ bool maybe_run_list(const char ** filenames, pyexec_result_t* exec_result) {
}
mp_hal_stdout_tx_str(filename);
const compressed_string_t* compressed = translate(" output:\n");
char decompressed[compressed->length];
char decompressed[decompress_length(compressed)];
decompress(compressed, decompressed);
mp_hal_stdout_tx_str(decompressed);
pyexec_file(filename, exec_result);

View File

@ -135,7 +135,7 @@ STATIC void mp_help_print_modules(void) {
// let the user know there may be other modules available from the filesystem
const compressed_string_t* compressed = translate("Plus any modules on the filesystem\n");
char decompressed[compressed->length];
char decompressed[decompress_length(compressed)];
decompress(compressed, decompressed);
mp_print_str(MP_PYTHON_PRINTER, decompressed);
}
@ -181,7 +181,7 @@ STATIC mp_obj_t mp_builtin_help(size_t n_args, const mp_obj_t *args) {
// print a general help message. Translate only works on single strings on one line.
const compressed_string_t* compressed =
translate("Welcome to Adafruit CircuitPython %s!\n\nPlease visit learn.adafruit.com/category/circuitpython for project guides.\n\nTo list built-in modules please do `help(\"modules\")`.\n");
char decompressed[compressed->length];
char decompressed[decompress_length(compressed)];
decompress(compressed, decompressed);
mp_printf(MP_PYTHON_PRINTER, decompressed, MICROPY_GIT_TAG);
} else {

View File

@ -1,7 +1,10 @@
"""
Process raw qstr file and output qstr data with length, hash and data bytes.
This script works with Python 2.6, 2.7, 3.3 and 3.4.
This script works with Python 2.7, 3.3 and 3.4.
For documentation about the format of compressed translated strings, see
supervisor/shared/translate.h
"""
from __future__ import print_function
@ -132,19 +135,37 @@ def compute_huffman_coding(translations, qstrs, compression_filename):
print("// estimated total memory size", len(lengths) + 2*len(values) + sum(len(cb[u]) for u in all_strings_concat))
print("//", values, lengths)
values_type = "uint16_t" if max(ord(u) for u in values) > 255 else "uint8_t"
max_translation_encoded_length = max(len(translation.encode("utf-8")) for original,translation in translations)
with open(compression_filename, "w") as f:
f.write("const uint8_t lengths[] = {{ {} }};\n".format(", ".join(map(str, lengths))))
f.write("const {} values[] = {{ {} }};\n".format(values_type, ", ".join(str(ord(u)) for u in values)))
f.write("#define compress_max_length_bits ({})\n".format(max_translation_encoded_length.bit_length()))
return values, lengths
def decompress(encoding_table, length, encoded):
def decompress(encoding_table, encoded, encoded_length_bits):
values, lengths = encoding_table
#print(l, encoded)
dec = []
this_byte = 0
this_bit = 7
b = encoded[this_byte]
for i in range(length):
bits = 0
for i in range(encoded_length_bits):
bits <<= 1
if 0x80 & b:
bits |= 1
b <<= 1
if this_bit == 0:
this_bit = 7
this_byte += 1
if this_byte < len(encoded):
b = encoded[this_byte]
else:
this_bit -= 1
length = bits
i = 0
while i < length:
bits = 0
bit_length = 0
max_code = lengths[0]
@ -170,10 +191,11 @@ def decompress(encoding_table, length, encoded):
searched_length += lengths[bit_length]
v = values[searched_length + bits - max_code]
i += len(v.encode('utf-8'))
dec.append(v)
return ''.join(dec)
def compress(encoding_table, decompressed):
def compress(encoding_table, decompressed, encoded_length_bits, len_translation_encoded):
if not isinstance(decompressed, str):
raise TypeError()
values, lengths = encoding_table
@ -182,6 +204,19 @@ def compress(encoding_table, decompressed):
#print(lengths)
current_bit = 7
current_byte = 0
code = len_translation_encoded
bits = encoded_length_bits+1
for i in range(bits - 1, 0, -1):
if len_translation_encoded & (1 << (i - 1)):
enc[current_byte] |= 1 << current_bit
if current_bit == 0:
current_bit = 7
#print("packed {0:0{width}b}".format(enc[current_byte], width=8))
current_byte += 1
else:
current_bit -= 1
for c in decompressed:
#print()
#print("char", c, values.index(c))
@ -342,14 +377,17 @@ def print_qstr_data(encoding_table, qcfgs, qstrs, i18ns):
total_text_size = 0
total_text_compressed_size = 0
max_translation_encoded_length = max(len(translation.encode("utf-8")) for original, translation in i18ns)
encoded_length_bits = max_translation_encoded_length.bit_length()
for original, translation in i18ns:
translation_encoded = translation.encode("utf-8")
compressed = compress(encoding_table, translation)
compressed = compress(encoding_table, translation, encoded_length_bits, len(translation_encoded))
total_text_compressed_size += len(compressed)
decompressed = decompress(encoding_table, len(translation_encoded), compressed)
decompressed = decompress(encoding_table, compressed, encoded_length_bits)
assert decompressed == translation
for c in C_ESCAPES:
decompressed = decompressed.replace(c, C_ESCAPES[c])
print("TRANSLATION(\"{}\", {}, {{ {} }}) // {}".format(original, len(translation_encoded)+1, ", ".join(["0x{:02x}".format(x) for x in compressed]), decompressed))
print("TRANSLATION(\"{}\", {}) // {}".format(original, ", ".join(["{:d}".format(x) for x in compressed]), decompressed))
total_text_size += len(translation.encode("utf-8"))
print()
@ -385,6 +423,7 @@ if __name__ == "__main__":
qcfgs, qstrs, i18ns = parse_input_headers(args.infiles)
if args.translation:
i18ns = sorted(i18ns)
translations = translate(args.translation, i18ns)
encoding_table = compute_huffman_coding(translations, qstrs, args.compression_filename)
print_qstr_data(encoding_table, qcfgs, qstrs, translations)

View File

@ -158,7 +158,7 @@ const char *mp_common_errno_to_str(mp_obj_t errno_val, char *buf, size_t len) {
case ENOSPC: desc = translate("No space left on device"); break;
case EROFS: desc = translate("Read-only filesystem"); break;
}
if (desc != NULL && desc->length <= len) {
if (desc != NULL && decompress_length(desc) <= len) {
decompress(desc, buf);
return buf;
}

View File

@ -94,17 +94,17 @@ void mp_obj_print_exception(const mp_print_t *print, mp_obj_t exc) {
assert(n % 3 == 0);
// Decompress the format strings
const compressed_string_t* traceback = translate("Traceback (most recent call last):\n");
char decompressed[traceback->length];
char decompressed[decompress_length(traceback)];
decompress(traceback, decompressed);
#if MICROPY_ENABLE_SOURCE_LINE
const compressed_string_t* frame = translate(" File \"%q\", line %d");
#else
const compressed_string_t* frame = translate(" File \"%q\"");
#endif
char decompressed_frame[frame->length];
char decompressed_frame[decompress_length(frame)];
decompress(frame, decompressed_frame);
const compressed_string_t* block_fmt = translate(", in %q\n");
char decompressed_block[block_fmt->length];
char decompressed_block[decompress_length(block_fmt)];
decompress(block_fmt, decompressed_block);
// Print the traceback

View File

@ -400,7 +400,7 @@ mp_obj_t mp_obj_new_exception_msg_vlist(const mp_obj_type_t *exc_type, const com
// Try to allocate memory for the message
mp_obj_str_t *o_str = m_new_obj_maybe(mp_obj_str_t);
size_t o_str_alloc = fmt->length + 1;
size_t o_str_alloc = decompress_length(fmt);
byte *o_str_buf = m_new_maybe(byte, o_str_alloc);
bool used_emg_buf = false;
@ -433,7 +433,7 @@ mp_obj_t mp_obj_new_exception_msg_vlist(const mp_obj_type_t *exc_type, const com
// We have some memory to format the string
struct _exc_printer_t exc_pr = {!used_emg_buf, o_str_alloc, 0, o_str_buf};
mp_print_t print = {&exc_pr, exc_add_strn};
char fmt_decompressed[fmt->length];
char fmt_decompressed[decompress_length(fmt)];
decompress(fmt, fmt_decompressed);
mp_vprintf(&print, fmt_decompressed, ap);
exc_pr.buf[exc_pr.len] = '\0';

View File

@ -37,7 +37,7 @@
#include "supervisor/serial.h"
void serial_write_compressed(const compressed_string_t* compressed) {
char decompressed[compressed->length];
char decompressed[decompress_length(compressed)];
decompress(compressed, decompressed);
serial_write(decompressed);
}
@ -58,12 +58,22 @@ STATIC int put_utf8(char *buf, int u) {
}
}
uint16_t decompress_length(const compressed_string_t* compressed) {
if (compress_max_length_bits <= 8) {
return 1 + (compressed->data >> (8 - compress_max_length_bits));
} else {
return 1 + ((compressed->data * 256 + compressed->tail[0]) >> (16 - compress_max_length_bits));
}
}
char* decompress(const compressed_string_t* compressed, char* decompressed) {
uint8_t this_byte = 0;
uint8_t this_bit = 7;
uint8_t b = compressed->data[this_byte];
uint8_t this_byte = compress_max_length_bits / 8;
uint8_t this_bit = 7 - compress_max_length_bits % 8;
uint8_t b = (&compressed->data)[this_byte];
uint16_t length = decompress_length(compressed);
// Stop one early because the last byte is always NULL.
for (uint16_t i = 0; i < compressed->length - 1;) {
for (uint16_t i = 0; i < length - 1;) {
uint32_t bits = 0;
uint8_t bit_length = 0;
uint32_t max_code = lengths[0];
@ -78,7 +88,7 @@ char* decompress(const compressed_string_t* compressed, char* decompressed) {
if (this_bit == 0) {
this_bit = 7;
this_byte += 1;
b = compressed->data[this_byte]; // This may read past the end but its never used.
b = (&compressed->data)[this_byte]; // This may read past the end but its never used.
} else {
this_bit -= 1;
}
@ -91,14 +101,14 @@ char* decompress(const compressed_string_t* compressed, char* decompressed) {
i += put_utf8(decompressed + i, values[searched_length + bits - max_code]);
}
decompressed[compressed->length-1] = '\0';
decompressed[length-1] = '\0';
return decompressed;
}
inline __attribute__((always_inline)) const compressed_string_t* translate(const char* original) {
#ifndef NO_QSTR
#define QDEF(id, str)
#define TRANSLATION(id, len, compressed...) if (strcmp(original, id) == 0) { static const compressed_string_t v = {.length = len, .data = compressed}; return &v; } else
#define TRANSLATION(id, firstbyte, ...) if (strcmp(original, id) == 0) { static const compressed_string_t v = { .data = firstbyte, .tail = { __VA_ARGS__ } }; return &v; } else
#include "genhdr/qstrdefs.generated.h"
#undef TRANSLATION
#undef QDEF

View File

@ -29,13 +29,41 @@
#include <stdint.h>
// The format of the compressed data is:
// - the size of the uncompressed string in UTF-8 bytes, encoded as a
// (compress_max_length_bits)-bit number. compress_max_length_bits is
// computed during dictionary generation time, and happens to be 8
// for all current platforms. However, it'll probably end up being
// 9 in some translations sometime in the future. This length excludes
// the trailing NUL, though notably decompress_length includes it.
//
// - followed by the huffman encoding of the individual UTF-16 code
// points that make up the string. The trailing "\0" is not
// represented by a huffman code, but is implied by the length.
// (building the huffman encoding on UTF-16 code points gave better
// compression than building it on UTF-8 bytes)
//
// The "data" / "tail" construct is so that the struct's last member is a
// "flexible array". However, the _only_ member is not permitted to be
// a flexible member, so we have to declare the first byte as a separte
// member of the structure.
//
// For translations where length needs 8 bits, this saves about 1.5
// bytes per string on average compared to a structure of {uint16_t,
// flexible array}, but is also future-proofed against strings with
// UTF-8 length above 256, with a savings of about 1.375 bytes per
// string.
typedef struct {
uint16_t length;
const uint8_t data[];
uint8_t data;
const uint8_t tail[];
} compressed_string_t;
// Return the compressed, translated version of a source string
// Usually, due to LTO, this is optimized into a load of a constant
// pointer.
const compressed_string_t* translate(const char* c);
void serial_write_compressed(const compressed_string_t* compressed);
char* decompress(const compressed_string_t* compressed, char* decompressed);
uint16_t decompress_length(const compressed_string_t* compressed);
#endif // MICROPY_INCLUDED_SUPERVISOR_TRANSLATE_H