Merge pull request #2968 from jepler/more-efficient-translation
More efficient translation
This commit is contained in:
commit
6ace4ee7e8
2
main.c
2
main.c
|
@ -185,7 +185,7 @@ bool maybe_run_list(const char ** filenames, pyexec_result_t* exec_result) {
|
|||
}
|
||||
mp_hal_stdout_tx_str(filename);
|
||||
const compressed_string_t* compressed = translate(" output:\n");
|
||||
char decompressed[compressed->length];
|
||||
char decompressed[decompress_length(compressed)];
|
||||
decompress(compressed, decompressed);
|
||||
mp_hal_stdout_tx_str(decompressed);
|
||||
pyexec_file(filename, exec_result);
|
||||
|
|
|
@ -135,7 +135,7 @@ STATIC void mp_help_print_modules(void) {
|
|||
|
||||
// let the user know there may be other modules available from the filesystem
|
||||
const compressed_string_t* compressed = translate("Plus any modules on the filesystem\n");
|
||||
char decompressed[compressed->length];
|
||||
char decompressed[decompress_length(compressed)];
|
||||
decompress(compressed, decompressed);
|
||||
mp_print_str(MP_PYTHON_PRINTER, decompressed);
|
||||
}
|
||||
|
@ -181,7 +181,7 @@ STATIC mp_obj_t mp_builtin_help(size_t n_args, const mp_obj_t *args) {
|
|||
// print a general help message. Translate only works on single strings on one line.
|
||||
const compressed_string_t* compressed =
|
||||
translate("Welcome to Adafruit CircuitPython %s!\n\nPlease visit learn.adafruit.com/category/circuitpython for project guides.\n\nTo list built-in modules please do `help(\"modules\")`.\n");
|
||||
char decompressed[compressed->length];
|
||||
char decompressed[decompress_length(compressed)];
|
||||
decompress(compressed, decompressed);
|
||||
mp_printf(MP_PYTHON_PRINTER, decompressed, MICROPY_GIT_TAG);
|
||||
} else {
|
||||
|
|
|
@ -1,7 +1,10 @@
|
|||
"""
|
||||
Process raw qstr file and output qstr data with length, hash and data bytes.
|
||||
|
||||
This script works with Python 2.6, 2.7, 3.3 and 3.4.
|
||||
This script works with Python 2.7, 3.3 and 3.4.
|
||||
|
||||
For documentation about the format of compressed translated strings, see
|
||||
supervisor/shared/translate.h
|
||||
"""
|
||||
|
||||
from __future__ import print_function
|
||||
|
@ -132,19 +135,37 @@ def compute_huffman_coding(translations, qstrs, compression_filename):
|
|||
print("// estimated total memory size", len(lengths) + 2*len(values) + sum(len(cb[u]) for u in all_strings_concat))
|
||||
print("//", values, lengths)
|
||||
values_type = "uint16_t" if max(ord(u) for u in values) > 255 else "uint8_t"
|
||||
max_translation_encoded_length = max(len(translation.encode("utf-8")) for original,translation in translations)
|
||||
with open(compression_filename, "w") as f:
|
||||
f.write("const uint8_t lengths[] = {{ {} }};\n".format(", ".join(map(str, lengths))))
|
||||
f.write("const {} values[] = {{ {} }};\n".format(values_type, ", ".join(str(ord(u)) for u in values)))
|
||||
f.write("#define compress_max_length_bits ({})\n".format(max_translation_encoded_length.bit_length()))
|
||||
return values, lengths
|
||||
|
||||
def decompress(encoding_table, length, encoded):
|
||||
def decompress(encoding_table, encoded, encoded_length_bits):
|
||||
values, lengths = encoding_table
|
||||
#print(l, encoded)
|
||||
dec = []
|
||||
this_byte = 0
|
||||
this_bit = 7
|
||||
b = encoded[this_byte]
|
||||
for i in range(length):
|
||||
bits = 0
|
||||
for i in range(encoded_length_bits):
|
||||
bits <<= 1
|
||||
if 0x80 & b:
|
||||
bits |= 1
|
||||
|
||||
b <<= 1
|
||||
if this_bit == 0:
|
||||
this_bit = 7
|
||||
this_byte += 1
|
||||
if this_byte < len(encoded):
|
||||
b = encoded[this_byte]
|
||||
else:
|
||||
this_bit -= 1
|
||||
length = bits
|
||||
|
||||
i = 0
|
||||
while i < length:
|
||||
bits = 0
|
||||
bit_length = 0
|
||||
max_code = lengths[0]
|
||||
|
@ -170,10 +191,11 @@ def decompress(encoding_table, length, encoded):
|
|||
searched_length += lengths[bit_length]
|
||||
|
||||
v = values[searched_length + bits - max_code]
|
||||
i += len(v.encode('utf-8'))
|
||||
dec.append(v)
|
||||
return ''.join(dec)
|
||||
|
||||
def compress(encoding_table, decompressed):
|
||||
def compress(encoding_table, decompressed, encoded_length_bits, len_translation_encoded):
|
||||
if not isinstance(decompressed, str):
|
||||
raise TypeError()
|
||||
values, lengths = encoding_table
|
||||
|
@ -182,6 +204,19 @@ def compress(encoding_table, decompressed):
|
|||
#print(lengths)
|
||||
current_bit = 7
|
||||
current_byte = 0
|
||||
|
||||
code = len_translation_encoded
|
||||
bits = encoded_length_bits+1
|
||||
for i in range(bits - 1, 0, -1):
|
||||
if len_translation_encoded & (1 << (i - 1)):
|
||||
enc[current_byte] |= 1 << current_bit
|
||||
if current_bit == 0:
|
||||
current_bit = 7
|
||||
#print("packed {0:0{width}b}".format(enc[current_byte], width=8))
|
||||
current_byte += 1
|
||||
else:
|
||||
current_bit -= 1
|
||||
|
||||
for c in decompressed:
|
||||
#print()
|
||||
#print("char", c, values.index(c))
|
||||
|
@ -342,14 +377,17 @@ def print_qstr_data(encoding_table, qcfgs, qstrs, i18ns):
|
|||
|
||||
total_text_size = 0
|
||||
total_text_compressed_size = 0
|
||||
max_translation_encoded_length = max(len(translation.encode("utf-8")) for original, translation in i18ns)
|
||||
encoded_length_bits = max_translation_encoded_length.bit_length()
|
||||
for original, translation in i18ns:
|
||||
translation_encoded = translation.encode("utf-8")
|
||||
compressed = compress(encoding_table, translation)
|
||||
compressed = compress(encoding_table, translation, encoded_length_bits, len(translation_encoded))
|
||||
total_text_compressed_size += len(compressed)
|
||||
decompressed = decompress(encoding_table, len(translation_encoded), compressed)
|
||||
decompressed = decompress(encoding_table, compressed, encoded_length_bits)
|
||||
assert decompressed == translation
|
||||
for c in C_ESCAPES:
|
||||
decompressed = decompressed.replace(c, C_ESCAPES[c])
|
||||
print("TRANSLATION(\"{}\", {}, {{ {} }}) // {}".format(original, len(translation_encoded)+1, ", ".join(["0x{:02x}".format(x) for x in compressed]), decompressed))
|
||||
print("TRANSLATION(\"{}\", {}) // {}".format(original, ", ".join(["{:d}".format(x) for x in compressed]), decompressed))
|
||||
total_text_size += len(translation.encode("utf-8"))
|
||||
|
||||
print()
|
||||
|
@ -385,6 +423,7 @@ if __name__ == "__main__":
|
|||
|
||||
qcfgs, qstrs, i18ns = parse_input_headers(args.infiles)
|
||||
if args.translation:
|
||||
i18ns = sorted(i18ns)
|
||||
translations = translate(args.translation, i18ns)
|
||||
encoding_table = compute_huffman_coding(translations, qstrs, args.compression_filename)
|
||||
print_qstr_data(encoding_table, qcfgs, qstrs, translations)
|
||||
|
|
|
@ -158,7 +158,7 @@ const char *mp_common_errno_to_str(mp_obj_t errno_val, char *buf, size_t len) {
|
|||
case ENOSPC: desc = translate("No space left on device"); break;
|
||||
case EROFS: desc = translate("Read-only filesystem"); break;
|
||||
}
|
||||
if (desc != NULL && desc->length <= len) {
|
||||
if (desc != NULL && decompress_length(desc) <= len) {
|
||||
decompress(desc, buf);
|
||||
return buf;
|
||||
}
|
||||
|
|
6
py/obj.c
6
py/obj.c
|
@ -94,17 +94,17 @@ void mp_obj_print_exception(const mp_print_t *print, mp_obj_t exc) {
|
|||
assert(n % 3 == 0);
|
||||
// Decompress the format strings
|
||||
const compressed_string_t* traceback = translate("Traceback (most recent call last):\n");
|
||||
char decompressed[traceback->length];
|
||||
char decompressed[decompress_length(traceback)];
|
||||
decompress(traceback, decompressed);
|
||||
#if MICROPY_ENABLE_SOURCE_LINE
|
||||
const compressed_string_t* frame = translate(" File \"%q\", line %d");
|
||||
#else
|
||||
const compressed_string_t* frame = translate(" File \"%q\"");
|
||||
#endif
|
||||
char decompressed_frame[frame->length];
|
||||
char decompressed_frame[decompress_length(frame)];
|
||||
decompress(frame, decompressed_frame);
|
||||
const compressed_string_t* block_fmt = translate(", in %q\n");
|
||||
char decompressed_block[block_fmt->length];
|
||||
char decompressed_block[decompress_length(block_fmt)];
|
||||
decompress(block_fmt, decompressed_block);
|
||||
|
||||
// Print the traceback
|
||||
|
|
|
@ -400,7 +400,7 @@ mp_obj_t mp_obj_new_exception_msg_vlist(const mp_obj_type_t *exc_type, const com
|
|||
|
||||
// Try to allocate memory for the message
|
||||
mp_obj_str_t *o_str = m_new_obj_maybe(mp_obj_str_t);
|
||||
size_t o_str_alloc = fmt->length + 1;
|
||||
size_t o_str_alloc = decompress_length(fmt);
|
||||
byte *o_str_buf = m_new_maybe(byte, o_str_alloc);
|
||||
|
||||
bool used_emg_buf = false;
|
||||
|
@ -433,7 +433,7 @@ mp_obj_t mp_obj_new_exception_msg_vlist(const mp_obj_type_t *exc_type, const com
|
|||
// We have some memory to format the string
|
||||
struct _exc_printer_t exc_pr = {!used_emg_buf, o_str_alloc, 0, o_str_buf};
|
||||
mp_print_t print = {&exc_pr, exc_add_strn};
|
||||
char fmt_decompressed[fmt->length];
|
||||
char fmt_decompressed[decompress_length(fmt)];
|
||||
decompress(fmt, fmt_decompressed);
|
||||
mp_vprintf(&print, fmt_decompressed, ap);
|
||||
exc_pr.buf[exc_pr.len] = '\0';
|
||||
|
|
|
@ -37,7 +37,7 @@
|
|||
#include "supervisor/serial.h"
|
||||
|
||||
void serial_write_compressed(const compressed_string_t* compressed) {
|
||||
char decompressed[compressed->length];
|
||||
char decompressed[decompress_length(compressed)];
|
||||
decompress(compressed, decompressed);
|
||||
serial_write(decompressed);
|
||||
}
|
||||
|
@ -58,12 +58,22 @@ STATIC int put_utf8(char *buf, int u) {
|
|||
}
|
||||
}
|
||||
|
||||
uint16_t decompress_length(const compressed_string_t* compressed) {
|
||||
if (compress_max_length_bits <= 8) {
|
||||
return 1 + (compressed->data >> (8 - compress_max_length_bits));
|
||||
} else {
|
||||
return 1 + ((compressed->data * 256 + compressed->tail[0]) >> (16 - compress_max_length_bits));
|
||||
}
|
||||
}
|
||||
|
||||
char* decompress(const compressed_string_t* compressed, char* decompressed) {
|
||||
uint8_t this_byte = 0;
|
||||
uint8_t this_bit = 7;
|
||||
uint8_t b = compressed->data[this_byte];
|
||||
uint8_t this_byte = compress_max_length_bits / 8;
|
||||
uint8_t this_bit = 7 - compress_max_length_bits % 8;
|
||||
uint8_t b = (&compressed->data)[this_byte];
|
||||
uint16_t length = decompress_length(compressed);
|
||||
|
||||
// Stop one early because the last byte is always NULL.
|
||||
for (uint16_t i = 0; i < compressed->length - 1;) {
|
||||
for (uint16_t i = 0; i < length - 1;) {
|
||||
uint32_t bits = 0;
|
||||
uint8_t bit_length = 0;
|
||||
uint32_t max_code = lengths[0];
|
||||
|
@ -78,7 +88,7 @@ char* decompress(const compressed_string_t* compressed, char* decompressed) {
|
|||
if (this_bit == 0) {
|
||||
this_bit = 7;
|
||||
this_byte += 1;
|
||||
b = compressed->data[this_byte]; // This may read past the end but its never used.
|
||||
b = (&compressed->data)[this_byte]; // This may read past the end but its never used.
|
||||
} else {
|
||||
this_bit -= 1;
|
||||
}
|
||||
|
@ -91,14 +101,14 @@ char* decompress(const compressed_string_t* compressed, char* decompressed) {
|
|||
i += put_utf8(decompressed + i, values[searched_length + bits - max_code]);
|
||||
}
|
||||
|
||||
decompressed[compressed->length-1] = '\0';
|
||||
decompressed[length-1] = '\0';
|
||||
return decompressed;
|
||||
}
|
||||
|
||||
inline __attribute__((always_inline)) const compressed_string_t* translate(const char* original) {
|
||||
#ifndef NO_QSTR
|
||||
#define QDEF(id, str)
|
||||
#define TRANSLATION(id, len, compressed...) if (strcmp(original, id) == 0) { static const compressed_string_t v = {.length = len, .data = compressed}; return &v; } else
|
||||
#define TRANSLATION(id, firstbyte, ...) if (strcmp(original, id) == 0) { static const compressed_string_t v = { .data = firstbyte, .tail = { __VA_ARGS__ } }; return &v; } else
|
||||
#include "genhdr/qstrdefs.generated.h"
|
||||
#undef TRANSLATION
|
||||
#undef QDEF
|
||||
|
|
|
@ -29,13 +29,41 @@
|
|||
|
||||
#include <stdint.h>
|
||||
|
||||
// The format of the compressed data is:
|
||||
// - the size of the uncompressed string in UTF-8 bytes, encoded as a
|
||||
// (compress_max_length_bits)-bit number. compress_max_length_bits is
|
||||
// computed during dictionary generation time, and happens to be 8
|
||||
// for all current platforms. However, it'll probably end up being
|
||||
// 9 in some translations sometime in the future. This length excludes
|
||||
// the trailing NUL, though notably decompress_length includes it.
|
||||
//
|
||||
// - followed by the huffman encoding of the individual UTF-16 code
|
||||
// points that make up the string. The trailing "\0" is not
|
||||
// represented by a huffman code, but is implied by the length.
|
||||
// (building the huffman encoding on UTF-16 code points gave better
|
||||
// compression than building it on UTF-8 bytes)
|
||||
//
|
||||
// The "data" / "tail" construct is so that the struct's last member is a
|
||||
// "flexible array". However, the _only_ member is not permitted to be
|
||||
// a flexible member, so we have to declare the first byte as a separte
|
||||
// member of the structure.
|
||||
//
|
||||
// For translations where length needs 8 bits, this saves about 1.5
|
||||
// bytes per string on average compared to a structure of {uint16_t,
|
||||
// flexible array}, but is also future-proofed against strings with
|
||||
// UTF-8 length above 256, with a savings of about 1.375 bytes per
|
||||
// string.
|
||||
typedef struct {
|
||||
uint16_t length;
|
||||
const uint8_t data[];
|
||||
uint8_t data;
|
||||
const uint8_t tail[];
|
||||
} compressed_string_t;
|
||||
|
||||
// Return the compressed, translated version of a source string
|
||||
// Usually, due to LTO, this is optimized into a load of a constant
|
||||
// pointer.
|
||||
const compressed_string_t* translate(const char* c);
|
||||
void serial_write_compressed(const compressed_string_t* compressed);
|
||||
char* decompress(const compressed_string_t* compressed, char* decompressed);
|
||||
uint16_t decompress_length(const compressed_string_t* compressed);
|
||||
|
||||
#endif // MICROPY_INCLUDED_SUPERVISOR_TRANSLATE_H
|
||||
|
|
Loading…
Reference in New Issue