Merge pull request #3370 from jepler/compression-bigrams
add bigram compression to makeqstrdata (save ~100 bytes on trinket m0 de_DE)
This commit is contained in:
commit
1ba28b3edc
|
@ -44,15 +44,15 @@ msgstr ""
|
||||||
|
|
||||||
#: py/obj.c
|
#: py/obj.c
|
||||||
msgid " File \"%q\""
|
msgid " File \"%q\""
|
||||||
msgstr " Soubor \"%q\""
|
msgstr " Soubor \"%q\""
|
||||||
|
|
||||||
#: py/obj.c
|
#: py/obj.c
|
||||||
msgid " File \"%q\", line %d"
|
msgid " File \"%q\", line %d"
|
||||||
msgstr " Soubor \"%q\", řádek %d"
|
msgstr " Soubor \"%q\", řádek %d"
|
||||||
|
|
||||||
#: main.c
|
#: main.c
|
||||||
msgid " output:\n"
|
msgid " output:\n"
|
||||||
msgstr " výstup:\n"
|
msgstr " výstup:\n"
|
||||||
|
|
||||||
#: py/objstr.c
|
#: py/objstr.c
|
||||||
#, c-format
|
#, c-format
|
||||||
|
|
|
@ -1971,7 +1971,7 @@ msgstr "wartość kalibracji poza zakresem +/-127"
|
||||||
|
|
||||||
#: py/emitinlinethumb.c
|
#: py/emitinlinethumb.c
|
||||||
msgid "can only have up to 4 parameters to Thumb assembly"
|
msgid "can only have up to 4 parameters to Thumb assembly"
|
||||||
msgstr "asembler Thumb może przyjąć do 4 parameterów"
|
msgstr "asembler Thumb może przyjąć do 4 parameterów"
|
||||||
|
|
||||||
#: py/emitinlinextensa.c
|
#: py/emitinlinextensa.c
|
||||||
msgid "can only have up to 4 parameters to Xtensa assembly"
|
msgid "can only have up to 4 parameters to Xtensa assembly"
|
||||||
|
@ -3562,7 +3562,7 @@ msgstr ""
|
||||||
#~ msgstr "Nie udało się odkryć serwisów"
|
#~ msgstr "Nie udało się odkryć serwisów"
|
||||||
|
|
||||||
#~ msgid "Failed to get local address"
|
#~ msgid "Failed to get local address"
|
||||||
#~ msgstr "Nie udało się uzyskać lokalnego adresu"
|
#~ msgstr "Nie udało się uzyskać lokalnego adresu"
|
||||||
|
|
||||||
#~ msgid "Failed to get softdevice state"
|
#~ msgid "Failed to get softdevice state"
|
||||||
#~ msgstr "Nie udało się odczytać stanu softdevice"
|
#~ msgstr "Nie udało się odczytać stanu softdevice"
|
||||||
|
@ -3610,7 +3610,7 @@ msgstr ""
|
||||||
#~ msgstr "Nie udało się zapisać gatts, błąd 0x%04x"
|
#~ msgstr "Nie udało się zapisać gatts, błąd 0x%04x"
|
||||||
|
|
||||||
#~ msgid "Flash erase failed"
|
#~ msgid "Flash erase failed"
|
||||||
#~ msgstr "Nie udało się skasować flash"
|
#~ msgstr "Nie udało się skasować flash"
|
||||||
|
|
||||||
#~ msgid "Flash erase failed to start, err 0x%04x"
|
#~ msgid "Flash erase failed to start, err 0x%04x"
|
||||||
#~ msgstr "Nie udało się rozpocząć kasowania flash, błąd 0x%04x"
|
#~ msgstr "Nie udało się rozpocząć kasowania flash, błąd 0x%04x"
|
||||||
|
|
|
@ -100,9 +100,30 @@ def translate(translation_file, i18ns):
|
||||||
translations.append((original, translation))
|
translations.append((original, translation))
|
||||||
return translations
|
return translations
|
||||||
|
|
||||||
|
def frequent_ngrams(corpus, sz, n):
|
||||||
|
return collections.Counter(corpus[i:i+sz] for i in range(len(corpus)-sz)).most_common(n)
|
||||||
|
|
||||||
|
def encode_ngrams(translation, ngrams):
|
||||||
|
if len(ngrams) > 32:
|
||||||
|
start = 0xe000
|
||||||
|
else:
|
||||||
|
start = 0x80
|
||||||
|
for i, g in enumerate(ngrams):
|
||||||
|
translation = translation.replace(g, chr(start + i))
|
||||||
|
return translation
|
||||||
|
|
||||||
|
def decode_ngrams(compressed, ngrams):
|
||||||
|
if len(ngrams) > 32:
|
||||||
|
start, end = 0xe000, 0xf8ff
|
||||||
|
else:
|
||||||
|
start, end = 0x80, 0x9f
|
||||||
|
return "".join(ngrams[ord(c) - start] if (start <= ord(c) <= end) else c for c in compressed)
|
||||||
|
|
||||||
def compute_huffman_coding(translations, qstrs, compression_filename):
|
def compute_huffman_coding(translations, qstrs, compression_filename):
|
||||||
all_strings = [x[1] for x in translations]
|
all_strings = [x[1] for x in translations]
|
||||||
all_strings_concat = "".join(all_strings)
|
all_strings_concat = "".join(all_strings)
|
||||||
|
ngrams = [i[0] for i in frequent_ngrams(all_strings_concat, 2, 32)]
|
||||||
|
all_strings_concat = encode_ngrams(all_strings_concat, ngrams)
|
||||||
counts = collections.Counter(all_strings_concat)
|
counts = collections.Counter(all_strings_concat)
|
||||||
cb = huffman.codebook(counts.items())
|
cb = huffman.codebook(counts.items())
|
||||||
values = []
|
values = []
|
||||||
|
@ -125,10 +146,12 @@ def compute_huffman_coding(translations, qstrs, compression_filename):
|
||||||
last_l = l
|
last_l = l
|
||||||
lengths = bytearray()
|
lengths = bytearray()
|
||||||
print("// length count", length_count)
|
print("// length count", length_count)
|
||||||
|
print("// bigrams", ngrams)
|
||||||
for i in range(1, max(length_count) + 2):
|
for i in range(1, max(length_count) + 2):
|
||||||
lengths.append(length_count.get(i, 0))
|
lengths.append(length_count.get(i, 0))
|
||||||
print("// values", values, "lengths", len(lengths), lengths)
|
print("// values", values, "lengths", len(lengths), lengths)
|
||||||
print("// estimated total memory size", len(lengths) + 2*len(values) + sum(len(cb[u]) for u in all_strings_concat))
|
ngramdata = [ord(ni) for i in ngrams for ni in i]
|
||||||
|
print("// estimated total memory size", len(lengths) + 2*len(values) + 2 * len(ngramdata) + sum((len(cb[u]) + 7)//8 for u in all_strings_concat))
|
||||||
print("//", values, lengths)
|
print("//", values, lengths)
|
||||||
values_type = "uint16_t" if max(ord(u) for u in values) > 255 else "uint8_t"
|
values_type = "uint16_t" if max(ord(u) for u in values) > 255 else "uint8_t"
|
||||||
max_translation_encoded_length = max(len(translation.encode("utf-8")) for original,translation in translations)
|
max_translation_encoded_length = max(len(translation.encode("utf-8")) for original,translation in translations)
|
||||||
|
@ -136,10 +159,18 @@ def compute_huffman_coding(translations, qstrs, compression_filename):
|
||||||
f.write("const uint8_t lengths[] = {{ {} }};\n".format(", ".join(map(str, lengths))))
|
f.write("const uint8_t lengths[] = {{ {} }};\n".format(", ".join(map(str, lengths))))
|
||||||
f.write("const {} values[] = {{ {} }};\n".format(values_type, ", ".join(str(ord(u)) for u in values)))
|
f.write("const {} values[] = {{ {} }};\n".format(values_type, ", ".join(str(ord(u)) for u in values)))
|
||||||
f.write("#define compress_max_length_bits ({})\n".format(max_translation_encoded_length.bit_length()))
|
f.write("#define compress_max_length_bits ({})\n".format(max_translation_encoded_length.bit_length()))
|
||||||
return values, lengths
|
f.write("const {} bigrams[] = {{ {} }};\n".format(values_type, ", ".join(str(u) for u in ngramdata)))
|
||||||
|
if len(ngrams) > 32:
|
||||||
|
bigram_start = 0xe000
|
||||||
|
else:
|
||||||
|
bigram_start = 0x80
|
||||||
|
bigram_end = bigram_start + len(ngrams) - 1 # End is inclusive
|
||||||
|
f.write("#define bigram_start {}\n".format(bigram_start))
|
||||||
|
f.write("#define bigram_end {}\n".format(bigram_end))
|
||||||
|
return values, lengths, ngrams
|
||||||
|
|
||||||
def decompress(encoding_table, encoded, encoded_length_bits):
|
def decompress(encoding_table, encoded, encoded_length_bits):
|
||||||
values, lengths = encoding_table
|
values, lengths, ngrams = encoding_table
|
||||||
dec = []
|
dec = []
|
||||||
this_byte = 0
|
this_byte = 0
|
||||||
this_bit = 7
|
this_bit = 7
|
||||||
|
@ -187,6 +218,7 @@ def decompress(encoding_table, encoded, encoded_length_bits):
|
||||||
searched_length += lengths[bit_length]
|
searched_length += lengths[bit_length]
|
||||||
|
|
||||||
v = values[searched_length + bits - max_code]
|
v = values[searched_length + bits - max_code]
|
||||||
|
v = decode_ngrams(v, ngrams)
|
||||||
i += len(v.encode('utf-8'))
|
i += len(v.encode('utf-8'))
|
||||||
dec.append(v)
|
dec.append(v)
|
||||||
return ''.join(dec)
|
return ''.join(dec)
|
||||||
|
@ -194,7 +226,8 @@ def decompress(encoding_table, encoded, encoded_length_bits):
|
||||||
def compress(encoding_table, decompressed, encoded_length_bits, len_translation_encoded):
|
def compress(encoding_table, decompressed, encoded_length_bits, len_translation_encoded):
|
||||||
if not isinstance(decompressed, str):
|
if not isinstance(decompressed, str):
|
||||||
raise TypeError()
|
raise TypeError()
|
||||||
values, lengths = encoding_table
|
values, lengths, ngrams = encoding_table
|
||||||
|
decompressed = encode_ngrams(decompressed, ngrams)
|
||||||
enc = bytearray(len(decompressed) * 3)
|
enc = bytearray(len(decompressed) * 3)
|
||||||
#print(decompressed)
|
#print(decompressed)
|
||||||
#print(lengths)
|
#print(lengths)
|
||||||
|
|
|
@ -34,6 +34,7 @@
|
||||||
#include "genhdr/compression.generated.h"
|
#include "genhdr/compression.generated.h"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#include "py/misc.h"
|
||||||
#include "supervisor/serial.h"
|
#include "supervisor/serial.h"
|
||||||
|
|
||||||
void serial_write_compressed(const compressed_string_t* compressed) {
|
void serial_write_compressed(const compressed_string_t* compressed) {
|
||||||
|
@ -46,13 +47,20 @@ STATIC int put_utf8(char *buf, int u) {
|
||||||
if(u <= 0x7f) {
|
if(u <= 0x7f) {
|
||||||
*buf = u;
|
*buf = u;
|
||||||
return 1;
|
return 1;
|
||||||
|
} else if(bigram_start <= u && u <= bigram_end) {
|
||||||
|
int n = (u - 0x80) * 2;
|
||||||
|
// (note that at present, entries in the bigrams table are
|
||||||
|
// guaranteed not to represent bigrams themselves, so this adds
|
||||||
|
// at most 1 level of recursive call
|
||||||
|
int ret = put_utf8(buf, bigrams[n]);
|
||||||
|
return ret + put_utf8(buf + ret, bigrams[n+1]);
|
||||||
} else if(u <= 0x07ff) {
|
} else if(u <= 0x07ff) {
|
||||||
*buf++ = 0b11000000 | (u >> 6);
|
*buf++ = 0b11000000 | (u >> 6);
|
||||||
*buf = 0b10000000 | (u & 0b00111111);
|
*buf = 0b10000000 | (u & 0b00111111);
|
||||||
return 2;
|
return 2;
|
||||||
} else { // u <= 0xffff)
|
} else { // u <= 0xffff
|
||||||
*buf++ = 0b11000000 | (u >> 12);
|
*buf++ = 0b11100000 | (u >> 12);
|
||||||
*buf = 0b10000000 | ((u >> 6) & 0b00111111);
|
*buf++ = 0b10000000 | ((u >> 6) & 0b00111111);
|
||||||
*buf = 0b10000000 | (u & 0b00111111);
|
*buf = 0b10000000 | (u & 0b00111111);
|
||||||
return 3;
|
return 3;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue