diff --git a/locale/cs.po b/locale/cs.po index cd81c258fe..3e22ac2514 100644 --- a/locale/cs.po +++ b/locale/cs.po @@ -44,15 +44,15 @@ msgstr "" #: py/obj.c msgid " File \"%q\"" -msgstr "  Soubor \"%q\"" +msgstr " Soubor \"%q\"" #: py/obj.c msgid " File \"%q\", line %d" -msgstr "  Soubor \"%q\", řádek %d" +msgstr " Soubor \"%q\", řádek %d" #: main.c msgid " output:\n" -msgstr " výstup:\n" +msgstr " výstup:\n" #: py/objstr.c #, c-format diff --git a/locale/pl.po b/locale/pl.po index b072629cd1..cd4b905e73 100644 --- a/locale/pl.po +++ b/locale/pl.po @@ -1971,7 +1971,7 @@ msgstr "wartość kalibracji poza zakresem +/-127" #: py/emitinlinethumb.c msgid "can only have up to 4 parameters to Thumb assembly" -msgstr "asembler Thumb może przyjąć do 4 parameterów" +msgstr "asembler Thumb może przyjąć do 4 parameterów" #: py/emitinlinextensa.c msgid "can only have up to 4 parameters to Xtensa assembly" @@ -3562,7 +3562,7 @@ msgstr "" #~ msgstr "Nie udało się odkryć serwisów" #~ msgid "Failed to get local address" -#~ msgstr "Nie udało się uzyskać lokalnego adresu" +#~ msgstr "Nie udało się uzyskać lokalnego adresu" #~ msgid "Failed to get softdevice state" #~ msgstr "Nie udało się odczytać stanu softdevice" @@ -3610,7 +3610,7 @@ msgstr "" #~ msgstr "Nie udało się zapisać gatts, błąd 0x%04x" #~ msgid "Flash erase failed" -#~ msgstr "Nie udało się skasować flash" +#~ msgstr "Nie udało się skasować flash" #~ msgid "Flash erase failed to start, err 0x%04x" #~ msgstr "Nie udało się rozpocząć kasowania flash, błąd 0x%04x" diff --git a/py/makeqstrdata.py b/py/makeqstrdata.py index 04c8938766..721fa83206 100644 --- a/py/makeqstrdata.py +++ b/py/makeqstrdata.py @@ -100,9 +100,30 @@ def translate(translation_file, i18ns): translations.append((original, translation)) return translations +def frequent_ngrams(corpus, sz, n): + return collections.Counter(corpus[i:i+sz] for i in range(len(corpus)-sz)).most_common(n) + +def encode_ngrams(translation, ngrams): + if len(ngrams) > 32: + start = 0xe000 + else: + start = 0x80 + for i, g in enumerate(ngrams): + translation = translation.replace(g, chr(start + i)) + return translation + +def decode_ngrams(compressed, ngrams): + if len(ngrams) > 32: + start, end = 0xe000, 0xf8ff + else: + start, end = 0x80, 0x9f + return "".join(ngrams[ord(c) - start] if (start <= ord(c) <= end) else c for c in compressed) + def compute_huffman_coding(translations, qstrs, compression_filename): all_strings = [x[1] for x in translations] all_strings_concat = "".join(all_strings) + ngrams = [i[0] for i in frequent_ngrams(all_strings_concat, 2, 32)] + all_strings_concat = encode_ngrams(all_strings_concat, ngrams) counts = collections.Counter(all_strings_concat) cb = huffman.codebook(counts.items()) values = [] @@ -125,10 +146,12 @@ def compute_huffman_coding(translations, qstrs, compression_filename): last_l = l lengths = bytearray() print("// length count", length_count) + print("// bigrams", ngrams) for i in range(1, max(length_count) + 2): lengths.append(length_count.get(i, 0)) print("// values", values, "lengths", len(lengths), lengths) - print("// estimated total memory size", len(lengths) + 2*len(values) + sum(len(cb[u]) for u in all_strings_concat)) + ngramdata = [ord(ni) for i in ngrams for ni in i] + print("// estimated total memory size", len(lengths) + 2*len(values) + 2 * len(ngramdata) + sum((len(cb[u]) + 7)//8 for u in all_strings_concat)) print("//", values, lengths) values_type = "uint16_t" if max(ord(u) for u in values) > 255 else "uint8_t" max_translation_encoded_length = max(len(translation.encode("utf-8")) for original,translation in translations) @@ -136,10 +159,18 @@ def compute_huffman_coding(translations, qstrs, compression_filename): f.write("const uint8_t lengths[] = {{ {} }};\n".format(", ".join(map(str, lengths)))) f.write("const {} values[] = {{ {} }};\n".format(values_type, ", ".join(str(ord(u)) for u in values))) f.write("#define compress_max_length_bits ({})\n".format(max_translation_encoded_length.bit_length())) - return values, lengths + f.write("const {} bigrams[] = {{ {} }};\n".format(values_type, ", ".join(str(u) for u in ngramdata))) + if len(ngrams) > 32: + bigram_start = 0xe000 + else: + bigram_start = 0x80 + bigram_end = bigram_start + len(ngrams) - 1 # End is inclusive + f.write("#define bigram_start {}\n".format(bigram_start)) + f.write("#define bigram_end {}\n".format(bigram_end)) + return values, lengths, ngrams def decompress(encoding_table, encoded, encoded_length_bits): - values, lengths = encoding_table + values, lengths, ngrams = encoding_table dec = [] this_byte = 0 this_bit = 7 @@ -187,6 +218,7 @@ def decompress(encoding_table, encoded, encoded_length_bits): searched_length += lengths[bit_length] v = values[searched_length + bits - max_code] + v = decode_ngrams(v, ngrams) i += len(v.encode('utf-8')) dec.append(v) return ''.join(dec) @@ -194,7 +226,8 @@ def decompress(encoding_table, encoded, encoded_length_bits): def compress(encoding_table, decompressed, encoded_length_bits, len_translation_encoded): if not isinstance(decompressed, str): raise TypeError() - values, lengths = encoding_table + values, lengths, ngrams = encoding_table + decompressed = encode_ngrams(decompressed, ngrams) enc = bytearray(len(decompressed) * 3) #print(decompressed) #print(lengths) diff --git a/supervisor/shared/translate.c b/supervisor/shared/translate.c index 606f8fa91a..5cd7b8dd88 100644 --- a/supervisor/shared/translate.c +++ b/supervisor/shared/translate.c @@ -34,6 +34,7 @@ #include "genhdr/compression.generated.h" #endif +#include "py/misc.h" #include "supervisor/serial.h" void serial_write_compressed(const compressed_string_t* compressed) { @@ -46,13 +47,20 @@ STATIC int put_utf8(char *buf, int u) { if(u <= 0x7f) { *buf = u; return 1; + } else if(bigram_start <= u && u <= bigram_end) { + int n = (u - 0x80) * 2; + // (note that at present, entries in the bigrams table are + // guaranteed not to represent bigrams themselves, so this adds + // at most 1 level of recursive call + int ret = put_utf8(buf, bigrams[n]); + return ret + put_utf8(buf + ret, bigrams[n+1]); } else if(u <= 0x07ff) { *buf++ = 0b11000000 | (u >> 6); *buf = 0b10000000 | (u & 0b00111111); return 2; - } else { // u <= 0xffff) - *buf++ = 0b11000000 | (u >> 12); - *buf = 0b10000000 | ((u >> 6) & 0b00111111); + } else { // u <= 0xffff + *buf++ = 0b11100000 | (u >> 12); + *buf++ = 0b10000000 | ((u >> 6) & 0b00111111); *buf = 0b10000000 | (u & 0b00111111); return 3; }