diff --git a/py/makeqstrdata.py b/py/makeqstrdata.py index 40c50b7b43..721fa83206 100644 --- a/py/makeqstrdata.py +++ b/py/makeqstrdata.py @@ -159,7 +159,14 @@ def compute_huffman_coding(translations, qstrs, compression_filename): f.write("const uint8_t lengths[] = {{ {} }};\n".format(", ".join(map(str, lengths)))) f.write("const {} values[] = {{ {} }};\n".format(values_type, ", ".join(str(ord(u)) for u in values))) f.write("#define compress_max_length_bits ({})\n".format(max_translation_encoded_length.bit_length())) - f.write("const {} ngrams[] = {{ {} }};\n".format(values_type, ", ".join(str(u) for u in ngramdata))) + f.write("const {} bigrams[] = {{ {} }};\n".format(values_type, ", ".join(str(u) for u in ngramdata))) + if len(ngrams) > 32: + bigram_start = 0xe000 + else: + bigram_start = 0x80 + bigram_end = bigram_start + len(ngrams) - 1 # End is inclusive + f.write("#define bigram_start {}\n".format(bigram_start)) + f.write("#define bigram_end {}\n".format(bigram_end)) return values, lengths, ngrams def decompress(encoding_table, encoded, encoded_length_bits): diff --git a/supervisor/shared/translate.c b/supervisor/shared/translate.c index 49ee8f143f..0235293bee 100644 --- a/supervisor/shared/translate.c +++ b/supervisor/shared/translate.c @@ -47,19 +47,18 @@ STATIC int put_utf8(char *buf, int u) { if(u <= 0x7f) { *buf = u; return 1; - } else if(MP_ARRAY_SIZE(ngrams) <= 64 && u <= 0xbf) { + } else if(bigram_start <= u && u <= bigram_end) { int n = (u - 0x80) * 2; - int ret = put_utf8(buf, ngrams[n]); - return ret + put_utf8(buf + ret, ngrams[n+1]); + // (note that at present, entries in the bigrams table are + // guaranteed not to represent bigrams themselves, so this adds + // at most 1 level of recursive call + int ret = put_utf8(buf, bigrams[n]); + return ret + put_utf8(buf + ret, bigrams[n+1]); } else if(u <= 0x07ff) { *buf++ = 0b11000000 | (u >> 6); *buf = 0b10000000 | (u & 0b00111111); return 2; - } else if(MP_ARRAY_SIZE(ngrams) > 64 && u >= 0xe000 && u <= 0xf8ff) { - int n = (u - 0xe000) * 2; - int ret = put_utf8(buf, ngrams[n]); - return ret + put_utf8(buf + ret, ngrams[n+1]); - } else { // u <= 0xffff) + } else { // u <= 0xffff *buf++ = 0b11000000 | (u >> 12); *buf = 0b10000000 | ((u >> 6) & 0b00111111); *buf = 0b10000000 | (u & 0b00111111);