translations: Make decompression clearer

Now this gets filled in with values e.g., 128 (0x80) and 159 (0x9f).
This commit is contained in:
Jeff Epler 2020-09-08 19:07:53 -05:00
parent cbfd38d1ce
commit bdb07adfcc
2 changed files with 15 additions and 9 deletions

View File

@ -159,7 +159,14 @@ def compute_huffman_coding(translations, qstrs, compression_filename):
f.write("const uint8_t lengths[] = {{ {} }};\n".format(", ".join(map(str, lengths))))
f.write("const {} values[] = {{ {} }};\n".format(values_type, ", ".join(str(ord(u)) for u in values)))
f.write("#define compress_max_length_bits ({})\n".format(max_translation_encoded_length.bit_length()))
f.write("const {} ngrams[] = {{ {} }};\n".format(values_type, ", ".join(str(u) for u in ngramdata)))
f.write("const {} bigrams[] = {{ {} }};\n".format(values_type, ", ".join(str(u) for u in ngramdata)))
if len(ngrams) > 32:
bigram_start = 0xe000
else:
bigram_start = 0x80
bigram_end = bigram_start + len(ngrams) - 1 # End is inclusive
f.write("#define bigram_start {}\n".format(bigram_start))
f.write("#define bigram_end {}\n".format(bigram_end))
return values, lengths, ngrams
def decompress(encoding_table, encoded, encoded_length_bits):

View File

@ -47,19 +47,18 @@ STATIC int put_utf8(char *buf, int u) {
if(u <= 0x7f) {
*buf = u;
return 1;
} else if(MP_ARRAY_SIZE(ngrams) <= 64 && u <= 0xbf) {
} else if(bigram_start <= u && u <= bigram_end) {
int n = (u - 0x80) * 2;
int ret = put_utf8(buf, ngrams[n]);
return ret + put_utf8(buf + ret, ngrams[n+1]);
// (note that at present, entries in the bigrams table are
// guaranteed not to represent bigrams themselves, so this adds
// at most 1 level of recursive call
int ret = put_utf8(buf, bigrams[n]);
return ret + put_utf8(buf + ret, bigrams[n+1]);
} else if(u <= 0x07ff) {
*buf++ = 0b11000000 | (u >> 6);
*buf = 0b10000000 | (u & 0b00111111);
return 2;
} else if(MP_ARRAY_SIZE(ngrams) > 64 && u >= 0xe000 && u <= 0xf8ff) {
int n = (u - 0xe000) * 2;
int ret = put_utf8(buf, ngrams[n]);
return ret + put_utf8(buf + ret, ngrams[n+1]);
} else { // u <= 0xffff)
} else { // u <= 0xffff
*buf++ = 0b11000000 | (u >> 12);
*buf = 0b10000000 | ((u >> 6) & 0b00111111);
*buf = 0b10000000 | (u & 0b00111111);