translations: Make decompression clearer
Now this gets filled in with values e.g., 128 (0x80) and 159 (0x9f).
This commit is contained in:
parent
cbfd38d1ce
commit
bdb07adfcc
@ -159,7 +159,14 @@ def compute_huffman_coding(translations, qstrs, compression_filename):
|
||||
f.write("const uint8_t lengths[] = {{ {} }};\n".format(", ".join(map(str, lengths))))
|
||||
f.write("const {} values[] = {{ {} }};\n".format(values_type, ", ".join(str(ord(u)) for u in values)))
|
||||
f.write("#define compress_max_length_bits ({})\n".format(max_translation_encoded_length.bit_length()))
|
||||
f.write("const {} ngrams[] = {{ {} }};\n".format(values_type, ", ".join(str(u) for u in ngramdata)))
|
||||
f.write("const {} bigrams[] = {{ {} }};\n".format(values_type, ", ".join(str(u) for u in ngramdata)))
|
||||
if len(ngrams) > 32:
|
||||
bigram_start = 0xe000
|
||||
else:
|
||||
bigram_start = 0x80
|
||||
bigram_end = bigram_start + len(ngrams) - 1 # End is inclusive
|
||||
f.write("#define bigram_start {}\n".format(bigram_start))
|
||||
f.write("#define bigram_end {}\n".format(bigram_end))
|
||||
return values, lengths, ngrams
|
||||
|
||||
def decompress(encoding_table, encoded, encoded_length_bits):
|
||||
|
@ -47,19 +47,18 @@ STATIC int put_utf8(char *buf, int u) {
|
||||
if(u <= 0x7f) {
|
||||
*buf = u;
|
||||
return 1;
|
||||
} else if(MP_ARRAY_SIZE(ngrams) <= 64 && u <= 0xbf) {
|
||||
} else if(bigram_start <= u && u <= bigram_end) {
|
||||
int n = (u - 0x80) * 2;
|
||||
int ret = put_utf8(buf, ngrams[n]);
|
||||
return ret + put_utf8(buf + ret, ngrams[n+1]);
|
||||
// (note that at present, entries in the bigrams table are
|
||||
// guaranteed not to represent bigrams themselves, so this adds
|
||||
// at most 1 level of recursive call
|
||||
int ret = put_utf8(buf, bigrams[n]);
|
||||
return ret + put_utf8(buf + ret, bigrams[n+1]);
|
||||
} else if(u <= 0x07ff) {
|
||||
*buf++ = 0b11000000 | (u >> 6);
|
||||
*buf = 0b10000000 | (u & 0b00111111);
|
||||
return 2;
|
||||
} else if(MP_ARRAY_SIZE(ngrams) > 64 && u >= 0xe000 && u <= 0xf8ff) {
|
||||
int n = (u - 0xe000) * 2;
|
||||
int ret = put_utf8(buf, ngrams[n]);
|
||||
return ret + put_utf8(buf + ret, ngrams[n+1]);
|
||||
} else { // u <= 0xffff)
|
||||
} else { // u <= 0xffff
|
||||
*buf++ = 0b11000000 | (u >> 12);
|
||||
*buf = 0b10000000 | ((u >> 6) & 0b00111111);
|
||||
*buf = 0b10000000 | (u & 0b00111111);
|
||||
|
Loading…
x
Reference in New Issue
Block a user