makeqstrdata: correct range of low code points to 0x80..0x9f inclusive

The previous range was unintentionally big and overlaps some characters
we'd like to use (and also 0xa0, which we don't intentionally use)
This commit is contained in:
Jeff Epler 2020-09-02 15:52:02 -05:00
parent f1c7389b29
commit c34cb82ecb

View File

@ -116,7 +116,7 @@ def pua_to_ngrams(compressed, ngrams):
if len(ngrams) > 32: if len(ngrams) > 32:
start, end = 0xe000, 0xf8ff start, end = 0xe000, 0xf8ff
else: else:
start, end = 0x80, 0xbf start, end = 0x80, 0x9f
return "".join(ngrams[ord(c) - start] if (start <= ord(c) <= end) else c for c in compressed) return "".join(ngrams[ord(c) - start] if (start <= ord(c) <= end) else c for c in compressed)
def compute_huffman_coding(translations, qstrs, compression_filename): def compute_huffman_coding(translations, qstrs, compression_filename):
@ -146,6 +146,7 @@ def compute_huffman_coding(translations, qstrs, compression_filename):
last_l = l last_l = l
lengths = bytearray() lengths = bytearray()
print("// length count", length_count) print("// length count", length_count)
print("// bigrams", ngrams)
for i in range(1, max(length_count) + 2): for i in range(1, max(length_count) + 2):
lengths.append(length_count.get(i, 0)) lengths.append(length_count.get(i, 0))
print("// values", values, "lengths", len(lengths), lengths) print("// values", values, "lengths", len(lengths), lengths)