From c34cb82ecb26ac2d5d85394ec535928194cd9e5e Mon Sep 17 00:00:00 2001 From: Jeff Epler Date: Wed, 2 Sep 2020 15:52:02 -0500 Subject: [PATCH] makeqstrdata: correct range of low code points to 0x80..0x9f inclusive The previous range was unintentionally big and overlaps some characters we'd like to use (and also 0xa0, which we don't intentionally use) --- py/makeqstrdata.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/py/makeqstrdata.py b/py/makeqstrdata.py index 28aed3df97..350f11c4cb 100644 --- a/py/makeqstrdata.py +++ b/py/makeqstrdata.py @@ -116,7 +116,7 @@ def pua_to_ngrams(compressed, ngrams): if len(ngrams) > 32: start, end = 0xe000, 0xf8ff else: - start, end = 0x80, 0xbf + start, end = 0x80, 0x9f return "".join(ngrams[ord(c) - start] if (start <= ord(c) <= end) else c for c in compressed) def compute_huffman_coding(translations, qstrs, compression_filename): @@ -146,6 +146,7 @@ def compute_huffman_coding(translations, qstrs, compression_filename): last_l = l lengths = bytearray() print("// length count", length_count) + print("// bigrams", ngrams) for i in range(1, max(length_count) + 2): lengths.append(length_count.get(i, 0)) print("// values", values, "lengths", len(lengths), lengths)