makeqstrdata: correct range of low code points to 0x80..0x9f inclusive
The previous range was unintentionally big and overlaps some characters we'd like to use (and also 0xa0, which we don't intentionally use)
This commit is contained in:
parent
f1c7389b29
commit
c34cb82ecb
@ -116,7 +116,7 @@ def pua_to_ngrams(compressed, ngrams):
|
|||||||
if len(ngrams) > 32:
|
if len(ngrams) > 32:
|
||||||
start, end = 0xe000, 0xf8ff
|
start, end = 0xe000, 0xf8ff
|
||||||
else:
|
else:
|
||||||
start, end = 0x80, 0xbf
|
start, end = 0x80, 0x9f
|
||||||
return "".join(ngrams[ord(c) - start] if (start <= ord(c) <= end) else c for c in compressed)
|
return "".join(ngrams[ord(c) - start] if (start <= ord(c) <= end) else c for c in compressed)
|
||||||
|
|
||||||
def compute_huffman_coding(translations, qstrs, compression_filename):
|
def compute_huffman_coding(translations, qstrs, compression_filename):
|
||||||
@ -146,6 +146,7 @@ def compute_huffman_coding(translations, qstrs, compression_filename):
|
|||||||
last_l = l
|
last_l = l
|
||||||
lengths = bytearray()
|
lengths = bytearray()
|
||||||
print("// length count", length_count)
|
print("// length count", length_count)
|
||||||
|
print("// bigrams", ngrams)
|
||||||
for i in range(1, max(length_count) + 2):
|
for i in range(1, max(length_count) + 2):
|
||||||
lengths.append(length_count.get(i, 0))
|
lengths.append(length_count.get(i, 0))
|
||||||
print("// values", values, "lengths", len(lengths), lengths)
|
print("// values", values, "lengths", len(lengths), lengths)
|
||||||
|
Loading…
Reference in New Issue
Block a user