Rename functions to encode_ngrams / decode_ngrams
This commit is contained in:
parent
c34cb82ecb
commit
cbfd38d1ce
@ -103,7 +103,7 @@ def translate(translation_file, i18ns):
|
|||||||
def frequent_ngrams(corpus, sz, n):
|
def frequent_ngrams(corpus, sz, n):
|
||||||
return collections.Counter(corpus[i:i+sz] for i in range(len(corpus)-sz)).most_common(n)
|
return collections.Counter(corpus[i:i+sz] for i in range(len(corpus)-sz)).most_common(n)
|
||||||
|
|
||||||
def ngrams_to_pua(translation, ngrams):
|
def encode_ngrams(translation, ngrams):
|
||||||
if len(ngrams) > 32:
|
if len(ngrams) > 32:
|
||||||
start = 0xe000
|
start = 0xe000
|
||||||
else:
|
else:
|
||||||
@ -112,7 +112,7 @@ def ngrams_to_pua(translation, ngrams):
|
|||||||
translation = translation.replace(g, chr(start + i))
|
translation = translation.replace(g, chr(start + i))
|
||||||
return translation
|
return translation
|
||||||
|
|
||||||
def pua_to_ngrams(compressed, ngrams):
|
def decode_ngrams(compressed, ngrams):
|
||||||
if len(ngrams) > 32:
|
if len(ngrams) > 32:
|
||||||
start, end = 0xe000, 0xf8ff
|
start, end = 0xe000, 0xf8ff
|
||||||
else:
|
else:
|
||||||
@ -123,7 +123,7 @@ def compute_huffman_coding(translations, qstrs, compression_filename):
|
|||||||
all_strings = [x[1] for x in translations]
|
all_strings = [x[1] for x in translations]
|
||||||
all_strings_concat = "".join(all_strings)
|
all_strings_concat = "".join(all_strings)
|
||||||
ngrams = [i[0] for i in frequent_ngrams(all_strings_concat, 2, 32)]
|
ngrams = [i[0] for i in frequent_ngrams(all_strings_concat, 2, 32)]
|
||||||
all_strings_concat = ngrams_to_pua(all_strings_concat, ngrams)
|
all_strings_concat = encode_ngrams(all_strings_concat, ngrams)
|
||||||
counts = collections.Counter(all_strings_concat)
|
counts = collections.Counter(all_strings_concat)
|
||||||
cb = huffman.codebook(counts.items())
|
cb = huffman.codebook(counts.items())
|
||||||
values = []
|
values = []
|
||||||
@ -211,7 +211,7 @@ def decompress(encoding_table, encoded, encoded_length_bits):
|
|||||||
searched_length += lengths[bit_length]
|
searched_length += lengths[bit_length]
|
||||||
|
|
||||||
v = values[searched_length + bits - max_code]
|
v = values[searched_length + bits - max_code]
|
||||||
v = pua_to_ngrams(v, ngrams)
|
v = decode_ngrams(v, ngrams)
|
||||||
i += len(v.encode('utf-8'))
|
i += len(v.encode('utf-8'))
|
||||||
dec.append(v)
|
dec.append(v)
|
||||||
return ''.join(dec)
|
return ''.join(dec)
|
||||||
@ -220,7 +220,7 @@ def compress(encoding_table, decompressed, encoded_length_bits, len_translation_
|
|||||||
if not isinstance(decompressed, str):
|
if not isinstance(decompressed, str):
|
||||||
raise TypeError()
|
raise TypeError()
|
||||||
values, lengths, ngrams = encoding_table
|
values, lengths, ngrams = encoding_table
|
||||||
decompressed = ngrams_to_pua(decompressed, ngrams)
|
decompressed = encode_ngrams(decompressed, ngrams)
|
||||||
enc = bytearray(len(decompressed) * 3)
|
enc = bytearray(len(decompressed) * 3)
|
||||||
#print(decompressed)
|
#print(decompressed)
|
||||||
#print(lengths)
|
#print(lengths)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user