Merge pull request #8327 from jepler/translation-compression-qstr
Use qstrs to improve compression
This commit is contained in:
commit
e0fa155037
@ -1,7 +1,9 @@
|
|||||||
"""
|
"""
|
||||||
Process raw qstr file and output qstr data with length, hash and data bytes.
|
Process raw qstr file and output qstr data with length, hash and data bytes.
|
||||||
|
|
||||||
This script works with Python 2.7, 3.3 and 3.4.
|
This script is only regularly tested with the same version of Python used
|
||||||
|
during CI, typically the latest "3.x". However, incompatibilities with any
|
||||||
|
supported CPython version are unintended.
|
||||||
|
|
||||||
For documentation about the format of compressed translated strings, see
|
For documentation about the format of compressed translated strings, see
|
||||||
supervisor/shared/translate/translate.h
|
supervisor/shared/translate/translate.h
|
||||||
@ -16,31 +18,16 @@ import sys
|
|||||||
|
|
||||||
import collections
|
import collections
|
||||||
import gettext
|
import gettext
|
||||||
import os.path
|
import pathlib
|
||||||
|
|
||||||
if hasattr(sys.stdout, "reconfigure"):
|
if hasattr(sys.stdout, "reconfigure"):
|
||||||
sys.stdout.reconfigure(encoding="utf-8")
|
sys.stdout.reconfigure(encoding="utf-8")
|
||||||
sys.stderr.reconfigure(errors="backslashreplace")
|
sys.stderr.reconfigure(errors="backslashreplace")
|
||||||
|
|
||||||
py = os.path.dirname(sys.argv[0])
|
sys.path.append(str(pathlib.Path(__file__).parent.parent / "tools/huffman"))
|
||||||
top = os.path.dirname(py)
|
|
||||||
|
|
||||||
sys.path.append(os.path.join(top, "tools/huffman"))
|
|
||||||
|
|
||||||
import huffman
|
import huffman
|
||||||
|
from html.entities import codepoint2name
|
||||||
# Python 2/3 compatibility:
|
|
||||||
# - iterating through bytes is different
|
|
||||||
# - codepoint2name lives in a different module
|
|
||||||
import platform
|
|
||||||
|
|
||||||
if platform.python_version_tuple()[0] == "2":
|
|
||||||
bytes_cons = lambda val, enc=None: bytearray(val)
|
|
||||||
from htmlentitydefs import codepoint2name
|
|
||||||
elif platform.python_version_tuple()[0] == "3":
|
|
||||||
bytes_cons = bytes
|
|
||||||
from html.entities import codepoint2name
|
|
||||||
# end compatibility code
|
|
||||||
|
|
||||||
codepoint2name[ord("-")] = "hyphen"
|
codepoint2name[ord("-")] = "hyphen"
|
||||||
|
|
||||||
@ -182,9 +169,15 @@ class EncodingTable:
|
|||||||
extractor: object
|
extractor: object
|
||||||
apply_offset: object
|
apply_offset: object
|
||||||
remove_offset: object
|
remove_offset: object
|
||||||
|
translation_qstr_bits: int
|
||||||
|
qstrs: object
|
||||||
|
qstrs_inv: object
|
||||||
|
|
||||||
|
|
||||||
def compute_huffman_coding(translation_name, translations, f):
|
def compute_huffman_coding(qstrs, translation_name, translations, f):
|
||||||
|
# possible future improvement: some languages are better when consider len(k) > 2. try both?
|
||||||
|
qstrs = dict((k, v) for k, v in qstrs.items() if len(k) > 3)
|
||||||
|
qstr_strs = list(qstrs.keys())
|
||||||
texts = [t[1] for t in translations]
|
texts = [t[1] for t in translations]
|
||||||
words = []
|
words = []
|
||||||
|
|
||||||
@ -234,10 +227,12 @@ def compute_huffman_coding(translation_name, translations, f):
|
|||||||
# if "the" is in words then not only will "the" not be considered
|
# if "the" is in words then not only will "the" not be considered
|
||||||
# again, neither will "there" or "wither", since they have "the"
|
# again, neither will "there" or "wither", since they have "the"
|
||||||
# as substrings.
|
# as substrings.
|
||||||
extractor = TextSplitter(words)
|
extractor = TextSplitter(words + qstr_strs)
|
||||||
counter = collections.Counter()
|
counter = collections.Counter()
|
||||||
for t in texts:
|
for t in texts:
|
||||||
for atom in extractor.iter(t):
|
for atom in extractor.iter(t):
|
||||||
|
if atom in qstrs:
|
||||||
|
atom = "\1"
|
||||||
counter[atom] += 1
|
counter[atom] += 1
|
||||||
cb = huffman.codebook(counter.items())
|
cb = huffman.codebook(counter.items())
|
||||||
lengths = sorted(dict((v, len(cb[k])) for k, v in counter.items()).items())
|
lengths = sorted(dict((v, len(cb[k])) for k, v in counter.items()).items())
|
||||||
@ -304,10 +299,14 @@ def compute_huffman_coding(translation_name, translations, f):
|
|||||||
words.append(word)
|
words.append(word)
|
||||||
|
|
||||||
words.sort(key=len)
|
words.sort(key=len)
|
||||||
extractor = TextSplitter(words)
|
extractor = TextSplitter(words + qstr_strs)
|
||||||
counter = collections.Counter()
|
counter = collections.Counter()
|
||||||
|
used_qstr = 0
|
||||||
for t in texts:
|
for t in texts:
|
||||||
for atom in extractor.iter(t):
|
for atom in extractor.iter(t):
|
||||||
|
if atom in qstrs:
|
||||||
|
used_qstr = max(used_qstr, qstrs[atom])
|
||||||
|
atom = "\1"
|
||||||
counter[atom] += 1
|
counter[atom] += 1
|
||||||
cb = huffman.codebook(counter.items())
|
cb = huffman.codebook(counter.items())
|
||||||
|
|
||||||
@ -322,6 +321,8 @@ def compute_huffman_coding(translation_name, translations, f):
|
|||||||
last_length = None
|
last_length = None
|
||||||
canonical = {}
|
canonical = {}
|
||||||
for atom, code in sorted(cb.items(), key=lambda x: (len(x[1]), x[0])):
|
for atom, code in sorted(cb.items(), key=lambda x: (len(x[1]), x[0])):
|
||||||
|
if atom in qstr_strs:
|
||||||
|
atom = "\1"
|
||||||
values.append(atom)
|
values.append(atom)
|
||||||
length = len(code)
|
length = len(code)
|
||||||
if length not in length_count:
|
if length not in length_count:
|
||||||
@ -359,6 +360,8 @@ def compute_huffman_coding(translation_name, translations, f):
|
|||||||
minlen = len(words[0])
|
minlen = len(words[0])
|
||||||
wlencount = [len([None for w in words if len(w) == l]) for l in range(minlen, maxlen + 1)]
|
wlencount = [len([None for w in words if len(w) == l]) for l in range(minlen, maxlen + 1)]
|
||||||
|
|
||||||
|
translation_qstr_bits = used_qstr.bit_length()
|
||||||
|
|
||||||
f.write("typedef {} mchar_t;\n".format(values_type))
|
f.write("typedef {} mchar_t;\n".format(values_type))
|
||||||
f.write("const uint8_t lengths[] = {{ {} }};\n".format(", ".join(map(str, lengths))))
|
f.write("const uint8_t lengths[] = {{ {} }};\n".format(", ".join(map(str, lengths))))
|
||||||
f.write(
|
f.write(
|
||||||
@ -383,34 +386,44 @@ def compute_huffman_coding(translation_name, translations, f):
|
|||||||
f.write("#define maxlen {}\n".format(maxlen))
|
f.write("#define maxlen {}\n".format(maxlen))
|
||||||
f.write("#define translation_offstart {}\n".format(offstart))
|
f.write("#define translation_offstart {}\n".format(offstart))
|
||||||
f.write("#define translation_offset {}\n".format(offset))
|
f.write("#define translation_offset {}\n".format(offset))
|
||||||
|
f.write("#define translation_qstr_bits {}\n".format(translation_qstr_bits))
|
||||||
|
|
||||||
return EncodingTable(values, lengths, words, canonical, extractor, apply_offset, remove_offset)
|
qstrs_inv = dict((v, k) for k, v in qstrs.items())
|
||||||
|
return EncodingTable(
|
||||||
|
values,
|
||||||
|
lengths,
|
||||||
|
words,
|
||||||
|
canonical,
|
||||||
|
extractor,
|
||||||
|
apply_offset,
|
||||||
|
remove_offset,
|
||||||
|
translation_qstr_bits,
|
||||||
|
qstrs,
|
||||||
|
qstrs_inv,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def decompress(encoding_table, encoded, encoded_length_bits):
|
def decompress(encoding_table, encoded, encoded_length_bits):
|
||||||
|
qstrs_inv = encoding_table.qstrs_inv
|
||||||
values = encoding_table.values
|
values = encoding_table.values
|
||||||
lengths = encoding_table.lengths
|
lengths = encoding_table.lengths
|
||||||
words = encoding_table.words
|
words = encoding_table.words
|
||||||
|
|
||||||
dec = []
|
def bititer():
|
||||||
this_byte = 0
|
for byte in encoded:
|
||||||
this_bit = 7
|
for bit in (0x80, 0x40, 0x20, 0x10, 0x8, 0x4, 0x2, 0x1):
|
||||||
b = encoded[this_byte]
|
yield bool(byte & bit)
|
||||||
bits = 0
|
|
||||||
for i in range(encoded_length_bits):
|
|
||||||
bits <<= 1
|
|
||||||
if 0x80 & b:
|
|
||||||
bits |= 1
|
|
||||||
|
|
||||||
b <<= 1
|
nextbit = bititer().__next__
|
||||||
if this_bit == 0:
|
|
||||||
this_bit = 7
|
def getnbits(n):
|
||||||
this_byte += 1
|
bits = 0
|
||||||
if this_byte < len(encoded):
|
for i in range(n):
|
||||||
b = encoded[this_byte]
|
bits = (bits << 1) | nextbit()
|
||||||
else:
|
return bits
|
||||||
this_bit -= 1
|
|
||||||
length = bits
|
dec = []
|
||||||
|
length = getnbits(encoded_length_bits)
|
||||||
|
|
||||||
i = 0
|
i = 0
|
||||||
while i < length:
|
while i < length:
|
||||||
@ -419,19 +432,8 @@ def decompress(encoding_table, encoded, encoded_length_bits):
|
|||||||
max_code = lengths[0]
|
max_code = lengths[0]
|
||||||
searched_length = lengths[0]
|
searched_length = lengths[0]
|
||||||
while True:
|
while True:
|
||||||
bits <<= 1
|
bits = (bits << 1) | nextbit()
|
||||||
if 0x80 & b:
|
|
||||||
bits |= 1
|
|
||||||
|
|
||||||
b <<= 1
|
|
||||||
bit_length += 1
|
bit_length += 1
|
||||||
if this_bit == 0:
|
|
||||||
this_bit = 7
|
|
||||||
this_byte += 1
|
|
||||||
if this_byte < len(encoded):
|
|
||||||
b = encoded[this_byte]
|
|
||||||
else:
|
|
||||||
this_bit -= 1
|
|
||||||
if max_code > 0 and bits < max_code:
|
if max_code > 0 and bits < max_code:
|
||||||
# print('{0:0{width}b}'.format(bits, width=bit_length))
|
# print('{0:0{width}b}'.format(bits, width=bit_length))
|
||||||
break
|
break
|
||||||
@ -439,7 +441,10 @@ def decompress(encoding_table, encoded, encoded_length_bits):
|
|||||||
searched_length += lengths[bit_length]
|
searched_length += lengths[bit_length]
|
||||||
|
|
||||||
v = values[searched_length + bits - max_code]
|
v = values[searched_length + bits - max_code]
|
||||||
if v >= chr(0x80) and v < chr(0x80 + len(words)):
|
if v == chr(1):
|
||||||
|
qstr_idx = getnbits(encoding_table.translation_qstr_bits)
|
||||||
|
v = qstrs_inv[qstr_idx]
|
||||||
|
elif v >= chr(0x80) and v < chr(0x80 + len(words)):
|
||||||
v = words[ord(v) - 0x80]
|
v = words[ord(v) - 0x80]
|
||||||
i += len(v.encode("utf-8"))
|
i += len(v.encode("utf-8"))
|
||||||
dec.append(v)
|
dec.append(v)
|
||||||
@ -449,36 +454,37 @@ def decompress(encoding_table, encoded, encoded_length_bits):
|
|||||||
def compress(encoding_table, decompressed, encoded_length_bits, len_translation_encoded):
|
def compress(encoding_table, decompressed, encoded_length_bits, len_translation_encoded):
|
||||||
if not isinstance(decompressed, str):
|
if not isinstance(decompressed, str):
|
||||||
raise TypeError()
|
raise TypeError()
|
||||||
|
qstrs = encoding_table.qstrs
|
||||||
canonical = encoding_table.canonical
|
canonical = encoding_table.canonical
|
||||||
extractor = encoding_table.extractor
|
extractor = encoding_table.extractor
|
||||||
|
|
||||||
enc = bytearray(len(decompressed) * 3)
|
enc = 1
|
||||||
current_bit = 7
|
|
||||||
current_byte = 0
|
|
||||||
|
|
||||||
bits = encoded_length_bits + 1
|
def put_bit(enc, b):
|
||||||
for i in range(bits - 1, 0, -1):
|
return (enc << 1) | bool(b)
|
||||||
if len_translation_encoded & (1 << (i - 1)):
|
|
||||||
enc[current_byte] |= 1 << current_bit
|
def put_bits(enc, b, n):
|
||||||
if current_bit == 0:
|
for i in range(n - 1, -1, -1):
|
||||||
current_bit = 7
|
enc = put_bit(enc, b & (1 << i))
|
||||||
current_byte += 1
|
return enc
|
||||||
else:
|
|
||||||
current_bit -= 1
|
enc = put_bits(enc, len_translation_encoded, encoded_length_bits)
|
||||||
|
|
||||||
for atom in extractor.iter(decompressed):
|
for atom in extractor.iter(decompressed):
|
||||||
for b in canonical[atom]:
|
if atom in qstrs:
|
||||||
if b == "1":
|
can = canonical["\1"]
|
||||||
enc[current_byte] |= 1 << current_bit
|
else:
|
||||||
if current_bit == 0:
|
can = canonical[atom]
|
||||||
current_bit = 7
|
for b in can:
|
||||||
current_byte += 1
|
enc = put_bit(enc, b == "1")
|
||||||
else:
|
if atom in qstrs:
|
||||||
current_bit -= 1
|
enc = put_bits(enc, qstrs[atom], encoding_table.translation_qstr_bits)
|
||||||
|
|
||||||
if current_bit != 7:
|
while enc.bit_length() % 8 != 1:
|
||||||
current_byte += 1
|
enc = put_bit(enc, 0)
|
||||||
return enc[:current_byte]
|
|
||||||
|
r = enc.to_bytes((enc.bit_length() + 7) // 8, "big")
|
||||||
|
return r[1:]
|
||||||
|
|
||||||
|
|
||||||
def qstr_escape(qst):
|
def qstr_escape(qst):
|
||||||
@ -493,10 +499,20 @@ def qstr_escape(qst):
|
|||||||
return re.sub(r"[^A-Za-z0-9_]", esc_char, qst)
|
return re.sub(r"[^A-Za-z0-9_]", esc_char, qst)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_qstrs(infile):
|
||||||
|
r = {}
|
||||||
|
rx = re.compile(r'QDEF\([A-Za-z0-9_]+,\s*\d+,\s*\d+,\s*(?P<cstr>"(?:[^"\\\\]*|\\.)")\)')
|
||||||
|
content = infile.read()
|
||||||
|
for i, mat in enumerate(rx.findall(content, re.M)):
|
||||||
|
mat = eval(mat)
|
||||||
|
r[mat] = i
|
||||||
|
return r
|
||||||
|
|
||||||
|
|
||||||
def parse_input_headers(infiles):
|
def parse_input_headers(infiles):
|
||||||
i18ns = set()
|
i18ns = set()
|
||||||
|
|
||||||
# read the qstrs in from the input files
|
# read the TRANSLATE strings in from the input files
|
||||||
for infile in infiles:
|
for infile in infiles:
|
||||||
with open(infile, "rt") as f:
|
with open(infile, "rt") as f:
|
||||||
for line in f:
|
for line in f:
|
||||||
@ -516,12 +532,12 @@ def escape_bytes(qstr):
|
|||||||
return qstr
|
return qstr
|
||||||
else:
|
else:
|
||||||
# qstr contains non-printable codes so render entire thing as hex pairs
|
# qstr contains non-printable codes so render entire thing as hex pairs
|
||||||
qbytes = bytes_cons(qstr, "utf8")
|
qbytes = bytes(qstr, "utf8")
|
||||||
return "".join(("\\x%02x" % b) for b in qbytes)
|
return "".join(("\\x%02x" % b) for b in qbytes)
|
||||||
|
|
||||||
|
|
||||||
def make_bytes(cfg_bytes_len, cfg_bytes_hash, qstr):
|
def make_bytes(cfg_bytes_len, cfg_bytes_hash, qstr):
|
||||||
qbytes = bytes_cons(qstr, "utf8")
|
qbytes = bytes(qstr, "utf8")
|
||||||
qlen = len(qbytes)
|
qlen = len(qbytes)
|
||||||
qhash = compute_hash(qbytes, cfg_bytes_hash)
|
qhash = compute_hash(qbytes, cfg_bytes_hash)
|
||||||
if qlen >= (1 << (8 * cfg_bytes_len)):
|
if qlen >= (1 << (8 * cfg_bytes_len)):
|
||||||
@ -551,7 +567,7 @@ def output_translation_data(encoding_table, i18ns, out):
|
|||||||
)
|
)
|
||||||
total_text_compressed_size += len(compressed)
|
total_text_compressed_size += len(compressed)
|
||||||
decompressed = decompress(encoding_table, compressed, encoded_length_bits)
|
decompressed = decompress(encoding_table, compressed, encoded_length_bits)
|
||||||
assert decompressed == translation
|
assert decompressed == translation, (decompressed, translation)
|
||||||
for c in C_ESCAPES:
|
for c in C_ESCAPES:
|
||||||
decompressed = decompressed.replace(c, C_ESCAPES[c])
|
decompressed = decompressed.replace(c, C_ESCAPES[c])
|
||||||
formatted = ["{:d}".format(x) for x in compressed]
|
formatted = ["{:d}".format(x) for x in compressed]
|
||||||
@ -572,7 +588,7 @@ if __name__ == "__main__":
|
|||||||
import argparse
|
import argparse
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
description="Process QSTR definitions into headers for compilation"
|
description="Process TRANSLATE strings into headers for compilation"
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"infiles", metavar="N", type=str, nargs="+", help="an integer for the accumulator"
|
"infiles", metavar="N", type=str, nargs="+", help="an integer for the accumulator"
|
||||||
@ -590,13 +606,19 @@ if __name__ == "__main__":
|
|||||||
type=argparse.FileType("w", encoding="UTF-8"),
|
type=argparse.FileType("w", encoding="UTF-8"),
|
||||||
help="c file for translation data",
|
help="c file for translation data",
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--qstrdefs_filename",
|
||||||
|
type=argparse.FileType("r", encoding="UTF-8"),
|
||||||
|
help="",
|
||||||
|
)
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
qstrs = parse_qstrs(args.qstrdefs_filename)
|
||||||
i18ns = parse_input_headers(args.infiles)
|
i18ns = parse_input_headers(args.infiles)
|
||||||
i18ns = sorted(i18ns)
|
i18ns = sorted(i18ns)
|
||||||
translations = translate(args.translation, i18ns)
|
translations = translate(args.translation, i18ns)
|
||||||
encoding_table = compute_huffman_coding(
|
encoding_table = compute_huffman_coding(
|
||||||
args.translation, translations, args.compression_filename
|
qstrs, args.translation, translations, args.compression_filename
|
||||||
)
|
)
|
||||||
output_translation_data(encoding_table, translations, args.translation_filename)
|
output_translation_data(encoding_table, translations, args.translation_filename)
|
||||||
|
2
py/py.mk
2
py/py.mk
@ -270,7 +270,7 @@ $(PY_BUILD)/translations-$(TRANSLATION).c: $(HEADER_BUILD)/compressed_translatio
|
|||||||
$(HEADER_BUILD)/compressed_translations.generated.h: $(PY_SRC)/maketranslationdata.py $(HEADER_BUILD)/$(TRANSLATION).mo $(HEADER_BUILD)/qstrdefs.generated.h
|
$(HEADER_BUILD)/compressed_translations.generated.h: $(PY_SRC)/maketranslationdata.py $(HEADER_BUILD)/$(TRANSLATION).mo $(HEADER_BUILD)/qstrdefs.generated.h
|
||||||
$(STEPECHO) "GEN $@"
|
$(STEPECHO) "GEN $@"
|
||||||
$(Q)mkdir -p $(PY_BUILD)
|
$(Q)mkdir -p $(PY_BUILD)
|
||||||
$(Q)$(PYTHON) $(PY_SRC)/maketranslationdata.py --compression_filename $(HEADER_BUILD)/compressed_translations.generated.h --translation $(HEADER_BUILD)/$(TRANSLATION).mo --translation_filename $(PY_BUILD)/translations-$(TRANSLATION).c $(HEADER_BUILD)/qstrdefs.preprocessed.h
|
$(Q)$(PYTHON) $(PY_SRC)/maketranslationdata.py --compression_filename $(HEADER_BUILD)/compressed_translations.generated.h --translation $(HEADER_BUILD)/$(TRANSLATION).mo --translation_filename $(PY_BUILD)/translations-$(TRANSLATION).c --qstrdefs_filename $(HEADER_BUILD)/qstrdefs.generated.h $(HEADER_BUILD)/qstrdefs.preprocessed.h
|
||||||
|
|
||||||
PY_CORE_O += $(PY_BUILD)/translations-$(TRANSLATION).o
|
PY_CORE_O += $(PY_BUILD)/translations-$(TRANSLATION).o
|
||||||
|
|
||||||
|
@ -53,6 +53,13 @@
|
|||||||
// speaking, words. They're just spans of code points that frequently
|
// speaking, words. They're just spans of code points that frequently
|
||||||
// occur together. They are ordered shortest to longest.
|
// occur together. They are ordered shortest to longest.
|
||||||
//
|
//
|
||||||
|
// - If the translation uses a lot of code points or widely spaced code points,
|
||||||
|
// then the huffman table entries are UTF-16 code points. But if the translation
|
||||||
|
// uses only ASCII 7-bit code points plus a SMALL range of higher code points that
|
||||||
|
// still fit in 8 bits, translation_offset and translation_offstart are used to
|
||||||
|
// renumber the code points so that they still fit within 8 bits. (it's very beneficial
|
||||||
|
// for mchar_t to be 8 bits instead of 16!)
|
||||||
|
//
|
||||||
// - dictionary entries are non-overlapping, and the _ending_ index of each
|
// - dictionary entries are non-overlapping, and the _ending_ index of each
|
||||||
// entry is stored in an array. A count of words of each length, from
|
// entry is stored in an array. A count of words of each length, from
|
||||||
// minlen to maxlen, is given in the array called wlencount. From
|
// minlen to maxlen, is given in the array called wlencount. From
|
||||||
@ -60,6 +67,14 @@
|
|||||||
// calculated by an efficient, small loop. (A bit of time is traded
|
// calculated by an efficient, small loop. (A bit of time is traded
|
||||||
// to reduce the size of this table indicating lengths)
|
// to reduce the size of this table indicating lengths)
|
||||||
//
|
//
|
||||||
|
// - Value 1 ('\1') is used to indicate that a QSTR number follows. the
|
||||||
|
// QSTR is encoded as a fixed number of bits (translation_qstr_bits), e.g.,
|
||||||
|
// 10 bits if the highest core qstr is from 512 to 1023 inclusive.
|
||||||
|
// (maketranslationdata uses a simple heuristic where any qstr >= 3
|
||||||
|
// characters long is encoded in this way; this is simple but probably not
|
||||||
|
// optimal. In fact, the rule of >= 2 characters is better for SOME languages
|
||||||
|
// on SOME boards.)
|
||||||
|
//
|
||||||
// The "data" / "tail" construct is so that the struct's last member is a
|
// The "data" / "tail" construct is so that the struct's last member is a
|
||||||
// "flexible array". However, the _only_ member is not permitted to be
|
// "flexible array". However, the _only_ member is not permitted to be
|
||||||
// a flexible member, so we have to declare the first byte as a separate
|
// a flexible member, so we have to declare the first byte as a separate
|
||||||
|
@ -25,6 +25,7 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
#include "supervisor/shared/translate/translate.h"
|
#include "supervisor/shared/translate/translate.h"
|
||||||
|
#include "py/qstr.h"
|
||||||
|
|
||||||
#include <stdbool.h>
|
#include <stdbool.h>
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
@ -99,10 +100,34 @@ uint16_t decompress_length(const compressed_string_t *compressed) {
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
const uint8_t *ptr;
|
||||||
|
uint8_t bit;
|
||||||
|
} bitstream_state_t;
|
||||||
|
|
||||||
|
static bool next_bit(bitstream_state_t *st) {
|
||||||
|
bool r = *st->ptr & st->bit;
|
||||||
|
st->bit >>= 1;
|
||||||
|
if (!st->bit) {
|
||||||
|
st->bit = 0x80;
|
||||||
|
st->ptr++;
|
||||||
|
}
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int get_nbits(bitstream_state_t *st, int n) {
|
||||||
|
int r = 0;
|
||||||
|
while (n--) {
|
||||||
|
r = (r << 1) | next_bit(st);
|
||||||
|
}
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
char *decompress(const compressed_string_t *compressed, char *decompressed) {
|
char *decompress(const compressed_string_t *compressed, char *decompressed) {
|
||||||
uint8_t this_byte = compress_max_length_bits / 8;
|
bitstream_state_t b = {
|
||||||
uint8_t this_bit = 7 - compress_max_length_bits % 8;
|
.ptr = &(compressed->data) + (compress_max_length_bits >> 3),
|
||||||
uint8_t b = (&compressed->data)[this_byte] << (compress_max_length_bits % 8);
|
.bit = 1 << (7 - ((compress_max_length_bits) & 0x7)),
|
||||||
|
};
|
||||||
uint16_t length = decompress_length(compressed);
|
uint16_t length = decompress_length(compressed);
|
||||||
|
|
||||||
// Stop one early because the last byte is always NULL.
|
// Stop one early because the last byte is always NULL.
|
||||||
@ -112,26 +137,23 @@ char *decompress(const compressed_string_t *compressed, char *decompressed) {
|
|||||||
uint32_t max_code = lengths[0];
|
uint32_t max_code = lengths[0];
|
||||||
uint32_t searched_length = lengths[0];
|
uint32_t searched_length = lengths[0];
|
||||||
while (true) {
|
while (true) {
|
||||||
bits <<= 1;
|
bits = (bits << 1) | next_bit(&b);
|
||||||
if ((0x80 & b) != 0) {
|
|
||||||
bits |= 1;
|
|
||||||
}
|
|
||||||
b <<= 1;
|
|
||||||
bit_length += 1;
|
bit_length += 1;
|
||||||
if (this_bit == 0) {
|
|
||||||
this_bit = 7;
|
|
||||||
this_byte += 1;
|
|
||||||
b = (&compressed->data)[this_byte]; // This may read past the end but its never used.
|
|
||||||
} else {
|
|
||||||
this_bit -= 1;
|
|
||||||
}
|
|
||||||
if (max_code > 0 && bits < max_code) {
|
if (max_code > 0 && bits < max_code) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
max_code = (max_code << 1) + lengths[bit_length];
|
max_code = (max_code << 1) + lengths[bit_length];
|
||||||
searched_length += lengths[bit_length];
|
searched_length += lengths[bit_length];
|
||||||
}
|
}
|
||||||
i += put_utf8(decompressed + i, values[searched_length + bits - max_code]);
|
int v = values[searched_length + bits - max_code];
|
||||||
|
if (v == 1) {
|
||||||
|
qstr q = get_nbits(&b, translation_qstr_bits) + 1; // honestly no idea why "+1"...
|
||||||
|
for (const char *qc = qstr_str(q); *qc;) {
|
||||||
|
decompressed[i++] = *qc++;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
i += put_utf8(decompressed + i, v);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
decompressed[length - 1] = '\0';
|
decompressed[length - 1] = '\0';
|
||||||
|
Loading…
Reference in New Issue
Block a user