diff --git a/py/makeqstrdata.py b/py/makeqstrdata.py index 721fa83206..39d4a6840f 100644 --- a/py/makeqstrdata.py +++ b/py/makeqstrdata.py @@ -12,10 +12,14 @@ from __future__ import print_function import re import sys +from math import log import collections import gettext import os.path +sys.stdout.reconfigure(encoding='utf-8') +sys.stderr.reconfigure(errors='backslashreplace') + py = os.path.dirname(sys.argv[0]) top = os.path.dirname(py) @@ -100,77 +104,173 @@ def translate(translation_file, i18ns): translations.append((original, translation)) return translations -def frequent_ngrams(corpus, sz, n): - return collections.Counter(corpus[i:i+sz] for i in range(len(corpus)-sz)).most_common(n) +class TextSplitter: + def __init__(self, words): + words.sort(key=lambda x: len(x), reverse=True) + self.words = set(words) + self.pat = re.compile("|".join(re.escape(w) for w in words) + "|.", flags=re.DOTALL) -def encode_ngrams(translation, ngrams): - if len(ngrams) > 32: - start = 0xe000 - else: - start = 0x80 - for i, g in enumerate(ngrams): - translation = translation.replace(g, chr(start + i)) - return translation + def iter_words(self, text): + s = [] + words = self.words + for m in self.pat.finditer(text): + t = m.group(0) + if t in words: + if s: + yield (False, "".join(s)) + s = [] + yield (True, t) + else: + s.append(t) + if s: + yield (False, "".join(s)) -def decode_ngrams(compressed, ngrams): - if len(ngrams) > 32: - start, end = 0xe000, 0xf8ff - else: - start, end = 0x80, 0x9f - return "".join(ngrams[ord(c) - start] if (start <= ord(c) <= end) else c for c in compressed) + def iter(self, text): + for m in self.pat.finditer(text): + yield m.group(0) + +def iter_substrings(s, minlen, maxlen): + len_s = len(s) + maxlen = min(len_s, maxlen) + for n in range(minlen, maxlen + 1): + for begin in range(0, len_s - n + 1): + yield s[begin : begin + n] + +def compute_huffman_coding(translations, compression_filename): + texts = [t[1] for t in translations] + words = [] + + start_unused = 0x80 + end_unused = 0xff + max_ord = 0 + for text in texts: + for c in text: + ord_c = ord(c) + max_ord = max(ord_c, max_ord) + if 0x80 <= ord_c < 0xff: + end_unused = min(ord_c, end_unused) + max_words = end_unused - 0x80 + + values_type = "uint16_t" if max_ord > 255 else "uint8_t" + max_words_len = 160 if max_ord > 255 else 255 + + sum_len = 0 + while True: + # Until the dictionary is filled to capacity, use a heuristic to find + # the best "word" (2- to 9-gram) to add to it. + # + # The TextSplitter allows us to avoid considering parts of the text + # that are already covered by a previously chosen word, for example + # if "the" is in words then not only will "the" not be considered + # again, neither will "there" or "wither", since they have "the" + # as substrings. + extractor = TextSplitter(words) + counter = collections.Counter() + for t in texts: + for (found, word) in extractor.iter_words(t): + if not found: + for substr in iter_substrings(word, minlen=2, maxlen=9): + counter[substr] += 1 + + # Score the candidates we found. This is an empirical formula only, + # chosen for its effectiveness. + scores = sorted( + ( + (s, (len(s) - 1) ** log(max(occ - 2, 1)), occ) + for (s, occ) in counter.items() + ), + key=lambda x: x[1], + reverse=True, + ) + + # Do we have a "word" that occurred 5 times and got a score of at least + # 5? Horray. Pick the one with the highest score. + word = None + for (s, score, occ) in scores: + if occ < 5: + continue + if score < 5: + break + word = s + break + + # If we can successfully add it to the dictionary, do so. Otherwise, + # we've filled the dictionary to capacity and are done. + if not word: + break + if sum_len + len(word) - 2 > max_words_len: + break + if len(words) == max_words: + break + words.append(word) + sum_len += len(word) - 2 + + extractor = TextSplitter(words) + counter = collections.Counter() + for t in texts: + for atom in extractor.iter(t): + counter[atom] += 1 + cb = huffman.codebook(counter.items()) + + word_start = start_unused + word_end = word_start + len(words) - 1 + print("// # words", len(words)) + print("// words", words) -def compute_huffman_coding(translations, qstrs, compression_filename): - all_strings = [x[1] for x in translations] - all_strings_concat = "".join(all_strings) - ngrams = [i[0] for i in frequent_ngrams(all_strings_concat, 2, 32)] - all_strings_concat = encode_ngrams(all_strings_concat, ngrams) - counts = collections.Counter(all_strings_concat) - cb = huffman.codebook(counts.items()) values = [] length_count = {} renumbered = 0 - last_l = None + last_length = None canonical = {} - for ch, code in sorted(cb.items(), key=lambda x: (len(x[1]), x[0])): - values.append(ch) - l = len(code) - if l not in length_count: - length_count[l] = 0 - length_count[l] += 1 - if last_l: - renumbered <<= (l - last_l) - canonical[ch] = '{0:0{width}b}'.format(renumbered, width=l) - s = C_ESCAPES.get(ch, ch) - print("//", ord(ch), s, counts[ch], canonical[ch], renumbered) + for atom, code in sorted(cb.items(), key=lambda x: (len(x[1]), x[0])): + values.append(atom) + length = len(code) + if length not in length_count: + length_count[length] = 0 + length_count[length] += 1 + if last_length: + renumbered <<= (length - last_length) + canonical[atom] = '{0:0{width}b}'.format(renumbered, width=length) + # print(f"atom={repr(atom)} code={code}", file=sys.stderr) + if len(atom) > 1: + o = words.index(atom) + 0x80 + s = "".join(C_ESCAPES.get(ch1, ch1) for ch1 in atom) + else: + s = C_ESCAPES.get(atom, atom) + o = ord(atom) + print("//", o, s, counter[atom], canonical[atom], renumbered) renumbered += 1 - last_l = l + last_length = length lengths = bytearray() print("// length count", length_count) - print("// bigrams", ngrams) + for i in range(1, max(length_count) + 2): lengths.append(length_count.get(i, 0)) print("// values", values, "lengths", len(lengths), lengths) - ngramdata = [ord(ni) for i in ngrams for ni in i] - print("// estimated total memory size", len(lengths) + 2*len(values) + 2 * len(ngramdata) + sum((len(cb[u]) + 7)//8 for u in all_strings_concat)) + print("//", values, lengths) - values_type = "uint16_t" if max(ord(u) for u in values) > 255 else "uint8_t" - max_translation_encoded_length = max(len(translation.encode("utf-8")) for original,translation in translations) + values = [(atom if len(atom) == 1 else chr(0x80 + words.index(atom))) for atom in values] + print("//", values, lengths) + max_translation_encoded_length = max( + len(translation.encode("utf-8")) for (original, translation) in translations) + + wends = list(len(w) - 2 for w in words) + for i in range(1, len(wends)): + wends[i] += wends[i - 1] + with open(compression_filename, "w") as f: f.write("const uint8_t lengths[] = {{ {} }};\n".format(", ".join(map(str, lengths)))) f.write("const {} values[] = {{ {} }};\n".format(values_type, ", ".join(str(ord(u)) for u in values))) f.write("#define compress_max_length_bits ({})\n".format(max_translation_encoded_length.bit_length())) - f.write("const {} bigrams[] = {{ {} }};\n".format(values_type, ", ".join(str(u) for u in ngramdata))) - if len(ngrams) > 32: - bigram_start = 0xe000 - else: - bigram_start = 0x80 - bigram_end = bigram_start + len(ngrams) - 1 # End is inclusive - f.write("#define bigram_start {}\n".format(bigram_start)) - f.write("#define bigram_end {}\n".format(bigram_end)) - return values, lengths, ngrams + f.write("const {} words[] = {{ {} }};\n".format(values_type, ", ".join(str(ord(c)) for w in words for c in w))) + f.write("const uint8_t wends[] = {{ {} }};\n".format(", ".join(str(p) for p in wends))) + f.write("#define word_start {}\n".format(word_start)) + f.write("#define word_end {}\n".format(word_end)) + + return (values, lengths, words, canonical, extractor) def decompress(encoding_table, encoded, encoded_length_bits): - values, lengths, ngrams = encoding_table + (values, lengths, words, _, _) = encoding_table dec = [] this_byte = 0 this_bit = 7 @@ -218,7 +318,8 @@ def decompress(encoding_table, encoded, encoded_length_bits): searched_length += lengths[bit_length] v = values[searched_length + bits - max_code] - v = decode_ngrams(v, ngrams) + if v >= chr(0x80) and v < chr(0x80 + len(words)): + v = words[ord(v) - 0x80] i += len(v.encode('utf-8')) dec.append(v) return ''.join(dec) @@ -226,66 +327,32 @@ def decompress(encoding_table, encoded, encoded_length_bits): def compress(encoding_table, decompressed, encoded_length_bits, len_translation_encoded): if not isinstance(decompressed, str): raise TypeError() - values, lengths, ngrams = encoding_table - decompressed = encode_ngrams(decompressed, ngrams) + (_, _, _, canonical, extractor) = encoding_table + enc = bytearray(len(decompressed) * 3) - #print(decompressed) - #print(lengths) current_bit = 7 current_byte = 0 - code = len_translation_encoded - bits = encoded_length_bits+1 + bits = encoded_length_bits + 1 for i in range(bits - 1, 0, -1): if len_translation_encoded & (1 << (i - 1)): enc[current_byte] |= 1 << current_bit if current_bit == 0: current_bit = 7 - #print("packed {0:0{width}b}".format(enc[current_byte], width=8)) current_byte += 1 else: current_bit -= 1 - for c in decompressed: - #print() - #print("char", c, values.index(c)) - start = 0 - end = lengths[0] - bits = 1 - compressed = None - code = 0 - while compressed is None: - s = start - e = end - #print("{0:0{width}b}".format(code, width=bits)) - # Binary search! - while e > s: - midpoint = (s + e) // 2 - #print(s, e, midpoint) - if values[midpoint] == c: - compressed = code + (midpoint - start) - #print("found {0:0{width}b}".format(compressed, width=bits)) - break - elif c < values[midpoint]: - e = midpoint - else: - s = midpoint + 1 - code += end - start - code <<= 1 - start = end - end += lengths[bits] - bits += 1 - #print("next bit", bits) - - for i in range(bits - 1, 0, -1): - if compressed & (1 << (i - 1)): + for atom in extractor.iter(decompressed): + for b in canonical[atom]: + if b == "1": enc[current_byte] |= 1 << current_bit if current_bit == 0: current_bit = 7 - #print("packed {0:0{width}b}".format(enc[current_byte], width=8)) current_byte += 1 else: current_bit -= 1 + if current_bit != 7: current_byte += 1 return enc[:current_byte] @@ -452,7 +519,7 @@ if __name__ == "__main__": if args.translation: i18ns = sorted(i18ns) translations = translate(args.translation, i18ns) - encoding_table = compute_huffman_coding(translations, qstrs, args.compression_filename) + encoding_table = compute_huffman_coding(translations, args.compression_filename) print_qstr_data(encoding_table, qcfgs, qstrs, translations) else: print_qstr_enums(qstrs) diff --git a/supervisor/shared/translate.c b/supervisor/shared/translate.c index 5cd7b8dd88..44544c98dd 100644 --- a/supervisor/shared/translate.c +++ b/supervisor/shared/translate.c @@ -47,13 +47,22 @@ STATIC int put_utf8(char *buf, int u) { if(u <= 0x7f) { *buf = u; return 1; - } else if(bigram_start <= u && u <= bigram_end) { - int n = (u - 0x80) * 2; - // (note that at present, entries in the bigrams table are - // guaranteed not to represent bigrams themselves, so this adds + } else if(word_start <= u && u <= word_end) { + uint n = (u - word_start); + size_t pos = 0; + if (n > 0) { + pos = wends[n - 1] + (n * 2); + } + int ret = 0; + // note that at present, entries in the words table are + // guaranteed not to represent words themselves, so this adds // at most 1 level of recursive call - int ret = put_utf8(buf, bigrams[n]); - return ret + put_utf8(buf + ret, bigrams[n+1]); + for(; pos < wends[n] + (n + 1) * 2; pos++) { + int len = put_utf8(buf, words[pos]); + buf += len; + ret += len; + } + return ret; } else if(u <= 0x07ff) { *buf++ = 0b11000000 | (u >> 6); *buf = 0b10000000 | (u & 0b00111111); diff --git a/supervisor/shared/translate.h b/supervisor/shared/translate.h index 731b26d123..16296a4161 100644 --- a/supervisor/shared/translate.h +++ b/supervisor/shared/translate.h @@ -43,6 +43,19 @@ // (building the huffman encoding on UTF-16 code points gave better // compression than building it on UTF-8 bytes) // +// - code points starting at 128 (word_start) and potentially extending +// to 255 (word_end) (but never interfering with the target +// language's used code points) stand for dictionary entries in a +// dictionary with size up to 256 code points. The dictionary entries +// are computed with a heuristic based on frequent substrings of 2 to +// 9 code points. These are called "words" but are not, grammatically +// speaking, words. They're just spans of code points that frequently +// occur together. +// +// - dictionary entries are non-overlapping, and the _ending_ index of each +// entry is stored in an array. Since the index given is the ending +// index, the array is called "wends". +// // The "data" / "tail" construct is so that the struct's last member is a // "flexible array". However, the _only_ member is not permitted to be // a flexible member, so we have to declare the first byte as a separte