Compress word offset table
By storing "count of words by length", the long `wends` table can be replaced with a short `wlencount` table. This saves flash storage space. Extend the range of string lengths that can be in the dictionary. Originally it was to 2 to 9; at one point it was changed to 3 to 9. Putting the lower bound back at 2 has a positive impact on the French translation (a bunch of them, such as "ch", "\r\n", "%q", are used). Increasing the maximum length gets 'mpossible', ' doit être ', and 'CircuitPyth' at the long end. This adds a bit of processing time to makeqstrdata. The specific 2/11 values are again empirical based on the French translation on the adafruit_proxlight_trinkey_m0.
This commit is contained in:
parent
063e3946d6
commit
d59a28db97
@ -333,12 +333,9 @@ def compute_huffman_coding(translations, compression_filename):
|
||||
|
||||
bits_per_codepoint = 16 if max_ord > 255 else 8
|
||||
values_type = "uint16_t" if max_ord > 255 else "uint8_t"
|
||||
max_words_len = 160 if max_ord > 255 else 255
|
||||
|
||||
sum_len = 0
|
||||
while True:
|
||||
while len(words) < max_words:
|
||||
# Until the dictionary is filled to capacity, use a heuristic to find
|
||||
# the best "word" (3- to 9-gram) to add to it.
|
||||
# the best "word" (2- to 11-gram) to add to it.
|
||||
#
|
||||
# The TextSplitter allows us to avoid considering parts of the text
|
||||
# that are already covered by a previously chosen word, for example
|
||||
@ -369,7 +366,8 @@ def compute_huffman_coding(translations, compression_filename):
|
||||
# the Huffman tree bumps up the encoding lengths of all words in the
|
||||
# same subtree. In the extreme case when the new word is so frequent
|
||||
# that it gets a one-bit encoding, all other words will cost an extra
|
||||
# bit each.
|
||||
# bit each. This is empirically modeled by the constant factor added to
|
||||
# cost, but the specific value used isn't "proven" to be correct.
|
||||
#
|
||||
# Another source of inaccuracy is that compressed strings end up
|
||||
# on byte boundaries, not bit boundaries, so saving 1 bit somewhere
|
||||
@ -383,14 +381,14 @@ def compute_huffman_coding(translations, compression_filename):
|
||||
# The difference between the two is the estimated net savings, in bits.
|
||||
def est_net_savings(s, occ):
|
||||
savings = occ * (bit_length(s) - est_len(occ))
|
||||
cost = len(s) * bits_per_codepoint
|
||||
cost = len(s) * bits_per_codepoint + 24
|
||||
return savings - cost
|
||||
|
||||
counter = collections.Counter()
|
||||
for t in texts:
|
||||
for (found, word) in extractor.iter_words(t):
|
||||
if not found:
|
||||
for substr in iter_substrings(word, minlen=3, maxlen=9):
|
||||
for substr in iter_substrings(word, minlen=2, maxlen=11):
|
||||
counter[substr] += 1
|
||||
|
||||
# Score the candidates we found. This is a semi-empirical formula that
|
||||
@ -410,16 +408,9 @@ def compute_huffman_coding(translations, compression_filename):
|
||||
break
|
||||
|
||||
word = scores[0][0]
|
||||
|
||||
# If we can successfully add it to the dictionary, do so. Otherwise,
|
||||
# we've filled the dictionary to capacity and are done.
|
||||
if sum_len + len(word) - 2 > max_words_len:
|
||||
break
|
||||
if len(words) == max_words:
|
||||
break
|
||||
words.append(word)
|
||||
sum_len += len(word) - 2
|
||||
|
||||
words.sort(key=len)
|
||||
extractor = TextSplitter(words)
|
||||
counter = collections.Counter()
|
||||
for t in texts:
|
||||
@ -469,16 +460,15 @@ def compute_huffman_coding(translations, compression_filename):
|
||||
len(translation.encode("utf-8")) for (original, translation) in translations
|
||||
)
|
||||
|
||||
wends = list(len(w) - 2 for w in words)
|
||||
for i in range(1, len(wends)):
|
||||
wends[i] += wends[i - 1]
|
||||
maxlen = len(words[-1])
|
||||
minlen = len(words[0])
|
||||
wlencount = [len([None for w in words if len(w) == l]) for l in range(minlen, maxlen + 1)]
|
||||
|
||||
with open(compression_filename, "w") as f:
|
||||
f.write("typedef {} mchar_t;".format(values_type))
|
||||
f.write("const uint8_t lengths[] = {{ {} }};\n".format(", ".join(map(str, lengths))))
|
||||
f.write(
|
||||
"const {} values[] = {{ {} }};\n".format(
|
||||
values_type, ", ".join(str(ord(u)) for u in values)
|
||||
)
|
||||
"const mchar_t values[] = {{ {} }};\n".format(", ".join(str(ord(u)) for u in values))
|
||||
)
|
||||
f.write(
|
||||
"#define compress_max_length_bits ({})\n".format(
|
||||
@ -486,13 +476,17 @@ def compute_huffman_coding(translations, compression_filename):
|
||||
)
|
||||
)
|
||||
f.write(
|
||||
"const {} words[] = {{ {} }};\n".format(
|
||||
values_type, ", ".join(str(ord(c)) for w in words for c in w)
|
||||
"const mchar_t words[] = {{ {} }};\n".format(
|
||||
", ".join(str(ord(c)) for w in words for c in w)
|
||||
)
|
||||
)
|
||||
f.write("const uint8_t wends[] = {{ {} }};\n".format(", ".join(str(p) for p in wends)))
|
||||
f.write(
|
||||
"const uint8_t wlencount[] = {{ {} }};\n".format(", ".join(str(p) for p in wlencount))
|
||||
)
|
||||
f.write("#define word_start {}\n".format(word_start))
|
||||
f.write("#define word_end {}\n".format(word_end))
|
||||
f.write("#define minlen {}\n".format(minlen))
|
||||
f.write("#define maxlen {}\n".format(maxlen))
|
||||
|
||||
return (values, lengths, words, canonical, extractor)
|
||||
|
||||
|
@ -43,22 +43,34 @@ void serial_write_compressed(const compressed_string_t *compressed) {
|
||||
serial_write(decompressed);
|
||||
}
|
||||
|
||||
STATIC void get_word(int n, const mchar_t **pos, const mchar_t **end) {
|
||||
int len = minlen;
|
||||
int i = 0;
|
||||
*pos = words;
|
||||
while (wlencount[i] <= n) {
|
||||
n -= wlencount[i];
|
||||
*pos += len * wlencount[i];
|
||||
i++;
|
||||
len++;
|
||||
}
|
||||
*pos += len * n;
|
||||
*end = *pos + len;
|
||||
}
|
||||
|
||||
STATIC int put_utf8(char *buf, int u) {
|
||||
if (u <= 0x7f) {
|
||||
*buf = u;
|
||||
return 1;
|
||||
} else if (word_start <= u && u <= word_end) {
|
||||
uint n = (u - word_start);
|
||||
size_t pos = 0;
|
||||
if (n > 0) {
|
||||
pos = wends[n - 1] + (n * 2);
|
||||
}
|
||||
const mchar_t *pos, *end;
|
||||
get_word(n, &pos, &end);
|
||||
int ret = 0;
|
||||
// note that at present, entries in the words table are
|
||||
// guaranteed not to represent words themselves, so this adds
|
||||
// at most 1 level of recursive call
|
||||
for (; pos < wends[n] + (n + 1) * 2; pos++) {
|
||||
int len = put_utf8(buf, words[pos]);
|
||||
for (; pos < end; pos++) {
|
||||
int len = put_utf8(buf, *pos);
|
||||
buf += len;
|
||||
ret += len;
|
||||
}
|
||||
|
@ -50,11 +50,14 @@
|
||||
// are computed with a heuristic based on frequent substrings of 2 to
|
||||
// 9 code points. These are called "words" but are not, grammatically
|
||||
// speaking, words. They're just spans of code points that frequently
|
||||
// occur together.
|
||||
// occur together. They are ordered shortest to longest.
|
||||
//
|
||||
// - dictionary entries are non-overlapping, and the _ending_ index of each
|
||||
// entry is stored in an array. Since the index given is the ending
|
||||
// index, the array is called "wends".
|
||||
// entry is stored in an array. A count of words of each length, from
|
||||
// minlen to maxlen, is given in the array called wlencount. From
|
||||
// this small array, the start and end of the N'th word can be
|
||||
// calculated by an efficient, small loop. (A bit of time is traded
|
||||
// to reduce the size of this table indicating lengths)
|
||||
//
|
||||
// The "data" / "tail" construct is so that the struct's last member is a
|
||||
// "flexible array". However, the _only_ member is not permitted to be
|
||||
|
Loading…
x
Reference in New Issue
Block a user