Compress word offset table

By storing "count of words by length", the long `wends` table can be
replaced with a short `wlencount` table.  This saves flash storage space.

Extend the range of string lengths that can be in the dictionary.
Originally it was to 2 to 9; at one point it was changed to 3 to 9.
Putting the lower bound back at 2 has a positive impact on the French
translation (a bunch of them, such as "ch", "\r\n", "%q", are used).
Increasing the maximum length gets 'mpossible', ' doit être ',
and 'CircuitPyth' at the long end.  This adds a bit of processing time
to makeqstrdata. The specific 2/11 values are again empirical based on
the French translation on the adafruit_proxlight_trinkey_m0.
This commit is contained in:
Jeff Epler 2021-08-07 09:17:41 -05:00
parent 063e3946d6
commit d59a28db97
3 changed files with 43 additions and 34 deletions

View File

@ -333,12 +333,9 @@ def compute_huffman_coding(translations, compression_filename):
bits_per_codepoint = 16 if max_ord > 255 else 8
values_type = "uint16_t" if max_ord > 255 else "uint8_t"
max_words_len = 160 if max_ord > 255 else 255
sum_len = 0
while True:
while len(words) < max_words:
# Until the dictionary is filled to capacity, use a heuristic to find
# the best "word" (3- to 9-gram) to add to it.
# the best "word" (2- to 11-gram) to add to it.
#
# The TextSplitter allows us to avoid considering parts of the text
# that are already covered by a previously chosen word, for example
@ -369,7 +366,8 @@ def compute_huffman_coding(translations, compression_filename):
# the Huffman tree bumps up the encoding lengths of all words in the
# same subtree. In the extreme case when the new word is so frequent
# that it gets a one-bit encoding, all other words will cost an extra
# bit each.
# bit each. This is empirically modeled by the constant factor added to
# cost, but the specific value used isn't "proven" to be correct.
#
# Another source of inaccuracy is that compressed strings end up
# on byte boundaries, not bit boundaries, so saving 1 bit somewhere
@ -383,14 +381,14 @@ def compute_huffman_coding(translations, compression_filename):
# The difference between the two is the estimated net savings, in bits.
def est_net_savings(s, occ):
savings = occ * (bit_length(s) - est_len(occ))
cost = len(s) * bits_per_codepoint
cost = len(s) * bits_per_codepoint + 24
return savings - cost
counter = collections.Counter()
for t in texts:
for (found, word) in extractor.iter_words(t):
if not found:
for substr in iter_substrings(word, minlen=3, maxlen=9):
for substr in iter_substrings(word, minlen=2, maxlen=11):
counter[substr] += 1
# Score the candidates we found. This is a semi-empirical formula that
@ -410,16 +408,9 @@ def compute_huffman_coding(translations, compression_filename):
break
word = scores[0][0]
# If we can successfully add it to the dictionary, do so. Otherwise,
# we've filled the dictionary to capacity and are done.
if sum_len + len(word) - 2 > max_words_len:
break
if len(words) == max_words:
break
words.append(word)
sum_len += len(word) - 2
words.sort(key=len)
extractor = TextSplitter(words)
counter = collections.Counter()
for t in texts:
@ -469,16 +460,15 @@ def compute_huffman_coding(translations, compression_filename):
len(translation.encode("utf-8")) for (original, translation) in translations
)
wends = list(len(w) - 2 for w in words)
for i in range(1, len(wends)):
wends[i] += wends[i - 1]
maxlen = len(words[-1])
minlen = len(words[0])
wlencount = [len([None for w in words if len(w) == l]) for l in range(minlen, maxlen + 1)]
with open(compression_filename, "w") as f:
f.write("typedef {} mchar_t;".format(values_type))
f.write("const uint8_t lengths[] = {{ {} }};\n".format(", ".join(map(str, lengths))))
f.write(
"const {} values[] = {{ {} }};\n".format(
values_type, ", ".join(str(ord(u)) for u in values)
)
"const mchar_t values[] = {{ {} }};\n".format(", ".join(str(ord(u)) for u in values))
)
f.write(
"#define compress_max_length_bits ({})\n".format(
@ -486,13 +476,17 @@ def compute_huffman_coding(translations, compression_filename):
)
)
f.write(
"const {} words[] = {{ {} }};\n".format(
values_type, ", ".join(str(ord(c)) for w in words for c in w)
"const mchar_t words[] = {{ {} }};\n".format(
", ".join(str(ord(c)) for w in words for c in w)
)
)
f.write("const uint8_t wends[] = {{ {} }};\n".format(", ".join(str(p) for p in wends)))
f.write(
"const uint8_t wlencount[] = {{ {} }};\n".format(", ".join(str(p) for p in wlencount))
)
f.write("#define word_start {}\n".format(word_start))
f.write("#define word_end {}\n".format(word_end))
f.write("#define minlen {}\n".format(minlen))
f.write("#define maxlen {}\n".format(maxlen))
return (values, lengths, words, canonical, extractor)

View File

@ -43,22 +43,34 @@ void serial_write_compressed(const compressed_string_t *compressed) {
serial_write(decompressed);
}
STATIC void get_word(int n, const mchar_t **pos, const mchar_t **end) {
int len = minlen;
int i = 0;
*pos = words;
while (wlencount[i] <= n) {
n -= wlencount[i];
*pos += len * wlencount[i];
i++;
len++;
}
*pos += len * n;
*end = *pos + len;
}
STATIC int put_utf8(char *buf, int u) {
if (u <= 0x7f) {
*buf = u;
return 1;
} else if (word_start <= u && u <= word_end) {
uint n = (u - word_start);
size_t pos = 0;
if (n > 0) {
pos = wends[n - 1] + (n * 2);
}
const mchar_t *pos, *end;
get_word(n, &pos, &end);
int ret = 0;
// note that at present, entries in the words table are
// guaranteed not to represent words themselves, so this adds
// at most 1 level of recursive call
for (; pos < wends[n] + (n + 1) * 2; pos++) {
int len = put_utf8(buf, words[pos]);
for (; pos < end; pos++) {
int len = put_utf8(buf, *pos);
buf += len;
ret += len;
}

View File

@ -50,11 +50,14 @@
// are computed with a heuristic based on frequent substrings of 2 to
// 9 code points. These are called "words" but are not, grammatically
// speaking, words. They're just spans of code points that frequently
// occur together.
// occur together. They are ordered shortest to longest.
//
// - dictionary entries are non-overlapping, and the _ending_ index of each
// entry is stored in an array. Since the index given is the ending
// index, the array is called "wends".
// entry is stored in an array. A count of words of each length, from
// minlen to maxlen, is given in the array called wlencount. From
// this small array, the start and end of the N'th word can be
// calculated by an efficient, small loop. (A bit of time is traded
// to reduce the size of this table indicating lengths)
//
// The "data" / "tail" construct is so that the struct's last member is a
// "flexible array". However, the _only_ member is not permitted to be