Compress word offset table

By storing "count of words by length", the long `wends` table can be replaced with a short `wlencount` table. This saves flash storage space. Extend the range of string lengths that can be in the dictionary. Originally it was to 2 to 9; at one point it was changed to 3 to 9. Putting the lower bound back at 2 has a positive impact on the French translation (a bunch of them, such as "ch", "\r\n", "%q", are used). Increasing the maximum length gets 'mpossible', ' doit être ', and 'CircuitPyth' at the long end. This adds a bit of processing time to makeqstrdata. The specific 2/11 values are again empirical based on the French translation on the adafruit_proxlight_trinkey_m0.
2021-08-07 09:17:41 -05:00 · 2021-08-07 09:17:41 -05:00 · d59a28db97
commit d59a28db97
parent 063e3946d6
3 changed files with 43 additions and 34 deletions
--- a/py/makeqstrdata.py
+++ b/py/makeqstrdata.py
@ -333,12 +333,9 @@ def compute_huffman_coding(translations, compression_filename):

    bits_per_codepoint = 16 if max_ord > 255 else 8
    values_type = "uint16_t" if max_ord > 255 else "uint8_t"
-    max_words_len = 160 if max_ord > 255 else 255
-
-    sum_len = 0
-    while True:
+    while len(words) < max_words:
        # Until the dictionary is filled to capacity, use a heuristic to find
-        # the best "word" (3- to 9-gram) to add to it.
+        # the best "word" (2- to 11-gram) to add to it.
        #
        # The TextSplitter allows us to avoid considering parts of the text
        # that are already covered by a previously chosen word, for example
@ -369,7 +366,8 @@ def compute_huffman_coding(translations, compression_filename):
        # the Huffman tree bumps up the encoding lengths of all words in the
        # same subtree.  In the extreme case when the new word is so frequent
        # that it gets a one-bit encoding, all other words will cost an extra
-        # bit each.
+        # bit each. This is empirically modeled by the constant factor added to
+        # cost, but the specific value used isn't "proven" to be correct.
        #
        # Another source of inaccuracy is that compressed strings end up
        # on byte boundaries, not bit boundaries, so saving 1 bit somewhere
@ -383,14 +381,14 @@ def compute_huffman_coding(translations, compression_filename):
        # The difference between the two is the estimated net savings, in bits.
        def est_net_savings(s, occ):
            savings = occ * (bit_length(s) - est_len(occ))
-            cost = len(s) * bits_per_codepoint
+            cost = len(s) * bits_per_codepoint + 24
            return savings - cost

        counter = collections.Counter()
        for t in texts:
            for (found, word) in extractor.iter_words(t):
                if not found:
-                    for substr in iter_substrings(word, minlen=3, maxlen=9):
+                    for substr in iter_substrings(word, minlen=2, maxlen=11):
                        counter[substr] += 1

        # Score the candidates we found.  This is a semi-empirical formula that
@ -410,16 +408,9 @@ def compute_huffman_coding(translations, compression_filename):
            break

        word = scores[0][0]
-
-        # If we can successfully add it to the dictionary, do so.  Otherwise,
-        # we've filled the dictionary to capacity and are done.
-        if sum_len + len(word) - 2 > max_words_len:
-            break
-        if len(words) == max_words:
-            break
        words.append(word)
-        sum_len += len(word) - 2

+    words.sort(key=len)
    extractor = TextSplitter(words)
    counter = collections.Counter()
    for t in texts:
@ -469,16 +460,15 @@ def compute_huffman_coding(translations, compression_filename):
        len(translation.encode("utf-8")) for (original, translation) in translations
    )

-    wends = list(len(w) - 2 for w in words)
-    for i in range(1, len(wends)):
-        wends[i] += wends[i - 1]
+    maxlen = len(words[-1])
+    minlen = len(words[0])
+    wlencount = [len([None for w in words if len(w) == l]) for l in range(minlen, maxlen + 1)]

    with open(compression_filename, "w") as f:
+        f.write("typedef {} mchar_t;".format(values_type))
        f.write("const uint8_t lengths[] = {{ {} }};\n".format(", ".join(map(str, lengths))))
        f.write(
-            "const {} values[] = {{ {} }};\n".format(
-                values_type, ", ".join(str(ord(u)) for u in values)
-            )
+            "const mchar_t values[] = {{ {} }};\n".format(", ".join(str(ord(u)) for u in values))
        )
        f.write(
            "#define compress_max_length_bits ({})\n".format(
@ -486,13 +476,17 @@ def compute_huffman_coding(translations, compression_filename):
            )
        )
        f.write(
-            "const {} words[] = {{ {} }};\n".format(
-                values_type, ", ".join(str(ord(c)) for w in words for c in w)
+            "const mchar_t words[] = {{ {} }};\n".format(
+                ", ".join(str(ord(c)) for w in words for c in w)
            )
        )
-        f.write("const uint8_t wends[] = {{ {} }};\n".format(", ".join(str(p) for p in wends)))
+        f.write(
+            "const uint8_t wlencount[] = {{ {} }};\n".format(", ".join(str(p) for p in wlencount))
+        )
        f.write("#define word_start {}\n".format(word_start))
        f.write("#define word_end {}\n".format(word_end))
+        f.write("#define minlen {}\n".format(minlen))
+        f.write("#define maxlen {}\n".format(maxlen))

    return (values, lengths, words, canonical, extractor)

--- a/supervisor/shared/translate.c
+++ b/supervisor/shared/translate.c
@ -43,22 +43,34 @@ void serial_write_compressed(const compressed_string_t *compressed) {
    serial_write(decompressed);
 }

+STATIC void get_word(int n, const mchar_t **pos, const mchar_t **end) {
+    int len = minlen;
+    int i = 0;
+    *pos = words;
+    while (wlencount[i] <= n) {
+        n -= wlencount[i];
+        *pos += len * wlencount[i];
+        i++;
+        len++;
+    }
+    *pos += len * n;
+    *end = *pos + len;
+}
+
 STATIC int put_utf8(char *buf, int u) {
    if (u <= 0x7f) {
        *buf = u;
        return 1;
    } else if (word_start <= u && u <= word_end) {
        uint n = (u - word_start);
-        size_t pos = 0;
-        if (n > 0) {
-            pos = wends[n - 1] + (n * 2);
-        }
+        const mchar_t *pos, *end;
+        get_word(n, &pos, &end);
        int ret = 0;
        // note that at present, entries in the words table are
        // guaranteed not to represent words themselves, so this adds
        // at most 1 level of recursive call
-        for (; pos < wends[n] + (n + 1) * 2; pos++) {
-            int len = put_utf8(buf, words[pos]);
+        for (; pos < end; pos++) {
+            int len = put_utf8(buf, *pos);
            buf += len;
            ret += len;
        }
--- a/supervisor/shared/translate.h
+++ b/supervisor/shared/translate.h
@ -50,11 +50,14 @@
 //   are computed with a heuristic based on frequent substrings of 2 to
 //   9 code points.  These are called "words" but are not, grammatically
 //   speaking, words.  They're just spans of code points that frequently
-//   occur together.
+//   occur together.  They are ordered shortest to longest.
 //
 // - dictionary entries are non-overlapping, and the _ending_ index of each
-//   entry is stored in an array.  Since the index given is the ending
-//   index, the array is called "wends".
+//   entry is stored in an array.  A count of words of each length, from
+//   minlen to maxlen, is given in the array called wlencount.  From
+//   this small array, the start and end of the N'th word can be
+//   calculated by an efficient, small loop.  (A bit of time is traded
+//   to reduce the size of this table indicating lengths)
 //
 // The "data" / "tail" construct is so that the struct's last member is a
 // "flexible array".  However, the _only_ member is not permitted to be