140 lines
4.7 KiB
C
Raw Normal View History

/*
* This file is part of the MicroPython project, http://micropython.org/
*
* The MIT License (MIT)
*
2018-07-31 16:53:54 -07:00
* Copyright (c) 2018 Scott Shawcroft for Adafruit Industries
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
2018-07-31 16:53:54 -07:00
#include "supervisor/shared/translate.h"
#include <stdbool.h>
#include <stdint.h>
2018-07-31 16:53:54 -07:00
#include <string.h>
#ifndef NO_QSTR
#include "genhdr/compression.generated.h"
#endif
add bigram compression to makeqstrdata Compress common unicode bigrams by making code points in the range 0x80 - 0xbf (inclusive) represent them. Then, they can be greedily encoded and the substituted code points handled by the existing Huffman compression. Normally code points in the range 0x80-0xbf are not used in Unicode, so we stake our own claim. Using the more arguably correct "Private Use Area" (PUA) would mean that for scripts that only use code points under 256 we would use more memory for the "values" table. bigram means "two letters", and is also sometimes called a "digram". It's nothing to do with "big RAM". For our purposes, a bigram represents two successive unicode code points, so for instance in our build on trinket m0 for english the most frequent are: ['t ', 'e ', 'in', 'd ', ...]. The bigrams are selected based on frequency in the corpus, but the selection is not necessarily optimal, for these reasons I can think of: * Suppose the corpus was just "tea" repeated 100 times. The top bigrams would be "te", and "ea". However, overlap, "te" could never be used. Thus, some bigrams might actually waste space * I _assume_ this has to be why e.g., bigram 0x86 "s " is more frequent than bigram 0x85 " a" in English for Trinket M0, because sequences like "can't add" would get the "t " digram and then be unable to use the " a" digram. * And generally, if a bigram is frequent then so are its constituents. Say that "i" and "n" both encode to just 5 or 6 bits, then the huffman code for "in" had better compress to 10 or fewer bits or it's a net loss! * I checked though! "i" is 5 bits, "n" is 6 bits (lucky guess) but the bigram 0x83 also just 6 bits, so this one is a win of 5 bits for every "it" minus overhead. Yay, this round goes to team compression. * On the other hand, the least frequent bigram 0x9d " n" is 10 bits long and its constituent code points are 4+6 bits so there's no savings, but there is the cost of the table entry. * and somehow 0x9f 'an' is never used at all! With or without accounting for overlaps, there is some optimum number of bigrams. Adding one more bigram uses at least 2 bytes (for the entry in the bigram table; 4 bytes if code points >255 are in the source text) and also needs a slot in the Huffman dictionary, so adding bigrams beyond the optimim number makes compression worse again. If it's an improvement, the fact that it's not guaranteed optimal doesn't seem to matter too much. It just leaves a little more fruit for the next sweep to pick up. Perhaps try adding the most frequent bigram not yet present, until it doesn't improve compression overall. Right now, de_DE is again the "fullest" build on trinket_m0. (It's reclaimed that spot from the ja translation somehow) This change saves 104 bytes there, increasing free space about 6.8%. In the larger (but not critically full) pyportal build it saves 324 bytes. The specific number of bigrams used (32) was chosen as it is the max number that fit within the 0x80..0xbf range. Larger tables would require the use of 16 bit code points in the de_DE build, losing savings overall. (Side note: The most frequent letters in English have been said to be: ETA OIN SHRDLU; but we have UAC EIL MOPRST in our corpus)
2020-09-01 17:12:22 -05:00
#include "py/misc.h"
#include "supervisor/serial.h"
void serial_write_compressed(const compressed_string_t* compressed) {
char decompressed[decompress_length(compressed)];
decompress(compressed, decompressed);
serial_write(decompressed);
}
STATIC int put_utf8(char *buf, int u) {
if(u <= 0x7f) {
*buf = u;
return 1;
} else if(word_start <= u && u <= word_end) {
uint n = (u - word_start);
size_t pos = 0;
if (n > 0) {
pos = wends[n - 1] + (n * 2);
}
int ret = 0;
// note that at present, entries in the words table are
// guaranteed not to represent words themselves, so this adds
// at most 1 level of recursive call
for(; pos < wends[n] + (n + 1) * 2; pos++) {
int len = put_utf8(buf, words[pos]);
buf += len;
ret += len;
}
return ret;
} else if(u <= 0x07ff) {
*buf++ = 0b11000000 | (u >> 6);
*buf = 0b10000000 | (u & 0b00111111);
return 2;
} else { // u <= 0xffff
*buf++ = 0b11100000 | (u >> 12);
*buf++ = 0b10000000 | ((u >> 6) & 0b00111111);
*buf = 0b10000000 | (u & 0b00111111);
return 3;
}
}
uint16_t decompress_length(const compressed_string_t* compressed) {
if (compress_max_length_bits <= 8) {
return 1 + (compressed->data >> (8 - compress_max_length_bits));
} else {
return 1 + ((compressed->data * 256 + compressed->tail[0]) >> (16 - compress_max_length_bits));
}
}
char* decompress(const compressed_string_t* compressed, char* decompressed) {
uint8_t this_byte = compress_max_length_bits / 8;
uint8_t this_bit = 7 - compress_max_length_bits % 8;
uint8_t b = (&compressed->data)[this_byte];
uint16_t length = decompress_length(compressed);
// Stop one early because the last byte is always NULL.
for (uint16_t i = 0; i < length - 1;) {
uint32_t bits = 0;
uint8_t bit_length = 0;
uint32_t max_code = lengths[0];
uint32_t searched_length = lengths[0];
while (true) {
bits <<= 1;
if ((0x80 & b) != 0) {
bits |= 1;
}
b <<= 1;
bit_length += 1;
if (this_bit == 0) {
this_bit = 7;
this_byte += 1;
b = (&compressed->data)[this_byte]; // This may read past the end but its never used.
} else {
this_bit -= 1;
}
if (max_code > 0 && bits < max_code) {
break;
}
max_code = (max_code << 1) + lengths[bit_length];
searched_length += lengths[bit_length];
}
i += put_utf8(decompressed + i, values[searched_length + bits - max_code]);
}
decompressed[length-1] = '\0';
return decompressed;
}
2021-01-08 22:16:58 -05:00
inline
// gcc10 -flto has issues with this being always_inline for debug builds.
#if CIRCUITPY_DEBUG < 1
__attribute__((always_inline))
#endif
const compressed_string_t* translate(const char* original) {
2018-07-31 16:53:54 -07:00
#ifndef NO_QSTR
#define QDEF(id, str)
#define TRANSLATION(id, firstbyte, ...) if (strcmp(original, id) == 0) { static const compressed_string_t v = { .data = firstbyte, .tail = { __VA_ARGS__ } }; return &v; } else
2018-07-31 16:53:54 -07:00
#include "genhdr/qstrdefs.generated.h"
#undef TRANSLATION
#undef QDEF
#endif
return NULL;
2018-07-31 16:53:54 -07:00
}