Jeff Epler 0eee93729a Fix decompression of unicode values above 2047
Two problems: The lead byte for 3-byte sequences was wrong, and one
mid-byte was not even filled in due to a missing "++"!

Apparently this was broken ever since the first "Compress as unicode,
not bytes" commit, but I believed I'd "tested" it by running on the
Pinyin translation.

This rendered at least the Korean and Japanese translations completely
illegible, affecting 5.0 and all later releases.
2020-09-08 20:54:47 -05:00

126 lines
4.5 KiB
C

/*
* This file is part of the MicroPython project, http://micropython.org/
*
* The MIT License (MIT)
*
* Copyright (c) 2018 Scott Shawcroft for Adafruit Industries
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
#include "supervisor/shared/translate.h"
#include <stdbool.h>
#include <stdint.h>
#include <string.h>
#ifndef NO_QSTR
#include "genhdr/compression.generated.h"
#endif
#include "py/misc.h"
#include "supervisor/serial.h"
void serial_write_compressed(const compressed_string_t* compressed) {
char decompressed[decompress_length(compressed)];
decompress(compressed, decompressed);
serial_write(decompressed);
}
STATIC int put_utf8(char *buf, int u) {
if(u <= 0x7f) {
*buf = u;
return 1;
} else if(bigram_start <= u && u <= bigram_end) {
int n = (u - 0x80) * 2;
// (note that at present, entries in the bigrams table are
// guaranteed not to represent bigrams themselves, so this adds
// at most 1 level of recursive call
int ret = put_utf8(buf, bigrams[n]);
return ret + put_utf8(buf + ret, bigrams[n+1]);
} else if(u <= 0x07ff) {
*buf++ = 0b11000000 | (u >> 6);
*buf = 0b10000000 | (u & 0b00111111);
return 2;
} else { // u <= 0xffff
*buf++ = 0b11100000 | (u >> 12);
*buf++ = 0b10000000 | ((u >> 6) & 0b00111111);
*buf = 0b10000000 | (u & 0b00111111);
return 3;
}
}
uint16_t decompress_length(const compressed_string_t* compressed) {
if (compress_max_length_bits <= 8) {
return 1 + (compressed->data >> (8 - compress_max_length_bits));
} else {
return 1 + ((compressed->data * 256 + compressed->tail[0]) >> (16 - compress_max_length_bits));
}
}
char* decompress(const compressed_string_t* compressed, char* decompressed) {
uint8_t this_byte = compress_max_length_bits / 8;
uint8_t this_bit = 7 - compress_max_length_bits % 8;
uint8_t b = (&compressed->data)[this_byte];
uint16_t length = decompress_length(compressed);
// Stop one early because the last byte is always NULL.
for (uint16_t i = 0; i < length - 1;) {
uint32_t bits = 0;
uint8_t bit_length = 0;
uint32_t max_code = lengths[0];
uint32_t searched_length = lengths[0];
while (true) {
bits <<= 1;
if ((0x80 & b) != 0) {
bits |= 1;
}
b <<= 1;
bit_length += 1;
if (this_bit == 0) {
this_bit = 7;
this_byte += 1;
b = (&compressed->data)[this_byte]; // This may read past the end but its never used.
} else {
this_bit -= 1;
}
if (max_code > 0 && bits < max_code) {
break;
}
max_code = (max_code << 1) + lengths[bit_length];
searched_length += lengths[bit_length];
}
i += put_utf8(decompressed + i, values[searched_length + bits - max_code]);
}
decompressed[length-1] = '\0';
return decompressed;
}
inline __attribute__((always_inline)) const compressed_string_t* translate(const char* original) {
#ifndef NO_QSTR
#define QDEF(id, str)
#define TRANSLATION(id, firstbyte, ...) if (strcmp(original, id) == 0) { static const compressed_string_t v = { .data = firstbyte, .tail = { __VA_ARGS__ } }; return &v; } else
#include "genhdr/qstrdefs.generated.h"
#undef TRANSLATION
#undef QDEF
#endif
return NULL;
}