From 68c28174d0e0ec3f6b1461aea3a0b6a1b84610bb Mon Sep 17 00:00:00 2001 From: tll <1040424979@qq.com> Date: Sat, 24 Jun 2017 08:38:32 +0800 Subject: [PATCH] py/objstr: Add check for valid UTF-8 when making a str from bytes. This patch adds a function utf8_check() to check for a valid UTF-8 encoded string, and calls it when constructing a str from raw bytes. The feature is selectable at compile time via MICROPY_PY_BUILTINS_STR_UNICODE_CHECK and is enabled if unicode is enabled. It costs about 110 bytes on Thumb-2, 150 bytes on Xtensa and 170 bytes on x86-64. --- py/mpconfig.h | 5 +++++ py/objstr.c | 10 ++++++++++ py/unicode.c | 28 ++++++++++++++++++++++++++++ py/unicode.h | 1 + tests/unicode/unicode.py | 14 ++++++++++++++ 5 files changed, 58 insertions(+) diff --git a/py/mpconfig.h b/py/mpconfig.h index dac8a903c9..38cf4b5605 100644 --- a/py/mpconfig.h +++ b/py/mpconfig.h @@ -691,6 +691,11 @@ typedef double mp_float_t; #define MICROPY_PY_BUILTINS_STR_UNICODE (0) #endif +// Whether to check for valid UTF-8 when converting bytes to str +#ifndef MICROPY_PY_BUILTINS_STR_UNICODE_CHECK +#define MICROPY_PY_BUILTINS_STR_UNICODE_CHECK (MICROPY_PY_BUILTINS_STR_UNICODE) +#endif + // Whether str.center() method provided #ifndef MICROPY_PY_BUILTINS_STR_CENTER #define MICROPY_PY_BUILTINS_STR_CENTER (0) diff --git a/py/objstr.c b/py/objstr.c index 4c287af040..f6214f80c9 100644 --- a/py/objstr.c +++ b/py/objstr.c @@ -161,6 +161,11 @@ mp_obj_t mp_obj_str_make_new(const mp_obj_type_t *type, size_t n_args, size_t n_ if (str_hash == 0) { str_hash = qstr_compute_hash(str_data, str_len); } + #if MICROPY_PY_BUILTINS_STR_UNICODE_CHECK + if (!utf8_check(str_data, str_len)) { + mp_raise_msg(&mp_type_UnicodeError, NULL); + } + #endif mp_obj_str_t *o = MP_OBJ_TO_PTR(mp_obj_new_str_of_type(type, NULL, str_len)); o->data = str_data; o->hash = str_hash; @@ -168,6 +173,11 @@ mp_obj_t mp_obj_str_make_new(const mp_obj_type_t *type, size_t n_args, size_t n_ } else { mp_buffer_info_t bufinfo; mp_get_buffer_raise(args[0], &bufinfo, MP_BUFFER_READ); + #if MICROPY_PY_BUILTINS_STR_UNICODE_CHECK + if (!utf8_check(bufinfo.buf, bufinfo.len)) { + mp_raise_msg(&mp_type_UnicodeError, NULL); + } + #endif return mp_obj_new_str(bufinfo.buf, bufinfo.len, false); } } diff --git a/py/unicode.c b/py/unicode.c index eddb007d5e..140b7ba712 100644 --- a/py/unicode.c +++ b/py/unicode.c @@ -182,3 +182,31 @@ mp_uint_t unichar_xdigit_value(unichar c) { } return n; } + +bool utf8_check(const byte *p, size_t len) { + uint8_t need = 0; + const byte *end = p + len; + for (; p < end; p++) { + byte c = *p; + if (need) { + if (c >= 0x80) { + need--; + } else { + // mismatch + return 0; + } + } else { + if (c >= 0xc0) { + if (c >= 0xf8) { + // mismatch + return 0; + } + need = (0xe5 >> ((c >> 3) & 0x6)) & 3; + } else if (c >= 0x80) { + // mismatch + return 0; + } + } + } + return need == 0; // no pending fragments allowed +} diff --git a/py/unicode.h b/py/unicode.h index 19487a65ae..c1fb517894 100644 --- a/py/unicode.h +++ b/py/unicode.h @@ -30,5 +30,6 @@ #include "py/misc.h" mp_uint_t utf8_ptr_to_index(const byte *s, const byte *ptr); +bool utf8_check(const byte *p, size_t len); #endif // MICROPY_INCLUDED_PY_UNICODE_H diff --git a/tests/unicode/unicode.py b/tests/unicode/unicode.py index 5f29bc1c95..3a35ce8948 100644 --- a/tests/unicode/unicode.py +++ b/tests/unicode/unicode.py @@ -33,3 +33,17 @@ try: int('\u0200') except ValueError: print('ValueError') + +# test invalid UTF-8 string +try: + str(b'ab\xa1', 'utf8') +except UnicodeError: + print('UnicodeError') +try: + str(b'ab\xf8', 'utf8') +except UnicodeError: + print('UnicodeError') +try: + str(bytearray(b'ab\xc0a'), 'utf8') +except UnicodeError: + print('UnicodeError')