py/objstr: Add check for valid UTF-8 when making a str from bytes.
This patch adds a function utf8_check() to check for a valid UTF-8 encoded string, and calls it when constructing a str from raw bytes. The feature is selectable at compile time via MICROPY_PY_BUILTINS_STR_UNICODE_CHECK and is enabled if unicode is enabled. It costs about 110 bytes on Thumb-2, 150 bytes on Xtensa and 170 bytes on x86-64.
This commit is contained in:
parent
069fc48bf6
commit
68c28174d0
|
@ -691,6 +691,11 @@ typedef double mp_float_t;
|
||||||
#define MICROPY_PY_BUILTINS_STR_UNICODE (0)
|
#define MICROPY_PY_BUILTINS_STR_UNICODE (0)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
// Whether to check for valid UTF-8 when converting bytes to str
|
||||||
|
#ifndef MICROPY_PY_BUILTINS_STR_UNICODE_CHECK
|
||||||
|
#define MICROPY_PY_BUILTINS_STR_UNICODE_CHECK (MICROPY_PY_BUILTINS_STR_UNICODE)
|
||||||
|
#endif
|
||||||
|
|
||||||
// Whether str.center() method provided
|
// Whether str.center() method provided
|
||||||
#ifndef MICROPY_PY_BUILTINS_STR_CENTER
|
#ifndef MICROPY_PY_BUILTINS_STR_CENTER
|
||||||
#define MICROPY_PY_BUILTINS_STR_CENTER (0)
|
#define MICROPY_PY_BUILTINS_STR_CENTER (0)
|
||||||
|
|
10
py/objstr.c
10
py/objstr.c
|
@ -161,6 +161,11 @@ mp_obj_t mp_obj_str_make_new(const mp_obj_type_t *type, size_t n_args, size_t n_
|
||||||
if (str_hash == 0) {
|
if (str_hash == 0) {
|
||||||
str_hash = qstr_compute_hash(str_data, str_len);
|
str_hash = qstr_compute_hash(str_data, str_len);
|
||||||
}
|
}
|
||||||
|
#if MICROPY_PY_BUILTINS_STR_UNICODE_CHECK
|
||||||
|
if (!utf8_check(str_data, str_len)) {
|
||||||
|
mp_raise_msg(&mp_type_UnicodeError, NULL);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
mp_obj_str_t *o = MP_OBJ_TO_PTR(mp_obj_new_str_of_type(type, NULL, str_len));
|
mp_obj_str_t *o = MP_OBJ_TO_PTR(mp_obj_new_str_of_type(type, NULL, str_len));
|
||||||
o->data = str_data;
|
o->data = str_data;
|
||||||
o->hash = str_hash;
|
o->hash = str_hash;
|
||||||
|
@ -168,6 +173,11 @@ mp_obj_t mp_obj_str_make_new(const mp_obj_type_t *type, size_t n_args, size_t n_
|
||||||
} else {
|
} else {
|
||||||
mp_buffer_info_t bufinfo;
|
mp_buffer_info_t bufinfo;
|
||||||
mp_get_buffer_raise(args[0], &bufinfo, MP_BUFFER_READ);
|
mp_get_buffer_raise(args[0], &bufinfo, MP_BUFFER_READ);
|
||||||
|
#if MICROPY_PY_BUILTINS_STR_UNICODE_CHECK
|
||||||
|
if (!utf8_check(bufinfo.buf, bufinfo.len)) {
|
||||||
|
mp_raise_msg(&mp_type_UnicodeError, NULL);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
return mp_obj_new_str(bufinfo.buf, bufinfo.len, false);
|
return mp_obj_new_str(bufinfo.buf, bufinfo.len, false);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
28
py/unicode.c
28
py/unicode.c
|
@ -182,3 +182,31 @@ mp_uint_t unichar_xdigit_value(unichar c) {
|
||||||
}
|
}
|
||||||
return n;
|
return n;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool utf8_check(const byte *p, size_t len) {
|
||||||
|
uint8_t need = 0;
|
||||||
|
const byte *end = p + len;
|
||||||
|
for (; p < end; p++) {
|
||||||
|
byte c = *p;
|
||||||
|
if (need) {
|
||||||
|
if (c >= 0x80) {
|
||||||
|
need--;
|
||||||
|
} else {
|
||||||
|
// mismatch
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (c >= 0xc0) {
|
||||||
|
if (c >= 0xf8) {
|
||||||
|
// mismatch
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
need = (0xe5 >> ((c >> 3) & 0x6)) & 3;
|
||||||
|
} else if (c >= 0x80) {
|
||||||
|
// mismatch
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return need == 0; // no pending fragments allowed
|
||||||
|
}
|
||||||
|
|
|
@ -30,5 +30,6 @@
|
||||||
#include "py/misc.h"
|
#include "py/misc.h"
|
||||||
|
|
||||||
mp_uint_t utf8_ptr_to_index(const byte *s, const byte *ptr);
|
mp_uint_t utf8_ptr_to_index(const byte *s, const byte *ptr);
|
||||||
|
bool utf8_check(const byte *p, size_t len);
|
||||||
|
|
||||||
#endif // MICROPY_INCLUDED_PY_UNICODE_H
|
#endif // MICROPY_INCLUDED_PY_UNICODE_H
|
||||||
|
|
|
@ -33,3 +33,17 @@ try:
|
||||||
int('\u0200')
|
int('\u0200')
|
||||||
except ValueError:
|
except ValueError:
|
||||||
print('ValueError')
|
print('ValueError')
|
||||||
|
|
||||||
|
# test invalid UTF-8 string
|
||||||
|
try:
|
||||||
|
str(b'ab\xa1', 'utf8')
|
||||||
|
except UnicodeError:
|
||||||
|
print('UnicodeError')
|
||||||
|
try:
|
||||||
|
str(b'ab\xf8', 'utf8')
|
||||||
|
except UnicodeError:
|
||||||
|
print('UnicodeError')
|
||||||
|
try:
|
||||||
|
str(bytearray(b'ab\xc0a'), 'utf8')
|
||||||
|
except UnicodeError:
|
||||||
|
print('UnicodeError')
|
||||||
|
|
Loading…
Reference in New Issue