From 6c3d8d38bfbe24c3d810f89f546ac947cf0cb66d Mon Sep 17 00:00:00 2001 From: Jim Mussared Date: Wed, 24 Aug 2022 12:22:57 +1000 Subject: [PATCH] py/objstr: Always validate utf-8 for mp_obj_new_str. All uses of this are either tiny strings or not-known-to-be-safe. Update comments for mp_obj_new_str_copy and mp_obj_new_str_of_type. Signed-off-by: Jim Mussared --- py/obj.h | 8 ++++---- py/objstr.c | 11 ++++++----- py/objstr.h | 4 ++-- 3 files changed, 12 insertions(+), 11 deletions(-) diff --git a/py/obj.h b/py/obj.h index 0b0a9692ed..e048cf0c28 100644 --- a/py/obj.h +++ b/py/obj.h @@ -789,11 +789,11 @@ mp_obj_t mp_obj_new_int_from_uint(mp_uint_t value); mp_obj_t mp_obj_new_int_from_str_len(const char **str, size_t len, bool neg, unsigned int base); mp_obj_t mp_obj_new_int_from_ll(long long val); // this must return a multi-precision integer object (or raise an overflow exception) mp_obj_t mp_obj_new_int_from_ull(unsigned long long val); // this must return a multi-precision integer object (or raise an overflow exception) -mp_obj_t mp_obj_new_str(const char *data, size_t len); -mp_obj_t mp_obj_new_str_via_qstr(const char *data, size_t len); -mp_obj_t mp_obj_new_str_from_vstr(vstr_t *vstr); +mp_obj_t mp_obj_new_str(const char *data, size_t len); // will check utf-8 (raises UnicodeError) +mp_obj_t mp_obj_new_str_via_qstr(const char *data, size_t len); // input data must be valid utf-8 +mp_obj_t mp_obj_new_str_from_vstr(vstr_t *vstr); // will check utf-8 (raises UnicodeError) #if MICROPY_PY_BUILTINS_STR_UNICODE && MICROPY_PY_BUILTINS_STR_UNICODE_CHECK -mp_obj_t mp_obj_new_str_from_utf8_vstr(vstr_t *vstr); // only use when vstr is already known to be utf-8 encoded +mp_obj_t mp_obj_new_str_from_utf8_vstr(vstr_t *vstr); // input data must be valid utf-8 #else #define mp_obj_new_str_from_utf8_vstr mp_obj_new_str_from_vstr #endif diff --git a/py/objstr.c b/py/objstr.c index 5a62237516..683d035e44 100644 --- a/py/objstr.c +++ b/py/objstr.c @@ -202,11 +202,7 @@ mp_obj_t mp_obj_str_make_new(const mp_obj_type_t *type, size_t n_args, size_t n_ } else { mp_buffer_info_t bufinfo; mp_get_buffer_raise(args[0], &bufinfo, MP_BUFFER_READ); - #if MICROPY_PY_BUILTINS_STR_UNICODE_CHECK - if (!utf8_check(bufinfo.buf, bufinfo.len)) { - mp_raise_msg(&mp_type_UnicodeError, NULL); - } - #endif + // This will utf-8 check the input. return mp_obj_new_str(bufinfo.buf, bufinfo.len); } } @@ -2268,6 +2264,11 @@ mp_obj_t mp_obj_new_bytes_from_vstr(vstr_t *vstr) { } mp_obj_t mp_obj_new_str(const char *data, size_t len) { + #if MICROPY_PY_BUILTINS_STR_UNICODE && MICROPY_PY_BUILTINS_STR_UNICODE_CHECK + if (!utf8_check((byte *)data, len)) { + mp_raise_msg(&mp_type_UnicodeError, NULL); + } + #endif qstr q = qstr_find_strn(data, len); if (q != MP_QSTRnull) { // qstr with this data already exists diff --git a/py/objstr.h b/py/objstr.h index 78441fedaf..6c6735bf5e 100644 --- a/py/objstr.h +++ b/py/objstr.h @@ -88,8 +88,8 @@ mp_obj_t mp_obj_str_make_new(const mp_obj_type_t *type_in, size_t n_args, size_t void mp_str_print_json(const mp_print_t *print, const byte *str_data, size_t str_len); mp_obj_t mp_obj_str_format(size_t n_args, const mp_obj_t *args, mp_map_t *kwargs); mp_obj_t mp_obj_str_split(size_t n_args, const mp_obj_t *args); -mp_obj_t mp_obj_new_str_copy(const mp_obj_type_t *type, const byte *data, size_t len); -mp_obj_t mp_obj_new_str_of_type(const mp_obj_type_t *type, const byte *data, size_t len); +mp_obj_t mp_obj_new_str_copy(const mp_obj_type_t *type, const byte *data, size_t len); // for type=str, input data must be valid utf-8 +mp_obj_t mp_obj_new_str_of_type(const mp_obj_type_t *type, const byte *data, size_t len); // for type=str, will check utf-8 (raises UnicodeError) mp_obj_t mp_obj_str_binary_op(mp_binary_op_t op, mp_obj_t lhs_in, mp_obj_t rhs_in); mp_int_t mp_obj_str_get_buffer(mp_obj_t self_in, mp_buffer_info_t *bufinfo, mp_uint_t flags);