When reading data from a file into a str, check if it's utf-8

Otherwise, weird stuff can happen down the line when it is print()ed,
especially as it can break the webrepl of circuitpython.
This commit is contained in:
Jeff Epler 2022-08-12 08:25:27 -05:00
parent 76f03a2bee
commit 06f22cb59c
No known key found for this signature in database
GPG Key ID: D5BF15AB975AB4DE

View File

@ -31,6 +31,7 @@
#include "py/objstr.h" #include "py/objstr.h"
#include "py/stream.h" #include "py/stream.h"
#include "py/runtime.h" #include "py/runtime.h"
#include "py/unicode.h"
#include "supervisor/shared/translate/translate.h" #include "supervisor/shared/translate/translate.h"
// This file defines generic Python stream read/write methods which // This file defines generic Python stream read/write methods which
@ -43,6 +44,13 @@ STATIC mp_obj_t stream_readall(mp_obj_t self_in);
#define STREAM_CONTENT_TYPE(stream) (((stream)->is_text) ? &mp_type_str : &mp_type_bytes) #define STREAM_CONTENT_TYPE(stream) (((stream)->is_text) ? &mp_type_str : &mp_type_bytes)
static mp_obj_t mp_obj_new_str_from_vstr_check(const mp_obj_type_t *type, vstr_t *vstr) {
if (type == &mp_type_str && !utf8_check((void *)vstr->buf, vstr->len)) {
mp_raise_msg(&mp_type_UnicodeError, NULL);
}
return mp_obj_new_str_from_vstr(type, vstr);
}
// Returns error condition in *errcode, if non-zero, return value is number of bytes written // Returns error condition in *errcode, if non-zero, return value is number of bytes written
// before error condition occurred. If *errcode == 0, returns total bytes written (which will // before error condition occurred. If *errcode == 0, returns total bytes written (which will
// be equal to input size). // be equal to input size).
@ -201,8 +209,7 @@ STATIC mp_obj_t stream_read_generic(size_t n_args, const mp_obj_t *args, byte fl
} }
} }
} }
return mp_obj_new_str_from_vstr_check(&mp_type_str, &vstr);
return mp_obj_new_str_from_vstr(&mp_type_str, &vstr);
} }
#endif #endif
@ -223,7 +230,7 @@ STATIC mp_obj_t stream_read_generic(size_t n_args, const mp_obj_t *args, byte fl
mp_raise_OSError(error); mp_raise_OSError(error);
} else { } else {
vstr.len = out_sz; vstr.len = out_sz;
return mp_obj_new_str_from_vstr(STREAM_CONTENT_TYPE(stream_p), &vstr); return mp_obj_new_str_from_vstr_check(STREAM_CONTENT_TYPE(stream_p), &vstr);
} }
} }
@ -364,7 +371,7 @@ STATIC mp_obj_t stream_readall(mp_obj_t self_in) {
} }
vstr.len = total_size; vstr.len = total_size;
return mp_obj_new_str_from_vstr(STREAM_CONTENT_TYPE(stream_p), &vstr); return mp_obj_new_str_from_vstr_check(STREAM_CONTENT_TYPE(stream_p), &vstr);
} }
// Unbuffered, inefficient implementation of readline() for raw I/O files. // Unbuffered, inefficient implementation of readline() for raw I/O files.
@ -417,7 +424,7 @@ STATIC mp_obj_t stream_unbuffered_readline(size_t n_args, const mp_obj_t *args)
} }
} }
return mp_obj_new_str_from_vstr(STREAM_CONTENT_TYPE(stream_p), &vstr); return mp_obj_new_str_from_vstr_check(STREAM_CONTENT_TYPE(stream_p), &vstr);
} }
MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(mp_stream_unbuffered_readline_obj, 1, 2, stream_unbuffered_readline); MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(mp_stream_unbuffered_readline_obj, 1, 2, stream_unbuffered_readline);