diff --git a/py/builtin.c b/py/builtin.c index d4b77d37a8..9986b90d3f 100644 --- a/py/builtin.c +++ b/py/builtin.c @@ -172,13 +172,40 @@ STATIC mp_obj_t mp_builtin_callable(mp_obj_t o_in) { MP_DEFINE_CONST_FUN_OBJ_1(mp_builtin_callable_obj, mp_builtin_callable); STATIC mp_obj_t mp_builtin_chr(mp_obj_t o_in) { - int ord = mp_obj_get_int(o_in); + #if MICROPY_PY_BUILTINS_STR_UNICODE + machine_int_t c = mp_obj_get_int(o_in); + char str[4]; + int len = 0; + if (c < 0x80) { + *str = c; len = 1; + } else if (c < 0x800) { + str[0] = (c >> 6) | 0xC0; + str[1] = (c & 0x3F) | 0x80; + len = 2; + } else if (c < 0x10000) { + str[0] = (c >> 12) | 0xE0; + str[1] = ((c >> 6) & 0x3F) | 0x80; + str[2] = (c & 0x3F) | 0x80; + len = 3; + } else if (c < 0x110000) { + str[0] = (c >> 18) | 0xF0; + str[1] = ((c >> 12) & 0x3F) | 0x80; + str[2] = ((c >> 6) & 0x3F) | 0x80; + str[3] = (c & 0x3F) | 0x80; + len = 4; + } else { + nlr_raise(mp_obj_new_exception_msg(&mp_type_ValueError, "chr() arg not in range(0x110000)")); + } + return mp_obj_new_str(str, len, true); + #else + machine_int_t ord = mp_obj_get_int(o_in); if (0 <= ord && ord <= 0x10ffff) { char str[1] = {ord}; return mp_obj_new_str(str, 1, true); } else { nlr_raise(mp_obj_new_exception_msg(&mp_type_ValueError, "chr() arg not in range(0x110000)")); } + #endif } MP_DEFINE_CONST_FUN_OBJ_1(mp_builtin_chr_obj, mp_builtin_chr); @@ -344,13 +371,32 @@ MP_DEFINE_CONST_FUN_OBJ_1(mp_builtin_oct_obj, mp_builtin_oct); STATIC mp_obj_t mp_builtin_ord(mp_obj_t o_in) { uint len; const char *str = mp_obj_str_get_data(o_in, &len); + #if MICROPY_PY_BUILTINS_STR_UNICODE + uint charlen = unichar_charlen(str, len); + if (charlen == 1) { + if (MP_OBJ_IS_STR(o_in) && UTF8_IS_NONASCII(*str)) { + machine_int_t ord = *str++ & 0x7F; + for (machine_int_t mask = 0x40; ord & mask; mask >>= 1) { + ord &= ~mask; + } + while (UTF8_IS_CONT(*str)) { + ord = (ord << 6) | (*str++ & 0x3F); + } + return mp_obj_new_int(ord); + } else { + return mp_obj_new_int(((const byte*)str)[0]); + } + } else { + nlr_raise(mp_obj_new_exception_msg_varg(&mp_type_TypeError, "ord() expected a character, but string of length %d found", charlen)); + } + #else if (len == 1) { // don't sign extend when converting to ord - // TODO unicode return mp_obj_new_int(((const byte*)str)[0]); } else { nlr_raise(mp_obj_new_exception_msg_varg(&mp_type_TypeError, "ord() expected a character, but string of length %d found", len)); } + #endif } MP_DEFINE_CONST_FUN_OBJ_1(mp_builtin_ord_obj, mp_builtin_ord); diff --git a/py/lexer.c b/py/lexer.c index 5d1113fb30..8732d64362 100644 --- a/py/lexer.c +++ b/py/lexer.c @@ -502,19 +502,32 @@ STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, mp_token_t *tok, bool firs case 'v': c = 0x0b; break; case 'f': c = 0x0c; break; case 'r': c = 0x0d; break; + case 'u': + case 'U': + if (is_bytes) { + // b'\u1234' == b'\\u1234' + vstr_add_char(&lex->vstr, '\\'); + break; + } + // Otherwise fall through. case 'x': { uint num = 0; - if (!get_hex(lex, 2, &num)) { + if (!get_hex(lex, (c == 'x' ? 2 : c == 'u' ? 4 : 8), &num)) { // TODO error message assert(0); } c = num; break; } - case 'N': break; // TODO \N{name} only in strings - case 'u': break; // TODO \uxxxx only in strings - case 'U': break; // TODO \Uxxxxxxxx only in strings + case 'N': + // Supporting '\N{LATIN SMALL LETTER A}' == 'a' would require keeping the + // entire Unicode name table in the core. As of Unicode 6.3.0, that's nearly + // 3MB of text; even gzip-compressed and with minimal structure, it'll take + // roughly half a meg of storage. This form of Unicode escape may be added + // later on, but it's definitely not a priority right now. -- CJA 20140607 + assert(!"Unicode name escapes not supported"); + break; default: if (c >= '0' && c <= '7') { // Octal sequence, 1-3 chars @@ -533,7 +546,13 @@ STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, mp_token_t *tok, bool firs } } if (c != MP_LEXER_CHAR_EOF) { - vstr_add_char(&lex->vstr, c); + if (c < 0x110000 && !is_bytes) { + vstr_add_char(&lex->vstr, c); + } else if (c < 0x100 && is_bytes) { + vstr_add_byte(&lex->vstr, c); + } else { + assert(!"TODO: Throw an error, invalid escape code probably"); + } } } else { vstr_add_char(&lex->vstr, CUR_CHAR(lex)); diff --git a/py/misc.h b/py/misc.h index 3f62e3198f..bf63ce1354 100644 --- a/py/misc.h +++ b/py/misc.h @@ -100,7 +100,9 @@ bool unichar_isupper(unichar c); bool unichar_islower(unichar c); unichar unichar_tolower(unichar c); unichar unichar_toupper(unichar c); -#define unichar_charlen(s, bytelen) (bytelen) +uint unichar_charlen(const char *str, uint len); // TODO this should return machine_uint_t +#define UTF8_IS_NONASCII(ch) ((ch) & 0x80) +#define UTF8_IS_CONT(ch) (((ch) & 0xC0) == 0x80) /** variable string *********************************************/ @@ -164,4 +166,18 @@ int DEBUG_printf(const char *fmt, ...); extern uint mp_verbose_flag; +// This is useful for unicode handling. Some CPU archs has +// special instructions for efficient implentation of this +// function (e.g. CLZ on ARM). +// NOTE: this function is unused at the moment +#ifndef count_lead_ones +static inline uint count_lead_ones(byte val) { + uint c = 0; + for (byte mask = 0x80; val & mask; mask >>= 1) { + c++; + } + return c; +} +#endif + #endif // _INCLUDED_MINILIB_H diff --git a/py/mpconfig.h b/py/mpconfig.h index 5b27b910e7..0dd84d1f81 100644 --- a/py/mpconfig.h +++ b/py/mpconfig.h @@ -249,6 +249,11 @@ typedef double mp_float_t; /*****************************************************************************/ /* Fine control over Python builtins, classes, modules, etc */ +// Whether str object is proper unicode +#ifndef MICROPY_PY_BUILTINS_STR_UNICODE +#define MICROPY_PY_BUILTINS_STR_UNICODE (0) +#endif + // Whether to support bytearray object #ifndef MICROPY_PY_BUILTINS_BYTEARRAY #define MICROPY_PY_BUILTINS_BYTEARRAY (1) diff --git a/py/obj.c b/py/obj.c index 7e39c5a5cc..12d7898428 100644 --- a/py/obj.c +++ b/py/obj.c @@ -357,7 +357,12 @@ uint mp_get_index(const mp_obj_type_t *type, machine_uint_t len, mp_obj_t index, // may return MP_OBJ_NULL mp_obj_t mp_obj_len_maybe(mp_obj_t o_in) { - if (MP_OBJ_IS_STR(o_in) || MP_OBJ_IS_TYPE(o_in, &mp_type_bytes)) { + if ( +#if !MICROPY_PY_BUILTINS_STR_UNICODE + // It's simple - unicode is slow, non-unicode is fast + MP_OBJ_IS_STR(o_in) || +#endif + MP_OBJ_IS_TYPE(o_in, &mp_type_bytes)) { return MP_OBJ_NEW_SMALL_INT((machine_int_t)mp_obj_str_get_len(o_in)); } else { mp_obj_type_t *type = mp_obj_get_type(o_in); diff --git a/py/objstr.c b/py/objstr.c index c84d7c900d..a4749369d7 100644 --- a/py/objstr.c +++ b/py/objstr.c @@ -32,6 +32,7 @@ #include "mpconfig.h" #include "nlr.h" #include "misc.h" +#include "unicode.h" #include "qstr.h" #include "obj.h" #include "runtime0.h" @@ -43,16 +44,7 @@ STATIC mp_obj_t str_modulo_format(mp_obj_t pattern, uint n_args, const mp_obj_t *args, mp_obj_t dict); const mp_obj_t mp_const_empty_bytes; -// use this macro to extract the string hash -#define GET_STR_HASH(str_obj_in, str_hash) uint str_hash; if (MP_OBJ_IS_QSTR(str_obj_in)) { str_hash = qstr_hash(MP_OBJ_QSTR_VALUE(str_obj_in)); } else { str_hash = ((mp_obj_str_t*)str_obj_in)->hash; } - -// use this macro to extract the string length -#define GET_STR_LEN(str_obj_in, str_len) uint str_len; if (MP_OBJ_IS_QSTR(str_obj_in)) { str_len = qstr_len(MP_OBJ_QSTR_VALUE(str_obj_in)); } else { str_len = ((mp_obj_str_t*)str_obj_in)->len; } - -// use this macro to extract the string data and length -#define GET_STR_DATA_LEN(str_obj_in, str_data, str_len) const byte *str_data; uint str_len; if (MP_OBJ_IS_QSTR(str_obj_in)) { str_data = qstr_data(MP_OBJ_QSTR_VALUE(str_obj_in), &str_len); } else { str_len = ((mp_obj_str_t*)str_obj_in)->len; str_data = ((mp_obj_str_t*)str_obj_in)->data; } - -STATIC mp_obj_t mp_obj_new_str_iterator(mp_obj_t str); +mp_obj_t mp_obj_new_str_iterator(mp_obj_t str); STATIC mp_obj_t mp_obj_new_bytes_iterator(mp_obj_t str); STATIC NORETURN void bad_implicit_conversion(mp_obj_t self_in); STATIC NORETURN void arg_type_mixup(); @@ -259,7 +251,7 @@ STATIC const byte *find_subbytes(const byte *haystack, machine_uint_t hlen, cons return NULL; } -STATIC mp_obj_t str_binary_op(int op, mp_obj_t lhs_in, mp_obj_t rhs_in) { +mp_obj_t mp_obj_str_binary_op(int op, mp_obj_t lhs_in, mp_obj_t rhs_in) { GET_STR_DATA_LEN(lhs_in, lhs_data, lhs_len); mp_obj_type_t *lhs_type = mp_obj_get_type(lhs_in); mp_obj_type_t *rhs_type = mp_obj_get_type(rhs_in); @@ -352,11 +344,14 @@ uncomparable: return MP_OBJ_NULL; // op not supported } +#if !MICROPY_PY_BUILTINS_STR_UNICODE +// objstrunicode defines own version const byte *str_index_to_ptr(const mp_obj_type_t *type, const byte *self_data, uint self_len, mp_obj_t index, bool is_slice) { machine_uint_t index_val = mp_get_index(type, self_len, index, is_slice); return self_data + index_val; } +#endif STATIC mp_obj_t str_subscr(mp_obj_t self_in, mp_obj_t index, mp_obj_t value) { mp_obj_type_t *type = mp_obj_get_type(self_in); @@ -571,7 +566,6 @@ STATIC mp_obj_t str_rsplit(uint n_args, const mp_obj_t *args) { return res; } - STATIC mp_obj_t str_finder(uint n_args, const mp_obj_t *args, machine_int_t direction, bool is_index) { const mp_obj_type_t *self_type = mp_obj_get_type(args[0]); assert(2 <= n_args && n_args <= 4); @@ -600,6 +594,11 @@ STATIC mp_obj_t str_finder(uint n_args, const mp_obj_t *args, machine_int_t dire } } else { // found + #if MICROPY_PY_BUILTINS_STR_UNICODE + if (self_type == &mp_type_str) { + return MP_OBJ_NEW_SMALL_INT(utf8_ptr_to_index(haystack, p)); + } + #endif return MP_OBJ_NEW_SMALL_INT(p - haystack); } } @@ -1449,7 +1448,7 @@ STATIC mp_obj_t str_count(uint n_args, const mp_obj_t *args) { // if needle_len is zero then we count each gap between characters as an occurrence if (needle_len == 0) { - return MP_OBJ_NEW_SMALL_INT(unichar_charlen((const char*)start, end - start) + 1); + return MP_OBJ_NEW_SMALL_INT((machine_uint_t)unichar_charlen((const char*)start, end - start) + 1); } // count the occurrences @@ -1610,7 +1609,7 @@ STATIC mp_obj_t str_encode(uint n_args, const mp_obj_t *args) { } #endif -STATIC machine_int_t str_get_buffer(mp_obj_t self_in, mp_buffer_info_t *bufinfo, int flags) { +machine_int_t mp_obj_str_get_buffer(mp_obj_t self_in, mp_buffer_info_t *bufinfo, int flags) { if (flags == MP_BUFFER_READ) { GET_STR_DATA_LEN(self_in, str_data, str_len); bufinfo->buf = (void*)str_data; @@ -1627,38 +1626,45 @@ STATIC machine_int_t str_get_buffer(mp_obj_t self_in, mp_buffer_info_t *bufinfo, } #if MICROPY_CPYTHON_COMPAT -STATIC MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(bytes_decode_obj, 1, 3, bytes_decode); -STATIC MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_encode_obj, 1, 3, str_encode); +MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(bytes_decode_obj, 1, 3, bytes_decode); +MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_encode_obj, 1, 3, str_encode); #endif -STATIC MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_find_obj, 2, 4, str_find); -STATIC MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_rfind_obj, 2, 4, str_rfind); -STATIC MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_index_obj, 2, 4, str_index); -STATIC MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_rindex_obj, 2, 4, str_rindex); -STATIC MP_DEFINE_CONST_FUN_OBJ_2(str_join_obj, str_join); -STATIC MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_split_obj, 1, 3, str_split); -STATIC MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_rsplit_obj, 1, 3, str_rsplit); -STATIC MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_startswith_obj, 2, 3, str_startswith); -STATIC MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_endswith_obj, 2, 3, str_endswith); -STATIC MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_strip_obj, 1, 2, str_strip); -STATIC MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_lstrip_obj, 1, 2, str_lstrip); -STATIC MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_rstrip_obj, 1, 2, str_rstrip); -STATIC MP_DEFINE_CONST_FUN_OBJ_VAR(str_format_obj, 1, mp_obj_str_format); -STATIC MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_replace_obj, 3, 4, str_replace); -STATIC MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_count_obj, 2, 4, str_count); -STATIC MP_DEFINE_CONST_FUN_OBJ_2(str_partition_obj, str_partition); -STATIC MP_DEFINE_CONST_FUN_OBJ_2(str_rpartition_obj, str_rpartition); -STATIC MP_DEFINE_CONST_FUN_OBJ_1(str_lower_obj, str_lower); -STATIC MP_DEFINE_CONST_FUN_OBJ_1(str_upper_obj, str_upper); -STATIC MP_DEFINE_CONST_FUN_OBJ_1(str_isspace_obj, str_isspace); -STATIC MP_DEFINE_CONST_FUN_OBJ_1(str_isalpha_obj, str_isalpha); -STATIC MP_DEFINE_CONST_FUN_OBJ_1(str_isdigit_obj, str_isdigit); -STATIC MP_DEFINE_CONST_FUN_OBJ_1(str_isupper_obj, str_isupper); -STATIC MP_DEFINE_CONST_FUN_OBJ_1(str_islower_obj, str_islower); +MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_find_obj, 2, 4, str_find); +MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_rfind_obj, 2, 4, str_rfind); +MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_index_obj, 2, 4, str_index); +MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_rindex_obj, 2, 4, str_rindex); +MP_DEFINE_CONST_FUN_OBJ_2(str_join_obj, str_join); +MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_split_obj, 1, 3, str_split); +MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_rsplit_obj, 1, 3, str_rsplit); +MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_startswith_obj, 2, 3, str_startswith); +MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_endswith_obj, 2, 3, str_endswith); +MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_strip_obj, 1, 2, str_strip); +MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_lstrip_obj, 1, 2, str_lstrip); +MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_rstrip_obj, 1, 2, str_rstrip); +MP_DEFINE_CONST_FUN_OBJ_VAR(str_format_obj, 1, mp_obj_str_format); +MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_replace_obj, 3, 4, str_replace); +MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_count_obj, 2, 4, str_count); +MP_DEFINE_CONST_FUN_OBJ_2(str_partition_obj, str_partition); +MP_DEFINE_CONST_FUN_OBJ_2(str_rpartition_obj, str_rpartition); +MP_DEFINE_CONST_FUN_OBJ_1(str_lower_obj, str_lower); +MP_DEFINE_CONST_FUN_OBJ_1(str_upper_obj, str_upper); +MP_DEFINE_CONST_FUN_OBJ_1(str_isspace_obj, str_isspace); +MP_DEFINE_CONST_FUN_OBJ_1(str_isalpha_obj, str_isalpha); +MP_DEFINE_CONST_FUN_OBJ_1(str_isdigit_obj, str_isdigit); +MP_DEFINE_CONST_FUN_OBJ_1(str_isupper_obj, str_isupper); +MP_DEFINE_CONST_FUN_OBJ_1(str_islower_obj, str_islower); STATIC const mp_map_elem_t str_locals_dict_table[] = { #if MICROPY_CPYTHON_COMPAT { MP_OBJ_NEW_QSTR(MP_QSTR_decode), (mp_obj_t)&bytes_decode_obj }, + #if !MICROPY_PY_BUILTINS_STR_UNICODE + // If we have separate unicode type, then here we have methods only + // for bytes type, and it should not have encode() methods. Otherwise, + // we have non-compliant-but-practical bytestring type, which shares + // method table with bytes, so they both have encode() and decode() + // methods (which should do type checking at runtime). { MP_OBJ_NEW_QSTR(MP_QSTR_encode), (mp_obj_t)&str_encode_obj }, + #endif #endif { MP_OBJ_NEW_QSTR(MP_QSTR_find), (mp_obj_t)&str_find_obj }, { MP_OBJ_NEW_QSTR(MP_QSTR_rfind), (mp_obj_t)&str_rfind_obj }, @@ -1688,17 +1694,19 @@ STATIC const mp_map_elem_t str_locals_dict_table[] = { STATIC MP_DEFINE_CONST_DICT(str_locals_dict, str_locals_dict_table); +#if !MICROPY_PY_BUILTINS_STR_UNICODE const mp_obj_type_t mp_type_str = { { &mp_type_type }, .name = MP_QSTR_str, .print = str_print, .make_new = str_make_new, - .binary_op = str_binary_op, + .binary_op = mp_obj_str_binary_op, .subscr = str_subscr, .getiter = mp_obj_new_str_iterator, - .buffer_p = { .get_buffer = str_get_buffer }, + .buffer_p = { .get_buffer = mp_obj_str_get_buffer }, .locals_dict = (mp_obj_t)&str_locals_dict, }; +#endif // Reuses most of methods from str const mp_obj_type_t mp_type_bytes = { @@ -1706,10 +1714,10 @@ const mp_obj_type_t mp_type_bytes = { .name = MP_QSTR_bytes, .print = str_print, .make_new = bytes_make_new, - .binary_op = str_binary_op, + .binary_op = mp_obj_str_binary_op, .subscr = str_subscr, .getiter = mp_obj_new_bytes_iterator, - .buffer_p = { .get_buffer = str_get_buffer }, + .buffer_p = { .get_buffer = mp_obj_str_get_buffer }, .locals_dict = (mp_obj_t)&str_locals_dict, }; @@ -1866,6 +1874,7 @@ typedef struct _mp_obj_str_it_t { machine_uint_t cur; } mp_obj_str_it_t; +#if !MICROPY_PY_BUILTINS_STR_UNICODE STATIC mp_obj_t str_it_iternext(mp_obj_t self_in) { mp_obj_str_it_t *self = self_in; GET_STR_DATA_LEN(self->str, str, len); @@ -1885,6 +1894,15 @@ STATIC const mp_obj_type_t mp_type_str_it = { .iternext = str_it_iternext, }; +mp_obj_t mp_obj_new_str_iterator(mp_obj_t str) { + mp_obj_str_it_t *o = m_new_obj(mp_obj_str_it_t); + o->base.type = &mp_type_str_it; + o->str = str; + o->cur = 0; + return o; +} +#endif + STATIC mp_obj_t bytes_it_iternext(mp_obj_t self_in) { mp_obj_str_it_t *self = self_in; GET_STR_DATA_LEN(self->str, str, len); @@ -1904,14 +1922,6 @@ STATIC const mp_obj_type_t mp_type_bytes_it = { .iternext = bytes_it_iternext, }; -mp_obj_t mp_obj_new_str_iterator(mp_obj_t str) { - mp_obj_str_it_t *o = m_new_obj(mp_obj_str_it_t); - o->base.type = &mp_type_str_it; - o->str = str; - o->cur = 0; - return o; -} - mp_obj_t mp_obj_new_bytes_iterator(mp_obj_t str) { mp_obj_str_it_t *o = m_new_obj(mp_obj_str_it_t); o->base.type = &mp_type_bytes_it; diff --git a/py/objstr.h b/py/objstr.h index 5be137d36d..515890c6e1 100644 --- a/py/objstr.h +++ b/py/objstr.h @@ -35,5 +35,53 @@ typedef struct _mp_obj_str_t { #define MP_DEFINE_STR_OBJ(obj_name, str) mp_obj_str_t obj_name = {{&mp_type_str}, 0, sizeof(str) - 1, (const byte*)str}; +// use this macro to extract the string hash +#define GET_STR_HASH(str_obj_in, str_hash) \ + uint str_hash; if (MP_OBJ_IS_QSTR(str_obj_in)) \ + { str_hash = qstr_hash(MP_OBJ_QSTR_VALUE(str_obj_in)); } else { str_hash = ((mp_obj_str_t*)str_obj_in)->hash; } + +// use this macro to extract the string length +#define GET_STR_LEN(str_obj_in, str_len) \ + uint str_len; if (MP_OBJ_IS_QSTR(str_obj_in)) \ + { str_len = qstr_len(MP_OBJ_QSTR_VALUE(str_obj_in)); } else { str_len = ((mp_obj_str_t*)str_obj_in)->len; } + +// use this macro to extract the string data and length +#define GET_STR_DATA_LEN(str_obj_in, str_data, str_len) \ + const byte *str_data; uint str_len; if (MP_OBJ_IS_QSTR(str_obj_in)) \ + { str_data = qstr_data(MP_OBJ_QSTR_VALUE(str_obj_in), &str_len); } \ + else { str_len = ((mp_obj_str_t*)str_obj_in)->len; str_data = ((mp_obj_str_t*)str_obj_in)->data; } + mp_obj_t mp_obj_str_format(uint n_args, const mp_obj_t *args); mp_obj_t mp_obj_new_str_of_type(const mp_obj_type_t *type, const byte* data, uint len); + +mp_obj_t mp_obj_str_binary_op(int op, mp_obj_t lhs_in, mp_obj_t rhs_in); +machine_int_t mp_obj_str_get_buffer(mp_obj_t self_in, mp_buffer_info_t *bufinfo, int flags); + +const byte *str_index_to_ptr(const mp_obj_type_t *type, const byte *self_data, uint self_len, + mp_obj_t index, bool is_slice); + +MP_DECLARE_CONST_FUN_OBJ(str_encode_obj); +MP_DECLARE_CONST_FUN_OBJ(str_find_obj); +MP_DECLARE_CONST_FUN_OBJ(str_rfind_obj); +MP_DECLARE_CONST_FUN_OBJ(str_index_obj); +MP_DECLARE_CONST_FUN_OBJ(str_rindex_obj); +MP_DECLARE_CONST_FUN_OBJ(str_join_obj); +MP_DECLARE_CONST_FUN_OBJ(str_split_obj); +MP_DECLARE_CONST_FUN_OBJ(str_rsplit_obj); +MP_DECLARE_CONST_FUN_OBJ(str_startswith_obj); +MP_DECLARE_CONST_FUN_OBJ(str_endswith_obj); +MP_DECLARE_CONST_FUN_OBJ(str_strip_obj); +MP_DECLARE_CONST_FUN_OBJ(str_lstrip_obj); +MP_DECLARE_CONST_FUN_OBJ(str_rstrip_obj); +MP_DECLARE_CONST_FUN_OBJ(str_format_obj); +MP_DECLARE_CONST_FUN_OBJ(str_replace_obj); +MP_DECLARE_CONST_FUN_OBJ(str_count_obj); +MP_DECLARE_CONST_FUN_OBJ(str_partition_obj); +MP_DECLARE_CONST_FUN_OBJ(str_rpartition_obj); +MP_DECLARE_CONST_FUN_OBJ(str_lower_obj); +MP_DECLARE_CONST_FUN_OBJ(str_upper_obj); +MP_DECLARE_CONST_FUN_OBJ(str_isspace_obj); +MP_DECLARE_CONST_FUN_OBJ(str_isalpha_obj); +MP_DECLARE_CONST_FUN_OBJ(str_isdigit_obj); +MP_DECLARE_CONST_FUN_OBJ(str_isupper_obj); +MP_DECLARE_CONST_FUN_OBJ(str_islower_obj); diff --git a/py/objstrunicode.c b/py/objstrunicode.c new file mode 100644 index 0000000000..35cc73bd84 --- /dev/null +++ b/py/objstrunicode.c @@ -0,0 +1,359 @@ +/* + * This file is part of the Micro Python project, http://micropython.org/ + * + * The MIT License (MIT) + * + * Copyright (c) 2013, 2014 Damien P. George + * Copyright (c) 2014 Paul Sokolovsky + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include +#include +#include + +#include "mpconfig.h" +#include "nlr.h" +#include "misc.h" +#include "qstr.h" +#include "obj.h" +#include "runtime0.h" +#include "runtime.h" +#include "pfenv.h" +#include "objstr.h" +#include "objlist.h" + +#if MICROPY_PY_BUILTINS_STR_UNICODE + +STATIC mp_obj_t mp_obj_new_str_iterator(mp_obj_t str); + +/******************************************************************************/ +/* str */ + +STATIC void uni_print_quoted(void (*print)(void *env, const char *fmt, ...), void *env, const byte *str_data, uint str_len) { + // this escapes characters, but it will be very slow to print (calling print many times) + bool has_single_quote = false; + bool has_double_quote = false; + for (const byte *s = str_data, *top = str_data + str_len; !has_double_quote && s < top; s++) { + if (*s == '\'') { + has_single_quote = true; + } else if (*s == '"') { + has_double_quote = true; + } + } + int quote_char = '\''; + if (has_single_quote && !has_double_quote) { + quote_char = '"'; + } + print(env, "%c", quote_char); + const byte *s = str_data, *top = str_data + str_len; + while (s < top) { + unichar ch; + ch = utf8_get_char(s); + s = utf8_next_char(s); + if (ch == quote_char) { + print(env, "\\%c", quote_char); + } else if (ch == '\\') { + print(env, "\\\\"); + } else if (32 <= ch && ch <= 126) { + print(env, "%c", ch); + } else if (ch == '\n') { + print(env, "\\n"); + } else if (ch == '\r') { + print(env, "\\r"); + } else if (ch == '\t') { + print(env, "\\t"); + } else if (ch < 0x100) { + print(env, "\\x%02x", ch); + } else if (ch < 0x10000) { + print(env, "\\u%04x", ch); + } else { + print(env, "\\U%08x", ch); + } + } + print(env, "%c", quote_char); +} + +STATIC void uni_print(void (*print)(void *env, const char *fmt, ...), void *env, mp_obj_t self_in, mp_print_kind_t kind) { + GET_STR_DATA_LEN(self_in, str_data, str_len); + if (kind == PRINT_STR) { + print(env, "%.*s", str_len, str_data); + } else { + uni_print_quoted(print, env, str_data, str_len); + } +} + +STATIC mp_obj_t uni_unary_op(int op, mp_obj_t self_in) { + GET_STR_DATA_LEN(self_in, str_data, str_len); + switch (op) { + case MP_UNARY_OP_BOOL: + return MP_BOOL(str_len != 0); + case MP_UNARY_OP_LEN: + return MP_OBJ_NEW_SMALL_INT((machine_int_t)unichar_charlen((const char *)str_data, str_len)); + default: + return MP_OBJ_NULL; // op not supported + } +} + +STATIC mp_obj_t str_make_new(mp_obj_t type_in, uint n_args, uint n_kw, const mp_obj_t *args) { +#if MICROPY_CPYTHON_COMPAT + if (n_kw != 0) { + mp_arg_error_unimpl_kw(); + } +#endif + + switch (n_args) { + case 0: + return MP_OBJ_NEW_QSTR(MP_QSTR_); + + case 1: + { + vstr_t *vstr = vstr_new(); + mp_obj_print_helper((void (*)(void*, const char*, ...))vstr_printf, vstr, args[0], PRINT_STR); + mp_obj_t s = mp_obj_new_str(vstr->buf, vstr->len, false); + vstr_free(vstr); + return s; + } + + case 2: + case 3: + { + // TODO: validate 2nd/3rd args + if (!MP_OBJ_IS_TYPE(args[0], &mp_type_bytes)) { + nlr_raise(mp_obj_new_exception_msg(&mp_type_TypeError, "bytes expected")); + } + GET_STR_DATA_LEN(args[0], str_data, str_len); + GET_STR_HASH(args[0], str_hash); + mp_obj_str_t *o = mp_obj_new_str_of_type(&mp_type_str, NULL, str_len); + o->data = str_data; + o->hash = str_hash; + return o; + } + + default: + nlr_raise(mp_obj_new_exception_msg(&mp_type_TypeError, "str takes at most 3 arguments")); + } +} + +// Convert an index into a pointer to its lead byte. Out of bounds indexing will raise IndexError or +// be capped to the first/last character of the string, depending on is_slice. +const byte *str_index_to_ptr(const mp_obj_type_t *type, const byte *self_data, uint self_len, + mp_obj_t index, bool is_slice) { + machine_int_t i; + // Copied from mp_get_index; I don't want bounds checking, just give me + // the integer as-is. (I can't bounds-check without scanning the whole + // string; an out-of-bounds index will be caught in the loops below.) + if (MP_OBJ_IS_SMALL_INT(index)) { + i = MP_OBJ_SMALL_INT_VALUE(index); + } else if (!mp_obj_get_int_maybe(index, &i)) { + nlr_raise(mp_obj_new_exception_msg_varg(&mp_type_TypeError, "string indices must be integers, not %s", mp_obj_get_type_str(index))); + } + const byte *s, *top = self_data + self_len; + if (i < 0) + { + // Negative indexing is performed by counting from the end of the string. + for (s = top - 1; i; --s) { + if (s < self_data) { + if (is_slice) { + return self_data; + } + nlr_raise(mp_obj_new_exception_msg_varg(&mp_type_IndexError, "string index out of range")); + } + if (!UTF8_IS_CONT(*s)) { + ++i; + } + } + ++s; + } else if (!i) { + return self_data; // Shortcut - str[0] is its base pointer + } else { + // Positive indexing, correspondingly, counts from the start of the string. + // It's assumed that negative indexing will generally be used with small + // absolute values (eg str[-1], not str[-1000000]), which means it'll be + // more efficient this way. + for (s = self_data; true; ++s) { + if (s >= top) { + if (is_slice) { + return top; + } + nlr_raise(mp_obj_new_exception_msg_varg(&mp_type_IndexError, "string index out of range")); + } + while (UTF8_IS_CONT(*s)) { + ++s; + } + if (!i--) { + return s; + } + } + } + return s; +} + +STATIC mp_obj_t str_subscr(mp_obj_t self_in, mp_obj_t index, mp_obj_t value) { + mp_obj_type_t *type = mp_obj_get_type(self_in); + GET_STR_DATA_LEN(self_in, self_data, self_len); + if (value == MP_OBJ_SENTINEL) { + // load +#if MICROPY_PY_BUILTINS_SLICE + if (MP_OBJ_IS_TYPE(index, &mp_type_slice)) { + mp_obj_t ostart, ostop, ostep; + mp_obj_slice_get(index, &ostart, &ostop, &ostep); + if (ostep != mp_const_none && ostep != MP_OBJ_NEW_SMALL_INT(1)) { + nlr_raise(mp_obj_new_exception_msg(&mp_type_NotImplementedError, + "only slices with step=1 (aka None) are supported")); + } + + if (type == &mp_type_bytes) { + machine_int_t start = 0, stop = self_len; + if (ostart != mp_const_none) { + start = MP_OBJ_SMALL_INT_VALUE(ostart); + if (start < 0) { + start = self_len + start; + } + } + if (ostop != mp_const_none) { + stop = MP_OBJ_SMALL_INT_VALUE(ostop); + if (stop < 0) { + stop = self_len + stop; + } + } + return mp_obj_new_str_of_type(type, self_data + start, stop - start); + } + const byte *pstart, *pstop; + if (ostart != mp_const_none) { + pstart = str_index_to_ptr(type, self_data, self_len, ostart, true); + } else { + pstart = self_data; + } + if (ostop != mp_const_none) { + // pstop will point just after the stop character. This depends on + // the \0 at the end of the string. + pstop = str_index_to_ptr(type, self_data, self_len, ostop, true); + } else { + pstop = self_data + self_len; + } + if (pstop < pstart) { + return MP_OBJ_NEW_QSTR(MP_QSTR_); + } + return mp_obj_new_str_of_type(type, (const byte *)pstart, pstop - pstart); + } +#endif + if (type == &mp_type_bytes) { + uint index_val = mp_get_index(type, self_len, index, false); + return MP_OBJ_NEW_SMALL_INT((mp_small_int_t)self_data[index_val]); + } + const byte *s = str_index_to_ptr(type, self_data, self_len, index, false); + int len = 1; + if (UTF8_IS_NONASCII(*s)) { + // Count the number of 1 bits (after the first) + for (char mask = 0x40; *s & mask; mask >>= 1) { + ++len; + } + } + return mp_obj_new_str((const char*)s, len, true); // This will create a one-character string + } else { + return MP_OBJ_NULL; // op not supported + } +} + +STATIC const mp_map_elem_t str_locals_dict_table[] = { +#if MICROPY_CPYTHON_COMPAT + { MP_OBJ_NEW_QSTR(MP_QSTR_encode), (mp_obj_t)&str_encode_obj }, +#endif + { MP_OBJ_NEW_QSTR(MP_QSTR_find), (mp_obj_t)&str_find_obj }, + { MP_OBJ_NEW_QSTR(MP_QSTR_rfind), (mp_obj_t)&str_rfind_obj }, + { MP_OBJ_NEW_QSTR(MP_QSTR_index), (mp_obj_t)&str_index_obj }, + { MP_OBJ_NEW_QSTR(MP_QSTR_rindex), (mp_obj_t)&str_rindex_obj }, + { MP_OBJ_NEW_QSTR(MP_QSTR_join), (mp_obj_t)&str_join_obj }, + { MP_OBJ_NEW_QSTR(MP_QSTR_split), (mp_obj_t)&str_split_obj }, + { MP_OBJ_NEW_QSTR(MP_QSTR_rsplit), (mp_obj_t)&str_rsplit_obj }, + { MP_OBJ_NEW_QSTR(MP_QSTR_startswith), (mp_obj_t)&str_startswith_obj }, + { MP_OBJ_NEW_QSTR(MP_QSTR_endswith), (mp_obj_t)&str_endswith_obj }, + { MP_OBJ_NEW_QSTR(MP_QSTR_strip), (mp_obj_t)&str_strip_obj }, + { MP_OBJ_NEW_QSTR(MP_QSTR_lstrip), (mp_obj_t)&str_lstrip_obj }, + { MP_OBJ_NEW_QSTR(MP_QSTR_rstrip), (mp_obj_t)&str_rstrip_obj }, + { MP_OBJ_NEW_QSTR(MP_QSTR_format), (mp_obj_t)&str_format_obj }, + { MP_OBJ_NEW_QSTR(MP_QSTR_replace), (mp_obj_t)&str_replace_obj }, + { MP_OBJ_NEW_QSTR(MP_QSTR_count), (mp_obj_t)&str_count_obj }, + { MP_OBJ_NEW_QSTR(MP_QSTR_partition), (mp_obj_t)&str_partition_obj }, + { MP_OBJ_NEW_QSTR(MP_QSTR_rpartition), (mp_obj_t)&str_rpartition_obj }, + { MP_OBJ_NEW_QSTR(MP_QSTR_lower), (mp_obj_t)&str_lower_obj }, + { MP_OBJ_NEW_QSTR(MP_QSTR_upper), (mp_obj_t)&str_upper_obj }, + { MP_OBJ_NEW_QSTR(MP_QSTR_isspace), (mp_obj_t)&str_isspace_obj }, + { MP_OBJ_NEW_QSTR(MP_QSTR_isalpha), (mp_obj_t)&str_isalpha_obj }, + { MP_OBJ_NEW_QSTR(MP_QSTR_isdigit), (mp_obj_t)&str_isdigit_obj }, + { MP_OBJ_NEW_QSTR(MP_QSTR_isupper), (mp_obj_t)&str_isupper_obj }, + { MP_OBJ_NEW_QSTR(MP_QSTR_islower), (mp_obj_t)&str_islower_obj }, +}; + +STATIC MP_DEFINE_CONST_DICT(str_locals_dict, str_locals_dict_table); + +const mp_obj_type_t mp_type_str = { + { &mp_type_type }, + .name = MP_QSTR_str, + .print = uni_print, + .make_new = str_make_new, + .unary_op = uni_unary_op, + .binary_op = mp_obj_str_binary_op, + .subscr = str_subscr, + .getiter = mp_obj_new_str_iterator, + .buffer_p = { .get_buffer = mp_obj_str_get_buffer }, + .locals_dict = (mp_obj_t)&str_locals_dict, +}; + +/******************************************************************************/ +/* str iterator */ + +typedef struct _mp_obj_str_it_t { + mp_obj_base_t base; + mp_obj_t str; + machine_uint_t cur; +} mp_obj_str_it_t; + +STATIC mp_obj_t str_it_iternext(mp_obj_t self_in) { + mp_obj_str_it_t *self = self_in; + GET_STR_DATA_LEN(self->str, str, len); + if (self->cur < len) { + const byte *cur = str + self->cur; + const byte *end = utf8_next_char(str + self->cur); + mp_obj_t o_out = mp_obj_new_str((const char*)cur, end - cur, true); + self->cur += end - cur; + return o_out; + } else { + return MP_OBJ_STOP_ITERATION; + } +} + +STATIC const mp_obj_type_t mp_type_str_it = { + { &mp_type_type }, + .name = MP_QSTR_iterator, + .getiter = mp_identity, + .iternext = str_it_iternext, +}; + +mp_obj_t mp_obj_new_str_iterator(mp_obj_t str) { + mp_obj_str_it_t *o = m_new_obj(mp_obj_str_it_t); + o->base.type = &mp_type_str_it; + o->str = str; + o->cur = 0; + return o; +} + +#endif // MICROPY_PY_BUILTINS_STR_UNICODE diff --git a/py/py.mk b/py/py.mk index 374bb11d61..549c35d320 100644 --- a/py/py.mk +++ b/py/py.mk @@ -75,6 +75,7 @@ PY_O_BASENAME = \ objset.o \ objslice.o \ objstr.o \ + objstrunicode.o \ objstringio.o \ objtuple.o \ objtype.o \ diff --git a/py/stream.c b/py/stream.c index 07a79248ab..a5a96a8682 100644 --- a/py/stream.c +++ b/py/stream.c @@ -33,6 +33,7 @@ #include "qstr.h" #include "obj.h" #include "objstr.h" +#include "runtime.h" #include "stream.h" #if MICROPY_STREAMS_NON_BLOCK #include @@ -67,6 +68,13 @@ STATIC mp_obj_t stream_read(uint n_args, const mp_obj_t *args) { if (n_args == 1 || ((sz = mp_obj_get_int(args[1])) == -1)) { return stream_readall(args[0]); } + + #if MICROPY_PY_BUILTINS_STR_UNICODE + if (!o->type->stream_p->is_bytes) { + mp_not_implemented("Reading from unicode text streams by character count"); + } + #endif + byte *buf = m_new(byte, sz); int error; machine_int_t out_sz = o->type->stream_p->read(o, buf, sz, &error); diff --git a/py/unicode.c b/py/unicode.c index 88f835131d..a91e08078e 100644 --- a/py/unicode.c +++ b/py/unicode.c @@ -65,14 +65,65 @@ STATIC const uint8_t attr[] = { AT_LO, AT_LO, AT_LO, AT_PR, AT_PR, AT_PR, AT_PR, 0 }; +// TODO: Rename to str_get_char unichar utf8_get_char(const byte *s) { +#if MICROPY_PY_BUILTINS_STR_UNICODE + unichar ord = *s++; + if (!UTF8_IS_NONASCII(ord)) return ord; + ord &= 0x7F; + for (unichar mask = 0x40; ord & mask; mask >>= 1) { + ord &= ~mask; + } + while (UTF8_IS_CONT(*s)) { + ord = (ord << 6) | (*s++ & 0x3F); + } + return ord; +#else return *s; +#endif } +// TODO: Rename to str_next_char const byte *utf8_next_char(const byte *s) { +#if MICROPY_PY_BUILTINS_STR_UNICODE + ++s; + while (UTF8_IS_CONT(*s)) { + ++s; + } + return s; +#else return s + 1; +#endif } +machine_uint_t utf8_ptr_to_index(const char *s, const char *ptr) { + machine_uint_t i = 0; + while (ptr > s) { + if (!UTF8_IS_CONT(*--ptr)) { + i++; + } + } + + return i; +} + +// TODO: Rename to str_charlen; return machine_uint_t +uint unichar_charlen(const char *str, uint len) +{ +#if MICROPY_PY_BUILTINS_STR_UNICODE + uint charlen = 0; + for (const char *top = str + len; str < top; ++str) { + if (!UTF8_IS_CONT(*str)) { + ++charlen; + } + } + return charlen; +#else + return len; +#endif +} + +// Be aware: These unichar_is* functions are actually ASCII-only! bool unichar_isspace(unichar c) { return c < 128 && (attr[c] & FL_SPACE) != 0; } diff --git a/py/unicode.h b/py/unicode.h new file mode 100644 index 0000000000..2468b2fecf --- /dev/null +++ b/py/unicode.h @@ -0,0 +1 @@ +machine_uint_t utf8_ptr_to_index(const byte *s, const byte *ptr); diff --git a/py/vstr.c b/py/vstr.c index f8b7e4dabc..9ccc95d49b 100644 --- a/py/vstr.c +++ b/py/vstr.c @@ -199,12 +199,48 @@ void vstr_add_byte(vstr_t *vstr, byte b) { } void vstr_add_char(vstr_t *vstr, unichar c) { - // TODO UNICODE +#if MICROPY_PY_BUILTINS_STR_UNICODE + // TODO: Can this be simplified and deduplicated? + // Is it worth just calling vstr_add_len(vstr, 4)? + if (c < 0x80) { + byte *buf = (byte*)vstr_add_len(vstr, 1); + if (buf == NULL) { + return; + } + *buf = (byte)c; + } else if (c < 0x800) { + byte *buf = (byte*)vstr_add_len(vstr, 2); + if (buf == NULL) { + return; + } + buf[0] = (c >> 6) | 0xC0; + buf[1] = (c & 0x3F) | 0x80; + } else if (c < 0x10000) { + byte *buf = (byte*)vstr_add_len(vstr, 3); + if (buf == NULL) { + return; + } + buf[0] = (c >> 12) | 0xE0; + buf[1] = ((c >> 6) & 0x3F) | 0x80; + buf[2] = (c & 0x3F) | 0x80; + } else { + assert(c < 0x110000); + byte *buf = (byte*)vstr_add_len(vstr, 4); + if (buf == NULL) { + return; + } + buf[0] = (c >> 18) | 0xF0; + buf[1] = ((c >> 12) & 0x3F) | 0x80; + buf[2] = ((c >> 6) & 0x3F) | 0x80; + buf[3] = (c & 0x3F) | 0x80; + } +#else byte *buf = (byte*)vstr_add_len(vstr, 1); if (buf == NULL) { return; } buf[0] = c; +#endif } void vstr_add_str(vstr_t *vstr, const char *str) { diff --git a/stmhal/mpconfigport.h b/stmhal/mpconfigport.h index 28cd90bb01..f5110d8f0a 100644 --- a/stmhal/mpconfigport.h +++ b/stmhal/mpconfigport.h @@ -44,6 +44,7 @@ */ #define MICROPY_ENABLE_LFN (1) #define MICROPY_LFN_CODE_PAGE (437) /* 1=SFN/ANSI 437=LFN/U.S.(OEM) */ +#define MICROPY_PY_BUILTINS_STR_UNICODE (0) #define MICROPY_PY_BUILTINS_FROZENSET (1) #define MICROPY_PY_SYS_EXIT (1) #define MICROPY_PY_SYS_STDFILES (1) diff --git a/tests/run-tests b/tests/run-tests index 8f5f7d470d..c6bc4020d4 100755 --- a/tests/run-tests +++ b/tests/run-tests @@ -88,10 +88,10 @@ def run_tests(pyb, tests): rm_f(filename_expected) rm_f(filename_mupy) else: - with open(filename_expected, "w") as f: - f.write(str(output_expected, "ascii")) - with open(filename_mupy, "w") as f: - f.write(str(output_mupy, "ascii")) + with open(filename_expected, "wb") as f: + f.write(output_expected) + with open(filename_mupy, "wb") as f: + f.write(output_mupy) print("FAIL ", test_file) failed_tests.append(test_name) diff --git a/tests/unicode/data/utf-8_1.txt b/tests/unicode/data/utf-8_1.txt new file mode 100644 index 0000000000..d84c480d1d --- /dev/null +++ b/tests/unicode/data/utf-8_1.txt @@ -0,0 +1 @@ +Привет diff --git a/tests/unicode/file1.py b/tests/unicode/file1.py new file mode 100644 index 0000000000..554e886743 --- /dev/null +++ b/tests/unicode/file1.py @@ -0,0 +1,4 @@ +f = open("unicode/data/utf-8_1.txt") +l = f.readline() +print(l) +print(len(l)) diff --git a/tests/unicode/unicode.py b/tests/unicode/unicode.py new file mode 100644 index 0000000000..c7e523f06a --- /dev/null +++ b/tests/unicode/unicode.py @@ -0,0 +1,18 @@ +# Test a UTF-8 encoded literal +s = "asdf©qwer" +for i in range(len(s)): + print("s[%d]: %s %X"%(i, s[i], ord(s[i]))) + +# Test all three forms of Unicode escape, and +# all blocks of UTF-8 byte patterns +s = "a\xA9\xFF\u0123\u0800\uFFEE\U0001F44C" +for i in range(-len(s), len(s)): + print("s[%d]: %s %X"%(i, s[i], ord(s[i]))) + print("s[:%d]: %d chars, '%s'"%(i, len(s[:i]), s[:i])) + for j in range(i, len(s)): + print("s[%d:%d]: %d chars, '%s'"%(i, j, len(s[i:j]), s[i:j])) + print("s[%d:]: %d chars, '%s'"%(i, len(s[i:]), s[i:])) + +# Test UTF-8 encode and decode +enc = s.encode() +print(enc, enc.decode() == s) diff --git a/tests/unicode/unicode_index.py b/tests/unicode/unicode_index.py new file mode 100644 index 0000000000..3c31468a41 --- /dev/null +++ b/tests/unicode/unicode_index.py @@ -0,0 +1,6 @@ +print("Привет".find("т")) +print("Привет".find("П")) +print("Привет".rfind("т")) +print("Привет".rfind("П")) +print("Привет".index("т")) +print("Привет".index("П")) diff --git a/tests/unicode/unicode_iter.py b/tests/unicode/unicode_iter.py new file mode 100644 index 0000000000..f08a4aceed --- /dev/null +++ b/tests/unicode/unicode_iter.py @@ -0,0 +1,4 @@ +for c in "Hello": + print(c) +for c in "Привет": + print(c) diff --git a/tests/unicode/unicode_pos.py b/tests/unicode/unicode_pos.py new file mode 100644 index 0000000000..6a5982920a --- /dev/null +++ b/tests/unicode/unicode_pos.py @@ -0,0 +1,5 @@ +# str methods with explicit start/end pos +print("Привет".startswith("П")) +print("Привет".startswith("р", 1)) +print("абвба".find("а", 1)) +print("абвба".find("а", 1, -1)) diff --git a/unix/mpconfigport.h b/unix/mpconfigport.h index 1559bdb359..763b34ba3e 100644 --- a/unix/mpconfigport.h +++ b/unix/mpconfigport.h @@ -41,6 +41,7 @@ #define MICROPY_LONGINT_IMPL (MICROPY_LONGINT_IMPL_MPZ) #define MICROPY_STREAMS_NON_BLOCK (1) #define MICROPY_OPT_COMPUTED_GOTO (1) +#define MICROPY_PY_BUILTINS_STR_UNICODE (0) #define MICROPY_PY_BUILTINS_FROZENSET (1) #define MICROPY_PY_SYS_EXIT (1) #define MICROPY_PY_SYS_PLATFORM "linux"