py/unicode: Clean up utf8 funcs and provide non-utf8 inline versions.
This patch provides inline versions of the utf8 helper functions for the case when unicode is disabled (MICROPY_PY_BUILTINS_STR_UNICODE set to 0). This saves code size. The unichar_charlen function is also renamed to utf8_charlen to match the other utf8 helper functions, and the signature of this function is adjusted for consistency (const char* -> const byte*, mp_uint_t -> size_t).
This commit is contained in:
parent
49e0dd54e6
commit
19aee9438a
@ -121,8 +121,15 @@ typedef uint32_t unichar;
|
|||||||
typedef uint unichar;
|
typedef uint unichar;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if MICROPY_PY_BUILTINS_STR_UNICODE
|
||||||
unichar utf8_get_char(const byte *s);
|
unichar utf8_get_char(const byte *s);
|
||||||
const byte *utf8_next_char(const byte *s);
|
const byte *utf8_next_char(const byte *s);
|
||||||
|
size_t utf8_charlen(const byte *str, size_t len);
|
||||||
|
#else
|
||||||
|
static inline unichar utf8_get_char(const byte *s) { return *s; }
|
||||||
|
static inline const byte *utf8_next_char(const byte *s) { return s + 1; }
|
||||||
|
static inline size_t utf8_charlen(const byte *str, size_t len) { (void)str; return len; }
|
||||||
|
#endif
|
||||||
|
|
||||||
bool unichar_isspace(unichar c);
|
bool unichar_isspace(unichar c);
|
||||||
bool unichar_isalpha(unichar c);
|
bool unichar_isalpha(unichar c);
|
||||||
@ -135,7 +142,6 @@ bool unichar_islower(unichar c);
|
|||||||
unichar unichar_tolower(unichar c);
|
unichar unichar_tolower(unichar c);
|
||||||
unichar unichar_toupper(unichar c);
|
unichar unichar_toupper(unichar c);
|
||||||
mp_uint_t unichar_xdigit_value(unichar c);
|
mp_uint_t unichar_xdigit_value(unichar c);
|
||||||
mp_uint_t unichar_charlen(const char *str, mp_uint_t len);
|
|
||||||
#define UTF8_IS_NONASCII(ch) ((ch) & 0x80)
|
#define UTF8_IS_NONASCII(ch) ((ch) & 0x80)
|
||||||
#define UTF8_IS_CONT(ch) (((ch) & 0xC0) == 0x80)
|
#define UTF8_IS_CONT(ch) (((ch) & 0xC0) == 0x80)
|
||||||
|
|
||||||
|
@ -346,7 +346,7 @@ STATIC mp_obj_t mp_builtin_ord(mp_obj_t o_in) {
|
|||||||
const char *str = mp_obj_str_get_data(o_in, &len);
|
const char *str = mp_obj_str_get_data(o_in, &len);
|
||||||
#if MICROPY_PY_BUILTINS_STR_UNICODE
|
#if MICROPY_PY_BUILTINS_STR_UNICODE
|
||||||
if (MP_OBJ_IS_STR(o_in)) {
|
if (MP_OBJ_IS_STR(o_in)) {
|
||||||
len = unichar_charlen(str, len);
|
len = utf8_charlen((const byte*)str, len);
|
||||||
if (len == 1) {
|
if (len == 1) {
|
||||||
return mp_obj_new_int(utf8_get_char((const byte*)str));
|
return mp_obj_new_int(utf8_get_char((const byte*)str));
|
||||||
}
|
}
|
||||||
|
@ -1704,7 +1704,7 @@ STATIC mp_obj_t str_count(size_t n_args, const mp_obj_t *args) {
|
|||||||
|
|
||||||
// if needle_len is zero then we count each gap between characters as an occurrence
|
// if needle_len is zero then we count each gap between characters as an occurrence
|
||||||
if (needle_len == 0) {
|
if (needle_len == 0) {
|
||||||
return MP_OBJ_NEW_SMALL_INT(unichar_charlen((const char*)start, end - start) + 1);
|
return MP_OBJ_NEW_SMALL_INT(utf8_charlen(start, end - start) + 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
// count the occurrences
|
// count the occurrences
|
||||||
|
@ -104,7 +104,7 @@ STATIC mp_obj_t uni_unary_op(mp_unary_op_t op, mp_obj_t self_in) {
|
|||||||
case MP_UNARY_OP_BOOL:
|
case MP_UNARY_OP_BOOL:
|
||||||
return mp_obj_new_bool(str_len != 0);
|
return mp_obj_new_bool(str_len != 0);
|
||||||
case MP_UNARY_OP_LEN:
|
case MP_UNARY_OP_LEN:
|
||||||
return MP_OBJ_NEW_SMALL_INT(unichar_charlen((const char *)str_data, str_len));
|
return MP_OBJ_NEW_SMALL_INT(utf8_charlen(str_data, str_len));
|
||||||
default:
|
default:
|
||||||
return MP_OBJ_NULL; // op not supported
|
return MP_OBJ_NULL; // op not supported
|
||||||
}
|
}
|
||||||
|
29
py/unicode.c
29
py/unicode.c
@ -67,9 +67,9 @@ STATIC const uint8_t attr[] = {
|
|||||||
AT_LO, AT_LO, AT_LO, AT_PR, AT_PR, AT_PR, AT_PR, 0
|
AT_LO, AT_LO, AT_LO, AT_PR, AT_PR, AT_PR, AT_PR, 0
|
||||||
};
|
};
|
||||||
|
|
||||||
// TODO: Rename to str_get_char
|
|
||||||
unichar utf8_get_char(const byte *s) {
|
|
||||||
#if MICROPY_PY_BUILTINS_STR_UNICODE
|
#if MICROPY_PY_BUILTINS_STR_UNICODE
|
||||||
|
|
||||||
|
unichar utf8_get_char(const byte *s) {
|
||||||
unichar ord = *s++;
|
unichar ord = *s++;
|
||||||
if (!UTF8_IS_NONASCII(ord)) return ord;
|
if (!UTF8_IS_NONASCII(ord)) return ord;
|
||||||
ord &= 0x7F;
|
ord &= 0x7F;
|
||||||
@ -80,22 +80,14 @@ unichar utf8_get_char(const byte *s) {
|
|||||||
ord = (ord << 6) | (*s++ & 0x3F);
|
ord = (ord << 6) | (*s++ & 0x3F);
|
||||||
}
|
}
|
||||||
return ord;
|
return ord;
|
||||||
#else
|
|
||||||
return *s;
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: Rename to str_next_char
|
|
||||||
const byte *utf8_next_char(const byte *s) {
|
const byte *utf8_next_char(const byte *s) {
|
||||||
#if MICROPY_PY_BUILTINS_STR_UNICODE
|
|
||||||
++s;
|
++s;
|
||||||
while (UTF8_IS_CONT(*s)) {
|
while (UTF8_IS_CONT(*s)) {
|
||||||
++s;
|
++s;
|
||||||
}
|
}
|
||||||
return s;
|
return s;
|
||||||
#else
|
|
||||||
return s + 1;
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
mp_uint_t utf8_ptr_to_index(const byte *s, const byte *ptr) {
|
mp_uint_t utf8_ptr_to_index(const byte *s, const byte *ptr) {
|
||||||
@ -109,21 +101,18 @@ mp_uint_t utf8_ptr_to_index(const byte *s, const byte *ptr) {
|
|||||||
return i;
|
return i;
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: Rename to str_charlen
|
size_t utf8_charlen(const byte *str, size_t len) {
|
||||||
mp_uint_t unichar_charlen(const char *str, mp_uint_t len) {
|
size_t charlen = 0;
|
||||||
#if MICROPY_PY_BUILTINS_STR_UNICODE
|
for (const byte *top = str + len; str < top; ++str) {
|
||||||
mp_uint_t charlen = 0;
|
|
||||||
for (const char *top = str + len; str < top; ++str) {
|
|
||||||
if (!UTF8_IS_CONT(*str)) {
|
if (!UTF8_IS_CONT(*str)) {
|
||||||
++charlen;
|
++charlen;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return charlen;
|
return charlen;
|
||||||
#else
|
|
||||||
return len;
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
// Be aware: These unichar_is* functions are actually ASCII-only!
|
// Be aware: These unichar_is* functions are actually ASCII-only!
|
||||||
bool unichar_isspace(unichar c) {
|
bool unichar_isspace(unichar c) {
|
||||||
return c < 128 && (attr[c] & FL_SPACE) != 0;
|
return c < 128 && (attr[c] & FL_SPACE) != 0;
|
||||||
@ -183,6 +172,8 @@ mp_uint_t unichar_xdigit_value(unichar c) {
|
|||||||
return n;
|
return n;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if MICROPY_PY_BUILTINS_STR_UNICODE
|
||||||
|
|
||||||
bool utf8_check(const byte *p, size_t len) {
|
bool utf8_check(const byte *p, size_t len) {
|
||||||
uint8_t need = 0;
|
uint8_t need = 0;
|
||||||
const byte *end = p + len;
|
const byte *end = p + len;
|
||||||
@ -210,3 +201,5 @@ bool utf8_check(const byte *p, size_t len) {
|
|||||||
}
|
}
|
||||||
return need == 0; // no pending fragments allowed
|
return need == 0; // no pending fragments allowed
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
Loading…
x
Reference in New Issue
Block a user