diff --git a/extmod/modure.c b/extmod/modure.c index 8478019663..a368ee8fac 100644 --- a/extmod/modure.c +++ b/extmod/modure.c @@ -178,6 +178,35 @@ STATIC mp_obj_t ure_exec(bool is_anchored, uint n_args, const mp_obj_t *args) { size_t len; subj.begin = mp_obj_str_get_data(args[1], &len); subj.end = subj.begin + len; +#if MICROPY_PY_URE_MATCH_SPAN_START_END + if (n_args > 2) { + const mp_obj_type_t *self_type = mp_obj_get_type(args[1]); + mp_int_t str_len = MP_OBJ_SMALL_INT_VALUE(mp_obj_len_maybe(args[1])); + const byte *begin = (const byte *)subj.begin; + + int pos = mp_obj_get_int(args[2]); + if (pos >= str_len) { + return mp_const_none; + } + if (pos < 0) { + pos = 0; + } + const byte *pos_ptr = str_index_to_ptr(self_type, begin, len, MP_OBJ_NEW_SMALL_INT(pos), true); + + const byte *endpos_ptr = (const byte *)subj.end; + if (n_args > 3) { + int endpos = mp_obj_get_int(args[3]); + if (endpos <= pos) { + return mp_const_none; + } + // Will cap to length + endpos_ptr = str_index_to_ptr(self_type, begin, len, args[3], true); + } + + subj.begin = (const char *)pos_ptr; + subj.end = (const char *)endpos_ptr; + } +#endif int caps_num = (self->re.sub + 1) * 2; mp_obj_match_t *match = m_new_obj_var(mp_obj_match_t, char*, caps_num); // cast is a workaround for a bug in msvc: it treats const char** as a const pointer instead of a pointer to pointer to const char diff --git a/py/objstr.c b/py/objstr.c index 7236d97727..ebd11d5cc2 100644 --- a/py/objstr.c +++ b/py/objstr.c @@ -408,6 +408,15 @@ mp_obj_t mp_obj_str_binary_op(mp_binary_op_t op, mp_obj_t lhs_in, mp_obj_t rhs_i #if !MICROPY_PY_BUILTINS_STR_UNICODE // objstrunicode defines own version +size_t str_offset_to_index(const mp_obj_type_t *type, const byte *self_data, size_t self_len, + size_t offset) { + if (offset > self_len) { + mp_raise_ValueError(translate("offset out of bounds")); + } + + return offset; +} + const byte *str_index_to_ptr(const mp_obj_type_t *type, const byte *self_data, size_t self_len, mp_obj_t index, bool is_slice) { size_t index_val = mp_get_index(type, self_len, index, is_slice); diff --git a/py/objstr.h b/py/objstr.h index 8951304461..0efe62a801 100644 --- a/py/objstr.h +++ b/py/objstr.h @@ -71,6 +71,8 @@ mp_obj_t mp_obj_new_str_of_type(const mp_obj_type_t *type, const byte* data, siz mp_obj_t mp_obj_str_binary_op(mp_binary_op_t op, mp_obj_t lhs_in, mp_obj_t rhs_in); mp_int_t mp_obj_str_get_buffer(mp_obj_t self_in, mp_buffer_info_t *bufinfo, mp_uint_t flags); +size_t str_offset_to_index(const mp_obj_type_t *type, const byte *self_data, size_t self_len, + size_t offset); const byte *str_index_to_ptr(const mp_obj_type_t *type, const byte *self_data, size_t self_len, mp_obj_t index, bool is_slice); const byte *find_subbytes(const byte *haystack, size_t hlen, const byte *needle, size_t nlen, int direction); diff --git a/py/objstrunicode.c b/py/objstrunicode.c index 03106f9873..30000a51e7 100644 --- a/py/objstrunicode.c +++ b/py/objstrunicode.c @@ -112,6 +112,26 @@ STATIC mp_obj_t uni_unary_op(mp_unary_op_t op, mp_obj_t self_in) { } } +size_t str_offset_to_index(const mp_obj_type_t *type, const byte *self_data, size_t self_len, + size_t offset) { + if (offset > self_len) { + mp_raise_ValueError(translate("offset out of bounds")); + } + + if (type == &mp_type_bytes) { + return offset; + } + + size_t index_val = 0; + const byte *s = self_data; + for (size_t i = 0; i < offset; i++, s++) { + if (!UTF8_IS_CONT(*s)) { + ++index_val; + } + } + return index_val; +} + // Convert an index into a pointer to its lead byte. Out of bounds indexing will raise IndexError or // be capped to the first/last character of the string, depending on is_slice. const byte *str_index_to_ptr(const mp_obj_type_t *type, const byte *self_data, size_t self_len,