extmod/modure: Convert byte offsets to unicode indices when necessary.
And add a test. Fixes issue #9202. Signed-off-by: Jeff Epler <jepler@gmail.com>
This commit is contained in:
parent
719dbbf563
commit
e90b85cc98
@ -33,6 +33,10 @@
|
|||||||
#include "py/objstr.h"
|
#include "py/objstr.h"
|
||||||
#include "py/stackctrl.h"
|
#include "py/stackctrl.h"
|
||||||
|
|
||||||
|
#if MICROPY_PY_BUILTINS_STR_UNICODE
|
||||||
|
#include "py/unicode.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
#if MICROPY_PY_URE
|
#if MICROPY_PY_URE
|
||||||
|
|
||||||
#define re1_5_stack_chk() MP_STACK_CHECK()
|
#define re1_5_stack_chk() MP_STACK_CHECK()
|
||||||
@ -121,6 +125,18 @@ STATIC void match_span_helper(size_t n_args, const mp_obj_t *args, mp_obj_t span
|
|||||||
e = self->caps[no * 2 + 1] - begin;
|
e = self->caps[no * 2 + 1] - begin;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if MICROPY_PY_BUILTINS_STR_UNICODE
|
||||||
|
if (mp_obj_get_type(self->str) == &mp_type_str) {
|
||||||
|
const byte *begin = (const byte *)mp_obj_str_get_str(self->str);
|
||||||
|
if (s != -1) {
|
||||||
|
s = utf8_ptr_to_index(begin, begin + s);
|
||||||
|
}
|
||||||
|
if (e != -1) {
|
||||||
|
e = utf8_ptr_to_index(begin, begin + e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
span[0] = mp_obj_new_int(s);
|
span[0] = mp_obj_new_int(s);
|
||||||
span[1] = mp_obj_new_int(e);
|
span[1] = mp_obj_new_int(e);
|
||||||
}
|
}
|
||||||
|
32
tests/unicode/unicode_ure.py
Normal file
32
tests/unicode/unicode_ure.py
Normal file
@ -0,0 +1,32 @@
|
|||||||
|
# test match.span() for unicode strings
|
||||||
|
|
||||||
|
try:
|
||||||
|
import ure as re
|
||||||
|
except ImportError:
|
||||||
|
try:
|
||||||
|
import re
|
||||||
|
except ImportError:
|
||||||
|
print("SKIP")
|
||||||
|
raise SystemExit
|
||||||
|
|
||||||
|
try:
|
||||||
|
m = re.match(".", "a")
|
||||||
|
m.span
|
||||||
|
except AttributeError:
|
||||||
|
print("SKIP")
|
||||||
|
raise SystemExit
|
||||||
|
|
||||||
|
|
||||||
|
def print_spans(match):
|
||||||
|
print("----")
|
||||||
|
try:
|
||||||
|
i = 0
|
||||||
|
while True:
|
||||||
|
print(match.span(i), match.start(i), match.end(i))
|
||||||
|
i += 1
|
||||||
|
except IndexError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
m = re.match(r"([0-9]*)(([a-z]*)([0-9]*))", "1234\u2764567")
|
||||||
|
print_spans(m)
|
Loading…
Reference in New Issue
Block a user