From c71e045165bffbbe085e407008feba3cdfda3298 Mon Sep 17 00:00:00 2001 From: Paul Sokolovsky Date: Fri, 12 Sep 2014 18:48:07 +0300 Subject: [PATCH 1/3] modure: Initial module, using re1.5 (which is based on re1 codebase). https://github.com/pfalcon/re1.5 --- extmod/modure.c | 241 ++++++++++++++++++++++++++++++++++++++++++++ py/builtin.h | 1 + py/builtintables.c | 3 + py/mpconfig.h | 4 + py/py.mk | 7 ++ py/qstrdefs.h | 9 ++ unix/mpconfigport.h | 1 + 7 files changed, 266 insertions(+) create mode 100644 extmod/modure.c diff --git a/extmod/modure.c b/extmod/modure.c new file mode 100644 index 0000000000..b6560599ae --- /dev/null +++ b/extmod/modure.c @@ -0,0 +1,241 @@ +/* + * This file is part of the Micro Python project, http://micropython.org/ + * + * The MIT License (MIT) + * + * Copyright (c) 2014 Paul Sokolovsky + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include +#include +#include + +#include "mpconfig.h" +#include "nlr.h" +#include "misc.h" +#include "qstr.h" +#include "obj.h" +#include "runtime.h" +#include "binary.h" + +#if MICROPY_PY_URE + +#include "re1.5/regexp.h" + +#define FLAG_DEBUG 0x1000 + +typedef struct _mp_obj_re_t { + mp_obj_base_t base; + ByteProg re; +} mp_obj_re_t; + +typedef struct _mp_obj_match_t { + mp_obj_base_t base; + int num_matches; + mp_obj_t str; + const char *caps[0]; +} mp_obj_match_t; + + +STATIC void match_print(void (*print)(void *env, const char *fmt, ...), void *env, mp_obj_t self_in, mp_print_kind_t kind) { + mp_obj_match_t *self = self_in; + print(env, "", self->num_matches); +} + +STATIC mp_obj_t match_group(mp_obj_t self_in, mp_obj_t no_in) { + mp_obj_match_t *self = self_in; + mp_int_t no = mp_obj_int_get(no_in); + if (no < 0 || no >= self->num_matches / 2) { + nlr_raise(mp_obj_new_exception_arg1(&mp_type_IndexError, no_in)); + } + + const char *start = self->caps[no * 2]; + return mp_obj_new_str(start, self->caps[no * 2 + 1] - start, false); +} +MP_DEFINE_CONST_FUN_OBJ_2(match_group_obj, match_group); + +STATIC const mp_map_elem_t match_locals_dict_table[] = { + { MP_OBJ_NEW_QSTR(MP_QSTR_group), (mp_obj_t) &match_group_obj }, +}; + +STATIC MP_DEFINE_CONST_DICT(match_locals_dict, match_locals_dict_table); + +STATIC const mp_obj_type_t match_type = { + { &mp_type_type }, + .name = MP_QSTR_match, + .print = match_print, + .locals_dict = (mp_obj_t)&match_locals_dict, +}; + +STATIC void re_print(void (*print)(void *env, const char *fmt, ...), void *env, mp_obj_t self_in, mp_print_kind_t kind) { + mp_obj_re_t *self = self_in; + print(env, "", self); +} + +STATIC mp_obj_t re_exec(bool is_anchored, uint n_args, const mp_obj_t *args) { + mp_obj_re_t *self = args[0]; + Subject subj; + mp_uint_t len; + subj.begin = mp_obj_str_get_data(args[1], &len); + subj.end = subj.begin + len; + int caps_num = (self->re.sub + 1) * 2; + mp_obj_match_t *match = m_new_obj_var(mp_obj_match_t, char*, caps_num); + int res = re1_5_recursiveloopprog(&self->re, &subj, match->caps, caps_num, is_anchored); + if (res == 0) { + m_del_var(mp_obj_match_t, char*, caps_num, match); + return mp_const_none; + } + + match->base.type = &match_type; + match->num_matches = caps_num; + match->str = args[1]; + return match; +} + +STATIC mp_obj_t re_match(uint n_args, const mp_obj_t *args) { + return re_exec(true, n_args, args); +} +MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(re_match_obj, 2, 4, re_match); + +STATIC mp_obj_t re_search(uint n_args, const mp_obj_t *args) { + return re_exec(false, n_args, args); +} +MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(re_search_obj, 2, 4, re_search); + +STATIC mp_obj_t re_split(uint n_args, const mp_obj_t *args) { + mp_obj_re_t *self = args[0]; + Subject subj; + mp_uint_t len; + subj.begin = mp_obj_str_get_data(args[1], &len); + subj.end = subj.begin + len; + int caps_num = (self->re.sub + 1) * 2; + + int maxsplit = 0; + if (n_args > 2) { + maxsplit = mp_obj_int_get(args[2]); + } + + mp_obj_t retval = mp_obj_new_list(0, NULL); + const char *caps[caps_num]; + while (true) { + int res = re1_5_recursiveloopprog(&self->re, &subj, caps, caps_num, false); + + // if we didn't have a match, or had an empty match, it's time to stop + if (!res || caps[0] == caps[1]) { + break; + } + + mp_obj_t s = mp_obj_new_str(subj.begin, caps[0] - subj.begin, false); + mp_obj_list_append(retval, s); + if (self->re.sub > 0) { + mp_not_implemented("Splitting with sub-captures"); + } + subj.begin = caps[1]; + if (maxsplit > 0 && --maxsplit == 0) { + break; + } + } + + mp_obj_t s = mp_obj_new_str(subj.begin, subj.end - subj.begin, false); + mp_obj_list_append(retval, s); + return retval; +} +MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(re_split_obj, 2, 3, re_split); + +STATIC const mp_map_elem_t re_locals_dict_table[] = { + { MP_OBJ_NEW_QSTR(MP_QSTR_match), (mp_obj_t) &re_match_obj }, + { MP_OBJ_NEW_QSTR(MP_QSTR_search), (mp_obj_t) &re_search_obj }, + { MP_OBJ_NEW_QSTR(MP_QSTR_split), (mp_obj_t) &re_split_obj }, +}; + +STATIC MP_DEFINE_CONST_DICT(re_locals_dict, re_locals_dict_table); + +STATIC const mp_obj_type_t re_type = { + { &mp_type_type }, + .name = MP_QSTR_ure, + .print = re_print, + .locals_dict = (mp_obj_t)&re_locals_dict, +}; + +mp_obj_t mod_re_compile(uint n_args, const mp_obj_t *args) { + const char *re_str = mp_obj_str_get_str(args[0]); + int size = re1_5_sizecode(re_str); + mp_obj_re_t *o = m_new_obj_var(mp_obj_re_t, char, size); + o->base.type = &re_type; + int flags = 0; + if (n_args > 1) { + flags = mp_obj_get_int(args[1]); + } + int error = re1_5_compilecode(&o->re, re_str); + if (error != 0) { + nlr_raise(mp_obj_new_exception_msg(&mp_type_ValueError, "Error in regex")); + } + if (flags & FLAG_DEBUG) { + re1_5_dumpcode(&o->re); + } + return o; +} +MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(mod_re_compile_obj, 1, 2, mod_re_compile); + +STATIC mp_obj_t mod_re_exec(bool is_anchored, uint n_args, const mp_obj_t *args) { + mp_obj_re_t *self = mod_re_compile(1, args); + + const mp_obj_t args2[] = {self, args[1]}; + mp_obj_match_t *match = re_exec(is_anchored, 2, args2); + return match; +} + +STATIC mp_obj_t mod_re_match(uint n_args, const mp_obj_t *args) { + return mod_re_exec(true, n_args, args); +} +MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(mod_re_match_obj, 2, 4, mod_re_match); + +STATIC mp_obj_t mod_re_search(uint n_args, const mp_obj_t *args) { + return mod_re_exec(false, n_args, args); +} +MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(mod_re_search_obj, 2, 4, mod_re_search); + +STATIC const mp_map_elem_t mp_module_re_globals_table[] = { + { MP_OBJ_NEW_QSTR(MP_QSTR___name__), MP_OBJ_NEW_QSTR(MP_QSTR_ure) }, + { MP_OBJ_NEW_QSTR(MP_QSTR_compile), (mp_obj_t)&mod_re_compile_obj }, + { MP_OBJ_NEW_QSTR(MP_QSTR_match), (mp_obj_t)&mod_re_match_obj }, + { MP_OBJ_NEW_QSTR(MP_QSTR_search), (mp_obj_t)&mod_re_search_obj }, + { MP_OBJ_NEW_QSTR(MP_QSTR_DEBUG), MP_OBJ_NEW_SMALL_INT(FLAG_DEBUG) }, +}; + +STATIC const mp_obj_dict_t mp_module_re_globals = { + .base = {&mp_type_dict}, + .map = { + .all_keys_are_qstrs = 1, + .table_is_fixed_array = 1, + .used = MP_ARRAY_SIZE(mp_module_re_globals_table), + .alloc = MP_ARRAY_SIZE(mp_module_re_globals_table), + .table = (mp_map_elem_t*)mp_module_re_globals_table, + }, +}; + +const mp_obj_module_t mp_module_ure = { + .base = { &mp_type_module }, + .name = MP_QSTR_ure, + .globals = (mp_obj_dict_t*)&mp_module_re_globals, +}; + +#endif //MICROPY_PY_URE diff --git a/py/builtin.h b/py/builtin.h index 9c8b2b9be2..526157fbcd 100644 --- a/py/builtin.h +++ b/py/builtin.h @@ -90,3 +90,4 @@ extern struct _dummy_t mp_sys_stderr_obj; extern const mp_obj_module_t mp_module_uctypes; extern const mp_obj_module_t mp_module_zlibd; extern const mp_obj_module_t mp_module_ujson; +extern const mp_obj_module_t mp_module_ure; diff --git a/py/builtintables.c b/py/builtintables.c index 5712a3ef56..1fd60d8177 100644 --- a/py/builtintables.c +++ b/py/builtintables.c @@ -209,6 +209,9 @@ STATIC const mp_map_elem_t mp_builtin_module_table[] = { #if MICROPY_PY_UJSON { MP_OBJ_NEW_QSTR(MP_QSTR_ujson), (mp_obj_t)&mp_module_ujson }, #endif +#if MICROPY_PY_URE + { MP_OBJ_NEW_QSTR(MP_QSTR_ure), (mp_obj_t)&mp_module_ure }, +#endif // extra builtin modules as defined by a port MICROPY_PORT_BUILTIN_MODULES diff --git a/py/mpconfig.h b/py/mpconfig.h index bf868624d9..92b61d0e12 100644 --- a/py/mpconfig.h +++ b/py/mpconfig.h @@ -394,6 +394,10 @@ typedef double mp_float_t; #define MICROPY_PY_UJSON (0) #endif +#ifndef MICROPY_PY_URE +#define MICROPY_PY_URE (0) +#endif + /*****************************************************************************/ /* Hooks for a port to add builtins */ diff --git a/py/py.mk b/py/py.mk index fb59b89729..bd5e69598c 100644 --- a/py/py.mk +++ b/py/py.mk @@ -113,6 +113,10 @@ PY_O_BASENAME = \ ../extmod/moductypes.o \ ../extmod/modzlibd.o \ ../extmod/modujson.o \ + ../extmod/modure.o \ + ../extmod/re1.5/compilecode.o \ + ../extmod/re1.5/dumpcode.o \ + ../extmod/re1.5/recursiveloop.o \ # prepend the build destination prefix to the py object files PY_O = $(addprefix $(PY_BUILD)/, $(PY_O_BASENAME)) @@ -166,3 +170,6 @@ $(PY_BUILD)/gc.o: CFLAGS += $(CSUPEROPT) # optimising vm for speed, adds only a small amount to code size but makes a huge difference to speed (20% faster) $(PY_BUILD)/vm.o: CFLAGS += $(CSUPEROPT) +# Additional options for 3rd-party libs + +$(PY_BUILD)/../extmod/re1.5/recursiveloop.o: CFLAGS += "-Dre1_5_fatal(x)=assert(!x)" diff --git a/py/qstrdefs.h b/py/qstrdefs.h index 6a63f0780b..eb99e1b3d7 100644 --- a/py/qstrdefs.h +++ b/py/qstrdefs.h @@ -473,3 +473,12 @@ Q(ujson) Q(dumps) Q(loads) #endif + +#if MICROPY_PY_URE +Q(ure) +Q(compile) +Q(match) +Q(search) +Q(group) +Q(DEBUG) +#endif diff --git a/unix/mpconfigport.h b/unix/mpconfigport.h index 6ca0808628..ceeed74f7e 100644 --- a/unix/mpconfigport.h +++ b/unix/mpconfigport.h @@ -57,6 +57,7 @@ #define MICROPY_PY_UCTYPES (1) #define MICROPY_PY_ZLIBD (1) #define MICROPY_PY_UJSON (1) +#define MICROPY_PY_URE (1) // Define to MICROPY_ERROR_REPORTING_DETAILED to get function, etc. // names in exception messages (may require more RAM). From 5edbadefc1acbcae90d373bd49cd1e13f0f4332b Mon Sep 17 00:00:00 2001 From: Paul Sokolovsky Date: Sat, 11 Oct 2014 14:26:29 +0300 Subject: [PATCH 2/3] modure: Import needed files from re1.5 v0.5. https://github.com/pfalcon/re1.5 --- extmod/re1.5/compilecode.c | 213 +++++++++++++++++++++++++++++++++++ extmod/re1.5/dumpcode.c | 50 ++++++++ extmod/re1.5/recursiveloop.c | 71 ++++++++++++ extmod/re1.5/regexp.h | 138 +++++++++++++++++++++++ 4 files changed, 472 insertions(+) create mode 100644 extmod/re1.5/compilecode.c create mode 100644 extmod/re1.5/dumpcode.c create mode 100644 extmod/re1.5/recursiveloop.c create mode 100644 extmod/re1.5/regexp.h diff --git a/extmod/re1.5/compilecode.c b/extmod/re1.5/compilecode.c new file mode 100644 index 0000000000..5b5d28c2a0 --- /dev/null +++ b/extmod/re1.5/compilecode.c @@ -0,0 +1,213 @@ +// Copyright 2014 Paul Sokolovsky. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "regexp.h" + +static void insert_code(char *code, int at, int num, int *pc) +{ + memmove(code + at + num, code + at, *pc - at); + *pc += num; +} + +#define REL(at, to) (to - at - 2) + +int re1_5_sizecode(const char *re) +{ + int pc = 5 + NON_ANCHORED_PREFIX; // Save 0, Save 1, Match; more bytes for "search" (vs "match") prefix code + + for (; *re; re++) { + switch (*re) { + case '\\': + re++; + default: + pc += 2; + break; + case '+': + // Skip entire "+?" + if (re[1] == '?') + re++; + case '?': + pc += 2; + break; + case '.': + case '^': + case '$': + pc++; + break; + case '*': + // Skip entire "*?" + if (re[1] == '?') + re++; + case '|': + case '(': + pc += 4; + break; + case ')': + break; + } + } + + return pc; +} + +#define EMIT(at, byte) code[at] = byte + +const char *_compilecode(const char *re, ByteProg *prog) +{ + char *code = prog->insts; + int pc = prog->bytelen; + int start = pc; + int term = pc; + int alt_label = 0; + + for (; *re && *re != ')'; re++) { + switch (*re) { + case '\\': + re++; + default: + term = pc; + EMIT(pc++, Char); + EMIT(pc++, *re); + prog->len++; + break; + case '.': + term = pc; + EMIT(pc++, Any); + prog->len++; + break; + case '(': + term = pc; + + EMIT(pc++, Save); + EMIT(pc++, 2 * ++prog->sub); + prog->len++; + + prog->bytelen = pc; + re = _compilecode(re + 1, prog); + pc = prog->bytelen; + + EMIT(pc++, Save); + EMIT(pc++, 2 * prog->sub + 1); + prog->len++; + + break; + case '?': + insert_code(code, term, 2, &pc); + EMIT(term, Split); + EMIT(term + 1, REL(term, pc)); + prog->len++; + break; + case '*': + insert_code(code, term, 2, &pc); + EMIT(pc, Jmp); + EMIT(pc + 1, REL(pc, term)); + pc += 2; + if (re[1] == '?') { + EMIT(term, RSplit); + re++; + } else { + EMIT(term, Split); + } + EMIT(term + 1, REL(term, pc)); + prog->len += 2; + break; + case '+': + if (re[1] == '?') { + EMIT(pc, Split); + re++; + } else { + EMIT(pc, RSplit); + } + EMIT(pc + 1, REL(pc, term)); + pc += 2; + prog->len++; + break; + case '|': + if (alt_label) { + EMIT(alt_label, REL(alt_label, pc) + 1); + } + insert_code(code, start, 2, &pc); + EMIT(pc++, Jmp); + alt_label = pc++; + EMIT(start, Split); + EMIT(start + 1, REL(start, pc)); + prog->len += 2; + break; + case '^': + EMIT(pc++, Bol); + prog->len++; + break; + case '$': + EMIT(pc++, Eol); + prog->len++; + break; + } + } + + if (alt_label) { + EMIT(alt_label, REL(alt_label, pc) + 1); + } + prog->bytelen = pc; + return re; +} + +int re1_5_compilecode(ByteProg *prog, const char *re) +{ + prog->len = 0; + prog->bytelen = 0; + prog->sub = 0; + + // Add code to implement non-anchored operation ("search"), + // for anchored operation ("match"), this code will be just skipped. + // TODO: Implement search in much more efficient manner + prog->insts[prog->bytelen++] = RSplit; + prog->insts[prog->bytelen++] = 3; + prog->insts[prog->bytelen++] = Any; + prog->insts[prog->bytelen++] = Jmp; + prog->insts[prog->bytelen++] = -5; + prog->len += 3; + + prog->insts[prog->bytelen++] = Save; + prog->insts[prog->bytelen++] = 0; + prog->len++; + + _compilecode(re, prog); + + prog->insts[prog->bytelen++] = Save; + prog->insts[prog->bytelen++] = 1; + prog->len++; + + prog->insts[prog->bytelen++] = Match; + prog->len++; + + return 0; +} + +void +cleanmarks(ByteProg *prog) +{ + char *pc = prog->insts; + char *end = pc + prog->bytelen; + while (pc < end) { + *pc &= 0x7f; + switch (*pc) { + case Jmp: + case Split: + case RSplit: + case Save: + case Char: + pc++; + } + pc++; + } +} + +#if 0 +int main(int argc, char *argv[]) +{ + int pc = 0; + ByteProg *code = re1_5_compilecode(argv[1]); + re1_5_dumpcode(code); +} +#endif diff --git a/extmod/re1.5/dumpcode.c b/extmod/re1.5/dumpcode.c new file mode 100644 index 0000000000..b91ded03a6 --- /dev/null +++ b/extmod/re1.5/dumpcode.c @@ -0,0 +1,50 @@ +// Copyright 2014 Paul Sokolovsky. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "regexp.h" + +void re1_5_dumpcode(ByteProg *prog) +{ + int pc = 0; + char *code = prog->insts; + while (pc < prog->bytelen) { + printf("%2d: ", pc); + switch(code[pc++]) { + default: + assert(0); +// re1_5_fatal("printprog"); + case Split: + printf("split %d (%d)\n", pc + (signed char)code[pc] + 1, (signed char)code[pc]); + pc++; + break; + case RSplit: + printf("rsplit %d (%d)\n", pc + (signed char)code[pc] + 1, (signed char)code[pc]); + pc++; + break; + case Jmp: + printf("jmp %d (%d)\n", pc + (signed char)code[pc] + 1, (signed char)code[pc]); + pc++; + break; + case Char: + printf("char %c\n", code[pc++]); + break; + case Any: + printf("any\n"); + break; + case Match: + printf("match\n"); + break; + case Save: + printf("save %d\n", (unsigned char)code[pc++]); + break; + case Bol: + printf("assert bol\n"); + break; + case Eol: + printf("assert eol\n"); + break; + } + } + printf("Bytes: %d, insts: %d\n", prog->bytelen, prog->len); +} diff --git a/extmod/re1.5/recursiveloop.c b/extmod/re1.5/recursiveloop.c new file mode 100644 index 0000000000..7b95eb4c95 --- /dev/null +++ b/extmod/re1.5/recursiveloop.c @@ -0,0 +1,71 @@ +// Copyright 2007-2009 Russ Cox. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "regexp.h" + +static int +recursiveloop(char *pc, const char *sp, Subject *input, const char **subp, int nsubp) +{ + const char *old; + int off; + + for(;;) { + if(inst_is_consumer(*pc)) { + // If we need to match a character, but there's none left, it's fail + if(sp >= input->end) + return 0; + } + switch(*pc++) { + case Char: + if(*sp != *pc++) + return 0; + case Any: + sp++; + continue; + case Match: + return 1; + case Jmp: + off = (signed char)*pc++; + pc = pc + off; + continue; + case Split: + off = (signed char)*pc++; + if(recursiveloop(pc, sp, input, subp, nsubp)) + return 1; + pc = pc + off; + continue; + case RSplit: + off = (signed char)*pc++; + if(recursiveloop(pc + off, sp, input, subp, nsubp)) + return 1; + continue; + case Save: + off = (unsigned char)*pc++; + if(off >= nsubp) { + continue; + } + old = subp[off]; + subp[off] = sp; + if(recursiveloop(pc, sp, input, subp, nsubp)) + return 1; + subp[off] = old; + return 0; + case Bol: + if(sp != input->begin) + return 0; + continue; + case Eol: + if(sp != input->end) + return 0; + continue; + } + re1_5_fatal("recursiveloop"); + } +} + +int +re1_5_recursiveloopprog(ByteProg *prog, Subject *input, const char **subp, int nsubp, int is_anchored) +{ + return recursiveloop(HANDLE_ANCHORED(prog->insts, is_anchored), input->begin, input, subp, nsubp); +} diff --git a/extmod/re1.5/regexp.h b/extmod/re1.5/regexp.h new file mode 100644 index 0000000000..78492eb0f1 --- /dev/null +++ b/extmod/re1.5/regexp.h @@ -0,0 +1,138 @@ +// Copyright 2007-2009 Russ Cox. All Rights Reserved. +// Copyright 2014 Paul Sokolovsky. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include +#include +#include +#include +#include + +#define nil ((void*)0) +#define nelem(x) (sizeof(x)/sizeof((x)[0])) + +typedef struct Regexp Regexp; +typedef struct Prog Prog; +typedef struct ByteProg ByteProg; +typedef struct Inst Inst; +typedef struct Subject Subject; + +struct Regexp +{ + int type; + int n; + int ch; + Regexp *left; + Regexp *right; +}; + +enum /* Regexp.type */ +{ + Alt = 1, + Cat, + Lit, + Dot, + Paren, + Quest, + Star, + Plus, +}; + +Regexp *parse(char*); +Regexp *reg(int type, Regexp *left, Regexp *right); +void printre(Regexp*); +#ifndef re1_5_fatal +void re1_5_fatal(char*); +#endif +void *mal(int); + +struct Prog +{ + Inst *start; + int len; +}; + +struct ByteProg +{ + int bytelen; + int len; + int sub; + char insts[0]; +}; + +struct Inst +{ + int opcode; + int c; + int n; + Inst *x; + Inst *y; + int gen; // global state, oooh! +}; + +enum /* Inst.opcode */ +{ + // Instructions which consume input bytes (and thus fail if none left) + CONSUMERS = 1, + Char = CONSUMERS, + Any, + ASSERTS = 0x50, + Bol = ASSERTS, + Eol, + // Instructions which take relative offset as arg + JUMPS = 0x60, + Jmp = JUMPS, + Split, + RSplit, + // Other (special) instructions + Save = 0x7e, + Match = 0x7f, +}; + +#define inst_is_consumer(inst) ((inst) < ASSERTS) +#define inst_is_jump(inst) ((inst) & 0x70 == JUMPS) + +Prog *compile(Regexp*); +void printprog(Prog*); + +extern int gen; + +enum { + MAXSUB = 20 +}; + +typedef struct Sub Sub; + +struct Sub +{ + int ref; + int nsub; + const char *sub[MAXSUB]; +}; + +Sub *newsub(int n); +Sub *incref(Sub*); +Sub *copy(Sub*); +Sub *update(Sub*, int, const char*); +void decref(Sub*); + +struct Subject { + const char *begin; + const char *end; +}; + + +#define NON_ANCHORED_PREFIX 5 +#define HANDLE_ANCHORED(bytecode, is_anchored) ((is_anchored) ? (bytecode) + NON_ANCHORED_PREFIX : (bytecode)) + +int re1_5_backtrack(ByteProg*, Subject*, const char**, int, int); +int re1_5_pikevm(ByteProg*, Subject*, const char**, int, int); +int re1_5_recursiveloopprog(ByteProg*, Subject*, const char**, int, int); +int re1_5_recursiveprog(ByteProg*, Subject*, const char**, int, int); +int re1_5_thompsonvm(ByteProg*, Subject*, const char**, int, int); + +int re1_5_sizecode(const char *re); +int re1_5_compilecode(ByteProg *prog, const char *re); +void re1_5_dumpcode(ByteProg *prog); +void cleanmarks(ByteProg *prog); From f7bcce05524783ab8b47171a8634a270ca68d040 Mon Sep 17 00:00:00 2001 From: Paul Sokolovsky Date: Mon, 15 Sep 2014 01:14:28 +0300 Subject: [PATCH 3/3] modure: Basic tests. --- tests/extmod/ure1.py | 37 +++++++++++++++++++++++++++++++++++++ tests/extmod/ure_split.py | 35 +++++++++++++++++++++++++++++++++++ 2 files changed, 72 insertions(+) create mode 100644 tests/extmod/ure1.py create mode 100644 tests/extmod/ure_split.py diff --git a/tests/extmod/ure1.py b/tests/extmod/ure1.py new file mode 100644 index 0000000000..dff099c8cc --- /dev/null +++ b/tests/extmod/ure1.py @@ -0,0 +1,37 @@ +try: + import ure as re +except ImportError: + import re + +r = re.compile(".+") +m = r.match("abc") +print(m.group(0)) +try: + m.group(1) +except IndexError: + print("IndexError") + +r = re.compile("(.+)1") +m = r.match("xyz781") +print(m.group(0)) +print(m.group(1)) +try: + m.group(2) +except IndexError: + print("IndexError") + + +r = re.compile("o+") +m = r.search("foobar") +print(m.group(0)) +try: + m.group(1) +except IndexError: + print("IndexError") + + +m = re.match(".*", "foo") +print(m.group(0)) + +m = re.search("w.r", "hello world") +print(m.group(0)) diff --git a/tests/extmod/ure_split.py b/tests/extmod/ure_split.py new file mode 100644 index 0000000000..0154f3abc2 --- /dev/null +++ b/tests/extmod/ure_split.py @@ -0,0 +1,35 @@ +try: + import ure as re +except ImportError: + import re + +r = re.compile(" ") +s = r.split("a b c foobar") +print(s) + +r = re.compile(" +") +s = r.split("a b c foobar") +print(s) + +r = re.compile(" +") +s = r.split("a b c foobar", 1) +print(s) + +r = re.compile(" +") +s = r.split("a b c foobar", 2) +print(s) + +r = re.compile(" *") +s = r.split("a b c foobar") +# TODO - no idea how this is supposed to work, per docs, empty match == stop +# splitting, so CPython code apparently does some dirty magic. +#print(s) + +r = re.compile("x*") +s = r.split("foo") +print(s) + +r = re.compile("[a-f]+") +s = r.split("0a3b9") +# TODO - char classes are not yet supported by re1.5 +#print(s)