2014-10-11 14:26:29 +03:00
|
|
|
// Copyright 2014 Paul Sokolovsky.
|
|
|
|
// Use of this source code is governed by a BSD-style
|
|
|
|
// license that can be found in the LICENSE file.
|
|
|
|
|
2014-10-15 04:43:13 +03:00
|
|
|
#include "re1.5.h"
|
2014-10-11 14:26:29 +03:00
|
|
|
|
2015-11-01 00:37:44 +03:00
|
|
|
#define INSERT_CODE(at, num, pc) \
|
2018-03-21 10:21:29 +08:00
|
|
|
((code ? memmove(code + at + num, code + at, pc - at) : 0), pc += num)
|
2014-10-11 14:26:29 +03:00
|
|
|
#define REL(at, to) (to - at - 2)
|
2018-03-21 10:21:29 +08:00
|
|
|
#define EMIT(at, byte) (code ? (code[at] = byte) : (at))
|
2021-04-06 19:10:01 -05:00
|
|
|
#define EMIT_CHECKED(at, byte) (_emit_checked(at, code, byte, &err))
|
2015-11-01 00:37:44 +03:00
|
|
|
#define PC (prog->bytelen)
|
2014-10-11 14:26:29 +03:00
|
|
|
|
2017-07-25 01:09:47 +02:00
|
|
|
static char unescape(char c) {
|
|
|
|
switch (c) {
|
|
|
|
case 'a':
|
|
|
|
return '\a';
|
|
|
|
case 'b':
|
|
|
|
return '\b';
|
|
|
|
case 'f':
|
|
|
|
return '\f';
|
|
|
|
case 'n':
|
|
|
|
return '\n';
|
|
|
|
case 'r':
|
|
|
|
return '\r';
|
2021-04-23 12:26:42 -07:00
|
|
|
case 't':
|
|
|
|
return '\t';
|
2017-07-25 01:09:47 +02:00
|
|
|
case 'v':
|
|
|
|
return '\v';
|
|
|
|
default:
|
|
|
|
return c;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2021-04-06 19:10:01 -05:00
|
|
|
static void _emit_checked(int at, char *code, int val, bool *err) {
|
|
|
|
*err |= val != (int8_t)val;
|
|
|
|
if (code) {
|
|
|
|
code[at] = val;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-11-01 00:37:44 +03:00
|
|
|
static const char *_compilecode(const char *re, ByteProg *prog, int sizecode)
|
2014-10-11 14:26:29 +03:00
|
|
|
{
|
2015-11-01 00:37:44 +03:00
|
|
|
char *code = sizecode ? NULL : prog->insts;
|
2021-04-06 19:10:01 -05:00
|
|
|
bool err = false;
|
2015-11-01 00:37:44 +03:00
|
|
|
int start = PC;
|
|
|
|
int term = PC;
|
2014-10-11 14:26:29 +03:00
|
|
|
int alt_label = 0;
|
|
|
|
|
|
|
|
for (; *re && *re != ')'; re++) {
|
|
|
|
switch (*re) {
|
|
|
|
case '\\':
|
|
|
|
re++;
|
2015-11-01 00:37:44 +03:00
|
|
|
if (!*re) return NULL; // Trailing backslash
|
2017-07-25 01:09:47 +02:00
|
|
|
term = PC;
|
2015-02-21 10:33:20 +00:00
|
|
|
if ((*re | 0x20) == 'd' || (*re | 0x20) == 's' || (*re | 0x20) == 'w') {
|
2015-11-01 00:37:44 +03:00
|
|
|
EMIT(PC++, NamedClass);
|
|
|
|
EMIT(PC++, *re);
|
2017-07-25 01:09:47 +02:00
|
|
|
} else {
|
|
|
|
EMIT(PC++, Char);
|
|
|
|
EMIT(PC++, unescape(*re));
|
2015-02-21 10:33:20 +00:00
|
|
|
}
|
2017-07-25 01:09:47 +02:00
|
|
|
prog->len++;
|
|
|
|
break;
|
2014-10-11 14:26:29 +03:00
|
|
|
default:
|
2015-11-01 00:37:44 +03:00
|
|
|
term = PC;
|
|
|
|
EMIT(PC++, Char);
|
|
|
|
EMIT(PC++, *re);
|
2014-10-11 14:26:29 +03:00
|
|
|
prog->len++;
|
|
|
|
break;
|
|
|
|
case '.':
|
2015-11-01 00:37:44 +03:00
|
|
|
term = PC;
|
|
|
|
EMIT(PC++, Any);
|
2014-10-11 14:26:29 +03:00
|
|
|
prog->len++;
|
|
|
|
break;
|
2014-10-15 04:43:13 +03:00
|
|
|
case '[': {
|
|
|
|
int cnt;
|
2015-11-01 00:37:44 +03:00
|
|
|
term = PC;
|
2014-10-16 13:56:13 +03:00
|
|
|
re++;
|
|
|
|
if (*re == '^') {
|
2015-11-01 00:37:44 +03:00
|
|
|
EMIT(PC++, ClassNot);
|
2014-10-16 13:56:13 +03:00
|
|
|
re++;
|
|
|
|
} else {
|
2015-11-01 00:37:44 +03:00
|
|
|
EMIT(PC++, Class);
|
2014-10-16 13:56:13 +03:00
|
|
|
}
|
2015-11-01 00:37:44 +03:00
|
|
|
PC++; // Skip # of pair byte
|
2014-10-15 04:43:13 +03:00
|
|
|
prog->len++;
|
|
|
|
for (cnt = 0; *re != ']'; re++, cnt++) {
|
|
|
|
if (!*re) return NULL;
|
2021-04-23 12:26:42 -07:00
|
|
|
const char *b = re;
|
2017-07-25 01:09:47 +02:00
|
|
|
if (*re == '\\') {
|
|
|
|
re += 1;
|
2021-04-23 12:26:42 -07:00
|
|
|
if (!*re) return NULL; // Trailing backslash
|
2017-07-25 01:09:47 +02:00
|
|
|
EMIT(PC++, unescape(*re));
|
|
|
|
} else {
|
|
|
|
EMIT(PC++, *re);
|
|
|
|
}
|
2017-09-24 10:19:24 +03:00
|
|
|
if (re[1] == '-' && re[2] != ']') {
|
2014-10-15 04:43:13 +03:00
|
|
|
re += 2;
|
2021-04-23 12:26:42 -07:00
|
|
|
} else {
|
|
|
|
re = b;
|
2014-10-15 04:43:13 +03:00
|
|
|
}
|
2017-07-25 01:09:47 +02:00
|
|
|
if (*re == '\\') {
|
|
|
|
re += 1;
|
2021-04-23 12:26:42 -07:00
|
|
|
if (!*re) return NULL; // Trailing backslash
|
2017-07-25 01:09:47 +02:00
|
|
|
EMIT(PC++, unescape(*re));
|
|
|
|
} else {
|
|
|
|
EMIT(PC++, *re);
|
|
|
|
}
|
2014-10-15 04:43:13 +03:00
|
|
|
}
|
2021-04-06 19:10:01 -05:00
|
|
|
EMIT_CHECKED(term + 1, cnt);
|
2014-10-15 04:43:13 +03:00
|
|
|
break;
|
|
|
|
}
|
2015-02-21 10:33:20 +00:00
|
|
|
case '(': {
|
2015-11-01 00:37:44 +03:00
|
|
|
term = PC;
|
2015-11-01 02:39:01 +03:00
|
|
|
int sub = 0;
|
2015-11-01 00:37:44 +03:00
|
|
|
int capture = re[1] != '?' || re[2] != ':';
|
|
|
|
|
|
|
|
if (capture) {
|
|
|
|
sub = ++prog->sub;
|
|
|
|
EMIT(PC++, Save);
|
2021-04-06 19:10:01 -05:00
|
|
|
EMIT_CHECKED(PC++, 2 * sub);
|
2015-11-01 00:37:44 +03:00
|
|
|
prog->len++;
|
|
|
|
} else {
|
|
|
|
re += 2;
|
|
|
|
}
|
2014-10-11 14:26:29 +03:00
|
|
|
|
2015-11-01 00:37:44 +03:00
|
|
|
re = _compilecode(re + 1, prog, sizecode);
|
2015-03-10 17:47:13 +00:00
|
|
|
if (re == NULL || *re != ')') return NULL; // error, or no matching paren
|
2014-10-11 14:26:29 +03:00
|
|
|
|
2015-11-01 00:37:44 +03:00
|
|
|
if (capture) {
|
|
|
|
EMIT(PC++, Save);
|
2021-04-06 19:10:01 -05:00
|
|
|
EMIT_CHECKED(PC++, 2 * sub + 1);
|
2015-11-01 00:37:44 +03:00
|
|
|
prog->len++;
|
|
|
|
}
|
2014-10-11 14:26:29 +03:00
|
|
|
|
|
|
|
break;
|
2015-02-21 10:33:20 +00:00
|
|
|
}
|
2014-10-11 14:26:29 +03:00
|
|
|
case '?':
|
2015-11-01 00:37:44 +03:00
|
|
|
if (PC == term) return NULL; // nothing to repeat
|
|
|
|
INSERT_CODE(term, 2, PC);
|
|
|
|
if (re[1] == '?') {
|
|
|
|
EMIT(term, RSplit);
|
|
|
|
re++;
|
|
|
|
} else {
|
|
|
|
EMIT(term, Split);
|
|
|
|
}
|
2021-04-06 19:10:01 -05:00
|
|
|
EMIT_CHECKED(term + 1, REL(term, PC));
|
2014-10-11 14:26:29 +03:00
|
|
|
prog->len++;
|
2015-11-01 00:37:44 +03:00
|
|
|
term = PC;
|
2014-10-11 14:26:29 +03:00
|
|
|
break;
|
|
|
|
case '*':
|
2015-11-01 00:37:44 +03:00
|
|
|
if (PC == term) return NULL; // nothing to repeat
|
|
|
|
INSERT_CODE(term, 2, PC);
|
|
|
|
EMIT(PC, Jmp);
|
2021-04-06 19:10:01 -05:00
|
|
|
EMIT_CHECKED(PC + 1, REL(PC, term));
|
2015-11-01 00:37:44 +03:00
|
|
|
PC += 2;
|
2014-10-11 14:26:29 +03:00
|
|
|
if (re[1] == '?') {
|
|
|
|
EMIT(term, RSplit);
|
|
|
|
re++;
|
|
|
|
} else {
|
|
|
|
EMIT(term, Split);
|
|
|
|
}
|
2021-04-06 19:10:01 -05:00
|
|
|
EMIT_CHECKED(term + 1, REL(term, PC));
|
2014-10-11 14:26:29 +03:00
|
|
|
prog->len += 2;
|
2015-11-01 00:37:44 +03:00
|
|
|
term = PC;
|
2014-10-11 14:26:29 +03:00
|
|
|
break;
|
|
|
|
case '+':
|
2015-11-01 00:37:44 +03:00
|
|
|
if (PC == term) return NULL; // nothing to repeat
|
2014-10-11 14:26:29 +03:00
|
|
|
if (re[1] == '?') {
|
2015-11-01 00:37:44 +03:00
|
|
|
EMIT(PC, Split);
|
2014-10-11 14:26:29 +03:00
|
|
|
re++;
|
|
|
|
} else {
|
2015-11-01 00:37:44 +03:00
|
|
|
EMIT(PC, RSplit);
|
2014-10-11 14:26:29 +03:00
|
|
|
}
|
2021-04-06 19:10:01 -05:00
|
|
|
EMIT_CHECKED(PC + 1, REL(PC, term));
|
2015-11-01 00:37:44 +03:00
|
|
|
PC += 2;
|
2014-10-11 14:26:29 +03:00
|
|
|
prog->len++;
|
2015-11-01 00:37:44 +03:00
|
|
|
term = PC;
|
2014-10-11 14:26:29 +03:00
|
|
|
break;
|
|
|
|
case '|':
|
|
|
|
if (alt_label) {
|
2021-04-06 19:10:01 -05:00
|
|
|
EMIT_CHECKED(alt_label, REL(alt_label, PC) + 1);
|
2014-10-11 14:26:29 +03:00
|
|
|
}
|
2015-11-01 00:37:44 +03:00
|
|
|
INSERT_CODE(start, 2, PC);
|
|
|
|
EMIT(PC++, Jmp);
|
|
|
|
alt_label = PC++;
|
2014-10-11 14:26:29 +03:00
|
|
|
EMIT(start, Split);
|
2021-04-06 19:10:01 -05:00
|
|
|
EMIT_CHECKED(start + 1, REL(start, PC));
|
2014-10-11 14:26:29 +03:00
|
|
|
prog->len += 2;
|
2015-11-01 00:37:44 +03:00
|
|
|
term = PC;
|
2014-10-11 14:26:29 +03:00
|
|
|
break;
|
|
|
|
case '^':
|
2015-11-01 00:37:44 +03:00
|
|
|
EMIT(PC++, Bol);
|
2014-10-11 14:26:29 +03:00
|
|
|
prog->len++;
|
2015-11-01 00:37:44 +03:00
|
|
|
term = PC;
|
2014-10-11 14:26:29 +03:00
|
|
|
break;
|
|
|
|
case '$':
|
2015-11-01 00:37:44 +03:00
|
|
|
EMIT(PC++, Eol);
|
2014-10-11 14:26:29 +03:00
|
|
|
prog->len++;
|
2015-11-01 00:37:44 +03:00
|
|
|
term = PC;
|
2014-10-11 14:26:29 +03:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (alt_label) {
|
2021-04-06 19:10:01 -05:00
|
|
|
EMIT_CHECKED(alt_label, REL(alt_label, PC) + 1);
|
2014-10-11 14:26:29 +03:00
|
|
|
}
|
2021-04-06 19:10:01 -05:00
|
|
|
return err ? NULL : re;
|
2014-10-11 14:26:29 +03:00
|
|
|
}
|
|
|
|
|
2015-11-01 00:37:44 +03:00
|
|
|
int re1_5_sizecode(const char *re)
|
|
|
|
{
|
|
|
|
ByteProg dummyprog = {
|
|
|
|
// Save 0, Save 1, Match; more bytes for "search" (vs "match") prefix code
|
|
|
|
.bytelen = 5 + NON_ANCHORED_PREFIX
|
|
|
|
};
|
|
|
|
|
|
|
|
if (_compilecode(re, &dummyprog, /*sizecode*/1) == NULL) return -1;
|
|
|
|
|
|
|
|
return dummyprog.bytelen;
|
|
|
|
}
|
|
|
|
|
2014-10-11 14:26:29 +03:00
|
|
|
int re1_5_compilecode(ByteProg *prog, const char *re)
|
|
|
|
{
|
|
|
|
prog->len = 0;
|
|
|
|
prog->bytelen = 0;
|
|
|
|
prog->sub = 0;
|
|
|
|
|
|
|
|
// Add code to implement non-anchored operation ("search"),
|
|
|
|
// for anchored operation ("match"), this code will be just skipped.
|
|
|
|
// TODO: Implement search in much more efficient manner
|
|
|
|
prog->insts[prog->bytelen++] = RSplit;
|
|
|
|
prog->insts[prog->bytelen++] = 3;
|
|
|
|
prog->insts[prog->bytelen++] = Any;
|
|
|
|
prog->insts[prog->bytelen++] = Jmp;
|
|
|
|
prog->insts[prog->bytelen++] = -5;
|
|
|
|
prog->len += 3;
|
|
|
|
|
|
|
|
prog->insts[prog->bytelen++] = Save;
|
|
|
|
prog->insts[prog->bytelen++] = 0;
|
|
|
|
prog->len++;
|
|
|
|
|
2015-11-01 00:37:44 +03:00
|
|
|
re = _compilecode(re, prog, /*sizecode*/0);
|
2015-03-10 17:47:13 +00:00
|
|
|
if (re == NULL || *re) return 1;
|
2014-10-11 14:26:29 +03:00
|
|
|
|
|
|
|
prog->insts[prog->bytelen++] = Save;
|
|
|
|
prog->insts[prog->bytelen++] = 1;
|
|
|
|
prog->len++;
|
|
|
|
|
|
|
|
prog->insts[prog->bytelen++] = Match;
|
|
|
|
prog->len++;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2021-04-23 12:26:42 -07:00
|
|
|
#if defined(DEBUG_COMPILECODE)
|
|
|
|
#include <assert.h>
|
|
|
|
void re1_5_fatal(char *x) {
|
|
|
|
fprintf(stderr, "%s\n", x);
|
|
|
|
abort();
|
|
|
|
}
|
|
|
|
|
2014-10-11 14:26:29 +03:00
|
|
|
int main(int argc, char *argv[])
|
|
|
|
{
|
2021-04-23 12:26:42 -07:00
|
|
|
char *re_str = argv[1];
|
|
|
|
int size = re1_5_sizecode(re_str);
|
|
|
|
ByteProg *code = malloc(sizeof(ByteProg) + size);
|
|
|
|
int ret = re1_5_compilecode(code, re_str);
|
|
|
|
if (ret == 0) {
|
|
|
|
re1_5_dumpcode(code);
|
|
|
|
}
|
2014-10-11 14:26:29 +03:00
|
|
|
}
|
|
|
|
#endif
|