py/lexer: Simplify and reduce code size for operator tokenising.
By removing the 'E' code from the operator token encoding mini-language the tokenising can be simplified. The 'E' code was only used for the != operator which is now handled as a special case; the optimisations for the general case more than make up for the addition of this single, special case. Furthermore, the . and ... operators can be handled in the same way as != which reduces the code size a little further. This simplification also removes a "goto". Changes in code size for this patch are (measured in bytes): bare-arm: -48 minimal x86: -64 unix x86-64: -112 unix nanbox: -64 stmhal: -48 cc3200: -48 esp8266: -76
This commit is contained in:
parent
e93c4ca181
commit
5010d1958f
76
py/lexer.c
76
py/lexer.c
|
@ -176,7 +176,6 @@ STATIC void indent_pop(mp_lexer_t *lex) {
|
||||||
// some tricky operator encoding:
|
// some tricky operator encoding:
|
||||||
// <op> = begin with <op>, if this opchar matches then begin here
|
// <op> = begin with <op>, if this opchar matches then begin here
|
||||||
// e<op> = end with <op>, if this opchar matches then end
|
// e<op> = end with <op>, if this opchar matches then end
|
||||||
// E<op> = mandatory end with <op>, this opchar must match, then end
|
|
||||||
// c<op> = continue with <op>, if this opchar matches then continue matching
|
// c<op> = continue with <op>, if this opchar matches then continue matching
|
||||||
// this means if the start of two ops are the same then they are equal til the last char
|
// this means if the start of two ops are the same then they are equal til the last char
|
||||||
|
|
||||||
|
@ -193,7 +192,7 @@ STATIC const char *const tok_enc =
|
||||||
"%e=" // % %=
|
"%e=" // % %=
|
||||||
"^e=" // ^ ^=
|
"^e=" // ^ ^=
|
||||||
"=e=" // = ==
|
"=e=" // = ==
|
||||||
"!E="; // !=
|
"!."; // start of special cases: != . ...
|
||||||
|
|
||||||
// TODO static assert that number of tokens is less than 256 so we can safely make this table with byte sized entries
|
// TODO static assert that number of tokens is less than 256 so we can safely make this table with byte sized entries
|
||||||
STATIC const uint8_t tok_enc_kind[] = {
|
STATIC const uint8_t tok_enc_kind[] = {
|
||||||
|
@ -213,7 +212,6 @@ STATIC const uint8_t tok_enc_kind[] = {
|
||||||
MP_TOKEN_OP_PERCENT, MP_TOKEN_DEL_PERCENT_EQUAL,
|
MP_TOKEN_OP_PERCENT, MP_TOKEN_DEL_PERCENT_EQUAL,
|
||||||
MP_TOKEN_OP_CARET, MP_TOKEN_DEL_CARET_EQUAL,
|
MP_TOKEN_OP_CARET, MP_TOKEN_DEL_CARET_EQUAL,
|
||||||
MP_TOKEN_DEL_EQUAL, MP_TOKEN_OP_DBL_EQUAL,
|
MP_TOKEN_DEL_EQUAL, MP_TOKEN_OP_DBL_EQUAL,
|
||||||
MP_TOKEN_OP_NOT_EQUAL,
|
|
||||||
};
|
};
|
||||||
|
|
||||||
// must have the same order as enum in lexer.h
|
// must have the same order as enum in lexer.h
|
||||||
|
@ -603,20 +601,6 @@ void mp_lexer_to_next(mp_lexer_t *lex) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
} else if (is_char(lex, '.')) {
|
|
||||||
// special handling for . and ... operators, because .. is not a valid operator
|
|
||||||
|
|
||||||
// get first char
|
|
||||||
next_char(lex);
|
|
||||||
|
|
||||||
if (is_char_and(lex, '.', '.')) {
|
|
||||||
next_char(lex);
|
|
||||||
next_char(lex);
|
|
||||||
lex->tok_kind = MP_TOKEN_ELLIPSIS;
|
|
||||||
} else {
|
|
||||||
lex->tok_kind = MP_TOKEN_DEL_PERIOD;
|
|
||||||
}
|
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
// search for encoded delimiter or operator
|
// search for encoded delimiter or operator
|
||||||
|
|
||||||
|
@ -625,9 +609,6 @@ void mp_lexer_to_next(mp_lexer_t *lex) {
|
||||||
for (; *t != 0 && !is_char(lex, *t); t += 1) {
|
for (; *t != 0 && !is_char(lex, *t); t += 1) {
|
||||||
if (*t == 'e' || *t == 'c') {
|
if (*t == 'e' || *t == 'c') {
|
||||||
t += 1;
|
t += 1;
|
||||||
} else if (*t == 'E') {
|
|
||||||
tok_enc_index -= 1;
|
|
||||||
t += 1;
|
|
||||||
}
|
}
|
||||||
tok_enc_index += 1;
|
tok_enc_index += 1;
|
||||||
}
|
}
|
||||||
|
@ -638,55 +619,48 @@ void mp_lexer_to_next(mp_lexer_t *lex) {
|
||||||
// didn't match any delimiter or operator characters
|
// didn't match any delimiter or operator characters
|
||||||
lex->tok_kind = MP_TOKEN_INVALID;
|
lex->tok_kind = MP_TOKEN_INVALID;
|
||||||
|
|
||||||
|
} else if (*t == '!') {
|
||||||
|
// "!=" is a special case because "!" is not a valid operator
|
||||||
|
if (is_char(lex, '=')) {
|
||||||
|
next_char(lex);
|
||||||
|
lex->tok_kind = MP_TOKEN_OP_NOT_EQUAL;
|
||||||
|
} else {
|
||||||
|
lex->tok_kind = MP_TOKEN_INVALID;
|
||||||
|
}
|
||||||
|
|
||||||
|
} else if (*t == '.') {
|
||||||
|
// "." and "..." are special cases because ".." is not a valid operator
|
||||||
|
if (is_char_and(lex, '.', '.')) {
|
||||||
|
next_char(lex);
|
||||||
|
next_char(lex);
|
||||||
|
lex->tok_kind = MP_TOKEN_ELLIPSIS;
|
||||||
|
} else {
|
||||||
|
lex->tok_kind = MP_TOKEN_DEL_PERIOD;
|
||||||
|
}
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
// matched a delimiter or operator character
|
// matched a delimiter or operator character
|
||||||
|
|
||||||
// get the maximum characters for a valid token
|
// get the maximum characters for a valid token
|
||||||
t += 1;
|
t += 1;
|
||||||
size_t t_index = tok_enc_index;
|
size_t t_index = tok_enc_index;
|
||||||
for (;;) {
|
while (*t == 'c' || *t == 'e') {
|
||||||
for (; *t == 'e'; t += 1) {
|
|
||||||
t += 1;
|
|
||||||
t_index += 1;
|
t_index += 1;
|
||||||
if (is_char(lex, *t)) {
|
if (is_char(lex, t[1])) {
|
||||||
next_char(lex);
|
next_char(lex);
|
||||||
tok_enc_index = t_index;
|
tok_enc_index = t_index;
|
||||||
|
if (*t == 'e') {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
} else if (*t == 'c') {
|
||||||
|
|
||||||
if (*t == 'E') {
|
|
||||||
t += 1;
|
|
||||||
if (is_char(lex, *t)) {
|
|
||||||
next_char(lex);
|
|
||||||
tok_enc_index = t_index;
|
|
||||||
} else {
|
|
||||||
lex->tok_kind = MP_TOKEN_INVALID;
|
|
||||||
goto tok_enc_no_match;
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (*t == 'c') {
|
|
||||||
t += 1;
|
|
||||||
t_index += 1;
|
|
||||||
if (is_char(lex, *t)) {
|
|
||||||
next_char(lex);
|
|
||||||
tok_enc_index = t_index;
|
|
||||||
t += 1;
|
|
||||||
} else {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
t += 2;
|
||||||
}
|
}
|
||||||
|
|
||||||
// set token kind
|
// set token kind
|
||||||
lex->tok_kind = tok_enc_kind[tok_enc_index];
|
lex->tok_kind = tok_enc_kind[tok_enc_index];
|
||||||
|
|
||||||
tok_enc_no_match:
|
|
||||||
|
|
||||||
// compute bracket level for implicit line joining
|
// compute bracket level for implicit line joining
|
||||||
if (lex->tok_kind == MP_TOKEN_DEL_PAREN_OPEN || lex->tok_kind == MP_TOKEN_DEL_BRACKET_OPEN || lex->tok_kind == MP_TOKEN_DEL_BRACE_OPEN) {
|
if (lex->tok_kind == MP_TOKEN_DEL_PAREN_OPEN || lex->tok_kind == MP_TOKEN_DEL_BRACKET_OPEN || lex->tok_kind == MP_TOKEN_DEL_BRACE_OPEN) {
|
||||||
lex->nested_bracket_level += 1;
|
lex->nested_bracket_level += 1;
|
||||||
|
|
Loading…
Reference in New Issue