py/lexer: Simplify and reduce code size for operator tokenising.

By removing the 'E' code from the operator token encoding mini-language the
tokenising can be simplified.  The 'E' code was only used for the !=
operator which is now handled as a special case; the optimisations for the
general case more than make up for the addition of this single, special
case.  Furthermore, the . and ... operators can be handled in the same way
as != which reduces the code size a little further.

This simplification also removes a "goto".

Changes in code size for this patch are (measured in bytes):

bare-arm:       -48
minimal x86:    -64
unix x86-64:   -112
unix nanbox:    -64
stmhal:         -48
cc3200:         -48
esp8266:        -76
This commit is contained in:
Damien George 2017-03-29 10:55:36 +11:00
parent e93c4ca181
commit 5010d1958f
1 changed files with 28 additions and 54 deletions

View File

@ -176,7 +176,6 @@ STATIC void indent_pop(mp_lexer_t *lex) {
// some tricky operator encoding: // some tricky operator encoding:
// <op> = begin with <op>, if this opchar matches then begin here // <op> = begin with <op>, if this opchar matches then begin here
// e<op> = end with <op>, if this opchar matches then end // e<op> = end with <op>, if this opchar matches then end
// E<op> = mandatory end with <op>, this opchar must match, then end
// c<op> = continue with <op>, if this opchar matches then continue matching // c<op> = continue with <op>, if this opchar matches then continue matching
// this means if the start of two ops are the same then they are equal til the last char // this means if the start of two ops are the same then they are equal til the last char
@ -193,7 +192,7 @@ STATIC const char *const tok_enc =
"%e=" // % %= "%e=" // % %=
"^e=" // ^ ^= "^e=" // ^ ^=
"=e=" // = == "=e=" // = ==
"!E="; // != "!."; // start of special cases: != . ...
// TODO static assert that number of tokens is less than 256 so we can safely make this table with byte sized entries // TODO static assert that number of tokens is less than 256 so we can safely make this table with byte sized entries
STATIC const uint8_t tok_enc_kind[] = { STATIC const uint8_t tok_enc_kind[] = {
@ -213,7 +212,6 @@ STATIC const uint8_t tok_enc_kind[] = {
MP_TOKEN_OP_PERCENT, MP_TOKEN_DEL_PERCENT_EQUAL, MP_TOKEN_OP_PERCENT, MP_TOKEN_DEL_PERCENT_EQUAL,
MP_TOKEN_OP_CARET, MP_TOKEN_DEL_CARET_EQUAL, MP_TOKEN_OP_CARET, MP_TOKEN_DEL_CARET_EQUAL,
MP_TOKEN_DEL_EQUAL, MP_TOKEN_OP_DBL_EQUAL, MP_TOKEN_DEL_EQUAL, MP_TOKEN_OP_DBL_EQUAL,
MP_TOKEN_OP_NOT_EQUAL,
}; };
// must have the same order as enum in lexer.h // must have the same order as enum in lexer.h
@ -603,20 +601,6 @@ void mp_lexer_to_next(mp_lexer_t *lex) {
} }
} }
} else if (is_char(lex, '.')) {
// special handling for . and ... operators, because .. is not a valid operator
// get first char
next_char(lex);
if (is_char_and(lex, '.', '.')) {
next_char(lex);
next_char(lex);
lex->tok_kind = MP_TOKEN_ELLIPSIS;
} else {
lex->tok_kind = MP_TOKEN_DEL_PERIOD;
}
} else { } else {
// search for encoded delimiter or operator // search for encoded delimiter or operator
@ -625,9 +609,6 @@ void mp_lexer_to_next(mp_lexer_t *lex) {
for (; *t != 0 && !is_char(lex, *t); t += 1) { for (; *t != 0 && !is_char(lex, *t); t += 1) {
if (*t == 'e' || *t == 'c') { if (*t == 'e' || *t == 'c') {
t += 1; t += 1;
} else if (*t == 'E') {
tok_enc_index -= 1;
t += 1;
} }
tok_enc_index += 1; tok_enc_index += 1;
} }
@ -638,55 +619,48 @@ void mp_lexer_to_next(mp_lexer_t *lex) {
// didn't match any delimiter or operator characters // didn't match any delimiter or operator characters
lex->tok_kind = MP_TOKEN_INVALID; lex->tok_kind = MP_TOKEN_INVALID;
} else if (*t == '!') {
// "!=" is a special case because "!" is not a valid operator
if (is_char(lex, '=')) {
next_char(lex);
lex->tok_kind = MP_TOKEN_OP_NOT_EQUAL;
} else {
lex->tok_kind = MP_TOKEN_INVALID;
}
} else if (*t == '.') {
// "." and "..." are special cases because ".." is not a valid operator
if (is_char_and(lex, '.', '.')) {
next_char(lex);
next_char(lex);
lex->tok_kind = MP_TOKEN_ELLIPSIS;
} else {
lex->tok_kind = MP_TOKEN_DEL_PERIOD;
}
} else { } else {
// matched a delimiter or operator character // matched a delimiter or operator character
// get the maximum characters for a valid token // get the maximum characters for a valid token
t += 1; t += 1;
size_t t_index = tok_enc_index; size_t t_index = tok_enc_index;
for (;;) { while (*t == 'c' || *t == 'e') {
for (; *t == 'e'; t += 1) { t_index += 1;
t += 1; if (is_char(lex, t[1])) {
t_index += 1; next_char(lex);
if (is_char(lex, *t)) { tok_enc_index = t_index;
next_char(lex); if (*t == 'e') {
tok_enc_index = t_index;
break; break;
} }
} } else if (*t == 'c') {
if (*t == 'E') {
t += 1;
if (is_char(lex, *t)) {
next_char(lex);
tok_enc_index = t_index;
} else {
lex->tok_kind = MP_TOKEN_INVALID;
goto tok_enc_no_match;
}
break;
}
if (*t == 'c') {
t += 1;
t_index += 1;
if (is_char(lex, *t)) {
next_char(lex);
tok_enc_index = t_index;
t += 1;
} else {
break;
}
} else {
break; break;
} }
t += 2;
} }
// set token kind // set token kind
lex->tok_kind = tok_enc_kind[tok_enc_index]; lex->tok_kind = tok_enc_kind[tok_enc_index];
tok_enc_no_match:
// compute bracket level for implicit line joining // compute bracket level for implicit line joining
if (lex->tok_kind == MP_TOKEN_DEL_PAREN_OPEN || lex->tok_kind == MP_TOKEN_DEL_BRACKET_OPEN || lex->tok_kind == MP_TOKEN_DEL_BRACE_OPEN) { if (lex->tok_kind == MP_TOKEN_DEL_PAREN_OPEN || lex->tok_kind == MP_TOKEN_DEL_BRACKET_OPEN || lex->tok_kind == MP_TOKEN_DEL_BRACE_OPEN) {
lex->nested_bracket_level += 1; lex->nested_bracket_level += 1;