py/lexer: Use strcmp to make keyword searching more efficient.

Since the table of keywords is sorted, we can use strcmp to do the search
and stop part way through the search if the comparison is less-than.

Because all tokens that are names are subject to this search, this
optimisation will improve the overall speed of the lexer when processing
a script.

The change also decreases code size by a little bit because we now use
strcmp instead of the custom str_strn_equal function.
This commit is contained in:
Damien George 2017-02-17 11:10:35 +11:00
parent a68c754688
commit ae43679792
2 changed files with 20 additions and 28 deletions

View File

@ -25,6 +25,7 @@
*/ */
#include <stdio.h> #include <stdio.h>
#include <string.h>
#include <assert.h> #include <assert.h>
#include "py/mpstate.h" #include "py/mpstate.h"
@ -39,19 +40,6 @@
// TODO seems that CPython allows NULL byte in the input stream // TODO seems that CPython allows NULL byte in the input stream
// don't know if that's intentional or not, but we don't allow it // don't know if that's intentional or not, but we don't allow it
// TODO replace with a call to a standard function
STATIC bool str_strn_equal(const char *str, const char *strn, mp_uint_t len) {
mp_uint_t i = 0;
while (i < len && *str == *strn) {
++i;
++str;
++strn;
}
return i == len && *str == 0;
}
#define MP_LEXER_EOF ((unichar)MP_READER_EOF) #define MP_LEXER_EOF ((unichar)MP_READER_EOF)
#define CUR_CHAR(lex) ((lex)->chr0) #define CUR_CHAR(lex) ((lex)->chr0)
@ -225,10 +213,12 @@ STATIC const uint8_t tok_enc_kind[] = {
}; };
// must have the same order as enum in lexer.h // must have the same order as enum in lexer.h
// must be sorted according to strcmp
STATIC const char *const tok_kw[] = { STATIC const char *const tok_kw[] = {
"False", "False",
"None", "None",
"True", "True",
"__debug__",
"and", "and",
"as", "as",
"assert", "assert",
@ -263,7 +253,6 @@ STATIC const char *const tok_kw[] = {
"while", "while",
"with", "with",
"yield", "yield",
"__debug__",
}; };
// This is called with CUR_CHAR() before first hex digit, and should return with // This is called with CUR_CHAR() before first hex digit, and should return with
@ -531,16 +520,18 @@ void mp_lexer_to_next(mp_lexer_t *lex) {
// We also check for __debug__ here and convert it to its value. This is // We also check for __debug__ here and convert it to its value. This is
// so the parser gives a syntax error on, eg, x.__debug__. Otherwise, we // so the parser gives a syntax error on, eg, x.__debug__. Otherwise, we
// need to check for this special token in many places in the compiler. // need to check for this special token in many places in the compiler.
// TODO improve speed of these string comparisons const char *s = vstr_null_terminated_str(&lex->vstr);
for (size_t i = 0; i < MP_ARRAY_SIZE(tok_kw); i++) { for (size_t i = 0; i < MP_ARRAY_SIZE(tok_kw); i++) {
if (str_strn_equal(tok_kw[i], lex->vstr.buf, lex->vstr.len)) { int cmp = strcmp(s, tok_kw[i]);
if (i == MP_ARRAY_SIZE(tok_kw) - 1) { if (cmp == 0) {
// tok_kw[MP_ARRAY_SIZE(tok_kw) - 1] == "__debug__"
lex->tok_kind = (MP_STATE_VM(mp_optimise_value) == 0 ? MP_TOKEN_KW_TRUE : MP_TOKEN_KW_FALSE);
} else {
lex->tok_kind = MP_TOKEN_KW_FALSE + i; lex->tok_kind = MP_TOKEN_KW_FALSE + i;
if (lex->tok_kind == MP_TOKEN_KW___DEBUG__) {
lex->tok_kind = (MP_STATE_VM(mp_optimise_value) == 0 ? MP_TOKEN_KW_TRUE : MP_TOKEN_KW_FALSE);
} }
break; break;
} else if (cmp < 0) {
// Table is sorted and comparison was less-than, so stop searching
break;
} }
} }

View File

@ -61,6 +61,7 @@ typedef enum _mp_token_kind_t {
MP_TOKEN_KW_FALSE, // 14 MP_TOKEN_KW_FALSE, // 14
MP_TOKEN_KW_NONE, MP_TOKEN_KW_NONE,
MP_TOKEN_KW_TRUE, MP_TOKEN_KW_TRUE,
MP_TOKEN_KW___DEBUG__,
MP_TOKEN_KW_AND, MP_TOKEN_KW_AND,
MP_TOKEN_KW_AS, MP_TOKEN_KW_AS,
MP_TOKEN_KW_ASSERT, MP_TOKEN_KW_ASSERT,
@ -71,7 +72,7 @@ typedef enum _mp_token_kind_t {
MP_TOKEN_KW_BREAK, MP_TOKEN_KW_BREAK,
MP_TOKEN_KW_CLASS, MP_TOKEN_KW_CLASS,
MP_TOKEN_KW_CONTINUE, MP_TOKEN_KW_CONTINUE,
MP_TOKEN_KW_DEF, // 23 MP_TOKEN_KW_DEF,
MP_TOKEN_KW_DEL, MP_TOKEN_KW_DEL,
MP_TOKEN_KW_ELIF, MP_TOKEN_KW_ELIF,
MP_TOKEN_KW_ELSE, MP_TOKEN_KW_ELSE,
@ -81,7 +82,7 @@ typedef enum _mp_token_kind_t {
MP_TOKEN_KW_FROM, MP_TOKEN_KW_FROM,
MP_TOKEN_KW_GLOBAL, MP_TOKEN_KW_GLOBAL,
MP_TOKEN_KW_IF, MP_TOKEN_KW_IF,
MP_TOKEN_KW_IMPORT, // 33 MP_TOKEN_KW_IMPORT,
MP_TOKEN_KW_IN, MP_TOKEN_KW_IN,
MP_TOKEN_KW_IS, MP_TOKEN_KW_IS,
MP_TOKEN_KW_LAMBDA, MP_TOKEN_KW_LAMBDA,
@ -91,12 +92,12 @@ typedef enum _mp_token_kind_t {
MP_TOKEN_KW_PASS, MP_TOKEN_KW_PASS,
MP_TOKEN_KW_RAISE, MP_TOKEN_KW_RAISE,
MP_TOKEN_KW_RETURN, MP_TOKEN_KW_RETURN,
MP_TOKEN_KW_TRY, // 43 MP_TOKEN_KW_TRY,
MP_TOKEN_KW_WHILE, MP_TOKEN_KW_WHILE,
MP_TOKEN_KW_WITH, MP_TOKEN_KW_WITH,
MP_TOKEN_KW_YIELD, MP_TOKEN_KW_YIELD,
MP_TOKEN_OP_PLUS, // 47 MP_TOKEN_OP_PLUS,
MP_TOKEN_OP_MINUS, MP_TOKEN_OP_MINUS,
MP_TOKEN_OP_STAR, MP_TOKEN_OP_STAR,
MP_TOKEN_OP_DBL_STAR, MP_TOKEN_OP_DBL_STAR,
@ -106,7 +107,7 @@ typedef enum _mp_token_kind_t {
MP_TOKEN_OP_LESS, MP_TOKEN_OP_LESS,
MP_TOKEN_OP_DBL_LESS, MP_TOKEN_OP_DBL_LESS,
MP_TOKEN_OP_MORE, MP_TOKEN_OP_MORE,
MP_TOKEN_OP_DBL_MORE, // 57 MP_TOKEN_OP_DBL_MORE,
MP_TOKEN_OP_AMPERSAND, MP_TOKEN_OP_AMPERSAND,
MP_TOKEN_OP_PIPE, MP_TOKEN_OP_PIPE,
MP_TOKEN_OP_CARET, MP_TOKEN_OP_CARET,
@ -116,7 +117,7 @@ typedef enum _mp_token_kind_t {
MP_TOKEN_OP_DBL_EQUAL, MP_TOKEN_OP_DBL_EQUAL,
MP_TOKEN_OP_NOT_EQUAL, MP_TOKEN_OP_NOT_EQUAL,
MP_TOKEN_DEL_PAREN_OPEN, // 66 MP_TOKEN_DEL_PAREN_OPEN,
MP_TOKEN_DEL_PAREN_CLOSE, MP_TOKEN_DEL_PAREN_CLOSE,
MP_TOKEN_DEL_BRACKET_OPEN, MP_TOKEN_DEL_BRACKET_OPEN,
MP_TOKEN_DEL_BRACKET_CLOSE, MP_TOKEN_DEL_BRACKET_CLOSE,
@ -126,7 +127,7 @@ typedef enum _mp_token_kind_t {
MP_TOKEN_DEL_COLON, MP_TOKEN_DEL_COLON,
MP_TOKEN_DEL_PERIOD, MP_TOKEN_DEL_PERIOD,
MP_TOKEN_DEL_SEMICOLON, MP_TOKEN_DEL_SEMICOLON,
MP_TOKEN_DEL_AT, // 76 MP_TOKEN_DEL_AT,
MP_TOKEN_DEL_EQUAL, MP_TOKEN_DEL_EQUAL,
MP_TOKEN_DEL_PLUS_EQUAL, MP_TOKEN_DEL_PLUS_EQUAL,
MP_TOKEN_DEL_MINUS_EQUAL, MP_TOKEN_DEL_MINUS_EQUAL,
@ -136,7 +137,7 @@ typedef enum _mp_token_kind_t {
MP_TOKEN_DEL_PERCENT_EQUAL, MP_TOKEN_DEL_PERCENT_EQUAL,
MP_TOKEN_DEL_AMPERSAND_EQUAL, MP_TOKEN_DEL_AMPERSAND_EQUAL,
MP_TOKEN_DEL_PIPE_EQUAL, MP_TOKEN_DEL_PIPE_EQUAL,
MP_TOKEN_DEL_CARET_EQUAL, // 86 MP_TOKEN_DEL_CARET_EQUAL,
MP_TOKEN_DEL_DBL_MORE_EQUAL, MP_TOKEN_DEL_DBL_MORE_EQUAL,
MP_TOKEN_DEL_DBL_LESS_EQUAL, MP_TOKEN_DEL_DBL_LESS_EQUAL,
MP_TOKEN_DEL_DBL_STAR_EQUAL, MP_TOKEN_DEL_DBL_STAR_EQUAL,