py/lexer: Use strcmp to make keyword searching more efficient.

Since the table of keywords is sorted, we can use strcmp to do the search and stop part way through the search if the comparison is less-than. Because all tokens that are names are subject to this search, this optimisation will improve the overall speed of the lexer when processing a script. The change also decreases code size by a little bit because we now use strcmp instead of the custom str_strn_equal function.
2017-02-17 11:10:35 +11:00 · 2017-02-17 11:10:35 +11:00 · ae43679792
commit ae43679792
parent a68c754688
2 changed files with 20 additions and 28 deletions
--- a/py/lexer.c
+++ b/py/lexer.c
@ -25,6 +25,7 @@
 */
 #include <stdio.h>
 #include <string.h>
 #include <assert.h>
 #include "py/mpstate.h"
@ -39,19 +40,6 @@
 // TODO seems that CPython allows NULL byte in the input stream
 // don't know if that's intentional or not, but we don't allow it
 // TODO replace with a call to a standard function
 STATIC bool str_strn_equal(const char *str, const char *strn, mp_uint_t len) {
    mp_uint_t i = 0;
    while (i < len && *str == *strn) {
        ++i;
        ++str;
        ++strn;
    }
    return i == len && *str == 0;
 }
 #define MP_LEXER_EOF ((unichar)MP_READER_EOF)
 #define CUR_CHAR(lex) ((lex)->chr0)
@ -225,10 +213,12 @@ STATIC const uint8_t tok_enc_kind[] = {
 };
 // must have the same order as enum in lexer.h
 // must be sorted according to strcmp
 STATIC const char *const tok_kw[] = {
    "False",
    "None",
    "True",
    "__debug__",
    "and",
    "as",
    "assert",
@ -263,7 +253,6 @@ STATIC const char *const tok_kw[] = {
    "while",
    "with",
    "yield",
    "__debug__",
 };
 // This is called with CUR_CHAR() before first hex digit, and should return with
@ -531,16 +520,18 @@ void mp_lexer_to_next(mp_lexer_t *lex) {
        // We also check for __debug__ here and convert it to its value.  This is
        // so the parser gives a syntax error on, eg, x.__debug__.  Otherwise, we
        // need to check for this special token in many places in the compiler.
-        // TODO improve speed of these string comparisons
+        const char *s = vstr_null_terminated_str(&lex->vstr);
        for (size_t i = 0; i < MP_ARRAY_SIZE(tok_kw); i++) {
-            if (str_strn_equal(tok_kw[i], lex->vstr.buf, lex->vstr.len)) {
+            int cmp = strcmp(s, tok_kw[i]);
-                if (i == MP_ARRAY_SIZE(tok_kw) - 1) {
+            if (cmp == 0) {
                    // tok_kw[MP_ARRAY_SIZE(tok_kw) - 1] == "__debug__"
                    lex->tok_kind = (MP_STATE_VM(mp_optimise_value) == 0 ? MP_TOKEN_KW_TRUE : MP_TOKEN_KW_FALSE);
                } else {
                lex->tok_kind = MP_TOKEN_KW_FALSE + i;
                if (lex->tok_kind == MP_TOKEN_KW___DEBUG__) {
                    lex->tok_kind = (MP_STATE_VM(mp_optimise_value) == 0 ? MP_TOKEN_KW_TRUE : MP_TOKEN_KW_FALSE);
                }
                break;
            } else if (cmp < 0) {
                // Table is sorted and comparison was less-than, so stop searching
                break;
            }
        }
--- a/py/lexer.h
+++ b/py/lexer.h
@ -61,6 +61,7 @@ typedef enum _mp_token_kind_t {
    MP_TOKEN_KW_FALSE,              // 14
    MP_TOKEN_KW_NONE,
    MP_TOKEN_KW_TRUE,
    MP_TOKEN_KW___DEBUG__,
    MP_TOKEN_KW_AND,
    MP_TOKEN_KW_AS,
    MP_TOKEN_KW_ASSERT,
@ -71,7 +72,7 @@ typedef enum _mp_token_kind_t {
    MP_TOKEN_KW_BREAK,
    MP_TOKEN_KW_CLASS,
    MP_TOKEN_KW_CONTINUE,
-    MP_TOKEN_KW_DEF,                // 23
+    MP_TOKEN_KW_DEF,
    MP_TOKEN_KW_DEL,
    MP_TOKEN_KW_ELIF,
    MP_TOKEN_KW_ELSE,
@ -81,7 +82,7 @@ typedef enum _mp_token_kind_t {
    MP_TOKEN_KW_FROM,
    MP_TOKEN_KW_GLOBAL,
    MP_TOKEN_KW_IF,
-    MP_TOKEN_KW_IMPORT,             // 33
+    MP_TOKEN_KW_IMPORT,
    MP_TOKEN_KW_IN,
    MP_TOKEN_KW_IS,
    MP_TOKEN_KW_LAMBDA,
@ -91,12 +92,12 @@ typedef enum _mp_token_kind_t {
    MP_TOKEN_KW_PASS,
    MP_TOKEN_KW_RAISE,
    MP_TOKEN_KW_RETURN,
-    MP_TOKEN_KW_TRY,                // 43
+    MP_TOKEN_KW_TRY,
    MP_TOKEN_KW_WHILE,
    MP_TOKEN_KW_WITH,
    MP_TOKEN_KW_YIELD,
-    MP_TOKEN_OP_PLUS,               // 47
+    MP_TOKEN_OP_PLUS,
    MP_TOKEN_OP_MINUS,
    MP_TOKEN_OP_STAR,
    MP_TOKEN_OP_DBL_STAR,
@ -106,7 +107,7 @@ typedef enum _mp_token_kind_t {
    MP_TOKEN_OP_LESS,
    MP_TOKEN_OP_DBL_LESS,
    MP_TOKEN_OP_MORE,
-    MP_TOKEN_OP_DBL_MORE,           // 57
+    MP_TOKEN_OP_DBL_MORE,
    MP_TOKEN_OP_AMPERSAND,
    MP_TOKEN_OP_PIPE,
    MP_TOKEN_OP_CARET,
@ -116,7 +117,7 @@ typedef enum _mp_token_kind_t {
    MP_TOKEN_OP_DBL_EQUAL,
    MP_TOKEN_OP_NOT_EQUAL,
-    MP_TOKEN_DEL_PAREN_OPEN,        // 66
+    MP_TOKEN_DEL_PAREN_OPEN,
    MP_TOKEN_DEL_PAREN_CLOSE,
    MP_TOKEN_DEL_BRACKET_OPEN,
    MP_TOKEN_DEL_BRACKET_CLOSE,
@ -126,7 +127,7 @@ typedef enum _mp_token_kind_t {
    MP_TOKEN_DEL_COLON,
    MP_TOKEN_DEL_PERIOD,
    MP_TOKEN_DEL_SEMICOLON,
-    MP_TOKEN_DEL_AT,                // 76
+    MP_TOKEN_DEL_AT,
    MP_TOKEN_DEL_EQUAL,
    MP_TOKEN_DEL_PLUS_EQUAL,
    MP_TOKEN_DEL_MINUS_EQUAL,
@ -136,7 +137,7 @@ typedef enum _mp_token_kind_t {
    MP_TOKEN_DEL_PERCENT_EQUAL,
    MP_TOKEN_DEL_AMPERSAND_EQUAL,
    MP_TOKEN_DEL_PIPE_EQUAL,
-    MP_TOKEN_DEL_CARET_EQUAL,       // 86
+    MP_TOKEN_DEL_CARET_EQUAL,
    MP_TOKEN_DEL_DBL_MORE_EQUAL,
    MP_TOKEN_DEL_DBL_LESS_EQUAL,
    MP_TOKEN_DEL_DBL_STAR_EQUAL,