From 5010d1958feee84fe49f887ff6e4b1d37d211152 Mon Sep 17 00:00:00 2001
From: Damien George <damien.p.george@gmail.com>
Date: Wed, 29 Mar 2017 10:55:36 +1100
Subject: [PATCH] py/lexer: Simplify and reduce code size for operator
 tokenising.

By removing the 'E' code from the operator token encoding mini-language the
tokenising can be simplified.  The 'E' code was only used for the !=
operator which is now handled as a special case; the optimisations for the
general case more than make up for the addition of this single, special
case.  Furthermore, the . and ... operators can be handled in the same way
as != which reduces the code size a little further.

This simplification also removes a "goto".

Changes in code size for this patch are (measured in bytes):

bare-arm:       -48
minimal x86:    -64
unix x86-64:   -112
unix nanbox:    -64
stmhal:         -48
cc3200:         -48
esp8266:        -76
---
 py/lexer.c | 82 +++++++++++++++++++-----------------------------------
 1 file changed, 28 insertions(+), 54 deletions(-)
diff --git a/py/lexer.c b/py/lexer.c
index a91f5c9c8b..05651abecf 100644
--- a/py/lexer.c
+++ b/py/lexer.c
@@ -176,7 +176,6 @@ STATIC void indent_pop(mp_lexer_t *lex) {
 // some tricky operator encoding:
 //     <op>  = begin with <op>, if this opchar matches then begin here
 //     e<op> = end with <op>, if this opchar matches then end
-//     E<op> = mandatory end with <op>, this opchar must match, then end
 //     c<op> = continue with <op>, if this opchar matches then continue matching
 // this means if the start of two ops are the same then they are equal til the last char
 
@@ -193,7 +192,7 @@ STATIC const char *const tok_enc =
     "%e="         // % %=
     "^e="         // ^ ^=
     "=e="         // = ==
-    "!E=";        // !=
+    "!.";         // start of special cases: != . ...
 
 // TODO static assert that number of tokens is less than 256 so we can safely make this table with byte sized entries
 STATIC const uint8_t tok_enc_kind[] = {
@@ -213,7 +212,6 @@ STATIC const uint8_t tok_enc_kind[] = {
     MP_TOKEN_OP_PERCENT, MP_TOKEN_DEL_PERCENT_EQUAL,
     MP_TOKEN_OP_CARET, MP_TOKEN_DEL_CARET_EQUAL,
     MP_TOKEN_DEL_EQUAL, MP_TOKEN_OP_DBL_EQUAL,
-    MP_TOKEN_OP_NOT_EQUAL,
 };
 
 // must have the same order as enum in lexer.h
@@ -603,20 +601,6 @@ void mp_lexer_to_next(mp_lexer_t *lex) {
             }
         }
 
-    } else if (is_char(lex, '.')) {
-        // special handling for . and ... operators, because .. is not a valid operator
-
-        // get first char
-        next_char(lex);
-
-        if (is_char_and(lex, '.', '.')) {
-            next_char(lex);
-            next_char(lex);
-            lex->tok_kind = MP_TOKEN_ELLIPSIS;
-        } else {
-            lex->tok_kind = MP_TOKEN_DEL_PERIOD;
-        }
-
     } else {
         // search for encoded delimiter or operator
 
@@ -625,9 +609,6 @@ void mp_lexer_to_next(mp_lexer_t *lex) {
         for (; *t != 0 && !is_char(lex, *t); t += 1) {
             if (*t == 'e' || *t == 'c') {
                 t += 1;
-            } else if (*t == 'E') {
-                tok_enc_index -= 1;
-                t += 1;
             }
             tok_enc_index += 1;
         }
@@ -638,55 +619,48 @@ void mp_lexer_to_next(mp_lexer_t *lex) {
             // didn't match any delimiter or operator characters
             lex->tok_kind = MP_TOKEN_INVALID;
 
+        } else if (*t == '!') {
+            // "!=" is a special case because "!" is not a valid operator
+            if (is_char(lex, '=')) {
+                next_char(lex);
+                lex->tok_kind = MP_TOKEN_OP_NOT_EQUAL;
+            } else {
+                lex->tok_kind = MP_TOKEN_INVALID;
+            }
+
+        } else if (*t == '.') {
+            // "." and "..." are special cases because ".." is not a valid operator
+            if (is_char_and(lex, '.', '.')) {
+                next_char(lex);
+                next_char(lex);
+                lex->tok_kind = MP_TOKEN_ELLIPSIS;
+            } else {
+                lex->tok_kind = MP_TOKEN_DEL_PERIOD;
+            }
+
         } else {
             // matched a delimiter or operator character
 
             // get the maximum characters for a valid token
             t += 1;
             size_t t_index = tok_enc_index;
-            for (;;) {
-                for (; *t == 'e'; t += 1) {
-                    t += 1;
-                    t_index += 1;
-                    if (is_char(lex, *t)) {
-                        next_char(lex);
-                        tok_enc_index = t_index;
+            while (*t == 'c' || *t == 'e') {
+                t_index += 1;
+                if (is_char(lex, t[1])) {
+                    next_char(lex);
+                    tok_enc_index = t_index;
+                    if (*t == 'e') {
                         break;
                     }
-                }
-
-                if (*t == 'E') {
-                    t += 1;
-                    if (is_char(lex, *t)) {
-                        next_char(lex);
-                        tok_enc_index = t_index;
-                    } else {
-                        lex->tok_kind = MP_TOKEN_INVALID;
-                        goto tok_enc_no_match;
-                    }
-                    break;
-                }
-
-                if (*t == 'c') {
-                    t += 1;
-                    t_index += 1;
-                    if (is_char(lex, *t)) {
-                        next_char(lex);
-                        tok_enc_index = t_index;
-                        t += 1;
-                    } else {
-                        break;
-                    }
-                } else {
+                } else if (*t == 'c') {
                     break;
                 }
+                t += 2;
             }
 
             // set token kind
             lex->tok_kind = tok_enc_kind[tok_enc_index];
 
-            tok_enc_no_match:
-
             // compute bracket level for implicit line joining
             if (lex->tok_kind == MP_TOKEN_DEL_PAREN_OPEN || lex->tok_kind == MP_TOKEN_DEL_BRACKET_OPEN || lex->tok_kind == MP_TOKEN_DEL_BRACE_OPEN) {
                 lex->nested_bracket_level += 1;