py: Do adjacent str/bytes literal concatenation in lexer, not compiler.

It's much more efficient in RAM and code size to do implicit literal string concatenation in the lexer, as opposed to the compiler. RAM usage is reduced because the concatenation can be done right away in the tokeniser by just accumulating the string/bytes literals into the lexer's vstr. Prior to this patch adjacent strings/bytes would create a parse tree (one node per string/bytes) and then in the compiler a whole new chunk of memory was allocated to store the concatenated string, which used more than double the memory compared to just accumulating in the lexer. This patch also significantly reduces code size: bare-arm: -204 minimal: -204 unix x64: -328 stmhal: -208 esp8266: -284 cc3200: -224
2017-02-17 12:12:40 +11:00 · 2017-02-17 12:12:40 +11:00 · 534b7c368d
commit 534b7c368d
parent 773278ec30
3 changed files with 203 additions and 222 deletions
--- a/py/compile.c
+++ b/py/compile.c
@ -2301,65 +2301,6 @@ STATIC void compile_atom_expr_trailers(compiler_t *comp, mp_parse_node_struct_t
    }
 }

-STATIC void compile_atom_string(compiler_t *comp, mp_parse_node_struct_t *pns) {
-    // a list of strings
-
-    // check type of list (string or bytes) and count total number of bytes
-    int n = MP_PARSE_NODE_STRUCT_NUM_NODES(pns);
-    size_t n_bytes = 0;
-    int string_kind = MP_PARSE_NODE_NULL;
-    for (int i = 0; i < n; i++) {
-        int pn_kind;
-        if (MP_PARSE_NODE_IS_LEAF(pns->nodes[i])) {
-            pn_kind = MP_PARSE_NODE_LEAF_KIND(pns->nodes[i]);
-            assert(pn_kind == MP_PARSE_NODE_STRING || pn_kind == MP_PARSE_NODE_BYTES);
-            n_bytes += qstr_len(MP_PARSE_NODE_LEAF_ARG(pns->nodes[i]));
-        } else {
-            assert(MP_PARSE_NODE_IS_STRUCT(pns->nodes[i]));
-            mp_parse_node_struct_t *pns_string = (mp_parse_node_struct_t*)pns->nodes[i];
-            if (MP_PARSE_NODE_STRUCT_KIND(pns_string) == PN_string) {
-                pn_kind = MP_PARSE_NODE_STRING;
-            } else {
-                assert(MP_PARSE_NODE_STRUCT_KIND(pns_string) == PN_bytes);
-                pn_kind = MP_PARSE_NODE_BYTES;
-            }
-            n_bytes += pns_string->nodes[1];
-        }
-        if (i == 0) {
-            string_kind = pn_kind;
-        } else if (pn_kind != string_kind) {
-            compile_syntax_error(comp, (mp_parse_node_t)pns, "cannot mix bytes and nonbytes literals");
-            return;
-        }
-    }
-
-    // if we are not in the last pass, just load a dummy object
-    if (comp->pass != MP_PASS_EMIT) {
-        EMIT_ARG(load_const_obj, mp_const_none);
-        return;
-    }
-
-    // concatenate string/bytes
-    vstr_t vstr;
-    vstr_init_len(&vstr, n_bytes);
-    byte *s_dest = (byte*)vstr.buf;
-    for (int i = 0; i < n; i++) {
-        if (MP_PARSE_NODE_IS_LEAF(pns->nodes[i])) {
-            size_t s_len;
-            const byte *s = qstr_data(MP_PARSE_NODE_LEAF_ARG(pns->nodes[i]), &s_len);
-            memcpy(s_dest, s, s_len);
-            s_dest += s_len;
-        } else {
-            mp_parse_node_struct_t *pns_string = (mp_parse_node_struct_t*)pns->nodes[i];
-            memcpy(s_dest, (const char*)pns_string->nodes[0], pns_string->nodes[1]);
-            s_dest += pns_string->nodes[1];
-        }
-    }
-
-    // load the object
-    EMIT_ARG(load_const_obj, mp_obj_new_str_from_vstr(string_kind == MP_PARSE_NODE_STRING ? &mp_type_str : &mp_type_bytes, &vstr));
-}
-
 // pns needs to have 2 nodes, first is lhs of comprehension, second is PN_comp_for node
 STATIC void compile_comprehension(compiler_t *comp, mp_parse_node_struct_t *pns, scope_kind_t kind) {
    assert(MP_PARSE_NODE_STRUCT_NUM_NODES(pns) == 2);
--- a/py/grammar.h
+++ b/py/grammar.h
@ -268,8 +268,7 @@ DEF_RULE_NC(power_dbl_star, and_ident(2), tok(OP_DBL_STAR), rule(factor))
 // testlist_comp: (test|star_expr) ( comp_for | (',' (test|star_expr))* [','] )
 // trailer: '(' [arglist] ')' | '[' subscriptlist ']' | '.' NAME

-DEF_RULE_NC(atom, or(11), tok(NAME), tok(INTEGER), tok(FLOAT_OR_IMAG), rule(atom_string), tok(ELLIPSIS), tok(KW_NONE), tok(KW_TRUE), tok(KW_FALSE), rule(atom_paren), rule(atom_bracket), rule(atom_brace))
-DEF_RULE(atom_string, c(atom_string), one_or_more, rule(string_or_bytes))
+DEF_RULE_NC(atom, or(12), tok(NAME), tok(INTEGER), tok(FLOAT_OR_IMAG), tok(STRING), tok(BYTES), tok(ELLIPSIS), tok(KW_NONE), tok(KW_TRUE), tok(KW_FALSE), rule(atom_paren), rule(atom_bracket), rule(atom_brace))
 DEF_RULE_NC(string_or_bytes, or(2), tok(STRING), tok(BYTES))
 DEF_RULE(atom_paren, c(atom_paren), and(3), tok(DEL_PAREN_OPEN), opt_rule(atom_2b), tok(DEL_PAREN_CLOSE))
 DEF_RULE_NC(atom_2b, or(2), rule(yield_expr), rule(testlist_comp))
--- a/py/lexer.c
+++ b/py/lexer.c
@ -63,11 +63,9 @@ STATIC bool is_char_or3(mp_lexer_t *lex, byte c1, byte c2, byte c3) {
    return lex->chr0 == c1 || lex->chr0 == c2 || lex->chr0 == c3;
 }

-/*
 STATIC bool is_char_following(mp_lexer_t *lex, byte c) {
    return lex->chr1 == c;
 }
-*/

 STATIC bool is_char_following_or(mp_lexer_t *lex, byte c1, byte c2) {
    return lex->chr1 == c1 || lex->chr1 == c2;
@ -106,6 +104,13 @@ STATIC bool is_following_odigit(mp_lexer_t *lex) {
    return lex->chr1 >= '0' && lex->chr1 <= '7';
 }

+STATIC bool is_string_or_bytes(mp_lexer_t *lex) {
+    return is_char_or(lex, '\'', '\"')
+        || (is_char_or3(lex, 'r', 'u', 'b') && is_char_following_or(lex, '\'', '\"'))
+        || ((is_char_and(lex, 'r', 'b') || is_char_and(lex, 'b', 'r'))
+            && is_char_following_following_or(lex, '\'', '\"'));
+}
+
 // to easily parse utf-8 identifiers we allow any raw byte with high bit set
 STATIC bool is_head_of_identifier(mp_lexer_t *lex) {
    return is_letter(lex) || lex->chr0 == '_' || lex->chr0 >= 0x80;
@ -272,99 +277,7 @@ STATIC bool get_hex(mp_lexer_t *lex, mp_uint_t num_digits, mp_uint_t *result) {
    return true;
 }

-void mp_lexer_to_next(mp_lexer_t *lex) {
-    // start new token text
-    vstr_reset(&lex->vstr);
-
-    // skip white space and comments
-    bool had_physical_newline = false;
-    while (!is_end(lex)) {
-        if (is_physical_newline(lex)) {
-            had_physical_newline = true;
-            next_char(lex);
-        } else if (is_whitespace(lex)) {
-            next_char(lex);
-        } else if (is_char(lex, '#')) {
-            next_char(lex);
-            while (!is_end(lex) && !is_physical_newline(lex)) {
-                next_char(lex);
-            }
-            // had_physical_newline will be set on next loop
-        } else if (is_char_and(lex, '\\', '\n')) {
-            // line-continuation, so don't set had_physical_newline
-            next_char(lex);
-            next_char(lex);
-        } else {
-            break;
-        }
-    }
-
-    // set token source information
-    lex->tok_line = lex->line;
-    lex->tok_column = lex->column;
-
-    if (lex->emit_dent < 0) {
-        lex->tok_kind = MP_TOKEN_DEDENT;
-        lex->emit_dent += 1;
-
-    } else if (lex->emit_dent > 0) {
-        lex->tok_kind = MP_TOKEN_INDENT;
-        lex->emit_dent -= 1;
-
-    } else if (had_physical_newline && lex->nested_bracket_level == 0) {
-        lex->tok_kind = MP_TOKEN_NEWLINE;
-
-        mp_uint_t num_spaces = lex->column - 1;
-        if (num_spaces == indent_top(lex)) {
-        } else if (num_spaces > indent_top(lex)) {
-            indent_push(lex, num_spaces);
-            lex->emit_dent += 1;
-        } else {
-            while (num_spaces < indent_top(lex)) {
-                indent_pop(lex);
-                lex->emit_dent -= 1;
-            }
-            if (num_spaces != indent_top(lex)) {
-                lex->tok_kind = MP_TOKEN_DEDENT_MISMATCH;
-            }
-        }
-
-    } else if (is_end(lex)) {
-        lex->tok_kind = MP_TOKEN_END;
-
-    } else if (is_char_or(lex, '\'', '\"')
-               || (is_char_or3(lex, 'r', 'u', 'b') && is_char_following_or(lex, '\'', '\"'))
-               || ((is_char_and(lex, 'r', 'b') || is_char_and(lex, 'b', 'r')) && is_char_following_following_or(lex, '\'', '\"'))) {
-        // a string or bytes literal
-
-        // parse type codes
-        bool is_raw = false;
-        bool is_bytes = false;
-        if (is_char(lex, 'u')) {
-            next_char(lex);
-        } else if (is_char(lex, 'b')) {
-            is_bytes = true;
-            next_char(lex);
-            if (is_char(lex, 'r')) {
-                is_raw = true;
-                next_char(lex);
-            }
-        } else if (is_char(lex, 'r')) {
-            is_raw = true;
-            next_char(lex);
-            if (is_char(lex, 'b')) {
-                is_bytes = true;
-                next_char(lex);
-            }
-        }
-
-        // set token kind
-        if (is_bytes) {
-            lex->tok_kind = MP_TOKEN_BYTES;
-        } else {
-            lex->tok_kind = MP_TOKEN_STRING;
-        }
-
+STATIC void parse_string_literal(mp_lexer_t *lex, bool is_raw) {
    // get first quoting character
    char quote_char = '\'';
    if (is_char(lex, '\"')) {
@ -373,7 +286,7 @@ void mp_lexer_to_next(mp_lexer_t *lex) {
    next_char(lex);

    // work out if it's a single or triple quoted literal
-        mp_uint_t num_quotes;
+    size_t num_quotes;
    if (is_char_and(lex, quote_char, quote_char)) {
        // triple quotes
        next_char(lex);
@ -384,8 +297,7 @@ void mp_lexer_to_next(mp_lexer_t *lex) {
        num_quotes = 1;
    }

-        // parse the literal
-        mp_uint_t n_closing = 0;
+    size_t n_closing = 0;
    while (!is_end(lex) && (num_quotes > 1 || !is_char(lex, '\n')) && n_closing < num_quotes) {
        if (is_char(lex, quote_char)) {
            n_closing += 1;
@ -415,7 +327,7 @@ void mp_lexer_to_next(mp_lexer_t *lex) {
                        case 'r': c = 0x0d; break;
                        case 'u':
                        case 'U':
-                                if (is_bytes) {
+                            if (lex->tok_kind == MP_TOKEN_BYTES) {
                                // b'\u1234' == b'\\u1234'
                                vstr_add_char(&lex->vstr, '\\');
                                break;
@ -458,9 +370,9 @@ void mp_lexer_to_next(mp_lexer_t *lex) {
                }
                if (c != MP_LEXER_EOF) {
                    if (MICROPY_PY_BUILTINS_STR_UNICODE_DYNAMIC) {
-                            if (c < 0x110000 && !is_bytes) {
+                        if (c < 0x110000 && lex->tok_kind == MP_TOKEN_STRING) {
                            vstr_add_char(&lex->vstr, c);
-                            } else if (c < 0x100 && is_bytes) {
+                        } else if (c < 0x100 && lex->tok_kind == MP_TOKEN_BYTES) {
                            vstr_add_byte(&lex->vstr, c);
                        } else {
                            // unicode character out of range
@ -494,6 +406,135 @@ void mp_lexer_to_next(mp_lexer_t *lex) {

    // cut off the end quotes from the token text
    vstr_cut_tail_bytes(&lex->vstr, n_closing);
+}
+
+STATIC bool skip_whitespace(mp_lexer_t *lex, bool stop_at_newline) {
+    bool had_physical_newline = false;
+    while (!is_end(lex)) {
+        if (is_physical_newline(lex)) {
+            if (stop_at_newline && lex->nested_bracket_level == 0) {
+                break;
+            }
+            had_physical_newline = true;
+            next_char(lex);
+        } else if (is_whitespace(lex)) {
+            next_char(lex);
+        } else if (is_char(lex, '#')) {
+            next_char(lex);
+            while (!is_end(lex) && !is_physical_newline(lex)) {
+                next_char(lex);
+            }
+            // had_physical_newline will be set on next loop
+        } else if (is_char_and(lex, '\\', '\n')) {
+            // line-continuation, so don't set had_physical_newline
+            next_char(lex);
+            next_char(lex);
+        } else {
+            break;
+        }
+    }
+    return had_physical_newline;
+}
+
+void mp_lexer_to_next(mp_lexer_t *lex) {
+    // start new token text
+    vstr_reset(&lex->vstr);
+
+    // skip white space and comments
+    bool had_physical_newline = skip_whitespace(lex, false);
+
+    // set token source information
+    lex->tok_line = lex->line;
+    lex->tok_column = lex->column;
+
+    if (lex->emit_dent < 0) {
+        lex->tok_kind = MP_TOKEN_DEDENT;
+        lex->emit_dent += 1;
+
+    } else if (lex->emit_dent > 0) {
+        lex->tok_kind = MP_TOKEN_INDENT;
+        lex->emit_dent -= 1;
+
+    } else if (had_physical_newline && lex->nested_bracket_level == 0) {
+        lex->tok_kind = MP_TOKEN_NEWLINE;
+
+        mp_uint_t num_spaces = lex->column - 1;
+        if (num_spaces == indent_top(lex)) {
+        } else if (num_spaces > indent_top(lex)) {
+            indent_push(lex, num_spaces);
+            lex->emit_dent += 1;
+        } else {
+            while (num_spaces < indent_top(lex)) {
+                indent_pop(lex);
+                lex->emit_dent -= 1;
+            }
+            if (num_spaces != indent_top(lex)) {
+                lex->tok_kind = MP_TOKEN_DEDENT_MISMATCH;
+            }
+        }
+
+    } else if (is_end(lex)) {
+        lex->tok_kind = MP_TOKEN_END;
+
+    } else if (is_string_or_bytes(lex)) {
+        // a string or bytes literal
+
+        // Python requires adjacent string/bytes literals to be automatically
+        // concatenated.  We do it here in the tokeniser to make efficient use of RAM,
+        // because then the lexer's vstr can be used to accumulate the string literal,
+        // in contrast to creating a parse tree of strings and then joining them later
+        // in the compiler.  It's also more compact in code size to do it here.
+
+        // MP_TOKEN_END is used to indicate that this is the first string token
+        lex->tok_kind = MP_TOKEN_END;
+
+        // Loop to accumulate string/bytes literals
+        do {
+            // parse type codes
+            bool is_raw = false;
+            mp_token_kind_t kind = MP_TOKEN_STRING;
+            int n_char = 0;
+            if (is_char(lex, 'u')) {
+                n_char = 1;
+            } else if (is_char(lex, 'b')) {
+                kind = MP_TOKEN_BYTES;
+                n_char = 1;
+                if (is_char_following(lex, 'r')) {
+                    is_raw = true;
+                    n_char = 2;
+                }
+            } else if (is_char(lex, 'r')) {
+                is_raw = true;
+                n_char = 1;
+                if (is_char_following(lex, 'b')) {
+                    kind = MP_TOKEN_BYTES;
+                    n_char = 2;
+                }
+            }
+
+            // Set or check token kind
+            if (lex->tok_kind == MP_TOKEN_END) {
+                lex->tok_kind = kind;
+            } else if (lex->tok_kind != kind) {
+                // Can't concatenate string with bytes
+                break;
+            }
+
+            // Skip any type code characters
+            if (n_char != 0) {
+                next_char(lex);
+                if (n_char == 2) {
+                    next_char(lex);
+                }
+            }
+
+            // Parse the literal
+            parse_string_literal(lex, is_raw);
+
+            // Skip whitespace so we can check if there's another string following
+            skip_whitespace(lex, true);
+
+        } while (is_string_or_bytes(lex));

    } else if (is_head_of_identifier(lex)) {
        lex->tok_kind = MP_TOKEN_NAME;