diff --git a/py/compile.c b/py/compile.c index 0c43d939e1..f925c8c1ff 100644 --- a/py/compile.c +++ b/py/compile.c @@ -56,6 +56,7 @@ typedef enum { #include "grammar.h" #undef DEF_RULE PN_maximum_number_of, + PN_string, // special node for non-interned string } pn_kind_t; #define EMIT(fun) (comp->emit_method_table->fun(comp->emit)) @@ -177,6 +178,8 @@ STATIC mp_parse_node_t fold_constants(compiler_t *comp, mp_parse_node_t pn, mp_m } break; #endif + case PN_string: + return pn; } // fold arguments @@ -426,6 +429,9 @@ void compile_generic_all_nodes(compiler_t *comp, mp_parse_node_struct_t *pns) { #if MICROPY_EMIT_CPYTHON STATIC bool cpython_c_tuple_is_const(mp_parse_node_t pn) { + if (MP_PARSE_NODE_IS_STRUCT_KIND(pn, PN_string)) { + return true; + } if (!MP_PARSE_NODE_IS_LEAF(pn)) { return false; } @@ -435,9 +441,7 @@ STATIC bool cpython_c_tuple_is_const(mp_parse_node_t pn) { return true; } -STATIC void cpython_c_print_quoted_str(vstr_t *vstr, qstr qstr, bool bytes) { - uint len; - const byte *str = qstr_data(qstr, &len); +STATIC void cpython_c_print_quoted_str(vstr_t *vstr, const char *str, uint len, bool bytes) { bool has_single_quote = false; bool has_double_quote = false; for (int i = 0; i < len; i++) { @@ -476,6 +480,12 @@ STATIC void cpython_c_print_quoted_str(vstr_t *vstr, qstr qstr, bool bytes) { } STATIC void cpython_c_tuple_emit_const(compiler_t *comp, mp_parse_node_t pn, vstr_t *vstr) { + if (MP_PARSE_NODE_IS_STRUCT_KIND(pn, PN_string)) { + mp_parse_node_struct_t *pns = (mp_parse_node_struct_t*)pn; + cpython_c_print_quoted_str(vstr, (const char*)pns->nodes[0], (machine_uint_t)pns->nodes[1], false); + return; + } + assert(MP_PARSE_NODE_IS_LEAF(pn)); if (MP_PARSE_NODE_IS_SMALL_INT(pn)) { vstr_printf(vstr, INT_FMT, MP_PARSE_NODE_LEAF_SMALL_INT(pn)); @@ -487,8 +497,13 @@ STATIC void cpython_c_tuple_emit_const(compiler_t *comp, mp_parse_node_t pn, vst case MP_PARSE_NODE_ID: assert(0); case MP_PARSE_NODE_INTEGER: vstr_printf(vstr, "%s", qstr_str(arg)); break; case MP_PARSE_NODE_DECIMAL: vstr_printf(vstr, "%s", qstr_str(arg)); break; - case MP_PARSE_NODE_STRING: cpython_c_print_quoted_str(vstr, arg, false); break; - case MP_PARSE_NODE_BYTES: cpython_c_print_quoted_str(vstr, arg, true); break; + case MP_PARSE_NODE_STRING: + case MP_PARSE_NODE_BYTES: { + uint len; + const byte *str = qstr_data(arg, &len); + cpython_c_print_quoted_str(vstr, (const char*)str, len, MP_PARSE_NODE_LEAF_KIND(pn) == MP_PARSE_NODE_BYTES); + break; + } case MP_PARSE_NODE_TOKEN: switch (arg) { case MP_TOKEN_KW_FALSE: vstr_printf(vstr, "False"); break; @@ -2058,7 +2073,8 @@ void compile_expr_stmt(compiler_t *comp, mp_parse_node_struct_t *pns) { } else { // for non-REPL, evaluate then discard the expression - if (MP_PARSE_NODE_IS_LEAF(pns->nodes[0]) && !MP_PARSE_NODE_IS_ID(pns->nodes[0])) { + if ((MP_PARSE_NODE_IS_LEAF(pns->nodes[0]) && !MP_PARSE_NODE_IS_ID(pns->nodes[0])) + || MP_PARSE_NODE_IS_STRUCT_KIND(pns->nodes[0], PN_string)) { // do nothing with a lonely constant } else { compile_node(comp, pns->nodes[0]); // just an expression @@ -2498,26 +2514,40 @@ void compile_atom_string(compiler_t *comp, mp_parse_node_struct_t *pns) { int n_bytes = 0; int string_kind = MP_PARSE_NODE_NULL; for (int i = 0; i < n; i++) { - assert(MP_PARSE_NODE_IS_LEAF(pns->nodes[i])); - int pn_kind = MP_PARSE_NODE_LEAF_KIND(pns->nodes[i]); - assert(pn_kind == MP_PARSE_NODE_STRING || pn_kind == MP_PARSE_NODE_BYTES); + int pn_kind; + if (MP_PARSE_NODE_IS_LEAF(pns->nodes[i])) { + pn_kind = MP_PARSE_NODE_LEAF_KIND(pns->nodes[i]); + assert(pn_kind == MP_PARSE_NODE_STRING || pn_kind == MP_PARSE_NODE_BYTES); + n_bytes += qstr_len(MP_PARSE_NODE_LEAF_ARG(pns->nodes[i])); + } else { + assert(MP_PARSE_NODE_IS_STRUCT(pns->nodes[i])); + mp_parse_node_struct_t *pns_string = (mp_parse_node_struct_t*)pns->nodes[i]; + assert(MP_PARSE_NODE_STRUCT_KIND(pns_string) == PN_string); + pn_kind = MP_PARSE_NODE_STRING; + n_bytes += (machine_uint_t)pns_string->nodes[1]; + } if (i == 0) { string_kind = pn_kind; } else if (pn_kind != string_kind) { compile_syntax_error(comp, (mp_parse_node_t)pns, "cannot mix bytes and nonbytes literals"); return; } - n_bytes += qstr_len(MP_PARSE_NODE_LEAF_ARG(pns->nodes[i])); } // concatenate string/bytes byte *q_ptr; byte *s_dest = qstr_build_start(n_bytes, &q_ptr); for (int i = 0; i < n; i++) { - uint s_len; - const byte *s = qstr_data(MP_PARSE_NODE_LEAF_ARG(pns->nodes[i]), &s_len); - memcpy(s_dest, s, s_len); - s_dest += s_len; + if (MP_PARSE_NODE_IS_LEAF(pns->nodes[i])) { + uint s_len; + const byte *s = qstr_data(MP_PARSE_NODE_LEAF_ARG(pns->nodes[i]), &s_len); + memcpy(s_dest, s, s_len); + s_dest += s_len; + } else { + mp_parse_node_struct_t *pns_string = (mp_parse_node_struct_t*)pns->nodes[i]; + memcpy(s_dest, (const char*)pns_string->nodes[0], (machine_uint_t)pns_string->nodes[1]); + s_dest += (machine_uint_t)pns_string->nodes[1]; + } } qstr q = qstr_build_end(q_ptr); @@ -2848,15 +2878,19 @@ void compile_node(compiler_t *comp, mp_parse_node_t pn) { } else { mp_parse_node_struct_t *pns = (mp_parse_node_struct_t*)pn; EMIT_ARG(set_line_number, pns->source_line); - compile_function_t f = compile_function[MP_PARSE_NODE_STRUCT_KIND(pns)]; - if (f == NULL) { - printf("node %u cannot be compiled\n", (uint)MP_PARSE_NODE_STRUCT_KIND(pns)); -#if MICROPY_DEBUG_PRINTERS - mp_parse_node_print(pn, 0); -#endif - compile_syntax_error(comp, pn, "internal compiler error"); + if (MP_PARSE_NODE_STRUCT_KIND(pns) == PN_string) { + EMIT_ARG(load_const_str, qstr_from_strn((const char*)pns->nodes[0], (machine_uint_t)pns->nodes[1]), false); } else { - f(comp, pns); + compile_function_t f = compile_function[MP_PARSE_NODE_STRUCT_KIND(pns)]; + if (f == NULL) { + printf("node %u cannot be compiled\n", (uint)MP_PARSE_NODE_STRUCT_KIND(pns)); +#if MICROPY_DEBUG_PRINTERS + mp_parse_node_print(pn, 0); +#endif + compile_syntax_error(comp, pn, "internal compiler error"); + } else { + f(comp, pns); + } } } } @@ -3033,13 +3067,13 @@ STATIC void check_for_doc_string(compiler_t *comp, mp_parse_node_t pn) { // check the first statement for a doc string if (MP_PARSE_NODE_IS_STRUCT_KIND(pn, PN_expr_stmt)) { mp_parse_node_struct_t* pns = (mp_parse_node_struct_t*)pn; - if (MP_PARSE_NODE_IS_LEAF(pns->nodes[0])) { - int kind = MP_PARSE_NODE_LEAF_KIND(pns->nodes[0]); - if (kind == MP_PARSE_NODE_STRING) { - compile_node(comp, pns->nodes[0]); // a doc string - // store doc string + if ((MP_PARSE_NODE_IS_LEAF(pns->nodes[0]) + && MP_PARSE_NODE_LEAF_KIND(pns->nodes[0]) == MP_PARSE_NODE_STRING) + || MP_PARSE_NODE_IS_STRUCT_KIND(pns->nodes[0], PN_string)) { + // compile the doc string + compile_node(comp, pns->nodes[0]); + // store the doc string EMIT_ARG(store_id, MP_QSTR___doc__); - } } } #endif diff --git a/py/mpconfig.h b/py/mpconfig.h index 853d475c3e..1af27f7679 100644 --- a/py/mpconfig.h +++ b/py/mpconfig.h @@ -66,6 +66,11 @@ #define MICROPY_ALLOC_PARSE_RESULT_INC (16) #endif +// Strings this length or less will be interned by the parser +#ifndef MICROPY_ALLOC_PARSE_INTERN_STRING_LEN +#define MICROPY_ALLOC_PARSE_INTERN_STRING_LEN (10) +#endif + // Initial amount for ids in a scope #ifndef MICROPY_ALLOC_SCOPE_ID_INIT #define MICROPY_ALLOC_SCOPE_ID_INIT (4) diff --git a/py/parse.c b/py/parse.c index 4bc78f14a9..b255ebd35c 100644 --- a/py/parse.c +++ b/py/parse.c @@ -28,6 +28,7 @@ #include #include #include +#include #include "misc.h" #include "mpconfig.h" @@ -70,6 +71,7 @@ enum { #include "grammar.h" #undef DEF_RULE RULE_maximum_number_of, + RULE_string, // special node for non-interned string }; #define or(n) (RULE_ACT_OR | n) @@ -170,26 +172,26 @@ mp_parse_node_t mp_parse_node_new_leaf(machine_int_t kind, machine_int_t arg) { return (mp_parse_node_t)(kind | (arg << 5)); } -uint mp_parse_node_free(mp_parse_node_t pn) { - uint cnt = 0; +void mp_parse_node_free(mp_parse_node_t pn) { if (MP_PARSE_NODE_IS_STRUCT(pn)) { mp_parse_node_struct_t *pns = (mp_parse_node_struct_t *)pn; uint n = MP_PARSE_NODE_STRUCT_NUM_NODES(pns); uint rule_id = MP_PARSE_NODE_STRUCT_KIND(pns); + if (rule_id == RULE_string) { + return; + } bool adjust = ADD_BLANK_NODE(rule_id); if (adjust) { n--; } for (uint i = 0; i < n; i++) { - cnt += mp_parse_node_free(pns->nodes[i]); + mp_parse_node_free(pns->nodes[i]); } if (adjust) { n++; } m_del_var(mp_parse_node_struct_t, mp_parse_node_t, n, pns); - cnt++; } - return cnt; } #if MICROPY_DEBUG_PRINTERS @@ -219,15 +221,20 @@ void mp_parse_node_print(mp_parse_node_t pn, int indent) { default: assert(0); } } else { + // node must be a mp_parse_node_struct_t mp_parse_node_struct_t *pns = (mp_parse_node_struct_t*)pn; - uint n = MP_PARSE_NODE_STRUCT_NUM_NODES(pns); + if (MP_PARSE_NODE_STRUCT_KIND(pns) == RULE_string) { + printf("literal str(%.*s)\n", (int)pns->nodes[1], (char*)pns->nodes[0]); + } else { + uint n = MP_PARSE_NODE_STRUCT_NUM_NODES(pns); #ifdef USE_RULE_NAME - printf("%s(%d) (n=%d)\n", rules[MP_PARSE_NODE_STRUCT_KIND(pns)]->rule_name, MP_PARSE_NODE_STRUCT_KIND(pns), n); + printf("%s(%d) (n=%d)\n", rules[MP_PARSE_NODE_STRUCT_KIND(pns)]->rule_name, MP_PARSE_NODE_STRUCT_KIND(pns), n); #else - printf("rule(%u) (n=%d)\n", (uint)MP_PARSE_NODE_STRUCT_KIND(pns), n); + printf("rule(%u) (n=%d)\n", (uint)MP_PARSE_NODE_STRUCT_KIND(pns), n); #endif - for (uint i = 0; i < n; i++) { - mp_parse_node_print(pns->nodes[i], indent + 2); + for (uint i = 0; i < n; i++) { + mp_parse_node_print(pns->nodes[i], indent + 2); + } } } } @@ -274,6 +281,21 @@ STATIC void push_result_node(parser_t *parser, mp_parse_node_t pn) { parser->result_stack[parser->result_stack_top++] = pn; } +STATIC void push_result_string(parser_t *parser, int src_line, const char *str, uint len) { + mp_parse_node_struct_t *pn = m_new_obj_var_maybe(mp_parse_node_struct_t, mp_parse_node_t, 2); + if (pn == NULL) { + memory_error(parser); + return; + } + pn->source_line = src_line; + pn->kind_num_nodes = RULE_string | (2 << 8); + char *p = m_new(char, len); + memcpy(p, str, len); + pn->nodes[0] = (machine_int_t)p; + pn->nodes[1] = len; + push_result_node(parser, (mp_parse_node_t)pn); +} + STATIC void push_result_token(parser_t *parser, const mp_lexer_t *lex) { const mp_token_t *tok = mp_lexer_cur(lex); mp_parse_node_t pn; @@ -319,7 +341,24 @@ STATIC void push_result_token(parser_t *parser, const mp_lexer_t *lex) { pn = mp_parse_node_new_leaf(MP_PARSE_NODE_INTEGER, qstr_from_strn(str, len)); } } else if (tok->kind == MP_TOKEN_STRING) { - pn = mp_parse_node_new_leaf(MP_PARSE_NODE_STRING, qstr_from_strn(tok->str, tok->len)); + // Don't automatically intern all strings. doc strings (which are usually large) + // will be discarded by the compiler, and so we shouldn't intern them. + qstr qst = MP_QSTR_NULL; + if (tok->len <= MICROPY_ALLOC_PARSE_INTERN_STRING_LEN) { + // intern short strings + qst = qstr_from_strn(tok->str, tok->len); + } else { + // check if this string is already interned + qst = qstr_find_strn((const byte*)tok->str, tok->len); + } + if (qst != MP_QSTR_NULL) { + // qstr exists, make a leaf node + pn = mp_parse_node_new_leaf(MP_PARSE_NODE_STRING, qst); + } else { + // not interned, make a node holding a pointer to the string data + push_result_string(parser, mp_lexer_cur(lex)->src_line, tok->str, tok->len); + return; + } } else if (tok->kind == MP_TOKEN_BYTES) { pn = mp_parse_node_new_leaf(MP_PARSE_NODE_BYTES, qstr_from_strn(tok->str, tok->len)); } else { @@ -516,14 +555,13 @@ mp_parse_node_t mp_parse(mp_lexer_t *lex, mp_parse_input_kind_t input_kind, mp_p } } -#if 0 && !MICROPY_ENABLE_DOC_STRING - // this code discards lonely statement, such as doc strings - // problem is that doc strings have already been interned, so this doesn't really help reduce RAM usage +#if !MICROPY_EMIT_CPYTHON && !MICROPY_ENABLE_DOC_STRING + // this code discards lonely statements, such as doc strings if (input_kind != MP_PARSE_SINGLE_INPUT && rule->rule_id == RULE_expr_stmt && peek_result(&parser, 0) == MP_PARSE_NODE_NULL) { mp_parse_node_t p = peek_result(&parser, 1); - if (MP_PARSE_NODE_IS_LEAF(p) && !MP_PARSE_NODE_IS_ID(p)) { - pop_result(parser); - pop_result(parser); + if ((MP_PARSE_NODE_IS_LEAF(p) && !MP_PARSE_NODE_IS_ID(p)) || MP_PARSE_NODE_IS_STRUCT_KIND(p, RULE_string)) { + pop_result(&parser); + pop_result(&parser); push_result_rule(&parser, rule_src_line, rules[RULE_pass_stmt], 0); break; } diff --git a/py/parse.h b/py/parse.h index 41102e1600..43c74e73b8 100644 --- a/py/parse.h +++ b/py/parse.h @@ -82,7 +82,7 @@ typedef struct _mp_parse_node_struct_t { #define MP_PARSE_NODE_STRUCT_NUM_NODES(pns) ((pns)->kind_num_nodes >> 8) mp_parse_node_t mp_parse_node_new_leaf(machine_int_t kind, machine_int_t arg); -uint mp_parse_node_free(mp_parse_node_t pn); +void mp_parse_node_free(mp_parse_node_t pn); void mp_parse_node_print(mp_parse_node_t pn, int indent);