py: Never intern data of large string/bytes object; add relevant tests.
Previously to this patch all constant string/bytes objects were interned by the compiler, and this lead to crashes when the qstr was too long (noticeable now that qstr length storage defaults to 1 byte). With this patch, long string/bytes objects are never interned, and are referenced directly as constant objects within generated code using load_const_obj.
This commit is contained in:
parent
dab1385177
commit
4c81ba8015
30
py/compile.c
30
py/compile.c
|
@ -46,6 +46,7 @@ typedef enum {
|
|||
#undef DEF_RULE
|
||||
PN_maximum_number_of,
|
||||
PN_string, // special node for non-interned string
|
||||
PN_bytes, // special node for non-interned bytes
|
||||
} pn_kind_t;
|
||||
|
||||
#define EMIT(fun) (comp->emit_method_table->fun(comp->emit))
|
||||
|
@ -172,6 +173,7 @@ STATIC mp_parse_node_t fold_constants(compiler_t *comp, mp_parse_node_t pn, mp_m
|
|||
break;
|
||||
#endif
|
||||
case PN_string:
|
||||
case PN_bytes:
|
||||
return pn;
|
||||
}
|
||||
|
||||
|
@ -427,6 +429,9 @@ STATIC bool cpython_c_tuple_is_const(mp_parse_node_t pn) {
|
|||
if (MP_PARSE_NODE_IS_STRUCT_KIND(pn, PN_string)) {
|
||||
return true;
|
||||
}
|
||||
if (MP_PARSE_NODE_IS_STRUCT_KIND(pn, PN_bytes)) {
|
||||
return true;
|
||||
}
|
||||
if (!MP_PARSE_NODE_IS_LEAF(pn)) {
|
||||
return false;
|
||||
}
|
||||
|
@ -475,9 +480,9 @@ STATIC void cpython_c_print_quoted_str(vstr_t *vstr, const char *str, uint len,
|
|||
}
|
||||
|
||||
STATIC void cpython_c_tuple_emit_const(compiler_t *comp, mp_parse_node_t pn, vstr_t *vstr) {
|
||||
if (MP_PARSE_NODE_IS_STRUCT_KIND(pn, PN_string)) {
|
||||
if (MP_PARSE_NODE_IS_STRUCT_KIND(pn, PN_string) || MP_PARSE_NODE_IS_STRUCT_KIND(pn, PN_bytes)) {
|
||||
mp_parse_node_struct_t *pns = (mp_parse_node_struct_t*)pn;
|
||||
cpython_c_print_quoted_str(vstr, (const char*)pns->nodes[0], (mp_uint_t)pns->nodes[1], false);
|
||||
cpython_c_print_quoted_str(vstr, (const char*)pns->nodes[0], (mp_uint_t)pns->nodes[1], MP_PARSE_NODE_IS_STRUCT_KIND(pn, PN_bytes));
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -2151,7 +2156,8 @@ STATIC void compile_expr_stmt(compiler_t *comp, mp_parse_node_struct_t *pns) {
|
|||
} else {
|
||||
// for non-REPL, evaluate then discard the expression
|
||||
if ((MP_PARSE_NODE_IS_LEAF(pns->nodes[0]) && !MP_PARSE_NODE_IS_ID(pns->nodes[0]))
|
||||
|| MP_PARSE_NODE_IS_STRUCT_KIND(pns->nodes[0], PN_string)) {
|
||||
|| MP_PARSE_NODE_IS_STRUCT_KIND(pns->nodes[0], PN_string)
|
||||
|| MP_PARSE_NODE_IS_STRUCT_KIND(pns->nodes[0], PN_bytes)) {
|
||||
// do nothing with a lonely constant
|
||||
} else {
|
||||
compile_node(comp, pns->nodes[0]); // just an expression
|
||||
|
@ -2595,8 +2601,12 @@ STATIC void compile_atom_string(compiler_t *comp, mp_parse_node_struct_t *pns) {
|
|||
} else {
|
||||
assert(MP_PARSE_NODE_IS_STRUCT(pns->nodes[i]));
|
||||
mp_parse_node_struct_t *pns_string = (mp_parse_node_struct_t*)pns->nodes[i];
|
||||
assert(MP_PARSE_NODE_STRUCT_KIND(pns_string) == PN_string);
|
||||
if (MP_PARSE_NODE_STRUCT_KIND(pns_string) == PN_string) {
|
||||
pn_kind = MP_PARSE_NODE_STRING;
|
||||
} else {
|
||||
assert(MP_PARSE_NODE_STRUCT_KIND(pns_string) == PN_bytes);
|
||||
pn_kind = MP_PARSE_NODE_BYTES;
|
||||
}
|
||||
n_bytes += (mp_uint_t)pns_string->nodes[1];
|
||||
}
|
||||
if (i == 0) {
|
||||
|
@ -2608,8 +2618,8 @@ STATIC void compile_atom_string(compiler_t *comp, mp_parse_node_struct_t *pns) {
|
|||
}
|
||||
|
||||
// concatenate string/bytes
|
||||
byte *q_ptr;
|
||||
byte *s_dest = qstr_build_start(n_bytes, &q_ptr);
|
||||
byte *s_dest;
|
||||
mp_obj_t obj = mp_obj_str_builder_start(string_kind == MP_PARSE_NODE_STRING ? &mp_type_str : &mp_type_bytes, n_bytes, &s_dest);
|
||||
for (int i = 0; i < n; i++) {
|
||||
if (MP_PARSE_NODE_IS_LEAF(pns->nodes[i])) {
|
||||
mp_uint_t s_len;
|
||||
|
@ -2622,9 +2632,7 @@ STATIC void compile_atom_string(compiler_t *comp, mp_parse_node_struct_t *pns) {
|
|||
s_dest += (mp_uint_t)pns_string->nodes[1];
|
||||
}
|
||||
}
|
||||
qstr q = qstr_build_end(q_ptr);
|
||||
|
||||
EMIT_ARG(load_const_str, q, string_kind == MP_PARSE_NODE_BYTES);
|
||||
EMIT_ARG(load_const_obj, mp_obj_str_builder_end(obj));
|
||||
}
|
||||
|
||||
// pns needs to have 2 nodes, first is lhs of comprehension, second is PN_comp_for node
|
||||
|
@ -2959,7 +2967,9 @@ STATIC void compile_node(compiler_t *comp, mp_parse_node_t pn) {
|
|||
mp_parse_node_struct_t *pns = (mp_parse_node_struct_t*)pn;
|
||||
EMIT_ARG(set_line_number, pns->source_line);
|
||||
if (MP_PARSE_NODE_STRUCT_KIND(pns) == PN_string) {
|
||||
EMIT_ARG(load_const_str, qstr_from_strn((const char*)pns->nodes[0], (mp_uint_t)pns->nodes[1]), false);
|
||||
EMIT_ARG(load_const_obj, mp_obj_new_str((const char*)pns->nodes[0], (mp_uint_t)pns->nodes[1], false));
|
||||
} else if (MP_PARSE_NODE_STRUCT_KIND(pns) == PN_bytes) {
|
||||
EMIT_ARG(load_const_obj, mp_obj_new_bytes((const byte*)pns->nodes[0], (mp_uint_t)pns->nodes[1]));
|
||||
} else {
|
||||
compile_function_t f = compile_function[MP_PARSE_NODE_STRUCT_KIND(pns)];
|
||||
if (f == NULL) {
|
||||
|
|
21
py/parse.c
21
py/parse.c
|
@ -70,6 +70,7 @@ enum {
|
|||
#undef DEF_RULE
|
||||
RULE_maximum_number_of,
|
||||
RULE_string, // special node for non-interned string
|
||||
RULE_bytes, // special node for non-interned bytes
|
||||
};
|
||||
|
||||
#define ident (RULE_ACT_ALLOW_IDENT)
|
||||
|
@ -176,7 +177,7 @@ void mp_parse_node_free(mp_parse_node_t pn) {
|
|||
mp_parse_node_struct_t *pns = (mp_parse_node_struct_t *)pn;
|
||||
mp_uint_t n = MP_PARSE_NODE_STRUCT_NUM_NODES(pns);
|
||||
mp_uint_t rule_id = MP_PARSE_NODE_STRUCT_KIND(pns);
|
||||
if (rule_id == RULE_string) {
|
||||
if (rule_id == RULE_string || rule_id == RULE_bytes) {
|
||||
m_del(char, (char*)pns->nodes[0], (mp_uint_t)pns->nodes[1]);
|
||||
} else {
|
||||
bool adjust = ADD_BLANK_NODE(rules[rule_id]);
|
||||
|
@ -225,6 +226,8 @@ void mp_parse_node_print(mp_parse_node_t pn, mp_uint_t indent) {
|
|||
mp_parse_node_struct_t *pns = (mp_parse_node_struct_t*)pn;
|
||||
if (MP_PARSE_NODE_STRUCT_KIND(pns) == RULE_string) {
|
||||
printf("literal str(%.*s)\n", (int)pns->nodes[1], (char*)pns->nodes[0]);
|
||||
} else if (MP_PARSE_NODE_STRUCT_KIND(pns) == RULE_bytes) {
|
||||
printf("literal bytes(%.*s)\n", (int)pns->nodes[1], (char*)pns->nodes[0]);
|
||||
} else {
|
||||
mp_uint_t n = MP_PARSE_NODE_STRUCT_NUM_NODES(pns);
|
||||
#ifdef USE_RULE_NAME
|
||||
|
@ -281,14 +284,14 @@ STATIC void push_result_node(parser_t *parser, mp_parse_node_t pn) {
|
|||
parser->result_stack[parser->result_stack_top++] = pn;
|
||||
}
|
||||
|
||||
STATIC void push_result_string(parser_t *parser, mp_uint_t src_line, const char *str, mp_uint_t len) {
|
||||
STATIC void push_result_string_bytes(parser_t *parser, mp_uint_t src_line, mp_uint_t rule_kind, const char *str, mp_uint_t len) {
|
||||
mp_parse_node_struct_t *pn = m_new_obj_var_maybe(mp_parse_node_struct_t, mp_parse_node_t, 2);
|
||||
if (pn == NULL) {
|
||||
memory_error(parser);
|
||||
return;
|
||||
}
|
||||
pn->source_line = src_line;
|
||||
pn->kind_num_nodes = RULE_string | (2 << 8);
|
||||
pn->kind_num_nodes = rule_kind | (2 << 8);
|
||||
char *p = m_new(char, len);
|
||||
memcpy(p, str, len);
|
||||
pn->nodes[0] = (mp_int_t)p;
|
||||
|
@ -340,8 +343,8 @@ STATIC void push_result_token(parser_t *parser) {
|
|||
} else {
|
||||
pn = mp_parse_node_new_leaf(MP_PARSE_NODE_INTEGER, qstr_from_strn(str, len));
|
||||
}
|
||||
} else if (lex->tok_kind == MP_TOKEN_STRING) {
|
||||
// Don't automatically intern all strings. doc strings (which are usually large)
|
||||
} else if (lex->tok_kind == MP_TOKEN_STRING || lex->tok_kind == MP_TOKEN_BYTES) {
|
||||
// Don't automatically intern all strings/bytes. doc strings (which are usually large)
|
||||
// will be discarded by the compiler, and so we shouldn't intern them.
|
||||
qstr qst = MP_QSTR_NULL;
|
||||
if (lex->vstr.len <= MICROPY_ALLOC_PARSE_INTERN_STRING_LEN) {
|
||||
|
@ -353,14 +356,12 @@ STATIC void push_result_token(parser_t *parser) {
|
|||
}
|
||||
if (qst != MP_QSTR_NULL) {
|
||||
// qstr exists, make a leaf node
|
||||
pn = mp_parse_node_new_leaf(MP_PARSE_NODE_STRING, qst);
|
||||
pn = mp_parse_node_new_leaf(lex->tok_kind == MP_TOKEN_STRING ? MP_PARSE_NODE_STRING : MP_PARSE_NODE_BYTES, qst);
|
||||
} else {
|
||||
// not interned, make a node holding a pointer to the string data
|
||||
push_result_string(parser, lex->tok_line, lex->vstr.buf, lex->vstr.len);
|
||||
// not interned, make a node holding a pointer to the string/bytes data
|
||||
push_result_string_bytes(parser, lex->tok_line, lex->tok_kind == MP_TOKEN_STRING ? RULE_string : RULE_bytes, lex->vstr.buf, lex->vstr.len);
|
||||
return;
|
||||
}
|
||||
} else if (lex->tok_kind == MP_TOKEN_BYTES) {
|
||||
pn = mp_parse_node_new_leaf(MP_PARSE_NODE_BYTES, qstr_from_strn(lex->vstr.buf, lex->vstr.len));
|
||||
} else {
|
||||
pn = mp_parse_node_new_leaf(MP_PARSE_NODE_TOKEN, lex->tok_kind);
|
||||
}
|
||||
|
|
|
@ -148,6 +148,7 @@ qstr qstr_from_str(const char *str) {
|
|||
}
|
||||
|
||||
qstr qstr_from_strn(const char *str, mp_uint_t len) {
|
||||
assert(len < (1 << (8 * MICROPY_QSTR_BYTES_IN_LEN)));
|
||||
qstr q = qstr_find_strn(str, len);
|
||||
if (q == 0) {
|
||||
mp_uint_t hash = qstr_compute_hash((const byte*)str, len);
|
||||
|
|
|
@ -0,0 +1,2 @@
|
|||
b1 = b"long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes"
|
||||
b2 = b"concatenated bytes" b"concatenated bytes" b"concatenated bytes" b"concatenated bytes" b"concatenated bytes" b"concatenated bytes" b"concatenated bytes" b"concatenated bytes" b"concatenated bytes" b"concatenated bytes" b"concatenated bytes" b"concatenated bytes" b"concatenated bytes" b"concatenated bytes" b"concatenated bytes" b"concatenated bytes"
|
|
@ -0,0 +1,2 @@
|
|||
s1 = "long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string"
|
||||
s2 = "concatenated string" "concatenated string" "concatenated string" "concatenated string" "concatenated string" "concatenated string" "concatenated string" "concatenated string" "concatenated string" "concatenated string" "concatenated string" "concatenated string" "concatenated string" "concatenated string" "concatenated string" "concatenated string"
|
Loading…
Reference in New Issue