py: Never intern data of large string/bytes object; add relevant tests.
Previously to this patch all constant string/bytes objects were interned by the compiler, and this lead to crashes when the qstr was too long (noticeable now that qstr length storage defaults to 1 byte). With this patch, long string/bytes objects are never interned, and are referenced directly as constant objects within generated code using load_const_obj.
This commit is contained in:
parent
dab1385177
commit
4c81ba8015
30
py/compile.c
30
py/compile.c
|
@ -46,6 +46,7 @@ typedef enum {
|
||||||
#undef DEF_RULE
|
#undef DEF_RULE
|
||||||
PN_maximum_number_of,
|
PN_maximum_number_of,
|
||||||
PN_string, // special node for non-interned string
|
PN_string, // special node for non-interned string
|
||||||
|
PN_bytes, // special node for non-interned bytes
|
||||||
} pn_kind_t;
|
} pn_kind_t;
|
||||||
|
|
||||||
#define EMIT(fun) (comp->emit_method_table->fun(comp->emit))
|
#define EMIT(fun) (comp->emit_method_table->fun(comp->emit))
|
||||||
|
@ -172,6 +173,7 @@ STATIC mp_parse_node_t fold_constants(compiler_t *comp, mp_parse_node_t pn, mp_m
|
||||||
break;
|
break;
|
||||||
#endif
|
#endif
|
||||||
case PN_string:
|
case PN_string:
|
||||||
|
case PN_bytes:
|
||||||
return pn;
|
return pn;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -427,6 +429,9 @@ STATIC bool cpython_c_tuple_is_const(mp_parse_node_t pn) {
|
||||||
if (MP_PARSE_NODE_IS_STRUCT_KIND(pn, PN_string)) {
|
if (MP_PARSE_NODE_IS_STRUCT_KIND(pn, PN_string)) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
if (MP_PARSE_NODE_IS_STRUCT_KIND(pn, PN_bytes)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
if (!MP_PARSE_NODE_IS_LEAF(pn)) {
|
if (!MP_PARSE_NODE_IS_LEAF(pn)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
@ -475,9 +480,9 @@ STATIC void cpython_c_print_quoted_str(vstr_t *vstr, const char *str, uint len,
|
||||||
}
|
}
|
||||||
|
|
||||||
STATIC void cpython_c_tuple_emit_const(compiler_t *comp, mp_parse_node_t pn, vstr_t *vstr) {
|
STATIC void cpython_c_tuple_emit_const(compiler_t *comp, mp_parse_node_t pn, vstr_t *vstr) {
|
||||||
if (MP_PARSE_NODE_IS_STRUCT_KIND(pn, PN_string)) {
|
if (MP_PARSE_NODE_IS_STRUCT_KIND(pn, PN_string) || MP_PARSE_NODE_IS_STRUCT_KIND(pn, PN_bytes)) {
|
||||||
mp_parse_node_struct_t *pns = (mp_parse_node_struct_t*)pn;
|
mp_parse_node_struct_t *pns = (mp_parse_node_struct_t*)pn;
|
||||||
cpython_c_print_quoted_str(vstr, (const char*)pns->nodes[0], (mp_uint_t)pns->nodes[1], false);
|
cpython_c_print_quoted_str(vstr, (const char*)pns->nodes[0], (mp_uint_t)pns->nodes[1], MP_PARSE_NODE_IS_STRUCT_KIND(pn, PN_bytes));
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2151,7 +2156,8 @@ STATIC void compile_expr_stmt(compiler_t *comp, mp_parse_node_struct_t *pns) {
|
||||||
} else {
|
} else {
|
||||||
// for non-REPL, evaluate then discard the expression
|
// for non-REPL, evaluate then discard the expression
|
||||||
if ((MP_PARSE_NODE_IS_LEAF(pns->nodes[0]) && !MP_PARSE_NODE_IS_ID(pns->nodes[0]))
|
if ((MP_PARSE_NODE_IS_LEAF(pns->nodes[0]) && !MP_PARSE_NODE_IS_ID(pns->nodes[0]))
|
||||||
|| MP_PARSE_NODE_IS_STRUCT_KIND(pns->nodes[0], PN_string)) {
|
|| MP_PARSE_NODE_IS_STRUCT_KIND(pns->nodes[0], PN_string)
|
||||||
|
|| MP_PARSE_NODE_IS_STRUCT_KIND(pns->nodes[0], PN_bytes)) {
|
||||||
// do nothing with a lonely constant
|
// do nothing with a lonely constant
|
||||||
} else {
|
} else {
|
||||||
compile_node(comp, pns->nodes[0]); // just an expression
|
compile_node(comp, pns->nodes[0]); // just an expression
|
||||||
|
@ -2595,8 +2601,12 @@ STATIC void compile_atom_string(compiler_t *comp, mp_parse_node_struct_t *pns) {
|
||||||
} else {
|
} else {
|
||||||
assert(MP_PARSE_NODE_IS_STRUCT(pns->nodes[i]));
|
assert(MP_PARSE_NODE_IS_STRUCT(pns->nodes[i]));
|
||||||
mp_parse_node_struct_t *pns_string = (mp_parse_node_struct_t*)pns->nodes[i];
|
mp_parse_node_struct_t *pns_string = (mp_parse_node_struct_t*)pns->nodes[i];
|
||||||
assert(MP_PARSE_NODE_STRUCT_KIND(pns_string) == PN_string);
|
if (MP_PARSE_NODE_STRUCT_KIND(pns_string) == PN_string) {
|
||||||
pn_kind = MP_PARSE_NODE_STRING;
|
pn_kind = MP_PARSE_NODE_STRING;
|
||||||
|
} else {
|
||||||
|
assert(MP_PARSE_NODE_STRUCT_KIND(pns_string) == PN_bytes);
|
||||||
|
pn_kind = MP_PARSE_NODE_BYTES;
|
||||||
|
}
|
||||||
n_bytes += (mp_uint_t)pns_string->nodes[1];
|
n_bytes += (mp_uint_t)pns_string->nodes[1];
|
||||||
}
|
}
|
||||||
if (i == 0) {
|
if (i == 0) {
|
||||||
|
@ -2608,8 +2618,8 @@ STATIC void compile_atom_string(compiler_t *comp, mp_parse_node_struct_t *pns) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// concatenate string/bytes
|
// concatenate string/bytes
|
||||||
byte *q_ptr;
|
byte *s_dest;
|
||||||
byte *s_dest = qstr_build_start(n_bytes, &q_ptr);
|
mp_obj_t obj = mp_obj_str_builder_start(string_kind == MP_PARSE_NODE_STRING ? &mp_type_str : &mp_type_bytes, n_bytes, &s_dest);
|
||||||
for (int i = 0; i < n; i++) {
|
for (int i = 0; i < n; i++) {
|
||||||
if (MP_PARSE_NODE_IS_LEAF(pns->nodes[i])) {
|
if (MP_PARSE_NODE_IS_LEAF(pns->nodes[i])) {
|
||||||
mp_uint_t s_len;
|
mp_uint_t s_len;
|
||||||
|
@ -2622,9 +2632,7 @@ STATIC void compile_atom_string(compiler_t *comp, mp_parse_node_struct_t *pns) {
|
||||||
s_dest += (mp_uint_t)pns_string->nodes[1];
|
s_dest += (mp_uint_t)pns_string->nodes[1];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
qstr q = qstr_build_end(q_ptr);
|
EMIT_ARG(load_const_obj, mp_obj_str_builder_end(obj));
|
||||||
|
|
||||||
EMIT_ARG(load_const_str, q, string_kind == MP_PARSE_NODE_BYTES);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// pns needs to have 2 nodes, first is lhs of comprehension, second is PN_comp_for node
|
// pns needs to have 2 nodes, first is lhs of comprehension, second is PN_comp_for node
|
||||||
|
@ -2959,7 +2967,9 @@ STATIC void compile_node(compiler_t *comp, mp_parse_node_t pn) {
|
||||||
mp_parse_node_struct_t *pns = (mp_parse_node_struct_t*)pn;
|
mp_parse_node_struct_t *pns = (mp_parse_node_struct_t*)pn;
|
||||||
EMIT_ARG(set_line_number, pns->source_line);
|
EMIT_ARG(set_line_number, pns->source_line);
|
||||||
if (MP_PARSE_NODE_STRUCT_KIND(pns) == PN_string) {
|
if (MP_PARSE_NODE_STRUCT_KIND(pns) == PN_string) {
|
||||||
EMIT_ARG(load_const_str, qstr_from_strn((const char*)pns->nodes[0], (mp_uint_t)pns->nodes[1]), false);
|
EMIT_ARG(load_const_obj, mp_obj_new_str((const char*)pns->nodes[0], (mp_uint_t)pns->nodes[1], false));
|
||||||
|
} else if (MP_PARSE_NODE_STRUCT_KIND(pns) == PN_bytes) {
|
||||||
|
EMIT_ARG(load_const_obj, mp_obj_new_bytes((const byte*)pns->nodes[0], (mp_uint_t)pns->nodes[1]));
|
||||||
} else {
|
} else {
|
||||||
compile_function_t f = compile_function[MP_PARSE_NODE_STRUCT_KIND(pns)];
|
compile_function_t f = compile_function[MP_PARSE_NODE_STRUCT_KIND(pns)];
|
||||||
if (f == NULL) {
|
if (f == NULL) {
|
||||||
|
|
21
py/parse.c
21
py/parse.c
|
@ -70,6 +70,7 @@ enum {
|
||||||
#undef DEF_RULE
|
#undef DEF_RULE
|
||||||
RULE_maximum_number_of,
|
RULE_maximum_number_of,
|
||||||
RULE_string, // special node for non-interned string
|
RULE_string, // special node for non-interned string
|
||||||
|
RULE_bytes, // special node for non-interned bytes
|
||||||
};
|
};
|
||||||
|
|
||||||
#define ident (RULE_ACT_ALLOW_IDENT)
|
#define ident (RULE_ACT_ALLOW_IDENT)
|
||||||
|
@ -176,7 +177,7 @@ void mp_parse_node_free(mp_parse_node_t pn) {
|
||||||
mp_parse_node_struct_t *pns = (mp_parse_node_struct_t *)pn;
|
mp_parse_node_struct_t *pns = (mp_parse_node_struct_t *)pn;
|
||||||
mp_uint_t n = MP_PARSE_NODE_STRUCT_NUM_NODES(pns);
|
mp_uint_t n = MP_PARSE_NODE_STRUCT_NUM_NODES(pns);
|
||||||
mp_uint_t rule_id = MP_PARSE_NODE_STRUCT_KIND(pns);
|
mp_uint_t rule_id = MP_PARSE_NODE_STRUCT_KIND(pns);
|
||||||
if (rule_id == RULE_string) {
|
if (rule_id == RULE_string || rule_id == RULE_bytes) {
|
||||||
m_del(char, (char*)pns->nodes[0], (mp_uint_t)pns->nodes[1]);
|
m_del(char, (char*)pns->nodes[0], (mp_uint_t)pns->nodes[1]);
|
||||||
} else {
|
} else {
|
||||||
bool adjust = ADD_BLANK_NODE(rules[rule_id]);
|
bool adjust = ADD_BLANK_NODE(rules[rule_id]);
|
||||||
|
@ -225,6 +226,8 @@ void mp_parse_node_print(mp_parse_node_t pn, mp_uint_t indent) {
|
||||||
mp_parse_node_struct_t *pns = (mp_parse_node_struct_t*)pn;
|
mp_parse_node_struct_t *pns = (mp_parse_node_struct_t*)pn;
|
||||||
if (MP_PARSE_NODE_STRUCT_KIND(pns) == RULE_string) {
|
if (MP_PARSE_NODE_STRUCT_KIND(pns) == RULE_string) {
|
||||||
printf("literal str(%.*s)\n", (int)pns->nodes[1], (char*)pns->nodes[0]);
|
printf("literal str(%.*s)\n", (int)pns->nodes[1], (char*)pns->nodes[0]);
|
||||||
|
} else if (MP_PARSE_NODE_STRUCT_KIND(pns) == RULE_bytes) {
|
||||||
|
printf("literal bytes(%.*s)\n", (int)pns->nodes[1], (char*)pns->nodes[0]);
|
||||||
} else {
|
} else {
|
||||||
mp_uint_t n = MP_PARSE_NODE_STRUCT_NUM_NODES(pns);
|
mp_uint_t n = MP_PARSE_NODE_STRUCT_NUM_NODES(pns);
|
||||||
#ifdef USE_RULE_NAME
|
#ifdef USE_RULE_NAME
|
||||||
|
@ -281,14 +284,14 @@ STATIC void push_result_node(parser_t *parser, mp_parse_node_t pn) {
|
||||||
parser->result_stack[parser->result_stack_top++] = pn;
|
parser->result_stack[parser->result_stack_top++] = pn;
|
||||||
}
|
}
|
||||||
|
|
||||||
STATIC void push_result_string(parser_t *parser, mp_uint_t src_line, const char *str, mp_uint_t len) {
|
STATIC void push_result_string_bytes(parser_t *parser, mp_uint_t src_line, mp_uint_t rule_kind, const char *str, mp_uint_t len) {
|
||||||
mp_parse_node_struct_t *pn = m_new_obj_var_maybe(mp_parse_node_struct_t, mp_parse_node_t, 2);
|
mp_parse_node_struct_t *pn = m_new_obj_var_maybe(mp_parse_node_struct_t, mp_parse_node_t, 2);
|
||||||
if (pn == NULL) {
|
if (pn == NULL) {
|
||||||
memory_error(parser);
|
memory_error(parser);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
pn->source_line = src_line;
|
pn->source_line = src_line;
|
||||||
pn->kind_num_nodes = RULE_string | (2 << 8);
|
pn->kind_num_nodes = rule_kind | (2 << 8);
|
||||||
char *p = m_new(char, len);
|
char *p = m_new(char, len);
|
||||||
memcpy(p, str, len);
|
memcpy(p, str, len);
|
||||||
pn->nodes[0] = (mp_int_t)p;
|
pn->nodes[0] = (mp_int_t)p;
|
||||||
|
@ -340,8 +343,8 @@ STATIC void push_result_token(parser_t *parser) {
|
||||||
} else {
|
} else {
|
||||||
pn = mp_parse_node_new_leaf(MP_PARSE_NODE_INTEGER, qstr_from_strn(str, len));
|
pn = mp_parse_node_new_leaf(MP_PARSE_NODE_INTEGER, qstr_from_strn(str, len));
|
||||||
}
|
}
|
||||||
} else if (lex->tok_kind == MP_TOKEN_STRING) {
|
} else if (lex->tok_kind == MP_TOKEN_STRING || lex->tok_kind == MP_TOKEN_BYTES) {
|
||||||
// Don't automatically intern all strings. doc strings (which are usually large)
|
// Don't automatically intern all strings/bytes. doc strings (which are usually large)
|
||||||
// will be discarded by the compiler, and so we shouldn't intern them.
|
// will be discarded by the compiler, and so we shouldn't intern them.
|
||||||
qstr qst = MP_QSTR_NULL;
|
qstr qst = MP_QSTR_NULL;
|
||||||
if (lex->vstr.len <= MICROPY_ALLOC_PARSE_INTERN_STRING_LEN) {
|
if (lex->vstr.len <= MICROPY_ALLOC_PARSE_INTERN_STRING_LEN) {
|
||||||
|
@ -353,14 +356,12 @@ STATIC void push_result_token(parser_t *parser) {
|
||||||
}
|
}
|
||||||
if (qst != MP_QSTR_NULL) {
|
if (qst != MP_QSTR_NULL) {
|
||||||
// qstr exists, make a leaf node
|
// qstr exists, make a leaf node
|
||||||
pn = mp_parse_node_new_leaf(MP_PARSE_NODE_STRING, qst);
|
pn = mp_parse_node_new_leaf(lex->tok_kind == MP_TOKEN_STRING ? MP_PARSE_NODE_STRING : MP_PARSE_NODE_BYTES, qst);
|
||||||
} else {
|
} else {
|
||||||
// not interned, make a node holding a pointer to the string data
|
// not interned, make a node holding a pointer to the string/bytes data
|
||||||
push_result_string(parser, lex->tok_line, lex->vstr.buf, lex->vstr.len);
|
push_result_string_bytes(parser, lex->tok_line, lex->tok_kind == MP_TOKEN_STRING ? RULE_string : RULE_bytes, lex->vstr.buf, lex->vstr.len);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
} else if (lex->tok_kind == MP_TOKEN_BYTES) {
|
|
||||||
pn = mp_parse_node_new_leaf(MP_PARSE_NODE_BYTES, qstr_from_strn(lex->vstr.buf, lex->vstr.len));
|
|
||||||
} else {
|
} else {
|
||||||
pn = mp_parse_node_new_leaf(MP_PARSE_NODE_TOKEN, lex->tok_kind);
|
pn = mp_parse_node_new_leaf(MP_PARSE_NODE_TOKEN, lex->tok_kind);
|
||||||
}
|
}
|
||||||
|
|
|
@ -148,6 +148,7 @@ qstr qstr_from_str(const char *str) {
|
||||||
}
|
}
|
||||||
|
|
||||||
qstr qstr_from_strn(const char *str, mp_uint_t len) {
|
qstr qstr_from_strn(const char *str, mp_uint_t len) {
|
||||||
|
assert(len < (1 << (8 * MICROPY_QSTR_BYTES_IN_LEN)));
|
||||||
qstr q = qstr_find_strn(str, len);
|
qstr q = qstr_find_strn(str, len);
|
||||||
if (q == 0) {
|
if (q == 0) {
|
||||||
mp_uint_t hash = qstr_compute_hash((const byte*)str, len);
|
mp_uint_t hash = qstr_compute_hash((const byte*)str, len);
|
||||||
|
|
|
@ -0,0 +1,2 @@
|
||||||
|
b1 = b"long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes long bytes"
|
||||||
|
b2 = b"concatenated bytes" b"concatenated bytes" b"concatenated bytes" b"concatenated bytes" b"concatenated bytes" b"concatenated bytes" b"concatenated bytes" b"concatenated bytes" b"concatenated bytes" b"concatenated bytes" b"concatenated bytes" b"concatenated bytes" b"concatenated bytes" b"concatenated bytes" b"concatenated bytes" b"concatenated bytes"
|
|
@ -0,0 +1,2 @@
|
||||||
|
s1 = "long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string long string"
|
||||||
|
s2 = "concatenated string" "concatenated string" "concatenated string" "concatenated string" "concatenated string" "concatenated string" "concatenated string" "concatenated string" "concatenated string" "concatenated string" "concatenated string" "concatenated string" "concatenated string" "concatenated string" "concatenated string" "concatenated string"
|
Loading…
Reference in New Issue