py: Do adjacent str/bytes literal concatenation in lexer, not compiler.

It's much more efficient in RAM and code size to do implicit literal string
concatenation in the lexer, as opposed to the compiler.

RAM usage is reduced because the concatenation can be done right away in the
tokeniser by just accumulating the string/bytes literals into the lexer's
vstr.  Prior to this patch adjacent strings/bytes would create a parse tree
(one node per string/bytes) and then in the compiler a whole new chunk of
memory was allocated to store the concatenated string, which used more than
double the memory compared to just accumulating in the lexer.

This patch also significantly reduces code size:

bare-arm: -204
minimal:  -204
unix x64: -328
stmhal:   -208
esp8266:  -284
cc3200:   -224
This commit is contained in:
Damien George 2017-02-17 12:12:40 +11:00
parent 773278ec30
commit 534b7c368d
3 changed files with 203 additions and 222 deletions

View File

@ -2301,65 +2301,6 @@ STATIC void compile_atom_expr_trailers(compiler_t *comp, mp_parse_node_struct_t
}
}
STATIC void compile_atom_string(compiler_t *comp, mp_parse_node_struct_t *pns) {
// a list of strings
// check type of list (string or bytes) and count total number of bytes
int n = MP_PARSE_NODE_STRUCT_NUM_NODES(pns);
size_t n_bytes = 0;
int string_kind = MP_PARSE_NODE_NULL;
for (int i = 0; i < n; i++) {
int pn_kind;
if (MP_PARSE_NODE_IS_LEAF(pns->nodes[i])) {
pn_kind = MP_PARSE_NODE_LEAF_KIND(pns->nodes[i]);
assert(pn_kind == MP_PARSE_NODE_STRING || pn_kind == MP_PARSE_NODE_BYTES);
n_bytes += qstr_len(MP_PARSE_NODE_LEAF_ARG(pns->nodes[i]));
} else {
assert(MP_PARSE_NODE_IS_STRUCT(pns->nodes[i]));
mp_parse_node_struct_t *pns_string = (mp_parse_node_struct_t*)pns->nodes[i];
if (MP_PARSE_NODE_STRUCT_KIND(pns_string) == PN_string) {
pn_kind = MP_PARSE_NODE_STRING;
} else {
assert(MP_PARSE_NODE_STRUCT_KIND(pns_string) == PN_bytes);
pn_kind = MP_PARSE_NODE_BYTES;
}
n_bytes += pns_string->nodes[1];
}
if (i == 0) {
string_kind = pn_kind;
} else if (pn_kind != string_kind) {
compile_syntax_error(comp, (mp_parse_node_t)pns, "cannot mix bytes and nonbytes literals");
return;
}
}
// if we are not in the last pass, just load a dummy object
if (comp->pass != MP_PASS_EMIT) {
EMIT_ARG(load_const_obj, mp_const_none);
return;
}
// concatenate string/bytes
vstr_t vstr;
vstr_init_len(&vstr, n_bytes);
byte *s_dest = (byte*)vstr.buf;
for (int i = 0; i < n; i++) {
if (MP_PARSE_NODE_IS_LEAF(pns->nodes[i])) {
size_t s_len;
const byte *s = qstr_data(MP_PARSE_NODE_LEAF_ARG(pns->nodes[i]), &s_len);
memcpy(s_dest, s, s_len);
s_dest += s_len;
} else {
mp_parse_node_struct_t *pns_string = (mp_parse_node_struct_t*)pns->nodes[i];
memcpy(s_dest, (const char*)pns_string->nodes[0], pns_string->nodes[1]);
s_dest += pns_string->nodes[1];
}
}
// load the object
EMIT_ARG(load_const_obj, mp_obj_new_str_from_vstr(string_kind == MP_PARSE_NODE_STRING ? &mp_type_str : &mp_type_bytes, &vstr));
}
// pns needs to have 2 nodes, first is lhs of comprehension, second is PN_comp_for node
STATIC void compile_comprehension(compiler_t *comp, mp_parse_node_struct_t *pns, scope_kind_t kind) {
assert(MP_PARSE_NODE_STRUCT_NUM_NODES(pns) == 2);

View File

@ -268,8 +268,7 @@ DEF_RULE_NC(power_dbl_star, and_ident(2), tok(OP_DBL_STAR), rule(factor))
// testlist_comp: (test|star_expr) ( comp_for | (',' (test|star_expr))* [','] )
// trailer: '(' [arglist] ')' | '[' subscriptlist ']' | '.' NAME
DEF_RULE_NC(atom, or(11), tok(NAME), tok(INTEGER), tok(FLOAT_OR_IMAG), rule(atom_string), tok(ELLIPSIS), tok(KW_NONE), tok(KW_TRUE), tok(KW_FALSE), rule(atom_paren), rule(atom_bracket), rule(atom_brace))
DEF_RULE(atom_string, c(atom_string), one_or_more, rule(string_or_bytes))
DEF_RULE_NC(atom, or(12), tok(NAME), tok(INTEGER), tok(FLOAT_OR_IMAG), tok(STRING), tok(BYTES), tok(ELLIPSIS), tok(KW_NONE), tok(KW_TRUE), tok(KW_FALSE), rule(atom_paren), rule(atom_bracket), rule(atom_brace))
DEF_RULE_NC(string_or_bytes, or(2), tok(STRING), tok(BYTES))
DEF_RULE(atom_paren, c(atom_paren), and(3), tok(DEL_PAREN_OPEN), opt_rule(atom_2b), tok(DEL_PAREN_CLOSE))
DEF_RULE_NC(atom_2b, or(2), rule(yield_expr), rule(testlist_comp))

View File

@ -63,11 +63,9 @@ STATIC bool is_char_or3(mp_lexer_t *lex, byte c1, byte c2, byte c3) {
return lex->chr0 == c1 || lex->chr0 == c2 || lex->chr0 == c3;
}
/*
STATIC bool is_char_following(mp_lexer_t *lex, byte c) {
return lex->chr1 == c;
}
*/
STATIC bool is_char_following_or(mp_lexer_t *lex, byte c1, byte c2) {
return lex->chr1 == c1 || lex->chr1 == c2;
@ -106,6 +104,13 @@ STATIC bool is_following_odigit(mp_lexer_t *lex) {
return lex->chr1 >= '0' && lex->chr1 <= '7';
}
STATIC bool is_string_or_bytes(mp_lexer_t *lex) {
return is_char_or(lex, '\'', '\"')
|| (is_char_or3(lex, 'r', 'u', 'b') && is_char_following_or(lex, '\'', '\"'))
|| ((is_char_and(lex, 'r', 'b') || is_char_and(lex, 'b', 'r'))
&& is_char_following_following_or(lex, '\'', '\"'));
}
// to easily parse utf-8 identifiers we allow any raw byte with high bit set
STATIC bool is_head_of_identifier(mp_lexer_t *lex) {
return is_letter(lex) || lex->chr0 == '_' || lex->chr0 >= 0x80;
@ -272,99 +277,7 @@ STATIC bool get_hex(mp_lexer_t *lex, mp_uint_t num_digits, mp_uint_t *result) {
return true;
}
void mp_lexer_to_next(mp_lexer_t *lex) {
// start new token text
vstr_reset(&lex->vstr);
// skip white space and comments
bool had_physical_newline = false;
while (!is_end(lex)) {
if (is_physical_newline(lex)) {
had_physical_newline = true;
next_char(lex);
} else if (is_whitespace(lex)) {
next_char(lex);
} else if (is_char(lex, '#')) {
next_char(lex);
while (!is_end(lex) && !is_physical_newline(lex)) {
next_char(lex);
}
// had_physical_newline will be set on next loop
} else if (is_char_and(lex, '\\', '\n')) {
// line-continuation, so don't set had_physical_newline
next_char(lex);
next_char(lex);
} else {
break;
}
}
// set token source information
lex->tok_line = lex->line;
lex->tok_column = lex->column;
if (lex->emit_dent < 0) {
lex->tok_kind = MP_TOKEN_DEDENT;
lex->emit_dent += 1;
} else if (lex->emit_dent > 0) {
lex->tok_kind = MP_TOKEN_INDENT;
lex->emit_dent -= 1;
} else if (had_physical_newline && lex->nested_bracket_level == 0) {
lex->tok_kind = MP_TOKEN_NEWLINE;
mp_uint_t num_spaces = lex->column - 1;
if (num_spaces == indent_top(lex)) {
} else if (num_spaces > indent_top(lex)) {
indent_push(lex, num_spaces);
lex->emit_dent += 1;
} else {
while (num_spaces < indent_top(lex)) {
indent_pop(lex);
lex->emit_dent -= 1;
}
if (num_spaces != indent_top(lex)) {
lex->tok_kind = MP_TOKEN_DEDENT_MISMATCH;
}
}
} else if (is_end(lex)) {
lex->tok_kind = MP_TOKEN_END;
} else if (is_char_or(lex, '\'', '\"')
|| (is_char_or3(lex, 'r', 'u', 'b') && is_char_following_or(lex, '\'', '\"'))
|| ((is_char_and(lex, 'r', 'b') || is_char_and(lex, 'b', 'r')) && is_char_following_following_or(lex, '\'', '\"'))) {
// a string or bytes literal
// parse type codes
bool is_raw = false;
bool is_bytes = false;
if (is_char(lex, 'u')) {
next_char(lex);
} else if (is_char(lex, 'b')) {
is_bytes = true;
next_char(lex);
if (is_char(lex, 'r')) {
is_raw = true;
next_char(lex);
}
} else if (is_char(lex, 'r')) {
is_raw = true;
next_char(lex);
if (is_char(lex, 'b')) {
is_bytes = true;
next_char(lex);
}
}
// set token kind
if (is_bytes) {
lex->tok_kind = MP_TOKEN_BYTES;
} else {
lex->tok_kind = MP_TOKEN_STRING;
}
STATIC void parse_string_literal(mp_lexer_t *lex, bool is_raw) {
// get first quoting character
char quote_char = '\'';
if (is_char(lex, '\"')) {
@ -373,7 +286,7 @@ void mp_lexer_to_next(mp_lexer_t *lex) {
next_char(lex);
// work out if it's a single or triple quoted literal
mp_uint_t num_quotes;
size_t num_quotes;
if (is_char_and(lex, quote_char, quote_char)) {
// triple quotes
next_char(lex);
@ -384,8 +297,7 @@ void mp_lexer_to_next(mp_lexer_t *lex) {
num_quotes = 1;
}
// parse the literal
mp_uint_t n_closing = 0;
size_t n_closing = 0;
while (!is_end(lex) && (num_quotes > 1 || !is_char(lex, '\n')) && n_closing < num_quotes) {
if (is_char(lex, quote_char)) {
n_closing += 1;
@ -415,7 +327,7 @@ void mp_lexer_to_next(mp_lexer_t *lex) {
case 'r': c = 0x0d; break;
case 'u':
case 'U':
if (is_bytes) {
if (lex->tok_kind == MP_TOKEN_BYTES) {
// b'\u1234' == b'\\u1234'
vstr_add_char(&lex->vstr, '\\');
break;
@ -458,9 +370,9 @@ void mp_lexer_to_next(mp_lexer_t *lex) {
}
if (c != MP_LEXER_EOF) {
if (MICROPY_PY_BUILTINS_STR_UNICODE_DYNAMIC) {
if (c < 0x110000 && !is_bytes) {
if (c < 0x110000 && lex->tok_kind == MP_TOKEN_STRING) {
vstr_add_char(&lex->vstr, c);
} else if (c < 0x100 && is_bytes) {
} else if (c < 0x100 && lex->tok_kind == MP_TOKEN_BYTES) {
vstr_add_byte(&lex->vstr, c);
} else {
// unicode character out of range
@ -494,6 +406,135 @@ void mp_lexer_to_next(mp_lexer_t *lex) {
// cut off the end quotes from the token text
vstr_cut_tail_bytes(&lex->vstr, n_closing);
}
STATIC bool skip_whitespace(mp_lexer_t *lex, bool stop_at_newline) {
bool had_physical_newline = false;
while (!is_end(lex)) {
if (is_physical_newline(lex)) {
if (stop_at_newline && lex->nested_bracket_level == 0) {
break;
}
had_physical_newline = true;
next_char(lex);
} else if (is_whitespace(lex)) {
next_char(lex);
} else if (is_char(lex, '#')) {
next_char(lex);
while (!is_end(lex) && !is_physical_newline(lex)) {
next_char(lex);
}
// had_physical_newline will be set on next loop
} else if (is_char_and(lex, '\\', '\n')) {
// line-continuation, so don't set had_physical_newline
next_char(lex);
next_char(lex);
} else {
break;
}
}
return had_physical_newline;
}
void mp_lexer_to_next(mp_lexer_t *lex) {
// start new token text
vstr_reset(&lex->vstr);
// skip white space and comments
bool had_physical_newline = skip_whitespace(lex, false);
// set token source information
lex->tok_line = lex->line;
lex->tok_column = lex->column;
if (lex->emit_dent < 0) {
lex->tok_kind = MP_TOKEN_DEDENT;
lex->emit_dent += 1;
} else if (lex->emit_dent > 0) {
lex->tok_kind = MP_TOKEN_INDENT;
lex->emit_dent -= 1;
} else if (had_physical_newline && lex->nested_bracket_level == 0) {
lex->tok_kind = MP_TOKEN_NEWLINE;
mp_uint_t num_spaces = lex->column - 1;
if (num_spaces == indent_top(lex)) {
} else if (num_spaces > indent_top(lex)) {
indent_push(lex, num_spaces);
lex->emit_dent += 1;
} else {
while (num_spaces < indent_top(lex)) {
indent_pop(lex);
lex->emit_dent -= 1;
}
if (num_spaces != indent_top(lex)) {
lex->tok_kind = MP_TOKEN_DEDENT_MISMATCH;
}
}
} else if (is_end(lex)) {
lex->tok_kind = MP_TOKEN_END;
} else if (is_string_or_bytes(lex)) {
// a string or bytes literal
// Python requires adjacent string/bytes literals to be automatically
// concatenated. We do it here in the tokeniser to make efficient use of RAM,
// because then the lexer's vstr can be used to accumulate the string literal,
// in contrast to creating a parse tree of strings and then joining them later
// in the compiler. It's also more compact in code size to do it here.
// MP_TOKEN_END is used to indicate that this is the first string token
lex->tok_kind = MP_TOKEN_END;
// Loop to accumulate string/bytes literals
do {
// parse type codes
bool is_raw = false;
mp_token_kind_t kind = MP_TOKEN_STRING;
int n_char = 0;
if (is_char(lex, 'u')) {
n_char = 1;
} else if (is_char(lex, 'b')) {
kind = MP_TOKEN_BYTES;
n_char = 1;
if (is_char_following(lex, 'r')) {
is_raw = true;
n_char = 2;
}
} else if (is_char(lex, 'r')) {
is_raw = true;
n_char = 1;
if (is_char_following(lex, 'b')) {
kind = MP_TOKEN_BYTES;
n_char = 2;
}
}
// Set or check token kind
if (lex->tok_kind == MP_TOKEN_END) {
lex->tok_kind = kind;
} else if (lex->tok_kind != kind) {
// Can't concatenate string with bytes
break;
}
// Skip any type code characters
if (n_char != 0) {
next_char(lex);
if (n_char == 2) {
next_char(lex);
}
}
// Parse the literal
parse_string_literal(lex, is_raw);
// Skip whitespace so we can check if there's another string following
skip_whitespace(lex, true);
} while (is_string_or_bytes(lex));
} else if (is_head_of_identifier(lex)) {
lex->tok_kind = MP_TOKEN_NAME;