py: Implement partial PEP-498 (f-string) support

This implements (most of) the PEP-498 spec for f-strings, with two
exceptions:

- raw f-strings (`fr` or `rf` prefixes) raise `NotImplementedError`
- one special corner case does not function as specified in the PEP
(more on that in a moment)

This is implemented in the core as a syntax translation, brute-forcing
all f-strings to run through `String.format`. For example, the statement
`x='world'; print(f'hello {x}')` gets translated *at a syntax level*
(injected into the lexer) to `x='world'; print('hello {}'.format(x))`.
While this may lead to weird column results in tracebacks, it seemed
like the fastest, most efficient, and *likely* most RAM-friendly option,
despite being implemented under the hood with a completely separate
`vstr_t`.

Since [string concatenation of adjacent literals is implemented in the
lexer](534b7c368d),
two side effects emerge:

- All strings with at least one f-string portion are concatenated into a
single literal which *must* be run through `String.format()` wholesale,
and:
- Concatenation of a raw string with interpolation characters with an
f-string will cause `IndexError`/`KeyError`, which is both different
from CPython *and* different from the corner case mentioned in the PEP
(which gave an example of the following:)

```python
x = 10
y = 'hi'
assert ('a' 'b' f'{x}' '{c}' f'str<{y:^4}>' 'd' 'e') == 'ab10{c}str< hi >de'
```

The above-linked commit detailed a pretty solid case for leaving string
concatenation in the lexer rather than putting it in the parser, and
undoing that decision would likely be disproportionately costly on
resources for the sake of a probably-low-impact corner case. An
alternative to become complaint with this corner case of the PEP would
be to revert to string concatenation in the parser *only when an
f-string is part of concatenation*, though I've done no investigation on
the difficulty or costs of doing this.

A decent set of tests is included. I've manually tested this on the
`unix` port on Linux and on a Feather M4 Express (`atmel-samd`) and
things seem sane.
This commit is contained in:
Josh Klar 2019-08-10 21:27:20 -07:00 committed by Jeff Epler
parent 83d5da95b7
commit 3a7a5ba686
4 changed files with 301 additions and 14 deletions

View File

@ -64,6 +64,10 @@ STATIC bool is_char_or3(mp_lexer_t *lex, byte c1, byte c2, byte c3) {
return lex->chr0 == c1 || lex->chr0 == c2 || lex->chr0 == c3;
}
STATIC bool is_char_or4(mp_lexer_t *lex, byte c1, byte c2, byte c3, byte c4) {
return lex->chr0 == c1 || lex->chr0 == c2 || lex->chr0 == c3 || lex->chr0 == c4;
}
STATIC bool is_char_following(mp_lexer_t *lex, byte c) {
return lex->chr1 == c;
}
@ -107,7 +111,9 @@ STATIC bool is_following_odigit(mp_lexer_t *lex) {
STATIC bool is_string_or_bytes(mp_lexer_t *lex) {
return is_char_or(lex, '\'', '\"')
|| (is_char_or3(lex, 'r', 'u', 'b') && is_char_following_or(lex, '\'', '\"'))
|| (is_char_or4(lex, 'r', 'u', 'b', 'f') && is_char_following_or(lex, '\'', '\"'))
|| ((is_char_and(lex, 'r', 'f') || is_char_and(lex, 'f', 'r'))
&& is_char_following_following_or(lex, '\'', '\"'))
|| ((is_char_and(lex, 'r', 'b') || is_char_and(lex, 'b', 'r'))
&& is_char_following_following_or(lex, '\'', '\"'));
}
@ -121,6 +127,37 @@ STATIC bool is_tail_of_identifier(mp_lexer_t *lex) {
return is_head_of_identifier(lex) || is_digit(lex);
}
STATIC void swap_char_banks(mp_lexer_t *lex) {
if (lex->vstr_postfix_processing) {
unichar h0, h1, h2;
h0 = lex->chr0;
h1 = lex->chr1;
h2 = lex->chr2;
lex->chr0 = lex->vstr_postfix.len > 0 ? lex->vstr_postfix.buf[0] : 0;
lex->chr1 = lex->vstr_postfix.len > 1 ? lex->vstr_postfix.buf[1] : 0;
lex->chr2 = lex->vstr_postfix.len > 2 ? lex->vstr_postfix.buf[2] : 0;
lex->chr3 = h0;
lex->chr4 = h1;
lex->chr5 = h2;
lex->vstr_postfix_idx = lex->vstr_postfix.len > 2 ? 3 : lex->vstr_postfix.len;
} else {
// blindly reset to the "backup" bank when done postfix processing
// this restores control to the mp_reader
lex->chr0 = lex->chr3;
lex->chr1 = lex->chr4;
lex->chr2 = lex->chr5;
lex->chr3 = 0;
lex->chr4 = 0;
lex->chr5 = 0;
vstr_reset(&lex->vstr_postfix);
lex->vstr_postfix_idx = 0;
}
}
STATIC void next_char(mp_lexer_t *lex) {
if (lex->chr0 == '\n') {
// a new line
@ -136,7 +173,16 @@ STATIC void next_char(mp_lexer_t *lex) {
lex->chr0 = lex->chr1;
lex->chr1 = lex->chr2;
lex->chr2 = lex->reader.readbyte(lex->reader.data);
if (lex->vstr_postfix_processing) {
if (lex->vstr_postfix_idx == lex->vstr_postfix.len) {
lex->chr2 = '\0';
} else {
lex->chr2 = lex->vstr_postfix.buf[lex->vstr_postfix_idx++];
}
} else {
lex->chr2 = lex->reader.readbyte(lex->reader.data);
}
if (lex->chr1 == '\r') {
// CR is a new line, converted to LF
@ -151,6 +197,11 @@ STATIC void next_char(mp_lexer_t *lex) {
if (lex->chr2 == MP_LEXER_EOF && lex->chr1 != MP_LEXER_EOF && lex->chr1 != '\n') {
lex->chr2 = '\n';
}
if (lex->vstr_postfix_processing && lex->chr0 == '\0') {
lex->vstr_postfix_processing = false;
swap_char_banks(lex);
}
}
STATIC void indent_push(mp_lexer_t *lex, size_t indent) {
@ -270,7 +321,7 @@ STATIC bool get_hex(mp_lexer_t *lex, size_t num_digits, mp_uint_t *result) {
return true;
}
STATIC void parse_string_literal(mp_lexer_t *lex, bool is_raw) {
STATIC void parse_string_literal(mp_lexer_t *lex, bool is_raw, bool is_fstring) {
// get first quoting character
char quote_char = '\'';
if (is_char(lex, '\"')) {
@ -291,15 +342,67 @@ STATIC void parse_string_literal(mp_lexer_t *lex, bool is_raw) {
}
size_t n_closing = 0;
bool in_expression = false;
bool expression_eat = true;
while (!is_end(lex) && (num_quotes > 1 || !is_char(lex, '\n')) && n_closing < num_quotes) {
if (is_char(lex, quote_char)) {
n_closing += 1;
vstr_add_char(&lex->vstr, CUR_CHAR(lex));
} else {
n_closing = 0;
if (is_fstring && is_char(lex, '{')) {
vstr_add_char(&lex->vstr, CUR_CHAR(lex));
in_expression = !in_expression;
expression_eat = in_expression;
if (lex->vstr_postfix.len == 0) {
vstr_add_str(&lex->vstr_postfix, ".format(");
}
next_char(lex);
continue;
}
if (is_fstring && is_char(lex, '}')) {
vstr_add_char(&lex->vstr, CUR_CHAR(lex));
if (in_expression) {
in_expression = false;
vstr_add_char(&lex->vstr_postfix, ',');
}
next_char(lex);
continue;
}
if (in_expression) {
// throw errors for illegal chars inside f-string expressions
if (is_char(lex, '#')) {
lex->tok_kind = MP_TOKEN_FSTRING_COMMENT;
return;
} else if (is_char(lex, '\\')) {
lex->tok_kind = MP_TOKEN_FSTRING_BACKSLASH;
return;
} else if (is_char(lex, ':')) {
expression_eat = false;
}
unichar c = CUR_CHAR(lex);
if (expression_eat) {
vstr_add_char(&lex->vstr_postfix, c);
} else {
vstr_add_char(&lex->vstr, c);
}
next_char(lex);
continue;
}
if (is_char(lex, '\\')) {
next_char(lex);
unichar c = CUR_CHAR(lex);
if (is_raw) {
// raw strings allow escaping of quotes, but the backslash is also emitted
vstr_add_char(&lex->vstr, '\\');
@ -430,6 +533,13 @@ STATIC bool skip_whitespace(mp_lexer_t *lex, bool stop_at_newline) {
}
void mp_lexer_to_next(mp_lexer_t *lex) {
if (lex->vstr_postfix.len && !lex->vstr_postfix_processing) {
// end format call injection
vstr_add_char(&lex->vstr_postfix, ')');
lex->vstr_postfix_processing = true;
swap_char_banks(lex);
}
// start new token text
vstr_reset(&lex->vstr);
@ -485,6 +595,7 @@ void mp_lexer_to_next(mp_lexer_t *lex) {
do {
// parse type codes
bool is_raw = false;
bool is_fstring = false;
mp_token_kind_t kind = MP_TOKEN_STRING;
int n_char = 0;
if (is_char(lex, 'u')) {
@ -503,6 +614,17 @@ void mp_lexer_to_next(mp_lexer_t *lex) {
kind = MP_TOKEN_BYTES;
n_char = 2;
}
if (is_char_following(lex, 'f')) {
lex->tok_kind = MP_TOKEN_FSTRING_RAW;
break;
}
} else if (is_char(lex, 'f')) {
if (is_char_following(lex, 'r')) {
lex->tok_kind = MP_TOKEN_FSTRING_RAW;
break;
}
n_char = 1;
is_fstring = true;
}
// Set or check token kind
@ -522,13 +644,12 @@ void mp_lexer_to_next(mp_lexer_t *lex) {
}
// Parse the literal
parse_string_literal(lex, is_raw);
parse_string_literal(lex, is_raw, is_fstring);
// Skip whitespace so we can check if there's another string following
skip_whitespace(lex, true);
} while (is_string_or_bytes(lex));
} else if (is_head_of_identifier(lex)) {
lex->tok_kind = MP_TOKEN_NAME;
@ -682,6 +803,7 @@ mp_lexer_t *mp_lexer_new(qstr src_name, mp_reader_t reader) {
lex->num_indent_level = 1;
lex->indent_level = m_new(uint16_t, lex->alloc_indent_level);
vstr_init(&lex->vstr, 32);
vstr_init(&lex->vstr_postfix, 0);
// store sentinel for first indentation level
lex->indent_level[0] = 0;

View File

@ -44,6 +44,12 @@ typedef enum _mp_token_kind_t {
MP_TOKEN_INVALID,
MP_TOKEN_DEDENT_MISMATCH,
MP_TOKEN_LONELY_STRING_OPEN,
MP_TOKEN_FSTRING_BACKSLASH,
MP_TOKEN_FSTRING_COMMENT,
MP_TOKEN_FSTRING_UNCLOSED,
MP_TOKEN_FSTRING_UNOPENED,
MP_TOKEN_FSTRING_EMPTY_EXP,
MP_TOKEN_FSTRING_RAW,
MP_TOKEN_NEWLINE,
MP_TOKEN_INDENT,
@ -150,6 +156,7 @@ typedef struct _mp_lexer_t {
mp_reader_t reader; // stream source
unichar chr0, chr1, chr2; // current cached characters from source
unichar chr3, chr4, chr5; // current cached characters from alt source
size_t line; // current source line
size_t column; // current source column
@ -165,6 +172,9 @@ typedef struct _mp_lexer_t {
size_t tok_column; // token source column
mp_token_kind_t tok_kind; // token kind
vstr_t vstr; // token data
vstr_t vstr_postfix; // postfix to apply to string
bool vstr_postfix_processing;
uint16_t vstr_postfix_idx;
} mp_lexer_t;
mp_lexer_t *mp_lexer_new(qstr src_name, mp_reader_t reader);

View File

@ -924,6 +924,7 @@ mp_parse_tree_t mp_parse(mp_lexer_t *lex, mp_parse_input_kind_t input_kind) {
backtrack = false;
}
for (; i < n; ++i) {
//printf("--> inside for @L924\n");
uint16_t kind = rule_arg[i] & RULE_ARG_KIND_MASK;
if (kind == RULE_ARG_TOK) {
if (lex->tok_kind == (rule_arg[i] & RULE_ARG_ARG_MASK)) {
@ -1168,15 +1169,43 @@ mp_parse_tree_t mp_parse(mp_lexer_t *lex, mp_parse_input_kind_t input_kind) {
) {
syntax_error:;
mp_obj_t exc;
if (lex->tok_kind == MP_TOKEN_INDENT) {
exc = mp_obj_new_exception_msg(&mp_type_IndentationError,
translate("unexpected indent"));
} else if (lex->tok_kind == MP_TOKEN_DEDENT_MISMATCH) {
exc = mp_obj_new_exception_msg(&mp_type_IndentationError,
translate("unindent does not match any outer indentation level"));
} else {
exc = mp_obj_new_exception_msg(&mp_type_SyntaxError,
translate("invalid syntax"));
switch(lex->tok_kind) {
case MP_TOKEN_INDENT:
exc = mp_obj_new_exception_msg(&mp_type_IndentationError,
translate("unexpected indent"));
break;
case MP_TOKEN_DEDENT_MISMATCH:
exc = mp_obj_new_exception_msg(&mp_type_IndentationError,
translate("unindent does not match any outer indentation level"));
break;
case MP_TOKEN_FSTRING_BACKSLASH:
exc = mp_obj_new_exception_msg(&mp_type_SyntaxError,
translate("f-string expression part cannot include a backslash"));
break;
case MP_TOKEN_FSTRING_COMMENT:
exc = mp_obj_new_exception_msg(&mp_type_SyntaxError,
translate("f-string expression part cannot include a '#'"));
break;
case MP_TOKEN_FSTRING_UNCLOSED:
exc = mp_obj_new_exception_msg(&mp_type_SyntaxError,
translate("f-string: expecting '}'"));
break;
case MP_TOKEN_FSTRING_UNOPENED:
exc = mp_obj_new_exception_msg(&mp_type_SyntaxError,
translate("f-string: single '}' is not allowed"));
break;
case MP_TOKEN_FSTRING_EMPTY_EXP:
exc = mp_obj_new_exception_msg(&mp_type_SyntaxError,
translate("f-string: empty expression not allowed"));
break;
case MP_TOKEN_FSTRING_RAW:
exc = mp_obj_new_exception_msg(&mp_type_NotImplementedError,
translate("raw f-strings are not implemented"));
break;
default:
exc = mp_obj_new_exception_msg(&mp_type_SyntaxError,
translate("invalid syntax"));
break;
}
// add traceback to give info about file name and location
// we don't have a 'block' name, so just pass the NULL qstr to indicate this

View File

@ -0,0 +1,126 @@
# Tests against https://www.python.org/dev/peps/pep-0498/
assert f'no interpolation' == 'no interpolation'
assert f"no interpolation" == 'no interpolation'
# Quoth the PEP:
# Backslashes may not appear anywhere within expressions. Comments, using the
# '#' character, are not allowed inside an expression
#
# CPython (3.7.4 on Linux) raises a SyntaxError here:
# >>> f'{#}'
# File "<stdin>", line 1
# SyntaxError: f-string expression part cannot include '#'
# >>> f'{\}'
# File "<stdin>", line 1
# SyntaxError: f-string expression part cannot include a backslash
# >>> f'{\\}'
# File "<stdin>", line 1
# SyntaxError: f-string expression part cannot include a backslash
# >>> f'{\#}'
# File "<stdin>", line 1
# SyntaxError: f-string expression part cannot include a backslash
# Backslashes and comments allowed outside expression
assert f"\\" == "\\"
assert f'#' == '#'
## But not inside
try:
eval("f'{\}'")
except SyntaxError:
pass
else:
raise AssertionError('f-string with backslash in expression did not raise SyntaxError')
try:
eval("f'{#}'")
except SyntaxError:
pass
else:
raise AssertionError('f-string with \'#\' in expression did not raise SyntaxError')
# Quoth the PEP:
# While scanning the string for expressions, any doubled braces '{{' or '}}'
# inside literal portions of an f-string are replaced by the corresponding
# single brace. Doubled literal opening braces do not signify the start of an
# expression. A single closing curly brace '}' in the literal portion of a
# string is an error: literal closing curly braces must be doubled '}}' in
# order to represent a single closing brace.
#
# CPython (3.7.4 on Linux) raises a SyntaxError for the last case:
# >>> f'{{}'
# File "<stdin>", line 1
# SyntaxError: f-string: single '}' is not allowed
assert f'{{}}' == '{}'
try:
eval("f'{{}'")
except ValueError:
pass
else:
raise RuntimeError('Expected ValueError for invalid f-string literal bracing')
x = 1
assert f'{x}' == '1'
# Quoth the PEP:
# The expressions that are extracted from the string are evaluated in the
# context where the f-string appeared. This means the expression has full
# access to local and global variables. Any valid Python expression can be
# used, including function and method calls. Because the f-strings are
# evaluated where the string appears in the source code, there is no additional
# expressiveness available with f-strings. There are also no additional
# security concerns: you could have also just written the same expression, not
# inside of an f-string:
def foo():
return 20
assert f'result={foo()}' == 'result=20'
assert f'result={foo()}' == 'result={}'.format(foo())
assert f'result={foo()}' == 'result={result}'.format(result=foo())
# Quoth the PEP:
# Adjacent f-strings and regular strings are concatenated. Regular strings are
# concatenated at compile time, and f-strings are concatenated at run time. For
# example, the expression:
#
# >>> x = 10
# >>> y = 'hi'
# >>> 'a' 'b' f'{x}' '{c}' f'str<{y:^4}>' 'd' 'e'
#
# yields the value: 'ab10{c}str< hi >de'
#
# Because strings are concatenated at lexer time rather than parser time in
# MicroPython for mostly RAM efficiency reasons (see
# https://github.com/micropython/micropython/commit/534b7c368dc2af7720f3aaed0c936ef46d773957),
# and because f-strings here are implemented as a syntax translation
# (f'{something}' => '{}'.format(something)), this particular functionality is unimplemented,
# and in the above example, the '{c}' portion will trigger a KeyError on String.format()
x = 10
y = 'hi'
assert (f'h' f'i') == 'hi'
assert (f'h' 'i') == 'hi'
assert ('h' f'i') == 'hi'
assert f'{x:^4}' == ' 10 '
assert ('a' 'b' f'{x}' f'str<{y:^4}>' 'd' 'e') == 'ab10str< hi >de'
# Other tests
assert f'{{{4*10}}}' == '{40}'
try:
eval("fr''")
except NotImplementedError:
pass
else:
raise RuntimeError('expected raw f-string to raise NotImplementedError')
try:
eval("rf''")
except NotImplementedError:
pass
else:
raise RuntimeError('expected raw f-string to raise NotImplementedError')