From 91d387de7df9e19bb5b00e6ad4c94790eb3422e3 Mon Sep 17 00:00:00 2001 From: Damien Date: Wed, 9 Oct 2013 15:09:52 +0100 Subject: [PATCH] Improve indent/dedent error checking and reporting. --- py/compile.c | 9 ++++++++- py/lexer.c | 22 ++++++++++++++++------ py/lexer.h | 28 +++++++++++++++------------- py/main.c | 12 +++++++----- py/parse.c | 18 +++++++++++++----- 5 files changed, 59 insertions(+), 30 deletions(-) diff --git a/py/compile.c b/py/compile.c index f4a5886b0b..3d5a29a192 100644 --- a/py/compile.c +++ b/py/compile.c @@ -2192,7 +2192,14 @@ void compile_node(compiler_t *comp, py_parse_node_t pn) { case PY_PARSE_NODE_DECIMAL: EMIT(load_const_dec, arg); break; case PY_PARSE_NODE_STRING: EMIT(load_const_str, arg, false); break; case PY_PARSE_NODE_BYTES: EMIT(load_const_str, arg, true); break; - case PY_PARSE_NODE_TOKEN: EMIT(load_const_tok, arg); break; + case PY_PARSE_NODE_TOKEN: + if (arg == PY_TOKEN_NEWLINE) { + // this can occur when file_input lets through a NEWLINE (eg if file starts with a newline) + // do nothing + } else { + EMIT(load_const_tok, arg); + } + break; default: assert(0); } } else { diff --git a/py/lexer.c b/py/lexer.c index 9c2195ef5b..656dc6d329 100644 --- a/py/lexer.c +++ b/py/lexer.c @@ -331,9 +331,7 @@ static void py_lexer_next_token_into(py_lexer_t *lex, py_token_t *tok) { tok->kind = PY_TOKEN_INDENT; lex->emit_dent -= 1; - } else if (had_physical_newline && lex->nested_bracket_level == 0 - && tok != &lex->tok_cur // so that we don't emit a newline if file starts with a comment - ) { + } else if (had_physical_newline && lex->nested_bracket_level == 0) { tok->kind = PY_TOKEN_NEWLINE; uint num_spaces = lex->column - 1; @@ -348,12 +346,11 @@ static void py_lexer_next_token_into(py_lexer_t *lex, py_token_t *tok) { lex->emit_dent -= 1; } if (num_spaces != indent_top(lex)) { - //SyntaxError + tok->kind = PY_TOKEN_DEDENT_MISMATCH; } } } else if (is_end(lex)) { - // TODO emit a newline if file does not end in one if (indent_top(lex) > 0) { tok->kind = PY_TOKEN_NEWLINE; lex->emit_dent = 0; @@ -613,7 +610,15 @@ py_lexer_t *py_lexer_from_str_len(const char *src_name, const char *str, uint le } py_lexer_next_token_into(lex, &lex->tok_cur); - py_lexer_next_token_into(lex, &lex->tok_next); + + // check that the first token is in the first column + // (done to get equivalence with CPython) + if (lex->tok_cur.src_line == 1 && lex->tok_cur.src_column != 1) { + lex->tok_next = lex->tok_cur; + lex->tok_cur.kind = PY_TOKEN_INDENT; + } else { + py_lexer_next_token_into(lex, &lex->tok_next); + } return lex; } @@ -675,3 +680,8 @@ bool py_lexer_opt_str(py_lexer_t *lex, const char *str) { bool py_lexer_show_error(py_lexer_t *lex, const char *msg) { return py_token_show_error(&lex->tok_cur, msg); } + +bool py_lexer_show_error_pythonic(py_lexer_t *lex, const char *msg) { + printf(" File \"%s\", line %d column %d\n%s\n", lex->tok_cur.src_name, lex->tok_cur.src_line, lex->tok_cur.src_column, msg); + return false; +} diff --git a/py/lexer.h b/py/lexer.h index 32ab48a084..948901259d 100644 --- a/py/lexer.h +++ b/py/lexer.h @@ -12,20 +12,21 @@ typedef enum _py_token_kind_t { PY_TOKEN_END, // 0 PY_TOKEN_INVALID, + PY_TOKEN_DEDENT_MISMATCH, PY_TOKEN_LONELY_STRING_OPEN, - PY_TOKEN_NEWLINE, // 3 - PY_TOKEN_INDENT, // 4 - PY_TOKEN_DEDENT, // 5 + PY_TOKEN_NEWLINE, // 4 + PY_TOKEN_INDENT, // 5 + PY_TOKEN_DEDENT, // 6 - PY_TOKEN_NAME, // 6 + PY_TOKEN_NAME, // 7 PY_TOKEN_NUMBER, PY_TOKEN_STRING, PY_TOKEN_BYTES, PY_TOKEN_ELLIPSES, - PY_TOKEN_KW_FALSE, // 11 + PY_TOKEN_KW_FALSE, // 12 PY_TOKEN_KW_NONE, PY_TOKEN_KW_TRUE, PY_TOKEN_KW_AND, @@ -34,7 +35,7 @@ typedef enum _py_token_kind_t { PY_TOKEN_KW_BREAK, PY_TOKEN_KW_CLASS, PY_TOKEN_KW_CONTINUE, - PY_TOKEN_KW_DEF, // 20 + PY_TOKEN_KW_DEF, // 21 PY_TOKEN_KW_DEL, PY_TOKEN_KW_ELIF, PY_TOKEN_KW_ELSE, @@ -44,7 +45,7 @@ typedef enum _py_token_kind_t { PY_TOKEN_KW_FROM, PY_TOKEN_KW_GLOBAL, PY_TOKEN_KW_IF, - PY_TOKEN_KW_IMPORT, // 30 + PY_TOKEN_KW_IMPORT, // 31 PY_TOKEN_KW_IN, PY_TOKEN_KW_IS, PY_TOKEN_KW_LAMBDA, @@ -54,12 +55,12 @@ typedef enum _py_token_kind_t { PY_TOKEN_KW_PASS, PY_TOKEN_KW_RAISE, PY_TOKEN_KW_RETURN, - PY_TOKEN_KW_TRY, // 40 + PY_TOKEN_KW_TRY, // 41 PY_TOKEN_KW_WHILE, PY_TOKEN_KW_WITH, PY_TOKEN_KW_YIELD, - PY_TOKEN_OP_PLUS, // 44 + PY_TOKEN_OP_PLUS, // 45 PY_TOKEN_OP_MINUS, PY_TOKEN_OP_STAR, PY_TOKEN_OP_DBL_STAR, @@ -69,7 +70,7 @@ typedef enum _py_token_kind_t { PY_TOKEN_OP_LESS, PY_TOKEN_OP_DBL_LESS, PY_TOKEN_OP_MORE, - PY_TOKEN_OP_DBL_MORE, // 54 + PY_TOKEN_OP_DBL_MORE, // 55 PY_TOKEN_OP_AMPERSAND, PY_TOKEN_OP_PIPE, PY_TOKEN_OP_CARET, @@ -79,7 +80,7 @@ typedef enum _py_token_kind_t { PY_TOKEN_OP_DBL_EQUAL, PY_TOKEN_OP_NOT_EQUAL, - PY_TOKEN_DEL_PAREN_OPEN, // 63 + PY_TOKEN_DEL_PAREN_OPEN, // 64 PY_TOKEN_DEL_PAREN_CLOSE, PY_TOKEN_DEL_BRACKET_OPEN, PY_TOKEN_DEL_BRACKET_CLOSE, @@ -89,7 +90,7 @@ typedef enum _py_token_kind_t { PY_TOKEN_DEL_COLON, PY_TOKEN_DEL_PERIOD, PY_TOKEN_DEL_SEMICOLON, - PY_TOKEN_DEL_AT, // 73 + PY_TOKEN_DEL_AT, // 74 PY_TOKEN_DEL_EQUAL, PY_TOKEN_DEL_PLUS_EQUAL, PY_TOKEN_DEL_MINUS_EQUAL, @@ -99,7 +100,7 @@ typedef enum _py_token_kind_t { PY_TOKEN_DEL_PERCENT_EQUAL, PY_TOKEN_DEL_AMPERSAND_EQUAL, PY_TOKEN_DEL_PIPE_EQUAL, - PY_TOKEN_DEL_CARET_EQUAL, // 83 + PY_TOKEN_DEL_CARET_EQUAL, // 84 PY_TOKEN_DEL_DBL_MORE_EQUAL, PY_TOKEN_DEL_DBL_LESS_EQUAL, PY_TOKEN_DEL_DBL_STAR_EQUAL, @@ -137,5 +138,6 @@ bool py_lexer_opt_kind(py_lexer_t *lex, py_token_kind_t kind); bool py_lexer_opt_str(py_lexer_t *lex, const char *str); */ bool py_lexer_show_error(py_lexer_t *lex, const char *msg); +bool py_lexer_show_error_pythonic(py_lexer_t *lex, const char *msg); #endif /* INCLUDED_LEXER_H */ diff --git a/py/main.c b/py/main.c index 7b17c38a85..cca52c4e73 100644 --- a/py/main.c +++ b/py/main.c @@ -31,11 +31,13 @@ int main(int argc, char **argv) { } } else { py_parse_node_t pn = py_parse(lex, 0); - //printf("----------------\n"); - //parse_node_show(pn, 0); - //printf("----------------\n"); - py_compile(pn); - //printf("----------------\n"); + if (pn != PY_PARSE_NODE_NULL) { + //printf("----------------\n"); + parse_node_show(pn, 0); + //printf("----------------\n"); + py_compile(pn); + //printf("----------------\n"); + } } py_lexer_free(lex); diff --git a/py/parse.c b/py/parse.c index 5eb4b27dbf..124d00ffeb 100644 --- a/py/parse.c +++ b/py/parse.c @@ -545,10 +545,12 @@ py_parse_node_t py_parse(py_lexer_t *lex, int wanted_rule) { assert(0); } } + + // check we are at the end of the token stream if (!py_lexer_is_kind(lex, PY_TOKEN_END)) { - py_lexer_show_error(lex, "unexpected token at end:"); - py_token_show(py_lexer_cur(lex)); + goto syntax_error; } + //printf("--------------\n"); //result_stack_show(parser); assert(parser->result_stack_top == 1); @@ -557,10 +559,16 @@ py_parse_node_t py_parse(py_lexer_t *lex, int wanted_rule) { return parser->result_stack[0]; syntax_error: - py_lexer_show_error(lex, "syntax error:"); + if (py_lexer_is_kind(lex, PY_TOKEN_INDENT)) { + py_lexer_show_error_pythonic(lex, "IndentationError: unexpected indent"); + } else if (py_lexer_is_kind(lex, PY_TOKEN_DEDENT_MISMATCH)) { + py_lexer_show_error_pythonic(lex, "IndentationError: unindent does not match any outer indentation level"); + } else { + py_lexer_show_error_pythonic(lex, "syntax error:"); #ifdef USE_RULE_NAME - py_lexer_show_error(lex, rule->rule_name); + py_lexer_show_error(lex, rule->rule_name); #endif - py_token_show(py_lexer_cur(lex)); + py_token_show(py_lexer_cur(lex)); + } return PY_PARSE_NODE_NULL; }