py: Support unicode (utf-8 encoded) identifiers in Python source.

Enabled simply by making the identifier lexing code 8-bit clean.
2015-06-09 10:58:07 +00:00 · 2015-06-09 10:58:07 +00:00 · 7ed58cb663
commit 7ed58cb663
parent 6e56bb623c
2 changed files with 32 additions and 6 deletions
--- a/py/lexer.c
+++ b/py/lexer.c
@ -112,12 +112,11 @@ STATIC bool is_following_odigit(mp_lexer_t *lex) {
    return lex->chr1 >= '0' && lex->chr1 <= '7';
 }
-// TODO UNICODE include unicode characters in definition of identifiers
+// to easily parse utf-8 identifiers we allow any raw byte with high bit set
 STATIC bool is_head_of_identifier(mp_lexer_t *lex) {
-    return is_letter(lex) || lex->chr0 == '_';
+    return is_letter(lex) || lex->chr0 == '_' || lex->chr0 >= 0x80;
 }
 // TODO UNICODE include unicode characters in definition of identifiers
 STATIC bool is_tail_of_identifier(mp_lexer_t *lex) {
    return is_head_of_identifier(lex) || is_digit(lex);
 }
@ -523,13 +522,13 @@ STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, bool first_token) {
    } else if (is_head_of_identifier(lex)) {
        lex->tok_kind = MP_TOKEN_NAME;
-        // get first char
+        // get first char (add as byte to remain 8-bit clean and support utf-8)
-        vstr_add_char(&lex->vstr, CUR_CHAR(lex));
+        vstr_add_byte(&lex->vstr, CUR_CHAR(lex));
        next_char(lex);
        // get tail chars
        while (!is_end(lex) && is_tail_of_identifier(lex)) {
-            vstr_add_char(&lex->vstr, CUR_CHAR(lex));
+            vstr_add_byte(&lex->vstr, CUR_CHAR(lex));
            next_char(lex);
        }
--- a/tests/unicode/unicode_id.py
+++ b/tests/unicode/unicode_id.py
@ -0,0 +1,27 @@
 # test unicode in identifiers
 # comment
 # αβγδϵφζ
 # global identifiers
 α = 1
 αβγ = 2
 bβ = 3
 βb = 4
 print(α, αβγ, bβ, βb)
 # function, argument, local identifiers
 def α(β, γ):
    δ = β + γ
    print(β, γ, δ)
 α(1, 2)
 # class, method identifiers
 class φ:
    def __init__(self):
        pass
    def δ(self, ϵ):
        print(ϵ)
 zζzζz = φ()
 if hasattr(zζzζz, "δ"):
    zζzζz.δ(ϵ=123)