Implement octal and hex escapes in strings.

2014-01-22 22:40:02 +02:00 · 2014-01-22 22:40:02 +02:00 · 0b7184dcb8
commit 0b7184dcb8
parent 0914371faf
4 changed files with 74 additions and 6 deletions
--- a/py/lexer.c
+++ b/py/lexer.c
@ -126,6 +126,10 @@ static bool is_following_digit(mp_lexer_t *lex) {
    return unichar_isdigit(lex->chr1);
 }
 static bool is_following_odigit(mp_lexer_t *lex) {
    return lex->chr1 >= '0' && lex->chr1 <= '7';
 }
 // TODO UNICODE include unicode characters in definition of identifiers
 static bool is_head_of_identifier(mp_lexer_t *lex) {
    return is_letter(lex) || lex->chr0 == '_';
@ -275,6 +279,32 @@ static const char *tok_kw[] = {
    NULL,
 };
 static int hex_digit(unichar c) {
    // c is assumed to be hex digit
    int n = c - '0';
    if (n > 9) {
        n &= ~('a' - 'A');
        n -= ('A' - ('9' + 1));
    }
    return n;
 }
 // This is called with CUR_CHAR() before first hex digit, and should return with
 // it pointing to last hex digit
 static bool get_hex(mp_lexer_t *lex, int num_digits, uint *result) {
    uint num = 0;
    while (num_digits-- != 0) {
        next_char(lex);
        unichar c = CUR_CHAR(lex);
        if (!unichar_isxdigit(c)) {
            return false;
        }
        num = (num << 4) + hex_digit(c);
    }
    *result = num;
    return true;
 }
 static void mp_lexer_next_token_into(mp_lexer_t *lex, mp_token_t *tok, bool first_token) {
    // skip white space and comments
    bool had_physical_newline = false;
@ -439,12 +469,34 @@ static void mp_lexer_next_token_into(mp_lexer_t *lex, mp_token_t *tok, bool firs
                        case 'v': c = 0x0b; break;
                        case 'f': c = 0x0c; break;
                        case 'r': c = 0x0d; break;
-                        // TODO \ooo octal
+                        case 'x':
-                        case 'x': // TODO \xhh
+                        {
-                        case 'N': // TODO \N{name} only in strings
+                            uint num;
-                        case 'u': // TODO \uxxxx only in strings
+                            if (!get_hex(lex, 2, &num)) {
-                        case 'U': // TODO \Uxxxxxxxx only in strings
+                                // TODO error message
-                        default: break; // TODO error message
+                                assert(0);
                            }
                            c = num;
                            break;
                        }
                        case 'N': break; // TODO \N{name} only in strings
                        case 'u': break; // TODO \uxxxx only in strings
                        case 'U': break; // TODO \Uxxxxxxxx only in strings
                        default:
                            if (c >= '0' && c <= '7') {
                                // Octal sequence, 1-3 chars
                                int digits = 3;
                                int num = c - '0';
                                while (is_following_odigit(lex) && --digits != 0) {
                                    next_char(lex);
                                    num = num * 8 + (CUR_CHAR(lex) - '0');
                                }
                                c = num;
                            } else {
                                // TODO error message
                                assert(0);
                            }
                            break;
                    }
                    if (c != MP_LEXER_CHAR_EOF) {
                        vstr_add_char(&lex->vstr, c);
--- a/py/misc.h
+++ b/py/misc.h
@ -43,6 +43,7 @@ bool unichar_isspace(unichar c);
 bool unichar_isalpha(unichar c);
 bool unichar_isprint(unichar c);
 bool unichar_isdigit(unichar c);
 bool unichar_isxdigit(unichar c);
 /** string ******************************************************/
--- a/py/unicode.c
+++ b/py/unicode.c
@ -62,6 +62,10 @@ bool unichar_isdigit(unichar c) {
    return c < 128 && (attr[c] & FL_DIGIT) != 0;
 }
 bool unichar_isxdigit(unichar c) {
    return unichar_isdigit(c) || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F');
 }
 /*
 bool char_is_alpha_or_digit(unichar c) {
    return c < 128 && (attr[c] & (FL_ALPHA | FL_DIGIT)) != 0;
--- a/tests/basics/string-escape.py
+++ b/tests/basics/string-escape.py
@ -0,0 +1,11 @@
 a = "a\1b"
 print(len(a))
 print(ord(a[1]))
 print(len("a\123b"))
 a = "a\12345b"
 print(len(a))
 print(ord(a[1]))
 a = "a\xffb"
 print(len(a))
 print(ord(a[1]))