circuitpython/py/unicode.c

/*
 * This file is part of the Micro Python project, http://micropython.org/
 *
 * The MIT License (MIT)
 *
 * Copyright (c) 2013, 2014 Damien P. George
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */

#include <stdint.h>

#include "mpconfig.h"
#include "misc.h"
#include "unicode.h"

// attribute flags
#define FL_PRINT (0x01)
#define FL_SPACE (0x02)
#define FL_DIGIT (0x04)
#define FL_ALPHA (0x08)
#define FL_UPPER (0x10)
#define FL_LOWER (0x20)

// shorthand character attributes
#define AT_PR (FL_PRINT)
#define AT_SP (FL_SPACE | FL_PRINT)
#define AT_DI (FL_DIGIT | FL_PRINT)
#define AT_AL (FL_ALPHA | FL_PRINT)
#define AT_UP (FL_UPPER | FL_ALPHA | FL_PRINT)
#define AT_LO (FL_LOWER | FL_ALPHA | FL_PRINT)

// table of attributes for ascii characters
STATIC const uint8_t attr[] = {
    0, 0, 0, 0, 0, 0, 0, 0,
    0, AT_SP, AT_SP, AT_SP, AT_SP, AT_SP, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    AT_SP, AT_PR, AT_PR, AT_PR, AT_PR, AT_PR, AT_PR, AT_PR,
    AT_PR, AT_PR, AT_PR, AT_PR, AT_PR, AT_PR, AT_PR, AT_PR,
    AT_DI, AT_DI, AT_DI, AT_DI, AT_DI, AT_DI, AT_DI, AT_DI,
    AT_DI, AT_DI, AT_PR, AT_PR, AT_PR, AT_PR, AT_PR, AT_PR,
    AT_PR, AT_UP, AT_UP, AT_UP, AT_UP, AT_UP, AT_UP, AT_UP,
    AT_UP, AT_UP, AT_UP, AT_UP, AT_UP, AT_UP, AT_UP, AT_UP,
    AT_UP, AT_UP, AT_UP, AT_UP, AT_UP, AT_UP, AT_UP, AT_UP,
    AT_UP, AT_UP, AT_UP, AT_PR, AT_PR, AT_PR, AT_PR, AT_PR,
    AT_PR, AT_LO, AT_LO, AT_LO, AT_LO, AT_LO, AT_LO, AT_LO,
    AT_LO, AT_LO, AT_LO, AT_LO, AT_LO, AT_LO, AT_LO, AT_LO,
    AT_LO, AT_LO, AT_LO, AT_LO, AT_LO, AT_LO, AT_LO, AT_LO,
    AT_LO, AT_LO, AT_LO, AT_PR, AT_PR, AT_PR, AT_PR, 0
};

// TODO: Rename to str_get_char
unichar utf8_get_char(const byte *s) {
#if MICROPY_PY_BUILTINS_STR_UNICODE
    unichar ord = *s++;
    if (!UTF8_IS_NONASCII(ord)) return ord;
    ord &= 0x7F;
    for (unichar mask = 0x40; ord & mask; mask >>= 1) {
        ord &= ~mask;
    }
    while (UTF8_IS_CONT(*s)) {
        ord = (ord << 6) | (*s++ & 0x3F);
    }
    return ord;
#else
    return *s;
#endif
}

// TODO: Rename to str_next_char
const byte *utf8_next_char(const byte *s) {
#if MICROPY_PY_BUILTINS_STR_UNICODE
    ++s;
    while (UTF8_IS_CONT(*s)) {
        ++s;
    }
    return s;
#else
    return s + 1;
#endif
}

mp_uint_t utf8_ptr_to_index(const byte *s, const byte *ptr) {
    mp_uint_t i = 0;
    while (ptr > s) {
        if (!UTF8_IS_CONT(*--ptr)) {
            i++;
        }
    }

    return i;
}

// TODO: Rename to str_charlen
mp_uint_t unichar_charlen(const char *str, mp_uint_t len)
{
#if MICROPY_PY_BUILTINS_STR_UNICODE
    mp_uint_t charlen = 0;
    for (const char *top = str + len; str < top; ++str) {
        if (!UTF8_IS_CONT(*str)) {
            ++charlen;
        }
    }
    return charlen;
#else
    return len;
#endif
}

// Be aware: These unichar_is* functions are actually ASCII-only!
bool unichar_isspace(unichar c) {
    return c < 128 && (attr[c] & FL_SPACE) != 0;
}

bool unichar_isalpha(unichar c) {
    return c < 128 && (attr[c] & FL_ALPHA) != 0;
}

bool unichar_isprint(unichar c) {
    return c < 128 && (attr[c] & FL_PRINT) != 0;
}

bool unichar_isdigit(unichar c) {
    return c < 128 && (attr[c] & FL_DIGIT) != 0;
}

bool unichar_isxdigit(unichar c) {
    return unichar_isdigit(c) || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F');
}

/*
bool unichar_is_alpha_or_digit(unichar c) {
    return c < 128 && (attr[c] & (FL_ALPHA | FL_DIGIT)) != 0;
}
*/

bool unichar_isupper(unichar c) {
    return c < 128 && (attr[c] & FL_UPPER) != 0;
}

bool unichar_islower(unichar c) {
    return c < 128 && (attr[c] & FL_LOWER) != 0;
}

unichar unichar_tolower(unichar c) {
    if (unichar_isupper(c)) {
        return c + 0x20;
    }
    return c;
}

unichar unichar_toupper(unichar c) {
    if (unichar_islower(c)) {
        return c - 0x20;
    }
    return c;
}
Add license header to (almost) all files. Blanket wide to all .c and .h files. Some files originating from ST are difficult to deal with (license wise) so it was left out of those. Also merged modpyb.h, modos.h, modstm.h and modtime.h in stmhal/. 2014-05-03 18:27:38 -04:00			`/*`
			`* This file is part of the Micro Python project, http://micropython.org/`
			`*`
			`* The MIT License (MIT)`
			`*`
			`* Copyright (c) 2013, 2014 Damien P. George`
			`*`
			`* Permission is hereby granted, free of charge, to any person obtaining a copy`
			`* of this software and associated documentation files (the "Software"), to deal`
			`* in the Software without restriction, including without limitation the rights`
			`* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell`
			`* copies of the Software, and to permit persons to whom the Software is`
			`* furnished to do so, subject to the following conditions:`
			`*`
			`* The above copyright notice and this permission notice shall be included in`
			`* all copies or substantial portions of the Software.`
			`*`
			`* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR`
			`* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,`
			`* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE`
			`* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER`
			`* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,`
			`* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN`
			`* THE SOFTWARE.`
			`*/`

Initial commit. 2013-10-04 14:53:11 -04:00			`#include <stdint.h>`

Replace global "static" -> "STATIC", to allow "analysis builds". Part 2. 2014-02-12 11:31:30 -05:00			`#include "mpconfig.h"`
py: Include mpconfig.h before all other includes. It defines types used by all other headers. Fixes #691. 2014-06-21 15:43:22 -04:00			`#include "misc.h"`
py: Tidy up a few function declarations. 2014-12-10 17:37:07 -05:00			`#include "unicode.h"`
Initial commit. 2013-10-04 14:53:11 -04:00
			`// attribute flags`
			`#define FL_PRINT (0x01)`
			`#define FL_SPACE (0x02)`
			`#define FL_DIGIT (0x04)`
			`#define FL_ALPHA (0x08)`
			`#define FL_UPPER (0x10)`
			`#define FL_LOWER (0x20)`

			`// shorthand character attributes`
			`#define AT_PR (FL_PRINT)`
			`#define AT_SP (FL_SPACE \| FL_PRINT)`
			`#define AT_DI (FL_DIGIT \| FL_PRINT)`
			`#define AT_AL (FL_ALPHA \| FL_PRINT)`
			`#define AT_UP (FL_UPPER \| FL_ALPHA \| FL_PRINT)`
			`#define AT_LO (FL_LOWER \| FL_ALPHA \| FL_PRINT)`

			`// table of attributes for ascii characters`
Replace global "static" -> "STATIC", to allow "analysis builds". Part 2. 2014-02-12 11:31:30 -05:00			`STATIC const uint8_t attr[] = {`
Initial commit. 2013-10-04 14:53:11 -04:00			`0, 0, 0, 0, 0, 0, 0, 0,`
py: Make form-feed character a space (following C isspace). Eg, in CPython stdlib, email/header.py has a form-feed character. 2014-04-10 06:39:36 -04:00			`0, AT_SP, AT_SP, AT_SP, AT_SP, AT_SP, 0, 0,`
Initial commit. 2013-10-04 14:53:11 -04:00			`0, 0, 0, 0, 0, 0, 0, 0,`
			`0, 0, 0, 0, 0, 0, 0, 0,`
			`AT_SP, AT_PR, AT_PR, AT_PR, AT_PR, AT_PR, AT_PR, AT_PR,`
			`AT_PR, AT_PR, AT_PR, AT_PR, AT_PR, AT_PR, AT_PR, AT_PR,`
			`AT_DI, AT_DI, AT_DI, AT_DI, AT_DI, AT_DI, AT_DI, AT_DI,`
			`AT_DI, AT_DI, AT_PR, AT_PR, AT_PR, AT_PR, AT_PR, AT_PR,`
			`AT_PR, AT_UP, AT_UP, AT_UP, AT_UP, AT_UP, AT_UP, AT_UP,`
			`AT_UP, AT_UP, AT_UP, AT_UP, AT_UP, AT_UP, AT_UP, AT_UP,`
			`AT_UP, AT_UP, AT_UP, AT_UP, AT_UP, AT_UP, AT_UP, AT_UP,`
			`AT_UP, AT_UP, AT_UP, AT_PR, AT_PR, AT_PR, AT_PR, AT_PR,`
			`AT_PR, AT_LO, AT_LO, AT_LO, AT_LO, AT_LO, AT_LO, AT_LO,`
			`AT_LO, AT_LO, AT_LO, AT_LO, AT_LO, AT_LO, AT_LO, AT_LO,`
			`AT_LO, AT_LO, AT_LO, AT_LO, AT_LO, AT_LO, AT_LO, AT_LO,`
			`AT_LO, AT_LO, AT_LO, AT_PR, AT_PR, AT_PR, AT_PR, 0`
			`};`

unicode: Make get_char()/next_char()/charlen() be 8-bit compatible. Based on config define. 2014-06-13 23:39:20 -04:00			`// TODO: Rename to str_get_char`
			`unichar utf8_get_char(const byte *s) {`
			`#if MICROPY_PY_BUILTINS_STR_UNICODE`
py: Implement basic unicode functions. 2014-06-03 15:28:12 -04:00			`unichar ord = *s++;`
			`if (!UTF8_IS_NONASCII(ord)) return ord;`
			`ord &= 0x7F;`
			`for (unichar mask = 0x40; ord & mask; mask >>= 1) {`
			`ord &= ~mask;`
			`}`
			`while (UTF8_IS_CONT(*s)) {`
			`ord = (ord << 6) \| (*s++ & 0x3F);`
			`}`
			`return ord;`
unicode: Make get_char()/next_char()/charlen() be 8-bit compatible. Based on config define. 2014-06-13 23:39:20 -04:00			`#else`
			`return *s;`
			`#endif`
py: Implement basic unicode functions. 2014-06-03 15:28:12 -04:00			`}`

unicode: Make get_char()/next_char()/charlen() be 8-bit compatible. Based on config define. 2014-06-13 23:39:20 -04:00			`// TODO: Rename to str_next_char`
			`const byte utf8_next_char(const byte s) {`
			`#if MICROPY_PY_BUILTINS_STR_UNICODE`
py: Implement basic unicode functions. 2014-06-03 15:28:12 -04:00			`++s;`
			`while (UTF8_IS_CONT(*s)) {`
			`++s;`
			`}`
unicode: Make get_char()/next_char()/charlen() be 8-bit compatible. Based on config define. 2014-06-13 23:39:20 -04:00			`return s;`
			`#else`
			`return s + 1;`
			`#endif`
Initial commit. 2013-10-04 14:53:11 -04:00			`}`

py: Tidy up a few function declarations. 2014-12-10 17:37:07 -05:00			`mp_uint_t utf8_ptr_to_index(const byte s, const byte ptr) {`
Rename machine_(u)int_t to mp_(u)int_t. See discussion in issue #50. 2014-07-03 08:25:24 -04:00			`mp_uint_t i = 0;`
unicode: Add utf8_ptr_to_index(). Useful when we have pointer to char inside string, but need to return char index. (E.g. str.find()). 2014-06-13 20:16:17 -04:00			`while (ptr > s) {`
			`if (!UTF8_IS_CONT(*--ptr)) {`
			`i++;`
			`}`
			`}`

			`return i;`
			`}`

py: Make unichar_charlen() accept/return machine_uint_t. 2014-06-28 16:14:30 -04:00			`// TODO: Rename to str_charlen`
Rename machine_(u)int_t to mp_(u)int_t. See discussion in issue #50. 2014-07-03 08:25:24 -04:00			`mp_uint_t unichar_charlen(const char *str, mp_uint_t len)`
py: Implement basic unicode functions. 2014-06-03 15:28:12 -04:00			`{`
unicode: Make get_char()/next_char()/charlen() be 8-bit compatible. Based on config define. 2014-06-13 23:39:20 -04:00			`#if MICROPY_PY_BUILTINS_STR_UNICODE`
Rename machine_(u)int_t to mp_(u)int_t. See discussion in issue #50. 2014-07-03 08:25:24 -04:00			`mp_uint_t charlen = 0;`
py: Implement basic unicode functions. 2014-06-03 15:28:12 -04:00			`for (const char *top = str + len; str < top; ++str) {`
			`if (!UTF8_IS_CONT(*str)) {`
			`++charlen;`
			`}`
			`}`
			`return charlen;`
unicode: Make get_char()/next_char()/charlen() be 8-bit compatible. Based on config define. 2014-06-13 23:39:20 -04:00			`#else`
			`return len;`
			`#endif`
Initial commit. 2013-10-04 14:53:11 -04:00			`}`

py: Implement basic unicode functions. 2014-06-03 15:28:12 -04:00			`// Be aware: These unichar_is* functions are actually ASCII-only!`
Put unicode functions in unicode.c, and tidy their names. 2013-12-30 13:23:50 -05:00			`bool unichar_isspace(unichar c) {`
Initial commit. 2013-10-04 14:53:11 -04:00			`return c < 128 && (attr[c] & FL_SPACE) != 0;`
			`}`

Put unicode functions in unicode.c, and tidy their names. 2013-12-30 13:23:50 -05:00			`bool unichar_isalpha(unichar c) {`
Initial commit. 2013-10-04 14:53:11 -04:00			`return c < 128 && (attr[c] & FL_ALPHA) != 0;`
			`}`

Put unicode functions in unicode.c, and tidy their names. 2013-12-30 13:23:50 -05:00			`bool unichar_isprint(unichar c) {`
Initial commit. 2013-10-04 14:53:11 -04:00			`return c < 128 && (attr[c] & FL_PRINT) != 0;`
			`}`

Put unicode functions in unicode.c, and tidy their names. 2013-12-30 13:23:50 -05:00			`bool unichar_isdigit(unichar c) {`
Initial commit. 2013-10-04 14:53:11 -04:00			`return c < 128 && (attr[c] & FL_DIGIT) != 0;`
			`}`

Implement octal and hex escapes in strings. 2014-01-22 15:40:02 -05:00			`bool unichar_isxdigit(unichar c) {`
			`return unichar_isdigit(c) \|\| (c >= 'a' && c <= 'f') \|\| (c >= 'A' && c <= 'F');`
			`}`

Initial commit. 2013-10-04 14:53:11 -04:00			`/*`
py: Rename some unichar functions for consistency. 2014-05-11 12:53:11 -04:00			`bool unichar_is_alpha_or_digit(unichar c) {`
Initial commit. 2013-10-04 14:53:11 -04:00			`return c < 128 && (attr[c] & (FL_ALPHA \| FL_DIGIT)) != 0;`
			`}`
objstr: Implement .lower() and .upper(). 2014-05-10 12:47:41 -04:00			`*/`
Initial commit. 2013-10-04 14:53:11 -04:00
py: Rename some unichar functions for consistency. 2014-05-11 12:53:11 -04:00			`bool unichar_isupper(unichar c) {`
Initial commit. 2013-10-04 14:53:11 -04:00			`return c < 128 && (attr[c] & FL_UPPER) != 0;`
			`}`

py: Rename some unichar functions for consistency. 2014-05-11 12:53:11 -04:00			`bool unichar_islower(unichar c) {`
Initial commit. 2013-10-04 14:53:11 -04:00			`return c < 128 && (attr[c] & FL_LOWER) != 0;`
			`}`
objstr: Implement .lower() and .upper(). 2014-05-10 12:47:41 -04:00
			`unichar unichar_tolower(unichar c) {`
py: Rename some unichar functions for consistency. 2014-05-11 12:53:11 -04:00			`if (unichar_isupper(c)) {`
objstr: Implement .lower() and .upper(). 2014-05-10 12:47:41 -04:00			`return c + 0x20;`
			`}`
			`return c;`
			`}`

			`unichar unichar_toupper(unichar c) {`
py: Rename some unichar functions for consistency. 2014-05-11 12:53:11 -04:00			`if (unichar_islower(c)) {`
objstr: Implement .lower() and .upper(). 2014-05-10 12:47:41 -04:00			`return c - 0x20;`
			`}`
			`return c;`
			`}`