circuitpython/tests/unicode/unicode.py
tll 68c28174d0 py/objstr: Add check for valid UTF-8 when making a str from bytes.
This patch adds a function utf8_check() to check for a valid UTF-8 encoded
string, and calls it when constructing a str from raw bytes.  The feature
is selectable at compile time via MICROPY_PY_BUILTINS_STR_UNICODE_CHECK and
is enabled if unicode is enabled.  It costs about 110 bytes on Thumb-2, 150
bytes on Xtensa and 170 bytes on x86-64.
2017-09-06 16:43:09 +10:00

50 lines
1.2 KiB
Python

# Test a UTF-8 encoded literal
s = "asdf©qwer"
for i in range(len(s)):
print("s[%d]: %s %X"%(i, s[i], ord(s[i])))
# Test all three forms of Unicode escape, and
# all blocks of UTF-8 byte patterns
s = "a\xA9\xFF\u0123\u0800\uFFEE\U0001F44C"
for i in range(-len(s), len(s)):
print("s[%d]: %s %X"%(i, s[i], ord(s[i])))
print("s[:%d]: %d chars, '%s'"%(i, len(s[:i]), s[:i]))
for j in range(i, len(s)):
print("s[%d:%d]: %d chars, '%s'"%(i, j, len(s[i:j]), s[i:j]))
print("s[%d:]: %d chars, '%s'"%(i, len(s[i:]), s[i:]))
# Test UTF-8 encode and decode
enc = s.encode()
print(enc, enc.decode() == s)
# printing of unicode chars using repr
# NOTE: for some characters (eg \u10ff) we differ to CPython
print(repr('a\uffff'))
print(repr('a\U0001ffff'))
# test invalid escape code
try:
eval('"\\U00110000"')
except SyntaxError:
print('SyntaxError')
# test unicode string given to int
try:
int('\u0200')
except ValueError:
print('ValueError')
# test invalid UTF-8 string
try:
str(b'ab\xa1', 'utf8')
except UnicodeError:
print('UnicodeError')
try:
str(b'ab\xf8', 'utf8')
except UnicodeError:
print('UnicodeError')
try:
str(bytearray(b'ab\xc0a'), 'utf8')
except UnicodeError:
print('UnicodeError')