1# Tests of 'bytes' (immutable byte strings). 2 3load("assert.star", "assert") 4 5# bytes(string) -- UTF-k to UTF-8 transcoding with U+FFFD replacement 6hello = bytes("hello, 世界") 7goodbye = bytes("goodbye") 8empty = bytes("") 9nonprinting = bytes("\t\n\x7F\u200D") # TAB, NEWLINE, DEL, ZERO_WIDTH_JOINER 10assert.eq(bytes("hello, 世界"[:-1]), b"hello, 世��") 11 12# bytes(iterable of int) -- construct from numeric byte values 13assert.eq(bytes([65, 66, 67]), b"ABC") 14assert.eq(bytes((65, 66, 67)), b"ABC") 15assert.eq(bytes([0xf0, 0x9f, 0x98, 0xbf]), b"") 16assert.fails(lambda: bytes([300]), 17 "at index 0, 300 out of range .want value in unsigned 8-bit range") 18assert.fails(lambda: bytes([b"a"]), 19 "at index 0, got bytes, want int") 20assert.fails(lambda: bytes(1), "want string, bytes, or iterable of ints") 21 22# literals 23assert.eq(b"hello, 世界", hello) 24assert.eq(b"goodbye", goodbye) 25assert.eq(b"", empty) 26assert.eq(b"\t\n\x7F\u200D", nonprinting) 27assert.ne("abc", b"abc") 28assert.eq(b"\012\xff\u0400\U0001F63F", b"\n\xffЀ") # see scanner tests for more 29assert.eq(rb"\r\n\t", b"\\r\\n\\t") # raw 30 31# type 32assert.eq(type(hello), "bytes") 33 34# len 35assert.eq(len(hello), 13) 36assert.eq(len(goodbye), 7) 37assert.eq(len(empty), 0) 38assert.eq(len(b"A"), 1) 39assert.eq(len(b"Ѐ"), 2) 40assert.eq(len(b"世"), 3) 41assert.eq(len(b""), 4) 42 43# truth 44assert.true(hello) 45assert.true(goodbye) 46assert.true(not empty) 47 48# str(bytes) does UTF-8 to UTF-k transcoding. 49# TODO(adonovan): specify. 50assert.eq(str(hello), "hello, 世界") 51assert.eq(str(hello[:-1]), "hello, 世��") # incomplete UTF-8 encoding => U+FFFD 52assert.eq(str(goodbye), "goodbye") 53assert.eq(str(empty), "") 54assert.eq(str(nonprinting), "\t\n\x7f\u200d") 55assert.eq(str(b"\xED\xB0\x80"), "���") # UTF-8 encoding of unpaired surrogate => U+FFFD x 3 56 57# repr 58assert.eq(repr(hello), r'b"hello, 世界"') 59assert.eq(repr(hello[:-1]), r'b"hello, 世\xe7\x95"') # (incomplete UTF-8 encoding ) 60assert.eq(repr(goodbye), 'b"goodbye"') 61assert.eq(repr(empty), 'b""') 62assert.eq(repr(nonprinting), 'b"\\t\\n\\x7f\\u200d"') 63 64# equality 65assert.eq(hello, hello) 66assert.ne(hello, goodbye) 67assert.eq(b"goodbye", goodbye) 68 69# ordered comparison 70assert.lt(b"abc", b"abd") 71assert.lt(b"abc", b"abcd") 72assert.lt(b"\x7f", b"\x80") # bytes compare as uint8, not int8 73 74# bytes are dict-hashable 75dict = {hello: 1, goodbye: 2} 76dict[b"goodbye"] = 3 77assert.eq(len(dict), 2) 78assert.eq(dict[goodbye], 3) 79 80# hash(bytes) is 32-bit FNV-1a. 81assert.eq(hash(b""), 0x811c9dc5) 82assert.eq(hash(b"a"), 0xe40c292c) 83assert.eq(hash(b"ab"), 0x4d2505ca) 84assert.eq(hash(b"abc"), 0x1a47e90b) 85 86# indexing 87assert.eq(goodbye[0], b"g") 88assert.eq(goodbye[-1], b"e") 89assert.fails(lambda: goodbye[100], "out of range") 90 91# slicing 92assert.eq(goodbye[:4], b"good") 93assert.eq(goodbye[4:], b"bye") 94assert.eq(goodbye[::2], b"gobe") 95assert.eq(goodbye[3:4], b"d") # special case: len=1 96assert.eq(goodbye[4:4], b"") # special case: len=0 97 98# bytes in bytes 99assert.eq(b"bc" in b"abcd", True) 100assert.eq(b"bc" in b"dcab", False) 101assert.fails(lambda: "bc" in b"dcab", "requires bytes or int as left operand, not string") 102 103# int in bytes 104assert.eq(97 in b"abc", True) # 97='a' 105assert.eq(100 in b"abc", False) # 100='d' 106assert.fails(lambda: 256 in b"abc", "int in bytes: 256 out of range") 107assert.fails(lambda: -1 in b"abc", "int in bytes: -1 out of range") 108 109# ord TODO(adonovan): specify 110assert.eq(ord(b"a"), 97) 111assert.fails(lambda: ord(b"ab"), "ord: bytes has length 2, want 1") 112assert.fails(lambda: ord(b""), "ord: bytes has length 0, want 1") 113 114# repeat (bytes * int) 115assert.eq(goodbye * 3, b"goodbyegoodbyegoodbye") 116assert.eq(3 * goodbye, b"goodbyegoodbyegoodbye") 117 118# elems() returns an iterable value over 1-byte substrings. 119assert.eq(type(hello.elems()), "bytes.elems") 120assert.eq(str(hello.elems()), "b\"hello, 世界\".elems()") 121assert.eq(list(hello.elems()), [104, 101, 108, 108, 111, 44, 32, 228, 184, 150, 231, 149, 140]) 122assert.eq(bytes([104, 101, 108, 108, 111, 44, 32, 228, 184, 150, 231, 149, 140]), hello) 123assert.eq(list(goodbye.elems()), [103, 111, 111, 100, 98, 121, 101]) 124assert.eq(list(empty.elems()), []) 125assert.eq(bytes(hello.elems()), hello) # bytes(iterable) is dual to bytes.elems() 126 127# x[i] = ... 128def f(): 129 b"abc"[1] = b"B" 130 131assert.fails(f, "bytes.*does not support.*assignment") 132 133# TODO(adonovan): the specification is not finalized in many areas: 134# - chr, ord functions 135# - encoding/decoding bytes to string. 136# - methods: find, index, split, etc. 137# 138# Summary of string operations (put this in spec). 139# 140# string to number: 141# - bytes[i] returns numeric value of ith byte. 142# - ord(string) returns numeric value of sole code point in string. 143# - ord(string[i]) is not a useful operation: fails on non-ASCII; see below. 144# Q. Perhaps ord should return the first (not sole) code point? Then it becomes a UTF-8 decoder. 145# Perhaps ord(string, index=int) should apply the index and relax the len=1 check. 146# - string.codepoint() iterates over 1-codepoint substrings. 147# - string.codepoint_ords() iterates over numeric values of code points in string. 148# - string.elems() iterates over 1-element (UTF-k code) substrings. 149# - string.elem_ords() iterates over numeric UTF-k code values. 150# - string.elem_ords()[i] returns numeric value of ith element (UTF-k code). 151# - string.elems()[i] returns substring of a single element (UTF-k code). 152# - int(string) parses string as decimal (or other) numeric literal. 153# 154# number to string: 155# - chr(int) returns string, UTF-k encoding of Unicode code point (like Python). 156# Redundant with '%c' % int (which Python2 calls 'unichr'.) 157# - bytes(chr(int)) returns byte string containing UTF-8 encoding of one code point. 158# - bytes([int]) returns 1-byte string (with regrettable list allocation). 159# - str(int) - format number as decimal. 160