1*4947cdc7SCole Faust# Tests of 'bytes' (immutable byte strings). 2*4947cdc7SCole Faust 3*4947cdc7SCole Faustload("assert.star", "assert") 4*4947cdc7SCole Faust 5*4947cdc7SCole Faust# bytes(string) -- UTF-k to UTF-8 transcoding with U+FFFD replacement 6*4947cdc7SCole Fausthello = bytes("hello, 世界") 7*4947cdc7SCole Faustgoodbye = bytes("goodbye") 8*4947cdc7SCole Faustempty = bytes("") 9*4947cdc7SCole Faustnonprinting = bytes("\t\n\x7F\u200D") # TAB, NEWLINE, DEL, ZERO_WIDTH_JOINER 10*4947cdc7SCole Faustassert.eq(bytes("hello, 世界"[:-1]), b"hello, 世��") 11*4947cdc7SCole Faust 12*4947cdc7SCole Faust# bytes(iterable of int) -- construct from numeric byte values 13*4947cdc7SCole Faustassert.eq(bytes([65, 66, 67]), b"ABC") 14*4947cdc7SCole Faustassert.eq(bytes((65, 66, 67)), b"ABC") 15*4947cdc7SCole Faustassert.eq(bytes([0xf0, 0x9f, 0x98, 0xbf]), b"") 16*4947cdc7SCole Faustassert.fails(lambda: bytes([300]), 17*4947cdc7SCole Faust "at index 0, 300 out of range .want value in unsigned 8-bit range") 18*4947cdc7SCole Faustassert.fails(lambda: bytes([b"a"]), 19*4947cdc7SCole Faust "at index 0, got bytes, want int") 20*4947cdc7SCole Faustassert.fails(lambda: bytes(1), "want string, bytes, or iterable of ints") 21*4947cdc7SCole Faust 22*4947cdc7SCole Faust# literals 23*4947cdc7SCole Faustassert.eq(b"hello, 世界", hello) 24*4947cdc7SCole Faustassert.eq(b"goodbye", goodbye) 25*4947cdc7SCole Faustassert.eq(b"", empty) 26*4947cdc7SCole Faustassert.eq(b"\t\n\x7F\u200D", nonprinting) 27*4947cdc7SCole Faustassert.ne("abc", b"abc") 28*4947cdc7SCole Faustassert.eq(b"\012\xff\u0400\U0001F63F", b"\n\xffЀ") # see scanner tests for more 29*4947cdc7SCole Faustassert.eq(rb"\r\n\t", b"\\r\\n\\t") # raw 30*4947cdc7SCole Faust 31*4947cdc7SCole Faust# type 32*4947cdc7SCole Faustassert.eq(type(hello), "bytes") 33*4947cdc7SCole Faust 34*4947cdc7SCole Faust# len 35*4947cdc7SCole Faustassert.eq(len(hello), 13) 36*4947cdc7SCole Faustassert.eq(len(goodbye), 7) 37*4947cdc7SCole Faustassert.eq(len(empty), 0) 38*4947cdc7SCole Faustassert.eq(len(b"A"), 1) 39*4947cdc7SCole Faustassert.eq(len(b"Ѐ"), 2) 40*4947cdc7SCole Faustassert.eq(len(b"世"), 3) 41*4947cdc7SCole Faustassert.eq(len(b""), 4) 42*4947cdc7SCole Faust 43*4947cdc7SCole Faust# truth 44*4947cdc7SCole Faustassert.true(hello) 45*4947cdc7SCole Faustassert.true(goodbye) 46*4947cdc7SCole Faustassert.true(not empty) 47*4947cdc7SCole Faust 48*4947cdc7SCole Faust# str(bytes) does UTF-8 to UTF-k transcoding. 49*4947cdc7SCole Faust# TODO(adonovan): specify. 50*4947cdc7SCole Faustassert.eq(str(hello), "hello, 世界") 51*4947cdc7SCole Faustassert.eq(str(hello[:-1]), "hello, 世��") # incomplete UTF-8 encoding => U+FFFD 52*4947cdc7SCole Faustassert.eq(str(goodbye), "goodbye") 53*4947cdc7SCole Faustassert.eq(str(empty), "") 54*4947cdc7SCole Faustassert.eq(str(nonprinting), "\t\n\x7f\u200d") 55*4947cdc7SCole Faustassert.eq(str(b"\xED\xB0\x80"), "���") # UTF-8 encoding of unpaired surrogate => U+FFFD x 3 56*4947cdc7SCole Faust 57*4947cdc7SCole Faust# repr 58*4947cdc7SCole Faustassert.eq(repr(hello), r'b"hello, 世界"') 59*4947cdc7SCole Faustassert.eq(repr(hello[:-1]), r'b"hello, 世\xe7\x95"') # (incomplete UTF-8 encoding ) 60*4947cdc7SCole Faustassert.eq(repr(goodbye), 'b"goodbye"') 61*4947cdc7SCole Faustassert.eq(repr(empty), 'b""') 62*4947cdc7SCole Faustassert.eq(repr(nonprinting), 'b"\\t\\n\\x7f\\u200d"') 63*4947cdc7SCole Faust 64*4947cdc7SCole Faust# equality 65*4947cdc7SCole Faustassert.eq(hello, hello) 66*4947cdc7SCole Faustassert.ne(hello, goodbye) 67*4947cdc7SCole Faustassert.eq(b"goodbye", goodbye) 68*4947cdc7SCole Faust 69*4947cdc7SCole Faust# ordered comparison 70*4947cdc7SCole Faustassert.lt(b"abc", b"abd") 71*4947cdc7SCole Faustassert.lt(b"abc", b"abcd") 72*4947cdc7SCole Faustassert.lt(b"\x7f", b"\x80") # bytes compare as uint8, not int8 73*4947cdc7SCole Faust 74*4947cdc7SCole Faust# bytes are dict-hashable 75*4947cdc7SCole Faustdict = {hello: 1, goodbye: 2} 76*4947cdc7SCole Faustdict[b"goodbye"] = 3 77*4947cdc7SCole Faustassert.eq(len(dict), 2) 78*4947cdc7SCole Faustassert.eq(dict[goodbye], 3) 79*4947cdc7SCole Faust 80*4947cdc7SCole Faust# hash(bytes) is 32-bit FNV-1a. 81*4947cdc7SCole Faustassert.eq(hash(b""), 0x811c9dc5) 82*4947cdc7SCole Faustassert.eq(hash(b"a"), 0xe40c292c) 83*4947cdc7SCole Faustassert.eq(hash(b"ab"), 0x4d2505ca) 84*4947cdc7SCole Faustassert.eq(hash(b"abc"), 0x1a47e90b) 85*4947cdc7SCole Faust 86*4947cdc7SCole Faust# indexing 87*4947cdc7SCole Faustassert.eq(goodbye[0], b"g") 88*4947cdc7SCole Faustassert.eq(goodbye[-1], b"e") 89*4947cdc7SCole Faustassert.fails(lambda: goodbye[100], "out of range") 90*4947cdc7SCole Faust 91*4947cdc7SCole Faust# slicing 92*4947cdc7SCole Faustassert.eq(goodbye[:4], b"good") 93*4947cdc7SCole Faustassert.eq(goodbye[4:], b"bye") 94*4947cdc7SCole Faustassert.eq(goodbye[::2], b"gobe") 95*4947cdc7SCole Faustassert.eq(goodbye[3:4], b"d") # special case: len=1 96*4947cdc7SCole Faustassert.eq(goodbye[4:4], b"") # special case: len=0 97*4947cdc7SCole Faust 98*4947cdc7SCole Faust# bytes in bytes 99*4947cdc7SCole Faustassert.eq(b"bc" in b"abcd", True) 100*4947cdc7SCole Faustassert.eq(b"bc" in b"dcab", False) 101*4947cdc7SCole Faustassert.fails(lambda: "bc" in b"dcab", "requires bytes or int as left operand, not string") 102*4947cdc7SCole Faust 103*4947cdc7SCole Faust# int in bytes 104*4947cdc7SCole Faustassert.eq(97 in b"abc", True) # 97='a' 105*4947cdc7SCole Faustassert.eq(100 in b"abc", False) # 100='d' 106*4947cdc7SCole Faustassert.fails(lambda: 256 in b"abc", "int in bytes: 256 out of range") 107*4947cdc7SCole Faustassert.fails(lambda: -1 in b"abc", "int in bytes: -1 out of range") 108*4947cdc7SCole Faust 109*4947cdc7SCole Faust# ord TODO(adonovan): specify 110*4947cdc7SCole Faustassert.eq(ord(b"a"), 97) 111*4947cdc7SCole Faustassert.fails(lambda: ord(b"ab"), "ord: bytes has length 2, want 1") 112*4947cdc7SCole Faustassert.fails(lambda: ord(b""), "ord: bytes has length 0, want 1") 113*4947cdc7SCole Faust 114*4947cdc7SCole Faust# repeat (bytes * int) 115*4947cdc7SCole Faustassert.eq(goodbye * 3, b"goodbyegoodbyegoodbye") 116*4947cdc7SCole Faustassert.eq(3 * goodbye, b"goodbyegoodbyegoodbye") 117*4947cdc7SCole Faust 118*4947cdc7SCole Faust# elems() returns an iterable value over 1-byte substrings. 119*4947cdc7SCole Faustassert.eq(type(hello.elems()), "bytes.elems") 120*4947cdc7SCole Faustassert.eq(str(hello.elems()), "b\"hello, 世界\".elems()") 121*4947cdc7SCole Faustassert.eq(list(hello.elems()), [104, 101, 108, 108, 111, 44, 32, 228, 184, 150, 231, 149, 140]) 122*4947cdc7SCole Faustassert.eq(bytes([104, 101, 108, 108, 111, 44, 32, 228, 184, 150, 231, 149, 140]), hello) 123*4947cdc7SCole Faustassert.eq(list(goodbye.elems()), [103, 111, 111, 100, 98, 121, 101]) 124*4947cdc7SCole Faustassert.eq(list(empty.elems()), []) 125*4947cdc7SCole Faustassert.eq(bytes(hello.elems()), hello) # bytes(iterable) is dual to bytes.elems() 126*4947cdc7SCole Faust 127*4947cdc7SCole Faust# x[i] = ... 128*4947cdc7SCole Faustdef f(): 129*4947cdc7SCole Faust b"abc"[1] = b"B" 130*4947cdc7SCole Faust 131*4947cdc7SCole Faustassert.fails(f, "bytes.*does not support.*assignment") 132*4947cdc7SCole Faust 133*4947cdc7SCole Faust# TODO(adonovan): the specification is not finalized in many areas: 134*4947cdc7SCole Faust# - chr, ord functions 135*4947cdc7SCole Faust# - encoding/decoding bytes to string. 136*4947cdc7SCole Faust# - methods: find, index, split, etc. 137*4947cdc7SCole Faust# 138*4947cdc7SCole Faust# Summary of string operations (put this in spec). 139*4947cdc7SCole Faust# 140*4947cdc7SCole Faust# string to number: 141*4947cdc7SCole Faust# - bytes[i] returns numeric value of ith byte. 142*4947cdc7SCole Faust# - ord(string) returns numeric value of sole code point in string. 143*4947cdc7SCole Faust# - ord(string[i]) is not a useful operation: fails on non-ASCII; see below. 144*4947cdc7SCole Faust# Q. Perhaps ord should return the first (not sole) code point? Then it becomes a UTF-8 decoder. 145*4947cdc7SCole Faust# Perhaps ord(string, index=int) should apply the index and relax the len=1 check. 146*4947cdc7SCole Faust# - string.codepoint() iterates over 1-codepoint substrings. 147*4947cdc7SCole Faust# - string.codepoint_ords() iterates over numeric values of code points in string. 148*4947cdc7SCole Faust# - string.elems() iterates over 1-element (UTF-k code) substrings. 149*4947cdc7SCole Faust# - string.elem_ords() iterates over numeric UTF-k code values. 150*4947cdc7SCole Faust# - string.elem_ords()[i] returns numeric value of ith element (UTF-k code). 151*4947cdc7SCole Faust# - string.elems()[i] returns substring of a single element (UTF-k code). 152*4947cdc7SCole Faust# - int(string) parses string as decimal (or other) numeric literal. 153*4947cdc7SCole Faust# 154*4947cdc7SCole Faust# number to string: 155*4947cdc7SCole Faust# - chr(int) returns string, UTF-k encoding of Unicode code point (like Python). 156*4947cdc7SCole Faust# Redundant with '%c' % int (which Python2 calls 'unichr'.) 157*4947cdc7SCole Faust# - bytes(chr(int)) returns byte string containing UTF-8 encoding of one code point. 158*4947cdc7SCole Faust# - bytes([int]) returns 1-byte string (with regrettable list allocation). 159*4947cdc7SCole Faust# - str(int) - format number as decimal. 160