1#!/usr/bin/env python3 2 3# This script is based on 4# https://github.com/rust-lang/rust/blob/master/library/core/src/unicode/printable.py 5# distributed under https://github.com/rust-lang/rust/blob/master/LICENSE-MIT. 6 7# This script uses the following Unicode tables: 8# - UnicodeData.txt 9 10 11from collections import namedtuple 12import csv 13import os 14import subprocess 15 16NUM_CODEPOINTS=0x110000 17 18def to_ranges(iter): 19 current = None 20 for i in iter: 21 if current is None or i != current[1] or i in (0x10000, 0x20000): 22 if current is not None: 23 yield tuple(current) 24 current = [i, i + 1] 25 else: 26 current[1] += 1 27 if current is not None: 28 yield tuple(current) 29 30def get_escaped(codepoints): 31 for c in codepoints: 32 if (c.class_ or "Cn") in "Cc Cf Cs Co Cn Zl Zp Zs".split() and c.value != ord(' '): 33 yield c.value 34 35def get_file(f): 36 try: 37 return open(os.path.basename(f)) 38 except FileNotFoundError: 39 subprocess.run(["curl", "-O", f], check=True) 40 return open(os.path.basename(f)) 41 42Codepoint = namedtuple('Codepoint', 'value class_') 43 44def get_codepoints(f): 45 r = csv.reader(f, delimiter=";") 46 prev_codepoint = 0 47 class_first = None 48 for row in r: 49 codepoint = int(row[0], 16) 50 name = row[1] 51 class_ = row[2] 52 53 if class_first is not None: 54 if not name.endswith("Last>"): 55 raise ValueError("Missing Last after First") 56 57 for c in range(prev_codepoint + 1, codepoint): 58 yield Codepoint(c, class_first) 59 60 class_first = None 61 if name.endswith("First>"): 62 class_first = class_ 63 64 yield Codepoint(codepoint, class_) 65 prev_codepoint = codepoint 66 67 if class_first is not None: 68 raise ValueError("Missing Last after First") 69 70 for c in range(prev_codepoint + 1, NUM_CODEPOINTS): 71 yield Codepoint(c, None) 72 73def compress_singletons(singletons): 74 uppers = [] # (upper, # items in lowers) 75 lowers = [] 76 77 for i in singletons: 78 upper = i >> 8 79 lower = i & 0xff 80 if len(uppers) == 0 or uppers[-1][0] != upper: 81 uppers.append((upper, 1)) 82 else: 83 upper, count = uppers[-1] 84 uppers[-1] = upper, count + 1 85 lowers.append(lower) 86 87 return uppers, lowers 88 89def compress_normal(normal): 90 # lengths 0x00..0x7f are encoded as 00, 01, ..., 7e, 7f 91 # lengths 0x80..0x7fff are encoded as 80 80, 80 81, ..., ff fe, ff ff 92 compressed = [] # [truelen, (truelenaux), falselen, (falselenaux)] 93 94 prev_start = 0 95 for start, count in normal: 96 truelen = start - prev_start 97 falselen = count 98 prev_start = start + count 99 100 assert truelen < 0x8000 and falselen < 0x8000 101 entry = [] 102 if truelen > 0x7f: 103 entry.append(0x80 | (truelen >> 8)) 104 entry.append(truelen & 0xff) 105 else: 106 entry.append(truelen & 0x7f) 107 if falselen > 0x7f: 108 entry.append(0x80 | (falselen >> 8)) 109 entry.append(falselen & 0xff) 110 else: 111 entry.append(falselen & 0x7f) 112 113 compressed.append(entry) 114 115 return compressed 116 117def print_singletons(uppers, lowers, uppersname, lowersname): 118 print(" static constexpr singleton {}[] = {{".format(uppersname)) 119 for u, c in uppers: 120 print(" {{{:#04x}, {}}},".format(u, c)) 121 print(" };") 122 print(" static constexpr unsigned char {}[] = {{".format(lowersname)) 123 for i in range(0, len(lowers), 8): 124 print(" {}".format(" ".join("{:#04x},".format(l) for l in lowers[i:i+8]))) 125 print(" };") 126 127def print_normal(normal, normalname): 128 print(" static constexpr unsigned char {}[] = {{".format(normalname)) 129 for v in normal: 130 print(" {}".format(" ".join("{:#04x},".format(i) for i in v))) 131 print(" };") 132 133def main(): 134 file = get_file("https://www.unicode.org/Public/UNIDATA/UnicodeData.txt") 135 136 codepoints = get_codepoints(file) 137 138 CUTOFF=0x10000 139 singletons0 = [] 140 singletons1 = [] 141 normal0 = [] 142 normal1 = [] 143 extra = [] 144 145 for a, b in to_ranges(get_escaped(codepoints)): 146 if a > 2 * CUTOFF: 147 extra.append((a, b - a)) 148 elif a == b - 1: 149 if a & CUTOFF: 150 singletons1.append(a & ~CUTOFF) 151 else: 152 singletons0.append(a) 153 elif a == b - 2: 154 if a & CUTOFF: 155 singletons1.append(a & ~CUTOFF) 156 singletons1.append((a + 1) & ~CUTOFF) 157 else: 158 singletons0.append(a) 159 singletons0.append(a + 1) 160 else: 161 if a >= 2 * CUTOFF: 162 extra.append((a, b - a)) 163 elif a & CUTOFF: 164 normal1.append((a & ~CUTOFF, b - a)) 165 else: 166 normal0.append((a, b - a)) 167 168 singletons0u, singletons0l = compress_singletons(singletons0) 169 singletons1u, singletons1l = compress_singletons(singletons1) 170 normal0 = compress_normal(normal0) 171 normal1 = compress_normal(normal1) 172 173 print("""\ 174FMT_FUNC auto is_printable(uint32_t cp) -> bool {\ 175""") 176 print_singletons(singletons0u, singletons0l, 'singletons0', 'singletons0_lower') 177 print_singletons(singletons1u, singletons1l, 'singletons1', 'singletons1_lower') 178 print_normal(normal0, 'normal0') 179 print_normal(normal1, 'normal1') 180 print("""\ 181 auto lower = static_cast<uint16_t>(cp); 182 if (cp < 0x10000) { 183 return is_printable(lower, singletons0, 184 sizeof(singletons0) / sizeof(*singletons0), 185 singletons0_lower, normal0, sizeof(normal0)); 186 } 187 if (cp < 0x20000) { 188 return is_printable(lower, singletons1, 189 sizeof(singletons1) / sizeof(*singletons1), 190 singletons1_lower, normal1, sizeof(normal1)); 191 }\ 192""") 193 for a, b in extra: 194 print(" if (0x{:x} <= cp && cp < 0x{:x}) return false;".format(a, a + b)) 195 print("""\ 196 return cp < 0x{:x}; 197}}\ 198""".format(NUM_CODEPOINTS)) 199 200if __name__ == '__main__': 201 main() 202