xref: /aosp_15_r20/external/fmtlib/support/printable.py (revision 5c90c05cd622c0a81b57953a4d343e0e489f2e08)
1#!/usr/bin/env python3
2
3# This script is based on
4# https://github.com/rust-lang/rust/blob/master/library/core/src/unicode/printable.py
5# distributed under https://github.com/rust-lang/rust/blob/master/LICENSE-MIT.
6
7# This script uses the following Unicode tables:
8# - UnicodeData.txt
9
10
11from collections import namedtuple
12import csv
13import os
14import subprocess
15
16NUM_CODEPOINTS=0x110000
17
18def to_ranges(iter):
19    current = None
20    for i in iter:
21        if current is None or i != current[1] or i in (0x10000, 0x20000):
22            if current is not None:
23                yield tuple(current)
24            current = [i, i + 1]
25        else:
26            current[1] += 1
27    if current is not None:
28        yield tuple(current)
29
30def get_escaped(codepoints):
31    for c in codepoints:
32        if (c.class_ or "Cn") in "Cc Cf Cs Co Cn Zl Zp Zs".split() and c.value != ord(' '):
33            yield c.value
34
35def get_file(f):
36    try:
37        return open(os.path.basename(f))
38    except FileNotFoundError:
39        subprocess.run(["curl", "-O", f], check=True)
40        return open(os.path.basename(f))
41
42Codepoint = namedtuple('Codepoint', 'value class_')
43
44def get_codepoints(f):
45    r = csv.reader(f, delimiter=";")
46    prev_codepoint = 0
47    class_first = None
48    for row in r:
49        codepoint = int(row[0], 16)
50        name = row[1]
51        class_ = row[2]
52
53        if class_first is not None:
54            if not name.endswith("Last>"):
55                raise ValueError("Missing Last after First")
56
57        for c in range(prev_codepoint + 1, codepoint):
58            yield Codepoint(c, class_first)
59
60        class_first = None
61        if name.endswith("First>"):
62            class_first = class_
63
64        yield Codepoint(codepoint, class_)
65        prev_codepoint = codepoint
66
67    if class_first is not None:
68        raise ValueError("Missing Last after First")
69
70    for c in range(prev_codepoint + 1, NUM_CODEPOINTS):
71        yield Codepoint(c, None)
72
73def compress_singletons(singletons):
74    uppers = [] # (upper, # items in lowers)
75    lowers = []
76
77    for i in singletons:
78        upper = i >> 8
79        lower = i & 0xff
80        if len(uppers) == 0 or uppers[-1][0] != upper:
81            uppers.append((upper, 1))
82        else:
83            upper, count = uppers[-1]
84            uppers[-1] = upper, count + 1
85        lowers.append(lower)
86
87    return uppers, lowers
88
89def compress_normal(normal):
90    # lengths 0x00..0x7f are encoded as 00, 01, ..., 7e, 7f
91    # lengths 0x80..0x7fff are encoded as 80 80, 80 81, ..., ff fe, ff ff
92    compressed = [] # [truelen, (truelenaux), falselen, (falselenaux)]
93
94    prev_start = 0
95    for start, count in normal:
96        truelen = start - prev_start
97        falselen = count
98        prev_start = start + count
99
100        assert truelen < 0x8000 and falselen < 0x8000
101        entry = []
102        if truelen > 0x7f:
103            entry.append(0x80 | (truelen >> 8))
104            entry.append(truelen & 0xff)
105        else:
106            entry.append(truelen & 0x7f)
107        if falselen > 0x7f:
108            entry.append(0x80 | (falselen >> 8))
109            entry.append(falselen & 0xff)
110        else:
111            entry.append(falselen & 0x7f)
112
113        compressed.append(entry)
114
115    return compressed
116
117def print_singletons(uppers, lowers, uppersname, lowersname):
118    print("  static constexpr singleton {}[] = {{".format(uppersname))
119    for u, c in uppers:
120        print("    {{{:#04x}, {}}},".format(u, c))
121    print("  };")
122    print("  static constexpr unsigned char {}[] = {{".format(lowersname))
123    for i in range(0, len(lowers), 8):
124        print("    {}".format(" ".join("{:#04x},".format(l) for l in lowers[i:i+8])))
125    print("  };")
126
127def print_normal(normal, normalname):
128    print("  static constexpr unsigned char {}[] = {{".format(normalname))
129    for v in normal:
130        print("    {}".format(" ".join("{:#04x},".format(i) for i in v)))
131    print("  };")
132
133def main():
134    file = get_file("https://www.unicode.org/Public/UNIDATA/UnicodeData.txt")
135
136    codepoints = get_codepoints(file)
137
138    CUTOFF=0x10000
139    singletons0 = []
140    singletons1 = []
141    normal0 = []
142    normal1 = []
143    extra = []
144
145    for a, b in to_ranges(get_escaped(codepoints)):
146        if a > 2 * CUTOFF:
147            extra.append((a, b - a))
148        elif a == b - 1:
149            if a & CUTOFF:
150                singletons1.append(a & ~CUTOFF)
151            else:
152                singletons0.append(a)
153        elif a == b - 2:
154            if a & CUTOFF:
155                singletons1.append(a & ~CUTOFF)
156                singletons1.append((a + 1) & ~CUTOFF)
157            else:
158                singletons0.append(a)
159                singletons0.append(a + 1)
160        else:
161            if a >= 2 * CUTOFF:
162                extra.append((a, b - a))
163            elif a & CUTOFF:
164                normal1.append((a & ~CUTOFF, b - a))
165            else:
166                normal0.append((a, b - a))
167
168    singletons0u, singletons0l = compress_singletons(singletons0)
169    singletons1u, singletons1l = compress_singletons(singletons1)
170    normal0 = compress_normal(normal0)
171    normal1 = compress_normal(normal1)
172
173    print("""\
174FMT_FUNC auto is_printable(uint32_t cp) -> bool {\
175""")
176    print_singletons(singletons0u, singletons0l, 'singletons0', 'singletons0_lower')
177    print_singletons(singletons1u, singletons1l, 'singletons1', 'singletons1_lower')
178    print_normal(normal0, 'normal0')
179    print_normal(normal1, 'normal1')
180    print("""\
181  auto lower = static_cast<uint16_t>(cp);
182  if (cp < 0x10000) {
183    return is_printable(lower, singletons0,
184                        sizeof(singletons0) / sizeof(*singletons0),
185                        singletons0_lower, normal0, sizeof(normal0));
186  }
187  if (cp < 0x20000) {
188    return is_printable(lower, singletons1,
189                        sizeof(singletons1) / sizeof(*singletons1),
190                        singletons1_lower, normal1, sizeof(normal1));
191  }\
192""")
193    for a, b in extra:
194        print("  if (0x{:x} <= cp && cp < 0x{:x}) return false;".format(a, a + b))
195    print("""\
196  return cp < 0x{:x};
197}}\
198""".format(NUM_CODEPOINTS))
199
200if __name__ == '__main__':
201    main()
202