xref: /aosp_15_r20/external/fonttools/MetaTools/buildUCD.py (revision e1fe3e4ad2793916b15cccdc4a7da52a7e1dd0e9)
1*e1fe3e4aSElliott Hughes#!/usr/bin/env python3
2*e1fe3e4aSElliott Hughes"""
3*e1fe3e4aSElliott HughesTools to parse data files from the Unicode Character Database.
4*e1fe3e4aSElliott Hughes"""
5*e1fe3e4aSElliott Hughes
6*e1fe3e4aSElliott Hughes
7*e1fe3e4aSElliott Hughestry:
8*e1fe3e4aSElliott Hughes    from urllib.request import urlopen
9*e1fe3e4aSElliott Hughesexcept ImportError:
10*e1fe3e4aSElliott Hughes    from urllib2 import urlopen
11*e1fe3e4aSElliott Hughesfrom contextlib import closing, contextmanager
12*e1fe3e4aSElliott Hughesimport re
13*e1fe3e4aSElliott Hughesfrom codecs import iterdecode
14*e1fe3e4aSElliott Hughesimport logging
15*e1fe3e4aSElliott Hughesimport os
16*e1fe3e4aSElliott Hughesfrom io import open
17*e1fe3e4aSElliott Hughesfrom os.path import abspath, dirname, join as pjoin, pardir, sep
18*e1fe3e4aSElliott Hughes
19*e1fe3e4aSElliott Hughes
20*e1fe3e4aSElliott Hughestry:  # pragma: no cover
21*e1fe3e4aSElliott Hughes    unicode
22*e1fe3e4aSElliott Hughesexcept NameError:
23*e1fe3e4aSElliott Hughes    unicode = str
24*e1fe3e4aSElliott Hughes
25*e1fe3e4aSElliott Hughes
26*e1fe3e4aSElliott HughesUNIDATA_URL = "https://unicode.org/Public/UNIDATA/"
27*e1fe3e4aSElliott HughesUNIDATA_LICENSE_URL = "http://unicode.org/copyright.html#License"
28*e1fe3e4aSElliott Hughes
29*e1fe3e4aSElliott Hughes# by default save output files to ../Lib/fontTools/unicodedata/
30*e1fe3e4aSElliott HughesUNIDATA_PATH = (
31*e1fe3e4aSElliott Hughes    pjoin(abspath(dirname(__file__)), pardir, "Lib", "fontTools", "unicodedata") + sep
32*e1fe3e4aSElliott Hughes)
33*e1fe3e4aSElliott Hughes
34*e1fe3e4aSElliott HughesSRC_ENCODING = "# -*- coding: utf-8 -*-\n"
35*e1fe3e4aSElliott Hughes
36*e1fe3e4aSElliott HughesNOTICE = "# NOTE: This file was auto-generated with MetaTools/buildUCD.py.\n"
37*e1fe3e4aSElliott Hughes
38*e1fe3e4aSElliott HughesMAX_UNICODE = 0x10FFFF
39*e1fe3e4aSElliott Hughes
40*e1fe3e4aSElliott Hugheslog = logging.getLogger()
41*e1fe3e4aSElliott Hughes
42*e1fe3e4aSElliott Hughes
43*e1fe3e4aSElliott Hughes@contextmanager
44*e1fe3e4aSElliott Hughesdef open_unidata_file(filename):
45*e1fe3e4aSElliott Hughes    """Open a text file from https://unicode.org/Public/UNIDATA/"""
46*e1fe3e4aSElliott Hughes    url = UNIDATA_URL + filename
47*e1fe3e4aSElliott Hughes    with closing(urlopen(url)) as response:
48*e1fe3e4aSElliott Hughes        yield iterdecode(response, encoding="utf-8")
49*e1fe3e4aSElliott Hughes
50*e1fe3e4aSElliott Hughes
51*e1fe3e4aSElliott Hughesdef parse_unidata_header(infile):
52*e1fe3e4aSElliott Hughes    """Read the top header of data files, until the first line
53*e1fe3e4aSElliott Hughes    that does not start with '#'.
54*e1fe3e4aSElliott Hughes    """
55*e1fe3e4aSElliott Hughes    header = []
56*e1fe3e4aSElliott Hughes    line = next(infile)
57*e1fe3e4aSElliott Hughes    while line.startswith("#"):
58*e1fe3e4aSElliott Hughes        header.append(line)
59*e1fe3e4aSElliott Hughes        line = next(infile)
60*e1fe3e4aSElliott Hughes    return "".join(header)
61*e1fe3e4aSElliott Hughes
62*e1fe3e4aSElliott Hughes
63*e1fe3e4aSElliott Hughesdef parse_range_properties(infile, default=None, is_set=False):
64*e1fe3e4aSElliott Hughes    """Parse a Unicode data file containing a column with one character or
65*e1fe3e4aSElliott Hughes    a range of characters, and another column containing a property value
66*e1fe3e4aSElliott Hughes    separated by a semicolon. Comments after '#' are ignored.
67*e1fe3e4aSElliott Hughes
68*e1fe3e4aSElliott Hughes    If the ranges defined in the data file are not continuous, assign the
69*e1fe3e4aSElliott Hughes    'default' property to the unassigned codepoints.
70*e1fe3e4aSElliott Hughes
71*e1fe3e4aSElliott Hughes    Return a list of (start, end, property_name) tuples.
72*e1fe3e4aSElliott Hughes    """
73*e1fe3e4aSElliott Hughes    ranges = []
74*e1fe3e4aSElliott Hughes    line_regex = re.compile(
75*e1fe3e4aSElliott Hughes        r"^"
76*e1fe3e4aSElliott Hughes        r"([0-9A-F]{4,6})"  # first character code
77*e1fe3e4aSElliott Hughes        r"(?:\.\.([0-9A-F]{4,6}))?"  # optional second character code
78*e1fe3e4aSElliott Hughes        r"\s*;\s*"
79*e1fe3e4aSElliott Hughes        r"([^#]+)"
80*e1fe3e4aSElliott Hughes    )  # everything up to the potential comment
81*e1fe3e4aSElliott Hughes    for line in infile:
82*e1fe3e4aSElliott Hughes        match = line_regex.match(line)
83*e1fe3e4aSElliott Hughes        if not match:
84*e1fe3e4aSElliott Hughes            continue
85*e1fe3e4aSElliott Hughes
86*e1fe3e4aSElliott Hughes        first, last, data = match.groups()
87*e1fe3e4aSElliott Hughes        if last is None:
88*e1fe3e4aSElliott Hughes            last = first
89*e1fe3e4aSElliott Hughes
90*e1fe3e4aSElliott Hughes        first = int(first, 16)
91*e1fe3e4aSElliott Hughes        last = int(last, 16)
92*e1fe3e4aSElliott Hughes        data = str(data.rstrip())
93*e1fe3e4aSElliott Hughes
94*e1fe3e4aSElliott Hughes        ranges.append((first, last, data))
95*e1fe3e4aSElliott Hughes
96*e1fe3e4aSElliott Hughes    ranges.sort()
97*e1fe3e4aSElliott Hughes
98*e1fe3e4aSElliott Hughes    if isinstance(default, unicode):
99*e1fe3e4aSElliott Hughes        default = str(default)
100*e1fe3e4aSElliott Hughes
101*e1fe3e4aSElliott Hughes    # fill the gaps between explicitly defined ranges
102*e1fe3e4aSElliott Hughes    last_start, last_end = -1, -1
103*e1fe3e4aSElliott Hughes    full_ranges = []
104*e1fe3e4aSElliott Hughes    for start, end, value in ranges:
105*e1fe3e4aSElliott Hughes        assert last_end < start
106*e1fe3e4aSElliott Hughes        assert start <= end
107*e1fe3e4aSElliott Hughes        if start - last_end > 1:
108*e1fe3e4aSElliott Hughes            full_ranges.append((last_end + 1, start - 1, default))
109*e1fe3e4aSElliott Hughes        if is_set:
110*e1fe3e4aSElliott Hughes            value = set(value.split())
111*e1fe3e4aSElliott Hughes        full_ranges.append((start, end, value))
112*e1fe3e4aSElliott Hughes        last_start, last_end = start, end
113*e1fe3e4aSElliott Hughes    if last_end != MAX_UNICODE:
114*e1fe3e4aSElliott Hughes        full_ranges.append((last_end + 1, MAX_UNICODE, default))
115*e1fe3e4aSElliott Hughes
116*e1fe3e4aSElliott Hughes    # reduce total number of ranges by combining continuous ones
117*e1fe3e4aSElliott Hughes    last_start, last_end, last_value = full_ranges.pop(0)
118*e1fe3e4aSElliott Hughes    merged_ranges = []
119*e1fe3e4aSElliott Hughes    for start, end, value in full_ranges:
120*e1fe3e4aSElliott Hughes        if value == last_value:
121*e1fe3e4aSElliott Hughes            continue
122*e1fe3e4aSElliott Hughes        else:
123*e1fe3e4aSElliott Hughes            merged_ranges.append((last_start, start - 1, last_value))
124*e1fe3e4aSElliott Hughes            last_start, line_end, last_value = start, end, value
125*e1fe3e4aSElliott Hughes    merged_ranges.append((last_start, MAX_UNICODE, last_value))
126*e1fe3e4aSElliott Hughes
127*e1fe3e4aSElliott Hughes    # make sure that the ranges cover the full unicode repertoire
128*e1fe3e4aSElliott Hughes    assert merged_ranges[0][0] == 0
129*e1fe3e4aSElliott Hughes    for (cs, ce, cv), (ns, ne, nv) in zip(merged_ranges, merged_ranges[1:]):
130*e1fe3e4aSElliott Hughes        assert ce + 1 == ns
131*e1fe3e4aSElliott Hughes    assert merged_ranges[-1][1] == MAX_UNICODE
132*e1fe3e4aSElliott Hughes
133*e1fe3e4aSElliott Hughes    return merged_ranges
134*e1fe3e4aSElliott Hughes
135*e1fe3e4aSElliott Hughes
136*e1fe3e4aSElliott Hughesdef parse_semicolon_separated_data(infile):
137*e1fe3e4aSElliott Hughes    """Parse a Unicode data file where each line contains a lists of values
138*e1fe3e4aSElliott Hughes    separated by a semicolon (e.g. "PropertyValueAliases.txt").
139*e1fe3e4aSElliott Hughes    The number of the values on different lines may be different.
140*e1fe3e4aSElliott Hughes
141*e1fe3e4aSElliott Hughes    Returns a list of lists each containing the values as strings.
142*e1fe3e4aSElliott Hughes    """
143*e1fe3e4aSElliott Hughes    data = []
144*e1fe3e4aSElliott Hughes    for line in infile:
145*e1fe3e4aSElliott Hughes        line = line.split("#", 1)[0].strip()  # remove the comment
146*e1fe3e4aSElliott Hughes        if not line:
147*e1fe3e4aSElliott Hughes            continue
148*e1fe3e4aSElliott Hughes        fields = [str(field.strip()) for field in line.split(";")]
149*e1fe3e4aSElliott Hughes        data.append(fields)
150*e1fe3e4aSElliott Hughes    return data
151*e1fe3e4aSElliott Hughes
152*e1fe3e4aSElliott Hughes
153*e1fe3e4aSElliott Hughesdef _set_repr(value):
154*e1fe3e4aSElliott Hughes    return (
155*e1fe3e4aSElliott Hughes        "None"
156*e1fe3e4aSElliott Hughes        if value is None
157*e1fe3e4aSElliott Hughes        else "{{{}}}".format(", ".join(repr(v) for v in sorted(value)))
158*e1fe3e4aSElliott Hughes    )
159*e1fe3e4aSElliott Hughes
160*e1fe3e4aSElliott Hughes
161*e1fe3e4aSElliott Hughesdef build_ranges(
162*e1fe3e4aSElliott Hughes    filename, local_ucd=None, output_path=None, default=None, is_set=False, aliases=None
163*e1fe3e4aSElliott Hughes):
164*e1fe3e4aSElliott Hughes    """Fetch 'filename' UCD data file from Unicode official website, parse
165*e1fe3e4aSElliott Hughes    the property ranges and values and write them as two Python lists
166*e1fe3e4aSElliott Hughes    to 'fontTools.unicodedata.<filename>.py'.
167*e1fe3e4aSElliott Hughes
168*e1fe3e4aSElliott Hughes    'aliases' is an optional mapping of property codes (short names) to long
169*e1fe3e4aSElliott Hughes    name aliases (list of strings, with the first item being the preferred
170*e1fe3e4aSElliott Hughes    alias). When this is provided, the property values are written using the
171*e1fe3e4aSElliott Hughes    short notation, and an additional 'NAMES' dict with the aliases is
172*e1fe3e4aSElliott Hughes    written to the output module.
173*e1fe3e4aSElliott Hughes
174*e1fe3e4aSElliott Hughes    To load the data file from a local directory, you can use the
175*e1fe3e4aSElliott Hughes    'local_ucd' argument.
176*e1fe3e4aSElliott Hughes    """
177*e1fe3e4aSElliott Hughes    modname = os.path.splitext(filename)[0] + ".py"
178*e1fe3e4aSElliott Hughes    if not output_path:
179*e1fe3e4aSElliott Hughes        output_path = UNIDATA_PATH + modname
180*e1fe3e4aSElliott Hughes
181*e1fe3e4aSElliott Hughes    if local_ucd:
182*e1fe3e4aSElliott Hughes        log.info("loading '%s' from local directory '%s'", filename, local_ucd)
183*e1fe3e4aSElliott Hughes        cm = open(pjoin(local_ucd, filename), "r", encoding="utf-8")
184*e1fe3e4aSElliott Hughes    else:
185*e1fe3e4aSElliott Hughes        log.info("downloading '%s' from '%s'", filename, UNIDATA_URL)
186*e1fe3e4aSElliott Hughes        cm = open_unidata_file(filename)
187*e1fe3e4aSElliott Hughes
188*e1fe3e4aSElliott Hughes    with cm as f:
189*e1fe3e4aSElliott Hughes        header = parse_unidata_header(f)
190*e1fe3e4aSElliott Hughes        ranges = parse_range_properties(f, default=default, is_set=is_set)
191*e1fe3e4aSElliott Hughes
192*e1fe3e4aSElliott Hughes    if aliases:
193*e1fe3e4aSElliott Hughes        reversed_aliases = {normalize(v[0]): k for k, v in aliases.items()}
194*e1fe3e4aSElliott Hughes        max_value_length = 6  # 4-letter tags plus two quotes for repr
195*e1fe3e4aSElliott Hughes    else:
196*e1fe3e4aSElliott Hughes        max_value_length = min(56, max(len(repr(v)) for _, _, v in ranges))
197*e1fe3e4aSElliott Hughes
198*e1fe3e4aSElliott Hughes    with open(output_path, "w", encoding="utf-8") as f:
199*e1fe3e4aSElliott Hughes        f.write(SRC_ENCODING)
200*e1fe3e4aSElliott Hughes        f.write("#\n")
201*e1fe3e4aSElliott Hughes        f.write(NOTICE)
202*e1fe3e4aSElliott Hughes        f.write("# Source: {}{}\n".format(UNIDATA_URL, filename))
203*e1fe3e4aSElliott Hughes        f.write("# License: {}\n".format(UNIDATA_LICENSE_URL))
204*e1fe3e4aSElliott Hughes        f.write("#\n")
205*e1fe3e4aSElliott Hughes        f.write(header + "\n\n")
206*e1fe3e4aSElliott Hughes
207*e1fe3e4aSElliott Hughes        f.write("RANGES = [\n")
208*e1fe3e4aSElliott Hughes        for first, last, value in ranges:
209*e1fe3e4aSElliott Hughes            f.write(
210*e1fe3e4aSElliott Hughes                "    0x{:0>4X},  # .. 0x{:0>4X} ; {}\n".format(
211*e1fe3e4aSElliott Hughes                    first, last, _set_repr(value) if is_set else value
212*e1fe3e4aSElliott Hughes                )
213*e1fe3e4aSElliott Hughes            )
214*e1fe3e4aSElliott Hughes        f.write("]\n")
215*e1fe3e4aSElliott Hughes
216*e1fe3e4aSElliott Hughes        f.write("\n")
217*e1fe3e4aSElliott Hughes        f.write("VALUES = [\n")
218*e1fe3e4aSElliott Hughes        for first, last, value in ranges:
219*e1fe3e4aSElliott Hughes            comment = "# {:0>4X}..{:0>4X}".format(first, last)
220*e1fe3e4aSElliott Hughes            if is_set:
221*e1fe3e4aSElliott Hughes                value_repr = "{},".format(_set_repr(value))
222*e1fe3e4aSElliott Hughes            else:
223*e1fe3e4aSElliott Hughes                if aliases:
224*e1fe3e4aSElliott Hughes                    # append long name to comment and use the short code
225*e1fe3e4aSElliott Hughes                    comment += " ; {}".format(value)
226*e1fe3e4aSElliott Hughes                    value = reversed_aliases[normalize(value)]
227*e1fe3e4aSElliott Hughes                value_repr = "{!r},".format(value)
228*e1fe3e4aSElliott Hughes            f.write(
229*e1fe3e4aSElliott Hughes                "    {}  {}\n".format(value_repr.ljust(max_value_length + 1), comment)
230*e1fe3e4aSElliott Hughes            )
231*e1fe3e4aSElliott Hughes        f.write("]\n")
232*e1fe3e4aSElliott Hughes
233*e1fe3e4aSElliott Hughes        if aliases:
234*e1fe3e4aSElliott Hughes            f.write("\n")
235*e1fe3e4aSElliott Hughes            f.write("NAMES = {\n")
236*e1fe3e4aSElliott Hughes            for value, names in sorted(aliases.items()):
237*e1fe3e4aSElliott Hughes                # we only write the first preferred alias
238*e1fe3e4aSElliott Hughes                f.write("    {!r}: {!r},\n".format(value, names[0]))
239*e1fe3e4aSElliott Hughes            f.write("}\n")
240*e1fe3e4aSElliott Hughes
241*e1fe3e4aSElliott Hughes    log.info("saved new file: '%s'", os.path.normpath(output_path))
242*e1fe3e4aSElliott Hughes
243*e1fe3e4aSElliott Hughes
244*e1fe3e4aSElliott Hughes_normalize_re = re.compile(r"[-_ ]+")
245*e1fe3e4aSElliott Hughes
246*e1fe3e4aSElliott Hughes
247*e1fe3e4aSElliott Hughesdef normalize(string):
248*e1fe3e4aSElliott Hughes    """Remove case, strip space, '-' and '_' for loose matching."""
249*e1fe3e4aSElliott Hughes    return _normalize_re.sub("", string).lower()
250*e1fe3e4aSElliott Hughes
251*e1fe3e4aSElliott Hughes
252*e1fe3e4aSElliott Hughesdef parse_property_value_aliases(property_tag, local_ucd=None):
253*e1fe3e4aSElliott Hughes    """Fetch the current 'PropertyValueAliases.txt' from the Unicode website,
254*e1fe3e4aSElliott Hughes    parse the values for the specified 'property_tag' and return a dictionary
255*e1fe3e4aSElliott Hughes    of name aliases (list of strings) keyed by short value codes (strings).
256*e1fe3e4aSElliott Hughes
257*e1fe3e4aSElliott Hughes    To load the data file from a local directory, you can use the
258*e1fe3e4aSElliott Hughes    'local_ucd' argument.
259*e1fe3e4aSElliott Hughes    """
260*e1fe3e4aSElliott Hughes    filename = "PropertyValueAliases.txt"
261*e1fe3e4aSElliott Hughes    if local_ucd:
262*e1fe3e4aSElliott Hughes        log.info("loading '%s' from local directory '%s'", filename, local_ucd)
263*e1fe3e4aSElliott Hughes        cm = open(pjoin(local_ucd, filename), "r", encoding="utf-8")
264*e1fe3e4aSElliott Hughes    else:
265*e1fe3e4aSElliott Hughes        log.info("downloading '%s' from '%s'", filename, UNIDATA_URL)
266*e1fe3e4aSElliott Hughes        cm = open_unidata_file(filename)
267*e1fe3e4aSElliott Hughes
268*e1fe3e4aSElliott Hughes    with cm as f:
269*e1fe3e4aSElliott Hughes        header = parse_unidata_header(f)
270*e1fe3e4aSElliott Hughes        data = parse_semicolon_separated_data(f)
271*e1fe3e4aSElliott Hughes
272*e1fe3e4aSElliott Hughes    aliases = {item[1]: item[2:] for item in data if item[0] == property_tag}
273*e1fe3e4aSElliott Hughes
274*e1fe3e4aSElliott Hughes    return aliases
275*e1fe3e4aSElliott Hughes
276*e1fe3e4aSElliott Hughes
277*e1fe3e4aSElliott Hughesdef main():
278*e1fe3e4aSElliott Hughes    import argparse
279*e1fe3e4aSElliott Hughes
280*e1fe3e4aSElliott Hughes    parser = argparse.ArgumentParser(
281*e1fe3e4aSElliott Hughes        description="Generate fontTools.unicodedata from UCD data files"
282*e1fe3e4aSElliott Hughes    )
283*e1fe3e4aSElliott Hughes    parser.add_argument(
284*e1fe3e4aSElliott Hughes        "--ucd-path", help="Path to local folder containing UCD data files"
285*e1fe3e4aSElliott Hughes    )
286*e1fe3e4aSElliott Hughes    parser.add_argument("-q", "--quiet", action="store_true")
287*e1fe3e4aSElliott Hughes    options = parser.parse_args()
288*e1fe3e4aSElliott Hughes
289*e1fe3e4aSElliott Hughes    level = "WARNING" if options.quiet else "INFO"
290*e1fe3e4aSElliott Hughes    logging.basicConfig(level=level, format="%(message)s")
291*e1fe3e4aSElliott Hughes
292*e1fe3e4aSElliott Hughes    build_ranges("Blocks.txt", local_ucd=options.ucd_path, default="No_Block")
293*e1fe3e4aSElliott Hughes
294*e1fe3e4aSElliott Hughes    script_aliases = parse_property_value_aliases("sc", options.ucd_path)
295*e1fe3e4aSElliott Hughes    build_ranges(
296*e1fe3e4aSElliott Hughes        "Scripts.txt",
297*e1fe3e4aSElliott Hughes        local_ucd=options.ucd_path,
298*e1fe3e4aSElliott Hughes        default="Unknown",
299*e1fe3e4aSElliott Hughes        aliases=script_aliases,
300*e1fe3e4aSElliott Hughes    )
301*e1fe3e4aSElliott Hughes    build_ranges("ScriptExtensions.txt", local_ucd=options.ucd_path, is_set=True)
302*e1fe3e4aSElliott Hughes
303*e1fe3e4aSElliott Hughes
304*e1fe3e4aSElliott Hughesif __name__ == "__main__":
305*e1fe3e4aSElliott Hughes    import sys
306*e1fe3e4aSElliott Hughes
307*e1fe3e4aSElliott Hughes    sys.exit(main())
308