1*e1fe3e4aSElliott Hughes#!/usr/bin/env python3 2*e1fe3e4aSElliott Hughes""" 3*e1fe3e4aSElliott HughesTools to parse data files from the Unicode Character Database. 4*e1fe3e4aSElliott Hughes""" 5*e1fe3e4aSElliott Hughes 6*e1fe3e4aSElliott Hughes 7*e1fe3e4aSElliott Hughestry: 8*e1fe3e4aSElliott Hughes from urllib.request import urlopen 9*e1fe3e4aSElliott Hughesexcept ImportError: 10*e1fe3e4aSElliott Hughes from urllib2 import urlopen 11*e1fe3e4aSElliott Hughesfrom contextlib import closing, contextmanager 12*e1fe3e4aSElliott Hughesimport re 13*e1fe3e4aSElliott Hughesfrom codecs import iterdecode 14*e1fe3e4aSElliott Hughesimport logging 15*e1fe3e4aSElliott Hughesimport os 16*e1fe3e4aSElliott Hughesfrom io import open 17*e1fe3e4aSElliott Hughesfrom os.path import abspath, dirname, join as pjoin, pardir, sep 18*e1fe3e4aSElliott Hughes 19*e1fe3e4aSElliott Hughes 20*e1fe3e4aSElliott Hughestry: # pragma: no cover 21*e1fe3e4aSElliott Hughes unicode 22*e1fe3e4aSElliott Hughesexcept NameError: 23*e1fe3e4aSElliott Hughes unicode = str 24*e1fe3e4aSElliott Hughes 25*e1fe3e4aSElliott Hughes 26*e1fe3e4aSElliott HughesUNIDATA_URL = "https://unicode.org/Public/UNIDATA/" 27*e1fe3e4aSElliott HughesUNIDATA_LICENSE_URL = "http://unicode.org/copyright.html#License" 28*e1fe3e4aSElliott Hughes 29*e1fe3e4aSElliott Hughes# by default save output files to ../Lib/fontTools/unicodedata/ 30*e1fe3e4aSElliott HughesUNIDATA_PATH = ( 31*e1fe3e4aSElliott Hughes pjoin(abspath(dirname(__file__)), pardir, "Lib", "fontTools", "unicodedata") + sep 32*e1fe3e4aSElliott Hughes) 33*e1fe3e4aSElliott Hughes 34*e1fe3e4aSElliott HughesSRC_ENCODING = "# -*- coding: utf-8 -*-\n" 35*e1fe3e4aSElliott Hughes 36*e1fe3e4aSElliott HughesNOTICE = "# NOTE: This file was auto-generated with MetaTools/buildUCD.py.\n" 37*e1fe3e4aSElliott Hughes 38*e1fe3e4aSElliott HughesMAX_UNICODE = 0x10FFFF 39*e1fe3e4aSElliott Hughes 40*e1fe3e4aSElliott Hugheslog = logging.getLogger() 41*e1fe3e4aSElliott Hughes 42*e1fe3e4aSElliott Hughes 43*e1fe3e4aSElliott Hughes@contextmanager 44*e1fe3e4aSElliott Hughesdef open_unidata_file(filename): 45*e1fe3e4aSElliott Hughes """Open a text file from https://unicode.org/Public/UNIDATA/""" 46*e1fe3e4aSElliott Hughes url = UNIDATA_URL + filename 47*e1fe3e4aSElliott Hughes with closing(urlopen(url)) as response: 48*e1fe3e4aSElliott Hughes yield iterdecode(response, encoding="utf-8") 49*e1fe3e4aSElliott Hughes 50*e1fe3e4aSElliott Hughes 51*e1fe3e4aSElliott Hughesdef parse_unidata_header(infile): 52*e1fe3e4aSElliott Hughes """Read the top header of data files, until the first line 53*e1fe3e4aSElliott Hughes that does not start with '#'. 54*e1fe3e4aSElliott Hughes """ 55*e1fe3e4aSElliott Hughes header = [] 56*e1fe3e4aSElliott Hughes line = next(infile) 57*e1fe3e4aSElliott Hughes while line.startswith("#"): 58*e1fe3e4aSElliott Hughes header.append(line) 59*e1fe3e4aSElliott Hughes line = next(infile) 60*e1fe3e4aSElliott Hughes return "".join(header) 61*e1fe3e4aSElliott Hughes 62*e1fe3e4aSElliott Hughes 63*e1fe3e4aSElliott Hughesdef parse_range_properties(infile, default=None, is_set=False): 64*e1fe3e4aSElliott Hughes """Parse a Unicode data file containing a column with one character or 65*e1fe3e4aSElliott Hughes a range of characters, and another column containing a property value 66*e1fe3e4aSElliott Hughes separated by a semicolon. Comments after '#' are ignored. 67*e1fe3e4aSElliott Hughes 68*e1fe3e4aSElliott Hughes If the ranges defined in the data file are not continuous, assign the 69*e1fe3e4aSElliott Hughes 'default' property to the unassigned codepoints. 70*e1fe3e4aSElliott Hughes 71*e1fe3e4aSElliott Hughes Return a list of (start, end, property_name) tuples. 72*e1fe3e4aSElliott Hughes """ 73*e1fe3e4aSElliott Hughes ranges = [] 74*e1fe3e4aSElliott Hughes line_regex = re.compile( 75*e1fe3e4aSElliott Hughes r"^" 76*e1fe3e4aSElliott Hughes r"([0-9A-F]{4,6})" # first character code 77*e1fe3e4aSElliott Hughes r"(?:\.\.([0-9A-F]{4,6}))?" # optional second character code 78*e1fe3e4aSElliott Hughes r"\s*;\s*" 79*e1fe3e4aSElliott Hughes r"([^#]+)" 80*e1fe3e4aSElliott Hughes ) # everything up to the potential comment 81*e1fe3e4aSElliott Hughes for line in infile: 82*e1fe3e4aSElliott Hughes match = line_regex.match(line) 83*e1fe3e4aSElliott Hughes if not match: 84*e1fe3e4aSElliott Hughes continue 85*e1fe3e4aSElliott Hughes 86*e1fe3e4aSElliott Hughes first, last, data = match.groups() 87*e1fe3e4aSElliott Hughes if last is None: 88*e1fe3e4aSElliott Hughes last = first 89*e1fe3e4aSElliott Hughes 90*e1fe3e4aSElliott Hughes first = int(first, 16) 91*e1fe3e4aSElliott Hughes last = int(last, 16) 92*e1fe3e4aSElliott Hughes data = str(data.rstrip()) 93*e1fe3e4aSElliott Hughes 94*e1fe3e4aSElliott Hughes ranges.append((first, last, data)) 95*e1fe3e4aSElliott Hughes 96*e1fe3e4aSElliott Hughes ranges.sort() 97*e1fe3e4aSElliott Hughes 98*e1fe3e4aSElliott Hughes if isinstance(default, unicode): 99*e1fe3e4aSElliott Hughes default = str(default) 100*e1fe3e4aSElliott Hughes 101*e1fe3e4aSElliott Hughes # fill the gaps between explicitly defined ranges 102*e1fe3e4aSElliott Hughes last_start, last_end = -1, -1 103*e1fe3e4aSElliott Hughes full_ranges = [] 104*e1fe3e4aSElliott Hughes for start, end, value in ranges: 105*e1fe3e4aSElliott Hughes assert last_end < start 106*e1fe3e4aSElliott Hughes assert start <= end 107*e1fe3e4aSElliott Hughes if start - last_end > 1: 108*e1fe3e4aSElliott Hughes full_ranges.append((last_end + 1, start - 1, default)) 109*e1fe3e4aSElliott Hughes if is_set: 110*e1fe3e4aSElliott Hughes value = set(value.split()) 111*e1fe3e4aSElliott Hughes full_ranges.append((start, end, value)) 112*e1fe3e4aSElliott Hughes last_start, last_end = start, end 113*e1fe3e4aSElliott Hughes if last_end != MAX_UNICODE: 114*e1fe3e4aSElliott Hughes full_ranges.append((last_end + 1, MAX_UNICODE, default)) 115*e1fe3e4aSElliott Hughes 116*e1fe3e4aSElliott Hughes # reduce total number of ranges by combining continuous ones 117*e1fe3e4aSElliott Hughes last_start, last_end, last_value = full_ranges.pop(0) 118*e1fe3e4aSElliott Hughes merged_ranges = [] 119*e1fe3e4aSElliott Hughes for start, end, value in full_ranges: 120*e1fe3e4aSElliott Hughes if value == last_value: 121*e1fe3e4aSElliott Hughes continue 122*e1fe3e4aSElliott Hughes else: 123*e1fe3e4aSElliott Hughes merged_ranges.append((last_start, start - 1, last_value)) 124*e1fe3e4aSElliott Hughes last_start, line_end, last_value = start, end, value 125*e1fe3e4aSElliott Hughes merged_ranges.append((last_start, MAX_UNICODE, last_value)) 126*e1fe3e4aSElliott Hughes 127*e1fe3e4aSElliott Hughes # make sure that the ranges cover the full unicode repertoire 128*e1fe3e4aSElliott Hughes assert merged_ranges[0][0] == 0 129*e1fe3e4aSElliott Hughes for (cs, ce, cv), (ns, ne, nv) in zip(merged_ranges, merged_ranges[1:]): 130*e1fe3e4aSElliott Hughes assert ce + 1 == ns 131*e1fe3e4aSElliott Hughes assert merged_ranges[-1][1] == MAX_UNICODE 132*e1fe3e4aSElliott Hughes 133*e1fe3e4aSElliott Hughes return merged_ranges 134*e1fe3e4aSElliott Hughes 135*e1fe3e4aSElliott Hughes 136*e1fe3e4aSElliott Hughesdef parse_semicolon_separated_data(infile): 137*e1fe3e4aSElliott Hughes """Parse a Unicode data file where each line contains a lists of values 138*e1fe3e4aSElliott Hughes separated by a semicolon (e.g. "PropertyValueAliases.txt"). 139*e1fe3e4aSElliott Hughes The number of the values on different lines may be different. 140*e1fe3e4aSElliott Hughes 141*e1fe3e4aSElliott Hughes Returns a list of lists each containing the values as strings. 142*e1fe3e4aSElliott Hughes """ 143*e1fe3e4aSElliott Hughes data = [] 144*e1fe3e4aSElliott Hughes for line in infile: 145*e1fe3e4aSElliott Hughes line = line.split("#", 1)[0].strip() # remove the comment 146*e1fe3e4aSElliott Hughes if not line: 147*e1fe3e4aSElliott Hughes continue 148*e1fe3e4aSElliott Hughes fields = [str(field.strip()) for field in line.split(";")] 149*e1fe3e4aSElliott Hughes data.append(fields) 150*e1fe3e4aSElliott Hughes return data 151*e1fe3e4aSElliott Hughes 152*e1fe3e4aSElliott Hughes 153*e1fe3e4aSElliott Hughesdef _set_repr(value): 154*e1fe3e4aSElliott Hughes return ( 155*e1fe3e4aSElliott Hughes "None" 156*e1fe3e4aSElliott Hughes if value is None 157*e1fe3e4aSElliott Hughes else "{{{}}}".format(", ".join(repr(v) for v in sorted(value))) 158*e1fe3e4aSElliott Hughes ) 159*e1fe3e4aSElliott Hughes 160*e1fe3e4aSElliott Hughes 161*e1fe3e4aSElliott Hughesdef build_ranges( 162*e1fe3e4aSElliott Hughes filename, local_ucd=None, output_path=None, default=None, is_set=False, aliases=None 163*e1fe3e4aSElliott Hughes): 164*e1fe3e4aSElliott Hughes """Fetch 'filename' UCD data file from Unicode official website, parse 165*e1fe3e4aSElliott Hughes the property ranges and values and write them as two Python lists 166*e1fe3e4aSElliott Hughes to 'fontTools.unicodedata.<filename>.py'. 167*e1fe3e4aSElliott Hughes 168*e1fe3e4aSElliott Hughes 'aliases' is an optional mapping of property codes (short names) to long 169*e1fe3e4aSElliott Hughes name aliases (list of strings, with the first item being the preferred 170*e1fe3e4aSElliott Hughes alias). When this is provided, the property values are written using the 171*e1fe3e4aSElliott Hughes short notation, and an additional 'NAMES' dict with the aliases is 172*e1fe3e4aSElliott Hughes written to the output module. 173*e1fe3e4aSElliott Hughes 174*e1fe3e4aSElliott Hughes To load the data file from a local directory, you can use the 175*e1fe3e4aSElliott Hughes 'local_ucd' argument. 176*e1fe3e4aSElliott Hughes """ 177*e1fe3e4aSElliott Hughes modname = os.path.splitext(filename)[0] + ".py" 178*e1fe3e4aSElliott Hughes if not output_path: 179*e1fe3e4aSElliott Hughes output_path = UNIDATA_PATH + modname 180*e1fe3e4aSElliott Hughes 181*e1fe3e4aSElliott Hughes if local_ucd: 182*e1fe3e4aSElliott Hughes log.info("loading '%s' from local directory '%s'", filename, local_ucd) 183*e1fe3e4aSElliott Hughes cm = open(pjoin(local_ucd, filename), "r", encoding="utf-8") 184*e1fe3e4aSElliott Hughes else: 185*e1fe3e4aSElliott Hughes log.info("downloading '%s' from '%s'", filename, UNIDATA_URL) 186*e1fe3e4aSElliott Hughes cm = open_unidata_file(filename) 187*e1fe3e4aSElliott Hughes 188*e1fe3e4aSElliott Hughes with cm as f: 189*e1fe3e4aSElliott Hughes header = parse_unidata_header(f) 190*e1fe3e4aSElliott Hughes ranges = parse_range_properties(f, default=default, is_set=is_set) 191*e1fe3e4aSElliott Hughes 192*e1fe3e4aSElliott Hughes if aliases: 193*e1fe3e4aSElliott Hughes reversed_aliases = {normalize(v[0]): k for k, v in aliases.items()} 194*e1fe3e4aSElliott Hughes max_value_length = 6 # 4-letter tags plus two quotes for repr 195*e1fe3e4aSElliott Hughes else: 196*e1fe3e4aSElliott Hughes max_value_length = min(56, max(len(repr(v)) for _, _, v in ranges)) 197*e1fe3e4aSElliott Hughes 198*e1fe3e4aSElliott Hughes with open(output_path, "w", encoding="utf-8") as f: 199*e1fe3e4aSElliott Hughes f.write(SRC_ENCODING) 200*e1fe3e4aSElliott Hughes f.write("#\n") 201*e1fe3e4aSElliott Hughes f.write(NOTICE) 202*e1fe3e4aSElliott Hughes f.write("# Source: {}{}\n".format(UNIDATA_URL, filename)) 203*e1fe3e4aSElliott Hughes f.write("# License: {}\n".format(UNIDATA_LICENSE_URL)) 204*e1fe3e4aSElliott Hughes f.write("#\n") 205*e1fe3e4aSElliott Hughes f.write(header + "\n\n") 206*e1fe3e4aSElliott Hughes 207*e1fe3e4aSElliott Hughes f.write("RANGES = [\n") 208*e1fe3e4aSElliott Hughes for first, last, value in ranges: 209*e1fe3e4aSElliott Hughes f.write( 210*e1fe3e4aSElliott Hughes " 0x{:0>4X}, # .. 0x{:0>4X} ; {}\n".format( 211*e1fe3e4aSElliott Hughes first, last, _set_repr(value) if is_set else value 212*e1fe3e4aSElliott Hughes ) 213*e1fe3e4aSElliott Hughes ) 214*e1fe3e4aSElliott Hughes f.write("]\n") 215*e1fe3e4aSElliott Hughes 216*e1fe3e4aSElliott Hughes f.write("\n") 217*e1fe3e4aSElliott Hughes f.write("VALUES = [\n") 218*e1fe3e4aSElliott Hughes for first, last, value in ranges: 219*e1fe3e4aSElliott Hughes comment = "# {:0>4X}..{:0>4X}".format(first, last) 220*e1fe3e4aSElliott Hughes if is_set: 221*e1fe3e4aSElliott Hughes value_repr = "{},".format(_set_repr(value)) 222*e1fe3e4aSElliott Hughes else: 223*e1fe3e4aSElliott Hughes if aliases: 224*e1fe3e4aSElliott Hughes # append long name to comment and use the short code 225*e1fe3e4aSElliott Hughes comment += " ; {}".format(value) 226*e1fe3e4aSElliott Hughes value = reversed_aliases[normalize(value)] 227*e1fe3e4aSElliott Hughes value_repr = "{!r},".format(value) 228*e1fe3e4aSElliott Hughes f.write( 229*e1fe3e4aSElliott Hughes " {} {}\n".format(value_repr.ljust(max_value_length + 1), comment) 230*e1fe3e4aSElliott Hughes ) 231*e1fe3e4aSElliott Hughes f.write("]\n") 232*e1fe3e4aSElliott Hughes 233*e1fe3e4aSElliott Hughes if aliases: 234*e1fe3e4aSElliott Hughes f.write("\n") 235*e1fe3e4aSElliott Hughes f.write("NAMES = {\n") 236*e1fe3e4aSElliott Hughes for value, names in sorted(aliases.items()): 237*e1fe3e4aSElliott Hughes # we only write the first preferred alias 238*e1fe3e4aSElliott Hughes f.write(" {!r}: {!r},\n".format(value, names[0])) 239*e1fe3e4aSElliott Hughes f.write("}\n") 240*e1fe3e4aSElliott Hughes 241*e1fe3e4aSElliott Hughes log.info("saved new file: '%s'", os.path.normpath(output_path)) 242*e1fe3e4aSElliott Hughes 243*e1fe3e4aSElliott Hughes 244*e1fe3e4aSElliott Hughes_normalize_re = re.compile(r"[-_ ]+") 245*e1fe3e4aSElliott Hughes 246*e1fe3e4aSElliott Hughes 247*e1fe3e4aSElliott Hughesdef normalize(string): 248*e1fe3e4aSElliott Hughes """Remove case, strip space, '-' and '_' for loose matching.""" 249*e1fe3e4aSElliott Hughes return _normalize_re.sub("", string).lower() 250*e1fe3e4aSElliott Hughes 251*e1fe3e4aSElliott Hughes 252*e1fe3e4aSElliott Hughesdef parse_property_value_aliases(property_tag, local_ucd=None): 253*e1fe3e4aSElliott Hughes """Fetch the current 'PropertyValueAliases.txt' from the Unicode website, 254*e1fe3e4aSElliott Hughes parse the values for the specified 'property_tag' and return a dictionary 255*e1fe3e4aSElliott Hughes of name aliases (list of strings) keyed by short value codes (strings). 256*e1fe3e4aSElliott Hughes 257*e1fe3e4aSElliott Hughes To load the data file from a local directory, you can use the 258*e1fe3e4aSElliott Hughes 'local_ucd' argument. 259*e1fe3e4aSElliott Hughes """ 260*e1fe3e4aSElliott Hughes filename = "PropertyValueAliases.txt" 261*e1fe3e4aSElliott Hughes if local_ucd: 262*e1fe3e4aSElliott Hughes log.info("loading '%s' from local directory '%s'", filename, local_ucd) 263*e1fe3e4aSElliott Hughes cm = open(pjoin(local_ucd, filename), "r", encoding="utf-8") 264*e1fe3e4aSElliott Hughes else: 265*e1fe3e4aSElliott Hughes log.info("downloading '%s' from '%s'", filename, UNIDATA_URL) 266*e1fe3e4aSElliott Hughes cm = open_unidata_file(filename) 267*e1fe3e4aSElliott Hughes 268*e1fe3e4aSElliott Hughes with cm as f: 269*e1fe3e4aSElliott Hughes header = parse_unidata_header(f) 270*e1fe3e4aSElliott Hughes data = parse_semicolon_separated_data(f) 271*e1fe3e4aSElliott Hughes 272*e1fe3e4aSElliott Hughes aliases = {item[1]: item[2:] for item in data if item[0] == property_tag} 273*e1fe3e4aSElliott Hughes 274*e1fe3e4aSElliott Hughes return aliases 275*e1fe3e4aSElliott Hughes 276*e1fe3e4aSElliott Hughes 277*e1fe3e4aSElliott Hughesdef main(): 278*e1fe3e4aSElliott Hughes import argparse 279*e1fe3e4aSElliott Hughes 280*e1fe3e4aSElliott Hughes parser = argparse.ArgumentParser( 281*e1fe3e4aSElliott Hughes description="Generate fontTools.unicodedata from UCD data files" 282*e1fe3e4aSElliott Hughes ) 283*e1fe3e4aSElliott Hughes parser.add_argument( 284*e1fe3e4aSElliott Hughes "--ucd-path", help="Path to local folder containing UCD data files" 285*e1fe3e4aSElliott Hughes ) 286*e1fe3e4aSElliott Hughes parser.add_argument("-q", "--quiet", action="store_true") 287*e1fe3e4aSElliott Hughes options = parser.parse_args() 288*e1fe3e4aSElliott Hughes 289*e1fe3e4aSElliott Hughes level = "WARNING" if options.quiet else "INFO" 290*e1fe3e4aSElliott Hughes logging.basicConfig(level=level, format="%(message)s") 291*e1fe3e4aSElliott Hughes 292*e1fe3e4aSElliott Hughes build_ranges("Blocks.txt", local_ucd=options.ucd_path, default="No_Block") 293*e1fe3e4aSElliott Hughes 294*e1fe3e4aSElliott Hughes script_aliases = parse_property_value_aliases("sc", options.ucd_path) 295*e1fe3e4aSElliott Hughes build_ranges( 296*e1fe3e4aSElliott Hughes "Scripts.txt", 297*e1fe3e4aSElliott Hughes local_ucd=options.ucd_path, 298*e1fe3e4aSElliott Hughes default="Unknown", 299*e1fe3e4aSElliott Hughes aliases=script_aliases, 300*e1fe3e4aSElliott Hughes ) 301*e1fe3e4aSElliott Hughes build_ranges("ScriptExtensions.txt", local_ucd=options.ucd_path, is_set=True) 302*e1fe3e4aSElliott Hughes 303*e1fe3e4aSElliott Hughes 304*e1fe3e4aSElliott Hughesif __name__ == "__main__": 305*e1fe3e4aSElliott Hughes import sys 306*e1fe3e4aSElliott Hughes 307*e1fe3e4aSElliott Hughes sys.exit(main()) 308