1*ccdc9c3eSSadaf Ebrahimi# Copyright 2008 The RE2 Authors. All Rights Reserved. 2*ccdc9c3eSSadaf Ebrahimi# Use of this source code is governed by a BSD-style 3*ccdc9c3eSSadaf Ebrahimi# license that can be found in the LICENSE file. 4*ccdc9c3eSSadaf Ebrahimi 5*ccdc9c3eSSadaf Ebrahimi"""Parser for Unicode data files (as distributed by unicode.org).""" 6*ccdc9c3eSSadaf Ebrahimi 7*ccdc9c3eSSadaf Ebrahimiimport os 8*ccdc9c3eSSadaf Ebrahimiimport re 9*ccdc9c3eSSadaf Ebrahimiimport urllib2 10*ccdc9c3eSSadaf Ebrahimi 11*ccdc9c3eSSadaf Ebrahimi# Directory or URL where Unicode tables reside. 12*ccdc9c3eSSadaf Ebrahimi_UNICODE_DIR = "https://www.unicode.org/Public/11.0.0/ucd" 13*ccdc9c3eSSadaf Ebrahimi 14*ccdc9c3eSSadaf Ebrahimi# Largest valid Unicode code value. 15*ccdc9c3eSSadaf Ebrahimi_RUNE_MAX = 0x10FFFF 16*ccdc9c3eSSadaf Ebrahimi 17*ccdc9c3eSSadaf Ebrahimi 18*ccdc9c3eSSadaf Ebrahimiclass Error(Exception): 19*ccdc9c3eSSadaf Ebrahimi """Unicode error base class.""" 20*ccdc9c3eSSadaf Ebrahimi 21*ccdc9c3eSSadaf Ebrahimi 22*ccdc9c3eSSadaf Ebrahimiclass InputError(Error): 23*ccdc9c3eSSadaf Ebrahimi """Unicode input error class. Raised on invalid input.""" 24*ccdc9c3eSSadaf Ebrahimi 25*ccdc9c3eSSadaf Ebrahimi 26*ccdc9c3eSSadaf Ebrahimidef _UInt(s): 27*ccdc9c3eSSadaf Ebrahimi """Converts string to Unicode code point ('263A' => 0x263a). 28*ccdc9c3eSSadaf Ebrahimi 29*ccdc9c3eSSadaf Ebrahimi Args: 30*ccdc9c3eSSadaf Ebrahimi s: string to convert 31*ccdc9c3eSSadaf Ebrahimi 32*ccdc9c3eSSadaf Ebrahimi Returns: 33*ccdc9c3eSSadaf Ebrahimi Unicode code point 34*ccdc9c3eSSadaf Ebrahimi 35*ccdc9c3eSSadaf Ebrahimi Raises: 36*ccdc9c3eSSadaf Ebrahimi InputError: the string is not a valid Unicode value. 37*ccdc9c3eSSadaf Ebrahimi """ 38*ccdc9c3eSSadaf Ebrahimi 39*ccdc9c3eSSadaf Ebrahimi try: 40*ccdc9c3eSSadaf Ebrahimi v = int(s, 16) 41*ccdc9c3eSSadaf Ebrahimi except ValueError: 42*ccdc9c3eSSadaf Ebrahimi v = -1 43*ccdc9c3eSSadaf Ebrahimi if len(s) < 4 or len(s) > 6 or v < 0 or v > _RUNE_MAX: 44*ccdc9c3eSSadaf Ebrahimi raise InputError("invalid Unicode value %s" % (s,)) 45*ccdc9c3eSSadaf Ebrahimi return v 46*ccdc9c3eSSadaf Ebrahimi 47*ccdc9c3eSSadaf Ebrahimi 48*ccdc9c3eSSadaf Ebrahimidef _URange(s): 49*ccdc9c3eSSadaf Ebrahimi """Converts string to Unicode range. 50*ccdc9c3eSSadaf Ebrahimi 51*ccdc9c3eSSadaf Ebrahimi '0001..0003' => [1, 2, 3]. 52*ccdc9c3eSSadaf Ebrahimi '0001' => [1]. 53*ccdc9c3eSSadaf Ebrahimi 54*ccdc9c3eSSadaf Ebrahimi Args: 55*ccdc9c3eSSadaf Ebrahimi s: string to convert 56*ccdc9c3eSSadaf Ebrahimi 57*ccdc9c3eSSadaf Ebrahimi Returns: 58*ccdc9c3eSSadaf Ebrahimi Unicode range 59*ccdc9c3eSSadaf Ebrahimi 60*ccdc9c3eSSadaf Ebrahimi Raises: 61*ccdc9c3eSSadaf Ebrahimi InputError: the string is not a valid Unicode range. 62*ccdc9c3eSSadaf Ebrahimi """ 63*ccdc9c3eSSadaf Ebrahimi a = s.split("..") 64*ccdc9c3eSSadaf Ebrahimi if len(a) == 1: 65*ccdc9c3eSSadaf Ebrahimi return [_UInt(a[0])] 66*ccdc9c3eSSadaf Ebrahimi if len(a) == 2: 67*ccdc9c3eSSadaf Ebrahimi lo = _UInt(a[0]) 68*ccdc9c3eSSadaf Ebrahimi hi = _UInt(a[1]) 69*ccdc9c3eSSadaf Ebrahimi if lo < hi: 70*ccdc9c3eSSadaf Ebrahimi return range(lo, hi + 1) 71*ccdc9c3eSSadaf Ebrahimi raise InputError("invalid Unicode range %s" % (s,)) 72*ccdc9c3eSSadaf Ebrahimi 73*ccdc9c3eSSadaf Ebrahimi 74*ccdc9c3eSSadaf Ebrahimidef _UStr(v): 75*ccdc9c3eSSadaf Ebrahimi """Converts Unicode code point to hex string. 76*ccdc9c3eSSadaf Ebrahimi 77*ccdc9c3eSSadaf Ebrahimi 0x263a => '0x263A'. 78*ccdc9c3eSSadaf Ebrahimi 79*ccdc9c3eSSadaf Ebrahimi Args: 80*ccdc9c3eSSadaf Ebrahimi v: code point to convert 81*ccdc9c3eSSadaf Ebrahimi 82*ccdc9c3eSSadaf Ebrahimi Returns: 83*ccdc9c3eSSadaf Ebrahimi Unicode string 84*ccdc9c3eSSadaf Ebrahimi 85*ccdc9c3eSSadaf Ebrahimi Raises: 86*ccdc9c3eSSadaf Ebrahimi InputError: the argument is not a valid Unicode value. 87*ccdc9c3eSSadaf Ebrahimi """ 88*ccdc9c3eSSadaf Ebrahimi if v < 0 or v > _RUNE_MAX: 89*ccdc9c3eSSadaf Ebrahimi raise InputError("invalid Unicode value %s" % (v,)) 90*ccdc9c3eSSadaf Ebrahimi return "0x%04X" % (v,) 91*ccdc9c3eSSadaf Ebrahimi 92*ccdc9c3eSSadaf Ebrahimi 93*ccdc9c3eSSadaf Ebrahimidef _ParseContinue(s): 94*ccdc9c3eSSadaf Ebrahimi """Parses a Unicode continuation field. 95*ccdc9c3eSSadaf Ebrahimi 96*ccdc9c3eSSadaf Ebrahimi These are of the form '<Name, First>' or '<Name, Last>'. 97*ccdc9c3eSSadaf Ebrahimi Instead of giving an explicit range in a single table entry, 98*ccdc9c3eSSadaf Ebrahimi some Unicode tables use two entries, one for the first 99*ccdc9c3eSSadaf Ebrahimi code value in the range and one for the last. 100*ccdc9c3eSSadaf Ebrahimi The first entry's description is '<Name, First>' instead of 'Name' 101*ccdc9c3eSSadaf Ebrahimi and the second is '<Name, Last>'. 102*ccdc9c3eSSadaf Ebrahimi 103*ccdc9c3eSSadaf Ebrahimi '<Name, First>' => ('Name', 'First') 104*ccdc9c3eSSadaf Ebrahimi '<Name, Last>' => ('Name', 'Last') 105*ccdc9c3eSSadaf Ebrahimi 'Anything else' => ('Anything else', None) 106*ccdc9c3eSSadaf Ebrahimi 107*ccdc9c3eSSadaf Ebrahimi Args: 108*ccdc9c3eSSadaf Ebrahimi s: continuation field string 109*ccdc9c3eSSadaf Ebrahimi 110*ccdc9c3eSSadaf Ebrahimi Returns: 111*ccdc9c3eSSadaf Ebrahimi pair: name and ('First', 'Last', or None) 112*ccdc9c3eSSadaf Ebrahimi """ 113*ccdc9c3eSSadaf Ebrahimi 114*ccdc9c3eSSadaf Ebrahimi match = re.match("<(.*), (First|Last)>", s) 115*ccdc9c3eSSadaf Ebrahimi if match is not None: 116*ccdc9c3eSSadaf Ebrahimi return match.groups() 117*ccdc9c3eSSadaf Ebrahimi return (s, None) 118*ccdc9c3eSSadaf Ebrahimi 119*ccdc9c3eSSadaf Ebrahimi 120*ccdc9c3eSSadaf Ebrahimidef ReadUnicodeTable(filename, nfields, doline): 121*ccdc9c3eSSadaf Ebrahimi """Generic Unicode table text file reader. 122*ccdc9c3eSSadaf Ebrahimi 123*ccdc9c3eSSadaf Ebrahimi The reader takes care of stripping out comments and also 124*ccdc9c3eSSadaf Ebrahimi parsing the two different ways that the Unicode tables specify 125*ccdc9c3eSSadaf Ebrahimi code ranges (using the .. notation and splitting the range across 126*ccdc9c3eSSadaf Ebrahimi multiple lines). 127*ccdc9c3eSSadaf Ebrahimi 128*ccdc9c3eSSadaf Ebrahimi Each non-comment line in the table is expected to have the given 129*ccdc9c3eSSadaf Ebrahimi number of fields. The first field is known to be the Unicode value 130*ccdc9c3eSSadaf Ebrahimi and the second field its description. 131*ccdc9c3eSSadaf Ebrahimi 132*ccdc9c3eSSadaf Ebrahimi The reader calls doline(codes, fields) for each entry in the table. 133*ccdc9c3eSSadaf Ebrahimi If fn raises an exception, the reader prints that exception, 134*ccdc9c3eSSadaf Ebrahimi prefixed with the file name and line number, and continues 135*ccdc9c3eSSadaf Ebrahimi processing the file. When done with the file, the reader re-raises 136*ccdc9c3eSSadaf Ebrahimi the first exception encountered during the file. 137*ccdc9c3eSSadaf Ebrahimi 138*ccdc9c3eSSadaf Ebrahimi Arguments: 139*ccdc9c3eSSadaf Ebrahimi filename: the Unicode data file to read, or a file-like object. 140*ccdc9c3eSSadaf Ebrahimi nfields: the number of expected fields per line in that file. 141*ccdc9c3eSSadaf Ebrahimi doline: the function to call for each table entry. 142*ccdc9c3eSSadaf Ebrahimi 143*ccdc9c3eSSadaf Ebrahimi Raises: 144*ccdc9c3eSSadaf Ebrahimi InputError: nfields is invalid (must be >= 2). 145*ccdc9c3eSSadaf Ebrahimi """ 146*ccdc9c3eSSadaf Ebrahimi 147*ccdc9c3eSSadaf Ebrahimi if nfields < 2: 148*ccdc9c3eSSadaf Ebrahimi raise InputError("invalid number of fields %d" % (nfields,)) 149*ccdc9c3eSSadaf Ebrahimi 150*ccdc9c3eSSadaf Ebrahimi if type(filename) == str: 151*ccdc9c3eSSadaf Ebrahimi if filename.startswith("https://"): 152*ccdc9c3eSSadaf Ebrahimi fil = urllib2.urlopen(filename) 153*ccdc9c3eSSadaf Ebrahimi else: 154*ccdc9c3eSSadaf Ebrahimi fil = open(filename, "r") 155*ccdc9c3eSSadaf Ebrahimi else: 156*ccdc9c3eSSadaf Ebrahimi fil = filename 157*ccdc9c3eSSadaf Ebrahimi 158*ccdc9c3eSSadaf Ebrahimi first = None # first code in multiline range 159*ccdc9c3eSSadaf Ebrahimi expect_last = None # tag expected for "Last" line in multiline range 160*ccdc9c3eSSadaf Ebrahimi lineno = 0 # current line number 161*ccdc9c3eSSadaf Ebrahimi for line in fil: 162*ccdc9c3eSSadaf Ebrahimi lineno += 1 163*ccdc9c3eSSadaf Ebrahimi try: 164*ccdc9c3eSSadaf Ebrahimi # Chop # comments and white space; ignore empty lines. 165*ccdc9c3eSSadaf Ebrahimi sharp = line.find("#") 166*ccdc9c3eSSadaf Ebrahimi if sharp >= 0: 167*ccdc9c3eSSadaf Ebrahimi line = line[:sharp] 168*ccdc9c3eSSadaf Ebrahimi line = line.strip() 169*ccdc9c3eSSadaf Ebrahimi if not line: 170*ccdc9c3eSSadaf Ebrahimi continue 171*ccdc9c3eSSadaf Ebrahimi 172*ccdc9c3eSSadaf Ebrahimi # Split fields on ";", chop more white space. 173*ccdc9c3eSSadaf Ebrahimi # Must have the expected number of fields. 174*ccdc9c3eSSadaf Ebrahimi fields = [s.strip() for s in line.split(";")] 175*ccdc9c3eSSadaf Ebrahimi if len(fields) != nfields: 176*ccdc9c3eSSadaf Ebrahimi raise InputError("wrong number of fields %d %d - %s" % 177*ccdc9c3eSSadaf Ebrahimi (len(fields), nfields, line)) 178*ccdc9c3eSSadaf Ebrahimi 179*ccdc9c3eSSadaf Ebrahimi # The Unicode text files have two different ways 180*ccdc9c3eSSadaf Ebrahimi # to list a Unicode range. Either the first field is 181*ccdc9c3eSSadaf Ebrahimi # itself a range (0000..FFFF), or the range is split 182*ccdc9c3eSSadaf Ebrahimi # across two lines, with the second field noting 183*ccdc9c3eSSadaf Ebrahimi # the continuation. 184*ccdc9c3eSSadaf Ebrahimi codes = _URange(fields[0]) 185*ccdc9c3eSSadaf Ebrahimi (name, cont) = _ParseContinue(fields[1]) 186*ccdc9c3eSSadaf Ebrahimi 187*ccdc9c3eSSadaf Ebrahimi if expect_last is not None: 188*ccdc9c3eSSadaf Ebrahimi # If the last line gave the First code in a range, 189*ccdc9c3eSSadaf Ebrahimi # this one had better give the Last one. 190*ccdc9c3eSSadaf Ebrahimi if (len(codes) != 1 or codes[0] <= first or 191*ccdc9c3eSSadaf Ebrahimi cont != "Last" or name != expect_last): 192*ccdc9c3eSSadaf Ebrahimi raise InputError("expected Last line for %s" % 193*ccdc9c3eSSadaf Ebrahimi (expect_last,)) 194*ccdc9c3eSSadaf Ebrahimi codes = range(first, codes[0] + 1) 195*ccdc9c3eSSadaf Ebrahimi first = None 196*ccdc9c3eSSadaf Ebrahimi expect_last = None 197*ccdc9c3eSSadaf Ebrahimi fields[0] = "%04X..%04X" % (codes[0], codes[-1]) 198*ccdc9c3eSSadaf Ebrahimi fields[1] = name 199*ccdc9c3eSSadaf Ebrahimi elif cont == "First": 200*ccdc9c3eSSadaf Ebrahimi # Otherwise, if this is the First code in a range, 201*ccdc9c3eSSadaf Ebrahimi # remember it and go to the next line. 202*ccdc9c3eSSadaf Ebrahimi if len(codes) != 1: 203*ccdc9c3eSSadaf Ebrahimi raise InputError("bad First line: range given") 204*ccdc9c3eSSadaf Ebrahimi expect_last = name 205*ccdc9c3eSSadaf Ebrahimi first = codes[0] 206*ccdc9c3eSSadaf Ebrahimi continue 207*ccdc9c3eSSadaf Ebrahimi 208*ccdc9c3eSSadaf Ebrahimi doline(codes, fields) 209*ccdc9c3eSSadaf Ebrahimi 210*ccdc9c3eSSadaf Ebrahimi except Exception, e: 211*ccdc9c3eSSadaf Ebrahimi print "%s:%d: %s" % (filename, lineno, e) 212*ccdc9c3eSSadaf Ebrahimi raise 213*ccdc9c3eSSadaf Ebrahimi 214*ccdc9c3eSSadaf Ebrahimi if expect_last is not None: 215*ccdc9c3eSSadaf Ebrahimi raise InputError("expected Last line for %s; got EOF" % 216*ccdc9c3eSSadaf Ebrahimi (expect_last,)) 217*ccdc9c3eSSadaf Ebrahimi 218*ccdc9c3eSSadaf Ebrahimi 219*ccdc9c3eSSadaf Ebrahimidef CaseGroups(unicode_dir=_UNICODE_DIR): 220*ccdc9c3eSSadaf Ebrahimi """Returns list of Unicode code groups equivalent under case folding. 221*ccdc9c3eSSadaf Ebrahimi 222*ccdc9c3eSSadaf Ebrahimi Each group is a sorted list of code points, 223*ccdc9c3eSSadaf Ebrahimi and the list of groups is sorted by first code point 224*ccdc9c3eSSadaf Ebrahimi in the group. 225*ccdc9c3eSSadaf Ebrahimi 226*ccdc9c3eSSadaf Ebrahimi Args: 227*ccdc9c3eSSadaf Ebrahimi unicode_dir: Unicode data directory 228*ccdc9c3eSSadaf Ebrahimi 229*ccdc9c3eSSadaf Ebrahimi Returns: 230*ccdc9c3eSSadaf Ebrahimi list of Unicode code groups 231*ccdc9c3eSSadaf Ebrahimi """ 232*ccdc9c3eSSadaf Ebrahimi 233*ccdc9c3eSSadaf Ebrahimi # Dict mapping lowercase code point to fold-equivalent group. 234*ccdc9c3eSSadaf Ebrahimi togroup = {} 235*ccdc9c3eSSadaf Ebrahimi 236*ccdc9c3eSSadaf Ebrahimi def DoLine(codes, fields): 237*ccdc9c3eSSadaf Ebrahimi """Process single CaseFolding.txt line, updating togroup.""" 238*ccdc9c3eSSadaf Ebrahimi (_, foldtype, lower, _) = fields 239*ccdc9c3eSSadaf Ebrahimi if foldtype not in ("C", "S"): 240*ccdc9c3eSSadaf Ebrahimi return 241*ccdc9c3eSSadaf Ebrahimi lower = _UInt(lower) 242*ccdc9c3eSSadaf Ebrahimi togroup.setdefault(lower, [lower]).extend(codes) 243*ccdc9c3eSSadaf Ebrahimi 244*ccdc9c3eSSadaf Ebrahimi ReadUnicodeTable(unicode_dir+"/CaseFolding.txt", 4, DoLine) 245*ccdc9c3eSSadaf Ebrahimi 246*ccdc9c3eSSadaf Ebrahimi groups = togroup.values() 247*ccdc9c3eSSadaf Ebrahimi for g in groups: 248*ccdc9c3eSSadaf Ebrahimi g.sort() 249*ccdc9c3eSSadaf Ebrahimi groups.sort() 250*ccdc9c3eSSadaf Ebrahimi return togroup, groups 251*ccdc9c3eSSadaf Ebrahimi 252*ccdc9c3eSSadaf Ebrahimi 253*ccdc9c3eSSadaf Ebrahimidef Scripts(unicode_dir=_UNICODE_DIR): 254*ccdc9c3eSSadaf Ebrahimi """Returns dict mapping script names to code lists. 255*ccdc9c3eSSadaf Ebrahimi 256*ccdc9c3eSSadaf Ebrahimi Args: 257*ccdc9c3eSSadaf Ebrahimi unicode_dir: Unicode data directory 258*ccdc9c3eSSadaf Ebrahimi 259*ccdc9c3eSSadaf Ebrahimi Returns: 260*ccdc9c3eSSadaf Ebrahimi dict mapping script names to code lists 261*ccdc9c3eSSadaf Ebrahimi """ 262*ccdc9c3eSSadaf Ebrahimi 263*ccdc9c3eSSadaf Ebrahimi scripts = {} 264*ccdc9c3eSSadaf Ebrahimi 265*ccdc9c3eSSadaf Ebrahimi def DoLine(codes, fields): 266*ccdc9c3eSSadaf Ebrahimi """Process single Scripts.txt line, updating scripts.""" 267*ccdc9c3eSSadaf Ebrahimi (_, name) = fields 268*ccdc9c3eSSadaf Ebrahimi scripts.setdefault(name, []).extend(codes) 269*ccdc9c3eSSadaf Ebrahimi 270*ccdc9c3eSSadaf Ebrahimi ReadUnicodeTable(unicode_dir+"/Scripts.txt", 2, DoLine) 271*ccdc9c3eSSadaf Ebrahimi return scripts 272*ccdc9c3eSSadaf Ebrahimi 273*ccdc9c3eSSadaf Ebrahimi 274*ccdc9c3eSSadaf Ebrahimidef Categories(unicode_dir=_UNICODE_DIR): 275*ccdc9c3eSSadaf Ebrahimi """Returns dict mapping category names to code lists. 276*ccdc9c3eSSadaf Ebrahimi 277*ccdc9c3eSSadaf Ebrahimi Args: 278*ccdc9c3eSSadaf Ebrahimi unicode_dir: Unicode data directory 279*ccdc9c3eSSadaf Ebrahimi 280*ccdc9c3eSSadaf Ebrahimi Returns: 281*ccdc9c3eSSadaf Ebrahimi dict mapping category names to code lists 282*ccdc9c3eSSadaf Ebrahimi """ 283*ccdc9c3eSSadaf Ebrahimi 284*ccdc9c3eSSadaf Ebrahimi categories = {} 285*ccdc9c3eSSadaf Ebrahimi 286*ccdc9c3eSSadaf Ebrahimi def DoLine(codes, fields): 287*ccdc9c3eSSadaf Ebrahimi """Process single UnicodeData.txt line, updating categories.""" 288*ccdc9c3eSSadaf Ebrahimi category = fields[2] 289*ccdc9c3eSSadaf Ebrahimi categories.setdefault(category, []).extend(codes) 290*ccdc9c3eSSadaf Ebrahimi # Add codes from Lu into L, etc. 291*ccdc9c3eSSadaf Ebrahimi if len(category) > 1: 292*ccdc9c3eSSadaf Ebrahimi short = category[0] 293*ccdc9c3eSSadaf Ebrahimi categories.setdefault(short, []).extend(codes) 294*ccdc9c3eSSadaf Ebrahimi 295*ccdc9c3eSSadaf Ebrahimi ReadUnicodeTable(unicode_dir+"/UnicodeData.txt", 15, DoLine) 296*ccdc9c3eSSadaf Ebrahimi return categories 297*ccdc9c3eSSadaf Ebrahimi 298