1#!/usr/bin/python 2# coding=utf-8 3# 4# Copyright 2008 The RE2 Authors. All Rights Reserved. 5# Use of this source code is governed by a BSD-style 6# license that can be found in the LICENSE file. 7 8# See unicode_casefold.h for description of case folding tables. 9 10"""Generate C++ table for Unicode case folding.""" 11 12import sys 13import unicode 14 15_header = """ 16// GENERATED BY make_unicode_casefold.py; DO NOT EDIT. 17// make_unicode_casefold.py >unicode_casefold.cc 18 19#include "re2/unicode_casefold.h" 20 21namespace re2 { 22 23""" 24 25_trailer = """ 26 27} // namespace re2 28 29""" 30 31def _Delta(a, b): 32 """Compute the delta for b - a. Even/odd and odd/even 33 are handled specially, as described above.""" 34 if a+1 == b: 35 if a%2 == 0: 36 return 'EvenOdd' 37 else: 38 return 'OddEven' 39 if a == b+1: 40 if a%2 == 0: 41 return 'OddEven' 42 else: 43 return 'EvenOdd' 44 return b - a 45 46def _AddDelta(a, delta): 47 """Return a + delta, handling EvenOdd and OddEven specially.""" 48 if type(delta) == int: 49 return a+delta 50 if delta == 'EvenOdd': 51 if a%2 == 0: 52 return a+1 53 else: 54 return a-1 55 if delta == 'OddEven': 56 if a%2 == 1: 57 return a+1 58 else: 59 return a-1 60 print >>sys.stderr, "Bad Delta: ", delta 61 raise "Bad Delta" 62 63def _MakeRanges(pairs): 64 """Turn a list like [(65,97), (66, 98), ..., (90,122)] 65 into [(65, 90, +32)].""" 66 ranges = [] 67 last = -100 68 69 def evenodd(last, a, b, r): 70 if a != last+1 or b != _AddDelta(a, r[2]): 71 return False 72 r[1] = a 73 return True 74 75 def evenoddpair(last, a, b, r): 76 if a != last+2: 77 return False 78 delta = r[2] 79 d = delta 80 if type(delta) is not str: 81 return False 82 if delta.endswith('Skip'): 83 d = delta[:-4] 84 else: 85 delta = d + 'Skip' 86 if b != _AddDelta(a, d): 87 return False 88 r[1] = a 89 r[2] = delta 90 return True 91 92 for a, b in pairs: 93 if ranges and evenodd(last, a, b, ranges[-1]): 94 pass 95 elif ranges and evenoddpair(last, a, b, ranges[-1]): 96 pass 97 else: 98 ranges.append([a, a, _Delta(a, b)]) 99 last = a 100 return ranges 101 102# The maximum size of a case-folding group. 103# Case folding is implemented in parse.cc by a recursive process 104# with a recursion depth equal to the size of the largest 105# case-folding group, so it is important that this bound be small. 106# The current tables have no group bigger than 4. 107# If there are ever groups bigger than 10 or so, it will be 108# time to rework the code in parse.cc. 109MaxCasefoldGroup = 4 110 111def main(): 112 lowergroups, casegroups = unicode.CaseGroups() 113 foldpairs = [] 114 seen = {} 115 for c in casegroups: 116 if len(c) > MaxCasefoldGroup: 117 raise unicode.Error("casefold group too long: %s" % (c,)) 118 for i in range(len(c)): 119 if c[i-1] in seen: 120 raise unicode.Error("bad casegroups %d -> %d" % (c[i-1], c[i])) 121 seen[c[i-1]] = True 122 foldpairs.append([c[i-1], c[i]]) 123 124 lowerpairs = [] 125 for lower, group in lowergroups.iteritems(): 126 for g in group: 127 if g != lower: 128 lowerpairs.append([g, lower]) 129 130 def printpairs(name, foldpairs): 131 foldpairs.sort() 132 foldranges = _MakeRanges(foldpairs) 133 print "// %d groups, %d pairs, %d ranges" % (len(casegroups), len(foldpairs), len(foldranges)) 134 print "const CaseFold unicode_%s[] = {" % (name,) 135 for lo, hi, delta in foldranges: 136 print "\t{ %d, %d, %s }," % (lo, hi, delta) 137 print "};" 138 print "const int num_unicode_%s = %d;" % (name, len(foldranges),) 139 print "" 140 141 print _header 142 printpairs("casefold", foldpairs) 143 printpairs("tolower", lowerpairs) 144 print _trailer 145 146if __name__ == '__main__': 147 main() 148