1#!/usr/bin/python3 2# Copyright 2008 The RE2 Authors. All Rights Reserved. 3# Use of this source code is governed by a BSD-style 4# license that can be found in the LICENSE file. 5 6"""Generate C++ tables for Unicode Script and Category groups.""" 7 8from __future__ import absolute_import 9from __future__ import division 10from __future__ import print_function 11 12import sys 13import unicode 14 15_header = """ 16// GENERATED BY make_unicode_groups.py; DO NOT EDIT. 17// make_unicode_groups.py >unicode_groups.cc 18 19#include "re2/unicode_groups.h" 20 21namespace re2 { 22 23""" 24 25_trailer = """ 26 27} // namespace re2 28 29""" 30 31n16 = 0 32n32 = 0 33 34def MakeRanges(codes): 35 """Turn a list like [1,2,3,7,8,9] into a range list [[1,3], [7,9]]""" 36 ranges = [] 37 last = -100 38 for c in codes: 39 if c == last+1: 40 ranges[-1][1] = c 41 else: 42 ranges.append([c, c]) 43 last = c 44 return ranges 45 46def PrintRanges(type, name, ranges): 47 """Print the ranges as an array of type named name.""" 48 print("static const %s %s[] = {" % (type, name)) 49 for lo, hi in ranges: 50 print("\t{ %d, %d }," % (lo, hi)) 51 print("};") 52 53# def PrintCodes(type, name, codes): 54# """Print the codes as an array of type named name.""" 55# print("static %s %s[] = {" % (type, name)) 56# for c in codes: 57# print("\t%d," % (c,)) 58# print("};") 59 60def PrintGroup(name, codes): 61 """Print the data structures for the group of codes. 62 Return a UGroup literal for the group.""" 63 64 # See unicode_groups.h for a description of the data structure. 65 66 # Split codes into 16-bit ranges and 32-bit ranges. 67 range16 = MakeRanges([c for c in codes if c < 65536]) 68 range32 = MakeRanges([c for c in codes if c >= 65536]) 69 70 # Pull singleton ranges out of range16. 71 # code16 = [lo for lo, hi in range16 if lo == hi] 72 # range16 = [[lo, hi] for lo, hi in range16 if lo != hi] 73 74 global n16 75 global n32 76 n16 += len(range16) 77 n32 += len(range32) 78 79 ugroup = "{ \"%s\", +1" % (name,) 80 # if len(code16) > 0: 81 # PrintCodes("uint16_t", name+"_code16", code16) 82 # ugroup += ", %s_code16, %d" % (name, len(code16)) 83 # else: 84 # ugroup += ", 0, 0" 85 if len(range16) > 0: 86 PrintRanges("URange16", name+"_range16", range16) 87 ugroup += ", %s_range16, %d" % (name, len(range16)) 88 else: 89 ugroup += ", 0, 0" 90 if len(range32) > 0: 91 PrintRanges("URange32", name+"_range32", range32) 92 ugroup += ", %s_range32, %d" % (name, len(range32)) 93 else: 94 ugroup += ", 0, 0" 95 ugroup += " }" 96 return ugroup 97 98def main(): 99 categories = unicode.Categories() 100 scripts = unicode.Scripts() 101 print(_header) 102 ugroups = [] 103 for name in sorted(categories): 104 ugroups.append(PrintGroup(name, categories[name])) 105 for name in sorted(scripts): 106 ugroups.append(PrintGroup(name, scripts[name])) 107 print("// %d 16-bit ranges, %d 32-bit ranges" % (n16, n32)) 108 print("const UGroup unicode_groups[] = {") 109 ugroups.sort() 110 for ug in ugroups: 111 print("\t%s," % (ug,)) 112 print("};") 113 print("const int num_unicode_groups = %d;" % (len(ugroups),)) 114 print(_trailer) 115 116if __name__ == '__main__': 117 main() 118