1#!/usr/bin/python3
2# Copyright 2008 The RE2 Authors.  All Rights Reserved.
3# Use of this source code is governed by a BSD-style
4# license that can be found in the LICENSE file.
5
6"""Generate C++ tables for Unicode Script and Category groups."""
7
8from __future__ import absolute_import
9from __future__ import division
10from __future__ import print_function
11
12import sys
13import unicode
14
15_header = """
16// GENERATED BY make_unicode_groups.py; DO NOT EDIT.
17// make_unicode_groups.py >unicode_groups.cc
18
19#include "re2/unicode_groups.h"
20
21namespace re2 {
22
23"""
24
25_trailer = """
26
27}  // namespace re2
28
29"""
30
31n16 = 0
32n32 = 0
33
34def MakeRanges(codes):
35  """Turn a list like [1,2,3,7,8,9] into a range list [[1,3], [7,9]]"""
36  ranges = []
37  last = -100
38  for c in codes:
39    if c == last+1:
40      ranges[-1][1] = c
41    else:
42      ranges.append([c, c])
43    last = c
44  return ranges
45
46def PrintRanges(type, name, ranges):
47  """Print the ranges as an array of type named name."""
48  print("static const %s %s[] = {" % (type, name))
49  for lo, hi in ranges:
50    print("\t{ %d, %d }," % (lo, hi))
51  print("};")
52
53# def PrintCodes(type, name, codes):
54#   """Print the codes as an array of type named name."""
55#   print("static %s %s[] = {" % (type, name))
56#   for c in codes:
57#     print("\t%d," % (c,))
58#   print("};")
59
60def PrintGroup(name, codes):
61  """Print the data structures for the group of codes.
62  Return a UGroup literal for the group."""
63
64  # See unicode_groups.h for a description of the data structure.
65
66  # Split codes into 16-bit ranges and 32-bit ranges.
67  range16 = MakeRanges([c for c in codes if c < 65536])
68  range32 = MakeRanges([c for c in codes if c >= 65536])
69
70  # Pull singleton ranges out of range16.
71  # code16 = [lo for lo, hi in range16 if lo == hi]
72  # range16 = [[lo, hi] for lo, hi in range16 if lo != hi]
73
74  global n16
75  global n32
76  n16 += len(range16)
77  n32 += len(range32)
78
79  ugroup = "{ \"%s\", +1" % (name,)
80  # if len(code16) > 0:
81  #   PrintCodes("uint16_t", name+"_code16", code16)
82  #   ugroup += ", %s_code16, %d" % (name, len(code16))
83  # else:
84  #   ugroup += ", 0, 0"
85  if len(range16) > 0:
86    PrintRanges("URange16", name+"_range16", range16)
87    ugroup += ", %s_range16, %d" % (name, len(range16))
88  else:
89    ugroup += ", 0, 0"
90  if len(range32) > 0:
91    PrintRanges("URange32", name+"_range32", range32)
92    ugroup += ", %s_range32, %d" % (name, len(range32))
93  else:
94    ugroup += ", 0, 0"
95  ugroup += " }"
96  return ugroup
97
98def main():
99  categories = unicode.Categories()
100  scripts = unicode.Scripts()
101  print(_header)
102  ugroups = []
103  for name in sorted(categories):
104    ugroups.append(PrintGroup(name, categories[name]))
105  for name in sorted(scripts):
106    ugroups.append(PrintGroup(name, scripts[name]))
107  print("// %d 16-bit ranges, %d 32-bit ranges" % (n16, n32))
108  print("const UGroup unicode_groups[] = {")
109  ugroups.sort()
110  for ug in ugroups:
111    print("\t%s," % (ug,))
112  print("};")
113  print("const int num_unicode_groups = %d;" % (len(ugroups),))
114  print(_trailer)
115
116if __name__ == '__main__':
117  main()
118