xref: /aosp_15_r20/external/regex-re2/re2/make_unicode_casefold.py (revision ccdc9c3e24c519bfa4832a66aa2e83a52c19f295)
1#!/usr/bin/python
2# coding=utf-8
3#
4# Copyright 2008 The RE2 Authors.  All Rights Reserved.
5# Use of this source code is governed by a BSD-style
6# license that can be found in the LICENSE file.
7
8# See unicode_casefold.h for description of case folding tables.
9
10"""Generate C++ table for Unicode case folding."""
11
12import sys
13import unicode
14
15_header = """
16// GENERATED BY make_unicode_casefold.py; DO NOT EDIT.
17// make_unicode_casefold.py >unicode_casefold.cc
18
19#include "re2/unicode_casefold.h"
20
21namespace re2 {
22
23"""
24
25_trailer = """
26
27} // namespace re2
28
29"""
30
31def _Delta(a, b):
32  """Compute the delta for b - a.  Even/odd and odd/even
33     are handled specially, as described above."""
34  if a+1 == b:
35    if a%2 == 0:
36      return 'EvenOdd'
37    else:
38      return 'OddEven'
39  if a == b+1:
40    if a%2 == 0:
41      return 'OddEven'
42    else:
43      return 'EvenOdd'
44  return b - a
45
46def _AddDelta(a, delta):
47  """Return a + delta, handling EvenOdd and OddEven specially."""
48  if type(delta) == int:
49    return a+delta
50  if delta == 'EvenOdd':
51    if a%2 == 0:
52      return a+1
53    else:
54      return a-1
55  if delta == 'OddEven':
56    if a%2 == 1:
57      return a+1
58    else:
59      return a-1
60  print >>sys.stderr, "Bad Delta: ", delta
61  raise "Bad Delta"
62
63def _MakeRanges(pairs):
64  """Turn a list like [(65,97), (66, 98), ..., (90,122)]
65     into [(65, 90, +32)]."""
66  ranges = []
67  last = -100
68
69  def evenodd(last, a, b, r):
70    if a != last+1 or b != _AddDelta(a, r[2]):
71      return False
72    r[1] = a
73    return True
74
75  def evenoddpair(last, a, b, r):
76    if a != last+2:
77      return False
78    delta = r[2]
79    d = delta
80    if type(delta) is not str:
81      return False
82    if delta.endswith('Skip'):
83      d = delta[:-4]
84    else:
85      delta = d + 'Skip'
86    if b != _AddDelta(a, d):
87      return False
88    r[1] = a
89    r[2] = delta
90    return True
91
92  for a, b in pairs:
93    if ranges and evenodd(last, a, b, ranges[-1]):
94      pass
95    elif ranges and evenoddpair(last, a, b, ranges[-1]):
96      pass
97    else:
98      ranges.append([a, a, _Delta(a, b)])
99    last = a
100  return ranges
101
102# The maximum size of a case-folding group.
103# Case folding is implemented in parse.cc by a recursive process
104# with a recursion depth equal to the size of the largest
105# case-folding group, so it is important that this bound be small.
106# The current tables have no group bigger than 4.
107# If there are ever groups bigger than 10 or so, it will be
108# time to rework the code in parse.cc.
109MaxCasefoldGroup = 4
110
111def main():
112  lowergroups, casegroups = unicode.CaseGroups()
113  foldpairs = []
114  seen = {}
115  for c in casegroups:
116    if len(c) > MaxCasefoldGroup:
117      raise unicode.Error("casefold group too long: %s" % (c,))
118    for i in range(len(c)):
119      if c[i-1] in seen:
120        raise unicode.Error("bad casegroups %d -> %d" % (c[i-1], c[i]))
121      seen[c[i-1]] = True
122      foldpairs.append([c[i-1], c[i]])
123
124  lowerpairs = []
125  for lower, group in lowergroups.iteritems():
126    for g in group:
127      if g != lower:
128        lowerpairs.append([g, lower])
129
130  def printpairs(name, foldpairs):
131    foldpairs.sort()
132    foldranges = _MakeRanges(foldpairs)
133    print "// %d groups, %d pairs, %d ranges" % (len(casegroups), len(foldpairs), len(foldranges))
134    print "const CaseFold unicode_%s[] = {" % (name,)
135    for lo, hi, delta in foldranges:
136      print "\t{ %d, %d, %s }," % (lo, hi, delta)
137    print "};"
138    print "const int num_unicode_%s = %d;" % (name, len(foldranges),)
139    print ""
140
141  print _header
142  printpairs("casefold", foldpairs)
143  printpairs("tolower", lowerpairs)
144  print _trailer
145
146if __name__ == '__main__':
147  main()
148