xref: /aosp_15_r20/external/regex-re2/re2/unicode.py (revision ccdc9c3e24c519bfa4832a66aa2e83a52c19f295)
1*ccdc9c3eSSadaf Ebrahimi# Copyright 2008 The RE2 Authors.  All Rights Reserved.
2*ccdc9c3eSSadaf Ebrahimi# Use of this source code is governed by a BSD-style
3*ccdc9c3eSSadaf Ebrahimi# license that can be found in the LICENSE file.
4*ccdc9c3eSSadaf Ebrahimi
5*ccdc9c3eSSadaf Ebrahimi"""Parser for Unicode data files (as distributed by unicode.org)."""
6*ccdc9c3eSSadaf Ebrahimi
7*ccdc9c3eSSadaf Ebrahimiimport os
8*ccdc9c3eSSadaf Ebrahimiimport re
9*ccdc9c3eSSadaf Ebrahimiimport urllib2
10*ccdc9c3eSSadaf Ebrahimi
11*ccdc9c3eSSadaf Ebrahimi# Directory or URL where Unicode tables reside.
12*ccdc9c3eSSadaf Ebrahimi_UNICODE_DIR = "https://www.unicode.org/Public/11.0.0/ucd"
13*ccdc9c3eSSadaf Ebrahimi
14*ccdc9c3eSSadaf Ebrahimi# Largest valid Unicode code value.
15*ccdc9c3eSSadaf Ebrahimi_RUNE_MAX = 0x10FFFF
16*ccdc9c3eSSadaf Ebrahimi
17*ccdc9c3eSSadaf Ebrahimi
18*ccdc9c3eSSadaf Ebrahimiclass Error(Exception):
19*ccdc9c3eSSadaf Ebrahimi  """Unicode error base class."""
20*ccdc9c3eSSadaf Ebrahimi
21*ccdc9c3eSSadaf Ebrahimi
22*ccdc9c3eSSadaf Ebrahimiclass InputError(Error):
23*ccdc9c3eSSadaf Ebrahimi  """Unicode input error class.  Raised on invalid input."""
24*ccdc9c3eSSadaf Ebrahimi
25*ccdc9c3eSSadaf Ebrahimi
26*ccdc9c3eSSadaf Ebrahimidef _UInt(s):
27*ccdc9c3eSSadaf Ebrahimi  """Converts string to Unicode code point ('263A' => 0x263a).
28*ccdc9c3eSSadaf Ebrahimi
29*ccdc9c3eSSadaf Ebrahimi  Args:
30*ccdc9c3eSSadaf Ebrahimi    s: string to convert
31*ccdc9c3eSSadaf Ebrahimi
32*ccdc9c3eSSadaf Ebrahimi  Returns:
33*ccdc9c3eSSadaf Ebrahimi    Unicode code point
34*ccdc9c3eSSadaf Ebrahimi
35*ccdc9c3eSSadaf Ebrahimi  Raises:
36*ccdc9c3eSSadaf Ebrahimi    InputError: the string is not a valid Unicode value.
37*ccdc9c3eSSadaf Ebrahimi  """
38*ccdc9c3eSSadaf Ebrahimi
39*ccdc9c3eSSadaf Ebrahimi  try:
40*ccdc9c3eSSadaf Ebrahimi    v = int(s, 16)
41*ccdc9c3eSSadaf Ebrahimi  except ValueError:
42*ccdc9c3eSSadaf Ebrahimi    v = -1
43*ccdc9c3eSSadaf Ebrahimi  if len(s) < 4 or len(s) > 6 or v < 0 or v > _RUNE_MAX:
44*ccdc9c3eSSadaf Ebrahimi    raise InputError("invalid Unicode value %s" % (s,))
45*ccdc9c3eSSadaf Ebrahimi  return v
46*ccdc9c3eSSadaf Ebrahimi
47*ccdc9c3eSSadaf Ebrahimi
48*ccdc9c3eSSadaf Ebrahimidef _URange(s):
49*ccdc9c3eSSadaf Ebrahimi  """Converts string to Unicode range.
50*ccdc9c3eSSadaf Ebrahimi
51*ccdc9c3eSSadaf Ebrahimi    '0001..0003' => [1, 2, 3].
52*ccdc9c3eSSadaf Ebrahimi    '0001' => [1].
53*ccdc9c3eSSadaf Ebrahimi
54*ccdc9c3eSSadaf Ebrahimi  Args:
55*ccdc9c3eSSadaf Ebrahimi    s: string to convert
56*ccdc9c3eSSadaf Ebrahimi
57*ccdc9c3eSSadaf Ebrahimi  Returns:
58*ccdc9c3eSSadaf Ebrahimi    Unicode range
59*ccdc9c3eSSadaf Ebrahimi
60*ccdc9c3eSSadaf Ebrahimi  Raises:
61*ccdc9c3eSSadaf Ebrahimi    InputError: the string is not a valid Unicode range.
62*ccdc9c3eSSadaf Ebrahimi  """
63*ccdc9c3eSSadaf Ebrahimi  a = s.split("..")
64*ccdc9c3eSSadaf Ebrahimi  if len(a) == 1:
65*ccdc9c3eSSadaf Ebrahimi    return [_UInt(a[0])]
66*ccdc9c3eSSadaf Ebrahimi  if len(a) == 2:
67*ccdc9c3eSSadaf Ebrahimi    lo = _UInt(a[0])
68*ccdc9c3eSSadaf Ebrahimi    hi = _UInt(a[1])
69*ccdc9c3eSSadaf Ebrahimi    if lo < hi:
70*ccdc9c3eSSadaf Ebrahimi      return range(lo, hi + 1)
71*ccdc9c3eSSadaf Ebrahimi  raise InputError("invalid Unicode range %s" % (s,))
72*ccdc9c3eSSadaf Ebrahimi
73*ccdc9c3eSSadaf Ebrahimi
74*ccdc9c3eSSadaf Ebrahimidef _UStr(v):
75*ccdc9c3eSSadaf Ebrahimi  """Converts Unicode code point to hex string.
76*ccdc9c3eSSadaf Ebrahimi
77*ccdc9c3eSSadaf Ebrahimi    0x263a => '0x263A'.
78*ccdc9c3eSSadaf Ebrahimi
79*ccdc9c3eSSadaf Ebrahimi  Args:
80*ccdc9c3eSSadaf Ebrahimi    v: code point to convert
81*ccdc9c3eSSadaf Ebrahimi
82*ccdc9c3eSSadaf Ebrahimi  Returns:
83*ccdc9c3eSSadaf Ebrahimi    Unicode string
84*ccdc9c3eSSadaf Ebrahimi
85*ccdc9c3eSSadaf Ebrahimi  Raises:
86*ccdc9c3eSSadaf Ebrahimi    InputError: the argument is not a valid Unicode value.
87*ccdc9c3eSSadaf Ebrahimi  """
88*ccdc9c3eSSadaf Ebrahimi  if v < 0 or v > _RUNE_MAX:
89*ccdc9c3eSSadaf Ebrahimi    raise InputError("invalid Unicode value %s" % (v,))
90*ccdc9c3eSSadaf Ebrahimi  return "0x%04X" % (v,)
91*ccdc9c3eSSadaf Ebrahimi
92*ccdc9c3eSSadaf Ebrahimi
93*ccdc9c3eSSadaf Ebrahimidef _ParseContinue(s):
94*ccdc9c3eSSadaf Ebrahimi  """Parses a Unicode continuation field.
95*ccdc9c3eSSadaf Ebrahimi
96*ccdc9c3eSSadaf Ebrahimi  These are of the form '<Name, First>' or '<Name, Last>'.
97*ccdc9c3eSSadaf Ebrahimi  Instead of giving an explicit range in a single table entry,
98*ccdc9c3eSSadaf Ebrahimi  some Unicode tables use two entries, one for the first
99*ccdc9c3eSSadaf Ebrahimi  code value in the range and one for the last.
100*ccdc9c3eSSadaf Ebrahimi  The first entry's description is '<Name, First>' instead of 'Name'
101*ccdc9c3eSSadaf Ebrahimi  and the second is '<Name, Last>'.
102*ccdc9c3eSSadaf Ebrahimi
103*ccdc9c3eSSadaf Ebrahimi    '<Name, First>' => ('Name', 'First')
104*ccdc9c3eSSadaf Ebrahimi    '<Name, Last>' => ('Name', 'Last')
105*ccdc9c3eSSadaf Ebrahimi    'Anything else' => ('Anything else', None)
106*ccdc9c3eSSadaf Ebrahimi
107*ccdc9c3eSSadaf Ebrahimi  Args:
108*ccdc9c3eSSadaf Ebrahimi    s: continuation field string
109*ccdc9c3eSSadaf Ebrahimi
110*ccdc9c3eSSadaf Ebrahimi  Returns:
111*ccdc9c3eSSadaf Ebrahimi    pair: name and ('First', 'Last', or None)
112*ccdc9c3eSSadaf Ebrahimi  """
113*ccdc9c3eSSadaf Ebrahimi
114*ccdc9c3eSSadaf Ebrahimi  match = re.match("<(.*), (First|Last)>", s)
115*ccdc9c3eSSadaf Ebrahimi  if match is not None:
116*ccdc9c3eSSadaf Ebrahimi    return match.groups()
117*ccdc9c3eSSadaf Ebrahimi  return (s, None)
118*ccdc9c3eSSadaf Ebrahimi
119*ccdc9c3eSSadaf Ebrahimi
120*ccdc9c3eSSadaf Ebrahimidef ReadUnicodeTable(filename, nfields, doline):
121*ccdc9c3eSSadaf Ebrahimi  """Generic Unicode table text file reader.
122*ccdc9c3eSSadaf Ebrahimi
123*ccdc9c3eSSadaf Ebrahimi  The reader takes care of stripping out comments and also
124*ccdc9c3eSSadaf Ebrahimi  parsing the two different ways that the Unicode tables specify
125*ccdc9c3eSSadaf Ebrahimi  code ranges (using the .. notation and splitting the range across
126*ccdc9c3eSSadaf Ebrahimi  multiple lines).
127*ccdc9c3eSSadaf Ebrahimi
128*ccdc9c3eSSadaf Ebrahimi  Each non-comment line in the table is expected to have the given
129*ccdc9c3eSSadaf Ebrahimi  number of fields.  The first field is known to be the Unicode value
130*ccdc9c3eSSadaf Ebrahimi  and the second field its description.
131*ccdc9c3eSSadaf Ebrahimi
132*ccdc9c3eSSadaf Ebrahimi  The reader calls doline(codes, fields) for each entry in the table.
133*ccdc9c3eSSadaf Ebrahimi  If fn raises an exception, the reader prints that exception,
134*ccdc9c3eSSadaf Ebrahimi  prefixed with the file name and line number, and continues
135*ccdc9c3eSSadaf Ebrahimi  processing the file.  When done with the file, the reader re-raises
136*ccdc9c3eSSadaf Ebrahimi  the first exception encountered during the file.
137*ccdc9c3eSSadaf Ebrahimi
138*ccdc9c3eSSadaf Ebrahimi  Arguments:
139*ccdc9c3eSSadaf Ebrahimi    filename: the Unicode data file to read, or a file-like object.
140*ccdc9c3eSSadaf Ebrahimi    nfields: the number of expected fields per line in that file.
141*ccdc9c3eSSadaf Ebrahimi    doline: the function to call for each table entry.
142*ccdc9c3eSSadaf Ebrahimi
143*ccdc9c3eSSadaf Ebrahimi  Raises:
144*ccdc9c3eSSadaf Ebrahimi    InputError: nfields is invalid (must be >= 2).
145*ccdc9c3eSSadaf Ebrahimi  """
146*ccdc9c3eSSadaf Ebrahimi
147*ccdc9c3eSSadaf Ebrahimi  if nfields < 2:
148*ccdc9c3eSSadaf Ebrahimi    raise InputError("invalid number of fields %d" % (nfields,))
149*ccdc9c3eSSadaf Ebrahimi
150*ccdc9c3eSSadaf Ebrahimi  if type(filename) == str:
151*ccdc9c3eSSadaf Ebrahimi    if filename.startswith("https://"):
152*ccdc9c3eSSadaf Ebrahimi      fil = urllib2.urlopen(filename)
153*ccdc9c3eSSadaf Ebrahimi    else:
154*ccdc9c3eSSadaf Ebrahimi      fil = open(filename, "r")
155*ccdc9c3eSSadaf Ebrahimi  else:
156*ccdc9c3eSSadaf Ebrahimi    fil = filename
157*ccdc9c3eSSadaf Ebrahimi
158*ccdc9c3eSSadaf Ebrahimi  first = None        # first code in multiline range
159*ccdc9c3eSSadaf Ebrahimi  expect_last = None  # tag expected for "Last" line in multiline range
160*ccdc9c3eSSadaf Ebrahimi  lineno = 0          # current line number
161*ccdc9c3eSSadaf Ebrahimi  for line in fil:
162*ccdc9c3eSSadaf Ebrahimi    lineno += 1
163*ccdc9c3eSSadaf Ebrahimi    try:
164*ccdc9c3eSSadaf Ebrahimi      # Chop # comments and white space; ignore empty lines.
165*ccdc9c3eSSadaf Ebrahimi      sharp = line.find("#")
166*ccdc9c3eSSadaf Ebrahimi      if sharp >= 0:
167*ccdc9c3eSSadaf Ebrahimi        line = line[:sharp]
168*ccdc9c3eSSadaf Ebrahimi      line = line.strip()
169*ccdc9c3eSSadaf Ebrahimi      if not line:
170*ccdc9c3eSSadaf Ebrahimi        continue
171*ccdc9c3eSSadaf Ebrahimi
172*ccdc9c3eSSadaf Ebrahimi      # Split fields on ";", chop more white space.
173*ccdc9c3eSSadaf Ebrahimi      # Must have the expected number of fields.
174*ccdc9c3eSSadaf Ebrahimi      fields = [s.strip() for s in line.split(";")]
175*ccdc9c3eSSadaf Ebrahimi      if len(fields) != nfields:
176*ccdc9c3eSSadaf Ebrahimi        raise InputError("wrong number of fields %d %d - %s" %
177*ccdc9c3eSSadaf Ebrahimi                         (len(fields), nfields, line))
178*ccdc9c3eSSadaf Ebrahimi
179*ccdc9c3eSSadaf Ebrahimi      # The Unicode text files have two different ways
180*ccdc9c3eSSadaf Ebrahimi      # to list a Unicode range.  Either the first field is
181*ccdc9c3eSSadaf Ebrahimi      # itself a range (0000..FFFF), or the range is split
182*ccdc9c3eSSadaf Ebrahimi      # across two lines, with the second field noting
183*ccdc9c3eSSadaf Ebrahimi      # the continuation.
184*ccdc9c3eSSadaf Ebrahimi      codes = _URange(fields[0])
185*ccdc9c3eSSadaf Ebrahimi      (name, cont) = _ParseContinue(fields[1])
186*ccdc9c3eSSadaf Ebrahimi
187*ccdc9c3eSSadaf Ebrahimi      if expect_last is not None:
188*ccdc9c3eSSadaf Ebrahimi        # If the last line gave the First code in a range,
189*ccdc9c3eSSadaf Ebrahimi        # this one had better give the Last one.
190*ccdc9c3eSSadaf Ebrahimi        if (len(codes) != 1 or codes[0] <= first or
191*ccdc9c3eSSadaf Ebrahimi            cont != "Last" or name != expect_last):
192*ccdc9c3eSSadaf Ebrahimi          raise InputError("expected Last line for %s" %
193*ccdc9c3eSSadaf Ebrahimi                           (expect_last,))
194*ccdc9c3eSSadaf Ebrahimi        codes = range(first, codes[0] + 1)
195*ccdc9c3eSSadaf Ebrahimi        first = None
196*ccdc9c3eSSadaf Ebrahimi        expect_last = None
197*ccdc9c3eSSadaf Ebrahimi        fields[0] = "%04X..%04X" % (codes[0], codes[-1])
198*ccdc9c3eSSadaf Ebrahimi        fields[1] = name
199*ccdc9c3eSSadaf Ebrahimi      elif cont == "First":
200*ccdc9c3eSSadaf Ebrahimi        # Otherwise, if this is the First code in a range,
201*ccdc9c3eSSadaf Ebrahimi        # remember it and go to the next line.
202*ccdc9c3eSSadaf Ebrahimi        if len(codes) != 1:
203*ccdc9c3eSSadaf Ebrahimi          raise InputError("bad First line: range given")
204*ccdc9c3eSSadaf Ebrahimi        expect_last = name
205*ccdc9c3eSSadaf Ebrahimi        first = codes[0]
206*ccdc9c3eSSadaf Ebrahimi        continue
207*ccdc9c3eSSadaf Ebrahimi
208*ccdc9c3eSSadaf Ebrahimi      doline(codes, fields)
209*ccdc9c3eSSadaf Ebrahimi
210*ccdc9c3eSSadaf Ebrahimi    except Exception, e:
211*ccdc9c3eSSadaf Ebrahimi      print "%s:%d: %s" % (filename, lineno, e)
212*ccdc9c3eSSadaf Ebrahimi      raise
213*ccdc9c3eSSadaf Ebrahimi
214*ccdc9c3eSSadaf Ebrahimi  if expect_last is not None:
215*ccdc9c3eSSadaf Ebrahimi    raise InputError("expected Last line for %s; got EOF" %
216*ccdc9c3eSSadaf Ebrahimi                     (expect_last,))
217*ccdc9c3eSSadaf Ebrahimi
218*ccdc9c3eSSadaf Ebrahimi
219*ccdc9c3eSSadaf Ebrahimidef CaseGroups(unicode_dir=_UNICODE_DIR):
220*ccdc9c3eSSadaf Ebrahimi  """Returns list of Unicode code groups equivalent under case folding.
221*ccdc9c3eSSadaf Ebrahimi
222*ccdc9c3eSSadaf Ebrahimi  Each group is a sorted list of code points,
223*ccdc9c3eSSadaf Ebrahimi  and the list of groups is sorted by first code point
224*ccdc9c3eSSadaf Ebrahimi  in the group.
225*ccdc9c3eSSadaf Ebrahimi
226*ccdc9c3eSSadaf Ebrahimi  Args:
227*ccdc9c3eSSadaf Ebrahimi    unicode_dir: Unicode data directory
228*ccdc9c3eSSadaf Ebrahimi
229*ccdc9c3eSSadaf Ebrahimi  Returns:
230*ccdc9c3eSSadaf Ebrahimi    list of Unicode code groups
231*ccdc9c3eSSadaf Ebrahimi  """
232*ccdc9c3eSSadaf Ebrahimi
233*ccdc9c3eSSadaf Ebrahimi  # Dict mapping lowercase code point to fold-equivalent group.
234*ccdc9c3eSSadaf Ebrahimi  togroup = {}
235*ccdc9c3eSSadaf Ebrahimi
236*ccdc9c3eSSadaf Ebrahimi  def DoLine(codes, fields):
237*ccdc9c3eSSadaf Ebrahimi    """Process single CaseFolding.txt line, updating togroup."""
238*ccdc9c3eSSadaf Ebrahimi    (_, foldtype, lower, _) = fields
239*ccdc9c3eSSadaf Ebrahimi    if foldtype not in ("C", "S"):
240*ccdc9c3eSSadaf Ebrahimi      return
241*ccdc9c3eSSadaf Ebrahimi    lower = _UInt(lower)
242*ccdc9c3eSSadaf Ebrahimi    togroup.setdefault(lower, [lower]).extend(codes)
243*ccdc9c3eSSadaf Ebrahimi
244*ccdc9c3eSSadaf Ebrahimi  ReadUnicodeTable(unicode_dir+"/CaseFolding.txt", 4, DoLine)
245*ccdc9c3eSSadaf Ebrahimi
246*ccdc9c3eSSadaf Ebrahimi  groups = togroup.values()
247*ccdc9c3eSSadaf Ebrahimi  for g in groups:
248*ccdc9c3eSSadaf Ebrahimi    g.sort()
249*ccdc9c3eSSadaf Ebrahimi  groups.sort()
250*ccdc9c3eSSadaf Ebrahimi  return togroup, groups
251*ccdc9c3eSSadaf Ebrahimi
252*ccdc9c3eSSadaf Ebrahimi
253*ccdc9c3eSSadaf Ebrahimidef Scripts(unicode_dir=_UNICODE_DIR):
254*ccdc9c3eSSadaf Ebrahimi  """Returns dict mapping script names to code lists.
255*ccdc9c3eSSadaf Ebrahimi
256*ccdc9c3eSSadaf Ebrahimi  Args:
257*ccdc9c3eSSadaf Ebrahimi    unicode_dir: Unicode data directory
258*ccdc9c3eSSadaf Ebrahimi
259*ccdc9c3eSSadaf Ebrahimi  Returns:
260*ccdc9c3eSSadaf Ebrahimi    dict mapping script names to code lists
261*ccdc9c3eSSadaf Ebrahimi  """
262*ccdc9c3eSSadaf Ebrahimi
263*ccdc9c3eSSadaf Ebrahimi  scripts = {}
264*ccdc9c3eSSadaf Ebrahimi
265*ccdc9c3eSSadaf Ebrahimi  def DoLine(codes, fields):
266*ccdc9c3eSSadaf Ebrahimi    """Process single Scripts.txt line, updating scripts."""
267*ccdc9c3eSSadaf Ebrahimi    (_, name) = fields
268*ccdc9c3eSSadaf Ebrahimi    scripts.setdefault(name, []).extend(codes)
269*ccdc9c3eSSadaf Ebrahimi
270*ccdc9c3eSSadaf Ebrahimi  ReadUnicodeTable(unicode_dir+"/Scripts.txt", 2, DoLine)
271*ccdc9c3eSSadaf Ebrahimi  return scripts
272*ccdc9c3eSSadaf Ebrahimi
273*ccdc9c3eSSadaf Ebrahimi
274*ccdc9c3eSSadaf Ebrahimidef Categories(unicode_dir=_UNICODE_DIR):
275*ccdc9c3eSSadaf Ebrahimi  """Returns dict mapping category names to code lists.
276*ccdc9c3eSSadaf Ebrahimi
277*ccdc9c3eSSadaf Ebrahimi  Args:
278*ccdc9c3eSSadaf Ebrahimi    unicode_dir: Unicode data directory
279*ccdc9c3eSSadaf Ebrahimi
280*ccdc9c3eSSadaf Ebrahimi  Returns:
281*ccdc9c3eSSadaf Ebrahimi    dict mapping category names to code lists
282*ccdc9c3eSSadaf Ebrahimi  """
283*ccdc9c3eSSadaf Ebrahimi
284*ccdc9c3eSSadaf Ebrahimi  categories = {}
285*ccdc9c3eSSadaf Ebrahimi
286*ccdc9c3eSSadaf Ebrahimi  def DoLine(codes, fields):
287*ccdc9c3eSSadaf Ebrahimi    """Process single UnicodeData.txt line, updating categories."""
288*ccdc9c3eSSadaf Ebrahimi    category = fields[2]
289*ccdc9c3eSSadaf Ebrahimi    categories.setdefault(category, []).extend(codes)
290*ccdc9c3eSSadaf Ebrahimi    # Add codes from Lu into L, etc.
291*ccdc9c3eSSadaf Ebrahimi    if len(category) > 1:
292*ccdc9c3eSSadaf Ebrahimi      short = category[0]
293*ccdc9c3eSSadaf Ebrahimi      categories.setdefault(short, []).extend(codes)
294*ccdc9c3eSSadaf Ebrahimi
295*ccdc9c3eSSadaf Ebrahimi  ReadUnicodeTable(unicode_dir+"/UnicodeData.txt", 15, DoLine)
296*ccdc9c3eSSadaf Ebrahimi  return categories
297*ccdc9c3eSSadaf Ebrahimi
298