xref: /aosp_15_r20/external/pcre/maint/GenerateUcd.py (revision 22dc650d8ae982c6770746019a6f94af92b0f024)
1#! /usr/bin/python
2
3#                   PCRE2 UNICODE PROPERTY SUPPORT
4#                   ------------------------------
5#
6# This script generates the pcre2_ucd.c file from Unicode data files. This is
7# the compressed Unicode property data used by PCRE2. The script was created in
8# December 2021 as part of the Unicode data generation refactoring. It is
9# basically a re-working of the MultiStage2.py script that was submitted to the
10# PCRE project by Peter Kankowski in 2008 as part of a previous upgrading of
11# Unicode property support. A number of extensions have since been added. The
12# main difference in the 2021 upgrade (apart from comments and layout) is that
13# the data tables (e.g. list of script names) are now listed in or generated by
14# a separate Python module that is shared with the other Generate scripts.
15#
16# This script must be run in the "maint" directory. It requires the following
17# Unicode data tables: BidiMirrorring.txt, CaseFolding.txt,
18# DerivedBidiClass.txt, DerivedCoreProperties.txt, DerivedGeneralCategory.txt,
19# GraphemeBreakProperty.txt, PropList.txt, PropertyAliases.txt,
20# PropertyValueAliases.txt, ScriptExtensions.txt, Scripts.txt, and
21# emoji-data.txt. These must be in the Unicode.tables subdirectory.
22#
23# The emoji-data.txt file is found in the "emoji" subdirectory even though it
24# is technically part of a different (but coordinated) standard as shown
25# in files associated with Unicode Technical Standard #51 ("Unicode Emoji"),
26# for example:
27#
28# http://unicode.org/Public/emoji/13.0/ReadMe.txt
29#
30# DerivedBidiClass.txt and DerivedGeneralCategory.txt are in the "extracted"
31# subdirectory of the Unicode database (UCD) on the Unicode web site;
32# GraphemeBreakProperty.txt is in the "auxiliary" subdirectory. The other files
33# are in the top-level UCD directory.
34#
35# -----------------------------------------------------------------------------
36# Minor modifications made to the original script:
37#  Added #! line at start
38#  Removed tabs
39#  Made it work with Python 2.4 by rewriting two statements that needed 2.5
40#  Consequent code tidy
41#  Adjusted data file names to take from the Unicode.tables directory
42#  Adjusted global table names by prefixing _pcre_.
43#  Commented out stuff relating to the casefolding table, which isn't used;
44#    removed completely in 2012.
45#  Corrected size calculation
46#  Add #ifndef SUPPORT_UCP to use dummy tables when no UCP support is needed.
47#  Update for PCRE2: name changes, and SUPPORT_UCP is abolished.
48#
49# Major modifications made to the original script:
50#  Added code to add a grapheme break property field to records.
51#
52#  Added code to search for sets of more than two characters that must match
53#  each other caselessly. A new table is output containing these sets, and
54#  offsets into the table are added to the main output records. This new
55#  code scans CaseFolding.txt instead of UnicodeData.txt, which is no longer
56#  used.
57#
58#  Update for Python3:
59#    . Processed with 2to3, but that didn't fix everything
60#    . Changed string.strip to str.strip
61#    . Added encoding='utf-8' to the open() call
62#    . Inserted 'int' before blocksize/ELEMS_PER_LINE because an int is
63#        required and the result of the division is a float
64#
65#  Added code to scan the emoji-data.txt file to find the Extended Pictographic
66#  property, which is used by PCRE2 as a grapheme breaking property. This was
67#  done when updating to Unicode 11.0.0 (July 2018).
68#
69#  Added code to add a Script Extensions field to records. This has increased
70#  their size from 8 to 12 bytes, only 10 of which are currently used.
71#
72#  Added code to add a bidi class field to records by scanning the
73#  DerivedBidiClass.txt and PropList.txt files. This uses one of the two spare
74#  bytes, so now 11 out of 12 are in use.
75#
76# 01-March-2010:     Updated list of scripts for Unicode 5.2.0
77# 30-April-2011:     Updated list of scripts for Unicode 6.0.0
78#     July-2012:     Updated list of scripts for Unicode 6.1.0
79# 20-August-2012:    Added scan of GraphemeBreakProperty.txt and added a new
80#                      field in the record to hold the value. Luckily, the
81#                      structure had a hole in it, so the resulting table is
82#                      not much bigger than before.
83# 18-September-2012: Added code for multiple caseless sets. This uses the
84#                      final hole in the structure.
85# 30-September-2012: Added RegionalIndicator break property from Unicode 6.2.0
86# 13-May-2014:       Updated for PCRE2
87# 03-June-2014:      Updated for Python 3
88# 20-June-2014:      Updated for Unicode 7.0.0
89# 12-August-2014:    Updated to put Unicode version into the file
90# 19-June-2015:      Updated for Unicode 8.0.0
91# 02-July-2017:      Updated for Unicode 10.0.0
92# 03-July-2018:      Updated for Unicode 11.0.0
93# 07-July-2018:      Added code to scan emoji-data.txt for the Extended
94#                      Pictographic property.
95# 01-October-2018:   Added the 'Unknown' script name
96# 03-October-2018:   Added new field for Script Extensions
97# 27-July-2019:      Updated for Unicode 12.1.0
98# 10-March-2020:     Updated for Unicode 13.0.0
99# PCRE2-10.39:       Updated for Unicode 14.0.0
100# 05-December-2021:  Added code to scan DerivedBidiClass.txt for bidi class,
101#                      and also PropList.txt for the Bidi_Control property
102# 19-December-2021:  Reworked script extensions lists to be bit maps instead
103#                      of zero-terminated lists of script numbers.
104# ----------------------------------------------------------------------------
105#
106# Changes to the refactored script:
107#
108# 26-December-2021:  Refactoring completed
109# 10-January-2022:   Addition of general Boolean property support
110# 12-January-2022:   Merge scriptx and bidiclass fields
111# 14-January-2022:   Enlarge Boolean property offset to 12 bits
112# 28-January-2023:   Remove ASCII "other case" from non-ASCII character that
113#                      are present in caseless sets.
114#
115# ----------------------------------------------------------------------------
116#
117#
118# The main tables generated by this script are used by macros defined in
119# pcre2_internal.h. They look up Unicode character properties using short
120# sequences of code that contains no branches, which makes for greater speed.
121#
122# Conceptually, there is a table of records (of type ucd_record), one for each
123# Unicode character. Each record contains the script number, script extension
124# value, character type, grapheme break type, offset to caseless matching set,
125# offset to the character's other case, the bidi class, and offset to bitmap of
126# Boolean properties.
127#
128# A real table covering all Unicode characters would be far too big. It can be
129# efficiently compressed by observing that many characters have the same
130# record, and many blocks of characters (taking 128 characters in a block) have
131# the same set of records as other blocks. This leads to a 2-stage lookup
132# process.
133#
134# This script constructs seven tables. The ucd_caseless_sets table contains
135# lists of characters that all match each other caselessly. Each list is
136# in order, and is terminated by NOTACHAR (0xffffffff), which is larger than
137# any valid character. The first list is empty; this is used for characters
138# that are not part of any list.
139#
140# The ucd_digit_sets table contains the code points of the '9' characters in
141# each set of 10 decimal digits in Unicode. This is used to ensure that digits
142# in script runs all come from the same set. The first element in the vector
143# contains the number of subsequent elements, which are in ascending order.
144#
145# Scripts are partitioned into two groups. Scripts that appear in at least one
146# character's script extension list come first, followed by "Unknown" and then
147# all the rest. This sorting is done automatically in the GenerateCommon.py
148# script. A script's number is its index in the script_names list.
149#
150# The ucd_script_sets table contains bitmaps that represent lists of scripts
151# for Script Extensions properties. Each bitmap consists of a fixed number of
152# unsigned 32-bit numbers, enough to allocate a bit for every script that is
153# used in any character's extension list, that is, enough for every script
154# whose number is less than ucp_Unknown. A character's script extension value
155# in its ucd record is an offset into the ucd_script_sets vector. The first
156# bitmap has no bits set; characters that have no script extensions have zero
157# as their script extensions value so that they use this map.
158#
159# The ucd_boolprop_sets table contains bitmaps that represent lists of Boolean
160# properties. Each bitmap consists of a fixed number of unsigned 32-bit
161# numbers, enough to allocate a bit for each supported Boolean property.
162#
163# The ucd_records table contains one instance of every unique character record
164# that is required. The ucd_stage1 table is indexed by a character's block
165# number, which is the character's code point divided by 128, since 128 is the
166# size of each block. The result of a lookup in ucd_stage1 a "virtual" block
167# number.
168#
169# The ucd_stage2 table is a table of "virtual" blocks; each block is indexed by
170# the offset of a character within its own block, and the result is the index
171# number of the required record in the ucd_records vector.
172#
173# The following examples are correct for the Unicode 14.0.0 database. Future
174# updates may make change the actual lookup values.
175#
176# Example: lowercase "a" (U+0061) is in block 0
177#          lookup 0 in stage1 table yields 0
178#          lookup 97 (0x61) in the first table in stage2 yields 35
179#          record 35 is { 0, 5, 12, 0, -32, 18432, 44 }
180#             0 = ucp_Latin   => Latin script
181#             5 = ucp_Ll      => Lower case letter
182#            12 = ucp_gbOther => Grapheme break property "Other"
183#             0               => Not part of a caseless set
184#           -32 (-0x20)       => Other case is U+0041
185#         18432 = 0x4800      => Combined Bidi class + script extension values
186#            44               => Offset to Boolean properties
187#
188# The top 5 bits of the sixth field are the Bidi class, with the rest being the
189# script extension value, giving:
190#
191#             9 = ucp_bidiL   => Bidi class left-to-right
192#             0               => No special script extension property
193#
194# Almost all lowercase latin characters resolve to the same record. One or two
195# are different because they are part of a multi-character caseless set (for
196# example, k, K and the Kelvin symbol are such a set).
197#
198# Example: hiragana letter A (U+3042) is in block 96 (0x60)
199#          lookup 96 in stage1 table yields 93
200#          lookup 66 (0x42) in table 93 in stage2 yields 819
201#          record 819 is { 20, 7, 12, 0, 0, 18432, 82 }
202#            20 = ucp_Hiragana => Hiragana script
203#             7 = ucp_Lo       => Other letter
204#            12 = ucp_gbOther  => Grapheme break property "Other"
205#             0                => Not part of a caseless set
206#             0                => No other case
207#         18432 = 0x4800       => Combined Bidi class + script extension values
208#            82                => Offset to Boolean properties
209#
210# The top 5 bits of the sixth field are the Bidi class, with the rest being the
211# script extension value, giving:
212#
213#             9 = ucp_bidiL   => Bidi class left-to-right
214#             0               => No special script extension property
215#
216# Example: vedic tone karshana (U+1CD0) is in block 57 (0x39)
217#          lookup 57 in stage1 table yields 55
218#          lookup 80 (0x50) in table 55 in stage2 yields 621
219#          record 621 is { 84, 12, 3, 0, 0, 26762, 96 }
220#            84 = ucp_Inherited => Script inherited from predecessor
221#            12 = ucp_Mn        => Non-spacing mark
222#             3 = ucp_gbExtend  => Grapheme break property "Extend"
223#             0                 => Not part of a caseless set
224#             0                 => No other case
225#         26762 = 0x688A        => Combined Bidi class + script extension values
226#            96                 => Offset to Boolean properties
227#
228# The top 5 bits of the sixth field are the Bidi class, with the rest being the
229# script extension value, giving:
230#
231#            13 = ucp_bidiNSM   => Bidi class non-spacing mark
232#           138                 => Script Extension list offset = 138
233#
234# At offset 138 in the ucd_script_sets vector we find a bitmap with bits 1, 8,
235# 18, and 47 set. This means that this character is expected to be used with
236# any of those scripts, which are Bengali, Devanagari, Kannada, and Grantha.
237#
238#  Philip Hazel, last updated 14 January 2022.
239##############################################################################
240
241
242# Import standard modules
243
244import re
245import string
246import sys
247
248# Import common data lists and functions
249
250from GenerateCommon import \
251  bidi_classes, \
252  bool_properties, \
253  bool_propsfiles, \
254  bool_props_list_item_size, \
255  break_properties, \
256  category_names, \
257  general_category_names, \
258  script_abbrevs, \
259  script_list_item_size, \
260  script_names, \
261  open_output
262
263# Some general parameters
264
265MAX_UNICODE = 0x110000
266NOTACHAR = 0xffffffff
267
268
269# ---------------------------------------------------------------------------
270#                         DEFINE FUNCTIONS
271# ---------------------------------------------------------------------------
272
273
274# Parse a line of Scripts.txt, GraphemeBreakProperty.txt or DerivedGeneralCategory.txt
275
276def make_get_names(enum):
277  return lambda chardata: enum.index(chardata[1])
278
279
280# Parse a line of DerivedBidiClass.txt
281
282def get_bidi(chardata):
283  if len(chardata[1]) > 3:
284    return bidi_classes_long.index(chardata[1])
285  else:
286    return bidi_classes_short.index(chardata[1])
287
288
289# Parse a line of CaseFolding.txt
290
291def get_other_case(chardata):
292  if chardata[1] == 'C' or chardata[1] == 'S':
293    return int(chardata[2], 16) - int(chardata[0], 16)
294  return None
295
296
297# Parse a line of ScriptExtensions.txt
298
299def get_script_extension(chardata):
300  global last_script_extension
301
302  offset = len(script_lists) * script_list_item_size
303  if last_script_extension == chardata[1]:
304    return offset - script_list_item_size
305
306  last_script_extension = chardata[1]
307  script_lists.append(tuple(script_abbrevs.index(abbrev) for abbrev in last_script_extension.split(' ')))
308  return offset
309
310
311# Read a whole table in memory, setting/checking the Unicode version
312
313def read_table(file_name, get_value, default_value):
314  global unicode_version
315
316  f = re.match(r'^[^/]+/([^.]+)\.txt$', file_name)
317  file_base = f.group(1)
318  version_pat = r"^# " + re.escape(file_base) + r"-(\d+\.\d+\.\d+)\.txt$"
319  file = open(file_name, 'r', encoding='utf-8')
320  f = re.match(version_pat, file.readline())
321  version = f.group(1)
322  if unicode_version == "":
323    unicode_version = version
324  elif unicode_version != version:
325    print("WARNING: Unicode version differs in %s", file_name, file=sys.stderr)
326
327  table = [default_value] * MAX_UNICODE
328  for line in file:
329    if file_base == 'DerivedBidiClass':
330      line = re.sub(r'# @missing: ', '', line)
331
332    line = re.sub(r'#.*', '', line)
333    chardata = list(map(str.strip, line.split(';')))
334    if len(chardata) <= 1:
335      continue
336    value = get_value(chardata)
337    if value is None:
338      continue
339    m = re.match(r'([0-9a-fA-F]+)(\.\.([0-9a-fA-F]+))?$', chardata[0])
340    char = int(m.group(1), 16)
341    if m.group(3) is None:
342      last = char
343    else:
344      last = int(m.group(3), 16)
345    for i in range(char, last + 1):
346      table[i] = value
347
348  file.close()
349  return table
350
351
352# Get the smallest possible C language type for the values in a table
353
354def get_type_size(table):
355  type_size = [("uint8_t", 1), ("uint16_t", 2), ("uint32_t", 4),
356    ("signed char", 1), ("int16_t", 2), ("int32_t", 4)]
357  limits = [(0, 255), (0, 65535), (0, 4294967295), (-128, 127),
358    (-32768, 32767), (-2147483648, 2147483647)]
359  minval = min(table)
360  maxval = max(table)
361  for num, (minlimit, maxlimit) in enumerate(limits):
362    if minlimit <= minval and maxval <= maxlimit:
363      return type_size[num]
364  raise OverflowError("Too large to fit into C types")
365
366
367# Get the total size of a list of tables
368
369def get_tables_size(*tables):
370  total_size = 0
371  for table in tables:
372    type, size = get_type_size(table)
373    total_size += size * len(table)
374  return total_size
375
376
377# Compress a table into the two stages
378
379def compress_table(table, block_size):
380  blocks = {} # Dictionary for finding identical blocks
381  stage1 = [] # Stage 1 table contains block numbers (indices into stage 2 table)
382  stage2 = [] # Stage 2 table contains the blocks with property values
383  table = tuple(table)
384  for i in range(0, len(table), block_size):
385    block = table[i:i+block_size]
386    start = blocks.get(block)
387    if start is None:
388      # Allocate a new block
389      start = len(stage2) / block_size
390      stage2 += block
391      blocks[block] = start
392    stage1.append(start)
393  return stage1, stage2
394
395
396# Output a table
397
398def write_table(table, table_name, block_size = None):
399  type, size = get_type_size(table)
400  ELEMS_PER_LINE = 16
401
402  s = "const %s %s[] = { /* %d bytes" % (type, table_name, size * len(table))
403  if block_size:
404    s += ", block = %d" % block_size
405  f.write(s + " */\n")
406  table = tuple(table)
407  if block_size is None:
408    fmt = "%3d," * ELEMS_PER_LINE + " /* U+%04X */\n"
409    mult = MAX_UNICODE / len(table)
410    for i in range(0, len(table), ELEMS_PER_LINE):
411      f.write(fmt % (table[i:i+ELEMS_PER_LINE] + (int(i * mult),)))
412  else:
413    if block_size > ELEMS_PER_LINE:
414      el = ELEMS_PER_LINE
415    else:
416      el = block_size
417    fmt = "%3d," * el + "\n"
418    if block_size > ELEMS_PER_LINE:
419      fmt = fmt * int(block_size / ELEMS_PER_LINE)
420    for i in range(0, len(table), block_size):
421      f.write(("\n/* block %d */\n" + fmt) % ((i / block_size,) + table[i:i+block_size]))
422  f.write("};\n\n")
423
424
425# Extract the unique combinations of properties into records
426
427def combine_tables(*tables):
428  records = {}
429  index = []
430  for t in zip(*tables):
431    i = records.get(t)
432    if i is None:
433      i = records[t] = len(records)
434    index.append(i)
435  return index, records
436
437
438# Create a record struct
439
440def get_record_size_struct(records):
441  size = 0
442  structure = 'typedef struct {\n'
443  for i in range(len(records[0])):
444    record_slice = [record[i] for record in records]
445    slice_type, slice_size = get_type_size(record_slice)
446    # add padding: round up to the nearest power of slice_size
447    size = (size + slice_size - 1) & -slice_size
448    size += slice_size
449    structure += '%s property_%d;\n' % (slice_type, i)
450
451  # round up to the first item of the next structure in array
452  record_slice = [record[0] for record in records]
453  slice_type, slice_size = get_type_size(record_slice)
454  size = (size + slice_size - 1) & -slice_size
455
456  structure += '} ucd_record;\n*/\n'
457  return size, structure
458
459
460# Write records
461
462def write_records(records, record_size):
463  f.write('const ucd_record PRIV(ucd_records)[] = { ' + \
464    '/* %d bytes, record size %d */\n' % (len(records) * record_size, record_size))
465  records = list(zip(list(records.keys()), list(records.values())))
466  records.sort(key = lambda x: x[1])
467  for i, record in enumerate(records):
468    f.write(('  {' + '%6d, ' * len(record[0]) + '}, /* %3d */\n') % (record[0] + (i,)))
469  f.write('};\n\n')
470
471
472# Write a bit set
473
474def write_bitsets(list, item_size):
475  for d in list:
476    bitwords = [0] * item_size
477    for idx in d:
478      bitwords[idx // 32] |= 1 << (idx & 31)
479    s = " "
480    for x in bitwords:
481      f.write("%s" % s)
482      s = ", "
483      f.write("0x%08xu" % x)
484    f.write(",\n")
485  f.write("};\n\n")
486
487
488# ---------------------------------------------------------------------------
489# This bit of code must have been useful when the original script was being
490# developed. Retain it just in case it is ever needed again.
491
492# def test_record_size():
493#   tests = [ \
494#     ( [(3,), (6,), (6,), (1,)], 1 ), \
495#     ( [(300,), (600,), (600,), (100,)], 2 ), \
496#     ( [(25, 3), (6, 6), (34, 6), (68, 1)], 2 ), \
497#     ( [(300, 3), (6, 6), (340, 6), (690, 1)], 4 ), \
498#     ( [(3, 300), (6, 6), (6, 340), (1, 690)], 4 ), \
499#     ( [(300, 300), (6, 6), (6, 340), (1, 690)], 4 ), \
500#     ( [(3, 100000), (6, 6), (6, 123456), (1, 690)], 8 ), \
501#     ( [(100000, 300), (6, 6), (123456, 6), (1, 690)], 8 ), \
502#   ]
503#   for test in tests:
504#     size, struct = get_record_size_struct(test[0])
505#     assert(size == test[1])
506# test_record_size()
507# ---------------------------------------------------------------------------
508
509
510
511# ---------------------------------------------------------------------------
512#                       MAIN CODE FOR CREATING TABLES
513# ---------------------------------------------------------------------------
514
515unicode_version = ""
516
517# Some of the tables imported from GenerateCommon.py have alternate comment
518# strings for use by GenerateUcpHeader. The comments are not wanted here, so
519# remove them.
520
521bidi_classes_short = bidi_classes[::2]
522bidi_classes_long = bidi_classes[1::2]
523break_properties = break_properties[::2]
524category_names = category_names[::2]
525
526# Create the various tables from Unicode data files
527
528script = read_table('Unicode.tables/Scripts.txt', make_get_names(script_names), script_names.index('Unknown'))
529category = read_table('Unicode.tables/DerivedGeneralCategory.txt', make_get_names(category_names), category_names.index('Cn'))
530break_props = read_table('Unicode.tables/GraphemeBreakProperty.txt', make_get_names(break_properties), break_properties.index('Other'))
531other_case = read_table('Unicode.tables/CaseFolding.txt', get_other_case, 0)
532bidi_class = read_table('Unicode.tables/DerivedBidiClass.txt', get_bidi, bidi_classes_short.index('L'))
533
534# The grapheme breaking rules were changed for Unicode 11.0.0 (June 2018). Now
535# we need to find the Extended_Pictographic property for emoji characters. This
536# can be set as an additional grapheme break property, because the default for
537# all the emojis is "other". We scan the emoji-data.txt file and modify the
538# break-props table.
539
540file = open('Unicode.tables/emoji-data.txt', 'r', encoding='utf-8')
541for line in file:
542  line = re.sub(r'#.*', '', line)
543  chardata = list(map(str.strip, line.split(';')))
544  if len(chardata) <= 1:
545    continue
546  if chardata[1] != "Extended_Pictographic":
547    continue
548  m = re.match(r'([0-9a-fA-F]+)(\.\.([0-9a-fA-F]+))?$', chardata[0])
549  char = int(m.group(1), 16)
550  if m.group(3) is None:
551    last = char
552  else:
553    last = int(m.group(3), 16)
554  for i in range(char, last + 1):
555    if break_props[i] != break_properties.index('Other'):
556      print("WARNING: Emoji 0x%x has break property %s, not 'Other'",
557        i, break_properties[break_props[i]], file=sys.stderr)
558    break_props[i] = break_properties.index('Extended_Pictographic')
559file.close()
560
561# Handle script extensions. The get_script_extesion() function maintains a
562# list of unique bitmaps representing lists of scripts, returning the offset
563# in that list. Initialize the list with an empty set, which is used for
564# characters that have no script extensions.
565
566script_lists = [[]]
567last_script_extension = ""
568scriptx_bidi_class = read_table('Unicode.tables/ScriptExtensions.txt', get_script_extension, 0)
569
570for idx in range(len(scriptx_bidi_class)):
571  scriptx_bidi_class[idx] = scriptx_bidi_class[idx] | (bidi_class[idx] << 11)
572bidi_class = None
573
574# Find the Boolean properties of each character. This next bit of magic creates
575# a list of empty lists. Using [[]] * MAX_UNICODE gives a list of references to
576# the *same* list, which is not what we want.
577
578bprops = [[] for _ in range(MAX_UNICODE)]
579
580# Collect the properties from the various files
581
582for filename in bool_propsfiles:
583  try:
584    file = open('Unicode.tables/' + filename, 'r')
585  except IOError:
586    print(f"** Couldn't open {'Unicode.tables/' + filename}\n")
587    sys.exit(1)
588
589  for line in file:
590    line = re.sub(r'#.*', '', line)
591    data = list(map(str.strip, line.split(';')))
592    if len(data) <= 1:
593      continue
594
595    try:
596      ix = bool_properties.index(data[1])
597    except ValueError:
598      continue
599
600    m = re.match(r'([0-9a-fA-F]+)(\.\.([0-9a-fA-F]+))?$', data[0])
601    char = int(m.group(1), 16)
602    if m.group(3) is None:
603      last = char
604    else:
605      last = int(m.group(3), 16)
606
607    for i in range(char, last + 1):
608      bprops[i].append(ix)
609
610  file.close()
611
612# The ASCII property isn't listed in any files, but it is easy enough to add
613# it manually.
614
615ix = bool_properties.index("ASCII")
616for i in range(128):
617  bprops[i].append(ix)
618
619# The Bidi_Mirrored property isn't listed in any property files. We have to
620# deduce it from the file that lists the mirrored characters.
621
622ix = bool_properties.index("Bidi_Mirrored")
623
624try:
625  file = open('Unicode.tables/BidiMirroring.txt', 'r')
626except IOError:
627  print(f"** Couldn't open {'Unicode.tables/BidiMirroring.txt'}\n")
628  sys.exit(1)
629
630for line in file:
631  line = re.sub(r'#.*', '', line)
632  data = list(map(str.strip, line.split(';')))
633  if len(data) <= 1:
634    continue
635  c = int(data[0], 16)
636  bprops[c].append(ix)
637
638file.close()
639
640# Scan each character's boolean property list and created a list of unique
641# lists, at the same time, setting the index in that list for each property in
642# the bool_props vector.
643
644bool_props = [0] * MAX_UNICODE
645bool_props_lists = [[]]
646
647for c in range(MAX_UNICODE):
648  s = set(bprops[c])
649  for i in range(len(bool_props_lists)):
650    if s == set(bool_props_lists[i]):
651      break;
652  else:
653    bool_props_lists.append(bprops[c])
654    i += 1
655
656  bool_props[c] = i * bool_props_list_item_size
657
658# This block of code was added by PH in September 2012. It scans the other_case
659# table to find sets of more than two characters that must all match each other
660# caselessly. Later in this script a table of these sets is written out.
661# However, we have to do this work here in order to compute the offsets in the
662# table that are inserted into the main table.
663
664# The CaseFolding.txt file lists pairs, but the common logic for reading data
665# sets only one value, so first we go through the table and set "return"
666# offsets for those that are not already set.
667
668for c in range(MAX_UNICODE):
669  if other_case[c] != 0 and other_case[c + other_case[c]] == 0:
670    other_case[c + other_case[c]] = -other_case[c]
671
672# Now scan again and create equivalence sets.
673
674caseless_sets = []
675
676for c in range(MAX_UNICODE):
677  o = c + other_case[c]
678
679  # Trigger when this character's other case does not point back here. We
680  # now have three characters that are case-equivalent.
681
682  if other_case[o] != -other_case[c]:
683    t = o + other_case[o]
684
685    # Scan the existing sets to see if any of the three characters are already
686    # part of a set. If so, unite the existing set with the new set.
687
688    appended = 0
689    for s in caseless_sets:
690      found = 0
691      for x in s:
692        if x == c or x == o or x == t:
693          found = 1
694
695      # Add new characters to an existing set
696
697      if found:
698        found = 0
699        for y in [c, o, t]:
700          for x in s:
701            if x == y:
702              found = 1
703          if not found:
704            s.append(y)
705        appended = 1
706
707    # If we have not added to an existing set, create a new one.
708
709    if not appended:
710      caseless_sets.append([c, o, t])
711
712# End of loop looking for caseless sets.
713
714# Now scan the sets and set appropriate offsets for the characters.
715
716caseless_offsets = [0] * MAX_UNICODE
717
718offset = 1;
719for s in caseless_sets:
720  for x in s:
721    caseless_offsets[x] = offset
722  offset += len(s) + 1
723
724# End of block of code for creating offsets for caseless matching sets.
725
726# Scan the caseless sets, and for any non-ASCII character that has an ASCII
727# character as its "base" other case, remove the other case. This makes it
728# easier to handle those characters when the PCRE2 option for not mixing ASCII
729# and non-ASCII is enabled. In principle one should perhaps scan for a
730# non-ASCII alternative, but in practice these don't exist.
731
732for s in caseless_sets:
733  for x in s:
734    if x > 127 and x + other_case[x] < 128:
735      other_case[x] = 0
736
737# Combine all the tables
738
739table, records = combine_tables(script, category, break_props,
740  caseless_offsets, other_case, scriptx_bidi_class, bool_props)
741
742# Find the record size and create a string definition of the structure for
743# outputting as a comment.
744
745record_size, record_struct = get_record_size_struct(list(records.keys()))
746
747# Find the optimum block size for the two-stage table
748
749min_size = sys.maxsize
750for block_size in [2 ** i for i in range(5,10)]:
751  size = len(records) * record_size
752  stage1, stage2 = compress_table(table, block_size)
753  size += get_tables_size(stage1, stage2)
754  #print("/* block size {:3d} => {:5d} bytes */".format(block_size, size))
755  if size < min_size:
756    min_size = size
757    min_stage1, min_stage2 = stage1, stage2
758    min_block_size = block_size
759
760
761# ---------------------------------------------------------------------------
762#                   MAIN CODE FOR WRITING THE OUTPUT FILE
763# ---------------------------------------------------------------------------
764
765# Open the output file (no return on failure). This call also writes standard
766# header boilerplate.
767
768f = open_output("pcre2_ucd.c")
769
770# Output this file's heading text
771
772f.write("""\
773/* This file contains tables of Unicode properties that are extracted from
774Unicode data files. See the comments at the start of maint/GenerateUcd.py for
775details.
776
777As well as being part of the PCRE2 library, this file is #included by the
778pcre2test program, which redefines the PRIV macro to change table names from
779_pcre2_xxx to xxxx, thereby avoiding name clashes with the library. At present,
780just one of these tables is actually needed. When compiling the library, some
781headers are needed. */
782
783#ifndef PCRE2_PCRE2TEST
784#ifdef HAVE_CONFIG_H
785#include "config.h"
786#endif
787#include "pcre2_internal.h"
788#endif /* PCRE2_PCRE2TEST */
789
790/* The tables herein are needed only when UCP support is built, and in PCRE2
791that happens automatically with UTF support. This module should not be
792referenced otherwise, so it should not matter whether it is compiled or not.
793However a comment was received about space saving - maybe the guy linked all
794the modules rather than using a library - so we include a condition to cut out
795the tables when not needed. But don't leave a totally empty module because some
796compilers barf at that. Instead, just supply some small dummy tables. */
797
798#ifndef SUPPORT_UNICODE
799const ucd_record PRIV(ucd_records)[] = {{0,0,0,0,0,0,0}};
800const uint16_t PRIV(ucd_stage1)[] = {0};
801const uint16_t PRIV(ucd_stage2)[] = {0};
802const uint32_t PRIV(ucd_caseless_sets)[] = {0};
803#else
804\n""")
805
806# --- Output some variable heading stuff ---
807
808f.write("/* Total size: %d bytes, block size: %d. */\n\n" % (min_size, min_block_size))
809f.write('const char *PRIV(unicode_version) = "{}";\n\n'.format(unicode_version))
810
811f.write("""\
812/* When recompiling tables with a new Unicode version, please check the types
813in this structure definition with those in pcre2_internal.h (the actual field
814names will be different).
815\n""")
816
817f.write(record_struct)
818
819f.write("""
820/* If the 32-bit library is run in non-32-bit mode, character values greater
821than 0x10ffff may be encountered. For these we set up a special record. */
822
823#if PCRE2_CODE_UNIT_WIDTH == 32
824const ucd_record PRIV(dummy_ucd_record)[] = {{
825  ucp_Unknown,    /* script */
826  ucp_Cn,         /* type unassigned */
827  ucp_gbOther,    /* grapheme break property */
828  0,              /* case set */
829  0,              /* other case */
830  0 | (ucp_bidiL << UCD_BIDICLASS_SHIFT), /* script extension and bidi class */
831  0,              /* bool properties offset */
832  }};
833#endif
834\n""")
835
836# --- Output the table of caseless character sets ---
837
838f.write("""\
839/* This table contains lists of characters that are caseless sets of
840more than one character. Each list is terminated by NOTACHAR. */
841
842const uint32_t PRIV(ucd_caseless_sets)[] = {
843  NOTACHAR,
844""")
845
846for s in caseless_sets:
847  s = sorted(s)
848  for x in s:
849    f.write('  0x%04x,' % x)
850  f.write('  NOTACHAR,\n')
851f.write('};\n\n')
852
853# --- Other tables are not needed by pcre2test ---
854
855f.write("""\
856/* When #included in pcre2test, we don't need the table of digit sets, nor the
857the large main UCD tables. */
858
859#ifndef PCRE2_PCRE2TEST
860\n""")
861
862# --- Read Scripts.txt again for the sets of 10 digits. ---
863
864digitsets = []
865file = open('Unicode.tables/Scripts.txt', 'r', encoding='utf-8')
866
867for line in file:
868  m = re.match(r'([0-9a-fA-F]+)\.\.([0-9a-fA-F]+)\s+;\s+\S+\s+#\s+Nd\s+', line)
869  if m is None:
870    continue
871  first = int(m.group(1),16)
872  last  = int(m.group(2),16)
873  if ((last - first + 1) % 10) != 0:
874    f.write("ERROR: %04x..%04x does not contain a multiple of 10 characters" % (first, last),
875      file=sys.stderr)
876  while first < last:
877    digitsets.append(first + 9)
878    first += 10
879file.close()
880digitsets.sort()
881
882f.write("""\
883/* This table lists the code points for the '9' characters in each set of
884decimal digits. It is used to ensure that all the digits in a script run come
885from the same set. */
886
887const uint32_t PRIV(ucd_digit_sets)[] = {
888""")
889
890f.write("  %d,  /* Number of subsequent values */" % len(digitsets))
891count = 8
892for d in digitsets:
893  if count == 8:
894    f.write("\n ")
895    count = 0
896  f.write(" 0x%05x," % d)
897  count += 1
898f.write("\n};\n\n")
899
900f.write("""\
901/* This vector is a list of script bitsets for the Script Extension property.
902The number of 32-bit words in each bitset is #defined in pcre2_ucp.h as
903ucd_script_sets_item_size. */
904
905const uint32_t PRIV(ucd_script_sets)[] = {
906""")
907write_bitsets(script_lists, script_list_item_size)
908
909f.write("""\
910/* This vector is a list of bitsets for Boolean properties. The number of
91132_bit words in each bitset is #defined as ucd_boolprop_sets_item_size in
912pcre2_ucp.h. */
913
914const uint32_t PRIV(ucd_boolprop_sets)[] = {
915""")
916write_bitsets(bool_props_lists, bool_props_list_item_size)
917
918
919# Output the main UCD tables.
920
921f.write("""\
922/* These are the main two-stage UCD tables. The fields in each record are:
923script (8 bits), character type (8 bits), grapheme break property (8 bits),
924offset to multichar other cases or zero (8 bits), offset to other case or zero
925(32 bits, signed), bidi class (5 bits) and script extension (11 bits) packed
926into a 16-bit field, and offset in binary properties table (16 bits). */
927\n""")
928
929write_records(records, record_size)
930write_table(min_stage1, 'PRIV(ucd_stage1)')
931write_table(min_stage2, 'PRIV(ucd_stage2)', min_block_size)
932
933f.write("#if UCD_BLOCK_SIZE != %d\n" % min_block_size)
934f.write("""\
935#error Please correct UCD_BLOCK_SIZE in pcre2_internal.h
936#endif
937#endif  /* SUPPORT_UNICODE */
938
939#endif  /* PCRE2_PCRE2TEST */
940
941/* End of pcre2_ucd.c */
942""")
943
944f.close
945
946# End
947