1*22dc650dSSadaf Ebrahimi#! /usr/bin/python 2*22dc650dSSadaf Ebrahimi 3*22dc650dSSadaf Ebrahimi# PCRE2 UNICODE PROPERTY SUPPORT 4*22dc650dSSadaf Ebrahimi# ------------------------------ 5*22dc650dSSadaf Ebrahimi# 6*22dc650dSSadaf Ebrahimi# This file auto-generates unicode property tests and their expected output. 7*22dc650dSSadaf Ebrahimi# It is recommended to re-run this generator after the unicode files are 8*22dc650dSSadaf Ebrahimi# updated. The names of the generated files are `testinput26` and `testoutput26` 9*22dc650dSSadaf Ebrahimi 10*22dc650dSSadaf Ebrahimiimport re 11*22dc650dSSadaf Ebrahimiimport sys 12*22dc650dSSadaf Ebrahimi 13*22dc650dSSadaf Ebrahimifrom GenerateCommon import \ 14*22dc650dSSadaf Ebrahimi script_names, \ 15*22dc650dSSadaf Ebrahimi script_abbrevs 16*22dc650dSSadaf Ebrahimi 17*22dc650dSSadaf Ebrahimidef write_both(text): 18*22dc650dSSadaf Ebrahimi input_file.write(text) 19*22dc650dSSadaf Ebrahimi output_file.write(text) 20*22dc650dSSadaf Ebrahimi 21*22dc650dSSadaf Ebrahimidef to_string_char(ch_idx): 22*22dc650dSSadaf Ebrahimi if ch_idx < 128: 23*22dc650dSSadaf Ebrahimi if ch_idx < 16: 24*22dc650dSSadaf Ebrahimi return "\\x{0%x}" % ch_idx 25*22dc650dSSadaf Ebrahimi if ch_idx >= 32: 26*22dc650dSSadaf Ebrahimi return chr(ch_idx) 27*22dc650dSSadaf Ebrahimi return "\\x{%x}" % ch_idx 28*22dc650dSSadaf Ebrahimi 29*22dc650dSSadaf Ebrahimioutput_directory = "" 30*22dc650dSSadaf Ebrahimi 31*22dc650dSSadaf Ebrahimiif len(sys.argv) > 2: 32*22dc650dSSadaf Ebrahimi print('** Too many arguments: just give a directory name') 33*22dc650dSSadaf Ebrahimi sys.exit(1) 34*22dc650dSSadaf Ebrahimiif len(sys.argv) == 2: 35*22dc650dSSadaf Ebrahimi output_directory = sys.argv[1] 36*22dc650dSSadaf Ebrahimi if not output_directory.endswith("/"): 37*22dc650dSSadaf Ebrahimi output_directory += "/" 38*22dc650dSSadaf Ebrahimi 39*22dc650dSSadaf Ebrahimitry: 40*22dc650dSSadaf Ebrahimi input_file = open(output_directory + "testinput26", "w") 41*22dc650dSSadaf Ebrahimi output_file = open(output_directory + "testoutput26", "w") 42*22dc650dSSadaf Ebrahimiexcept IOError: 43*22dc650dSSadaf Ebrahimi print("** Couldn't open output files") 44*22dc650dSSadaf Ebrahimi sys.exit(1) 45*22dc650dSSadaf Ebrahimi 46*22dc650dSSadaf Ebrahimiwrite_both("# These tests are generated by maint/GenerateTest26.py, do not edit.\n\n") 47*22dc650dSSadaf Ebrahimi 48*22dc650dSSadaf Ebrahimi# --------------------------------------------------------------------------- 49*22dc650dSSadaf Ebrahimi# UNICODE SCRIPT EXTENSION TESTS 50*22dc650dSSadaf Ebrahimi# --------------------------------------------------------------------------- 51*22dc650dSSadaf Ebrahimi 52*22dc650dSSadaf Ebrahimiwrite_both("# Unicode Script Extension tests.\n\n") 53*22dc650dSSadaf Ebrahimi 54*22dc650dSSadaf Ebrahimidef gen_script_tests(): 55*22dc650dSSadaf Ebrahimi script_data = [None] * len(script_names) 56*22dc650dSSadaf Ebrahimi char_data = [None] * 0x110000 57*22dc650dSSadaf Ebrahimi 58*22dc650dSSadaf Ebrahimi property_re = re.compile("^([0-9A-F]{4,6})(?:\\.\\.([0-9A-F]{4,6}))? +; ([A-Za-z_ ]+) #") 59*22dc650dSSadaf Ebrahimi prev_name = "" 60*22dc650dSSadaf Ebrahimi script_idx = -1 61*22dc650dSSadaf Ebrahimi 62*22dc650dSSadaf Ebrahimi with open("Unicode.tables/Scripts.txt") as f: 63*22dc650dSSadaf Ebrahimi for line in f: 64*22dc650dSSadaf Ebrahimi match_obj = property_re.match(line) 65*22dc650dSSadaf Ebrahimi 66*22dc650dSSadaf Ebrahimi if match_obj == None: 67*22dc650dSSadaf Ebrahimi continue 68*22dc650dSSadaf Ebrahimi 69*22dc650dSSadaf Ebrahimi name = match_obj.group(3) 70*22dc650dSSadaf Ebrahimi if name != prev_name: 71*22dc650dSSadaf Ebrahimi script_idx = script_names.index(name) 72*22dc650dSSadaf Ebrahimi prev_name = name 73*22dc650dSSadaf Ebrahimi 74*22dc650dSSadaf Ebrahimi low = int(match_obj.group(1), 16) 75*22dc650dSSadaf Ebrahimi high = low 76*22dc650dSSadaf Ebrahimi char_data[low] = name 77*22dc650dSSadaf Ebrahimi 78*22dc650dSSadaf Ebrahimi if match_obj.group(2) != None: 79*22dc650dSSadaf Ebrahimi high = int(match_obj.group(2), 16) 80*22dc650dSSadaf Ebrahimi for idx in range(low + 1, high + 1): 81*22dc650dSSadaf Ebrahimi char_data[idx] = name 82*22dc650dSSadaf Ebrahimi 83*22dc650dSSadaf Ebrahimi if script_data[script_idx] == None: 84*22dc650dSSadaf Ebrahimi script_data[script_idx] = [low, None, None, None, None] 85*22dc650dSSadaf Ebrahimi script_data[script_idx][1] = high 86*22dc650dSSadaf Ebrahimi 87*22dc650dSSadaf Ebrahimi extended_script_indicies = {} 88*22dc650dSSadaf Ebrahimi 89*22dc650dSSadaf Ebrahimi with open("Unicode.tables/ScriptExtensions.txt") as f: 90*22dc650dSSadaf Ebrahimi for line in f: 91*22dc650dSSadaf Ebrahimi match_obj = property_re.match(line) 92*22dc650dSSadaf Ebrahimi 93*22dc650dSSadaf Ebrahimi if match_obj == None: 94*22dc650dSSadaf Ebrahimi continue 95*22dc650dSSadaf Ebrahimi 96*22dc650dSSadaf Ebrahimi low = int(match_obj.group(1), 16) 97*22dc650dSSadaf Ebrahimi high = low 98*22dc650dSSadaf Ebrahimi if match_obj.group(2) != None: 99*22dc650dSSadaf Ebrahimi high = int(match_obj.group(2), 16) 100*22dc650dSSadaf Ebrahimi 101*22dc650dSSadaf Ebrahimi for abbrev in match_obj.group(3).split(" "): 102*22dc650dSSadaf Ebrahimi if abbrev not in extended_script_indicies: 103*22dc650dSSadaf Ebrahimi idx = script_abbrevs.index(abbrev) 104*22dc650dSSadaf Ebrahimi extended_script_indicies[abbrev] = idx 105*22dc650dSSadaf Ebrahimi rec = script_data[idx] 106*22dc650dSSadaf Ebrahimi rec[2] = low 107*22dc650dSSadaf Ebrahimi rec[3] = high 108*22dc650dSSadaf Ebrahimi else: 109*22dc650dSSadaf Ebrahimi idx = extended_script_indicies[abbrev] 110*22dc650dSSadaf Ebrahimi rec = script_data[idx] 111*22dc650dSSadaf Ebrahimi if rec[2] > low: 112*22dc650dSSadaf Ebrahimi rec[2] = low 113*22dc650dSSadaf Ebrahimi if rec[3] < high: 114*22dc650dSSadaf Ebrahimi rec[3] = high 115*22dc650dSSadaf Ebrahimi 116*22dc650dSSadaf Ebrahimi if rec[4] == None: 117*22dc650dSSadaf Ebrahimi name = script_names[idx] 118*22dc650dSSadaf Ebrahimi for idx in range(low, high + 1): 119*22dc650dSSadaf Ebrahimi if char_data[idx] != name: 120*22dc650dSSadaf Ebrahimi rec[4] = idx 121*22dc650dSSadaf Ebrahimi break 122*22dc650dSSadaf Ebrahimi 123*22dc650dSSadaf Ebrahimi long_property_name = False 124*22dc650dSSadaf Ebrahimi 125*22dc650dSSadaf Ebrahimi for idx, rec in enumerate(script_data): 126*22dc650dSSadaf Ebrahimi script_name = script_names[idx] 127*22dc650dSSadaf Ebrahimi 128*22dc650dSSadaf Ebrahimi if script_name == "Unknown": 129*22dc650dSSadaf Ebrahimi continue 130*22dc650dSSadaf Ebrahimi 131*22dc650dSSadaf Ebrahimi script_abbrev = script_abbrevs[idx] 132*22dc650dSSadaf Ebrahimi 133*22dc650dSSadaf Ebrahimi write_both("# Base script check\n") 134*22dc650dSSadaf Ebrahimi write_both("/^\\p{sc=%s}/utf\n" % script_name) 135*22dc650dSSadaf Ebrahimi write_both(" %s\n" % to_string_char(rec[0])) 136*22dc650dSSadaf Ebrahimi output_file.write(" 0: %s\n" % to_string_char(rec[0])) 137*22dc650dSSadaf Ebrahimi write_both("\n") 138*22dc650dSSadaf Ebrahimi 139*22dc650dSSadaf Ebrahimi write_both("/^\\p{Script=%s}/utf\n" % script_abbrev) 140*22dc650dSSadaf Ebrahimi write_both(" %s\n" % to_string_char(rec[1])) 141*22dc650dSSadaf Ebrahimi output_file.write(" 0: %s\n" % to_string_char(rec[1])) 142*22dc650dSSadaf Ebrahimi write_both("\n") 143*22dc650dSSadaf Ebrahimi 144*22dc650dSSadaf Ebrahimi if rec[2] != None: 145*22dc650dSSadaf Ebrahimi property_name = "scx" 146*22dc650dSSadaf Ebrahimi if long_property_name: 147*22dc650dSSadaf Ebrahimi property_name = "Script_Extensions" 148*22dc650dSSadaf Ebrahimi 149*22dc650dSSadaf Ebrahimi write_both("# Script extension check\n") 150*22dc650dSSadaf Ebrahimi write_both("/^\\p{%s}/utf\n" % script_name) 151*22dc650dSSadaf Ebrahimi write_both(" %s\n" % to_string_char(rec[2])) 152*22dc650dSSadaf Ebrahimi output_file.write(" 0: %s\n" % to_string_char(rec[2])) 153*22dc650dSSadaf Ebrahimi write_both("\n") 154*22dc650dSSadaf Ebrahimi 155*22dc650dSSadaf Ebrahimi write_both("/^\\p{%s=%s}/utf\n" % (property_name, script_abbrev)) 156*22dc650dSSadaf Ebrahimi write_both(" %s\n" % to_string_char(rec[3])) 157*22dc650dSSadaf Ebrahimi output_file.write(" 0: %s\n" % to_string_char(rec[3])) 158*22dc650dSSadaf Ebrahimi write_both("\n") 159*22dc650dSSadaf Ebrahimi 160*22dc650dSSadaf Ebrahimi long_property_name = not long_property_name 161*22dc650dSSadaf Ebrahimi 162*22dc650dSSadaf Ebrahimi if rec[4] != None: 163*22dc650dSSadaf Ebrahimi write_both("# Script extension only character\n") 164*22dc650dSSadaf Ebrahimi write_both("/^\\p{%s}/utf\n" % script_name) 165*22dc650dSSadaf Ebrahimi write_both(" %s\n" % to_string_char(rec[4])) 166*22dc650dSSadaf Ebrahimi output_file.write(" 0: %s\n" % to_string_char(rec[4])) 167*22dc650dSSadaf Ebrahimi write_both("\n") 168*22dc650dSSadaf Ebrahimi 169*22dc650dSSadaf Ebrahimi write_both("/^\\p{sc=%s}/utf\n" % script_name) 170*22dc650dSSadaf Ebrahimi write_both(" %s\n" % to_string_char(rec[4])) 171*22dc650dSSadaf Ebrahimi output_file.write("No match\n") 172*22dc650dSSadaf Ebrahimi write_both("\n") 173*22dc650dSSadaf Ebrahimi else: 174*22dc650dSSadaf Ebrahimi print("External character has not found for %s" % script_name) 175*22dc650dSSadaf Ebrahimi 176*22dc650dSSadaf Ebrahimi high = rec[1] 177*22dc650dSSadaf Ebrahimi if rec[3] != None and rec[3] > rec[1]: 178*22dc650dSSadaf Ebrahimi high = rec[3] 179*22dc650dSSadaf Ebrahimi write_both("# Character not in script\n") 180*22dc650dSSadaf Ebrahimi write_both("/^\\p{%s}/utf\n" % script_name) 181*22dc650dSSadaf Ebrahimi write_both(" %s\n" % to_string_char(high + 1)) 182*22dc650dSSadaf Ebrahimi output_file.write("No match\n") 183*22dc650dSSadaf Ebrahimi write_both("\n") 184*22dc650dSSadaf Ebrahimi 185*22dc650dSSadaf Ebrahimi 186*22dc650dSSadaf Ebrahimigen_script_tests() 187*22dc650dSSadaf Ebrahimi 188*22dc650dSSadaf Ebrahimiwrite_both("# End of testinput26\n") 189