1#! /usr/bin/env python3 2# This script generates Lib/re/_casefix.py. 3 4import collections 5import re 6import sys 7import unicodedata 8 9def update_file(file, content): 10 try: 11 with open(file, 'r', encoding='utf-8') as fobj: 12 if fobj.read() == content: 13 return False 14 except (OSError, ValueError): 15 pass 16 with open(file, 'w', encoding='utf-8') as fobj: 17 fobj.write(content) 18 return True 19 20re_casefix_template = """\ 21# Auto-generated by Tools/scripts/generate_re_casefix.py. 22 23# Maps the code of lowercased character to codes of different lowercased 24# characters which have the same uppercase. 25_EXTRA_CASES = { 26%s 27} 28""" 29 30def uname(i): 31 return unicodedata.name(chr(i), r'U+%04X' % i) 32 33class hexint(int): 34 def __repr__(self): 35 return '%#06x' % self 36 37def alpha(i): 38 c = chr(i) 39 return c if c.isalpha() else ascii(c)[1:-1] 40 41 42def main(outfile='Lib/re/_casefix.py'): 43 # Find sets of characters which have the same uppercase. 44 equivalent_chars = collections.defaultdict(str) 45 for c in map(chr, range(sys.maxunicode + 1)): 46 equivalent_chars[c.upper()] += c 47 equivalent_chars = [t for t in equivalent_chars.values() if len(t) > 1] 48 49 # List of codes of lowercased characters which have the same uppercase. 50 equivalent_lower_codes = [sorted(t) 51 for s in equivalent_chars 52 for t in [set(ord(c.lower()) for c in s)] 53 if len(t) > 1] 54 55 bad_codes = [] 56 for t in equivalent_lower_codes: 57 for i in t: 58 if i > 0xffff: 59 bad_codes.extend(t) 60 try: 61 bad_codes.append(ord(chr(i).upper())) 62 except (ValueError, TypeError): 63 pass 64 break 65 if bad_codes: 66 print('Case-insensitive matching may not work correctly for character:', 67 file=sys.stderr) 68 for i in sorted(bad_codes): 69 print(" '%s' (U+%04x, %s)" % (alpha(i), i, uname(i)), 70 file=sys.stderr) 71 sys.exit(1) 72 73 mapping = {i: tuple(j for j in t if i != j) 74 for t in equivalent_lower_codes 75 for i in t} 76 77 items = [] 78 for i, t in sorted(mapping.items()): 79 items.append(' # %s: %s' % ( 80 uname(i), 81 ', '.join(map(uname, t)), 82 )) 83 items.append(" %r: %r, # '%s': '%s'" % ( 84 hexint(i), 85 tuple(map(hexint, t)), 86 alpha(i), 87 ''.join(map(alpha, t)), 88 )) 89 90 update_file(outfile, re_casefix_template % '\n'.join(items)) 91 92 93if __name__ == '__main__': 94 import sys 95 main(*sys.argv[1:]) 96