1#! /usr/bin/env python3
2# This script generates Lib/re/_casefix.py.
3
4import collections
5import re
6import sys
7import unicodedata
8
9def update_file(file, content):
10    try:
11        with open(file, 'r', encoding='utf-8') as fobj:
12            if fobj.read() == content:
13                return False
14    except (OSError, ValueError):
15        pass
16    with open(file, 'w', encoding='utf-8') as fobj:
17        fobj.write(content)
18    return True
19
20re_casefix_template = """\
21# Auto-generated by Tools/scripts/generate_re_casefix.py.
22
23# Maps the code of lowercased character to codes of different lowercased
24# characters which have the same uppercase.
25_EXTRA_CASES = {
26%s
27}
28"""
29
30def uname(i):
31    return unicodedata.name(chr(i), r'U+%04X' % i)
32
33class hexint(int):
34    def __repr__(self):
35        return '%#06x' % self
36
37def alpha(i):
38    c = chr(i)
39    return c if c.isalpha() else ascii(c)[1:-1]
40
41
42def main(outfile='Lib/re/_casefix.py'):
43    # Find sets of characters which have the same uppercase.
44    equivalent_chars = collections.defaultdict(str)
45    for c in map(chr, range(sys.maxunicode + 1)):
46        equivalent_chars[c.upper()] += c
47    equivalent_chars = [t for t in equivalent_chars.values() if len(t) > 1]
48
49    # List of codes of lowercased characters which have the same uppercase.
50    equivalent_lower_codes = [sorted(t)
51                              for s in equivalent_chars
52                              for t in [set(ord(c.lower()) for c in s)]
53                              if len(t) > 1]
54
55    bad_codes = []
56    for t in equivalent_lower_codes:
57        for i in t:
58            if i > 0xffff:
59                bad_codes.extend(t)
60                try:
61                    bad_codes.append(ord(chr(i).upper()))
62                except (ValueError, TypeError):
63                    pass
64                break
65    if bad_codes:
66        print('Case-insensitive matching may not work correctly for character:',
67              file=sys.stderr)
68        for i in sorted(bad_codes):
69            print("  '%s' (U+%04x, %s)" % (alpha(i), i, uname(i)),
70                  file=sys.stderr)
71        sys.exit(1)
72
73    mapping = {i: tuple(j for j in t if i != j)
74               for t in equivalent_lower_codes
75               for i in t}
76
77    items = []
78    for i, t in sorted(mapping.items()):
79        items.append('    # %s: %s' % (
80            uname(i),
81            ', '.join(map(uname, t)),
82        ))
83        items.append("    %r: %r, # '%s': '%s'" % (
84            hexint(i),
85            tuple(map(hexint, t)),
86            alpha(i),
87            ''.join(map(alpha, t)),
88        ))
89
90    update_file(outfile, re_casefix_template % '\n'.join(items))
91
92
93if __name__ == '__main__':
94    import sys
95    main(*sys.argv[1:])
96