xref: /aosp_15_r20/frameworks/ex/common/tools/make-iana-tld-pattern.py (revision 51f0e3d5878722200615b7170faeca299e15103d)
1*51f0e3d5SAndroid Build Coastguard Worker#!/usr/bin/env python
2*51f0e3d5SAndroid Build Coastguard Worker
3*51f0e3d5SAndroid Build Coastguard Workerfrom urllib2 import urlopen
4*51f0e3d5SAndroid Build Coastguard Workerfrom datetime import date
5*51f0e3d5SAndroid Build Coastguard Worker
6*51f0e3d5SAndroid Build Coastguard WorkerURL='http://data.iana.org/TLD/tlds-alpha-by-domain.txt'
7*51f0e3d5SAndroid Build Coastguard Worker
8*51f0e3d5SAndroid Build Coastguard WorkerTLD_PREFIX = r"""
9*51f0e3d5SAndroid Build Coastguard Worker    /**
10*51f0e3d5SAndroid Build Coastguard Worker     *  Regular expression to match all IANA top-level domains.
11*51f0e3d5SAndroid Build Coastguard Worker     *  List accurate as of {gen_date}.  List taken from:
12*51f0e3d5SAndroid Build Coastguard Worker     *  {url}
13*51f0e3d5SAndroid Build Coastguard Worker     *  This pattern is auto-generated by frameworks/ex/common/tools/make-iana-tld-pattern.py
14*51f0e3d5SAndroid Build Coastguard Worker     */
15*51f0e3d5SAndroid Build Coastguard Worker    public static final String TOP_LEVEL_DOMAIN_STR =
16*51f0e3d5SAndroid Build Coastguard Worker"""
17*51f0e3d5SAndroid Build Coastguard WorkerTLD_SUFFIX = '";'
18*51f0e3d5SAndroid Build Coastguard Worker
19*51f0e3d5SAndroid Build Coastguard WorkerURL_PREFIX = r"""
20*51f0e3d5SAndroid Build Coastguard Worker    /**
21*51f0e3d5SAndroid Build Coastguard Worker     *  Regular expression to match all IANA top-level domains for WEB_URL.
22*51f0e3d5SAndroid Build Coastguard Worker     *  List accurate as of {gen_date}.  List taken from:
23*51f0e3d5SAndroid Build Coastguard Worker     *  {url}
24*51f0e3d5SAndroid Build Coastguard Worker     *  This pattern is auto-generated by frameworks/ex/common/tools/make-iana-tld-pattern.py
25*51f0e3d5SAndroid Build Coastguard Worker     */
26*51f0e3d5SAndroid Build Coastguard Worker    public static final String TOP_LEVEL_DOMAIN_STR_FOR_WEB_URL =
27*51f0e3d5SAndroid Build Coastguard Worker        "(?:"
28*51f0e3d5SAndroid Build Coastguard Worker"""
29*51f0e3d5SAndroid Build Coastguard Worker
30*51f0e3d5SAndroid Build Coastguard WorkerURL_SUFFIX = ';'
31*51f0e3d5SAndroid Build Coastguard WorkerTAB = '        '
32*51f0e3d5SAndroid Build Coastguard Worker
33*51f0e3d5SAndroid Build Coastguard Workerclass BucketOutput:
34*51f0e3d5SAndroid Build Coastguard Worker    def __init__(self):
35*51f0e3d5SAndroid Build Coastguard Worker        self.buffer = TAB
36*51f0e3d5SAndroid Build Coastguard Worker        self.lineLength = len(TAB)
37*51f0e3d5SAndroid Build Coastguard Worker
38*51f0e3d5SAndroid Build Coastguard Worker    def __iadd__(self, other):
39*51f0e3d5SAndroid Build Coastguard Worker        self.buffer += other
40*51f0e3d5SAndroid Build Coastguard Worker        self.lineLength += len(other)
41*51f0e3d5SAndroid Build Coastguard Worker        return self
42*51f0e3d5SAndroid Build Coastguard Worker
43*51f0e3d5SAndroid Build Coastguard Worker    def addPipe(self):
44*51f0e3d5SAndroid Build Coastguard Worker        if self.lineLength > 90:
45*51f0e3d5SAndroid Build Coastguard Worker            self.buffer += '"\n'
46*51f0e3d5SAndroid Build Coastguard Worker            self.buffer += TAB
47*51f0e3d5SAndroid Build Coastguard Worker            self.buffer += '+ "'
48*51f0e3d5SAndroid Build Coastguard Worker            self.lineLength = len(TAB)
49*51f0e3d5SAndroid Build Coastguard Worker
50*51f0e3d5SAndroid Build Coastguard Worker        self += '|'
51*51f0e3d5SAndroid Build Coastguard Worker
52*51f0e3d5SAndroid Build Coastguard Worker    def value(self):
53*51f0e3d5SAndroid Build Coastguard Worker        return self.buffer
54*51f0e3d5SAndroid Build Coastguard Worker
55*51f0e3d5SAndroid Build Coastguard Workerclass Bucket:
56*51f0e3d5SAndroid Build Coastguard Worker    def __init__(self, baseLetter):
57*51f0e3d5SAndroid Build Coastguard Worker        self.base=baseLetter
58*51f0e3d5SAndroid Build Coastguard Worker        self.words=[]
59*51f0e3d5SAndroid Build Coastguard Worker        self.letters=[]
60*51f0e3d5SAndroid Build Coastguard Worker
61*51f0e3d5SAndroid Build Coastguard Worker    def dump(self, isWebUrl=False, isFirst=False, isLast=False):
62*51f0e3d5SAndroid Build Coastguard Worker        if (len(self.words) == 0) and (len(self.letters) == 0):
63*51f0e3d5SAndroid Build Coastguard Worker            return ''
64*51f0e3d5SAndroid Build Coastguard Worker
65*51f0e3d5SAndroid Build Coastguard Worker        self.words.sort()
66*51f0e3d5SAndroid Build Coastguard Worker        self.letters.sort()
67*51f0e3d5SAndroid Build Coastguard Worker
68*51f0e3d5SAndroid Build Coastguard Worker        output = BucketOutput()
69*51f0e3d5SAndroid Build Coastguard Worker
70*51f0e3d5SAndroid Build Coastguard Worker        if isFirst:
71*51f0e3d5SAndroid Build Coastguard Worker            if isWebUrl:
72*51f0e3d5SAndroid Build Coastguard Worker                output += '+ "'
73*51f0e3d5SAndroid Build Coastguard Worker            else:
74*51f0e3d5SAndroid Build Coastguard Worker                output += '"('
75*51f0e3d5SAndroid Build Coastguard Worker        else:
76*51f0e3d5SAndroid Build Coastguard Worker            output += '+ "|'
77*51f0e3d5SAndroid Build Coastguard Worker
78*51f0e3d5SAndroid Build Coastguard Worker        if len(self.words) != 0:
79*51f0e3d5SAndroid Build Coastguard Worker            output += '('
80*51f0e3d5SAndroid Build Coastguard Worker
81*51f0e3d5SAndroid Build Coastguard Worker            if isWebUrl:
82*51f0e3d5SAndroid Build Coastguard Worker                output += '?:'
83*51f0e3d5SAndroid Build Coastguard Worker
84*51f0e3d5SAndroid Build Coastguard Worker        firstWord = 1
85*51f0e3d5SAndroid Build Coastguard Worker        for word in self.words:
86*51f0e3d5SAndroid Build Coastguard Worker            if firstWord == 0:
87*51f0e3d5SAndroid Build Coastguard Worker                output.addPipe()
88*51f0e3d5SAndroid Build Coastguard Worker            firstWord = 0
89*51f0e3d5SAndroid Build Coastguard Worker            for letter in word:
90*51f0e3d5SAndroid Build Coastguard Worker                if letter == '-':
91*51f0e3d5SAndroid Build Coastguard Worker                    output += '\\\\'  # escape the '-' character.
92*51f0e3d5SAndroid Build Coastguard Worker                output += letter
93*51f0e3d5SAndroid Build Coastguard Worker
94*51f0e3d5SAndroid Build Coastguard Worker        if len(self.words) > 0 and len(self.letters) > 0:
95*51f0e3d5SAndroid Build Coastguard Worker            output.addPipe()
96*51f0e3d5SAndroid Build Coastguard Worker
97*51f0e3d5SAndroid Build Coastguard Worker        if len(self.letters) == 1:
98*51f0e3d5SAndroid Build Coastguard Worker            output += '%c%c' % (self.base, self.letters[0])
99*51f0e3d5SAndroid Build Coastguard Worker        elif len(self.letters) > 0:
100*51f0e3d5SAndroid Build Coastguard Worker            output += '%c[' % self.base
101*51f0e3d5SAndroid Build Coastguard Worker
102*51f0e3d5SAndroid Build Coastguard Worker            for letter in self.letters:
103*51f0e3d5SAndroid Build Coastguard Worker                output += letter
104*51f0e3d5SAndroid Build Coastguard Worker
105*51f0e3d5SAndroid Build Coastguard Worker            output += ']'
106*51f0e3d5SAndroid Build Coastguard Worker
107*51f0e3d5SAndroid Build Coastguard Worker        if len(self.words) != 0:
108*51f0e3d5SAndroid Build Coastguard Worker            output += ')'
109*51f0e3d5SAndroid Build Coastguard Worker
110*51f0e3d5SAndroid Build Coastguard Worker        if not isLast:
111*51f0e3d5SAndroid Build Coastguard Worker            output += '"'
112*51f0e3d5SAndroid Build Coastguard Worker            output += '\n'
113*51f0e3d5SAndroid Build Coastguard Worker
114*51f0e3d5SAndroid Build Coastguard Worker        return output.value();
115*51f0e3d5SAndroid Build Coastguard Worker
116*51f0e3d5SAndroid Build Coastguard Worker    def add(self, line):
117*51f0e3d5SAndroid Build Coastguard Worker        length = len(line)
118*51f0e3d5SAndroid Build Coastguard Worker
119*51f0e3d5SAndroid Build Coastguard Worker        if line.startswith('#') or (length == 0):
120*51f0e3d5SAndroid Build Coastguard Worker            return;
121*51f0e3d5SAndroid Build Coastguard Worker
122*51f0e3d5SAndroid Build Coastguard Worker        if length == 2:
123*51f0e3d5SAndroid Build Coastguard Worker            self.letters.append(line[1:2])
124*51f0e3d5SAndroid Build Coastguard Worker        else:
125*51f0e3d5SAndroid Build Coastguard Worker            self.words.append(line)
126*51f0e3d5SAndroid Build Coastguard Worker
127*51f0e3d5SAndroid Build Coastguard Workerdef getBucket(buckets, line):
128*51f0e3d5SAndroid Build Coastguard Worker    letter = line[0]
129*51f0e3d5SAndroid Build Coastguard Worker    bucket = buckets.get(letter)
130*51f0e3d5SAndroid Build Coastguard Worker
131*51f0e3d5SAndroid Build Coastguard Worker    if bucket is None:
132*51f0e3d5SAndroid Build Coastguard Worker        bucket = Bucket(letter)
133*51f0e3d5SAndroid Build Coastguard Worker        buckets[letter] = bucket
134*51f0e3d5SAndroid Build Coastguard Worker
135*51f0e3d5SAndroid Build Coastguard Worker    return bucket
136*51f0e3d5SAndroid Build Coastguard Worker
137*51f0e3d5SAndroid Build Coastguard Workerdef makePattern(prefix, suffix, buckets, isWebUrl=False):
138*51f0e3d5SAndroid Build Coastguard Worker    output = prefix.format(gen_date = date.today(), url=URL)
139*51f0e3d5SAndroid Build Coastguard Worker
140*51f0e3d5SAndroid Build Coastguard Worker    output += getBucket(buckets, 'a').dump(isFirst=True, isWebUrl=isWebUrl)
141*51f0e3d5SAndroid Build Coastguard Worker
142*51f0e3d5SAndroid Build Coastguard Worker    for letter in range(ord('b'), ord('z')):
143*51f0e3d5SAndroid Build Coastguard Worker        output += getBucket(buckets, chr(letter)).dump(isWebUrl=isWebUrl)
144*51f0e3d5SAndroid Build Coastguard Worker
145*51f0e3d5SAndroid Build Coastguard Worker    output += getBucket(buckets, 'z').dump(isLast=True, isWebUrl=isWebUrl)
146*51f0e3d5SAndroid Build Coastguard Worker
147*51f0e3d5SAndroid Build Coastguard Worker    if isWebUrl:
148*51f0e3d5SAndroid Build Coastguard Worker        output += '))"'
149*51f0e3d5SAndroid Build Coastguard Worker    else:
150*51f0e3d5SAndroid Build Coastguard Worker        output += ')'
151*51f0e3d5SAndroid Build Coastguard Worker
152*51f0e3d5SAndroid Build Coastguard Worker    output += suffix
153*51f0e3d5SAndroid Build Coastguard Worker
154*51f0e3d5SAndroid Build Coastguard Worker    print output
155*51f0e3d5SAndroid Build Coastguard Worker
156*51f0e3d5SAndroid Build Coastguard Workerif __name__ == "__main__":
157*51f0e3d5SAndroid Build Coastguard Worker    f = urlopen(URL)
158*51f0e3d5SAndroid Build Coastguard Worker    domains = f.readlines()
159*51f0e3d5SAndroid Build Coastguard Worker    f.close()
160*51f0e3d5SAndroid Build Coastguard Worker
161*51f0e3d5SAndroid Build Coastguard Worker    buckets = {}
162*51f0e3d5SAndroid Build Coastguard Worker
163*51f0e3d5SAndroid Build Coastguard Worker    for domain in domains:
164*51f0e3d5SAndroid Build Coastguard Worker        domain = domain.lower()
165*51f0e3d5SAndroid Build Coastguard Worker
166*51f0e3d5SAndroid Build Coastguard Worker        if len(domain) > 0:
167*51f0e3d5SAndroid Build Coastguard Worker            getBucket(buckets, domain[0]).add(domain.strip())
168*51f0e3d5SAndroid Build Coastguard Worker
169*51f0e3d5SAndroid Build Coastguard Worker        if domain.startswith('xn--'):
170*51f0e3d5SAndroid Build Coastguard Worker	   puny = domain.strip()[4:]
171*51f0e3d5SAndroid Build Coastguard Worker	   result = puny.decode('punycode')
172*51f0e3d5SAndroid Build Coastguard Worker	   result = repr(result)
173*51f0e3d5SAndroid Build Coastguard Worker           getBucket(buckets, 'xn--').add(result[2:-1])
174*51f0e3d5SAndroid Build Coastguard Worker
175*51f0e3d5SAndroid Build Coastguard Worker    makePattern(TLD_PREFIX, TLD_SUFFIX, buckets, isWebUrl=False)
176*51f0e3d5SAndroid Build Coastguard Worker    makePattern(URL_PREFIX, URL_SUFFIX, buckets, isWebUrl=True)
177