xref: /aosp_15_r20/prebuilts/build-tools/common/py3-stdlib/urllib/robotparser.py (revision cda5da8d549138a6648c5ee6d7a49cf8f4a657be)
1*cda5da8dSAndroid Build Coastguard Worker""" robotparser.py
2*cda5da8dSAndroid Build Coastguard Worker
3*cda5da8dSAndroid Build Coastguard Worker    Copyright (C) 2000  Bastian Kleineidam
4*cda5da8dSAndroid Build Coastguard Worker
5*cda5da8dSAndroid Build Coastguard Worker    You can choose between two licenses when using this package:
6*cda5da8dSAndroid Build Coastguard Worker    1) GNU GPLv2
7*cda5da8dSAndroid Build Coastguard Worker    2) PSF license for Python 2.2
8*cda5da8dSAndroid Build Coastguard Worker
9*cda5da8dSAndroid Build Coastguard Worker    The robots.txt Exclusion Protocol is implemented as specified in
10*cda5da8dSAndroid Build Coastguard Worker    http://www.robotstxt.org/norobots-rfc.txt
11*cda5da8dSAndroid Build Coastguard Worker"""
12*cda5da8dSAndroid Build Coastguard Worker
13*cda5da8dSAndroid Build Coastguard Workerimport collections
14*cda5da8dSAndroid Build Coastguard Workerimport urllib.parse
15*cda5da8dSAndroid Build Coastguard Workerimport urllib.request
16*cda5da8dSAndroid Build Coastguard Worker
17*cda5da8dSAndroid Build Coastguard Worker__all__ = ["RobotFileParser"]
18*cda5da8dSAndroid Build Coastguard Worker
19*cda5da8dSAndroid Build Coastguard WorkerRequestRate = collections.namedtuple("RequestRate", "requests seconds")
20*cda5da8dSAndroid Build Coastguard Worker
21*cda5da8dSAndroid Build Coastguard Worker
22*cda5da8dSAndroid Build Coastguard Workerclass RobotFileParser:
23*cda5da8dSAndroid Build Coastguard Worker    """ This class provides a set of methods to read, parse and answer
24*cda5da8dSAndroid Build Coastguard Worker    questions about a single robots.txt file.
25*cda5da8dSAndroid Build Coastguard Worker
26*cda5da8dSAndroid Build Coastguard Worker    """
27*cda5da8dSAndroid Build Coastguard Worker
28*cda5da8dSAndroid Build Coastguard Worker    def __init__(self, url=''):
29*cda5da8dSAndroid Build Coastguard Worker        self.entries = []
30*cda5da8dSAndroid Build Coastguard Worker        self.sitemaps = []
31*cda5da8dSAndroid Build Coastguard Worker        self.default_entry = None
32*cda5da8dSAndroid Build Coastguard Worker        self.disallow_all = False
33*cda5da8dSAndroid Build Coastguard Worker        self.allow_all = False
34*cda5da8dSAndroid Build Coastguard Worker        self.set_url(url)
35*cda5da8dSAndroid Build Coastguard Worker        self.last_checked = 0
36*cda5da8dSAndroid Build Coastguard Worker
37*cda5da8dSAndroid Build Coastguard Worker    def mtime(self):
38*cda5da8dSAndroid Build Coastguard Worker        """Returns the time the robots.txt file was last fetched.
39*cda5da8dSAndroid Build Coastguard Worker
40*cda5da8dSAndroid Build Coastguard Worker        This is useful for long-running web spiders that need to
41*cda5da8dSAndroid Build Coastguard Worker        check for new robots.txt files periodically.
42*cda5da8dSAndroid Build Coastguard Worker
43*cda5da8dSAndroid Build Coastguard Worker        """
44*cda5da8dSAndroid Build Coastguard Worker        return self.last_checked
45*cda5da8dSAndroid Build Coastguard Worker
46*cda5da8dSAndroid Build Coastguard Worker    def modified(self):
47*cda5da8dSAndroid Build Coastguard Worker        """Sets the time the robots.txt file was last fetched to the
48*cda5da8dSAndroid Build Coastguard Worker        current time.
49*cda5da8dSAndroid Build Coastguard Worker
50*cda5da8dSAndroid Build Coastguard Worker        """
51*cda5da8dSAndroid Build Coastguard Worker        import time
52*cda5da8dSAndroid Build Coastguard Worker        self.last_checked = time.time()
53*cda5da8dSAndroid Build Coastguard Worker
54*cda5da8dSAndroid Build Coastguard Worker    def set_url(self, url):
55*cda5da8dSAndroid Build Coastguard Worker        """Sets the URL referring to a robots.txt file."""
56*cda5da8dSAndroid Build Coastguard Worker        self.url = url
57*cda5da8dSAndroid Build Coastguard Worker        self.host, self.path = urllib.parse.urlparse(url)[1:3]
58*cda5da8dSAndroid Build Coastguard Worker
59*cda5da8dSAndroid Build Coastguard Worker    def read(self):
60*cda5da8dSAndroid Build Coastguard Worker        """Reads the robots.txt URL and feeds it to the parser."""
61*cda5da8dSAndroid Build Coastguard Worker        try:
62*cda5da8dSAndroid Build Coastguard Worker            f = urllib.request.urlopen(self.url)
63*cda5da8dSAndroid Build Coastguard Worker        except urllib.error.HTTPError as err:
64*cda5da8dSAndroid Build Coastguard Worker            if err.code in (401, 403):
65*cda5da8dSAndroid Build Coastguard Worker                self.disallow_all = True
66*cda5da8dSAndroid Build Coastguard Worker            elif err.code >= 400 and err.code < 500:
67*cda5da8dSAndroid Build Coastguard Worker                self.allow_all = True
68*cda5da8dSAndroid Build Coastguard Worker        else:
69*cda5da8dSAndroid Build Coastguard Worker            raw = f.read()
70*cda5da8dSAndroid Build Coastguard Worker            self.parse(raw.decode("utf-8").splitlines())
71*cda5da8dSAndroid Build Coastguard Worker
72*cda5da8dSAndroid Build Coastguard Worker    def _add_entry(self, entry):
73*cda5da8dSAndroid Build Coastguard Worker        if "*" in entry.useragents:
74*cda5da8dSAndroid Build Coastguard Worker            # the default entry is considered last
75*cda5da8dSAndroid Build Coastguard Worker            if self.default_entry is None:
76*cda5da8dSAndroid Build Coastguard Worker                # the first default entry wins
77*cda5da8dSAndroid Build Coastguard Worker                self.default_entry = entry
78*cda5da8dSAndroid Build Coastguard Worker        else:
79*cda5da8dSAndroid Build Coastguard Worker            self.entries.append(entry)
80*cda5da8dSAndroid Build Coastguard Worker
81*cda5da8dSAndroid Build Coastguard Worker    def parse(self, lines):
82*cda5da8dSAndroid Build Coastguard Worker        """Parse the input lines from a robots.txt file.
83*cda5da8dSAndroid Build Coastguard Worker
84*cda5da8dSAndroid Build Coastguard Worker        We allow that a user-agent: line is not preceded by
85*cda5da8dSAndroid Build Coastguard Worker        one or more blank lines.
86*cda5da8dSAndroid Build Coastguard Worker        """
87*cda5da8dSAndroid Build Coastguard Worker        # states:
88*cda5da8dSAndroid Build Coastguard Worker        #   0: start state
89*cda5da8dSAndroid Build Coastguard Worker        #   1: saw user-agent line
90*cda5da8dSAndroid Build Coastguard Worker        #   2: saw an allow or disallow line
91*cda5da8dSAndroid Build Coastguard Worker        state = 0
92*cda5da8dSAndroid Build Coastguard Worker        entry = Entry()
93*cda5da8dSAndroid Build Coastguard Worker
94*cda5da8dSAndroid Build Coastguard Worker        self.modified()
95*cda5da8dSAndroid Build Coastguard Worker        for line in lines:
96*cda5da8dSAndroid Build Coastguard Worker            if not line:
97*cda5da8dSAndroid Build Coastguard Worker                if state == 1:
98*cda5da8dSAndroid Build Coastguard Worker                    entry = Entry()
99*cda5da8dSAndroid Build Coastguard Worker                    state = 0
100*cda5da8dSAndroid Build Coastguard Worker                elif state == 2:
101*cda5da8dSAndroid Build Coastguard Worker                    self._add_entry(entry)
102*cda5da8dSAndroid Build Coastguard Worker                    entry = Entry()
103*cda5da8dSAndroid Build Coastguard Worker                    state = 0
104*cda5da8dSAndroid Build Coastguard Worker            # remove optional comment and strip line
105*cda5da8dSAndroid Build Coastguard Worker            i = line.find('#')
106*cda5da8dSAndroid Build Coastguard Worker            if i >= 0:
107*cda5da8dSAndroid Build Coastguard Worker                line = line[:i]
108*cda5da8dSAndroid Build Coastguard Worker            line = line.strip()
109*cda5da8dSAndroid Build Coastguard Worker            if not line:
110*cda5da8dSAndroid Build Coastguard Worker                continue
111*cda5da8dSAndroid Build Coastguard Worker            line = line.split(':', 1)
112*cda5da8dSAndroid Build Coastguard Worker            if len(line) == 2:
113*cda5da8dSAndroid Build Coastguard Worker                line[0] = line[0].strip().lower()
114*cda5da8dSAndroid Build Coastguard Worker                line[1] = urllib.parse.unquote(line[1].strip())
115*cda5da8dSAndroid Build Coastguard Worker                if line[0] == "user-agent":
116*cda5da8dSAndroid Build Coastguard Worker                    if state == 2:
117*cda5da8dSAndroid Build Coastguard Worker                        self._add_entry(entry)
118*cda5da8dSAndroid Build Coastguard Worker                        entry = Entry()
119*cda5da8dSAndroid Build Coastguard Worker                    entry.useragents.append(line[1])
120*cda5da8dSAndroid Build Coastguard Worker                    state = 1
121*cda5da8dSAndroid Build Coastguard Worker                elif line[0] == "disallow":
122*cda5da8dSAndroid Build Coastguard Worker                    if state != 0:
123*cda5da8dSAndroid Build Coastguard Worker                        entry.rulelines.append(RuleLine(line[1], False))
124*cda5da8dSAndroid Build Coastguard Worker                        state = 2
125*cda5da8dSAndroid Build Coastguard Worker                elif line[0] == "allow":
126*cda5da8dSAndroid Build Coastguard Worker                    if state != 0:
127*cda5da8dSAndroid Build Coastguard Worker                        entry.rulelines.append(RuleLine(line[1], True))
128*cda5da8dSAndroid Build Coastguard Worker                        state = 2
129*cda5da8dSAndroid Build Coastguard Worker                elif line[0] == "crawl-delay":
130*cda5da8dSAndroid Build Coastguard Worker                    if state != 0:
131*cda5da8dSAndroid Build Coastguard Worker                        # before trying to convert to int we need to make
132*cda5da8dSAndroid Build Coastguard Worker                        # sure that robots.txt has valid syntax otherwise
133*cda5da8dSAndroid Build Coastguard Worker                        # it will crash
134*cda5da8dSAndroid Build Coastguard Worker                        if line[1].strip().isdigit():
135*cda5da8dSAndroid Build Coastguard Worker                            entry.delay = int(line[1])
136*cda5da8dSAndroid Build Coastguard Worker                        state = 2
137*cda5da8dSAndroid Build Coastguard Worker                elif line[0] == "request-rate":
138*cda5da8dSAndroid Build Coastguard Worker                    if state != 0:
139*cda5da8dSAndroid Build Coastguard Worker                        numbers = line[1].split('/')
140*cda5da8dSAndroid Build Coastguard Worker                        # check if all values are sane
141*cda5da8dSAndroid Build Coastguard Worker                        if (len(numbers) == 2 and numbers[0].strip().isdigit()
142*cda5da8dSAndroid Build Coastguard Worker                            and numbers[1].strip().isdigit()):
143*cda5da8dSAndroid Build Coastguard Worker                            entry.req_rate = RequestRate(int(numbers[0]), int(numbers[1]))
144*cda5da8dSAndroid Build Coastguard Worker                        state = 2
145*cda5da8dSAndroid Build Coastguard Worker                elif line[0] == "sitemap":
146*cda5da8dSAndroid Build Coastguard Worker                    # According to http://www.sitemaps.org/protocol.html
147*cda5da8dSAndroid Build Coastguard Worker                    # "This directive is independent of the user-agent line,
148*cda5da8dSAndroid Build Coastguard Worker                    #  so it doesn't matter where you place it in your file."
149*cda5da8dSAndroid Build Coastguard Worker                    # Therefore we do not change the state of the parser.
150*cda5da8dSAndroid Build Coastguard Worker                    self.sitemaps.append(line[1])
151*cda5da8dSAndroid Build Coastguard Worker        if state == 2:
152*cda5da8dSAndroid Build Coastguard Worker            self._add_entry(entry)
153*cda5da8dSAndroid Build Coastguard Worker
154*cda5da8dSAndroid Build Coastguard Worker    def can_fetch(self, useragent, url):
155*cda5da8dSAndroid Build Coastguard Worker        """using the parsed robots.txt decide if useragent can fetch url"""
156*cda5da8dSAndroid Build Coastguard Worker        if self.disallow_all:
157*cda5da8dSAndroid Build Coastguard Worker            return False
158*cda5da8dSAndroid Build Coastguard Worker        if self.allow_all:
159*cda5da8dSAndroid Build Coastguard Worker            return True
160*cda5da8dSAndroid Build Coastguard Worker        # Until the robots.txt file has been read or found not
161*cda5da8dSAndroid Build Coastguard Worker        # to exist, we must assume that no url is allowable.
162*cda5da8dSAndroid Build Coastguard Worker        # This prevents false positives when a user erroneously
163*cda5da8dSAndroid Build Coastguard Worker        # calls can_fetch() before calling read().
164*cda5da8dSAndroid Build Coastguard Worker        if not self.last_checked:
165*cda5da8dSAndroid Build Coastguard Worker            return False
166*cda5da8dSAndroid Build Coastguard Worker        # search for given user agent matches
167*cda5da8dSAndroid Build Coastguard Worker        # the first match counts
168*cda5da8dSAndroid Build Coastguard Worker        parsed_url = urllib.parse.urlparse(urllib.parse.unquote(url))
169*cda5da8dSAndroid Build Coastguard Worker        url = urllib.parse.urlunparse(('','',parsed_url.path,
170*cda5da8dSAndroid Build Coastguard Worker            parsed_url.params,parsed_url.query, parsed_url.fragment))
171*cda5da8dSAndroid Build Coastguard Worker        url = urllib.parse.quote(url)
172*cda5da8dSAndroid Build Coastguard Worker        if not url:
173*cda5da8dSAndroid Build Coastguard Worker            url = "/"
174*cda5da8dSAndroid Build Coastguard Worker        for entry in self.entries:
175*cda5da8dSAndroid Build Coastguard Worker            if entry.applies_to(useragent):
176*cda5da8dSAndroid Build Coastguard Worker                return entry.allowance(url)
177*cda5da8dSAndroid Build Coastguard Worker        # try the default entry last
178*cda5da8dSAndroid Build Coastguard Worker        if self.default_entry:
179*cda5da8dSAndroid Build Coastguard Worker            return self.default_entry.allowance(url)
180*cda5da8dSAndroid Build Coastguard Worker        # agent not found ==> access granted
181*cda5da8dSAndroid Build Coastguard Worker        return True
182*cda5da8dSAndroid Build Coastguard Worker
183*cda5da8dSAndroid Build Coastguard Worker    def crawl_delay(self, useragent):
184*cda5da8dSAndroid Build Coastguard Worker        if not self.mtime():
185*cda5da8dSAndroid Build Coastguard Worker            return None
186*cda5da8dSAndroid Build Coastguard Worker        for entry in self.entries:
187*cda5da8dSAndroid Build Coastguard Worker            if entry.applies_to(useragent):
188*cda5da8dSAndroid Build Coastguard Worker                return entry.delay
189*cda5da8dSAndroid Build Coastguard Worker        if self.default_entry:
190*cda5da8dSAndroid Build Coastguard Worker            return self.default_entry.delay
191*cda5da8dSAndroid Build Coastguard Worker        return None
192*cda5da8dSAndroid Build Coastguard Worker
193*cda5da8dSAndroid Build Coastguard Worker    def request_rate(self, useragent):
194*cda5da8dSAndroid Build Coastguard Worker        if not self.mtime():
195*cda5da8dSAndroid Build Coastguard Worker            return None
196*cda5da8dSAndroid Build Coastguard Worker        for entry in self.entries:
197*cda5da8dSAndroid Build Coastguard Worker            if entry.applies_to(useragent):
198*cda5da8dSAndroid Build Coastguard Worker                return entry.req_rate
199*cda5da8dSAndroid Build Coastguard Worker        if self.default_entry:
200*cda5da8dSAndroid Build Coastguard Worker            return self.default_entry.req_rate
201*cda5da8dSAndroid Build Coastguard Worker        return None
202*cda5da8dSAndroid Build Coastguard Worker
203*cda5da8dSAndroid Build Coastguard Worker    def site_maps(self):
204*cda5da8dSAndroid Build Coastguard Worker        if not self.sitemaps:
205*cda5da8dSAndroid Build Coastguard Worker            return None
206*cda5da8dSAndroid Build Coastguard Worker        return self.sitemaps
207*cda5da8dSAndroid Build Coastguard Worker
208*cda5da8dSAndroid Build Coastguard Worker    def __str__(self):
209*cda5da8dSAndroid Build Coastguard Worker        entries = self.entries
210*cda5da8dSAndroid Build Coastguard Worker        if self.default_entry is not None:
211*cda5da8dSAndroid Build Coastguard Worker            entries = entries + [self.default_entry]
212*cda5da8dSAndroid Build Coastguard Worker        return '\n\n'.join(map(str, entries))
213*cda5da8dSAndroid Build Coastguard Worker
214*cda5da8dSAndroid Build Coastguard Worker
215*cda5da8dSAndroid Build Coastguard Workerclass RuleLine:
216*cda5da8dSAndroid Build Coastguard Worker    """A rule line is a single "Allow:" (allowance==True) or "Disallow:"
217*cda5da8dSAndroid Build Coastguard Worker       (allowance==False) followed by a path."""
218*cda5da8dSAndroid Build Coastguard Worker    def __init__(self, path, allowance):
219*cda5da8dSAndroid Build Coastguard Worker        if path == '' and not allowance:
220*cda5da8dSAndroid Build Coastguard Worker            # an empty value means allow all
221*cda5da8dSAndroid Build Coastguard Worker            allowance = True
222*cda5da8dSAndroid Build Coastguard Worker        path = urllib.parse.urlunparse(urllib.parse.urlparse(path))
223*cda5da8dSAndroid Build Coastguard Worker        self.path = urllib.parse.quote(path)
224*cda5da8dSAndroid Build Coastguard Worker        self.allowance = allowance
225*cda5da8dSAndroid Build Coastguard Worker
226*cda5da8dSAndroid Build Coastguard Worker    def applies_to(self, filename):
227*cda5da8dSAndroid Build Coastguard Worker        return self.path == "*" or filename.startswith(self.path)
228*cda5da8dSAndroid Build Coastguard Worker
229*cda5da8dSAndroid Build Coastguard Worker    def __str__(self):
230*cda5da8dSAndroid Build Coastguard Worker        return ("Allow" if self.allowance else "Disallow") + ": " + self.path
231*cda5da8dSAndroid Build Coastguard Worker
232*cda5da8dSAndroid Build Coastguard Worker
233*cda5da8dSAndroid Build Coastguard Workerclass Entry:
234*cda5da8dSAndroid Build Coastguard Worker    """An entry has one or more user-agents and zero or more rulelines"""
235*cda5da8dSAndroid Build Coastguard Worker    def __init__(self):
236*cda5da8dSAndroid Build Coastguard Worker        self.useragents = []
237*cda5da8dSAndroid Build Coastguard Worker        self.rulelines = []
238*cda5da8dSAndroid Build Coastguard Worker        self.delay = None
239*cda5da8dSAndroid Build Coastguard Worker        self.req_rate = None
240*cda5da8dSAndroid Build Coastguard Worker
241*cda5da8dSAndroid Build Coastguard Worker    def __str__(self):
242*cda5da8dSAndroid Build Coastguard Worker        ret = []
243*cda5da8dSAndroid Build Coastguard Worker        for agent in self.useragents:
244*cda5da8dSAndroid Build Coastguard Worker            ret.append(f"User-agent: {agent}")
245*cda5da8dSAndroid Build Coastguard Worker        if self.delay is not None:
246*cda5da8dSAndroid Build Coastguard Worker            ret.append(f"Crawl-delay: {self.delay}")
247*cda5da8dSAndroid Build Coastguard Worker        if self.req_rate is not None:
248*cda5da8dSAndroid Build Coastguard Worker            rate = self.req_rate
249*cda5da8dSAndroid Build Coastguard Worker            ret.append(f"Request-rate: {rate.requests}/{rate.seconds}")
250*cda5da8dSAndroid Build Coastguard Worker        ret.extend(map(str, self.rulelines))
251*cda5da8dSAndroid Build Coastguard Worker        return '\n'.join(ret)
252*cda5da8dSAndroid Build Coastguard Worker
253*cda5da8dSAndroid Build Coastguard Worker    def applies_to(self, useragent):
254*cda5da8dSAndroid Build Coastguard Worker        """check if this entry applies to the specified agent"""
255*cda5da8dSAndroid Build Coastguard Worker        # split the name token and make it lower case
256*cda5da8dSAndroid Build Coastguard Worker        useragent = useragent.split("/")[0].lower()
257*cda5da8dSAndroid Build Coastguard Worker        for agent in self.useragents:
258*cda5da8dSAndroid Build Coastguard Worker            if agent == '*':
259*cda5da8dSAndroid Build Coastguard Worker                # we have the catch-all agent
260*cda5da8dSAndroid Build Coastguard Worker                return True
261*cda5da8dSAndroid Build Coastguard Worker            agent = agent.lower()
262*cda5da8dSAndroid Build Coastguard Worker            if agent in useragent:
263*cda5da8dSAndroid Build Coastguard Worker                return True
264*cda5da8dSAndroid Build Coastguard Worker        return False
265*cda5da8dSAndroid Build Coastguard Worker
266*cda5da8dSAndroid Build Coastguard Worker    def allowance(self, filename):
267*cda5da8dSAndroid Build Coastguard Worker        """Preconditions:
268*cda5da8dSAndroid Build Coastguard Worker        - our agent applies to this entry
269*cda5da8dSAndroid Build Coastguard Worker        - filename is URL decoded"""
270*cda5da8dSAndroid Build Coastguard Worker        for line in self.rulelines:
271*cda5da8dSAndroid Build Coastguard Worker            if line.applies_to(filename):
272*cda5da8dSAndroid Build Coastguard Worker                return line.allowance
273*cda5da8dSAndroid Build Coastguard Worker        return True
274