1*cda5da8dSAndroid Build Coastguard Worker""" robotparser.py 2*cda5da8dSAndroid Build Coastguard Worker 3*cda5da8dSAndroid Build Coastguard Worker Copyright (C) 2000 Bastian Kleineidam 4*cda5da8dSAndroid Build Coastguard Worker 5*cda5da8dSAndroid Build Coastguard Worker You can choose between two licenses when using this package: 6*cda5da8dSAndroid Build Coastguard Worker 1) GNU GPLv2 7*cda5da8dSAndroid Build Coastguard Worker 2) PSF license for Python 2.2 8*cda5da8dSAndroid Build Coastguard Worker 9*cda5da8dSAndroid Build Coastguard Worker The robots.txt Exclusion Protocol is implemented as specified in 10*cda5da8dSAndroid Build Coastguard Worker http://www.robotstxt.org/norobots-rfc.txt 11*cda5da8dSAndroid Build Coastguard Worker""" 12*cda5da8dSAndroid Build Coastguard Worker 13*cda5da8dSAndroid Build Coastguard Workerimport collections 14*cda5da8dSAndroid Build Coastguard Workerimport urllib.parse 15*cda5da8dSAndroid Build Coastguard Workerimport urllib.request 16*cda5da8dSAndroid Build Coastguard Worker 17*cda5da8dSAndroid Build Coastguard Worker__all__ = ["RobotFileParser"] 18*cda5da8dSAndroid Build Coastguard Worker 19*cda5da8dSAndroid Build Coastguard WorkerRequestRate = collections.namedtuple("RequestRate", "requests seconds") 20*cda5da8dSAndroid Build Coastguard Worker 21*cda5da8dSAndroid Build Coastguard Worker 22*cda5da8dSAndroid Build Coastguard Workerclass RobotFileParser: 23*cda5da8dSAndroid Build Coastguard Worker """ This class provides a set of methods to read, parse and answer 24*cda5da8dSAndroid Build Coastguard Worker questions about a single robots.txt file. 25*cda5da8dSAndroid Build Coastguard Worker 26*cda5da8dSAndroid Build Coastguard Worker """ 27*cda5da8dSAndroid Build Coastguard Worker 28*cda5da8dSAndroid Build Coastguard Worker def __init__(self, url=''): 29*cda5da8dSAndroid Build Coastguard Worker self.entries = [] 30*cda5da8dSAndroid Build Coastguard Worker self.sitemaps = [] 31*cda5da8dSAndroid Build Coastguard Worker self.default_entry = None 32*cda5da8dSAndroid Build Coastguard Worker self.disallow_all = False 33*cda5da8dSAndroid Build Coastguard Worker self.allow_all = False 34*cda5da8dSAndroid Build Coastguard Worker self.set_url(url) 35*cda5da8dSAndroid Build Coastguard Worker self.last_checked = 0 36*cda5da8dSAndroid Build Coastguard Worker 37*cda5da8dSAndroid Build Coastguard Worker def mtime(self): 38*cda5da8dSAndroid Build Coastguard Worker """Returns the time the robots.txt file was last fetched. 39*cda5da8dSAndroid Build Coastguard Worker 40*cda5da8dSAndroid Build Coastguard Worker This is useful for long-running web spiders that need to 41*cda5da8dSAndroid Build Coastguard Worker check for new robots.txt files periodically. 42*cda5da8dSAndroid Build Coastguard Worker 43*cda5da8dSAndroid Build Coastguard Worker """ 44*cda5da8dSAndroid Build Coastguard Worker return self.last_checked 45*cda5da8dSAndroid Build Coastguard Worker 46*cda5da8dSAndroid Build Coastguard Worker def modified(self): 47*cda5da8dSAndroid Build Coastguard Worker """Sets the time the robots.txt file was last fetched to the 48*cda5da8dSAndroid Build Coastguard Worker current time. 49*cda5da8dSAndroid Build Coastguard Worker 50*cda5da8dSAndroid Build Coastguard Worker """ 51*cda5da8dSAndroid Build Coastguard Worker import time 52*cda5da8dSAndroid Build Coastguard Worker self.last_checked = time.time() 53*cda5da8dSAndroid Build Coastguard Worker 54*cda5da8dSAndroid Build Coastguard Worker def set_url(self, url): 55*cda5da8dSAndroid Build Coastguard Worker """Sets the URL referring to a robots.txt file.""" 56*cda5da8dSAndroid Build Coastguard Worker self.url = url 57*cda5da8dSAndroid Build Coastguard Worker self.host, self.path = urllib.parse.urlparse(url)[1:3] 58*cda5da8dSAndroid Build Coastguard Worker 59*cda5da8dSAndroid Build Coastguard Worker def read(self): 60*cda5da8dSAndroid Build Coastguard Worker """Reads the robots.txt URL and feeds it to the parser.""" 61*cda5da8dSAndroid Build Coastguard Worker try: 62*cda5da8dSAndroid Build Coastguard Worker f = urllib.request.urlopen(self.url) 63*cda5da8dSAndroid Build Coastguard Worker except urllib.error.HTTPError as err: 64*cda5da8dSAndroid Build Coastguard Worker if err.code in (401, 403): 65*cda5da8dSAndroid Build Coastguard Worker self.disallow_all = True 66*cda5da8dSAndroid Build Coastguard Worker elif err.code >= 400 and err.code < 500: 67*cda5da8dSAndroid Build Coastguard Worker self.allow_all = True 68*cda5da8dSAndroid Build Coastguard Worker else: 69*cda5da8dSAndroid Build Coastguard Worker raw = f.read() 70*cda5da8dSAndroid Build Coastguard Worker self.parse(raw.decode("utf-8").splitlines()) 71*cda5da8dSAndroid Build Coastguard Worker 72*cda5da8dSAndroid Build Coastguard Worker def _add_entry(self, entry): 73*cda5da8dSAndroid Build Coastguard Worker if "*" in entry.useragents: 74*cda5da8dSAndroid Build Coastguard Worker # the default entry is considered last 75*cda5da8dSAndroid Build Coastguard Worker if self.default_entry is None: 76*cda5da8dSAndroid Build Coastguard Worker # the first default entry wins 77*cda5da8dSAndroid Build Coastguard Worker self.default_entry = entry 78*cda5da8dSAndroid Build Coastguard Worker else: 79*cda5da8dSAndroid Build Coastguard Worker self.entries.append(entry) 80*cda5da8dSAndroid Build Coastguard Worker 81*cda5da8dSAndroid Build Coastguard Worker def parse(self, lines): 82*cda5da8dSAndroid Build Coastguard Worker """Parse the input lines from a robots.txt file. 83*cda5da8dSAndroid Build Coastguard Worker 84*cda5da8dSAndroid Build Coastguard Worker We allow that a user-agent: line is not preceded by 85*cda5da8dSAndroid Build Coastguard Worker one or more blank lines. 86*cda5da8dSAndroid Build Coastguard Worker """ 87*cda5da8dSAndroid Build Coastguard Worker # states: 88*cda5da8dSAndroid Build Coastguard Worker # 0: start state 89*cda5da8dSAndroid Build Coastguard Worker # 1: saw user-agent line 90*cda5da8dSAndroid Build Coastguard Worker # 2: saw an allow or disallow line 91*cda5da8dSAndroid Build Coastguard Worker state = 0 92*cda5da8dSAndroid Build Coastguard Worker entry = Entry() 93*cda5da8dSAndroid Build Coastguard Worker 94*cda5da8dSAndroid Build Coastguard Worker self.modified() 95*cda5da8dSAndroid Build Coastguard Worker for line in lines: 96*cda5da8dSAndroid Build Coastguard Worker if not line: 97*cda5da8dSAndroid Build Coastguard Worker if state == 1: 98*cda5da8dSAndroid Build Coastguard Worker entry = Entry() 99*cda5da8dSAndroid Build Coastguard Worker state = 0 100*cda5da8dSAndroid Build Coastguard Worker elif state == 2: 101*cda5da8dSAndroid Build Coastguard Worker self._add_entry(entry) 102*cda5da8dSAndroid Build Coastguard Worker entry = Entry() 103*cda5da8dSAndroid Build Coastguard Worker state = 0 104*cda5da8dSAndroid Build Coastguard Worker # remove optional comment and strip line 105*cda5da8dSAndroid Build Coastguard Worker i = line.find('#') 106*cda5da8dSAndroid Build Coastguard Worker if i >= 0: 107*cda5da8dSAndroid Build Coastguard Worker line = line[:i] 108*cda5da8dSAndroid Build Coastguard Worker line = line.strip() 109*cda5da8dSAndroid Build Coastguard Worker if not line: 110*cda5da8dSAndroid Build Coastguard Worker continue 111*cda5da8dSAndroid Build Coastguard Worker line = line.split(':', 1) 112*cda5da8dSAndroid Build Coastguard Worker if len(line) == 2: 113*cda5da8dSAndroid Build Coastguard Worker line[0] = line[0].strip().lower() 114*cda5da8dSAndroid Build Coastguard Worker line[1] = urllib.parse.unquote(line[1].strip()) 115*cda5da8dSAndroid Build Coastguard Worker if line[0] == "user-agent": 116*cda5da8dSAndroid Build Coastguard Worker if state == 2: 117*cda5da8dSAndroid Build Coastguard Worker self._add_entry(entry) 118*cda5da8dSAndroid Build Coastguard Worker entry = Entry() 119*cda5da8dSAndroid Build Coastguard Worker entry.useragents.append(line[1]) 120*cda5da8dSAndroid Build Coastguard Worker state = 1 121*cda5da8dSAndroid Build Coastguard Worker elif line[0] == "disallow": 122*cda5da8dSAndroid Build Coastguard Worker if state != 0: 123*cda5da8dSAndroid Build Coastguard Worker entry.rulelines.append(RuleLine(line[1], False)) 124*cda5da8dSAndroid Build Coastguard Worker state = 2 125*cda5da8dSAndroid Build Coastguard Worker elif line[0] == "allow": 126*cda5da8dSAndroid Build Coastguard Worker if state != 0: 127*cda5da8dSAndroid Build Coastguard Worker entry.rulelines.append(RuleLine(line[1], True)) 128*cda5da8dSAndroid Build Coastguard Worker state = 2 129*cda5da8dSAndroid Build Coastguard Worker elif line[0] == "crawl-delay": 130*cda5da8dSAndroid Build Coastguard Worker if state != 0: 131*cda5da8dSAndroid Build Coastguard Worker # before trying to convert to int we need to make 132*cda5da8dSAndroid Build Coastguard Worker # sure that robots.txt has valid syntax otherwise 133*cda5da8dSAndroid Build Coastguard Worker # it will crash 134*cda5da8dSAndroid Build Coastguard Worker if line[1].strip().isdigit(): 135*cda5da8dSAndroid Build Coastguard Worker entry.delay = int(line[1]) 136*cda5da8dSAndroid Build Coastguard Worker state = 2 137*cda5da8dSAndroid Build Coastguard Worker elif line[0] == "request-rate": 138*cda5da8dSAndroid Build Coastguard Worker if state != 0: 139*cda5da8dSAndroid Build Coastguard Worker numbers = line[1].split('/') 140*cda5da8dSAndroid Build Coastguard Worker # check if all values are sane 141*cda5da8dSAndroid Build Coastguard Worker if (len(numbers) == 2 and numbers[0].strip().isdigit() 142*cda5da8dSAndroid Build Coastguard Worker and numbers[1].strip().isdigit()): 143*cda5da8dSAndroid Build Coastguard Worker entry.req_rate = RequestRate(int(numbers[0]), int(numbers[1])) 144*cda5da8dSAndroid Build Coastguard Worker state = 2 145*cda5da8dSAndroid Build Coastguard Worker elif line[0] == "sitemap": 146*cda5da8dSAndroid Build Coastguard Worker # According to http://www.sitemaps.org/protocol.html 147*cda5da8dSAndroid Build Coastguard Worker # "This directive is independent of the user-agent line, 148*cda5da8dSAndroid Build Coastguard Worker # so it doesn't matter where you place it in your file." 149*cda5da8dSAndroid Build Coastguard Worker # Therefore we do not change the state of the parser. 150*cda5da8dSAndroid Build Coastguard Worker self.sitemaps.append(line[1]) 151*cda5da8dSAndroid Build Coastguard Worker if state == 2: 152*cda5da8dSAndroid Build Coastguard Worker self._add_entry(entry) 153*cda5da8dSAndroid Build Coastguard Worker 154*cda5da8dSAndroid Build Coastguard Worker def can_fetch(self, useragent, url): 155*cda5da8dSAndroid Build Coastguard Worker """using the parsed robots.txt decide if useragent can fetch url""" 156*cda5da8dSAndroid Build Coastguard Worker if self.disallow_all: 157*cda5da8dSAndroid Build Coastguard Worker return False 158*cda5da8dSAndroid Build Coastguard Worker if self.allow_all: 159*cda5da8dSAndroid Build Coastguard Worker return True 160*cda5da8dSAndroid Build Coastguard Worker # Until the robots.txt file has been read or found not 161*cda5da8dSAndroid Build Coastguard Worker # to exist, we must assume that no url is allowable. 162*cda5da8dSAndroid Build Coastguard Worker # This prevents false positives when a user erroneously 163*cda5da8dSAndroid Build Coastguard Worker # calls can_fetch() before calling read(). 164*cda5da8dSAndroid Build Coastguard Worker if not self.last_checked: 165*cda5da8dSAndroid Build Coastguard Worker return False 166*cda5da8dSAndroid Build Coastguard Worker # search for given user agent matches 167*cda5da8dSAndroid Build Coastguard Worker # the first match counts 168*cda5da8dSAndroid Build Coastguard Worker parsed_url = urllib.parse.urlparse(urllib.parse.unquote(url)) 169*cda5da8dSAndroid Build Coastguard Worker url = urllib.parse.urlunparse(('','',parsed_url.path, 170*cda5da8dSAndroid Build Coastguard Worker parsed_url.params,parsed_url.query, parsed_url.fragment)) 171*cda5da8dSAndroid Build Coastguard Worker url = urllib.parse.quote(url) 172*cda5da8dSAndroid Build Coastguard Worker if not url: 173*cda5da8dSAndroid Build Coastguard Worker url = "/" 174*cda5da8dSAndroid Build Coastguard Worker for entry in self.entries: 175*cda5da8dSAndroid Build Coastguard Worker if entry.applies_to(useragent): 176*cda5da8dSAndroid Build Coastguard Worker return entry.allowance(url) 177*cda5da8dSAndroid Build Coastguard Worker # try the default entry last 178*cda5da8dSAndroid Build Coastguard Worker if self.default_entry: 179*cda5da8dSAndroid Build Coastguard Worker return self.default_entry.allowance(url) 180*cda5da8dSAndroid Build Coastguard Worker # agent not found ==> access granted 181*cda5da8dSAndroid Build Coastguard Worker return True 182*cda5da8dSAndroid Build Coastguard Worker 183*cda5da8dSAndroid Build Coastguard Worker def crawl_delay(self, useragent): 184*cda5da8dSAndroid Build Coastguard Worker if not self.mtime(): 185*cda5da8dSAndroid Build Coastguard Worker return None 186*cda5da8dSAndroid Build Coastguard Worker for entry in self.entries: 187*cda5da8dSAndroid Build Coastguard Worker if entry.applies_to(useragent): 188*cda5da8dSAndroid Build Coastguard Worker return entry.delay 189*cda5da8dSAndroid Build Coastguard Worker if self.default_entry: 190*cda5da8dSAndroid Build Coastguard Worker return self.default_entry.delay 191*cda5da8dSAndroid Build Coastguard Worker return None 192*cda5da8dSAndroid Build Coastguard Worker 193*cda5da8dSAndroid Build Coastguard Worker def request_rate(self, useragent): 194*cda5da8dSAndroid Build Coastguard Worker if not self.mtime(): 195*cda5da8dSAndroid Build Coastguard Worker return None 196*cda5da8dSAndroid Build Coastguard Worker for entry in self.entries: 197*cda5da8dSAndroid Build Coastguard Worker if entry.applies_to(useragent): 198*cda5da8dSAndroid Build Coastguard Worker return entry.req_rate 199*cda5da8dSAndroid Build Coastguard Worker if self.default_entry: 200*cda5da8dSAndroid Build Coastguard Worker return self.default_entry.req_rate 201*cda5da8dSAndroid Build Coastguard Worker return None 202*cda5da8dSAndroid Build Coastguard Worker 203*cda5da8dSAndroid Build Coastguard Worker def site_maps(self): 204*cda5da8dSAndroid Build Coastguard Worker if not self.sitemaps: 205*cda5da8dSAndroid Build Coastguard Worker return None 206*cda5da8dSAndroid Build Coastguard Worker return self.sitemaps 207*cda5da8dSAndroid Build Coastguard Worker 208*cda5da8dSAndroid Build Coastguard Worker def __str__(self): 209*cda5da8dSAndroid Build Coastguard Worker entries = self.entries 210*cda5da8dSAndroid Build Coastguard Worker if self.default_entry is not None: 211*cda5da8dSAndroid Build Coastguard Worker entries = entries + [self.default_entry] 212*cda5da8dSAndroid Build Coastguard Worker return '\n\n'.join(map(str, entries)) 213*cda5da8dSAndroid Build Coastguard Worker 214*cda5da8dSAndroid Build Coastguard Worker 215*cda5da8dSAndroid Build Coastguard Workerclass RuleLine: 216*cda5da8dSAndroid Build Coastguard Worker """A rule line is a single "Allow:" (allowance==True) or "Disallow:" 217*cda5da8dSAndroid Build Coastguard Worker (allowance==False) followed by a path.""" 218*cda5da8dSAndroid Build Coastguard Worker def __init__(self, path, allowance): 219*cda5da8dSAndroid Build Coastguard Worker if path == '' and not allowance: 220*cda5da8dSAndroid Build Coastguard Worker # an empty value means allow all 221*cda5da8dSAndroid Build Coastguard Worker allowance = True 222*cda5da8dSAndroid Build Coastguard Worker path = urllib.parse.urlunparse(urllib.parse.urlparse(path)) 223*cda5da8dSAndroid Build Coastguard Worker self.path = urllib.parse.quote(path) 224*cda5da8dSAndroid Build Coastguard Worker self.allowance = allowance 225*cda5da8dSAndroid Build Coastguard Worker 226*cda5da8dSAndroid Build Coastguard Worker def applies_to(self, filename): 227*cda5da8dSAndroid Build Coastguard Worker return self.path == "*" or filename.startswith(self.path) 228*cda5da8dSAndroid Build Coastguard Worker 229*cda5da8dSAndroid Build Coastguard Worker def __str__(self): 230*cda5da8dSAndroid Build Coastguard Worker return ("Allow" if self.allowance else "Disallow") + ": " + self.path 231*cda5da8dSAndroid Build Coastguard Worker 232*cda5da8dSAndroid Build Coastguard Worker 233*cda5da8dSAndroid Build Coastguard Workerclass Entry: 234*cda5da8dSAndroid Build Coastguard Worker """An entry has one or more user-agents and zero or more rulelines""" 235*cda5da8dSAndroid Build Coastguard Worker def __init__(self): 236*cda5da8dSAndroid Build Coastguard Worker self.useragents = [] 237*cda5da8dSAndroid Build Coastguard Worker self.rulelines = [] 238*cda5da8dSAndroid Build Coastguard Worker self.delay = None 239*cda5da8dSAndroid Build Coastguard Worker self.req_rate = None 240*cda5da8dSAndroid Build Coastguard Worker 241*cda5da8dSAndroid Build Coastguard Worker def __str__(self): 242*cda5da8dSAndroid Build Coastguard Worker ret = [] 243*cda5da8dSAndroid Build Coastguard Worker for agent in self.useragents: 244*cda5da8dSAndroid Build Coastguard Worker ret.append(f"User-agent: {agent}") 245*cda5da8dSAndroid Build Coastguard Worker if self.delay is not None: 246*cda5da8dSAndroid Build Coastguard Worker ret.append(f"Crawl-delay: {self.delay}") 247*cda5da8dSAndroid Build Coastguard Worker if self.req_rate is not None: 248*cda5da8dSAndroid Build Coastguard Worker rate = self.req_rate 249*cda5da8dSAndroid Build Coastguard Worker ret.append(f"Request-rate: {rate.requests}/{rate.seconds}") 250*cda5da8dSAndroid Build Coastguard Worker ret.extend(map(str, self.rulelines)) 251*cda5da8dSAndroid Build Coastguard Worker return '\n'.join(ret) 252*cda5da8dSAndroid Build Coastguard Worker 253*cda5da8dSAndroid Build Coastguard Worker def applies_to(self, useragent): 254*cda5da8dSAndroid Build Coastguard Worker """check if this entry applies to the specified agent""" 255*cda5da8dSAndroid Build Coastguard Worker # split the name token and make it lower case 256*cda5da8dSAndroid Build Coastguard Worker useragent = useragent.split("/")[0].lower() 257*cda5da8dSAndroid Build Coastguard Worker for agent in self.useragents: 258*cda5da8dSAndroid Build Coastguard Worker if agent == '*': 259*cda5da8dSAndroid Build Coastguard Worker # we have the catch-all agent 260*cda5da8dSAndroid Build Coastguard Worker return True 261*cda5da8dSAndroid Build Coastguard Worker agent = agent.lower() 262*cda5da8dSAndroid Build Coastguard Worker if agent in useragent: 263*cda5da8dSAndroid Build Coastguard Worker return True 264*cda5da8dSAndroid Build Coastguard Worker return False 265*cda5da8dSAndroid Build Coastguard Worker 266*cda5da8dSAndroid Build Coastguard Worker def allowance(self, filename): 267*cda5da8dSAndroid Build Coastguard Worker """Preconditions: 268*cda5da8dSAndroid Build Coastguard Worker - our agent applies to this entry 269*cda5da8dSAndroid Build Coastguard Worker - filename is URL decoded""" 270*cda5da8dSAndroid Build Coastguard Worker for line in self.rulelines: 271*cda5da8dSAndroid Build Coastguard Worker if line.applies_to(filename): 272*cda5da8dSAndroid Build Coastguard Worker return line.allowance 273*cda5da8dSAndroid Build Coastguard Worker return True 274