1import io 2import os 3import threading 4import unittest 5import urllib.robotparser 6from test import support 7from test.support import socket_helper 8from test.support import threading_helper 9from http.server import BaseHTTPRequestHandler, HTTPServer 10 11 12class BaseRobotTest: 13 robots_txt = '' 14 agent = 'test_robotparser' 15 good = [] 16 bad = [] 17 site_maps = None 18 19 def setUp(self): 20 lines = io.StringIO(self.robots_txt).readlines() 21 self.parser = urllib.robotparser.RobotFileParser() 22 self.parser.parse(lines) 23 24 def get_agent_and_url(self, url): 25 if isinstance(url, tuple): 26 agent, url = url 27 return agent, url 28 return self.agent, url 29 30 def test_good_urls(self): 31 for url in self.good: 32 agent, url = self.get_agent_and_url(url) 33 with self.subTest(url=url, agent=agent): 34 self.assertTrue(self.parser.can_fetch(agent, url)) 35 36 def test_bad_urls(self): 37 for url in self.bad: 38 agent, url = self.get_agent_and_url(url) 39 with self.subTest(url=url, agent=agent): 40 self.assertFalse(self.parser.can_fetch(agent, url)) 41 42 def test_site_maps(self): 43 self.assertEqual(self.parser.site_maps(), self.site_maps) 44 45 46class UserAgentWildcardTest(BaseRobotTest, unittest.TestCase): 47 robots_txt = """\ 48User-agent: * 49Disallow: /cyberworld/map/ # This is an infinite virtual URL space 50Disallow: /tmp/ # these will soon disappear 51Disallow: /foo.html 52 """ 53 good = ['/', '/test.html'] 54 bad = ['/cyberworld/map/index.html', '/tmp/xxx', '/foo.html'] 55 56 57class CrawlDelayAndCustomAgentTest(BaseRobotTest, unittest.TestCase): 58 robots_txt = """\ 59# robots.txt for http://www.example.com/ 60 61User-agent: * 62Crawl-delay: 1 63Request-rate: 3/15 64Disallow: /cyberworld/map/ # This is an infinite virtual URL space 65 66# Cybermapper knows where to go. 67User-agent: cybermapper 68Disallow: 69 """ 70 good = ['/', '/test.html', ('cybermapper', '/cyberworld/map/index.html')] 71 bad = ['/cyberworld/map/index.html'] 72 73 74class SitemapTest(BaseRobotTest, unittest.TestCase): 75 robots_txt = """\ 76# robots.txt for http://www.example.com/ 77 78User-agent: * 79Sitemap: http://www.gstatic.com/s2/sitemaps/profiles-sitemap.xml 80Sitemap: http://www.google.com/hostednews/sitemap_index.xml 81Request-rate: 3/15 82Disallow: /cyberworld/map/ # This is an infinite virtual URL space 83 84 """ 85 good = ['/', '/test.html'] 86 bad = ['/cyberworld/map/index.html'] 87 site_maps = ['http://www.gstatic.com/s2/sitemaps/profiles-sitemap.xml', 88 'http://www.google.com/hostednews/sitemap_index.xml'] 89 90 91class RejectAllRobotsTest(BaseRobotTest, unittest.TestCase): 92 robots_txt = """\ 93# go away 94User-agent: * 95Disallow: / 96 """ 97 good = [] 98 bad = ['/cyberworld/map/index.html', '/', '/tmp/'] 99 100 101class BaseRequestRateTest(BaseRobotTest): 102 request_rate = None 103 crawl_delay = None 104 105 def test_request_rate(self): 106 parser = self.parser 107 for url in self.good + self.bad: 108 agent, url = self.get_agent_and_url(url) 109 with self.subTest(url=url, agent=agent): 110 self.assertEqual(parser.crawl_delay(agent), self.crawl_delay) 111 112 parsed_request_rate = parser.request_rate(agent) 113 self.assertEqual(parsed_request_rate, self.request_rate) 114 if self.request_rate is not None: 115 self.assertIsInstance( 116 parsed_request_rate, 117 urllib.robotparser.RequestRate 118 ) 119 self.assertEqual( 120 parsed_request_rate.requests, 121 self.request_rate.requests 122 ) 123 self.assertEqual( 124 parsed_request_rate.seconds, 125 self.request_rate.seconds 126 ) 127 128 129class EmptyFileTest(BaseRequestRateTest, unittest.TestCase): 130 robots_txt = '' 131 good = ['/foo'] 132 133 134class CrawlDelayAndRequestRateTest(BaseRequestRateTest, unittest.TestCase): 135 robots_txt = """\ 136User-agent: figtree 137Crawl-delay: 3 138Request-rate: 9/30 139Disallow: /tmp 140Disallow: /a%3cd.html 141Disallow: /a%2fb.html 142Disallow: /%7ejoe/index.html 143 """ 144 agent = 'figtree' 145 request_rate = urllib.robotparser.RequestRate(9, 30) 146 crawl_delay = 3 147 good = [('figtree', '/foo.html')] 148 bad = ['/tmp', '/tmp.html', '/tmp/a.html', '/a%3cd.html', '/a%3Cd.html', 149 '/a%2fb.html', '/~joe/index.html'] 150 151 152class DifferentAgentTest(CrawlDelayAndRequestRateTest): 153 agent = 'FigTree Robot libwww-perl/5.04' 154 155 156class InvalidRequestRateTest(BaseRobotTest, unittest.TestCase): 157 robots_txt = """\ 158User-agent: * 159Disallow: /tmp/ 160Disallow: /a%3Cd.html 161Disallow: /a/b.html 162Disallow: /%7ejoe/index.html 163Crawl-delay: 3 164Request-rate: 9/banana 165 """ 166 good = ['/tmp'] 167 bad = ['/tmp/', '/tmp/a.html', '/a%3cd.html', '/a%3Cd.html', '/a/b.html', 168 '/%7Ejoe/index.html'] 169 crawl_delay = 3 170 171 172class InvalidCrawlDelayTest(BaseRobotTest, unittest.TestCase): 173 # From bug report #523041 174 robots_txt = """\ 175User-Agent: * 176Disallow: /. 177Crawl-delay: pears 178 """ 179 good = ['/foo.html'] 180 # bug report says "/" should be denied, but that is not in the RFC 181 bad = [] 182 183 184class AnotherInvalidRequestRateTest(BaseRobotTest, unittest.TestCase): 185 # also test that Allow and Diasallow works well with each other 186 robots_txt = """\ 187User-agent: Googlebot 188Allow: /folder1/myfile.html 189Disallow: /folder1/ 190Request-rate: whale/banana 191 """ 192 agent = 'Googlebot' 193 good = ['/folder1/myfile.html'] 194 bad = ['/folder1/anotherfile.html'] 195 196 197class UserAgentOrderingTest(BaseRobotTest, unittest.TestCase): 198 # the order of User-agent should be correct. note 199 # that this file is incorrect because "Googlebot" is a 200 # substring of "Googlebot-Mobile" 201 robots_txt = """\ 202User-agent: Googlebot 203Disallow: / 204 205User-agent: Googlebot-Mobile 206Allow: / 207 """ 208 agent = 'Googlebot' 209 bad = ['/something.jpg'] 210 211 212class UserAgentGoogleMobileTest(UserAgentOrderingTest): 213 agent = 'Googlebot-Mobile' 214 215 216class GoogleURLOrderingTest(BaseRobotTest, unittest.TestCase): 217 # Google also got the order wrong. You need 218 # to specify the URLs from more specific to more general 219 robots_txt = """\ 220User-agent: Googlebot 221Allow: /folder1/myfile.html 222Disallow: /folder1/ 223 """ 224 agent = 'googlebot' 225 good = ['/folder1/myfile.html'] 226 bad = ['/folder1/anotherfile.html'] 227 228 229class DisallowQueryStringTest(BaseRobotTest, unittest.TestCase): 230 # see issue #6325 for details 231 robots_txt = """\ 232User-agent: * 233Disallow: /some/path?name=value 234 """ 235 good = ['/some/path'] 236 bad = ['/some/path?name=value'] 237 238 239class UseFirstUserAgentWildcardTest(BaseRobotTest, unittest.TestCase): 240 # obey first * entry (#4108) 241 robots_txt = """\ 242User-agent: * 243Disallow: /some/path 244 245User-agent: * 246Disallow: /another/path 247 """ 248 good = ['/another/path'] 249 bad = ['/some/path'] 250 251 252class EmptyQueryStringTest(BaseRobotTest, unittest.TestCase): 253 # normalize the URL first (#17403) 254 robots_txt = """\ 255User-agent: * 256Allow: /some/path? 257Disallow: /another/path? 258 """ 259 good = ['/some/path?'] 260 bad = ['/another/path?'] 261 262 263class DefaultEntryTest(BaseRequestRateTest, unittest.TestCase): 264 robots_txt = """\ 265User-agent: * 266Crawl-delay: 1 267Request-rate: 3/15 268Disallow: /cyberworld/map/ 269 """ 270 request_rate = urllib.robotparser.RequestRate(3, 15) 271 crawl_delay = 1 272 good = ['/', '/test.html'] 273 bad = ['/cyberworld/map/index.html'] 274 275 276class StringFormattingTest(BaseRobotTest, unittest.TestCase): 277 robots_txt = """\ 278User-agent: * 279Crawl-delay: 1 280Request-rate: 3/15 281Disallow: /cyberworld/map/ # This is an infinite virtual URL space 282 283# Cybermapper knows where to go. 284User-agent: cybermapper 285Disallow: /some/path 286 """ 287 288 expected_output = """\ 289User-agent: cybermapper 290Disallow: /some/path 291 292User-agent: * 293Crawl-delay: 1 294Request-rate: 3/15 295Disallow: /cyberworld/map/\ 296""" 297 298 def test_string_formatting(self): 299 self.assertEqual(str(self.parser), self.expected_output) 300 301 302class RobotHandler(BaseHTTPRequestHandler): 303 304 def do_GET(self): 305 self.send_error(403, "Forbidden access") 306 307 def log_message(self, format, *args): 308 pass 309 310 311@unittest.skipUnless( 312 support.has_socket_support, 313 "Socket server requires working socket." 314) 315class PasswordProtectedSiteTestCase(unittest.TestCase): 316 317 def setUp(self): 318 # clear _opener global variable 319 self.addCleanup(urllib.request.urlcleanup) 320 321 self.server = HTTPServer((socket_helper.HOST, 0), RobotHandler) 322 323 self.t = threading.Thread( 324 name='HTTPServer serving', 325 target=self.server.serve_forever, 326 # Short poll interval to make the test finish quickly. 327 # Time between requests is short enough that we won't wake 328 # up spuriously too many times. 329 kwargs={'poll_interval':0.01}) 330 self.t.daemon = True # In case this function raises. 331 self.t.start() 332 333 def tearDown(self): 334 self.server.shutdown() 335 self.t.join() 336 self.server.server_close() 337 338 @threading_helper.reap_threads 339 def testPasswordProtectedSite(self): 340 addr = self.server.server_address 341 url = 'http://' + socket_helper.HOST + ':' + str(addr[1]) 342 robots_url = url + "/robots.txt" 343 parser = urllib.robotparser.RobotFileParser() 344 parser.set_url(url) 345 parser.read() 346 self.assertFalse(parser.can_fetch("*", robots_url)) 347 348 349@support.requires_working_socket() 350class NetworkTestCase(unittest.TestCase): 351 352 base_url = 'http://www.pythontest.net/' 353 robots_txt = '{}elsewhere/robots.txt'.format(base_url) 354 355 @classmethod 356 def setUpClass(cls): 357 support.requires('network') 358 with socket_helper.transient_internet(cls.base_url): 359 cls.parser = urllib.robotparser.RobotFileParser(cls.robots_txt) 360 cls.parser.read() 361 362 def url(self, path): 363 return '{}{}{}'.format( 364 self.base_url, path, '/' if not os.path.splitext(path)[1] else '' 365 ) 366 367 def test_basic(self): 368 self.assertFalse(self.parser.disallow_all) 369 self.assertFalse(self.parser.allow_all) 370 self.assertGreater(self.parser.mtime(), 0) 371 self.assertFalse(self.parser.crawl_delay('*')) 372 self.assertFalse(self.parser.request_rate('*')) 373 374 def test_can_fetch(self): 375 self.assertTrue(self.parser.can_fetch('*', self.url('elsewhere'))) 376 self.assertFalse(self.parser.can_fetch('Nutch', self.base_url)) 377 self.assertFalse(self.parser.can_fetch('Nutch', self.url('brian'))) 378 self.assertFalse(self.parser.can_fetch('Nutch', self.url('webstats'))) 379 self.assertFalse(self.parser.can_fetch('*', self.url('webstats'))) 380 self.assertTrue(self.parser.can_fetch('*', self.base_url)) 381 382 def test_read_404(self): 383 parser = urllib.robotparser.RobotFileParser(self.url('i-robot.txt')) 384 parser.read() 385 self.assertTrue(parser.allow_all) 386 self.assertFalse(parser.disallow_all) 387 self.assertEqual(parser.mtime(), 0) 388 self.assertIsNone(parser.crawl_delay('*')) 389 self.assertIsNone(parser.request_rate('*')) 390 391if __name__=='__main__': 392 unittest.main() 393