1"""An extensible library for opening URLs using a variety of protocols 2 3The simplest way to use this module is to call the urlopen function, 4which accepts a string containing a URL or a Request object (described 5below). It opens the URL and returns the results as file-like 6object; the returned object has some extra methods described below. 7 8The OpenerDirector manages a collection of Handler objects that do 9all the actual work. Each Handler implements a particular protocol or 10option. The OpenerDirector is a composite object that invokes the 11Handlers needed to open the requested URL. For example, the 12HTTPHandler performs HTTP GET and POST requests and deals with 13non-error returns. The HTTPRedirectHandler automatically deals with 14HTTP 301, 302, 303, 307, and 308 redirect errors, and the 15HTTPDigestAuthHandler deals with digest authentication. 16 17urlopen(url, data=None) -- Basic usage is the same as original 18urllib. pass the url and optionally data to post to an HTTP URL, and 19get a file-like object back. One difference is that you can also pass 20a Request instance instead of URL. Raises a URLError (subclass of 21OSError); for HTTP errors, raises an HTTPError, which can also be 22treated as a valid response. 23 24build_opener -- Function that creates a new OpenerDirector instance. 25Will install the default handlers. Accepts one or more Handlers as 26arguments, either instances or Handler classes that it will 27instantiate. If one of the argument is a subclass of the default 28handler, the argument will be installed instead of the default. 29 30install_opener -- Installs a new opener as the default opener. 31 32objects of interest: 33 34OpenerDirector -- Sets up the User Agent as the Python-urllib client and manages 35the Handler classes, while dealing with requests and responses. 36 37Request -- An object that encapsulates the state of a request. The 38state can be as simple as the URL. It can also include extra HTTP 39headers, e.g. a User-Agent. 40 41BaseHandler -- 42 43internals: 44BaseHandler and parent 45_call_chain conventions 46 47Example usage: 48 49import urllib.request 50 51# set up authentication info 52authinfo = urllib.request.HTTPBasicAuthHandler() 53authinfo.add_password(realm='PDQ Application', 54 uri='https://mahler:8092/site-updates.py', 55 user='klem', 56 passwd='geheim$parole') 57 58proxy_support = urllib.request.ProxyHandler({"http" : "http://ahad-haam:3128"}) 59 60# build a new opener that adds authentication and caching FTP handlers 61opener = urllib.request.build_opener(proxy_support, authinfo, 62 urllib.request.CacheFTPHandler) 63 64# install it 65urllib.request.install_opener(opener) 66 67f = urllib.request.urlopen('https://www.python.org/') 68""" 69 70# XXX issues: 71# If an authentication error handler that tries to perform 72# authentication for some reason but fails, how should the error be 73# signalled? The client needs to know the HTTP error code. But if 74# the handler knows that the problem was, e.g., that it didn't know 75# that hash algo that requested in the challenge, it would be good to 76# pass that information along to the client, too. 77# ftp errors aren't handled cleanly 78# check digest against correct (i.e. non-apache) implementation 79 80# Possible extensions: 81# complex proxies XXX not sure what exactly was meant by this 82# abstract factory for opener 83 84import base64 85import bisect 86import email 87import hashlib 88import http.client 89import io 90import os 91import posixpath 92import re 93import socket 94import string 95import sys 96import time 97import tempfile 98import contextlib 99import warnings 100 101 102from urllib.error import URLError, HTTPError, ContentTooShortError 103from urllib.parse import ( 104 urlparse, urlsplit, urljoin, unwrap, quote, unquote, 105 _splittype, _splithost, _splitport, _splituser, _splitpasswd, 106 _splitattr, _splitquery, _splitvalue, _splittag, _to_bytes, 107 unquote_to_bytes, urlunparse) 108from urllib.response import addinfourl, addclosehook 109 110# check for SSL 111try: 112 import ssl 113except ImportError: 114 _have_ssl = False 115else: 116 _have_ssl = True 117 118__all__ = [ 119 # Classes 120 'Request', 'OpenerDirector', 'BaseHandler', 'HTTPDefaultErrorHandler', 121 'HTTPRedirectHandler', 'HTTPCookieProcessor', 'ProxyHandler', 122 'HTTPPasswordMgr', 'HTTPPasswordMgrWithDefaultRealm', 123 'HTTPPasswordMgrWithPriorAuth', 'AbstractBasicAuthHandler', 124 'HTTPBasicAuthHandler', 'ProxyBasicAuthHandler', 'AbstractDigestAuthHandler', 125 'HTTPDigestAuthHandler', 'ProxyDigestAuthHandler', 'HTTPHandler', 126 'FileHandler', 'FTPHandler', 'CacheFTPHandler', 'DataHandler', 127 'UnknownHandler', 'HTTPErrorProcessor', 128 # Functions 129 'urlopen', 'install_opener', 'build_opener', 130 'pathname2url', 'url2pathname', 'getproxies', 131 # Legacy interface 132 'urlretrieve', 'urlcleanup', 'URLopener', 'FancyURLopener', 133] 134 135# used in User-Agent header sent 136__version__ = '%d.%d' % sys.version_info[:2] 137 138_opener = None 139def urlopen(url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, 140 *, cafile=None, capath=None, cadefault=False, context=None): 141 '''Open the URL url, which can be either a string or a Request object. 142 143 *data* must be an object specifying additional data to be sent to 144 the server, or None if no such data is needed. See Request for 145 details. 146 147 urllib.request module uses HTTP/1.1 and includes a "Connection:close" 148 header in its HTTP requests. 149 150 The optional *timeout* parameter specifies a timeout in seconds for 151 blocking operations like the connection attempt (if not specified, the 152 global default timeout setting will be used). This only works for HTTP, 153 HTTPS and FTP connections. 154 155 If *context* is specified, it must be a ssl.SSLContext instance describing 156 the various SSL options. See HTTPSConnection for more details. 157 158 The optional *cafile* and *capath* parameters specify a set of trusted CA 159 certificates for HTTPS requests. cafile should point to a single file 160 containing a bundle of CA certificates, whereas capath should point to a 161 directory of hashed certificate files. More information can be found in 162 ssl.SSLContext.load_verify_locations(). 163 164 The *cadefault* parameter is ignored. 165 166 167 This function always returns an object which can work as a 168 context manager and has the properties url, headers, and status. 169 See urllib.response.addinfourl for more detail on these properties. 170 171 For HTTP and HTTPS URLs, this function returns a http.client.HTTPResponse 172 object slightly modified. In addition to the three new methods above, the 173 msg attribute contains the same information as the reason attribute --- 174 the reason phrase returned by the server --- instead of the response 175 headers as it is specified in the documentation for HTTPResponse. 176 177 For FTP, file, and data URLs and requests explicitly handled by legacy 178 URLopener and FancyURLopener classes, this function returns a 179 urllib.response.addinfourl object. 180 181 Note that None may be returned if no handler handles the request (though 182 the default installed global OpenerDirector uses UnknownHandler to ensure 183 this never happens). 184 185 In addition, if proxy settings are detected (for example, when a *_proxy 186 environment variable like http_proxy is set), ProxyHandler is default 187 installed and makes sure the requests are handled through the proxy. 188 189 ''' 190 global _opener 191 if cafile or capath or cadefault: 192 import warnings 193 warnings.warn("cafile, capath and cadefault are deprecated, use a " 194 "custom context instead.", DeprecationWarning, 2) 195 if context is not None: 196 raise ValueError( 197 "You can't pass both context and any of cafile, capath, and " 198 "cadefault" 199 ) 200 if not _have_ssl: 201 raise ValueError('SSL support not available') 202 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH, 203 cafile=cafile, 204 capath=capath) 205 # send ALPN extension to indicate HTTP/1.1 protocol 206 context.set_alpn_protocols(['http/1.1']) 207 https_handler = HTTPSHandler(context=context) 208 opener = build_opener(https_handler) 209 elif context: 210 https_handler = HTTPSHandler(context=context) 211 opener = build_opener(https_handler) 212 elif _opener is None: 213 _opener = opener = build_opener() 214 else: 215 opener = _opener 216 return opener.open(url, data, timeout) 217 218def install_opener(opener): 219 global _opener 220 _opener = opener 221 222_url_tempfiles = [] 223def urlretrieve(url, filename=None, reporthook=None, data=None): 224 """ 225 Retrieve a URL into a temporary location on disk. 226 227 Requires a URL argument. If a filename is passed, it is used as 228 the temporary file location. The reporthook argument should be 229 a callable that accepts a block number, a read size, and the 230 total file size of the URL target. The data argument should be 231 valid URL encoded data. 232 233 If a filename is passed and the URL points to a local resource, 234 the result is a copy from local file to new file. 235 236 Returns a tuple containing the path to the newly created 237 data file as well as the resulting HTTPMessage object. 238 """ 239 url_type, path = _splittype(url) 240 241 with contextlib.closing(urlopen(url, data)) as fp: 242 headers = fp.info() 243 244 # Just return the local path and the "headers" for file:// 245 # URLs. No sense in performing a copy unless requested. 246 if url_type == "file" and not filename: 247 return os.path.normpath(path), headers 248 249 # Handle temporary file setup. 250 if filename: 251 tfp = open(filename, 'wb') 252 else: 253 tfp = tempfile.NamedTemporaryFile(delete=False) 254 filename = tfp.name 255 _url_tempfiles.append(filename) 256 257 with tfp: 258 result = filename, headers 259 bs = 1024*8 260 size = -1 261 read = 0 262 blocknum = 0 263 if "content-length" in headers: 264 size = int(headers["Content-Length"]) 265 266 if reporthook: 267 reporthook(blocknum, bs, size) 268 269 while True: 270 block = fp.read(bs) 271 if not block: 272 break 273 read += len(block) 274 tfp.write(block) 275 blocknum += 1 276 if reporthook: 277 reporthook(blocknum, bs, size) 278 279 if size >= 0 and read < size: 280 raise ContentTooShortError( 281 "retrieval incomplete: got only %i out of %i bytes" 282 % (read, size), result) 283 284 return result 285 286def urlcleanup(): 287 """Clean up temporary files from urlretrieve calls.""" 288 for temp_file in _url_tempfiles: 289 try: 290 os.unlink(temp_file) 291 except OSError: 292 pass 293 294 del _url_tempfiles[:] 295 global _opener 296 if _opener: 297 _opener = None 298 299# copied from cookielib.py 300_cut_port_re = re.compile(r":\d+$", re.ASCII) 301def request_host(request): 302 """Return request-host, as defined by RFC 2965. 303 304 Variation from RFC: returned value is lowercased, for convenient 305 comparison. 306 307 """ 308 url = request.full_url 309 host = urlparse(url)[1] 310 if host == "": 311 host = request.get_header("Host", "") 312 313 # remove port, if present 314 host = _cut_port_re.sub("", host, 1) 315 return host.lower() 316 317class Request: 318 319 def __init__(self, url, data=None, headers={}, 320 origin_req_host=None, unverifiable=False, 321 method=None): 322 self.full_url = url 323 self.headers = {} 324 self.unredirected_hdrs = {} 325 self._data = None 326 self.data = data 327 self._tunnel_host = None 328 for key, value in headers.items(): 329 self.add_header(key, value) 330 if origin_req_host is None: 331 origin_req_host = request_host(self) 332 self.origin_req_host = origin_req_host 333 self.unverifiable = unverifiable 334 if method: 335 self.method = method 336 337 @property 338 def full_url(self): 339 if self.fragment: 340 return '{}#{}'.format(self._full_url, self.fragment) 341 return self._full_url 342 343 @full_url.setter 344 def full_url(self, url): 345 # unwrap('<URL:type://host/path>') --> 'type://host/path' 346 self._full_url = unwrap(url) 347 self._full_url, self.fragment = _splittag(self._full_url) 348 self._parse() 349 350 @full_url.deleter 351 def full_url(self): 352 self._full_url = None 353 self.fragment = None 354 self.selector = '' 355 356 @property 357 def data(self): 358 return self._data 359 360 @data.setter 361 def data(self, data): 362 if data != self._data: 363 self._data = data 364 # issue 16464 365 # if we change data we need to remove content-length header 366 # (cause it's most probably calculated for previous value) 367 if self.has_header("Content-length"): 368 self.remove_header("Content-length") 369 370 @data.deleter 371 def data(self): 372 self.data = None 373 374 def _parse(self): 375 self.type, rest = _splittype(self._full_url) 376 if self.type is None: 377 raise ValueError("unknown url type: %r" % self.full_url) 378 self.host, self.selector = _splithost(rest) 379 if self.host: 380 self.host = unquote(self.host) 381 382 def get_method(self): 383 """Return a string indicating the HTTP request method.""" 384 default_method = "POST" if self.data is not None else "GET" 385 return getattr(self, 'method', default_method) 386 387 def get_full_url(self): 388 return self.full_url 389 390 def set_proxy(self, host, type): 391 if self.type == 'https' and not self._tunnel_host: 392 self._tunnel_host = self.host 393 else: 394 self.type= type 395 self.selector = self.full_url 396 self.host = host 397 398 def has_proxy(self): 399 return self.selector == self.full_url 400 401 def add_header(self, key, val): 402 # useful for something like authentication 403 self.headers[key.capitalize()] = val 404 405 def add_unredirected_header(self, key, val): 406 # will not be added to a redirected request 407 self.unredirected_hdrs[key.capitalize()] = val 408 409 def has_header(self, header_name): 410 return (header_name in self.headers or 411 header_name in self.unredirected_hdrs) 412 413 def get_header(self, header_name, default=None): 414 return self.headers.get( 415 header_name, 416 self.unredirected_hdrs.get(header_name, default)) 417 418 def remove_header(self, header_name): 419 self.headers.pop(header_name, None) 420 self.unredirected_hdrs.pop(header_name, None) 421 422 def header_items(self): 423 hdrs = {**self.unredirected_hdrs, **self.headers} 424 return list(hdrs.items()) 425 426class OpenerDirector: 427 def __init__(self): 428 client_version = "Python-urllib/%s" % __version__ 429 self.addheaders = [('User-agent', client_version)] 430 # self.handlers is retained only for backward compatibility 431 self.handlers = [] 432 # manage the individual handlers 433 self.handle_open = {} 434 self.handle_error = {} 435 self.process_response = {} 436 self.process_request = {} 437 438 def add_handler(self, handler): 439 if not hasattr(handler, "add_parent"): 440 raise TypeError("expected BaseHandler instance, got %r" % 441 type(handler)) 442 443 added = False 444 for meth in dir(handler): 445 if meth in ["redirect_request", "do_open", "proxy_open"]: 446 # oops, coincidental match 447 continue 448 449 i = meth.find("_") 450 protocol = meth[:i] 451 condition = meth[i+1:] 452 453 if condition.startswith("error"): 454 j = condition.find("_") + i + 1 455 kind = meth[j+1:] 456 try: 457 kind = int(kind) 458 except ValueError: 459 pass 460 lookup = self.handle_error.get(protocol, {}) 461 self.handle_error[protocol] = lookup 462 elif condition == "open": 463 kind = protocol 464 lookup = self.handle_open 465 elif condition == "response": 466 kind = protocol 467 lookup = self.process_response 468 elif condition == "request": 469 kind = protocol 470 lookup = self.process_request 471 else: 472 continue 473 474 handlers = lookup.setdefault(kind, []) 475 if handlers: 476 bisect.insort(handlers, handler) 477 else: 478 handlers.append(handler) 479 added = True 480 481 if added: 482 bisect.insort(self.handlers, handler) 483 handler.add_parent(self) 484 485 def close(self): 486 # Only exists for backwards compatibility. 487 pass 488 489 def _call_chain(self, chain, kind, meth_name, *args): 490 # Handlers raise an exception if no one else should try to handle 491 # the request, or return None if they can't but another handler 492 # could. Otherwise, they return the response. 493 handlers = chain.get(kind, ()) 494 for handler in handlers: 495 func = getattr(handler, meth_name) 496 result = func(*args) 497 if result is not None: 498 return result 499 500 def open(self, fullurl, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT): 501 # accept a URL or a Request object 502 if isinstance(fullurl, str): 503 req = Request(fullurl, data) 504 else: 505 req = fullurl 506 if data is not None: 507 req.data = data 508 509 req.timeout = timeout 510 protocol = req.type 511 512 # pre-process request 513 meth_name = protocol+"_request" 514 for processor in self.process_request.get(protocol, []): 515 meth = getattr(processor, meth_name) 516 req = meth(req) 517 518 sys.audit('urllib.Request', req.full_url, req.data, req.headers, req.get_method()) 519 response = self._open(req, data) 520 521 # post-process response 522 meth_name = protocol+"_response" 523 for processor in self.process_response.get(protocol, []): 524 meth = getattr(processor, meth_name) 525 response = meth(req, response) 526 527 return response 528 529 def _open(self, req, data=None): 530 result = self._call_chain(self.handle_open, 'default', 531 'default_open', req) 532 if result: 533 return result 534 535 protocol = req.type 536 result = self._call_chain(self.handle_open, protocol, protocol + 537 '_open', req) 538 if result: 539 return result 540 541 return self._call_chain(self.handle_open, 'unknown', 542 'unknown_open', req) 543 544 def error(self, proto, *args): 545 if proto in ('http', 'https'): 546 # XXX http[s] protocols are special-cased 547 dict = self.handle_error['http'] # https is not different than http 548 proto = args[2] # YUCK! 549 meth_name = 'http_error_%s' % proto 550 http_err = 1 551 orig_args = args 552 else: 553 dict = self.handle_error 554 meth_name = proto + '_error' 555 http_err = 0 556 args = (dict, proto, meth_name) + args 557 result = self._call_chain(*args) 558 if result: 559 return result 560 561 if http_err: 562 args = (dict, 'default', 'http_error_default') + orig_args 563 return self._call_chain(*args) 564 565# XXX probably also want an abstract factory that knows when it makes 566# sense to skip a superclass in favor of a subclass and when it might 567# make sense to include both 568 569def build_opener(*handlers): 570 """Create an opener object from a list of handlers. 571 572 The opener will use several default handlers, including support 573 for HTTP, FTP and when applicable HTTPS. 574 575 If any of the handlers passed as arguments are subclasses of the 576 default handlers, the default handlers will not be used. 577 """ 578 opener = OpenerDirector() 579 default_classes = [ProxyHandler, UnknownHandler, HTTPHandler, 580 HTTPDefaultErrorHandler, HTTPRedirectHandler, 581 FTPHandler, FileHandler, HTTPErrorProcessor, 582 DataHandler] 583 if hasattr(http.client, "HTTPSConnection"): 584 default_classes.append(HTTPSHandler) 585 skip = set() 586 for klass in default_classes: 587 for check in handlers: 588 if isinstance(check, type): 589 if issubclass(check, klass): 590 skip.add(klass) 591 elif isinstance(check, klass): 592 skip.add(klass) 593 for klass in skip: 594 default_classes.remove(klass) 595 596 for klass in default_classes: 597 opener.add_handler(klass()) 598 599 for h in handlers: 600 if isinstance(h, type): 601 h = h() 602 opener.add_handler(h) 603 return opener 604 605class BaseHandler: 606 handler_order = 500 607 608 def add_parent(self, parent): 609 self.parent = parent 610 611 def close(self): 612 # Only exists for backwards compatibility 613 pass 614 615 def __lt__(self, other): 616 if not hasattr(other, "handler_order"): 617 # Try to preserve the old behavior of having custom classes 618 # inserted after default ones (works only for custom user 619 # classes which are not aware of handler_order). 620 return True 621 return self.handler_order < other.handler_order 622 623 624class HTTPErrorProcessor(BaseHandler): 625 """Process HTTP error responses.""" 626 handler_order = 1000 # after all other processing 627 628 def http_response(self, request, response): 629 code, msg, hdrs = response.code, response.msg, response.info() 630 631 # According to RFC 2616, "2xx" code indicates that the client's 632 # request was successfully received, understood, and accepted. 633 if not (200 <= code < 300): 634 response = self.parent.error( 635 'http', request, response, code, msg, hdrs) 636 637 return response 638 639 https_response = http_response 640 641class HTTPDefaultErrorHandler(BaseHandler): 642 def http_error_default(self, req, fp, code, msg, hdrs): 643 raise HTTPError(req.full_url, code, msg, hdrs, fp) 644 645class HTTPRedirectHandler(BaseHandler): 646 # maximum number of redirections to any single URL 647 # this is needed because of the state that cookies introduce 648 max_repeats = 4 649 # maximum total number of redirections (regardless of URL) before 650 # assuming we're in a loop 651 max_redirections = 10 652 653 def redirect_request(self, req, fp, code, msg, headers, newurl): 654 """Return a Request or None in response to a redirect. 655 656 This is called by the http_error_30x methods when a 657 redirection response is received. If a redirection should 658 take place, return a new Request to allow http_error_30x to 659 perform the redirect. Otherwise, raise HTTPError if no-one 660 else should try to handle this url. Return None if you can't 661 but another Handler might. 662 """ 663 m = req.get_method() 664 if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD") 665 or code in (301, 302, 303) and m == "POST")): 666 raise HTTPError(req.full_url, code, msg, headers, fp) 667 668 # Strictly (according to RFC 2616), 301 or 302 in response to 669 # a POST MUST NOT cause a redirection without confirmation 670 # from the user (of urllib.request, in this case). In practice, 671 # essentially all clients do redirect in this case, so we do 672 # the same. 673 674 # Be conciliant with URIs containing a space. This is mainly 675 # redundant with the more complete encoding done in http_error_302(), 676 # but it is kept for compatibility with other callers. 677 newurl = newurl.replace(' ', '%20') 678 679 CONTENT_HEADERS = ("content-length", "content-type") 680 newheaders = {k: v for k, v in req.headers.items() 681 if k.lower() not in CONTENT_HEADERS} 682 return Request(newurl, 683 headers=newheaders, 684 origin_req_host=req.origin_req_host, 685 unverifiable=True) 686 687 # Implementation note: To avoid the server sending us into an 688 # infinite loop, the request object needs to track what URLs we 689 # have already seen. Do this by adding a handler-specific 690 # attribute to the Request object. 691 def http_error_302(self, req, fp, code, msg, headers): 692 # Some servers (incorrectly) return multiple Location headers 693 # (so probably same goes for URI). Use first header. 694 if "location" in headers: 695 newurl = headers["location"] 696 elif "uri" in headers: 697 newurl = headers["uri"] 698 else: 699 return 700 701 # fix a possible malformed URL 702 urlparts = urlparse(newurl) 703 704 # For security reasons we don't allow redirection to anything other 705 # than http, https or ftp. 706 707 if urlparts.scheme not in ('http', 'https', 'ftp', ''): 708 raise HTTPError( 709 newurl, code, 710 "%s - Redirection to url '%s' is not allowed" % (msg, newurl), 711 headers, fp) 712 713 if not urlparts.path and urlparts.netloc: 714 urlparts = list(urlparts) 715 urlparts[2] = "/" 716 newurl = urlunparse(urlparts) 717 718 # http.client.parse_headers() decodes as ISO-8859-1. Recover the 719 # original bytes and percent-encode non-ASCII bytes, and any special 720 # characters such as the space. 721 newurl = quote( 722 newurl, encoding="iso-8859-1", safe=string.punctuation) 723 newurl = urljoin(req.full_url, newurl) 724 725 # XXX Probably want to forget about the state of the current 726 # request, although that might interact poorly with other 727 # handlers that also use handler-specific request attributes 728 new = self.redirect_request(req, fp, code, msg, headers, newurl) 729 if new is None: 730 return 731 732 # loop detection 733 # .redirect_dict has a key url if url was previously visited. 734 if hasattr(req, 'redirect_dict'): 735 visited = new.redirect_dict = req.redirect_dict 736 if (visited.get(newurl, 0) >= self.max_repeats or 737 len(visited) >= self.max_redirections): 738 raise HTTPError(req.full_url, code, 739 self.inf_msg + msg, headers, fp) 740 else: 741 visited = new.redirect_dict = req.redirect_dict = {} 742 visited[newurl] = visited.get(newurl, 0) + 1 743 744 # Don't close the fp until we are sure that we won't use it 745 # with HTTPError. 746 fp.read() 747 fp.close() 748 749 return self.parent.open(new, timeout=req.timeout) 750 751 http_error_301 = http_error_303 = http_error_307 = http_error_308 = http_error_302 752 753 inf_msg = "The HTTP server returned a redirect error that would " \ 754 "lead to an infinite loop.\n" \ 755 "The last 30x error message was:\n" 756 757 758def _parse_proxy(proxy): 759 """Return (scheme, user, password, host/port) given a URL or an authority. 760 761 If a URL is supplied, it must have an authority (host:port) component. 762 According to RFC 3986, having an authority component means the URL must 763 have two slashes after the scheme. 764 """ 765 scheme, r_scheme = _splittype(proxy) 766 if not r_scheme.startswith("/"): 767 # authority 768 scheme = None 769 authority = proxy 770 else: 771 # URL 772 if not r_scheme.startswith("//"): 773 raise ValueError("proxy URL with no authority: %r" % proxy) 774 # We have an authority, so for RFC 3986-compliant URLs (by ss 3. 775 # and 3.3.), path is empty or starts with '/' 776 if '@' in r_scheme: 777 host_separator = r_scheme.find('@') 778 end = r_scheme.find("/", host_separator) 779 else: 780 end = r_scheme.find("/", 2) 781 if end == -1: 782 end = None 783 authority = r_scheme[2:end] 784 userinfo, hostport = _splituser(authority) 785 if userinfo is not None: 786 user, password = _splitpasswd(userinfo) 787 else: 788 user = password = None 789 return scheme, user, password, hostport 790 791class ProxyHandler(BaseHandler): 792 # Proxies must be in front 793 handler_order = 100 794 795 def __init__(self, proxies=None): 796 if proxies is None: 797 proxies = getproxies() 798 assert hasattr(proxies, 'keys'), "proxies must be a mapping" 799 self.proxies = proxies 800 for type, url in proxies.items(): 801 type = type.lower() 802 setattr(self, '%s_open' % type, 803 lambda r, proxy=url, type=type, meth=self.proxy_open: 804 meth(r, proxy, type)) 805 806 def proxy_open(self, req, proxy, type): 807 orig_type = req.type 808 proxy_type, user, password, hostport = _parse_proxy(proxy) 809 if proxy_type is None: 810 proxy_type = orig_type 811 812 if req.host and proxy_bypass(req.host): 813 return None 814 815 if user and password: 816 user_pass = '%s:%s' % (unquote(user), 817 unquote(password)) 818 creds = base64.b64encode(user_pass.encode()).decode("ascii") 819 req.add_header('Proxy-authorization', 'Basic ' + creds) 820 hostport = unquote(hostport) 821 req.set_proxy(hostport, proxy_type) 822 if orig_type == proxy_type or orig_type == 'https': 823 # let other handlers take care of it 824 return None 825 else: 826 # need to start over, because the other handlers don't 827 # grok the proxy's URL type 828 # e.g. if we have a constructor arg proxies like so: 829 # {'http': 'ftp://proxy.example.com'}, we may end up turning 830 # a request for http://acme.example.com/a into one for 831 # ftp://proxy.example.com/a 832 return self.parent.open(req, timeout=req.timeout) 833 834class HTTPPasswordMgr: 835 836 def __init__(self): 837 self.passwd = {} 838 839 def add_password(self, realm, uri, user, passwd): 840 # uri could be a single URI or a sequence 841 if isinstance(uri, str): 842 uri = [uri] 843 if realm not in self.passwd: 844 self.passwd[realm] = {} 845 for default_port in True, False: 846 reduced_uri = tuple( 847 self.reduce_uri(u, default_port) for u in uri) 848 self.passwd[realm][reduced_uri] = (user, passwd) 849 850 def find_user_password(self, realm, authuri): 851 domains = self.passwd.get(realm, {}) 852 for default_port in True, False: 853 reduced_authuri = self.reduce_uri(authuri, default_port) 854 for uris, authinfo in domains.items(): 855 for uri in uris: 856 if self.is_suburi(uri, reduced_authuri): 857 return authinfo 858 return None, None 859 860 def reduce_uri(self, uri, default_port=True): 861 """Accept authority or URI and extract only the authority and path.""" 862 # note HTTP URLs do not have a userinfo component 863 parts = urlsplit(uri) 864 if parts[1]: 865 # URI 866 scheme = parts[0] 867 authority = parts[1] 868 path = parts[2] or '/' 869 else: 870 # host or host:port 871 scheme = None 872 authority = uri 873 path = '/' 874 host, port = _splitport(authority) 875 if default_port and port is None and scheme is not None: 876 dport = {"http": 80, 877 "https": 443, 878 }.get(scheme) 879 if dport is not None: 880 authority = "%s:%d" % (host, dport) 881 return authority, path 882 883 def is_suburi(self, base, test): 884 """Check if test is below base in a URI tree 885 886 Both args must be URIs in reduced form. 887 """ 888 if base == test: 889 return True 890 if base[0] != test[0]: 891 return False 892 prefix = base[1] 893 if prefix[-1:] != '/': 894 prefix += '/' 895 return test[1].startswith(prefix) 896 897 898class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr): 899 900 def find_user_password(self, realm, authuri): 901 user, password = HTTPPasswordMgr.find_user_password(self, realm, 902 authuri) 903 if user is not None: 904 return user, password 905 return HTTPPasswordMgr.find_user_password(self, None, authuri) 906 907 908class HTTPPasswordMgrWithPriorAuth(HTTPPasswordMgrWithDefaultRealm): 909 910 def __init__(self, *args, **kwargs): 911 self.authenticated = {} 912 super().__init__(*args, **kwargs) 913 914 def add_password(self, realm, uri, user, passwd, is_authenticated=False): 915 self.update_authenticated(uri, is_authenticated) 916 # Add a default for prior auth requests 917 if realm is not None: 918 super().add_password(None, uri, user, passwd) 919 super().add_password(realm, uri, user, passwd) 920 921 def update_authenticated(self, uri, is_authenticated=False): 922 # uri could be a single URI or a sequence 923 if isinstance(uri, str): 924 uri = [uri] 925 926 for default_port in True, False: 927 for u in uri: 928 reduced_uri = self.reduce_uri(u, default_port) 929 self.authenticated[reduced_uri] = is_authenticated 930 931 def is_authenticated(self, authuri): 932 for default_port in True, False: 933 reduced_authuri = self.reduce_uri(authuri, default_port) 934 for uri in self.authenticated: 935 if self.is_suburi(uri, reduced_authuri): 936 return self.authenticated[uri] 937 938 939class AbstractBasicAuthHandler: 940 941 # XXX this allows for multiple auth-schemes, but will stupidly pick 942 # the last one with a realm specified. 943 944 # allow for double- and single-quoted realm values 945 # (single quotes are a violation of the RFC, but appear in the wild) 946 rx = re.compile('(?:^|,)' # start of the string or ',' 947 '[ \t]*' # optional whitespaces 948 '([^ \t,]+)' # scheme like "Basic" 949 '[ \t]+' # mandatory whitespaces 950 # realm=xxx 951 # realm='xxx' 952 # realm="xxx" 953 'realm=(["\']?)([^"\']*)\\2', 954 re.I) 955 956 # XXX could pre-emptively send auth info already accepted (RFC 2617, 957 # end of section 2, and section 1.2 immediately after "credentials" 958 # production). 959 960 def __init__(self, password_mgr=None): 961 if password_mgr is None: 962 password_mgr = HTTPPasswordMgr() 963 self.passwd = password_mgr 964 self.add_password = self.passwd.add_password 965 966 def _parse_realm(self, header): 967 # parse WWW-Authenticate header: accept multiple challenges per header 968 found_challenge = False 969 for mo in AbstractBasicAuthHandler.rx.finditer(header): 970 scheme, quote, realm = mo.groups() 971 if quote not in ['"', "'"]: 972 warnings.warn("Basic Auth Realm was unquoted", 973 UserWarning, 3) 974 975 yield (scheme, realm) 976 977 found_challenge = True 978 979 if not found_challenge: 980 if header: 981 scheme = header.split()[0] 982 else: 983 scheme = '' 984 yield (scheme, None) 985 986 def http_error_auth_reqed(self, authreq, host, req, headers): 987 # host may be an authority (without userinfo) or a URL with an 988 # authority 989 headers = headers.get_all(authreq) 990 if not headers: 991 # no header found 992 return 993 994 unsupported = None 995 for header in headers: 996 for scheme, realm in self._parse_realm(header): 997 if scheme.lower() != 'basic': 998 unsupported = scheme 999 continue 1000 1001 if realm is not None: 1002 # Use the first matching Basic challenge. 1003 # Ignore following challenges even if they use the Basic 1004 # scheme. 1005 return self.retry_http_basic_auth(host, req, realm) 1006 1007 if unsupported is not None: 1008 raise ValueError("AbstractBasicAuthHandler does not " 1009 "support the following scheme: %r" 1010 % (scheme,)) 1011 1012 def retry_http_basic_auth(self, host, req, realm): 1013 user, pw = self.passwd.find_user_password(realm, host) 1014 if pw is not None: 1015 raw = "%s:%s" % (user, pw) 1016 auth = "Basic " + base64.b64encode(raw.encode()).decode("ascii") 1017 if req.get_header(self.auth_header, None) == auth: 1018 return None 1019 req.add_unredirected_header(self.auth_header, auth) 1020 return self.parent.open(req, timeout=req.timeout) 1021 else: 1022 return None 1023 1024 def http_request(self, req): 1025 if (not hasattr(self.passwd, 'is_authenticated') or 1026 not self.passwd.is_authenticated(req.full_url)): 1027 return req 1028 1029 if not req.has_header('Authorization'): 1030 user, passwd = self.passwd.find_user_password(None, req.full_url) 1031 credentials = '{0}:{1}'.format(user, passwd).encode() 1032 auth_str = base64.standard_b64encode(credentials).decode() 1033 req.add_unredirected_header('Authorization', 1034 'Basic {}'.format(auth_str.strip())) 1035 return req 1036 1037 def http_response(self, req, response): 1038 if hasattr(self.passwd, 'is_authenticated'): 1039 if 200 <= response.code < 300: 1040 self.passwd.update_authenticated(req.full_url, True) 1041 else: 1042 self.passwd.update_authenticated(req.full_url, False) 1043 return response 1044 1045 https_request = http_request 1046 https_response = http_response 1047 1048 1049 1050class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler): 1051 1052 auth_header = 'Authorization' 1053 1054 def http_error_401(self, req, fp, code, msg, headers): 1055 url = req.full_url 1056 response = self.http_error_auth_reqed('www-authenticate', 1057 url, req, headers) 1058 return response 1059 1060 1061class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler): 1062 1063 auth_header = 'Proxy-authorization' 1064 1065 def http_error_407(self, req, fp, code, msg, headers): 1066 # http_error_auth_reqed requires that there is no userinfo component in 1067 # authority. Assume there isn't one, since urllib.request does not (and 1068 # should not, RFC 3986 s. 3.2.1) support requests for URLs containing 1069 # userinfo. 1070 authority = req.host 1071 response = self.http_error_auth_reqed('proxy-authenticate', 1072 authority, req, headers) 1073 return response 1074 1075 1076# Return n random bytes. 1077_randombytes = os.urandom 1078 1079 1080class AbstractDigestAuthHandler: 1081 # Digest authentication is specified in RFC 2617. 1082 1083 # XXX The client does not inspect the Authentication-Info header 1084 # in a successful response. 1085 1086 # XXX It should be possible to test this implementation against 1087 # a mock server that just generates a static set of challenges. 1088 1089 # XXX qop="auth-int" supports is shaky 1090 1091 def __init__(self, passwd=None): 1092 if passwd is None: 1093 passwd = HTTPPasswordMgr() 1094 self.passwd = passwd 1095 self.add_password = self.passwd.add_password 1096 self.retried = 0 1097 self.nonce_count = 0 1098 self.last_nonce = None 1099 1100 def reset_retry_count(self): 1101 self.retried = 0 1102 1103 def http_error_auth_reqed(self, auth_header, host, req, headers): 1104 authreq = headers.get(auth_header, None) 1105 if self.retried > 5: 1106 # Don't fail endlessly - if we failed once, we'll probably 1107 # fail a second time. Hm. Unless the Password Manager is 1108 # prompting for the information. Crap. This isn't great 1109 # but it's better than the current 'repeat until recursion 1110 # depth exceeded' approach <wink> 1111 raise HTTPError(req.full_url, 401, "digest auth failed", 1112 headers, None) 1113 else: 1114 self.retried += 1 1115 if authreq: 1116 scheme = authreq.split()[0] 1117 if scheme.lower() == 'digest': 1118 return self.retry_http_digest_auth(req, authreq) 1119 elif scheme.lower() != 'basic': 1120 raise ValueError("AbstractDigestAuthHandler does not support" 1121 " the following scheme: '%s'" % scheme) 1122 1123 def retry_http_digest_auth(self, req, auth): 1124 token, challenge = auth.split(' ', 1) 1125 chal = parse_keqv_list(filter(None, parse_http_list(challenge))) 1126 auth = self.get_authorization(req, chal) 1127 if auth: 1128 auth_val = 'Digest %s' % auth 1129 if req.headers.get(self.auth_header, None) == auth_val: 1130 return None 1131 req.add_unredirected_header(self.auth_header, auth_val) 1132 resp = self.parent.open(req, timeout=req.timeout) 1133 return resp 1134 1135 def get_cnonce(self, nonce): 1136 # The cnonce-value is an opaque 1137 # quoted string value provided by the client and used by both client 1138 # and server to avoid chosen plaintext attacks, to provide mutual 1139 # authentication, and to provide some message integrity protection. 1140 # This isn't a fabulous effort, but it's probably Good Enough. 1141 s = "%s:%s:%s:" % (self.nonce_count, nonce, time.ctime()) 1142 b = s.encode("ascii") + _randombytes(8) 1143 dig = hashlib.sha1(b).hexdigest() 1144 return dig[:16] 1145 1146 def get_authorization(self, req, chal): 1147 try: 1148 realm = chal['realm'] 1149 nonce = chal['nonce'] 1150 qop = chal.get('qop') 1151 algorithm = chal.get('algorithm', 'MD5') 1152 # mod_digest doesn't send an opaque, even though it isn't 1153 # supposed to be optional 1154 opaque = chal.get('opaque', None) 1155 except KeyError: 1156 return None 1157 1158 H, KD = self.get_algorithm_impls(algorithm) 1159 if H is None: 1160 return None 1161 1162 user, pw = self.passwd.find_user_password(realm, req.full_url) 1163 if user is None: 1164 return None 1165 1166 # XXX not implemented yet 1167 if req.data is not None: 1168 entdig = self.get_entity_digest(req.data, chal) 1169 else: 1170 entdig = None 1171 1172 A1 = "%s:%s:%s" % (user, realm, pw) 1173 A2 = "%s:%s" % (req.get_method(), 1174 # XXX selector: what about proxies and full urls 1175 req.selector) 1176 # NOTE: As per RFC 2617, when server sends "auth,auth-int", the client could use either `auth` 1177 # or `auth-int` to the response back. we use `auth` to send the response back. 1178 if qop is None: 1179 respdig = KD(H(A1), "%s:%s" % (nonce, H(A2))) 1180 elif 'auth' in qop.split(','): 1181 if nonce == self.last_nonce: 1182 self.nonce_count += 1 1183 else: 1184 self.nonce_count = 1 1185 self.last_nonce = nonce 1186 ncvalue = '%08x' % self.nonce_count 1187 cnonce = self.get_cnonce(nonce) 1188 noncebit = "%s:%s:%s:%s:%s" % (nonce, ncvalue, cnonce, 'auth', H(A2)) 1189 respdig = KD(H(A1), noncebit) 1190 else: 1191 # XXX handle auth-int. 1192 raise URLError("qop '%s' is not supported." % qop) 1193 1194 # XXX should the partial digests be encoded too? 1195 1196 base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \ 1197 'response="%s"' % (user, realm, nonce, req.selector, 1198 respdig) 1199 if opaque: 1200 base += ', opaque="%s"' % opaque 1201 if entdig: 1202 base += ', digest="%s"' % entdig 1203 base += ', algorithm="%s"' % algorithm 1204 if qop: 1205 base += ', qop=auth, nc=%s, cnonce="%s"' % (ncvalue, cnonce) 1206 return base 1207 1208 def get_algorithm_impls(self, algorithm): 1209 # lambdas assume digest modules are imported at the top level 1210 if algorithm == 'MD5': 1211 H = lambda x: hashlib.md5(x.encode("ascii")).hexdigest() 1212 elif algorithm == 'SHA': 1213 H = lambda x: hashlib.sha1(x.encode("ascii")).hexdigest() 1214 # XXX MD5-sess 1215 else: 1216 raise ValueError("Unsupported digest authentication " 1217 "algorithm %r" % algorithm) 1218 KD = lambda s, d: H("%s:%s" % (s, d)) 1219 return H, KD 1220 1221 def get_entity_digest(self, data, chal): 1222 # XXX not implemented yet 1223 return None 1224 1225 1226class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler): 1227 """An authentication protocol defined by RFC 2069 1228 1229 Digest authentication improves on basic authentication because it 1230 does not transmit passwords in the clear. 1231 """ 1232 1233 auth_header = 'Authorization' 1234 handler_order = 490 # before Basic auth 1235 1236 def http_error_401(self, req, fp, code, msg, headers): 1237 host = urlparse(req.full_url)[1] 1238 retry = self.http_error_auth_reqed('www-authenticate', 1239 host, req, headers) 1240 self.reset_retry_count() 1241 return retry 1242 1243 1244class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler): 1245 1246 auth_header = 'Proxy-Authorization' 1247 handler_order = 490 # before Basic auth 1248 1249 def http_error_407(self, req, fp, code, msg, headers): 1250 host = req.host 1251 retry = self.http_error_auth_reqed('proxy-authenticate', 1252 host, req, headers) 1253 self.reset_retry_count() 1254 return retry 1255 1256class AbstractHTTPHandler(BaseHandler): 1257 1258 def __init__(self, debuglevel=0): 1259 self._debuglevel = debuglevel 1260 1261 def set_http_debuglevel(self, level): 1262 self._debuglevel = level 1263 1264 def _get_content_length(self, request): 1265 return http.client.HTTPConnection._get_content_length( 1266 request.data, 1267 request.get_method()) 1268 1269 def do_request_(self, request): 1270 host = request.host 1271 if not host: 1272 raise URLError('no host given') 1273 1274 if request.data is not None: # POST 1275 data = request.data 1276 if isinstance(data, str): 1277 msg = "POST data should be bytes, an iterable of bytes, " \ 1278 "or a file object. It cannot be of type str." 1279 raise TypeError(msg) 1280 if not request.has_header('Content-type'): 1281 request.add_unredirected_header( 1282 'Content-type', 1283 'application/x-www-form-urlencoded') 1284 if (not request.has_header('Content-length') 1285 and not request.has_header('Transfer-encoding')): 1286 content_length = self._get_content_length(request) 1287 if content_length is not None: 1288 request.add_unredirected_header( 1289 'Content-length', str(content_length)) 1290 else: 1291 request.add_unredirected_header( 1292 'Transfer-encoding', 'chunked') 1293 1294 sel_host = host 1295 if request.has_proxy(): 1296 scheme, sel = _splittype(request.selector) 1297 sel_host, sel_path = _splithost(sel) 1298 if not request.has_header('Host'): 1299 request.add_unredirected_header('Host', sel_host) 1300 for name, value in self.parent.addheaders: 1301 name = name.capitalize() 1302 if not request.has_header(name): 1303 request.add_unredirected_header(name, value) 1304 1305 return request 1306 1307 def do_open(self, http_class, req, **http_conn_args): 1308 """Return an HTTPResponse object for the request, using http_class. 1309 1310 http_class must implement the HTTPConnection API from http.client. 1311 """ 1312 host = req.host 1313 if not host: 1314 raise URLError('no host given') 1315 1316 # will parse host:port 1317 h = http_class(host, timeout=req.timeout, **http_conn_args) 1318 h.set_debuglevel(self._debuglevel) 1319 1320 headers = dict(req.unredirected_hdrs) 1321 headers.update({k: v for k, v in req.headers.items() 1322 if k not in headers}) 1323 1324 # TODO(jhylton): Should this be redesigned to handle 1325 # persistent connections? 1326 1327 # We want to make an HTTP/1.1 request, but the addinfourl 1328 # class isn't prepared to deal with a persistent connection. 1329 # It will try to read all remaining data from the socket, 1330 # which will block while the server waits for the next request. 1331 # So make sure the connection gets closed after the (only) 1332 # request. 1333 headers["Connection"] = "close" 1334 headers = {name.title(): val for name, val in headers.items()} 1335 1336 if req._tunnel_host: 1337 tunnel_headers = {} 1338 proxy_auth_hdr = "Proxy-Authorization" 1339 if proxy_auth_hdr in headers: 1340 tunnel_headers[proxy_auth_hdr] = headers[proxy_auth_hdr] 1341 # Proxy-Authorization should not be sent to origin 1342 # server. 1343 del headers[proxy_auth_hdr] 1344 h.set_tunnel(req._tunnel_host, headers=tunnel_headers) 1345 1346 try: 1347 try: 1348 h.request(req.get_method(), req.selector, req.data, headers, 1349 encode_chunked=req.has_header('Transfer-encoding')) 1350 except OSError as err: # timeout error 1351 raise URLError(err) 1352 r = h.getresponse() 1353 except: 1354 h.close() 1355 raise 1356 1357 # If the server does not send us a 'Connection: close' header, 1358 # HTTPConnection assumes the socket should be left open. Manually 1359 # mark the socket to be closed when this response object goes away. 1360 if h.sock: 1361 h.sock.close() 1362 h.sock = None 1363 1364 r.url = req.get_full_url() 1365 # This line replaces the .msg attribute of the HTTPResponse 1366 # with .headers, because urllib clients expect the response to 1367 # have the reason in .msg. It would be good to mark this 1368 # attribute is deprecated and get then to use info() or 1369 # .headers. 1370 r.msg = r.reason 1371 return r 1372 1373 1374class HTTPHandler(AbstractHTTPHandler): 1375 1376 def http_open(self, req): 1377 return self.do_open(http.client.HTTPConnection, req) 1378 1379 http_request = AbstractHTTPHandler.do_request_ 1380 1381if hasattr(http.client, 'HTTPSConnection'): 1382 1383 class HTTPSHandler(AbstractHTTPHandler): 1384 1385 def __init__(self, debuglevel=0, context=None, check_hostname=None): 1386 AbstractHTTPHandler.__init__(self, debuglevel) 1387 self._context = context 1388 self._check_hostname = check_hostname 1389 1390 def https_open(self, req): 1391 return self.do_open(http.client.HTTPSConnection, req, 1392 context=self._context, check_hostname=self._check_hostname) 1393 1394 https_request = AbstractHTTPHandler.do_request_ 1395 1396 __all__.append('HTTPSHandler') 1397 1398class HTTPCookieProcessor(BaseHandler): 1399 def __init__(self, cookiejar=None): 1400 import http.cookiejar 1401 if cookiejar is None: 1402 cookiejar = http.cookiejar.CookieJar() 1403 self.cookiejar = cookiejar 1404 1405 def http_request(self, request): 1406 self.cookiejar.add_cookie_header(request) 1407 return request 1408 1409 def http_response(self, request, response): 1410 self.cookiejar.extract_cookies(response, request) 1411 return response 1412 1413 https_request = http_request 1414 https_response = http_response 1415 1416class UnknownHandler(BaseHandler): 1417 def unknown_open(self, req): 1418 type = req.type 1419 raise URLError('unknown url type: %s' % type) 1420 1421def parse_keqv_list(l): 1422 """Parse list of key=value strings where keys are not duplicated.""" 1423 parsed = {} 1424 for elt in l: 1425 k, v = elt.split('=', 1) 1426 if v[0] == '"' and v[-1] == '"': 1427 v = v[1:-1] 1428 parsed[k] = v 1429 return parsed 1430 1431def parse_http_list(s): 1432 """Parse lists as described by RFC 2068 Section 2. 1433 1434 In particular, parse comma-separated lists where the elements of 1435 the list may include quoted-strings. A quoted-string could 1436 contain a comma. A non-quoted string could have quotes in the 1437 middle. Neither commas nor quotes count if they are escaped. 1438 Only double-quotes count, not single-quotes. 1439 """ 1440 res = [] 1441 part = '' 1442 1443 escape = quote = False 1444 for cur in s: 1445 if escape: 1446 part += cur 1447 escape = False 1448 continue 1449 if quote: 1450 if cur == '\\': 1451 escape = True 1452 continue 1453 elif cur == '"': 1454 quote = False 1455 part += cur 1456 continue 1457 1458 if cur == ',': 1459 res.append(part) 1460 part = '' 1461 continue 1462 1463 if cur == '"': 1464 quote = True 1465 1466 part += cur 1467 1468 # append last part 1469 if part: 1470 res.append(part) 1471 1472 return [part.strip() for part in res] 1473 1474class FileHandler(BaseHandler): 1475 # Use local file or FTP depending on form of URL 1476 def file_open(self, req): 1477 url = req.selector 1478 if url[:2] == '//' and url[2:3] != '/' and (req.host and 1479 req.host != 'localhost'): 1480 if not req.host in self.get_names(): 1481 raise URLError("file:// scheme is supported only on localhost") 1482 else: 1483 return self.open_local_file(req) 1484 1485 # names for the localhost 1486 names = None 1487 def get_names(self): 1488 if FileHandler.names is None: 1489 try: 1490 FileHandler.names = tuple( 1491 socket.gethostbyname_ex('localhost')[2] + 1492 socket.gethostbyname_ex(socket.gethostname())[2]) 1493 except socket.gaierror: 1494 FileHandler.names = (socket.gethostbyname('localhost'),) 1495 return FileHandler.names 1496 1497 # not entirely sure what the rules are here 1498 def open_local_file(self, req): 1499 import email.utils 1500 import mimetypes 1501 host = req.host 1502 filename = req.selector 1503 localfile = url2pathname(filename) 1504 try: 1505 stats = os.stat(localfile) 1506 size = stats.st_size 1507 modified = email.utils.formatdate(stats.st_mtime, usegmt=True) 1508 mtype = mimetypes.guess_type(filename)[0] 1509 headers = email.message_from_string( 1510 'Content-type: %s\nContent-length: %d\nLast-modified: %s\n' % 1511 (mtype or 'text/plain', size, modified)) 1512 if host: 1513 host, port = _splitport(host) 1514 if not host or \ 1515 (not port and _safe_gethostbyname(host) in self.get_names()): 1516 if host: 1517 origurl = 'file://' + host + filename 1518 else: 1519 origurl = 'file://' + filename 1520 return addinfourl(open(localfile, 'rb'), headers, origurl) 1521 except OSError as exp: 1522 raise URLError(exp) 1523 raise URLError('file not on local host') 1524 1525def _safe_gethostbyname(host): 1526 try: 1527 return socket.gethostbyname(host) 1528 except socket.gaierror: 1529 return None 1530 1531class FTPHandler(BaseHandler): 1532 def ftp_open(self, req): 1533 import ftplib 1534 import mimetypes 1535 host = req.host 1536 if not host: 1537 raise URLError('ftp error: no host given') 1538 host, port = _splitport(host) 1539 if port is None: 1540 port = ftplib.FTP_PORT 1541 else: 1542 port = int(port) 1543 1544 # username/password handling 1545 user, host = _splituser(host) 1546 if user: 1547 user, passwd = _splitpasswd(user) 1548 else: 1549 passwd = None 1550 host = unquote(host) 1551 user = user or '' 1552 passwd = passwd or '' 1553 1554 try: 1555 host = socket.gethostbyname(host) 1556 except OSError as msg: 1557 raise URLError(msg) 1558 path, attrs = _splitattr(req.selector) 1559 dirs = path.split('/') 1560 dirs = list(map(unquote, dirs)) 1561 dirs, file = dirs[:-1], dirs[-1] 1562 if dirs and not dirs[0]: 1563 dirs = dirs[1:] 1564 try: 1565 fw = self.connect_ftp(user, passwd, host, port, dirs, req.timeout) 1566 type = file and 'I' or 'D' 1567 for attr in attrs: 1568 attr, value = _splitvalue(attr) 1569 if attr.lower() == 'type' and \ 1570 value in ('a', 'A', 'i', 'I', 'd', 'D'): 1571 type = value.upper() 1572 fp, retrlen = fw.retrfile(file, type) 1573 headers = "" 1574 mtype = mimetypes.guess_type(req.full_url)[0] 1575 if mtype: 1576 headers += "Content-type: %s\n" % mtype 1577 if retrlen is not None and retrlen >= 0: 1578 headers += "Content-length: %d\n" % retrlen 1579 headers = email.message_from_string(headers) 1580 return addinfourl(fp, headers, req.full_url) 1581 except ftplib.all_errors as exp: 1582 raise URLError(exp) from exp 1583 1584 def connect_ftp(self, user, passwd, host, port, dirs, timeout): 1585 return ftpwrapper(user, passwd, host, port, dirs, timeout, 1586 persistent=False) 1587 1588class CacheFTPHandler(FTPHandler): 1589 # XXX would be nice to have pluggable cache strategies 1590 # XXX this stuff is definitely not thread safe 1591 def __init__(self): 1592 self.cache = {} 1593 self.timeout = {} 1594 self.soonest = 0 1595 self.delay = 60 1596 self.max_conns = 16 1597 1598 def setTimeout(self, t): 1599 self.delay = t 1600 1601 def setMaxConns(self, m): 1602 self.max_conns = m 1603 1604 def connect_ftp(self, user, passwd, host, port, dirs, timeout): 1605 key = user, host, port, '/'.join(dirs), timeout 1606 if key in self.cache: 1607 self.timeout[key] = time.time() + self.delay 1608 else: 1609 self.cache[key] = ftpwrapper(user, passwd, host, port, 1610 dirs, timeout) 1611 self.timeout[key] = time.time() + self.delay 1612 self.check_cache() 1613 return self.cache[key] 1614 1615 def check_cache(self): 1616 # first check for old ones 1617 t = time.time() 1618 if self.soonest <= t: 1619 for k, v in list(self.timeout.items()): 1620 if v < t: 1621 self.cache[k].close() 1622 del self.cache[k] 1623 del self.timeout[k] 1624 self.soonest = min(list(self.timeout.values())) 1625 1626 # then check the size 1627 if len(self.cache) == self.max_conns: 1628 for k, v in list(self.timeout.items()): 1629 if v == self.soonest: 1630 del self.cache[k] 1631 del self.timeout[k] 1632 break 1633 self.soonest = min(list(self.timeout.values())) 1634 1635 def clear_cache(self): 1636 for conn in self.cache.values(): 1637 conn.close() 1638 self.cache.clear() 1639 self.timeout.clear() 1640 1641class DataHandler(BaseHandler): 1642 def data_open(self, req): 1643 # data URLs as specified in RFC 2397. 1644 # 1645 # ignores POSTed data 1646 # 1647 # syntax: 1648 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data 1649 # mediatype := [ type "/" subtype ] *( ";" parameter ) 1650 # data := *urlchar 1651 # parameter := attribute "=" value 1652 url = req.full_url 1653 1654 scheme, data = url.split(":",1) 1655 mediatype, data = data.split(",",1) 1656 1657 # even base64 encoded data URLs might be quoted so unquote in any case: 1658 data = unquote_to_bytes(data) 1659 if mediatype.endswith(";base64"): 1660 data = base64.decodebytes(data) 1661 mediatype = mediatype[:-7] 1662 1663 if not mediatype: 1664 mediatype = "text/plain;charset=US-ASCII" 1665 1666 headers = email.message_from_string("Content-type: %s\nContent-length: %d\n" % 1667 (mediatype, len(data))) 1668 1669 return addinfourl(io.BytesIO(data), headers, url) 1670 1671 1672# Code move from the old urllib module 1673 1674MAXFTPCACHE = 10 # Trim the ftp cache beyond this size 1675 1676# Helper for non-unix systems 1677if os.name == 'nt': 1678 from nturl2path import url2pathname, pathname2url 1679else: 1680 def url2pathname(pathname): 1681 """OS-specific conversion from a relative URL of the 'file' scheme 1682 to a file system path; not recommended for general use.""" 1683 return unquote(pathname) 1684 1685 def pathname2url(pathname): 1686 """OS-specific conversion from a file system path to a relative URL 1687 of the 'file' scheme; not recommended for general use.""" 1688 return quote(pathname) 1689 1690 1691ftpcache = {} 1692 1693 1694class URLopener: 1695 """Class to open URLs. 1696 This is a class rather than just a subroutine because we may need 1697 more than one set of global protocol-specific options. 1698 Note -- this is a base class for those who don't want the 1699 automatic handling of errors type 302 (relocated) and 401 1700 (authorization needed).""" 1701 1702 __tempfiles = None 1703 1704 version = "Python-urllib/%s" % __version__ 1705 1706 # Constructor 1707 def __init__(self, proxies=None, **x509): 1708 msg = "%(class)s style of invoking requests is deprecated. " \ 1709 "Use newer urlopen functions/methods" % {'class': self.__class__.__name__} 1710 warnings.warn(msg, DeprecationWarning, stacklevel=3) 1711 if proxies is None: 1712 proxies = getproxies() 1713 assert hasattr(proxies, 'keys'), "proxies must be a mapping" 1714 self.proxies = proxies 1715 self.key_file = x509.get('key_file') 1716 self.cert_file = x509.get('cert_file') 1717 self.addheaders = [('User-Agent', self.version), ('Accept', '*/*')] 1718 self.__tempfiles = [] 1719 self.__unlink = os.unlink # See cleanup() 1720 self.tempcache = None 1721 # Undocumented feature: if you assign {} to tempcache, 1722 # it is used to cache files retrieved with 1723 # self.retrieve(). This is not enabled by default 1724 # since it does not work for changing documents (and I 1725 # haven't got the logic to check expiration headers 1726 # yet). 1727 self.ftpcache = ftpcache 1728 # Undocumented feature: you can use a different 1729 # ftp cache by assigning to the .ftpcache member; 1730 # in case you want logically independent URL openers 1731 # XXX This is not threadsafe. Bah. 1732 1733 def __del__(self): 1734 self.close() 1735 1736 def close(self): 1737 self.cleanup() 1738 1739 def cleanup(self): 1740 # This code sometimes runs when the rest of this module 1741 # has already been deleted, so it can't use any globals 1742 # or import anything. 1743 if self.__tempfiles: 1744 for file in self.__tempfiles: 1745 try: 1746 self.__unlink(file) 1747 except OSError: 1748 pass 1749 del self.__tempfiles[:] 1750 if self.tempcache: 1751 self.tempcache.clear() 1752 1753 def addheader(self, *args): 1754 """Add a header to be used by the HTTP interface only 1755 e.g. u.addheader('Accept', 'sound/basic')""" 1756 self.addheaders.append(args) 1757 1758 # External interface 1759 def open(self, fullurl, data=None): 1760 """Use URLopener().open(file) instead of open(file, 'r').""" 1761 fullurl = unwrap(_to_bytes(fullurl)) 1762 fullurl = quote(fullurl, safe="%/:=&?~#+!$,;'@()*[]|") 1763 if self.tempcache and fullurl in self.tempcache: 1764 filename, headers = self.tempcache[fullurl] 1765 fp = open(filename, 'rb') 1766 return addinfourl(fp, headers, fullurl) 1767 urltype, url = _splittype(fullurl) 1768 if not urltype: 1769 urltype = 'file' 1770 if urltype in self.proxies: 1771 proxy = self.proxies[urltype] 1772 urltype, proxyhost = _splittype(proxy) 1773 host, selector = _splithost(proxyhost) 1774 url = (host, fullurl) # Signal special case to open_*() 1775 else: 1776 proxy = None 1777 name = 'open_' + urltype 1778 self.type = urltype 1779 name = name.replace('-', '_') 1780 if not hasattr(self, name) or name == 'open_local_file': 1781 if proxy: 1782 return self.open_unknown_proxy(proxy, fullurl, data) 1783 else: 1784 return self.open_unknown(fullurl, data) 1785 try: 1786 if data is None: 1787 return getattr(self, name)(url) 1788 else: 1789 return getattr(self, name)(url, data) 1790 except (HTTPError, URLError): 1791 raise 1792 except OSError as msg: 1793 raise OSError('socket error', msg) from msg 1794 1795 def open_unknown(self, fullurl, data=None): 1796 """Overridable interface to open unknown URL type.""" 1797 type, url = _splittype(fullurl) 1798 raise OSError('url error', 'unknown url type', type) 1799 1800 def open_unknown_proxy(self, proxy, fullurl, data=None): 1801 """Overridable interface to open unknown URL type.""" 1802 type, url = _splittype(fullurl) 1803 raise OSError('url error', 'invalid proxy for %s' % type, proxy) 1804 1805 # External interface 1806 def retrieve(self, url, filename=None, reporthook=None, data=None): 1807 """retrieve(url) returns (filename, headers) for a local object 1808 or (tempfilename, headers) for a remote object.""" 1809 url = unwrap(_to_bytes(url)) 1810 if self.tempcache and url in self.tempcache: 1811 return self.tempcache[url] 1812 type, url1 = _splittype(url) 1813 if filename is None and (not type or type == 'file'): 1814 try: 1815 fp = self.open_local_file(url1) 1816 hdrs = fp.info() 1817 fp.close() 1818 return url2pathname(_splithost(url1)[1]), hdrs 1819 except OSError: 1820 pass 1821 fp = self.open(url, data) 1822 try: 1823 headers = fp.info() 1824 if filename: 1825 tfp = open(filename, 'wb') 1826 else: 1827 garbage, path = _splittype(url) 1828 garbage, path = _splithost(path or "") 1829 path, garbage = _splitquery(path or "") 1830 path, garbage = _splitattr(path or "") 1831 suffix = os.path.splitext(path)[1] 1832 (fd, filename) = tempfile.mkstemp(suffix) 1833 self.__tempfiles.append(filename) 1834 tfp = os.fdopen(fd, 'wb') 1835 try: 1836 result = filename, headers 1837 if self.tempcache is not None: 1838 self.tempcache[url] = result 1839 bs = 1024*8 1840 size = -1 1841 read = 0 1842 blocknum = 0 1843 if "content-length" in headers: 1844 size = int(headers["Content-Length"]) 1845 if reporthook: 1846 reporthook(blocknum, bs, size) 1847 while 1: 1848 block = fp.read(bs) 1849 if not block: 1850 break 1851 read += len(block) 1852 tfp.write(block) 1853 blocknum += 1 1854 if reporthook: 1855 reporthook(blocknum, bs, size) 1856 finally: 1857 tfp.close() 1858 finally: 1859 fp.close() 1860 1861 # raise exception if actual size does not match content-length header 1862 if size >= 0 and read < size: 1863 raise ContentTooShortError( 1864 "retrieval incomplete: got only %i out of %i bytes" 1865 % (read, size), result) 1866 1867 return result 1868 1869 # Each method named open_<type> knows how to open that type of URL 1870 1871 def _open_generic_http(self, connection_factory, url, data): 1872 """Make an HTTP connection using connection_class. 1873 1874 This is an internal method that should be called from 1875 open_http() or open_https(). 1876 1877 Arguments: 1878 - connection_factory should take a host name and return an 1879 HTTPConnection instance. 1880 - url is the url to retrieval or a host, relative-path pair. 1881 - data is payload for a POST request or None. 1882 """ 1883 1884 user_passwd = None 1885 proxy_passwd= None 1886 if isinstance(url, str): 1887 host, selector = _splithost(url) 1888 if host: 1889 user_passwd, host = _splituser(host) 1890 host = unquote(host) 1891 realhost = host 1892 else: 1893 host, selector = url 1894 # check whether the proxy contains authorization information 1895 proxy_passwd, host = _splituser(host) 1896 # now we proceed with the url we want to obtain 1897 urltype, rest = _splittype(selector) 1898 url = rest 1899 user_passwd = None 1900 if urltype.lower() != 'http': 1901 realhost = None 1902 else: 1903 realhost, rest = _splithost(rest) 1904 if realhost: 1905 user_passwd, realhost = _splituser(realhost) 1906 if user_passwd: 1907 selector = "%s://%s%s" % (urltype, realhost, rest) 1908 if proxy_bypass(realhost): 1909 host = realhost 1910 1911 if not host: raise OSError('http error', 'no host given') 1912 1913 if proxy_passwd: 1914 proxy_passwd = unquote(proxy_passwd) 1915 proxy_auth = base64.b64encode(proxy_passwd.encode()).decode('ascii') 1916 else: 1917 proxy_auth = None 1918 1919 if user_passwd: 1920 user_passwd = unquote(user_passwd) 1921 auth = base64.b64encode(user_passwd.encode()).decode('ascii') 1922 else: 1923 auth = None 1924 http_conn = connection_factory(host) 1925 headers = {} 1926 if proxy_auth: 1927 headers["Proxy-Authorization"] = "Basic %s" % proxy_auth 1928 if auth: 1929 headers["Authorization"] = "Basic %s" % auth 1930 if realhost: 1931 headers["Host"] = realhost 1932 1933 # Add Connection:close as we don't support persistent connections yet. 1934 # This helps in closing the socket and avoiding ResourceWarning 1935 1936 headers["Connection"] = "close" 1937 1938 for header, value in self.addheaders: 1939 headers[header] = value 1940 1941 if data is not None: 1942 headers["Content-Type"] = "application/x-www-form-urlencoded" 1943 http_conn.request("POST", selector, data, headers) 1944 else: 1945 http_conn.request("GET", selector, headers=headers) 1946 1947 try: 1948 response = http_conn.getresponse() 1949 except http.client.BadStatusLine: 1950 # something went wrong with the HTTP status line 1951 raise URLError("http protocol error: bad status line") 1952 1953 # According to RFC 2616, "2xx" code indicates that the client's 1954 # request was successfully received, understood, and accepted. 1955 if 200 <= response.status < 300: 1956 return addinfourl(response, response.msg, "http:" + url, 1957 response.status) 1958 else: 1959 return self.http_error( 1960 url, response.fp, 1961 response.status, response.reason, response.msg, data) 1962 1963 def open_http(self, url, data=None): 1964 """Use HTTP protocol.""" 1965 return self._open_generic_http(http.client.HTTPConnection, url, data) 1966 1967 def http_error(self, url, fp, errcode, errmsg, headers, data=None): 1968 """Handle http errors. 1969 1970 Derived class can override this, or provide specific handlers 1971 named http_error_DDD where DDD is the 3-digit error code.""" 1972 # First check if there's a specific handler for this error 1973 name = 'http_error_%d' % errcode 1974 if hasattr(self, name): 1975 method = getattr(self, name) 1976 if data is None: 1977 result = method(url, fp, errcode, errmsg, headers) 1978 else: 1979 result = method(url, fp, errcode, errmsg, headers, data) 1980 if result: return result 1981 return self.http_error_default(url, fp, errcode, errmsg, headers) 1982 1983 def http_error_default(self, url, fp, errcode, errmsg, headers): 1984 """Default error handler: close the connection and raise OSError.""" 1985 fp.close() 1986 raise HTTPError(url, errcode, errmsg, headers, None) 1987 1988 if _have_ssl: 1989 def _https_connection(self, host): 1990 return http.client.HTTPSConnection(host, 1991 key_file=self.key_file, 1992 cert_file=self.cert_file) 1993 1994 def open_https(self, url, data=None): 1995 """Use HTTPS protocol.""" 1996 return self._open_generic_http(self._https_connection, url, data) 1997 1998 def open_file(self, url): 1999 """Use local file or FTP depending on form of URL.""" 2000 if not isinstance(url, str): 2001 raise URLError('file error: proxy support for file protocol currently not implemented') 2002 if url[:2] == '//' and url[2:3] != '/' and url[2:12].lower() != 'localhost/': 2003 raise ValueError("file:// scheme is supported only on localhost") 2004 else: 2005 return self.open_local_file(url) 2006 2007 def open_local_file(self, url): 2008 """Use local file.""" 2009 import email.utils 2010 import mimetypes 2011 host, file = _splithost(url) 2012 localname = url2pathname(file) 2013 try: 2014 stats = os.stat(localname) 2015 except OSError as e: 2016 raise URLError(e.strerror, e.filename) 2017 size = stats.st_size 2018 modified = email.utils.formatdate(stats.st_mtime, usegmt=True) 2019 mtype = mimetypes.guess_type(url)[0] 2020 headers = email.message_from_string( 2021 'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' % 2022 (mtype or 'text/plain', size, modified)) 2023 if not host: 2024 urlfile = file 2025 if file[:1] == '/': 2026 urlfile = 'file://' + file 2027 return addinfourl(open(localname, 'rb'), headers, urlfile) 2028 host, port = _splitport(host) 2029 if (not port 2030 and socket.gethostbyname(host) in ((localhost(),) + thishost())): 2031 urlfile = file 2032 if file[:1] == '/': 2033 urlfile = 'file://' + file 2034 elif file[:2] == './': 2035 raise ValueError("local file url may start with / or file:. Unknown url of type: %s" % url) 2036 return addinfourl(open(localname, 'rb'), headers, urlfile) 2037 raise URLError('local file error: not on local host') 2038 2039 def open_ftp(self, url): 2040 """Use FTP protocol.""" 2041 if not isinstance(url, str): 2042 raise URLError('ftp error: proxy support for ftp protocol currently not implemented') 2043 import mimetypes 2044 host, path = _splithost(url) 2045 if not host: raise URLError('ftp error: no host given') 2046 host, port = _splitport(host) 2047 user, host = _splituser(host) 2048 if user: user, passwd = _splitpasswd(user) 2049 else: passwd = None 2050 host = unquote(host) 2051 user = unquote(user or '') 2052 passwd = unquote(passwd or '') 2053 host = socket.gethostbyname(host) 2054 if not port: 2055 import ftplib 2056 port = ftplib.FTP_PORT 2057 else: 2058 port = int(port) 2059 path, attrs = _splitattr(path) 2060 path = unquote(path) 2061 dirs = path.split('/') 2062 dirs, file = dirs[:-1], dirs[-1] 2063 if dirs and not dirs[0]: dirs = dirs[1:] 2064 if dirs and not dirs[0]: dirs[0] = '/' 2065 key = user, host, port, '/'.join(dirs) 2066 # XXX thread unsafe! 2067 if len(self.ftpcache) > MAXFTPCACHE: 2068 # Prune the cache, rather arbitrarily 2069 for k in list(self.ftpcache): 2070 if k != key: 2071 v = self.ftpcache[k] 2072 del self.ftpcache[k] 2073 v.close() 2074 try: 2075 if key not in self.ftpcache: 2076 self.ftpcache[key] = \ 2077 ftpwrapper(user, passwd, host, port, dirs) 2078 if not file: type = 'D' 2079 else: type = 'I' 2080 for attr in attrs: 2081 attr, value = _splitvalue(attr) 2082 if attr.lower() == 'type' and \ 2083 value in ('a', 'A', 'i', 'I', 'd', 'D'): 2084 type = value.upper() 2085 (fp, retrlen) = self.ftpcache[key].retrfile(file, type) 2086 mtype = mimetypes.guess_type("ftp:" + url)[0] 2087 headers = "" 2088 if mtype: 2089 headers += "Content-Type: %s\n" % mtype 2090 if retrlen is not None and retrlen >= 0: 2091 headers += "Content-Length: %d\n" % retrlen 2092 headers = email.message_from_string(headers) 2093 return addinfourl(fp, headers, "ftp:" + url) 2094 except ftperrors() as exp: 2095 raise URLError(f'ftp error: {exp}') from exp 2096 2097 def open_data(self, url, data=None): 2098 """Use "data" URL.""" 2099 if not isinstance(url, str): 2100 raise URLError('data error: proxy support for data protocol currently not implemented') 2101 # ignore POSTed data 2102 # 2103 # syntax of data URLs: 2104 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data 2105 # mediatype := [ type "/" subtype ] *( ";" parameter ) 2106 # data := *urlchar 2107 # parameter := attribute "=" value 2108 try: 2109 [type, data] = url.split(',', 1) 2110 except ValueError: 2111 raise OSError('data error', 'bad data URL') 2112 if not type: 2113 type = 'text/plain;charset=US-ASCII' 2114 semi = type.rfind(';') 2115 if semi >= 0 and '=' not in type[semi:]: 2116 encoding = type[semi+1:] 2117 type = type[:semi] 2118 else: 2119 encoding = '' 2120 msg = [] 2121 msg.append('Date: %s'%time.strftime('%a, %d %b %Y %H:%M:%S GMT', 2122 time.gmtime(time.time()))) 2123 msg.append('Content-type: %s' % type) 2124 if encoding == 'base64': 2125 # XXX is this encoding/decoding ok? 2126 data = base64.decodebytes(data.encode('ascii')).decode('latin-1') 2127 else: 2128 data = unquote(data) 2129 msg.append('Content-Length: %d' % len(data)) 2130 msg.append('') 2131 msg.append(data) 2132 msg = '\n'.join(msg) 2133 headers = email.message_from_string(msg) 2134 f = io.StringIO(msg) 2135 #f.fileno = None # needed for addinfourl 2136 return addinfourl(f, headers, url) 2137 2138 2139class FancyURLopener(URLopener): 2140 """Derived class with handlers for errors we can handle (perhaps).""" 2141 2142 def __init__(self, *args, **kwargs): 2143 URLopener.__init__(self, *args, **kwargs) 2144 self.auth_cache = {} 2145 self.tries = 0 2146 self.maxtries = 10 2147 2148 def http_error_default(self, url, fp, errcode, errmsg, headers): 2149 """Default error handling -- don't raise an exception.""" 2150 return addinfourl(fp, headers, "http:" + url, errcode) 2151 2152 def http_error_302(self, url, fp, errcode, errmsg, headers, data=None): 2153 """Error 302 -- relocated (temporarily).""" 2154 self.tries += 1 2155 try: 2156 if self.maxtries and self.tries >= self.maxtries: 2157 if hasattr(self, "http_error_500"): 2158 meth = self.http_error_500 2159 else: 2160 meth = self.http_error_default 2161 return meth(url, fp, 500, 2162 "Internal Server Error: Redirect Recursion", 2163 headers) 2164 result = self.redirect_internal(url, fp, errcode, errmsg, 2165 headers, data) 2166 return result 2167 finally: 2168 self.tries = 0 2169 2170 def redirect_internal(self, url, fp, errcode, errmsg, headers, data): 2171 if 'location' in headers: 2172 newurl = headers['location'] 2173 elif 'uri' in headers: 2174 newurl = headers['uri'] 2175 else: 2176 return 2177 fp.close() 2178 2179 # In case the server sent a relative URL, join with original: 2180 newurl = urljoin(self.type + ":" + url, newurl) 2181 2182 urlparts = urlparse(newurl) 2183 2184 # For security reasons, we don't allow redirection to anything other 2185 # than http, https and ftp. 2186 2187 # We are using newer HTTPError with older redirect_internal method 2188 # This older method will get deprecated in 3.3 2189 2190 if urlparts.scheme not in ('http', 'https', 'ftp', ''): 2191 raise HTTPError(newurl, errcode, 2192 errmsg + 2193 " Redirection to url '%s' is not allowed." % newurl, 2194 headers, fp) 2195 2196 return self.open(newurl) 2197 2198 def http_error_301(self, url, fp, errcode, errmsg, headers, data=None): 2199 """Error 301 -- also relocated (permanently).""" 2200 return self.http_error_302(url, fp, errcode, errmsg, headers, data) 2201 2202 def http_error_303(self, url, fp, errcode, errmsg, headers, data=None): 2203 """Error 303 -- also relocated (essentially identical to 302).""" 2204 return self.http_error_302(url, fp, errcode, errmsg, headers, data) 2205 2206 def http_error_307(self, url, fp, errcode, errmsg, headers, data=None): 2207 """Error 307 -- relocated, but turn POST into error.""" 2208 if data is None: 2209 return self.http_error_302(url, fp, errcode, errmsg, headers, data) 2210 else: 2211 return self.http_error_default(url, fp, errcode, errmsg, headers) 2212 2213 def http_error_308(self, url, fp, errcode, errmsg, headers, data=None): 2214 """Error 308 -- relocated, but turn POST into error.""" 2215 if data is None: 2216 return self.http_error_301(url, fp, errcode, errmsg, headers, data) 2217 else: 2218 return self.http_error_default(url, fp, errcode, errmsg, headers) 2219 2220 def http_error_401(self, url, fp, errcode, errmsg, headers, data=None, 2221 retry=False): 2222 """Error 401 -- authentication required. 2223 This function supports Basic authentication only.""" 2224 if 'www-authenticate' not in headers: 2225 URLopener.http_error_default(self, url, fp, 2226 errcode, errmsg, headers) 2227 stuff = headers['www-authenticate'] 2228 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff) 2229 if not match: 2230 URLopener.http_error_default(self, url, fp, 2231 errcode, errmsg, headers) 2232 scheme, realm = match.groups() 2233 if scheme.lower() != 'basic': 2234 URLopener.http_error_default(self, url, fp, 2235 errcode, errmsg, headers) 2236 if not retry: 2237 URLopener.http_error_default(self, url, fp, errcode, errmsg, 2238 headers) 2239 name = 'retry_' + self.type + '_basic_auth' 2240 if data is None: 2241 return getattr(self,name)(url, realm) 2242 else: 2243 return getattr(self,name)(url, realm, data) 2244 2245 def http_error_407(self, url, fp, errcode, errmsg, headers, data=None, 2246 retry=False): 2247 """Error 407 -- proxy authentication required. 2248 This function supports Basic authentication only.""" 2249 if 'proxy-authenticate' not in headers: 2250 URLopener.http_error_default(self, url, fp, 2251 errcode, errmsg, headers) 2252 stuff = headers['proxy-authenticate'] 2253 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff) 2254 if not match: 2255 URLopener.http_error_default(self, url, fp, 2256 errcode, errmsg, headers) 2257 scheme, realm = match.groups() 2258 if scheme.lower() != 'basic': 2259 URLopener.http_error_default(self, url, fp, 2260 errcode, errmsg, headers) 2261 if not retry: 2262 URLopener.http_error_default(self, url, fp, errcode, errmsg, 2263 headers) 2264 name = 'retry_proxy_' + self.type + '_basic_auth' 2265 if data is None: 2266 return getattr(self,name)(url, realm) 2267 else: 2268 return getattr(self,name)(url, realm, data) 2269 2270 def retry_proxy_http_basic_auth(self, url, realm, data=None): 2271 host, selector = _splithost(url) 2272 newurl = 'http://' + host + selector 2273 proxy = self.proxies['http'] 2274 urltype, proxyhost = _splittype(proxy) 2275 proxyhost, proxyselector = _splithost(proxyhost) 2276 i = proxyhost.find('@') + 1 2277 proxyhost = proxyhost[i:] 2278 user, passwd = self.get_user_passwd(proxyhost, realm, i) 2279 if not (user or passwd): return None 2280 proxyhost = "%s:%s@%s" % (quote(user, safe=''), 2281 quote(passwd, safe=''), proxyhost) 2282 self.proxies['http'] = 'http://' + proxyhost + proxyselector 2283 if data is None: 2284 return self.open(newurl) 2285 else: 2286 return self.open(newurl, data) 2287 2288 def retry_proxy_https_basic_auth(self, url, realm, data=None): 2289 host, selector = _splithost(url) 2290 newurl = 'https://' + host + selector 2291 proxy = self.proxies['https'] 2292 urltype, proxyhost = _splittype(proxy) 2293 proxyhost, proxyselector = _splithost(proxyhost) 2294 i = proxyhost.find('@') + 1 2295 proxyhost = proxyhost[i:] 2296 user, passwd = self.get_user_passwd(proxyhost, realm, i) 2297 if not (user or passwd): return None 2298 proxyhost = "%s:%s@%s" % (quote(user, safe=''), 2299 quote(passwd, safe=''), proxyhost) 2300 self.proxies['https'] = 'https://' + proxyhost + proxyselector 2301 if data is None: 2302 return self.open(newurl) 2303 else: 2304 return self.open(newurl, data) 2305 2306 def retry_http_basic_auth(self, url, realm, data=None): 2307 host, selector = _splithost(url) 2308 i = host.find('@') + 1 2309 host = host[i:] 2310 user, passwd = self.get_user_passwd(host, realm, i) 2311 if not (user or passwd): return None 2312 host = "%s:%s@%s" % (quote(user, safe=''), 2313 quote(passwd, safe=''), host) 2314 newurl = 'http://' + host + selector 2315 if data is None: 2316 return self.open(newurl) 2317 else: 2318 return self.open(newurl, data) 2319 2320 def retry_https_basic_auth(self, url, realm, data=None): 2321 host, selector = _splithost(url) 2322 i = host.find('@') + 1 2323 host = host[i:] 2324 user, passwd = self.get_user_passwd(host, realm, i) 2325 if not (user or passwd): return None 2326 host = "%s:%s@%s" % (quote(user, safe=''), 2327 quote(passwd, safe=''), host) 2328 newurl = 'https://' + host + selector 2329 if data is None: 2330 return self.open(newurl) 2331 else: 2332 return self.open(newurl, data) 2333 2334 def get_user_passwd(self, host, realm, clear_cache=0): 2335 key = realm + '@' + host.lower() 2336 if key in self.auth_cache: 2337 if clear_cache: 2338 del self.auth_cache[key] 2339 else: 2340 return self.auth_cache[key] 2341 user, passwd = self.prompt_user_passwd(host, realm) 2342 if user or passwd: self.auth_cache[key] = (user, passwd) 2343 return user, passwd 2344 2345 def prompt_user_passwd(self, host, realm): 2346 """Override this in a GUI environment!""" 2347 import getpass 2348 try: 2349 user = input("Enter username for %s at %s: " % (realm, host)) 2350 passwd = getpass.getpass("Enter password for %s in %s at %s: " % 2351 (user, realm, host)) 2352 return user, passwd 2353 except KeyboardInterrupt: 2354 print() 2355 return None, None 2356 2357 2358# Utility functions 2359 2360_localhost = None 2361def localhost(): 2362 """Return the IP address of the magic hostname 'localhost'.""" 2363 global _localhost 2364 if _localhost is None: 2365 _localhost = socket.gethostbyname('localhost') 2366 return _localhost 2367 2368_thishost = None 2369def thishost(): 2370 """Return the IP addresses of the current host.""" 2371 global _thishost 2372 if _thishost is None: 2373 try: 2374 _thishost = tuple(socket.gethostbyname_ex(socket.gethostname())[2]) 2375 except socket.gaierror: 2376 _thishost = tuple(socket.gethostbyname_ex('localhost')[2]) 2377 return _thishost 2378 2379_ftperrors = None 2380def ftperrors(): 2381 """Return the set of errors raised by the FTP class.""" 2382 global _ftperrors 2383 if _ftperrors is None: 2384 import ftplib 2385 _ftperrors = ftplib.all_errors 2386 return _ftperrors 2387 2388_noheaders = None 2389def noheaders(): 2390 """Return an empty email Message object.""" 2391 global _noheaders 2392 if _noheaders is None: 2393 _noheaders = email.message_from_string("") 2394 return _noheaders 2395 2396 2397# Utility classes 2398 2399class ftpwrapper: 2400 """Class used by open_ftp() for cache of open FTP connections.""" 2401 2402 def __init__(self, user, passwd, host, port, dirs, timeout=None, 2403 persistent=True): 2404 self.user = user 2405 self.passwd = passwd 2406 self.host = host 2407 self.port = port 2408 self.dirs = dirs 2409 self.timeout = timeout 2410 self.refcount = 0 2411 self.keepalive = persistent 2412 try: 2413 self.init() 2414 except: 2415 self.close() 2416 raise 2417 2418 def init(self): 2419 import ftplib 2420 self.busy = 0 2421 self.ftp = ftplib.FTP() 2422 self.ftp.connect(self.host, self.port, self.timeout) 2423 self.ftp.login(self.user, self.passwd) 2424 _target = '/'.join(self.dirs) 2425 self.ftp.cwd(_target) 2426 2427 def retrfile(self, file, type): 2428 import ftplib 2429 self.endtransfer() 2430 if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1 2431 else: cmd = 'TYPE ' + type; isdir = 0 2432 try: 2433 self.ftp.voidcmd(cmd) 2434 except ftplib.all_errors: 2435 self.init() 2436 self.ftp.voidcmd(cmd) 2437 conn = None 2438 if file and not isdir: 2439 # Try to retrieve as a file 2440 try: 2441 cmd = 'RETR ' + file 2442 conn, retrlen = self.ftp.ntransfercmd(cmd) 2443 except ftplib.error_perm as reason: 2444 if str(reason)[:3] != '550': 2445 raise URLError(f'ftp error: {reason}') from reason 2446 if not conn: 2447 # Set transfer mode to ASCII! 2448 self.ftp.voidcmd('TYPE A') 2449 # Try a directory listing. Verify that directory exists. 2450 if file: 2451 pwd = self.ftp.pwd() 2452 try: 2453 try: 2454 self.ftp.cwd(file) 2455 except ftplib.error_perm as reason: 2456 raise URLError('ftp error: %r' % reason) from reason 2457 finally: 2458 self.ftp.cwd(pwd) 2459 cmd = 'LIST ' + file 2460 else: 2461 cmd = 'LIST' 2462 conn, retrlen = self.ftp.ntransfercmd(cmd) 2463 self.busy = 1 2464 2465 ftpobj = addclosehook(conn.makefile('rb'), self.file_close) 2466 self.refcount += 1 2467 conn.close() 2468 # Pass back both a suitably decorated object and a retrieval length 2469 return (ftpobj, retrlen) 2470 2471 def endtransfer(self): 2472 if not self.busy: 2473 return 2474 self.busy = 0 2475 try: 2476 self.ftp.voidresp() 2477 except ftperrors(): 2478 pass 2479 2480 def close(self): 2481 self.keepalive = False 2482 if self.refcount <= 0: 2483 self.real_close() 2484 2485 def file_close(self): 2486 self.endtransfer() 2487 self.refcount -= 1 2488 if self.refcount <= 0 and not self.keepalive: 2489 self.real_close() 2490 2491 def real_close(self): 2492 self.endtransfer() 2493 try: 2494 self.ftp.close() 2495 except ftperrors(): 2496 pass 2497 2498# Proxy handling 2499def getproxies_environment(): 2500 """Return a dictionary of scheme -> proxy server URL mappings. 2501 2502 Scan the environment for variables named <scheme>_proxy; 2503 this seems to be the standard convention. If you need a 2504 different way, you can pass a proxies dictionary to the 2505 [Fancy]URLopener constructor. 2506 2507 """ 2508 proxies = {} 2509 # in order to prefer lowercase variables, process environment in 2510 # two passes: first matches any, second pass matches lowercase only 2511 for name, value in os.environ.items(): 2512 name = name.lower() 2513 if value and name[-6:] == '_proxy': 2514 proxies[name[:-6]] = value 2515 # CVE-2016-1000110 - If we are running as CGI script, forget HTTP_PROXY 2516 # (non-all-lowercase) as it may be set from the web server by a "Proxy:" 2517 # header from the client 2518 # If "proxy" is lowercase, it will still be used thanks to the next block 2519 if 'REQUEST_METHOD' in os.environ: 2520 proxies.pop('http', None) 2521 for name, value in os.environ.items(): 2522 if name[-6:] == '_proxy': 2523 name = name.lower() 2524 if value: 2525 proxies[name[:-6]] = value 2526 else: 2527 proxies.pop(name[:-6], None) 2528 return proxies 2529 2530def proxy_bypass_environment(host, proxies=None): 2531 """Test if proxies should not be used for a particular host. 2532 2533 Checks the proxy dict for the value of no_proxy, which should 2534 be a list of comma separated DNS suffixes, or '*' for all hosts. 2535 2536 """ 2537 if proxies is None: 2538 proxies = getproxies_environment() 2539 # don't bypass, if no_proxy isn't specified 2540 try: 2541 no_proxy = proxies['no'] 2542 except KeyError: 2543 return False 2544 # '*' is special case for always bypass 2545 if no_proxy == '*': 2546 return True 2547 host = host.lower() 2548 # strip port off host 2549 hostonly, port = _splitport(host) 2550 # check if the host ends with any of the DNS suffixes 2551 for name in no_proxy.split(','): 2552 name = name.strip() 2553 if name: 2554 name = name.lstrip('.') # ignore leading dots 2555 name = name.lower() 2556 if hostonly == name or host == name: 2557 return True 2558 name = '.' + name 2559 if hostonly.endswith(name) or host.endswith(name): 2560 return True 2561 # otherwise, don't bypass 2562 return False 2563 2564 2565# This code tests an OSX specific data structure but is testable on all 2566# platforms 2567def _proxy_bypass_macosx_sysconf(host, proxy_settings): 2568 """ 2569 Return True iff this host shouldn't be accessed using a proxy 2570 2571 This function uses the MacOSX framework SystemConfiguration 2572 to fetch the proxy information. 2573 2574 proxy_settings come from _scproxy._get_proxy_settings or get mocked ie: 2575 { 'exclude_simple': bool, 2576 'exceptions': ['foo.bar', '*.bar.com', '127.0.0.1', '10.1', '10.0/16'] 2577 } 2578 """ 2579 from fnmatch import fnmatch 2580 2581 hostonly, port = _splitport(host) 2582 2583 def ip2num(ipAddr): 2584 parts = ipAddr.split('.') 2585 parts = list(map(int, parts)) 2586 if len(parts) != 4: 2587 parts = (parts + [0, 0, 0, 0])[:4] 2588 return (parts[0] << 24) | (parts[1] << 16) | (parts[2] << 8) | parts[3] 2589 2590 # Check for simple host names: 2591 if '.' not in host: 2592 if proxy_settings['exclude_simple']: 2593 return True 2594 2595 hostIP = None 2596 2597 for value in proxy_settings.get('exceptions', ()): 2598 # Items in the list are strings like these: *.local, 169.254/16 2599 if not value: continue 2600 2601 m = re.match(r"(\d+(?:\.\d+)*)(/\d+)?", value) 2602 if m is not None: 2603 if hostIP is None: 2604 try: 2605 hostIP = socket.gethostbyname(hostonly) 2606 hostIP = ip2num(hostIP) 2607 except OSError: 2608 continue 2609 2610 base = ip2num(m.group(1)) 2611 mask = m.group(2) 2612 if mask is None: 2613 mask = 8 * (m.group(1).count('.') + 1) 2614 else: 2615 mask = int(mask[1:]) 2616 2617 if mask < 0 or mask > 32: 2618 # System libraries ignore invalid prefix lengths 2619 continue 2620 2621 mask = 32 - mask 2622 2623 if (hostIP >> mask) == (base >> mask): 2624 return True 2625 2626 elif fnmatch(host, value): 2627 return True 2628 2629 return False 2630 2631 2632if sys.platform == 'darwin': 2633 from _scproxy import _get_proxy_settings, _get_proxies 2634 2635 def proxy_bypass_macosx_sysconf(host): 2636 proxy_settings = _get_proxy_settings() 2637 return _proxy_bypass_macosx_sysconf(host, proxy_settings) 2638 2639 def getproxies_macosx_sysconf(): 2640 """Return a dictionary of scheme -> proxy server URL mappings. 2641 2642 This function uses the MacOSX framework SystemConfiguration 2643 to fetch the proxy information. 2644 """ 2645 return _get_proxies() 2646 2647 2648 2649 def proxy_bypass(host): 2650 """Return True, if host should be bypassed. 2651 2652 Checks proxy settings gathered from the environment, if specified, 2653 or from the MacOSX framework SystemConfiguration. 2654 2655 """ 2656 proxies = getproxies_environment() 2657 if proxies: 2658 return proxy_bypass_environment(host, proxies) 2659 else: 2660 return proxy_bypass_macosx_sysconf(host) 2661 2662 def getproxies(): 2663 return getproxies_environment() or getproxies_macosx_sysconf() 2664 2665 2666elif os.name == 'nt': 2667 def getproxies_registry(): 2668 """Return a dictionary of scheme -> proxy server URL mappings. 2669 2670 Win32 uses the registry to store proxies. 2671 2672 """ 2673 proxies = {} 2674 try: 2675 import winreg 2676 except ImportError: 2677 # Std module, so should be around - but you never know! 2678 return proxies 2679 try: 2680 internetSettings = winreg.OpenKey(winreg.HKEY_CURRENT_USER, 2681 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings') 2682 proxyEnable = winreg.QueryValueEx(internetSettings, 2683 'ProxyEnable')[0] 2684 if proxyEnable: 2685 # Returned as Unicode but problems if not converted to ASCII 2686 proxyServer = str(winreg.QueryValueEx(internetSettings, 2687 'ProxyServer')[0]) 2688 if '=' not in proxyServer and ';' not in proxyServer: 2689 # Use one setting for all protocols. 2690 proxyServer = 'http={0};https={0};ftp={0}'.format(proxyServer) 2691 for p in proxyServer.split(';'): 2692 protocol, address = p.split('=', 1) 2693 # See if address has a type:// prefix 2694 if not re.match('(?:[^/:]+)://', address): 2695 # Add type:// prefix to address without specifying type 2696 if protocol in ('http', 'https', 'ftp'): 2697 # The default proxy type of Windows is HTTP 2698 address = 'http://' + address 2699 elif protocol == 'socks': 2700 address = 'socks://' + address 2701 proxies[protocol] = address 2702 # Use SOCKS proxy for HTTP(S) protocols 2703 if proxies.get('socks'): 2704 # The default SOCKS proxy type of Windows is SOCKS4 2705 address = re.sub(r'^socks://', 'socks4://', proxies['socks']) 2706 proxies['http'] = proxies.get('http') or address 2707 proxies['https'] = proxies.get('https') or address 2708 internetSettings.Close() 2709 except (OSError, ValueError, TypeError): 2710 # Either registry key not found etc, or the value in an 2711 # unexpected format. 2712 # proxies already set up to be empty so nothing to do 2713 pass 2714 return proxies 2715 2716 def getproxies(): 2717 """Return a dictionary of scheme -> proxy server URL mappings. 2718 2719 Returns settings gathered from the environment, if specified, 2720 or the registry. 2721 2722 """ 2723 return getproxies_environment() or getproxies_registry() 2724 2725 def proxy_bypass_registry(host): 2726 try: 2727 import winreg 2728 except ImportError: 2729 # Std modules, so should be around - but you never know! 2730 return 0 2731 try: 2732 internetSettings = winreg.OpenKey(winreg.HKEY_CURRENT_USER, 2733 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings') 2734 proxyEnable = winreg.QueryValueEx(internetSettings, 2735 'ProxyEnable')[0] 2736 proxyOverride = str(winreg.QueryValueEx(internetSettings, 2737 'ProxyOverride')[0]) 2738 # ^^^^ Returned as Unicode but problems if not converted to ASCII 2739 except OSError: 2740 return 0 2741 if not proxyEnable or not proxyOverride: 2742 return 0 2743 # try to make a host list from name and IP address. 2744 rawHost, port = _splitport(host) 2745 host = [rawHost] 2746 try: 2747 addr = socket.gethostbyname(rawHost) 2748 if addr != rawHost: 2749 host.append(addr) 2750 except OSError: 2751 pass 2752 try: 2753 fqdn = socket.getfqdn(rawHost) 2754 if fqdn != rawHost: 2755 host.append(fqdn) 2756 except OSError: 2757 pass 2758 # make a check value list from the registry entry: replace the 2759 # '<local>' string by the localhost entry and the corresponding 2760 # canonical entry. 2761 proxyOverride = proxyOverride.split(';') 2762 # now check if we match one of the registry values. 2763 for test in proxyOverride: 2764 if test == '<local>': 2765 if '.' not in rawHost: 2766 return 1 2767 test = test.replace(".", r"\.") # mask dots 2768 test = test.replace("*", r".*") # change glob sequence 2769 test = test.replace("?", r".") # change glob char 2770 for val in host: 2771 if re.match(test, val, re.I): 2772 return 1 2773 return 0 2774 2775 def proxy_bypass(host): 2776 """Return True, if host should be bypassed. 2777 2778 Checks proxy settings gathered from the environment, if specified, 2779 or the registry. 2780 2781 """ 2782 proxies = getproxies_environment() 2783 if proxies: 2784 return proxy_bypass_environment(host, proxies) 2785 else: 2786 return proxy_bypass_registry(host) 2787 2788else: 2789 # By default use environment variables 2790 getproxies = getproxies_environment 2791 proxy_bypass = proxy_bypass_environment 2792