1 """Open an arbitrary URL.
3 See the following document for more info on URLs:
4 "Names and Addresses, URIs, URLs, URNs, URCs", at
5 http://www.w3.org/pub/WWW/Addressing/Overview.html
7 See also the HTTP spec (from which the error codes are derived):
8 "HTTP - Hypertext Transfer Protocol", at
9 http://www.w3.org/pub/WWW/Protocols/
11 Related standards and specs:
12 - RFC1808: the "relative URL" spec. (authoritative status)
13 - RFC1738 - the "URL standard". (authoritative status)
14 - RFC1630 - the "URI spec". (informational status)
16 The object returned by URLopener().open(file) will differ per
17 protocol. All you know is that is has methods read(), readline(),
18 readlines(), fileno(), close() and info(). The read*(), fileno()
19 and close() methods work like those of open files.
20 The info() method returns a mimetools.Message object which can be
21 used to query various info about the object, if available.
22 (mimetools.Message objects are queried with the getheader() method.)
33 __all__ = ["urlopen", "URLopener", "FancyURLopener", "urlretrieve",
34 "urlcleanup", "quote", "quote_plus", "unquote", "unquote_plus",
35 "urlencode", "url2pathname", "pathname2url", "splittag",
36 "localhost", "thishost", "ftperrors", "basejoin", "unwrap",
37 "splittype", "splithost", "splituser", "splitpasswd", "splitport",
38 "splitnport", "splitquery", "splitattr", "splitvalue",
39 "splitgophertype", "getproxies"]
41 __version__ = '1.15' # XXX This version is not always updated :-(
43 MAXFTPCACHE = 10 # Trim the ftp cache beyond this size
45 # Helper for non-unix systems
47 from macurl2path import url2pathname, pathname2url
49 from nturl2path import url2pathname, pathname2url
50 elif os.name == 'riscos':
51 from rourl2path import url2pathname, pathname2url
53 def url2pathname(pathname):
54 return unquote(pathname)
55 def pathname2url(pathname):
56 return quote(pathname)
58 # This really consists of two pieces:
59 # (1) a class which handles opening of all sorts of URLs
60 # (plus assorted utilities etc.)
61 # (2) a set of functions for parsing URLs
62 # XXX Should these be separated out into different modules?
65 # Shortcut for basic usage
67 def urlopen(url, data=None):
68 """urlopen(url [, data]) -> open file-like object"""
71 _urlopener = FancyURLopener()
73 return _urlopener.open(url)
75 return _urlopener.open(url, data)
76 def urlretrieve(url, filename=None, reporthook=None, data=None):
79 _urlopener = FancyURLopener()
80 return _urlopener.retrieve(url, filename, reporthook, data)
88 """Class to open URLs.
89 This is a class rather than just a subroutine because we may need
90 more than one set of global protocol-specific options.
91 Note -- this is a base class for those who don't want the
92 automatic handling of errors type 302 (relocated) and 401
93 (authorization needed)."""
97 version = "Python-urllib/%s" % __version__
100 def __init__(self, proxies=None, **x509):
102 proxies = getproxies()
103 assert hasattr(proxies, 'has_key'), "proxies must be a mapping"
104 self.proxies = proxies
105 self.key_file = x509.get('key_file')
106 self.cert_file = x509.get('cert_file')
107 self.addheaders = [('User-agent', 'Servus/0.2')]
108 self.__tempfiles = []
109 self.__unlink = os.unlink # See cleanup()
110 self.tempcache = None
111 # Undocumented feature: if you assign {} to tempcache,
112 # it is used to cache files retrieved with
113 # self.retrieve(). This is not enabled by default
114 # since it does not work for changing documents (and I
115 # haven't got the logic to check expiration headers
117 self.ftpcache = ftpcache
118 # Undocumented feature: you can use a different
119 # ftp cache by assigning to the .ftpcache member;
120 # in case you want logically independent URL openers
121 # XXX This is not threadsafe. Bah.
130 # This code sometimes runs when the rest of this module
131 # has already been deleted, so it can't use any globals
132 # or import anything.
134 for file in self.__tempfiles:
139 del self.__tempfiles[:]
141 self.tempcache.clear()
143 def addheader(self, *args):
144 """Add a header to be used by the HTTP interface only
145 e.g. u.addheader('Accept', 'sound/basic')"""
146 self.addheaders.append(args)
149 def open(self, fullurl, data=None):
150 """Use URLopener().open(file) instead of open(file, 'r')."""
151 fullurl = unwrap(toBytes(fullurl))
152 if self.tempcache and self.tempcache.has_key(fullurl):
153 filename, headers = self.tempcache[fullurl]
154 fp = open(filename, 'rb')
155 return addinfourl(fp, headers, fullurl)
156 urltype, url = splittype(fullurl)
159 if self.proxies.has_key(urltype):
160 proxy = self.proxies[urltype]
161 urltype, proxyhost = splittype(proxy)
162 host, selector = splithost(proxyhost)
163 url = (host, fullurl) # Signal special case to open_*()
166 name = 'open_' + urltype
170 name = '_'.join(name.split('-'))
171 if not hasattr(self, name):
173 return self.open_unknown_proxy(proxy, fullurl, data)
175 return self.open_unknown(fullurl, data)
178 return getattr(self, name)(url)
180 return getattr(self, name)(url, data)
181 except socket.error, msg:
182 raise IOError, ('socket error', msg), sys.exc_info()[2]
184 def open_unknown(self, fullurl, data=None):
185 """Overridable interface to open unknown URL type."""
186 type, url = splittype(fullurl)
187 raise IOError, ('url error', 'unknown url type', type)
189 def open_unknown_proxy(self, proxy, fullurl, data=None):
190 """Overridable interface to open unknown URL type."""
191 type, url = splittype(fullurl)
192 raise IOError, ('url error', 'invalid proxy for %s' % type, proxy)
195 def retrieve(self, url, filename=None, reporthook=None, data=None):
196 """retrieve(url) returns (filename, None) for a local object
197 or (tempfilename, headers) for a remote object."""
198 url = unwrap(toBytes(url))
199 if self.tempcache and self.tempcache.has_key(url):
200 return self.tempcache[url]
201 type, url1 = splittype(url)
202 if not filename and (not type or type == 'file'):
204 fp = self.open_local_file(url1)
207 return url2pathname(splithost(url1)[1]), hdrs
210 fp = self.open(url, data)
214 garbage, path = splittype(url)
215 garbage, path = splithost(path or "")
216 path, garbage = splitquery(path or "")
217 path, garbage = splitattr(path or "")
218 suffix = os.path.splitext(path)[1]
219 filename = tempfile.mktemp(suffix)
220 self.__tempfiles.append(filename)
221 result = filename, headers
222 if self.tempcache is not None:
223 self.tempcache[url] = result
224 tfp = open(filename, 'wb')
229 if headers.has_key("content-length"):
230 size = int(headers["Content-Length"])
231 reporthook(0, bs, size)
234 reporthook(1, bs, size)
238 blocknum = blocknum + 1
240 reporthook(blocknum, bs, size)
247 # Each method named open_<type> knows how to open that type of URL
249 def open_http(self, url, data=None):
250 """Use HTTP protocol."""
253 if type(url) is types.StringType:
254 host, selector = splithost(url)
256 user_passwd, host = splituser(host)
261 urltype, rest = splittype(selector)
264 if urltype.lower() != 'http':
267 realhost, rest = splithost(rest)
269 user_passwd, realhost = splituser(realhost)
271 selector = "%s://%s%s" % (urltype, realhost, rest)
272 if proxy_bypass(realhost):
275 #print "proxy via http:", host, selector
276 if not host: raise IOError, ('http error', 'no host given')
279 auth = base64.encodestring(user_passwd).strip()
282 h = httplib.HTTP(host)
284 h.putrequest('POST', selector)
285 h.putheader('Content-type', 'application/x-www-form-urlencoded')
286 h.putheader('Content-length', '%d' % len(data))
288 h.putrequest('GET', selector)
289 if auth: h.putheader('Authorization', 'Basic %s' % auth)
290 if realhost: h.putheader('Host', realhost)
291 for args in self.addheaders: apply(h.putheader, args)
295 errcode, errmsg, headers = h.getreply()
298 return addinfourl(fp, headers, "http:" + url)
301 return self.http_error(url, fp, errcode, errmsg, headers)
303 return self.http_error(url, fp, errcode, errmsg, headers, data)
305 def http_error(self, url, fp, errcode, errmsg, headers, data=None):
306 """Handle http errors.
307 Derived class can override this, or provide specific handlers
308 named http_error_DDD where DDD is the 3-digit error code."""
309 # First check if there's a specific handler for this error
310 name = 'http_error_%d' % errcode
311 if hasattr(self, name):
312 method = getattr(self, name)
314 result = method(url, fp, errcode, errmsg, headers)
316 result = method(url, fp, errcode, errmsg, headers, data)
317 if result: return result
318 return self.http_error_default(url, fp, errcode, errmsg, headers)
320 def http_error_default(self, url, fp, errcode, errmsg, headers):
321 """Default error handler: close the connection and raise IOError."""
324 raise IOError, ('http error', errcode, errmsg, headers)
326 if hasattr(socket, "ssl"):
327 def open_https(self, url, data=None):
328 """Use HTTPS protocol."""
331 if type(url) is types.StringType:
332 host, selector = splithost(url)
334 user_passwd, host = splituser(host)
339 urltype, rest = splittype(selector)
342 if urltype.lower() != 'https':
345 realhost, rest = splithost(rest)
347 user_passwd, realhost = splituser(realhost)
349 selector = "%s://%s%s" % (urltype, realhost, rest)
350 #print "proxy via https:", host, selector
351 if not host: raise IOError, ('https error', 'no host given')
354 auth = base64.encodestring(user_passwd).strip()
357 h = httplib.HTTPS(host, 0,
358 key_file=self.key_file,
359 cert_file=self.cert_file)
361 h.putrequest('POST', selector)
362 h.putheader('Content-type',
363 'application/x-www-form-urlencoded')
364 h.putheader('Content-length', '%d' % len(data))
366 h.putrequest('GET', selector)
367 if auth: h.putheader('Authorization: Basic %s' % auth)
368 if realhost: h.putheader('Host', realhost)
369 for args in self.addheaders: apply(h.putheader, args)
373 errcode, errmsg, headers = h.getreply()
376 return addinfourl(fp, headers, "https:" + url)
379 return self.http_error(url, fp, errcode, errmsg, headers)
381 return self.http_error(url, fp, errcode, errmsg, headers,
384 def open_gopher(self, url):
385 """Use Gopher protocol."""
387 host, selector = splithost(url)
388 if not host: raise IOError, ('gopher error', 'no host given')
390 type, selector = splitgophertype(selector)
391 selector, query = splitquery(selector)
392 selector = unquote(selector)
394 query = unquote(query)
395 fp = gopherlib.send_query(selector, query, host)
397 fp = gopherlib.send_selector(selector, host)
398 return addinfourl(fp, noheaders(), "gopher:" + url)
400 def open_file(self, url):
401 """Use local file or FTP depending on form of URL."""
402 if url[:2] == '//' and url[2:3] != '/':
403 return self.open_ftp(url)
405 return self.open_local_file(url)
407 def open_local_file(self, url):
408 """Use local file."""
409 import mimetypes, mimetools, rfc822, StringIO
410 host, file = splithost(url)
411 localname = url2pathname(file)
413 stats = os.stat(localname)
415 raise IOError(e.errno, e.strerror, e.filename)
416 size = stats[stat.ST_SIZE]
417 modified = rfc822.formatdate(stats[stat.ST_MTIME])
418 mtype = mimetypes.guess_type(url)[0]
419 headers = mimetools.Message(StringIO.StringIO(
420 'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %
421 (mtype or 'text/plain', size, modified)))
425 urlfile = 'file://' + file
426 return addinfourl(open(localname, 'rb'),
428 host, port = splitport(host)
430 and socket.gethostbyname(host) in (localhost(), thishost()):
433 urlfile = 'file://' + file
434 return addinfourl(open(localname, 'rb'),
436 raise IOError, ('local file error', 'not on local host')
438 def open_ftp(self, url):
439 """Use FTP protocol."""
440 import mimetypes, mimetools, StringIO
441 host, path = splithost(url)
442 if not host: raise IOError, ('ftp error', 'no host given')
443 host, port = splitport(host)
444 user, host = splituser(host)
445 if user: user, passwd = splitpasswd(user)
448 user = unquote(user or '')
449 passwd = unquote(passwd or '')
450 host = socket.gethostbyname(host)
453 port = ftplib.FTP_PORT
456 path, attrs = splitattr(path)
458 dirs = path.split('/')
459 dirs, file = dirs[:-1], dirs[-1]
460 if dirs and not dirs[0]: dirs = dirs[1:]
461 if dirs and not dirs[0]: dirs[0] = '/'
462 key = user, host, port, '/'.join(dirs)
464 if len(self.ftpcache) > MAXFTPCACHE:
465 # Prune the cache, rather arbitrarily
466 for k in self.ftpcache.keys():
472 if not self.ftpcache.has_key(key):
473 self.ftpcache[key] = \
474 ftpwrapper(user, passwd, host, port, dirs)
475 if not file: type = 'D'
478 attr, value = splitvalue(attr)
479 if attr.lower() == 'type' and \
480 value in ('a', 'A', 'i', 'I', 'd', 'D'):
482 (fp, retrlen) = self.ftpcache[key].retrfile(file, type)
483 mtype = mimetypes.guess_type("ftp:" + url)[0]
486 headers += "Content-Type: %s\n" % mtype
487 if retrlen is not None and retrlen >= 0:
488 headers += "Content-Length: %d\n" % retrlen
489 headers = mimetools.Message(StringIO.StringIO(headers))
490 return addinfourl(fp, headers, "ftp:" + url)
491 except ftperrors(), msg:
492 raise IOError, ('ftp error', msg), sys.exc_info()[2]
494 def open_data(self, url, data=None):
495 """Use "data" URL."""
498 # syntax of data URLs:
499 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data
500 # mediatype := [ type "/" subtype ] *( ";" parameter )
502 # parameter := attribute "=" value
503 import StringIO, mimetools, time
505 [type, data] = url.split(',', 1)
507 raise IOError, ('data error', 'bad data URL')
509 type = 'text/plain;charset=US-ASCII'
510 semi = type.rfind(';')
511 if semi >= 0 and '=' not in type[semi:]:
512 encoding = type[semi+1:]
517 msg.append('Date: %s'%time.strftime('%a, %d %b %Y %T GMT',
518 time.gmtime(time.time())))
519 msg.append('Content-type: %s' % type)
520 if encoding == 'base64':
522 data = base64.decodestring(data)
525 msg.append('Content-length: %d' % len(data))
529 f = StringIO.StringIO(msg)
530 headers = mimetools.Message(f, 0)
531 f.fileno = None # needed for addinfourl
532 return addinfourl(f, headers, url)
535 class FancyURLopener(URLopener):
536 """Derived class with handlers for errors we can handle (perhaps)."""
538 def __init__(self, *args):
539 apply(URLopener.__init__, (self,) + args)
544 def http_error_default(self, url, fp, errcode, errmsg, headers):
545 """Default error handling -- don't raise an exception."""
546 return addinfourl(fp, headers, "http:" + url)
548 def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
549 """Error 302 -- relocated (temporarily)."""
551 if self.maxtries and self.tries >= self.maxtries:
552 if hasattr(self, "http_error_500"):
553 meth = self.http_error_500
555 meth = self.http_error_default
557 return meth(url, fp, 500,
558 "Internal Server Error: Redirect Recursion", headers)
559 result = self.redirect_internal(url, fp, errcode, errmsg, headers,
564 def redirect_internal(self, url, fp, errcode, errmsg, headers, data):
565 if headers.has_key('location'):
566 newurl = headers['location']
567 elif headers.has_key('uri'):
568 newurl = headers['uri']
573 # In case the server sent a relative URL, join with original:
574 newurl = basejoin(self.type + ":" + url, newurl)
576 return self.open(newurl)
578 return self.open(newurl, data)
580 def http_error_301(self, url, fp, errcode, errmsg, headers, data=None):
581 """Error 301 -- also relocated (permanently)."""
582 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
584 def http_error_401(self, url, fp, errcode, errmsg, headers, data=None):
585 """Error 401 -- authentication required.
586 See this URL for a description of the basic authentication scheme:
587 http://www.ics.uci.edu/pub/ietf/http/draft-ietf-http-v10-spec-00.txt"""
588 if not headers.has_key('www-authenticate'):
589 URLopener.http_error_default(self, url, fp,
590 errcode, errmsg, headers)
591 stuff = headers['www-authenticate']
593 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
595 URLopener.http_error_default(self, url, fp,
596 errcode, errmsg, headers)
597 scheme, realm = match.groups()
598 if scheme.lower() != 'basic':
599 URLopener.http_error_default(self, url, fp,
600 errcode, errmsg, headers)
601 name = 'retry_' + self.type + '_basic_auth'
603 return getattr(self,name)(url, realm)
605 return getattr(self,name)(url, realm, data)
607 def retry_http_basic_auth(self, url, realm, data=None):
608 host, selector = splithost(url)
609 i = host.find('@') + 1
611 user, passwd = self.get_user_passwd(host, realm, i)
612 if not (user or passwd): return None
613 host = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + host
614 newurl = 'http://' + host + selector
616 return self.open(newurl)
618 return self.open(newurl, data)
620 def retry_https_basic_auth(self, url, realm, data=None):
621 host, selector = splithost(url)
622 i = host.find('@') + 1
624 user, passwd = self.get_user_passwd(host, realm, i)
625 if not (user or passwd): return None
626 host = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + host
627 newurl = '//' + host + selector
628 return self.open_https(newurl, data)
630 def get_user_passwd(self, host, realm, clear_cache = 0):
631 key = realm + '@' + host.lower()
632 if self.auth_cache.has_key(key):
634 del self.auth_cache[key]
636 return self.auth_cache[key]
637 user, passwd = self.prompt_user_passwd(host, realm)
638 if user or passwd: self.auth_cache[key] = (user, passwd)
641 def prompt_user_passwd(self, host, realm):
642 """Override this in a GUI environment!"""
645 user = raw_input("Enter username for %s at %s: " % (realm,
647 passwd = getpass.getpass("Enter password for %s in %s at %s: " %
650 except KeyboardInterrupt:
659 """Return the IP address of the magic hostname 'localhost'."""
662 _localhost = socket.gethostbyname('localhost')
667 """Return the IP address of the current host."""
670 _thishost = socket.gethostbyname(socket.gethostname())
675 """Return the set of errors raised by the FTP class."""
679 _ftperrors = ftplib.all_errors
684 """Return an empty mimetools.Message object."""
689 _noheaders = mimetools.Message(StringIO.StringIO(), 0)
690 _noheaders.fp.close() # Recycle file descriptor
697 """Class used by open_ftp() for cache of open FTP connections."""
699 def __init__(self, user, passwd, host, port, dirs):
710 self.ftp = ftplib.FTP()
711 self.ftp.connect(self.host, self.port)
712 self.ftp.login(self.user, self.passwd)
713 for dir in self.dirs:
716 def retrfile(self, file, type):
719 if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
720 else: cmd = 'TYPE ' + type; isdir = 0
722 self.ftp.voidcmd(cmd)
723 except ftplib.all_errors:
725 self.ftp.voidcmd(cmd)
727 if file and not isdir:
728 # Use nlst to see if the file exists at all
731 except ftplib.error_perm, reason:
732 raise IOError, ('ftp error', reason), sys.exc_info()[2]
733 # Restore the transfer mode!
734 self.ftp.voidcmd(cmd)
735 # Try to retrieve as a file
738 conn = self.ftp.ntransfercmd(cmd)
739 except ftplib.error_perm, reason:
740 if str(reason)[:3] != '550':
741 raise IOError, ('ftp error', reason), sys.exc_info()[2]
743 # Set transfer mode to ASCII!
744 self.ftp.voidcmd('TYPE A')
745 # Try a directory listing
746 if file: cmd = 'LIST ' + file
748 conn = self.ftp.ntransfercmd(cmd)
750 # Pass back both a suitably decorated object and a retrieval length
751 return (addclosehook(conn[0].makefile('rb'),
752 self.endtransfer), conn[1])
753 def endtransfer(self):
770 """Base class for addinfo and addclosehook."""
772 def __init__(self, fp):
774 self.read = self.fp.read
775 self.readline = self.fp.readline
776 if hasattr(self.fp, "readlines"): self.readlines = self.fp.readlines
777 if hasattr(self.fp, "fileno"): self.fileno = self.fp.fileno
780 return '<%s at %s whose fp = %s>' % (self.__class__.__name__,
781 `id(self)`, `self.fp`)
786 self.readlines = None
788 if self.fp: self.fp.close()
791 class addclosehook(addbase):
792 """Class to add a close hook to an open file."""
794 def __init__(self, fp, closehook, *hookargs):
795 addbase.__init__(self, fp)
796 self.closehook = closehook
797 self.hookargs = hookargs
802 apply(self.closehook, self.hookargs)
803 self.closehook = None
806 class addinfo(addbase):
807 """class to add an info() method to an open file."""
809 def __init__(self, fp, headers):
810 addbase.__init__(self, fp)
811 self.headers = headers
816 class addinfourl(addbase):
817 """class to add info() and geturl() methods to an open file."""
819 def __init__(self, fp, headers, url):
820 addbase.__init__(self, fp)
821 self.headers = headers
831 def basejoin(base, url):
832 """Utility to combine a URL with a base URL to form a new URL."""
833 type, path = splittype(url)
835 # if url is complete (i.e., it contains a type), return it
837 host, path = splithost(path)
838 type, basepath = splittype(base) # inherit type from base
840 # if url contains host, just inherit type
841 if type: return type + '://' + host + path
843 # no type inherited, so url must have started with //
846 host, basepath = splithost(basepath) # inherit host
847 basepath, basetag = splittag(basepath) # remove extraneous cruft
848 basepath, basequery = splitquery(basepath) # idem
850 # non-absolute path name
851 if path[:1] in ('#', '?'):
852 # path is just a tag or query, attach to basepath
855 # else replace last component
856 i = basepath.rfind('/')
858 # basepath not absolute
860 # host present, make absolute
863 # else keep non-absolute
866 # remove last file component
867 basepath = basepath[:i+1]
868 # Interpret ../ (important because of symlinks)
869 while basepath and path[:3] == '../':
871 i = basepath[:-1].rfind('/')
873 basepath = basepath[:i+1]
880 path = basepath + path
881 if host and path and path[0] != '/':
883 if type and host: return type + '://' + host + path
884 elif type: return type + ':' + path
885 elif host: return '//' + host + path # don't know what this means
889 # Utilities to parse URLs (most of these return None for missing parts):
890 # unwrap('<URL:type://host/path>') --> 'type://host/path'
891 # splittype('type:opaquestring') --> 'type', 'opaquestring'
892 # splithost('//host[:port]/path') --> 'host[:port]', '/path'
893 # splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
894 # splitpasswd('user:passwd') -> 'user', 'passwd'
895 # splitport('host:port') --> 'host', 'port'
896 # splitquery('/path?query') --> '/path', 'query'
897 # splittag('/path#tag') --> '/path', 'tag'
898 # splitattr('/path;attr1=value1;attr2=value2;...') ->
899 # '/path', ['attr1=value1', 'attr2=value2', ...]
900 # splitvalue('attr=value') --> 'attr', 'value'
901 # splitgophertype('/Xselector') --> 'X', 'selector'
902 # unquote('abc%20def') -> 'abc def'
903 # quote('abc def') -> 'abc%20def')
906 """toBytes(u"URL") --> 'URL'."""
907 # Most URL schemes require ASCII. If that changes, the conversion
909 if type(url) is types.UnicodeType:
911 url = url.encode("ASCII")
913 raise UnicodeError("URL " + repr(url) +
914 " contains non-ASCII characters")
918 """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
920 if url[:1] == '<' and url[-1:] == '>':
921 url = url[1:-1].strip()
922 if url[:4] == 'URL:': url = url[4:].strip()
927 """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
929 if _typeprog is None:
931 _typeprog = re.compile('^([^/:]+):')
933 match = _typeprog.match(url)
935 scheme = match.group(1)
936 return scheme.lower(), url[len(scheme) + 1:]
941 """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
943 if _hostprog is None:
945 _hostprog = re.compile('^//([^/]*)(.*)$')
947 match = _hostprog.match(url)
948 if match: return match.group(1, 2)
953 """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
955 if _userprog is None:
957 _userprog = re.compile('^([^@]*)@(.*)$')
959 match = _userprog.match(host)
960 if match: return map(unquote, match.group(1, 2))
964 def splitpasswd(user):
965 """splitpasswd('user:passwd') -> 'user', 'passwd'."""
967 if _passwdprog is None:
969 _passwdprog = re.compile('^([^:]*):(.*)$')
971 match = _passwdprog.match(user)
972 if match: return match.group(1, 2)
975 # splittag('/path#tag') --> '/path', 'tag'
978 """splitport('host:port') --> 'host', 'port'."""
980 if _portprog is None:
982 _portprog = re.compile('^(.*):([0-9]+)$')
984 match = _portprog.match(host)
985 if match: return match.group(1, 2)
989 def splitnport(host, defport=-1):
990 """Split host and port, returning numeric port.
991 Return given default port if no ':' found; defaults to -1.
992 Return numerical port if a valid number are found after ':'.
993 Return None if ':' but not a valid number."""
995 if _nportprog is None:
997 _nportprog = re.compile('^(.*):(.*)$')
999 match = _nportprog.match(host)
1001 host, port = match.group(1, 2)
1003 if not port: raise ValueError, "no digits"
1008 return host, defport
1011 def splitquery(url):
1012 """splitquery('/path?query') --> '/path', 'query'."""
1014 if _queryprog is None:
1016 _queryprog = re.compile('^(.*)\?([^?]*)$')
1018 match = _queryprog.match(url)
1019 if match: return match.group(1, 2)
1024 """splittag('/path#tag') --> '/path', 'tag'."""
1026 if _tagprog is None:
1028 _tagprog = re.compile('^(.*)#([^#]*)$')
1030 match = _tagprog.match(url)
1031 if match: return match.group(1, 2)
1035 """splitattr('/path;attr1=value1;attr2=value2;...') ->
1036 '/path', ['attr1=value1', 'attr2=value2', ...]."""
1037 words = url.split(';')
1038 return words[0], words[1:]
1041 def splitvalue(attr):
1042 """splitvalue('attr=value') --> 'attr', 'value'."""
1044 if _valueprog is None:
1046 _valueprog = re.compile('^([^=]*)=(.*)$')
1048 match = _valueprog.match(attr)
1049 if match: return match.group(1, 2)
1052 def splitgophertype(selector):
1053 """splitgophertype('/Xselector') --> 'X', 'selector'."""
1054 if selector[:1] == '/' and selector[1:2]:
1055 return selector[1], selector[2:]
1056 return None, selector
1059 """unquote('abc%20def') -> 'abc def'."""
1064 myappend = res.append
1069 myappend(mychr(myatoi(item[:2], 16))
1072 myappend('%' + item)
1074 myappend('%' + item)
1077 def unquote_plus(s):
1078 """unquote('%7e/abc+def') -> '~/abc def'"""
1080 # replace '+' with ' '
1081 s = ' '.join(s.split('+'))
1084 always_safe = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
1085 'abcdefghijklmnopqrstuvwxyz'
1088 _fast_safe_test = always_safe + '/'
1093 if _fast_safe is None:
1095 for c in _fast_safe_test:
1098 for i in range(len(res)):
1100 if not _fast_safe.has_key(c):
1101 res[i] = '%%%02X' % ord(c)
1104 def quote(s, safe = '/'):
1105 """quote('abc def') -> 'abc%20def'
1107 Each part of a URL, e.g. the path info, the query, etc., has a
1108 different set of reserved characters that must be quoted.
1110 RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
1111 the following reserved characters.
1113 reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
1116 Each of these characters is reserved in some component of a URL,
1117 but not necessarily in all of them.
1119 By default, the quote function is intended for quoting the path
1120 section of a URL. Thus, it will not encode '/'. This character
1121 is reserved, but in typical usage the quote function is being
1122 called on a path where the existing slash characters are used as
1123 reserved characters.
1125 safe = always_safe + safe
1126 if _fast_safe_test == safe:
1127 return _fast_quote(s)
1129 for i in range(len(res)):
1132 res[i] = '%%%02X' % ord(c)
1135 def quote_plus(s, safe = ''):
1136 """Quote the query fragment of a URL; replacing ' ' with '+'"""
1139 for i in range(len(l)):
1140 l[i] = quote(l[i], safe)
1143 return quote(s, safe)
1145 def urlencode(query,doseq=0):
1146 """Encode a sequence of two-element tuples or dictionary into a URL query string.
1148 If any values in the query arg are sequences and doseq is true, each
1149 sequence element is converted to a separate parameter.
1151 If the query arg is a sequence of two-element tuples, the order of the
1152 parameters in the output will match the order of parameters in the
1156 if hasattr(query,"items"):
1158 query = query.items()
1160 # it's a bother at times that strings and string-like objects are
1163 # non-sequence items should not work with len()
1165 # non-empty strings will fail this
1166 if len(query) and type(query[0]) != types.TupleType:
1168 # zero-length sequences of all types will get here and succeed,
1169 # but that's a minor nit - since the original implementation
1170 # allowed empty dicts that type of behavior probably should be
1171 # preserved for consistency
1173 ty,va,tb = sys.exc_info()
1174 raise TypeError, "not a valid non-string sequence or mapping object", tb
1178 # preserve old behavior
1180 k = quote_plus(str(k))
1181 v = quote_plus(str(v))
1182 l.append(k + '=' + v)
1185 k = quote_plus(str(k))
1186 if type(v) == types.StringType:
1188 l.append(k + '=' + v)
1189 elif type(v) == types.UnicodeType:
1190 # is there a reasonable way to convert to ASCII?
1191 # encode generates a string, but "replace" or "ignore"
1192 # lose information and "strict" can raise UnicodeError
1193 v = quote_plus(v.encode("ASCII","replace"))
1194 l.append(k + '=' + v)
1197 # is this a sufficient test for sequence-ness?
1201 v = quote_plus(str(v))
1202 l.append(k + '=' + v)
1204 # loop over the sequence
1206 l.append(k + '=' + quote_plus(str(elt)))
1210 def getproxies_environment():
1211 """Return a dictionary of scheme -> proxy server URL mappings.
1213 Scan the environment for variables named <scheme>_proxy;
1214 this seems to be the standard convention. If you need a
1215 different way, you can pass a proxies dictionary to the
1216 [Fancy]URLopener constructor.
1220 for name, value in os.environ.items():
1222 if value and name[-6:] == '_proxy':
1223 proxies[name[:-6]] = value
1226 if os.name == 'mac':
1228 """Return a dictionary of scheme -> proxy server URL mappings.
1230 By convention the mac uses Internet Config to store
1231 proxies. An HTTP proxy, for instance, is stored under
1246 if config.has_key('UseHTTPProxy') and config['UseHTTPProxy']:
1248 value = config['HTTPProxyHost']
1252 proxies['http'] = 'http://%s' % value
1253 # FTP: XXXX To be done.
1254 # Gopher: XXXX To be done.
1257 def proxy_bypass(x):
1260 elif os.name == 'nt':
1261 def getproxies_registry():
1262 """Return a dictionary of scheme -> proxy server URL mappings.
1264 Win32 uses the registry to store proxies.
1271 # Std module, so should be around - but you never know!
1274 internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER,
1275 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
1276 proxyEnable = _winreg.QueryValueEx(internetSettings,
1279 # Returned as Unicode but problems if not converted to ASCII
1280 proxyServer = str(_winreg.QueryValueEx(internetSettings,
1282 if '=' in proxyServer:
1283 # Per-protocol settings
1284 for p in proxyServer.split(';'):
1285 protocol, address = p.split('=', 1)
1286 # See if address has a type:// prefix
1288 if not re.match('^([^/:]+)://', address):
1289 address = '%s://%s' % (protocol, address)
1290 proxies[protocol] = address
1292 # Use one setting for all protocols
1293 if proxyServer[:5] == 'http:':
1294 proxies['http'] = proxyServer
1296 proxies['http'] = 'http://%s' % proxyServer
1297 proxies['ftp'] = 'ftp://%s' % proxyServer
1298 internetSettings.Close()
1299 except (WindowsError, ValueError, TypeError):
1300 # Either registry key not found etc, or the value in an
1301 # unexpected format.
1302 # proxies already set up to be empty so nothing to do
1307 """Return a dictionary of scheme -> proxy server URL mappings.
1309 Returns settings gathered from the environment, if specified,
1313 return getproxies_environment() or getproxies_registry()
1315 def proxy_bypass(host):
1321 # Std modules, so should be around - but you never know!
1324 internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER,
1325 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
1326 proxyEnable = _winreg.QueryValueEx(internetSettings,
1328 proxyOverride = str(_winreg.QueryValueEx(internetSettings,
1329 'ProxyOverride')[0])
1330 # ^^^^ Returned as Unicode but problems if not converted to ASCII
1331 except WindowsError:
1333 if not proxyEnable or not proxyOverride:
1335 # try to make a host list from name and IP address.
1338 addr = socket.gethostbyname(host[0])
1341 except socket.error:
1343 # make a check value list from the registry entry: replace the
1344 # '<local>' string by the localhost entry and the corresponding
1346 proxyOverride = proxyOverride.split(';')
1348 while i < len(proxyOverride):
1349 if proxyOverride[i] == '<local>':
1350 proxyOverride[i:i+1] = ['localhost',
1352 socket.gethostname(),
1353 socket.gethostbyname(
1354 socket.gethostname())]
1356 # print proxyOverride
1357 # now check if we match one of the registry values.
1358 for test in proxyOverride:
1359 test = test.replace(".", r"\.") # mask dots
1360 test = test.replace("*", r".*") # change glob sequence
1361 test = test.replace("?", r".") # change glob char
1363 # print "%s <--> %s" %( test, val )
1364 if re.match(test, val, re.I):
1369 # By default use environment variables
1370 getproxies = getproxies_environment
1372 def proxy_bypass(host):
1375 # Test and time quote() and unquote()
1379 for i in range(256): s = s + chr(i)
1390 print round(t1 - t0, 3), 'sec'
1393 def reporthook(blocknum, blocksize, totalsize):
1394 # Report during remote transfers
1395 print "Block number: %d, Block size: %d, Total size: %d" % (
1396 blocknum, blocksize, totalsize)
1404 'file://localhost/etc/passwd',
1405 'ftp://ftp.python.org/pub/python/README',
1406 ## 'gopher://gopher.micro.umn.edu/1/',
1407 'http://www.python.org/index.html',
1409 if hasattr(URLopener, "open_https"):
1410 args.append('https://synergy.as.cmu.edu/~geek/')
1413 print '-'*10, url, '-'*10
1414 fn, h = urlretrieve(url, None, reporthook)
1418 for k in h.keys(): print k + ':', h[k]
1424 table = string.maketrans("", "")
1425 data = data.translate(table, "\r")
1435 opts, args = getopt.getopt(sys.argv[1:], "th")
1436 except getopt.error, msg:
1438 print "Use -h for help"
1445 print "Usage: python urllib.py [-t] [url ...]"
1446 print "-t runs self-test;",
1447 print "otherwise, contents of urls are printed"
1455 print "Use -h for help"
1457 print urlopen(url).read(),
1459 # Run test program when run as a script
1460 if __name__ == '__main__':