| """Open an arbitrary URL. |
| |
| See the following document for more info on URLs: |
| "Names and Addresses, URIs, URLs, URNs, URCs", at |
| http://www.w3.org/pub/WWW/Addressing/Overview.html |
| |
| See also the HTTP spec (from which the error codes are derived): |
| "HTTP - Hypertext Transfer Protocol", at |
| http://www.w3.org/pub/WWW/Protocols/ |
| |
| Related standards and specs: |
| - RFC1808: the "relative URL" spec. (authoritative status) |
| - RFC1738 - the "URL standard". (authoritative status) |
| - RFC1630 - the "URI spec". (informational status) |
| |
| All code but that related to URL parsing has been removed (since it is not |
| compatible with Google App Engine)from this fork of the original file, |
| obtained from: |
| http://svn.python.org/view/*checkout*/python/tags/r252/Lib/urllib.py?content-type=text%2Fplain&rev=60915 |
| """ |
| |
| import string |
| import sys |
| from urlparse import urljoin as basejoin |
| |
| __all__ = ["quote", "quote_plus", "unquote", "unquote_plus", |
| "urlencode", "splittag", |
| "basejoin", "unwrap", |
| "splittype", "splithost", "splituser", "splitpasswd", "splitport", |
| "splitnport", "splitquery", "splitattr", "splitvalue", |
| "splitgophertype",] |
| |
| __version__ = '1.17' # XXX This version is not always updated :-( |
| |
| |
| # Utilities to parse URLs (most of these return None for missing parts): |
| # unwrap('<URL:type://host/path>') --> 'type://host/path' |
| # splittype('type:opaquestring') --> 'type', 'opaquestring' |
| # splithost('//host[:port]/path') --> 'host[:port]', '/path' |
| # splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]' |
| # splitpasswd('user:passwd') -> 'user', 'passwd' |
| # splitport('host:port') --> 'host', 'port' |
| # splitquery('/path?query') --> '/path', 'query' |
| # splittag('/path#tag') --> '/path', 'tag' |
| # splitattr('/path;attr1=value1;attr2=value2;...') -> |
| # '/path', ['attr1=value1', 'attr2=value2', ...] |
| # splitvalue('attr=value') --> 'attr', 'value' |
| # splitgophertype('/Xselector') --> 'X', 'selector' |
| # unquote('abc%20def') -> 'abc def' |
| # quote('abc def') -> 'abc%20def') |
| |
| try: |
| unicode |
| except NameError: |
| def _is_unicode(x): |
| return 0 |
| else: |
| def _is_unicode(x): |
| return isinstance(x, unicode) |
| |
| def toBytes(url): |
| """toBytes(u"URL") --> 'URL'.""" |
| # Most URL schemes require ASCII. If that changes, the conversion |
| # can be relaxed |
| if _is_unicode(url): |
| try: |
| url = url.encode("ASCII") |
| except UnicodeError: |
| raise UnicodeError("URL " + repr(url) + |
| " contains non-ASCII characters") |
| return url |
| |
| def unwrap(url): |
| """unwrap('<URL:type://host/path>') --> 'type://host/path'.""" |
| url = url.strip() |
| if url[:1] == '<' and url[-1:] == '>': |
| url = url[1:-1].strip() |
| if url[:4] == 'URL:': url = url[4:].strip() |
| return url |
| |
| _typeprog = None |
| def splittype(url): |
| """splittype('type:opaquestring') --> 'type', 'opaquestring'.""" |
| global _typeprog |
| if _typeprog is None: |
| import re |
| _typeprog = re.compile('^([^/:]+):') |
| |
| match = _typeprog.match(url) |
| if match: |
| scheme = match.group(1) |
| return scheme.lower(), url[len(scheme) + 1:] |
| return None, url |
| |
| _hostprog = None |
| def splithost(url): |
| """splithost('//host[:port]/path') --> 'host[:port]', '/path'.""" |
| global _hostprog |
| if _hostprog is None: |
| import re |
| _hostprog = re.compile('^//([^/?]*)(.*)$') |
| |
| match = _hostprog.match(url) |
| if match: return match.group(1, 2) |
| return None, url |
| |
| _userprog = None |
| def splituser(host): |
| """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'.""" |
| global _userprog |
| if _userprog is None: |
| import re |
| _userprog = re.compile('^(.*)@(.*)$') |
| |
| match = _userprog.match(host) |
| if match: return map(unquote, match.group(1, 2)) |
| return None, host |
| |
| _passwdprog = None |
| def splitpasswd(user): |
| """splitpasswd('user:passwd') -> 'user', 'passwd'.""" |
| global _passwdprog |
| if _passwdprog is None: |
| import re |
| _passwdprog = re.compile('^([^:]*):(.*)$') |
| |
| match = _passwdprog.match(user) |
| if match: return match.group(1, 2) |
| return user, None |
| |
| # splittag('/path#tag') --> '/path', 'tag' |
| _portprog = None |
| def splitport(host): |
| """splitport('host:port') --> 'host', 'port'.""" |
| global _portprog |
| if _portprog is None: |
| import re |
| _portprog = re.compile('^(.*):([0-9]+)$') |
| |
| match = _portprog.match(host) |
| if match: return match.group(1, 2) |
| return host, None |
| |
| _nportprog = None |
| def splitnport(host, defport=-1): |
| """Split host and port, returning numeric port. |
| Return given default port if no ':' found; defaults to -1. |
| Return numerical port if a valid number are found after ':'. |
| Return None if ':' but not a valid number.""" |
| global _nportprog |
| if _nportprog is None: |
| import re |
| _nportprog = re.compile('^(.*):(.*)$') |
| |
| match = _nportprog.match(host) |
| if match: |
| host, port = match.group(1, 2) |
| try: |
| if not port: raise ValueError, "no digits" |
| nport = int(port) |
| except ValueError: |
| nport = None |
| return host, nport |
| return host, defport |
| |
| _queryprog = None |
| def splitquery(url): |
| """splitquery('/path?query') --> '/path', 'query'.""" |
| global _queryprog |
| if _queryprog is None: |
| import re |
| _queryprog = re.compile('^(.*)\?([^?]*)$') |
| |
| match = _queryprog.match(url) |
| if match: return match.group(1, 2) |
| return url, None |
| |
| _tagprog = None |
| def splittag(url): |
| """splittag('/path#tag') --> '/path', 'tag'.""" |
| global _tagprog |
| if _tagprog is None: |
| import re |
| _tagprog = re.compile('^(.*)#([^#]*)$') |
| |
| match = _tagprog.match(url) |
| if match: return match.group(1, 2) |
| return url, None |
| |
| def splitattr(url): |
| """splitattr('/path;attr1=value1;attr2=value2;...') -> |
| '/path', ['attr1=value1', 'attr2=value2', ...].""" |
| words = url.split(';') |
| return words[0], words[1:] |
| |
| _valueprog = None |
| def splitvalue(attr): |
| """splitvalue('attr=value') --> 'attr', 'value'.""" |
| global _valueprog |
| if _valueprog is None: |
| import re |
| _valueprog = re.compile('^([^=]*)=(.*)$') |
| |
| match = _valueprog.match(attr) |
| if match: return match.group(1, 2) |
| return attr, None |
| |
| def splitgophertype(selector): |
| """splitgophertype('/Xselector') --> 'X', 'selector'.""" |
| if selector[:1] == '/' and selector[1:2]: |
| return selector[1], selector[2:] |
| return None, selector |
| |
| _hextochr = dict(('%02x' % i, chr(i)) for i in range(256)) |
| _hextochr.update(('%02X' % i, chr(i)) for i in range(256)) |
| |
| def unquote(s): |
| """unquote('abc%20def') -> 'abc def'.""" |
| res = s.split('%') |
| for i in xrange(1, len(res)): |
| item = res[i] |
| try: |
| res[i] = _hextochr[item[:2]] + item[2:] |
| except KeyError: |
| res[i] = '%' + item |
| except UnicodeDecodeError: |
| res[i] = unichr(int(item[:2], 16)) + item[2:] |
| return "".join(res) |
| |
| def unquote_plus(s): |
| """unquote('%7e/abc+def') -> '~/abc def'""" |
| s = s.replace('+', ' ') |
| return unquote(s) |
| |
| always_safe = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ' |
| 'abcdefghijklmnopqrstuvwxyz' |
| '0123456789' '_.-') |
| _safemaps = {} |
| |
| def quote(s, safe = '/'): |
| """quote('abc def') -> 'abc%20def' |
| |
| Each part of a URL, e.g. the path info, the query, etc., has a |
| different set of reserved characters that must be quoted. |
| |
| RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists |
| the following reserved characters. |
| |
| reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" | |
| "$" | "," |
| |
| Each of these characters is reserved in some component of a URL, |
| but not necessarily in all of them. |
| |
| By default, the quote function is intended for quoting the path |
| section of a URL. Thus, it will not encode '/'. This character |
| is reserved, but in typical usage the quote function is being |
| called on a path where the existing slash characters are used as |
| reserved characters. |
| """ |
| cachekey = (safe, always_safe) |
| try: |
| safe_map = _safemaps[cachekey] |
| except KeyError: |
| safe += always_safe |
| safe_map = {} |
| for i in range(256): |
| c = chr(i) |
| safe_map[c] = (c in safe) and c or ('%%%02X' % i) |
| _safemaps[cachekey] = safe_map |
| res = map(safe_map.__getitem__, s) |
| return ''.join(res) |
| |
| def quote_plus(s, safe = ''): |
| """Quote the query fragment of a URL; replacing ' ' with '+'""" |
| if ' ' in s: |
| s = quote(s, safe + ' ') |
| return s.replace(' ', '+') |
| return quote(s, safe) |
| |
| def urlencode(query,doseq=0): |
| """Encode a sequence of two-element tuples or dictionary into a URL query string. |
| |
| If any values in the query arg are sequences and doseq is true, each |
| sequence element is converted to a separate parameter. |
| |
| If the query arg is a sequence of two-element tuples, the order of the |
| parameters in the output will match the order of parameters in the |
| input. |
| """ |
| |
| if hasattr(query,"items"): |
| # mapping objects |
| query = query.items() |
| else: |
| # it's a bother at times that strings and string-like objects are |
| # sequences... |
| try: |
| # non-sequence items should not work with len() |
| # non-empty strings will fail this |
| if len(query) and not isinstance(query[0], tuple): |
| raise TypeError |
| # zero-length sequences of all types will get here and succeed, |
| # but that's a minor nit - since the original implementation |
| # allowed empty dicts that type of behavior probably should be |
| # preserved for consistency |
| except TypeError: |
| ty,va,tb = sys.exc_info() |
| raise TypeError, "not a valid non-string sequence or mapping object", tb |
| |
| l = [] |
| if not doseq: |
| # preserve old behavior |
| for k, v in query: |
| k = quote_plus(str(k)) |
| v = quote_plus(str(v)) |
| l.append(k + '=' + v) |
| else: |
| for k, v in query: |
| k = quote_plus(str(k)) |
| if isinstance(v, str): |
| v = quote_plus(v) |
| l.append(k + '=' + v) |
| elif _is_unicode(v): |
| # is there a reasonable way to convert to ASCII? |
| # encode generates a string, but "replace" or "ignore" |
| # lose information and "strict" can raise UnicodeError |
| v = quote_plus(v.encode("ASCII","replace")) |
| l.append(k + '=' + v) |
| else: |
| try: |
| # is this a sufficient test for sequence-ness? |
| x = len(v) |
| except TypeError: |
| # not a sequence |
| v = quote_plus(str(v)) |
| l.append(k + '=' + v) |
| else: |
| # loop over the sequence |
| for elt in v: |
| l.append(k + '=' + quote_plus(str(elt))) |
| return '&'.join(l) |