source: vendor/python/2.5/Lib/urllib.py@ 3225

Last change on this file since 3225 was 3225, checked in by bird, 19 years ago

Python 2.5

File size: 53.2 KB
Line 
1"""Open an arbitrary URL.
2
3See the following document for more info on URLs:
4"Names and Addresses, URIs, URLs, URNs, URCs", at
5http://www.w3.org/pub/WWW/Addressing/Overview.html
6
7See also the HTTP spec (from which the error codes are derived):
8"HTTP - Hypertext Transfer Protocol", at
9http://www.w3.org/pub/WWW/Protocols/
10
11Related standards and specs:
12- RFC1808: the "relative URL" spec. (authoritative status)
13- RFC1738 - the "URL standard". (authoritative status)
14- RFC1630 - the "URI spec". (informational status)
15
16The object returned by URLopener().open(file) will differ per
17protocol. All you know is that is has methods read(), readline(),
18readlines(), fileno(), close() and info(). The read*(), fileno()
19and close() methods work like those of open files.
20The info() method returns a mimetools.Message object which can be
21used to query various info about the object, if available.
22(mimetools.Message objects are queried with the getheader() method.)
23"""
24
25import string
26import socket
27import os
28import time
29import sys
30from urlparse import urljoin as basejoin
31
32__all__ = ["urlopen", "URLopener", "FancyURLopener", "urlretrieve",
33 "urlcleanup", "quote", "quote_plus", "unquote", "unquote_plus",
34 "urlencode", "url2pathname", "pathname2url", "splittag",
35 "localhost", "thishost", "ftperrors", "basejoin", "unwrap",
36 "splittype", "splithost", "splituser", "splitpasswd", "splitport",
37 "splitnport", "splitquery", "splitattr", "splitvalue",
38 "splitgophertype", "getproxies"]
39
40__version__ = '1.17' # XXX This version is not always updated :-(
41
42MAXFTPCACHE = 10 # Trim the ftp cache beyond this size
43
44# Helper for non-unix systems
45if os.name == 'mac':
46 from macurl2path import url2pathname, pathname2url
47elif os.name == 'nt':
48 from nturl2path import url2pathname, pathname2url
49elif os.name == 'riscos':
50 from rourl2path import url2pathname, pathname2url
51else:
52 def url2pathname(pathname):
53 """OS-specific conversion from a relative URL of the 'file' scheme
54 to a file system path; not recommended for general use."""
55 return unquote(pathname)
56
57 def pathname2url(pathname):
58 """OS-specific conversion from a file system path to a relative URL
59 of the 'file' scheme; not recommended for general use."""
60 return quote(pathname)
61
62# This really consists of two pieces:
63# (1) a class which handles opening of all sorts of URLs
64# (plus assorted utilities etc.)
65# (2) a set of functions for parsing URLs
66# XXX Should these be separated out into different modules?
67
68
69# Shortcut for basic usage
70_urlopener = None
71def urlopen(url, data=None, proxies=None):
72 """urlopen(url [, data]) -> open file-like object"""
73 global _urlopener
74 if proxies is not None:
75 opener = FancyURLopener(proxies=proxies)
76 elif not _urlopener:
77 opener = FancyURLopener()
78 _urlopener = opener
79 else:
80 opener = _urlopener
81 if data is None:
82 return opener.open(url)
83 else:
84 return opener.open(url, data)
85def urlretrieve(url, filename=None, reporthook=None, data=None):
86 global _urlopener
87 if not _urlopener:
88 _urlopener = FancyURLopener()
89 return _urlopener.retrieve(url, filename, reporthook, data)
90def urlcleanup():
91 if _urlopener:
92 _urlopener.cleanup()
93
94# exception raised when downloaded size does not match content-length
95class ContentTooShortError(IOError):
96 def __init__(self, message, content):
97 IOError.__init__(self, message)
98 self.content = content
99
100ftpcache = {}
101class URLopener:
102 """Class to open URLs.
103 This is a class rather than just a subroutine because we may need
104 more than one set of global protocol-specific options.
105 Note -- this is a base class for those who don't want the
106 automatic handling of errors type 302 (relocated) and 401
107 (authorization needed)."""
108
109 __tempfiles = None
110
111 version = "Python-urllib/%s" % __version__
112
113 # Constructor
114 def __init__(self, proxies=None, **x509):
115 if proxies is None:
116 proxies = getproxies()
117 assert hasattr(proxies, 'has_key'), "proxies must be a mapping"
118 self.proxies = proxies
119 self.key_file = x509.get('key_file')
120 self.cert_file = x509.get('cert_file')
121 self.addheaders = [('User-Agent', self.version)]
122 self.__tempfiles = []
123 self.__unlink = os.unlink # See cleanup()
124 self.tempcache = None
125 # Undocumented feature: if you assign {} to tempcache,
126 # it is used to cache files retrieved with
127 # self.retrieve(). This is not enabled by default
128 # since it does not work for changing documents (and I
129 # haven't got the logic to check expiration headers
130 # yet).
131 self.ftpcache = ftpcache
132 # Undocumented feature: you can use a different
133 # ftp cache by assigning to the .ftpcache member;
134 # in case you want logically independent URL openers
135 # XXX This is not threadsafe. Bah.
136
137 def __del__(self):
138 self.close()
139
140 def close(self):
141 self.cleanup()
142
143 def cleanup(self):
144 # This code sometimes runs when the rest of this module
145 # has already been deleted, so it can't use any globals
146 # or import anything.
147 if self.__tempfiles:
148 for file in self.__tempfiles:
149 try:
150 self.__unlink(file)
151 except OSError:
152 pass
153 del self.__tempfiles[:]
154 if self.tempcache:
155 self.tempcache.clear()
156
157 def addheader(self, *args):
158 """Add a header to be used by the HTTP interface only
159 e.g. u.addheader('Accept', 'sound/basic')"""
160 self.addheaders.append(args)
161
162 # External interface
163 def open(self, fullurl, data=None):
164 """Use URLopener().open(file) instead of open(file, 'r')."""
165 fullurl = unwrap(toBytes(fullurl))
166 if self.tempcache and fullurl in self.tempcache:
167 filename, headers = self.tempcache[fullurl]
168 fp = open(filename, 'rb')
169 return addinfourl(fp, headers, fullurl)
170 urltype, url = splittype(fullurl)
171 if not urltype:
172 urltype = 'file'
173 if urltype in self.proxies:
174 proxy = self.proxies[urltype]
175 urltype, proxyhost = splittype(proxy)
176 host, selector = splithost(proxyhost)
177 url = (host, fullurl) # Signal special case to open_*()
178 else:
179 proxy = None
180 name = 'open_' + urltype
181 self.type = urltype
182 name = name.replace('-', '_')
183 if not hasattr(self, name):
184 if proxy:
185 return self.open_unknown_proxy(proxy, fullurl, data)
186 else:
187 return self.open_unknown(fullurl, data)
188 try:
189 if data is None:
190 return getattr(self, name)(url)
191 else:
192 return getattr(self, name)(url, data)
193 except socket.error, msg:
194 raise IOError, ('socket error', msg), sys.exc_info()[2]
195
196 def open_unknown(self, fullurl, data=None):
197 """Overridable interface to open unknown URL type."""
198 type, url = splittype(fullurl)
199 raise IOError, ('url error', 'unknown url type', type)
200
201 def open_unknown_proxy(self, proxy, fullurl, data=None):
202 """Overridable interface to open unknown URL type."""
203 type, url = splittype(fullurl)
204 raise IOError, ('url error', 'invalid proxy for %s' % type, proxy)
205
206 # External interface
207 def retrieve(self, url, filename=None, reporthook=None, data=None):
208 """retrieve(url) returns (filename, headers) for a local object
209 or (tempfilename, headers) for a remote object."""
210 url = unwrap(toBytes(url))
211 if self.tempcache and url in self.tempcache:
212 return self.tempcache[url]
213 type, url1 = splittype(url)
214 if filename is None and (not type or type == 'file'):
215 try:
216 fp = self.open_local_file(url1)
217 hdrs = fp.info()
218 del fp
219 return url2pathname(splithost(url1)[1]), hdrs
220 except IOError, msg:
221 pass
222 fp = self.open(url, data)
223 headers = fp.info()
224 if filename:
225 tfp = open(filename, 'wb')
226 else:
227 import tempfile
228 garbage, path = splittype(url)
229 garbage, path = splithost(path or "")
230 path, garbage = splitquery(path or "")
231 path, garbage = splitattr(path or "")
232 suffix = os.path.splitext(path)[1]
233 (fd, filename) = tempfile.mkstemp(suffix)
234 self.__tempfiles.append(filename)
235 tfp = os.fdopen(fd, 'wb')
236 result = filename, headers
237 if self.tempcache is not None:
238 self.tempcache[url] = result
239 bs = 1024*8
240 size = -1
241 read = 0
242 blocknum = 0
243 if reporthook:
244 if "content-length" in headers:
245 size = int(headers["Content-Length"])
246 reporthook(blocknum, bs, size)
247 while 1:
248 block = fp.read(bs)
249 if block == "":
250 break
251 read += len(block)
252 tfp.write(block)
253 blocknum += 1
254 if reporthook:
255 reporthook(blocknum, bs, size)
256 fp.close()
257 tfp.close()
258 del fp
259 del tfp
260
261 # raise exception if actual size does not match content-length header
262 if size >= 0 and read < size:
263 raise ContentTooShortError("retrieval incomplete: got only %i out "
264 "of %i bytes" % (read, size), result)
265
266 return result
267
268 # Each method named open_<type> knows how to open that type of URL
269
270 def open_http(self, url, data=None):
271 """Use HTTP protocol."""
272 import httplib
273 user_passwd = None
274 proxy_passwd= None
275 if isinstance(url, str):
276 host, selector = splithost(url)
277 if host:
278 user_passwd, host = splituser(host)
279 host = unquote(host)
280 realhost = host
281 else:
282 host, selector = url
283 # check whether the proxy contains authorization information
284 proxy_passwd, host = splituser(host)
285 # now we proceed with the url we want to obtain
286 urltype, rest = splittype(selector)
287 url = rest
288 user_passwd = None
289 if urltype.lower() != 'http':
290 realhost = None
291 else:
292 realhost, rest = splithost(rest)
293 if realhost:
294 user_passwd, realhost = splituser(realhost)
295 if user_passwd:
296 selector = "%s://%s%s" % (urltype, realhost, rest)
297 if proxy_bypass(realhost):
298 host = realhost
299
300 #print "proxy via http:", host, selector
301 if not host: raise IOError, ('http error', 'no host given')
302
303 if proxy_passwd:
304 import base64
305 proxy_auth = base64.encodestring(proxy_passwd).strip()
306 else:
307 proxy_auth = None
308
309 if user_passwd:
310 import base64
311 auth = base64.encodestring(user_passwd).strip()
312 else:
313 auth = None
314 h = httplib.HTTP(host)
315 if data is not None:
316 h.putrequest('POST', selector)
317 h.putheader('Content-Type', 'application/x-www-form-urlencoded')
318 h.putheader('Content-Length', '%d' % len(data))
319 else:
320 h.putrequest('GET', selector)
321 if proxy_auth: h.putheader('Proxy-Authorization', 'Basic %s' % proxy_auth)
322 if auth: h.putheader('Authorization', 'Basic %s' % auth)
323 if realhost: h.putheader('Host', realhost)
324 for args in self.addheaders: h.putheader(*args)
325 h.endheaders()
326 if data is not None:
327 h.send(data)
328 errcode, errmsg, headers = h.getreply()
329 fp = h.getfile()
330 if errcode == 200:
331 return addinfourl(fp, headers, "http:" + url)
332 else:
333 if data is None:
334 return self.http_error(url, fp, errcode, errmsg, headers)
335 else:
336 return self.http_error(url, fp, errcode, errmsg, headers, data)
337
338 def http_error(self, url, fp, errcode, errmsg, headers, data=None):
339 """Handle http errors.
340 Derived class can override this, or provide specific handlers
341 named http_error_DDD where DDD is the 3-digit error code."""
342 # First check if there's a specific handler for this error
343 name = 'http_error_%d' % errcode
344 if hasattr(self, name):
345 method = getattr(self, name)
346 if data is None:
347 result = method(url, fp, errcode, errmsg, headers)
348 else:
349 result = method(url, fp, errcode, errmsg, headers, data)
350 if result: return result
351 return self.http_error_default(url, fp, errcode, errmsg, headers)
352
353 def http_error_default(self, url, fp, errcode, errmsg, headers):
354 """Default error handler: close the connection and raise IOError."""
355 void = fp.read()
356 fp.close()
357 raise IOError, ('http error', errcode, errmsg, headers)
358
359 if hasattr(socket, "ssl"):
360 def open_https(self, url, data=None):
361 """Use HTTPS protocol."""
362 import httplib
363 user_passwd = None
364 proxy_passwd = None
365 if isinstance(url, str):
366 host, selector = splithost(url)
367 if host:
368 user_passwd, host = splituser(host)
369 host = unquote(host)
370 realhost = host
371 else:
372 host, selector = url
373 # here, we determine, whether the proxy contains authorization information
374 proxy_passwd, host = splituser(host)
375 urltype, rest = splittype(selector)
376 url = rest
377 user_passwd = None
378 if urltype.lower() != 'https':
379 realhost = None
380 else:
381 realhost, rest = splithost(rest)
382 if realhost:
383 user_passwd, realhost = splituser(realhost)
384 if user_passwd:
385 selector = "%s://%s%s" % (urltype, realhost, rest)
386 #print "proxy via https:", host, selector
387 if not host: raise IOError, ('https error', 'no host given')
388 if proxy_passwd:
389 import base64
390 proxy_auth = base64.encodestring(proxy_passwd).strip()
391 else:
392 proxy_auth = None
393 if user_passwd:
394 import base64
395 auth = base64.encodestring(user_passwd).strip()
396 else:
397 auth = None
398 h = httplib.HTTPS(host, 0,
399 key_file=self.key_file,
400 cert_file=self.cert_file)
401 if data is not None:
402 h.putrequest('POST', selector)
403 h.putheader('Content-Type',
404 'application/x-www-form-urlencoded')
405 h.putheader('Content-Length', '%d' % len(data))
406 else:
407 h.putrequest('GET', selector)
408 if proxy_auth: h.putheader('Proxy-Authorization: Basic %s' % proxy_auth)
409 if auth: h.putheader('Authorization: Basic %s' % auth)
410 if realhost: h.putheader('Host', realhost)
411 for args in self.addheaders: h.putheader(*args)
412 h.endheaders()
413 if data is not None:
414 h.send(data)
415 errcode, errmsg, headers = h.getreply()
416 fp = h.getfile()
417 if errcode == 200:
418 return addinfourl(fp, headers, "https:" + url)
419 else:
420 if data is None:
421 return self.http_error(url, fp, errcode, errmsg, headers)
422 else:
423 return self.http_error(url, fp, errcode, errmsg, headers,
424 data)
425
426 def open_gopher(self, url):
427 """Use Gopher protocol."""
428 if not isinstance(url, str):
429 raise IOError, ('gopher error', 'proxy support for gopher protocol currently not implemented')
430 import gopherlib
431 host, selector = splithost(url)
432 if not host: raise IOError, ('gopher error', 'no host given')
433 host = unquote(host)
434 type, selector = splitgophertype(selector)
435 selector, query = splitquery(selector)
436 selector = unquote(selector)
437 if query:
438 query = unquote(query)
439 fp = gopherlib.send_query(selector, query, host)
440 else:
441 fp = gopherlib.send_selector(selector, host)
442 return addinfourl(fp, noheaders(), "gopher:" + url)
443
444 def open_file(self, url):
445 """Use local file or FTP depending on form of URL."""
446 if not isinstance(url, str):
447 raise IOError, ('file error', 'proxy support for file protocol currently not implemented')
448 if url[:2] == '//' and url[2:3] != '/' and url[2:12].lower() != 'localhost/':
449 return self.open_ftp(url)
450 else:
451 return self.open_local_file(url)
452
453 def open_local_file(self, url):
454 """Use local file."""
455 import mimetypes, mimetools, email.Utils
456 try:
457 from cStringIO import StringIO
458 except ImportError:
459 from StringIO import StringIO
460 host, file = splithost(url)
461 localname = url2pathname(file)
462 try:
463 stats = os.stat(localname)
464 except OSError, e:
465 raise IOError(e.errno, e.strerror, e.filename)
466 size = stats.st_size
467 modified = email.Utils.formatdate(stats.st_mtime, usegmt=True)
468 mtype = mimetypes.guess_type(url)[0]
469 headers = mimetools.Message(StringIO(
470 'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %
471 (mtype or 'text/plain', size, modified)))
472 if not host:
473 urlfile = file
474 if file[:1] == '/':
475 urlfile = 'file://' + file
476 return addinfourl(open(localname, 'rb'),
477 headers, urlfile)
478 host, port = splitport(host)
479 if not port \
480 and socket.gethostbyname(host) in (localhost(), thishost()):
481 urlfile = file
482 if file[:1] == '/':
483 urlfile = 'file://' + file
484 return addinfourl(open(localname, 'rb'),
485 headers, urlfile)
486 raise IOError, ('local file error', 'not on local host')
487
488 def open_ftp(self, url):
489 """Use FTP protocol."""
490 if not isinstance(url, str):
491 raise IOError, ('ftp error', 'proxy support for ftp protocol currently not implemented')
492 import mimetypes, mimetools
493 try:
494 from cStringIO import StringIO
495 except ImportError:
496 from StringIO import StringIO
497 host, path = splithost(url)
498 if not host: raise IOError, ('ftp error', 'no host given')
499 host, port = splitport(host)
500 user, host = splituser(host)
501 if user: user, passwd = splitpasswd(user)
502 else: passwd = None
503 host = unquote(host)
504 user = unquote(user or '')
505 passwd = unquote(passwd or '')
506 host = socket.gethostbyname(host)
507 if not port:
508 import ftplib
509 port = ftplib.FTP_PORT
510 else:
511 port = int(port)
512 path, attrs = splitattr(path)
513 path = unquote(path)
514 dirs = path.split('/')
515 dirs, file = dirs[:-1], dirs[-1]
516 if dirs and not dirs[0]: dirs = dirs[1:]
517 if dirs and not dirs[0]: dirs[0] = '/'
518 key = user, host, port, '/'.join(dirs)