Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

urllib.py@ 3225

Visit:

Last change on this file since 3225 was 3225, checked in by bird, 19 years ago
Python 2.5
File size: 53.2 KB

Line
1	"""Open an arbitrary URL.
2
3	See the following document for more info on URLs:
4	"Names and Addresses, URIs, URLs, URNs, URCs", at
5	http://www.w3.org/pub/WWW/Addressing/Overview.html
6
7	See also the HTTP spec (from which the error codes are derived):
8	"HTTP - Hypertext Transfer Protocol", at
9	http://www.w3.org/pub/WWW/Protocols/
10
11	Related standards and specs:
12	- RFC1808: the "relative URL" spec. (authoritative status)
13	- RFC1738 - the "URL standard". (authoritative status)
14	- RFC1630 - the "URI spec". (informational status)
15
16	The object returned by URLopener().open(file) will differ per
17	protocol. All you know is that is has methods read(), readline(),
18	readlines(), fileno(), close() and info(). The read*(), fileno()
19	and close() methods work like those of open files.
20	The info() method returns a mimetools.Message object which can be
21	used to query various info about the object, if available.
22	(mimetools.Message objects are queried with the getheader() method.)
23	"""
24
25	import string
26	import socket
27	import os
28	import time
29	import sys
30	from urlparse import urljoin as basejoin
31
32	__all__ = ["urlopen", "URLopener", "FancyURLopener", "urlretrieve",
33	"urlcleanup", "quote", "quote_plus", "unquote", "unquote_plus",
34	"urlencode", "url2pathname", "pathname2url", "splittag",
35	"localhost", "thishost", "ftperrors", "basejoin", "unwrap",
36	"splittype", "splithost", "splituser", "splitpasswd", "splitport",
37	"splitnport", "splitquery", "splitattr", "splitvalue",
38	"splitgophertype", "getproxies"]
39
40	__version__ = '1.17' # XXX This version is not always updated :-(
41
42	MAXFTPCACHE = 10 # Trim the ftp cache beyond this size
43
44	# Helper for non-unix systems
45	if os.name == 'mac':
46	from macurl2path import url2pathname, pathname2url
47	elif os.name == 'nt':
48	from nturl2path import url2pathname, pathname2url
49	elif os.name == 'riscos':
50	from rourl2path import url2pathname, pathname2url
51	else:
52	def url2pathname(pathname):
53	"""OS-specific conversion from a relative URL of the 'file' scheme
54	to a file system path; not recommended for general use."""
55	return unquote(pathname)
56
57	def pathname2url(pathname):
58	"""OS-specific conversion from a file system path to a relative URL
59	of the 'file' scheme; not recommended for general use."""
60	return quote(pathname)
61
62	# This really consists of two pieces:
63	# (1) a class which handles opening of all sorts of URLs
64	# (plus assorted utilities etc.)
65	# (2) a set of functions for parsing URLs
66	# XXX Should these be separated out into different modules?
67
68
69	# Shortcut for basic usage
70	_urlopener = None
71	def urlopen(url, data=None, proxies=None):
72	"""urlopen(url [, data]) -> open file-like object"""
73	global _urlopener
74	if proxies is not None:
75	opener = FancyURLopener(proxies=proxies)
76	elif not _urlopener:
77	opener = FancyURLopener()
78	_urlopener = opener
79	else:
80	opener = _urlopener
81	if data is None:
82	return opener.open(url)
83	else:
84	return opener.open(url, data)
85	def urlretrieve(url, filename=None, reporthook=None, data=None):
86	global _urlopener
87	if not _urlopener:
88	_urlopener = FancyURLopener()
89	return _urlopener.retrieve(url, filename, reporthook, data)
90	def urlcleanup():
91	if _urlopener:
92	_urlopener.cleanup()
93
94	# exception raised when downloaded size does not match content-length
95	class ContentTooShortError(IOError):
96	def __init__(self, message, content):
97	IOError.__init__(self, message)
98	self.content = content
99
100	ftpcache = {}
101	class URLopener:
102	"""Class to open URLs.
103	This is a class rather than just a subroutine because we may need
104	more than one set of global protocol-specific options.
105	Note -- this is a base class for those who don't want the
106	automatic handling of errors type 302 (relocated) and 401
107	(authorization needed)."""
108
109	__tempfiles = None
110
111	version = "Python-urllib/%s" % __version__
112
113	# Constructor
114	def __init__(self, proxies=None, **x509):
115	if proxies is None:
116	proxies = getproxies()
117	assert hasattr(proxies, 'has_key'), "proxies must be a mapping"
118	self.proxies = proxies
119	self.key_file = x509.get('key_file')
120	self.cert_file = x509.get('cert_file')
121	self.addheaders = [('User-Agent', self.version)]
122	self.__tempfiles = []
123	self.__unlink = os.unlink # See cleanup()
124	self.tempcache = None
125	# Undocumented feature: if you assign {} to tempcache,
126	# it is used to cache files retrieved with
127	# self.retrieve(). This is not enabled by default
128	# since it does not work for changing documents (and I
129	# haven't got the logic to check expiration headers
130	# yet).
131	self.ftpcache = ftpcache
132	# Undocumented feature: you can use a different
133	# ftp cache by assigning to the .ftpcache member;
134	# in case you want logically independent URL openers
135	# XXX This is not threadsafe. Bah.
136
137	def __del__(self):
138	self.close()
139
140	def close(self):
141	self.cleanup()
142
143	def cleanup(self):
144	# This code sometimes runs when the rest of this module
145	# has already been deleted, so it can't use any globals
146	# or import anything.
147	if self.__tempfiles:
148	for file in self.__tempfiles:
149	try:
150	self.__unlink(file)
151	except OSError:
152	pass
153	del self.__tempfiles[:]
154	if self.tempcache:
155	self.tempcache.clear()
156
157	def addheader(self, *args):
158	"""Add a header to be used by the HTTP interface only
159	e.g. u.addheader('Accept', 'sound/basic')"""
160	self.addheaders.append(args)
161
162	# External interface
163	def open(self, fullurl, data=None):
164	"""Use URLopener().open(file) instead of open(file, 'r')."""
165	fullurl = unwrap(toBytes(fullurl))
166	if self.tempcache and fullurl in self.tempcache:
167	filename, headers = self.tempcache[fullurl]
168	fp = open(filename, 'rb')
169	return addinfourl(fp, headers, fullurl)
170	urltype, url = splittype(fullurl)
171	if not urltype:
172	urltype = 'file'
173	if urltype in self.proxies:
174	proxy = self.proxies[urltype]
175	urltype, proxyhost = splittype(proxy)
176	host, selector = splithost(proxyhost)
177	url = (host, fullurl) # Signal special case to open_*()
178	else:
179	proxy = None
180	name = 'open_' + urltype
181	self.type = urltype
182	name = name.replace('-', '_')
183	if not hasattr(self, name):
184	if proxy:
185	return self.open_unknown_proxy(proxy, fullurl, data)
186	else:
187	return self.open_unknown(fullurl, data)
188	try:
189	if data is None:
190	return getattr(self, name)(url)
191	else:
192	return getattr(self, name)(url, data)
193	except socket.error, msg:
194	raise IOError, ('socket error', msg), sys.exc_info()[2]
195
196	def open_unknown(self, fullurl, data=None):
197	"""Overridable interface to open unknown URL type."""
198	type, url = splittype(fullurl)
199	raise IOError, ('url error', 'unknown url type', type)
200
201	def open_unknown_proxy(self, proxy, fullurl, data=None):
202	"""Overridable interface to open unknown URL type."""
203	type, url = splittype(fullurl)
204	raise IOError, ('url error', 'invalid proxy for %s' % type, proxy)
205
206	# External interface
207	def retrieve(self, url, filename=None, reporthook=None, data=None):
208	"""retrieve(url) returns (filename, headers) for a local object
209	or (tempfilename, headers) for a remote object."""
210	url = unwrap(toBytes(url))
211	if self.tempcache and url in self.tempcache:
212	return self.tempcache[url]
213	type, url1 = splittype(url)
214	if filename is None and (not type or type == 'file'):
215	try:
216	fp = self.open_local_file(url1)
217	hdrs = fp.info()
218	del fp
219	return url2pathname(splithost(url1)[1]), hdrs
220	except IOError, msg:
221	pass
222	fp = self.open(url, data)
223	headers = fp.info()
224	if filename:
225	tfp = open(filename, 'wb')
226	else:
227	import tempfile
228	garbage, path = splittype(url)
229	garbage, path = splithost(path or "")
230	path, garbage = splitquery(path or "")
231	path, garbage = splitattr(path or "")
232	suffix = os.path.splitext(path)[1]
233	(fd, filename) = tempfile.mkstemp(suffix)
234	self.__tempfiles.append(filename)
235	tfp = os.fdopen(fd, 'wb')
236	result = filename, headers
237	if self.tempcache is not None:
238	self.tempcache[url] = result
239	bs = 1024*8
240	size = -1
241	read = 0
242	blocknum = 0
243	if reporthook:
244	if "content-length" in headers:
245	size = int(headers["Content-Length"])
246	reporthook(blocknum, bs, size)
247	while 1:
248	block = fp.read(bs)
249	if block == "":
250	break
251	read += len(block)
252	tfp.write(block)
253	blocknum += 1
254	if reporthook:
255	reporthook(blocknum, bs, size)
256	fp.close()
257	tfp.close()
258	del fp
259	del tfp
260
261	# raise exception if actual size does not match content-length header
262	if size >= 0 and read < size:
263	raise ContentTooShortError("retrieval incomplete: got only %i out "
264	"of %i bytes" % (read, size), result)
265
266	return result
267
268	# Each method named open_<type> knows how to open that type of URL
269
270	def open_http(self, url, data=None):
271	"""Use HTTP protocol."""
272	import httplib
273	user_passwd = None
274	proxy_passwd= None
275	if isinstance(url, str):
276	host, selector = splithost(url)
277	if host:
278	user_passwd, host = splituser(host)
279	host = unquote(host)
280	realhost = host
281	else:
282	host, selector = url
283	# check whether the proxy contains authorization information
284	proxy_passwd, host = splituser(host)
285	# now we proceed with the url we want to obtain
286	urltype, rest = splittype(selector)
287	url = rest
288	user_passwd = None
289	if urltype.lower() != 'http':
290	realhost = None
291	else:
292	realhost, rest = splithost(rest)
293	if realhost:
294	user_passwd, realhost = splituser(realhost)
295	if user_passwd:
296	selector = "%s://%s%s" % (urltype, realhost, rest)
297	if proxy_bypass(realhost):
298	host = realhost
299
300	#print "proxy via http:", host, selector
301	if not host: raise IOError, ('http error', 'no host given')
302
303	if proxy_passwd:
304	import base64
305	proxy_auth = base64.encodestring(proxy_passwd).strip()
306	else:
307	proxy_auth = None
308
309	if user_passwd:
310	import base64
311	auth = base64.encodestring(user_passwd).strip()
312	else:
313	auth = None
314	h = httplib.HTTP(host)
315	if data is not None:
316	h.putrequest('POST', selector)
317	h.putheader('Content-Type', 'application/x-www-form-urlencoded')
318	h.putheader('Content-Length', '%d' % len(data))
319	else:
320	h.putrequest('GET', selector)
321	if proxy_auth: h.putheader('Proxy-Authorization', 'Basic %s' % proxy_auth)
322	if auth: h.putheader('Authorization', 'Basic %s' % auth)
323	if realhost: h.putheader('Host', realhost)
324	for args in self.addheaders: h.putheader(*args)
325	h.endheaders()
326	if data is not None:
327	h.send(data)
328	errcode, errmsg, headers = h.getreply()
329	fp = h.getfile()
330	if errcode == 200:
331	return addinfourl(fp, headers, "http:" + url)
332	else:
333	if data is None:
334	return self.http_error(url, fp, errcode, errmsg, headers)
335	else:
336	return self.http_error(url, fp, errcode, errmsg, headers, data)
337
338	def http_error(self, url, fp, errcode, errmsg, headers, data=None):
339	"""Handle http errors.
340	Derived class can override this, or provide specific handlers
341	named http_error_DDD where DDD is the 3-digit error code."""
342	# First check if there's a specific handler for this error
343	name = 'http_error_%d' % errcode
344	if hasattr(self, name):
345	method = getattr(self, name)
346	if data is None:
347	result = method(url, fp, errcode, errmsg, headers)
348	else:
349	result = method(url, fp, errcode, errmsg, headers, data)
350	if result: return result
351	return self.http_error_default(url, fp, errcode, errmsg, headers)
352
353	def http_error_default(self, url, fp, errcode, errmsg, headers):
354	"""Default error handler: close the connection and raise IOError."""
355	void = fp.read()
356	fp.close()
357	raise IOError, ('http error', errcode, errmsg, headers)
358
359	if hasattr(socket, "ssl"):
360	def open_https(self, url, data=None):
361	"""Use HTTPS protocol."""
362	import httplib
363	user_passwd = None
364	proxy_passwd = None
365	if isinstance(url, str):
366	host, selector = splithost(url)
367	if host:
368	user_passwd, host = splituser(host)
369	host = unquote(host)
370	realhost = host
371	else:
372	host, selector = url
373	# here, we determine, whether the proxy contains authorization information
374	proxy_passwd, host = splituser(host)
375	urltype, rest = splittype(selector)
376	url = rest
377	user_passwd = None
378	if urltype.lower() != 'https':
379	realhost = None
380	else:
381	realhost, rest = splithost(rest)
382	if realhost:
383	user_passwd, realhost = splituser(realhost)
384	if user_passwd:
385	selector = "%s://%s%s" % (urltype, realhost, rest)
386	#print "proxy via https:", host, selector
387	if not host: raise IOError, ('https error', 'no host given')
388	if proxy_passwd:
389	import base64
390	proxy_auth = base64.encodestring(proxy_passwd).strip()
391	else:
392	proxy_auth = None
393	if user_passwd:
394	import base64
395	auth = base64.encodestring(user_passwd).strip()
396	else:
397	auth = None
398	h = httplib.HTTPS(host, 0,
399	key_file=self.key_file,
400	cert_file=self.cert_file)
401	if data is not None:
402	h.putrequest('POST', selector)
403	h.putheader('Content-Type',
404	'application/x-www-form-urlencoded')
405	h.putheader('Content-Length', '%d' % len(data))
406	else:
407	h.putrequest('GET', selector)
408	if proxy_auth: h.putheader('Proxy-Authorization: Basic %s' % proxy_auth)
409	if auth: h.putheader('Authorization: Basic %s' % auth)
410	if realhost: h.putheader('Host', realhost)
411	for args in self.addheaders: h.putheader(*args)
412	h.endheaders()
413	if data is not None:
414	h.send(data)
415	errcode, errmsg, headers = h.getreply()
416	fp = h.getfile()
417	if errcode == 200:
418	return addinfourl(fp, headers, "https:" + url)
419	else:
420	if data is None:
421	return self.http_error(url, fp, errcode, errmsg, headers)
422	else:
423	return self.http_error(url, fp, errcode, errmsg, headers,
424	data)
425
426	def open_gopher(self, url):
427	"""Use Gopher protocol."""
428	if not isinstance(url, str):
429	raise IOError, ('gopher error', 'proxy support for gopher protocol currently not implemented')
430	import gopherlib
431	host, selector = splithost(url)
432	if not host: raise IOError, ('gopher error', 'no host given')
433	host = unquote(host)
434	type, selector = splitgophertype(selector)
435	selector, query = splitquery(selector)
436	selector = unquote(selector)
437	if query:
438	query = unquote(query)
439	fp = gopherlib.send_query(selector, query, host)
440	else:
441	fp = gopherlib.send_selector(selector, host)
442	return addinfourl(fp, noheaders(), "gopher:" + url)
443
444	def open_file(self, url):
445	"""Use local file or FTP depending on form of URL."""
446	if not isinstance(url, str):
447	raise IOError, ('file error', 'proxy support for file protocol currently not implemented')
448	if url[:2] == '//' and url[2:3] != '/' and url[2:12].lower() != 'localhost/':
449	return self.open_ftp(url)
450	else:
451	return self.open_local_file(url)
452
453	def open_local_file(self, url):
454	"""Use local file."""
455	import mimetypes, mimetools, email.Utils
456	try:
457	from cStringIO import StringIO
458	except ImportError:
459	from StringIO import StringIO
460	host, file = splithost(url)
461	localname = url2pathname(file)
462	try:
463	stats = os.stat(localname)
464	except OSError, e:
465	raise IOError(e.errno, e.strerror, e.filename)
466	size = stats.st_size
467	modified = email.Utils.formatdate(stats.st_mtime, usegmt=True)
468	mtype = mimetypes.guess_type(url)[0]
469	headers = mimetools.Message(StringIO(
470	'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %
471	(mtype or 'text/plain', size, modified)))
472	if not host:
473	urlfile = file
474	if file[:1] == '/':
475	urlfile = 'file://' + file
476	return addinfourl(open(localname, 'rb'),
477	headers, urlfile)
478	host, port = splitport(host)
479	if not port \
480	and socket.gethostbyname(host) in (localhost(), thishost()):
481	urlfile = file
482	if file[:1] == '/':
483	urlfile = 'file://' + file
484	return addinfourl(open(localname, 'rb'),
485	headers, urlfile)
486	raise IOError, ('local file error', 'not on local host')
487
488	def open_ftp(self, url):
489	"""Use FTP protocol."""
490	if not isinstance(url, str):
491	raise IOError, ('ftp error', 'proxy support for ftp protocol currently not implemented')
492	import mimetypes, mimetools
493	try:
494	from cStringIO import StringIO
495	except ImportError:
496	from StringIO import StringIO
497	host, path = splithost(url)
498	if not host: raise IOError, ('ftp error', 'no host given')
499	host, port = splitport(host)
500	user, host = splituser(host)
501	if user: user, passwd = splitpasswd(user)
502	else: passwd = None
503	host = unquote(host)
504	user = unquote(user or '')
505	passwd = unquote(passwd or '')
506	host = socket.gethostbyname(host)
507	if not port:
508	import ftplib
509	port = ftplib.FTP_PORT
510	else:
511	port = int(port)
512	path, attrs = splitattr(path)
513	path = unquote(path)
514	dirs = path.split('/')
515	dirs, file = dirs[:-1], dirs[-1]
516	if dirs and not dirs[0]: dirs = dirs[1:]
517	if dirs and not dirs[0]: dirs[0] = '/'
518	key = user, host, port, '/'.join(dirs)