Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

header.py@ 3951

Visit:

Last change on this file since 3951 was 3225, checked in by bird, 19 years ago
Python 2.5
File size: 21.2 KB

Line
1	# Copyright (C) 2002-2006 Python Software Foundation
2	# Author: Ben Gertzfield, Barry Warsaw
3	# Contact: [email protected]
4
5	"""Header encoding and decoding functionality."""
6
7	__all__ = [
8	'Header',
9	'decode_header',
10	'make_header',
11	]
12
13	import re
14	import binascii
15
16	import email.quoprimime
17	import email.base64mime
18
19	from email.errors import HeaderParseError
20	from email.charset import Charset
21
22	NL = '\n'
23	SPACE = ' '
24	USPACE = u' '
25	SPACE8 = ' ' * 8
26	UEMPTYSTRING = u''
27
28	MAXLINELEN = 76
29
30	USASCII = Charset('us-ascii')
31	UTF8 = Charset('utf-8')
32
33	# Match encoded-word strings in the form =?charset?q?Hello_World?=
34	ecre = re.compile(r'''
35	=\? # literal =?
36	(?P<charset>[^?]*?) # non-greedy up to the next ? is the charset
37	\? # literal ?
38	(?P<encoding>[qb]) # either a "q" or a "b", case insensitive
39	\? # literal ?
40	(?P<encoded>.*?) # non-greedy up to the next ?= is the encoded string
41	\?= # literal ?=
42	''', re.VERBOSE \| re.IGNORECASE)
43
44	# Field name regexp, including trailing colon, but not separating whitespace,
45	# according to RFC 2822. Character range is from tilde to exclamation mark.
46	# For use with .match()
47	fcre = re.compile(r'[\041-\176]+:$')
48
49
50
51
52	# Helpers
53	_max_append = email.quoprimime._max_append
54
55
56
57
58	def decode_header(header):
59	"""Decode a message header value without converting charset.
60
61	Returns a list of (decoded_string, charset) pairs containing each of the
62	decoded parts of the header. Charset is None for non-encoded parts of the
63	header, otherwise a lower-case string containing the name of the character
64	set specified in the encoded string.
65
66	An email.Errors.HeaderParseError may be raised when certain decoding error
67	occurs (e.g. a base64 decoding exception).
68	"""
69	# If no encoding, just return the header
70	header = str(header)
71	if not ecre.search(header):
72	return [(header, None)]
73	decoded = []
74	dec = ''
75	for line in header.splitlines():
76	# This line might not have an encoding in it
77	if not ecre.search(line):
78	decoded.append((line, None))
79	continue
80	parts = ecre.split(line)
81	while parts:
82	unenc = parts.pop(0).strip()
83	if unenc:
84	# Should we continue a long line?
85	if decoded and decoded[-1][1] is None:
86	decoded[-1] = (decoded[-1][0] + SPACE + unenc, None)
87	else:
88	decoded.append((unenc, None))
89	if parts:
90	charset, encoding = [s.lower() for s in parts[0:2]]
91	encoded = parts[2]
92	dec = None
93	if encoding == 'q':
94	dec = email.quoprimime.header_decode(encoded)
95	elif encoding == 'b':
96	try:
97	dec = email.base64mime.decode(encoded)
98	except binascii.Error:
99	# Turn this into a higher level exception. BAW: Right
100	# now we throw the lower level exception away but
101	# when/if we get exception chaining, we'll preserve it.
102	raise HeaderParseError
103	if dec is None:
104	dec = encoded
105
106	if decoded and decoded[-1][1] == charset:
107	decoded[-1] = (decoded[-1][0] + dec, decoded[-1][1])
108	else:
109	decoded.append((dec, charset))
110	del parts[0:3]
111	return decoded
112
113
114
115
116	def make_header(decoded_seq, maxlinelen=None, header_name=None,
117	continuation_ws=' '):
118	"""Create a Header from a sequence of pairs as returned by decode_header()
119
120	decode_header() takes a header value string and returns a sequence of
121	pairs of the format (decoded_string, charset) where charset is the string
122	name of the character set.
123
124	This function takes one of those sequence of pairs and returns a Header
125	instance. Optional maxlinelen, header_name, and continuation_ws are as in
126	the Header constructor.
127	"""
128	h = Header(maxlinelen=maxlinelen, header_name=header_name,
129	continuation_ws=continuation_ws)
130	for s, charset in decoded_seq:
131	# None means us-ascii but we can simply pass it on to h.append()
132	if charset is not None and not isinstance(charset, Charset):
133	charset = Charset(charset)
134	h.append(s, charset)
135	return h
136
137
138
139
140	class Header:
141	def __init__(self, s=None, charset=None,
142	maxlinelen=None, header_name=None,
143	continuation_ws=' ', errors='strict'):
144	"""Create a MIME-compliant header that can contain many character sets.
145
146	Optional s is the initial header value. If None, the initial header
147	value is not set. You can later append to the header with .append()
148	method calls. s may be a byte string or a Unicode string, but see the
149	.append() documentation for semantics.
150
151	Optional charset serves two purposes: it has the same meaning as the
152	charset argument to the .append() method. It also sets the default
153	character set for all subsequent .append() calls that omit the charset
154	argument. If charset is not provided in the constructor, the us-ascii
155	charset is used both as s's initial charset and as the default for
156	subsequent .append() calls.
157
158	The maximum line length can be specified explicit via maxlinelen. For
159	splitting the first line to a shorter value (to account for the field
160	header which isn't included in s, e.g. `Subject') pass in the name of
161	the field in header_name. The default maxlinelen is 76.
162
163	continuation_ws must be RFC 2822 compliant folding whitespace (usually
164	either a space or a hard tab) which will be prepended to continuation
165	lines.
166
167	errors is passed through to the .append() call.
168	"""
169	if charset is None:
170	charset = USASCII
171	if not isinstance(charset, Charset):
172	charset = Charset(charset)
173	self._charset = charset
174	self._continuation_ws = continuation_ws
175	cws_expanded_len = len(continuation_ws.replace('\t', SPACE8))
176	# BAW: I believe `chunks' and `maxlinelen' should be non-public.
177	self._chunks = []
178	if s is not None:
179	self.append(s, charset, errors)
180	if maxlinelen is None:
181	maxlinelen = MAXLINELEN
182	if header_name is None:
183	# We don't know anything about the field header so the first line
184	# is the same length as subsequent lines.
185	self._firstlinelen = maxlinelen
186	else:
187	# The first line should be shorter to take into account the field
188	# header. Also subtract off 2 extra for the colon and space.
189	self._firstlinelen = maxlinelen - len(header_name) - 2
190	# Second and subsequent lines should subtract off the length in
191	# columns of the continuation whitespace prefix.
192	self._maxlinelen = maxlinelen - cws_expanded_len
193
194	def __str__(self):
195	"""A synonym for self.encode()."""
196	return self.encode()
197
198	def __unicode__(self):
199	"""Helper for the built-in unicode function."""
200	uchunks = []
201	lastcs = None
202	for s, charset in self._chunks:
203	# We must preserve spaces between encoded and non-encoded word
204	# boundaries, which means for us we need to add a space when we go
205	# from a charset to None/us-ascii, or from None/us-ascii to a
206	# charset. Only do this for the second and subsequent chunks.
207	nextcs = charset
208	if uchunks:
209	if lastcs not in (None, 'us-ascii'):
210	if nextcs in (None, 'us-ascii'):
211	uchunks.append(USPACE)
212	nextcs = None
213	elif nextcs not in (None, 'us-ascii'):
214	uchunks.append(USPACE)
215	lastcs = nextcs
216	uchunks.append(unicode(s, str(charset)))
217	return UEMPTYSTRING.join(uchunks)
218
219	# Rich comparison operators for equality only. BAW: does it make sense to
220	# have or explicitly disable <, <=, >, >= operators?
221	def __eq__(self, other):
222	# other may be a Header or a string. Both are fine so coerce
223	# ourselves to a string, swap the args and do another comparison.
224	return other == self.encode()
225
226	def __ne__(self, other):
227	return not self == other
228
229	def append(self, s, charset=None, errors='strict'):
230	"""Append a string to the MIME header.
231
232	Optional charset, if given, should be a Charset instance or the name
233	of a character set (which will be converted to a Charset instance). A
234	value of None (the default) means that the charset given in the
235	constructor is used.
236
237	s may be a byte string or a Unicode string. If it is a byte string
238	(i.e. isinstance(s, str) is true), then charset is the encoding of
239	that byte string, and a UnicodeError will be raised if the string
240	cannot be decoded with that charset. If s is a Unicode string, then
241	charset is a hint specifying the character set of the characters in
242	the string. In this case, when producing an RFC 2822 compliant header
243	using RFC 2047 rules, the Unicode string will be encoded using the
244	following charsets in order: us-ascii, the charset hint, utf-8. The
245	first character set not to provoke a UnicodeError is used.
246
247	Optional `errors' is passed as the third argument to any unicode() or
248	ustr.encode() call.
249	"""
250	if charset is None:
251	charset = self._charset
252	elif not isinstance(charset, Charset):
253	charset = Charset(charset)
254	# If the charset is our faux 8bit charset, leave the string unchanged
255	if charset <> '8bit':
256	# We need to test that the string can be converted to unicode and
257	# back to a byte string, given the input and output codecs of the
258	# charset.
259	if isinstance(s, str):
260	# Possibly raise UnicodeError if the byte string can't be
261	# converted to a unicode with the input codec of the charset.
262	incodec = charset.input_codec or 'us-ascii'
263	ustr = unicode(s, incodec, errors)
264	# Now make sure that the unicode could be converted back to a
265	# byte string with the output codec, which may be different
266	# than the iput coded. Still, use the original byte string.
267	outcodec = charset.output_codec or 'us-ascii'
268	ustr.encode(outcodec, errors)
269	elif isinstance(s, unicode):
270	# Now we have to be sure the unicode string can be converted
271	# to a byte string with a reasonable output codec. We want to
272	# use the byte string in the chunk.
273	for charset in USASCII, charset, UTF8:
274	try:
275	outcodec = charset.output_codec or 'us-ascii'
276	s = s.encode(outcodec, errors)
277	break
278	except UnicodeError:
279	pass
280	else:
281	assert False, 'utf-8 conversion failed'
282	self._chunks.append((s, charset))
283
284	def _split(self, s, charset, maxlinelen, splitchars):
285	# Split up a header safely for use with encode_chunks.
286	splittable = charset.to_splittable(s)
287	encoded = charset.from_splittable(splittable, True)
288	elen = charset.encoded_header_len(encoded)
289	# If the line's encoded length first, just return it
290	if elen <= maxlinelen:
291	return [(encoded, charset)]
292	# If we have undetermined raw 8bit characters sitting in a byte
293	# string, we really don't know what the right thing to do is. We
294	# can't really split it because it might be multibyte data which we
295	# could break if we split it between pairs. The least harm seems to
296	# be to not split the header at all, but that means they could go out
297	# longer than maxlinelen.
298	if charset == '8bit':
299	return [(s, charset)]
300	# BAW: I'm not sure what the right test here is. What we're trying to
301	# do is be faithful to RFC 2822's recommendation that ($2.2.3):
302	#
303	# "Note: Though structured field bodies are defined in such a way that
304	# folding can take place between many of the lexical tokens (and even
305	# within some of the lexical tokens), folding SHOULD be limited to
306	# placing the CRLF at higher-level syntactic breaks."
307	#
308	# For now, I can only imagine doing this when the charset is us-ascii,
309	# although it's possible that other charsets may also benefit from the
310	# higher-level syntactic breaks.
311	elif charset == 'us-ascii':
312	return self._split_ascii(s, charset, maxlinelen, splitchars)
313	# BAW: should we use encoded?
314	elif elen == len(s):
315	# We can split on _maxlinelen boundaries because we know that the
316	# encoding won't change the size of the string
317	splitpnt = maxlinelen
318	first = charset.from_splittable(splittable[:splitpnt], False)
319	last = charset.from_splittable(splittable[splitpnt:], False)
320	else:
321	# Binary search for split point
322	first, last = _binsplit(splittable, charset, maxlinelen)
323	# first is of the proper length so just wrap it in the appropriate
324	# chrome. last must be recursively split.
325	fsplittable = charset.to_splittable(first)
326	fencoded = charset.from_splittable(fsplittable, True)
327	chunk = [(fencoded, charset)]
328	return chunk + self._split(last, charset, self._maxlinelen, splitchars)
329
330	def _split_ascii(self, s, charset, firstlen, splitchars):
331	chunks = _split_ascii(s, firstlen, self._maxlinelen,
332	self._continuation_ws, splitchars)
333	return zip(chunks, [charset]*len(chunks))
334
335	def _encode_chunks(self, newchunks, maxlinelen):
336	# MIME-encode a header with many different charsets and/or encodings.
337	#
338	# Given a list of pairs (string, charset), return a MIME-encoded
339	# string suitable for use in a header field. Each pair may have
340	# different charsets and/or encodings, and the resulting header will
341	# accurately reflect each setting.
342	#
343	# Each encoding can be email.Utils.QP (quoted-printable, for
344	# ASCII-like character sets like iso-8859-1), email.Utils.BASE64
345	# (Base64, for non-ASCII like character sets like KOI8-R and
346	# iso-2022-jp), or None (no encoding).
347	#
348	# Each pair will be represented on a separate line; the resulting
349	# string will be in the format:
350	#
351	# =?charset1?q?Mar=EDa_Gonz=E1lez_Alonso?=\n
352	# =?charset2?b?SvxyZ2VuIEL2aW5n?="
353	chunks = []
354	for header, charset in newchunks:
355	if not header:
356	continue
357	if charset is None or charset.header_encoding is None:
358	s = header
359	else:
360	s = charset.header_encode(header)
361	# Don't add more folding whitespace than necessary
362	if chunks and chunks[-1].endswith(' '):
363	extra = ''
364	else:
365	extra = ' '
366	_max_append(chunks, s, maxlinelen, extra)
367	joiner = NL + self._continuation_ws
368	return joiner.join(chunks)
369
370	def encode(self, splitchars=';, '):
371	"""Encode a message header into an RFC-compliant format.
372
373	There are many issues involved in converting a given string for use in
374	an email header. Only certain character sets are readable in most
375	email clients, and as header strings can only contain a subset of
376	7-bit ASCII, care must be taken to properly convert and encode (with
377	Base64 or quoted-printable) header strings. In addition, there is a
378	75-character length limit on any given encoded header field, so
379	line-wrapping must be performed, even with double-byte character sets.
380
381	This method will do its best to convert the string to the correct
382	character set used in email, and encode and line wrap it safely with
383	the appropriate scheme for that character set.
384
385	If the given charset is not known or an error occurs during
386	conversion, this function will return the header untouched.
387
388	Optional splitchars is a string containing characters to split long
389	ASCII lines on, in rough support of RFC 2822's `highest level
390	syntactic breaks'. This doesn't affect RFC 2047 encoded lines.
391	"""
392	newchunks = []
393	maxlinelen = self._firstlinelen
394	lastlen = 0
395	for s, charset in self._chunks:
396	# The first bit of the next chunk should be just long enough to
397	# fill the next line. Don't forget the space separating the
398	# encoded words.
399	targetlen = maxlinelen - lastlen - 1
400	if targetlen < charset.encoded_header_len(''):
401	# Stick it on the next line
402	targetlen = maxlinelen
403	newchunks += self._split(s, charset, targetlen, splitchars)
404	lastchunk, lastcharset = newchunks[-1]
405	lastlen = lastcharset.encoded_header_len(lastchunk)
406	return self._encode_chunks(newchunks, maxlinelen)
407
408
409
410
411	def _split_ascii(s, firstlen, restlen, continuation_ws, splitchars):
412	lines = []
413	maxlen = firstlen
414	for line in s.splitlines():
415	# Ignore any leading whitespace (i.e. continuation whitespace) already
416	# on the line, since we'll be adding our own.
417	line = line.lstrip()
418	if len(line) < maxlen:
419	lines.append(line)
420	maxlen = restlen
421	continue
422	# Attempt to split the line at the highest-level syntactic break
423	# possible. Note that we don't have a lot of smarts about field
424	# syntax; we just try to break on semi-colons, then commas, then
425	# whitespace.
426	for ch in splitchars:
427	if ch in line:
428	break
429	else:
430	# There's nothing useful to split the line on, not even spaces, so
431	# just append this line unchanged
432	lines.append(line)
433	maxlen = restlen
434	continue
435	# Now split the line on the character plus trailing whitespace
436	cre = re.compile(r'%s\s*' % ch)
437	if ch in ';,':
438	eol = ch
439	else:
440	eol = ''
441	joiner = eol + ' '
442	joinlen = len(joiner)
443	wslen = len(continuation_ws.replace('\t', SPACE8))
444	this = []
445	linelen = 0
446	for part in cre.split(line):
447	curlen = linelen + max(0, len(this)-1) * joinlen
448	partlen = len(part)
449	onfirstline = not lines
450	# We don't want to split after the field name, if we're on the
451	# first line and the field name is present in the header string.
452	if ch == ' ' and onfirstline and \
453	len(this) == 1 and fcre.match(this[0]):
454	this.append(part)
455	linelen += partlen
456	elif curlen + partlen > maxlen:
457	if this:
458	lines.append(joiner.join(this) + eol)
459	# If this part is longer than maxlen and we aren't already
460	# splitting on whitespace, try to recursively split this line
461	# on whitespace.
462	if partlen > maxlen and ch <> ' ':
463	subl = _split_ascii(part, maxlen, restlen,
464	continuation_ws, ' ')
465	lines.extend(subl[:-1])
466	this = [subl[-1]]
467	else:
468	this = [part]
469	linelen = wslen + len(this[-1])
470	maxlen = restlen
471	else:
472	this.append(part)
473	linelen += partlen
474	# Put any left over parts on a line by themselves
475	if this:
476	lines.append(joiner.join(this))
477	return lines
478
479
480
481
482	def _binsplit(splittable, charset, maxlinelen):
483	i = 0
484	j = len(splittable)
485	while i < j:
486	# Invariants:
487	# 1. splittable[:k] fits for all k <= i (note that we assume,
488	# at the start, that splittable[:0] fits).
489	# 2. splittable[:k] does not fit for any k > j (at the start,
490	# this means we shouldn't look at any k > len(splittable)).
491	# 3. We don't know about splittable[:k] for k in i+1..j.
492	# 4. We want to set i to the largest k that fits, with i <= k <= j.
493	#
494	m = (i+j+1) >> 1 # ceiling((i+j)/2); i < m <= j
495	chunk = charset.from_splittable(splittable[:m], True)
496	chunklen = charset.encoded_header_len(chunk)
497	if chunklen <= maxlinelen:
498	# m is acceptable, so is a new lower bound.
499	i = m
500	else:
501	# m is not acceptable, so final i must be < m.
502	j = m - 1
503	# i == j. Invariant #1 implies that splittable[:i] fits, and
504	# invariant #2 implies that splittable[:i+1] does not fit, so i
505	# is what we're looking for.
506	first = charset.from_splittable(splittable[:i], False)
507	last = charset.from_splittable(splittable[i:], False)
508	return first, last

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: trunk/essentials/dev-lang/python/Lib/email/header.py@ 3951

Download in other formats: