Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

rfc822.py@ 3298

Visit:

Last change on this file since 3298 was 3225, checked in by bird, 19 years ago
Python 2.5
File size: 32.3 KB

Line
1	"""RFC 2822 message manipulation.
2
3	Note: This is only a very rough sketch of a full RFC-822 parser; in particular
4	the tokenizing of addresses does not adhere to all the quoting rules.
5
6	Note: RFC 2822 is a long awaited update to RFC 822. This module should
7	conform to RFC 2822, and is thus mis-named (it's not worth renaming it). Some
8	effort at RFC 2822 updates have been made, but a thorough audit has not been
9	performed. Consider any RFC 2822 non-conformance to be a bug.
10
11	RFC 2822: http://www.faqs.org/rfcs/rfc2822.html
12	RFC 822 : http://www.faqs.org/rfcs/rfc822.html (obsolete)
13
14	Directions for use:
15
16	To create a Message object: first open a file, e.g.:
17
18	fp = open(file, 'r')
19
20	You can use any other legal way of getting an open file object, e.g. use
21	sys.stdin or call os.popen(). Then pass the open file object to the Message()
22	constructor:
23
24	m = Message(fp)
25
26	This class can work with any input object that supports a readline method. If
27	the input object has seek and tell capability, the rewindbody method will
28	work; also illegal lines will be pushed back onto the input stream. If the
29	input object lacks seek but has an `unread' method that can push back a line
30	of input, Message will use that to push back illegal lines. Thus this class
31	can be used to parse messages coming from a buffered stream.
32
33	The optional `seekable' argument is provided as a workaround for certain stdio
34	libraries in which tell() discards buffered data before discovering that the
35	lseek() system call doesn't work. For maximum portability, you should set the
36	seekable argument to zero to prevent that initial \code{tell} when passing in
37	an unseekable object such as a a file object created from a socket object. If
38	it is 1 on entry -- which it is by default -- the tell() method of the open
39	file object is called once; if this raises an exception, seekable is reset to
40	0. For other nonzero values of seekable, this test is not made.
41
42	To get the text of a particular header there are several methods:
43
44	str = m.getheader(name)
45	str = m.getrawheader(name)
46
47	where name is the name of the header, e.g. 'Subject'. The difference is that
48	getheader() strips the leading and trailing whitespace, while getrawheader()
49	doesn't. Both functions retain embedded whitespace (including newlines)
50	exactly as they are specified in the header, and leave the case of the text
51	unchanged.
52
53	For addresses and address lists there are functions
54
55	realname, mailaddress = m.getaddr(name)
56	list = m.getaddrlist(name)
57
58	where the latter returns a list of (realname, mailaddr) tuples.
59
60	There is also a method
61
62	time = m.getdate(name)
63
64	which parses a Date-like field and returns a time-compatible tuple,
65	i.e. a tuple such as returned by time.localtime() or accepted by
66	time.mktime().
67
68	See the class definition for lower level access methods.
69
70	There are also some utility functions here.
71	"""
72	# Cleanup and extensions by Eric S. Raymond <[email protected]>
73
74	import time
75
76	__all__ = ["Message","AddressList","parsedate","parsedate_tz","mktime_tz"]
77
78	_blanklines = ('\r\n', '\n') # Optimization for islast()
79
80
81	class Message:
82	"""Represents a single RFC 2822-compliant message."""
83
84	def __init__(self, fp, seekable = 1):
85	"""Initialize the class instance and read the headers."""
86	if seekable == 1:
87	# Exercise tell() to make sure it works
88	# (and then assume seek() works, too)
89	try:
90	fp.tell()
91	except (AttributeError, IOError):
92	seekable = 0
93	self.fp = fp
94	self.seekable = seekable
95	self.startofheaders = None
96	self.startofbody = None
97	#
98	if self.seekable:
99	try:
100	self.startofheaders = self.fp.tell()
101	except IOError:
102	self.seekable = 0
103	#
104	self.readheaders()
105	#
106	if self.seekable:
107	try:
108	self.startofbody = self.fp.tell()
109	except IOError:
110	self.seekable = 0
111
112	def rewindbody(self):
113	"""Rewind the file to the start of the body (if seekable)."""
114	if not self.seekable:
115	raise IOError, "unseekable file"
116	self.fp.seek(self.startofbody)
117
118	def readheaders(self):
119	"""Read header lines.
120
121	Read header lines up to the entirely blank line that terminates them.
122	The (normally blank) line that ends the headers is skipped, but not
123	included in the returned list. If a non-header line ends the headers,
124	(which is an error), an attempt is made to backspace over it; it is
125	never included in the returned list.
126
127	The variable self.status is set to the empty string if all went well,
128	otherwise it is an error message. The variable self.headers is a
129	completely uninterpreted list of lines contained in the header (so
130	printing them will reproduce the header exactly as it appears in the
131	file).
132	"""
133	self.dict = {}
134	self.unixfrom = ''
135	self.headers = lst = []
136	self.status = ''
137	headerseen = ""
138	firstline = 1
139	startofline = unread = tell = None
140	if hasattr(self.fp, 'unread'):
141	unread = self.fp.unread
142	elif self.seekable:
143	tell = self.fp.tell
144	while 1:
145	if tell:
146	try:
147	startofline = tell()
148	except IOError:
149	startofline = tell = None
150	self.seekable = 0
151	line = self.fp.readline()
152	if not line:
153	self.status = 'EOF in headers'
154	break
155	# Skip unix From name time lines
156	if firstline and line.startswith('From '):
157	self.unixfrom = self.unixfrom + line
158	continue
159	firstline = 0
160	if headerseen and line[0] in ' \t':
161	# It's a continuation line.
162	lst.append(line)
163	x = (self.dict[headerseen] + "\n " + line.strip())
164	self.dict[headerseen] = x.strip()
165	continue
166	elif self.iscomment(line):
167	# It's a comment. Ignore it.
168	continue
169	elif self.islast(line):
170	# Note! No pushback here! The delimiter line gets eaten.
171	break
172	headerseen = self.isheader(line)
173	if headerseen:
174	# It's a legal header line, save it.
175	lst.append(line)
176	self.dict[headerseen] = line[len(headerseen)+1:].strip()
177	continue
178	else:
179	# It's not a header line; throw it back and stop here.
180	if not self.dict:
181	self.status = 'No headers'
182	else:
183	self.status = 'Non-header line where header expected'
184	# Try to undo the read.
185	if unread:
186	unread(line)
187	elif tell:
188	self.fp.seek(startofline)
189	else:
190	self.status = self.status + '; bad seek'
191	break
192
193	def isheader(self, line):
194	"""Determine whether a given line is a legal header.
195
196	This method should return the header name, suitably canonicalized.
197	You may override this method in order to use Message parsing on tagged
198	data in RFC 2822-like formats with special header formats.
199	"""
200	i = line.find(':')
201	if i > 0:
202	return line[:i].lower()
203	return None
204
205	def islast(self, line):
206	"""Determine whether a line is a legal end of RFC 2822 headers.
207
208	You may override this method if your application wants to bend the
209	rules, e.g. to strip trailing whitespace, or to recognize MH template
210	separators ('--------'). For convenience (e.g. for code reading from
211	sockets) a line consisting of \r\n also matches.
212	"""
213	return line in _blanklines
214
215	def iscomment(self, line):
216	"""Determine whether a line should be skipped entirely.
217
218	You may override this method in order to use Message parsing on tagged
219	data in RFC 2822-like formats that support embedded comments or
220	free-text data.
221	"""
222	return False
223
224	def getallmatchingheaders(self, name):
225	"""Find all header lines matching a given header name.
226
227	Look through the list of headers and find all lines matching a given
228	header name (and their continuation lines). A list of the lines is
229	returned, without interpretation. If the header does not occur, an
230	empty list is returned. If the header occurs multiple times, all
231	occurrences are returned. Case is not important in the header name.
232	"""
233	name = name.lower() + ':'
234	n = len(name)
235	lst = []
236	hit = 0
237	for line in self.headers:
238	if line[:n].lower() == name:
239	hit = 1
240	elif not line[:1].isspace():
241	hit = 0
242	if hit:
243	lst.append(line)
244	return lst
245
246	def getfirstmatchingheader(self, name):
247	"""Get the first header line matching name.
248
249	This is similar to getallmatchingheaders, but it returns only the
250	first matching header (and its continuation lines).
251	"""
252	name = name.lower() + ':'
253	n = len(name)
254	lst = []
255	hit = 0
256	for line in self.headers:
257	if hit:
258	if not line[:1].isspace():
259	break
260	elif line[:n].lower() == name:
261	hit = 1
262	if hit:
263	lst.append(line)
264	return lst
265
266	def getrawheader(self, name):
267	"""A higher-level interface to getfirstmatchingheader().
268
269	Return a string containing the literal text of the header but with the
270	keyword stripped. All leading, trailing and embedded whitespace is
271	kept in the string, however. Return None if the header does not
272	occur.
273	"""
274
275	lst = self.getfirstmatchingheader(name)
276	if not lst:
277	return None
278	lst[0] = lst[0][len(name) + 1:]
279	return ''.join(lst)
280
281	def getheader(self, name, default=None):
282	"""Get the header value for a name.
283
284	This is the normal interface: it returns a stripped version of the
285	header value for a given header name, or None if it doesn't exist.
286	This uses the dictionary version which finds the last such header.
287	"""
288	return self.dict.get(name.lower(), default)
289	get = getheader
290
291	def getheaders(self, name):
292	"""Get all values for a header.
293
294	This returns a list of values for headers given more than once; each
295	value in the result list is stripped in the same way as the result of
296	getheader(). If the header is not given, return an empty list.
297	"""
298	result = []
299	current = ''
300	have_header = 0
301	for s in self.getallmatchingheaders(name):
302	if s[0].isspace():
303	if current:
304	current = "%s\n %s" % (current, s.strip())
305	else:
306	current = s.strip()
307	else:
308	if have_header:
309	result.append(current)
310	current = s[s.find(":") + 1:].strip()
311	have_header = 1
312	if have_header:
313	result.append(current)
314	return result
315
316	def getaddr(self, name):
317	"""Get a single address from a header, as a tuple.
318
319	An example return value:
320	('Guido van Rossum', '[email protected]')
321	"""
322	# New, by Ben Escoto
323	alist = self.getaddrlist(name)
324	if alist:
325	return alist[0]
326	else:
327	return (None, None)
328
329	def getaddrlist(self, name):
330	"""Get a list of addresses from a header.
331
332	Retrieves a list of addresses from a header, where each address is a
333	tuple as returned by getaddr(). Scans all named headers, so it works
334	properly with multiple To: or Cc: headers for example.
335	"""
336	raw = []
337	for h in self.getallmatchingheaders(name):
338	if h[0] in ' \t':
339	raw.append(h)
340	else:
341	if raw:
342	raw.append(', ')
343	i = h.find(':')
344	if i > 0:
345	addr = h[i+1:]
346	raw.append(addr)
347	alladdrs = ''.join(raw)
348	a = AddressList(alladdrs)
349	return a.addresslist
350
351	def getdate(self, name):
352	"""Retrieve a date field from a header.
353
354	Retrieves a date field from the named header, returning a tuple
355	compatible with time.mktime().
356	"""
357	try:
358	data = self[name]
359	except KeyError:
360	return None
361	return parsedate(data)
362
363	def getdate_tz(self, name):
364	"""Retrieve a date field from a header as a 10-tuple.
365
366	The first 9 elements make up a tuple compatible with time.mktime(),
367	and the 10th is the offset of the poster's time zone from GMT/UTC.
368	"""
369	try:
370	data = self[name]
371	except KeyError:
372	return None
373	return parsedate_tz(data)
374
375
376	# Access as a dictionary (only finds last header of each type):
377
378	def __len__(self):
379	"""Get the number of headers in a message."""
380	return len(self.dict)
381
382	def __getitem__(self, name):
383	"""Get a specific header, as from a dictionary."""
384	return self.dict[name.lower()]
385
386	def __setitem__(self, name, value):
387	"""Set the value of a header.
388
389	Note: This is not a perfect inversion of __getitem__, because any
390	changed headers get stuck at the end of the raw-headers list rather
391	than where the altered header was.
392	"""
393	del self[name] # Won't fail if it doesn't exist
394	self.dict[name.lower()] = value
395	text = name + ": " + value
396	for line in text.split("\n"):
397	self.headers.append(line + "\n")
398
399	def __delitem__(self, name):
400	"""Delete all occurrences of a specific header, if it is present."""
401	name = name.lower()
402	if not name in self.dict:
403	return
404	del self.dict[name]
405	name = name + ':'
406	n = len(name)
407	lst = []
408	hit = 0
409	for i in range(len(self.headers)):
410	line = self.headers[i]
411	if line[:n].lower() == name:
412	hit = 1
413	elif not line[:1].isspace():
414	hit = 0
415	if hit:
416	lst.append(i)
417	for i in reversed(lst):
418	del self.headers[i]
419
420	def setdefault(self, name, default=""):
421	lowername = name.lower()
422	if lowername in self.dict:
423	return self.dict[lowername]
424	else:
425	text = name + ": " + default
426	for line in text.split("\n"):
427	self.headers.append(line + "\n")
428	self.dict[lowername] = default
429	return default
430
431	def has_key(self, name):
432	"""Determine whether a message contains the named header."""
433	return name.lower() in self.dict
434
435	def __contains__(self, name):
436	"""Determine whether a message contains the named header."""
437	return name.lower() in self.dict
438
439	def __iter__(self):
440	return iter(self.dict)
441
442	def keys(self):
443	"""Get all of a message's header field names."""
444	return self.dict.keys()
445
446	def values(self):
447	"""Get all of a message's header field values."""
448	return self.dict.values()
449
450	def items(self):
451	"""Get all of a message's headers.
452
453	Returns a list of name, value tuples.
454	"""
455	return self.dict.items()
456
457	def __str__(self):
458	return ''.join(self.headers)
459
460
461	# Utility functions
462	# -----------------
463
464	# XXX Should fix unquote() and quote() to be really conformant.
465	# XXX The inverses of the parse functions may also be useful.
466
467
468	def unquote(s):
469	"""Remove quotes from a string."""
470	if len(s) > 1:
471	if s.startswith('"') and s.endswith('"'):
472	return s[1:-1].replace('\\\\', '\\').replace('\\"', '"')
473	if s.startswith('<') and s.endswith('>'):
474	return s[1:-1]
475	return s
476
477
478	def quote(s):
479	"""Add quotes around a string."""
480	return s.replace('\\', '\\\\').replace('"', '\\"')
481
482
483	def parseaddr(address):
484	"""Parse an address into a (realname, mailaddr) tuple."""
485	a = AddressList(address)
486	lst = a.addresslist
487	if not lst:
488	return (None, None)
489	return lst[0]
490
491
492	class AddrlistClass:
493	"""Address parser class by Ben Escoto.
494
495	To understand what this class does, it helps to have a copy of
496	RFC 2822 in front of you.
497
498	http://www.faqs.org/rfcs/rfc2822.html
499
500	Note: this class interface is deprecated and may be removed in the future.
501	Use rfc822.AddressList instead.
502	"""
503
504	def __init__(self, field):
505	"""Initialize a new instance.
506
507	`field' is an unparsed address header field, containing one or more
508	addresses.
509	"""
510	self.specials = '()<>@,:;.\"[]'
511	self.pos = 0
512	self.LWS = ' \t'
513	self.CR = '\r\n'
514	self.atomends = self.specials + self.LWS + self.CR
515	# Note that RFC 2822 now specifies `.' as obs-phrase, meaning that it
516	# is obsolete syntax. RFC 2822 requires that we recognize obsolete
517	# syntax, so allow dots in phrases.
518	self.phraseends = self.atomends.replace('.', '')
519	self.field = field
520	self.commentlist = []
521
522	def gotonext(self):
523	"""Parse up to the start of the next address."""
524	while self.pos < len(self.field):
525	if self.field[self.pos] in self.LWS + '\n\r':
526	self.pos = self.pos + 1
527	elif self.field[self.pos] == '(':
528	self.commentlist.append(self.getcomment())
529	else: break
530
531	def getaddrlist(self):
532	"""Parse all addresses.
533
534	Returns a list containing all of the addresses.
535	"""
536	result = []
537	ad = self.getaddress()
538	while ad:
539	result += ad
540	ad = self.getaddress()
541	return result
542
543	def getaddress(self):
544	"""Parse the next address."""
545	self.commentlist = []
546	self.gotonext()
547
548	oldpos = self.pos
549	oldcl = self.commentlist
550	plist = self.getphraselist()
551
552	self.gotonext()
553	returnlist = []
554
555	if self.pos >= len(self.field):
556	# Bad email address technically, no domain.
557	if plist:
558	returnlist = [(' '.join(self.commentlist), plist[0])]
559
560	elif self.field[self.pos] in '.@':
561	# email address is just an addrspec
562	# this isn't very efficient since we start over
563	self.pos = oldpos
564	self.commentlist = oldcl
565	addrspec = self.getaddrspec()
566	returnlist = [(' '.join(self.commentlist), addrspec)]
567
568	elif self.field[self.pos] == ':':
569	# address is a group
570	returnlist = []
571
572	fieldlen = len(self.field)
573	self.pos += 1
574	while self.pos < len(self.field):
575	self.gotonext()
576	if self.pos < fieldlen and self.field[self.pos] == ';':
577	self.pos += 1
578	break
579	returnlist = returnlist + self.getaddress()
580
581	elif self.field[self.pos] == '<':
582	# Address is a phrase then a route addr
583	routeaddr = self.getrouteaddr()
584
585	if self.commentlist:
586	returnlist = [(' '.join(plist) + ' (' + \
587	' '.join(self.commentlist) + ')', routeaddr)]
588	else: returnlist = [(' '.join(plist), routeaddr)]
589
590	else:
591	if plist:
592	returnlist = [(' '.join(self.commentlist), plist[0])]
593	elif self.field[self.pos] in self.specials:
594	self.pos += 1
595
596	self.gotonext()
597	if self.pos < len(self.field) and self.field[self.pos] == ',':
598	self.pos += 1
599	return returnlist
600
601	def getrouteaddr(self):
602	"""Parse a route address (Return-path value).
603
604	This method just skips all the route stuff and returns the addrspec.
605	"""
606	if self.field[self.pos] != '<':
607	return
608
609	expectroute = 0
610	self.pos += 1
611	self.gotonext()
612	adlist = ""
613	while self.pos < len(self.field):
614	if expectroute:
615	self.getdomain()
616	expectroute = 0
617	elif self.field[self.pos] == '>':
618	self.pos += 1
619	break
620	elif self.field[self.pos] == '@':
621	self.pos += 1
622	expectroute = 1
623	elif self.field[self.pos] == ':':
624	self.pos += 1
625	else:
626	adlist = self.getaddrspec()
627	self.pos += 1
628	break
629	self.gotonext()
630
631	return adlist
632
633	def getaddrspec(self):
634	"""Parse an RFC 2822 addr-spec."""
635	aslist = []
636
637	self.gotonext()
638	while self.pos < len(self.field):
639	if self.field[self.pos] == '.':
640	aslist.append('.')
641	self.pos += 1
642	elif self.field[self.pos] == '"':
643	aslist.append('"%s"' % self.getquote())
644	elif self.field[self.pos] in self.atomends:
645	break
646	else: aslist.append(self.getatom())
647	self.gotonext()
648
649	if self.pos >= len(self.field) or self.field[self.pos] != '@':
650	return ''.join(aslist)
651
652	aslist.append('@')
653	self.pos += 1
654	self.gotonext()
655	return ''.join(aslist) + self.getdomain()
656
657	def getdomain(self):
658	"""Get the complete domain name from an address."""
659	sdlist = []
660	while self.pos < len(self.field):
661	if self.field[self.pos] in self.LWS:
662	self.pos += 1
663	elif self.field[self.pos] == '(':
664	self.commentlist.append(self.getcomment())
665	elif self.field[self.pos] == '[':
666	sdlist.append(self.getdomainliteral())
667	elif self.field[self.pos] == '.':
668	self.pos += 1
669	sdlist.append('.')
670	elif self.field[self.pos] in self.atomends:
671	break
672	else: sdlist.append(self.getatom())
673	return ''.join(sdlist)
674
675	def getdelimited(self, beginchar, endchars, allowcomments = 1):
676	"""Parse a header fragment delimited by special characters.
677
678	`beginchar' is the start character for the fragment. If self is not
679	looking at an instance of `beginchar' then getdelimited returns the
680	empty string.
681
682	`endchars' is a sequence of allowable end-delimiting characters.
683	Parsing stops when one of these is encountered.
684
685	If `allowcomments' is non-zero, embedded RFC 2822 comments are allowed
686	within the parsed fragment.
687	"""
688	if self.field[self.pos] != beginchar:
689	return ''
690
691	slist = ['']
692	quote = 0
693	self.pos += 1
694	while self.pos < len(self.field):
695	if quote == 1:
696	slist.append(self.field[self.pos])
697	quote = 0
698	elif self.field[self.pos] in endchars:
699	self.pos += 1
700	break
701	elif allowcomments and self.field[self.pos] == '(':
702	slist.append(self.getcomment())
703	continue # have already advanced pos from getcomment
704	elif self.field[self.pos] == '\\':
705	quote = 1
706	else:
707	slist.append(self.field[self.pos])
708	self.pos += 1
709
710	return ''.join(slist)
711
712	def getquote(self):
713	"""Get a quote-delimited fragment from self's field."""
714	return self.getdelimited('"', '"\r', 0)
715
716	def getcomment(self):
717	"""Get a parenthesis-delimited fragment from self's field."""
718	return self.getdelimited('(', ')\r', 1)
719
720	def getdomainliteral(self):
721	"""Parse an RFC 2822 domain-literal."""
722	return '[%s]' % self.getdelimited('[', ']\r', 0)
723
724	def getatom(self, atomends=None):
725	"""Parse an RFC 2822 atom.
726
727	Optional atomends specifies a different set of end token delimiters
728	(the default is to use self.atomends). This is used e.g. in
729	getphraselist() since phrase endings must not include the `.' (which
730	is legal in phrases)."""
731	atomlist = ['']
732	if atomends is None:
733	atomends = self.atomends
734
735	while self.pos < len(self.field):
736	if self.field[self.pos] in atomends:
737	break
738	else: atomlist.append(self.field[self.pos])
739	self.pos += 1
740
741	return ''.join(atomlist)
742
743	def getphraselist(self):
744	"""Parse a sequence of RFC 2822 phrases.
745
746	A phrase is a sequence of words, which are in turn either RFC 2822
747	atoms or quoted-strings. Phrases are canonicalized by squeezing all
748	runs of continuous whitespace into one space.
749	"""
750	plist = []
751
752	while self.pos < len(self.field):
753	if self.field[self.pos] in self.LWS:
754	self.pos += 1
755	elif self.field[self.pos] == '"':
756	plist.append(self.getquote())
757	elif self.field[self.pos] == '(':
758	self.commentlist.append(self.getcomment())
759	elif self.field[self.pos] in self.phraseends:
760	break
761	else:
762	plist.append(self.getatom(self.phraseends))
763
764	return plist
765
766	class AddressList(AddrlistClass):
767	"""An AddressList encapsulates a list of parsed RFC 2822 addresses."""
768	def __init__(self, field):
769	AddrlistClass.__init__(self, field)
770	if field:
771	self.addresslist = self.getaddrlist()
772	else:
773	self.addresslist = []
774
775	def __len__(self):
776	return len(self.addresslist)
777
778	def __str__(self):
779	return ", ".join(map(dump_address_pair, self.addresslist))
780
781	def __add__(self, other):
782	# Set union
783	newaddr = AddressList(None)
784	newaddr.addresslist = self.addresslist[:]
785	for x in other.addresslist:
786	if not x in self.addresslist:
787	newaddr.addresslist.append(x)
788	return newaddr
789
790	def __iadd__(self, other):
791	# Set union, in-place
792	for x in other.addresslist:
793	if not x in self.addresslist:
794	self.addresslist.append(x)
795	return self
796
797	def __sub__(self, other):
798	# Set difference
799	newaddr = AddressList(None)
800	for x in self.addresslist:
801	if not x in other.addresslist:
802	newaddr.addresslist.append(x)
803	return newaddr
804
805	def __isub__(self, other):
806	# Set difference, in-place
807	for x in other.addresslist:
808	if x in self.addresslist:
809	self.addresslist.remove(x)
810	return self
811
812	def __getitem__(self, index):
813	# Make indexing, slices, and 'in' work
814	return self.addresslist[index]
815
816	def dump_address_pair(pair):
817	"""Dump a (name, address) pair in a canonicalized form."""
818	if pair[0]:
819	return '"' + pair[0] + '" <' + pair[1] + '>'
820	else:
821	return pair[1]
822
823	# Parse a date field
824
825	_monthnames = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul',
826	'aug', 'sep', 'oct', 'nov', 'dec',
827	'january', 'february', 'march', 'april', 'may', 'june', 'july',
828	'august', 'september', 'october', 'november', 'december']
829	_daynames = ['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun']
830
831	# The timezone table does not include the military time zones defined
832	# in RFC822, other than Z. According to RFC1123, the description in
833	# RFC822 gets the signs wrong, so we can't rely on any such time
834	# zones. RFC1123 recommends that numeric timezone indicators be used
835	# instead of timezone names.
836
837	_timezones = {'UT':0, 'UTC':0, 'GMT':0, 'Z':0,
838	'AST': -400, 'ADT': -300, # Atlantic (used in Canada)
839	'EST': -500, 'EDT': -400, # Eastern
840	'CST': -600, 'CDT': -500, # Central
841	'MST': -700, 'MDT': -600, # Mountain
842	'PST': -800, 'PDT': -700 # Pacific
843	}
844
845
846	def parsedate_tz(data):
847	"""Convert a date string to a time tuple.
848
849	Accounts for military timezones.
850	"""
851	if not data:
852	return None
853	data = data.split()
854	if data[0][-1] in (',', '.') or data[0].lower() in _daynames:
855	# There's a dayname here. Skip it
856	del data[0]
857	if len(data) == 3: # RFC 850 date, deprecated
858	stuff = data[0].split('-')
859	if len(stuff) == 3:
860	data = stuff + data[1:]
861	if len(data) == 4:
862	s = data[3]
863	i = s.find('+')
864	if i > 0:
865	data[3:] = [s[:i], s[i+1:]]
866	else:
867	data.append('') # Dummy tz
868	if len(data) < 5:
869	return None
870	data = data[:5]
871	[dd, mm, yy, tm, tz] = data
872	mm = mm.lower()
873	if not mm in _monthnames:
874	dd, mm = mm, dd.lower()
875	if not mm in _monthnames:
876	return None
877	mm = _monthnames.index(mm)+1
878	if mm > 12: mm = mm - 12
879	if dd[-1] == ',':
880	dd = dd[:-1]
881	i = yy.find(':')
882	if i > 0:
883	yy, tm = tm, yy
884	if yy[-1] == ',':
885	yy = yy[:-1]
886	if not yy[0].isdigit():
887	yy, tz = tz, yy
888	if tm[-1] == ',':
889	tm = tm[:-1]
890	tm = tm.split(':')
891	if len(tm) == 2:
892	[thh, tmm] = tm
893	tss = '0'
894	elif len(tm) == 3:
895	[thh, tmm, tss] = tm
896	else:
897	return None
898	try:
899	yy = int(yy)
900	dd = int(dd)
901	thh = int(thh)
902	tmm = int(tmm)
903	tss = int(tss)
904	except ValueError:
905	return None
906	tzoffset = None
907	tz = tz.upper()
908	if tz in _timezones:
909	tzoffset = _timezones[tz]
910	else:
911	try:
912	tzoffset = int(tz)
913	except ValueError:
914	pass
915	# Convert a timezone offset into seconds ; -0500 -> -18000
916	if tzoffset:
917	if tzoffset < 0:
918	tzsign = -1
919	tzoffset = -tzoffset
920	else:
921	tzsign = 1
922	tzoffset = tzsign * ( (tzoffset//100)3600 + (tzoffset % 100)60)
923	return (yy, mm, dd, thh, tmm, tss, 0, 1, 0, tzoffset)
924
925
926	def parsedate(data):
927	"""Convert a time string to a time tuple."""
928	t = parsedate_tz(data)
929	if t is None:
930	return t
931	return t[:9]
932
933
934	def mktime_tz(data):
935	"""Turn a 10-tuple as returned by parsedate_tz() into a UTC timestamp."""
936	if data[9] is None:
937	# No zone info, so localtime is better assumption than GMT
938	return time.mktime(data[:8] + (-1,))
939	else:
940	t = time.mktime(data[:8] + (0,))
941	return t - data[9] - time.timezone
942
943	def formatdate(timeval=None):
944	"""Returns time format preferred for Internet standards.
945
946	Sun, 06 Nov 1994 08:49:37 GMT ; RFC 822, updated by RFC 1123
947
948	According to RFC 1123, day and month names must always be in
949	English. If not for that, this code could use strftime(). It
950	can't because strftime() honors the locale and could generated
951	non-English names.
952	"""
953	if timeval is None:
954	timeval = time.time()
955	timeval = time.gmtime(timeval)
956	return "%s, %02d %s %04d %02d:%02d:%02d GMT" % (
957	("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun")[timeval[6]],
958	timeval[2],
959	("Jan", "Feb", "Mar", "Apr", "May", "Jun",
960	"Jul", "Aug", "Sep", "Oct", "Nov", "Dec")[timeval[1]-1],
961	timeval[0], timeval[3], timeval[4], timeval[5])
962
963
964	# When used as script, run a small test program.
965	# The first command line argument must be a filename containing one
966	# message in RFC-822 format.
967
968	if __name__ == '__main__':
969	import sys, os
970	file = os.path.join(os.environ['HOME'], 'Mail/inbox/1')
971	if sys.argv[1:]: file = sys.argv[1]
972	f = open(file, 'r')
973	m = Message(f)
974	print 'From:', m.getaddr('from')
975	print 'To:', m.getaddrlist('to')
976	print 'Subject:', m.getheader('subject')
977	print 'Date:', m.getheader('date')
978	date = m.getdate_tz('date')
979	tz = date[-1]
980	date = time.localtime(mktime_tz(date))
981	if date:
982	print 'ParsedDate:', time.asctime(date),
983	hhmmss = tz
984	hhmm, ss = divmod(hhmmss, 60)
985	hh, mm = divmod(hhmm, 60)
986	print "%+03d%02d" % (hh, mm),
987	if ss: print ".%02d" % ss,
988	print
989	else:
990	print 'ParsedDate:', None
991	m.rewindbody()
992	n = 0
993	while f.readline():
994	n += 1
995	print 'Lines:', n
996	print '-'*70
997	print 'len =', len(m)
998	if 'Date' in m: print 'Date =', m['Date']
999	if 'X-Nonsense' in m: pass
1000	print 'keys =', m.keys()
1001	print 'values =', m.values()
1002	print 'items =', m.items()

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: trunk/essentials/dev-lang/python/Lib/rfc822.py@ 3298

Download in other formats: