Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

codecs.py@ 3226

Visit:

Last change on this file since 3226 was 3225, checked in by bird, 19 years ago
Python 2.5
File size: 31.9 KB

Line
1	""" codecs -- Python Codec Registry, API and helpers.
2
3
4	Written by Marc-Andre Lemburg ([email protected]).
5
6	(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
7
8	"""#"
9
10	import __builtin__, sys
11
12	### Registry and builtin stateless codec functions
13
14	try:
15	from _codecs import *
16	except ImportError, why:
17	raise SystemError('Failed to load the builtin codecs: %s' % why)
18
19	__all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",
20	"BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",
21	"BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",
22	"BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",
23	"strict_errors", "ignore_errors", "replace_errors",
24	"xmlcharrefreplace_errors",
25	"register_error", "lookup_error"]
26
27	### Constants
28
29	#
30	# Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF)
31	# and its possible byte string values
32	# for UTF8/UTF16/UTF32 output and little/big endian machines
33	#
34
35	# UTF-8
36	BOM_UTF8 = '\xef\xbb\xbf'
37
38	# UTF-16, little endian
39	BOM_LE = BOM_UTF16_LE = '\xff\xfe'
40
41	# UTF-16, big endian
42	BOM_BE = BOM_UTF16_BE = '\xfe\xff'
43
44	# UTF-32, little endian
45	BOM_UTF32_LE = '\xff\xfe\x00\x00'
46
47	# UTF-32, big endian
48	BOM_UTF32_BE = '\x00\x00\xfe\xff'
49
50	if sys.byteorder == 'little':
51
52	# UTF-16, native endianness
53	BOM = BOM_UTF16 = BOM_UTF16_LE
54
55	# UTF-32, native endianness
56	BOM_UTF32 = BOM_UTF32_LE
57
58	else:
59
60	# UTF-16, native endianness
61	BOM = BOM_UTF16 = BOM_UTF16_BE
62
63	# UTF-32, native endianness
64	BOM_UTF32 = BOM_UTF32_BE
65
66	# Old broken names (don't use in new code)
67	BOM32_LE = BOM_UTF16_LE
68	BOM32_BE = BOM_UTF16_BE
69	BOM64_LE = BOM_UTF32_LE
70	BOM64_BE = BOM_UTF32_BE
71
72
73	### Codec base classes (defining the API)
74
75	class CodecInfo(tuple):
76
77	def __new__(cls, encode, decode, streamreader=None, streamwriter=None,
78	incrementalencoder=None, incrementaldecoder=None, name=None):
79	self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter))
80	self.name = name
81	self.encode = encode
82	self.decode = decode
83	self.incrementalencoder = incrementalencoder
84	self.incrementaldecoder = incrementaldecoder
85	self.streamwriter = streamwriter
86	self.streamreader = streamreader
87	return self
88
89	def __repr__(self):
90	return "<%s.%s object for encoding %s at 0x%x>" % (self.__class__.__module__, self.__class__.__name__, self.name, id(self))
91
92	class Codec:
93
94	""" Defines the interface for stateless encoders/decoders.
95
96	The .encode()/.decode() methods may use different error
97	handling schemes by providing the errors argument. These
98	string values are predefined:
99
100	'strict' - raise a ValueError error (or a subclass)
101	'ignore' - ignore the character and continue with the next
102	'replace' - replace with a suitable replacement character;
103	Python will use the official U+FFFD REPLACEMENT
104	CHARACTER for the builtin Unicode codecs on
105	decoding and '?' on encoding.
106	'xmlcharrefreplace' - Replace with the appropriate XML
107	character reference (only for encoding).
108	'backslashreplace' - Replace with backslashed escape sequences
109	(only for encoding).
110
111	The set of allowed values can be extended via register_error.
112
113	"""
114	def encode(self, input, errors='strict'):
115
116	""" Encodes the object input and returns a tuple (output
117	object, length consumed).
118
119	errors defines the error handling to apply. It defaults to
120	'strict' handling.
121
122	The method may not store state in the Codec instance. Use
123	StreamCodec for codecs which have to keep state in order to
124	make encoding/decoding efficient.
125
126	The encoder must be able to handle zero length input and
127	return an empty object of the output object type in this
128	situation.
129
130	"""
131	raise NotImplementedError
132
133	def decode(self, input, errors='strict'):
134
135	""" Decodes the object input and returns a tuple (output
136	object, length consumed).
137
138	input must be an object which provides the bf_getreadbuf
139	buffer slot. Python strings, buffer objects and memory
140	mapped files are examples of objects providing this slot.
141
142	errors defines the error handling to apply. It defaults to
143	'strict' handling.
144
145	The method may not store state in the Codec instance. Use
146	StreamCodec for codecs which have to keep state in order to
147	make encoding/decoding efficient.
148
149	The decoder must be able to handle zero length input and
150	return an empty object of the output object type in this
151	situation.
152
153	"""
154	raise NotImplementedError
155
156	class IncrementalEncoder(object):
157	"""
158	An IncrementalEncoder encodes an input in multiple steps. The input can be
159	passed piece by piece to the encode() method. The IncrementalEncoder remembers
160	the state of the Encoding process between calls to encode().
161	"""
162	def __init__(self, errors='strict'):
163	"""
164	Creates an IncrementalEncoder instance.
165
166	The IncrementalEncoder may use different error handling schemes by
167	providing the errors keyword argument. See the module docstring
168	for a list of possible values.
169	"""
170	self.errors = errors
171	self.buffer = ""
172
173	def encode(self, input, final=False):
174	"""
175	Encodes input and returns the resulting object.
176	"""
177	raise NotImplementedError
178
179	def reset(self):
180	"""
181	Resets the encoder to the initial state.
182	"""
183
184	class BufferedIncrementalEncoder(IncrementalEncoder):
185	"""
186	This subclass of IncrementalEncoder can be used as the baseclass for an
187	incremental encoder if the encoder must keep some of the output in a
188	buffer between calls to encode().
189	"""
190	def __init__(self, errors='strict'):
191	IncrementalEncoder.__init__(self, errors)
192	self.buffer = "" # unencoded input that is kept between calls to encode()
193
194	def _buffer_encode(self, input, errors, final):
195	# Overwrite this method in subclasses: It must encode input
196	# and return an (output, length consumed) tuple
197	raise NotImplementedError
198
199	def encode(self, input, final=False):
200	# encode input (taking the buffer into account)
201	data = self.buffer + input
202	(result, consumed) = self._buffer_encode(data, self.errors, final)
203	# keep unencoded input until the next call
204	self.buffer = data[consumed:]
205	return result
206
207	def reset(self):
208	IncrementalEncoder.reset(self)
209	self.buffer = ""
210
211	class IncrementalDecoder(object):
212	"""
213	An IncrementalDecoder decodes an input in multiple steps. The input can be
214	passed piece by piece to the decode() method. The IncrementalDecoder
215	remembers the state of the decoding process between calls to decode().
216	"""
217	def __init__(self, errors='strict'):
218	"""
219	Creates a IncrementalDecoder instance.
220
221	The IncrementalDecoder may use different error handling schemes by
222	providing the errors keyword argument. See the module docstring
223	for a list of possible values.
224	"""
225	self.errors = errors
226
227	def decode(self, input, final=False):
228	"""
229	Decodes input and returns the resulting object.
230	"""
231	raise NotImplementedError
232
233	def reset(self):
234	"""
235	Resets the decoder to the initial state.
236	"""
237
238	class BufferedIncrementalDecoder(IncrementalDecoder):
239	"""
240	This subclass of IncrementalDecoder can be used as the baseclass for an
241	incremental decoder if the decoder must be able to handle incomplete byte
242	sequences.
243	"""
244	def __init__(self, errors='strict'):
245	IncrementalDecoder.__init__(self, errors)
246	self.buffer = "" # undecoded input that is kept between calls to decode()
247
248	def _buffer_decode(self, input, errors, final):
249	# Overwrite this method in subclasses: It must decode input
250	# and return an (output, length consumed) tuple
251	raise NotImplementedError
252
253	def decode(self, input, final=False):
254	# decode input (taking the buffer into account)
255	data = self.buffer + input
256	(result, consumed) = self._buffer_decode(data, self.errors, final)
257	# keep undecoded input until the next call
258	self.buffer = data[consumed:]
259	return result
260
261	def reset(self):
262	IncrementalDecoder.reset(self)
263	self.buffer = ""
264
265	#
266	# The StreamWriter and StreamReader class provide generic working
267	# interfaces which can be used to implement new encoding submodules
268	# very easily. See encodings/utf_8.py for an example on how this is
269	# done.
270	#
271
272	class StreamWriter(Codec):
273
274	def __init__(self, stream, errors='strict'):
275
276	""" Creates a StreamWriter instance.
277
278	stream must be a file-like object open for writing
279	(binary) data.
280
281	The StreamWriter may use different error handling
282	schemes by providing the errors keyword argument. These
283	parameters are predefined:
284
285	'strict' - raise a ValueError (or a subclass)
286	'ignore' - ignore the character and continue with the next
287	'replace'- replace with a suitable replacement character
288	'xmlcharrefreplace' - Replace with the appropriate XML
289	character reference.
290	'backslashreplace' - Replace with backslashed escape
291	sequences (only for encoding).
292
293	The set of allowed parameter values can be extended via
294	register_error.
295	"""
296	self.stream = stream
297	self.errors = errors
298
299	def write(self, object):
300
301	""" Writes the object's contents encoded to self.stream.
302	"""
303	data, consumed = self.encode(object, self.errors)
304	self.stream.write(data)
305
306	def writelines(self, list):
307
308	""" Writes the concatenated list of strings to the stream
309	using .write().
310	"""
311	self.write(''.join(list))
312
313	def reset(self):
314
315	""" Flushes and resets the codec buffers used for keeping state.
316
317	Calling this method should ensure that the data on the
318	output is put into a clean state, that allows appending
319	of new fresh data without having to rescan the whole
320	stream to recover state.
321
322	"""
323	pass
324
325	def __getattr__(self, name,
326	getattr=getattr):
327
328	""" Inherit all other methods from the underlying stream.
329	"""
330	return getattr(self.stream, name)
331
332	###
333
334	class StreamReader(Codec):
335
336	def __init__(self, stream, errors='strict'):
337
338	""" Creates a StreamReader instance.
339
340	stream must be a file-like object open for reading
341	(binary) data.
342
343	The StreamReader may use different error handling
344	schemes by providing the errors keyword argument. These
345	parameters are predefined:
346
347	'strict' - raise a ValueError (or a subclass)
348	'ignore' - ignore the character and continue with the next
349	'replace'- replace with a suitable replacement character;
350
351	The set of allowed parameter values can be extended via
352	register_error.
353	"""
354	self.stream = stream
355	self.errors = errors
356	self.bytebuffer = ""
357	# For str->str decoding this will stay a str
358	# For str->unicode decoding the first read will promote it to unicode
359	self.charbuffer = ""
360	self.linebuffer = None
361
362	def decode(self, input, errors='strict'):
363	raise NotImplementedError
364
365	def read(self, size=-1, chars=-1, firstline=False):
366
367	""" Decodes data from the stream self.stream and returns the
368	resulting object.
369
370	chars indicates the number of characters to read from the
371	stream. read() will never return more than chars
372	characters, but it might return less, if there are not enough
373	characters available.
374
375	size indicates the approximate maximum number of bytes to
376	read from the stream for decoding purposes. The decoder
377	can modify this setting as appropriate. The default value
378	-1 indicates to read and decode as much as possible. size
379	is intended to prevent having to decode huge files in one
380	step.
381
382	If firstline is true, and a UnicodeDecodeError happens
383	after the first line terminator in the input only the first line
384	will be returned, the rest of the input will be kept until the
385	next call to read().
386
387	The method should use a greedy read strategy meaning that
388	it should read as much data as is allowed within the
389	definition of the encoding and the given size, e.g. if
390	optional encoding endings or state markers are available
391	on the stream, these should be read too.
392	"""
393	# If we have lines cached, first merge them back into characters
394	if self.linebuffer:
395	self.charbuffer = "".join(self.linebuffer)
396	self.linebuffer = None
397
398	# read until we get the required number of characters (if available)
399	while True:
400	# can the request can be satisfied from the character buffer?
401	if chars < 0:
402	if size < 0:
403	if self.charbuffer:
404	break
405	elif len(self.charbuffer) >= size:
406	break
407	else:
408	if len(self.charbuffer) >= chars:
409	break
410	# we need more data
411	if size < 0:
412	newdata = self.stream.read()
413	else:
414	newdata = self.stream.read(size)
415	# decode bytes (those remaining from the last call included)
416	data = self.bytebuffer + newdata
417	try:
418	newchars, decodedbytes = self.decode(data, self.errors)
419	except UnicodeDecodeError, exc:
420	if firstline:
421	newchars, decodedbytes = self.decode(data[:exc.start], self.errors)
422	lines = newchars.splitlines(True)
423	if len(lines)<=1:
424	raise
425	else:
426	raise
427	# keep undecoded bytes until the next call
428	self.bytebuffer = data[decodedbytes:]
429	# put new characters in the character buffer
430	self.charbuffer += newchars
431	# there was no data available
432	if not newdata:
433	break
434	if chars < 0:
435	# Return everything we've got
436	result = self.charbuffer
437	self.charbuffer = ""
438	else:
439	# Return the first chars characters
440	result = self.charbuffer[:chars]
441	self.charbuffer = self.charbuffer[chars:]
442	return result
443
444	def readline(self, size=None, keepends=True):
445
446	""" Read one line from the input stream and return the
447	decoded data.
448
449	size, if given, is passed as size argument to the
450	read() method.
451
452	"""
453	# If we have lines cached from an earlier read, return
454	# them unconditionally
455	if self.linebuffer:
456	line = self.linebuffer[0]
457	del self.linebuffer[0]
458	if len(self.linebuffer) == 1:
459	# revert to charbuffer mode; we might need more data
460	# next time
461	self.charbuffer = self.linebuffer[0]
462	self.linebuffer = None
463	if not keepends:
464	line = line.splitlines(False)[0]
465	return line
466
467	readsize = size or 72
468	line = ""
469	# If size is given, we call read() only once
470	while True:
471	data = self.read(readsize, firstline=True)
472	if data:
473	# If we're at a "\r" read one extra character (which might
474	# be a "\n") to get a proper line ending. If the stream is
475	# temporarily exhausted we return the wrong line ending.
476	if data.endswith("\r"):
477	data += self.read(size=1, chars=1)
478
479	line += data
480	lines = line.splitlines(True)
481	if lines:
482	if len(lines) > 1:
483	# More than one line result; the first line is a full line
484	# to return
485	line = lines[0]
486	del lines[0]
487	if len(lines) > 1:
488	# cache the remaining lines
489	lines[-1] += self.charbuffer
490	self.linebuffer = lines
491	self.charbuffer = None
492	else:
493	# only one remaining line, put it back into charbuffer
494	self.charbuffer = lines[0] + self.charbuffer
495	if not keepends:
496	line = line.splitlines(False)[0]
497	break
498	line0withend = lines[0]
499	line0withoutend = lines[0].splitlines(False)[0]
500	if line0withend != line0withoutend: # We really have a line end
501	# Put the rest back together and keep it until the next call
502	self.charbuffer = "".join(lines[1:]) + self.charbuffer
503	if keepends:
504	line = line0withend
505	else:
506	line = line0withoutend
507	break
508	# we didn't get anything or this was our only try
509	if not data or size is not None:
510	if line and not keepends:
511	line = line.splitlines(False)[0]
512	break
513	if readsize<8000:
514	readsize *= 2
515	return line
516
517	def readlines(self, sizehint=None, keepends=True):
518
519	""" Read all lines available on the input stream
520	and return them as list of lines.
521
522	Line breaks are implemented using the codec's decoder
523	method and are included in the list entries.
524
525	sizehint, if given, is ignored since there is no efficient
526	way to finding the true end-of-line.
527
528	"""
529	data = self.read()
530	return data.splitlines(keepends)
531
532	def reset(self):
533
534	""" Resets the codec buffers used for keeping state.
535
536	Note that no stream repositioning should take place.
537	This method is primarily intended to be able to recover
538	from decoding errors.
539
540	"""
541	self.bytebuffer = ""
542	self.charbuffer = u""
543	self.linebuffer = None
544
545	def seek(self, offset, whence=0):
546	""" Set the input stream's current position.
547
548	Resets the codec buffers used for keeping state.
549	"""
550	self.reset()
551	self.stream.seek(offset, whence)
552
553	def next(self):
554
555	""" Return the next decoded line from the input stream."""
556	line = self.readline()
557	if line:
558	return line
559	raise StopIteration
560
561	def __iter__(self):
562	return self
563
564	def __getattr__(self, name,
565	getattr=getattr):
566
567	""" Inherit all other methods from the underlying stream.
568	"""
569	return getattr(self.stream, name)
570
571	###
572
573	class StreamReaderWriter:
574
575	""" StreamReaderWriter instances allow wrapping streams which
576	work in both read and write modes.
577
578	The design is such that one can use the factory functions
579	returned by the codec.lookup() function to construct the
580	instance.
581
582	"""
583	# Optional attributes set by the file wrappers below
584	encoding = 'unknown'
585
586	def __init__(self, stream, Reader, Writer, errors='strict'):
587
588	""" Creates a StreamReaderWriter instance.
589
590	stream must be a Stream-like object.
591
592	Reader, Writer must be factory functions or classes
593	providing the StreamReader, StreamWriter interface resp.
594
595	Error handling is done in the same way as defined for the
596	StreamWriter/Readers.
597
598	"""
599	self.stream = stream
600	self.reader = Reader(stream, errors)
601	self.writer = Writer(stream, errors)
602	self.errors = errors
603
604	def read(self, size=-1):
605
606	return self.reader.read(size)
607
608	def readline(self, size=None):
609
610	return self.reader.readline(size)
611
612	def readlines(self, sizehint=None):
613
614	return self.reader.readlines(sizehint)
615
616	def next(self):
617
618	""" Return the next decoded line from the input stream."""
619	return self.reader.next()
620
621	def __iter__(self):
622	return self
623
624	def write(self, data):
625
626	return self.writer.write(data)
627
628	def writelines(self, list):
629
630	return self.writer.writelines(list)
631
632	def reset(self):
633
634	self.reader.reset()
635	self.writer.reset()
636
637	def __getattr__(self, name,
638	getattr=getattr):
639
640	""" Inherit all other methods from the underlying stream.
641	"""
642	return getattr(self.stream, name)
643
644	###
645
646	class StreamRecoder:
647
648	""" StreamRecoder instances provide a frontend - backend
649	view of encoding data.
650
651	They use the complete set of APIs returned by the
652	codecs.lookup() function to implement their task.
653
654	Data written to the stream is first decoded into an
655	intermediate format (which is dependent on the given codec
656	combination) and then written to the stream using an instance
657	of the provided Writer class.
658
659	In the other direction, data is read from the stream using a
660	Reader instance and then return encoded data to the caller.
661
662	"""
663	# Optional attributes set by the file wrappers below
664	data_encoding = 'unknown'
665	file_encoding = 'unknown'
666
667	def __init__(self, stream, encode, decode, Reader, Writer,
668	errors='strict'):
669
670	""" Creates a StreamRecoder instance which implements a two-way
671	conversion: encode and decode work on the frontend (the
672	input to .read() and output of .write()) while
673	Reader and Writer work on the backend (reading and
674	writing to the stream).
675
676	You can use these objects to do transparent direct
677	recodings from e.g. latin-1 to utf-8 and back.
678
679	stream must be a file-like object.
680
681	encode, decode must adhere to the Codec interface, Reader,
682	Writer must be factory functions or classes providing the
683	StreamReader, StreamWriter interface resp.
684
685	encode and decode are needed for the frontend translation,
686	Reader and Writer for the backend translation. Unicode is
687	used as intermediate encoding.
688
689	Error handling is done in the same way as defined for the
690	StreamWriter/Readers.
691
692	"""
693	self.stream = stream
694	self.encode = encode
695	self.decode = decode
696	self.reader = Reader(stream, errors)
697	self.writer = Writer(stream, errors)
698	self.errors = errors
699
700	def read(self, size=-1):
701
702	data = self.reader.read(size)
703	data, bytesencoded = self.encode(data, self.errors)
704	return data
705
706	def readline(self, size=None):
707
708	if size is None:
709	data = self.reader.readline()
710	else:
711	data = self.reader.readline(size)
712	data, bytesencoded = self.encode(data, self.errors)
713	return data
714
715	def readlines(self, sizehint=None):
716
717	data = self.reader.read()
718	data, bytesencoded = self.encode(data, self.errors)
719	return data.splitlines(1)
720
721	def next(self):
722
723	""" Return the next decoded line from the input stream."""
724	data = self.reader.next()
725	data, bytesencoded = self.encode(data, self.errors)
726	return data
727
728	def __iter__(self):
729	return self
730
731	def write(self, data):
732
733	data, bytesdecoded = self.decode(data, self.errors)
734	return self.writer.write(data)
735
736	def writelines(self, list):
737
738	data = ''.join(list)
739	data, bytesdecoded = self.decode(data, self.errors)
740	return self.writer.write(data)
741
742	def reset(self):
743
744	self.reader.reset()
745	self.writer.reset()
746
747	def __getattr__(self, name,
748	getattr=getattr):
749
750	""" Inherit all other methods from the underlying stream.
751	"""
752	return getattr(self.stream, name)
753
754	### Shortcuts
755
756	def open(filename, mode='rb', encoding=None, errors='strict', buffering=1):
757
758	""" Open an encoded file using the given mode and return
759	a wrapped version providing transparent encoding/decoding.
760
761	Note: The wrapped version will only accept the object format
762	defined by the codecs, i.e. Unicode objects for most builtin
763	codecs. Output is also codec dependent and will usually be
764	Unicode as well.
765
766	Files are always opened in binary mode, even if no binary mode
767	was specified. This is done to avoid data loss due to encodings
768	using 8-bit values. The default file mode is 'rb' meaning to
769	open the file in binary read mode.
770
771	encoding specifies the encoding which is to be used for the
772	file.
773
774	errors may be given to define the error handling. It defaults
775	to 'strict' which causes ValueErrors to be raised in case an
776	encoding error occurs.
777
778	buffering has the same meaning as for the builtin open() API.
779	It defaults to line buffered.
780
781	The returned wrapped file object provides an extra attribute
782	.encoding which allows querying the used encoding. This
783	attribute is only available if an encoding was specified as
784	parameter.
785
786	"""
787	if encoding is not None and \
788	'b' not in mode:
789	# Force opening of the file in binary mode
790	mode = mode + 'b'
791	file = __builtin__.open(filename, mode, buffering)
792	if encoding is None:
793	return file
794	info = lookup(encoding)
795	srw = StreamReaderWriter(file, info.streamreader, info.streamwriter, errors)
796	# Add attributes to simplify introspection
797	srw.encoding = encoding
798	return srw
799
800	def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
801
802	""" Return a wrapped version of file which provides transparent
803	encoding translation.
804
805	Strings written to the wrapped file are interpreted according
806	to the given data_encoding and then written to the original
807	file as string using file_encoding. The intermediate encoding
808	will usually be Unicode but depends on the specified codecs.
809
810	Strings are read from the file using file_encoding and then
811	passed back to the caller as string using data_encoding.
812
813	If file_encoding is not given, it defaults to data_encoding.
814
815	errors may be given to define the error handling. It defaults
816	to 'strict' which causes ValueErrors to be raised in case an
817	encoding error occurs.
818
819	The returned wrapped file object provides two extra attributes
820	.data_encoding and .file_encoding which reflect the given
821	parameters of the same name. The attributes can be used for
822	introspection by Python programs.
823
824	"""
825	if file_encoding is None:
826	file_encoding = data_encoding
827	info = lookup(data_encoding)
828	sr = StreamRecoder(file, info.encode, info.decode,
829	info.streamreader, info.streamwriter, errors)
830	# Add attributes to simplify introspection
831	sr.data_encoding = data_encoding
832	sr.file_encoding = file_encoding
833	return sr
834
835	### Helpers for codec lookup
836
837	def getencoder(encoding):
838
839	""" Lookup up the codec for the given encoding and return
840	its encoder function.
841
842	Raises a LookupError in case the encoding cannot be found.
843
844	"""
845	return lookup(encoding).encode
846
847	def getdecoder(encoding):
848
849	""" Lookup up the codec for the given encoding and return
850	its decoder function.
851
852	Raises a LookupError in case the encoding cannot be found.
853
854	"""
855	return lookup(encoding).decode
856
857	def getincrementalencoder(encoding):
858
859	""" Lookup up the codec for the given encoding and return
860	its IncrementalEncoder class or factory function.
861
862	Raises a LookupError in case the encoding cannot be found
863	or the codecs doesn't provide an incremental encoder.
864
865	"""
866	encoder = lookup(encoding).incrementalencoder
867	if encoder is None:
868	raise LookupError(encoding)
869	return encoder
870
871	def getincrementaldecoder(encoding):
872
873	""" Lookup up the codec for the given encoding and return
874	its IncrementalDecoder class or factory function.
875
876	Raises a LookupError in case the encoding cannot be found
877	or the codecs doesn't provide an incremental decoder.
878
879	"""
880	decoder = lookup(encoding).incrementaldecoder
881	if decoder is None:
882	raise LookupError(encoding)
883	return decoder
884
885	def getreader(encoding):
886
887	""" Lookup up the codec for the given encoding and return
888	its StreamReader class or factory function.
889
890	Raises a LookupError in case the encoding cannot be found.
891
892	"""
893	return lookup(encoding).streamreader
894
895	def getwriter(encoding):
896
897	""" Lookup up the codec for the given encoding and return
898	its StreamWriter class or factory function.
899
900	Raises a LookupError in case the encoding cannot be found.
901
902	"""
903	return lookup(encoding).streamwriter
904
905	def iterencode(iterator, encoding, errors='strict', **kwargs):
906	"""
907	Encoding iterator.
908
909	Encodes the input strings from the iterator using a IncrementalEncoder.
910
911	errors and kwargs are passed through to the IncrementalEncoder
912	constructor.
913	"""
914	encoder = getincrementalencoder(encoding)(errors, **kwargs)
915	for input in iterator:
916	output = encoder.encode(input)
917	if output:
918	yield output
919	output = encoder.encode("", True)
920	if output:
921	yield output
922
923	def iterdecode(iterator, encoding, errors='strict', **kwargs):
924	"""
925	Decoding iterator.
926
927	Decodes the input strings from the iterator using a IncrementalDecoder.
928
929	errors and kwargs are passed through to the IncrementalDecoder
930	constructor.
931	"""
932	decoder = getincrementaldecoder(encoding)(errors, **kwargs)
933	for input in iterator:
934	output = decoder.decode(input)
935	if output:
936	yield output
937	output = decoder.decode("", True)
938	if output:
939	yield output
940
941	### Helpers for charmap-based codecs
942
943	def make_identity_dict(rng):
944
945	""" make_identity_dict(rng) -> dict
946
947	Return a dictionary where elements of the rng sequence are
948	mapped to themselves.
949
950	"""
951	res = {}
952	for i in rng:
953	res[i]=i
954	return res
955
956	def make_encoding_map(decoding_map):
957
958	""" Creates an encoding map from a decoding map.
959
960	If a target mapping in the decoding map occurs multiple
961	times, then that target is mapped to None (undefined mapping),
962	causing an exception when encountered by the charmap codec
963	during translation.
964
965	One example where this happens is cp875.py which decodes
966	multiple character to \u001a.
967
968	"""
969	m = {}
970	for k,v in decoding_map.items():
971	if not v in m:
972	m[v] = k
973	else:
974	m[v] = None
975	return m
976
977	### error handlers
978
979	try:
980	strict_errors = lookup_error("strict")
981	ignore_errors = lookup_error("ignore")
982	replace_errors = lookup_error("replace")
983	xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")
984	backslashreplace_errors = lookup_error("backslashreplace")
985	except LookupError:
986	# In --disable-unicode builds, these error handler are missing
987	strict_errors = None
988	ignore_errors = None
989	replace_errors = None
990	xmlcharrefreplace_errors = None
991	backslashreplace_errors = None
992
993	# Tell modulefinder that using codecs probably needs the encodings
994	# package
995	_false = 0
996	if _false:
997	import encodings
998
999	### Tests
1000
1001	if __name__ == '__main__':
1002
1003	# Make stdout translate Latin-1 output into UTF-8 output
1004	sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')
1005
1006	# Have stdin translate Latin-1 input into UTF-8 input
1007	sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: trunk/essentials/dev-lang/python/Lib/codecs.py@ 3226

Download in other formats: