source: trunk/essentials/dev-lang/python/Lib/test/test_unicode.py@ 3951

Last change on this file since 3951 was 3225, checked in by bird, 19 years ago

Python 2.5

File size: 34.1 KB
Line 
1# -*- coding: iso-8859-1 -*-
2""" Test script for the Unicode implementation.
3
4Written by Marc-Andre Lemburg ([email protected]).
5
6(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
7
8"""#"
9import unittest, sys, string, codecs, new
10from test import test_support, string_tests
11
12# Error handling (bad decoder return)
13def search_function(encoding):
14 def decode1(input, errors="strict"):
15 return 42 # not a tuple
16 def encode1(input, errors="strict"):
17 return 42 # not a tuple
18 def encode2(input, errors="strict"):
19 return (42, 42) # no unicode
20 def decode2(input, errors="strict"):
21 return (42, 42) # no unicode
22 if encoding=="test.unicode1":
23 return (encode1, decode1, None, None)
24 elif encoding=="test.unicode2":
25 return (encode2, decode2, None, None)
26 else:
27 return None
28codecs.register(search_function)
29
30class UnicodeTest(
31 string_tests.CommonTest,
32 string_tests.MixinStrUnicodeUserStringTest,
33 string_tests.MixinStrUnicodeTest,
34 ):
35 type2test = unicode
36
37 def checkequalnofix(self, result, object, methodname, *args):
38 method = getattr(object, methodname)
39 realresult = method(*args)
40 self.assertEqual(realresult, result)
41 self.assert_(type(realresult) is type(result))
42
43 # if the original is returned make sure that
44 # this doesn't happen with subclasses
45 if realresult is object:
46 class usub(unicode):
47 def __repr__(self):
48 return 'usub(%r)' % unicode.__repr__(self)
49 object = usub(object)
50 method = getattr(object, methodname)
51 realresult = method(*args)
52 self.assertEqual(realresult, result)
53 self.assert_(object is not realresult)
54
55 def test_literals(self):
56 self.assertEqual(u'\xff', u'\u00ff')
57 self.assertEqual(u'\uffff', u'\U0000ffff')
58 self.assertRaises(UnicodeError, eval, 'u\'\\Ufffffffe\'')
59 self.assertRaises(UnicodeError, eval, 'u\'\\Uffffffff\'')
60 self.assertRaises(UnicodeError, eval, 'u\'\\U%08x\'' % 0x110000)
61
62 def test_repr(self):
63 if not sys.platform.startswith('java'):
64 # Test basic sanity of repr()
65 self.assertEqual(repr(u'abc'), "u'abc'")
66 self.assertEqual(repr(u'ab\\c'), "u'ab\\\\c'")
67 self.assertEqual(repr(u'ab\\'), "u'ab\\\\'")
68 self.assertEqual(repr(u'\\c'), "u'\\\\c'")
69 self.assertEqual(repr(u'\\'), "u'\\\\'")
70 self.assertEqual(repr(u'\n'), "u'\\n'")
71 self.assertEqual(repr(u'\r'), "u'\\r'")
72 self.assertEqual(repr(u'\t'), "u'\\t'")
73 self.assertEqual(repr(u'\b'), "u'\\x08'")
74 self.assertEqual(repr(u"'\""), """u'\\'"'""")
75 self.assertEqual(repr(u"'\""), """u'\\'"'""")
76 self.assertEqual(repr(u"'"), '''u"'"''')
77 self.assertEqual(repr(u'"'), """u'"'""")
78 latin1repr = (
79 "u'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r"
80 "\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a"
81 "\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI"
82 "JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f"
83 "\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d"
84 "\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b"
85 "\\x9c\\x9d\\x9e\\x9f\\xa0\\xa1\\xa2\\xa3\\xa4\\xa5\\xa6\\xa7\\xa8\\xa9"
86 "\\xaa\\xab\\xac\\xad\\xae\\xaf\\xb0\\xb1\\xb2\\xb3\\xb4\\xb5\\xb6\\xb7"
87 "\\xb8\\xb9\\xba\\xbb\\xbc\\xbd\\xbe\\xbf\\xc0\\xc1\\xc2\\xc3\\xc4\\xc5"
88 "\\xc6\\xc7\\xc8\\xc9\\xca\\xcb\\xcc\\xcd\\xce\\xcf\\xd0\\xd1\\xd2\\xd3"
89 "\\xd4\\xd5\\xd6\\xd7\\xd8\\xd9\\xda\\xdb\\xdc\\xdd\\xde\\xdf\\xe0\\xe1"
90 "\\xe2\\xe3\\xe4\\xe5\\xe6\\xe7\\xe8\\xe9\\xea\\xeb\\xec\\xed\\xee\\xef"
91 "\\xf0\\xf1\\xf2\\xf3\\xf4\\xf5\\xf6\\xf7\\xf8\\xf9\\xfa\\xfb\\xfc\\xfd"
92 "\\xfe\\xff'")
93 testrepr = repr(u''.join(map(unichr, xrange(256))))
94 self.assertEqual(testrepr, latin1repr)
95 # Test repr works on wide unicode escapes without overflow.
96 self.assertEqual(repr(u"\U00010000" * 39 + u"\uffff" * 4096),
97 repr(u"\U00010000" * 39 + u"\uffff" * 4096))
98
99
100 def test_count(self):
101 string_tests.CommonTest.test_count(self)
102 # check mixed argument types
103 self.checkequalnofix(3, 'aaa', 'count', u'a')
104 self.checkequalnofix(0, 'aaa', 'count', u'b')
105 self.checkequalnofix(3, u'aaa', 'count', 'a')
106 self.checkequalnofix(0, u'aaa', 'count', 'b')
107 self.checkequalnofix(0, u'aaa', 'count', 'b')
108 self.checkequalnofix(1, u'aaa', 'count', 'a', -1)
109 self.checkequalnofix(3, u'aaa', 'count', 'a', -10)
110 self.checkequalnofix(2, u'aaa', 'count', 'a', 0, -1)
111 self.checkequalnofix(0, u'aaa', 'count', 'a', 0, -10)
112
113 def test_find(self):
114 self.checkequalnofix(0, u'abcdefghiabc', 'find', u'abc')
115 self.checkequalnofix(9, u'abcdefghiabc', 'find', u'abc', 1)
116 self.checkequalnofix(-1, u'abcdefghiabc', 'find', u'def', 4)
117
118 self.assertRaises(TypeError, u'hello'.find)
119 self.assertRaises(TypeError, u'hello'.find, 42)
120
121 def test_rfind(self):
122 string_tests.CommonTest.test_rfind(self)
123 # check mixed argument types
124 self.checkequalnofix(9, 'abcdefghiabc', 'rfind', u'abc')
125 self.checkequalnofix(12, 'abcdefghiabc', 'rfind', u'')
126 self.checkequalnofix(12, u'abcdefghiabc', 'rfind', '')
127
128 def test_index(self):
129 string_tests.CommonTest.test_index(self)
130 # check mixed argument types
131 for (t1, t2) in ((str, unicode), (unicode, str)):
132 self.checkequalnofix(0, t1('abcdefghiabc'), 'index', t2(''))
133 self.checkequalnofix(3, t1('abcdefghiabc'), 'index', t2('def'))
134 self.checkequalnofix(0, t1('abcdefghiabc'), 'index', t2('abc'))
135 self.checkequalnofix(9, t1('abcdefghiabc'), 'index', t2('abc'), 1)
136 self.assertRaises(ValueError, t1('abcdefghiabc').index, t2('hib'))
137 self.assertRaises(ValueError, t1('abcdefghiab').index, t2('abc'), 1)
138 self.assertRaises(ValueError, t1('abcdefghi').index, t2('ghi'), 8)
139 self.assertRaises(ValueError, t1('abcdefghi').index, t2('ghi'), -1)
140
141 def test_rindex(self):
142 string_tests.CommonTest.test_rindex(self)
143 # check mixed argument types
144 for (t1, t2) in ((str, unicode), (unicode, str)):
145 self.checkequalnofix(12, t1('abcdefghiabc'), 'rindex', t2(''))
146 self.checkequalnofix(3, t1('abcdefghiabc'), 'rindex', t2('def'))
147 self.checkequalnofix(9, t1('abcdefghiabc'), 'rindex', t2('abc'))
148 self.checkequalnofix(0, t1('abcdefghiabc'), 'rindex', t2('abc'), 0, -1)
149
150 self.assertRaises(ValueError, t1('abcdefghiabc').rindex, t2('hib'))
151 self.assertRaises(ValueError, t1('defghiabc').rindex, t2('def'), 1)
152 self.assertRaises(ValueError, t1('defghiabc').rindex, t2('abc'), 0, -1)
153 self.assertRaises(ValueError, t1('abcdefghi').rindex, t2('ghi'), 0, 8)
154 self.assertRaises(ValueError, t1('abcdefghi').rindex, t2('ghi'), 0, -1)
155
156 def test_translate(self):
157 self.checkequalnofix(u'bbbc', u'abababc', 'translate', {ord('a'):None})
158 self.checkequalnofix(u'iiic', u'abababc', 'translate', {ord('a'):None, ord('b'):ord('i')})
159 self.checkequalnofix(u'iiix', u'abababc', 'translate', {ord('a'):None, ord('b'):ord('i'), ord('c'):u'x'})
160 self.checkequalnofix(u'<i><i><i>c', u'abababc', 'translate', {ord('a'):None, ord('b'):u'<i>'})
161 self.checkequalnofix(u'c', u'abababc', 'translate', {ord('a'):None, ord('b'):u''})
162 self.checkequalnofix(u'xyyx', u'xzx', 'translate', {ord('z'):u'yy'})
163
164 self.assertRaises(TypeError, u'hello'.translate)
165 self.assertRaises(TypeError, u'abababc'.translate, {ord('a'):''})
166
167 def test_split(self):
168 string_tests.CommonTest.test_split(self)
169
170 # Mixed arguments
171 self.checkequalnofix([u'a', u'b', u'c', u'd'], u'a//b//c//d', 'split', '//')
172 self.checkequalnofix([u'a', u'b', u'c', u'd'], 'a//b//c//d', 'split', u'//')
173 self.checkequalnofix([u'endcase ', u''], u'endcase test', 'split', 'test')
174
175 def test_join(self):
176 string_tests.MixinStrUnicodeUserStringTest.test_join(self)
177
178 # mixed arguments
179 self.checkequalnofix(u'a b c d', u' ', 'join', ['a', 'b', u'c', u'd'])
180 self.checkequalnofix(u'abcd', u'', 'join', (u'a', u'b', u'c', u'd'))
181 self.checkequalnofix(u'w x y z', u' ', 'join', string_tests.Sequence('wxyz'))
182 self.checkequalnofix(u'a b c d', ' ', 'join', [u'a', u'b', u'c', u'd'])
183 self.checkequalnofix(u'a b c d', ' ', 'join', ['a', 'b', u'c', u'd'])
184 self.checkequalnofix(u'abcd', '', 'join', (u'a', u'b', u'c', u'd'))
185 self.checkequalnofix(u'w x y z', ' ', 'join', string_tests.Sequence(u'wxyz'))
186
187 def test_strip(self):
188 string_tests.CommonTest.test_strip(self)
189 self.assertRaises(UnicodeError, u"hello".strip, "\xff")
190
191 def test_replace(self):
192 string_tests.CommonTest.test_replace(self)
193
194 # method call forwarded from str implementation because of unicode argument
195 self.checkequalnofix(u'one@two!three!', 'one!two!three!', 'replace', u'!', u'@', 1)
196 self.assertRaises(TypeError, 'replace'.replace, u"r", 42)
197
198 def test_comparison(self):
199 # Comparisons:
200 self.assertEqual(u'abc', 'abc')
201 self.assertEqual('abc', u'abc')
202 self.assertEqual(u'abc', u'abc')
203 self.assert_(u'abcd' > 'abc')
204 self.assert_('abcd' > u'abc')
205 self.assert_(u'abcd' > u'abc')
206 self.assert_(u'abc' < 'abcd')
207 self.assert_('abc' < u'abcd')
208 self.assert_(u'abc' < u'abcd')
209
210 if 0:
211 # Move these tests to a Unicode collation module test...
212 # Testing UTF-16 code point order comparisons...
213
214 # No surrogates, no fixup required.
215 self.assert_(u'\u0061' < u'\u20ac')
216 # Non surrogate below surrogate value, no fixup required
217 self.assert_(u'\u0061' < u'\ud800\udc02')
218
219 # Non surrogate above surrogate value, fixup required
220 def test_lecmp(s, s2):
221 self.assert_(s < s2)
222
223 def test_fixup(s):
224 s2 = u'\ud800\udc01'
225 test_lecmp(s, s2)
226 s2 = u'\ud900\udc01'
227 test_lecmp(s, s2)
228 s2 = u'\uda00\udc01'
229 test_lecmp(s, s2)
230 s2 = u'\udb00\udc01'
231 test_lecmp(s, s2)
232 s2 = u'\ud800\udd01'
233 test_lecmp(s, s2)
234 s2 = u'\ud900\udd01'
235 test_lecmp(s, s2)
236 s2 = u'\uda00\udd01'
237 test_lecmp(s, s2)
238 s2 = u'\udb00\udd01'
239 test_lecmp(s, s2)
240 s2 = u'\ud800\ude01'
241 test_lecmp(s, s2)
242 s2 = u'\ud900\ude01'
243 test_lecmp(s, s2)
244 s2 = u'\uda00\ude01'
245 test_lecmp(s, s2)
246 s2 = u'\udb00\ude01'
247 test_lecmp(s, s2)
248 s2 = u'\ud800\udfff'
249 test_lecmp(s, s2)
250 s2 = u'\ud900\udfff'
251 test_lecmp(s, s2)
252 s2 = u'\uda00\udfff'
253 test_lecmp(s, s2)
254 s2 = u'\udb00\udfff'
255 test_lecmp(s, s2)
256
257 test_fixup(u'\ue000')
258 test_fixup(u'\uff61')
259
260 # Surrogates on both sides, no fixup required
261 self.assert_(u'\ud800\udc02' < u'\ud84d\udc56')
262
263 def test_islower(self):
264 string_tests.MixinStrUnicodeUserStringTest.test_islower(self)
265 self.checkequalnofix(False, u'\u1FFc', 'islower')
266
267 def test_isupper(self):
268 string_tests.MixinStrUnicodeUserStringTest.test_isupper(self)
269 if not sys.platform.startswith('java'):
270 self.checkequalnofix(False, u'\u1FFc', 'isupper')
271
272 def test_istitle(self):
273 string_tests.MixinStrUnicodeUserStringTest.test_title(self)
274 self.checkequalnofix(True, u'\u1FFc', 'istitle')
275 self.checkequalnofix(True, u'Greek \u1FFcitlecases ...', 'istitle')
276
277 def test_isspace(self):
278 string_tests.MixinStrUnicodeUserStringTest.test_isspace(self)
279 self.checkequalnofix(True, u'\u2000', 'isspace')
280 self.checkequalnofix(True, u'\u200a', 'isspace')
281 self.checkequalnofix(False, u'\u2014', 'isspace')
282
283 def test_isalpha(self):
284 string_tests.MixinStrUnicodeUserStringTest.test_isalpha(self)
285 self.checkequalnofix(True, u'\u1FFc', 'isalpha')
286
287 def test_isdecimal(self):
288 self.checkequalnofix(False, u'', 'isdecimal')
289 self.checkequalnofix(False, u'a', 'isdecimal')
290 self.checkequalnofix(True, u'0', 'isdecimal')
291 self.checkequalnofix(False, u'\u2460', 'isdecimal') # CIRCLED DIGIT ONE
292 self.checkequalnofix(False, u'\xbc', 'isdecimal') # VULGAR FRACTION ONE QUARTER
293 self.checkequalnofix(True, u'\u0660', 'isdecimal') # ARABIC-INDIC DIGIT ZERO
294 self.checkequalnofix(True, u'0123456789', 'isdecimal')
295 self.checkequalnofix(False, u'0123456789a', 'isdecimal')
296
297 self.checkraises(TypeError, 'abc', 'isdecimal', 42)
298
299 def test_isdigit(self):
300 string_tests.MixinStrUnicodeUserStringTest.test_isdigit(self)
301 self.checkequalnofix(True, u'\u2460', 'isdigit')
302 self.checkequalnofix(False, u'\xbc', 'isdigit')
303 self.checkequalnofix(True, u'\u0660', 'isdigit')
304
305 def test_isnumeric(self):
306 self.checkequalnofix(False, u'', 'isnumeric')
307 self.checkequalnofix(False, u'a', 'isnumeric')
308 self.checkequalnofix(True, u'0', 'isnumeric')
309 self.checkequalnofix(True, u'\u2460', 'isnumeric')
310 self.checkequalnofix(True, u'\xbc', 'isnumeric')
311 self.checkequalnofix(True, u'\u0660', 'isnumeric')
312 self.checkequalnofix(True, u'0123456789', 'isnumeric')
313 self.checkequalnofix(False, u'0123456789a', 'isnumeric')
314
315 self.assertRaises(TypeError, u"abc".isnumeric, 42)
316
317 def test_contains(self):
318 # Testing Unicode contains method
319 self.assert_('a' in u'abdb')
320 self.assert_('a' in u'bdab')
321 self.assert_('a' in u'bdaba')
322 self.assert_('a' in u'bdba')
323 self.assert_('a' in u'bdba')
324 self.assert_(u'a' in u'bdba')
325 self.assert_(u'a' not in u'bdb')
326 self.assert_(u'a' not in 'bdb')
327 self.assert_(u'a' in 'bdba')
328 self.assert_(u'a' in ('a',1,None))
329 self.assert_(u'a' in (1,None,'a'))
330 self.assert_(u'a' in (1,None,u'a'))
331 self.assert_('a' in ('a',1,None))
332 self.assert_('a' in (1,None,'a'))
333 self.assert_('a' in (1,None,u'a'))
334 self.assert_('a' not in ('x',1,u'y'))
335 self.assert_('a' not in ('x',1,None))
336 self.assert_(u'abcd' not in u'abcxxxx')
337 self.assert_(u'ab' in u'abcd')
338 self.assert_('ab' in u'abc')
339 self.assert_(u'ab' in 'abc')
340 self.assert_(u'ab' in (1,None,u'ab'))
341 self.assert_(u'' in u'abc')
342 self.assert_('' in u'abc')
343
344 # If the following fails either
345 # the contains operator does not propagate UnicodeErrors or
346 # someone has changed the default encoding
347 self.assertRaises(UnicodeError, 'g\xe2teau'.__contains__, u'\xe2')
348
349 self.assert_(u'' in '')
350 self.assert_('' in u'')
351 self.assert_(u'' in u'')
352 self.assert_(u'' in 'abc')
353 self.assert_('' in u'abc')
354 self.assert_(u'' in u'abc')
355 self.assert_(u'\0' not in 'abc')
356 self.assert_('\0' not in u'abc')
357 self.assert_(u'\0' not in u'abc')
358 self.assert_(u'\0' in '\0abc')
359 self.assert_('\0' in u'\0abc')
360 self.assert_(u'\0' in u'\0abc')
361 self.assert_(u'\0' in 'abc\0')
362 self.assert_('\0' in u'abc\0')
363 self.assert_(u'\0' in u'abc\0')
364 self.assert_(u'a' in '\0abc')
365 self.assert_('a' in u'\0abc')
366 self.assert_(u'a' in u'\0abc')
367 self.assert_(u'asdf' in 'asdf')
368 self.assert_('asdf' in u'asdf')
369 self.assert_(u'asdf' in u'asdf')
370 self.assert_(u'asdf' not in 'asd')
371 self.assert_('asdf' not in u'asd')
372 self.assert_(u'asdf' not in u'asd')
373 self.assert_(u'asdf' not in '')
374 self.assert_('asdf' not in u'')
375 self.assert_(u'asdf' not in u'')
376
377 self.assertRaises(TypeError, u"abc".__contains__)
378
379 def test_formatting(self):
380 string_tests.MixinStrUnicodeUserStringTest.test_formatting(self)
381 # Testing Unicode formatting strings...
382 self.assertEqual(u"%s, %s" % (u"abc", "abc"), u'abc, abc')
383 self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", 1, 2, 3), u'abc, abc, 1, 2.000000, 3.00')
384 self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", 1, -2, 3), u'abc, abc, 1, -2.000000, 3.00')
385 self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 3.5), u'abc, abc, -1, -2.000000, 3.50')
386 self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 3.57), u'abc, abc, -1, -2.000000, 3.57')
387 self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 1003.57), u'abc, abc, -1, -2.000000, 1003.57')
388 if not sys.platform.startswith('java'):
389 self.assertEqual(u"%r, %r" % (u"abc", "abc"), u"u'abc', 'abc'")
390 self.assertEqual(u"%(x)s, %(y)s" % {'x':u"abc", 'y':"def"}, u'abc, def')
391 self.assertEqual(u"%(x)s, %(\xfc)s" % {'x':u"abc", u'\xfc':"def"}, u'abc, def')
392
393 self.assertEqual(u'%c' % 0x1234, u'\u1234')
394 self.assertRaises(OverflowError, u"%c".__mod__, (sys.maxunicode+1,))
395
396 # formatting jobs delegated from the string implementation:
397 self.assertEqual('...%(foo)s...' % {'foo':u"abc"}, u'...abc...')
398 self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
399 self.assertEqual('...%(foo)s...' % {u'foo':"abc"}, '...abc...')
400 self.assertEqual('...%(foo)s...' % {u'foo':u"abc"}, u'...abc...')
401 self.assertEqual('...%(foo)s...' % {u'foo':u"abc",'def':123}, u'...abc...')
402 self.assertEqual('...%(foo)s...' % {u'foo':u"abc",u'def':123}, u'...abc...')
403 self.assertEqual('...%s...%s...%s...%s...' % (1,2,3,u"abc"), u'...1...2...3...abc...')
404 self.assertEqual('...%%...%%s...%s...%s...%s...%s...' % (1,2,3,u"abc"), u'...%...%s...1...2...3...abc...')
405 self.assertEqual('...%s...' % u"abc", u'...abc...')
406 self.assertEqual('%*s' % (5,u'abc',), u' abc')
407 self.assertEqual('%*s' % (-5,u'abc',), u'abc ')
408 self.assertEqual('%*.*s' % (5,2,u'abc',), u' ab')
409 self.assertEqual('%*.*s' % (5,3,u'abc',), u' abc')
410 self.assertEqual('%i %*.*s' % (10, 5,3,u'abc',), u'10 abc')
411 self.assertEqual('%i%s %*.*s' % (10, 3, 5, 3, u'abc',), u'103 abc')
412 self.assertEqual('%c' % u'a', u'a')
413 class Wrapper:
414 def __str__(self):
415 return u'\u1234'
416 self.assertEqual('%s' % Wrapper(), u'\u1234')
417
418 @test_support.run_with_locale('LC_ALL', 'de_DE', 'fr_FR')
419 def test_format_float(self):
420 # should not format with a comma, but always with C locale
421 self.assertEqual(u'1.0', u'%.1f' % 1.0)
422
423 def test_constructor(self):
424 # unicode(obj) tests (this maps to PyObject_Unicode() at C level)
425
426 self.assertEqual(
427 unicode(u'unicode remains unicode'),
428 u'unicode remains unicode'
429 )
430
431 class UnicodeSubclass(unicode):
432 pass
433
434 self.assertEqual(
435 unicode(UnicodeSubclass('unicode subclass becomes unicode')),
436 u'unicode subclass becomes unicode'
437 )
438
439 self.assertEqual(
440 unicode('strings are converted to unicode'),
441 u'strings are converted to unicode'
442 )
443
444 class UnicodeCompat:
445 def __init__(self, x):
446 self.x = x
447 def __unicode__(self):
448 return self.x
449
450 self.assertEqual(
451 unicode(UnicodeCompat('__unicode__ compatible objects are recognized')),
452 u'__unicode__ compatible objects are recognized')
453
454 class StringCompat:
455 def __init__(self, x):
456 self.x = x
457 def __str__(self):
458 return self.x
459
460 self.assertEqual(
461 unicode(StringCompat('__str__ compatible objects are recognized')),
462 u'__str__ compatible objects are recognized'
463 )
464
465 # unicode(obj) is compatible to str():
466
467 o = StringCompat('unicode(obj) is compatible to str()')
468 self.assertEqual(unicode(o), u'unicode(obj) is compatible to str()')
469 self.assertEqual(str(o), 'unicode(obj) is compatible to str()')
470
471 # %-formatting and .__unicode__()
472 self.assertEqual(u'%s' %
473 UnicodeCompat(u"u'%s' % obj uses obj.__unicode__()"),
474 u"u'%s' % obj uses obj.__unicode__()")
475 self.assertEqual(u'%s' %
476 UnicodeCompat(u"u'%s' % obj falls back to obj.__str__()"),
477 u"u'%s' % obj falls back to obj.__str__()")
478
479 for obj in (123, 123.45, 123L):
480 self.assertEqual(unicode(obj), unicode(str(obj)))
481
482 # unicode(obj, encoding, error) tests (this maps to
483 # PyUnicode_FromEncodedObject() at C level)
484
485 if not sys.platform.startswith('java'):
486 self.assertRaises(
487 TypeError,
488 unicode,
489 u'decoding unicode is not supported',
490 'utf-8',
491 'strict'
492 )
493
494 self.assertEqual(
495 unicode('strings are decoded to unicode', 'utf-8', 'strict'),
496 u'strings are decoded to unicode'
497 )
498
499 if not sys.platform.startswith('java'):
500 self.assertEqual(
501 unicode(
502 buffer('character buffers are decoded to unicode'),
503 'utf-8',
504 'strict'
505 ),
506 u'character buffers are decoded to unicode'
507 )
508
509 self.assertRaises(TypeError, unicode, 42, 42, 42)
510
511 def test_codecs_utf7(self):
512 utfTests = [
513 (u'A\u2262\u0391.', 'A+ImIDkQ.'), # RFC2152 example
514 (u'Hi Mom -\u263a-!', 'Hi Mom -+Jjo--!'), # RFC2152 example
515 (u'\u65E5\u672C\u8A9E', '+ZeVnLIqe-'), # RFC2152 example
516 (u'Item 3 is \u00a31.', 'Item 3 is +AKM-1.'), # RFC2152 example
517 (u'+', '+-'),
518 (u'+-', '+--'),
519 (u'+?', '+-?'),
520 (u'\?', '+AFw?'),
521 (u'+?', '+-?'),
522 (ur'\\?', '+AFwAXA?'),
523 (ur'\\\?', '+AFwAXABc?'),
524 (ur'++--', '+-+---')
525 ]
526
527 for (x, y) in utfTests:
528 self.assertEqual(x.encode('utf-7'), y)
529
530 # surrogates not supported
531 self.assertRaises(UnicodeError, unicode, '+3ADYAA-', 'utf-7')
532
533 self.assertEqual(unicode('+3ADYAA-', 'utf-7', 'replace'), u'\ufffd')
534
535 def test_codecs_utf8(self):
536 self.assertEqual(u''.encode('utf-8'), '')
537 self.assertEqual(u'\u20ac'.encode('utf-8'), '\xe2\x82\xac')
538 self.assertEqual(u'\ud800\udc02'.encode('utf-8'), '\xf0\x90\x80\x82')
539 self.assertEqual(u'\ud84d\udc56'.encode('utf-8'), '\xf0\xa3\x91\x96')
540 self.assertEqual(u'\ud800'.encode('utf-8'), '\xed\xa0\x80')
541 self.assertEqual(u'\udc00'.encode('utf-8'), '\xed\xb0\x80')
542 self.assertEqual(
543 (u'\ud800\udc02'*1000).encode('utf-8'),
544 '\xf0\x90\x80\x82'*1000
545 )
546 self.assertEqual(
547 u'\u6b63\u78ba\u306b\u8a00\u3046\u3068\u7ffb\u8a33\u306f'
548 u'\u3055\u308c\u3066\u3044\u307e\u305b\u3093\u3002\u4e00'
549 u'\u90e8\u306f\u30c9\u30a4\u30c4\u8a9e\u3067\u3059\u304c'
550 u'\u3001\u3042\u3068\u306f\u3067\u305f\u3089\u3081\u3067'
551 u'\u3059\u3002\u5b9f\u969b\u306b\u306f\u300cWenn ist das'
552 u' Nunstuck git und'.encode('utf-8'),
553 '\xe6\xad\xa3\xe7\xa2\xba\xe3\x81\xab\xe8\xa8\x80\xe3\x81'
554 '\x86\xe3\x81\xa8\xe7\xbf\xbb\xe8\xa8\xb3\xe3\x81\xaf\xe3'
555 '\x81\x95\xe3\x82\x8c\xe3\x81\xa6\xe3\x81\x84\xe3\x81\xbe'
556 '\xe3\x81\x9b\xe3\x82\x93\xe3\x80\x82\xe4\xb8\x80\xe9\x83'
557 '\xa8\xe3\x81\xaf\xe3\x83\x89\xe3\x82\xa4\xe3\x83\x84\xe8'
558 '\xaa\x9e\xe3\x81\xa7\xe3\x81\x99\xe3\x81\x8c\xe3\x80\x81'
559 '\xe3\x81\x82\xe3\x81\xa8\xe3\x81\xaf\xe3\x81\xa7\xe3\x81'
560 '\x9f\xe3\x82\x89\xe3\x82\x81\xe3\x81\xa7\xe3\x81\x99\xe3'
561 '\x80\x82\xe5\xae\x9f\xe9\x9a\x9b\xe3\x81\xab\xe3\x81\xaf'
562 '\xe3\x80\x8cWenn ist das Nunstuck git und'
563 )
564
565 # UTF-8 specific decoding tests
566 self.assertEqual(unicode('\xf0\xa3\x91\x96', 'utf-8'), u'\U00023456' )
567 self.assertEqual(unicode('\xf0\x90\x80\x82', 'utf-8'), u'\U00010002' )
568 self.assertEqual(unicode('\xe2\x82\xac', 'utf-8'), u'\u20ac' )
569
570 # Other possible utf-8 test cases:
571 # * strict decoding testing for all of the
572 # UTF8_ERROR cases in PyUnicode_DecodeUTF8
573
574 def test_codecs_idna(self):
575 # Test whether trailing dot is preserved
576 self.assertEqual(u"www.python.org.".encode("idna"), "www.python.org.")
577
578 def test_codecs_errors(self):
579 # Error handling (encoding)
580 self.assertRaises(UnicodeError, u'Andr\202 x'.encode, 'ascii')
581 self.assertRaises(UnicodeError, u'Andr\202 x'.encode, 'ascii','strict')
582 self.assertEqual(u'Andr\202 x'.encode('ascii','ignore'), "Andr x")
583 self.assertEqual(u'Andr\202 x'.encode('ascii','replace'), "Andr? x")
584
585 # Error handling (decoding)
586 self.assertRaises(UnicodeError, unicode, 'Andr\202 x', 'ascii')
587 self.assertRaises(UnicodeError, unicode, 'Andr\202 x', 'ascii','strict')
588 self.assertEqual(unicode('Andr\202 x','ascii','ignore'), u"Andr x")
589 self.assertEqual(unicode('Andr\202 x','ascii','replace'), u'Andr\uFFFD x')
590
591 # Error handling (unknown character names)
592 self.assertEqual("\\N{foo}xx".decode("unicode-escape", "ignore"), u"xx")
593
594 # Error handling (truncated escape sequence)
595 self.assertRaises(UnicodeError, "\\".decode, "unicode-escape")
596
597 self.assertRaises(TypeError, "hello".decode, "test.unicode1")
598 self.assertRaises(TypeError, unicode, "hello", "test.unicode2")
599 self.assertRaises(TypeError, u"hello".encode, "test.unicode1")
600 self.assertRaises(TypeError, u"hello".encode, "test.unicode2")
601 # executes PyUnicode_Encode()
602 import imp
603 self.assertRaises(
604 ImportError,
605 imp.find_module,
606 "non-existing module",
607 [u"non-existing dir"]
608 )
609
610 # Error handling (wrong arguments)
611 self.assertRaises(TypeError, u"hello".encode, 42, 42, 42)
612
613 # Error handling (PyUnicode_EncodeDecimal())
614 self.assertRaises(UnicodeError, int, u"\u0200")
615
616 def test_codecs(self):
617 # Encoding
618 self.assertEqual(u'hello'.encode('ascii'), 'hello')
619 self.assertEqual(u'hello'.encode('utf-7'), 'hello')
620 self.assertEqual(u'hello'.encode('utf-8'), 'hello')
621 self.assertEqual(u'hello'.encode('utf8'), 'hello')
622 self.assertEqual(u'hello'.encode('utf-16-le'), 'h\000e\000l\000l\000o\000')
623 self.assertEqual(u'hello'.encode('utf-16-be'), '\000h\000e\000l\000l\000o')
624 self.assertEqual(u'hello'.encode('latin-1'), 'hello')
625
626 # Roundtrip safety for BMP (just the first 1024 chars)
627 for c in xrange(1024):
628 u = unichr(c)
629 for encoding in ('utf-7', 'utf-8', 'utf-16', 'utf-16-le',
630 'utf-16-be', 'raw_unicode_escape',
631 'unicode_escape', 'unicode_internal'):
632 self.assertEqual(unicode(u.encode(encoding),encoding), u)
633
634 # Roundtrip safety for BMP (just the first 256 chars)
635 for c in xrange(256):
636 u = unichr(c)
637 for encoding in ('latin-1',):
638 self.assertEqual(unicode(u.encode(encoding),encoding), u)
639
640 # Roundtrip safety for BMP (just the first 128 chars)
641 for c in xrange(128):
642 u = unichr(c)
643 for encoding in ('ascii',):
644 self.assertEqual(unicode(u.encode(encoding),encoding), u)
645
646 # Roundtrip safety for non-BMP (just a few chars)
647 u = u'\U00010001\U00020002\U00030003\U00040004\U00050005'
648 for encoding in ('utf-8', 'utf-16', 'utf-16-le', 'utf-16-be',
649 #'raw_unicode_escape',
650 'unicode_escape', 'unicode_internal'):
651 self.assertEqual(unicode(u.encode(encoding),encoding), u)
652
653 # UTF-8 must be roundtrip safe for all UCS-2 code points
654 # This excludes surrogates: in the full range, there would be
655 # a surrogate pair (\udbff\udc00), which gets converted back
656 # to a non-BMP character (\U0010fc00)
657 u = u''.join(map(unichr, range(0,0xd800)+range(0xe000,0x10000)))
658 for encoding in ('utf-8',):
659 self.assertEqual(unicode(u.encode(encoding),encoding), u)
660
661 def test_codecs_charmap(self):
662 # 0-127
663 s = ''.join(map(chr, xrange(128)))
664 for encoding in (
665 'cp037', 'cp1026',
666 'cp437', 'cp500', 'cp737', 'cp775', 'cp850',
667 'cp852', 'cp855', 'cp860', 'cp861', 'cp862',
668 'cp863', 'cp865', 'cp866',
669 'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
670 'iso8859_2', 'iso8859_3', 'iso8859_4', 'iso8859_5', 'iso8859_6',
671 'iso8859_7', 'iso8859_9', 'koi8_r', 'latin_1',
672 'mac_cyrillic', 'mac_latin2',
673
674 'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
675 'cp1256', 'cp1257', 'cp1258',
676 'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
677
678 'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
679 'cp1006', 'iso8859_8',
680
681 ### These have undefined mappings:
682 #'cp424',
683
684 ### These fail the round-trip:
685 #'cp875'
686
687 ):
688 self.assertEqual(unicode(s, encoding).encode(encoding), s)
689
690 # 128-255
691 s = ''.join(map(chr, xrange(128, 256)))
692 for encoding in (
693 'cp037', 'cp1026',
694 'cp437', 'cp500', 'cp737', 'cp775', 'cp850',
695 'cp852', 'cp855', 'cp860', 'cp861', 'cp862',
696 'cp863', 'cp865', 'cp866',
697 'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
698 'iso8859_2', 'iso8859_4', 'iso8859_5',
699 'iso8859_9', 'koi8_r', 'latin_1',
700 'mac_cyrillic', 'mac_latin2',
701
702 ### These have undefined mappings:
703 #'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
704 #'cp1256', 'cp1257', 'cp1258',
705 #'cp424', 'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
706 #'iso8859_3', 'iso8859_6', 'iso8859_7',
707 #'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
708
709 ### These fail the round-trip:
710 #'cp1006', 'cp875', 'iso8859_8',
711
712 ):
713 self.assertEqual(unicode(s, encoding).encode(encoding), s)
714
715 def test_concatenation(self):
716 self.assertEqual((u"abc" u"def"), u"abcdef")
717 self.assertEqual(("abc" u"def"), u"abcdef")
718 self.assertEqual((u"abc" "def"), u"abcdef")
719 self.assertEqual((u"abc" u"def" "ghi"), u"abcdefghi")
720 self.assertEqual(("abc" "def" u"ghi"), u"abcdefghi")
721
722 def test_printing(self):
723 class BitBucket:
724 def write(self, text):
725 pass
726
727 out = BitBucket()
728 print >>out, u'abc'
729 print >>out, u'abc', u'def'
730 print >>out, u'abc', 'def'
731 print >>out, 'abc', u'def'
732 print >>out, u'abc\n'
733 print >>out, u'abc\n',
734 print >>out, u'abc\n',
735 print >>out, u'def\n'
736 print >>out, u'def\n'
737
738 def test_ucs4(self):
739 if sys.maxunicode == 0xFFFF:
740 return
741 x = u'\U00100000'
742 y = x.encode("raw-unicode-escape").decode("raw-unicode-escape")
743 self.assertEqual(x, y)
744
745 def test_conversion(self):
746 # Make sure __unicode__() works properly
747 class Foo0:
748 def __str__(self):
749 return "foo"
750
751 class Foo1:
752 def __unicode__(self):
753 return u"foo"
754
755 class Foo2(object):
756 def __unicode__(self):
757 return u"foo"
758
759 class Foo3(object):
760 def __unicode__(self):
761 return "foo"
762
763 class Foo4(str):
764 def __unicode__(self):
765 return "foo"
766
767 class Foo5(unicode):
768 def __unicode__(self):
769 return "foo"
770
771 class Foo6(str):
772 def __str__(self):
773 return "foos"
774
775 def __unicode__(self):
776 return u"foou"
777
778 class Foo7(unicode):
779 def __str__(self):
780 return "foos"
781 def __unicode__(self):
782 return u"foou"
783
784 class Foo8(unicode):
785 def __new__(cls, content=""):
786 return unicode.__new__(cls, 2*content)
787 def __unicode__(self):
788 return self
789
790 class Foo9(unicode):
791 def __str__(self):
792 return "string"
793 def __unicode__(self):
794 return "not unicode"
795
796 self.assertEqual(unicode(Foo0()), u"foo")
797 self.assertEqual(unicode(Foo1()), u"foo")
798 self.assertEqual(unicode(Foo2()), u"foo")
799 self.assertEqual(unicode(Foo3()), u"foo")
800 self.assertEqual(unicode(Foo4("bar")), u"foo")
801 self.assertEqual(unicode(Foo5("bar")), u"foo")
802 self.assertEqual(unicode(Foo6("bar")), u"foou")
803 self.assertEqual(unicode(Foo7("bar")), u"foou")
804 self.assertEqual(unicode(Foo8("foo")), u"foofoo")
805 self.assertEqual(str(Foo9("foo")), "string")
806 self.assertEqual(unicode(Foo9("foo")), u"not unicode")
807
808 def test_unicode_repr(self):
809 class s1:
810 def __repr__(self):
811 return '\\n'
812
813 class s2:
814 def __repr__(self):
815 return u'\\n'
816
817 self.assertEqual(repr(s1()), '\\n')
818 self.assertEqual(repr(s2()), '\\n')
819
820
821
822
823
824def test_main():
825 test_support.run_unittest(UnicodeTest)
826
827if __name__ == "__main__":
828 test_main()
Note: See TracBrowser for help on using the repository browser.