source: trunk/essentials/dev-lang/python/Modules/cjkcodecs/_codecs_cn.c

Last change on this file was 3225, checked in by bird, 19 years ago

Python 2.5

File size: 6.9 KB
Line 
1/*
2 * _codecs_cn.c: Codecs collection for Mainland Chinese encodings
3 *
4 * Written by Hye-Shik Chang <[email protected]>
5 */
6
7#include "cjkcodecs.h"
8#include "mappings_cn.h"
9
10/**
11 * hz is predefined as 100 on AIX. So we undefine it to avoid
12 * conflict against hz codec's.
13 */
14#ifdef _AIX
15#undef hz
16#endif
17
18/* GBK and GB2312 map differently in few codepoints that are listed below:
19 *
20 * gb2312 gbk
21 * A1A4 U+30FB KATAKANA MIDDLE DOT U+00B7 MIDDLE DOT
22 * A1AA U+2015 HORIZONTAL BAR U+2014 EM DASH
23 * A844 undefined U+2015 HORIZONTAL BAR
24 */
25
26#define GBK_DECODE(dc1, dc2, assi) \
27 if ((dc1) == 0xa1 && (dc2) == 0xaa) (assi) = 0x2014; \
28 else if ((dc1) == 0xa8 && (dc2) == 0x44) (assi) = 0x2015; \
29 else if ((dc1) == 0xa1 && (dc2) == 0xa4) (assi) = 0x00b7; \
30 else TRYMAP_DEC(gb2312, assi, dc1 ^ 0x80, dc2 ^ 0x80); \
31 else TRYMAP_DEC(gbkext, assi, dc1, dc2);
32
33#define GBK_ENCODE(code, assi) \
34 if ((code) == 0x2014) (assi) = 0xa1aa; \
35 else if ((code) == 0x2015) (assi) = 0xa844; \
36 else if ((code) == 0x00b7) (assi) = 0xa1a4; \
37 else if ((code) != 0x30fb && TRYMAP_ENC_COND(gbcommon, assi, code));
38
39/*
40 * GB2312 codec
41 */
42
43ENCODER(gb2312)
44{
45 while (inleft > 0) {
46 Py_UNICODE c = IN1;
47 DBCHAR code;
48
49 if (c < 0x80) {
50 WRITE1((unsigned char)c)
51 NEXT(1, 1)
52 continue;
53 }
54 UCS4INVALID(c)
55
56 REQUIRE_OUTBUF(2)
57 TRYMAP_ENC(gbcommon, code, c);
58 else return 1;
59
60 if (code & 0x8000) /* MSB set: GBK */
61 return 1;
62
63 OUT1((code >> 8) | 0x80)
64 OUT2((code & 0xFF) | 0x80)
65 NEXT(1, 2)
66 }
67
68 return 0;
69}
70
71DECODER(gb2312)
72{
73 while (inleft > 0) {
74 unsigned char c = **inbuf;
75
76 REQUIRE_OUTBUF(1)
77
78 if (c < 0x80) {
79 OUT1(c)
80 NEXT(1, 1)
81 continue;
82 }
83
84 REQUIRE_INBUF(2)
85 TRYMAP_DEC(gb2312, **outbuf, c ^ 0x80, IN2 ^ 0x80) {
86 NEXT(2, 1)
87 }
88 else return 2;
89 }
90
91 return 0;
92}
93
94
95/*
96 * GBK codec
97 */
98
99ENCODER(gbk)
100{
101 while (inleft > 0) {
102 Py_UNICODE c = IN1;
103 DBCHAR code;
104
105 if (c < 0x80) {
106 WRITE1((unsigned char)c)
107 NEXT(1, 1)
108 continue;
109 }
110 UCS4INVALID(c)
111
112 REQUIRE_OUTBUF(2)
113
114 GBK_ENCODE(c, code)
115 else return 1;
116
117 OUT1((code >> 8) | 0x80)
118 if (code & 0x8000)
119 OUT2((code & 0xFF)) /* MSB set: GBK */
120 else
121 OUT2((code & 0xFF) | 0x80) /* MSB unset: GB2312 */
122 NEXT(1, 2)
123 }
124
125 return 0;
126}
127
128DECODER(gbk)
129{
130 while (inleft > 0) {
131 unsigned char c = IN1;
132
133 REQUIRE_OUTBUF(1)
134
135 if (c < 0x80) {
136 OUT1(c)
137 NEXT(1, 1)
138 continue;
139 }
140
141 REQUIRE_INBUF(2)
142
143 GBK_DECODE(c, IN2, **outbuf)
144 else return 2;
145
146 NEXT(2, 1)
147 }
148
149 return 0;
150}
151
152
153/*
154 * GB18030 codec
155 */
156
157ENCODER(gb18030)
158{
159 while (inleft > 0) {
160 ucs4_t c = IN1;
161 DBCHAR code;
162
163 if (c < 0x80) {
164 WRITE1(c)
165 NEXT(1, 1)
166 continue;
167 }
168
169 DECODE_SURROGATE(c)
170 if (c > 0x10FFFF)
171#if Py_UNICODE_SIZE == 2
172 return 2; /* surrogates pair */
173#else
174 return 1;
175#endif
176 else if (c >= 0x10000) {
177 ucs4_t tc = c - 0x10000;
178
179 REQUIRE_OUTBUF(4)
180
181 OUT4((unsigned char)(tc % 10) + 0x30)
182 tc /= 10;
183 OUT3((unsigned char)(tc % 126) + 0x81)
184 tc /= 126;
185 OUT2((unsigned char)(tc % 10) + 0x30)
186 tc /= 10;
187 OUT1((unsigned char)(tc + 0x90))
188
189#if Py_UNICODE_SIZE == 2
190 NEXT(2, 4) /* surrogates pair */
191#else
192 NEXT(1, 4)
193#endif
194 continue;
195 }
196
197 REQUIRE_OUTBUF(2)
198
199 GBK_ENCODE(c, code)
200 else {
201 const struct _gb18030_to_unibmp_ranges *utrrange;
202
203 REQUIRE_OUTBUF(4)
204
205 for (utrrange = gb18030_to_unibmp_ranges;
206 utrrange->first != 0;
207 utrrange++)
208 if (utrrange->first <= c &&
209 c <= utrrange->last) {
210 Py_UNICODE tc;
211
212 tc = c - utrrange->first +
213 utrrange->base;
214
215 OUT4((unsigned char)(tc % 10) + 0x30)
216 tc /= 10;
217 OUT3((unsigned char)(tc % 126) + 0x81)
218 tc /= 126;
219 OUT2((unsigned char)(tc % 10) + 0x30)
220 tc /= 10;
221 OUT1((unsigned char)tc + 0x81)
222
223 NEXT(1, 4)
224 break;
225 }
226
227 if (utrrange->first == 0)
228 return 1;
229 continue;
230 }
231
232 OUT1((code >> 8) | 0x80)
233 if (code & 0x8000)
234 OUT2((code & 0xFF)) /* MSB set: GBK or GB18030ext */
235 else
236 OUT2((code & 0xFF) | 0x80) /* MSB unset: GB2312 */
237
238 NEXT(1, 2)
239 }
240
241 return 0;
242}
243
244DECODER(gb18030)
245{
246 while (inleft > 0) {
247 unsigned char c = IN1, c2;
248
249 REQUIRE_OUTBUF(1)
250
251 if (c < 0x80) {
252 OUT1(c)
253 NEXT(1, 1)
254 continue;
255 }
256
257 REQUIRE_INBUF(2)
258
259 c2 = IN2;
260 if (c2 >= 0x30 && c2 <= 0x39) { /* 4 bytes seq */
261 const struct _gb18030_to_unibmp_ranges *utr;
262 unsigned char c3, c4;
263 ucs4_t lseq;
264
265 REQUIRE_INBUF(4)
266 c3 = IN3;
267 c4 = IN4;
268 if (c < 0x81 || c3 < 0x81 || c4 < 0x30 || c4 > 0x39)
269 return 4;
270 c -= 0x81; c2 -= 0x30;
271 c3 -= 0x81; c4 -= 0x30;
272
273 if (c < 4) { /* U+0080 - U+FFFF */
274 lseq = ((ucs4_t)c * 10 + c2) * 1260 +
275 (ucs4_t)c3 * 10 + c4;
276 if (lseq < 39420) {
277 for (utr = gb18030_to_unibmp_ranges;
278 lseq >= (utr + 1)->base;
279 utr++) ;
280 OUT1(utr->first - utr->base + lseq)
281 NEXT(4, 1)
282 continue;
283 }
284 }
285 else if (c >= 15) { /* U+10000 - U+10FFFF */
286 lseq = 0x10000 + (((ucs4_t)c-15) * 10 + c2)
287 * 1260 + (ucs4_t)c3 * 10 + c4;
288 if (lseq <= 0x10FFFF) {
289 WRITEUCS4(lseq);
290 NEXT_IN(4)
291 continue;
292 }
293 }
294 return 4;
295 }
296
297 GBK_DECODE(c, c2, **outbuf)
298 else TRYMAP_DEC(gb18030ext, **outbuf, c, c2);
299 else return 2;
300
301 NEXT(2, 1)
302 }
303
304 return 0;
305}
306
307
308/*
309 * HZ codec
310 */
311
312ENCODER_INIT(hz)
313{
314 state->i = 0;
315 return 0;
316}
317
318ENCODER_RESET(hz)
319{
320 if (state->i != 0) {
321 WRITE2('~', '}')
322 state->i = 0;
323 NEXT_OUT(2)
324 }
325 return 0;
326}
327
328ENCODER(hz)
329{
330 while (inleft > 0) {
331 Py_UNICODE c = IN1;
332 DBCHAR code;
333
334 if (c < 0x80) {
335 if (state->i == 0) {
336 WRITE1((unsigned char)c)
337 NEXT(1, 1)
338 }
339 else {
340 WRITE3('~', '}', (unsigned char)c)
341 NEXT(1, 3)
342 state->i = 0;
343 }
344 continue;
345 }
346
347 UCS4INVALID(c)
348
349 TRYMAP_ENC(gbcommon, code, c);
350 else return 1;
351
352 if (code & 0x8000) /* MSB set: GBK */
353 return 1;
354
355 if (state->i == 0) {
356 WRITE4('~', '{', code >> 8, code & 0xff)
357 NEXT(1, 4)
358 state->i = 1;
359 }
360 else {
361 WRITE2(code >> 8, code & 0xff)
362 NEXT(1, 2)
363 }
364 }
365
366 return 0;
367}
368
369DECODER_INIT(hz)
370{
371 state->i = 0;
372 return 0;
373}
374
375DECODER_RESET(hz)
376{
377 state->i = 0;
378 return 0;
379}
380
381DECODER(hz)
382{
383 while (inleft > 0) {
384 unsigned char c = IN1;
385
386 if (c == '~') {
387 unsigned char c2 = IN2;
388
389 REQUIRE_INBUF(2)
390 if (c2 == '~') {
391 WRITE1('~')
392 NEXT(2, 1)
393 continue;
394 }
395 else if (c2 == '{' && state->i == 0)
396 state->i = 1; /* set GB */
397 else if (c2 == '}' && state->i == 1)
398 state->i = 0; /* set ASCII */
399 else if (c2 == '\n')
400 ; /* line-continuation */
401 else
402 return 2;
403 NEXT(2, 0);
404 continue;
405 }
406
407 if (c & 0x80)
408 return 1;
409
410 if (state->i == 0) { /* ASCII mode */
411 WRITE1(c)
412 NEXT(1, 1)
413 }
414 else { /* GB mode */
415 REQUIRE_INBUF(2)
416 REQUIRE_OUTBUF(1)
417 TRYMAP_DEC(gb2312, **outbuf, c, IN2) {
418 NEXT(2, 1)
419 }
420 else
421 return 2;
422 }
423 }
424
425 return 0;
426}
427
428
429BEGIN_MAPPINGS_LIST
430 MAPPING_DECONLY(gb2312)
431 MAPPING_DECONLY(gbkext)
432 MAPPING_ENCONLY(gbcommon)
433 MAPPING_ENCDEC(gb18030ext)
434END_MAPPINGS_LIST
435
436BEGIN_CODECS_LIST
437 CODEC_STATELESS(gb2312)
438 CODEC_STATELESS(gbk)
439 CODEC_STATELESS(gb18030)
440 CODEC_STATEFUL(hz)
441END_CODECS_LIST
442
443I_AM_A_MODULE_FOR(cn)
Note: See TracBrowser for help on using the repository browser.