source: trunk/src/corelib/codecs/qutfcodec.cpp@ 467

Last change on this file since 467 was 2, checked in by Dmitry A. Kuminov, 16 years ago

Initially imported qt-all-opensource-src-4.5.1 from Trolltech.

File size: 17.7 KB
Line 
1/****************************************************************************
2**
3** Copyright (C) 2009 Nokia Corporation and/or its subsidiary(-ies).
4** Contact: Qt Software Information ([email protected])
5**
6** This file is part of the QtCore module of the Qt Toolkit.
7**
8** $QT_BEGIN_LICENSE:LGPL$
9** Commercial Usage
10** Licensees holding valid Qt Commercial licenses may use this file in
11** accordance with the Qt Commercial License Agreement provided with the
12** Software or, alternatively, in accordance with the terms contained in
13** a written agreement between you and Nokia.
14**
15** GNU Lesser General Public License Usage
16** Alternatively, this file may be used under the terms of the GNU Lesser
17** General Public License version 2.1 as published by the Free Software
18** Foundation and appearing in the file LICENSE.LGPL included in the
19** packaging of this file. Please review the following information to
20** ensure the GNU Lesser General Public License version 2.1 requirements
21** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
22**
23** In addition, as a special exception, Nokia gives you certain
24** additional rights. These rights are described in the Nokia Qt LGPL
25** Exception version 1.0, included in the file LGPL_EXCEPTION.txt in this
26** package.
27**
28** GNU General Public License Usage
29** Alternatively, this file may be used under the terms of the GNU
30** General Public License version 3.0 as published by the Free Software
31** Foundation and appearing in the file LICENSE.GPL included in the
32** packaging of this file. Please review the following information to
33** ensure the GNU General Public License version 3.0 requirements will be
34** met: http://www.gnu.org/copyleft/gpl.html.
35**
36** If you are unsure which license is appropriate for your use, please
37** contact the sales department at [email protected].
38** $QT_END_LICENSE$
39**
40****************************************************************************/
41
42#include "qutfcodec_p.h"
43#include "qlist.h"
44#include "qendian.h"
45#include "qchar.h"
46
47#ifndef QT_NO_TEXTCODEC
48
49QT_BEGIN_NAMESPACE
50
51QUtf8Codec::~QUtf8Codec()
52{
53}
54
55QByteArray QUtf8Codec::convertFromUnicode(const QChar *uc, int len, ConverterState *state) const
56{
57 uchar replacement = '?';
58 int rlen = 3*len;
59 int surrogate_high = -1;
60 if (state) {
61 if (state->flags & ConvertInvalidToNull)
62 replacement = 0;
63 if (!(state->flags & IgnoreHeader))
64 rlen += 3;
65 if (state->remainingChars)
66 surrogate_high = state->state_data[0];
67 }
68
69 QByteArray rstr;
70 rstr.resize(rlen);
71 uchar* cursor = (uchar*)rstr.data();
72 const QChar *ch = uc;
73 int invalid = 0;
74 if (state && !(state->flags & IgnoreHeader)) {
75 *cursor++ = 0xef;
76 *cursor++ = 0xbb;
77 *cursor++ = 0xbf;
78 }
79
80 const QChar *end = ch + len;
81 while (ch < end) {
82 uint u = ch->unicode();
83 if (surrogate_high >= 0) {
84 if (u >= 0xdc00 && u < 0xe000) {
85 u = (surrogate_high - 0xd800)*0x400 + (u - 0xdc00) + 0x10000;
86 surrogate_high = -1;
87 } else {
88 // high surrogate without low
89 *cursor = replacement;
90 ++ch;
91 ++invalid;
92 surrogate_high = -1;
93 continue;
94 }
95 } else if (u >= 0xdc00 && u < 0xe000) {
96 // low surrogate without high
97 *cursor = replacement;
98 ++ch;
99 ++invalid;
100 continue;
101 } else if (u >= 0xd800 && u < 0xdc00) {
102 surrogate_high = u;
103 ++ch;
104 continue;
105 }
106
107 if (u < 0x80) {
108 *cursor++ = (uchar)u;
109 } else {
110 if (u < 0x0800) {
111 *cursor++ = 0xc0 | ((uchar) (u >> 6));
112 } else {
113 if (u > 0xffff) {
114 // see QString::fromUtf8() and QString::utf8() for explanations
115 if (u > 0x10fe00 && u < 0x10ff00) {
116 *cursor++ = (u - 0x10fe00);
117 ++ch;
118 continue;
119 } else {
120 *cursor++ = 0xf0 | ((uchar) (u >> 18));
121 *cursor++ = 0x80 | (((uchar) (u >> 12)) & 0x3f);
122 }
123 } else {
124 *cursor++ = 0xe0 | (((uchar) (u >> 12)) & 0x3f);
125 }
126 *cursor++ = 0x80 | (((uchar) (u >> 6)) & 0x3f);
127 }
128 *cursor++ = 0x80 | ((uchar) (u&0x3f));
129 }
130 ++ch;
131 }
132
133 rstr.resize(cursor - (const uchar*)rstr.constData());
134 if (state) {
135 state->invalidChars += invalid;
136 state->flags |= IgnoreHeader;
137 state->remainingChars = 0;
138 if (surrogate_high >= 0) {
139 state->remainingChars = 1;
140 state->state_data[0] = surrogate_high;
141 }
142 }
143 return rstr;
144}
145
146void QUtf8Codec::convertToUnicode(QString *target, const char *chars, int len, ConverterState *state) const
147{
148 bool headerdone = false;
149 QChar replacement = QChar::ReplacementCharacter;
150 int need = 0;
151 int error = -1;
152 uint uc = 0;
153 uint min_uc = 0;
154 if (state) {
155 if (state->flags & IgnoreHeader)
156 headerdone = true;
157 if (state->flags & ConvertInvalidToNull)
158 replacement = QChar::Null;
159 need = state->remainingChars;
160 if (need) {
161 uc = state->state_data[0];
162 min_uc = state->state_data[1];
163 }
164 }
165 if (!headerdone && len > 3
166 && (uchar)chars[0] == 0xef && (uchar)chars[1] == 0xbb && (uchar)chars[2] == 0xbf) {
167 // starts with a byte order mark
168 chars += 3;
169 len -= 3;
170 headerdone = true;
171 }
172
173 int originalLength = target->length();
174 QString &result = *target;
175 result.resize(originalLength + len + 1); // worst case
176 QChar *qch = result.data() + originalLength;
177 uchar ch;
178 int invalid = 0;
179
180 for (int i=0; i<len; i++) {
181 ch = chars[i];
182 if (need) {
183 if ((ch&0xc0) == 0x80) {
184 uc = (uc << 6) | (ch & 0x3f);
185 need--;
186 if (!need) {
187 if (uc > 0xffff && uc < 0x110000) {
188 // surrogate pair
189 uc -= 0x10000;
190 unsigned short high = uc/0x400 + 0xd800;
191 unsigned short low = uc%0x400 + 0xdc00;
192
193 // resize if necessary
194 long where = qch - result.unicode();
195 if (where + 2 >= result.length()) {
196 result.resize(where + 2);
197 qch = result.data() + where;
198 }
199
200 *qch++ = QChar(high);
201 *qch++ = QChar(low);
202 } else if ((uc < min_uc) || (uc >= 0xd800 && uc <= 0xdfff) || (uc >= 0xfffe)) {
203 // error
204 *qch++ = replacement;
205 ++invalid;
206 } else {
207 *qch++ = uc;
208 }
209 }
210 } else {
211 // error
212 i = error;
213 *qch++ = replacement;
214 ++invalid;
215 need = 0;
216 }
217 } else {
218 if (ch < 128) {
219 *qch++ = QLatin1Char(ch);
220 } else if ((ch & 0xe0) == 0xc0) {
221 uc = ch & 0x1f;
222 need = 1;
223 error = i;
224 min_uc = 0x80;
225 } else if ((ch & 0xf0) == 0xe0) {
226 uc = ch & 0x0f;
227 need = 2;
228 error = i;
229 min_uc = 0x800;
230 } else if ((ch&0xf8) == 0xf0) {
231 uc = ch & 0x07;
232 need = 3;
233 error = i;
234 min_uc = 0x10000;
235 } else {
236 // error
237 *qch++ = replacement;
238 ++invalid;
239 }
240 }
241 }
242 if (!state && need > 0) {
243 // unterminated UTF sequence
244 for (int i = error; i < len; ++i) {
245 *qch++ = replacement;
246 ++invalid;
247 }
248 }
249 result.truncate(qch - result.unicode());
250 if (state) {
251 state->invalidChars += invalid;
252 state->remainingChars = need;
253 if (headerdone)
254 state->flags |= IgnoreHeader;
255 state->state_data[0] = need ? uc : 0;
256 state->state_data[1] = need ? min_uc : 0;
257 }
258}
259
260QString QUtf8Codec::convertToUnicode(const char *chars, int len, ConverterState *state) const
261{
262 QString result;
263 convertToUnicode(&result, chars, len, state);
264 return result;
265}
266
267QByteArray QUtf8Codec::name() const
268{
269 return "UTF-8";
270}
271
272int QUtf8Codec::mibEnum() const
273{
274 return 106;
275}
276
277enum { Endian = 0, Data = 1 };
278
279QUtf16Codec::~QUtf16Codec()
280{
281}
282
283QByteArray QUtf16Codec::convertFromUnicode(const QChar *uc, int len, ConverterState *state) const
284{
285 Endianness endian = e;
286 int length = 2*len;
287 if (!state || (!(state->flags & IgnoreHeader))) {
288 length += 2;
289 }
290 if (e == Detect) {
291 endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BE : LE;
292 }
293
294 QByteArray d;
295 d.resize(length);
296 char *data = d.data();
297 if (!state || !(state->flags & IgnoreHeader)) {
298 QChar bom(QChar::ByteOrderMark);
299 if (endian == BE) {
300 data[0] = bom.row();
301 data[1] = bom.cell();
302 } else {
303 data[0] = bom.cell();
304 data[1] = bom.row();
305 }
306 data += 2;
307 }
308 if (endian == BE) {
309 for (int i = 0; i < len; ++i) {
310 *(data++) = uc[i].row();
311 *(data++) = uc[i].cell();
312 }
313 } else {
314 for (int i = 0; i < len; ++i) {
315 *(data++) = uc[i].cell();
316 *(data++) = uc[i].row();
317 }
318 }
319
320 if (state) {
321 state->remainingChars = 0;
322 state->flags |= IgnoreHeader;
323 }
324 return d;
325}
326
327QString QUtf16Codec::convertToUnicode(const char *chars, int len, ConverterState *state) const
328{
329 Endianness endian = e;
330 bool half = false;
331 uchar buf = 0;
332 bool headerdone = false;
333 if (state) {
334 headerdone = state->flags & IgnoreHeader;
335 if (endian == Detect)
336 endian = (Endianness)state->state_data[Endian];
337 if (state->remainingChars) {
338 half = true;
339 buf = state->state_data[Data];
340 }
341 }
342 if (headerdone && endian == Detect)
343 endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BE : LE;
344
345 QString result;
346 result.resize(len); // worst case
347 QChar *qch = (QChar *)result.unicode();
348 while (len--) {
349 if (half) {
350 QChar ch;
351 if (endian == LE) {
352 ch.setRow(*chars++);
353 ch.setCell(buf);
354 } else {
355 ch.setRow(buf);
356 ch.setCell(*chars++);
357 }
358 if (!headerdone) {
359 if (endian == Detect) {
360 if (ch == QChar::ByteOrderSwapped && endian != BE) {
361 endian = LE;
362 } else if (ch == QChar::ByteOrderMark && endian != LE) {
363 // ignore BOM
364 endian = BE;
365 } else {
366 if (QSysInfo::ByteOrder == QSysInfo::BigEndian) {
367 endian = BE;
368 } else {
369 endian = LE;
370 ch = QChar((ch.unicode() >> 8) | ((ch.unicode() & 0xff) << 8));
371 }
372 *qch++ = ch;
373 }
374 } else if (ch != QChar::ByteOrderMark) {
375 *qch++ = ch;
376 }
377 headerdone = true;
378 } else {
379 *qch++ = ch;
380 }
381 half = false;
382 } else {
383 buf = *chars++;
384 half = true;
385 }
386 }
387 result.truncate(qch - result.unicode());
388
389 if (state) {
390 if (endian != Detect)
391 state->flags |= IgnoreHeader;
392 state->state_data[Endian] = endian;
393 if (half) {
394 state->remainingChars = 1;
395 state->state_data[Data] = buf;
396 } else {
397 state->remainingChars = 0;
398 state->state_data[Data] = 0;
399 }
400 }
401 return result;
402}
403
404int QUtf16Codec::mibEnum() const
405{
406 return 1015;
407}
408
409QByteArray QUtf16Codec::name() const
410{
411 return "UTF-16";
412}
413
414QList<QByteArray> QUtf16Codec::aliases() const
415{
416 QList<QByteArray> list;
417 list << "ISO-10646-UCS-2";
418 return list;
419}
420
421int QUtf16BECodec::mibEnum() const
422{
423 return 1013;
424}
425
426QByteArray QUtf16BECodec::name() const
427{
428 return "UTF-16BE";
429}
430
431QList<QByteArray> QUtf16BECodec::aliases() const
432{
433 QList<QByteArray> list;
434 return list;
435}
436
437int QUtf16LECodec::mibEnum() const
438{
439 return 1014;
440}
441
442QByteArray QUtf16LECodec::name() const
443{
444 return "UTF-16LE";
445}
446
447QList<QByteArray> QUtf16LECodec::aliases() const
448{
449 QList<QByteArray> list;
450 return list;
451}
452
453QUtf32Codec::~QUtf32Codec()
454{
455}
456
457QByteArray QUtf32Codec::convertFromUnicode(const QChar *uc, int len, ConverterState *state) const
458{
459 Endianness endian = e;
460 int length = 4*len;
461 if (!state || (!(state->flags & IgnoreHeader))) {
462 length += 4;
463 }
464 if (e == Detect) {
465 endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BE : LE;
466 }
467
468 QByteArray d;
469 d.resize(length);
470 char *data = d.data();
471 if (!state || !(state->flags & IgnoreHeader)) {
472 if (endian == BE) {
473 data[0] = 0;
474 data[1] = 0;
475 data[2] = (char)0xfe;
476 data[3] = (char)0xff;
477 } else {
478 data[0] = (char)0xff;
479 data[1] = (char)0xfe;
480 data[2] = 0;
481 data[3] = 0;
482 }
483 data += 2;
484 }
485 if (endian == BE) {
486 for (int i = 0; i < len; ++i) {
487 uint cp = uc[i].unicode();
488 if (uc[i].isHighSurrogate() && i < len - 1)
489 cp = QChar::surrogateToUcs4(cp, uc[++i].unicode());
490 *(data++) = cp >> 24;
491 *(data++) = (cp >> 16) & 0xff;
492 *(data++) = (cp >> 8) & 0xff;
493 *(data++) = cp & 0xff;
494 }
495 } else {
496 for (int i = 0; i < len; ++i) {
497 uint cp = uc[i].unicode();
498 if (uc[i].isHighSurrogate() && i < len - 1)
499 cp = QChar::surrogateToUcs4(cp, uc[++i].unicode());
500 *(data++) = cp & 0xff;
501 *(data++) = (cp >> 8) & 0xff;
502 *(data++) = (cp >> 16) & 0xff;
503 *(data++) = cp >> 24;
504 }
505 }
506
507 if (state) {
508 state->remainingChars = 0;
509 state->flags |= IgnoreHeader;
510 }
511 return d;
512}
513
514QString QUtf32Codec::convertToUnicode(const char *chars, int len, ConverterState *state) const
515{
516 Endianness endian = e;
517 uchar tuple[4];
518 int num = 0;
519 bool headerdone = false;
520 if (state) {
521 headerdone = state->flags & IgnoreHeader;
522 if (endian == Detect) {
523 endian = (Endianness)state->state_data[Endian];
524 }
525 num = state->remainingChars;
526 memcpy(tuple, &state->state_data[Data], 4);
527 }
528 if (headerdone && endian == Detect)
529 endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BE : LE;
530
531 QString result;
532 result.resize((num + len) >> 2 << 1); // worst case
533 QChar *qch = (QChar *)result.unicode();
534
535 const char *end = chars + len;
536 while (chars < end) {
537 tuple[num++] = *chars++;
538 if (num == 4) {
539 if (!headerdone) {
540 if (endian == Detect) {
541 if (endian == Detect) {
542 if (tuple[0] == 0xff && tuple[1] == 0xfe && tuple[2] == 0 && tuple[3] == 0 && endian != BE) {
543 endian = LE;
544 num = 0;
545 continue;
546 } else if (tuple[0] == 0 && tuple[1] == 0 && tuple[2] == 0xfe && tuple[3] == 0xff && endian != LE) {
547 endian = BE;
548 num = 0;
549 continue;
550 } else if (QSysInfo::ByteOrder == QSysInfo::BigEndian) {
551 endian = BE;
552 } else {
553 endian = LE;
554 }
555 }
556 } else if (((endian == BE) ? qFromBigEndian<quint32>(tuple) : qFromLittleEndian<quint32>(tuple)) == QChar::ByteOrderMark) {
557 num = 0;
558 continue;
559 }
560 }
561 uint code = (endian == BE) ? qFromBigEndian<quint32>(tuple) : qFromLittleEndian<quint32>(tuple);
562 if (code >= 0x10000) {
563 *qch++ = QChar::highSurrogate(code);
564 *qch++ = QChar::lowSurrogate(code);
565 } else {
566 *qch++ = code;
567 }
568 num = 0;
569 }
570 }
571 result.truncate(qch - result.unicode());
572
573 if (state) {
574 if (endian != Detect)
575 state->flags |= IgnoreHeader;
576 state->state_data[Endian] = endian;
577 state->remainingChars = num;
578 memcpy(&state->state_data[Data], tuple, 4);
579 }
580 return result;
581}
582
583int QUtf32Codec::mibEnum() const
584{
585 return 1017;
586}
587
588QByteArray QUtf32Codec::name() const
589{
590 return "UTF-32";
591}
592
593QList<QByteArray> QUtf32Codec::aliases() const
594{
595 QList<QByteArray> list;
596 return list;
597}
598
599int QUtf32BECodec::mibEnum() const
600{
601 return 1018;
602}
603
604QByteArray QUtf32BECodec::name() const
605{
606 return "UTF-32BE";
607}
608
609QList<QByteArray> QUtf32BECodec::aliases() const
610{
611 QList<QByteArray> list;
612 return list;
613}
614
615int QUtf32LECodec::mibEnum() const
616{
617 return 1019;
618}
619
620QByteArray QUtf32LECodec::name() const
621{
622 return "UTF-32LE";
623}
624
625QList<QByteArray> QUtf32LECodec::aliases() const
626{
627 QList<QByteArray> list;
628 return list;
629}
630
631
632QT_END_NAMESPACE
633
634#endif //QT_NO_TEXTCODEC
Note: See TracBrowser for help on using the repository browser.