source: trunk/src/corelib/codecs/qutfcodec.cpp

Last change on this file was 846, checked in by Dmitry A. Kuminov, 14 years ago

trunk: Merged in qt 4.7.2 sources from branches/vendor/nokia/qt.

File size: 19.7 KB
Line 
1/****************************************************************************
2**
3** Copyright (C) 2011 Nokia Corporation and/or its subsidiary(-ies).
4** All rights reserved.
5** Contact: Nokia Corporation ([email protected])
6**
7** This file is part of the QtCore module of the Qt Toolkit.
8**
9** $QT_BEGIN_LICENSE:LGPL$
10** Commercial Usage
11** Licensees holding valid Qt Commercial licenses may use this file in
12** accordance with the Qt Commercial License Agreement provided with the
13** Software or, alternatively, in accordance with the terms contained in
14** a written agreement between you and Nokia.
15**
16** GNU Lesser General Public License Usage
17** Alternatively, this file may be used under the terms of the GNU Lesser
18** General Public License version 2.1 as published by the Free Software
19** Foundation and appearing in the file LICENSE.LGPL included in the
20** packaging of this file. Please review the following information to
21** ensure the GNU Lesser General Public License version 2.1 requirements
22** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
23**
24** In addition, as a special exception, Nokia gives you certain additional
25** rights. These rights are described in the Nokia Qt LGPL Exception
26** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
27**
28** GNU General Public License Usage
29** Alternatively, this file may be used under the terms of the GNU
30** General Public License version 3.0 as published by the Free Software
31** Foundation and appearing in the file LICENSE.GPL included in the
32** packaging of this file. Please review the following information to
33** ensure the GNU General Public License version 3.0 requirements will be
34** met: http://www.gnu.org/copyleft/gpl.html.
35**
36** If you have questions regarding the use of this file, please contact
37** Nokia at [email protected].
38** $QT_END_LICENSE$
39**
40****************************************************************************/
41
42#include "qutfcodec_p.h"
43#include "qlist.h"
44#include "qendian.h"
45#include "qchar.h"
46
47QT_BEGIN_NAMESPACE
48
49enum { Endian = 0, Data = 1 };
50
51static inline bool isUnicodeNonCharacter(uint ucs4)
52{
53 // Unicode has a couple of "non-characters" that one can use internally,
54 // but are not allowed to be used for text interchange.
55 //
56 // Those are the last two entries each Unicode Plane (U+FFFE, U+FFFF,
57 // U+1FFFE, U+1FFFF, etc.) as well as the entries between U+FDD0 and
58 // U+FDEF (inclusive)
59
60 return (ucs4 & 0xfffe) == 0xfffe
61 || (ucs4 - 0xfdd0U) < 16;
62}
63
64QByteArray QUtf8::convertFromUnicode(const QChar *uc, int len, QTextCodec::ConverterState *state)
65{
66 uchar replacement = '?';
67 int rlen = 3*len;
68 int surrogate_high = -1;
69 if (state) {
70 if (state->flags & QTextCodec::ConvertInvalidToNull)
71 replacement = 0;
72 if (!(state->flags & QTextCodec::IgnoreHeader))
73 rlen += 3;
74 if (state->remainingChars)
75 surrogate_high = state->state_data[0];
76 }
77
78 QByteArray rstr;
79 rstr.resize(rlen);
80 uchar* cursor = (uchar*)rstr.data();
81 const QChar *ch = uc;
82 int invalid = 0;
83 if (state && !(state->flags & QTextCodec::IgnoreHeader)) {
84 *cursor++ = 0xef;
85 *cursor++ = 0xbb;
86 *cursor++ = 0xbf;
87 }
88
89 const QChar *end = ch + len;
90 while (ch < end) {
91 uint u = ch->unicode();
92 if (surrogate_high >= 0) {
93 if (u >= 0xdc00 && u < 0xe000) {
94 u = (surrogate_high - 0xd800)*0x400 + (u - 0xdc00) + 0x10000;
95 surrogate_high = -1;
96 } else {
97 // high surrogate without low
98 *cursor = replacement;
99 ++ch;
100 ++invalid;
101 surrogate_high = -1;
102 continue;
103 }
104 } else if (u >= 0xdc00 && u < 0xe000) {
105 // low surrogate without high
106 *cursor = replacement;
107 ++ch;
108 ++invalid;
109 continue;
110 } else if (u >= 0xd800 && u < 0xdc00) {
111 surrogate_high = u;
112 ++ch;
113 continue;
114 }
115
116 if (u < 0x80) {
117 *cursor++ = (uchar)u;
118 } else {
119 if (u < 0x0800) {
120 *cursor++ = 0xc0 | ((uchar) (u >> 6));
121 } else {
122 // is it one of the Unicode non-characters?
123 if (isUnicodeNonCharacter(u)) {
124 *cursor++ = replacement;
125 ++ch;
126 ++invalid;
127 continue;
128 }
129
130 if (u > 0xffff) {
131 *cursor++ = 0xf0 | ((uchar) (u >> 18));
132 *cursor++ = 0x80 | (((uchar) (u >> 12)) & 0x3f);
133 } else {
134 *cursor++ = 0xe0 | (((uchar) (u >> 12)) & 0x3f);
135 }
136 *cursor++ = 0x80 | (((uchar) (u >> 6)) & 0x3f);
137 }
138 *cursor++ = 0x80 | ((uchar) (u&0x3f));
139 }
140 ++ch;
141 }
142
143 rstr.resize(cursor - (const uchar*)rstr.constData());
144 if (state) {
145 state->invalidChars += invalid;
146 state->flags |= QTextCodec::IgnoreHeader;
147 state->remainingChars = 0;
148 if (surrogate_high >= 0) {
149 state->remainingChars = 1;
150 state->state_data[0] = surrogate_high;
151 }
152 }
153 return rstr;
154}
155
156QString QUtf8::convertToUnicode(const char *chars, int len, QTextCodec::ConverterState *state)
157{
158 bool headerdone = false;
159 ushort replacement = QChar::ReplacementCharacter;
160 int need = 0;
161 int error = -1;
162 uint uc = 0;
163 uint min_uc = 0;
164 if (state) {
165 if (state->flags & QTextCodec::IgnoreHeader)
166 headerdone = true;
167 if (state->flags & QTextCodec::ConvertInvalidToNull)
168 replacement = QChar::Null;
169 need = state->remainingChars;
170 if (need) {
171 uc = state->state_data[0];
172 min_uc = state->state_data[1];
173 }
174 }
175 if (!headerdone && len > 3
176 && (uchar)chars[0] == 0xef && (uchar)chars[1] == 0xbb && (uchar)chars[2] == 0xbf) {
177 // starts with a byte order mark
178 chars += 3;
179 len -= 3;
180 headerdone = true;
181 }
182
183 QString result(need + len + 1, Qt::Uninitialized); // worst case
184 ushort *qch = (ushort *)result.unicode();
185 uchar ch;
186 int invalid = 0;
187
188 for (int i = 0; i < len; ++i) {
189 ch = chars[i];
190 if (need) {
191 if ((ch&0xc0) == 0x80) {
192 uc = (uc << 6) | (ch & 0x3f);
193 --need;
194 if (!need) {
195 // utf-8 bom composes into 0xfeff code point
196 bool nonCharacter;
197 if (!headerdone && uc == 0xfeff) {
198 // don't do anything, just skip the BOM
199 } else if (!(nonCharacter = isUnicodeNonCharacter(uc)) && uc > 0xffff && uc < 0x110000) {
200 // surrogate pair
201 Q_ASSERT((qch - (ushort*)result.unicode()) + 2 < result.length());
202 *qch++ = QChar::highSurrogate(uc);
203 *qch++ = QChar::lowSurrogate(uc);
204 } else if ((uc < min_uc) || (uc >= 0xd800 && uc <= 0xdfff) || nonCharacter || uc >= 0x110000) {
205 // error: overlong sequence, UTF16 surrogate or non-character
206 *qch++ = replacement;
207 ++invalid;
208 } else {
209 *qch++ = uc;
210 }
211 headerdone = true;
212 }
213 } else {
214 // error
215 i = error;
216 *qch++ = replacement;
217 ++invalid;
218 need = 0;
219 headerdone = true;
220 }
221 } else {
222 if (ch < 128) {
223 *qch++ = ushort(ch);
224 headerdone = true;
225 } else if ((ch & 0xe0) == 0xc0) {
226 uc = ch & 0x1f;
227 need = 1;
228 error = i;
229 min_uc = 0x80;
230 headerdone = true;
231 } else if ((ch & 0xf0) == 0xe0) {
232 uc = ch & 0x0f;
233 need = 2;
234 error = i;
235 min_uc = 0x800;
236 } else if ((ch&0xf8) == 0xf0) {
237 uc = ch & 0x07;
238 need = 3;
239 error = i;
240 min_uc = 0x10000;
241 headerdone = true;
242 } else {
243 // error
244 *qch++ = replacement;
245 ++invalid;
246 headerdone = true;
247 }
248 }
249 }
250 if (!state && need > 0) {
251 // unterminated UTF sequence
252 for (int i = error; i < len; ++i) {
253 *qch++ = replacement;
254 ++invalid;
255 }
256 }
257 result.truncate(qch - (ushort *)result.unicode());
258 if (state) {
259 state->invalidChars += invalid;
260 state->remainingChars = need;
261 if (headerdone)
262 state->flags |= QTextCodec::IgnoreHeader;
263 state->state_data[0] = need ? uc : 0;
264 state->state_data[1] = need ? min_uc : 0;
265 }
266 return result;
267}
268
269QByteArray QUtf16::convertFromUnicode(const QChar *uc, int len, QTextCodec::ConverterState *state, DataEndianness e)
270{
271 DataEndianness endian = e;
272 int length = 2*len;
273 if (!state || (!(state->flags & QTextCodec::IgnoreHeader))) {
274 length += 2;
275 }
276 if (e == DetectEndianness) {
277 endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness;
278 }
279
280 QByteArray d;
281 d.resize(length);
282 char *data = d.data();
283 if (!state || !(state->flags & QTextCodec::IgnoreHeader)) {
284 QChar bom(QChar::ByteOrderMark);
285 if (endian == BigEndianness) {
286 data[0] = bom.row();
287 data[1] = bom.cell();
288 } else {
289 data[0] = bom.cell();
290 data[1] = bom.row();
291 }
292 data += 2;
293 }
294 if (endian == BigEndianness) {
295 for (int i = 0; i < len; ++i) {
296 *(data++) = uc[i].row();
297 *(data++) = uc[i].cell();
298 }
299 } else {
300 for (int i = 0; i < len; ++i) {
301 *(data++) = uc[i].cell();
302 *(data++) = uc[i].row();
303 }
304 }
305
306 if (state) {
307 state->remainingChars = 0;
308 state->flags |= QTextCodec::IgnoreHeader;
309 }
310 return d;
311}
312
313QString QUtf16::convertToUnicode(const char *chars, int len, QTextCodec::ConverterState *state, DataEndianness e)
314{
315 DataEndianness endian = e;
316 bool half = false;
317 uchar buf = 0;
318 bool headerdone = false;
319 if (state) {
320 headerdone = state->flags & QTextCodec::IgnoreHeader;
321 if (endian == DetectEndianness)
322 endian = (DataEndianness)state->state_data[Endian];
323 if (state->remainingChars) {
324 half = true;
325 buf = state->state_data[Data];
326 }
327 }
328 if (headerdone && endian == DetectEndianness)
329 endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness;
330
331 QString result(len, Qt::Uninitialized); // worst case
332 QChar *qch = (QChar *)result.unicode();
333 while (len--) {
334 if (half) {
335 QChar ch;
336 if (endian == LittleEndianness) {
337 ch.setRow(*chars++);
338 ch.setCell(buf);
339 } else {
340 ch.setRow(buf);
341 ch.setCell(*chars++);
342 }
343 if (!headerdone) {
344 headerdone = true;
345 if (endian == DetectEndianness) {
346 if (ch == QChar::ByteOrderSwapped) {
347 endian = LittleEndianness;
348 } else if (ch == QChar::ByteOrderMark) {
349 endian = BigEndianness;
350 } else {
351 if (QSysInfo::ByteOrder == QSysInfo::BigEndian) {
352 endian = BigEndianness;
353 } else {
354 endian = LittleEndianness;
355 ch = QChar((ch.unicode() >> 8) | ((ch.unicode() & 0xff) << 8));
356 }
357 *qch++ = ch;
358 }
359 } else if (ch != QChar::ByteOrderMark) {
360 *qch++ = ch;
361 }
362 } else {
363 *qch++ = ch;
364 }
365 half = false;
366 } else {
367 buf = *chars++;
368 half = true;
369 }
370 }
371 result.truncate(qch - result.unicode());
372
373 if (state) {
374 if (headerdone)
375 state->flags |= QTextCodec::IgnoreHeader;
376 state->state_data[Endian] = endian;
377 if (half) {
378 state->remainingChars = 1;
379 state->state_data[Data] = buf;
380 } else {
381 state->remainingChars = 0;
382 state->state_data[Data] = 0;
383 }
384 }
385 return result;
386}
387
388QByteArray QUtf32::convertFromUnicode(const QChar *uc, int len, QTextCodec::ConverterState *state, DataEndianness e)
389{
390 DataEndianness endian = e;
391 int length = 4*len;
392 if (!state || (!(state->flags & QTextCodec::IgnoreHeader))) {
393 length += 4;
394 }
395 if (e == DetectEndianness) {
396 endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness;
397 }
398
399 QByteArray d(length, Qt::Uninitialized);
400 char *data = d.data();
401 if (!state || !(state->flags & QTextCodec::IgnoreHeader)) {
402 if (endian == BigEndianness) {
403 data[0] = 0;
404 data[1] = 0;
405 data[2] = (char)0xfe;
406 data[3] = (char)0xff;
407 } else {
408 data[0] = (char)0xff;
409 data[1] = (char)0xfe;
410 data[2] = 0;
411 data[3] = 0;
412 }
413 data += 4;
414 }
415 if (endian == BigEndianness) {
416 for (int i = 0; i < len; ++i) {
417 uint cp = uc[i].unicode();
418 if (uc[i].isHighSurrogate() && i < len - 1)
419 cp = QChar::surrogateToUcs4(cp, uc[++i].unicode());
420 *(data++) = cp >> 24;
421 *(data++) = (cp >> 16) & 0xff;
422 *(data++) = (cp >> 8) & 0xff;
423 *(data++) = cp & 0xff;
424 }
425 } else {
426 for (int i = 0; i < len; ++i) {
427 uint cp = uc[i].unicode();
428 if (uc[i].isHighSurrogate() && i < len - 1)
429 cp = QChar::surrogateToUcs4(cp, uc[++i].unicode());
430 *(data++) = cp & 0xff;
431 *(data++) = (cp >> 8) & 0xff;
432 *(data++) = (cp >> 16) & 0xff;
433 *(data++) = cp >> 24;
434 }
435 }
436
437 if (state) {
438 state->remainingChars = 0;
439 state->flags |= QTextCodec::IgnoreHeader;
440 }
441 return d;
442}
443
444QString QUtf32::convertToUnicode(const char *chars, int len, QTextCodec::ConverterState *state, DataEndianness e)
445{
446 DataEndianness endian = e;
447 uchar tuple[4];
448 int num = 0;
449 bool headerdone = false;
450 if (state) {
451 headerdone = state->flags & QTextCodec::IgnoreHeader;
452 if (endian == DetectEndianness) {
453 endian = (DataEndianness)state->state_data[Endian];
454 }
455 num = state->remainingChars;
456 memcpy(tuple, &state->state_data[Data], 4);
457 }
458 if (headerdone && endian == DetectEndianness)
459 endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness;
460
461 QString result;
462 result.resize((num + len) >> 2 << 1); // worst case
463 QChar *qch = (QChar *)result.unicode();
464
465 const char *end = chars + len;
466 while (chars < end) {
467 tuple[num++] = *chars++;
468 if (num == 4) {
469 if (!headerdone) {
470 if (endian == DetectEndianness) {
471 if (endian == DetectEndianness) {
472 if (tuple[0] == 0xff && tuple[1] == 0xfe && tuple[2] == 0 && tuple[3] == 0 && endian != BigEndianness) {
473 endian = LittleEndianness;
474 num = 0;
475 continue;
476 } else if (tuple[0] == 0 && tuple[1] == 0 && tuple[2] == 0xfe && tuple[3] == 0xff && endian != LittleEndianness) {
477 endian = BigEndianness;
478 num = 0;
479 continue;
480 } else if (QSysInfo::ByteOrder == QSysInfo::BigEndian) {
481 endian = BigEndianness;
482 } else {
483 endian = LittleEndianness;
484 }
485 }
486 } else if (((endian == BigEndianness) ? qFromBigEndian<quint32>(tuple) : qFromLittleEndian<quint32>(tuple)) == QChar::ByteOrderMark) {
487 num = 0;
488 continue;
489 }
490 }
491 uint code = (endian == BigEndianness) ? qFromBigEndian<quint32>(tuple) : qFromLittleEndian<quint32>(tuple);
492 if (code >= 0x10000) {
493 *qch++ = QChar::highSurrogate(code);
494 *qch++ = QChar::lowSurrogate(code);
495 } else {
496 *qch++ = code;
497 }
498 num = 0;
499 }
500 }
501 result.truncate(qch - result.unicode());
502
503 if (state) {
504 if (headerdone)
505 state->flags |= QTextCodec::IgnoreHeader;
506 state->state_data[Endian] = endian;
507 state->remainingChars = num;
508 memcpy(&state->state_data[Data], tuple, 4);
509 }
510 return result;
511}
512
513
514#ifndef QT_NO_TEXTCODEC
515
516QUtf8Codec::~QUtf8Codec()
517{
518}
519
520QByteArray QUtf8Codec::convertFromUnicode(const QChar *uc, int len, ConverterState *state) const
521{
522 return QUtf8::convertFromUnicode(uc, len, state);
523}
524
525void QUtf8Codec::convertToUnicode(QString *target, const char *chars, int len, ConverterState *state) const
526{
527 *target += QUtf8::convertToUnicode(chars, len, state);
528}
529
530QString QUtf8Codec::convertToUnicode(const char *chars, int len, ConverterState *state) const
531{
532 return QUtf8::convertToUnicode(chars, len, state);
533}
534
535QByteArray QUtf8Codec::name() const
536{
537 return "UTF-8";
538}
539
540int QUtf8Codec::mibEnum() const
541{
542 return 106;
543}
544
545QUtf16Codec::~QUtf16Codec()
546{
547}
548
549QByteArray QUtf16Codec::convertFromUnicode(const QChar *uc, int len, ConverterState *state) const
550{
551 return QUtf16::convertFromUnicode(uc, len, state, e);
552}
553
554QString QUtf16Codec::convertToUnicode(const char *chars, int len, ConverterState *state) const
555{
556 return QUtf16::convertToUnicode(chars, len, state, e);
557}
558
559int QUtf16Codec::mibEnum() const
560{
561 return 1015;
562}
563
564QByteArray QUtf16Codec::name() const
565{
566 return "UTF-16";
567}
568
569QList<QByteArray> QUtf16Codec::aliases() const
570{
571 return QList<QByteArray>();
572}
573
574int QUtf16BECodec::mibEnum() const
575{
576 return 1013;
577}
578
579QByteArray QUtf16BECodec::name() const
580{
581 return "UTF-16BE";
582}
583
584QList<QByteArray> QUtf16BECodec::aliases() const
585{
586 QList<QByteArray> list;
587 return list;
588}
589
590int QUtf16LECodec::mibEnum() const
591{
592 return 1014;
593}
594
595QByteArray QUtf16LECodec::name() const
596{
597 return "UTF-16LE";
598}
599
600QList<QByteArray> QUtf16LECodec::aliases() const
601{
602 QList<QByteArray> list;
603 return list;
604}
605
606QUtf32Codec::~QUtf32Codec()
607{
608}
609
610QByteArray QUtf32Codec::convertFromUnicode(const QChar *uc, int len, ConverterState *state) const
611{
612 return QUtf32::convertFromUnicode(uc, len, state, e);
613}
614
615QString QUtf32Codec::convertToUnicode(const char *chars, int len, ConverterState *state) const
616{
617 return QUtf32::convertToUnicode(chars, len, state, e);
618}
619
620int QUtf32Codec::mibEnum() const
621{
622 return 1017;
623}
624
625QByteArray QUtf32Codec::name() const
626{
627 return "UTF-32";
628}
629
630QList<QByteArray> QUtf32Codec::aliases() const
631{
632 QList<QByteArray> list;
633 return list;
634}
635
636int QUtf32BECodec::mibEnum() const
637{
638 return 1018;
639}
640
641QByteArray QUtf32BECodec::name() const
642{
643 return "UTF-32BE";
644}
645
646QList<QByteArray> QUtf32BECodec::aliases() const
647{
648 QList<QByteArray> list;
649 return list;
650}
651
652int QUtf32LECodec::mibEnum() const
653{
654 return 1019;
655}
656
657QByteArray QUtf32LECodec::name() const
658{
659 return "UTF-32LE";
660}
661
662QList<QByteArray> QUtf32LECodec::aliases() const
663{
664 QList<QByteArray> list;
665 return list;
666}
667
668#endif //QT_NO_TEXTCODEC
669
670QT_END_NAMESPACE
Note: See TracBrowser for help on using the repository browser.