source: trunk/src/corelib/codecs/qutfcodec.cpp@ 807

Last change on this file since 807 was 769, checked in by Dmitry A. Kuminov, 15 years ago

trunk: Merged in qt 4.6.3 sources from branches/vendor/nokia/qt.

File size: 19.2 KB
Line 
1/****************************************************************************
2**
3** Copyright (C) 2010 Nokia Corporation and/or its subsidiary(-ies).
4** All rights reserved.
5** Contact: Nokia Corporation ([email protected])
6**
7** This file is part of the QtCore module of the Qt Toolkit.
8**
9** $QT_BEGIN_LICENSE:LGPL$
10** Commercial Usage
11** Licensees holding valid Qt Commercial licenses may use this file in
12** accordance with the Qt Commercial License Agreement provided with the
13** Software or, alternatively, in accordance with the terms contained in
14** a written agreement between you and Nokia.
15**
16** GNU Lesser General Public License Usage
17** Alternatively, this file may be used under the terms of the GNU Lesser
18** General Public License version 2.1 as published by the Free Software
19** Foundation and appearing in the file LICENSE.LGPL included in the
20** packaging of this file. Please review the following information to
21** ensure the GNU Lesser General Public License version 2.1 requirements
22** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
23**
24** In addition, as a special exception, Nokia gives you certain additional
25** rights. These rights are described in the Nokia Qt LGPL Exception
26** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
27**
28** GNU General Public License Usage
29** Alternatively, this file may be used under the terms of the GNU
30** General Public License version 3.0 as published by the Free Software
31** Foundation and appearing in the file LICENSE.GPL included in the
32** packaging of this file. Please review the following information to
33** ensure the GNU General Public License version 3.0 requirements will be
34** met: http://www.gnu.org/copyleft/gpl.html.
35**
36** If you have questions regarding the use of this file, please contact
37** Nokia at [email protected].
38** $QT_END_LICENSE$
39**
40****************************************************************************/
41
42#include "qutfcodec_p.h"
43#include "qlist.h"
44#include "qendian.h"
45#include "qchar.h"
46
47QT_BEGIN_NAMESPACE
48
49enum { Endian = 0, Data = 1 };
50
51QByteArray QUtf8::convertFromUnicode(const QChar *uc, int len, QTextCodec::ConverterState *state)
52{
53 uchar replacement = '?';
54 int rlen = 3*len;
55 int surrogate_high = -1;
56 if (state) {
57 if (state->flags & QTextCodec::ConvertInvalidToNull)
58 replacement = 0;
59 if (!(state->flags & QTextCodec::IgnoreHeader))
60 rlen += 3;
61 if (state->remainingChars)
62 surrogate_high = state->state_data[0];
63 }
64
65 QByteArray rstr;
66 rstr.resize(rlen);
67 uchar* cursor = (uchar*)rstr.data();
68 const QChar *ch = uc;
69 int invalid = 0;
70 if (state && !(state->flags & QTextCodec::IgnoreHeader)) {
71 *cursor++ = 0xef;
72 *cursor++ = 0xbb;
73 *cursor++ = 0xbf;
74 }
75
76 const QChar *end = ch + len;
77 while (ch < end) {
78 uint u = ch->unicode();
79 if (surrogate_high >= 0) {
80 if (u >= 0xdc00 && u < 0xe000) {
81 u = (surrogate_high - 0xd800)*0x400 + (u - 0xdc00) + 0x10000;
82 surrogate_high = -1;
83 } else {
84 // high surrogate without low
85 *cursor = replacement;
86 ++ch;
87 ++invalid;
88 surrogate_high = -1;
89 continue;
90 }
91 } else if (u >= 0xdc00 && u < 0xe000) {
92 // low surrogate without high
93 *cursor = replacement;
94 ++ch;
95 ++invalid;
96 continue;
97 } else if (u >= 0xd800 && u < 0xdc00) {
98 surrogate_high = u;
99 ++ch;
100 continue;
101 }
102
103 if (u < 0x80) {
104 *cursor++ = (uchar)u;
105 } else {
106 if (u < 0x0800) {
107 *cursor++ = 0xc0 | ((uchar) (u >> 6));
108 } else {
109 if (u > 0xffff) {
110 // see QString::fromUtf8() and QString::utf8() for explanations
111 if (u > 0x10fe00 && u < 0x10ff00) {
112 *cursor++ = (u - 0x10fe00);
113 ++ch;
114 continue;
115 } else {
116 *cursor++ = 0xf0 | ((uchar) (u >> 18));
117 *cursor++ = 0x80 | (((uchar) (u >> 12)) & 0x3f);
118 }
119 } else {
120 *cursor++ = 0xe0 | (((uchar) (u >> 12)) & 0x3f);
121 }
122 *cursor++ = 0x80 | (((uchar) (u >> 6)) & 0x3f);
123 }
124 *cursor++ = 0x80 | ((uchar) (u&0x3f));
125 }
126 ++ch;
127 }
128
129 rstr.resize(cursor - (const uchar*)rstr.constData());
130 if (state) {
131 state->invalidChars += invalid;
132 state->flags |= QTextCodec::IgnoreHeader;
133 state->remainingChars = 0;
134 if (surrogate_high >= 0) {
135 state->remainingChars = 1;
136 state->state_data[0] = surrogate_high;
137 }
138 }
139 return rstr;
140}
141
142QString QUtf8::convertToUnicode(const char *chars, int len, QTextCodec::ConverterState *state)
143{
144 bool headerdone = false;
145 ushort replacement = QChar::ReplacementCharacter;
146 int need = 0;
147 int error = -1;
148 uint uc = 0;
149 uint min_uc = 0;
150 if (state) {
151 if (state->flags & QTextCodec::IgnoreHeader)
152 headerdone = true;
153 if (state->flags & QTextCodec::ConvertInvalidToNull)
154 replacement = QChar::Null;
155 need = state->remainingChars;
156 if (need) {
157 uc = state->state_data[0];
158 min_uc = state->state_data[1];
159 }
160 }
161 if (!headerdone && len > 3
162 && (uchar)chars[0] == 0xef && (uchar)chars[1] == 0xbb && (uchar)chars[2] == 0xbf) {
163 // starts with a byte order mark
164 chars += 3;
165 len -= 3;
166 headerdone = true;
167 }
168
169 QString result(need + len + 1, Qt::Uninitialized); // worst case
170 ushort *qch = (ushort *)result.unicode();
171 uchar ch;
172 int invalid = 0;
173
174 for (int i = 0; i < len; ++i) {
175 ch = chars[i];
176 if (need) {
177 if ((ch&0xc0) == 0x80) {
178 uc = (uc << 6) | (ch & 0x3f);
179 --need;
180 if (!need) {
181 // utf-8 bom composes into 0xfeff code point
182 if (!headerdone && uc == 0xfeff) {
183 // dont do anything, just skip the BOM
184 } else if (uc > 0xffff && uc < 0x110000) {
185 // surrogate pair
186 Q_ASSERT((qch - (ushort*)result.unicode()) + 2 < result.length());
187 *qch++ = QChar::highSurrogate(uc);
188 *qch++ = QChar::lowSurrogate(uc);
189 } else if ((uc < min_uc) || (uc >= 0xd800 && uc <= 0xdfff) || (uc >= 0xfffe)) {
190 // error: overlong sequence, UTF16 surrogate or BOM
191 *qch++ = replacement;
192 ++invalid;
193 } else {
194 *qch++ = uc;
195 }
196 headerdone = true;
197 }
198 } else {
199 // error
200 i = error;
201 *qch++ = replacement;
202 ++invalid;
203 need = 0;
204 headerdone = true;
205 }
206 } else {
207 if (ch < 128) {
208 *qch++ = ushort(ch);
209 headerdone = true;
210 } else if ((ch & 0xe0) == 0xc0) {
211 uc = ch & 0x1f;
212 need = 1;
213 error = i;
214 min_uc = 0x80;
215 headerdone = true;
216 } else if ((ch & 0xf0) == 0xe0) {
217 uc = ch & 0x0f;
218 need = 2;
219 error = i;
220 min_uc = 0x800;
221 } else if ((ch&0xf8) == 0xf0) {
222 uc = ch & 0x07;
223 need = 3;
224 error = i;
225 min_uc = 0x10000;
226 headerdone = true;
227 } else {
228 // error
229 *qch++ = replacement;
230 ++invalid;
231 headerdone = true;
232 }
233 }
234 }
235 if (!state && need > 0) {
236 // unterminated UTF sequence
237 for (int i = error; i < len; ++i) {
238 *qch++ = replacement;
239 ++invalid;
240 }
241 }
242 result.truncate(qch - (ushort *)result.unicode());
243 if (state) {
244 state->invalidChars += invalid;
245 state->remainingChars = need;
246 if (headerdone)
247 state->flags |= QTextCodec::IgnoreHeader;
248 state->state_data[0] = need ? uc : 0;
249 state->state_data[1] = need ? min_uc : 0;
250 }
251 return result;
252}
253
254QByteArray QUtf16::convertFromUnicode(const QChar *uc, int len, QTextCodec::ConverterState *state, DataEndianness e)
255{
256 DataEndianness endian = e;
257 int length = 2*len;
258 if (!state || (!(state->flags & QTextCodec::IgnoreHeader))) {
259 length += 2;
260 }
261 if (e == DetectEndianness) {
262 endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness;
263 }
264
265 QByteArray d;
266 d.resize(length);
267 char *data = d.data();
268 if (!state || !(state->flags & QTextCodec::IgnoreHeader)) {
269 QChar bom(QChar::ByteOrderMark);
270 if (endian == BigEndianness) {
271 data[0] = bom.row();
272 data[1] = bom.cell();
273 } else {
274 data[0] = bom.cell();
275 data[1] = bom.row();
276 }
277 data += 2;
278 }
279 if (endian == BigEndianness) {
280 for (int i = 0; i < len; ++i) {
281 *(data++) = uc[i].row();
282 *(data++) = uc[i].cell();
283 }
284 } else {
285 for (int i = 0; i < len; ++i) {
286 *(data++) = uc[i].cell();
287 *(data++) = uc[i].row();
288 }
289 }
290
291 if (state) {
292 state->remainingChars = 0;
293 state->flags |= QTextCodec::IgnoreHeader;
294 }
295 return d;
296}
297
298QString QUtf16::convertToUnicode(const char *chars, int len, QTextCodec::ConverterState *state, DataEndianness e)
299{
300 DataEndianness endian = e;
301 bool half = false;
302 uchar buf = 0;
303 bool headerdone = false;
304 if (state) {
305 headerdone = state->flags & QTextCodec::IgnoreHeader;
306 if (endian == DetectEndianness)
307 endian = (DataEndianness)state->state_data[Endian];
308 if (state->remainingChars) {
309 half = true;
310 buf = state->state_data[Data];
311 }
312 }
313 if (headerdone && endian == DetectEndianness)
314 endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness;
315
316 QString result(len, Qt::Uninitialized); // worst case
317 QChar *qch = (QChar *)result.unicode();
318 while (len--) {
319 if (half) {
320 QChar ch;
321 if (endian == LittleEndianness) {
322 ch.setRow(*chars++);
323 ch.setCell(buf);
324 } else {
325 ch.setRow(buf);
326 ch.setCell(*chars++);
327 }
328 if (!headerdone) {
329 headerdone = true;
330 if (endian == DetectEndianness) {
331 if (ch == QChar::ByteOrderSwapped) {
332 endian = LittleEndianness;
333 } else if (ch == QChar::ByteOrderMark) {
334 endian = BigEndianness;
335 } else {
336 if (QSysInfo::ByteOrder == QSysInfo::BigEndian) {
337 endian = BigEndianness;
338 } else {
339 endian = LittleEndianness;
340 ch = QChar((ch.unicode() >> 8) | ((ch.unicode() & 0xff) << 8));
341 }
342 *qch++ = ch;
343 }
344 } else if (ch != QChar::ByteOrderMark) {
345 *qch++ = ch;
346 }
347 } else {
348 *qch++ = ch;
349 }
350 half = false;
351 } else {
352 buf = *chars++;
353 half = true;
354 }
355 }
356 result.truncate(qch - result.unicode());
357
358 if (state) {
359 if (headerdone)
360 state->flags |= QTextCodec::IgnoreHeader;
361 state->state_data[Endian] = endian;
362 if (half) {
363 state->remainingChars = 1;
364 state->state_data[Data] = buf;
365 } else {
366 state->remainingChars = 0;
367 state->state_data[Data] = 0;
368 }
369 }
370 return result;
371}
372
373QByteArray QUtf32::convertFromUnicode(const QChar *uc, int len, QTextCodec::ConverterState *state, DataEndianness e)
374{
375 DataEndianness endian = e;
376 int length = 4*len;
377 if (!state || (!(state->flags & QTextCodec::IgnoreHeader))) {
378 length += 4;
379 }
380 if (e == DetectEndianness) {
381 endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness;
382 }
383
384 QByteArray d(length, Qt::Uninitialized);
385 char *data = d.data();
386 if (!state || !(state->flags & QTextCodec::IgnoreHeader)) {
387 if (endian == BigEndianness) {
388 data[0] = 0;
389 data[1] = 0;
390 data[2] = (char)0xfe;
391 data[3] = (char)0xff;
392 } else {
393 data[0] = (char)0xff;
394 data[1] = (char)0xfe;
395 data[2] = 0;
396 data[3] = 0;
397 }
398 data += 4;
399 }
400 if (endian == BigEndianness) {
401 for (int i = 0; i < len; ++i) {
402 uint cp = uc[i].unicode();
403 if (uc[i].isHighSurrogate() && i < len - 1)
404 cp = QChar::surrogateToUcs4(cp, uc[++i].unicode());
405 *(data++) = cp >> 24;
406 *(data++) = (cp >> 16) & 0xff;
407 *(data++) = (cp >> 8) & 0xff;
408 *(data++) = cp & 0xff;
409 }
410 } else {
411 for (int i = 0; i < len; ++i) {
412 uint cp = uc[i].unicode();
413 if (uc[i].isHighSurrogate() && i < len - 1)
414 cp = QChar::surrogateToUcs4(cp, uc[++i].unicode());
415 *(data++) = cp & 0xff;
416 *(data++) = (cp >> 8) & 0xff;
417 *(data++) = (cp >> 16) & 0xff;
418 *(data++) = cp >> 24;
419 }
420 }
421
422 if (state) {
423 state->remainingChars = 0;
424 state->flags |= QTextCodec::IgnoreHeader;
425 }
426 return d;
427}
428
429QString QUtf32::convertToUnicode(const char *chars, int len, QTextCodec::ConverterState *state, DataEndianness e)
430{
431 DataEndianness endian = e;
432 uchar tuple[4];
433 int num = 0;
434 bool headerdone = false;
435 if (state) {
436 headerdone = state->flags & QTextCodec::IgnoreHeader;
437 if (endian == DetectEndianness) {
438 endian = (DataEndianness)state->state_data[Endian];
439 }
440 num = state->remainingChars;
441 memcpy(tuple, &state->state_data[Data], 4);
442 }
443 if (headerdone && endian == DetectEndianness)
444 endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness;
445
446 QString result;
447 result.resize((num + len) >> 2 << 1); // worst case
448 QChar *qch = (QChar *)result.unicode();
449
450 const char *end = chars + len;
451 while (chars < end) {
452 tuple[num++] = *chars++;
453 if (num == 4) {
454 if (!headerdone) {
455 if (endian == DetectEndianness) {
456 if (endian == DetectEndianness) {
457 if (tuple[0] == 0xff && tuple[1] == 0xfe && tuple[2] == 0 && tuple[3] == 0 && endian != BigEndianness) {
458 endian = LittleEndianness;
459 num = 0;
460 continue;
461 } else if (tuple[0] == 0 && tuple[1] == 0 && tuple[2] == 0xfe && tuple[3] == 0xff && endian != LittleEndianness) {
462 endian = BigEndianness;
463 num = 0;
464 continue;
465 } else if (QSysInfo::ByteOrder == QSysInfo::BigEndian) {
466 endian = BigEndianness;
467 } else {
468 endian = LittleEndianness;
469 }
470 }
471 } else if (((endian == BigEndianness) ? qFromBigEndian<quint32>(tuple) : qFromLittleEndian<quint32>(tuple)) == QChar::ByteOrderMark) {
472 num = 0;
473 continue;
474 }
475 }
476 uint code = (endian == BigEndianness) ? qFromBigEndian<quint32>(tuple) : qFromLittleEndian<quint32>(tuple);
477 if (code >= 0x10000) {
478 *qch++ = QChar::highSurrogate(code);
479 *qch++ = QChar::lowSurrogate(code);
480 } else {
481 *qch++ = code;
482 }
483 num = 0;
484 }
485 }
486 result.truncate(qch - result.unicode());
487
488 if (state) {
489 if (headerdone)
490 state->flags |= QTextCodec::IgnoreHeader;
491 state->state_data[Endian] = endian;
492 state->remainingChars = num;
493 memcpy(&state->state_data[Data], tuple, 4);
494 }
495 return result;
496}
497
498
499#ifndef QT_NO_TEXTCODEC
500
501QUtf8Codec::~QUtf8Codec()
502{
503}
504
505QByteArray QUtf8Codec::convertFromUnicode(const QChar *uc, int len, ConverterState *state) const
506{
507 return QUtf8::convertFromUnicode(uc, len, state);
508}
509
510void QUtf8Codec::convertToUnicode(QString *target, const char *chars, int len, ConverterState *state) const
511{
512 *target += QUtf8::convertToUnicode(chars, len, state);
513}
514
515QString QUtf8Codec::convertToUnicode(const char *chars, int len, ConverterState *state) const
516{
517 return QUtf8::convertToUnicode(chars, len, state);
518}
519
520QByteArray QUtf8Codec::name() const
521{
522 return "UTF-8";
523}
524
525int QUtf8Codec::mibEnum() const
526{
527 return 106;
528}
529
530QUtf16Codec::~QUtf16Codec()
531{
532}
533
534QByteArray QUtf16Codec::convertFromUnicode(const QChar *uc, int len, ConverterState *state) const
535{
536 return QUtf16::convertFromUnicode(uc, len, state, e);
537}
538
539QString QUtf16Codec::convertToUnicode(const char *chars, int len, ConverterState *state) const
540{
541 return QUtf16::convertToUnicode(chars, len, state, e);
542}
543
544int QUtf16Codec::mibEnum() const
545{
546 return 1015;
547}
548
549QByteArray QUtf16Codec::name() const
550{
551 return "UTF-16";
552}
553
554QList<QByteArray> QUtf16Codec::aliases() const
555{
556 return QList<QByteArray>();
557}
558
559int QUtf16BECodec::mibEnum() const
560{
561 return 1013;
562}
563
564QByteArray QUtf16BECodec::name() const
565{
566 return "UTF-16BE";
567}
568
569QList<QByteArray> QUtf16BECodec::aliases() const
570{
571 QList<QByteArray> list;
572 return list;
573}
574
575int QUtf16LECodec::mibEnum() const
576{
577 return 1014;
578}
579
580QByteArray QUtf16LECodec::name() const
581{
582 return "UTF-16LE";
583}
584
585QList<QByteArray> QUtf16LECodec::aliases() const
586{
587 QList<QByteArray> list;
588 return list;
589}
590
591QUtf32Codec::~QUtf32Codec()
592{
593}
594
595QByteArray QUtf32Codec::convertFromUnicode(const QChar *uc, int len, ConverterState *state) const
596{
597 return QUtf32::convertFromUnicode(uc, len, state, e);
598}
599
600QString QUtf32Codec::convertToUnicode(const char *chars, int len, ConverterState *state) const
601{
602 return QUtf32::convertToUnicode(chars, len, state, e);
603}
604
605int QUtf32Codec::mibEnum() const
606{
607 return 1017;
608}
609
610QByteArray QUtf32Codec::name() const
611{
612 return "UTF-32";
613}
614
615QList<QByteArray> QUtf32Codec::aliases() const
616{
617 QList<QByteArray> list;
618 return list;
619}
620
621int QUtf32BECodec::mibEnum() const
622{
623 return 1018;
624}
625
626QByteArray QUtf32BECodec::name() const
627{
628 return "UTF-32BE";
629}
630
631QList<QByteArray> QUtf32BECodec::aliases() const
632{
633 QList<QByteArray> list;
634 return list;
635}
636
637int QUtf32LECodec::mibEnum() const
638{
639 return 1019;
640}
641
642QByteArray QUtf32LECodec::name() const
643{
644 return "UTF-32LE";
645}
646
647QList<QByteArray> QUtf32LECodec::aliases() const
648{
649 QList<QByteArray> list;
650 return list;
651}
652
653#endif //QT_NO_TEXTCODEC
654
655QT_END_NAMESPACE
Note: See TracBrowser for help on using the repository browser.