source: trunk/src/corelib/codecs/qutfcodec.cpp@ 730

Last change on this file since 730 was 651, checked in by Dmitry A. Kuminov, 15 years ago

trunk: Merged in qt 4.6.2 sources.

File size: 19.3 KB
Line 
1/****************************************************************************
2**
3** Copyright (C) 2010 Nokia Corporation and/or its subsidiary(-ies).
4** All rights reserved.
5** Contact: Nokia Corporation ([email protected])
6**
7** This file is part of the QtCore module of the Qt Toolkit.
8**
9** $QT_BEGIN_LICENSE:LGPL$
10** Commercial Usage
11** Licensees holding valid Qt Commercial licenses may use this file in
12** accordance with the Qt Commercial License Agreement provided with the
13** Software or, alternatively, in accordance with the terms contained in
14** a written agreement between you and Nokia.
15**
16** GNU Lesser General Public License Usage
17** Alternatively, this file may be used under the terms of the GNU Lesser
18** General Public License version 2.1 as published by the Free Software
19** Foundation and appearing in the file LICENSE.LGPL included in the
20** packaging of this file. Please review the following information to
21** ensure the GNU Lesser General Public License version 2.1 requirements
22** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
23**
24** In addition, as a special exception, Nokia gives you certain additional
25** rights. These rights are described in the Nokia Qt LGPL Exception
26** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
27**
28** GNU General Public License Usage
29** Alternatively, this file may be used under the terms of the GNU
30** General Public License version 3.0 as published by the Free Software
31** Foundation and appearing in the file LICENSE.GPL included in the
32** packaging of this file. Please review the following information to
33** ensure the GNU General Public License version 3.0 requirements will be
34** met: http://www.gnu.org/copyleft/gpl.html.
35**
36** If you have questions regarding the use of this file, please contact
37** Nokia at [email protected].
38** $QT_END_LICENSE$
39**
40****************************************************************************/
41
42#include "qutfcodec_p.h"
43#include "qlist.h"
44#include "qendian.h"
45#include "qchar.h"
46
47QT_BEGIN_NAMESPACE
48
49enum { Endian = 0, Data = 1 };
50
51QByteArray QUtf8::convertFromUnicode(const QChar *uc, int len, QTextCodec::ConverterState *state)
52{
53 uchar replacement = '?';
54 int rlen = 3*len;
55 int surrogate_high = -1;
56 if (state) {
57 if (state->flags & QTextCodec::ConvertInvalidToNull)
58 replacement = 0;
59 if (!(state->flags & QTextCodec::IgnoreHeader))
60 rlen += 3;
61 if (state->remainingChars)
62 surrogate_high = state->state_data[0];
63 }
64
65 QByteArray rstr;
66 rstr.resize(rlen);
67 uchar* cursor = (uchar*)rstr.data();
68 const QChar *ch = uc;
69 int invalid = 0;
70 if (state && !(state->flags & QTextCodec::IgnoreHeader)) {
71 *cursor++ = 0xef;
72 *cursor++ = 0xbb;
73 *cursor++ = 0xbf;
74 }
75
76 const QChar *end = ch + len;
77 while (ch < end) {
78 uint u = ch->unicode();
79 if (surrogate_high >= 0) {
80 if (u >= 0xdc00 && u < 0xe000) {
81 u = (surrogate_high - 0xd800)*0x400 + (u - 0xdc00) + 0x10000;
82 surrogate_high = -1;
83 } else {
84 // high surrogate without low
85 *cursor = replacement;
86 ++ch;
87 ++invalid;
88 surrogate_high = -1;
89 continue;
90 }
91 } else if (u >= 0xdc00 && u < 0xe000) {
92 // low surrogate without high
93 *cursor = replacement;
94 ++ch;
95 ++invalid;
96 continue;
97 } else if (u >= 0xd800 && u < 0xdc00) {
98 surrogate_high = u;
99 ++ch;
100 continue;
101 }
102
103 if (u < 0x80) {
104 *cursor++ = (uchar)u;
105 } else {
106 if (u < 0x0800) {
107 *cursor++ = 0xc0 | ((uchar) (u >> 6));
108 } else {
109 if (u > 0xffff) {
110 // see QString::fromUtf8() and QString::utf8() for explanations
111 if (u > 0x10fe00 && u < 0x10ff00) {
112 *cursor++ = (u - 0x10fe00);
113 ++ch;
114 continue;
115 } else {
116 *cursor++ = 0xf0 | ((uchar) (u >> 18));
117 *cursor++ = 0x80 | (((uchar) (u >> 12)) & 0x3f);
118 }
119 } else {
120 *cursor++ = 0xe0 | (((uchar) (u >> 12)) & 0x3f);
121 }
122 *cursor++ = 0x80 | (((uchar) (u >> 6)) & 0x3f);
123 }
124 *cursor++ = 0x80 | ((uchar) (u&0x3f));
125 }
126 ++ch;
127 }
128
129 rstr.resize(cursor - (const uchar*)rstr.constData());
130 if (state) {
131 state->invalidChars += invalid;
132 state->flags |= QTextCodec::IgnoreHeader;
133 state->remainingChars = 0;
134 if (surrogate_high >= 0) {
135 state->remainingChars = 1;
136 state->state_data[0] = surrogate_high;
137 }
138 }
139 return rstr;
140}
141
142QString QUtf8::convertToUnicode(const char *chars, int len, QTextCodec::ConverterState *state)
143{
144 bool headerdone = false;
145 ushort replacement = QChar::ReplacementCharacter;
146 int need = 0;
147 int error = -1;
148 uint uc = 0;
149 uint min_uc = 0;
150 if (state) {
151 if (state->flags & QTextCodec::IgnoreHeader)
152 headerdone = true;
153 if (state->flags & QTextCodec::ConvertInvalidToNull)
154 replacement = QChar::Null;
155 need = state->remainingChars;
156 if (need) {
157 uc = state->state_data[0];
158 min_uc = state->state_data[1];
159 }
160 }
161 if (!headerdone && len > 3
162 && (uchar)chars[0] == 0xef && (uchar)chars[1] == 0xbb && (uchar)chars[2] == 0xbf) {
163 // starts with a byte order mark
164 chars += 3;
165 len -= 3;
166 headerdone = true;
167 }
168
169 QString result(need + len + 1, Qt::Uninitialized); // worst case
170 ushort *qch = (ushort *)result.unicode();
171 uchar ch;
172 int invalid = 0;
173
174 for (int i = 0; i < len; ++i) {
175 ch = chars[i];
176 if (need) {
177 if ((ch&0xc0) == 0x80) {
178 uc = (uc << 6) | (ch & 0x3f);
179 --need;
180 if (!need) {
181 // utf-8 bom composes into 0xfeff code point
182 if (!headerdone && uc == 0xfeff) {
183 // dont do anything, just skip the BOM
184 } else if (uc > 0xffff && uc < 0x110000) {
185 // surrogate pair
186 Q_ASSERT((qch - (ushort*)result.unicode()) + 2 < result.length());
187 *qch++ = QChar::highSurrogate(uc);
188 *qch++ = QChar::lowSurrogate(uc);
189 } else if ((uc < min_uc) || (uc >= 0xd800 && uc <= 0xdfff) || (uc >= 0xfffe)) {
190 // error: overlong sequence, UTF16 surrogate or BOM
191 *qch++ = replacement;
192 ++invalid;
193 } else {
194 *qch++ = uc;
195 }
196 headerdone = true;
197 }
198 } else {
199 // error
200 i = error;
201 *qch++ = replacement;
202 ++invalid;
203 need = 0;
204 headerdone = true;
205 }
206 } else {
207 if (ch < 128) {
208 *qch++ = ushort(ch);
209 headerdone = true;
210 } else if ((ch & 0xe0) == 0xc0) {
211 uc = ch & 0x1f;
212 need = 1;
213 error = i;
214 min_uc = 0x80;
215 headerdone = true;
216 } else if ((ch & 0xf0) == 0xe0) {
217 uc = ch & 0x0f;
218 need = 2;
219 error = i;
220 min_uc = 0x800;
221 } else if ((ch&0xf8) == 0xf0) {
222 uc = ch & 0x07;
223 need = 3;
224 error = i;
225 min_uc = 0x10000;
226 headerdone = true;
227 } else {
228 // error
229 *qch++ = replacement;
230 ++invalid;
231 headerdone = true;
232 }
233 }
234 }
235 if (!state && need > 0) {
236 // unterminated UTF sequence
237 for (int i = error; i < len; ++i) {
238 *qch++ = replacement;
239 ++invalid;
240 }
241 }
242 result.truncate(qch - (ushort *)result.unicode());
243 if (state) {
244 state->invalidChars += invalid;
245 state->remainingChars = need;
246 if (headerdone)
247 state->flags |= QTextCodec::IgnoreHeader;
248 state->state_data[0] = need ? uc : 0;
249 state->state_data[1] = need ? min_uc : 0;
250 }
251 return result;
252}
253
254QByteArray QUtf16::convertFromUnicode(const QChar *uc, int len, QTextCodec::ConverterState *state, DataEndianness e)
255{
256 DataEndianness endian = e;
257 int length = 2*len;
258 if (!state || (!(state->flags & QTextCodec::IgnoreHeader))) {
259 length += 2;
260 }
261 if (e == DetectEndianness) {
262 endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness;
263 }
264
265 QByteArray d;
266 d.resize(length);
267 char *data = d.data();
268 if (!state || !(state->flags & QTextCodec::IgnoreHeader)) {
269 QChar bom(QChar::ByteOrderMark);
270 if (endian == BigEndianness) {
271 data[0] = bom.row();
272 data[1] = bom.cell();
273 } else {
274 data[0] = bom.cell();
275 data[1] = bom.row();
276 }
277 data += 2;
278 }
279 if (endian == BigEndianness) {
280 for (int i = 0; i < len; ++i) {
281 *(data++) = uc[i].row();
282 *(data++) = uc[i].cell();
283 }
284 } else {
285 for (int i = 0; i < len; ++i) {
286 *(data++) = uc[i].cell();
287 *(data++) = uc[i].row();
288 }
289 }
290
291 if (state) {
292 state->remainingChars = 0;
293 state->flags |= QTextCodec::IgnoreHeader;
294 }
295 return d;
296}
297
298QString QUtf16::convertToUnicode(const char *chars, int len, QTextCodec::ConverterState *state, DataEndianness e)
299{
300 DataEndianness endian = e;
301 bool half = false;
302 uchar buf = 0;
303 bool headerdone = false;
304 if (state) {
305 headerdone = state->flags & QTextCodec::IgnoreHeader;
306 if (endian == DetectEndianness)
307 endian = (DataEndianness)state->state_data[Endian];
308 if (state->remainingChars) {
309 half = true;
310 buf = state->state_data[Data];
311 }
312 }
313 if (headerdone && endian == DetectEndianness)
314 endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness;
315
316 QString result(len, Qt::Uninitialized); // worst case
317 QChar *qch = (QChar *)result.unicode();
318 while (len--) {
319 if (half) {
320 QChar ch;
321 if (endian == LittleEndianness) {
322 ch.setRow(*chars++);
323 ch.setCell(buf);
324 } else {
325 ch.setRow(buf);
326 ch.setCell(*chars++);
327 }
328 if (!headerdone) {
329 if (endian == DetectEndianness) {
330 if (ch == QChar::ByteOrderSwapped && endian != BigEndianness) {
331 endian = LittleEndianness;
332 } else if (ch == QChar::ByteOrderMark && endian != LittleEndianness) {
333 // ignore BOM
334 endian = BigEndianness;
335 } else {
336 if (QSysInfo::ByteOrder == QSysInfo::BigEndian) {
337 endian = BigEndianness;
338 } else {
339 endian = LittleEndianness;
340 ch = QChar((ch.unicode() >> 8) | ((ch.unicode() & 0xff) << 8));
341 }
342 *qch++ = ch;
343 }
344 } else if (ch != QChar::ByteOrderMark) {
345 *qch++ = ch;
346 }
347 headerdone = true;
348 } else {
349 *qch++ = ch;
350 }
351 half = false;
352 } else {
353 buf = *chars++;
354 half = true;
355 }
356 }
357 result.truncate(qch - result.unicode());
358
359 if (state) {
360 if (headerdone)
361 state->flags |= QTextCodec::IgnoreHeader;
362 state->state_data[Endian] = endian;
363 if (half) {
364 state->remainingChars = 1;
365 state->state_data[Data] = buf;
366 } else {
367 state->remainingChars = 0;
368 state->state_data[Data] = 0;
369 }
370 }
371 return result;
372}
373
374QByteArray QUtf32::convertFromUnicode(const QChar *uc, int len, QTextCodec::ConverterState *state, DataEndianness e)
375{
376 DataEndianness endian = e;
377 int length = 4*len;
378 if (!state || (!(state->flags & QTextCodec::IgnoreHeader))) {
379 length += 4;
380 }
381 if (e == DetectEndianness) {
382 endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness;
383 }
384
385 QByteArray d(length, Qt::Uninitialized);
386 char *data = d.data();
387 if (!state || !(state->flags & QTextCodec::IgnoreHeader)) {
388 if (endian == BigEndianness) {
389 data[0] = 0;
390 data[1] = 0;
391 data[2] = (char)0xfe;
392 data[3] = (char)0xff;
393 } else {
394 data[0] = (char)0xff;
395 data[1] = (char)0xfe;
396 data[2] = 0;
397 data[3] = 0;
398 }
399 data += 4;
400 }
401 if (endian == BigEndianness) {
402 for (int i = 0; i < len; ++i) {
403 uint cp = uc[i].unicode();
404 if (uc[i].isHighSurrogate() && i < len - 1)
405 cp = QChar::surrogateToUcs4(cp, uc[++i].unicode());
406 *(data++) = cp >> 24;
407 *(data++) = (cp >> 16) & 0xff;
408 *(data++) = (cp >> 8) & 0xff;
409 *(data++) = cp & 0xff;
410 }
411 } else {
412 for (int i = 0; i < len; ++i) {
413 uint cp = uc[i].unicode();
414 if (uc[i].isHighSurrogate() && i < len - 1)
415 cp = QChar::surrogateToUcs4(cp, uc[++i].unicode());
416 *(data++) = cp & 0xff;
417 *(data++) = (cp >> 8) & 0xff;
418 *(data++) = (cp >> 16) & 0xff;
419 *(data++) = cp >> 24;
420 }
421 }
422
423 if (state) {
424 state->remainingChars = 0;
425 state->flags |= QTextCodec::IgnoreHeader;
426 }
427 return d;
428}
429
430QString QUtf32::convertToUnicode(const char *chars, int len, QTextCodec::ConverterState *state, DataEndianness e)
431{
432 DataEndianness endian = e;
433 uchar tuple[4];
434 int num = 0;
435 bool headerdone = false;
436 if (state) {
437 headerdone = state->flags & QTextCodec::IgnoreHeader;
438 if (endian == DetectEndianness) {
439 endian = (DataEndianness)state->state_data[Endian];
440 }
441 num = state->remainingChars;
442 memcpy(tuple, &state->state_data[Data], 4);
443 }
444 if (headerdone && endian == DetectEndianness)
445 endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness;
446
447 QString result;
448 result.resize((num + len) >> 2 << 1); // worst case
449 QChar *qch = (QChar *)result.unicode();
450
451 const char *end = chars + len;
452 while (chars < end) {
453 tuple[num++] = *chars++;
454 if (num == 4) {
455 if (!headerdone) {
456 if (endian == DetectEndianness) {
457 if (endian == DetectEndianness) {
458 if (tuple[0] == 0xff && tuple[1] == 0xfe && tuple[2] == 0 && tuple[3] == 0 && endian != BigEndianness) {
459 endian = LittleEndianness;
460 num = 0;
461 continue;
462 } else if (tuple[0] == 0 && tuple[1] == 0 && tuple[2] == 0xfe && tuple[3] == 0xff && endian != LittleEndianness) {
463 endian = BigEndianness;
464 num = 0;
465 continue;
466 } else if (QSysInfo::ByteOrder == QSysInfo::BigEndian) {
467 endian = BigEndianness;
468 } else {
469 endian = LittleEndianness;
470 }
471 }
472 } else if (((endian == BigEndianness) ? qFromBigEndian<quint32>(tuple) : qFromLittleEndian<quint32>(tuple)) == QChar::ByteOrderMark) {
473 num = 0;
474 continue;
475 }
476 }
477 uint code = (endian == BigEndianness) ? qFromBigEndian<quint32>(tuple) : qFromLittleEndian<quint32>(tuple);
478 if (code >= 0x10000) {
479 *qch++ = QChar::highSurrogate(code);
480 *qch++ = QChar::lowSurrogate(code);
481 } else {
482 *qch++ = code;
483 }
484 num = 0;
485 }
486 }
487 result.truncate(qch - result.unicode());
488
489 if (state) {
490 if (headerdone)
491 state->flags |= QTextCodec::IgnoreHeader;
492 state->state_data[Endian] = endian;
493 state->remainingChars = num;
494 memcpy(&state->state_data[Data], tuple, 4);
495 }
496 return result;
497}
498
499
500#ifndef QT_NO_TEXTCODEC
501
502QUtf8Codec::~QUtf8Codec()
503{
504}
505
506QByteArray QUtf8Codec::convertFromUnicode(const QChar *uc, int len, ConverterState *state) const
507{
508 return QUtf8::convertFromUnicode(uc, len, state);
509}
510
511void QUtf8Codec::convertToUnicode(QString *target, const char *chars, int len, ConverterState *state) const
512{
513 *target += QUtf8::convertToUnicode(chars, len, state);
514}
515
516QString QUtf8Codec::convertToUnicode(const char *chars, int len, ConverterState *state) const
517{
518 return QUtf8::convertToUnicode(chars, len, state);
519}
520
521QByteArray QUtf8Codec::name() const
522{
523 return "UTF-8";
524}
525
526int QUtf8Codec::mibEnum() const
527{
528 return 106;
529}
530
531QUtf16Codec::~QUtf16Codec()
532{
533}
534
535QByteArray QUtf16Codec::convertFromUnicode(const QChar *uc, int len, ConverterState *state) const
536{
537 return QUtf16::convertFromUnicode(uc, len, state, e);
538}
539
540QString QUtf16Codec::convertToUnicode(const char *chars, int len, ConverterState *state) const
541{
542 return QUtf16::convertToUnicode(chars, len, state, e);
543}
544
545int QUtf16Codec::mibEnum() const
546{
547 return 1015;
548}
549
550QByteArray QUtf16Codec::name() const
551{
552 return "UTF-16";
553}
554
555QList<QByteArray> QUtf16Codec::aliases() const
556{
557 return QList<QByteArray>();
558}
559
560int QUtf16BECodec::mibEnum() const
561{
562 return 1013;
563}
564
565QByteArray QUtf16BECodec::name() const
566{
567 return "UTF-16BE";
568}
569
570QList<QByteArray> QUtf16BECodec::aliases() const
571{
572 QList<QByteArray> list;
573 return list;
574}
575
576int QUtf16LECodec::mibEnum() const
577{
578 return 1014;
579}
580
581QByteArray QUtf16LECodec::name() const
582{
583 return "UTF-16LE";
584}
585
586QList<QByteArray> QUtf16LECodec::aliases() const
587{
588 QList<QByteArray> list;
589 return list;
590}
591
592QUtf32Codec::~QUtf32Codec()
593{
594}
595
596QByteArray QUtf32Codec::convertFromUnicode(const QChar *uc, int len, ConverterState *state) const
597{
598 return QUtf32::convertFromUnicode(uc, len, state, e);
599}
600
601QString QUtf32Codec::convertToUnicode(const char *chars, int len, ConverterState *state) const
602{
603 return QUtf32::convertToUnicode(chars, len, state, e);
604}
605
606int QUtf32Codec::mibEnum() const
607{
608 return 1017;
609}
610
611QByteArray QUtf32Codec::name() const
612{
613 return "UTF-32";
614}
615
616QList<QByteArray> QUtf32Codec::aliases() const
617{
618 QList<QByteArray> list;
619 return list;
620}
621
622int QUtf32BECodec::mibEnum() const
623{
624 return 1018;
625}
626
627QByteArray QUtf32BECodec::name() const
628{
629 return "UTF-32BE";
630}
631
632QList<QByteArray> QUtf32BECodec::aliases() const
633{
634 QList<QByteArray> list;
635 return list;
636}
637
638int QUtf32LECodec::mibEnum() const
639{
640 return 1019;
641}
642
643QByteArray QUtf32LECodec::name() const
644{
645 return "UTF-32LE";
646}
647
648QList<QByteArray> QUtf32LECodec::aliases() const
649{
650 QList<QByteArray> list;
651 return list;
652}
653
654#endif //QT_NO_TEXTCODEC
655
656QT_END_NAMESPACE
Note: See TracBrowser for help on using the repository browser.