source: trunk/src/corelib/codecs/qtextcodec.cpp@ 846

Last change on this file since 846 was 846, checked in by Dmitry A. Kuminov, 14 years ago

trunk: Merged in qt 4.7.2 sources from branches/vendor/nokia/qt.

File size: 61.0 KB
Line 
1/****************************************************************************
2**
3** Copyright (C) 2011 Nokia Corporation and/or its subsidiary(-ies).
4** All rights reserved.
5** Contact: Nokia Corporation ([email protected])
6**
7** This file is part of the QtCore module of the Qt Toolkit.
8**
9** $QT_BEGIN_LICENSE:LGPL$
10** Commercial Usage
11** Licensees holding valid Qt Commercial licenses may use this file in
12** accordance with the Qt Commercial License Agreement provided with the
13** Software or, alternatively, in accordance with the terms contained in
14** a written agreement between you and Nokia.
15**
16** GNU Lesser General Public License Usage
17** Alternatively, this file may be used under the terms of the GNU Lesser
18** General Public License version 2.1 as published by the Free Software
19** Foundation and appearing in the file LICENSE.LGPL included in the
20** packaging of this file. Please review the following information to
21** ensure the GNU Lesser General Public License version 2.1 requirements
22** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
23**
24** In addition, as a special exception, Nokia gives you certain additional
25** rights. These rights are described in the Nokia Qt LGPL Exception
26** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
27**
28** GNU General Public License Usage
29** Alternatively, this file may be used under the terms of the GNU
30** General Public License version 3.0 as published by the Free Software
31** Foundation and appearing in the file LICENSE.GPL included in the
32** packaging of this file. Please review the following information to
33** ensure the GNU General Public License version 3.0 requirements will be
34** met: http://www.gnu.org/copyleft/gpl.html.
35**
36** If you have questions regarding the use of this file, please contact
37** Nokia at [email protected].
38** $QT_END_LICENSE$
39**
40****************************************************************************/
41
42#include "qplatformdefs.h"
43#include "qtextcodec.h"
44#include "qtextcodec_p.h"
45
46#ifndef QT_NO_TEXTCODEC
47
48#include "qlist.h"
49#include "qfile.h"
50#ifndef QT_NO_LIBRARY
51# include "qcoreapplication.h"
52# include "qtextcodecplugin.h"
53# include "private/qfactoryloader_p.h"
54#endif
55#include "qstringlist.h"
56
57#ifdef Q_OS_UNIX
58# include "qiconvcodec_p.h"
59#endif
60
61#if defined(Q_OS_OS2)
62# include <unidef.h>
63# include <uconv.h>
64# include "qvector.h"
65#endif
66
67#include "qutfcodec_p.h"
68#include "qsimplecodec_p.h"
69#include "qlatincodec_p.h"
70#ifndef QT_NO_CODECS
71# include "qtsciicodec_p.h"
72# include "qisciicodec_p.h"
73#ifndef Q_OS_SYMBIAN
74# if defined(QT_NO_ICONV) && !defined(QT_BOOTSTRAPPED)
75// no iconv(3) support, must build all codecs into the library
76# include "../../plugins/codecs/cn/qgb18030codec.h"
77# include "../../plugins/codecs/jp/qeucjpcodec.h"
78# include "../../plugins/codecs/jp/qjiscodec.h"
79# include "../../plugins/codecs/jp/qsjiscodec.h"
80# include "../../plugins/codecs/kr/qeuckrcodec.h"
81# include "../../plugins/codecs/tw/qbig5codec.h"
82# endif // QT_NO_ICONV
83# if defined(Q_WS_X11) && !defined(QT_BOOTSTRAPPED)
84# include "qfontlaocodec_p.h"
85# include "../../plugins/codecs/jp/qfontjpcodec.h"
86# endif
87#endif // QT_NO_SYMBIAN
88#endif // QT_NO_CODECS
89#include "qlocale.h"
90#include "qmutex.h"
91#include "qhash.h"
92
93#include <stdlib.h>
94#include <ctype.h>
95#include <locale.h>
96#if defined (_XOPEN_UNIX) && !defined(Q_OS_QNX) && !defined(Q_OS_OSF)
97#include <langinfo.h>
98#endif
99
100#if defined(Q_OS_WINCE)
101# define QT_NO_SETLOCALE
102#endif
103
104#ifdef Q_OS_SYMBIAN
105#include "qtextcodec_symbian.cpp"
106#endif
107
108
109// enabling this is not exception safe!
110// #define Q_DEBUG_TEXTCODEC
111
112QT_BEGIN_NAMESPACE
113
114#if !defined(QT_NO_LIBRARY) && !defined(QT_NO_TEXTCODECPLUGIN)
115Q_GLOBAL_STATIC_WITH_ARGS(QFactoryLoader, loader,
116 (QTextCodecFactoryInterface_iid, QLatin1String("/codecs")))
117#endif
118
119//Cache for QTextCodec::codecForName and codecForMib.
120typedef QHash<QByteArray, QTextCodec *> QTextCodecCache;
121Q_GLOBAL_STATIC(QTextCodecCache, qTextCodecCache)
122
123
124static char qtolower(register char c)
125{ if (c >= 'A' && c <= 'Z') return c + 0x20; return c; }
126static bool qisalnum(register char c)
127{ return (c >= '0' && c <= '9') || ((c | 0x20) >= 'a' && (c | 0x20) <= 'z'); }
128
129static bool nameMatch(const QByteArray &name, const QByteArray &test)
130{
131 // if they're the same, return a perfect score
132 if (qstricmp(name, test) == 0)
133 return true;
134
135 const char *n = name.constData();
136 const char *h = test.constData();
137
138 // if the letters and numbers are the same, we have a match
139 while (*n != '\0') {
140 if (qisalnum(*n)) {
141 for (;;) {
142 if (*h == '\0')
143 return false;
144 if (qisalnum(*h))
145 break;
146 ++h;
147 }
148 if (qtolower(*n) != qtolower(*h))
149 return false;
150 ++h;
151 }
152 ++n;
153 }
154 while (*h && !qisalnum(*h))
155 ++h;
156 return (*h == '\0');
157}
158
159
160static QTextCodec *createForName(const QByteArray &name)
161{
162#if !defined(QT_NO_LIBRARY) && !defined(QT_NO_TEXTCODECPLUGIN)
163 QFactoryLoader *l = loader();
164 QStringList keys = l->keys();
165 for (int i = 0; i < keys.size(); ++i) {
166 if (nameMatch(name, keys.at(i).toLatin1())) {
167 QString realName = keys.at(i);
168 if (QTextCodecFactoryInterface *factory
169 = qobject_cast<QTextCodecFactoryInterface*>(l->instance(realName))) {
170 return factory->create(realName);
171 }
172 }
173 }
174#else
175 Q_UNUSED(name);
176#endif
177 return 0;
178}
179
180static QTextCodec *createForMib(int mib)
181{
182#ifndef QT_NO_TEXTCODECPLUGIN
183 QString name = QLatin1String("MIB: ") + QString::number(mib);
184 if (QTextCodecFactoryInterface *factory
185 = qobject_cast<QTextCodecFactoryInterface*>(loader()->instance(name)))
186 return factory->create(name);
187#else
188 Q_UNUSED(mib);
189#endif
190 return 0;
191}
192
193static QList<QTextCodec*> *all = 0;
194#ifdef Q_DEBUG_TEXTCODEC
195static bool destroying_is_ok = false;
196#endif
197
198static QTextCodec *localeMapper = 0;
199QTextCodec *QTextCodec::cftr = 0;
200
201
202class QTextCodecCleanup
203{
204public:
205 ~QTextCodecCleanup();
206};
207
208/*
209 Deletes all the created codecs. This destructor is called just
210 before exiting to delete any QTextCodec objects that may be lying
211 around.
212*/
213QTextCodecCleanup::~QTextCodecCleanup()
214{
215 if (!all)
216 return;
217
218#ifdef Q_DEBUG_TEXTCODEC
219 destroying_is_ok = true;
220#endif
221
222 for (QList<QTextCodec *>::const_iterator it = all->constBegin()
223 ; it != all->constEnd(); ++it) {
224 delete *it;
225 }
226 delete all;
227 all = 0;
228 localeMapper = 0;
229
230#ifdef Q_DEBUG_TEXTCODEC
231 destroying_is_ok = false;
232#endif
233}
234
235Q_GLOBAL_STATIC(QTextCodecCleanup, createQTextCodecCleanup)
236
237bool QTextCodec::validCodecs()
238{
239#ifdef Q_OS_SYMBIAN
240 // If we don't have a trap handler, we're outside of the main() function,
241 // ie. in global constructors or destructors. Don't use codecs in this
242 // case as it would lead to crashes because we don't have a cleanup stack on Symbian
243 return (User::TrapHandler() != NULL);
244#else
245 return true;
246#endif
247}
248
249
250#if defined(Q_OS_WIN32) || defined(Q_OS_WINCE)
251class QWindowsLocalCodec: public QTextCodec
252{
253public:
254 QWindowsLocalCodec();
255 ~QWindowsLocalCodec();
256
257 QString convertToUnicode(const char *, int, ConverterState *) const;
258 QByteArray convertFromUnicode(const QChar *, int, ConverterState *) const;
259 QString convertToUnicodeCharByChar(const char *chars, int length, ConverterState *state) const;
260
261 QByteArray name() const;
262 int mibEnum() const;
263
264};
265
266QWindowsLocalCodec::QWindowsLocalCodec()
267{
268}
269
270QWindowsLocalCodec::~QWindowsLocalCodec()
271{
272}
273
274QString QWindowsLocalCodec::convertToUnicode(const char *chars, int length, ConverterState *state) const
275{
276 const char *mb = chars;
277 int mblen = length;
278
279 if (!mb || !mblen)
280 return QString();
281
282 const int wclen_auto = 4096;
283 wchar_t wc_auto[wclen_auto];
284 int wclen = wclen_auto;
285 wchar_t *wc = wc_auto;
286 int len;
287 QString sp;
288 bool prepend = false;
289 char state_data = 0;
290 int remainingChars = 0;
291
292 //save the current state information
293 if (state) {
294 state_data = (char)state->state_data[0];
295 remainingChars = state->remainingChars;
296 }
297
298 //convert the pending charcter (if available)
299 if (state && remainingChars) {
300 char prev[3] = {0};
301 prev[0] = state_data;
302 prev[1] = mb[0];
303 remainingChars = 0;
304 len = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED,
305 prev, 2, wc, wclen);
306 if (len) {
307 prepend = true;
308 sp.append(QChar(wc[0]));
309 mb++;
310 mblen--;
311 wc[0] = 0;
312 }
313 }
314
315 while (!(len=MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED|MB_ERR_INVALID_CHARS,
316 mb, mblen, wc, wclen))) {
317 int r = GetLastError();
318 if (r == ERROR_INSUFFICIENT_BUFFER) {
319 if (wc != wc_auto) {
320 qWarning("MultiByteToWideChar: Size changed");
321 break;
322 } else {
323 wclen = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED,
324 mb, mblen, 0, 0);
325 wc = new wchar_t[wclen];
326 // and try again...
327 }
328 } else if (r == ERROR_NO_UNICODE_TRANSLATION) {
329 //find the last non NULL character
330 while (mblen > 1 && !(mb[mblen-1]))
331 mblen--;
332 //check whether, we hit an invalid character in the middle
333 if ((mblen <= 1) || (remainingChars && state_data))
334 return convertToUnicodeCharByChar(chars, length, state);
335 //Remove the last character and try again...
336 state_data = mb[mblen-1];
337 remainingChars = 1;
338 mblen--;
339 } else {
340 // Fail.
341 qWarning("MultiByteToWideChar: Cannot convert multibyte text");
342 break;
343 }
344 }
345 if (len <= 0)
346 return QString();
347 if (wc[len-1] == 0) // len - 1: we don't want terminator
348 --len;
349
350 //save the new state information
351 if (state) {
352 state->state_data[0] = (char)state_data;
353 state->remainingChars = remainingChars;
354 }
355 QString s((QChar*)wc, len);
356 if (wc != wc_auto)
357 delete [] wc;
358 if (prepend) {
359 return sp+s;
360 }
361 return s;
362}
363
364QString QWindowsLocalCodec::convertToUnicodeCharByChar(const char *chars, int length, ConverterState *state) const
365{
366 if (!chars || !length)
367 return QString();
368
369 int copyLocation = 0;
370 int extra = 2;
371 if (state && state->remainingChars) {
372 copyLocation = state->remainingChars;
373 extra += copyLocation;
374 }
375 int newLength = length + extra;
376 char *mbcs = new char[newLength];
377 //ensure that we have a NULL terminated string
378 mbcs[newLength-1] = 0;
379 mbcs[newLength-2] = 0;
380 memcpy(&(mbcs[copyLocation]), chars, length);
381 if (copyLocation) {
382 //copy the last character from the state
383 mbcs[0] = (char)state->state_data[0];
384 state->remainingChars = 0;
385 }
386 const char *mb = mbcs;
387#ifndef Q_OS_WINCE
388 const char *next = 0;
389 QString s;
390 while((next = CharNextExA(CP_ACP, mb, 0)) != mb) {
391 wchar_t wc[2] ={0};
392 int charlength = next - mb;
393 int len = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED|MB_ERR_INVALID_CHARS, mb, charlength, wc, 2);
394 if (len>0) {
395 s.append(QChar(wc[0]));
396 } else {
397 int r = GetLastError();
398 //check if the character being dropped is the last character
399 if (r == ERROR_NO_UNICODE_TRANSLATION && mb == (mbcs+newLength -3) && state) {
400 state->remainingChars = 1;
401 state->state_data[0] = (char)*mb;
402 }
403 }
404 mb = next;
405 }
406#else
407 QString s;
408 int size = mbstowcs(NULL, mb, length);
409 if (size < 0) {
410 Q_ASSERT("Error in CE TextCodec");
411 return QString();
412 }
413 wchar_t* ws = new wchar_t[size + 2];
414 ws[size +1] = 0;
415 ws[size] = 0;
416 size = mbstowcs(ws, mb, length);
417 for (int i=0; i< size; i++)
418 s.append(QChar(ws[i]));
419 delete [] ws;
420#endif
421 delete mbcs;
422 return s;
423}
424
425QByteArray QWindowsLocalCodec::convertFromUnicode(const QChar *ch, int uclen, ConverterState *) const
426{
427 if (!ch)
428 return QByteArray();
429 if (uclen == 0)
430 return QByteArray("");
431 BOOL used_def;
432 QByteArray mb(4096, 0);
433 int len;
434 while (!(len=WideCharToMultiByte(CP_ACP, 0, (const wchar_t*)ch, uclen,
435 mb.data(), mb.size()-1, 0, &used_def)))
436 {
437 int r = GetLastError();
438 if (r == ERROR_INSUFFICIENT_BUFFER) {
439 mb.resize(1+WideCharToMultiByte(CP_ACP, 0,
440 (const wchar_t*)ch, uclen,
441 0, 0, 0, &used_def));
442 // and try again...
443 } else {
444#ifndef QT_NO_DEBUG
445 // Fail.
446 qWarning("WideCharToMultiByte: Cannot convert multibyte text (error %d): %s (UTF-8)",
447 r, QString(ch, uclen).toLocal8Bit().data());
448#endif
449 break;
450 }
451 }
452 mb.resize(len);
453 return mb;
454}
455
456
457QByteArray QWindowsLocalCodec::name() const
458{
459 return "System";
460}
461
462int QWindowsLocalCodec::mibEnum() const
463{
464 return 0;
465}
466
467#elif defined(Q_OS_OS2)
468
469class QOs2LocalCodec: public QTextCodec
470{
471public:
472 QOs2LocalCodec();
473 ~QOs2LocalCodec();
474
475 QString convertToUnicode(const char *, int, ConverterState *) const;
476 QByteArray convertFromUnicode(const QChar *, int, ConverterState *) const;
477
478 QByteArray name() const;
479 int mibEnum() const;
480
481private:
482 UconvObject uoSubYes;
483 UconvObject uoSubNo;
484};
485
486QOs2LocalCodec::QOs2LocalCodec() : uoSubYes(0), uoSubNo(0)
487{
488 // create the conversion object for the process code page that performs
489 // substitution of invalid characters with '?'
490 UniCreateUconvObject((UniChar *)L"@sub=yes,subchar=\\x3F,subuni=\\x003F",
491 &uoSubYes);
492 Q_ASSERT(uoSubYes);
493
494 // same as above but doesn't perform substitution
495 UniCreateUconvObject((UniChar *)L"@sub=no", &uoSubNo);
496 Q_ASSERT(uoSubNo);
497}
498
499QOs2LocalCodec::~QOs2LocalCodec()
500{
501 UniFreeUconvObject(uoSubNo);
502 UniFreeUconvObject(uoSubYes);
503}
504
505static void qOs2LocalCodecStateFree(QTextCodec::ConverterState *state)
506{
507 delete reinterpret_cast<char *>(state->d);
508}
509
510QString QOs2LocalCodec::convertToUnicode(const char *chars, int length,
511 ConverterState *state) const
512{
513 QString res;
514
515 if (!chars)
516 return res;
517 if (!length)
518 return QLatin1String("");
519
520 UconvObject uo = uoSubYes;
521 if (state && (state->flags & ConvertInvalidToNull))
522 uo = uoSubNo;
523
524 int remainingChars = 0;
525 char *remainingBuffer = 0;
526
527 if (state) {
528 // stateful conversion
529 remainingBuffer = reinterpret_cast<char *>(state->d);
530 if (remainingBuffer) {
531 // restore state
532 remainingChars = state->remainingChars;
533 } else {
534 // first time, add the destructor for state->d
535 state->flags |= FreeFunction;
536 QTextCodecUnalignedPointer::encode(state->state_data,
537 qOs2LocalCodecStateFree);
538 }
539 }
540
541 const char *mbPtr = chars;
542 size_t mbLeft = length;
543
544 QByteArray mbExtra;
545 if (remainingChars) {
546 // we have to prepend the remaining bytes from the previous conversion
547 mbLeft += remainingChars;
548 mbExtra.resize(mbLeft);
549 mbPtr = mbExtra.data();
550
551 memcpy(mbExtra.data(), remainingBuffer, remainingChars);
552 memcpy(mbExtra.data() + remainingChars, chars, length);
553
554 remainingBuffer = 0;
555 remainingChars = 0;
556 }
557
558 size_t ucLen = mbLeft;
559 QString ucBuf(ucLen, QLatin1Char('\0'));
560 UniChar *ucPtr = reinterpret_cast<UniChar *>(ucBuf.data());
561 size_t ucLeft = ucLen;
562
563 size_t nonIdent = 0;
564 int rc;
565
566 while (mbLeft) {
567 rc = UniUconvToUcs(uo, (void**)&mbPtr, &mbLeft, &ucPtr, &ucLeft,
568 &nonIdent);
569 if (rc == ULS_BUFFERFULL) {
570 size_t ucDone = ucLen - ucLeft;
571 size_t mbDone = length - mbLeft;
572 // assume that mbLeft/ucLeft is an approximation of mbDone/ucDone
573 ucLen = ucDone + (mbLeft * ucDone) / mbDone;
574 ucBuf.resize(ucLen);
575 ucPtr = reinterpret_cast<UniChar *>(ucBuf.data() + ucDone);
576 } else if (rc == ULS_ILLEGALSEQUENCE && state) {
577 // conversion stopped because the remaining inBytesLeft make up
578 // an incomplete multi-byte sequence; save them for later
579 remainingBuffer = new char[mbLeft];
580 memcpy(remainingBuffer, mbPtr, mbLeft);
581 remainingChars = mbLeft;
582 break;
583 } else if (rc != ULS_SUCCESS) {
584 // just fail on an unexpected error (will return what we've got)
585 qWarning("QOs2LocalCodec::convertToUnicode: UniUconvToUcs failed "
586 "with %d", rc);
587 break;
588 }
589 }
590
591 ucBuf.resize(ucLen - ucLeft);
592 res = ucBuf;
593
594 if (state) {
595 // update the state
596 state->invalidChars = nonIdent;
597 state->remainingChars = remainingChars;
598 state->d = remainingBuffer;
599 }
600
601 return res;
602}
603
604QByteArray QOs2LocalCodec::convertFromUnicode(const QChar *uchars, int length,
605 ConverterState *state) const
606{
607 QByteArray res;
608
609 if (!uchars)
610 return res;
611 if (!length)
612 return QByteArray("");
613
614 UconvObject uo = uoSubYes;
615 if (state && (state->flags & ConvertInvalidToNull))
616 uo = uoSubNo;
617
618 const UniChar *ucPtr = reinterpret_cast<const UniChar *>(uchars);
619 size_t ucLeft = length;
620
621 QVector<QChar> ucExtra;
622 if (state && state->remainingChars) {
623 // we have one surrogate char to be prepended
624 Q_ASSERT(state->remainingChars == 1);
625 ucLeft += 1;
626 ucExtra.resize(ucLeft);
627 ucPtr = reinterpret_cast<const UniChar *>(ucExtra.data());
628
629 ucExtra[0] = state->state_data[0];
630 memcpy(ucExtra.data() + 1, uchars, length * sizeof(QChar));
631
632 state->remainingChars = 0;
633 }
634
635 // be optimistic (imply that one byte is necessary per every Unicode char)
636 size_t mbLen = length;
637 QByteArray mbBuf(mbLen, '\0');
638 char *mbPtr = mbBuf.data();
639 size_t mbLeft = mbLen;
640
641 size_t nonIdent = 0;
642 int rc;
643
644 while (ucLeft) {
645 rc = UniUconvFromUcs(uo, const_cast<UniChar **>(&ucPtr), &ucLeft,
646 (void**)&mbPtr, &mbLeft, &nonIdent);
647 if (rc == ULS_BUFFERFULL) {
648 size_t mbDone = mbLen - mbLeft;
649 size_t ucDone = length - ucLeft;
650 size_t newLen = mbLen;
651 if (ucDone) {
652 // assume that ucLeft/mbLeft is an approximation of ucDone/mbDone
653 newLen = mbDone + (ucLeft * mbDone) / ucDone;
654 }
655 if (newLen == mbLen) {
656 // could not process a single Unicode char, double the size
657 mbLen *= 2;
658 } else {
659 mbLen = newLen;
660 }
661 mbBuf.resize(mbLen);
662 mbPtr = mbBuf.data() + mbDone;
663 mbLeft = mbLen - mbDone;
664 } else if (rc == ULS_ILLEGALSEQUENCE && state) {
665 // buffer ends in a surrogate
666 Q_ASSERT(ucLeft == 2);
667 state->state_data[0] = *ucPtr;
668 state->remainingChars = 1;
669 break;
670 } else if (rc != ULS_SUCCESS) {
671 // just fail on an unexpected error (will return what we've got)
672 qWarning("QOs2LocalCodec::convertFromUnicode: UniUconvFromUcs failed "
673 "with %d", rc);
674 break;
675 }
676 }
677
678 mbBuf.resize(mbLen - mbLeft);
679 res = mbBuf;
680
681 if (state) {
682 // update the state
683 state->invalidChars = nonIdent;
684 }
685
686 return res;
687}
688
689QByteArray QOs2LocalCodec::name() const
690{
691 return "System";
692}
693
694int QOs2LocalCodec::mibEnum() const
695{
696 return 0;
697}
698
699#else
700
701/* locale names mostly copied from XFree86 */
702static const char * const iso8859_2locales[] = {
703 "croatian", "cs", "cs_CS", "cs_CZ","cz", "cz_CZ", "czech", "hr",
704 "hr_HR", "hu", "hu_HU", "hungarian", "pl", "pl_PL", "polish", "ro",
705 "ro_RO", "rumanian", "serbocroatian", "sh", "sh_SP", "sh_YU", "sk",
706 "sk_SK", "sl", "sl_CS", "sl_SI", "slovak", "slovene", "sr_SP", 0 };
707
708static const char * const iso8859_3locales[] = {
709 "eo", 0 };
710
711static const char * const iso8859_4locales[] = {
712 "ee", "ee_EE", 0 };
713
714static const char * const iso8859_5locales[] = {
715 "mk", "mk_MK", "sp", "sp_YU", 0 };
716
717static const char * const cp_1251locales[] = {
718 "be", "be_BY", "bg", "bg_BG", "bulgarian", 0 };
719
720static const char * const pt_154locales[] = {
721 "ba_RU", "ky", "ky_KG", "kk", "kk_KZ", 0 };
722
723static const char * const iso8859_6locales[] = {
724 "ar_AA", "ar_SA", "arabic", 0 };
725
726static const char * const iso8859_7locales[] = {
727 "el", "el_GR", "greek", 0 };
728
729static const char * const iso8859_8locales[] = {
730 "hebrew", "he", "he_IL", "iw", "iw_IL", 0 };
731
732static const char * const iso8859_9locales[] = {
733 "tr", "tr_TR", "turkish", 0 };
734
735static const char * const iso8859_13locales[] = {
736 "lt", "lt_LT", "lv", "lv_LV", 0 };
737
738static const char * const iso8859_15locales[] = {
739 "et", "et_EE",
740 // Euro countries
741 "br_FR", "ca_ES", "de", "de_AT", "de_BE", "de_DE", "de_LU", "en_IE",
742 "es", "es_ES", "eu_ES", "fi", "fi_FI", "finnish", "fr", "fr_FR",
743 "fr_BE", "fr_LU", "french", "ga_IE", "gl_ES", "it", "it_IT", "oc_FR",
744 "nl", "nl_BE", "nl_NL", "pt", "pt_PT", "sv_FI", "wa_BE",
745 0 };
746
747static const char * const koi8_ulocales[] = {
748 "uk", "uk_UA", "ru_UA", "ukrainian", 0 };
749
750static const char * const tis_620locales[] = {
751 "th", "th_TH", "thai", 0 };
752
753// static const char * const tcvnlocales[] = {
754// "vi", "vi_VN", 0 };
755
756static bool try_locale_list(const char * const locale[], const QByteArray &lang)
757{
758 int i;
759 for(i=0; locale[i] && lang != locale[i]; i++)
760 ;
761 return locale[i] != 0;
762}
763
764// For the probably_koi8_locales we have to look. the standard says
765// these are 8859-5, but almost all Russian users use KOI8-R and
766// incorrectly set $LANG to ru_RU. We'll check tolower() to see what
767// it thinks ru_RU means.
768
769// If you read the history, it seems that many Russians blame ISO and
770// Perestroika for the confusion.
771//
772// The real bug is that some programs break if the user specifies
773// ru_RU.KOI8-R.
774
775static const char * const probably_koi8_rlocales[] = {
776 "ru", "ru_SU", "ru_RU", "russian", 0 };
777
778static QTextCodec * ru_RU_hack(const char * i) {
779#if defined(Q_OS_OS2)
780 // @todo temporary hack. the proper one is to use the current process'
781 // code page if LANG or its codepage part is missing
782 return QTextCodec::codecForName("cp866");
783#else
784 QTextCodec * ru_RU_codec = 0;
785
786#if !defined(QT_NO_SETLOCALE)
787 QByteArray origlocale(setlocale(LC_CTYPE, i));
788#else
789 QByteArray origlocale(i);
790#endif
791 // unicode koi8r latin5 name
792 // 0x044E 0xC0 0xEE CYRILLIC SMALL LETTER YU
793 // 0x042E 0xE0 0xCE CYRILLIC CAPITAL LETTER YU
794 int latin5 = tolower(0xCE);
795 int koi8r = tolower(0xE0);
796 if (koi8r == 0xC0 && latin5 != 0xEE) {
797 ru_RU_codec = QTextCodec::codecForName("KOI8-R");
798 } else if (koi8r != 0xC0 && latin5 == 0xEE) {
799 ru_RU_codec = QTextCodec::codecForName("ISO 8859-5");
800 } else {
801 // something else again... let's assume... *throws dice*
802 ru_RU_codec = QTextCodec::codecForName("KOI8-R");
803 qWarning("QTextCodec: Using KOI8-R, probe failed (%02x %02x %s)",
804 koi8r, latin5, i);
805 }
806#if !defined(QT_NO_SETLOCALE)
807 setlocale(LC_CTYPE, origlocale);
808#endif
809
810 return ru_RU_codec;
811#endif // defined(Q_OS_OS2)
812}
813
814#endif
815
816#if !defined(Q_OS_WIN32) && !defined(Q_OS_WINCE) && !defined(Q_OS_OS2)
817static QTextCodec *checkForCodec(const QByteArray &name) {
818 QTextCodec *c = QTextCodec::codecForName(name);
819 if (!c) {
820 const int index = name.indexOf('@');
821 if (index != -1) {
822 c = QTextCodec::codecForName(name.left(index));
823 }
824 }
825 return c;
826}
827#endif
828
829/* the next two functions are implicitely thread safe,
830 as they are only called by setup() which uses a mutex.
831*/
832static void setupLocaleMapper()
833{
834#ifdef Q_OS_SYMBIAN
835 localeMapper = QSymbianTextCodec::localeMapper;
836 if (localeMapper)
837 return;
838#endif
839
840#if defined(Q_OS_WIN32) || defined(Q_OS_WINCE)
841 localeMapper = QTextCodec::codecForName("System");
842#elif defined(Q_OS_OS2)
843 localeMapper = QTextCodec::codecForName("System");
844#else
845
846#ifndef QT_NO_ICONV
847 localeMapper = QTextCodec::codecForName("System");
848#endif
849
850#if defined (_XOPEN_UNIX) && !defined(Q_OS_QNX) && !defined(Q_OS_OSF)
851 if (!localeMapper) {
852 char *charset = nl_langinfo (CODESET);
853 if (charset)
854 localeMapper = QTextCodec::codecForName(charset);
855 }
856#endif
857
858 if (!localeMapper) {
859 // Very poorly defined and followed standards causes lots of
860 // code to try to get all the cases... This logic is
861 // duplicated in QIconvCodec, so if you change it here, change
862 // it there too.
863
864 // Try to determine locale codeset from locale name assigned to
865 // LC_CTYPE category.
866
867 // First part is getting that locale name. First try setlocale() which
868 // definitely knows it, but since we cannot fully trust it, get ready
869 // to fall back to environment variables.
870#if !defined(QT_NO_SETLOCALE)
871 const QByteArray ctype = setlocale(LC_CTYPE, 0);
872#else
873 const QByteArray ctype;
874#endif
875
876 // Get the first nonempty value from $LC_ALL, $LC_CTYPE, and $LANG
877 // environment variables.
878 QByteArray lang = qgetenv("LC_ALL");
879 if (lang.isEmpty() || lang == "C") {
880 lang = qgetenv("LC_CTYPE");
881 }
882 if (lang.isEmpty() || lang == "C") {
883 lang = qgetenv("LANG");
884 }
885
886 // Now try these in order:
887 // 1. CODESET from ctype if it contains a .CODESET part (e.g. en_US.ISO8859-15)
888 // 2. CODESET from lang if it contains a .CODESET part
889 // 3. ctype (maybe the locale is named "ISO-8859-1" or something)
890 // 4. locale (ditto)
891 // 5. check for "@euro"
892 // 6. guess locale from ctype unless ctype is "C"
893 // 7. guess locale from lang
894
895 // 1. CODESET from ctype if it contains a .CODESET part (e.g. en_US.ISO8859-15)
896 int indexOfDot = ctype.indexOf('.');
897 if (indexOfDot != -1)
898 localeMapper = checkForCodec( ctype.mid(indexOfDot + 1) );
899
900 // 2. CODESET from lang if it contains a .CODESET part
901 if (!localeMapper) {
902 indexOfDot = lang.indexOf('.');
903 if (indexOfDot != -1)
904 localeMapper = checkForCodec( lang.mid(indexOfDot + 1) );
905 }
906
907 // 3. ctype (maybe the locale is named "ISO-8859-1" or something)
908 if (!localeMapper && !ctype.isEmpty() && ctype != "C")
909 localeMapper = checkForCodec(ctype);
910
911 // 4. locale (ditto)
912 if (!localeMapper && !lang.isEmpty())
913 localeMapper = checkForCodec(lang);
914
915 // 5. "@euro"
916 if ((!localeMapper && ctype.contains("@euro")) || lang.contains("@euro"))
917 localeMapper = checkForCodec("ISO 8859-15");
918
919 // 6. guess locale from ctype unless ctype is "C"
920 // 7. guess locale from lang
921 const QByteArray &try_by_name = (!ctype.isEmpty() && ctype != "C") ? lang : ctype;
922
923 // Now do the guessing.
924 if (!lang.isEmpty() && !localeMapper && !try_by_name.isEmpty()) {
925 if (try_locale_list(iso8859_15locales, lang))
926 localeMapper = QTextCodec::codecForName("ISO 8859-15");
927 else if (try_locale_list(iso8859_2locales, lang))
928 localeMapper = QTextCodec::codecForName("ISO 8859-2");
929 else if (try_locale_list(iso8859_3locales, lang))
930 localeMapper = QTextCodec::codecForName("ISO 8859-3");
931 else if (try_locale_list(iso8859_4locales, lang))
932 localeMapper = QTextCodec::codecForName("ISO 8859-4");
933 else if (try_locale_list(iso8859_5locales, lang))
934 localeMapper = QTextCodec::codecForName("ISO 8859-5");
935 else if (try_locale_list(iso8859_6locales, lang))
936 localeMapper = QTextCodec::codecForName("ISO 8859-6");
937 else if (try_locale_list(iso8859_7locales, lang))
938 localeMapper = QTextCodec::codecForName("ISO 8859-7");
939 else if (try_locale_list(iso8859_8locales, lang))
940 localeMapper = QTextCodec::codecForName("ISO 8859-8-I");
941 else if (try_locale_list(iso8859_9locales, lang))
942 localeMapper = QTextCodec::codecForName("ISO 8859-9");
943 else if (try_locale_list(iso8859_13locales, lang))
944 localeMapper = QTextCodec::codecForName("ISO 8859-13");
945 else if (try_locale_list(tis_620locales, lang))
946 localeMapper = QTextCodec::codecForName("ISO 8859-11");
947 else if (try_locale_list(koi8_ulocales, lang))
948 localeMapper = QTextCodec::codecForName("KOI8-U");
949 else if (try_locale_list(cp_1251locales, lang))
950 localeMapper = QTextCodec::codecForName("CP 1251");
951 else if (try_locale_list(pt_154locales, lang))
952 localeMapper = QTextCodec::codecForName("PT 154");
953 else if (try_locale_list(probably_koi8_rlocales, lang))
954 localeMapper = ru_RU_hack(lang);
955 }
956
957 }
958
959 // If everything failed, we default to 8859-1
960 // We could perhaps default to 8859-15.
961 if (!localeMapper)
962 localeMapper = QTextCodec::codecForName("ISO 8859-1");
963#endif
964}
965
966#ifndef QT_NO_THREAD
967Q_GLOBAL_STATIC_WITH_ARGS(QMutex, textCodecsMutex, (QMutex::Recursive));
968#endif
969
970// textCodecsMutex need to be locked to enter this function
971static void setup()
972{
973 if (all)
974 return;
975
976#ifdef Q_OS_SYMBIAN
977 // If we don't have a trap handler, we're outside of the main() function,
978 // ie. in global constructors or destructors. Don't create codecs in this
979 // case as it would lead to crashes because of a missing cleanup stack on Symbian
980 if (User::TrapHandler() == NULL)
981 return;
982#endif
983
984#ifdef Q_DEBUG_TEXTCODEC
985 if (destroying_is_ok)
986 qWarning("QTextCodec: Creating new codec during codec cleanup");
987#endif
988 all = new QList<QTextCodec*>;
989 // create the cleanup object to cleanup all codecs on exit
990 (void) createQTextCodecCleanup();
991
992#ifndef QT_NO_CODECS
993 (void)new QTsciiCodec;
994 for (int i = 0; i < 9; ++i)
995 (void)new QIsciiCodec(i);
996
997 for (int i = 0; i < QSimpleTextCodec::numSimpleCodecs; ++i)
998 (void)new QSimpleTextCodec(i);
999
1000#ifdef Q_OS_SYMBIAN
1001 localeMapper = QSymbianTextCodec::init();
1002#endif
1003
1004# if defined(Q_WS_X11) && !defined(QT_BOOTSTRAPPED)
1005 // no font codecs when bootstrapping
1006 (void)new QFontLaoCodec;
1007# if defined(QT_NO_ICONV)
1008 // no iconv(3) support, must build all codecs into the library
1009 (void)new QFontGb2312Codec;
1010 (void)new QFontGbkCodec;
1011 (void)new QFontGb18030_0Codec;
1012 (void)new QFontJis0208Codec;
1013 (void)new QFontJis0201Codec;
1014 (void)new QFontKsc5601Codec;
1015 (void)new QFontBig5hkscsCodec;
1016 (void)new QFontBig5Codec;
1017# endif // QT_NO_ICONV && !QT_BOOTSTRAPPED
1018# endif // Q_WS_X11
1019
1020
1021#ifndef Q_OS_SYMBIAN
1022# if defined(QT_NO_ICONV) && !defined(QT_BOOTSTRAPPED)
1023 // no asian codecs when bootstrapping, sorry
1024 (void)new QGb18030Codec;
1025 (void)new QGbkCodec;
1026 (void)new QGb2312Codec;
1027 (void)new QEucJpCodec;
1028 (void)new QJisCodec;
1029 (void)new QSjisCodec;
1030 (void)new QEucKrCodec;
1031 (void)new QCP949Codec;
1032 (void)new QBig5Codec;
1033 (void)new QBig5hkscsCodec;
1034# endif // QT_NO_ICONV && !QT_BOOTSTRAPPED
1035#endif //Q_OS_SYMBIAN
1036#endif // QT_NO_CODECS
1037
1038#if defined(Q_OS_WIN32) || defined(Q_OS_WINCE)
1039 (void) new QWindowsLocalCodec;
1040#endif // Q_OS_WIN32
1041
1042#if defined(Q_OS_OS2)
1043 (void) new QOs2LocalCodec;
1044#endif // Q_OS_OS2
1045
1046 (void)new QUtf16Codec;
1047 (void)new QUtf16BECodec;
1048 (void)new QUtf16LECodec;
1049 (void)new QUtf32Codec;
1050 (void)new QUtf32BECodec;
1051 (void)new QUtf32LECodec;
1052#ifndef Q_OS_SYMBIAN
1053 (void)new QLatin15Codec;
1054#endif
1055 (void)new QLatin1Codec;
1056 (void)new QUtf8Codec;
1057
1058#ifndef Q_OS_SYMBIAN
1059#if defined(Q_OS_UNIX) && !defined(QT_NO_ICONV) && !defined(QT_BOOTSTRAPPED)
1060 // QIconvCodec depends on the UTF-16 codec, so it needs to be created last
1061 (void) new QIconvCodec();
1062#endif
1063#endif
1064
1065 if (!localeMapper)
1066 setupLocaleMapper();
1067}
1068
1069/*!
1070 \enum QTextCodec::ConversionFlag
1071
1072 \value DefaultConversion No flag is set.
1073 \value ConvertInvalidToNull If this flag is set, each invalid input
1074 character is output as a null character.
1075 \value IgnoreHeader Ignore any Unicode byte-order mark and don't generate any.
1076
1077 \omitvalue FreeFunction
1078*/
1079
1080/*!
1081 \fn QTextCodec::ConverterState::ConverterState(ConversionFlags flags)
1082
1083 Constructs a ConverterState object initialized with the given \a flags.
1084*/
1085
1086/*!
1087 Destroys the ConverterState object.
1088*/
1089QTextCodec::ConverterState::~ConverterState()
1090{
1091 if (flags & FreeFunction)
1092 (QTextCodecUnalignedPointer::decode(state_data))(this);
1093 else if (d)
1094 qFree(d);
1095}
1096
1097static bool codecForLocaleSet = false;
1098void qt_resetCodecForLocale()
1099{
1100#ifndef QT_NO_THREAD
1101 QMutexLocker locker(textCodecsMutex());
1102#endif
1103 // if QTextCodec::codecForLocale() was called, we assume that the user has
1104 // explicitly set the codec he wants for the locale and don't attempt to
1105 // autodetect it again
1106 if (!codecForLocaleSet)
1107 setupLocaleMapper();
1108}
1109
1110/*!
1111 \class QTextCodec
1112 \brief The QTextCodec class provides conversions between text encodings.
1113 \reentrant
1114 \ingroup i18n
1115
1116 Qt uses Unicode to store, draw and manipulate strings. In many
1117 situations you may wish to deal with data that uses a different
1118 encoding. For example, most Japanese documents are still stored
1119 in Shift-JIS or ISO 2022-JP, while Russian users often have their
1120 documents in KOI8-R or Windows-1251.
1121
1122 Qt provides a set of QTextCodec classes to help with converting
1123 non-Unicode formats to and from Unicode. You can also create your
1124 own codec classes.
1125
1126 The supported encodings are:
1127
1128 \list
1129 \o Apple Roman
1130 \o \l{Big5 Text Codec}{Big5}
1131 \o \l{Big5-HKSCS Text Codec}{Big5-HKSCS}
1132 \o CP949
1133 \o \l{EUC-JP Text Codec}{EUC-JP}
1134 \o \l{EUC-KR Text Codec}{EUC-KR}
1135 \o \l{GBK Text Codec}{GB18030-0}
1136 \o IBM 850
1137 \o IBM 866
1138 \o IBM 874
1139 \o \l{ISO 2022-JP (JIS) Text Codec}{ISO 2022-JP}
1140 \o ISO 8859-1 to 10
1141 \o ISO 8859-13 to 16
1142 \o Iscii-Bng, Dev, Gjr, Knd, Mlm, Ori, Pnj, Tlg, and Tml
1143 \o JIS X 0201
1144 \o JIS X 0208
1145 \o KOI8-R
1146 \o KOI8-U
1147 \o MuleLao-1
1148 \o ROMAN8
1149 \o \l{Shift-JIS Text Codec}{Shift-JIS}
1150 \o TIS-620
1151 \o \l{TSCII Text Codec}{TSCII}
1152 \o UTF-8
1153 \o UTF-16
1154 \o UTF-16BE
1155 \o UTF-16LE
1156 \o UTF-32
1157 \o UTF-32BE
1158 \o UTF-32LE
1159 \o Windows-1250 to 1258
1160 \o WINSAMI2
1161 \endlist
1162
1163 QTextCodecs can be used as follows to convert some locally encoded
1164 string to Unicode. Suppose you have some string encoded in Russian
1165 KOI8-R encoding, and want to convert it to Unicode. The simple way
1166 to do it is like this:
1167
1168 \snippet doc/src/snippets/code/src_corelib_codecs_qtextcodec.cpp 0
1169
1170 After this, \c string holds the text converted to Unicode.
1171 Converting a string from Unicode to the local encoding is just as
1172 easy:
1173
1174 \snippet doc/src/snippets/code/src_corelib_codecs_qtextcodec.cpp 1
1175
1176 To read or write files in various encodings, use QTextStream and
1177 its \l{QTextStream::setCodec()}{setCodec()} function. See the
1178 \l{tools/codecs}{Codecs} example for an application of QTextCodec
1179 to file I/O.
1180
1181 Some care must be taken when trying to convert the data in chunks,
1182 for example, when receiving it over a network. In such cases it is
1183 possible that a multi-byte character will be split over two
1184 chunks. At best this might result in the loss of a character and
1185 at worst cause the entire conversion to fail.
1186
1187 The approach to use in these situations is to create a QTextDecoder
1188 object for the codec and use this QTextDecoder for the whole
1189 decoding process, as shown below:
1190
1191 \snippet doc/src/snippets/code/src_corelib_codecs_qtextcodec.cpp 2
1192
1193 The QTextDecoder object maintains state between chunks and therefore
1194 works correctly even if a multi-byte character is split between
1195 chunks.
1196
1197 \section1 Creating Your Own Codec Class
1198
1199 Support for new text encodings can be added to Qt by creating
1200 QTextCodec subclasses.
1201
1202 The pure virtual functions describe the encoder to the system and
1203 the coder is used as required in the different text file formats
1204 supported by QTextStream, and under X11, for the locale-specific
1205 character input and output.
1206
1207 To add support for another encoding to Qt, make a subclass of
1208 QTextCodec and implement the functions listed in the table below.
1209
1210 \table
1211 \header \o Function \o Description
1212
1213 \row \o name()
1214 \o Returns the official name for the encoding. If the
1215 encoding is listed in the
1216 \l{IANA character-sets encoding file}, the name
1217 should be the preferred MIME name for the encoding.
1218
1219 \row \o aliases()
1220 \o Returns a list of alternative names for the encoding.
1221 QTextCodec provides a default implementation that returns
1222 an empty list. For example, "ISO-8859-1" has "latin1",
1223 "CP819", "IBM819", and "iso-ir-100" as aliases.
1224
1225 \row \o mibEnum()
1226 \o Return the MIB enum for the encoding if it is listed in
1227 the \l{IANA character-sets encoding file}.
1228
1229 \row \o convertToUnicode()
1230 \o Converts an 8-bit character string to Unicode.
1231
1232 \row \o convertFromUnicode()
1233 \o Converts a Unicode string to an 8-bit character string.
1234 \endtable
1235
1236 You may find it more convenient to make your codec class
1237 available as a plugin; see \l{How to Create Qt Plugins} for
1238 details.
1239
1240 \sa QTextStream, QTextDecoder, QTextEncoder, {Codecs Example}
1241*/
1242
1243/*!
1244 Constructs a QTextCodec, and gives it the highest precedence. The
1245 QTextCodec should always be constructed on the heap (i.e. with \c
1246 new). Qt takes ownership and will delete it when the application
1247 terminates.
1248*/
1249QTextCodec::QTextCodec()
1250{
1251#ifndef QT_NO_THREAD
1252 QMutexLocker locker(textCodecsMutex());
1253#endif
1254 setup();
1255 all->prepend(this);
1256}
1257
1258
1259/*!
1260 \nonreentrant
1261
1262 Destroys the QTextCodec. Note that you should not delete codecs
1263 yourself: once created they become Qt's responsibility.
1264*/
1265QTextCodec::~QTextCodec()
1266{
1267#ifdef Q_DEBUG_TEXTCODEC
1268 if (!destroying_is_ok)
1269 qWarning("QTextCodec::~QTextCodec: Called by application");
1270#endif
1271 if (all) {
1272#ifndef QT_NO_THREAD
1273 QMutexLocker locker(textCodecsMutex());
1274#endif
1275 all->removeAll(this);
1276 QTextCodecCache *cache = qTextCodecCache();
1277 if (cache)
1278 cache->clear();
1279 }
1280}
1281
1282/*!
1283 \fn QTextCodec *QTextCodec::codecForName(const char *name)
1284
1285 Searches all installed QTextCodec objects and returns the one
1286 which best matches \a name; the match is case-insensitive. Returns
1287 0 if no codec matching the name \a name could be found.
1288*/
1289
1290/*!
1291 Searches all installed QTextCodec objects and returns the one
1292 which best matches \a name; the match is case-insensitive. Returns
1293 0 if no codec matching the name \a name could be found.
1294*/
1295QTextCodec *QTextCodec::codecForName(const QByteArray &name)
1296{
1297 if (name.isEmpty())
1298 return 0;
1299
1300#ifndef QT_NO_THREAD
1301 QMutexLocker locker(textCodecsMutex());
1302#endif
1303 setup();
1304
1305 if (!validCodecs())
1306 return 0;
1307
1308 QTextCodecCache *cache = qTextCodecCache();
1309 QTextCodec *codec;
1310 if (cache) {
1311 codec = cache->value(name);
1312 if (codec)
1313 return codec;
1314 }
1315
1316 for (int i = 0; i < all->size(); ++i) {
1317 QTextCodec *cursor = all->at(i);
1318 if (nameMatch(cursor->name(), name)) {
1319 if (cache)
1320 cache->insert(name, cursor);
1321 return cursor;
1322 }
1323 QList<QByteArray> aliases = cursor->aliases();
1324 for (int y = 0; y < aliases.size(); ++y)
1325 if (nameMatch(aliases.at(y), name)) {
1326 if (cache)
1327 cache->insert(name, cursor);
1328 return cursor;
1329 }
1330 }
1331
1332 codec = createForName(name);
1333 if (codec && cache)
1334 cache->insert(name, codec);
1335 return codec;
1336}
1337
1338
1339/*!
1340 Returns the QTextCodec which matches the \link
1341 QTextCodec::mibEnum() MIBenum\endlink \a mib.
1342*/
1343QTextCodec* QTextCodec::codecForMib(int mib)
1344{
1345#ifndef QT_NO_THREAD
1346 QMutexLocker locker(textCodecsMutex());
1347#endif
1348 setup();
1349
1350 if (!validCodecs())
1351 return 0;
1352
1353 QByteArray key = "MIB: " + QByteArray::number(mib);
1354 QTextCodecCache *cache = qTextCodecCache();
1355 QTextCodec *codec;
1356 if (cache)
1357 codec = cache->value(key);
1358
1359 QList<QTextCodec*>::ConstIterator i;
1360 for (int i = 0; i < all->size(); ++i) {
1361 QTextCodec *cursor = all->at(i);
1362 if (cursor->mibEnum() == mib) {
1363 if (cache)
1364 cache->insert(key, cursor);
1365 return cursor;
1366 }
1367 }
1368
1369 codec = createForMib(mib);
1370
1371 // Qt 3 used 1000 (mib for UCS2) as its identifier for the utf16 codec. Map
1372 // this correctly for compatibility.
1373 if (!codec && mib == 1000)
1374 return codecForMib(1015);
1375
1376 if (codec && cache)
1377 cache->insert(key, codec);
1378 return codec;
1379}
1380
1381/*!
1382 Returns the list of all available codecs, by name. Call
1383 QTextCodec::codecForName() to obtain the QTextCodec for the name.
1384
1385 The list may contain many mentions of the same codec
1386 if the codec has aliases.
1387
1388 \sa availableMibs(), name(), aliases()
1389*/
1390QList<QByteArray> QTextCodec::availableCodecs()
1391{
1392#ifndef QT_NO_THREAD
1393 QMutexLocker locker(textCodecsMutex());
1394#endif
1395 setup();
1396
1397 QList<QByteArray> codecs;
1398
1399 if (!validCodecs())
1400 return codecs;
1401
1402 for (int i = 0; i < all->size(); ++i) {
1403 codecs += all->at(i)->name();
1404 codecs += all->at(i)->aliases();
1405 }
1406
1407#ifndef QT_NO_THREAD
1408 locker.unlock();
1409#endif
1410
1411#if !defined(QT_NO_LIBRARY) && !defined(QT_NO_TEXTCODECPLUGIN)
1412 QFactoryLoader *l = loader();
1413 QStringList keys = l->keys();
1414 for (int i = 0; i < keys.size(); ++i) {
1415 if (!keys.at(i).startsWith(QLatin1String("MIB: "))) {
1416 QByteArray name = keys.at(i).toLatin1();
1417 if (!codecs.contains(name))
1418 codecs += name;
1419 }
1420 }
1421#endif
1422
1423 return codecs;
1424}
1425
1426/*!
1427 Returns the list of MIBs for all available codecs. Call
1428 QTextCodec::codecForMib() to obtain the QTextCodec for the MIB.
1429
1430 \sa availableCodecs(), mibEnum()
1431*/
1432QList<int> QTextCodec::availableMibs()
1433{
1434#ifndef QT_NO_THREAD
1435 QMutexLocker locker(textCodecsMutex());
1436#endif
1437 setup();
1438
1439 QList<int> codecs;
1440
1441 if (!validCodecs())
1442 return codecs;
1443
1444 for (int i = 0; i < all->size(); ++i)
1445 codecs += all->at(i)->mibEnum();
1446
1447#ifndef QT_NO_THREAD
1448 locker.unlock();
1449#endif
1450
1451#if !defined(QT_NO_LIBRARY) && !defined(QT_NO_TEXTCODECPLUGIN)
1452 QFactoryLoader *l = loader();
1453 QStringList keys = l->keys();
1454 for (int i = 0; i < keys.size(); ++i) {
1455 if (keys.at(i).startsWith(QLatin1String("MIB: "))) {
1456 int mib = keys.at(i).mid(5).toInt();
1457 if (!codecs.contains(mib))
1458 codecs += mib;
1459 }
1460 }
1461#endif
1462
1463 return codecs;
1464}
1465
1466/*!
1467 Set the codec to \a c; this will be returned by
1468 codecForLocale(). If \a c is a null pointer, the codec is reset to
1469 the default.
1470
1471 This might be needed for some applications that want to use their
1472 own mechanism for setting the locale.
1473
1474 \sa codecForLocale()
1475*/
1476void QTextCodec::setCodecForLocale(QTextCodec *c)
1477{
1478#ifndef QT_NO_THREAD
1479 QMutexLocker locker(textCodecsMutex());
1480#endif
1481 codecForLocaleSet = true;
1482 localeMapper = c;
1483 if (!localeMapper)
1484 setupLocaleMapper();
1485}
1486
1487/*!
1488 Returns a pointer to the codec most suitable for this locale.
1489
1490 On Windows, the codec will be based on a system locale. On Unix
1491 systems, starting with Qt 4.2, the codec will be using the \e
1492 iconv library. Note that in both cases the codec's name will be
1493 "System".
1494*/
1495
1496QTextCodec* QTextCodec::codecForLocale()
1497{
1498 if (!validCodecs())
1499 return 0;
1500
1501 if (localeMapper)
1502 return localeMapper;
1503
1504#ifndef QT_NO_THREAD
1505 QMutexLocker locker(textCodecsMutex());
1506#endif
1507 setup();
1508
1509 return localeMapper;
1510}
1511
1512
1513/*!
1514 \fn QByteArray QTextCodec::name() const
1515
1516 QTextCodec subclasses must reimplement this function. It returns
1517 the name of the encoding supported by the subclass.
1518
1519 If the codec is registered as a character set in the
1520 \l{IANA character-sets encoding file} this method should
1521 return the preferred mime name for the codec if defined,
1522 otherwise its name.
1523*/
1524
1525/*!
1526 \fn int QTextCodec::mibEnum() const
1527
1528 Subclasses of QTextCodec must reimplement this function. It
1529 returns the MIBenum (see \l{IANA character-sets encoding file}
1530 for more information). It is important that each QTextCodec
1531 subclass returns the correct unique value for this function.
1532*/
1533
1534/*!
1535 Subclasses can return a number of aliases for the codec in question.
1536
1537 Standard aliases for codecs can be found in the
1538 \l{IANA character-sets encoding file}.
1539*/
1540QList<QByteArray> QTextCodec::aliases() const
1541{
1542 return QList<QByteArray>();
1543}
1544
1545/*!
1546 \fn QString QTextCodec::convertToUnicode(const char *chars, int len,
1547 ConverterState *state) const
1548
1549 QTextCodec subclasses must reimplement this function.
1550
1551 Converts the first \a len characters of \a chars from the
1552 encoding of the subclass to Unicode, and returns the result in a
1553 QString.
1554
1555 \a state can be 0, in which case the conversion is stateless and
1556 default conversion rules should be used. If state is not 0, the
1557 codec should save the state after the conversion in \a state, and
1558 adjust the remainingChars and invalidChars members of the struct.
1559*/
1560
1561/*!
1562 \fn QByteArray QTextCodec::convertFromUnicode(const QChar *input, int number,
1563 ConverterState *state) const
1564
1565 QTextCodec subclasses must reimplement this function.
1566
1567 Converts the first \a number of characters from the \a input array
1568 from Unicode to the encoding of the subclass, and returns the result
1569 in a QByteArray.
1570
1571 \a state can be 0 in which case the conversion is stateless and
1572 default conversion rules should be used. If state is not 0, the
1573 codec should save the state after the conversion in \a state, and
1574 adjust the remainingChars and invalidChars members of the struct.
1575*/
1576
1577/*!
1578 Creates a QTextDecoder which stores enough state to decode chunks
1579 of \c{char *} data to create chunks of Unicode data.
1580
1581 The caller is responsible for deleting the returned object.
1582*/
1583QTextDecoder* QTextCodec::makeDecoder() const
1584{
1585 return new QTextDecoder(this);
1586}
1587
1588/*!
1589 Creates a QTextDecoder with a specified \a flags to decode chunks
1590 of \c{char *} data to create chunks of Unicode data.
1591
1592 The caller is responsible for deleting the returned object.
1593
1594 \since 4.7
1595*/
1596QTextDecoder* QTextCodec::makeDecoder(QTextCodec::ConversionFlags flags) const
1597{
1598 return new QTextDecoder(this, flags);
1599}
1600
1601
1602/*!
1603 Creates a QTextEncoder which stores enough state to encode chunks
1604 of Unicode data as \c{char *} data.
1605
1606 The caller is responsible for deleting the returned object.
1607*/
1608QTextEncoder* QTextCodec::makeEncoder() const
1609{
1610 return new QTextEncoder(this);
1611}
1612
1613/*!
1614 Creates a QTextEncoder with a specified \a flags to encode chunks
1615 of Unicode data as \c{char *} data.
1616
1617 The caller is responsible for deleting the returned object.
1618
1619 \since 4.7
1620*/
1621QTextEncoder* QTextCodec::makeEncoder(QTextCodec::ConversionFlags flags) const
1622{
1623 return new QTextEncoder(this, flags);
1624}
1625
1626/*!
1627 \fn QByteArray QTextCodec::fromUnicode(const QChar *input, int number,
1628 ConverterState *state) const
1629
1630 Converts the first \a number of characters from the \a input array
1631 from Unicode to the encoding of this codec, and returns the result
1632 in a QByteArray.
1633
1634 The \a state of the convertor used is updated.
1635*/
1636
1637/*!
1638 Converts \a str from Unicode to the encoding of this codec, and
1639 returns the result in a QByteArray.
1640*/
1641QByteArray QTextCodec::fromUnicode(const QString& str) const
1642{
1643 return convertFromUnicode(str.constData(), str.length(), 0);
1644}
1645
1646/*!
1647 \fn QString QTextCodec::toUnicode(const char *input, int size,
1648 ConverterState *state) const
1649
1650 Converts the first \a size characters from the \a input from the
1651 encoding of this codec to Unicode, and returns the result in a
1652 QString.
1653
1654 The \a state of the convertor used is updated.
1655*/
1656
1657/*!
1658 Converts \a a from the encoding of this codec to Unicode, and
1659 returns the result in a QString.
1660*/
1661QString QTextCodec::toUnicode(const QByteArray& a) const
1662{
1663 return convertToUnicode(a.constData(), a.length(), 0);
1664}
1665
1666/*!
1667 Returns true if the Unicode character \a ch can be fully encoded
1668 with this codec; otherwise returns false.
1669*/
1670bool QTextCodec::canEncode(QChar ch) const
1671{
1672 ConverterState state;
1673 state.flags = ConvertInvalidToNull;
1674 convertFromUnicode(&ch, 1, &state);
1675 return (state.invalidChars == 0);
1676}
1677
1678/*!
1679 \overload
1680
1681 \a s contains the string being tested for encode-ability.
1682*/
1683bool QTextCodec::canEncode(const QString& s) const
1684{
1685 ConverterState state;
1686 state.flags = ConvertInvalidToNull;
1687 convertFromUnicode(s.constData(), s.length(), &state);
1688 return (state.invalidChars == 0);
1689}
1690
1691#ifdef QT3_SUPPORT
1692/*!
1693 Returns a string representing the current language and
1694 sublanguage, e.g. "pt" for Portuguese, or "pt_br" for Portuguese/Brazil.
1695
1696 \sa QLocale
1697*/
1698const char *QTextCodec::locale()
1699{
1700 static char locale[6];
1701 QByteArray l = QLocale::system().name().toLatin1();
1702 int len = qMin(l.length(), 5);
1703 memcpy(locale, l.constData(), len);
1704 locale[len] = '\0';
1705
1706 return locale;
1707}
1708
1709/*!
1710 \overload
1711*/
1712
1713QByteArray QTextCodec::fromUnicode(const QString& uc, int& lenInOut) const
1714{
1715 QByteArray result = convertFromUnicode(uc.constData(), lenInOut, 0);
1716 lenInOut = result.length();
1717 return result;
1718}
1719
1720/*!
1721 \overload
1722
1723 \a a contains the source characters; \a len contains the number of
1724 characters in \a a to use.
1725*/
1726QString QTextCodec::toUnicode(const QByteArray& a, int len) const
1727{
1728 len = qMin(a.size(), len);
1729 return convertToUnicode(a.constData(), len, 0);
1730}
1731#endif
1732
1733/*!
1734 \overload
1735
1736 \a chars contains the source characters.
1737*/
1738QString QTextCodec::toUnicode(const char *chars) const
1739{
1740 int len = qstrlen(chars);
1741 return convertToUnicode(chars, len, 0);
1742}
1743
1744
1745/*!
1746 \class QTextEncoder
1747 \brief The QTextEncoder class provides a state-based encoder.
1748 \reentrant
1749 \ingroup i18n
1750
1751 A text encoder converts text from Unicode into an encoded text format
1752 using a specific codec.
1753
1754 The encoder converts Unicode into another format, remembering any
1755 state that is required between calls.
1756
1757 \sa QTextCodec::makeEncoder(), QTextDecoder
1758*/
1759
1760/*!
1761 \fn QTextEncoder::QTextEncoder(const QTextCodec *codec)
1762
1763 Constructs a text encoder for the given \a codec.
1764*/
1765
1766/*!
1767 Constructs a text encoder for the given \a codec and conversion \a flags.
1768
1769 \since 4.7
1770*/
1771QTextEncoder::QTextEncoder(const QTextCodec *codec, QTextCodec::ConversionFlags flags)
1772 : c(codec), state()
1773{
1774 state.flags = flags;
1775}
1776
1777/*!
1778 Destroys the encoder.
1779*/
1780QTextEncoder::~QTextEncoder()
1781{
1782}
1783
1784/*! \internal
1785 \since 4.5
1786 Determines whether the eecoder encountered a failure while decoding the input. If
1787 an error was encountered, the produced result is undefined, and gets converted as according
1788 to the conversion flags.
1789 */
1790bool QTextEncoder::hasFailure() const
1791{
1792 return state.invalidChars != 0;
1793}
1794
1795/*!
1796 Converts the Unicode string \a str into an encoded QByteArray.
1797*/
1798QByteArray QTextEncoder::fromUnicode(const QString& str)
1799{
1800 QByteArray result = c->fromUnicode(str.constData(), str.length(), &state);
1801 return result;
1802}
1803
1804/*!
1805 \overload
1806
1807 Converts \a len characters (not bytes) from \a uc, and returns the
1808 result in a QByteArray.
1809*/
1810QByteArray QTextEncoder::fromUnicode(const QChar *uc, int len)
1811{
1812 QByteArray result = c->fromUnicode(uc, len, &state);
1813 return result;
1814}
1815
1816#ifdef QT3_SUPPORT
1817/*!
1818 \overload
1819
1820 Converts \a lenInOut characters (not bytes) from \a uc, and returns the
1821 result in a QByteArray. The number of characters read is returned in
1822 the \a lenInOut parameter.
1823*/
1824QByteArray QTextEncoder::fromUnicode(const QString& uc, int& lenInOut)
1825{
1826 QByteArray result = c->fromUnicode(uc.constData(), lenInOut, &state);
1827 lenInOut = result.length();
1828 return result;
1829}
1830#endif
1831
1832/*!
1833 \class QTextDecoder
1834 \brief The QTextDecoder class provides a state-based decoder.
1835 \reentrant
1836 \ingroup i18n
1837
1838 A text decoder converts text from an encoded text format into Unicode
1839 using a specific codec.
1840
1841 The decoder converts text in this format into Unicode, remembering any
1842 state that is required between calls.
1843
1844 \sa QTextCodec::makeDecoder(), QTextEncoder
1845*/
1846
1847/*!
1848 \fn QTextDecoder::QTextDecoder(const QTextCodec *codec)
1849
1850 Constructs a text decoder for the given \a codec.
1851*/
1852
1853/*!
1854 Constructs a text decoder for the given \a codec and conversion \a flags.
1855
1856 \since 4.7
1857*/
1858
1859QTextDecoder::QTextDecoder(const QTextCodec *codec, QTextCodec::ConversionFlags flags)
1860 : c(codec), state()
1861{
1862 state.flags = flags;
1863}
1864
1865/*!
1866 Destroys the decoder.
1867*/
1868QTextDecoder::~QTextDecoder()
1869{
1870}
1871
1872/*!
1873 \fn QString QTextDecoder::toUnicode(const char *chars, int len)
1874
1875 Converts the first \a len bytes in \a chars to Unicode, returning
1876 the result.
1877
1878 If not all characters are used (e.g. if only part of a multi-byte
1879 encoding is at the end of the characters), the decoder remembers
1880 enough state to continue with the next call to this function.
1881*/
1882QString QTextDecoder::toUnicode(const char *chars, int len)
1883{
1884 return c->toUnicode(chars, len, &state);
1885}
1886
1887
1888/*! \overload
1889
1890 The converted string is returned in \a target.
1891 */
1892void QTextDecoder::toUnicode(QString *target, const char *chars, int len)
1893{
1894 Q_ASSERT(target);
1895 switch (c->mibEnum()) {
1896 case 106: // utf8
1897 static_cast<const QUtf8Codec*>(c)->convertToUnicode(target, chars, len, &state);
1898 break;
1899 case 4: { // latin1
1900 target->resize(len);
1901 ushort *data = (ushort*)target->data();
1902 for (int i = len; i >=0; --i)
1903 data[i] = (uchar) chars[i];
1904 } break;
1905 default:
1906 *target = c->toUnicode(chars, len, &state);
1907 }
1908}
1909
1910
1911/*!
1912 \overload
1913
1914 Converts the bytes in the byte array specified by \a ba to Unicode
1915 and returns the result.
1916*/
1917QString QTextDecoder::toUnicode(const QByteArray &ba)
1918{
1919 return c->toUnicode(ba.constData(), ba.length(), &state);
1920}
1921
1922
1923/*!
1924 \fn QTextCodec* QTextCodec::codecForTr()
1925
1926 Returns the codec used by QObject::tr() on its argument. If this
1927 function returns 0 (the default), tr() assumes Latin-1.
1928
1929 \sa setCodecForTr()
1930*/
1931
1932/*!
1933 \fn void QTextCodec::setCodecForTr(QTextCodec *c)
1934 \nonreentrant
1935
1936 Sets the codec used by QObject::tr() on its argument to \a c. If
1937 \a c is 0 (the default), tr() assumes Latin-1.
1938
1939 If the literal quoted text in the program is not in the Latin-1
1940 encoding, this function can be used to set the appropriate
1941 encoding. For example, software developed by Korean programmers
1942 might use eucKR for all the text in the program, in which case the
1943 main() function might look like this:
1944
1945 \snippet doc/src/snippets/code/src_corelib_codecs_qtextcodec.cpp 3
1946
1947 Note that this is not the way to select the encoding that the \e
1948 user has chosen. For example, to convert an application containing
1949 literal English strings to Korean, all that is needed is for the
1950 English strings to be passed through tr() and for translation
1951 files to be loaded. For details of internationalization, see
1952 \l{Internationalization with Qt}.
1953
1954 \sa codecForTr(), setCodecForCStrings()
1955*/
1956
1957
1958/*!
1959 \fn QTextCodec* QTextCodec::codecForCStrings()
1960
1961 Returns the codec used by QString to convert to and from \c{const
1962 char *} and QByteArrays. If this function returns 0 (the default),
1963 QString assumes Latin-1.
1964
1965 \sa setCodecForCStrings()
1966*/
1967
1968/*!
1969 \fn void QTextCodec::setCodecForCStrings(QTextCodec *codec)
1970 \nonreentrant
1971
1972 Sets the codec used by QString to convert to and from \c{const
1973 char *} and QByteArrays. If the \a codec is 0 (the default),
1974 QString assumes Latin-1.
1975
1976 \warning Some codecs do not preserve the characters in the ASCII
1977 range (0x00 to 0x7F). For example, the Japanese Shift-JIS
1978 encoding maps the backslash character (0x5A) to the Yen
1979 character. To avoid undesirable side-effects, we recommend
1980 avoiding such codecs with setCodecsForCString().
1981
1982 \sa codecForCStrings(), setCodecForTr()
1983*/
1984
1985/*!
1986 \since 4.4
1987
1988 Tries to detect the encoding of the provided snippet of HTML in
1989 the given byte array, \a ba, by checking the BOM (Byte Order Mark)
1990 and the content-type meta header and returns a QTextCodec instance
1991 that is capable of decoding the html to unicode. If the codec
1992 cannot be detected from the content provided, \a defaultCodec is
1993 returned.
1994
1995 \sa codecForUtfText()
1996*/
1997QTextCodec *QTextCodec::codecForHtml(const QByteArray &ba, QTextCodec *defaultCodec)
1998{
1999 // determine charset
2000 int pos;
2001 QTextCodec *c = 0;
2002
2003 c = QTextCodec::codecForUtfText(ba, c);
2004 if (!c) {
2005 QByteArray header = ba.left(512).toLower();
2006 if ((pos = header.indexOf("http-equiv=")) != -1) {
2007 if ((pos = header.lastIndexOf("meta ", pos)) != -1) {
2008 pos = header.indexOf("charset=", pos) + int(strlen("charset="));
2009 if (pos != -1) {
2010 int pos2 = header.indexOf('\"', pos+1);
2011 QByteArray cs = header.mid(pos, pos2-pos);
2012 // qDebug("found charset: %s", cs.data());
2013 c = QTextCodec::codecForName(cs);
2014 }
2015 }
2016 }
2017 }
2018 if (!c)
2019 c = defaultCodec;
2020
2021 return c;
2022}
2023
2024/*!
2025 \overload
2026
2027 Tries to detect the encoding of the provided snippet of HTML in
2028 the given byte array, \a ba, by checking the BOM (Byte Order Mark)
2029 and the content-type meta header and returns a QTextCodec instance
2030 that is capable of decoding the html to unicode. If the codec cannot
2031 be detected, this overload returns a Latin-1 QTextCodec.
2032*/
2033QTextCodec *QTextCodec::codecForHtml(const QByteArray &ba)
2034{
2035 return codecForHtml(ba, QTextCodec::codecForMib(/*Latin 1*/ 4));
2036}
2037
2038/*!
2039 \since 4.6
2040
2041 Tries to detect the encoding of the provided snippet \a ba by
2042 using the BOM (Byte Order Mark) and returns a QTextCodec instance
2043 that is capable of decoding the text to unicode. If the codec
2044 cannot be detected from the content provided, \a defaultCodec is
2045 returned.
2046
2047 \sa codecForHtml()
2048*/
2049QTextCodec *QTextCodec::codecForUtfText(const QByteArray &ba, QTextCodec *defaultCodec)
2050{
2051 const int arraySize = ba.size();
2052
2053 if (arraySize > 3) {
2054 if ((uchar)ba[0] == 0x00
2055 && (uchar)ba[1] == 0x00
2056 && (uchar)ba[2] == 0xFE
2057 && (uchar)ba[3] == 0xFF)
2058 return QTextCodec::codecForMib(1018); // utf-32 be
2059 else if ((uchar)ba[0] == 0xFF
2060 && (uchar)ba[1] == 0xFE
2061 && (uchar)ba[2] == 0x00
2062 && (uchar)ba[3] == 0x00)
2063 return QTextCodec::codecForMib(1019); // utf-32 le
2064 }
2065
2066 if (arraySize < 2)
2067 return defaultCodec;
2068 if ((uchar)ba[0] == 0xfe && (uchar)ba[1] == 0xff)
2069 return QTextCodec::codecForMib(1013); // utf16 be
2070 else if ((uchar)ba[0] == 0xff && (uchar)ba[1] == 0xfe)
2071 return QTextCodec::codecForMib(1014); // utf16 le
2072
2073 if (arraySize < 3)
2074 return defaultCodec;
2075 if ((uchar)ba[0] == 0xef
2076 && (uchar)ba[1] == 0xbb
2077 && (uchar)ba[2] == 0xbf)
2078 return QTextCodec::codecForMib(106); // utf-8
2079
2080 return defaultCodec;
2081}
2082
2083/*!
2084 \overload
2085
2086 Tries to detect the encoding of the provided snippet \a ba by
2087 using the BOM (Byte Order Mark) and returns a QTextCodec instance
2088 that is capable of decoding the text to unicode. If the codec
2089 cannot be detected, this overload returns a Latin-1 QTextCodec.
2090
2091 \sa codecForHtml()
2092*/
2093QTextCodec *QTextCodec::codecForUtfText(const QByteArray &ba)
2094{
2095 return codecForUtfText(ba, QTextCodec::codecForMib(/*Latin 1*/ 4));
2096}
2097
2098
2099/*! \internal
2100 \since 4.3
2101 Determines whether the decoder encountered a failure while decoding the input. If
2102 an error was encountered, the produced result is undefined, and gets converted as according
2103 to the conversion flags.
2104 */
2105bool QTextDecoder::hasFailure() const
2106{
2107 return state.invalidChars != 0;
2108}
2109
2110/*!
2111 \fn QTextCodec *QTextCodec::codecForContent(const char *str, int size)
2112
2113 This functionality is no longer provided by Qt. This
2114 compatibility function always returns a null pointer.
2115*/
2116
2117/*!
2118 \fn QTextCodec *QTextCodec::codecForName(const char *hint, int accuracy)
2119
2120 Use the codecForName(const QByteArray &) overload instead.
2121*/
2122
2123/*!
2124 \fn QTextCodec *QTextCodec::codecForIndex(int i)
2125
2126 Use availableCodecs() or availableMibs() instead and iterate
2127 through the resulting list.
2128*/
2129
2130
2131/*!
2132 \fn QByteArray QTextCodec::mimeName() const
2133
2134 Use name() instead.
2135*/
2136
2137QT_END_NAMESPACE
2138
2139#endif // QT_NO_TEXTCODEC
Note: See TracBrowser for help on using the repository browser.