source: trunk/src/corelib/codecs/qtextcodec.cpp@ 478

Last change on this file since 478 was 359, checked in by Dmitry A. Kuminov, 16 years ago

corelib/codecs: FIxed a nasty typo that caused LIBC panic (Tried to free block twice) when using QTextStream.

File size: 53.9 KB
Line 
1/****************************************************************************
2**
3** Copyright (C) 2009 Nokia Corporation and/or its subsidiary(-ies).
4** Contact: Qt Software Information ([email protected])
5**
6** This file is part of the QtCore module of the Qt Toolkit.
7**
8** $QT_BEGIN_LICENSE:LGPL$
9** Commercial Usage
10** Licensees holding valid Qt Commercial licenses may use this file in
11** accordance with the Qt Commercial License Agreement provided with the
12** Software or, alternatively, in accordance with the terms contained in
13** a written agreement between you and Nokia.
14**
15** GNU Lesser General Public License Usage
16** Alternatively, this file may be used under the terms of the GNU Lesser
17** General Public License version 2.1 as published by the Free Software
18** Foundation and appearing in the file LICENSE.LGPL included in the
19** packaging of this file. Please review the following information to
20** ensure the GNU Lesser General Public License version 2.1 requirements
21** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
22**
23** In addition, as a special exception, Nokia gives you certain
24** additional rights. These rights are described in the Nokia Qt LGPL
25** Exception version 1.0, included in the file LGPL_EXCEPTION.txt in this
26** package.
27**
28** GNU General Public License Usage
29** Alternatively, this file may be used under the terms of the GNU
30** General Public License version 3.0 as published by the Free Software
31** Foundation and appearing in the file LICENSE.GPL included in the
32** packaging of this file. Please review the following information to
33** ensure the GNU General Public License version 3.0 requirements will be
34** met: http://www.gnu.org/copyleft/gpl.html.
35**
36** If you are unsure which license is appropriate for your use, please
37** contact the sales department at [email protected].
38** $QT_END_LICENSE$
39**
40****************************************************************************/
41
42#include "qplatformdefs.h"
43#include "qtextcodec.h"
44#include "qtextcodec_p.h"
45
46#ifndef QT_NO_TEXTCODEC
47
48#include "qlist.h"
49#include "qfile.h"
50#ifndef QT_NO_LIBRARY
51# include "qcoreapplication.h"
52# include "qtextcodecplugin.h"
53# include "private/qfactoryloader_p.h"
54#endif
55#include "qstringlist.h"
56
57#ifdef Q_OS_UNIX
58# include "qiconvcodec_p.h"
59#endif
60
61#if defined(Q_OS_OS2)
62# include <unidef.h>
63# include <uconv.h>
64# include "qvector.h"
65#endif
66
67#include "qutfcodec_p.h"
68#include "qsimplecodec_p.h"
69#include "qlatincodec_p.h"
70#ifndef QT_NO_CODECS
71# include "qtsciicodec_p.h"
72# include "qisciicodec_p.h"
73# if defined(QT_NO_ICONV) && !defined(QT_BOOTSTRAPPED)
74// no iconv(3) support, must build all codecs into the library
75# include "../../plugins/codecs/cn/qgb18030codec.h"
76# include "../../plugins/codecs/jp/qeucjpcodec.h"
77# include "../../plugins/codecs/jp/qjiscodec.h"
78# include "../../plugins/codecs/jp/qsjiscodec.h"
79# include "../../plugins/codecs/kr/qeuckrcodec.h"
80# include "../../plugins/codecs/tw/qbig5codec.h"
81# endif // QT_NO_ICONV
82# if defined(Q_WS_X11) && !defined(QT_BOOTSTRAPPED)
83# include "qfontlaocodec_p.h"
84# include "../../plugins/codecs/jp/qfontjpcodec.h"
85# endif
86#endif // QT_NO_CODECS
87#include "qlocale.h"
88#include "private/qmutexpool_p.h"
89
90#include <stdlib.h>
91#include <ctype.h>
92#include <locale.h>
93#if defined (_XOPEN_UNIX) && !defined(Q_OS_QNX6) && !defined(Q_OS_OSF)
94#include <langinfo.h>
95#endif
96
97#if defined(Q_OS_WINCE)
98# define QT_NO_SETLOCALE
99#endif
100
101QT_BEGIN_NAMESPACE
102
103#ifndef QT_NO_TEXTCODECPLUGIN
104Q_GLOBAL_STATIC_WITH_ARGS(QFactoryLoader, loader,
105 (QTextCodecFactoryInterface_iid, QLatin1String("/codecs")))
106#endif
107
108
109static bool nameMatch(const QByteArray &name, const QByteArray &test)
110{
111 // if they're the same, return a perfect score
112 if (qstricmp(name, test) == 0)
113 return true;
114
115 const char *n = name.constData();
116 const char *h = test.constData();
117
118 // if the letters and numbers are the same, we have a match
119 while (*n != '\0') {
120 if (isalnum((uchar)*n)) {
121 for (;;) {
122 if (*h == '\0')
123 return false;
124 if (isalnum((uchar)*h))
125 break;
126 ++h;
127 }
128 if (tolower((uchar)*n) != tolower((uchar)*h))
129 return false;
130 ++h;
131 }
132 ++n;
133 }
134 while (*h && !isalnum((uchar)*h))
135 ++h;
136 return (*h == '\0');
137}
138
139
140static QTextCodec *createForName(const QByteArray &name)
141{
142#ifndef QT_NO_TEXTCODECPLUGIN
143 QFactoryLoader *l = loader();
144 QStringList keys = l->keys();
145 for (int i = 0; i < keys.size(); ++i) {
146 if (nameMatch(name, keys.at(i).toLatin1())) {
147 QString realName = keys.at(i);
148 if (QTextCodecFactoryInterface *factory
149 = qobject_cast<QTextCodecFactoryInterface*>(l->instance(realName))) {
150 return factory->create(realName);
151 }
152 }
153 }
154#else
155 Q_UNUSED(name);
156#endif
157 return 0;
158}
159
160static QTextCodec *createForMib(int mib)
161{
162#ifndef QT_NO_TEXTCODECPLUGIN
163 QString name = QLatin1String("MIB: ") + QString::number(mib);
164 if (QTextCodecFactoryInterface *factory
165 = qobject_cast<QTextCodecFactoryInterface*>(loader()->instance(name)))
166 return factory->create(name);
167#else
168 Q_UNUSED(mib);
169#endif
170 return 0;
171}
172
173static QList<QTextCodec*> *all = 0;
174static bool destroying_is_ok = false;
175
176static QTextCodec *localeMapper = 0;
177QTextCodec *QTextCodec::cftr = 0;
178
179
180class QTextCodecCleanup
181{
182public:
183 ~QTextCodecCleanup();
184};
185
186/*
187 Deletes all the created codecs. This destructor is called just
188 before exiting to delete any QTextCodec objects that may be lying
189 around.
190*/
191QTextCodecCleanup::~QTextCodecCleanup()
192{
193 if (!all)
194 return;
195
196 destroying_is_ok = true;
197
198 while (all->size())
199 delete all->takeFirst();
200 delete all;
201 all = 0;
202 localeMapper = 0;
203
204 destroying_is_ok = false;
205}
206
207Q_GLOBAL_STATIC(QTextCodecCleanup, createQTextCodecCleanup)
208
209#if defined(Q_OS_WIN32) || defined(Q_OS_WINCE)
210class QWindowsLocalCodec: public QTextCodec
211{
212public:
213 QWindowsLocalCodec();
214 ~QWindowsLocalCodec();
215
216 QString convertToUnicode(const char *, int, ConverterState *) const;
217 QByteArray convertFromUnicode(const QChar *, int, ConverterState *) const;
218 QString convertToUnicodeCharByChar(const char *chars, int length, ConverterState *state) const;
219
220 QByteArray name() const;
221 int mibEnum() const;
222
223};
224
225QWindowsLocalCodec::QWindowsLocalCodec()
226{
227}
228
229QWindowsLocalCodec::~QWindowsLocalCodec()
230{
231}
232
233QString QWindowsLocalCodec::convertToUnicode(const char *chars, int length, ConverterState *state) const
234{
235 const char *mb = chars;
236 int mblen = length;
237
238 if (!mb || !mblen)
239 return QString();
240
241 const int wclen_auto = 4096;
242 WCHAR wc_auto[wclen_auto];
243 int wclen = wclen_auto;
244 WCHAR *wc = wc_auto;
245 int len;
246 QString sp;
247 bool prepend = false;
248 char state_data = 0;
249 int remainingChars = 0;
250
251 //save the current state information
252 if (state) {
253 state_data = (char)state->state_data[0];
254 remainingChars = state->remainingChars;
255 }
256
257 //convert the pending charcter (if available)
258 if (state && remainingChars) {
259 char prev[3] = {0};
260 prev[0] = state_data;
261 prev[1] = mb[0];
262 remainingChars = 0;
263 len = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED,
264 prev, 2, wc, wclen);
265 if (len) {
266 prepend = true;
267 sp.append(QChar(wc[0]));
268 mb++;
269 mblen--;
270 wc[0] = 0;
271 }
272 }
273
274 while (!(len=MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED|MB_ERR_INVALID_CHARS,
275 mb, mblen, wc, wclen))) {
276 int r = GetLastError();
277 if (r == ERROR_INSUFFICIENT_BUFFER) {
278 if (wc != wc_auto) {
279 qWarning("MultiByteToWideChar: Size changed");
280 break;
281 } else {
282 wclen = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED,
283 mb, mblen, 0, 0);
284 wc = new WCHAR[wclen];
285 // and try again...
286 }
287 } else if (r == ERROR_NO_UNICODE_TRANSLATION) {
288 //find the last non NULL character
289 while (mblen > 1 && !(mb[mblen-1]))
290 mblen--;
291 //check whether, we hit an invalid character in the middle
292 if ((mblen <= 1) || (remainingChars && state_data))
293 return convertToUnicodeCharByChar(chars, length, state);
294 //Remove the last character and try again...
295 state_data = mb[mblen-1];
296 remainingChars = 1;
297 mblen--;
298 } else {
299 // Fail.
300 qWarning("MultiByteToWideChar: Cannot convert multibyte text");
301 break;
302 }
303 }
304 if (len <= 0)
305 return QString();
306 if (wc[len-1] == 0) // len - 1: we don't want terminator
307 --len;
308
309 //save the new state information
310 if (state) {
311 state->state_data[0] = (char)state_data;
312 state->remainingChars = remainingChars;
313 }
314 QString s((QChar*)wc, len);
315 if (wc != wc_auto)
316 delete [] wc;
317 if (prepend) {
318 return sp+s;
319 }
320 return s;
321}
322
323QString QWindowsLocalCodec::convertToUnicodeCharByChar(const char *chars, int length, ConverterState *state) const
324{
325 if (!chars || !length)
326 return QString();
327
328 int copyLocation = 0;
329 int extra = 2;
330 if (state && state->remainingChars) {
331 copyLocation = state->remainingChars;
332 extra += copyLocation;
333 }
334 int newLength = length + extra;
335 char *mbcs = new char[newLength];
336 //ensure that we have a NULL terminated string
337 mbcs[newLength-1] = 0;
338 mbcs[newLength-2] = 0;
339 memcpy(&(mbcs[copyLocation]), chars, length);
340 if (copyLocation) {
341 //copy the last character from the state
342 mbcs[0] = (char)state->state_data[0];
343 state->remainingChars = 0;
344 }
345 const char *mb = mbcs;
346#ifndef Q_OS_WINCE
347 const char *next = 0;
348 QString s;
349 while((next = CharNextExA(CP_ACP, mb, 0)) != mb) {
350 WCHAR wc[2] ={0};
351 int charlength = next - mb;
352 int len = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED|MB_ERR_INVALID_CHARS, mb, charlength, wc, 2);
353 if (len>0) {
354 s.append(QChar(wc[0]));
355 } else {
356 int r = GetLastError();
357 //check if the character being dropped is the last character
358 if (r == ERROR_NO_UNICODE_TRANSLATION && mb == (mbcs+newLength -3) && state) {
359 state->remainingChars = 1;
360 state->state_data[0] = (char)*mb;
361 }
362 }
363 mb = next;
364 }
365#else
366 QString s;
367 int size = mbstowcs(NULL, mb, length);
368 if (size < 0) {
369 Q_ASSERT("Error in CE TextCodec");
370 return QString();
371 }
372 wchar_t* ws = new wchar_t[size + 2];
373 ws[size +1] = 0;
374 ws[size] = 0;
375 size = mbstowcs(ws, mb, length);
376 for (int i=0; i< size; i++)
377 s.append(QChar(ws[i]));
378 delete [] ws;
379#endif
380 delete mbcs;
381 return s;
382}
383
384QByteArray QWindowsLocalCodec::convertFromUnicode(const QChar *uc, int len, ConverterState *) const
385{
386 return qt_winQString2MB(uc, len);
387}
388
389
390QByteArray QWindowsLocalCodec::name() const
391{
392 return "System";
393}
394
395int QWindowsLocalCodec::mibEnum() const
396{
397 return 0;
398}
399
400#elif defined(Q_OS_OS2)
401
402class QOs2LocalCodec: public QTextCodec
403{
404public:
405 QOs2LocalCodec();
406 ~QOs2LocalCodec();
407
408 QString convertToUnicode(const char *, int, ConverterState *) const;
409 QByteArray convertFromUnicode(const QChar *, int, ConverterState *) const;
410
411 QByteArray name() const;
412 int mibEnum() const;
413
414private:
415 UconvObject uoSubYes;
416 UconvObject uoSubNo;
417};
418
419QOs2LocalCodec::QOs2LocalCodec() : uoSubYes(0), uoSubNo(0)
420{
421 // create the conversion object for the process code page that performs
422 // substitution of invalid characters with '?'
423 UniCreateUconvObject((UniChar *)L"@sub=yes,subchar=\\x3F,subuni=\\x003F",
424 &uoSubYes);
425 Q_ASSERT(uoSubYes);
426
427 // same as above but doesn't perform substitution
428 UniCreateUconvObject((UniChar *)L"@sub=no", &uoSubNo);
429 Q_ASSERT(uoSubNo);
430}
431
432QOs2LocalCodec::~QOs2LocalCodec()
433{
434 UniFreeUconvObject(uoSubNo);
435 UniFreeUconvObject(uoSubYes);
436}
437
438static void qOs2LocalCodecStateFree(QTextCodec::ConverterState *state)
439{
440 delete reinterpret_cast<char *>(state->d);
441}
442
443QString QOs2LocalCodec::convertToUnicode(const char *chars, int length,
444 ConverterState *state) const
445{
446 QString res;
447
448 if (!chars)
449 return res;
450 if (!length)
451 return QLatin1String("");
452
453 UconvObject uo = uoSubYes;
454 if (state && (state->flags & ConvertInvalidToNull))
455 uo = uoSubNo;
456
457 int remainingChars = 0;
458 char *remainingBuffer = 0;
459
460 if (state) {
461 // stateful conversion
462 remainingBuffer = reinterpret_cast<char *>(state->d);
463 if (remainingBuffer) {
464 // restore state
465 remainingChars = state->remainingChars;
466 } else {
467 // first time, add the destructor for state->d
468 state->flags |= FreeFunction;
469 QTextCodecUnalignedPointer::encode(state->state_data,
470 qOs2LocalCodecStateFree);
471 }
472 }
473
474 const char *mbPtr = chars;
475 size_t mbLeft = length;
476
477 QByteArray mbExtra;
478 if (remainingChars) {
479 // we have to prepend the remaining bytes from the previous conversion
480 mbLeft += remainingChars;
481 mbExtra.resize(mbLeft);
482 mbPtr = mbExtra.data();
483
484 memcpy(mbExtra.data(), remainingBuffer, remainingChars);
485 memcpy(mbExtra.data() + remainingChars, chars, length);
486
487 remainingBuffer = 0;
488 remainingChars = 0;
489 }
490
491 size_t ucLen = mbLeft;
492 QString ucBuf(ucLen, QLatin1Char('\0'));
493 UniChar *ucPtr = reinterpret_cast<UniChar *>(ucBuf.data());
494 size_t ucLeft = ucLen;
495
496 size_t nonIdent = 0;
497 int rc;
498
499 while (mbLeft) {
500 rc = UniUconvToUcs(uo, (void**)&mbPtr, &mbLeft, &ucPtr, &ucLeft,
501 &nonIdent);
502 if (rc == ULS_BUFFERFULL) {
503 size_t ucDone = ucLen - ucLeft;
504 size_t mbDone = length - mbLeft;
505 // assume that mbLeft/ucLeft is an approximation of mbDone/ucDone
506 ucLen = ucDone + (mbLeft * ucDone) / mbDone;
507 ucBuf.resize(ucLen);
508 ucPtr = reinterpret_cast<UniChar *>(ucBuf.data() + ucDone);
509 } else if (rc == ULS_ILLEGALSEQUENCE && state) {
510 // conversion stopped because the remaining inBytesLeft make up
511 // an incomplete multi-byte sequence; save them for later
512 remainingBuffer = new char[mbLeft];
513 memcpy(remainingBuffer, mbPtr, mbLeft);
514 remainingChars = mbLeft;
515 break;
516 } else if (rc != ULS_SUCCESS) {
517 // just fail on an unexpected error (will return what we've got)
518 qWarning("QOs2LocalCodec::convertToUnicode: UniUconvToUcs failed "
519 "with %d", rc);
520 break;
521 }
522 }
523
524 ucBuf.resize(ucLen - ucLeft);
525 res = ucBuf;
526
527 if (state) {
528 // update the state
529 state->invalidChars = nonIdent;
530 state->remainingChars = remainingChars;
531 state->d = remainingBuffer;
532 }
533
534 return res;
535}
536
537QByteArray QOs2LocalCodec::convertFromUnicode(const QChar *uchars, int length,
538 ConverterState *state) const
539{
540 QByteArray res;
541
542 if (!uchars)
543 return res;
544 if (!length)
545 return QByteArray("");
546
547 UconvObject uo = uoSubYes;
548 if (state && (state->flags & ConvertInvalidToNull))
549 uo = uoSubNo;
550
551 const UniChar *ucPtr = reinterpret_cast<const UniChar *>(uchars);
552 size_t ucLeft = length;
553
554 QVector<QChar> ucExtra;
555 if (state && state->remainingChars) {
556 // we have one surrogate char to be prepended
557 Q_ASSERT(state->remainingChars == 1);
558 ucLeft += 1;
559 ucExtra.resize(ucLeft);
560 ucPtr = reinterpret_cast<const UniChar *>(ucExtra.data());
561
562 ucExtra[0] = state->state_data[0];
563 memcpy(ucExtra.data() + 1, uchars, length * sizeof(QChar));
564
565 state->remainingChars = 0;
566 }
567
568 // be optimistic (imply that one byte is necessary per every Unicode char)
569 size_t mbLen = length;
570 QByteArray mbBuf(mbLen, '\0');
571 char *mbPtr = mbBuf.data();
572 size_t mbLeft = mbLen;
573
574 size_t nonIdent = 0;
575 int rc;
576
577 while (ucLeft) {
578 rc = UniUconvFromUcs(uo, const_cast<UniChar **>(&ucPtr), &ucLeft,
579 (void**)&mbPtr, &mbLeft, &nonIdent);
580 if (rc == ULS_BUFFERFULL) {
581 size_t mbDone = mbLen - mbLeft;
582 size_t ucDone = length - ucLeft;
583 size_t newLen = mbLen;
584 if (ucDone) {
585 // assume that ucLeft/mbLeft is an approximation of ucDone/mbDone
586 newLen = mbDone + (ucLeft * mbDone) / ucDone;
587 }
588 if (newLen == mbLen) {
589 // could not process a single Unicode char, double the size
590 mbLen *= 2;
591 } else {
592 mbLen = newLen;
593 }
594 mbBuf.resize(mbLen);
595 mbPtr = mbBuf.data() + mbDone;
596 mbLeft = mbLen - mbDone;
597 } else if (rc == ULS_ILLEGALSEQUENCE && state) {
598 // buffer ends in a surrogate
599 Q_ASSERT(ucLeft == 2);
600 state->state_data[0] = *ucPtr;
601 state->remainingChars = 1;
602 break;
603 } else if (rc != ULS_SUCCESS) {
604 // just fail on an unexpected error (will return what we've got)
605 qWarning("QOs2LocalCodec::convertFromUnicode: UniUconvFromUcs failed "
606 "with %d", rc);
607 break;
608 }
609 }
610
611 mbBuf.resize(mbLen - mbLeft);
612 res = mbBuf;
613
614 if (state) {
615 // update the state
616 state->invalidChars = nonIdent;
617 }
618
619 return res;
620}
621
622QByteArray QOs2LocalCodec::name() const
623{
624 return "System";
625}
626
627int QOs2LocalCodec::mibEnum() const
628{
629 return 0;
630}
631
632#else
633
634/* locale names mostly copied from XFree86 */
635static const char * const iso8859_2locales[] = {
636 "croatian", "cs", "cs_CS", "cs_CZ","cz", "cz_CZ", "czech", "hr",
637 "hr_HR", "hu", "hu_HU", "hungarian", "pl", "pl_PL", "polish", "ro",
638 "ro_RO", "rumanian", "serbocroatian", "sh", "sh_SP", "sh_YU", "sk",
639 "sk_SK", "sl", "sl_CS", "sl_SI", "slovak", "slovene", "sr_SP", 0 };
640
641static const char * const iso8859_3locales[] = {
642 "eo", 0 };
643
644static const char * const iso8859_4locales[] = {
645 "ee", "ee_EE", 0 };
646
647static const char * const iso8859_5locales[] = {
648 "mk", "mk_MK", "sp", "sp_YU", 0 };
649
650static const char * const cp_1251locales[] = {
651 "be", "be_BY", "bg", "bg_BG", "bulgarian", 0 };
652
653static const char * const pt_154locales[] = {
654 "ba_RU", "ky", "ky_KG", "kk", "kk_KZ", 0 };
655
656static const char * const iso8859_6locales[] = {
657 "ar_AA", "ar_SA", "arabic", 0 };
658
659static const char * const iso8859_7locales[] = {
660 "el", "el_GR", "greek", 0 };
661
662static const char * const iso8859_8locales[] = {
663 "hebrew", "he", "he_IL", "iw", "iw_IL", 0 };
664
665static const char * const iso8859_9locales[] = {
666 "tr", "tr_TR", "turkish", 0 };
667
668static const char * const iso8859_13locales[] = {
669 "lt", "lt_LT", "lv", "lv_LV", 0 };
670
671static const char * const iso8859_15locales[] = {
672 "et", "et_EE",
673 // Euro countries
674 "br_FR", "ca_ES", "de", "de_AT", "de_BE", "de_DE", "de_LU", "en_IE",
675 "es", "es_ES", "eu_ES", "fi", "fi_FI", "finnish", "fr", "fr_FR",
676 "fr_BE", "fr_LU", "french", "ga_IE", "gl_ES", "it", "it_IT", "oc_FR",
677 "nl", "nl_BE", "nl_NL", "pt", "pt_PT", "sv_FI", "wa_BE",
678 0 };
679
680static const char * const koi8_ulocales[] = {
681 "uk", "uk_UA", "ru_UA", "ukrainian", 0 };
682
683static const char * const tis_620locales[] = {
684 "th", "th_TH", "thai", 0 };
685
686// static const char * const tcvnlocales[] = {
687// "vi", "vi_VN", 0 };
688
689static bool try_locale_list(const char * const locale[], const char * lang)
690{
691 int i;
692 for(i=0; locale[i] && *locale[i] && strcmp(locale[i], lang); i++)
693 ;
694 return locale[i] != 0;
695}
696
697// For the probably_koi8_locales we have to look. the standard says
698// these are 8859-5, but almost all Russian users use KOI8-R and
699// incorrectly set $LANG to ru_RU. We'll check tolower() to see what
700// it thinks ru_RU means.
701
702// If you read the history, it seems that many Russians blame ISO and
703// Perestroika for the confusion.
704//
705// The real bug is that some programs break if the user specifies
706// ru_RU.KOI8-R.
707
708static const char * const probably_koi8_rlocales[] = {
709 "ru", "ru_SU", "ru_RU", "russian", 0 };
710
711static QTextCodec * ru_RU_hack(const char * i) {
712#if defined(Q_OS_OS2)
713 // @todo temporary hack. the proper one is to use the current process'
714 // code page if LANG or its codepage part is missing
715 return QTextCodec::codecForName("cp866");
716#else
717 QTextCodec * ru_RU_codec = 0;
718
719#if !defined(QT_NO_SETLOCALE)
720 QByteArray origlocale(setlocale(LC_CTYPE, i));
721#else
722 QByteArray origlocale(i);
723#endif
724 // unicode koi8r latin5 name
725 // 0x044E 0xC0 0xEE CYRILLIC SMALL LETTER YU
726 // 0x042E 0xE0 0xCE CYRILLIC CAPITAL LETTER YU
727 int latin5 = tolower(0xCE);
728 int koi8r = tolower(0xE0);
729 if (koi8r == 0xC0 && latin5 != 0xEE) {
730 ru_RU_codec = QTextCodec::codecForName("KOI8-R");
731 } else if (koi8r != 0xC0 && latin5 == 0xEE) {
732 ru_RU_codec = QTextCodec::codecForName("ISO 8859-5");
733 } else {
734 // something else again... let's assume... *throws dice*
735 ru_RU_codec = QTextCodec::codecForName("KOI8-R");
736 qWarning("QTextCodec: Using KOI8-R, probe failed (%02x %02x %s)",
737 koi8r, latin5, i);
738 }
739#if !defined(QT_NO_SETLOCALE)
740 setlocale(LC_CTYPE, origlocale);
741#endif
742
743 return ru_RU_codec;
744#endif // defined(Q_OS_OS2)
745}
746
747#endif
748
749#if !defined(Q_OS_WIN32) && !defined(Q_OS_WINCE) && !defined(Q_OS_OS2)
750static QTextCodec *checkForCodec(const char *name) {
751 QTextCodec *c = QTextCodec::codecForName(name);
752 if (!c) {
753 const char *at = strchr(name, '@');
754 if (at) {
755 QByteArray n(name, at - name);
756 c = QTextCodec::codecForName(n.data());
757 }
758 }
759 return c;
760}
761#endif
762
763/* the next two functions are implicitely thread safe,
764 as they are only called by setup() which uses a mutex.
765*/
766static void setupLocaleMapper()
767{
768#if defined(Q_OS_WIN32) || defined(Q_OS_WINCE)
769 localeMapper = QTextCodec::codecForName("System");
770#elif defined(Q_OS_OS2)
771 localeMapper = QTextCodec::codecForName("System");
772#else
773
774#ifndef QT_NO_ICONV
775 localeMapper = QTextCodec::codecForName("System");
776#endif
777
778#if defined (_XOPEN_UNIX) && !defined(Q_OS_QNX6) && !defined(Q_OS_OSF)
779 if (!localeMapper) {
780 char *charset = nl_langinfo (CODESET);
781 if (charset)
782 localeMapper = QTextCodec::codecForName(charset);
783 }
784#endif
785
786 if (!localeMapper) {
787 // Very poorly defined and followed standards causes lots of
788 // code to try to get all the cases... This logic is
789 // duplicated in QIconvCodec, so if you change it here, change
790 // it there too.
791
792 // Try to determine locale codeset from locale name assigned to
793 // LC_CTYPE category.
794
795 // First part is getting that locale name. First try setlocale() which
796 // definitely knows it, but since we cannot fully trust it, get ready
797 // to fall back to environment variables.
798#if !defined(QT_NO_SETLOCALE)
799 char * ctype = qstrdup(setlocale(LC_CTYPE, 0));
800#else
801 char * ctype = qstrdup("");
802#endif
803
804 // Get the first nonempty value from $LC_ALL, $LC_CTYPE, and $LANG
805 // environment variables.
806 char * lang = qstrdup(qgetenv("LC_ALL").constData());
807 if (!lang || lang[0] == 0 || strcmp(lang, "C") == 0) {
808 if (lang) delete [] lang;
809 lang = qstrdup(qgetenv("LC_CTYPE").constData());
810 }
811 if (!lang || lang[0] == 0 || strcmp(lang, "C") == 0) {
812 if (lang) delete [] lang;
813 lang = qstrdup(qgetenv("LANG").constData());
814 }
815
816 // Now try these in order:
817 // 1. CODESET from ctype if it contains a .CODESET part (e.g. en_US.ISO8859-15)
818 // 2. CODESET from lang if it contains a .CODESET part
819 // 3. ctype (maybe the locale is named "ISO-8859-1" or something)
820 // 4. locale (ditto)
821 // 5. check for "@euro"
822 // 6. guess locale from ctype unless ctype is "C"
823 // 7. guess locale from lang
824
825 // 1. CODESET from ctype if it contains a .CODESET part (e.g. en_US.ISO8859-15)
826 char * codeset = ctype ? strchr(ctype, '.') : 0;
827 if (codeset && *codeset == '.')
828 localeMapper = checkForCodec(codeset + 1);
829
830 // 2. CODESET from lang if it contains a .CODESET part
831 codeset = lang ? strchr(lang, '.') : 0;
832 if (!localeMapper && codeset && *codeset == '.')
833 localeMapper = checkForCodec(codeset + 1);
834
835 // 3. ctype (maybe the locale is named "ISO-8859-1" or something)
836 if (!localeMapper && ctype && *ctype != 0 && strcmp (ctype, "C") != 0)
837 localeMapper = checkForCodec(ctype);
838
839 // 4. locale (ditto)
840 if (!localeMapper && lang && *lang != 0)
841 localeMapper = checkForCodec(lang);
842
843 // 5. "@euro"
844 if ((!localeMapper && ctype && strstr(ctype, "@euro")) || (lang && strstr(lang, "@euro")))
845 localeMapper = checkForCodec("ISO 8859-15");
846
847 // 6. guess locale from ctype unless ctype is "C"
848 // 7. guess locale from lang
849 char * try_by_name = ctype;
850 if (ctype && *ctype != 0 && strcmp (ctype, "C") != 0)
851 try_by_name = lang;
852
853 // Now do the guessing.
854 if (lang && *lang && !localeMapper && try_by_name && *try_by_name) {
855 if (try_locale_list(iso8859_15locales, lang))
856 localeMapper = QTextCodec::codecForName("ISO 8859-15");
857 else if (try_locale_list(iso8859_2locales, lang))
858 localeMapper = QTextCodec::codecForName("ISO 8859-2");
859 else if (try_locale_list(iso8859_3locales, lang))
860 localeMapper = QTextCodec::codecForName("ISO 8859-3");
861 else if (try_locale_list(iso8859_4locales, lang))
862 localeMapper = QTextCodec::codecForName("ISO 8859-4");
863 else if (try_locale_list(iso8859_5locales, lang))
864 localeMapper = QTextCodec::codecForName("ISO 8859-5");
865 else if (try_locale_list(iso8859_6locales, lang))
866 localeMapper = QTextCodec::codecForName("ISO 8859-6");
867 else if (try_locale_list(iso8859_7locales, lang))
868 localeMapper = QTextCodec::codecForName("ISO 8859-7");
869 else if (try_locale_list(iso8859_8locales, lang))
870 localeMapper = QTextCodec::codecForName("ISO 8859-8-I");
871 else if (try_locale_list(iso8859_9locales, lang))
872 localeMapper = QTextCodec::codecForName("ISO 8859-9");
873 else if (try_locale_list(iso8859_13locales, lang))
874 localeMapper = QTextCodec::codecForName("ISO 8859-13");
875 else if (try_locale_list(tis_620locales, lang))
876 localeMapper = QTextCodec::codecForName("ISO 8859-11");
877 else if (try_locale_list(koi8_ulocales, lang))
878 localeMapper = QTextCodec::codecForName("KOI8-U");
879 else if (try_locale_list(cp_1251locales, lang))
880 localeMapper = QTextCodec::codecForName("CP 1251");
881 else if (try_locale_list(pt_154locales, lang))
882 localeMapper = QTextCodec::codecForName("PT 154");
883 else if (try_locale_list(probably_koi8_rlocales, lang))
884 localeMapper = ru_RU_hack(lang);
885 }
886
887 delete [] ctype;
888 delete [] lang;
889 }
890
891 // If everything failed, we default to 8859-1
892 // We could perhaps default to 8859-15.
893 if (!localeMapper)
894 localeMapper = QTextCodec::codecForName("ISO 8859-1");
895#endif
896}
897
898
899static void setup()
900{
901#ifndef QT_NO_THREAD
902 QMutexLocker locker(QMutexPool::globalInstanceGet(&all));
903#endif
904
905 if (all)
906 return;
907
908 if (destroying_is_ok)
909 qWarning("QTextCodec: Creating new codec during codec cleanup");
910 all = new QList<QTextCodec*>;
911 // create the cleanup object to cleanup all codecs on exit
912 (void) createQTextCodecCleanup();
913
914#ifndef QT_NO_CODECS
915# if defined(Q_WS_X11) && !defined(QT_BOOTSTRAPPED)
916 // no font codecs when bootstrapping
917 (void)new QFontLaoCodec;
918# if defined(QT_NO_ICONV)
919 // no iconv(3) support, must build all codecs into the library
920 (void)new QFontGb2312Codec;
921 (void)new QFontGbkCodec;
922 (void)new QFontGb18030_0Codec;
923 (void)new QFontJis0208Codec;
924 (void)new QFontJis0201Codec;
925 (void)new QFontKsc5601Codec;
926 (void)new QFontBig5hkscsCodec;
927 (void)new QFontBig5Codec;
928# endif // QT_NO_ICONV && !QT_BOOTSTRAPPED
929# endif // Q_WS_X11
930
931 (void)new QTsciiCodec;
932
933 for (int i = 0; i < 9; ++i)
934 (void)new QIsciiCodec(i);
935
936
937# if defined(QT_NO_ICONV) && !defined(QT_BOOTSTRAPPED)
938 // no asian codecs when bootstrapping, sorry
939 (void)new QGb18030Codec;
940 (void)new QGbkCodec;
941 (void)new QGb2312Codec;
942 (void)new QEucJpCodec;
943 (void)new QJisCodec;
944 (void)new QSjisCodec;
945 (void)new QEucKrCodec;
946 (void)new QBig5Codec;
947 (void)new QBig5hkscsCodec;
948# endif // QT_NO_ICONV && !QT_BOOTSTRAPPED
949#endif // QT_NO_CODECS
950
951#if defined(Q_OS_WIN32) || defined(Q_OS_WINCE)
952 (void) new QWindowsLocalCodec;
953#endif // Q_OS_WIN32
954
955#if defined(Q_OS_OS2)
956 (void) new QOs2LocalCodec;
957#endif // Q_OS_OS2
958
959 (void)new QUtf16Codec;
960 (void)new QUtf16BECodec;
961 (void)new QUtf16LECodec;
962 (void)new QUtf32Codec;
963 (void)new QUtf32BECodec;
964 (void)new QUtf32LECodec;
965 (void)new QLatin15Codec;
966 (void)new QLatin1Codec;
967 (void)new QUtf8Codec;
968
969 for (int i = 0; i < QSimpleTextCodec::numSimpleCodecs; ++i)
970 (void)new QSimpleTextCodec(i);
971
972#if defined(Q_OS_UNIX) && !defined(QT_NO_ICONV) && !defined(QT_BOOTSTRAPPED)
973 // QIconvCodec depends on the UTF-16 codec, so it needs to be created last
974 (void) new QIconvCodec();
975#endif
976
977 if (!localeMapper)
978 setupLocaleMapper();
979}
980
981QTextCodec::ConverterState::~ConverterState()
982{
983 if (flags & FreeFunction)
984 (QTextCodecUnalignedPointer::decode(state_data))(this);
985 else if (d)
986 qFree(d);
987}
988
989static bool codecForLocaleSet = false;
990void qt_resetCodecForLocale()
991{
992 // if QTextCodec::codecForLocale() was called, we assume that the user has
993 // explicitly set the codec he wants for the locale and don't attempt to
994 // autodetect it again
995 if (!codecForLocaleSet)
996 setupLocaleMapper();
997}
998
999/*!
1000 \class QTextCodec
1001 \brief The QTextCodec class provides conversions between text encodings.
1002 \reentrant
1003 \ingroup i18n
1004
1005 Qt uses Unicode to store, draw and manipulate strings. In many
1006 situations you may wish to deal with data that uses a different
1007 encoding. For example, most Japanese documents are still stored
1008 in Shift-JIS or ISO 2022-JP, while Russian users often have their
1009 documents in KOI8-R or Windows-1251.
1010
1011 Qt provides a set of QTextCodec classes to help with converting
1012 non-Unicode formats to and from Unicode. You can also create your
1013 own codec classes.
1014
1015 The supported encodings are:
1016
1017 \list
1018 \o Apple Roman
1019 \o \l{Big5 Text Codec}{Big5}
1020 \o \l{Big5-HKSCS Text Codec}{Big5-HKSCS}
1021 \o CP949
1022 \o \l{EUC-JP Text Codec}{EUC-JP}
1023 \o \l{EUC-KR Text Codec}{EUC-KR}
1024 \o \l{GBK Text Codec}{GB18030-0}
1025 \o IBM 850
1026 \o IBM 866
1027 \o IBM 874
1028 \o \l{ISO 2022-JP (JIS) Text Codec}{ISO 2022-JP}
1029 \o ISO 8859-1 to 10
1030 \o ISO 8859-13 to 16
1031 \o Iscii-Bng, Dev, Gjr, Knd, Mlm, Ori, Pnj, Tlg, and Tml
1032 \o JIS X 0201
1033 \o JIS X 0208
1034 \o KOI8-R
1035 \o KOI8-U
1036 \o MuleLao-1
1037 \o ROMAN8
1038 \o \l{Shift-JIS Text Codec}{Shift-JIS}
1039 \o TIS-620
1040 \o \l{TSCII Text Codec}{TSCII}
1041 \o UTF-8
1042 \o UTF-16
1043 \o UTF-16BE
1044 \o UTF-16LE
1045 \o UTF-32
1046 \o UTF-32BE
1047 \o UTF-32LE
1048 \o Windows-1250 to 1258
1049 \o WINSAMI2
1050 \endlist
1051
1052 QTextCodecs can be used as follows to convert some locally encoded
1053 string to Unicode. Suppose you have some string encoded in Russian
1054 KOI8-R encoding, and want to convert it to Unicode. The simple way
1055 to do it is like this:
1056
1057 \snippet doc/src/snippets/code/src_corelib_codecs_qtextcodec.cpp 0
1058
1059 After this, \c string holds the text converted to Unicode.
1060 Converting a string from Unicode to the local encoding is just as
1061 easy:
1062
1063 \snippet doc/src/snippets/code/src_corelib_codecs_qtextcodec.cpp 1
1064
1065 To read or write files in various encodings, use QTextStream and
1066 its \l{QTextStream::setCodec()}{setCodec()} function. See the
1067 \l{tools/codecs}{Codecs} example for an application of QTextCodec
1068 to file I/O.
1069
1070 Some care must be taken when trying to convert the data in chunks,
1071 for example, when receiving it over a network. In such cases it is
1072 possible that a multi-byte character will be split over two
1073 chunks. At best this might result in the loss of a character and
1074 at worst cause the entire conversion to fail.
1075
1076 The approach to use in these situations is to create a QTextDecoder
1077 object for the codec and use this QTextDecoder for the whole
1078 decoding process, as shown below:
1079
1080 \snippet doc/src/snippets/code/src_corelib_codecs_qtextcodec.cpp 2
1081
1082 The QTextDecoder object maintains state between chunks and therefore
1083 works correctly even if a multi-byte character is split between
1084 chunks.
1085
1086 \section1 Creating Your Own Codec Class
1087
1088 Support for new text encodings can be added to Qt by creating
1089 QTextCodec subclasses.
1090
1091 The pure virtual functions describe the encoder to the system and
1092 the coder is used as required in the different text file formats
1093 supported by QTextStream, and under X11, for the locale-specific
1094 character input and output.
1095
1096 To add support for another encoding to Qt, make a subclass of
1097 QTextCodec and implement the functions listed in the table below.
1098
1099 \table
1100 \header \o Function \o Description
1101
1102 \row \o name()
1103 \o Returns the official name for the encoding. If the
1104 encoding is listed in the
1105 \l{IANA character-sets encoding file}, the name
1106 should be the preferred MIME name for the encoding.
1107
1108 \row \o aliases()
1109 \o Returns a list of alternative names for the encoding.
1110 QTextCodec provides a default implementation that returns
1111 an empty list. For example, "ISO-8859-1" has "latin1",
1112 "CP819", "IBM819", and "iso-ir-100" as aliases.
1113
1114 \row \o mibEnum()
1115 \o Return the MIB enum for the encoding if it is listed in
1116 the \l{IANA character-sets encoding file}.
1117
1118 \row \o convertToUnicode()
1119 \o Converts an 8-bit character string to Unicode.
1120
1121 \row \o convertFromUnicode()
1122 \o Converts a Unicode string to an 8-bit character string.
1123 \endtable
1124
1125 You may find it more convenient to make your codec class
1126 available as a plugin; see \l{How to Create Qt Plugins} for
1127 details.
1128
1129 \sa QTextStream, QTextDecoder, QTextEncoder, {Codecs Example}
1130*/
1131
1132/*!
1133 \enum QTextCodec::ConversionFlag
1134
1135 \value DefaultConversion No flag is set.
1136 \value ConvertInvalidToNull If this flag is set, each invalid input
1137 character is output as a null character.
1138 \value IgnoreHeader Ignore any Unicode byte-order mark and don't generate any.
1139
1140 \omitvalue FreeFunction
1141*/
1142
1143/*!
1144 \fn QTextCodec::ConverterState::ConverterState(ConversionFlags flags)
1145
1146 Constructs a ConverterState object initialized with the given \a flags.
1147*/
1148
1149/*!
1150 \fn QTextCodec::ConverterState::~ConverterState()
1151
1152 Destroys the ConverterState object.
1153*/
1154
1155/*!
1156 \nonreentrant
1157
1158 Constructs a QTextCodec, and gives it the highest precedence. The
1159 QTextCodec should always be constructed on the heap (i.e. with \c
1160 new). Qt takes ownership and will delete it when the application
1161 terminates.
1162*/
1163QTextCodec::QTextCodec()
1164{
1165 setup();
1166 all->prepend(this);
1167}
1168
1169
1170/*!
1171 \nonreentrant
1172
1173 Destroys the QTextCodec. Note that you should not delete codecs
1174 yourself: once created they become Qt's responsibility.
1175*/
1176QTextCodec::~QTextCodec()
1177{
1178 if (!destroying_is_ok)
1179 qWarning("QTextCodec::~QTextCodec: Called by application");
1180 if (all)
1181 all->removeAll(this);
1182}
1183
1184/*!
1185 \fn QTextCodec *QTextCodec::codecForName(const char *name)
1186
1187 Searches all installed QTextCodec objects and returns the one
1188 which best matches \a name; the match is case-insensitive. Returns
1189 0 if no codec matching the name \a name could be found.
1190*/
1191
1192/*!
1193 Searches all installed QTextCodec objects and returns the one
1194 which best matches \a name; the match is case-insensitive. Returns
1195 0 if no codec matching the name \a name could be found.
1196*/
1197QTextCodec *QTextCodec::codecForName(const QByteArray &name)
1198{
1199 if (name.isEmpty())
1200 return 0;
1201
1202 setup();
1203
1204 for (int i = 0; i < all->size(); ++i) {
1205 QTextCodec *cursor = all->at(i);
1206 if (nameMatch(cursor->name(), name))
1207 return cursor;
1208 QList<QByteArray> aliases = cursor->aliases();
1209 for (int i = 0; i < aliases.size(); ++i)
1210 if (nameMatch(aliases.at(i), name))
1211 return cursor;
1212 }
1213
1214 return createForName(name);
1215}
1216
1217
1218/*!
1219 Returns the QTextCodec which matches the \link
1220 QTextCodec::mibEnum() MIBenum\endlink \a mib.
1221*/
1222QTextCodec* QTextCodec::codecForMib(int mib)
1223{
1224 setup();
1225
1226 // Qt 3 used 1000 (mib for UCS2) as its identifier for the utf16 codec. Map
1227 // this correctly for compatibility.
1228 if (mib == 1000)
1229 mib = 1015;
1230
1231 QList<QTextCodec*>::ConstIterator i;
1232 for (int i = 0; i < all->size(); ++i) {
1233 QTextCodec *cursor = all->at(i);
1234 if (cursor->mibEnum() == mib)
1235 return cursor;
1236 }
1237
1238 return createForMib(mib);
1239}
1240
1241/*!
1242 Returns the list of all available codecs, by name. Call
1243 QTextCodec::codecForName() to obtain the QTextCodec for the name.
1244
1245 The list may contain many mentions of the same codec
1246 if the codec has aliases.
1247
1248 \sa availableMibs(), name(), aliases()
1249*/
1250QList<QByteArray> QTextCodec::availableCodecs()
1251{
1252 setup();
1253
1254 QList<QByteArray> codecs;
1255 for (int i = 0; i < all->size(); ++i) {
1256 codecs += all->at(i)->name();
1257 codecs += all->at(i)->aliases();
1258 }
1259#ifndef QT_NO_TEXTCODECPLUGIN
1260 QFactoryLoader *l = loader();
1261 QStringList keys = l->keys();
1262 for (int i = 0; i < keys.size(); ++i) {
1263 if (!keys.at(i).startsWith(QLatin1String("MIB: "))) {
1264 QByteArray name = keys.at(i).toLatin1();
1265 if (!codecs.contains(name))
1266 codecs += name;
1267 }
1268 }
1269#endif
1270
1271 return codecs;
1272}
1273
1274/*!
1275 Returns the list of MIBs for all available codecs. Call
1276 QTextCodec::codecForMib() to obtain the QTextCodec for the MIB.
1277
1278 \sa availableCodecs(), mibEnum()
1279*/
1280QList<int> QTextCodec::availableMibs()
1281{
1282 setup();
1283
1284 QList<int> codecs;
1285 for (int i = 0; i < all->size(); ++i)
1286 codecs += all->at(i)->mibEnum();
1287#ifndef QT_NO_TEXTCODECPLUGIN
1288 QFactoryLoader *l = loader();
1289 QStringList keys = l->keys();
1290 for (int i = 0; i < keys.size(); ++i) {
1291 if (keys.at(i).startsWith(QLatin1String("MIB: "))) {
1292 int mib = keys.at(i).mid(5).toInt();
1293 if (!codecs.contains(mib))
1294 codecs += mib;
1295 }
1296 }
1297#endif
1298
1299 return codecs;
1300}
1301
1302/*!
1303 Set the codec to \a c; this will be returned by
1304 codecForLocale(). If \a c is a null pointer, the codec is reset to
1305 the default.
1306
1307 This might be needed for some applications that want to use their
1308 own mechanism for setting the locale.
1309
1310 Setting this codec is not supported on DOS based Windows.
1311
1312 \sa codecForLocale()
1313*/
1314void QTextCodec::setCodecForLocale(QTextCodec *c)
1315{
1316#ifdef Q_WS_WIN
1317 if (QSysInfo::WindowsVersion& QSysInfo::WV_DOS_based)
1318 return;
1319#endif
1320 codecForLocaleSet = true;
1321 localeMapper = c;
1322 if (!localeMapper)
1323 setupLocaleMapper();
1324}
1325
1326/*!
1327 Returns a pointer to the codec most suitable for this locale.
1328
1329 On Windows, the codec will be based on a system locale. On Unix
1330 systems, starting with Qt 4.2, the codec will be using the \e
1331 iconv library. Note that in both cases the codec's name will be
1332 "System".
1333*/
1334
1335QTextCodec* QTextCodec::codecForLocale()
1336{
1337 if (localeMapper)
1338 return localeMapper;
1339
1340 setup();
1341
1342 return localeMapper;
1343}
1344
1345
1346/*!
1347 \fn QByteArray QTextCodec::name() const
1348
1349 QTextCodec subclasses must reimplement this function. It returns
1350 the name of the encoding supported by the subclass.
1351
1352 If the codec is registered as a character set in the
1353 \l{IANA character-sets encoding file} this method should
1354 return the preferred mime name for the codec if defined,
1355 otherwise its name.
1356*/
1357
1358/*!
1359 \fn int QTextCodec::mibEnum() const
1360
1361 Subclasses of QTextCodec must reimplement this function. It
1362 returns the MIBenum (see \l{IANA character-sets encoding file}
1363 for more information). It is important that each QTextCodec
1364 subclass returns the correct unique value for this function.
1365*/
1366
1367/*!
1368 Subclasses can return a number of aliases for the codec in question.
1369
1370 Standard aliases for codecs can be found in the
1371 \l{IANA character-sets encoding file}.
1372*/
1373QList<QByteArray> QTextCodec::aliases() const
1374{
1375 return QList<QByteArray>();
1376}
1377
1378/*!
1379 \fn QString QTextCodec::convertToUnicode(const char *chars, int len,
1380 ConverterState *state) const
1381
1382 QTextCodec subclasses must reimplement this function.
1383
1384 Converts the first \a len characters of \a chars from the
1385 encoding of the subclass to Unicode, and returns the result in a
1386 QString.
1387
1388 \a state can be 0, in which case the conversion is stateless and
1389 default conversion rules should be used. If state is not 0, the
1390 codec should save the state after the conversion in \a state, and
1391 adjust the remainingChars and invalidChars members of the struct.
1392*/
1393
1394/*!
1395 \fn QByteArray QTextCodec::convertFromUnicode(const QChar *input, int number,
1396 ConverterState *state) const
1397
1398 QTextCodec subclasses must reimplement this function.
1399
1400 Converts the first \a number of characters from the \a input array
1401 from Unicode to the encoding of the subclass, and returns the result
1402 in a QByteArray.
1403
1404 \a state can be 0 in which case the conversion is stateless and
1405 default conversion rules should be used. If state is not 0, the
1406 codec should save the state after the conversion in \a state, and
1407 adjust the remainingChars and invalidChars members of the struct.
1408*/
1409
1410/*!
1411 Creates a QTextDecoder which stores enough state to decode chunks
1412 of \c{char *} data to create chunks of Unicode data.
1413
1414 The caller is responsible for deleting the returned object.
1415*/
1416QTextDecoder* QTextCodec::makeDecoder() const
1417{
1418 return new QTextDecoder(this);
1419}
1420
1421
1422/*!
1423 Creates a QTextEncoder which stores enough state to encode chunks
1424 of Unicode data as \c{char *} data.
1425
1426 The caller is responsible for deleting the returned object.
1427*/
1428QTextEncoder* QTextCodec::makeEncoder() const
1429{
1430 return new QTextEncoder(this);
1431}
1432
1433/*!
1434 \fn QByteArray QTextCodec::fromUnicode(const QChar *input, int number,
1435 ConverterState *state) const
1436
1437 Converts the first \a number of characters from the \a input array
1438 from Unicode to the encoding of this codec, and returns the result
1439 in a QByteArray.
1440
1441 The \a state of the convertor used is updated.
1442*/
1443
1444/*!
1445 Converts \a str from Unicode to the encoding of this codec, and
1446 returns the result in a QByteArray.
1447*/
1448QByteArray QTextCodec::fromUnicode(const QString& str) const
1449{
1450 return convertFromUnicode(str.constData(), str.length(), 0);
1451}
1452
1453/*!
1454 \fn QString QTextCodec::toUnicode(const char *input, int size,
1455 ConverterState *state) const
1456
1457 Converts the first \a size characters from the \a input from the
1458 encoding of this codec to Unicode, and returns the result in a
1459 QString.
1460
1461 The \a state of the convertor used is updated.
1462*/
1463
1464/*!
1465 Converts \a a from the encoding of this codec to Unicode, and
1466 returns the result in a QString.
1467*/
1468QString QTextCodec::toUnicode(const QByteArray& a) const
1469{
1470 return convertToUnicode(a.constData(), a.length(), 0);
1471}
1472
1473/*!
1474 Returns true if the Unicode character \a ch can be fully encoded
1475 with this codec; otherwise returns false.
1476*/
1477bool QTextCodec::canEncode(QChar ch) const
1478{
1479 ConverterState state;
1480 state.flags = ConvertInvalidToNull;
1481 convertFromUnicode(&ch, 1, &state);
1482 return (state.invalidChars == 0);
1483}
1484
1485/*!
1486 \overload
1487
1488 \a s contains the string being tested for encode-ability.
1489*/
1490bool QTextCodec::canEncode(const QString& s) const
1491{
1492 ConverterState state;
1493 state.flags = ConvertInvalidToNull;
1494 convertFromUnicode(s.constData(), s.length(), &state);
1495 return (state.invalidChars == 0);
1496}
1497
1498#ifdef QT3_SUPPORT
1499/*!
1500 Returns a string representing the current language and
1501 sublanguage, e.g. "pt" for Portuguese, or "pt_br" for Portuguese/Brazil.
1502
1503 \sa QLocale
1504*/
1505const char *QTextCodec::locale()
1506{
1507 static char locale[6];
1508 QByteArray l = QLocale::system().name().toLatin1();
1509 int len = qMin(l.length(), 5);
1510 memcpy(locale, l.constData(), len);
1511 locale[len] = '\0';
1512
1513 return locale;
1514}
1515
1516/*!
1517 \overload
1518*/
1519
1520QByteArray QTextCodec::fromUnicode(const QString& uc, int& lenInOut) const
1521{
1522 QByteArray result = convertFromUnicode(uc.constData(), lenInOut, 0);
1523 lenInOut = result.length();
1524 return result;
1525}
1526
1527/*!
1528 \overload
1529
1530 \a a contains the source characters; \a len contains the number of
1531 characters in \a a to use.
1532*/
1533QString QTextCodec::toUnicode(const QByteArray& a, int len) const
1534{
1535 len = qMin(a.size(), len);
1536 return convertToUnicode(a.constData(), len, 0);
1537}
1538#endif
1539
1540/*!
1541 \overload
1542
1543 \a chars contains the source characters.
1544*/
1545QString QTextCodec::toUnicode(const char *chars) const
1546{
1547 int len = qstrlen(chars);
1548 return convertToUnicode(chars, len, 0);
1549}
1550
1551
1552/*!
1553 \class QTextEncoder
1554 \brief The QTextEncoder class provides a state-based encoder.
1555 \reentrant
1556 \ingroup i18n
1557
1558 A text encoder converts text from Unicode into an encoded text format
1559 using a specific codec.
1560
1561 The encoder converts Unicode into another format, remembering any
1562 state that is required between calls.
1563
1564 \sa QTextCodec::makeEncoder(), QTextDecoder
1565*/
1566
1567/*!
1568 \fn QTextEncoder::QTextEncoder(const QTextCodec *codec)
1569
1570 Constructs a text encoder for the given \a codec.
1571*/
1572
1573/*!
1574 Destroys the encoder.
1575*/
1576QTextEncoder::~QTextEncoder()
1577{
1578}
1579
1580/*! \internal
1581 \since 4.5
1582 Determines whether the eecoder encountered a failure while decoding the input. If
1583 an error was encountered, the produced result is undefined, and gets converted as according
1584 to the conversion flags.
1585 */
1586bool QTextEncoder::hasFailure() const
1587{
1588 return state.invalidChars != 0;
1589}
1590
1591/*!
1592 Converts the Unicode string \a str into an encoded QByteArray.
1593*/
1594QByteArray QTextEncoder::fromUnicode(const QString& str)
1595{
1596 QByteArray result = c->fromUnicode(str.constData(), str.length(), &state);
1597 return result;
1598}
1599
1600/*!
1601 \overload
1602
1603 Converts \a len characters (not bytes) from \a uc, and returns the
1604 result in a QByteArray.
1605*/
1606QByteArray QTextEncoder::fromUnicode(const QChar *uc, int len)
1607{
1608 QByteArray result = c->fromUnicode(uc, len, &state);
1609 return result;
1610}
1611
1612#ifdef QT3_SUPPORT
1613/*!
1614 \overload
1615
1616 Converts \a lenInOut characters (not bytes) from \a uc, and returns the
1617 result in a QByteArray. The number of characters read is returned in
1618 the \a lenInOut parameter.
1619*/
1620QByteArray QTextEncoder::fromUnicode(const QString& uc, int& lenInOut)
1621{
1622 QByteArray result = c->fromUnicode(uc.constData(), lenInOut, &state);
1623 lenInOut = result.length();
1624 return result;
1625}
1626#endif
1627
1628/*!
1629 \class QTextDecoder
1630 \brief The QTextDecoder class provides a state-based decoder.
1631 \reentrant
1632 \ingroup i18n
1633
1634 A text decoder converts text from an encoded text format into Unicode
1635 using a specific codec.
1636
1637 The decoder converts text in this format into Unicode, remembering any
1638 state that is required between calls.
1639
1640 \sa QTextCodec::makeDecoder(), QTextEncoder
1641*/
1642
1643/*!
1644 \fn QTextDecoder::QTextDecoder(const QTextCodec *codec)
1645
1646 Constructs a text decoder for the given \a codec.
1647*/
1648
1649/*!
1650 Destroys the decoder.
1651*/
1652QTextDecoder::~QTextDecoder()
1653{
1654}
1655
1656/*!
1657 \fn QString QTextDecoder::toUnicode(const char *chars, int len)
1658
1659 Converts the first \a len bytes in \a chars to Unicode, returning
1660 the result.
1661
1662 If not all characters are used (e.g. if only part of a multi-byte
1663 encoding is at the end of the characters), the decoder remembers
1664 enough state to continue with the next call to this function.
1665*/
1666QString QTextDecoder::toUnicode(const char *chars, int len)
1667{
1668 return c->toUnicode(chars, len, &state);
1669}
1670
1671
1672/*! \overload
1673
1674 The converted string is returned in \a target.
1675 */
1676void QTextDecoder::toUnicode(QString *target, const char *chars, int len)
1677{
1678 Q_ASSERT(target);
1679 switch (c->mibEnum()) {
1680 case 106: // utf8
1681 static_cast<const QUtf8Codec*>(c)->convertToUnicode(target, chars, len, &state);
1682 break;
1683 case 4: { // latin1
1684 target->resize(len);
1685 ushort *data = (ushort*)target->data();
1686 for (int i = len; i >=0; --i)
1687 data[i] = (uchar) chars[i];
1688 } break;
1689 default:
1690 *target = c->toUnicode(chars, len, &state);
1691 }
1692}
1693
1694
1695/*!
1696 \overload
1697
1698 Converts the bytes in the byte array specified by \a ba to Unicode
1699 and returns the result.
1700*/
1701QString QTextDecoder::toUnicode(const QByteArray &ba)
1702{
1703 return c->toUnicode(ba.constData(), ba.length(), &state);
1704}
1705
1706
1707/*!
1708 \fn QTextCodec* QTextCodec::codecForTr()
1709
1710 Returns the codec used by QObject::tr() on its argument. If this
1711 function returns 0 (the default), tr() assumes Latin-1.
1712
1713 \sa setCodecForTr()
1714*/
1715
1716/*!
1717 \fn void QTextCodec::setCodecForTr(QTextCodec *c)
1718 \nonreentrant
1719
1720 Sets the codec used by QObject::tr() on its argument to \a c. If
1721 \a c is 0 (the default), tr() assumes Latin-1.
1722
1723 If the literal quoted text in the program is not in the Latin-1
1724 encoding, this function can be used to set the appropriate
1725 encoding. For example, software developed by Korean programmers
1726 might use eucKR for all the text in the program, in which case the
1727 main() function might look like this:
1728
1729 \snippet doc/src/snippets/code/src_corelib_codecs_qtextcodec.cpp 3
1730
1731 Note that this is not the way to select the encoding that the \e
1732 user has chosen. For example, to convert an application containing
1733 literal English strings to Korean, all that is needed is for the
1734 English strings to be passed through tr() and for translation
1735 files to be loaded. For details of internationalization, see
1736 \l{Internationalization with Qt}.
1737
1738 \sa codecForTr(), setCodecForCStrings()
1739*/
1740
1741
1742/*!
1743 \fn QTextCodec* QTextCodec::codecForCStrings()
1744
1745 Returns the codec used by QString to convert to and from \c{const
1746 char *} and QByteArrays. If this function returns 0 (the default),
1747 QString assumes Latin-1.
1748
1749 \sa setCodecForCStrings()
1750*/
1751
1752/*!
1753 \fn void QTextCodec::setCodecForCStrings(QTextCodec *codec)
1754 \nonreentrant
1755
1756 Sets the codec used by QString to convert to and from \c{const
1757 char *} and QByteArrays. If the \a codec is 0 (the default),
1758 QString assumes Latin-1.
1759
1760 \warning Some codecs do not preserve the characters in the ASCII
1761 range (0x00 to 0x7F). For example, the Japanese Shift-JIS
1762 encoding maps the backslash character (0x5A) to the Yen
1763 character. To avoid undesirable side-effects, we recommend
1764 avoiding such codecs with setCodecsForCString().
1765
1766 \sa codecForCStrings(), setCodecForTr()
1767*/
1768
1769/*!
1770 \since 4.4
1771
1772 Tries to detect the encoding of the provided snippet of HTML in the given byte array, \a ba,
1773 and returns a QTextCodec instance that is capable of decoding the html to unicode.
1774 If the codec cannot be detected from the content provided, \a defaultCodec is returned.
1775*/
1776QTextCodec *QTextCodec::codecForHtml(const QByteArray &ba, QTextCodec *defaultCodec)
1777{
1778 // determine charset
1779 int pos;
1780 QTextCodec *c = 0;
1781
1782 if (ba.size() > 1 && (((uchar)ba[0] == 0xfe && (uchar)ba[1] == 0xff)
1783 || ((uchar)ba[0] == 0xff && (uchar)ba[1] == 0xfe))) {
1784 c = QTextCodec::codecForMib(1015); // utf16
1785 } else if (ba.size() > 2
1786 && (uchar)ba[0] == 0xef
1787 && (uchar)ba[1] == 0xbb
1788 && (uchar)ba[2] == 0xbf) {
1789 c = QTextCodec::codecForMib(106); // utf-8
1790 } else {
1791 QByteArray header = ba.left(512).toLower();
1792 if ((pos = header.indexOf("http-equiv=")) != -1) {
1793 pos = header.indexOf("charset=", pos) + int(strlen("charset="));
1794 if (pos != -1) {
1795 int pos2 = header.indexOf('\"', pos+1);
1796 QByteArray cs = header.mid(pos, pos2-pos);
1797 // qDebug("found charset: %s", cs.data());
1798 c = QTextCodec::codecForName(cs);
1799 }
1800 }
1801 }
1802 if (!c)
1803 c = defaultCodec;
1804
1805 return c;
1806}
1807
1808/*!
1809 \overload
1810
1811 If the codec cannot be detected, this overload returns a Latin-1 QTextCodec.
1812*/
1813QTextCodec *QTextCodec::codecForHtml(const QByteArray &ba)
1814{
1815 return codecForHtml(ba, QTextCodec::codecForMib(/*Latin 1*/ 4));
1816}
1817
1818
1819/*! \internal
1820 \since 4.3
1821 Determines whether the decoder encountered a failure while decoding the input. If
1822 an error was encountered, the produced result is undefined, and gets converted as according
1823 to the conversion flags.
1824 */
1825bool QTextDecoder::hasFailure() const
1826{
1827 return state.invalidChars != 0;
1828}
1829
1830/*!
1831 \fn QTextCodec *QTextCodec::codecForContent(const char *str, int size)
1832
1833 This functionality is no longer provided by Qt. This
1834 compatibility function always returns a null pointer.
1835*/
1836
1837/*!
1838 \fn QTextCodec *QTextCodec::codecForName(const char *hint, int accuracy)
1839
1840 Use the codecForName(const QByteArray &) overload instead.
1841*/
1842
1843/*!
1844 \fn QTextCodec *QTextCodec::codecForIndex(int i)
1845
1846 Use availableCodecs() or availableMibs() instead and iterate
1847 through the resulting list.
1848*/
1849
1850
1851/*!
1852 \fn QByteArray QTextCodec::mimeName() const
1853
1854 Use name() instead.
1855*/
1856
1857QT_END_NAMESPACE
1858
1859#endif // QT_NO_TEXTCODEC
Note: See TracBrowser for help on using the repository browser.