source: trunk/src/corelib/codecs/qtextcodec.cpp@ 751

Last change on this file since 751 was 651, checked in by Dmitry A. Kuminov, 15 years ago

trunk: Merged in qt 4.6.2 sources.

File size: 55.8 KB
Line 
1/****************************************************************************
2**
3** Copyright (C) 2010 Nokia Corporation and/or its subsidiary(-ies).
4** All rights reserved.
5** Contact: Nokia Corporation ([email protected])
6**
7** This file is part of the QtCore module of the Qt Toolkit.
8**
9** $QT_BEGIN_LICENSE:LGPL$
10** Commercial Usage
11** Licensees holding valid Qt Commercial licenses may use this file in
12** accordance with the Qt Commercial License Agreement provided with the
13** Software or, alternatively, in accordance with the terms contained in
14** a written agreement between you and Nokia.
15**
16** GNU Lesser General Public License Usage
17** Alternatively, this file may be used under the terms of the GNU Lesser
18** General Public License version 2.1 as published by the Free Software
19** Foundation and appearing in the file LICENSE.LGPL included in the
20** packaging of this file. Please review the following information to
21** ensure the GNU Lesser General Public License version 2.1 requirements
22** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
23**
24** In addition, as a special exception, Nokia gives you certain additional
25** rights. These rights are described in the Nokia Qt LGPL Exception
26** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
27**
28** GNU General Public License Usage
29** Alternatively, this file may be used under the terms of the GNU
30** General Public License version 3.0 as published by the Free Software
31** Foundation and appearing in the file LICENSE.GPL included in the
32** packaging of this file. Please review the following information to
33** ensure the GNU General Public License version 3.0 requirements will be
34** met: http://www.gnu.org/copyleft/gpl.html.
35**
36** If you have questions regarding the use of this file, please contact
37** Nokia at [email protected].
38** $QT_END_LICENSE$
39**
40****************************************************************************/
41
42#include "qplatformdefs.h"
43#include "qtextcodec.h"
44#include "qtextcodec_p.h"
45
46#ifndef QT_NO_TEXTCODEC
47
48#include "qlist.h"
49#include "qfile.h"
50#ifndef QT_NO_LIBRARY
51# include "qcoreapplication.h"
52# include "qtextcodecplugin.h"
53# include "private/qfactoryloader_p.h"
54#endif
55#include "qstringlist.h"
56
57#ifdef Q_OS_UNIX
58# include "qiconvcodec_p.h"
59#endif
60
61#if defined(Q_OS_OS2)
62# include <unidef.h>
63# include <uconv.h>
64# include "qvector.h"
65#endif
66
67#include "qutfcodec_p.h"
68#include "qsimplecodec_p.h"
69#include "qlatincodec_p.h"
70#ifndef QT_NO_CODECS
71# include "qtsciicodec_p.h"
72# include "qisciicodec_p.h"
73# if defined(QT_NO_ICONV) && !defined(QT_BOOTSTRAPPED)
74// no iconv(3) support, must build all codecs into the library
75# include "../../plugins/codecs/cn/qgb18030codec.h"
76# include "../../plugins/codecs/jp/qeucjpcodec.h"
77# include "../../plugins/codecs/jp/qjiscodec.h"
78# include "../../plugins/codecs/jp/qsjiscodec.h"
79# include "../../plugins/codecs/kr/qeuckrcodec.h"
80# include "../../plugins/codecs/tw/qbig5codec.h"
81# endif // QT_NO_ICONV
82# if defined(Q_WS_X11) && !defined(QT_BOOTSTRAPPED)
83# include "qfontlaocodec_p.h"
84# include "../../plugins/codecs/jp/qfontjpcodec.h"
85# endif
86#endif // QT_NO_CODECS
87#include "qlocale.h"
88#include "private/qmutexpool_p.h"
89
90#include <stdlib.h>
91#include <ctype.h>
92#include <locale.h>
93#if defined (_XOPEN_UNIX) && !defined(Q_OS_QNX) && !defined(Q_OS_OSF)
94#include <langinfo.h>
95#endif
96
97#if defined(Q_OS_WINCE)
98# define QT_NO_SETLOCALE
99#endif
100
101// enabling this is not exception safe!
102// #define Q_DEBUG_TEXTCODEC
103
104QT_BEGIN_NAMESPACE
105
106#ifndef QT_NO_TEXTCODECPLUGIN
107Q_GLOBAL_STATIC_WITH_ARGS(QFactoryLoader, loader,
108 (QTextCodecFactoryInterface_iid, QLatin1String("/codecs")))
109#endif
110
111static char qtolower(register char c)
112{ if (c >= 'A' && c <= 'Z') return c + 0x20; return c; }
113static bool qisalnum(register char c)
114{ return (c >= '0' && c <= '9') || ((c | 0x20) >= 'a' && (c | 0x20) <= 'z'); }
115
116static bool nameMatch(const QByteArray &name, const QByteArray &test)
117{
118 // if they're the same, return a perfect score
119 if (qstricmp(name, test) == 0)
120 return true;
121
122 const char *n = name.constData();
123 const char *h = test.constData();
124
125 // if the letters and numbers are the same, we have a match
126 while (*n != '\0') {
127 if (qisalnum(*n)) {
128 for (;;) {
129 if (*h == '\0')
130 return false;
131 if (qisalnum(*h))
132 break;
133 ++h;
134 }
135 if (qtolower(*n) != qtolower(*h))
136 return false;
137 ++h;
138 }
139 ++n;
140 }
141 while (*h && !qisalnum(*h))
142 ++h;
143 return (*h == '\0');
144}
145
146
147static QTextCodec *createForName(const QByteArray &name)
148{
149#ifndef QT_NO_TEXTCODECPLUGIN
150 QFactoryLoader *l = loader();
151 QStringList keys = l->keys();
152 for (int i = 0; i < keys.size(); ++i) {
153 if (nameMatch(name, keys.at(i).toLatin1())) {
154 QString realName = keys.at(i);
155 if (QTextCodecFactoryInterface *factory
156 = qobject_cast<QTextCodecFactoryInterface*>(l->instance(realName))) {
157 return factory->create(realName);
158 }
159 }
160 }
161#else
162 Q_UNUSED(name);
163#endif
164 return 0;
165}
166
167static QTextCodec *createForMib(int mib)
168{
169#ifndef QT_NO_TEXTCODECPLUGIN
170 QString name = QLatin1String("MIB: ") + QString::number(mib);
171 if (QTextCodecFactoryInterface *factory
172 = qobject_cast<QTextCodecFactoryInterface*>(loader()->instance(name)))
173 return factory->create(name);
174#else
175 Q_UNUSED(mib);
176#endif
177 return 0;
178}
179
180static QList<QTextCodec*> *all = 0;
181#ifdef Q_DEBUG_TEXTCODEC
182static bool destroying_is_ok = false;
183#endif
184
185static QTextCodec *localeMapper = 0;
186QTextCodec *QTextCodec::cftr = 0;
187
188
189class QTextCodecCleanup
190{
191public:
192 ~QTextCodecCleanup();
193};
194
195/*
196 Deletes all the created codecs. This destructor is called just
197 before exiting to delete any QTextCodec objects that may be lying
198 around.
199*/
200QTextCodecCleanup::~QTextCodecCleanup()
201{
202 if (!all)
203 return;
204
205#ifdef Q_DEBUG_TEXTCODEC
206 destroying_is_ok = true;
207#endif
208
209 for (QList<QTextCodec *>::const_iterator it = all->constBegin()
210 ; it != all->constEnd(); ++it) {
211 delete *it;
212 }
213 delete all;
214 all = 0;
215 localeMapper = 0;
216
217#ifdef Q_DEBUG_TEXTCODEC
218 destroying_is_ok = false;
219#endif
220}
221
222Q_GLOBAL_STATIC(QTextCodecCleanup, createQTextCodecCleanup)
223
224#if defined(Q_OS_WIN32) || defined(Q_OS_WINCE)
225class QWindowsLocalCodec: public QTextCodec
226{
227public:
228 QWindowsLocalCodec();
229 ~QWindowsLocalCodec();
230
231 QString convertToUnicode(const char *, int, ConverterState *) const;
232 QByteArray convertFromUnicode(const QChar *, int, ConverterState *) const;
233 QString convertToUnicodeCharByChar(const char *chars, int length, ConverterState *state) const;
234
235 QByteArray name() const;
236 int mibEnum() const;
237
238};
239
240QWindowsLocalCodec::QWindowsLocalCodec()
241{
242}
243
244QWindowsLocalCodec::~QWindowsLocalCodec()
245{
246}
247
248QString QWindowsLocalCodec::convertToUnicode(const char *chars, int length, ConverterState *state) const
249{
250 const char *mb = chars;
251 int mblen = length;
252
253 if (!mb || !mblen)
254 return QString();
255
256 const int wclen_auto = 4096;
257 wchar_t wc_auto[wclen_auto];
258 int wclen = wclen_auto;
259 wchar_t *wc = wc_auto;
260 int len;
261 QString sp;
262 bool prepend = false;
263 char state_data = 0;
264 int remainingChars = 0;
265
266 //save the current state information
267 if (state) {
268 state_data = (char)state->state_data[0];
269 remainingChars = state->remainingChars;
270 }
271
272 //convert the pending charcter (if available)
273 if (state && remainingChars) {
274 char prev[3] = {0};
275 prev[0] = state_data;
276 prev[1] = mb[0];
277 remainingChars = 0;
278 len = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED,
279 prev, 2, wc, wclen);
280 if (len) {
281 prepend = true;
282 sp.append(QChar(wc[0]));
283 mb++;
284 mblen--;
285 wc[0] = 0;
286 }
287 }
288
289 while (!(len=MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED|MB_ERR_INVALID_CHARS,
290 mb, mblen, wc, wclen))) {
291 int r = GetLastError();
292 if (r == ERROR_INSUFFICIENT_BUFFER) {
293 if (wc != wc_auto) {
294 qWarning("MultiByteToWideChar: Size changed");
295 break;
296 } else {
297 wclen = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED,
298 mb, mblen, 0, 0);
299 wc = new wchar_t[wclen];
300 // and try again...
301 }
302 } else if (r == ERROR_NO_UNICODE_TRANSLATION) {
303 //find the last non NULL character
304 while (mblen > 1 && !(mb[mblen-1]))
305 mblen--;
306 //check whether, we hit an invalid character in the middle
307 if ((mblen <= 1) || (remainingChars && state_data))
308 return convertToUnicodeCharByChar(chars, length, state);
309 //Remove the last character and try again...
310 state_data = mb[mblen-1];
311 remainingChars = 1;
312 mblen--;
313 } else {
314 // Fail.
315 qWarning("MultiByteToWideChar: Cannot convert multibyte text");
316 break;
317 }
318 }
319 if (len <= 0)
320 return QString();
321 if (wc[len-1] == 0) // len - 1: we don't want terminator
322 --len;
323
324 //save the new state information
325 if (state) {
326 state->state_data[0] = (char)state_data;
327 state->remainingChars = remainingChars;
328 }
329 QString s((QChar*)wc, len);
330 if (wc != wc_auto)
331 delete [] wc;
332 if (prepend) {
333 return sp+s;
334 }
335 return s;
336}
337
338QString QWindowsLocalCodec::convertToUnicodeCharByChar(const char *chars, int length, ConverterState *state) const
339{
340 if (!chars || !length)
341 return QString();
342
343 int copyLocation = 0;
344 int extra = 2;
345 if (state && state->remainingChars) {
346 copyLocation = state->remainingChars;
347 extra += copyLocation;
348 }
349 int newLength = length + extra;
350 char *mbcs = new char[newLength];
351 //ensure that we have a NULL terminated string
352 mbcs[newLength-1] = 0;
353 mbcs[newLength-2] = 0;
354 memcpy(&(mbcs[copyLocation]), chars, length);
355 if (copyLocation) {
356 //copy the last character from the state
357 mbcs[0] = (char)state->state_data[0];
358 state->remainingChars = 0;
359 }
360 const char *mb = mbcs;
361#ifndef Q_OS_WINCE
362 const char *next = 0;
363 QString s;
364 while((next = CharNextExA(CP_ACP, mb, 0)) != mb) {
365 wchar_t wc[2] ={0};
366 int charlength = next - mb;
367 int len = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED|MB_ERR_INVALID_CHARS, mb, charlength, wc, 2);
368 if (len>0) {
369 s.append(QChar(wc[0]));
370 } else {
371 int r = GetLastError();
372 //check if the character being dropped is the last character
373 if (r == ERROR_NO_UNICODE_TRANSLATION && mb == (mbcs+newLength -3) && state) {
374 state->remainingChars = 1;
375 state->state_data[0] = (char)*mb;
376 }
377 }
378 mb = next;
379 }
380#else
381 QString s;
382 int size = mbstowcs(NULL, mb, length);
383 if (size < 0) {
384 Q_ASSERT("Error in CE TextCodec");
385 return QString();
386 }
387 wchar_t* ws = new wchar_t[size + 2];
388 ws[size +1] = 0;
389 ws[size] = 0;
390 size = mbstowcs(ws, mb, length);
391 for (int i=0; i< size; i++)
392 s.append(QChar(ws[i]));
393 delete [] ws;
394#endif
395 delete mbcs;
396 return s;
397}
398
399QByteArray QWindowsLocalCodec::convertFromUnicode(const QChar *uc, int len, ConverterState *) const
400{
401 return qt_winQString2MB(uc, len);
402}
403
404
405QByteArray QWindowsLocalCodec::name() const
406{
407 return "System";
408}
409
410int QWindowsLocalCodec::mibEnum() const
411{
412 return 0;
413}
414
415#elif defined(Q_OS_OS2)
416
417class QOs2LocalCodec: public QTextCodec
418{
419public:
420 QOs2LocalCodec();
421 ~QOs2LocalCodec();
422
423 QString convertToUnicode(const char *, int, ConverterState *) const;
424 QByteArray convertFromUnicode(const QChar *, int, ConverterState *) const;
425
426 QByteArray name() const;
427 int mibEnum() const;
428
429private:
430 UconvObject uoSubYes;
431 UconvObject uoSubNo;
432};
433
434QOs2LocalCodec::QOs2LocalCodec() : uoSubYes(0), uoSubNo(0)
435{
436 // create the conversion object for the process code page that performs
437 // substitution of invalid characters with '?'
438 UniCreateUconvObject((UniChar *)L"@sub=yes,subchar=\\x3F,subuni=\\x003F",
439 &uoSubYes);
440 Q_ASSERT(uoSubYes);
441
442 // same as above but doesn't perform substitution
443 UniCreateUconvObject((UniChar *)L"@sub=no", &uoSubNo);
444 Q_ASSERT(uoSubNo);
445}
446
447QOs2LocalCodec::~QOs2LocalCodec()
448{
449 UniFreeUconvObject(uoSubNo);
450 UniFreeUconvObject(uoSubYes);
451}
452
453static void qOs2LocalCodecStateFree(QTextCodec::ConverterState *state)
454{
455 delete reinterpret_cast<char *>(state->d);
456}
457
458QString QOs2LocalCodec::convertToUnicode(const char *chars, int length,
459 ConverterState *state) const
460{
461 QString res;
462
463 if (!chars)
464 return res;
465 if (!length)
466 return QLatin1String("");
467
468 UconvObject uo = uoSubYes;
469 if (state && (state->flags & ConvertInvalidToNull))
470 uo = uoSubNo;
471
472 int remainingChars = 0;
473 char *remainingBuffer = 0;
474
475 if (state) {
476 // stateful conversion
477 remainingBuffer = reinterpret_cast<char *>(state->d);
478 if (remainingBuffer) {
479 // restore state
480 remainingChars = state->remainingChars;
481 } else {
482 // first time, add the destructor for state->d
483 state->flags |= FreeFunction;
484 QTextCodecUnalignedPointer::encode(state->state_data,
485 qOs2LocalCodecStateFree);
486 }
487 }
488
489 const char *mbPtr = chars;
490 size_t mbLeft = length;
491
492 QByteArray mbExtra;
493 if (remainingChars) {
494 // we have to prepend the remaining bytes from the previous conversion
495 mbLeft += remainingChars;
496 mbExtra.resize(mbLeft);
497 mbPtr = mbExtra.data();
498
499 memcpy(mbExtra.data(), remainingBuffer, remainingChars);
500 memcpy(mbExtra.data() + remainingChars, chars, length);
501
502 remainingBuffer = 0;
503 remainingChars = 0;
504 }
505
506 size_t ucLen = mbLeft;
507 QString ucBuf(ucLen, QLatin1Char('\0'));
508 UniChar *ucPtr = reinterpret_cast<UniChar *>(ucBuf.data());
509 size_t ucLeft = ucLen;
510
511 size_t nonIdent = 0;
512 int rc;
513
514 while (mbLeft) {
515 rc = UniUconvToUcs(uo, (void**)&mbPtr, &mbLeft, &ucPtr, &ucLeft,
516 &nonIdent);
517 if (rc == ULS_BUFFERFULL) {
518 size_t ucDone = ucLen - ucLeft;
519 size_t mbDone = length - mbLeft;
520 // assume that mbLeft/ucLeft is an approximation of mbDone/ucDone
521 ucLen = ucDone + (mbLeft * ucDone) / mbDone;
522 ucBuf.resize(ucLen);
523 ucPtr = reinterpret_cast<UniChar *>(ucBuf.data() + ucDone);
524 } else if (rc == ULS_ILLEGALSEQUENCE && state) {
525 // conversion stopped because the remaining inBytesLeft make up
526 // an incomplete multi-byte sequence; save them for later
527 remainingBuffer = new char[mbLeft];
528 memcpy(remainingBuffer, mbPtr, mbLeft);
529 remainingChars = mbLeft;
530 break;
531 } else if (rc != ULS_SUCCESS) {
532 // just fail on an unexpected error (will return what we've got)
533 qWarning("QOs2LocalCodec::convertToUnicode: UniUconvToUcs failed "
534 "with %d", rc);
535 break;
536 }
537 }
538
539 ucBuf.resize(ucLen - ucLeft);
540 res = ucBuf;
541
542 if (state) {
543 // update the state
544 state->invalidChars = nonIdent;
545 state->remainingChars = remainingChars;
546 state->d = remainingBuffer;
547 }
548
549 return res;
550}
551
552QByteArray QOs2LocalCodec::convertFromUnicode(const QChar *uchars, int length,
553 ConverterState *state) const
554{
555 QByteArray res;
556
557 if (!uchars)
558 return res;
559 if (!length)
560 return QByteArray("");
561
562 UconvObject uo = uoSubYes;
563 if (state && (state->flags & ConvertInvalidToNull))
564 uo = uoSubNo;
565
566 const UniChar *ucPtr = reinterpret_cast<const UniChar *>(uchars);
567 size_t ucLeft = length;
568
569 QVector<QChar> ucExtra;
570 if (state && state->remainingChars) {
571 // we have one surrogate char to be prepended
572 Q_ASSERT(state->remainingChars == 1);
573 ucLeft += 1;
574 ucExtra.resize(ucLeft);
575 ucPtr = reinterpret_cast<const UniChar *>(ucExtra.data());
576
577 ucExtra[0] = state->state_data[0];
578 memcpy(ucExtra.data() + 1, uchars, length * sizeof(QChar));
579
580 state->remainingChars = 0;
581 }
582
583 // be optimistic (imply that one byte is necessary per every Unicode char)
584 size_t mbLen = length;
585 QByteArray mbBuf(mbLen, '\0');
586 char *mbPtr = mbBuf.data();
587 size_t mbLeft = mbLen;
588
589 size_t nonIdent = 0;
590 int rc;
591
592 while (ucLeft) {
593 rc = UniUconvFromUcs(uo, const_cast<UniChar **>(&ucPtr), &ucLeft,
594 (void**)&mbPtr, &mbLeft, &nonIdent);
595 if (rc == ULS_BUFFERFULL) {
596 size_t mbDone = mbLen - mbLeft;
597 size_t ucDone = length - ucLeft;
598 size_t newLen = mbLen;
599 if (ucDone) {
600 // assume that ucLeft/mbLeft is an approximation of ucDone/mbDone
601 newLen = mbDone + (ucLeft * mbDone) / ucDone;
602 }
603 if (newLen == mbLen) {
604 // could not process a single Unicode char, double the size
605 mbLen *= 2;
606 } else {
607 mbLen = newLen;
608 }
609 mbBuf.resize(mbLen);
610 mbPtr = mbBuf.data() + mbDone;
611 mbLeft = mbLen - mbDone;
612 } else if (rc == ULS_ILLEGALSEQUENCE && state) {
613 // buffer ends in a surrogate
614 Q_ASSERT(ucLeft == 2);
615 state->state_data[0] = *ucPtr;
616 state->remainingChars = 1;
617 break;
618 } else if (rc != ULS_SUCCESS) {
619 // just fail on an unexpected error (will return what we've got)
620 qWarning("QOs2LocalCodec::convertFromUnicode: UniUconvFromUcs failed "
621 "with %d", rc);
622 break;
623 }
624 }
625
626 mbBuf.resize(mbLen - mbLeft);
627 res = mbBuf;
628
629 if (state) {
630 // update the state
631 state->invalidChars = nonIdent;
632 }
633
634 return res;
635}
636
637QByteArray QOs2LocalCodec::name() const
638{
639 return "System";
640}
641
642int QOs2LocalCodec::mibEnum() const
643{
644 return 0;
645}
646
647#else
648
649/* locale names mostly copied from XFree86 */
650static const char * const iso8859_2locales[] = {
651 "croatian", "cs", "cs_CS", "cs_CZ","cz", "cz_CZ", "czech", "hr",
652 "hr_HR", "hu", "hu_HU", "hungarian", "pl", "pl_PL", "polish", "ro",
653 "ro_RO", "rumanian", "serbocroatian", "sh", "sh_SP", "sh_YU", "sk",
654 "sk_SK", "sl", "sl_CS", "sl_SI", "slovak", "slovene", "sr_SP", 0 };
655
656static const char * const iso8859_3locales[] = {
657 "eo", 0 };
658
659static const char * const iso8859_4locales[] = {
660 "ee", "ee_EE", 0 };
661
662static const char * const iso8859_5locales[] = {
663 "mk", "mk_MK", "sp", "sp_YU", 0 };
664
665static const char * const cp_1251locales[] = {
666 "be", "be_BY", "bg", "bg_BG", "bulgarian", 0 };
667
668static const char * const pt_154locales[] = {
669 "ba_RU", "ky", "ky_KG", "kk", "kk_KZ", 0 };
670
671static const char * const iso8859_6locales[] = {
672 "ar_AA", "ar_SA", "arabic", 0 };
673
674static const char * const iso8859_7locales[] = {
675 "el", "el_GR", "greek", 0 };
676
677static const char * const iso8859_8locales[] = {
678 "hebrew", "he", "he_IL", "iw", "iw_IL", 0 };
679
680static const char * const iso8859_9locales[] = {
681 "tr", "tr_TR", "turkish", 0 };
682
683static const char * const iso8859_13locales[] = {
684 "lt", "lt_LT", "lv", "lv_LV", 0 };
685
686static const char * const iso8859_15locales[] = {
687 "et", "et_EE",
688 // Euro countries
689 "br_FR", "ca_ES", "de", "de_AT", "de_BE", "de_DE", "de_LU", "en_IE",
690 "es", "es_ES", "eu_ES", "fi", "fi_FI", "finnish", "fr", "fr_FR",
691 "fr_BE", "fr_LU", "french", "ga_IE", "gl_ES", "it", "it_IT", "oc_FR",
692 "nl", "nl_BE", "nl_NL", "pt", "pt_PT", "sv_FI", "wa_BE",
693 0 };
694
695static const char * const koi8_ulocales[] = {
696 "uk", "uk_UA", "ru_UA", "ukrainian", 0 };
697
698static const char * const tis_620locales[] = {
699 "th", "th_TH", "thai", 0 };
700
701// static const char * const tcvnlocales[] = {
702// "vi", "vi_VN", 0 };
703
704static bool try_locale_list(const char * const locale[], const QByteArray &lang)
705{
706 int i;
707 for(i=0; locale[i] && lang != locale[i]; i++)
708 ;
709 return locale[i] != 0;
710}
711
712// For the probably_koi8_locales we have to look. the standard says
713// these are 8859-5, but almost all Russian users use KOI8-R and
714// incorrectly set $LANG to ru_RU. We'll check tolower() to see what
715// it thinks ru_RU means.
716
717// If you read the history, it seems that many Russians blame ISO and
718// Perestroika for the confusion.
719//
720// The real bug is that some programs break if the user specifies
721// ru_RU.KOI8-R.
722
723static const char * const probably_koi8_rlocales[] = {
724 "ru", "ru_SU", "ru_RU", "russian", 0 };
725
726static QTextCodec * ru_RU_hack(const char * i) {
727#if defined(Q_OS_OS2)
728 // @todo temporary hack. the proper one is to use the current process'
729 // code page if LANG or its codepage part is missing
730 return QTextCodec::codecForName("cp866");
731#else
732 QTextCodec * ru_RU_codec = 0;
733
734#if !defined(QT_NO_SETLOCALE)
735 QByteArray origlocale(setlocale(LC_CTYPE, i));
736#else
737 QByteArray origlocale(i);
738#endif
739 // unicode koi8r latin5 name
740 // 0x044E 0xC0 0xEE CYRILLIC SMALL LETTER YU
741 // 0x042E 0xE0 0xCE CYRILLIC CAPITAL LETTER YU
742 int latin5 = tolower(0xCE);
743 int koi8r = tolower(0xE0);
744 if (koi8r == 0xC0 && latin5 != 0xEE) {
745 ru_RU_codec = QTextCodec::codecForName("KOI8-R");
746 } else if (koi8r != 0xC0 && latin5 == 0xEE) {
747 ru_RU_codec = QTextCodec::codecForName("ISO 8859-5");
748 } else {
749 // something else again... let's assume... *throws dice*
750 ru_RU_codec = QTextCodec::codecForName("KOI8-R");
751 qWarning("QTextCodec: Using KOI8-R, probe failed (%02x %02x %s)",
752 koi8r, latin5, i);
753 }
754#if !defined(QT_NO_SETLOCALE)
755 setlocale(LC_CTYPE, origlocale);
756#endif
757
758 return ru_RU_codec;
759#endif // defined(Q_OS_OS2)
760}
761
762#endif
763
764#if !defined(Q_OS_WIN32) && !defined(Q_OS_WINCE) && !defined(Q_OS_OS2)
765static QTextCodec *checkForCodec(const QByteArray &name) {
766 QTextCodec *c = QTextCodec::codecForName(name);
767 if (!c) {
768 const int index = name.indexOf('@');
769 if (index != -1) {
770 c = QTextCodec::codecForName(name.left(index));
771 }
772 }
773 return c;
774}
775#endif
776
777/* the next two functions are implicitely thread safe,
778 as they are only called by setup() which uses a mutex.
779*/
780static void setupLocaleMapper()
781{
782#if defined(Q_OS_WIN32) || defined(Q_OS_WINCE)
783 localeMapper = QTextCodec::codecForName("System");
784#elif defined(Q_OS_OS2)
785 localeMapper = QTextCodec::codecForName("System");
786#else
787
788#ifndef QT_NO_ICONV
789 localeMapper = QTextCodec::codecForName("System");
790#endif
791
792#if defined (_XOPEN_UNIX) && !defined(Q_OS_QNX) && !defined(Q_OS_OSF)
793 if (!localeMapper) {
794 char *charset = nl_langinfo (CODESET);
795 if (charset)
796 localeMapper = QTextCodec::codecForName(charset);
797 }
798#endif
799
800 if (!localeMapper) {
801 // Very poorly defined and followed standards causes lots of
802 // code to try to get all the cases... This logic is
803 // duplicated in QIconvCodec, so if you change it here, change
804 // it there too.
805
806 // Try to determine locale codeset from locale name assigned to
807 // LC_CTYPE category.
808
809 // First part is getting that locale name. First try setlocale() which
810 // definitely knows it, but since we cannot fully trust it, get ready
811 // to fall back to environment variables.
812#if !defined(QT_NO_SETLOCALE)
813 const QByteArray ctype = setlocale(LC_CTYPE, 0);
814#else
815 const QByteArray ctype;
816#endif
817
818 // Get the first nonempty value from $LC_ALL, $LC_CTYPE, and $LANG
819 // environment variables.
820 QByteArray lang = qgetenv("LC_ALL");
821 if (lang.isEmpty() || lang == "C") {
822 lang = qgetenv("LC_CTYPE");
823 }
824 if (lang.isEmpty() || lang == "C") {
825 lang = qgetenv("LANG");
826 }
827
828 // Now try these in order:
829 // 1. CODESET from ctype if it contains a .CODESET part (e.g. en_US.ISO8859-15)
830 // 2. CODESET from lang if it contains a .CODESET part
831 // 3. ctype (maybe the locale is named "ISO-8859-1" or something)
832 // 4. locale (ditto)
833 // 5. check for "@euro"
834 // 6. guess locale from ctype unless ctype is "C"
835 // 7. guess locale from lang
836
837 // 1. CODESET from ctype if it contains a .CODESET part (e.g. en_US.ISO8859-15)
838 int indexOfDot = ctype.indexOf('.');
839 if (indexOfDot != -1)
840 localeMapper = checkForCodec( ctype.mid(indexOfDot + 1) );
841
842 // 2. CODESET from lang if it contains a .CODESET part
843 if (!localeMapper) {
844 indexOfDot = lang.indexOf('.');
845 if (indexOfDot != -1)
846 localeMapper = checkForCodec( lang.mid(indexOfDot + 1) );
847 }
848
849 // 3. ctype (maybe the locale is named "ISO-8859-1" or something)
850 if (!localeMapper && !ctype.isEmpty() && ctype != "C")
851 localeMapper = checkForCodec(ctype);
852
853 // 4. locale (ditto)
854 if (!localeMapper && !lang.isEmpty())
855 localeMapper = checkForCodec(lang);
856
857 // 5. "@euro"
858 if ((!localeMapper && ctype.contains("@euro")) || lang.contains("@euro"))
859 localeMapper = checkForCodec("ISO 8859-15");
860
861 // 6. guess locale from ctype unless ctype is "C"
862 // 7. guess locale from lang
863 const QByteArray &try_by_name = (!ctype.isEmpty() && ctype != "C") ? lang : ctype;
864
865 // Now do the guessing.
866 if (!lang.isEmpty() && !localeMapper && !try_by_name.isEmpty()) {
867 if (try_locale_list(iso8859_15locales, lang))
868 localeMapper = QTextCodec::codecForName("ISO 8859-15");
869 else if (try_locale_list(iso8859_2locales, lang))
870 localeMapper = QTextCodec::codecForName("ISO 8859-2");
871 else if (try_locale_list(iso8859_3locales, lang))
872 localeMapper = QTextCodec::codecForName("ISO 8859-3");
873 else if (try_locale_list(iso8859_4locales, lang))
874 localeMapper = QTextCodec::codecForName("ISO 8859-4");
875 else if (try_locale_list(iso8859_5locales, lang))
876 localeMapper = QTextCodec::codecForName("ISO 8859-5");
877 else if (try_locale_list(iso8859_6locales, lang))
878 localeMapper = QTextCodec::codecForName("ISO 8859-6");
879 else if (try_locale_list(iso8859_7locales, lang))
880 localeMapper = QTextCodec::codecForName("ISO 8859-7");
881 else if (try_locale_list(iso8859_8locales, lang))
882 localeMapper = QTextCodec::codecForName("ISO 8859-8-I");
883 else if (try_locale_list(iso8859_9locales, lang))
884 localeMapper = QTextCodec::codecForName("ISO 8859-9");
885 else if (try_locale_list(iso8859_13locales, lang))
886 localeMapper = QTextCodec::codecForName("ISO 8859-13");
887 else if (try_locale_list(tis_620locales, lang))
888 localeMapper = QTextCodec::codecForName("ISO 8859-11");
889 else if (try_locale_list(koi8_ulocales, lang))
890 localeMapper = QTextCodec::codecForName("KOI8-U");
891 else if (try_locale_list(cp_1251locales, lang))
892 localeMapper = QTextCodec::codecForName("CP 1251");
893 else if (try_locale_list(pt_154locales, lang))
894 localeMapper = QTextCodec::codecForName("PT 154");
895 else if (try_locale_list(probably_koi8_rlocales, lang))
896 localeMapper = ru_RU_hack(lang);
897 }
898
899 }
900
901 // If everything failed, we default to 8859-1
902 // We could perhaps default to 8859-15.
903 if (!localeMapper)
904 localeMapper = QTextCodec::codecForName("ISO 8859-1");
905#endif
906}
907
908
909static void setup()
910{
911#ifndef QT_NO_THREAD
912 QMutexLocker locker(QMutexPool::globalInstanceGet(&all));
913#endif
914
915 if (all)
916 return;
917
918#ifdef Q_DEBUG_TEXTCODEC
919 if (destroying_is_ok)
920 qWarning("QTextCodec: Creating new codec during codec cleanup");
921#endif
922 all = new QList<QTextCodec*>;
923 // create the cleanup object to cleanup all codecs on exit
924 (void) createQTextCodecCleanup();
925
926#ifndef QT_NO_CODECS
927# if defined(Q_WS_X11) && !defined(QT_BOOTSTRAPPED)
928 // no font codecs when bootstrapping
929 (void)new QFontLaoCodec;
930# if defined(QT_NO_ICONV)
931 // no iconv(3) support, must build all codecs into the library
932 (void)new QFontGb2312Codec;
933 (void)new QFontGbkCodec;
934 (void)new QFontGb18030_0Codec;
935 (void)new QFontJis0208Codec;
936 (void)new QFontJis0201Codec;
937 (void)new QFontKsc5601Codec;
938 (void)new QFontBig5hkscsCodec;
939 (void)new QFontBig5Codec;
940# endif // QT_NO_ICONV && !QT_BOOTSTRAPPED
941# endif // Q_WS_X11
942
943 (void)new QTsciiCodec;
944
945 for (int i = 0; i < 9; ++i)
946 (void)new QIsciiCodec(i);
947
948
949# if defined(QT_NO_ICONV) && !defined(QT_BOOTSTRAPPED)
950 // no asian codecs when bootstrapping, sorry
951 (void)new QGb18030Codec;
952 (void)new QGbkCodec;
953 (void)new QGb2312Codec;
954 (void)new QEucJpCodec;
955 (void)new QJisCodec;
956 (void)new QSjisCodec;
957 (void)new QEucKrCodec;
958 (void)new QCP949Codec;
959 (void)new QBig5Codec;
960 (void)new QBig5hkscsCodec;
961# endif // QT_NO_ICONV && !QT_BOOTSTRAPPED
962#endif // QT_NO_CODECS
963
964#if defined(Q_OS_WIN32) || defined(Q_OS_WINCE)
965 (void) new QWindowsLocalCodec;
966#endif // Q_OS_WIN32
967
968#if defined(Q_OS_OS2)
969 (void) new QOs2LocalCodec;
970#endif // Q_OS_OS2
971
972 (void)new QUtf16Codec;
973 (void)new QUtf16BECodec;
974 (void)new QUtf16LECodec;
975 (void)new QUtf32Codec;
976 (void)new QUtf32BECodec;
977 (void)new QUtf32LECodec;
978 (void)new QLatin15Codec;
979 (void)new QLatin1Codec;
980 (void)new QUtf8Codec;
981
982 for (int i = 0; i < QSimpleTextCodec::numSimpleCodecs; ++i)
983 (void)new QSimpleTextCodec(i);
984
985#if defined(Q_OS_UNIX) && !defined(QT_NO_ICONV) && !defined(QT_BOOTSTRAPPED)
986 // QIconvCodec depends on the UTF-16 codec, so it needs to be created last
987 (void) new QIconvCodec();
988#endif
989
990 if (!localeMapper)
991 setupLocaleMapper();
992}
993
994/*!
995 \enum QTextCodec::ConversionFlag
996
997 \value DefaultConversion No flag is set.
998 \value ConvertInvalidToNull If this flag is set, each invalid input
999 character is output as a null character.
1000 \value IgnoreHeader Ignore any Unicode byte-order mark and don't generate any.
1001
1002 \omitvalue FreeFunction
1003*/
1004
1005/*!
1006 \fn QTextCodec::ConverterState::ConverterState(ConversionFlags flags)
1007
1008 Constructs a ConverterState object initialized with the given \a flags.
1009*/
1010
1011/*!
1012 Destroys the ConverterState object.
1013*/
1014QTextCodec::ConverterState::~ConverterState()
1015{
1016 if (flags & FreeFunction)
1017 (QTextCodecUnalignedPointer::decode(state_data))(this);
1018 else if (d)
1019 qFree(d);
1020}
1021
1022static bool codecForLocaleSet = false;
1023void qt_resetCodecForLocale()
1024{
1025 // if QTextCodec::codecForLocale() was called, we assume that the user has
1026 // explicitly set the codec he wants for the locale and don't attempt to
1027 // autodetect it again
1028 if (!codecForLocaleSet)
1029 setupLocaleMapper();
1030}
1031
1032/*!
1033 \class QTextCodec
1034 \brief The QTextCodec class provides conversions between text encodings.
1035 \reentrant
1036 \ingroup i18n
1037
1038 Qt uses Unicode to store, draw and manipulate strings. In many
1039 situations you may wish to deal with data that uses a different
1040 encoding. For example, most Japanese documents are still stored
1041 in Shift-JIS or ISO 2022-JP, while Russian users often have their
1042 documents in KOI8-R or Windows-1251.
1043
1044 Qt provides a set of QTextCodec classes to help with converting
1045 non-Unicode formats to and from Unicode. You can also create your
1046 own codec classes.
1047
1048 The supported encodings are:
1049
1050 \list
1051 \o Apple Roman
1052 \o \l{Big5 Text Codec}{Big5}
1053 \o \l{Big5-HKSCS Text Codec}{Big5-HKSCS}
1054 \o CP949
1055 \o \l{EUC-JP Text Codec}{EUC-JP}
1056 \o \l{EUC-KR Text Codec}{EUC-KR}
1057 \o \l{GBK Text Codec}{GB18030-0}
1058 \o IBM 850
1059 \o IBM 866
1060 \o IBM 874
1061 \o \l{ISO 2022-JP (JIS) Text Codec}{ISO 2022-JP}
1062 \o ISO 8859-1 to 10
1063 \o ISO 8859-13 to 16
1064 \o Iscii-Bng, Dev, Gjr, Knd, Mlm, Ori, Pnj, Tlg, and Tml
1065 \o JIS X 0201
1066 \o JIS X 0208
1067 \o KOI8-R
1068 \o KOI8-U
1069 \o MuleLao-1
1070 \o ROMAN8
1071 \o \l{Shift-JIS Text Codec}{Shift-JIS}
1072 \o TIS-620
1073 \o \l{TSCII Text Codec}{TSCII}
1074 \o UTF-8
1075 \o UTF-16
1076 \o UTF-16BE
1077 \o UTF-16LE
1078 \o UTF-32
1079 \o UTF-32BE
1080 \o UTF-32LE
1081 \o Windows-1250 to 1258
1082 \o WINSAMI2
1083 \endlist
1084
1085 QTextCodecs can be used as follows to convert some locally encoded
1086 string to Unicode. Suppose you have some string encoded in Russian
1087 KOI8-R encoding, and want to convert it to Unicode. The simple way
1088 to do it is like this:
1089
1090 \snippet doc/src/snippets/code/src_corelib_codecs_qtextcodec.cpp 0
1091
1092 After this, \c string holds the text converted to Unicode.
1093 Converting a string from Unicode to the local encoding is just as
1094 easy:
1095
1096 \snippet doc/src/snippets/code/src_corelib_codecs_qtextcodec.cpp 1
1097
1098 To read or write files in various encodings, use QTextStream and
1099 its \l{QTextStream::setCodec()}{setCodec()} function. See the
1100 \l{tools/codecs}{Codecs} example for an application of QTextCodec
1101 to file I/O.
1102
1103 Some care must be taken when trying to convert the data in chunks,
1104 for example, when receiving it over a network. In such cases it is
1105 possible that a multi-byte character will be split over two
1106 chunks. At best this might result in the loss of a character and
1107 at worst cause the entire conversion to fail.
1108
1109 The approach to use in these situations is to create a QTextDecoder
1110 object for the codec and use this QTextDecoder for the whole
1111 decoding process, as shown below:
1112
1113 \snippet doc/src/snippets/code/src_corelib_codecs_qtextcodec.cpp 2
1114
1115 The QTextDecoder object maintains state between chunks and therefore
1116 works correctly even if a multi-byte character is split between
1117 chunks.
1118
1119 \section1 Creating Your Own Codec Class
1120
1121 Support for new text encodings can be added to Qt by creating
1122 QTextCodec subclasses.
1123
1124 The pure virtual functions describe the encoder to the system and
1125 the coder is used as required in the different text file formats
1126 supported by QTextStream, and under X11, for the locale-specific
1127 character input and output.
1128
1129 To add support for another encoding to Qt, make a subclass of
1130 QTextCodec and implement the functions listed in the table below.
1131
1132 \table
1133 \header \o Function \o Description
1134
1135 \row \o name()
1136 \o Returns the official name for the encoding. If the
1137 encoding is listed in the
1138 \l{IANA character-sets encoding file}, the name
1139 should be the preferred MIME name for the encoding.
1140
1141 \row \o aliases()
1142 \o Returns a list of alternative names for the encoding.
1143 QTextCodec provides a default implementation that returns
1144 an empty list. For example, "ISO-8859-1" has "latin1",
1145 "CP819", "IBM819", and "iso-ir-100" as aliases.
1146
1147 \row \o mibEnum()
1148 \o Return the MIB enum for the encoding if it is listed in
1149 the \l{IANA character-sets encoding file}.
1150
1151 \row \o convertToUnicode()
1152 \o Converts an 8-bit character string to Unicode.
1153
1154 \row \o convertFromUnicode()
1155 \o Converts a Unicode string to an 8-bit character string.
1156 \endtable
1157
1158 You may find it more convenient to make your codec class
1159 available as a plugin; see \l{How to Create Qt Plugins} for
1160 details.
1161
1162 \sa QTextStream, QTextDecoder, QTextEncoder, {Codecs Example}
1163*/
1164
1165/*!
1166 \nonreentrant
1167
1168 Constructs a QTextCodec, and gives it the highest precedence. The
1169 QTextCodec should always be constructed on the heap (i.e. with \c
1170 new). Qt takes ownership and will delete it when the application
1171 terminates.
1172*/
1173QTextCodec::QTextCodec()
1174{
1175 setup();
1176 all->prepend(this);
1177}
1178
1179
1180/*!
1181 \nonreentrant
1182
1183 Destroys the QTextCodec. Note that you should not delete codecs
1184 yourself: once created they become Qt's responsibility.
1185*/
1186QTextCodec::~QTextCodec()
1187{
1188#ifdef Q_DEBUG_TEXTCODEC
1189 if (!destroying_is_ok)
1190 qWarning("QTextCodec::~QTextCodec: Called by application");
1191#endif
1192 if (all)
1193 all->removeAll(this);
1194}
1195
1196/*!
1197 \fn QTextCodec *QTextCodec::codecForName(const char *name)
1198
1199 Searches all installed QTextCodec objects and returns the one
1200 which best matches \a name; the match is case-insensitive. Returns
1201 0 if no codec matching the name \a name could be found.
1202*/
1203
1204/*!
1205 Searches all installed QTextCodec objects and returns the one
1206 which best matches \a name; the match is case-insensitive. Returns
1207 0 if no codec matching the name \a name could be found.
1208*/
1209QTextCodec *QTextCodec::codecForName(const QByteArray &name)
1210{
1211 if (name.isEmpty())
1212 return 0;
1213
1214 setup();
1215
1216 for (int i = 0; i < all->size(); ++i) {
1217 QTextCodec *cursor = all->at(i);
1218 if (nameMatch(cursor->name(), name))
1219 return cursor;
1220 QList<QByteArray> aliases = cursor->aliases();
1221 for (int i = 0; i < aliases.size(); ++i)
1222 if (nameMatch(aliases.at(i), name))
1223 return cursor;
1224 }
1225
1226 return createForName(name);
1227}
1228
1229
1230/*!
1231 Returns the QTextCodec which matches the \link
1232 QTextCodec::mibEnum() MIBenum\endlink \a mib.
1233*/
1234QTextCodec* QTextCodec::codecForMib(int mib)
1235{
1236 setup();
1237
1238 // Qt 3 used 1000 (mib for UCS2) as its identifier for the utf16 codec. Map
1239 // this correctly for compatibility.
1240 if (mib == 1000)
1241 mib = 1015;
1242
1243 QList<QTextCodec*>::ConstIterator i;
1244 for (int i = 0; i < all->size(); ++i) {
1245 QTextCodec *cursor = all->at(i);
1246 if (cursor->mibEnum() == mib)
1247 return cursor;
1248 }
1249
1250 return createForMib(mib);
1251}
1252
1253/*!
1254 Returns the list of all available codecs, by name. Call
1255 QTextCodec::codecForName() to obtain the QTextCodec for the name.
1256
1257 The list may contain many mentions of the same codec
1258 if the codec has aliases.
1259
1260 \sa availableMibs(), name(), aliases()
1261*/
1262QList<QByteArray> QTextCodec::availableCodecs()
1263{
1264 setup();
1265
1266 QList<QByteArray> codecs;
1267 for (int i = 0; i < all->size(); ++i) {
1268 codecs += all->at(i)->name();
1269 codecs += all->at(i)->aliases();
1270 }
1271#ifndef QT_NO_TEXTCODECPLUGIN
1272 QFactoryLoader *l = loader();
1273 QStringList keys = l->keys();
1274 for (int i = 0; i < keys.size(); ++i) {
1275 if (!keys.at(i).startsWith(QLatin1String("MIB: "))) {
1276 QByteArray name = keys.at(i).toLatin1();
1277 if (!codecs.contains(name))
1278 codecs += name;
1279 }
1280 }
1281#endif
1282
1283 return codecs;
1284}
1285
1286/*!
1287 Returns the list of MIBs for all available codecs. Call
1288 QTextCodec::codecForMib() to obtain the QTextCodec for the MIB.
1289
1290 \sa availableCodecs(), mibEnum()
1291*/
1292QList<int> QTextCodec::availableMibs()
1293{
1294 setup();
1295
1296 QList<int> codecs;
1297 for (int i = 0; i < all->size(); ++i)
1298 codecs += all->at(i)->mibEnum();
1299#ifndef QT_NO_TEXTCODECPLUGIN
1300 QFactoryLoader *l = loader();
1301 QStringList keys = l->keys();
1302 for (int i = 0; i < keys.size(); ++i) {
1303 if (keys.at(i).startsWith(QLatin1String("MIB: "))) {
1304 int mib = keys.at(i).mid(5).toInt();
1305 if (!codecs.contains(mib))
1306 codecs += mib;
1307 }
1308 }
1309#endif
1310
1311 return codecs;
1312}
1313
1314/*!
1315 Set the codec to \a c; this will be returned by
1316 codecForLocale(). If \a c is a null pointer, the codec is reset to
1317 the default.
1318
1319 This might be needed for some applications that want to use their
1320 own mechanism for setting the locale.
1321
1322 \sa codecForLocale()
1323*/
1324void QTextCodec::setCodecForLocale(QTextCodec *c)
1325{
1326 codecForLocaleSet = true;
1327 localeMapper = c;
1328 if (!localeMapper)
1329 setupLocaleMapper();
1330}
1331
1332/*!
1333 Returns a pointer to the codec most suitable for this locale.
1334
1335 On Windows, the codec will be based on a system locale. On Unix
1336 systems, starting with Qt 4.2, the codec will be using the \e
1337 iconv library. Note that in both cases the codec's name will be
1338 "System".
1339*/
1340
1341QTextCodec* QTextCodec::codecForLocale()
1342{
1343 if (localeMapper)
1344 return localeMapper;
1345
1346 setup();
1347
1348 return localeMapper;
1349}
1350
1351
1352/*!
1353 \fn QByteArray QTextCodec::name() const
1354
1355 QTextCodec subclasses must reimplement this function. It returns
1356 the name of the encoding supported by the subclass.
1357
1358 If the codec is registered as a character set in the
1359 \l{IANA character-sets encoding file} this method should
1360 return the preferred mime name for the codec if defined,
1361 otherwise its name.
1362*/
1363
1364/*!
1365 \fn int QTextCodec::mibEnum() const
1366
1367 Subclasses of QTextCodec must reimplement this function. It
1368 returns the MIBenum (see \l{IANA character-sets encoding file}
1369 for more information). It is important that each QTextCodec
1370 subclass returns the correct unique value for this function.
1371*/
1372
1373/*!
1374 Subclasses can return a number of aliases for the codec in question.
1375
1376 Standard aliases for codecs can be found in the
1377 \l{IANA character-sets encoding file}.
1378*/
1379QList<QByteArray> QTextCodec::aliases() const
1380{
1381 return QList<QByteArray>();
1382}
1383
1384/*!
1385 \fn QString QTextCodec::convertToUnicode(const char *chars, int len,
1386 ConverterState *state) const
1387
1388 QTextCodec subclasses must reimplement this function.
1389
1390 Converts the first \a len characters of \a chars from the
1391 encoding of the subclass to Unicode, and returns the result in a
1392 QString.
1393
1394 \a state can be 0, in which case the conversion is stateless and
1395 default conversion rules should be used. If state is not 0, the
1396 codec should save the state after the conversion in \a state, and
1397 adjust the remainingChars and invalidChars members of the struct.
1398*/
1399
1400/*!
1401 \fn QByteArray QTextCodec::convertFromUnicode(const QChar *input, int number,
1402 ConverterState *state) const
1403
1404 QTextCodec subclasses must reimplement this function.
1405
1406 Converts the first \a number of characters from the \a input array
1407 from Unicode to the encoding of the subclass, and returns the result
1408 in a QByteArray.
1409
1410 \a state can be 0 in which case the conversion is stateless and
1411 default conversion rules should be used. If state is not 0, the
1412 codec should save the state after the conversion in \a state, and
1413 adjust the remainingChars and invalidChars members of the struct.
1414*/
1415
1416/*!
1417 Creates a QTextDecoder which stores enough state to decode chunks
1418 of \c{char *} data to create chunks of Unicode data.
1419
1420 The caller is responsible for deleting the returned object.
1421*/
1422QTextDecoder* QTextCodec::makeDecoder() const
1423{
1424 return new QTextDecoder(this);
1425}
1426
1427
1428/*!
1429 Creates a QTextEncoder which stores enough state to encode chunks
1430 of Unicode data as \c{char *} data.
1431
1432 The caller is responsible for deleting the returned object.
1433*/
1434QTextEncoder* QTextCodec::makeEncoder() const
1435{
1436 return new QTextEncoder(this);
1437}
1438
1439/*!
1440 \fn QByteArray QTextCodec::fromUnicode(const QChar *input, int number,
1441 ConverterState *state) const
1442
1443 Converts the first \a number of characters from the \a input array
1444 from Unicode to the encoding of this codec, and returns the result
1445 in a QByteArray.
1446
1447 The \a state of the convertor used is updated.
1448*/
1449
1450/*!
1451 Converts \a str from Unicode to the encoding of this codec, and
1452 returns the result in a QByteArray.
1453*/
1454QByteArray QTextCodec::fromUnicode(const QString& str) const
1455{
1456 return convertFromUnicode(str.constData(), str.length(), 0);
1457}
1458
1459/*!
1460 \fn QString QTextCodec::toUnicode(const char *input, int size,
1461 ConverterState *state) const
1462
1463 Converts the first \a size characters from the \a input from the
1464 encoding of this codec to Unicode, and returns the result in a
1465 QString.
1466
1467 The \a state of the convertor used is updated.
1468*/
1469
1470/*!
1471 Converts \a a from the encoding of this codec to Unicode, and
1472 returns the result in a QString.
1473*/
1474QString QTextCodec::toUnicode(const QByteArray& a) const
1475{
1476 return convertToUnicode(a.constData(), a.length(), 0);
1477}
1478
1479/*!
1480 Returns true if the Unicode character \a ch can be fully encoded
1481 with this codec; otherwise returns false.
1482*/
1483bool QTextCodec::canEncode(QChar ch) const
1484{
1485 ConverterState state;
1486 state.flags = ConvertInvalidToNull;
1487 convertFromUnicode(&ch, 1, &state);
1488 return (state.invalidChars == 0);
1489}
1490
1491/*!
1492 \overload
1493
1494 \a s contains the string being tested for encode-ability.
1495*/
1496bool QTextCodec::canEncode(const QString& s) const
1497{
1498 ConverterState state;
1499 state.flags = ConvertInvalidToNull;
1500 convertFromUnicode(s.constData(), s.length(), &state);
1501 return (state.invalidChars == 0);
1502}
1503
1504#ifdef QT3_SUPPORT
1505/*!
1506 Returns a string representing the current language and
1507 sublanguage, e.g. "pt" for Portuguese, or "pt_br" for Portuguese/Brazil.
1508
1509 \sa QLocale
1510*/
1511const char *QTextCodec::locale()
1512{
1513 static char locale[6];
1514 QByteArray l = QLocale::system().name().toLatin1();
1515 int len = qMin(l.length(), 5);
1516 memcpy(locale, l.constData(), len);
1517 locale[len] = '\0';
1518
1519 return locale;
1520}
1521
1522/*!
1523 \overload
1524*/
1525
1526QByteArray QTextCodec::fromUnicode(const QString& uc, int& lenInOut) const
1527{
1528 QByteArray result = convertFromUnicode(uc.constData(), lenInOut, 0);
1529 lenInOut = result.length();
1530 return result;
1531}
1532
1533/*!
1534 \overload
1535
1536 \a a contains the source characters; \a len contains the number of
1537 characters in \a a to use.
1538*/
1539QString QTextCodec::toUnicode(const QByteArray& a, int len) const
1540{
1541 len = qMin(a.size(), len);
1542 return convertToUnicode(a.constData(), len, 0);
1543}
1544#endif
1545
1546/*!
1547 \overload
1548
1549 \a chars contains the source characters.
1550*/
1551QString QTextCodec::toUnicode(const char *chars) const
1552{
1553 int len = qstrlen(chars);
1554 return convertToUnicode(chars, len, 0);
1555}
1556
1557
1558/*!
1559 \class QTextEncoder
1560 \brief The QTextEncoder class provides a state-based encoder.
1561 \reentrant
1562 \ingroup i18n
1563
1564 A text encoder converts text from Unicode into an encoded text format
1565 using a specific codec.
1566
1567 The encoder converts Unicode into another format, remembering any
1568 state that is required between calls.
1569
1570 \sa QTextCodec::makeEncoder(), QTextDecoder
1571*/
1572
1573/*!
1574 \fn QTextEncoder::QTextEncoder(const QTextCodec *codec)
1575
1576 Constructs a text encoder for the given \a codec.
1577*/
1578
1579/*!
1580 Destroys the encoder.
1581*/
1582QTextEncoder::~QTextEncoder()
1583{
1584}
1585
1586/*! \internal
1587 \since 4.5
1588 Determines whether the eecoder encountered a failure while decoding the input. If
1589 an error was encountered, the produced result is undefined, and gets converted as according
1590 to the conversion flags.
1591 */
1592bool QTextEncoder::hasFailure() const
1593{
1594 return state.invalidChars != 0;
1595}
1596
1597/*!
1598 Converts the Unicode string \a str into an encoded QByteArray.
1599*/
1600QByteArray QTextEncoder::fromUnicode(const QString& str)
1601{
1602 QByteArray result = c->fromUnicode(str.constData(), str.length(), &state);
1603 return result;
1604}
1605
1606/*!
1607 \overload
1608
1609 Converts \a len characters (not bytes) from \a uc, and returns the
1610 result in a QByteArray.
1611*/
1612QByteArray QTextEncoder::fromUnicode(const QChar *uc, int len)
1613{
1614 QByteArray result = c->fromUnicode(uc, len, &state);
1615 return result;
1616}
1617
1618#ifdef QT3_SUPPORT
1619/*!
1620 \overload
1621
1622 Converts \a lenInOut characters (not bytes) from \a uc, and returns the
1623 result in a QByteArray. The number of characters read is returned in
1624 the \a lenInOut parameter.
1625*/
1626QByteArray QTextEncoder::fromUnicode(const QString& uc, int& lenInOut)
1627{
1628 QByteArray result = c->fromUnicode(uc.constData(), lenInOut, &state);
1629 lenInOut = result.length();
1630 return result;
1631}
1632#endif
1633
1634/*!
1635 \class QTextDecoder
1636 \brief The QTextDecoder class provides a state-based decoder.
1637 \reentrant
1638 \ingroup i18n
1639
1640 A text decoder converts text from an encoded text format into Unicode
1641 using a specific codec.
1642
1643 The decoder converts text in this format into Unicode, remembering any
1644 state that is required between calls.
1645
1646 \sa QTextCodec::makeDecoder(), QTextEncoder
1647*/
1648
1649/*!
1650 \fn QTextDecoder::QTextDecoder(const QTextCodec *codec)
1651
1652 Constructs a text decoder for the given \a codec.
1653*/
1654
1655/*!
1656 Destroys the decoder.
1657*/
1658QTextDecoder::~QTextDecoder()
1659{
1660}
1661
1662/*!
1663 \fn QString QTextDecoder::toUnicode(const char *chars, int len)
1664
1665 Converts the first \a len bytes in \a chars to Unicode, returning
1666 the result.
1667
1668 If not all characters are used (e.g. if only part of a multi-byte
1669 encoding is at the end of the characters), the decoder remembers
1670 enough state to continue with the next call to this function.
1671*/
1672QString QTextDecoder::toUnicode(const char *chars, int len)
1673{
1674 return c->toUnicode(chars, len, &state);
1675}
1676
1677
1678/*! \overload
1679
1680 The converted string is returned in \a target.
1681 */
1682void QTextDecoder::toUnicode(QString *target, const char *chars, int len)
1683{
1684 Q_ASSERT(target);
1685 switch (c->mibEnum()) {
1686 case 106: // utf8
1687 static_cast<const QUtf8Codec*>(c)->convertToUnicode(target, chars, len, &state);
1688 break;
1689 case 4: { // latin1
1690 target->resize(len);
1691 ushort *data = (ushort*)target->data();
1692 for (int i = len; i >=0; --i)
1693 data[i] = (uchar) chars[i];
1694 } break;
1695 default:
1696 *target = c->toUnicode(chars, len, &state);
1697 }
1698}
1699
1700
1701/*!
1702 \overload
1703
1704 Converts the bytes in the byte array specified by \a ba to Unicode
1705 and returns the result.
1706*/
1707QString QTextDecoder::toUnicode(const QByteArray &ba)
1708{
1709 return c->toUnicode(ba.constData(), ba.length(), &state);
1710}
1711
1712
1713/*!
1714 \fn QTextCodec* QTextCodec::codecForTr()
1715
1716 Returns the codec used by QObject::tr() on its argument. If this
1717 function returns 0 (the default), tr() assumes Latin-1.
1718
1719 \sa setCodecForTr()
1720*/
1721
1722/*!
1723 \fn void QTextCodec::setCodecForTr(QTextCodec *c)
1724 \nonreentrant
1725
1726 Sets the codec used by QObject::tr() on its argument to \a c. If
1727 \a c is 0 (the default), tr() assumes Latin-1.
1728
1729 If the literal quoted text in the program is not in the Latin-1
1730 encoding, this function can be used to set the appropriate
1731 encoding. For example, software developed by Korean programmers
1732 might use eucKR for all the text in the program, in which case the
1733 main() function might look like this:
1734
1735 \snippet doc/src/snippets/code/src_corelib_codecs_qtextcodec.cpp 3
1736
1737 Note that this is not the way to select the encoding that the \e
1738 user has chosen. For example, to convert an application containing
1739 literal English strings to Korean, all that is needed is for the
1740 English strings to be passed through tr() and for translation
1741 files to be loaded. For details of internationalization, see
1742 \l{Internationalization with Qt}.
1743
1744 \sa codecForTr(), setCodecForCStrings()
1745*/
1746
1747
1748/*!
1749 \fn QTextCodec* QTextCodec::codecForCStrings()
1750
1751 Returns the codec used by QString to convert to and from \c{const
1752 char *} and QByteArrays. If this function returns 0 (the default),
1753 QString assumes Latin-1.
1754
1755 \sa setCodecForCStrings()
1756*/
1757
1758/*!
1759 \fn void QTextCodec::setCodecForCStrings(QTextCodec *codec)
1760 \nonreentrant
1761
1762 Sets the codec used by QString to convert to and from \c{const
1763 char *} and QByteArrays. If the \a codec is 0 (the default),
1764 QString assumes Latin-1.
1765
1766 \warning Some codecs do not preserve the characters in the ASCII
1767 range (0x00 to 0x7F). For example, the Japanese Shift-JIS
1768 encoding maps the backslash character (0x5A) to the Yen
1769 character. To avoid undesirable side-effects, we recommend
1770 avoiding such codecs with setCodecsForCString().
1771
1772 \sa codecForCStrings(), setCodecForTr()
1773*/
1774
1775/*!
1776 \since 4.4
1777
1778 Tries to detect the encoding of the provided snippet of HTML in
1779 the given byte array, \a ba, by checking the BOM (Byte Order Mark)
1780 and the content-type meta header and returns a QTextCodec instance
1781 that is capable of decoding the html to unicode. If the codec
1782 cannot be detected from the content provided, \a defaultCodec is
1783 returned.
1784
1785 \sa codecForUtfText()
1786*/
1787QTextCodec *QTextCodec::codecForHtml(const QByteArray &ba, QTextCodec *defaultCodec)
1788{
1789 // determine charset
1790 int pos;
1791 QTextCodec *c = 0;
1792
1793 c = QTextCodec::codecForUtfText(ba, c);
1794 if (!c) {
1795 QByteArray header = ba.left(512).toLower();
1796 if ((pos = header.indexOf("http-equiv=")) != -1) {
1797 if ((pos = header.lastIndexOf("meta ", pos)) != -1) {
1798 pos = header.indexOf("charset=", pos) + int(strlen("charset="));
1799 if (pos != -1) {
1800 int pos2 = header.indexOf('\"', pos+1);
1801 QByteArray cs = header.mid(pos, pos2-pos);
1802 // qDebug("found charset: %s", cs.data());
1803 c = QTextCodec::codecForName(cs);
1804 }
1805 }
1806 }
1807 }
1808 if (!c)
1809 c = defaultCodec;
1810
1811 return c;
1812}
1813
1814/*!
1815 \overload
1816
1817 Tries to detect the encoding of the provided snippet of HTML in
1818 the given byte array, \a ba, by checking the BOM (Byte Order Mark)
1819 and the content-type meta header and returns a QTextCodec instance
1820 that is capable of decoding the html to unicode. If the codec cannot
1821 be detected, this overload returns a Latin-1 QTextCodec.
1822*/
1823QTextCodec *QTextCodec::codecForHtml(const QByteArray &ba)
1824{
1825 return codecForHtml(ba, QTextCodec::codecForMib(/*Latin 1*/ 4));
1826}
1827
1828/*!
1829 \since 4.6
1830
1831 Tries to detect the encoding of the provided snippet \a ba by
1832 using the BOM (Byte Order Mark) and returns a QTextCodec instance
1833 that is capable of decoding the text to unicode. If the codec
1834 cannot be detected from the content provided, \a defaultCodec is
1835 returned.
1836
1837 \sa codecForHtml()
1838*/
1839QTextCodec *QTextCodec::codecForUtfText(const QByteArray &ba, QTextCodec *defaultCodec)
1840{
1841 const int arraySize = ba.size();
1842
1843 if (arraySize > 3) {
1844 if ((uchar)ba[0] == 0x00
1845 && (uchar)ba[1] == 0x00
1846 && (uchar)ba[2] == 0xFE
1847 && (uchar)ba[3] == 0xFF)
1848 return QTextCodec::codecForMib(1018); // utf-32 be
1849 else if ((uchar)ba[0] == 0xFF
1850 && (uchar)ba[1] == 0xFE
1851 && (uchar)ba[2] == 0x00
1852 && (uchar)ba[3] == 0x00)
1853 return QTextCodec::codecForMib(1019); // utf-32 le
1854 }
1855
1856 if (arraySize < 2)
1857 return defaultCodec;
1858 if ((uchar)ba[0] == 0xfe && (uchar)ba[1] == 0xff)
1859 return QTextCodec::codecForMib(1013); // utf16 be
1860 else if ((uchar)ba[0] == 0xff && (uchar)ba[1] == 0xfe)
1861 return QTextCodec::codecForMib(1014); // utf16 le
1862
1863 if (arraySize < 3)
1864 return defaultCodec;
1865 if ((uchar)ba[0] == 0xef
1866 && (uchar)ba[1] == 0xbb
1867 && (uchar)ba[2] == 0xbf)
1868 return QTextCodec::codecForMib(106); // utf-8
1869
1870 return defaultCodec;
1871}
1872
1873/*!
1874 \overload
1875
1876 Tries to detect the encoding of the provided snippet \a ba by
1877 using the BOM (Byte Order Mark) and returns a QTextCodec instance
1878 that is capable of decoding the text to unicode. If the codec
1879 cannot be detected, this overload returns a Latin-1 QTextCodec.
1880
1881 \sa codecForHtml()
1882*/
1883QTextCodec *QTextCodec::codecForUtfText(const QByteArray &ba)
1884{
1885 return codecForUtfText(ba, QTextCodec::codecForMib(/*Latin 1*/ 4));
1886}
1887
1888
1889/*! \internal
1890 \since 4.3
1891 Determines whether the decoder encountered a failure while decoding the input. If
1892 an error was encountered, the produced result is undefined, and gets converted as according
1893 to the conversion flags.
1894 */
1895bool QTextDecoder::hasFailure() const
1896{
1897 return state.invalidChars != 0;
1898}
1899
1900/*!
1901 \fn QTextCodec *QTextCodec::codecForContent(const char *str, int size)
1902
1903 This functionality is no longer provided by Qt. This
1904 compatibility function always returns a null pointer.
1905*/
1906
1907/*!
1908 \fn QTextCodec *QTextCodec::codecForName(const char *hint, int accuracy)
1909
1910 Use the codecForName(const QByteArray &) overload instead.
1911*/
1912
1913/*!
1914 \fn QTextCodec *QTextCodec::codecForIndex(int i)
1915
1916 Use availableCodecs() or availableMibs() instead and iterate
1917 through the resulting list.
1918*/
1919
1920
1921/*!
1922 \fn QByteArray QTextCodec::mimeName() const
1923
1924 Use name() instead.
1925*/
1926
1927QT_END_NAMESPACE
1928
1929#endif // QT_NO_TEXTCODEC
Note: See TracBrowser for help on using the repository browser.