source: trunk/src/corelib/codecs/qtextcodec.cpp@ 168

Last change on this file since 168 was 135, checked in by Dmitry A. Kuminov, 16 years ago

core: Added a hack to use cp866 on OS/2 if LANG is ru_RU.

File size: 46.6 KB
Line 
1/****************************************************************************
2**
3** Copyright (C) 2009 Nokia Corporation and/or its subsidiary(-ies).
4** Contact: Qt Software Information ([email protected])
5**
6** This file is part of the QtCore module of the Qt Toolkit.
7**
8** $QT_BEGIN_LICENSE:LGPL$
9** Commercial Usage
10** Licensees holding valid Qt Commercial licenses may use this file in
11** accordance with the Qt Commercial License Agreement provided with the
12** Software or, alternatively, in accordance with the terms contained in
13** a written agreement between you and Nokia.
14**
15** GNU Lesser General Public License Usage
16** Alternatively, this file may be used under the terms of the GNU Lesser
17** General Public License version 2.1 as published by the Free Software
18** Foundation and appearing in the file LICENSE.LGPL included in the
19** packaging of this file. Please review the following information to
20** ensure the GNU Lesser General Public License version 2.1 requirements
21** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
22**
23** In addition, as a special exception, Nokia gives you certain
24** additional rights. These rights are described in the Nokia Qt LGPL
25** Exception version 1.0, included in the file LGPL_EXCEPTION.txt in this
26** package.
27**
28** GNU General Public License Usage
29** Alternatively, this file may be used under the terms of the GNU
30** General Public License version 3.0 as published by the Free Software
31** Foundation and appearing in the file LICENSE.GPL included in the
32** packaging of this file. Please review the following information to
33** ensure the GNU General Public License version 3.0 requirements will be
34** met: http://www.gnu.org/copyleft/gpl.html.
35**
36** If you are unsure which license is appropriate for your use, please
37** contact the sales department at [email protected].
38** $QT_END_LICENSE$
39**
40****************************************************************************/
41
42#include "qplatformdefs.h"
43#include "qtextcodec.h"
44#include "qtextcodec_p.h"
45
46#ifndef QT_NO_TEXTCODEC
47
48#include "qlist.h"
49#include "qfile.h"
50#ifndef QT_NO_LIBRARY
51# include "qcoreapplication.h"
52# include "qtextcodecplugin.h"
53# include "private/qfactoryloader_p.h"
54#endif
55#include "qstringlist.h"
56
57#ifdef Q_OS_UNIX
58# include "qiconvcodec_p.h"
59#endif
60
61#include "qutfcodec_p.h"
62#include "qsimplecodec_p.h"
63#include "qlatincodec_p.h"
64#ifndef QT_NO_CODECS
65# include "qtsciicodec_p.h"
66# include "qisciicodec_p.h"
67# if defined(QT_NO_ICONV) && !defined(QT_BOOTSTRAPPED)
68// no iconv(3) support, must build all codecs into the library
69# include "../../plugins/codecs/cn/qgb18030codec.h"
70# include "../../plugins/codecs/jp/qeucjpcodec.h"
71# include "../../plugins/codecs/jp/qjiscodec.h"
72# include "../../plugins/codecs/jp/qsjiscodec.h"
73# include "../../plugins/codecs/kr/qeuckrcodec.h"
74# include "../../plugins/codecs/tw/qbig5codec.h"
75# endif // QT_NO_ICONV
76# if defined(Q_WS_X11) && !defined(QT_BOOTSTRAPPED)
77# include "qfontlaocodec_p.h"
78# include "../../plugins/codecs/jp/qfontjpcodec.h"
79# endif
80#endif // QT_NO_CODECS
81#include "qlocale.h"
82#include "private/qmutexpool_p.h"
83
84#include <stdlib.h>
85#include <ctype.h>
86#include <locale.h>
87#if defined (_XOPEN_UNIX) && !defined(Q_OS_QNX6) && !defined(Q_OS_OSF)
88#include <langinfo.h>
89#endif
90
91#if defined(Q_OS_WINCE)
92# define QT_NO_SETLOCALE
93#endif
94
95QT_BEGIN_NAMESPACE
96
97#ifndef QT_NO_TEXTCODECPLUGIN
98Q_GLOBAL_STATIC_WITH_ARGS(QFactoryLoader, loader,
99 (QTextCodecFactoryInterface_iid, QLatin1String("/codecs")))
100#endif
101
102
103static bool nameMatch(const QByteArray &name, const QByteArray &test)
104{
105 // if they're the same, return a perfect score
106 if (qstricmp(name, test) == 0)
107 return true;
108
109 const char *n = name.constData();
110 const char *h = test.constData();
111
112 // if the letters and numbers are the same, we have a match
113 while (*n != '\0') {
114 if (isalnum((uchar)*n)) {
115 for (;;) {
116 if (*h == '\0')
117 return false;
118 if (isalnum((uchar)*h))
119 break;
120 ++h;
121 }
122 if (tolower((uchar)*n) != tolower((uchar)*h))
123 return false;
124 ++h;
125 }
126 ++n;
127 }
128 while (*h && !isalnum((uchar)*h))
129 ++h;
130 return (*h == '\0');
131}
132
133
134static QTextCodec *createForName(const QByteArray &name)
135{
136#ifndef QT_NO_TEXTCODECPLUGIN
137 QFactoryLoader *l = loader();
138 QStringList keys = l->keys();
139 for (int i = 0; i < keys.size(); ++i) {
140 if (nameMatch(name, keys.at(i).toLatin1())) {
141 QString realName = keys.at(i);
142 if (QTextCodecFactoryInterface *factory
143 = qobject_cast<QTextCodecFactoryInterface*>(l->instance(realName))) {
144 return factory->create(realName);
145 }
146 }
147 }
148#else
149 Q_UNUSED(name);
150#endif
151 return 0;
152}
153
154static QTextCodec *createForMib(int mib)
155{
156#ifndef QT_NO_TEXTCODECPLUGIN
157 QString name = QLatin1String("MIB: ") + QString::number(mib);
158 if (QTextCodecFactoryInterface *factory
159 = qobject_cast<QTextCodecFactoryInterface*>(loader()->instance(name)))
160 return factory->create(name);
161#else
162 Q_UNUSED(mib);
163#endif
164 return 0;
165}
166
167static QList<QTextCodec*> *all = 0;
168static bool destroying_is_ok = false;
169
170static QTextCodec *localeMapper = 0;
171QTextCodec *QTextCodec::cftr = 0;
172
173
174class QTextCodecCleanup
175{
176public:
177 ~QTextCodecCleanup();
178};
179
180/*
181 Deletes all the created codecs. This destructor is called just
182 before exiting to delete any QTextCodec objects that may be lying
183 around.
184*/
185QTextCodecCleanup::~QTextCodecCleanup()
186{
187 if (!all)
188 return;
189
190 destroying_is_ok = true;
191
192 while (all->size())
193 delete all->takeFirst();
194 delete all;
195 all = 0;
196 localeMapper = 0;
197
198 destroying_is_ok = false;
199}
200
201Q_GLOBAL_STATIC(QTextCodecCleanup, createQTextCodecCleanup)
202
203#if defined(Q_OS_WIN32) || defined(Q_OS_WINCE)
204class QWindowsLocalCodec: public QTextCodec
205{
206public:
207 QWindowsLocalCodec();
208 ~QWindowsLocalCodec();
209
210 QString convertToUnicode(const char *, int, ConverterState *) const;
211 QByteArray convertFromUnicode(const QChar *, int, ConverterState *) const;
212 QString convertToUnicodeCharByChar(const char *chars, int length, ConverterState *state) const;
213
214 QByteArray name() const;
215 int mibEnum() const;
216
217};
218
219QWindowsLocalCodec::QWindowsLocalCodec()
220{
221}
222
223QWindowsLocalCodec::~QWindowsLocalCodec()
224{
225}
226
227QString QWindowsLocalCodec::convertToUnicode(const char *chars, int length, ConverterState *state) const
228{
229 const char *mb = chars;
230 int mblen = length;
231
232 if (!mb || !mblen)
233 return QString();
234
235 const int wclen_auto = 4096;
236 WCHAR wc_auto[wclen_auto];
237 int wclen = wclen_auto;
238 WCHAR *wc = wc_auto;
239 int len;
240 QString sp;
241 bool prepend = false;
242 char state_data = 0;
243 int remainingChars = 0;
244
245 //save the current state information
246 if (state) {
247 state_data = (char)state->state_data[0];
248 remainingChars = state->remainingChars;
249 }
250
251 //convert the pending charcter (if available)
252 if (state && remainingChars) {
253 char prev[3] = {0};
254 prev[0] = state_data;
255 prev[1] = mb[0];
256 remainingChars = 0;
257 len = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED,
258 prev, 2, wc, wclen);
259 if (len) {
260 prepend = true;
261 sp.append(QChar(wc[0]));
262 mb++;
263 mblen--;
264 wc[0] = 0;
265 }
266 }
267
268 while (!(len=MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED|MB_ERR_INVALID_CHARS,
269 mb, mblen, wc, wclen))) {
270 int r = GetLastError();
271 if (r == ERROR_INSUFFICIENT_BUFFER) {
272 if (wc != wc_auto) {
273 qWarning("MultiByteToWideChar: Size changed");
274 break;
275 } else {
276 wclen = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED,
277 mb, mblen, 0, 0);
278 wc = new WCHAR[wclen];
279 // and try again...
280 }
281 } else if (r == ERROR_NO_UNICODE_TRANSLATION) {
282 //find the last non NULL character
283 while (mblen > 1 && !(mb[mblen-1]))
284 mblen--;
285 //check whether, we hit an invalid character in the middle
286 if ((mblen <= 1) || (remainingChars && state_data))
287 return convertToUnicodeCharByChar(chars, length, state);
288 //Remove the last character and try again...
289 state_data = mb[mblen-1];
290 remainingChars = 1;
291 mblen--;
292 } else {
293 // Fail.
294 qWarning("MultiByteToWideChar: Cannot convert multibyte text");
295 break;
296 }
297 }
298 if (len <= 0)
299 return QString();
300 if (wc[len-1] == 0) // len - 1: we don't want terminator
301 --len;
302
303 //save the new state information
304 if (state) {
305 state->state_data[0] = (char)state_data;
306 state->remainingChars = remainingChars;
307 }
308 QString s((QChar*)wc, len);
309 if (wc != wc_auto)
310 delete [] wc;
311 if (prepend) {
312 return sp+s;
313 }
314 return s;
315}
316
317QString QWindowsLocalCodec::convertToUnicodeCharByChar(const char *chars, int length, ConverterState *state) const
318{
319 if (!chars || !length)
320 return QString();
321
322 int copyLocation = 0;
323 int extra = 2;
324 if (state && state->remainingChars) {
325 copyLocation = state->remainingChars;
326 extra += copyLocation;
327 }
328 int newLength = length + extra;
329 char *mbcs = new char[newLength];
330 //ensure that we have a NULL terminated string
331 mbcs[newLength-1] = 0;
332 mbcs[newLength-2] = 0;
333 memcpy(&(mbcs[copyLocation]), chars, length);
334 if (copyLocation) {
335 //copy the last character from the state
336 mbcs[0] = (char)state->state_data[0];
337 state->remainingChars = 0;
338 }
339 const char *mb = mbcs;
340#ifndef Q_OS_WINCE
341 const char *next = 0;
342 QString s;
343 while((next = CharNextExA(CP_ACP, mb, 0)) != mb) {
344 WCHAR wc[2] ={0};
345 int charlength = next - mb;
346 int len = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED|MB_ERR_INVALID_CHARS, mb, charlength, wc, 2);
347 if (len>0) {
348 s.append(QChar(wc[0]));
349 } else {
350 int r = GetLastError();
351 //check if the character being dropped is the last character
352 if (r == ERROR_NO_UNICODE_TRANSLATION && mb == (mbcs+newLength -3) && state) {
353 state->remainingChars = 1;
354 state->state_data[0] = (char)*mb;
355 }
356 }
357 mb = next;
358 }
359#else
360 QString s;
361 int size = mbstowcs(NULL, mb, length);
362 if (size < 0) {
363 Q_ASSERT("Error in CE TextCodec");
364 return QString();
365 }
366 wchar_t* ws = new wchar_t[size + 2];
367 ws[size +1] = 0;
368 ws[size] = 0;
369 size = mbstowcs(ws, mb, length);
370 for (int i=0; i< size; i++)
371 s.append(QChar(ws[i]));
372 delete [] ws;
373#endif
374 delete mbcs;
375 return s;
376}
377
378QByteArray QWindowsLocalCodec::convertFromUnicode(const QChar *uc, int len, ConverterState *) const
379{
380 return qt_winQString2MB(uc, len);
381}
382
383
384QByteArray QWindowsLocalCodec::name() const
385{
386 return "System";
387}
388
389int QWindowsLocalCodec::mibEnum() const
390{
391 return 0;
392}
393
394#else
395
396/* locale names mostly copied from XFree86 */
397static const char * const iso8859_2locales[] = {
398 "croatian", "cs", "cs_CS", "cs_CZ","cz", "cz_CZ", "czech", "hr",
399 "hr_HR", "hu", "hu_HU", "hungarian", "pl", "pl_PL", "polish", "ro",
400 "ro_RO", "rumanian", "serbocroatian", "sh", "sh_SP", "sh_YU", "sk",
401 "sk_SK", "sl", "sl_CS", "sl_SI", "slovak", "slovene", "sr_SP", 0 };
402
403static const char * const iso8859_3locales[] = {
404 "eo", 0 };
405
406static const char * const iso8859_4locales[] = {
407 "ee", "ee_EE", 0 };
408
409static const char * const iso8859_5locales[] = {
410 "mk", "mk_MK", "sp", "sp_YU", 0 };
411
412static const char * const cp_1251locales[] = {
413 "be", "be_BY", "bg", "bg_BG", "bulgarian", 0 };
414
415static const char * const pt_154locales[] = {
416 "ba_RU", "ky", "ky_KG", "kk", "kk_KZ", 0 };
417
418static const char * const iso8859_6locales[] = {
419 "ar_AA", "ar_SA", "arabic", 0 };
420
421static const char * const iso8859_7locales[] = {
422 "el", "el_GR", "greek", 0 };
423
424static const char * const iso8859_8locales[] = {
425 "hebrew", "he", "he_IL", "iw", "iw_IL", 0 };
426
427static const char * const iso8859_9locales[] = {
428 "tr", "tr_TR", "turkish", 0 };
429
430static const char * const iso8859_13locales[] = {
431 "lt", "lt_LT", "lv", "lv_LV", 0 };
432
433static const char * const iso8859_15locales[] = {
434 "et", "et_EE",
435 // Euro countries
436 "br_FR", "ca_ES", "de", "de_AT", "de_BE", "de_DE", "de_LU", "en_IE",
437 "es", "es_ES", "eu_ES", "fi", "fi_FI", "finnish", "fr", "fr_FR",
438 "fr_BE", "fr_LU", "french", "ga_IE", "gl_ES", "it", "it_IT", "oc_FR",
439 "nl", "nl_BE", "nl_NL", "pt", "pt_PT", "sv_FI", "wa_BE",
440 0 };
441
442static const char * const koi8_ulocales[] = {
443 "uk", "uk_UA", "ru_UA", "ukrainian", 0 };
444
445static const char * const tis_620locales[] = {
446 "th", "th_TH", "thai", 0 };
447
448// static const char * const tcvnlocales[] = {
449// "vi", "vi_VN", 0 };
450
451static bool try_locale_list(const char * const locale[], const char * lang)
452{
453 int i;
454 for(i=0; locale[i] && *locale[i] && strcmp(locale[i], lang); i++)
455 ;
456 return locale[i] != 0;
457}
458
459// For the probably_koi8_locales we have to look. the standard says
460// these are 8859-5, but almost all Russian users use KOI8-R and
461// incorrectly set $LANG to ru_RU. We'll check tolower() to see what
462// it thinks ru_RU means.
463
464// If you read the history, it seems that many Russians blame ISO and
465// Perestroika for the confusion.
466//
467// The real bug is that some programs break if the user specifies
468// ru_RU.KOI8-R.
469
470static const char * const probably_koi8_rlocales[] = {
471 "ru", "ru_SU", "ru_RU", "russian", 0 };
472
473static QTextCodec * ru_RU_hack(const char * i) {
474#if defined(Q_OS_OS2)
475 // @todo temporary hack. the proper one is to use the current process'
476 // code page if LANG or its codepage part is missing
477 return QTextCodec::codecForName("cp866");
478#else
479 QTextCodec * ru_RU_codec = 0;
480
481#if !defined(QT_NO_SETLOCALE)
482 QByteArray origlocale(setlocale(LC_CTYPE, i));
483#else
484 QByteArray origlocale(i);
485#endif
486 // unicode koi8r latin5 name
487 // 0x044E 0xC0 0xEE CYRILLIC SMALL LETTER YU
488 // 0x042E 0xE0 0xCE CYRILLIC CAPITAL LETTER YU
489 int latin5 = tolower(0xCE);
490 int koi8r = tolower(0xE0);
491 if (koi8r == 0xC0 && latin5 != 0xEE) {
492 ru_RU_codec = QTextCodec::codecForName("KOI8-R");
493 } else if (koi8r != 0xC0 && latin5 == 0xEE) {
494 ru_RU_codec = QTextCodec::codecForName("ISO 8859-5");
495 } else {
496 // something else again... let's assume... *throws dice*
497 ru_RU_codec = QTextCodec::codecForName("KOI8-R");
498 qWarning("QTextCodec: Using KOI8-R, probe failed (%02x %02x %s)",
499 koi8r, latin5, i);
500 }
501#if !defined(QT_NO_SETLOCALE)
502 setlocale(LC_CTYPE, origlocale);
503#endif
504
505 return ru_RU_codec;
506#endif // defined(Q_OS_OS2)
507}
508
509#endif
510
511#if !defined(Q_OS_WIN32) && !defined(Q_OS_WINCE)
512static QTextCodec *checkForCodec(const char *name) {
513 QTextCodec *c = QTextCodec::codecForName(name);
514 if (!c) {
515 const char *at = strchr(name, '@');
516 if (at) {
517 QByteArray n(name, at - name);
518 c = QTextCodec::codecForName(n.data());
519 }
520 }
521 return c;
522}
523#endif
524
525/* the next two functions are implicitely thread safe,
526 as they are only called by setup() which uses a mutex.
527*/
528static void setupLocaleMapper()
529{
530#if defined(Q_OS_WIN32) || defined(Q_OS_WINCE)
531 localeMapper = QTextCodec::codecForName("System");
532#else
533
534#ifndef QT_NO_ICONV
535 localeMapper = QTextCodec::codecForName("System");
536#endif
537
538#if defined (_XOPEN_UNIX) && !defined(Q_OS_QNX6) && !defined(Q_OS_OSF)
539 if (!localeMapper) {
540 char *charset = nl_langinfo (CODESET);
541 if (charset)
542 localeMapper = QTextCodec::codecForName(charset);
543 }
544#endif
545
546 if (!localeMapper) {
547 // Very poorly defined and followed standards causes lots of
548 // code to try to get all the cases... This logic is
549 // duplicated in QIconvCodec, so if you change it here, change
550 // it there too.
551
552 // Try to determine locale codeset from locale name assigned to
553 // LC_CTYPE category.
554
555 // First part is getting that locale name. First try setlocale() which
556 // definitely knows it, but since we cannot fully trust it, get ready
557 // to fall back to environment variables.
558#if !defined(QT_NO_SETLOCALE)
559 char * ctype = qstrdup(setlocale(LC_CTYPE, 0));
560#else
561 char * ctype = qstrdup("");
562#endif
563
564 // Get the first nonempty value from $LC_ALL, $LC_CTYPE, and $LANG
565 // environment variables.
566 char * lang = qstrdup(qgetenv("LC_ALL").constData());
567 if (!lang || lang[0] == 0 || strcmp(lang, "C") == 0) {
568 if (lang) delete [] lang;
569 lang = qstrdup(qgetenv("LC_CTYPE").constData());
570 }
571 if (!lang || lang[0] == 0 || strcmp(lang, "C") == 0) {
572 if (lang) delete [] lang;
573 lang = qstrdup(qgetenv("LANG").constData());
574 }
575
576 // Now try these in order:
577 // 1. CODESET from ctype if it contains a .CODESET part (e.g. en_US.ISO8859-15)
578 // 2. CODESET from lang if it contains a .CODESET part
579 // 3. ctype (maybe the locale is named "ISO-8859-1" or something)
580 // 4. locale (ditto)
581 // 5. check for "@euro"
582 // 6. guess locale from ctype unless ctype is "C"
583 // 7. guess locale from lang
584
585 // 1. CODESET from ctype if it contains a .CODESET part (e.g. en_US.ISO8859-15)
586 char * codeset = ctype ? strchr(ctype, '.') : 0;
587 if (codeset && *codeset == '.')
588 localeMapper = checkForCodec(codeset + 1);
589
590 // 2. CODESET from lang if it contains a .CODESET part
591 codeset = lang ? strchr(lang, '.') : 0;
592 if (!localeMapper && codeset && *codeset == '.')
593 localeMapper = checkForCodec(codeset + 1);
594
595 // 3. ctype (maybe the locale is named "ISO-8859-1" or something)
596 if (!localeMapper && ctype && *ctype != 0 && strcmp (ctype, "C") != 0)
597 localeMapper = checkForCodec(ctype);
598
599 // 4. locale (ditto)
600 if (!localeMapper && lang && *lang != 0)
601 localeMapper = checkForCodec(lang);
602
603 // 5. "@euro"
604 if ((!localeMapper && ctype && strstr(ctype, "@euro")) || (lang && strstr(lang, "@euro")))
605 localeMapper = checkForCodec("ISO 8859-15");
606
607 // 6. guess locale from ctype unless ctype is "C"
608 // 7. guess locale from lang
609 char * try_by_name = ctype;
610 if (ctype && *ctype != 0 && strcmp (ctype, "C") != 0)
611 try_by_name = lang;
612
613 // Now do the guessing.
614 if (lang && *lang && !localeMapper && try_by_name && *try_by_name) {
615 if (try_locale_list(iso8859_15locales, lang))
616 localeMapper = QTextCodec::codecForName("ISO 8859-15");
617 else if (try_locale_list(iso8859_2locales, lang))
618 localeMapper = QTextCodec::codecForName("ISO 8859-2");
619 else if (try_locale_list(iso8859_3locales, lang))
620 localeMapper = QTextCodec::codecForName("ISO 8859-3");
621 else if (try_locale_list(iso8859_4locales, lang))
622 localeMapper = QTextCodec::codecForName("ISO 8859-4");
623 else if (try_locale_list(iso8859_5locales, lang))
624 localeMapper = QTextCodec::codecForName("ISO 8859-5");
625 else if (try_locale_list(iso8859_6locales, lang))
626 localeMapper = QTextCodec::codecForName("ISO 8859-6");
627 else if (try_locale_list(iso8859_7locales, lang))
628 localeMapper = QTextCodec::codecForName("ISO 8859-7");
629 else if (try_locale_list(iso8859_8locales, lang))
630 localeMapper = QTextCodec::codecForName("ISO 8859-8-I");
631 else if (try_locale_list(iso8859_9locales, lang))
632 localeMapper = QTextCodec::codecForName("ISO 8859-9");
633 else if (try_locale_list(iso8859_13locales, lang))
634 localeMapper = QTextCodec::codecForName("ISO 8859-13");
635 else if (try_locale_list(tis_620locales, lang))
636 localeMapper = QTextCodec::codecForName("ISO 8859-11");
637 else if (try_locale_list(koi8_ulocales, lang))
638 localeMapper = QTextCodec::codecForName("KOI8-U");
639 else if (try_locale_list(cp_1251locales, lang))
640 localeMapper = QTextCodec::codecForName("CP 1251");
641 else if (try_locale_list(pt_154locales, lang))
642 localeMapper = QTextCodec::codecForName("PT 154");
643 else if (try_locale_list(probably_koi8_rlocales, lang))
644 localeMapper = ru_RU_hack(lang);
645 }
646
647 delete [] ctype;
648 delete [] lang;
649 }
650
651 // If everything failed, we default to 8859-1
652 // We could perhaps default to 8859-15.
653 if (!localeMapper)
654 localeMapper = QTextCodec::codecForName("ISO 8859-1");
655#endif
656}
657
658
659static void setup()
660{
661#ifndef QT_NO_THREAD
662 QMutexLocker locker(QMutexPool::globalInstanceGet(&all));
663#endif
664
665 if (all)
666 return;
667
668 if (destroying_is_ok)
669 qWarning("QTextCodec: Creating new codec during codec cleanup");
670 all = new QList<QTextCodec*>;
671 // create the cleanup object to cleanup all codecs on exit
672 (void) createQTextCodecCleanup();
673
674#ifndef QT_NO_CODECS
675# if defined(Q_WS_X11) && !defined(QT_BOOTSTRAPPED)
676 // no font codecs when bootstrapping
677 (void)new QFontLaoCodec;
678# if defined(QT_NO_ICONV)
679 // no iconv(3) support, must build all codecs into the library
680 (void)new QFontGb2312Codec;
681 (void)new QFontGbkCodec;
682 (void)new QFontGb18030_0Codec;
683 (void)new QFontJis0208Codec;
684 (void)new QFontJis0201Codec;
685 (void)new QFontKsc5601Codec;
686 (void)new QFontBig5hkscsCodec;
687 (void)new QFontBig5Codec;
688# endif // QT_NO_ICONV && !QT_BOOTSTRAPPED
689# endif // Q_WS_X11
690
691 (void)new QTsciiCodec;
692
693 for (int i = 0; i < 9; ++i)
694 (void)new QIsciiCodec(i);
695
696
697# if defined(QT_NO_ICONV) && !defined(QT_BOOTSTRAPPED)
698 // no asian codecs when bootstrapping, sorry
699 (void)new QGb18030Codec;
700 (void)new QGbkCodec;
701 (void)new QGb2312Codec;
702 (void)new QEucJpCodec;
703 (void)new QJisCodec;
704 (void)new QSjisCodec;
705 (void)new QEucKrCodec;
706 (void)new QBig5Codec;
707 (void)new QBig5hkscsCodec;
708# endif // QT_NO_ICONV && !QT_BOOTSTRAPPED
709#endif // QT_NO_CODECS
710
711#if defined(Q_OS_WIN32) || defined(Q_OS_WINCE)
712 (void) new QWindowsLocalCodec;
713#endif // Q_OS_WIN32
714
715 (void)new QUtf16Codec;
716 (void)new QUtf16BECodec;
717 (void)new QUtf16LECodec;
718 (void)new QUtf32Codec;
719 (void)new QUtf32BECodec;
720 (void)new QUtf32LECodec;
721 (void)new QLatin15Codec;
722 (void)new QLatin1Codec;
723 (void)new QUtf8Codec;
724
725 for (int i = 0; i < QSimpleTextCodec::numSimpleCodecs; ++i)
726 (void)new QSimpleTextCodec(i);
727
728#if defined(Q_OS_UNIX) && !defined(QT_NO_ICONV) && !defined(QT_BOOTSTRAPPED)
729 // QIconvCodec depends on the UTF-16 codec, so it needs to be created last
730 (void) new QIconvCodec();
731#endif
732
733 if (!localeMapper)
734 setupLocaleMapper();
735}
736
737QTextCodec::ConverterState::~ConverterState()
738{
739 if (flags & FreeFunction)
740 (QTextCodecUnalignedPointer::decode(state_data))(this);
741 else if (d)
742 qFree(d);
743}
744
745/*!
746 \class QTextCodec
747 \brief The QTextCodec class provides conversions between text encodings.
748 \reentrant
749 \ingroup i18n
750
751 Qt uses Unicode to store, draw and manipulate strings. In many
752 situations you may wish to deal with data that uses a different
753 encoding. For example, most Japanese documents are still stored
754 in Shift-JIS or ISO 2022-JP, while Russian users often have their
755 documents in KOI8-R or Windows-1251.
756
757 Qt provides a set of QTextCodec classes to help with converting
758 non-Unicode formats to and from Unicode. You can also create your
759 own codec classes.
760
761 The supported encodings are:
762
763 \list
764 \o Apple Roman
765 \o \l{Big5 Text Codec}{Big5}
766 \o \l{Big5-HKSCS Text Codec}{Big5-HKSCS}
767 \o CP949
768 \o \l{EUC-JP Text Codec}{EUC-JP}
769 \o \l{EUC-KR Text Codec}{EUC-KR}
770 \o \l{GBK Text Codec}{GB18030-0}
771 \o IBM 850
772 \o IBM 866
773 \o IBM 874
774 \o \l{ISO 2022-JP (JIS) Text Codec}{ISO 2022-JP}
775 \o ISO 8859-1 to 10
776 \o ISO 8859-13 to 16
777 \o Iscii-Bng, Dev, Gjr, Knd, Mlm, Ori, Pnj, Tlg, and Tml
778 \o JIS X 0201
779 \o JIS X 0208
780 \o KOI8-R
781 \o KOI8-U
782 \o MuleLao-1
783 \o ROMAN8
784 \o \l{Shift-JIS Text Codec}{Shift-JIS}
785 \o TIS-620
786 \o \l{TSCII Text Codec}{TSCII}
787 \o UTF-8
788 \o UTF-16
789 \o UTF-16BE
790 \o UTF-16LE
791 \o UTF-32
792 \o UTF-32BE
793 \o UTF-32LE
794 \o Windows-1250 to 1258
795 \o WINSAMI2
796 \endlist
797
798 QTextCodecs can be used as follows to convert some locally encoded
799 string to Unicode. Suppose you have some string encoded in Russian
800 KOI8-R encoding, and want to convert it to Unicode. The simple way
801 to do it is like this:
802
803 \snippet doc/src/snippets/code/src_corelib_codecs_qtextcodec.cpp 0
804
805 After this, \c string holds the text converted to Unicode.
806 Converting a string from Unicode to the local encoding is just as
807 easy:
808
809 \snippet doc/src/snippets/code/src_corelib_codecs_qtextcodec.cpp 1
810
811 To read or write files in various encodings, use QTextStream and
812 its \l{QTextStream::setCodec()}{setCodec()} function. See the
813 \l{tools/codecs}{Codecs} example for an application of QTextCodec
814 to file I/O.
815
816 Some care must be taken when trying to convert the data in chunks,
817 for example, when receiving it over a network. In such cases it is
818 possible that a multi-byte character will be split over two
819 chunks. At best this might result in the loss of a character and
820 at worst cause the entire conversion to fail.
821
822 The approach to use in these situations is to create a QTextDecoder
823 object for the codec and use this QTextDecoder for the whole
824 decoding process, as shown below:
825
826 \snippet doc/src/snippets/code/src_corelib_codecs_qtextcodec.cpp 2
827
828 The QTextDecoder object maintains state between chunks and therefore
829 works correctly even if a multi-byte character is split between
830 chunks.
831
832 \section1 Creating Your Own Codec Class
833
834 Support for new text encodings can be added to Qt by creating
835 QTextCodec subclasses.
836
837 The pure virtual functions describe the encoder to the system and
838 the coder is used as required in the different text file formats
839 supported by QTextStream, and under X11, for the locale-specific
840 character input and output.
841
842 To add support for another encoding to Qt, make a subclass of
843 QTextCodec and implement the functions listed in the table below.
844
845 \table
846 \header \o Function \o Description
847
848 \row \o name()
849 \o Returns the official name for the encoding. If the
850 encoding is listed in the
851 \l{IANA character-sets encoding file}, the name
852 should be the preferred MIME name for the encoding.
853
854 \row \o aliases()
855 \o Returns a list of alternative names for the encoding.
856 QTextCodec provides a default implementation that returns
857 an empty list. For example, "ISO-8859-1" has "latin1",
858 "CP819", "IBM819", and "iso-ir-100" as aliases.
859
860 \row \o mibEnum()
861 \o Return the MIB enum for the encoding if it is listed in
862 the \l{IANA character-sets encoding file}.
863
864 \row \o convertToUnicode()
865 \o Converts an 8-bit character string to Unicode.
866
867 \row \o convertFromUnicode()
868 \o Converts a Unicode string to an 8-bit character string.
869 \endtable
870
871 You may find it more convenient to make your codec class
872 available as a plugin; see \l{How to Create Qt Plugins} for
873 details.
874
875 \sa QTextStream, QTextDecoder, QTextEncoder, {Codecs Example}
876*/
877
878/*!
879 \enum QTextCodec::ConversionFlag
880
881 \value DefaultConversion No flag is set.
882 \value ConvertInvalidToNull If this flag is set, each invalid input
883 character is output as a null character.
884 \value IgnoreHeader Ignore any Unicode byte-order mark and don't generate any.
885
886 \omitvalue FreeFunction
887*/
888
889/*!
890 \fn QTextCodec::ConverterState::ConverterState(ConversionFlags flags)
891
892 Constructs a ConverterState object initialized with the given \a flags.
893*/
894
895/*!
896 \fn QTextCodec::ConverterState::~ConverterState()
897
898 Destroys the ConverterState object.
899*/
900
901/*!
902 \nonreentrant
903
904 Constructs a QTextCodec, and gives it the highest precedence. The
905 QTextCodec should always be constructed on the heap (i.e. with \c
906 new). Qt takes ownership and will delete it when the application
907 terminates.
908*/
909QTextCodec::QTextCodec()
910{
911 setup();
912 all->prepend(this);
913}
914
915
916/*!
917 \nonreentrant
918
919 Destroys the QTextCodec. Note that you should not delete codecs
920 yourself: once created they become Qt's responsibility.
921*/
922QTextCodec::~QTextCodec()
923{
924 if (!destroying_is_ok)
925 qWarning("QTextCodec::~QTextCodec: Called by application");
926 if (all)
927 all->removeAll(this);
928}
929
930/*!
931 \fn QTextCodec *QTextCodec::codecForName(const char *name)
932
933 Searches all installed QTextCodec objects and returns the one
934 which best matches \a name; the match is case-insensitive. Returns
935 0 if no codec matching the name \a name could be found.
936*/
937
938/*!
939 Searches all installed QTextCodec objects and returns the one
940 which best matches \a name; the match is case-insensitive. Returns
941 0 if no codec matching the name \a name could be found.
942*/
943QTextCodec *QTextCodec::codecForName(const QByteArray &name)
944{
945 if (name.isEmpty())
946 return 0;
947
948 setup();
949
950 for (int i = 0; i < all->size(); ++i) {
951 QTextCodec *cursor = all->at(i);
952 if (nameMatch(cursor->name(), name))
953 return cursor;
954 QList<QByteArray> aliases = cursor->aliases();
955 for (int i = 0; i < aliases.size(); ++i)
956 if (nameMatch(aliases.at(i), name))
957 return cursor;
958 }
959
960 return createForName(name);
961}
962
963
964/*!
965 Returns the QTextCodec which matches the \link
966 QTextCodec::mibEnum() MIBenum\endlink \a mib.
967*/
968QTextCodec* QTextCodec::codecForMib(int mib)
969{
970 setup();
971
972 // Qt 3 used 1000 (mib for UCS2) as its identifier for the utf16 codec. Map
973 // this correctly for compatibility.
974 if (mib == 1000)
975 mib = 1015;
976
977 QList<QTextCodec*>::ConstIterator i;
978 for (int i = 0; i < all->size(); ++i) {
979 QTextCodec *cursor = all->at(i);
980 if (cursor->mibEnum() == mib)
981 return cursor;
982 }
983
984 return createForMib(mib);
985}
986
987/*!
988 Returns the list of all available codecs, by name. Call
989 QTextCodec::codecForName() to obtain the QTextCodec for the name.
990
991 The list may contain many mentions of the same codec
992 if the codec has aliases.
993
994 \sa availableMibs(), name(), aliases()
995*/
996QList<QByteArray> QTextCodec::availableCodecs()
997{
998 setup();
999
1000 QList<QByteArray> codecs;
1001 for (int i = 0; i < all->size(); ++i) {
1002 codecs += all->at(i)->name();
1003 codecs += all->at(i)->aliases();
1004 }
1005#ifndef QT_NO_TEXTCODECPLUGIN
1006 QFactoryLoader *l = loader();
1007 QStringList keys = l->keys();
1008 for (int i = 0; i < keys.size(); ++i) {
1009 if (!keys.at(i).startsWith(QLatin1String("MIB: "))) {
1010 QByteArray name = keys.at(i).toLatin1();
1011 if (!codecs.contains(name))
1012 codecs += name;
1013 }
1014 }
1015#endif
1016
1017 return codecs;
1018}
1019
1020/*!
1021 Returns the list of MIBs for all available codecs. Call
1022 QTextCodec::codecForMib() to obtain the QTextCodec for the MIB.
1023
1024 \sa availableCodecs(), mibEnum()
1025*/
1026QList<int> QTextCodec::availableMibs()
1027{
1028 setup();
1029
1030 QList<int> codecs;
1031 for (int i = 0; i < all->size(); ++i)
1032 codecs += all->at(i)->mibEnum();
1033#ifndef QT_NO_TEXTCODECPLUGIN
1034 QFactoryLoader *l = loader();
1035 QStringList keys = l->keys();
1036 for (int i = 0; i < keys.size(); ++i) {
1037 if (keys.at(i).startsWith(QLatin1String("MIB: "))) {
1038 int mib = keys.at(i).mid(5).toInt();
1039 if (!codecs.contains(mib))
1040 codecs += mib;
1041 }
1042 }
1043#endif
1044
1045 return codecs;
1046}
1047
1048/*!
1049 Set the codec to \a c; this will be returned by
1050 codecForLocale(). If \a c is a null pointer, the codec is reset to
1051 the default.
1052
1053 This might be needed for some applications that want to use their
1054 own mechanism for setting the locale.
1055
1056 Setting this codec is not supported on DOS based Windows.
1057
1058 \sa codecForLocale()
1059*/
1060void QTextCodec::setCodecForLocale(QTextCodec *c)
1061{
1062#ifdef Q_WS_WIN
1063 if (QSysInfo::WindowsVersion& QSysInfo::WV_DOS_based)
1064 return;
1065#endif
1066 localeMapper = c;
1067 if (!localeMapper)
1068 setupLocaleMapper();
1069}
1070
1071/*!
1072 Returns a pointer to the codec most suitable for this locale.
1073
1074 On Windows, the codec will be based on a system locale. On Unix
1075 systems, starting with Qt 4.2, the codec will be using the \e
1076 iconv library. Note that in both cases the codec's name will be
1077 "System".
1078*/
1079
1080QTextCodec* QTextCodec::codecForLocale()
1081{
1082 if (localeMapper)
1083 return localeMapper;
1084
1085 setup();
1086
1087 return localeMapper;
1088}
1089
1090
1091/*!
1092 \fn QByteArray QTextCodec::name() const
1093
1094 QTextCodec subclasses must reimplement this function. It returns
1095 the name of the encoding supported by the subclass.
1096
1097 If the codec is registered as a character set in the
1098 \l{IANA character-sets encoding file} this method should
1099 return the preferred mime name for the codec if defined,
1100 otherwise its name.
1101*/
1102
1103/*!
1104 \fn int QTextCodec::mibEnum() const
1105
1106 Subclasses of QTextCodec must reimplement this function. It
1107 returns the MIBenum (see \l{IANA character-sets encoding file}
1108 for more information). It is important that each QTextCodec
1109 subclass returns the correct unique value for this function.
1110*/
1111
1112/*!
1113 Subclasses can return a number of aliases for the codec in question.
1114
1115 Standard aliases for codecs can be found in the
1116 \l{IANA character-sets encoding file}.
1117*/
1118QList<QByteArray> QTextCodec::aliases() const
1119{
1120 return QList<QByteArray>();
1121}
1122
1123/*!
1124 \fn QString QTextCodec::convertToUnicode(const char *chars, int len,
1125 ConverterState *state) const
1126
1127 QTextCodec subclasses must reimplement this function.
1128
1129 Converts the first \a len characters of \a chars from the
1130 encoding of the subclass to Unicode, and returns the result in a
1131 QString.
1132
1133 \a state can be 0, in which case the conversion is stateless and
1134 default conversion rules should be used. If state is not 0, the
1135 codec should save the state after the conversion in \a state, and
1136 adjust the remainingChars and invalidChars members of the struct.
1137*/
1138
1139/*!
1140 \fn QByteArray QTextCodec::convertFromUnicode(const QChar *input, int number,
1141 ConverterState *state) const
1142
1143 QTextCodec subclasses must reimplement this function.
1144
1145 Converts the first \a number of characters from the \a input array
1146 from Unicode to the encoding of the subclass, and returns the result
1147 in a QByteArray.
1148
1149 \a state can be 0 in which case the conversion is stateless and
1150 default conversion rules should be used. If state is not 0, the
1151 codec should save the state after the conversion in \a state, and
1152 adjust the remainingChars and invalidChars members of the struct.
1153*/
1154
1155/*!
1156 Creates a QTextDecoder which stores enough state to decode chunks
1157 of \c{char *} data to create chunks of Unicode data.
1158
1159 The caller is responsible for deleting the returned object.
1160*/
1161QTextDecoder* QTextCodec::makeDecoder() const
1162{
1163 return new QTextDecoder(this);
1164}
1165
1166
1167/*!
1168 Creates a QTextEncoder which stores enough state to encode chunks
1169 of Unicode data as \c{char *} data.
1170
1171 The caller is responsible for deleting the returned object.
1172*/
1173QTextEncoder* QTextCodec::makeEncoder() const
1174{
1175 return new QTextEncoder(this);
1176}
1177
1178/*!
1179 \fn QByteArray QTextCodec::fromUnicode(const QChar *input, int number,
1180 ConverterState *state) const
1181
1182 Converts the first \a number of characters from the \a input array
1183 from Unicode to the encoding of this codec, and returns the result
1184 in a QByteArray.
1185
1186 The \a state of the convertor used is updated.
1187*/
1188
1189/*!
1190 Converts \a str from Unicode to the encoding of this codec, and
1191 returns the result in a QByteArray.
1192*/
1193QByteArray QTextCodec::fromUnicode(const QString& str) const
1194{
1195 return convertFromUnicode(str.constData(), str.length(), 0);
1196}
1197
1198/*!
1199 \fn QString QTextCodec::toUnicode(const char *input, int size,
1200 ConverterState *state) const
1201
1202 Converts the first \a size characters from the \a input from the
1203 encoding of this codec to Unicode, and returns the result in a
1204 QString.
1205
1206 The \a state of the convertor used is updated.
1207*/
1208
1209/*!
1210 Converts \a a from the encoding of this codec to Unicode, and
1211 returns the result in a QString.
1212*/
1213QString QTextCodec::toUnicode(const QByteArray& a) const
1214{
1215 return convertToUnicode(a.constData(), a.length(), 0);
1216}
1217
1218/*!
1219 Returns true if the Unicode character \a ch can be fully encoded
1220 with this codec; otherwise returns false.
1221*/
1222bool QTextCodec::canEncode(QChar ch) const
1223{
1224 ConverterState state;
1225 state.flags = ConvertInvalidToNull;
1226 convertFromUnicode(&ch, 1, &state);
1227 return (state.invalidChars == 0);
1228}
1229
1230/*!
1231 \overload
1232
1233 \a s contains the string being tested for encode-ability.
1234*/
1235bool QTextCodec::canEncode(const QString& s) const
1236{
1237 ConverterState state;
1238 state.flags = ConvertInvalidToNull;
1239 convertFromUnicode(s.constData(), s.length(), &state);
1240 return (state.invalidChars == 0);
1241}
1242
1243#ifdef QT3_SUPPORT
1244/*!
1245 Returns a string representing the current language and
1246 sublanguage, e.g. "pt" for Portuguese, or "pt_br" for Portuguese/Brazil.
1247
1248 \sa QLocale
1249*/
1250const char *QTextCodec::locale()
1251{
1252 static char locale[6];
1253 QByteArray l = QLocale::system().name().toLatin1();
1254 int len = qMin(l.length(), 5);
1255 memcpy(locale, l.constData(), len);
1256 locale[len] = '\0';
1257
1258 return locale;
1259}
1260
1261/*!
1262 \overload
1263*/
1264
1265QByteArray QTextCodec::fromUnicode(const QString& uc, int& lenInOut) const
1266{
1267 QByteArray result = convertFromUnicode(uc.constData(), lenInOut, 0);
1268 lenInOut = result.length();
1269 return result;
1270}
1271
1272/*!
1273 \overload
1274
1275 \a a contains the source characters; \a len contains the number of
1276 characters in \a a to use.
1277*/
1278QString QTextCodec::toUnicode(const QByteArray& a, int len) const
1279{
1280 len = qMin(a.size(), len);
1281 return convertToUnicode(a.constData(), len, 0);
1282}
1283#endif
1284
1285/*!
1286 \overload
1287
1288 \a chars contains the source characters.
1289*/
1290QString QTextCodec::toUnicode(const char *chars) const
1291{
1292 int len = qstrlen(chars);
1293 return convertToUnicode(chars, len, 0);
1294}
1295
1296
1297/*!
1298 \class QTextEncoder
1299 \brief The QTextEncoder class provides a state-based encoder.
1300 \reentrant
1301 \ingroup i18n
1302
1303 A text encoder converts text from Unicode into an encoded text format
1304 using a specific codec.
1305
1306 The encoder converts Unicode into another format, remembering any
1307 state that is required between calls.
1308
1309 \sa QTextCodec::makeEncoder(), QTextDecoder
1310*/
1311
1312/*!
1313 \fn QTextEncoder::QTextEncoder(const QTextCodec *codec)
1314
1315 Constructs a text encoder for the given \a codec.
1316*/
1317
1318/*!
1319 Destroys the encoder.
1320*/
1321QTextEncoder::~QTextEncoder()
1322{
1323}
1324
1325/*! \internal
1326 \since 4.5
1327 Determines whether the eecoder encountered a failure while decoding the input. If
1328 an error was encountered, the produced result is undefined, and gets converted as according
1329 to the conversion flags.
1330 */
1331bool QTextEncoder::hasFailure() const
1332{
1333 return state.invalidChars != 0;
1334}
1335
1336/*!
1337 Converts the Unicode string \a str into an encoded QByteArray.
1338*/
1339QByteArray QTextEncoder::fromUnicode(const QString& str)
1340{
1341 QByteArray result = c->fromUnicode(str.constData(), str.length(), &state);
1342 return result;
1343}
1344
1345/*!
1346 \overload
1347
1348 Converts \a len characters (not bytes) from \a uc, and returns the
1349 result in a QByteArray.
1350*/
1351QByteArray QTextEncoder::fromUnicode(const QChar *uc, int len)
1352{
1353 QByteArray result = c->fromUnicode(uc, len, &state);
1354 return result;
1355}
1356
1357#ifdef QT3_SUPPORT
1358/*!
1359 \overload
1360
1361 Converts \a lenInOut characters (not bytes) from \a uc, and returns the
1362 result in a QByteArray. The number of characters read is returned in
1363 the \a lenInOut parameter.
1364*/
1365QByteArray QTextEncoder::fromUnicode(const QString& uc, int& lenInOut)
1366{
1367 QByteArray result = c->fromUnicode(uc.constData(), lenInOut, &state);
1368 lenInOut = result.length();
1369 return result;
1370}
1371#endif
1372
1373/*!
1374 \class QTextDecoder
1375 \brief The QTextDecoder class provides a state-based decoder.
1376 \reentrant
1377 \ingroup i18n
1378
1379 A text decoder converts text from an encoded text format into Unicode
1380 using a specific codec.
1381
1382 The decoder converts text in this format into Unicode, remembering any
1383 state that is required between calls.
1384
1385 \sa QTextCodec::makeDecoder(), QTextEncoder
1386*/
1387
1388/*!
1389 \fn QTextDecoder::QTextDecoder(const QTextCodec *codec)
1390
1391 Constructs a text decoder for the given \a codec.
1392*/
1393
1394/*!
1395 Destroys the decoder.
1396*/
1397QTextDecoder::~QTextDecoder()
1398{
1399}
1400
1401/*!
1402 \fn QString QTextDecoder::toUnicode(const char *chars, int len)
1403
1404 Converts the first \a len bytes in \a chars to Unicode, returning
1405 the result.
1406
1407 If not all characters are used (e.g. if only part of a multi-byte
1408 encoding is at the end of the characters), the decoder remembers
1409 enough state to continue with the next call to this function.
1410*/
1411QString QTextDecoder::toUnicode(const char *chars, int len)
1412{
1413 return c->toUnicode(chars, len, &state);
1414}
1415
1416
1417/*! \overload
1418
1419 The converted string is returned in \a target.
1420 */
1421void QTextDecoder::toUnicode(QString *target, const char *chars, int len)
1422{
1423 Q_ASSERT(target);
1424 switch (c->mibEnum()) {
1425 case 106: // utf8
1426 static_cast<const QUtf8Codec*>(c)->convertToUnicode(target, chars, len, &state);
1427 break;
1428 case 4: { // latin1
1429 target->resize(len);
1430 ushort *data = (ushort*)target->data();
1431 for (int i = len; i >=0; --i)
1432 data[i] = (uchar) chars[i];
1433 } break;
1434 default:
1435 *target = c->toUnicode(chars, len, &state);
1436 }
1437}
1438
1439
1440/*!
1441 \overload
1442
1443 Converts the bytes in the byte array specified by \a ba to Unicode
1444 and returns the result.
1445*/
1446QString QTextDecoder::toUnicode(const QByteArray &ba)
1447{
1448 return c->toUnicode(ba.constData(), ba.length(), &state);
1449}
1450
1451
1452/*!
1453 \fn QTextCodec* QTextCodec::codecForTr()
1454
1455 Returns the codec used by QObject::tr() on its argument. If this
1456 function returns 0 (the default), tr() assumes Latin-1.
1457
1458 \sa setCodecForTr()
1459*/
1460
1461/*!
1462 \fn void QTextCodec::setCodecForTr(QTextCodec *c)
1463 \nonreentrant
1464
1465 Sets the codec used by QObject::tr() on its argument to \a c. If
1466 \a c is 0 (the default), tr() assumes Latin-1.
1467
1468 If the literal quoted text in the program is not in the Latin-1
1469 encoding, this function can be used to set the appropriate
1470 encoding. For example, software developed by Korean programmers
1471 might use eucKR for all the text in the program, in which case the
1472 main() function might look like this:
1473
1474 \snippet doc/src/snippets/code/src_corelib_codecs_qtextcodec.cpp 3
1475
1476 Note that this is not the way to select the encoding that the \e
1477 user has chosen. For example, to convert an application containing
1478 literal English strings to Korean, all that is needed is for the
1479 English strings to be passed through tr() and for translation
1480 files to be loaded. For details of internationalization, see
1481 \l{Internationalization with Qt}.
1482
1483 \sa codecForTr(), setCodecForCStrings()
1484*/
1485
1486
1487/*!
1488 \fn QTextCodec* QTextCodec::codecForCStrings()
1489
1490 Returns the codec used by QString to convert to and from \c{const
1491 char *} and QByteArrays. If this function returns 0 (the default),
1492 QString assumes Latin-1.
1493
1494 \sa setCodecForCStrings()
1495*/
1496
1497/*!
1498 \fn void QTextCodec::setCodecForCStrings(QTextCodec *codec)
1499 \nonreentrant
1500
1501 Sets the codec used by QString to convert to and from \c{const
1502 char *} and QByteArrays. If the \a codec is 0 (the default),
1503 QString assumes Latin-1.
1504
1505 \warning Some codecs do not preserve the characters in the ASCII
1506 range (0x00 to 0x7F). For example, the Japanese Shift-JIS
1507 encoding maps the backslash character (0x5A) to the Yen
1508 character. To avoid undesirable side-effects, we recommend
1509 avoiding such codecs with setCodecsForCString().
1510
1511 \sa codecForCStrings(), setCodecForTr()
1512*/
1513
1514/*!
1515 \since 4.4
1516
1517 Tries to detect the encoding of the provided snippet of HTML in the given byte array, \a ba,
1518 and returns a QTextCodec instance that is capable of decoding the html to unicode.
1519 If the codec cannot be detected from the content provided, \a defaultCodec is returned.
1520*/
1521QTextCodec *QTextCodec::codecForHtml(const QByteArray &ba, QTextCodec *defaultCodec)
1522{
1523 // determine charset
1524 int pos;
1525 QTextCodec *c = 0;
1526
1527 if (ba.size() > 1 && (((uchar)ba[0] == 0xfe && (uchar)ba[1] == 0xff)
1528 || ((uchar)ba[0] == 0xff && (uchar)ba[1] == 0xfe))) {
1529 c = QTextCodec::codecForMib(1015); // utf16
1530 } else if (ba.size() > 2
1531 && (uchar)ba[0] == 0xef
1532 && (uchar)ba[1] == 0xbb
1533 && (uchar)ba[2] == 0xbf) {
1534 c = QTextCodec::codecForMib(106); // utf-8
1535 } else {
1536 QByteArray header = ba.left(512).toLower();
1537 if ((pos = header.indexOf("http-equiv=")) != -1) {
1538 pos = header.indexOf("charset=", pos) + int(strlen("charset="));
1539 if (pos != -1) {
1540 int pos2 = header.indexOf('\"', pos+1);
1541 QByteArray cs = header.mid(pos, pos2-pos);
1542 // qDebug("found charset: %s", cs.data());
1543 c = QTextCodec::codecForName(cs);
1544 }
1545 }
1546 }
1547 if (!c)
1548 c = defaultCodec;
1549
1550 return c;
1551}
1552
1553/*!
1554 \overload
1555
1556 If the codec cannot be detected, this overload returns a Latin-1 QTextCodec.
1557*/
1558QTextCodec *QTextCodec::codecForHtml(const QByteArray &ba)
1559{
1560 return codecForHtml(ba, QTextCodec::codecForMib(/*Latin 1*/ 4));
1561}
1562
1563
1564/*! \internal
1565 \since 4.3
1566 Determines whether the decoder encountered a failure while decoding the input. If
1567 an error was encountered, the produced result is undefined, and gets converted as according
1568 to the conversion flags.
1569 */
1570bool QTextDecoder::hasFailure() const
1571{
1572 return state.invalidChars != 0;
1573}
1574
1575/*!
1576 \fn QTextCodec *QTextCodec::codecForContent(const char *str, int size)
1577
1578 This functionality is no longer provided by Qt. This
1579 compatibility function always returns a null pointer.
1580*/
1581
1582/*!
1583 \fn QTextCodec *QTextCodec::codecForName(const char *hint, int accuracy)
1584
1585 Use the codecForName(const QByteArray &) overload instead.
1586*/
1587
1588/*!
1589 \fn QTextCodec *QTextCodec::codecForIndex(int i)
1590
1591 Use availableCodecs() or availableMibs() instead and iterate
1592 through the resulting list.
1593*/
1594
1595
1596/*!
1597 \fn QByteArray QTextCodec::mimeName() const
1598
1599 Use name() instead.
1600*/
1601
1602QT_END_NAMESPACE
1603
1604#endif // QT_NO_TEXTCODEC
Note: See TracBrowser for help on using the repository browser.