Context Navigation

source: trunk/src/corelib/codecs/qtextcodec.cpp@ 751

Last change on this file since 751 was 651, checked in by Dmitry A. Kuminov, 15 years ago
trunk: Merged in qt 4.6.2 sources.
File size: 55.8 KB

Line
1	/****************************************************************************
2	**
3	** Copyright (C) 2010 Nokia Corporation and/or its subsidiary(-ies).
4	** All rights reserved.
5	** Contact: Nokia Corporation ([email protected])
6	**
7	** This file is part of the QtCore module of the Qt Toolkit.
8	**
9	** $QT_BEGIN_LICENSE:LGPL$
10	** Commercial Usage
11	** Licensees holding valid Qt Commercial licenses may use this file in
12	** accordance with the Qt Commercial License Agreement provided with the
13	** Software or, alternatively, in accordance with the terms contained in
14	** a written agreement between you and Nokia.
15	**
16	** GNU Lesser General Public License Usage
17	** Alternatively, this file may be used under the terms of the GNU Lesser
18	** General Public License version 2.1 as published by the Free Software
19	** Foundation and appearing in the file LICENSE.LGPL included in the
20	** packaging of this file. Please review the following information to
21	** ensure the GNU Lesser General Public License version 2.1 requirements
22	** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
23	**
24	** In addition, as a special exception, Nokia gives you certain additional
25	** rights. These rights are described in the Nokia Qt LGPL Exception
26	** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
27	**
28	** GNU General Public License Usage
29	** Alternatively, this file may be used under the terms of the GNU
30	** General Public License version 3.0 as published by the Free Software
31	** Foundation and appearing in the file LICENSE.GPL included in the
32	** packaging of this file. Please review the following information to
33	** ensure the GNU General Public License version 3.0 requirements will be
34	** met: http://www.gnu.org/copyleft/gpl.html.
35	**
36	** If you have questions regarding the use of this file, please contact
37	** Nokia at [email protected].
38	** $QT_END_LICENSE$
39	**
40	****************************************************************************/
41
42	#include "qplatformdefs.h"
43	#include "qtextcodec.h"
44	#include "qtextcodec_p.h"
45
46	#ifndef QT_NO_TEXTCODEC
47
48	#include "qlist.h"
49	#include "qfile.h"
50	#ifndef QT_NO_LIBRARY
51	# include "qcoreapplication.h"
52	# include "qtextcodecplugin.h"
53	# include "private/qfactoryloader_p.h"
54	#endif
55	#include "qstringlist.h"
56
57	#ifdef Q_OS_UNIX
58	# include "qiconvcodec_p.h"
59	#endif
60
61	#if defined(Q_OS_OS2)
62	# include <unidef.h>
63	# include <uconv.h>
64	# include "qvector.h"
65	#endif
66
67	#include "qutfcodec_p.h"
68	#include "qsimplecodec_p.h"
69	#include "qlatincodec_p.h"
70	#ifndef QT_NO_CODECS
71	# include "qtsciicodec_p.h"
72	# include "qisciicodec_p.h"
73	# if defined(QT_NO_ICONV) && !defined(QT_BOOTSTRAPPED)
74	// no iconv(3) support, must build all codecs into the library
75	# include "../../plugins/codecs/cn/qgb18030codec.h"
76	# include "../../plugins/codecs/jp/qeucjpcodec.h"
77	# include "../../plugins/codecs/jp/qjiscodec.h"
78	# include "../../plugins/codecs/jp/qsjiscodec.h"
79	# include "../../plugins/codecs/kr/qeuckrcodec.h"
80	# include "../../plugins/codecs/tw/qbig5codec.h"
81	# endif // QT_NO_ICONV
82	# if defined(Q_WS_X11) && !defined(QT_BOOTSTRAPPED)
83	# include "qfontlaocodec_p.h"
84	# include "../../plugins/codecs/jp/qfontjpcodec.h"
85	# endif
86	#endif // QT_NO_CODECS
87	#include "qlocale.h"
88	#include "private/qmutexpool_p.h"
89
90	#include <stdlib.h>
91	#include <ctype.h>
92	#include <locale.h>
93	#if defined (_XOPEN_UNIX) && !defined(Q_OS_QNX) && !defined(Q_OS_OSF)
94	#include <langinfo.h>
95	#endif
96
97	#if defined(Q_OS_WINCE)
98	# define QT_NO_SETLOCALE
99	#endif
100
101	// enabling this is not exception safe!
102	// #define Q_DEBUG_TEXTCODEC
103
104	QT_BEGIN_NAMESPACE
105
106	#ifndef QT_NO_TEXTCODECPLUGIN
107	Q_GLOBAL_STATIC_WITH_ARGS(QFactoryLoader, loader,
108	(QTextCodecFactoryInterface_iid, QLatin1String("/codecs")))
109	#endif
110
111	static char qtolower(register char c)
112	{ if (c >= 'A' && c <= 'Z') return c + 0x20; return c; }
113	static bool qisalnum(register char c)
114	{ return (c >= '0' && c <= '9') \|\| ((c \| 0x20) >= 'a' && (c \| 0x20) <= 'z'); }
115
116	static bool nameMatch(const QByteArray &name, const QByteArray &test)
117	{
118	// if they're the same, return a perfect score
119	if (qstricmp(name, test) == 0)
120	return true;
121
122	const char *n = name.constData();
123	const char *h = test.constData();
124
125	// if the letters and numbers are the same, we have a match
126	while (*n != '\0') {
127	if (qisalnum(*n)) {
128	for (;;) {
129	if (*h == '\0')
130	return false;
131	if (qisalnum(*h))
132	break;
133	++h;
134	}
135	if (qtolower(n) != qtolower(h))
136	return false;
137	++h;
138	}
139	++n;
140	}
141	while (h && !qisalnum(h))
142	++h;
143	return (*h == '\0');
144	}
145
146
147	static QTextCodec *createForName(const QByteArray &name)
148	{
149	#ifndef QT_NO_TEXTCODECPLUGIN
150	QFactoryLoader *l = loader();
151	QStringList keys = l->keys();
152	for (int i = 0; i < keys.size(); ++i) {
153	if (nameMatch(name, keys.at(i).toLatin1())) {
154	QString realName = keys.at(i);
155	if (QTextCodecFactoryInterface *factory
156	= qobject_cast<QTextCodecFactoryInterface*>(l->instance(realName))) {
157	return factory->create(realName);
158	}
159	}
160	}
161	#else
162	Q_UNUSED(name);
163	#endif
164	return 0;
165	}
166
167	static QTextCodec *createForMib(int mib)
168	{
169	#ifndef QT_NO_TEXTCODECPLUGIN
170	QString name = QLatin1String("MIB: ") + QString::number(mib);
171	if (QTextCodecFactoryInterface *factory
172	= qobject_cast<QTextCodecFactoryInterface*>(loader()->instance(name)))
173	return factory->create(name);
174	#else
175	Q_UNUSED(mib);
176	#endif
177	return 0;
178	}
179
180	static QList<QTextCodec> all = 0;
181	#ifdef Q_DEBUG_TEXTCODEC
182	static bool destroying_is_ok = false;
183	#endif
184
185	static QTextCodec *localeMapper = 0;
186	QTextCodec *QTextCodec::cftr = 0;
187
188
189	class QTextCodecCleanup
190	{
191	public:
192	~QTextCodecCleanup();
193	};
194
195	/*
196	Deletes all the created codecs. This destructor is called just
197	before exiting to delete any QTextCodec objects that may be lying
198	around.
199	*/
200	QTextCodecCleanup::~QTextCodecCleanup()
201	{
202	if (!all)
203	return;
204
205	#ifdef Q_DEBUG_TEXTCODEC
206	destroying_is_ok = true;
207	#endif
208
209	for (QList<QTextCodec *>::const_iterator it = all->constBegin()
210	; it != all->constEnd(); ++it) {
211	delete *it;
212	}
213	delete all;
214	all = 0;
215	localeMapper = 0;
216
217	#ifdef Q_DEBUG_TEXTCODEC
218	destroying_is_ok = false;
219	#endif
220	}
221
222	Q_GLOBAL_STATIC(QTextCodecCleanup, createQTextCodecCleanup)
223
224	#if defined(Q_OS_WIN32) \|\| defined(Q_OS_WINCE)
225	class QWindowsLocalCodec: public QTextCodec
226	{
227	public:
228	QWindowsLocalCodec();
229	~QWindowsLocalCodec();
230
231	QString convertToUnicode(const char , int, ConverterState ) const;
232	QByteArray convertFromUnicode(const QChar , int, ConverterState ) const;
233	QString convertToUnicodeCharByChar(const char chars, int length, ConverterState state) const;
234
235	QByteArray name() const;
236	int mibEnum() const;
237
238	};
239
240	QWindowsLocalCodec::QWindowsLocalCodec()
241	{
242	}
243
244	QWindowsLocalCodec::~QWindowsLocalCodec()
245	{
246	}
247
248	QString QWindowsLocalCodec::convertToUnicode(const char chars, int length, ConverterState state) const
249	{
250	const char *mb = chars;
251	int mblen = length;
252
253	if (!mb \|\| !mblen)
254	return QString();
255
256	const int wclen_auto = 4096;
257	wchar_t wc_auto[wclen_auto];
258	int wclen = wclen_auto;
259	wchar_t *wc = wc_auto;
260	int len;
261	QString sp;
262	bool prepend = false;
263	char state_data = 0;
264	int remainingChars = 0;
265
266	//save the current state information
267	if (state) {
268	state_data = (char)state->state_data[0];
269	remainingChars = state->remainingChars;
270	}
271
272	//convert the pending charcter (if available)
273	if (state && remainingChars) {
274	char prev[3] = {0};
275	prev[0] = state_data;
276	prev[1] = mb[0];
277	remainingChars = 0;
278	len = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED,
279	prev, 2, wc, wclen);
280	if (len) {
281	prepend = true;
282	sp.append(QChar(wc[0]));
283	mb++;
284	mblen--;
285	wc[0] = 0;
286	}
287	}
288
289	while (!(len=MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED\|MB_ERR_INVALID_CHARS,
290	mb, mblen, wc, wclen))) {
291	int r = GetLastError();
292	if (r == ERROR_INSUFFICIENT_BUFFER) {
293	if (wc != wc_auto) {
294	qWarning("MultiByteToWideChar: Size changed");
295	break;
296	} else {
297	wclen = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED,
298	mb, mblen, 0, 0);
299	wc = new wchar_t[wclen];
300	// and try again...
301	}
302	} else if (r == ERROR_NO_UNICODE_TRANSLATION) {
303	//find the last non NULL character
304	while (mblen > 1 && !(mb[mblen-1]))
305	mblen--;
306	//check whether, we hit an invalid character in the middle
307	if ((mblen <= 1) \|\| (remainingChars && state_data))
308	return convertToUnicodeCharByChar(chars, length, state);
309	//Remove the last character and try again...
310	state_data = mb[mblen-1];
311	remainingChars = 1;
312	mblen--;
313	} else {
314	// Fail.
315	qWarning("MultiByteToWideChar: Cannot convert multibyte text");
316	break;
317	}
318	}
319	if (len <= 0)
320	return QString();
321	if (wc[len-1] == 0) // len - 1: we don't want terminator
322	--len;
323
324	//save the new state information
325	if (state) {
326	state->state_data[0] = (char)state_data;
327	state->remainingChars = remainingChars;
328	}
329	QString s((QChar*)wc, len);
330	if (wc != wc_auto)
331	delete [] wc;
332	if (prepend) {
333	return sp+s;
334	}
335	return s;
336	}
337
338	QString QWindowsLocalCodec::convertToUnicodeCharByChar(const char chars, int length, ConverterState state) const
339	{
340	if (!chars \|\| !length)
341	return QString();
342
343	int copyLocation = 0;
344	int extra = 2;
345	if (state && state->remainingChars) {
346	copyLocation = state->remainingChars;
347	extra += copyLocation;
348	}
349	int newLength = length + extra;
350	char *mbcs = new char[newLength];
351	//ensure that we have a NULL terminated string
352	mbcs[newLength-1] = 0;
353	mbcs[newLength-2] = 0;
354	memcpy(&(mbcs[copyLocation]), chars, length);
355	if (copyLocation) {
356	//copy the last character from the state
357	mbcs[0] = (char)state->state_data[0];
358	state->remainingChars = 0;
359	}
360	const char *mb = mbcs;
361	#ifndef Q_OS_WINCE
362	const char *next = 0;
363	QString s;
364	while((next = CharNextExA(CP_ACP, mb, 0)) != mb) {
365	wchar_t wc[2] ={0};
366	int charlength = next - mb;
367	int len = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED\|MB_ERR_INVALID_CHARS, mb, charlength, wc, 2);
368	if (len>0) {
369	s.append(QChar(wc[0]));
370	} else {
371	int r = GetLastError();
372	//check if the character being dropped is the last character
373	if (r == ERROR_NO_UNICODE_TRANSLATION && mb == (mbcs+newLength -3) && state) {
374	state->remainingChars = 1;
375	state->state_data[0] = (char)*mb;
376	}
377	}
378	mb = next;
379	}
380	#else
381	QString s;
382	int size = mbstowcs(NULL, mb, length);
383	if (size < 0) {
384	Q_ASSERT("Error in CE TextCodec");
385	return QString();
386	}
387	wchar_t* ws = new wchar_t[size + 2];
388	ws[size +1] = 0;
389	ws[size] = 0;
390	size = mbstowcs(ws, mb, length);
391	for (int i=0; i< size; i++)
392	s.append(QChar(ws[i]));
393	delete [] ws;
394	#endif
395	delete mbcs;
396	return s;
397	}
398
399	QByteArray QWindowsLocalCodec::convertFromUnicode(const QChar uc, int len, ConverterState ) const
400	{
401	return qt_winQString2MB(uc, len);
402	}
403
404
405	QByteArray QWindowsLocalCodec::name() const
406	{
407	return "System";
408	}
409
410	int QWindowsLocalCodec::mibEnum() const
411	{
412	return 0;
413	}
414
415	#elif defined(Q_OS_OS2)
416
417	class QOs2LocalCodec: public QTextCodec
418	{
419	public:
420	QOs2LocalCodec();
421	~QOs2LocalCodec();
422
423	QString convertToUnicode(const char , int, ConverterState ) const;
424	QByteArray convertFromUnicode(const QChar , int, ConverterState ) const;
425
426	QByteArray name() const;
427	int mibEnum() const;
428
429	private:
430	UconvObject uoSubYes;
431	UconvObject uoSubNo;
432	};
433
434	QOs2LocalCodec::QOs2LocalCodec() : uoSubYes(0), uoSubNo(0)
435	{
436	// create the conversion object for the process code page that performs
437	// substitution of invalid characters with '?'
438	UniCreateUconvObject((UniChar *)L"@sub=yes,subchar=\\x3F,subuni=\\x003F",
439	&uoSubYes);
440	Q_ASSERT(uoSubYes);
441
442	// same as above but doesn't perform substitution
443	UniCreateUconvObject((UniChar *)L"@sub=no", &uoSubNo);
444	Q_ASSERT(uoSubNo);
445	}
446
447	QOs2LocalCodec::~QOs2LocalCodec()
448	{
449	UniFreeUconvObject(uoSubNo);
450	UniFreeUconvObject(uoSubYes);
451	}
452
453	static void qOs2LocalCodecStateFree(QTextCodec::ConverterState *state)
454	{
455	delete reinterpret_cast<char *>(state->d);
456	}
457
458	QString QOs2LocalCodec::convertToUnicode(const char *chars, int length,
459	ConverterState *state) const
460	{
461	QString res;
462
463	if (!chars)
464	return res;
465	if (!length)
466	return QLatin1String("");
467
468	UconvObject uo = uoSubYes;
469	if (state && (state->flags & ConvertInvalidToNull))
470	uo = uoSubNo;
471
472	int remainingChars = 0;
473	char *remainingBuffer = 0;
474
475	if (state) {
476	// stateful conversion
477	remainingBuffer = reinterpret_cast<char *>(state->d);
478	if (remainingBuffer) {
479	// restore state
480	remainingChars = state->remainingChars;
481	} else {
482	// first time, add the destructor for state->d
483	state->flags \|= FreeFunction;
484	QTextCodecUnalignedPointer::encode(state->state_data,
485	qOs2LocalCodecStateFree);
486	}
487	}
488
489	const char *mbPtr = chars;
490	size_t mbLeft = length;
491
492	QByteArray mbExtra;
493	if (remainingChars) {
494	// we have to prepend the remaining bytes from the previous conversion
495	mbLeft += remainingChars;
496	mbExtra.resize(mbLeft);
497	mbPtr = mbExtra.data();
498
499	memcpy(mbExtra.data(), remainingBuffer, remainingChars);
500	memcpy(mbExtra.data() + remainingChars, chars, length);
501
502	remainingBuffer = 0;
503	remainingChars = 0;
504	}
505
506	size_t ucLen = mbLeft;
507	QString ucBuf(ucLen, QLatin1Char('\0'));
508	UniChar ucPtr = reinterpret_cast<UniChar >(ucBuf.data());
509	size_t ucLeft = ucLen;
510
511	size_t nonIdent = 0;
512	int rc;
513
514	while (mbLeft) {
515	rc = UniUconvToUcs(uo, (void**)&mbPtr, &mbLeft, &ucPtr, &ucLeft,
516	&nonIdent);
517	if (rc == ULS_BUFFERFULL) {
518	size_t ucDone = ucLen - ucLeft;
519	size_t mbDone = length - mbLeft;
520	// assume that mbLeft/ucLeft is an approximation of mbDone/ucDone
521	ucLen = ucDone + (mbLeft * ucDone) / mbDone;
522	ucBuf.resize(ucLen);
523	ucPtr = reinterpret_cast<UniChar *>(ucBuf.data() + ucDone);
524	} else if (rc == ULS_ILLEGALSEQUENCE && state) {
525	// conversion stopped because the remaining inBytesLeft make up
526	// an incomplete multi-byte sequence; save them for later
527	remainingBuffer = new char[mbLeft];
528	memcpy(remainingBuffer, mbPtr, mbLeft);
529	remainingChars = mbLeft;
530	break;
531	} else if (rc != ULS_SUCCESS) {
532	// just fail on an unexpected error (will return what we've got)
533	qWarning("QOs2LocalCodec::convertToUnicode: UniUconvToUcs failed "
534	"with %d", rc);
535	break;
536	}
537	}
538
539	ucBuf.resize(ucLen - ucLeft);
540	res = ucBuf;
541
542	if (state) {
543	// update the state
544	state->invalidChars = nonIdent;
545	state->remainingChars = remainingChars;
546	state->d = remainingBuffer;
547	}
548
549	return res;
550	}
551
552	QByteArray QOs2LocalCodec::convertFromUnicode(const QChar *uchars, int length,
553	ConverterState *state) const
554	{
555	QByteArray res;
556
557	if (!uchars)
558	return res;
559	if (!length)
560	return QByteArray("");
561
562	UconvObject uo = uoSubYes;
563	if (state && (state->flags & ConvertInvalidToNull))
564	uo = uoSubNo;
565
566	const UniChar ucPtr = reinterpret_cast<const UniChar >(uchars);
567	size_t ucLeft = length;
568
569	QVector<QChar> ucExtra;
570	if (state && state->remainingChars) {
571	// we have one surrogate char to be prepended
572	Q_ASSERT(state->remainingChars == 1);
573	ucLeft += 1;
574	ucExtra.resize(ucLeft);
575	ucPtr = reinterpret_cast<const UniChar *>(ucExtra.data());
576
577	ucExtra[0] = state->state_data[0];
578	memcpy(ucExtra.data() + 1, uchars, length * sizeof(QChar));
579
580	state->remainingChars = 0;
581	}
582
583	// be optimistic (imply that one byte is necessary per every Unicode char)
584	size_t mbLen = length;
585	QByteArray mbBuf(mbLen, '\0');
586	char *mbPtr = mbBuf.data();
587	size_t mbLeft = mbLen;
588
589	size_t nonIdent = 0;
590	int rc;
591
592	while (ucLeft) {
593	rc = UniUconvFromUcs(uo, const_cast<UniChar **>(&ucPtr), &ucLeft,
594	(void**)&mbPtr, &mbLeft, &nonIdent);
595	if (rc == ULS_BUFFERFULL) {
596	size_t mbDone = mbLen - mbLeft;
597	size_t ucDone = length - ucLeft;
598	size_t newLen = mbLen;
599	if (ucDone) {
600	// assume that ucLeft/mbLeft is an approximation of ucDone/mbDone
601	newLen = mbDone + (ucLeft * mbDone) / ucDone;
602	}
603	if (newLen == mbLen) {
604	// could not process a single Unicode char, double the size
605	mbLen *= 2;
606	} else {
607	mbLen = newLen;
608	}
609	mbBuf.resize(mbLen);
610	mbPtr = mbBuf.data() + mbDone;
611	mbLeft = mbLen - mbDone;
612	} else if (rc == ULS_ILLEGALSEQUENCE && state) {
613	// buffer ends in a surrogate
614	Q_ASSERT(ucLeft == 2);
615	state->state_data[0] = *ucPtr;
616	state->remainingChars = 1;
617	break;
618	} else if (rc != ULS_SUCCESS) {
619	// just fail on an unexpected error (will return what we've got)
620	qWarning("QOs2LocalCodec::convertFromUnicode: UniUconvFromUcs failed "
621	"with %d", rc);
622	break;
623	}
624	}
625
626	mbBuf.resize(mbLen - mbLeft);
627	res = mbBuf;
628
629	if (state) {
630	// update the state
631	state->invalidChars = nonIdent;
632	}
633
634	return res;
635	}
636
637	QByteArray QOs2LocalCodec::name() const
638	{
639	return "System";
640	}
641
642	int QOs2LocalCodec::mibEnum() const
643	{
644	return 0;
645	}
646
647	#else
648
649	/* locale names mostly copied from XFree86 */
650	static const char * const iso8859_2locales[] = {
651	"croatian", "cs", "cs_CS", "cs_CZ","cz", "cz_CZ", "czech", "hr",
652	"hr_HR", "hu", "hu_HU", "hungarian", "pl", "pl_PL", "polish", "ro",
653	"ro_RO", "rumanian", "serbocroatian", "sh", "sh_SP", "sh_YU", "sk",
654	"sk_SK", "sl", "sl_CS", "sl_SI", "slovak", "slovene", "sr_SP", 0 };
655
656	static const char * const iso8859_3locales[] = {
657	"eo", 0 };
658
659	static const char * const iso8859_4locales[] = {
660	"ee", "ee_EE", 0 };
661
662	static const char * const iso8859_5locales[] = {
663	"mk", "mk_MK", "sp", "sp_YU", 0 };
664
665	static const char * const cp_1251locales[] = {
666	"be", "be_BY", "bg", "bg_BG", "bulgarian", 0 };
667
668	static const char * const pt_154locales[] = {
669	"ba_RU", "ky", "ky_KG", "kk", "kk_KZ", 0 };
670
671	static const char * const iso8859_6locales[] = {
672	"ar_AA", "ar_SA", "arabic", 0 };
673
674	static const char * const iso8859_7locales[] = {
675	"el", "el_GR", "greek", 0 };
676
677	static const char * const iso8859_8locales[] = {
678	"hebrew", "he", "he_IL", "iw", "iw_IL", 0 };
679
680	static const char * const iso8859_9locales[] = {
681	"tr", "tr_TR", "turkish", 0 };
682
683	static const char * const iso8859_13locales[] = {
684	"lt", "lt_LT", "lv", "lv_LV", 0 };
685
686	static const char * const iso8859_15locales[] = {
687	"et", "et_EE",
688	// Euro countries
689	"br_FR", "ca_ES", "de", "de_AT", "de_BE", "de_DE", "de_LU", "en_IE",
690	"es", "es_ES", "eu_ES", "fi", "fi_FI", "finnish", "fr", "fr_FR",
691	"fr_BE", "fr_LU", "french", "ga_IE", "gl_ES", "it", "it_IT", "oc_FR",
692	"nl", "nl_BE", "nl_NL", "pt", "pt_PT", "sv_FI", "wa_BE",
693	0 };
694
695	static const char * const koi8_ulocales[] = {
696	"uk", "uk_UA", "ru_UA", "ukrainian", 0 };
697
698	static const char * const tis_620locales[] = {
699	"th", "th_TH", "thai", 0 };
700
701	// static const char * const tcvnlocales[] = {
702	// "vi", "vi_VN", 0 };
703
704	static bool try_locale_list(const char * const locale[], const QByteArray &lang)
705	{
706	int i;
707	for(i=0; locale[i] && lang != locale[i]; i++)
708	;
709	return locale[i] != 0;
710	}
711
712	// For the probably_koi8_locales we have to look. the standard says
713	// these are 8859-5, but almost all Russian users use KOI8-R and
714	// incorrectly set $LANG to ru_RU. We'll check tolower() to see what
715	// it thinks ru_RU means.
716
717	// If you read the history, it seems that many Russians blame ISO and
718	// Perestroika for the confusion.
719	//
720	// The real bug is that some programs break if the user specifies
721	// ru_RU.KOI8-R.
722
723	static const char * const probably_koi8_rlocales[] = {
724	"ru", "ru_SU", "ru_RU", "russian", 0 };
725
726	static QTextCodec * ru_RU_hack(const char * i) {
727	#if defined(Q_OS_OS2)
728	// @todo temporary hack. the proper one is to use the current process'
729	// code page if LANG or its codepage part is missing
730	return QTextCodec::codecForName("cp866");
731	#else
732	QTextCodec * ru_RU_codec = 0;
733
734	#if !defined(QT_NO_SETLOCALE)
735	QByteArray origlocale(setlocale(LC_CTYPE, i));
736	#else
737	QByteArray origlocale(i);
738	#endif
739	// unicode koi8r latin5 name
740	// 0x044E 0xC0 0xEE CYRILLIC SMALL LETTER YU
741	// 0x042E 0xE0 0xCE CYRILLIC CAPITAL LETTER YU
742	int latin5 = tolower(0xCE);
743	int koi8r = tolower(0xE0);
744	if (koi8r == 0xC0 && latin5 != 0xEE) {
745	ru_RU_codec = QTextCodec::codecForName("KOI8-R");
746	} else if (koi8r != 0xC0 && latin5 == 0xEE) {
747	ru_RU_codec = QTextCodec::codecForName("ISO 8859-5");
748	} else {
749	// something else again... let's assume... throws dice
750	ru_RU_codec = QTextCodec::codecForName("KOI8-R");
751	qWarning("QTextCodec: Using KOI8-R, probe failed (%02x %02x %s)",
752	koi8r, latin5, i);
753	}
754	#if !defined(QT_NO_SETLOCALE)
755	setlocale(LC_CTYPE, origlocale);
756	#endif
757
758	return ru_RU_codec;
759	#endif // defined(Q_OS_OS2)
760	}
761
762	#endif
763
764	#if !defined(Q_OS_WIN32) && !defined(Q_OS_WINCE) && !defined(Q_OS_OS2)
765	static QTextCodec *checkForCodec(const QByteArray &name) {
766	QTextCodec *c = QTextCodec::codecForName(name);
767	if (!c) {
768	const int index = name.indexOf('@');
769	if (index != -1) {
770	c = QTextCodec::codecForName(name.left(index));
771	}
772	}
773	return c;
774	}
775	#endif
776
777	/* the next two functions are implicitely thread safe,
778	as they are only called by setup() which uses a mutex.
779	*/
780	static void setupLocaleMapper()
781	{
782	#if defined(Q_OS_WIN32) \|\| defined(Q_OS_WINCE)
783	localeMapper = QTextCodec::codecForName("System");
784	#elif defined(Q_OS_OS2)
785	localeMapper = QTextCodec::codecForName("System");
786	#else
787
788	#ifndef QT_NO_ICONV
789	localeMapper = QTextCodec::codecForName("System");
790	#endif
791
792	#if defined (_XOPEN_UNIX) && !defined(Q_OS_QNX) && !defined(Q_OS_OSF)
793	if (!localeMapper) {
794	char *charset = nl_langinfo (CODESET);
795	if (charset)
796	localeMapper = QTextCodec::codecForName(charset);
797	}
798	#endif
799
800	if (!localeMapper) {
801	// Very poorly defined and followed standards causes lots of
802	// code to try to get all the cases... This logic is
803	// duplicated in QIconvCodec, so if you change it here, change
804	// it there too.
805
806	// Try to determine locale codeset from locale name assigned to
807	// LC_CTYPE category.
808
809	// First part is getting that locale name. First try setlocale() which
810	// definitely knows it, but since we cannot fully trust it, get ready
811	// to fall back to environment variables.
812	#if !defined(QT_NO_SETLOCALE)
813	const QByteArray ctype = setlocale(LC_CTYPE, 0);
814	#else
815	const QByteArray ctype;
816	#endif
817
818	// Get the first nonempty value from $LC_ALL, $LC_CTYPE, and $LANG
819	// environment variables.
820	QByteArray lang = qgetenv("LC_ALL");
821	if (lang.isEmpty() \|\| lang == "C") {
822	lang = qgetenv("LC_CTYPE");
823	}
824	if (lang.isEmpty() \|\| lang == "C") {
825	lang = qgetenv("LANG");
826	}
827
828	// Now try these in order:
829	// 1. CODESET from ctype if it contains a .CODESET part (e.g. en_US.ISO8859-15)
830	// 2. CODESET from lang if it contains a .CODESET part
831	// 3. ctype (maybe the locale is named "ISO-8859-1" or something)
832	// 4. locale (ditto)
833	// 5. check for "@euro"
834	// 6. guess locale from ctype unless ctype is "C"
835	// 7. guess locale from lang
836
837	// 1. CODESET from ctype if it contains a .CODESET part (e.g. en_US.ISO8859-15)
838	int indexOfDot = ctype.indexOf('.');
839	if (indexOfDot != -1)
840	localeMapper = checkForCodec( ctype.mid(indexOfDot + 1) );
841
842	// 2. CODESET from lang if it contains a .CODESET part
843	if (!localeMapper) {
844	indexOfDot = lang.indexOf('.');
845	if (indexOfDot != -1)
846	localeMapper = checkForCodec( lang.mid(indexOfDot + 1) );
847	}
848
849	// 3. ctype (maybe the locale is named "ISO-8859-1" or something)
850	if (!localeMapper && !ctype.isEmpty() && ctype != "C")
851	localeMapper = checkForCodec(ctype);
852
853	// 4. locale (ditto)
854	if (!localeMapper && !lang.isEmpty())
855	localeMapper = checkForCodec(lang);
856
857	// 5. "@euro"
858	if ((!localeMapper && ctype.contains("@euro")) \|\| lang.contains("@euro"))
859	localeMapper = checkForCodec("ISO 8859-15");
860
861	// 6. guess locale from ctype unless ctype is "C"
862	// 7. guess locale from lang
863	const QByteArray &try_by_name = (!ctype.isEmpty() && ctype != "C") ? lang : ctype;
864
865	// Now do the guessing.
866	if (!lang.isEmpty() && !localeMapper && !try_by_name.isEmpty()) {
867	if (try_locale_list(iso8859_15locales, lang))
868	localeMapper = QTextCodec::codecForName("ISO 8859-15");
869	else if (try_locale_list(iso8859_2locales, lang))
870	localeMapper = QTextCodec::codecForName("ISO 8859-2");
871	else if (try_locale_list(iso8859_3locales, lang))
872	localeMapper = QTextCodec::codecForName("ISO 8859-3");
873	else if (try_locale_list(iso8859_4locales, lang))
874	localeMapper = QTextCodec::codecForName("ISO 8859-4");
875	else if (try_locale_list(iso8859_5locales, lang))
876	localeMapper = QTextCodec::codecForName("ISO 8859-5");
877	else if (try_locale_list(iso8859_6locales, lang))
878	localeMapper = QTextCodec::codecForName("ISO 8859-6");
879	else if (try_locale_list(iso8859_7locales, lang))
880	localeMapper = QTextCodec::codecForName("ISO 8859-7");
881	else if (try_locale_list(iso8859_8locales, lang))
882	localeMapper = QTextCodec::codecForName("ISO 8859-8-I");
883	else if (try_locale_list(iso8859_9locales, lang))
884	localeMapper = QTextCodec::codecForName("ISO 8859-9");
885	else if (try_locale_list(iso8859_13locales, lang))
886	localeMapper = QTextCodec::codecForName("ISO 8859-13");
887	else if (try_locale_list(tis_620locales, lang))
888	localeMapper = QTextCodec::codecForName("ISO 8859-11");
889	else if (try_locale_list(koi8_ulocales, lang))
890	localeMapper = QTextCodec::codecForName("KOI8-U");
891	else if (try_locale_list(cp_1251locales, lang))
892	localeMapper = QTextCodec::codecForName("CP 1251");
893	else if (try_locale_list(pt_154locales, lang))
894	localeMapper = QTextCodec::codecForName("PT 154");
895	else if (try_locale_list(probably_koi8_rlocales, lang))
896	localeMapper = ru_RU_hack(lang);
897	}
898
899	}
900
901	// If everything failed, we default to 8859-1
902	// We could perhaps default to 8859-15.
903	if (!localeMapper)
904	localeMapper = QTextCodec::codecForName("ISO 8859-1");
905	#endif
906	}
907
908
909	static void setup()
910	{
911	#ifndef QT_NO_THREAD
912	QMutexLocker locker(QMutexPool::globalInstanceGet(&all));
913	#endif
914
915	if (all)
916	return;
917
918	#ifdef Q_DEBUG_TEXTCODEC
919	if (destroying_is_ok)
920	qWarning("QTextCodec: Creating new codec during codec cleanup");
921	#endif
922	all = new QList<QTextCodec*>;
923	// create the cleanup object to cleanup all codecs on exit
924	(void) createQTextCodecCleanup();
925
926	#ifndef QT_NO_CODECS
927	# if defined(Q_WS_X11) && !defined(QT_BOOTSTRAPPED)
928	// no font codecs when bootstrapping
929	(void)new QFontLaoCodec;
930	# if defined(QT_NO_ICONV)
931	// no iconv(3) support, must build all codecs into the library
932	(void)new QFontGb2312Codec;
933	(void)new QFontGbkCodec;
934	(void)new QFontGb18030_0Codec;
935	(void)new QFontJis0208Codec;
936	(void)new QFontJis0201Codec;
937	(void)new QFontKsc5601Codec;
938	(void)new QFontBig5hkscsCodec;
939	(void)new QFontBig5Codec;
940	# endif // QT_NO_ICONV && !QT_BOOTSTRAPPED
941	# endif // Q_WS_X11
942
943	(void)new QTsciiCodec;
944
945	for (int i = 0; i < 9; ++i)
946	(void)new QIsciiCodec(i);
947
948
949	# if defined(QT_NO_ICONV) && !defined(QT_BOOTSTRAPPED)
950	// no asian codecs when bootstrapping, sorry
951	(void)new QGb18030Codec;
952	(void)new QGbkCodec;
953	(void)new QGb2312Codec;
954	(void)new QEucJpCodec;
955	(void)new QJisCodec;
956	(void)new QSjisCodec;
957	(void)new QEucKrCodec;
958	(void)new QCP949Codec;
959	(void)new QBig5Codec;
960	(void)new QBig5hkscsCodec;
961	# endif // QT_NO_ICONV && !QT_BOOTSTRAPPED
962	#endif // QT_NO_CODECS
963
964	#if defined(Q_OS_WIN32) \|\| defined(Q_OS_WINCE)
965	(void) new QWindowsLocalCodec;
966	#endif // Q_OS_WIN32
967
968	#if defined(Q_OS_OS2)
969	(void) new QOs2LocalCodec;
970	#endif // Q_OS_OS2
971
972	(void)new QUtf16Codec;
973	(void)new QUtf16BECodec;
974	(void)new QUtf16LECodec;
975	(void)new QUtf32Codec;
976	(void)new QUtf32BECodec;
977	(void)new QUtf32LECodec;
978	(void)new QLatin15Codec;
979	(void)new QLatin1Codec;
980	(void)new QUtf8Codec;
981
982	for (int i = 0; i < QSimpleTextCodec::numSimpleCodecs; ++i)
983	(void)new QSimpleTextCodec(i);
984
985	#if defined(Q_OS_UNIX) && !defined(QT_NO_ICONV) && !defined(QT_BOOTSTRAPPED)
986	// QIconvCodec depends on the UTF-16 codec, so it needs to be created last
987	(void) new QIconvCodec();
988	#endif
989
990	if (!localeMapper)
991	setupLocaleMapper();
992	}
993
994	/*!
995	\enum QTextCodec::ConversionFlag
996
997	\value DefaultConversion No flag is set.
998	\value ConvertInvalidToNull If this flag is set, each invalid input
999	character is output as a null character.
1000	\value IgnoreHeader Ignore any Unicode byte-order mark and don't generate any.
1001
1002	\omitvalue FreeFunction
1003	*/
1004
1005	/*!
1006	\fn QTextCodec::ConverterState::ConverterState(ConversionFlags flags)
1007
1008	Constructs a ConverterState object initialized with the given \a flags.
1009	*/
1010
1011	/*!
1012	Destroys the ConverterState object.
1013	*/
1014	QTextCodec::ConverterState::~ConverterState()
1015	{
1016	if (flags & FreeFunction)
1017	(QTextCodecUnalignedPointer::decode(state_data))(this);
1018	else if (d)
1019	qFree(d);
1020	}
1021
1022	static bool codecForLocaleSet = false;
1023	void qt_resetCodecForLocale()
1024	{
1025	// if QTextCodec::codecForLocale() was called, we assume that the user has
1026	// explicitly set the codec he wants for the locale and don't attempt to
1027	// autodetect it again
1028	if (!codecForLocaleSet)
1029	setupLocaleMapper();
1030	}
1031
1032	/*!
1033	\class QTextCodec
1034	\brief The QTextCodec class provides conversions between text encodings.
1035	\reentrant
1036	\ingroup i18n
1037
1038	Qt uses Unicode to store, draw and manipulate strings. In many
1039	situations you may wish to deal with data that uses a different
1040	encoding. For example, most Japanese documents are still stored
1041	in Shift-JIS or ISO 2022-JP, while Russian users often have their
1042	documents in KOI8-R or Windows-1251.
1043
1044	Qt provides a set of QTextCodec classes to help with converting
1045	non-Unicode formats to and from Unicode. You can also create your
1046	own codec classes.
1047
1048	The supported encodings are:
1049
1050	\list
1051	\o Apple Roman
1052	\o \l{Big5 Text Codec}{Big5}
1053	\o \l{Big5-HKSCS Text Codec}{Big5-HKSCS}
1054	\o CP949
1055	\o \l{EUC-JP Text Codec}{EUC-JP}
1056	\o \l{EUC-KR Text Codec}{EUC-KR}
1057	\o \l{GBK Text Codec}{GB18030-0}
1058	\o IBM 850
1059	\o IBM 866
1060	\o IBM 874
1061	\o \l{ISO 2022-JP (JIS) Text Codec}{ISO 2022-JP}
1062	\o ISO 8859-1 to 10
1063	\o ISO 8859-13 to 16
1064	\o Iscii-Bng, Dev, Gjr, Knd, Mlm, Ori, Pnj, Tlg, and Tml
1065	\o JIS X 0201
1066	\o JIS X 0208
1067	\o KOI8-R
1068	\o KOI8-U
1069	\o MuleLao-1
1070	\o ROMAN8
1071	\o \l{Shift-JIS Text Codec}{Shift-JIS}
1072	\o TIS-620
1073	\o \l{TSCII Text Codec}{TSCII}
1074	\o UTF-8
1075	\o UTF-16
1076	\o UTF-16BE
1077	\o UTF-16LE
1078	\o UTF-32
1079	\o UTF-32BE
1080	\o UTF-32LE
1081	\o Windows-1250 to 1258
1082	\o WINSAMI2
1083	\endlist
1084
1085	QTextCodecs can be used as follows to convert some locally encoded
1086	string to Unicode. Suppose you have some string encoded in Russian
1087	KOI8-R encoding, and want to convert it to Unicode. The simple way
1088	to do it is like this:
1089
1090	\snippet doc/src/snippets/code/src_corelib_codecs_qtextcodec.cpp 0
1091
1092	After this, \c string holds the text converted to Unicode.
1093	Converting a string from Unicode to the local encoding is just as
1094	easy:
1095
1096	\snippet doc/src/snippets/code/src_corelib_codecs_qtextcodec.cpp 1
1097
1098	To read or write files in various encodings, use QTextStream and
1099	its \l{QTextStream::setCodec()}{setCodec()} function. See the
1100	\l{tools/codecs}{Codecs} example for an application of QTextCodec
1101	to file I/O.
1102
1103	Some care must be taken when trying to convert the data in chunks,
1104	for example, when receiving it over a network. In such cases it is
1105	possible that a multi-byte character will be split over two
1106	chunks. At best this might result in the loss of a character and
1107	at worst cause the entire conversion to fail.
1108
1109	The approach to use in these situations is to create a QTextDecoder
1110	object for the codec and use this QTextDecoder for the whole
1111	decoding process, as shown below:
1112
1113	\snippet doc/src/snippets/code/src_corelib_codecs_qtextcodec.cpp 2
1114
1115	The QTextDecoder object maintains state between chunks and therefore
1116	works correctly even if a multi-byte character is split between
1117	chunks.
1118
1119	\section1 Creating Your Own Codec Class
1120
1121	Support for new text encodings can be added to Qt by creating
1122	QTextCodec subclasses.
1123
1124	The pure virtual functions describe the encoder to the system and
1125	the coder is used as required in the different text file formats
1126	supported by QTextStream, and under X11, for the locale-specific
1127	character input and output.
1128
1129	To add support for another encoding to Qt, make a subclass of
1130	QTextCodec and implement the functions listed in the table below.
1131
1132	\table
1133	\header \o Function \o Description
1134
1135	\row \o name()
1136	\o Returns the official name for the encoding. If the
1137	encoding is listed in the
1138	\l{IANA character-sets encoding file}, the name
1139	should be the preferred MIME name for the encoding.
1140
1141	\row \o aliases()
1142	\o Returns a list of alternative names for the encoding.
1143	QTextCodec provides a default implementation that returns
1144	an empty list. For example, "ISO-8859-1" has "latin1",
1145	"CP819", "IBM819", and "iso-ir-100" as aliases.
1146
1147	\row \o mibEnum()
1148	\o Return the MIB enum for the encoding if it is listed in
1149	the \l{IANA character-sets encoding file}.
1150
1151	\row \o convertToUnicode()
1152	\o Converts an 8-bit character string to Unicode.
1153
1154	\row \o convertFromUnicode()
1155	\o Converts a Unicode string to an 8-bit character string.
1156	\endtable
1157
1158	You may find it more convenient to make your codec class
1159	available as a plugin; see \l{How to Create Qt Plugins} for
1160	details.
1161
1162	\sa QTextStream, QTextDecoder, QTextEncoder, {Codecs Example}
1163	*/
1164
1165	/*!
1166	\nonreentrant
1167
1168	Constructs a QTextCodec, and gives it the highest precedence. The
1169	QTextCodec should always be constructed on the heap (i.e. with \c
1170	new). Qt takes ownership and will delete it when the application
1171	terminates.
1172	*/
1173	QTextCodec::QTextCodec()
1174	{
1175	setup();
1176	all->prepend(this);
1177	}
1178
1179
1180	/*!
1181	\nonreentrant
1182
1183	Destroys the QTextCodec. Note that you should not delete codecs
1184	yourself: once created they become Qt's responsibility.
1185	*/
1186	QTextCodec::~QTextCodec()
1187	{
1188	#ifdef Q_DEBUG_TEXTCODEC
1189	if (!destroying_is_ok)
1190	qWarning("QTextCodec::~QTextCodec: Called by application");
1191	#endif
1192	if (all)
1193	all->removeAll(this);
1194	}
1195
1196	/*!
1197	\fn QTextCodec QTextCodec::codecForName(const char name)
1198
1199	Searches all installed QTextCodec objects and returns the one
1200	which best matches \a name; the match is case-insensitive. Returns
1201	0 if no codec matching the name \a name could be found.
1202	*/
1203
1204	/*!
1205	Searches all installed QTextCodec objects and returns the one
1206	which best matches \a name; the match is case-insensitive. Returns
1207	0 if no codec matching the name \a name could be found.
1208	*/
1209	QTextCodec *QTextCodec::codecForName(const QByteArray &name)
1210	{
1211	if (name.isEmpty())
1212	return 0;
1213
1214	setup();
1215
1216	for (int i = 0; i < all->size(); ++i) {
1217	QTextCodec *cursor = all->at(i);
1218	if (nameMatch(cursor->name(), name))
1219	return cursor;
1220	QList<QByteArray> aliases = cursor->aliases();
1221	for (int i = 0; i < aliases.size(); ++i)
1222	if (nameMatch(aliases.at(i), name))
1223	return cursor;
1224	}
1225
1226	return createForName(name);
1227	}
1228
1229
1230	/*!
1231	Returns the QTextCodec which matches the \link
1232	QTextCodec::mibEnum() MIBenum\endlink \a mib.
1233	*/
1234	QTextCodec* QTextCodec::codecForMib(int mib)
1235	{
1236	setup();
1237
1238	// Qt 3 used 1000 (mib for UCS2) as its identifier for the utf16 codec. Map
1239	// this correctly for compatibility.
1240	if (mib == 1000)
1241	mib = 1015;
1242
1243	QList<QTextCodec*>::ConstIterator i;
1244	for (int i = 0; i < all->size(); ++i) {
1245	QTextCodec *cursor = all->at(i);
1246	if (cursor->mibEnum() == mib)
1247	return cursor;
1248	}
1249
1250	return createForMib(mib);
1251	}
1252
1253	/*!
1254	Returns the list of all available codecs, by name. Call
1255	QTextCodec::codecForName() to obtain the QTextCodec for the name.
1256
1257	The list may contain many mentions of the same codec
1258	if the codec has aliases.
1259
1260	\sa availableMibs(), name(), aliases()
1261	*/
1262	QList<QByteArray> QTextCodec::availableCodecs()
1263	{
1264	setup();
1265
1266	QList<QByteArray> codecs;
1267	for (int i = 0; i < all->size(); ++i) {
1268	codecs += all->at(i)->name();
1269	codecs += all->at(i)->aliases();
1270	}
1271	#ifndef QT_NO_TEXTCODECPLUGIN
1272	QFactoryLoader *l = loader();
1273	QStringList keys = l->keys();
1274	for (int i = 0; i < keys.size(); ++i) {
1275	if (!keys.at(i).startsWith(QLatin1String("MIB: "))) {
1276	QByteArray name = keys.at(i).toLatin1();
1277	if (!codecs.contains(name))
1278	codecs += name;
1279	}
1280	}
1281	#endif
1282
1283	return codecs;
1284	}
1285
1286	/*!
1287	Returns the list of MIBs for all available codecs. Call
1288	QTextCodec::codecForMib() to obtain the QTextCodec for the MIB.
1289
1290	\sa availableCodecs(), mibEnum()
1291	*/
1292	QList<int> QTextCodec::availableMibs()
1293	{
1294	setup();
1295
1296	QList<int> codecs;
1297	for (int i = 0; i < all->size(); ++i)
1298	codecs += all->at(i)->mibEnum();
1299	#ifndef QT_NO_TEXTCODECPLUGIN
1300	QFactoryLoader *l = loader();
1301	QStringList keys = l->keys();
1302	for (int i = 0; i < keys.size(); ++i) {
1303	if (keys.at(i).startsWith(QLatin1String("MIB: "))) {
1304	int mib = keys.at(i).mid(5).toInt();
1305	if (!codecs.contains(mib))
1306	codecs += mib;
1307	}
1308	}
1309	#endif
1310
1311	return codecs;
1312	}
1313
1314	/*!
1315	Set the codec to \a c; this will be returned by
1316	codecForLocale(). If \a c is a null pointer, the codec is reset to
1317	the default.
1318
1319	This might be needed for some applications that want to use their
1320	own mechanism for setting the locale.
1321
1322	\sa codecForLocale()
1323	*/
1324	void QTextCodec::setCodecForLocale(QTextCodec *c)
1325	{
1326	codecForLocaleSet = true;
1327	localeMapper = c;
1328	if (!localeMapper)
1329	setupLocaleMapper();
1330	}
1331
1332	/*!
1333	Returns a pointer to the codec most suitable for this locale.
1334
1335	On Windows, the codec will be based on a system locale. On Unix
1336	systems, starting with Qt 4.2, the codec will be using the \e
1337	iconv library. Note that in both cases the codec's name will be
1338	"System".
1339	*/
1340
1341	QTextCodec* QTextCodec::codecForLocale()
1342	{
1343	if (localeMapper)
1344	return localeMapper;
1345
1346	setup();
1347
1348	return localeMapper;
1349	}
1350
1351
1352	/*!
1353	\fn QByteArray QTextCodec::name() const
1354
1355	QTextCodec subclasses must reimplement this function. It returns
1356	the name of the encoding supported by the subclass.
1357
1358	If the codec is registered as a character set in the
1359	\l{IANA character-sets encoding file} this method should
1360	return the preferred mime name for the codec if defined,
1361	otherwise its name.
1362	*/
1363
1364	/*!
1365	\fn int QTextCodec::mibEnum() const
1366
1367	Subclasses of QTextCodec must reimplement this function. It
1368	returns the MIBenum (see \l{IANA character-sets encoding file}
1369	for more information). It is important that each QTextCodec
1370	subclass returns the correct unique value for this function.
1371	*/
1372
1373	/*!
1374	Subclasses can return a number of aliases for the codec in question.
1375
1376	Standard aliases for codecs can be found in the
1377	\l{IANA character-sets encoding file}.
1378	*/
1379	QList<QByteArray> QTextCodec::aliases() const
1380	{
1381	return QList<QByteArray>();
1382	}
1383
1384	/*!
1385	\fn QString QTextCodec::convertToUnicode(const char *chars, int len,
1386	ConverterState *state) const
1387
1388	QTextCodec subclasses must reimplement this function.
1389
1390	Converts the first \a len characters of \a chars from the
1391	encoding of the subclass to Unicode, and returns the result in a
1392	QString.
1393
1394	\a state can be 0, in which case the conversion is stateless and
1395	default conversion rules should be used. If state is not 0, the
1396	codec should save the state after the conversion in \a state, and
1397	adjust the remainingChars and invalidChars members of the struct.
1398	*/
1399
1400	/*!
1401	\fn QByteArray QTextCodec::convertFromUnicode(const QChar *input, int number,
1402	ConverterState *state) const
1403
1404	QTextCodec subclasses must reimplement this function.
1405
1406	Converts the first \a number of characters from the \a input array
1407	from Unicode to the encoding of the subclass, and returns the result
1408	in a QByteArray.
1409
1410	\a state can be 0 in which case the conversion is stateless and
1411	default conversion rules should be used. If state is not 0, the
1412	codec should save the state after the conversion in \a state, and
1413	adjust the remainingChars and invalidChars members of the struct.
1414	*/
1415
1416	/*!
1417	Creates a QTextDecoder which stores enough state to decode chunks
1418	of \c{char *} data to create chunks of Unicode data.
1419
1420	The caller is responsible for deleting the returned object.
1421	*/
1422	QTextDecoder* QTextCodec::makeDecoder() const
1423	{
1424	return new QTextDecoder(this);
1425	}
1426
1427
1428	/*!
1429	Creates a QTextEncoder which stores enough state to encode chunks
1430	of Unicode data as \c{char *} data.
1431
1432	The caller is responsible for deleting the returned object.
1433	*/
1434	QTextEncoder* QTextCodec::makeEncoder() const
1435	{
1436	return new QTextEncoder(this);
1437	}
1438
1439	/*!
1440	\fn QByteArray QTextCodec::fromUnicode(const QChar *input, int number,
1441	ConverterState *state) const
1442
1443	Converts the first \a number of characters from the \a input array
1444	from Unicode to the encoding of this codec, and returns the result
1445	in a QByteArray.
1446
1447	The \a state of the convertor used is updated.
1448	*/
1449
1450	/*!
1451	Converts \a str from Unicode to the encoding of this codec, and
1452	returns the result in a QByteArray.
1453	*/
1454	QByteArray QTextCodec::fromUnicode(const QString& str) const
1455	{
1456	return convertFromUnicode(str.constData(), str.length(), 0);
1457	}
1458
1459	/*!
1460	\fn QString QTextCodec::toUnicode(const char *input, int size,
1461	ConverterState *state) const
1462
1463	Converts the first \a size characters from the \a input from the
1464	encoding of this codec to Unicode, and returns the result in a
1465	QString.
1466
1467	The \a state of the convertor used is updated.
1468	*/
1469
1470	/*!
1471	Converts \a a from the encoding of this codec to Unicode, and
1472	returns the result in a QString.
1473	*/
1474	QString QTextCodec::toUnicode(const QByteArray& a) const
1475	{
1476	return convertToUnicode(a.constData(), a.length(), 0);
1477	}
1478
1479	/*!
1480	Returns true if the Unicode character \a ch can be fully encoded
1481	with this codec; otherwise returns false.
1482	*/
1483	bool QTextCodec::canEncode(QChar ch) const
1484	{
1485	ConverterState state;
1486	state.flags = ConvertInvalidToNull;
1487	convertFromUnicode(&ch, 1, &state);
1488	return (state.invalidChars == 0);
1489	}
1490
1491	/*!
1492	\overload
1493
1494	\a s contains the string being tested for encode-ability.
1495	*/
1496	bool QTextCodec::canEncode(const QString& s) const
1497	{
1498	ConverterState state;
1499	state.flags = ConvertInvalidToNull;
1500	convertFromUnicode(s.constData(), s.length(), &state);
1501	return (state.invalidChars == 0);
1502	}
1503
1504	#ifdef QT3_SUPPORT
1505	/*!
1506	Returns a string representing the current language and
1507	sublanguage, e.g. "pt" for Portuguese, or "pt_br" for Portuguese/Brazil.
1508
1509	\sa QLocale
1510	*/
1511	const char *QTextCodec::locale()
1512	{
1513	static char locale[6];
1514	QByteArray l = QLocale::system().name().toLatin1();
1515	int len = qMin(l.length(), 5);
1516	memcpy(locale, l.constData(), len);
1517	locale[len] = '\0';
1518
1519	return locale;
1520	}
1521
1522	/*!
1523	\overload
1524	*/
1525
1526	QByteArray QTextCodec::fromUnicode(const QString& uc, int& lenInOut) const
1527	{
1528	QByteArray result = convertFromUnicode(uc.constData(), lenInOut, 0);
1529	lenInOut = result.length();
1530	return result;
1531	}
1532
1533	/*!
1534	\overload
1535
1536	\a a contains the source characters; \a len contains the number of
1537	characters in \a a to use.
1538	*/
1539	QString QTextCodec::toUnicode(const QByteArray& a, int len) const
1540	{
1541	len = qMin(a.size(), len);
1542	return convertToUnicode(a.constData(), len, 0);
1543	}
1544	#endif
1545
1546	/*!
1547	\overload
1548
1549	\a chars contains the source characters.
1550	*/
1551	QString QTextCodec::toUnicode(const char *chars) const
1552	{
1553	int len = qstrlen(chars);
1554	return convertToUnicode(chars, len, 0);
1555	}
1556
1557
1558	/*!
1559	\class QTextEncoder
1560	\brief The QTextEncoder class provides a state-based encoder.
1561	\reentrant
1562	\ingroup i18n
1563
1564	A text encoder converts text from Unicode into an encoded text format
1565	using a specific codec.
1566
1567	The encoder converts Unicode into another format, remembering any
1568	state that is required between calls.
1569
1570	\sa QTextCodec::makeEncoder(), QTextDecoder
1571	*/
1572
1573	/*!
1574	\fn QTextEncoder::QTextEncoder(const QTextCodec *codec)
1575
1576	Constructs a text encoder for the given \a codec.
1577	*/
1578
1579	/*!
1580	Destroys the encoder.
1581	*/
1582	QTextEncoder::~QTextEncoder()
1583	{
1584	}
1585
1586	/*! \internal
1587	\since 4.5
1588	Determines whether the eecoder encountered a failure while decoding the input. If
1589	an error was encountered, the produced result is undefined, and gets converted as according
1590	to the conversion flags.
1591	*/
1592	bool QTextEncoder::hasFailure() const
1593	{
1594	return state.invalidChars != 0;
1595	}
1596
1597	/*!
1598	Converts the Unicode string \a str into an encoded QByteArray.
1599	*/
1600	QByteArray QTextEncoder::fromUnicode(const QString& str)
1601	{
1602	QByteArray result = c->fromUnicode(str.constData(), str.length(), &state);
1603	return result;
1604	}
1605
1606	/*!
1607	\overload
1608
1609	Converts \a len characters (not bytes) from \a uc, and returns the
1610	result in a QByteArray.
1611	*/
1612	QByteArray QTextEncoder::fromUnicode(const QChar *uc, int len)
1613	{
1614	QByteArray result = c->fromUnicode(uc, len, &state);
1615	return result;
1616	}
1617
1618	#ifdef QT3_SUPPORT
1619	/*!
1620	\overload
1621
1622	Converts \a lenInOut characters (not bytes) from \a uc, and returns the
1623	result in a QByteArray. The number of characters read is returned in
1624	the \a lenInOut parameter.
1625	*/
1626	QByteArray QTextEncoder::fromUnicode(const QString& uc, int& lenInOut)
1627	{
1628	QByteArray result = c->fromUnicode(uc.constData(), lenInOut, &state);
1629	lenInOut = result.length();
1630	return result;
1631	}
1632	#endif
1633
1634	/*!
1635	\class QTextDecoder
1636	\brief The QTextDecoder class provides a state-based decoder.
1637	\reentrant
1638	\ingroup i18n
1639
1640	A text decoder converts text from an encoded text format into Unicode
1641	using a specific codec.
1642
1643	The decoder converts text in this format into Unicode, remembering any
1644	state that is required between calls.
1645
1646	\sa QTextCodec::makeDecoder(), QTextEncoder
1647	*/
1648
1649	/*!
1650	\fn QTextDecoder::QTextDecoder(const QTextCodec *codec)
1651
1652	Constructs a text decoder for the given \a codec.
1653	*/
1654
1655	/*!
1656	Destroys the decoder.
1657	*/
1658	QTextDecoder::~QTextDecoder()
1659	{
1660	}
1661
1662	/*!
1663	\fn QString QTextDecoder::toUnicode(const char *chars, int len)
1664
1665	Converts the first \a len bytes in \a chars to Unicode, returning
1666	the result.
1667
1668	If not all characters are used (e.g. if only part of a multi-byte
1669	encoding is at the end of the characters), the decoder remembers
1670	enough state to continue with the next call to this function.
1671	*/
1672	QString QTextDecoder::toUnicode(const char *chars, int len)
1673	{
1674	return c->toUnicode(chars, len, &state);
1675	}
1676
1677
1678	/*! \overload
1679
1680	The converted string is returned in \a target.
1681	*/
1682	void QTextDecoder::toUnicode(QString target, const char chars, int len)
1683	{
1684	Q_ASSERT(target);
1685	switch (c->mibEnum()) {
1686	case 106: // utf8
1687	static_cast<const QUtf8Codec*>(c)->convertToUnicode(target, chars, len, &state);
1688	break;
1689	case 4: { // latin1
1690	target->resize(len);
1691	ushort data = (ushort)target->data();
1692	for (int i = len; i >=0; --i)
1693	data[i] = (uchar) chars[i];
1694	} break;
1695	default:
1696	*target = c->toUnicode(chars, len, &state);
1697	}
1698	}
1699
1700
1701	/*!
1702	\overload
1703
1704	Converts the bytes in the byte array specified by \a ba to Unicode
1705	and returns the result.
1706	*/
1707	QString QTextDecoder::toUnicode(const QByteArray &ba)
1708	{
1709	return c->toUnicode(ba.constData(), ba.length(), &state);
1710	}
1711
1712
1713	/*!
1714	\fn QTextCodec* QTextCodec::codecForTr()
1715
1716	Returns the codec used by QObject::tr() on its argument. If this
1717	function returns 0 (the default), tr() assumes Latin-1.
1718
1719	\sa setCodecForTr()
1720	*/
1721
1722	/*!
1723	\fn void QTextCodec::setCodecForTr(QTextCodec *c)
1724	\nonreentrant
1725
1726	Sets the codec used by QObject::tr() on its argument to \a c. If
1727	\a c is 0 (the default), tr() assumes Latin-1.
1728
1729	If the literal quoted text in the program is not in the Latin-1
1730	encoding, this function can be used to set the appropriate
1731	encoding. For example, software developed by Korean programmers
1732	might use eucKR for all the text in the program, in which case the
1733	main() function might look like this:
1734
1735	\snippet doc/src/snippets/code/src_corelib_codecs_qtextcodec.cpp 3
1736
1737	Note that this is not the way to select the encoding that the \e
1738	user has chosen. For example, to convert an application containing
1739	literal English strings to Korean, all that is needed is for the
1740	English strings to be passed through tr() and for translation
1741	files to be loaded. For details of internationalization, see
1742	\l{Internationalization with Qt}.
1743
1744	\sa codecForTr(), setCodecForCStrings()
1745	*/
1746
1747
1748	/*!
1749	\fn QTextCodec* QTextCodec::codecForCStrings()
1750
1751	Returns the codec used by QString to convert to and from \c{const
1752	char *} and QByteArrays. If this function returns 0 (the default),
1753	QString assumes Latin-1.
1754
1755	\sa setCodecForCStrings()
1756	*/
1757
1758	/*!
1759	\fn void QTextCodec::setCodecForCStrings(QTextCodec *codec)
1760	\nonreentrant
1761
1762	Sets the codec used by QString to convert to and from \c{const
1763	char *} and QByteArrays. If the \a codec is 0 (the default),
1764	QString assumes Latin-1.
1765
1766	\warning Some codecs do not preserve the characters in the ASCII
1767	range (0x00 to 0x7F). For example, the Japanese Shift-JIS
1768	encoding maps the backslash character (0x5A) to the Yen
1769	character. To avoid undesirable side-effects, we recommend
1770	avoiding such codecs with setCodecsForCString().
1771
1772	\sa codecForCStrings(), setCodecForTr()
1773	*/
1774
1775	/*!
1776	\since 4.4
1777
1778	Tries to detect the encoding of the provided snippet of HTML in
1779	the given byte array, \a ba, by checking the BOM (Byte Order Mark)
1780	and the content-type meta header and returns a QTextCodec instance
1781	that is capable of decoding the html to unicode. If the codec
1782	cannot be detected from the content provided, \a defaultCodec is
1783	returned.
1784
1785	\sa codecForUtfText()
1786	*/
1787	QTextCodec QTextCodec::codecForHtml(const QByteArray &ba, QTextCodec defaultCodec)
1788	{
1789	// determine charset
1790	int pos;
1791	QTextCodec *c = 0;
1792
1793	c = QTextCodec::codecForUtfText(ba, c);
1794	if (!c) {
1795	QByteArray header = ba.left(512).toLower();
1796	if ((pos = header.indexOf("http-equiv=")) != -1) {
1797	if ((pos = header.lastIndexOf("meta ", pos)) != -1) {
1798	pos = header.indexOf("charset=", pos) + int(strlen("charset="));
1799	if (pos != -1) {
1800	int pos2 = header.indexOf('\"', pos+1);
1801	QByteArray cs = header.mid(pos, pos2-pos);
1802	// qDebug("found charset: %s", cs.data());
1803	c = QTextCodec::codecForName(cs);
1804	}
1805	}
1806	}
1807	}
1808	if (!c)
1809	c = defaultCodec;
1810
1811	return c;
1812	}
1813
1814	/*!
1815	\overload
1816
1817	Tries to detect the encoding of the provided snippet of HTML in
1818	the given byte array, \a ba, by checking the BOM (Byte Order Mark)
1819	and the content-type meta header and returns a QTextCodec instance
1820	that is capable of decoding the html to unicode. If the codec cannot
1821	be detected, this overload returns a Latin-1 QTextCodec.
1822	*/
1823	QTextCodec *QTextCodec::codecForHtml(const QByteArray &ba)
1824	{
1825	return codecForHtml(ba, QTextCodec::codecForMib(/Latin 1/ 4));
1826	}
1827
1828	/*!
1829	\since 4.6
1830
1831	Tries to detect the encoding of the provided snippet \a ba by
1832	using the BOM (Byte Order Mark) and returns a QTextCodec instance
1833	that is capable of decoding the text to unicode. If the codec
1834	cannot be detected from the content provided, \a defaultCodec is
1835	returned.
1836
1837	\sa codecForHtml()
1838	*/
1839	QTextCodec QTextCodec::codecForUtfText(const QByteArray &ba, QTextCodec defaultCodec)
1840	{
1841	const int arraySize = ba.size();
1842
1843	if (arraySize > 3) {
1844	if ((uchar)ba[0] == 0x00
1845	&& (uchar)ba[1] == 0x00
1846	&& (uchar)ba[2] == 0xFE
1847	&& (uchar)ba[3] == 0xFF)
1848	return QTextCodec::codecForMib(1018); // utf-32 be
1849	else if ((uchar)ba[0] == 0xFF
1850	&& (uchar)ba[1] == 0xFE
1851	&& (uchar)ba[2] == 0x00
1852	&& (uchar)ba[3] == 0x00)
1853	return QTextCodec::codecForMib(1019); // utf-32 le
1854	}
1855
1856	if (arraySize < 2)
1857	return defaultCodec;
1858	if ((uchar)ba[0] == 0xfe && (uchar)ba[1] == 0xff)
1859	return QTextCodec::codecForMib(1013); // utf16 be
1860	else if ((uchar)ba[0] == 0xff && (uchar)ba[1] == 0xfe)
1861	return QTextCodec::codecForMib(1014); // utf16 le
1862
1863	if (arraySize < 3)
1864	return defaultCodec;
1865	if ((uchar)ba[0] == 0xef
1866	&& (uchar)ba[1] == 0xbb
1867	&& (uchar)ba[2] == 0xbf)
1868	return QTextCodec::codecForMib(106); // utf-8
1869
1870	return defaultCodec;
1871	}
1872
1873	/*!
1874	\overload
1875
1876	Tries to detect the encoding of the provided snippet \a ba by
1877	using the BOM (Byte Order Mark) and returns a QTextCodec instance
1878	that is capable of decoding the text to unicode. If the codec
1879	cannot be detected, this overload returns a Latin-1 QTextCodec.
1880
1881	\sa codecForHtml()
1882	*/
1883	QTextCodec *QTextCodec::codecForUtfText(const QByteArray &ba)
1884	{
1885	return codecForUtfText(ba, QTextCodec::codecForMib(/Latin 1/ 4));
1886	}
1887
1888
1889	/*! \internal
1890	\since 4.3
1891	Determines whether the decoder encountered a failure while decoding the input. If
1892	an error was encountered, the produced result is undefined, and gets converted as according
1893	to the conversion flags.
1894	*/
1895	bool QTextDecoder::hasFailure() const
1896	{
1897	return state.invalidChars != 0;
1898	}
1899
1900	/*!
1901	\fn QTextCodec QTextCodec::codecForContent(const char str, int size)
1902
1903	This functionality is no longer provided by Qt. This
1904	compatibility function always returns a null pointer.
1905	*/
1906
1907	/*!
1908	\fn QTextCodec QTextCodec::codecForName(const char hint, int accuracy)
1909
1910	Use the codecForName(const QByteArray &) overload instead.
1911	*/
1912
1913	/*!
1914	\fn QTextCodec *QTextCodec::codecForIndex(int i)
1915
1916	Use availableCodecs() or availableMibs() instead and iterate
1917	through the resulting list.
1918	*/
1919
1920
1921	/*!
1922	\fn QByteArray QTextCodec::mimeName() const
1923
1924	Use name() instead.
1925	*/
1926
1927	QT_END_NAMESPACE
1928
1929	#endif // QT_NO_TEXTCODEC

Note: See TracBrowser for help on using the repository browser.

Download in other formats: