Context Navigation

source: trunk/src/corelib/codecs/qtextcodec.cpp@ 478

Last change on this file since 478 was 359, checked in by Dmitry A. Kuminov, 16 years ago
corelib/codecs: FIxed a nasty typo that caused LIBC panic (Tried to free block twice) when using QTextStream.
File size: 53.9 KB

Line
1	/****************************************************************************
2	**
3	** Copyright (C) 2009 Nokia Corporation and/or its subsidiary(-ies).
4	** Contact: Qt Software Information ([email protected])
5	**
6	** This file is part of the QtCore module of the Qt Toolkit.
7	**
8	** $QT_BEGIN_LICENSE:LGPL$
9	** Commercial Usage
10	** Licensees holding valid Qt Commercial licenses may use this file in
11	** accordance with the Qt Commercial License Agreement provided with the
12	** Software or, alternatively, in accordance with the terms contained in
13	** a written agreement between you and Nokia.
14	**
15	** GNU Lesser General Public License Usage
16	** Alternatively, this file may be used under the terms of the GNU Lesser
17	** General Public License version 2.1 as published by the Free Software
18	** Foundation and appearing in the file LICENSE.LGPL included in the
19	** packaging of this file. Please review the following information to
20	** ensure the GNU Lesser General Public License version 2.1 requirements
21	** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
22	**
23	** In addition, as a special exception, Nokia gives you certain
24	** additional rights. These rights are described in the Nokia Qt LGPL
25	** Exception version 1.0, included in the file LGPL_EXCEPTION.txt in this
26	** package.
27	**
28	** GNU General Public License Usage
29	** Alternatively, this file may be used under the terms of the GNU
30	** General Public License version 3.0 as published by the Free Software
31	** Foundation and appearing in the file LICENSE.GPL included in the
32	** packaging of this file. Please review the following information to
33	** ensure the GNU General Public License version 3.0 requirements will be
34	** met: http://www.gnu.org/copyleft/gpl.html.
35	**
36	** If you are unsure which license is appropriate for your use, please
37	** contact the sales department at [email protected].
38	** $QT_END_LICENSE$
39	**
40	****************************************************************************/
41
42	#include "qplatformdefs.h"
43	#include "qtextcodec.h"
44	#include "qtextcodec_p.h"
45
46	#ifndef QT_NO_TEXTCODEC
47
48	#include "qlist.h"
49	#include "qfile.h"
50	#ifndef QT_NO_LIBRARY
51	# include "qcoreapplication.h"
52	# include "qtextcodecplugin.h"
53	# include "private/qfactoryloader_p.h"
54	#endif
55	#include "qstringlist.h"
56
57	#ifdef Q_OS_UNIX
58	# include "qiconvcodec_p.h"
59	#endif
60
61	#if defined(Q_OS_OS2)
62	# include <unidef.h>
63	# include <uconv.h>
64	# include "qvector.h"
65	#endif
66
67	#include "qutfcodec_p.h"
68	#include "qsimplecodec_p.h"
69	#include "qlatincodec_p.h"
70	#ifndef QT_NO_CODECS
71	# include "qtsciicodec_p.h"
72	# include "qisciicodec_p.h"
73	# if defined(QT_NO_ICONV) && !defined(QT_BOOTSTRAPPED)
74	// no iconv(3) support, must build all codecs into the library
75	# include "../../plugins/codecs/cn/qgb18030codec.h"
76	# include "../../plugins/codecs/jp/qeucjpcodec.h"
77	# include "../../plugins/codecs/jp/qjiscodec.h"
78	# include "../../plugins/codecs/jp/qsjiscodec.h"
79	# include "../../plugins/codecs/kr/qeuckrcodec.h"
80	# include "../../plugins/codecs/tw/qbig5codec.h"
81	# endif // QT_NO_ICONV
82	# if defined(Q_WS_X11) && !defined(QT_BOOTSTRAPPED)
83	# include "qfontlaocodec_p.h"
84	# include "../../plugins/codecs/jp/qfontjpcodec.h"
85	# endif
86	#endif // QT_NO_CODECS
87	#include "qlocale.h"
88	#include "private/qmutexpool_p.h"
89
90	#include <stdlib.h>
91	#include <ctype.h>
92	#include <locale.h>
93	#if defined (_XOPEN_UNIX) && !defined(Q_OS_QNX6) && !defined(Q_OS_OSF)
94	#include <langinfo.h>
95	#endif
96
97	#if defined(Q_OS_WINCE)
98	# define QT_NO_SETLOCALE
99	#endif
100
101	QT_BEGIN_NAMESPACE
102
103	#ifndef QT_NO_TEXTCODECPLUGIN
104	Q_GLOBAL_STATIC_WITH_ARGS(QFactoryLoader, loader,
105	(QTextCodecFactoryInterface_iid, QLatin1String("/codecs")))
106	#endif
107
108
109	static bool nameMatch(const QByteArray &name, const QByteArray &test)
110	{
111	// if they're the same, return a perfect score
112	if (qstricmp(name, test) == 0)
113	return true;
114
115	const char *n = name.constData();
116	const char *h = test.constData();
117
118	// if the letters and numbers are the same, we have a match
119	while (*n != '\0') {
120	if (isalnum((uchar)*n)) {
121	for (;;) {
122	if (*h == '\0')
123	return false;
124	if (isalnum((uchar)*h))
125	break;
126	++h;
127	}
128	if (tolower((uchar)n) != tolower((uchar)h))
129	return false;
130	++h;
131	}
132	++n;
133	}
134	while (h && !isalnum((uchar)h))
135	++h;
136	return (*h == '\0');
137	}
138
139
140	static QTextCodec *createForName(const QByteArray &name)
141	{
142	#ifndef QT_NO_TEXTCODECPLUGIN
143	QFactoryLoader *l = loader();
144	QStringList keys = l->keys();
145	for (int i = 0; i < keys.size(); ++i) {
146	if (nameMatch(name, keys.at(i).toLatin1())) {
147	QString realName = keys.at(i);
148	if (QTextCodecFactoryInterface *factory
149	= qobject_cast<QTextCodecFactoryInterface*>(l->instance(realName))) {
150	return factory->create(realName);
151	}
152	}
153	}
154	#else
155	Q_UNUSED(name);
156	#endif
157	return 0;
158	}
159
160	static QTextCodec *createForMib(int mib)
161	{
162	#ifndef QT_NO_TEXTCODECPLUGIN
163	QString name = QLatin1String("MIB: ") + QString::number(mib);
164	if (QTextCodecFactoryInterface *factory
165	= qobject_cast<QTextCodecFactoryInterface*>(loader()->instance(name)))
166	return factory->create(name);
167	#else
168	Q_UNUSED(mib);
169	#endif
170	return 0;
171	}
172
173	static QList<QTextCodec> all = 0;
174	static bool destroying_is_ok = false;
175
176	static QTextCodec *localeMapper = 0;
177	QTextCodec *QTextCodec::cftr = 0;
178
179
180	class QTextCodecCleanup
181	{
182	public:
183	~QTextCodecCleanup();
184	};
185
186	/*
187	Deletes all the created codecs. This destructor is called just
188	before exiting to delete any QTextCodec objects that may be lying
189	around.
190	*/
191	QTextCodecCleanup::~QTextCodecCleanup()
192	{
193	if (!all)
194	return;
195
196	destroying_is_ok = true;
197
198	while (all->size())
199	delete all->takeFirst();
200	delete all;
201	all = 0;
202	localeMapper = 0;
203
204	destroying_is_ok = false;
205	}
206
207	Q_GLOBAL_STATIC(QTextCodecCleanup, createQTextCodecCleanup)
208
209	#if defined(Q_OS_WIN32) \|\| defined(Q_OS_WINCE)
210	class QWindowsLocalCodec: public QTextCodec
211	{
212	public:
213	QWindowsLocalCodec();
214	~QWindowsLocalCodec();
215
216	QString convertToUnicode(const char , int, ConverterState ) const;
217	QByteArray convertFromUnicode(const QChar , int, ConverterState ) const;
218	QString convertToUnicodeCharByChar(const char chars, int length, ConverterState state) const;
219
220	QByteArray name() const;
221	int mibEnum() const;
222
223	};
224
225	QWindowsLocalCodec::QWindowsLocalCodec()
226	{
227	}
228
229	QWindowsLocalCodec::~QWindowsLocalCodec()
230	{
231	}
232
233	QString QWindowsLocalCodec::convertToUnicode(const char chars, int length, ConverterState state) const
234	{
235	const char *mb = chars;
236	int mblen = length;
237
238	if (!mb \|\| !mblen)
239	return QString();
240
241	const int wclen_auto = 4096;
242	WCHAR wc_auto[wclen_auto];
243	int wclen = wclen_auto;
244	WCHAR *wc = wc_auto;
245	int len;
246	QString sp;
247	bool prepend = false;
248	char state_data = 0;
249	int remainingChars = 0;
250
251	//save the current state information
252	if (state) {
253	state_data = (char)state->state_data[0];
254	remainingChars = state->remainingChars;
255	}
256
257	//convert the pending charcter (if available)
258	if (state && remainingChars) {
259	char prev[3] = {0};
260	prev[0] = state_data;
261	prev[1] = mb[0];
262	remainingChars = 0;
263	len = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED,
264	prev, 2, wc, wclen);
265	if (len) {
266	prepend = true;
267	sp.append(QChar(wc[0]));
268	mb++;
269	mblen--;
270	wc[0] = 0;
271	}
272	}
273
274	while (!(len=MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED\|MB_ERR_INVALID_CHARS,
275	mb, mblen, wc, wclen))) {
276	int r = GetLastError();
277	if (r == ERROR_INSUFFICIENT_BUFFER) {
278	if (wc != wc_auto) {
279	qWarning("MultiByteToWideChar: Size changed");
280	break;
281	} else {
282	wclen = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED,
283	mb, mblen, 0, 0);
284	wc = new WCHAR[wclen];
285	// and try again...
286	}
287	} else if (r == ERROR_NO_UNICODE_TRANSLATION) {
288	//find the last non NULL character
289	while (mblen > 1 && !(mb[mblen-1]))
290	mblen--;
291	//check whether, we hit an invalid character in the middle
292	if ((mblen <= 1) \|\| (remainingChars && state_data))
293	return convertToUnicodeCharByChar(chars, length, state);
294	//Remove the last character and try again...
295	state_data = mb[mblen-1];
296	remainingChars = 1;
297	mblen--;
298	} else {
299	// Fail.
300	qWarning("MultiByteToWideChar: Cannot convert multibyte text");
301	break;
302	}
303	}
304	if (len <= 0)
305	return QString();
306	if (wc[len-1] == 0) // len - 1: we don't want terminator
307	--len;
308
309	//save the new state information
310	if (state) {
311	state->state_data[0] = (char)state_data;
312	state->remainingChars = remainingChars;
313	}
314	QString s((QChar*)wc, len);
315	if (wc != wc_auto)
316	delete [] wc;
317	if (prepend) {
318	return sp+s;
319	}
320	return s;
321	}
322
323	QString QWindowsLocalCodec::convertToUnicodeCharByChar(const char chars, int length, ConverterState state) const
324	{
325	if (!chars \|\| !length)
326	return QString();
327
328	int copyLocation = 0;
329	int extra = 2;
330	if (state && state->remainingChars) {
331	copyLocation = state->remainingChars;
332	extra += copyLocation;
333	}
334	int newLength = length + extra;
335	char *mbcs = new char[newLength];
336	//ensure that we have a NULL terminated string
337	mbcs[newLength-1] = 0;
338	mbcs[newLength-2] = 0;
339	memcpy(&(mbcs[copyLocation]), chars, length);
340	if (copyLocation) {
341	//copy the last character from the state
342	mbcs[0] = (char)state->state_data[0];
343	state->remainingChars = 0;
344	}
345	const char *mb = mbcs;
346	#ifndef Q_OS_WINCE
347	const char *next = 0;
348	QString s;
349	while((next = CharNextExA(CP_ACP, mb, 0)) != mb) {
350	WCHAR wc[2] ={0};
351	int charlength = next - mb;
352	int len = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED\|MB_ERR_INVALID_CHARS, mb, charlength, wc, 2);
353	if (len>0) {
354	s.append(QChar(wc[0]));
355	} else {
356	int r = GetLastError();
357	//check if the character being dropped is the last character
358	if (r == ERROR_NO_UNICODE_TRANSLATION && mb == (mbcs+newLength -3) && state) {
359	state->remainingChars = 1;
360	state->state_data[0] = (char)*mb;
361	}
362	}
363	mb = next;
364	}
365	#else
366	QString s;
367	int size = mbstowcs(NULL, mb, length);
368	if (size < 0) {
369	Q_ASSERT("Error in CE TextCodec");
370	return QString();
371	}
372	wchar_t* ws = new wchar_t[size + 2];
373	ws[size +1] = 0;
374	ws[size] = 0;
375	size = mbstowcs(ws, mb, length);
376	for (int i=0; i< size; i++)
377	s.append(QChar(ws[i]));
378	delete [] ws;
379	#endif
380	delete mbcs;
381	return s;
382	}
383
384	QByteArray QWindowsLocalCodec::convertFromUnicode(const QChar uc, int len, ConverterState ) const
385	{
386	return qt_winQString2MB(uc, len);
387	}
388
389
390	QByteArray QWindowsLocalCodec::name() const
391	{
392	return "System";
393	}
394
395	int QWindowsLocalCodec::mibEnum() const
396	{
397	return 0;
398	}
399
400	#elif defined(Q_OS_OS2)
401
402	class QOs2LocalCodec: public QTextCodec
403	{
404	public:
405	QOs2LocalCodec();
406	~QOs2LocalCodec();
407
408	QString convertToUnicode(const char , int, ConverterState ) const;
409	QByteArray convertFromUnicode(const QChar , int, ConverterState ) const;
410
411	QByteArray name() const;
412	int mibEnum() const;
413
414	private:
415	UconvObject uoSubYes;
416	UconvObject uoSubNo;
417	};
418
419	QOs2LocalCodec::QOs2LocalCodec() : uoSubYes(0), uoSubNo(0)
420	{
421	// create the conversion object for the process code page that performs
422	// substitution of invalid characters with '?'
423	UniCreateUconvObject((UniChar *)L"@sub=yes,subchar=\\x3F,subuni=\\x003F",
424	&uoSubYes);
425	Q_ASSERT(uoSubYes);
426
427	// same as above but doesn't perform substitution
428	UniCreateUconvObject((UniChar *)L"@sub=no", &uoSubNo);
429	Q_ASSERT(uoSubNo);
430	}
431
432	QOs2LocalCodec::~QOs2LocalCodec()
433	{
434	UniFreeUconvObject(uoSubNo);
435	UniFreeUconvObject(uoSubYes);
436	}
437
438	static void qOs2LocalCodecStateFree(QTextCodec::ConverterState *state)
439	{
440	delete reinterpret_cast<char *>(state->d);
441	}
442
443	QString QOs2LocalCodec::convertToUnicode(const char *chars, int length,
444	ConverterState *state) const
445	{
446	QString res;
447
448	if (!chars)
449	return res;
450	if (!length)
451	return QLatin1String("");
452
453	UconvObject uo = uoSubYes;
454	if (state && (state->flags & ConvertInvalidToNull))
455	uo = uoSubNo;
456
457	int remainingChars = 0;
458	char *remainingBuffer = 0;
459
460	if (state) {
461	// stateful conversion
462	remainingBuffer = reinterpret_cast<char *>(state->d);
463	if (remainingBuffer) {
464	// restore state
465	remainingChars = state->remainingChars;
466	} else {
467	// first time, add the destructor for state->d
468	state->flags \|= FreeFunction;
469	QTextCodecUnalignedPointer::encode(state->state_data,
470	qOs2LocalCodecStateFree);
471	}
472	}
473
474	const char *mbPtr = chars;
475	size_t mbLeft = length;
476
477	QByteArray mbExtra;
478	if (remainingChars) {
479	// we have to prepend the remaining bytes from the previous conversion
480	mbLeft += remainingChars;
481	mbExtra.resize(mbLeft);
482	mbPtr = mbExtra.data();
483
484	memcpy(mbExtra.data(), remainingBuffer, remainingChars);
485	memcpy(mbExtra.data() + remainingChars, chars, length);
486
487	remainingBuffer = 0;
488	remainingChars = 0;
489	}
490
491	size_t ucLen = mbLeft;
492	QString ucBuf(ucLen, QLatin1Char('\0'));
493	UniChar ucPtr = reinterpret_cast<UniChar >(ucBuf.data());
494	size_t ucLeft = ucLen;
495
496	size_t nonIdent = 0;
497	int rc;
498
499	while (mbLeft) {
500	rc = UniUconvToUcs(uo, (void**)&mbPtr, &mbLeft, &ucPtr, &ucLeft,
501	&nonIdent);
502	if (rc == ULS_BUFFERFULL) {
503	size_t ucDone = ucLen - ucLeft;
504	size_t mbDone = length - mbLeft;
505	// assume that mbLeft/ucLeft is an approximation of mbDone/ucDone
506	ucLen = ucDone + (mbLeft * ucDone) / mbDone;
507	ucBuf.resize(ucLen);
508	ucPtr = reinterpret_cast<UniChar *>(ucBuf.data() + ucDone);
509	} else if (rc == ULS_ILLEGALSEQUENCE && state) {
510	// conversion stopped because the remaining inBytesLeft make up
511	// an incomplete multi-byte sequence; save them for later
512	remainingBuffer = new char[mbLeft];
513	memcpy(remainingBuffer, mbPtr, mbLeft);
514	remainingChars = mbLeft;
515	break;
516	} else if (rc != ULS_SUCCESS) {
517	// just fail on an unexpected error (will return what we've got)
518	qWarning("QOs2LocalCodec::convertToUnicode: UniUconvToUcs failed "
519	"with %d", rc);
520	break;
521	}
522	}
523
524	ucBuf.resize(ucLen - ucLeft);
525	res = ucBuf;
526
527	if (state) {
528	// update the state
529	state->invalidChars = nonIdent;
530	state->remainingChars = remainingChars;
531	state->d = remainingBuffer;
532	}
533
534	return res;
535	}
536
537	QByteArray QOs2LocalCodec::convertFromUnicode(const QChar *uchars, int length,
538	ConverterState *state) const
539	{
540	QByteArray res;
541
542	if (!uchars)
543	return res;
544	if (!length)
545	return QByteArray("");
546
547	UconvObject uo = uoSubYes;
548	if (state && (state->flags & ConvertInvalidToNull))
549	uo = uoSubNo;
550
551	const UniChar ucPtr = reinterpret_cast<const UniChar >(uchars);
552	size_t ucLeft = length;
553
554	QVector<QChar> ucExtra;
555	if (state && state->remainingChars) {
556	// we have one surrogate char to be prepended
557	Q_ASSERT(state->remainingChars == 1);
558	ucLeft += 1;
559	ucExtra.resize(ucLeft);
560	ucPtr = reinterpret_cast<const UniChar *>(ucExtra.data());
561
562	ucExtra[0] = state->state_data[0];
563	memcpy(ucExtra.data() + 1, uchars, length * sizeof(QChar));
564
565	state->remainingChars = 0;
566	}
567
568	// be optimistic (imply that one byte is necessary per every Unicode char)
569	size_t mbLen = length;
570	QByteArray mbBuf(mbLen, '\0');
571	char *mbPtr = mbBuf.data();
572	size_t mbLeft = mbLen;
573
574	size_t nonIdent = 0;
575	int rc;
576
577	while (ucLeft) {
578	rc = UniUconvFromUcs(uo, const_cast<UniChar **>(&ucPtr), &ucLeft,
579	(void**)&mbPtr, &mbLeft, &nonIdent);
580	if (rc == ULS_BUFFERFULL) {
581	size_t mbDone = mbLen - mbLeft;
582	size_t ucDone = length - ucLeft;
583	size_t newLen = mbLen;
584	if (ucDone) {
585	// assume that ucLeft/mbLeft is an approximation of ucDone/mbDone
586	newLen = mbDone + (ucLeft * mbDone) / ucDone;
587	}
588	if (newLen == mbLen) {
589	// could not process a single Unicode char, double the size
590	mbLen *= 2;
591	} else {
592	mbLen = newLen;
593	}
594	mbBuf.resize(mbLen);
595	mbPtr = mbBuf.data() + mbDone;
596	mbLeft = mbLen - mbDone;
597	} else if (rc == ULS_ILLEGALSEQUENCE && state) {
598	// buffer ends in a surrogate
599	Q_ASSERT(ucLeft == 2);
600	state->state_data[0] = *ucPtr;
601	state->remainingChars = 1;
602	break;
603	} else if (rc != ULS_SUCCESS) {
604	// just fail on an unexpected error (will return what we've got)
605	qWarning("QOs2LocalCodec::convertFromUnicode: UniUconvFromUcs failed "
606	"with %d", rc);
607	break;
608	}
609	}
610
611	mbBuf.resize(mbLen - mbLeft);
612	res = mbBuf;
613
614	if (state) {
615	// update the state
616	state->invalidChars = nonIdent;
617	}
618
619	return res;
620	}
621
622	QByteArray QOs2LocalCodec::name() const
623	{
624	return "System";
625	}
626
627	int QOs2LocalCodec::mibEnum() const
628	{
629	return 0;
630	}
631
632	#else
633
634	/* locale names mostly copied from XFree86 */
635	static const char * const iso8859_2locales[] = {
636	"croatian", "cs", "cs_CS", "cs_CZ","cz", "cz_CZ", "czech", "hr",
637	"hr_HR", "hu", "hu_HU", "hungarian", "pl", "pl_PL", "polish", "ro",
638	"ro_RO", "rumanian", "serbocroatian", "sh", "sh_SP", "sh_YU", "sk",
639	"sk_SK", "sl", "sl_CS", "sl_SI", "slovak", "slovene", "sr_SP", 0 };
640
641	static const char * const iso8859_3locales[] = {
642	"eo", 0 };
643
644	static const char * const iso8859_4locales[] = {
645	"ee", "ee_EE", 0 };
646
647	static const char * const iso8859_5locales[] = {
648	"mk", "mk_MK", "sp", "sp_YU", 0 };
649
650	static const char * const cp_1251locales[] = {
651	"be", "be_BY", "bg", "bg_BG", "bulgarian", 0 };
652
653	static const char * const pt_154locales[] = {
654	"ba_RU", "ky", "ky_KG", "kk", "kk_KZ", 0 };
655
656	static const char * const iso8859_6locales[] = {
657	"ar_AA", "ar_SA", "arabic", 0 };
658
659	static const char * const iso8859_7locales[] = {
660	"el", "el_GR", "greek", 0 };
661
662	static const char * const iso8859_8locales[] = {
663	"hebrew", "he", "he_IL", "iw", "iw_IL", 0 };
664
665	static const char * const iso8859_9locales[] = {
666	"tr", "tr_TR", "turkish", 0 };
667
668	static const char * const iso8859_13locales[] = {
669	"lt", "lt_LT", "lv", "lv_LV", 0 };
670
671	static const char * const iso8859_15locales[] = {
672	"et", "et_EE",
673	// Euro countries
674	"br_FR", "ca_ES", "de", "de_AT", "de_BE", "de_DE", "de_LU", "en_IE",
675	"es", "es_ES", "eu_ES", "fi", "fi_FI", "finnish", "fr", "fr_FR",
676	"fr_BE", "fr_LU", "french", "ga_IE", "gl_ES", "it", "it_IT", "oc_FR",
677	"nl", "nl_BE", "nl_NL", "pt", "pt_PT", "sv_FI", "wa_BE",
678	0 };
679
680	static const char * const koi8_ulocales[] = {
681	"uk", "uk_UA", "ru_UA", "ukrainian", 0 };
682
683	static const char * const tis_620locales[] = {
684	"th", "th_TH", "thai", 0 };
685
686	// static const char * const tcvnlocales[] = {
687	// "vi", "vi_VN", 0 };
688
689	static bool try_locale_list(const char * const locale[], const char * lang)
690	{
691	int i;
692	for(i=0; locale[i] && *locale[i] && strcmp(locale[i], lang); i++)
693	;
694	return locale[i] != 0;
695	}
696
697	// For the probably_koi8_locales we have to look. the standard says
698	// these are 8859-5, but almost all Russian users use KOI8-R and
699	// incorrectly set $LANG to ru_RU. We'll check tolower() to see what
700	// it thinks ru_RU means.
701
702	// If you read the history, it seems that many Russians blame ISO and
703	// Perestroika for the confusion.
704	//
705	// The real bug is that some programs break if the user specifies
706	// ru_RU.KOI8-R.
707
708	static const char * const probably_koi8_rlocales[] = {
709	"ru", "ru_SU", "ru_RU", "russian", 0 };
710
711	static QTextCodec * ru_RU_hack(const char * i) {
712	#if defined(Q_OS_OS2)
713	// @todo temporary hack. the proper one is to use the current process'
714	// code page if LANG or its codepage part is missing
715	return QTextCodec::codecForName("cp866");
716	#else
717	QTextCodec * ru_RU_codec = 0;
718
719	#if !defined(QT_NO_SETLOCALE)
720	QByteArray origlocale(setlocale(LC_CTYPE, i));
721	#else
722	QByteArray origlocale(i);
723	#endif
724	// unicode koi8r latin5 name
725	// 0x044E 0xC0 0xEE CYRILLIC SMALL LETTER YU
726	// 0x042E 0xE0 0xCE CYRILLIC CAPITAL LETTER YU
727	int latin5 = tolower(0xCE);
728	int koi8r = tolower(0xE0);
729	if (koi8r == 0xC0 && latin5 != 0xEE) {
730	ru_RU_codec = QTextCodec::codecForName("KOI8-R");
731	} else if (koi8r != 0xC0 && latin5 == 0xEE) {
732	ru_RU_codec = QTextCodec::codecForName("ISO 8859-5");
733	} else {
734	// something else again... let's assume... throws dice
735	ru_RU_codec = QTextCodec::codecForName("KOI8-R");
736	qWarning("QTextCodec: Using KOI8-R, probe failed (%02x %02x %s)",
737	koi8r, latin5, i);
738	}
739	#if !defined(QT_NO_SETLOCALE)
740	setlocale(LC_CTYPE, origlocale);
741	#endif
742
743	return ru_RU_codec;
744	#endif // defined(Q_OS_OS2)
745	}
746
747	#endif
748
749	#if !defined(Q_OS_WIN32) && !defined(Q_OS_WINCE) && !defined(Q_OS_OS2)
750	static QTextCodec checkForCodec(const char name) {
751	QTextCodec *c = QTextCodec::codecForName(name);
752	if (!c) {
753	const char *at = strchr(name, '@');
754	if (at) {
755	QByteArray n(name, at - name);
756	c = QTextCodec::codecForName(n.data());
757	}
758	}
759	return c;
760	}
761	#endif
762
763	/* the next two functions are implicitely thread safe,
764	as they are only called by setup() which uses a mutex.
765	*/
766	static void setupLocaleMapper()
767	{
768	#if defined(Q_OS_WIN32) \|\| defined(Q_OS_WINCE)
769	localeMapper = QTextCodec::codecForName("System");
770	#elif defined(Q_OS_OS2)
771	localeMapper = QTextCodec::codecForName("System");
772	#else
773
774	#ifndef QT_NO_ICONV
775	localeMapper = QTextCodec::codecForName("System");
776	#endif
777
778	#if defined (_XOPEN_UNIX) && !defined(Q_OS_QNX6) && !defined(Q_OS_OSF)
779	if (!localeMapper) {
780	char *charset = nl_langinfo (CODESET);
781	if (charset)
782	localeMapper = QTextCodec::codecForName(charset);
783	}
784	#endif
785
786	if (!localeMapper) {
787	// Very poorly defined and followed standards causes lots of
788	// code to try to get all the cases... This logic is
789	// duplicated in QIconvCodec, so if you change it here, change
790	// it there too.
791
792	// Try to determine locale codeset from locale name assigned to
793	// LC_CTYPE category.
794
795	// First part is getting that locale name. First try setlocale() which
796	// definitely knows it, but since we cannot fully trust it, get ready
797	// to fall back to environment variables.
798	#if !defined(QT_NO_SETLOCALE)
799	char * ctype = qstrdup(setlocale(LC_CTYPE, 0));
800	#else
801	char * ctype = qstrdup("");
802	#endif
803
804	// Get the first nonempty value from $LC_ALL, $LC_CTYPE, and $LANG
805	// environment variables.
806	char * lang = qstrdup(qgetenv("LC_ALL").constData());
807	if (!lang \|\| lang[0] == 0 \|\| strcmp(lang, "C") == 0) {
808	if (lang) delete [] lang;
809	lang = qstrdup(qgetenv("LC_CTYPE").constData());
810	}
811	if (!lang \|\| lang[0] == 0 \|\| strcmp(lang, "C") == 0) {
812	if (lang) delete [] lang;
813	lang = qstrdup(qgetenv("LANG").constData());
814	}
815
816	// Now try these in order:
817	// 1. CODESET from ctype if it contains a .CODESET part (e.g. en_US.ISO8859-15)
818	// 2. CODESET from lang if it contains a .CODESET part
819	// 3. ctype (maybe the locale is named "ISO-8859-1" or something)
820	// 4. locale (ditto)
821	// 5. check for "@euro"
822	// 6. guess locale from ctype unless ctype is "C"
823	// 7. guess locale from lang
824
825	// 1. CODESET from ctype if it contains a .CODESET part (e.g. en_US.ISO8859-15)
826	char * codeset = ctype ? strchr(ctype, '.') : 0;
827	if (codeset && *codeset == '.')
828	localeMapper = checkForCodec(codeset + 1);
829
830	// 2. CODESET from lang if it contains a .CODESET part
831	codeset = lang ? strchr(lang, '.') : 0;
832	if (!localeMapper && codeset && *codeset == '.')
833	localeMapper = checkForCodec(codeset + 1);
834
835	// 3. ctype (maybe the locale is named "ISO-8859-1" or something)
836	if (!localeMapper && ctype && *ctype != 0 && strcmp (ctype, "C") != 0)
837	localeMapper = checkForCodec(ctype);
838
839	// 4. locale (ditto)
840	if (!localeMapper && lang && *lang != 0)
841	localeMapper = checkForCodec(lang);
842
843	// 5. "@euro"
844	if ((!localeMapper && ctype && strstr(ctype, "@euro")) \|\| (lang && strstr(lang, "@euro")))
845	localeMapper = checkForCodec("ISO 8859-15");
846
847	// 6. guess locale from ctype unless ctype is "C"
848	// 7. guess locale from lang
849	char * try_by_name = ctype;
850	if (ctype && *ctype != 0 && strcmp (ctype, "C") != 0)
851	try_by_name = lang;
852
853	// Now do the guessing.
854	if (lang && lang && !localeMapper && try_by_name && try_by_name) {
855	if (try_locale_list(iso8859_15locales, lang))
856	localeMapper = QTextCodec::codecForName("ISO 8859-15");
857	else if (try_locale_list(iso8859_2locales, lang))
858	localeMapper = QTextCodec::codecForName("ISO 8859-2");
859	else if (try_locale_list(iso8859_3locales, lang))
860	localeMapper = QTextCodec::codecForName("ISO 8859-3");
861	else if (try_locale_list(iso8859_4locales, lang))
862	localeMapper = QTextCodec::codecForName("ISO 8859-4");
863	else if (try_locale_list(iso8859_5locales, lang))
864	localeMapper = QTextCodec::codecForName("ISO 8859-5");
865	else if (try_locale_list(iso8859_6locales, lang))
866	localeMapper = QTextCodec::codecForName("ISO 8859-6");
867	else if (try_locale_list(iso8859_7locales, lang))
868	localeMapper = QTextCodec::codecForName("ISO 8859-7");
869	else if (try_locale_list(iso8859_8locales, lang))
870	localeMapper = QTextCodec::codecForName("ISO 8859-8-I");
871	else if (try_locale_list(iso8859_9locales, lang))
872	localeMapper = QTextCodec::codecForName("ISO 8859-9");
873	else if (try_locale_list(iso8859_13locales, lang))
874	localeMapper = QTextCodec::codecForName("ISO 8859-13");
875	else if (try_locale_list(tis_620locales, lang))
876	localeMapper = QTextCodec::codecForName("ISO 8859-11");
877	else if (try_locale_list(koi8_ulocales, lang))
878	localeMapper = QTextCodec::codecForName("KOI8-U");
879	else if (try_locale_list(cp_1251locales, lang))
880	localeMapper = QTextCodec::codecForName("CP 1251");
881	else if (try_locale_list(pt_154locales, lang))
882	localeMapper = QTextCodec::codecForName("PT 154");
883	else if (try_locale_list(probably_koi8_rlocales, lang))
884	localeMapper = ru_RU_hack(lang);
885	}
886
887	delete [] ctype;
888	delete [] lang;
889	}
890
891	// If everything failed, we default to 8859-1
892	// We could perhaps default to 8859-15.
893	if (!localeMapper)
894	localeMapper = QTextCodec::codecForName("ISO 8859-1");
895	#endif
896	}
897
898
899	static void setup()
900	{
901	#ifndef QT_NO_THREAD
902	QMutexLocker locker(QMutexPool::globalInstanceGet(&all));
903	#endif
904
905	if (all)
906	return;
907
908	if (destroying_is_ok)
909	qWarning("QTextCodec: Creating new codec during codec cleanup");
910	all = new QList<QTextCodec*>;
911	// create the cleanup object to cleanup all codecs on exit
912	(void) createQTextCodecCleanup();
913
914	#ifndef QT_NO_CODECS
915	# if defined(Q_WS_X11) && !defined(QT_BOOTSTRAPPED)
916	// no font codecs when bootstrapping
917	(void)new QFontLaoCodec;
918	# if defined(QT_NO_ICONV)
919	// no iconv(3) support, must build all codecs into the library
920	(void)new QFontGb2312Codec;
921	(void)new QFontGbkCodec;
922	(void)new QFontGb18030_0Codec;
923	(void)new QFontJis0208Codec;
924	(void)new QFontJis0201Codec;
925	(void)new QFontKsc5601Codec;
926	(void)new QFontBig5hkscsCodec;
927	(void)new QFontBig5Codec;
928	# endif // QT_NO_ICONV && !QT_BOOTSTRAPPED
929	# endif // Q_WS_X11
930
931	(void)new QTsciiCodec;
932
933	for (int i = 0; i < 9; ++i)
934	(void)new QIsciiCodec(i);
935
936
937	# if defined(QT_NO_ICONV) && !defined(QT_BOOTSTRAPPED)
938	// no asian codecs when bootstrapping, sorry
939	(void)new QGb18030Codec;
940	(void)new QGbkCodec;
941	(void)new QGb2312Codec;
942	(void)new QEucJpCodec;
943	(void)new QJisCodec;
944	(void)new QSjisCodec;
945	(void)new QEucKrCodec;
946	(void)new QBig5Codec;
947	(void)new QBig5hkscsCodec;
948	# endif // QT_NO_ICONV && !QT_BOOTSTRAPPED
949	#endif // QT_NO_CODECS
950
951	#if defined(Q_OS_WIN32) \|\| defined(Q_OS_WINCE)
952	(void) new QWindowsLocalCodec;
953	#endif // Q_OS_WIN32
954
955	#if defined(Q_OS_OS2)
956	(void) new QOs2LocalCodec;
957	#endif // Q_OS_OS2
958
959	(void)new QUtf16Codec;
960	(void)new QUtf16BECodec;
961	(void)new QUtf16LECodec;
962	(void)new QUtf32Codec;
963	(void)new QUtf32BECodec;
964	(void)new QUtf32LECodec;
965	(void)new QLatin15Codec;
966	(void)new QLatin1Codec;
967	(void)new QUtf8Codec;
968
969	for (int i = 0; i < QSimpleTextCodec::numSimpleCodecs; ++i)
970	(void)new QSimpleTextCodec(i);
971
972	#if defined(Q_OS_UNIX) && !defined(QT_NO_ICONV) && !defined(QT_BOOTSTRAPPED)
973	// QIconvCodec depends on the UTF-16 codec, so it needs to be created last
974	(void) new QIconvCodec();
975	#endif
976
977	if (!localeMapper)
978	setupLocaleMapper();
979	}
980
981	QTextCodec::ConverterState::~ConverterState()
982	{
983	if (flags & FreeFunction)
984	(QTextCodecUnalignedPointer::decode(state_data))(this);
985	else if (d)
986	qFree(d);
987	}
988
989	static bool codecForLocaleSet = false;
990	void qt_resetCodecForLocale()
991	{
992	// if QTextCodec::codecForLocale() was called, we assume that the user has
993	// explicitly set the codec he wants for the locale and don't attempt to
994	// autodetect it again
995	if (!codecForLocaleSet)
996	setupLocaleMapper();
997	}
998
999	/*!
1000	\class QTextCodec
1001	\brief The QTextCodec class provides conversions between text encodings.
1002	\reentrant
1003	\ingroup i18n
1004
1005	Qt uses Unicode to store, draw and manipulate strings. In many
1006	situations you may wish to deal with data that uses a different
1007	encoding. For example, most Japanese documents are still stored
1008	in Shift-JIS or ISO 2022-JP, while Russian users often have their
1009	documents in KOI8-R or Windows-1251.
1010
1011	Qt provides a set of QTextCodec classes to help with converting
1012	non-Unicode formats to and from Unicode. You can also create your
1013	own codec classes.
1014
1015	The supported encodings are:
1016
1017	\list
1018	\o Apple Roman
1019	\o \l{Big5 Text Codec}{Big5}
1020	\o \l{Big5-HKSCS Text Codec}{Big5-HKSCS}
1021	\o CP949
1022	\o \l{EUC-JP Text Codec}{EUC-JP}
1023	\o \l{EUC-KR Text Codec}{EUC-KR}
1024	\o \l{GBK Text Codec}{GB18030-0}
1025	\o IBM 850
1026	\o IBM 866
1027	\o IBM 874
1028	\o \l{ISO 2022-JP (JIS) Text Codec}{ISO 2022-JP}
1029	\o ISO 8859-1 to 10
1030	\o ISO 8859-13 to 16
1031	\o Iscii-Bng, Dev, Gjr, Knd, Mlm, Ori, Pnj, Tlg, and Tml
1032	\o JIS X 0201
1033	\o JIS X 0208
1034	\o KOI8-R
1035	\o KOI8-U
1036	\o MuleLao-1
1037	\o ROMAN8
1038	\o \l{Shift-JIS Text Codec}{Shift-JIS}
1039	\o TIS-620
1040	\o \l{TSCII Text Codec}{TSCII}
1041	\o UTF-8
1042	\o UTF-16
1043	\o UTF-16BE
1044	\o UTF-16LE
1045	\o UTF-32
1046	\o UTF-32BE
1047	\o UTF-32LE
1048	\o Windows-1250 to 1258
1049	\o WINSAMI2
1050	\endlist
1051
1052	QTextCodecs can be used as follows to convert some locally encoded
1053	string to Unicode. Suppose you have some string encoded in Russian
1054	KOI8-R encoding, and want to convert it to Unicode. The simple way
1055	to do it is like this:
1056
1057	\snippet doc/src/snippets/code/src_corelib_codecs_qtextcodec.cpp 0
1058
1059	After this, \c string holds the text converted to Unicode.
1060	Converting a string from Unicode to the local encoding is just as
1061	easy:
1062
1063	\snippet doc/src/snippets/code/src_corelib_codecs_qtextcodec.cpp 1
1064
1065	To read or write files in various encodings, use QTextStream and
1066	its \l{QTextStream::setCodec()}{setCodec()} function. See the
1067	\l{tools/codecs}{Codecs} example for an application of QTextCodec
1068	to file I/O.
1069
1070	Some care must be taken when trying to convert the data in chunks,
1071	for example, when receiving it over a network. In such cases it is
1072	possible that a multi-byte character will be split over two
1073	chunks. At best this might result in the loss of a character and
1074	at worst cause the entire conversion to fail.
1075
1076	The approach to use in these situations is to create a QTextDecoder
1077	object for the codec and use this QTextDecoder for the whole
1078	decoding process, as shown below:
1079
1080	\snippet doc/src/snippets/code/src_corelib_codecs_qtextcodec.cpp 2
1081
1082	The QTextDecoder object maintains state between chunks and therefore
1083	works correctly even if a multi-byte character is split between
1084	chunks.
1085
1086	\section1 Creating Your Own Codec Class
1087
1088	Support for new text encodings can be added to Qt by creating
1089	QTextCodec subclasses.
1090
1091	The pure virtual functions describe the encoder to the system and
1092	the coder is used as required in the different text file formats
1093	supported by QTextStream, and under X11, for the locale-specific
1094	character input and output.
1095
1096	To add support for another encoding to Qt, make a subclass of
1097	QTextCodec and implement the functions listed in the table below.
1098
1099	\table
1100	\header \o Function \o Description
1101
1102	\row \o name()
1103	\o Returns the official name for the encoding. If the
1104	encoding is listed in the
1105	\l{IANA character-sets encoding file}, the name
1106	should be the preferred MIME name for the encoding.
1107
1108	\row \o aliases()
1109	\o Returns a list of alternative names for the encoding.
1110	QTextCodec provides a default implementation that returns
1111	an empty list. For example, "ISO-8859-1" has "latin1",
1112	"CP819", "IBM819", and "iso-ir-100" as aliases.
1113
1114	\row \o mibEnum()
1115	\o Return the MIB enum for the encoding if it is listed in
1116	the \l{IANA character-sets encoding file}.
1117
1118	\row \o convertToUnicode()
1119	\o Converts an 8-bit character string to Unicode.
1120
1121	\row \o convertFromUnicode()
1122	\o Converts a Unicode string to an 8-bit character string.
1123	\endtable
1124
1125	You may find it more convenient to make your codec class
1126	available as a plugin; see \l{How to Create Qt Plugins} for
1127	details.
1128
1129	\sa QTextStream, QTextDecoder, QTextEncoder, {Codecs Example}
1130	*/
1131
1132	/*!
1133	\enum QTextCodec::ConversionFlag
1134
1135	\value DefaultConversion No flag is set.
1136	\value ConvertInvalidToNull If this flag is set, each invalid input
1137	character is output as a null character.
1138	\value IgnoreHeader Ignore any Unicode byte-order mark and don't generate any.
1139
1140	\omitvalue FreeFunction
1141	*/
1142
1143	/*!
1144	\fn QTextCodec::ConverterState::ConverterState(ConversionFlags flags)
1145
1146	Constructs a ConverterState object initialized with the given \a flags.
1147	*/
1148
1149	/*!
1150	\fn QTextCodec::ConverterState::~ConverterState()
1151
1152	Destroys the ConverterState object.
1153	*/
1154
1155	/*!
1156	\nonreentrant
1157
1158	Constructs a QTextCodec, and gives it the highest precedence. The
1159	QTextCodec should always be constructed on the heap (i.e. with \c
1160	new). Qt takes ownership and will delete it when the application
1161	terminates.
1162	*/
1163	QTextCodec::QTextCodec()
1164	{
1165	setup();
1166	all->prepend(this);
1167	}
1168
1169
1170	/*!
1171	\nonreentrant
1172
1173	Destroys the QTextCodec. Note that you should not delete codecs
1174	yourself: once created they become Qt's responsibility.
1175	*/
1176	QTextCodec::~QTextCodec()
1177	{
1178	if (!destroying_is_ok)
1179	qWarning("QTextCodec::~QTextCodec: Called by application");
1180	if (all)
1181	all->removeAll(this);
1182	}
1183
1184	/*!
1185	\fn QTextCodec QTextCodec::codecForName(const char name)
1186
1187	Searches all installed QTextCodec objects and returns the one
1188	which best matches \a name; the match is case-insensitive. Returns
1189	0 if no codec matching the name \a name could be found.
1190	*/
1191
1192	/*!
1193	Searches all installed QTextCodec objects and returns the one
1194	which best matches \a name; the match is case-insensitive. Returns
1195	0 if no codec matching the name \a name could be found.
1196	*/
1197	QTextCodec *QTextCodec::codecForName(const QByteArray &name)
1198	{
1199	if (name.isEmpty())
1200	return 0;
1201
1202	setup();
1203
1204	for (int i = 0; i < all->size(); ++i) {
1205	QTextCodec *cursor = all->at(i);
1206	if (nameMatch(cursor->name(), name))
1207	return cursor;
1208	QList<QByteArray> aliases = cursor->aliases();
1209	for (int i = 0; i < aliases.size(); ++i)
1210	if (nameMatch(aliases.at(i), name))
1211	return cursor;
1212	}
1213
1214	return createForName(name);
1215	}
1216
1217
1218	/*!
1219	Returns the QTextCodec which matches the \link
1220	QTextCodec::mibEnum() MIBenum\endlink \a mib.
1221	*/
1222	QTextCodec* QTextCodec::codecForMib(int mib)
1223	{
1224	setup();
1225
1226	// Qt 3 used 1000 (mib for UCS2) as its identifier for the utf16 codec. Map
1227	// this correctly for compatibility.
1228	if (mib == 1000)
1229	mib = 1015;
1230
1231	QList<QTextCodec*>::ConstIterator i;
1232	for (int i = 0; i < all->size(); ++i) {
1233	QTextCodec *cursor = all->at(i);
1234	if (cursor->mibEnum() == mib)
1235	return cursor;
1236	}
1237
1238	return createForMib(mib);
1239	}
1240
1241	/*!
1242	Returns the list of all available codecs, by name. Call
1243	QTextCodec::codecForName() to obtain the QTextCodec for the name.
1244
1245	The list may contain many mentions of the same codec
1246	if the codec has aliases.
1247
1248	\sa availableMibs(), name(), aliases()
1249	*/
1250	QList<QByteArray> QTextCodec::availableCodecs()
1251	{
1252	setup();
1253
1254	QList<QByteArray> codecs;
1255	for (int i = 0; i < all->size(); ++i) {
1256	codecs += all->at(i)->name();
1257	codecs += all->at(i)->aliases();
1258	}
1259	#ifndef QT_NO_TEXTCODECPLUGIN
1260	QFactoryLoader *l = loader();
1261	QStringList keys = l->keys();
1262	for (int i = 0; i < keys.size(); ++i) {
1263	if (!keys.at(i).startsWith(QLatin1String("MIB: "))) {
1264	QByteArray name = keys.at(i).toLatin1();
1265	if (!codecs.contains(name))
1266	codecs += name;
1267	}
1268	}
1269	#endif
1270
1271	return codecs;
1272	}
1273
1274	/*!
1275	Returns the list of MIBs for all available codecs. Call
1276	QTextCodec::codecForMib() to obtain the QTextCodec for the MIB.
1277
1278	\sa availableCodecs(), mibEnum()
1279	*/
1280	QList<int> QTextCodec::availableMibs()
1281	{
1282	setup();
1283
1284	QList<int> codecs;
1285	for (int i = 0; i < all->size(); ++i)
1286	codecs += all->at(i)->mibEnum();
1287	#ifndef QT_NO_TEXTCODECPLUGIN
1288	QFactoryLoader *l = loader();
1289	QStringList keys = l->keys();
1290	for (int i = 0; i < keys.size(); ++i) {
1291	if (keys.at(i).startsWith(QLatin1String("MIB: "))) {
1292	int mib = keys.at(i).mid(5).toInt();
1293	if (!codecs.contains(mib))
1294	codecs += mib;
1295	}
1296	}
1297	#endif
1298
1299	return codecs;
1300	}
1301
1302	/*!
1303	Set the codec to \a c; this will be returned by
1304	codecForLocale(). If \a c is a null pointer, the codec is reset to
1305	the default.
1306
1307	This might be needed for some applications that want to use their
1308	own mechanism for setting the locale.
1309
1310	Setting this codec is not supported on DOS based Windows.
1311
1312	\sa codecForLocale()
1313	*/
1314	void QTextCodec::setCodecForLocale(QTextCodec *c)
1315	{
1316	#ifdef Q_WS_WIN
1317	if (QSysInfo::WindowsVersion& QSysInfo::WV_DOS_based)
1318	return;
1319	#endif
1320	codecForLocaleSet = true;
1321	localeMapper = c;
1322	if (!localeMapper)
1323	setupLocaleMapper();
1324	}
1325
1326	/*!
1327	Returns a pointer to the codec most suitable for this locale.
1328
1329	On Windows, the codec will be based on a system locale. On Unix
1330	systems, starting with Qt 4.2, the codec will be using the \e
1331	iconv library. Note that in both cases the codec's name will be
1332	"System".
1333	*/
1334
1335	QTextCodec* QTextCodec::codecForLocale()
1336	{
1337	if (localeMapper)
1338	return localeMapper;
1339
1340	setup();
1341
1342	return localeMapper;
1343	}
1344
1345
1346	/*!
1347	\fn QByteArray QTextCodec::name() const
1348
1349	QTextCodec subclasses must reimplement this function. It returns
1350	the name of the encoding supported by the subclass.
1351
1352	If the codec is registered as a character set in the
1353	\l{IANA character-sets encoding file} this method should
1354	return the preferred mime name for the codec if defined,
1355	otherwise its name.
1356	*/
1357
1358	/*!
1359	\fn int QTextCodec::mibEnum() const
1360
1361	Subclasses of QTextCodec must reimplement this function. It
1362	returns the MIBenum (see \l{IANA character-sets encoding file}
1363	for more information). It is important that each QTextCodec
1364	subclass returns the correct unique value for this function.
1365	*/
1366
1367	/*!
1368	Subclasses can return a number of aliases for the codec in question.
1369
1370	Standard aliases for codecs can be found in the
1371	\l{IANA character-sets encoding file}.
1372	*/
1373	QList<QByteArray> QTextCodec::aliases() const
1374	{
1375	return QList<QByteArray>();
1376	}
1377
1378	/*!
1379	\fn QString QTextCodec::convertToUnicode(const char *chars, int len,
1380	ConverterState *state) const
1381
1382	QTextCodec subclasses must reimplement this function.
1383
1384	Converts the first \a len characters of \a chars from the
1385	encoding of the subclass to Unicode, and returns the result in a
1386	QString.
1387
1388	\a state can be 0, in which case the conversion is stateless and
1389	default conversion rules should be used. If state is not 0, the
1390	codec should save the state after the conversion in \a state, and
1391	adjust the remainingChars and invalidChars members of the struct.
1392	*/
1393
1394	/*!
1395	\fn QByteArray QTextCodec::convertFromUnicode(const QChar *input, int number,
1396	ConverterState *state) const
1397
1398	QTextCodec subclasses must reimplement this function.
1399
1400	Converts the first \a number of characters from the \a input array
1401	from Unicode to the encoding of the subclass, and returns the result
1402	in a QByteArray.
1403
1404	\a state can be 0 in which case the conversion is stateless and
1405	default conversion rules should be used. If state is not 0, the
1406	codec should save the state after the conversion in \a state, and
1407	adjust the remainingChars and invalidChars members of the struct.
1408	*/
1409
1410	/*!
1411	Creates a QTextDecoder which stores enough state to decode chunks
1412	of \c{char *} data to create chunks of Unicode data.
1413
1414	The caller is responsible for deleting the returned object.
1415	*/
1416	QTextDecoder* QTextCodec::makeDecoder() const
1417	{
1418	return new QTextDecoder(this);
1419	}
1420
1421
1422	/*!
1423	Creates a QTextEncoder which stores enough state to encode chunks
1424	of Unicode data as \c{char *} data.
1425
1426	The caller is responsible for deleting the returned object.
1427	*/
1428	QTextEncoder* QTextCodec::makeEncoder() const
1429	{
1430	return new QTextEncoder(this);
1431	}
1432
1433	/*!
1434	\fn QByteArray QTextCodec::fromUnicode(const QChar *input, int number,
1435	ConverterState *state) const
1436
1437	Converts the first \a number of characters from the \a input array
1438	from Unicode to the encoding of this codec, and returns the result
1439	in a QByteArray.
1440
1441	The \a state of the convertor used is updated.
1442	*/
1443
1444	/*!
1445	Converts \a str from Unicode to the encoding of this codec, and
1446	returns the result in a QByteArray.
1447	*/
1448	QByteArray QTextCodec::fromUnicode(const QString& str) const
1449	{
1450	return convertFromUnicode(str.constData(), str.length(), 0);
1451	}
1452
1453	/*!
1454	\fn QString QTextCodec::toUnicode(const char *input, int size,
1455	ConverterState *state) const
1456
1457	Converts the first \a size characters from the \a input from the
1458	encoding of this codec to Unicode, and returns the result in a
1459	QString.
1460
1461	The \a state of the convertor used is updated.
1462	*/
1463
1464	/*!
1465	Converts \a a from the encoding of this codec to Unicode, and
1466	returns the result in a QString.
1467	*/
1468	QString QTextCodec::toUnicode(const QByteArray& a) const
1469	{
1470	return convertToUnicode(a.constData(), a.length(), 0);
1471	}
1472
1473	/*!
1474	Returns true if the Unicode character \a ch can be fully encoded
1475	with this codec; otherwise returns false.
1476	*/
1477	bool QTextCodec::canEncode(QChar ch) const
1478	{
1479	ConverterState state;
1480	state.flags = ConvertInvalidToNull;
1481	convertFromUnicode(&ch, 1, &state);
1482	return (state.invalidChars == 0);
1483	}
1484
1485	/*!
1486	\overload
1487
1488	\a s contains the string being tested for encode-ability.
1489	*/
1490	bool QTextCodec::canEncode(const QString& s) const
1491	{
1492	ConverterState state;
1493	state.flags = ConvertInvalidToNull;
1494	convertFromUnicode(s.constData(), s.length(), &state);
1495	return (state.invalidChars == 0);
1496	}
1497
1498	#ifdef QT3_SUPPORT
1499	/*!
1500	Returns a string representing the current language and
1501	sublanguage, e.g. "pt" for Portuguese, or "pt_br" for Portuguese/Brazil.
1502
1503	\sa QLocale
1504	*/
1505	const char *QTextCodec::locale()
1506	{
1507	static char locale[6];
1508	QByteArray l = QLocale::system().name().toLatin1();
1509	int len = qMin(l.length(), 5);
1510	memcpy(locale, l.constData(), len);
1511	locale[len] = '\0';
1512
1513	return locale;
1514	}
1515
1516	/*!
1517	\overload
1518	*/
1519
1520	QByteArray QTextCodec::fromUnicode(const QString& uc, int& lenInOut) const
1521	{
1522	QByteArray result = convertFromUnicode(uc.constData(), lenInOut, 0);
1523	lenInOut = result.length();
1524	return result;
1525	}
1526
1527	/*!
1528	\overload
1529
1530	\a a contains the source characters; \a len contains the number of
1531	characters in \a a to use.
1532	*/
1533	QString QTextCodec::toUnicode(const QByteArray& a, int len) const
1534	{
1535	len = qMin(a.size(), len);
1536	return convertToUnicode(a.constData(), len, 0);
1537	}
1538	#endif
1539
1540	/*!
1541	\overload
1542
1543	\a chars contains the source characters.
1544	*/
1545	QString QTextCodec::toUnicode(const char *chars) const
1546	{
1547	int len = qstrlen(chars);
1548	return convertToUnicode(chars, len, 0);
1549	}
1550
1551
1552	/*!
1553	\class QTextEncoder
1554	\brief The QTextEncoder class provides a state-based encoder.
1555	\reentrant
1556	\ingroup i18n
1557
1558	A text encoder converts text from Unicode into an encoded text format
1559	using a specific codec.
1560
1561	The encoder converts Unicode into another format, remembering any
1562	state that is required between calls.
1563
1564	\sa QTextCodec::makeEncoder(), QTextDecoder
1565	*/
1566
1567	/*!
1568	\fn QTextEncoder::QTextEncoder(const QTextCodec *codec)
1569
1570	Constructs a text encoder for the given \a codec.
1571	*/
1572
1573	/*!
1574	Destroys the encoder.
1575	*/
1576	QTextEncoder::~QTextEncoder()
1577	{
1578	}
1579
1580	/*! \internal
1581	\since 4.5
1582	Determines whether the eecoder encountered a failure while decoding the input. If
1583	an error was encountered, the produced result is undefined, and gets converted as according
1584	to the conversion flags.
1585	*/
1586	bool QTextEncoder::hasFailure() const
1587	{
1588	return state.invalidChars != 0;
1589	}
1590
1591	/*!
1592	Converts the Unicode string \a str into an encoded QByteArray.
1593	*/
1594	QByteArray QTextEncoder::fromUnicode(const QString& str)
1595	{
1596	QByteArray result = c->fromUnicode(str.constData(), str.length(), &state);
1597	return result;
1598	}
1599
1600	/*!
1601	\overload
1602
1603	Converts \a len characters (not bytes) from \a uc, and returns the
1604	result in a QByteArray.
1605	*/
1606	QByteArray QTextEncoder::fromUnicode(const QChar *uc, int len)
1607	{
1608	QByteArray result = c->fromUnicode(uc, len, &state);
1609	return result;
1610	}
1611
1612	#ifdef QT3_SUPPORT
1613	/*!
1614	\overload
1615
1616	Converts \a lenInOut characters (not bytes) from \a uc, and returns the
1617	result in a QByteArray. The number of characters read is returned in
1618	the \a lenInOut parameter.
1619	*/
1620	QByteArray QTextEncoder::fromUnicode(const QString& uc, int& lenInOut)
1621	{
1622	QByteArray result = c->fromUnicode(uc.constData(), lenInOut, &state);
1623	lenInOut = result.length();
1624	return result;
1625	}
1626	#endif
1627
1628	/*!
1629	\class QTextDecoder
1630	\brief The QTextDecoder class provides a state-based decoder.
1631	\reentrant
1632	\ingroup i18n
1633
1634	A text decoder converts text from an encoded text format into Unicode
1635	using a specific codec.
1636
1637	The decoder converts text in this format into Unicode, remembering any
1638	state that is required between calls.
1639
1640	\sa QTextCodec::makeDecoder(), QTextEncoder
1641	*/
1642
1643	/*!
1644	\fn QTextDecoder::QTextDecoder(const QTextCodec *codec)
1645
1646	Constructs a text decoder for the given \a codec.
1647	*/
1648
1649	/*!
1650	Destroys the decoder.
1651	*/
1652	QTextDecoder::~QTextDecoder()
1653	{
1654	}
1655
1656	/*!
1657	\fn QString QTextDecoder::toUnicode(const char *chars, int len)
1658
1659	Converts the first \a len bytes in \a chars to Unicode, returning
1660	the result.
1661
1662	If not all characters are used (e.g. if only part of a multi-byte
1663	encoding is at the end of the characters), the decoder remembers
1664	enough state to continue with the next call to this function.
1665	*/
1666	QString QTextDecoder::toUnicode(const char *chars, int len)
1667	{
1668	return c->toUnicode(chars, len, &state);
1669	}
1670
1671
1672	/*! \overload
1673
1674	The converted string is returned in \a target.
1675	*/
1676	void QTextDecoder::toUnicode(QString target, const char chars, int len)
1677	{
1678	Q_ASSERT(target);
1679	switch (c->mibEnum()) {
1680	case 106: // utf8
1681	static_cast<const QUtf8Codec*>(c)->convertToUnicode(target, chars, len, &state);
1682	break;
1683	case 4: { // latin1
1684	target->resize(len);
1685	ushort data = (ushort)target->data();
1686	for (int i = len; i >=0; --i)
1687	data[i] = (uchar) chars[i];
1688	} break;
1689	default:
1690	*target = c->toUnicode(chars, len, &state);
1691	}
1692	}
1693
1694
1695	/*!
1696	\overload
1697
1698	Converts the bytes in the byte array specified by \a ba to Unicode
1699	and returns the result.
1700	*/
1701	QString QTextDecoder::toUnicode(const QByteArray &ba)
1702	{
1703	return c->toUnicode(ba.constData(), ba.length(), &state);
1704	}
1705
1706
1707	/*!
1708	\fn QTextCodec* QTextCodec::codecForTr()
1709
1710	Returns the codec used by QObject::tr() on its argument. If this
1711	function returns 0 (the default), tr() assumes Latin-1.
1712
1713	\sa setCodecForTr()
1714	*/
1715
1716	/*!
1717	\fn void QTextCodec::setCodecForTr(QTextCodec *c)
1718	\nonreentrant
1719
1720	Sets the codec used by QObject::tr() on its argument to \a c. If
1721	\a c is 0 (the default), tr() assumes Latin-1.
1722
1723	If the literal quoted text in the program is not in the Latin-1
1724	encoding, this function can be used to set the appropriate
1725	encoding. For example, software developed by Korean programmers
1726	might use eucKR for all the text in the program, in which case the
1727	main() function might look like this:
1728
1729	\snippet doc/src/snippets/code/src_corelib_codecs_qtextcodec.cpp 3
1730
1731	Note that this is not the way to select the encoding that the \e
1732	user has chosen. For example, to convert an application containing
1733	literal English strings to Korean, all that is needed is for the
1734	English strings to be passed through tr() and for translation
1735	files to be loaded. For details of internationalization, see
1736	\l{Internationalization with Qt}.
1737
1738	\sa codecForTr(), setCodecForCStrings()
1739	*/
1740
1741
1742	/*!
1743	\fn QTextCodec* QTextCodec::codecForCStrings()
1744
1745	Returns the codec used by QString to convert to and from \c{const
1746	char *} and QByteArrays. If this function returns 0 (the default),
1747	QString assumes Latin-1.
1748
1749	\sa setCodecForCStrings()
1750	*/
1751
1752	/*!
1753	\fn void QTextCodec::setCodecForCStrings(QTextCodec *codec)
1754	\nonreentrant
1755
1756	Sets the codec used by QString to convert to and from \c{const
1757	char *} and QByteArrays. If the \a codec is 0 (the default),
1758	QString assumes Latin-1.
1759
1760	\warning Some codecs do not preserve the characters in the ASCII
1761	range (0x00 to 0x7F). For example, the Japanese Shift-JIS
1762	encoding maps the backslash character (0x5A) to the Yen
1763	character. To avoid undesirable side-effects, we recommend
1764	avoiding such codecs with setCodecsForCString().
1765
1766	\sa codecForCStrings(), setCodecForTr()
1767	*/
1768
1769	/*!
1770	\since 4.4
1771
1772	Tries to detect the encoding of the provided snippet of HTML in the given byte array, \a ba,
1773	and returns a QTextCodec instance that is capable of decoding the html to unicode.
1774	If the codec cannot be detected from the content provided, \a defaultCodec is returned.
1775	*/
1776	QTextCodec QTextCodec::codecForHtml(const QByteArray &ba, QTextCodec defaultCodec)
1777	{
1778	// determine charset
1779	int pos;
1780	QTextCodec *c = 0;
1781
1782	if (ba.size() > 1 && (((uchar)ba[0] == 0xfe && (uchar)ba[1] == 0xff)
1783	\|\| ((uchar)ba[0] == 0xff && (uchar)ba[1] == 0xfe))) {
1784	c = QTextCodec::codecForMib(1015); // utf16
1785	} else if (ba.size() > 2
1786	&& (uchar)ba[0] == 0xef
1787	&& (uchar)ba[1] == 0xbb
1788	&& (uchar)ba[2] == 0xbf) {
1789	c = QTextCodec::codecForMib(106); // utf-8
1790	} else {
1791	QByteArray header = ba.left(512).toLower();
1792	if ((pos = header.indexOf("http-equiv=")) != -1) {
1793	pos = header.indexOf("charset=", pos) + int(strlen("charset="));
1794	if (pos != -1) {
1795	int pos2 = header.indexOf('\"', pos+1);
1796	QByteArray cs = header.mid(pos, pos2-pos);
1797	// qDebug("found charset: %s", cs.data());
1798	c = QTextCodec::codecForName(cs);
1799	}
1800	}
1801	}
1802	if (!c)
1803	c = defaultCodec;
1804
1805	return c;
1806	}
1807
1808	/*!
1809	\overload
1810
1811	If the codec cannot be detected, this overload returns a Latin-1 QTextCodec.
1812	*/
1813	QTextCodec *QTextCodec::codecForHtml(const QByteArray &ba)
1814	{
1815	return codecForHtml(ba, QTextCodec::codecForMib(/Latin 1/ 4));
1816	}
1817
1818
1819	/*! \internal
1820	\since 4.3
1821	Determines whether the decoder encountered a failure while decoding the input. If
1822	an error was encountered, the produced result is undefined, and gets converted as according
1823	to the conversion flags.
1824	*/
1825	bool QTextDecoder::hasFailure() const
1826	{
1827	return state.invalidChars != 0;
1828	}
1829
1830	/*!
1831	\fn QTextCodec QTextCodec::codecForContent(const char str, int size)
1832
1833	This functionality is no longer provided by Qt. This
1834	compatibility function always returns a null pointer.
1835	*/
1836
1837	/*!
1838	\fn QTextCodec QTextCodec::codecForName(const char hint, int accuracy)
1839
1840	Use the codecForName(const QByteArray &) overload instead.
1841	*/
1842
1843	/*!
1844	\fn QTextCodec *QTextCodec::codecForIndex(int i)
1845
1846	Use availableCodecs() or availableMibs() instead and iterate
1847	through the resulting list.
1848	*/
1849
1850
1851	/*!
1852	\fn QByteArray QTextCodec::mimeName() const
1853
1854	Use name() instead.
1855	*/
1856
1857	QT_END_NAMESPACE
1858
1859	#endif // QT_NO_TEXTCODEC

Note: See TracBrowser for help on using the repository browser.

Download in other formats: