Context Navigation

← Previous Revision
Next Revision →
Blame
Revision Log

qutfcodec.cpp

Last change on this file was 846, checked in by Dmitry A. Kuminov, 14 years ago
trunk: Merged in qt 4.7.2 sources from branches/vendor/nokia/qt.
File size: 19.7 KB

Line
1	/****************************************************************************
2	**
3	** Copyright (C) 2011 Nokia Corporation and/or its subsidiary(-ies).
4	** All rights reserved.
5	** Contact: Nokia Corporation ([email protected])
6	**
7	** This file is part of the QtCore module of the Qt Toolkit.
8	**
9	** $QT_BEGIN_LICENSE:LGPL$
10	** Commercial Usage
11	** Licensees holding valid Qt Commercial licenses may use this file in
12	** accordance with the Qt Commercial License Agreement provided with the
13	** Software or, alternatively, in accordance with the terms contained in
14	** a written agreement between you and Nokia.
15	**
16	** GNU Lesser General Public License Usage
17	** Alternatively, this file may be used under the terms of the GNU Lesser
18	** General Public License version 2.1 as published by the Free Software
19	** Foundation and appearing in the file LICENSE.LGPL included in the
20	** packaging of this file. Please review the following information to
21	** ensure the GNU Lesser General Public License version 2.1 requirements
22	** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
23	**
24	** In addition, as a special exception, Nokia gives you certain additional
25	** rights. These rights are described in the Nokia Qt LGPL Exception
26	** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
27	**
28	** GNU General Public License Usage
29	** Alternatively, this file may be used under the terms of the GNU
30	** General Public License version 3.0 as published by the Free Software
31	** Foundation and appearing in the file LICENSE.GPL included in the
32	** packaging of this file. Please review the following information to
33	** ensure the GNU General Public License version 3.0 requirements will be
34	** met: http://www.gnu.org/copyleft/gpl.html.
35	**
36	** If you have questions regarding the use of this file, please contact
37	** Nokia at [email protected].
38	** $QT_END_LICENSE$
39	**
40	****************************************************************************/
41
42	#include "qutfcodec_p.h"
43	#include "qlist.h"
44	#include "qendian.h"
45	#include "qchar.h"
46
47	QT_BEGIN_NAMESPACE
48
49	enum { Endian = 0, Data = 1 };
50
51	static inline bool isUnicodeNonCharacter(uint ucs4)
52	{
53	// Unicode has a couple of "non-characters" that one can use internally,
54	// but are not allowed to be used for text interchange.
55	//
56	// Those are the last two entries each Unicode Plane (U+FFFE, U+FFFF,
57	// U+1FFFE, U+1FFFF, etc.) as well as the entries between U+FDD0 and
58	// U+FDEF (inclusive)
59
60	return (ucs4 & 0xfffe) == 0xfffe
61	\|\| (ucs4 - 0xfdd0U) < 16;
62	}
63
64	QByteArray QUtf8::convertFromUnicode(const QChar uc, int len, QTextCodec::ConverterState state)
65	{
66	uchar replacement = '?';
67	int rlen = 3*len;
68	int surrogate_high = -1;
69	if (state) {
70	if (state->flags & QTextCodec::ConvertInvalidToNull)
71	replacement = 0;
72	if (!(state->flags & QTextCodec::IgnoreHeader))
73	rlen += 3;
74	if (state->remainingChars)
75	surrogate_high = state->state_data[0];
76	}
77
78	QByteArray rstr;
79	rstr.resize(rlen);
80	uchar* cursor = (uchar*)rstr.data();
81	const QChar *ch = uc;
82	int invalid = 0;
83	if (state && !(state->flags & QTextCodec::IgnoreHeader)) {
84	*cursor++ = 0xef;
85	*cursor++ = 0xbb;
86	*cursor++ = 0xbf;
87	}
88
89	const QChar *end = ch + len;
90	while (ch < end) {
91	uint u = ch->unicode();
92	if (surrogate_high >= 0) {
93	if (u >= 0xdc00 && u < 0xe000) {
94	u = (surrogate_high - 0xd800)*0x400 + (u - 0xdc00) + 0x10000;
95	surrogate_high = -1;
96	} else {
97	// high surrogate without low
98	*cursor = replacement;
99	++ch;
100	++invalid;
101	surrogate_high = -1;
102	continue;
103	}
104	} else if (u >= 0xdc00 && u < 0xe000) {
105	// low surrogate without high
106	*cursor = replacement;
107	++ch;
108	++invalid;
109	continue;
110	} else if (u >= 0xd800 && u < 0xdc00) {
111	surrogate_high = u;
112	++ch;
113	continue;
114	}
115
116	if (u < 0x80) {
117	*cursor++ = (uchar)u;
118	} else {
119	if (u < 0x0800) {
120	*cursor++ = 0xc0 \| ((uchar) (u >> 6));
121	} else {
122	// is it one of the Unicode non-characters?
123	if (isUnicodeNonCharacter(u)) {
124	*cursor++ = replacement;
125	++ch;
126	++invalid;
127	continue;
128	}
129
130	if (u > 0xffff) {
131	*cursor++ = 0xf0 \| ((uchar) (u >> 18));
132	*cursor++ = 0x80 \| (((uchar) (u >> 12)) & 0x3f);
133	} else {
134	*cursor++ = 0xe0 \| (((uchar) (u >> 12)) & 0x3f);
135	}
136	*cursor++ = 0x80 \| (((uchar) (u >> 6)) & 0x3f);
137	}
138	*cursor++ = 0x80 \| ((uchar) (u&0x3f));
139	}
140	++ch;
141	}
142
143	rstr.resize(cursor - (const uchar*)rstr.constData());
144	if (state) {
145	state->invalidChars += invalid;
146	state->flags \|= QTextCodec::IgnoreHeader;
147	state->remainingChars = 0;
148	if (surrogate_high >= 0) {
149	state->remainingChars = 1;
150	state->state_data[0] = surrogate_high;
151	}
152	}
153	return rstr;
154	}
155
156	QString QUtf8::convertToUnicode(const char chars, int len, QTextCodec::ConverterState state)
157	{
158	bool headerdone = false;
159	ushort replacement = QChar::ReplacementCharacter;
160	int need = 0;
161	int error = -1;
162	uint uc = 0;
163	uint min_uc = 0;
164	if (state) {
165	if (state->flags & QTextCodec::IgnoreHeader)
166	headerdone = true;
167	if (state->flags & QTextCodec::ConvertInvalidToNull)
168	replacement = QChar::Null;
169	need = state->remainingChars;
170	if (need) {
171	uc = state->state_data[0];
172	min_uc = state->state_data[1];
173	}
174	}
175	if (!headerdone && len > 3
176	&& (uchar)chars[0] == 0xef && (uchar)chars[1] == 0xbb && (uchar)chars[2] == 0xbf) {
177	// starts with a byte order mark
178	chars += 3;
179	len -= 3;
180	headerdone = true;
181	}
182
183	QString result(need + len + 1, Qt::Uninitialized); // worst case
184	ushort qch = (ushort )result.unicode();
185	uchar ch;
186	int invalid = 0;
187
188	for (int i = 0; i < len; ++i) {
189	ch = chars[i];
190	if (need) {
191	if ((ch&0xc0) == 0x80) {
192	uc = (uc << 6) \| (ch & 0x3f);
193	--need;
194	if (!need) {
195	// utf-8 bom composes into 0xfeff code point
196	bool nonCharacter;
197	if (!headerdone && uc == 0xfeff) {
198	// don't do anything, just skip the BOM
199	} else if (!(nonCharacter = isUnicodeNonCharacter(uc)) && uc > 0xffff && uc < 0x110000) {
200	// surrogate pair
201	Q_ASSERT((qch - (ushort*)result.unicode()) + 2 < result.length());
202	*qch++ = QChar::highSurrogate(uc);
203	*qch++ = QChar::lowSurrogate(uc);
204	} else if ((uc < min_uc) \|\| (uc >= 0xd800 && uc <= 0xdfff) \|\| nonCharacter \|\| uc >= 0x110000) {
205	// error: overlong sequence, UTF16 surrogate or non-character
206	*qch++ = replacement;
207	++invalid;
208	} else {
209	*qch++ = uc;
210	}
211	headerdone = true;
212	}
213	} else {
214	// error
215	i = error;
216	*qch++ = replacement;
217	++invalid;
218	need = 0;
219	headerdone = true;
220	}
221	} else {
222	if (ch < 128) {
223	*qch++ = ushort(ch);
224	headerdone = true;
225	} else if ((ch & 0xe0) == 0xc0) {
226	uc = ch & 0x1f;
227	need = 1;
228	error = i;
229	min_uc = 0x80;
230	headerdone = true;
231	} else if ((ch & 0xf0) == 0xe0) {
232	uc = ch & 0x0f;
233	need = 2;
234	error = i;
235	min_uc = 0x800;
236	} else if ((ch&0xf8) == 0xf0) {
237	uc = ch & 0x07;
238	need = 3;
239	error = i;
240	min_uc = 0x10000;
241	headerdone = true;
242	} else {
243	// error
244	*qch++ = replacement;
245	++invalid;
246	headerdone = true;
247	}
248	}
249	}
250	if (!state && need > 0) {
251	// unterminated UTF sequence
252	for (int i = error; i < len; ++i) {
253	*qch++ = replacement;
254	++invalid;
255	}
256	}
257	result.truncate(qch - (ushort *)result.unicode());
258	if (state) {
259	state->invalidChars += invalid;
260	state->remainingChars = need;
261	if (headerdone)
262	state->flags \|= QTextCodec::IgnoreHeader;
263	state->state_data[0] = need ? uc : 0;
264	state->state_data[1] = need ? min_uc : 0;
265	}
266	return result;
267	}
268
269	QByteArray QUtf16::convertFromUnicode(const QChar uc, int len, QTextCodec::ConverterState state, DataEndianness e)
270	{
271	DataEndianness endian = e;
272	int length = 2*len;
273	if (!state \|\| (!(state->flags & QTextCodec::IgnoreHeader))) {
274	length += 2;
275	}
276	if (e == DetectEndianness) {
277	endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness;
278	}
279
280	QByteArray d;
281	d.resize(length);
282	char *data = d.data();
283	if (!state \|\| !(state->flags & QTextCodec::IgnoreHeader)) {
284	QChar bom(QChar::ByteOrderMark);
285	if (endian == BigEndianness) {
286	data[0] = bom.row();
287	data[1] = bom.cell();
288	} else {
289	data[0] = bom.cell();
290	data[1] = bom.row();
291	}
292	data += 2;
293	}
294	if (endian == BigEndianness) {
295	for (int i = 0; i < len; ++i) {
296	*(data++) = uc[i].row();
297	*(data++) = uc[i].cell();
298	}
299	} else {
300	for (int i = 0; i < len; ++i) {
301	*(data++) = uc[i].cell();
302	*(data++) = uc[i].row();
303	}
304	}
305
306	if (state) {
307	state->remainingChars = 0;
308	state->flags \|= QTextCodec::IgnoreHeader;
309	}
310	return d;
311	}
312
313	QString QUtf16::convertToUnicode(const char chars, int len, QTextCodec::ConverterState state, DataEndianness e)
314	{
315	DataEndianness endian = e;
316	bool half = false;
317	uchar buf = 0;
318	bool headerdone = false;
319	if (state) {
320	headerdone = state->flags & QTextCodec::IgnoreHeader;
321	if (endian == DetectEndianness)
322	endian = (DataEndianness)state->state_data[Endian];
323	if (state->remainingChars) {
324	half = true;
325	buf = state->state_data[Data];
326	}
327	}
328	if (headerdone && endian == DetectEndianness)
329	endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness;
330
331	QString result(len, Qt::Uninitialized); // worst case
332	QChar qch = (QChar )result.unicode();
333	while (len--) {
334	if (half) {
335	QChar ch;
336	if (endian == LittleEndianness) {
337	ch.setRow(*chars++);
338	ch.setCell(buf);
339	} else {
340	ch.setRow(buf);
341	ch.setCell(*chars++);
342	}
343	if (!headerdone) {
344	headerdone = true;
345	if (endian == DetectEndianness) {
346	if (ch == QChar::ByteOrderSwapped) {
347	endian = LittleEndianness;
348	} else if (ch == QChar::ByteOrderMark) {
349	endian = BigEndianness;
350	} else {
351	if (QSysInfo::ByteOrder == QSysInfo::BigEndian) {
352	endian = BigEndianness;
353	} else {
354	endian = LittleEndianness;
355	ch = QChar((ch.unicode() >> 8) \| ((ch.unicode() & 0xff) << 8));
356	}
357	*qch++ = ch;
358	}
359	} else if (ch != QChar::ByteOrderMark) {
360	*qch++ = ch;
361	}
362	} else {
363	*qch++ = ch;
364	}
365	half = false;
366	} else {
367	buf = *chars++;
368	half = true;
369	}
370	}
371	result.truncate(qch - result.unicode());
372
373	if (state) {
374	if (headerdone)
375	state->flags \|= QTextCodec::IgnoreHeader;
376	state->state_data[Endian] = endian;
377	if (half) {
378	state->remainingChars = 1;
379	state->state_data[Data] = buf;
380	} else {
381	state->remainingChars = 0;
382	state->state_data[Data] = 0;
383	}
384	}
385	return result;
386	}
387
388	QByteArray QUtf32::convertFromUnicode(const QChar uc, int len, QTextCodec::ConverterState state, DataEndianness e)
389	{
390	DataEndianness endian = e;
391	int length = 4*len;
392	if (!state \|\| (!(state->flags & QTextCodec::IgnoreHeader))) {
393	length += 4;
394	}
395	if (e == DetectEndianness) {
396	endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness;
397	}
398
399	QByteArray d(length, Qt::Uninitialized);
400	char *data = d.data();
401	if (!state \|\| !(state->flags & QTextCodec::IgnoreHeader)) {
402	if (endian == BigEndianness) {
403	data[0] = 0;
404	data[1] = 0;
405	data[2] = (char)0xfe;
406	data[3] = (char)0xff;
407	} else {
408	data[0] = (char)0xff;
409	data[1] = (char)0xfe;
410	data[2] = 0;
411	data[3] = 0;
412	}
413	data += 4;
414	}
415	if (endian == BigEndianness) {
416	for (int i = 0; i < len; ++i) {
417	uint cp = uc[i].unicode();
418	if (uc[i].isHighSurrogate() && i < len - 1)
419	cp = QChar::surrogateToUcs4(cp, uc[++i].unicode());
420	*(data++) = cp >> 24;
421	*(data++) = (cp >> 16) & 0xff;
422	*(data++) = (cp >> 8) & 0xff;
423	*(data++) = cp & 0xff;
424	}
425	} else {
426	for (int i = 0; i < len; ++i) {
427	uint cp = uc[i].unicode();
428	if (uc[i].isHighSurrogate() && i < len - 1)
429	cp = QChar::surrogateToUcs4(cp, uc[++i].unicode());
430	*(data++) = cp & 0xff;
431	*(data++) = (cp >> 8) & 0xff;
432	*(data++) = (cp >> 16) & 0xff;
433	*(data++) = cp >> 24;
434	}
435	}
436
437	if (state) {
438	state->remainingChars = 0;
439	state->flags \|= QTextCodec::IgnoreHeader;
440	}
441	return d;
442	}
443
444	QString QUtf32::convertToUnicode(const char chars, int len, QTextCodec::ConverterState state, DataEndianness e)
445	{
446	DataEndianness endian = e;
447	uchar tuple[4];
448	int num = 0;
449	bool headerdone = false;
450	if (state) {
451	headerdone = state->flags & QTextCodec::IgnoreHeader;
452	if (endian == DetectEndianness) {
453	endian = (DataEndianness)state->state_data[Endian];
454	}
455	num = state->remainingChars;
456	memcpy(tuple, &state->state_data[Data], 4);
457	}
458	if (headerdone && endian == DetectEndianness)
459	endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness;
460
461	QString result;
462	result.resize((num + len) >> 2 << 1); // worst case
463	QChar qch = (QChar )result.unicode();
464
465	const char *end = chars + len;
466	while (chars < end) {
467	tuple[num++] = *chars++;
468	if (num == 4) {
469	if (!headerdone) {
470	if (endian == DetectEndianness) {
471	if (endian == DetectEndianness) {
472	if (tuple[0] == 0xff && tuple[1] == 0xfe && tuple[2] == 0 && tuple[3] == 0 && endian != BigEndianness) {
473	endian = LittleEndianness;
474	num = 0;
475	continue;
476	} else if (tuple[0] == 0 && tuple[1] == 0 && tuple[2] == 0xfe && tuple[3] == 0xff && endian != LittleEndianness) {
477	endian = BigEndianness;
478	num = 0;
479	continue;
480	} else if (QSysInfo::ByteOrder == QSysInfo::BigEndian) {
481	endian = BigEndianness;
482	} else {
483	endian = LittleEndianness;
484	}
485	}
486	} else if (((endian == BigEndianness) ? qFromBigEndian<quint32>(tuple) : qFromLittleEndian<quint32>(tuple)) == QChar::ByteOrderMark) {
487	num = 0;
488	continue;
489	}
490	}
491	uint code = (endian == BigEndianness) ? qFromBigEndian<quint32>(tuple) : qFromLittleEndian<quint32>(tuple);
492	if (code >= 0x10000) {
493	*qch++ = QChar::highSurrogate(code);
494	*qch++ = QChar::lowSurrogate(code);
495	} else {
496	*qch++ = code;
497	}
498	num = 0;
499	}
500	}
501	result.truncate(qch - result.unicode());
502
503	if (state) {
504	if (headerdone)
505	state->flags \|= QTextCodec::IgnoreHeader;
506	state->state_data[Endian] = endian;
507	state->remainingChars = num;
508	memcpy(&state->state_data[Data], tuple, 4);
509	}
510	return result;
511	}
512
513
514	#ifndef QT_NO_TEXTCODEC
515
516	QUtf8Codec::~QUtf8Codec()
517	{
518	}
519
520	QByteArray QUtf8Codec::convertFromUnicode(const QChar uc, int len, ConverterState state) const
521	{
522	return QUtf8::convertFromUnicode(uc, len, state);
523	}
524
525	void QUtf8Codec::convertToUnicode(QString target, const char chars, int len, ConverterState *state) const
526	{
527	*target += QUtf8::convertToUnicode(chars, len, state);
528	}
529
530	QString QUtf8Codec::convertToUnicode(const char chars, int len, ConverterState state) const
531	{
532	return QUtf8::convertToUnicode(chars, len, state);
533	}
534
535	QByteArray QUtf8Codec::name() const
536	{
537	return "UTF-8";
538	}
539
540	int QUtf8Codec::mibEnum() const
541	{
542	return 106;
543	}
544
545	QUtf16Codec::~QUtf16Codec()
546	{
547	}
548
549	QByteArray QUtf16Codec::convertFromUnicode(const QChar uc, int len, ConverterState state) const
550	{
551	return QUtf16::convertFromUnicode(uc, len, state, e);
552	}
553
554	QString QUtf16Codec::convertToUnicode(const char chars, int len, ConverterState state) const
555	{
556	return QUtf16::convertToUnicode(chars, len, state, e);
557	}
558
559	int QUtf16Codec::mibEnum() const
560	{
561	return 1015;
562	}
563
564	QByteArray QUtf16Codec::name() const
565	{
566	return "UTF-16";
567	}
568
569	QList<QByteArray> QUtf16Codec::aliases() const
570	{
571	return QList<QByteArray>();
572	}
573
574	int QUtf16BECodec::mibEnum() const
575	{
576	return 1013;
577	}
578
579	QByteArray QUtf16BECodec::name() const
580	{
581	return "UTF-16BE";
582	}
583
584	QList<QByteArray> QUtf16BECodec::aliases() const
585	{
586	QList<QByteArray> list;
587	return list;
588	}
589
590	int QUtf16LECodec::mibEnum() const
591	{
592	return 1014;
593	}
594
595	QByteArray QUtf16LECodec::name() const
596	{
597	return "UTF-16LE";
598	}
599
600	QList<QByteArray> QUtf16LECodec::aliases() const
601	{
602	QList<QByteArray> list;
603	return list;
604	}
605
606	QUtf32Codec::~QUtf32Codec()
607	{
608	}
609
610	QByteArray QUtf32Codec::convertFromUnicode(const QChar uc, int len, ConverterState state) const
611	{
612	return QUtf32::convertFromUnicode(uc, len, state, e);
613	}
614
615	QString QUtf32Codec::convertToUnicode(const char chars, int len, ConverterState state) const
616	{
617	return QUtf32::convertToUnicode(chars, len, state, e);
618	}
619
620	int QUtf32Codec::mibEnum() const
621	{
622	return 1017;
623	}
624
625	QByteArray QUtf32Codec::name() const
626	{
627	return "UTF-32";
628	}
629
630	QList<QByteArray> QUtf32Codec::aliases() const
631	{
632	QList<QByteArray> list;
633	return list;
634	}
635
636	int QUtf32BECodec::mibEnum() const
637	{
638	return 1018;
639	}
640
641	QByteArray QUtf32BECodec::name() const
642	{
643	return "UTF-32BE";
644	}
645
646	QList<QByteArray> QUtf32BECodec::aliases() const
647	{
648	QList<QByteArray> list;
649	return list;
650	}
651
652	int QUtf32LECodec::mibEnum() const
653	{
654	return 1019;
655	}
656
657	QByteArray QUtf32LECodec::name() const
658	{
659	return "UTF-32LE";
660	}
661
662	QList<QByteArray> QUtf32LECodec::aliases() const
663	{
664	QList<QByteArray> list;
665	return list;
666	}
667
668	#endif //QT_NO_TEXTCODEC
669
670	QT_END_NAMESPACE

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: trunk/src/corelib/codecs/qutfcodec.cpp

Download in other formats: