Context Navigation

source: trunk/src/corelib/codecs/qutfcodec.cpp@ 730

Last change on this file since 730 was 651, checked in by Dmitry A. Kuminov, 15 years ago
trunk: Merged in qt 4.6.2 sources.
File size: 19.3 KB

Line
1	/****************************************************************************
2	**
3	** Copyright (C) 2010 Nokia Corporation and/or its subsidiary(-ies).
4	** All rights reserved.
5	** Contact: Nokia Corporation ([email protected])
6	**
7	** This file is part of the QtCore module of the Qt Toolkit.
8	**
9	** $QT_BEGIN_LICENSE:LGPL$
10	** Commercial Usage
11	** Licensees holding valid Qt Commercial licenses may use this file in
12	** accordance with the Qt Commercial License Agreement provided with the
13	** Software or, alternatively, in accordance with the terms contained in
14	** a written agreement between you and Nokia.
15	**
16	** GNU Lesser General Public License Usage
17	** Alternatively, this file may be used under the terms of the GNU Lesser
18	** General Public License version 2.1 as published by the Free Software
19	** Foundation and appearing in the file LICENSE.LGPL included in the
20	** packaging of this file. Please review the following information to
21	** ensure the GNU Lesser General Public License version 2.1 requirements
22	** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
23	**
24	** In addition, as a special exception, Nokia gives you certain additional
25	** rights. These rights are described in the Nokia Qt LGPL Exception
26	** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
27	**
28	** GNU General Public License Usage
29	** Alternatively, this file may be used under the terms of the GNU
30	** General Public License version 3.0 as published by the Free Software
31	** Foundation and appearing in the file LICENSE.GPL included in the
32	** packaging of this file. Please review the following information to
33	** ensure the GNU General Public License version 3.0 requirements will be
34	** met: http://www.gnu.org/copyleft/gpl.html.
35	**
36	** If you have questions regarding the use of this file, please contact
37	** Nokia at [email protected].
38	** $QT_END_LICENSE$
39	**
40	****************************************************************************/
41
42	#include "qutfcodec_p.h"
43	#include "qlist.h"
44	#include "qendian.h"
45	#include "qchar.h"
46
47	QT_BEGIN_NAMESPACE
48
49	enum { Endian = 0, Data = 1 };
50
51	QByteArray QUtf8::convertFromUnicode(const QChar uc, int len, QTextCodec::ConverterState state)
52	{
53	uchar replacement = '?';
54	int rlen = 3*len;
55	int surrogate_high = -1;
56	if (state) {
57	if (state->flags & QTextCodec::ConvertInvalidToNull)
58	replacement = 0;
59	if (!(state->flags & QTextCodec::IgnoreHeader))
60	rlen += 3;
61	if (state->remainingChars)
62	surrogate_high = state->state_data[0];
63	}
64
65	QByteArray rstr;
66	rstr.resize(rlen);
67	uchar* cursor = (uchar*)rstr.data();
68	const QChar *ch = uc;
69	int invalid = 0;
70	if (state && !(state->flags & QTextCodec::IgnoreHeader)) {
71	*cursor++ = 0xef;
72	*cursor++ = 0xbb;
73	*cursor++ = 0xbf;
74	}
75
76	const QChar *end = ch + len;
77	while (ch < end) {
78	uint u = ch->unicode();
79	if (surrogate_high >= 0) {
80	if (u >= 0xdc00 && u < 0xe000) {
81	u = (surrogate_high - 0xd800)*0x400 + (u - 0xdc00) + 0x10000;
82	surrogate_high = -1;
83	} else {
84	// high surrogate without low
85	*cursor = replacement;
86	++ch;
87	++invalid;
88	surrogate_high = -1;
89	continue;
90	}
91	} else if (u >= 0xdc00 && u < 0xe000) {
92	// low surrogate without high
93	*cursor = replacement;
94	++ch;
95	++invalid;
96	continue;
97	} else if (u >= 0xd800 && u < 0xdc00) {
98	surrogate_high = u;
99	++ch;
100	continue;
101	}
102
103	if (u < 0x80) {
104	*cursor++ = (uchar)u;
105	} else {
106	if (u < 0x0800) {
107	*cursor++ = 0xc0 \| ((uchar) (u >> 6));
108	} else {
109	if (u > 0xffff) {
110	// see QString::fromUtf8() and QString::utf8() for explanations
111	if (u > 0x10fe00 && u < 0x10ff00) {
112	*cursor++ = (u - 0x10fe00);
113	++ch;
114	continue;
115	} else {
116	*cursor++ = 0xf0 \| ((uchar) (u >> 18));
117	*cursor++ = 0x80 \| (((uchar) (u >> 12)) & 0x3f);
118	}
119	} else {
120	*cursor++ = 0xe0 \| (((uchar) (u >> 12)) & 0x3f);
121	}
122	*cursor++ = 0x80 \| (((uchar) (u >> 6)) & 0x3f);
123	}
124	*cursor++ = 0x80 \| ((uchar) (u&0x3f));
125	}
126	++ch;
127	}
128
129	rstr.resize(cursor - (const uchar*)rstr.constData());
130	if (state) {
131	state->invalidChars += invalid;
132	state->flags \|= QTextCodec::IgnoreHeader;
133	state->remainingChars = 0;
134	if (surrogate_high >= 0) {
135	state->remainingChars = 1;
136	state->state_data[0] = surrogate_high;
137	}
138	}
139	return rstr;
140	}
141
142	QString QUtf8::convertToUnicode(const char chars, int len, QTextCodec::ConverterState state)
143	{
144	bool headerdone = false;
145	ushort replacement = QChar::ReplacementCharacter;
146	int need = 0;
147	int error = -1;
148	uint uc = 0;
149	uint min_uc = 0;
150	if (state) {
151	if (state->flags & QTextCodec::IgnoreHeader)
152	headerdone = true;
153	if (state->flags & QTextCodec::ConvertInvalidToNull)
154	replacement = QChar::Null;
155	need = state->remainingChars;
156	if (need) {
157	uc = state->state_data[0];
158	min_uc = state->state_data[1];
159	}
160	}
161	if (!headerdone && len > 3
162	&& (uchar)chars[0] == 0xef && (uchar)chars[1] == 0xbb && (uchar)chars[2] == 0xbf) {
163	// starts with a byte order mark
164	chars += 3;
165	len -= 3;
166	headerdone = true;
167	}
168
169	QString result(need + len + 1, Qt::Uninitialized); // worst case
170	ushort qch = (ushort )result.unicode();
171	uchar ch;
172	int invalid = 0;
173
174	for (int i = 0; i < len; ++i) {
175	ch = chars[i];
176	if (need) {
177	if ((ch&0xc0) == 0x80) {
178	uc = (uc << 6) \| (ch & 0x3f);
179	--need;
180	if (!need) {
181	// utf-8 bom composes into 0xfeff code point
182	if (!headerdone && uc == 0xfeff) {
183	// dont do anything, just skip the BOM
184	} else if (uc > 0xffff && uc < 0x110000) {
185	// surrogate pair
186	Q_ASSERT((qch - (ushort*)result.unicode()) + 2 < result.length());
187	*qch++ = QChar::highSurrogate(uc);
188	*qch++ = QChar::lowSurrogate(uc);
189	} else if ((uc < min_uc) \|\| (uc >= 0xd800 && uc <= 0xdfff) \|\| (uc >= 0xfffe)) {
190	// error: overlong sequence, UTF16 surrogate or BOM
191	*qch++ = replacement;
192	++invalid;
193	} else {
194	*qch++ = uc;
195	}
196	headerdone = true;
197	}
198	} else {
199	// error
200	i = error;
201	*qch++ = replacement;
202	++invalid;
203	need = 0;
204	headerdone = true;
205	}
206	} else {
207	if (ch < 128) {
208	*qch++ = ushort(ch);
209	headerdone = true;
210	} else if ((ch & 0xe0) == 0xc0) {
211	uc = ch & 0x1f;
212	need = 1;
213	error = i;
214	min_uc = 0x80;
215	headerdone = true;
216	} else if ((ch & 0xf0) == 0xe0) {
217	uc = ch & 0x0f;
218	need = 2;
219	error = i;
220	min_uc = 0x800;
221	} else if ((ch&0xf8) == 0xf0) {
222	uc = ch & 0x07;
223	need = 3;
224	error = i;
225	min_uc = 0x10000;
226	headerdone = true;
227	} else {
228	// error
229	*qch++ = replacement;
230	++invalid;
231	headerdone = true;
232	}
233	}
234	}
235	if (!state && need > 0) {
236	// unterminated UTF sequence
237	for (int i = error; i < len; ++i) {
238	*qch++ = replacement;
239	++invalid;
240	}
241	}
242	result.truncate(qch - (ushort *)result.unicode());
243	if (state) {
244	state->invalidChars += invalid;
245	state->remainingChars = need;
246	if (headerdone)
247	state->flags \|= QTextCodec::IgnoreHeader;
248	state->state_data[0] = need ? uc : 0;
249	state->state_data[1] = need ? min_uc : 0;
250	}
251	return result;
252	}
253
254	QByteArray QUtf16::convertFromUnicode(const QChar uc, int len, QTextCodec::ConverterState state, DataEndianness e)
255	{
256	DataEndianness endian = e;
257	int length = 2*len;
258	if (!state \|\| (!(state->flags & QTextCodec::IgnoreHeader))) {
259	length += 2;
260	}
261	if (e == DetectEndianness) {
262	endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness;
263	}
264
265	QByteArray d;
266	d.resize(length);
267	char *data = d.data();
268	if (!state \|\| !(state->flags & QTextCodec::IgnoreHeader)) {
269	QChar bom(QChar::ByteOrderMark);
270	if (endian == BigEndianness) {
271	data[0] = bom.row();
272	data[1] = bom.cell();
273	} else {
274	data[0] = bom.cell();
275	data[1] = bom.row();
276	}
277	data += 2;
278	}
279	if (endian == BigEndianness) {
280	for (int i = 0; i < len; ++i) {
281	*(data++) = uc[i].row();
282	*(data++) = uc[i].cell();
283	}
284	} else {
285	for (int i = 0; i < len; ++i) {
286	*(data++) = uc[i].cell();
287	*(data++) = uc[i].row();
288	}
289	}
290
291	if (state) {
292	state->remainingChars = 0;
293	state->flags \|= QTextCodec::IgnoreHeader;
294	}
295	return d;
296	}
297
298	QString QUtf16::convertToUnicode(const char chars, int len, QTextCodec::ConverterState state, DataEndianness e)
299	{
300	DataEndianness endian = e;
301	bool half = false;
302	uchar buf = 0;
303	bool headerdone = false;
304	if (state) {
305	headerdone = state->flags & QTextCodec::IgnoreHeader;
306	if (endian == DetectEndianness)
307	endian = (DataEndianness)state->state_data[Endian];
308	if (state->remainingChars) {
309	half = true;
310	buf = state->state_data[Data];
311	}
312	}
313	if (headerdone && endian == DetectEndianness)
314	endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness;
315
316	QString result(len, Qt::Uninitialized); // worst case
317	QChar qch = (QChar )result.unicode();
318	while (len--) {
319	if (half) {
320	QChar ch;
321	if (endian == LittleEndianness) {
322	ch.setRow(*chars++);
323	ch.setCell(buf);
324	} else {
325	ch.setRow(buf);
326	ch.setCell(*chars++);
327	}
328	if (!headerdone) {
329	if (endian == DetectEndianness) {
330	if (ch == QChar::ByteOrderSwapped && endian != BigEndianness) {
331	endian = LittleEndianness;
332	} else if (ch == QChar::ByteOrderMark && endian != LittleEndianness) {
333	// ignore BOM
334	endian = BigEndianness;
335	} else {
336	if (QSysInfo::ByteOrder == QSysInfo::BigEndian) {
337	endian = BigEndianness;
338	} else {
339	endian = LittleEndianness;
340	ch = QChar((ch.unicode() >> 8) \| ((ch.unicode() & 0xff) << 8));
341	}
342	*qch++ = ch;
343	}
344	} else if (ch != QChar::ByteOrderMark) {
345	*qch++ = ch;
346	}
347	headerdone = true;
348	} else {
349	*qch++ = ch;
350	}
351	half = false;
352	} else {
353	buf = *chars++;
354	half = true;
355	}
356	}
357	result.truncate(qch - result.unicode());
358
359	if (state) {
360	if (headerdone)
361	state->flags \|= QTextCodec::IgnoreHeader;
362	state->state_data[Endian] = endian;
363	if (half) {
364	state->remainingChars = 1;
365	state->state_data[Data] = buf;
366	} else {
367	state->remainingChars = 0;
368	state->state_data[Data] = 0;
369	}
370	}
371	return result;
372	}
373
374	QByteArray QUtf32::convertFromUnicode(const QChar uc, int len, QTextCodec::ConverterState state, DataEndianness e)
375	{
376	DataEndianness endian = e;
377	int length = 4*len;
378	if (!state \|\| (!(state->flags & QTextCodec::IgnoreHeader))) {
379	length += 4;
380	}
381	if (e == DetectEndianness) {
382	endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness;
383	}
384
385	QByteArray d(length, Qt::Uninitialized);
386	char *data = d.data();
387	if (!state \|\| !(state->flags & QTextCodec::IgnoreHeader)) {
388	if (endian == BigEndianness) {
389	data[0] = 0;
390	data[1] = 0;
391	data[2] = (char)0xfe;
392	data[3] = (char)0xff;
393	} else {
394	data[0] = (char)0xff;
395	data[1] = (char)0xfe;
396	data[2] = 0;
397	data[3] = 0;
398	}
399	data += 4;
400	}
401	if (endian == BigEndianness) {
402	for (int i = 0; i < len; ++i) {
403	uint cp = uc[i].unicode();
404	if (uc[i].isHighSurrogate() && i < len - 1)
405	cp = QChar::surrogateToUcs4(cp, uc[++i].unicode());
406	*(data++) = cp >> 24;
407	*(data++) = (cp >> 16) & 0xff;
408	*(data++) = (cp >> 8) & 0xff;
409	*(data++) = cp & 0xff;
410	}
411	} else {
412	for (int i = 0; i < len; ++i) {
413	uint cp = uc[i].unicode();
414	if (uc[i].isHighSurrogate() && i < len - 1)
415	cp = QChar::surrogateToUcs4(cp, uc[++i].unicode());
416	*(data++) = cp & 0xff;
417	*(data++) = (cp >> 8) & 0xff;
418	*(data++) = (cp >> 16) & 0xff;
419	*(data++) = cp >> 24;
420	}
421	}
422
423	if (state) {
424	state->remainingChars = 0;
425	state->flags \|= QTextCodec::IgnoreHeader;
426	}
427	return d;
428	}
429
430	QString QUtf32::convertToUnicode(const char chars, int len, QTextCodec::ConverterState state, DataEndianness e)
431	{
432	DataEndianness endian = e;
433	uchar tuple[4];
434	int num = 0;
435	bool headerdone = false;
436	if (state) {
437	headerdone = state->flags & QTextCodec::IgnoreHeader;
438	if (endian == DetectEndianness) {
439	endian = (DataEndianness)state->state_data[Endian];
440	}
441	num = state->remainingChars;
442	memcpy(tuple, &state->state_data[Data], 4);
443	}
444	if (headerdone && endian == DetectEndianness)
445	endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness;
446
447	QString result;
448	result.resize((num + len) >> 2 << 1); // worst case
449	QChar qch = (QChar )result.unicode();
450
451	const char *end = chars + len;
452	while (chars < end) {
453	tuple[num++] = *chars++;
454	if (num == 4) {
455	if (!headerdone) {
456	if (endian == DetectEndianness) {
457	if (endian == DetectEndianness) {
458	if (tuple[0] == 0xff && tuple[1] == 0xfe && tuple[2] == 0 && tuple[3] == 0 && endian != BigEndianness) {
459	endian = LittleEndianness;
460	num = 0;
461	continue;
462	} else if (tuple[0] == 0 && tuple[1] == 0 && tuple[2] == 0xfe && tuple[3] == 0xff && endian != LittleEndianness) {
463	endian = BigEndianness;
464	num = 0;
465	continue;
466	} else if (QSysInfo::ByteOrder == QSysInfo::BigEndian) {
467	endian = BigEndianness;
468	} else {
469	endian = LittleEndianness;
470	}
471	}
472	} else if (((endian == BigEndianness) ? qFromBigEndian<quint32>(tuple) : qFromLittleEndian<quint32>(tuple)) == QChar::ByteOrderMark) {
473	num = 0;
474	continue;
475	}
476	}
477	uint code = (endian == BigEndianness) ? qFromBigEndian<quint32>(tuple) : qFromLittleEndian<quint32>(tuple);
478	if (code >= 0x10000) {
479	*qch++ = QChar::highSurrogate(code);
480	*qch++ = QChar::lowSurrogate(code);
481	} else {
482	*qch++ = code;
483	}
484	num = 0;
485	}
486	}
487	result.truncate(qch - result.unicode());
488
489	if (state) {
490	if (headerdone)
491	state->flags \|= QTextCodec::IgnoreHeader;
492	state->state_data[Endian] = endian;
493	state->remainingChars = num;
494	memcpy(&state->state_data[Data], tuple, 4);
495	}
496	return result;
497	}
498
499
500	#ifndef QT_NO_TEXTCODEC
501
502	QUtf8Codec::~QUtf8Codec()
503	{
504	}
505
506	QByteArray QUtf8Codec::convertFromUnicode(const QChar uc, int len, ConverterState state) const
507	{
508	return QUtf8::convertFromUnicode(uc, len, state);
509	}
510
511	void QUtf8Codec::convertToUnicode(QString target, const char chars, int len, ConverterState *state) const
512	{
513	*target += QUtf8::convertToUnicode(chars, len, state);
514	}
515
516	QString QUtf8Codec::convertToUnicode(const char chars, int len, ConverterState state) const
517	{
518	return QUtf8::convertToUnicode(chars, len, state);
519	}
520
521	QByteArray QUtf8Codec::name() const
522	{
523	return "UTF-8";
524	}
525
526	int QUtf8Codec::mibEnum() const
527	{
528	return 106;
529	}
530
531	QUtf16Codec::~QUtf16Codec()
532	{
533	}
534
535	QByteArray QUtf16Codec::convertFromUnicode(const QChar uc, int len, ConverterState state) const
536	{
537	return QUtf16::convertFromUnicode(uc, len, state, e);
538	}
539
540	QString QUtf16Codec::convertToUnicode(const char chars, int len, ConverterState state) const
541	{
542	return QUtf16::convertToUnicode(chars, len, state, e);
543	}
544
545	int QUtf16Codec::mibEnum() const
546	{
547	return 1015;
548	}
549
550	QByteArray QUtf16Codec::name() const
551	{
552	return "UTF-16";
553	}
554
555	QList<QByteArray> QUtf16Codec::aliases() const
556	{
557	return QList<QByteArray>();
558	}
559
560	int QUtf16BECodec::mibEnum() const
561	{
562	return 1013;
563	}
564
565	QByteArray QUtf16BECodec::name() const
566	{
567	return "UTF-16BE";
568	}
569
570	QList<QByteArray> QUtf16BECodec::aliases() const
571	{
572	QList<QByteArray> list;
573	return list;
574	}
575
576	int QUtf16LECodec::mibEnum() const
577	{
578	return 1014;
579	}
580
581	QByteArray QUtf16LECodec::name() const
582	{
583	return "UTF-16LE";
584	}
585
586	QList<QByteArray> QUtf16LECodec::aliases() const
587	{
588	QList<QByteArray> list;
589	return list;
590	}
591
592	QUtf32Codec::~QUtf32Codec()
593	{
594	}
595
596	QByteArray QUtf32Codec::convertFromUnicode(const QChar uc, int len, ConverterState state) const
597	{
598	return QUtf32::convertFromUnicode(uc, len, state, e);
599	}
600
601	QString QUtf32Codec::convertToUnicode(const char chars, int len, ConverterState state) const
602	{
603	return QUtf32::convertToUnicode(chars, len, state, e);
604	}
605
606	int QUtf32Codec::mibEnum() const
607	{
608	return 1017;
609	}
610
611	QByteArray QUtf32Codec::name() const
612	{
613	return "UTF-32";
614	}
615
616	QList<QByteArray> QUtf32Codec::aliases() const
617	{
618	QList<QByteArray> list;
619	return list;
620	}
621
622	int QUtf32BECodec::mibEnum() const
623	{
624	return 1018;
625	}
626
627	QByteArray QUtf32BECodec::name() const
628	{
629	return "UTF-32BE";
630	}
631
632	QList<QByteArray> QUtf32BECodec::aliases() const
633	{
634	QList<QByteArray> list;
635	return list;
636	}
637
638	int QUtf32LECodec::mibEnum() const
639	{
640	return 1019;
641	}
642
643	QByteArray QUtf32LECodec::name() const
644	{
645	return "UTF-32LE";
646	}
647
648	QList<QByteArray> QUtf32LECodec::aliases() const
649	{
650	QList<QByteArray> list;
651	return list;
652	}
653
654	#endif //QT_NO_TEXTCODEC
655
656	QT_END_NAMESPACE

Note: See TracBrowser for help on using the repository browser.

Download in other formats: