Context Navigation

charcnv.c@ 536

Visit:

Last change on this file since 536 was 341, checked in by Herwig Bauernfeind, 16 years ago
divers fix for Ticket #68 in 3.2
File size: 49.5 KB

Line
1	/*
2	Unix SMB/CIFS implementation.
3	Character set conversion Extensions
4	Copyright (C) Igor Vergeichik <[email protected]> 2001
5	Copyright (C) Andrew Tridgell 2001
6	Copyright (C) Simo Sorce 2001
7	Copyright (C) Martin Pool 2003
8
9	This program is free software; you can redistribute it and/or modify
10	it under the terms of the GNU General Public License as published by
11	the Free Software Foundation; either version 3 of the License, or
12	(at your option) any later version.
13
14	This program is distributed in the hope that it will be useful,
15	but WITHOUT ANY WARRANTY; without even the implied warranty of
16	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17	GNU General Public License for more details.
18
19	You should have received a copy of the GNU General Public License
20	along with this program. If not, see <http://www.gnu.org/licenses/>.
21
22	*/
23	#include "includes.h"
24
25	/* We can parameterize this if someone complains.... JRA. */
26
27	char lp_failed_convert_char(void)
28	{
29	return '_';
30	}
31
32	/**
33	* @file
34	*
35	* @brief Character-set conversion routines built on our iconv.
36	*
37	* @note Samba's internal character set (at least in the 3.0 series)
38	* is always the same as the one for the Unix filesystem. It is
39	* <b>not</b> necessarily UTF-8 and may be different on machines that
40	* need i18n filenames to be compatible with Unix software. It does
41	* have to be a superset of ASCII. All multibyte sequences must start
42	* with a byte with the high bit set.
43	*
44	* @sa lib/iconv.c
45	*/
46
47
48	static smb_iconv_t conv_handles[NUM_CHARSETS][NUM_CHARSETS];
49	static bool conv_silent; /* Should we do a debug if the conversion fails ? */
50	static bool initialized;
51
52	/**
53	* Return the name of a charset to give to iconv().
54	**/
55	static const char *charset_name(charset_t ch)
56	{
57	const char *ret = NULL;
58	if (ch == CH_UTF16LE) ret = "UTF-16LE";
59	else if (ch == CH_UTF16BE) ret = "UTF-16BE";
60	else if (ch == CH_UNIX) ret = lp_unix_charset();
61	else if (ch == CH_DOS) ret = lp_dos_charset();
62	else if (ch == CH_DISPLAY) ret = lp_display_charset();
63	else if (ch == CH_UTF8) ret = "UTF8";
64
65	#if defined(HAVE_NL_LANGINFO) && defined(CODESET)
66	if (ret && !strcmp(ret, "LOCALE")) {
67	const char *ln = NULL;
68
69	#ifdef HAVE_SETLOCALE
70	setlocale(LC_ALL, "");
71	#endif
72	ln = nl_langinfo(CODESET);
73	if (ln) {
74	/* Check whether the charset name is supported
75	by iconv */
76	smb_iconv_t handle = smb_iconv_open(ln,"UCS-2LE");
77	if (handle == (smb_iconv_t) -1) {
78	DEBUG(5,("Locale charset '%s' unsupported, using ASCII instead\n", ln));
79	ln = NULL;
80	} else {
81	DEBUG(5,("Substituting charset '%s' for LOCALE\n", ln));
82	smb_iconv_close(handle);
83	}
84	}
85	ret = ln;
86	}
87	#endif
88
89	if (!ret \|\| !*ret) ret = "ASCII";
90	return ret;
91	}
92
93	void lazy_initialize_conv(void)
94	{
95	if (!initialized) {
96	load_case_tables();
97	init_iconv();
98	initialized = true;
99	}
100	}
101
102	/**
103	* Destroy global objects allocated by init_iconv()
104	**/
105	void gfree_charcnv(void)
106	{
107	int c1, c2;
108
109	for (c1=0;c1<NUM_CHARSETS;c1++) {
110	for (c2=0;c2<NUM_CHARSETS;c2++) {
111	if ( conv_handles[c1][c2] ) {
112	smb_iconv_close( conv_handles[c1][c2] );
113	conv_handles[c1][c2] = 0;
114	}
115	}
116	}
117	initialized = false;
118	}
119
120	/**
121	* Initialize iconv conversion descriptors.
122	*
123	* This is called the first time it is needed, and also called again
124	* every time the configuration is reloaded, because the charset or
125	* codepage might have changed.
126	**/
127	void init_iconv(void)
128	{
129	int c1, c2;
130	bool did_reload = False;
131
132	/* so that charset_name() works we need to get the UNIX<->UCS2 going
133	first */
134	if (!conv_handles[CH_UNIX][CH_UTF16LE])
135	conv_handles[CH_UNIX][CH_UTF16LE] = smb_iconv_open(charset_name(CH_UTF16LE), "ASCII");
136
137	if (!conv_handles[CH_UTF16LE][CH_UNIX])
138	conv_handles[CH_UTF16LE][CH_UNIX] = smb_iconv_open("ASCII", charset_name(CH_UTF16LE));
139
140	for (c1=0;c1<NUM_CHARSETS;c1++) {
141	for (c2=0;c2<NUM_CHARSETS;c2++) {
142	const char *n1 = charset_name((charset_t)c1);
143	const char *n2 = charset_name((charset_t)c2);
144	if (conv_handles[c1][c2] &&
145	strcmp(n1, conv_handles[c1][c2]->from_name) == 0 &&
146	strcmp(n2, conv_handles[c1][c2]->to_name) == 0)
147	continue;
148
149	did_reload = True;
150
151	if (conv_handles[c1][c2])
152	smb_iconv_close(conv_handles[c1][c2]);
153
154	conv_handles[c1][c2] = smb_iconv_open(n2,n1);
155	if (conv_handles[c1][c2] == (smb_iconv_t)-1) {
156	DEBUG(0,("init_iconv: Conversion from %s to %s not supported\n",
157	charset_name((charset_t)c1), charset_name((charset_t)c2)));
158	if (c1 != CH_UTF16LE && c1 != CH_UTF16BE) {
159	n1 = "ASCII";
160	}
161	if (c2 != CH_UTF16LE && c2 != CH_UTF16BE) {
162	n2 = "ASCII";
163	}
164	DEBUG(0,("init_iconv: Attempting to replace with conversion from %s to %s\n",
165	n1, n2 ));
166	conv_handles[c1][c2] = smb_iconv_open(n2,n1);
167	if (!conv_handles[c1][c2]) {
168	DEBUG(0,("init_iconv: Conversion from %s to %s failed", n1, n2));
169	smb_panic("init_iconv: conv_handle initialization failed");
170	}
171	}
172	}
173	}
174
175	if (did_reload) {
176	/* XXX: Does this really get called every time the dos
177	* codepage changes? */
178	/* XXX: Is the did_reload test too strict? */
179	conv_silent = True;
180	init_valid_table();
181	conv_silent = False;
182	}
183	}
184
185	/**
186	* Convert string from one encoding to another, making error checking etc
187	* Slow path version - uses (slow) iconv.
188	*
189	* @param src pointer to source string (multibyte or singlebyte)
190	* @param srclen length of the source string in bytes
191	* @param dest pointer to destination string (multibyte or singlebyte)
192	* @param destlen maximal length allowed for string
193	* @param allow_bad_conv determines if a "best effort" conversion is acceptable (never returns errors)
194	* @returns the number of bytes occupied in the destination
195	*
196	* Ensure the srclen contains the terminating zero.
197	*
198	**/
199
200	static size_t convert_string_internal(charset_t from, charset_t to,
201	void const *src, size_t srclen,
202	void *dest, size_t destlen, bool allow_bad_conv)
203	{
204	size_t i_len, o_len;
205	size_t retval;
206	const char* inbuf = (const char*)src;
207	char* outbuf = (char*)dest;
208	smb_iconv_t descriptor;
209
210	lazy_initialize_conv();
211
212	descriptor = conv_handles[from][to];
213
214	if (srclen == (size_t)-1) {
215	if (from == CH_UTF16LE \|\| from == CH_UTF16BE) {
216	srclen = (strlen_w((const smb_ucs2_t )src)+1) 2;
217	} else {
218	srclen = strlen((const char *)src)+1;
219	}
220	}
221
222
223	if (descriptor == (smb_iconv_t)-1 \|\| descriptor == (smb_iconv_t)0) {
224	if (!conv_silent)
225	DEBUG(0,("convert_string_internal: Conversion not supported.\n"));
226	return (size_t)-1;
227	}
228
229	i_len=srclen;
230	o_len=destlen;
231
232	again:
233
234	retval = smb_iconv(descriptor, &inbuf, &i_len, &outbuf, &o_len);
235	if(retval==(size_t)-1) {
236	const char *reason="unknown error";
237	switch(errno) {
238	case EINVAL:
239	reason="Incomplete multibyte sequence";
240	if (!conv_silent)
241	DEBUG(3,("convert_string_internal: Conversion error: %s(%s)\n",reason,inbuf));
242	if (allow_bad_conv)
243	goto use_as_is;
244	return (size_t)-1;
245	case E2BIG:
246	reason="No more room";
247	if (!conv_silent) {
248	if (from == CH_UNIX) {
249	DEBUG(3,("E2BIG: convert_string(%s,%s): srclen=%u destlen=%u - '%s'\n",
250	charset_name(from), charset_name(to),
251	(unsigned int)srclen, (unsigned int)destlen, (const char *)src));
252	} else {
253	DEBUG(3,("E2BIG: convert_string(%s,%s): srclen=%u destlen=%u\n",
254	charset_name(from), charset_name(to),
255	(unsigned int)srclen, (unsigned int)destlen));
256	}
257	}
258	break;
259	case EILSEQ:
260	reason="Illegal multibyte sequence";
261	if (!conv_silent)
262	DEBUG(3,("convert_string_internal: Conversion error: %s(%s)\n",reason,inbuf));
263	if (allow_bad_conv)
264	goto use_as_is;
265
266	return (size_t)-1;
267	default:
268	if (!conv_silent)
269	DEBUG(0,("convert_string_internal: Conversion error: %s(%s)\n",reason,inbuf));
270	return (size_t)-1;
271	}
272	/* smb_panic(reason); */
273	}
274	return destlen-o_len;
275
276	use_as_is:
277
278	/*
279	* Conversion not supported. This is actually an error, but there are so
280	* many misconfigured iconv systems and smb.conf's out there we can't just
281	* fail. Do a very bad conversion instead.... JRA.
282	*/
283
284	{
285	if (o_len == 0 \|\| i_len == 0)
286	return destlen - o_len;
287
288	if (((from == CH_UTF16LE)\|\|(from == CH_UTF16BE)) &&
289	((to != CH_UTF16LE)\|\|(to != CH_UTF16BE))) {
290	/* Can't convert from utf16 any endian to multibyte.
291	Replace with the default fail char.
292	*/
293	if (i_len < 2)
294	return destlen - o_len;
295	if (i_len >= 2) {
296	*outbuf = lp_failed_convert_char();
297
298	outbuf++;
299	o_len--;
300
301	inbuf += 2;
302	i_len -= 2;
303	}
304
305	if (o_len == 0 \|\| i_len == 0)
306	return destlen - o_len;
307
308	/* Keep trying with the next char... */
309	goto again;
310
311	} else if (from != CH_UTF16LE && from != CH_UTF16BE && to == CH_UTF16LE) {
312	/* Can't convert to UTF16LE - just widen by adding the
313	default fail char then zero.
314	*/
315	if (o_len < 2)
316	return destlen - o_len;
317
318	outbuf[0] = lp_failed_convert_char();
319	outbuf[1] = '\0';
320
321	inbuf++;
322	i_len--;
323
324	outbuf += 2;
325	o_len -= 2;
326
327	if (o_len == 0 \|\| i_len == 0)
328	return destlen - o_len;
329
330	/* Keep trying with the next char... */
331	goto again;
332
333	} else if (from != CH_UTF16LE && from != CH_UTF16BE &&
334	to != CH_UTF16LE && to != CH_UTF16BE) {
335	/* Failed multibyte to multibyte. Just copy the default fail char and
336	try again. */
337	outbuf[0] = lp_failed_convert_char();
338
339	inbuf++;
340	i_len--;
341
342	outbuf++;
343	o_len--;
344
345	if (o_len == 0 \|\| i_len == 0)
346	return destlen - o_len;
347
348	/* Keep trying with the next char... */
349	goto again;
350
351	} else {
352	/* Keep compiler happy.... */
353	return destlen - o_len;
354	}
355	}
356	}
357
358	/**
359	* Convert string from one encoding to another, making error checking etc
360	* Fast path version - handles ASCII first.
361	*
362	* @param src pointer to source string (multibyte or singlebyte)
363	* @param srclen length of the source string in bytes, or -1 for nul terminated.
364	* @param dest pointer to destination string (multibyte or singlebyte)
365	* @param destlen maximal length allowed for string - NEVER -1.
366	* @param allow_bad_conv determines if a "best effort" conversion is acceptable (never returns errors)
367	* @returns the number of bytes occupied in the destination
368	*
369	* Ensure the srclen contains the terminating zero.
370	*
371	* This function has been hand-tuned to provide a fast path.
372	* Don't change unless you really know what you are doing. JRA.
373	**/
374
375	size_t convert_string(charset_t from, charset_t to,
376	void const *src, size_t srclen,
377	void *dest, size_t destlen, bool allow_bad_conv)
378	{
379	/*
380	* NB. We deliberately don't do a strlen here if srclen == -1.
381	* This is very expensive over millions of calls and is taken
382	* care of in the slow path in convert_string_internal. JRA.
383	*/
384
385	#ifdef DEVELOPER
386	SMB_ASSERT(destlen != (size_t)-1);
387	#endif
388
389	if (srclen == 0)
390	return 0;
391
392	if (from != CH_UTF16LE && from != CH_UTF16BE && to != CH_UTF16LE && to != CH_UTF16BE) {
393	const unsigned char p = (const unsigned char )src;
394	unsigned char q = (unsigned char )dest;
395	size_t slen = srclen;
396	size_t dlen = destlen;
397	unsigned char lastp = '\0';
398	size_t retval = 0;
399
400	/* If all characters are ascii, fast path here. */
401	while (slen && dlen) {
402	if ((lastp = *p) <= 0x7f) {
403	q++ = p++;
404	if (slen != (size_t)-1) {
405	slen--;
406	}
407	dlen--;
408	retval++;
409	if (!lastp)
410	break;
411	} else {
412	#ifdef BROKEN_UNICODE_COMPOSE_CHARACTERS
413	goto general_case;
414	#else
415	size_t ret = convert_string_internal(from, to, p, slen, q, dlen, allow_bad_conv);
416	if (ret == (size_t)-1) {
417	return ret;
418	}
419	return retval + ret;
420	#endif
421	}
422	}
423	if (!dlen) {
424	/* Even if we fast path we should note if we ran out of room. */
425	if (((slen != (size_t)-1) && slen) \|\|
426	((slen == (size_t)-1) && lastp)) {
427	errno = E2BIG;
428	}
429	}
430	return retval;
431
432	} else if (from == CH_UTF16LE && to != CH_UTF16LE) {
433	const unsigned char p = (const unsigned char )src;
434	unsigned char q = (unsigned char )dest;
435	size_t retval = 0;
436	size_t slen = srclen;
437	size_t dlen = destlen;
438	unsigned char lastp = '\0';
439
440	/* If all characters are ascii, fast path here. */
441	while (((slen == (size_t)-1) \|\| (slen >= 2)) && dlen) {
442	if (((lastp = *p) <= 0x7f) && (p[1] == 0)) {
443	q++ = p;
444	if (slen != (size_t)-1) {
445	slen -= 2;
446	}
447	p += 2;
448	dlen--;
449	retval++;
450	if (!lastp)
451	break;
452	} else {
453	#ifdef BROKEN_UNICODE_COMPOSE_CHARACTERS
454	goto general_case;
455	#else
456	size_t ret = convert_string_internal(from, to, p, slen, q, dlen, allow_bad_conv);
457	if (ret == (size_t)-1) {
458	return ret;
459	}
460	return retval + ret;
461	#endif
462	}
463	}
464	if (!dlen) {
465	/* Even if we fast path we should note if we ran out of room. */
466	if (((slen != (size_t)-1) && slen) \|\|
467	((slen == (size_t)-1) && lastp)) {
468	errno = E2BIG;
469	}
470	}
471	return retval;
472
473	} else if (from != CH_UTF16LE && from != CH_UTF16BE && to == CH_UTF16LE) {
474	const unsigned char p = (const unsigned char )src;
475	unsigned char q = (unsigned char )dest;
476	size_t retval = 0;
477	size_t slen = srclen;
478	size_t dlen = destlen;
479	unsigned char lastp = '\0';
480
481	/* If all characters are ascii, fast path here. */
482	while (slen && (dlen >= 2)) {
483	if ((lastp = *p) <= 0x7F) {
484	q++ = p++;
485	*q++ = '\0';
486	if (slen != (size_t)-1) {
487	slen--;
488	}
489	dlen -= 2;
490	retval += 2;
491	if (!lastp)
492	break;
493	} else {
494	#ifdef BROKEN_UNICODE_COMPOSE_CHARACTERS
495	goto general_case;
496	#else
497	size_t ret = convert_string_internal(from, to, p, slen, q, dlen, allow_bad_conv);
498	if (ret == (size_t)-1) {
499	return ret;
500	}
501	return retval + ret;
502	#endif
503	}
504	}
505	if (!dlen) {
506	/* Even if we fast path we should note if we ran out of room. */
507	if (((slen != (size_t)-1) && slen) \|\|
508	((slen == (size_t)-1) && lastp)) {
509	errno = E2BIG;
510	}
511	}
512	return retval;
513	}
514
515	#ifdef BROKEN_UNICODE_COMPOSE_CHARACTERS
516	general_case:
517	#endif
518	return convert_string_internal(from, to, src, srclen, dest, destlen, allow_bad_conv);
519	}
520
521	/**
522	* Convert between character sets, allocating a new buffer for the result.
523	*
524	* @param ctx TALLOC_CTX to use to allocate with. If NULL use malloc.
525	* (this is a bad interface and needs fixing. JRA).
526	* @param srclen length of source buffer.
527	* @param dest always set at least to NULL
528	* @param converted_size set to the size of the allocated buffer on return
529	* true
530	* @note -1 is not accepted for srclen.
531	*
532	* @return True if new buffer was correctly allocated, and string was
533	* converted.
534	*
535	* Ensure the srclen contains the terminating zero.
536	*
537	* I hate the goto's in this function. It's embarressing.....
538	* There has to be a cleaner way to do this. JRA.
539	**/
540
541	bool convert_string_allocate(TALLOC_CTX *ctx, charset_t from, charset_t to,
542	void const src, size_t srclen, void dst,
543	size_t *converted_size, bool allow_bad_conv)
544	{
545	size_t i_len, o_len, destlen = (srclen * 3) / 2;
546	size_t retval;
547	const char inbuf = (const char )src;
548	char outbuf = NULL, ob = NULL;
549	smb_iconv_t descriptor;
550	void dest = (void )dst;
551
552	*dest = NULL;
553
554	if (!converted_size) {
555	errno = EINVAL;
556	return false;
557	}
558
559	if (src == NULL \|\| srclen == (size_t)-1) {
560	errno = EINVAL;
561	return false;
562	}
563	if (srclen == 0) {
564	ob = ((ctx != NULL) ? talloc_strdup(ctx, "") : SMB_STRDUP(""));
565	if (ob == NULL) {
566	errno = ENOMEM;
567	return false;
568	}
569	*dest = ob;
570	*converted_size = 0;
571	return true;
572	}
573
574	lazy_initialize_conv();
575
576	descriptor = conv_handles[from][to];
577
578	if (descriptor == (smb_iconv_t)-1 \|\| descriptor == (smb_iconv_t)0) {
579	if (!conv_silent)
580	DEBUG(0,("convert_string_allocate: Conversion not supported.\n"));
581	errno = EOPNOTSUPP;
582	return false;
583	}
584
585	convert:
586
587	/* +2 is for ucs2 null termination. */
588	if ((destlen*2)+2 < destlen) {
589	/* wrapped ! abort. */
590	if (!conv_silent)
591	DEBUG(0, ("convert_string_allocate: destlen wrapped !\n"));
592	if (!ctx)
593	SAFE_FREE(outbuf);
594	errno = EOPNOTSUPP;
595	return false;
596	} else {
597	destlen = destlen * 2;
598	}
599
600	/* +2 is for ucs2 null termination. */
601	if (ctx) {
602	ob = (char *)TALLOC_REALLOC(ctx, ob, destlen + 2);
603	} else {
604	ob = (char *)SMB_REALLOC(ob, destlen + 2);
605	}
606
607	if (!ob) {
608	DEBUG(0, ("convert_string_allocate: realloc failed!\n"));
609	errno = ENOMEM;
610	return false;
611	}
612	outbuf = ob;
613	i_len = srclen;
614	o_len = destlen;
615
616	again:
617
618
619	retval = smb_iconv(descriptor,
620	&inbuf, &i_len,
621	&outbuf, &o_len);
622	if(retval == (size_t)-1) {
623	const char *reason="unknown error";
624	switch(errno) {
625	case EINVAL:
626	reason="Incomplete multibyte sequence";
627	if (!conv_silent)
628	DEBUG(3,("convert_string_allocate: Conversion error: %s(%s)\n",reason,inbuf));
629	if (allow_bad_conv)
630	goto use_as_is;
631	break;
632	case E2BIG:
633	goto convert;
634	case EILSEQ:
635	reason="Illegal multibyte sequence";
636	if (!conv_silent)
637	DEBUG(3,("convert_string_allocate: Conversion error: %s(%s)\n",reason,inbuf));
638	if (allow_bad_conv)
639	goto use_as_is;
640	break;
641	}
642	if (!conv_silent)
643	DEBUG(0,("Conversion error: %s(%s)\n",reason,inbuf));
644	/* smb_panic(reason); */
645	if (ctx) {
646	TALLOC_FREE(ob);
647	} else {
648	SAFE_FREE(ob);
649	}
650	return false;
651	}
652
653	out:
654
655	destlen = destlen - o_len;
656	/* Don't shrink unless we're reclaiming a lot of
657	* space. This is in the hot codepath and these
658	* reallocs cost. JRA.
659	*/
660	if (o_len > 1024) {
661	/* We're shrinking here so we know the +2 is safe from wrap. */
662	if (ctx) {
663	ob = (char *)TALLOC_REALLOC(ctx,ob,destlen + 2);
664	} else {
665	ob = (char *)SMB_REALLOC(ob,destlen + 2);
666	}
667	}
668
669	if (destlen && !ob) {
670	DEBUG(0, ("convert_string_allocate: out of memory!\n"));
671	errno = ENOMEM;
672	return false;
673	}
674
675	*dest = ob;
676
677	/* Must ucs2 null terminate in the extra space we allocated. */
678	ob[destlen] = '\0';
679	ob[destlen+1] = '\0';
680
681	*converted_size = destlen;
682	return true;
683
684	use_as_is:
685
686	/*
687	* Conversion not supported. This is actually an error, but there are so
688	* many misconfigured iconv systems and smb.conf's out there we can't just
689	* fail. Do a very bad conversion instead.... JRA.
690	*/
691
692	{
693	if (o_len == 0 \|\| i_len == 0)
694	goto out;
695
696	if (((from == CH_UTF16LE)\|\|(from == CH_UTF16BE)) &&
697	((to != CH_UTF16LE)\|\|(to != CH_UTF16BE))) {
698	/* Can't convert from utf16 any endian to multibyte.
699	Replace with the default fail char.
700	*/
701
702	if (i_len < 2)
703	goto out;
704
705	if (i_len >= 2) {
706	*outbuf = lp_failed_convert_char();
707
708	outbuf++;
709	o_len--;
710
711	inbuf += 2;
712	i_len -= 2;
713	}
714