source: branches/samba-3.2.x/source/lib/charcnv.c@ 536

Last change on this file since 536 was 341, checked in by Herwig Bauernfeind, 16 years ago

divers fix for Ticket #68 in 3.2

File size: 49.5 KB
Line 
1/*
2 Unix SMB/CIFS implementation.
3 Character set conversion Extensions
4 Copyright (C) Igor Vergeichik <[email protected]> 2001
5 Copyright (C) Andrew Tridgell 2001
6 Copyright (C) Simo Sorce 2001
7 Copyright (C) Martin Pool 2003
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation; either version 3 of the License, or
12 (at your option) any later version.
13
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
18
19 You should have received a copy of the GNU General Public License
20 along with this program. If not, see <http://www.gnu.org/licenses/>.
21
22*/
23#include "includes.h"
24
25/* We can parameterize this if someone complains.... JRA. */
26
27char lp_failed_convert_char(void)
28{
29 return '_';
30}
31
32/**
33 * @file
34 *
35 * @brief Character-set conversion routines built on our iconv.
36 *
37 * @note Samba's internal character set (at least in the 3.0 series)
38 * is always the same as the one for the Unix filesystem. It is
39 * <b>not</b> necessarily UTF-8 and may be different on machines that
40 * need i18n filenames to be compatible with Unix software. It does
41 * have to be a superset of ASCII. All multibyte sequences must start
42 * with a byte with the high bit set.
43 *
44 * @sa lib/iconv.c
45 */
46
47
48static smb_iconv_t conv_handles[NUM_CHARSETS][NUM_CHARSETS];
49static bool conv_silent; /* Should we do a debug if the conversion fails ? */
50static bool initialized;
51
52/**
53 * Return the name of a charset to give to iconv().
54 **/
55static const char *charset_name(charset_t ch)
56{
57 const char *ret = NULL;
58 if (ch == CH_UTF16LE) ret = "UTF-16LE";
59 else if (ch == CH_UTF16BE) ret = "UTF-16BE";
60 else if (ch == CH_UNIX) ret = lp_unix_charset();
61 else if (ch == CH_DOS) ret = lp_dos_charset();
62 else if (ch == CH_DISPLAY) ret = lp_display_charset();
63 else if (ch == CH_UTF8) ret = "UTF8";
64
65#if defined(HAVE_NL_LANGINFO) && defined(CODESET)
66 if (ret && !strcmp(ret, "LOCALE")) {
67 const char *ln = NULL;
68
69#ifdef HAVE_SETLOCALE
70 setlocale(LC_ALL, "");
71#endif
72 ln = nl_langinfo(CODESET);
73 if (ln) {
74 /* Check whether the charset name is supported
75 by iconv */
76 smb_iconv_t handle = smb_iconv_open(ln,"UCS-2LE");
77 if (handle == (smb_iconv_t) -1) {
78 DEBUG(5,("Locale charset '%s' unsupported, using ASCII instead\n", ln));
79 ln = NULL;
80 } else {
81 DEBUG(5,("Substituting charset '%s' for LOCALE\n", ln));
82 smb_iconv_close(handle);
83 }
84 }
85 ret = ln;
86 }
87#endif
88
89 if (!ret || !*ret) ret = "ASCII";
90 return ret;
91}
92
93void lazy_initialize_conv(void)
94{
95 if (!initialized) {
96 load_case_tables();
97 init_iconv();
98 initialized = true;
99 }
100}
101
102/**
103 * Destroy global objects allocated by init_iconv()
104 **/
105void gfree_charcnv(void)
106{
107 int c1, c2;
108
109 for (c1=0;c1<NUM_CHARSETS;c1++) {
110 for (c2=0;c2<NUM_CHARSETS;c2++) {
111 if ( conv_handles[c1][c2] ) {
112 smb_iconv_close( conv_handles[c1][c2] );
113 conv_handles[c1][c2] = 0;
114 }
115 }
116 }
117 initialized = false;
118}
119
120/**
121 * Initialize iconv conversion descriptors.
122 *
123 * This is called the first time it is needed, and also called again
124 * every time the configuration is reloaded, because the charset or
125 * codepage might have changed.
126 **/
127void init_iconv(void)
128{
129 int c1, c2;
130 bool did_reload = False;
131
132 /* so that charset_name() works we need to get the UNIX<->UCS2 going
133 first */
134 if (!conv_handles[CH_UNIX][CH_UTF16LE])
135 conv_handles[CH_UNIX][CH_UTF16LE] = smb_iconv_open(charset_name(CH_UTF16LE), "ASCII");
136
137 if (!conv_handles[CH_UTF16LE][CH_UNIX])
138 conv_handles[CH_UTF16LE][CH_UNIX] = smb_iconv_open("ASCII", charset_name(CH_UTF16LE));
139
140 for (c1=0;c1<NUM_CHARSETS;c1++) {
141 for (c2=0;c2<NUM_CHARSETS;c2++) {
142 const char *n1 = charset_name((charset_t)c1);
143 const char *n2 = charset_name((charset_t)c2);
144 if (conv_handles[c1][c2] &&
145 strcmp(n1, conv_handles[c1][c2]->from_name) == 0 &&
146 strcmp(n2, conv_handles[c1][c2]->to_name) == 0)
147 continue;
148
149 did_reload = True;
150
151 if (conv_handles[c1][c2])
152 smb_iconv_close(conv_handles[c1][c2]);
153
154 conv_handles[c1][c2] = smb_iconv_open(n2,n1);
155 if (conv_handles[c1][c2] == (smb_iconv_t)-1) {
156 DEBUG(0,("init_iconv: Conversion from %s to %s not supported\n",
157 charset_name((charset_t)c1), charset_name((charset_t)c2)));
158 if (c1 != CH_UTF16LE && c1 != CH_UTF16BE) {
159 n1 = "ASCII";
160 }
161 if (c2 != CH_UTF16LE && c2 != CH_UTF16BE) {
162 n2 = "ASCII";
163 }
164 DEBUG(0,("init_iconv: Attempting to replace with conversion from %s to %s\n",
165 n1, n2 ));
166 conv_handles[c1][c2] = smb_iconv_open(n2,n1);
167 if (!conv_handles[c1][c2]) {
168 DEBUG(0,("init_iconv: Conversion from %s to %s failed", n1, n2));
169 smb_panic("init_iconv: conv_handle initialization failed");
170 }
171 }
172 }
173 }
174
175 if (did_reload) {
176 /* XXX: Does this really get called every time the dos
177 * codepage changes? */
178 /* XXX: Is the did_reload test too strict? */
179 conv_silent = True;
180 init_valid_table();
181 conv_silent = False;
182 }
183}
184
185/**
186 * Convert string from one encoding to another, making error checking etc
187 * Slow path version - uses (slow) iconv.
188 *
189 * @param src pointer to source string (multibyte or singlebyte)
190 * @param srclen length of the source string in bytes
191 * @param dest pointer to destination string (multibyte or singlebyte)
192 * @param destlen maximal length allowed for string
193 * @param allow_bad_conv determines if a "best effort" conversion is acceptable (never returns errors)
194 * @returns the number of bytes occupied in the destination
195 *
196 * Ensure the srclen contains the terminating zero.
197 *
198 **/
199
200static size_t convert_string_internal(charset_t from, charset_t to,
201 void const *src, size_t srclen,
202 void *dest, size_t destlen, bool allow_bad_conv)
203{
204 size_t i_len, o_len;
205 size_t retval;
206 const char* inbuf = (const char*)src;
207 char* outbuf = (char*)dest;
208 smb_iconv_t descriptor;
209
210 lazy_initialize_conv();
211
212 descriptor = conv_handles[from][to];
213
214 if (srclen == (size_t)-1) {
215 if (from == CH_UTF16LE || from == CH_UTF16BE) {
216 srclen = (strlen_w((const smb_ucs2_t *)src)+1) * 2;
217 } else {
218 srclen = strlen((const char *)src)+1;
219 }
220 }
221
222
223 if (descriptor == (smb_iconv_t)-1 || descriptor == (smb_iconv_t)0) {
224 if (!conv_silent)
225 DEBUG(0,("convert_string_internal: Conversion not supported.\n"));
226 return (size_t)-1;
227 }
228
229 i_len=srclen;
230 o_len=destlen;
231
232 again:
233
234 retval = smb_iconv(descriptor, &inbuf, &i_len, &outbuf, &o_len);
235 if(retval==(size_t)-1) {
236 const char *reason="unknown error";
237 switch(errno) {
238 case EINVAL:
239 reason="Incomplete multibyte sequence";
240 if (!conv_silent)
241 DEBUG(3,("convert_string_internal: Conversion error: %s(%s)\n",reason,inbuf));
242 if (allow_bad_conv)
243 goto use_as_is;
244 return (size_t)-1;
245 case E2BIG:
246 reason="No more room";
247 if (!conv_silent) {
248 if (from == CH_UNIX) {
249 DEBUG(3,("E2BIG: convert_string(%s,%s): srclen=%u destlen=%u - '%s'\n",
250 charset_name(from), charset_name(to),
251 (unsigned int)srclen, (unsigned int)destlen, (const char *)src));
252 } else {
253 DEBUG(3,("E2BIG: convert_string(%s,%s): srclen=%u destlen=%u\n",
254 charset_name(from), charset_name(to),
255 (unsigned int)srclen, (unsigned int)destlen));
256 }
257 }
258 break;
259 case EILSEQ:
260 reason="Illegal multibyte sequence";
261 if (!conv_silent)
262 DEBUG(3,("convert_string_internal: Conversion error: %s(%s)\n",reason,inbuf));
263 if (allow_bad_conv)
264 goto use_as_is;
265
266 return (size_t)-1;
267 default:
268 if (!conv_silent)
269 DEBUG(0,("convert_string_internal: Conversion error: %s(%s)\n",reason,inbuf));
270 return (size_t)-1;
271 }
272 /* smb_panic(reason); */
273 }
274 return destlen-o_len;
275
276 use_as_is:
277
278 /*
279 * Conversion not supported. This is actually an error, but there are so
280 * many misconfigured iconv systems and smb.conf's out there we can't just
281 * fail. Do a very bad conversion instead.... JRA.
282 */
283
284 {
285 if (o_len == 0 || i_len == 0)
286 return destlen - o_len;
287
288 if (((from == CH_UTF16LE)||(from == CH_UTF16BE)) &&
289 ((to != CH_UTF16LE)||(to != CH_UTF16BE))) {
290 /* Can't convert from utf16 any endian to multibyte.
291 Replace with the default fail char.
292 */
293 if (i_len < 2)
294 return destlen - o_len;
295 if (i_len >= 2) {
296 *outbuf = lp_failed_convert_char();
297
298 outbuf++;
299 o_len--;
300
301 inbuf += 2;
302 i_len -= 2;
303 }
304
305 if (o_len == 0 || i_len == 0)
306 return destlen - o_len;
307
308 /* Keep trying with the next char... */
309 goto again;
310
311 } else if (from != CH_UTF16LE && from != CH_UTF16BE && to == CH_UTF16LE) {
312 /* Can't convert to UTF16LE - just widen by adding the
313 default fail char then zero.
314 */
315 if (o_len < 2)
316 return destlen - o_len;
317
318 outbuf[0] = lp_failed_convert_char();
319 outbuf[1] = '\0';
320
321 inbuf++;
322 i_len--;
323
324 outbuf += 2;
325 o_len -= 2;
326
327 if (o_len == 0 || i_len == 0)
328 return destlen - o_len;
329
330 /* Keep trying with the next char... */
331 goto again;
332
333 } else if (from != CH_UTF16LE && from != CH_UTF16BE &&
334 to != CH_UTF16LE && to != CH_UTF16BE) {
335 /* Failed multibyte to multibyte. Just copy the default fail char and
336 try again. */
337 outbuf[0] = lp_failed_convert_char();
338
339 inbuf++;
340 i_len--;
341
342 outbuf++;
343 o_len--;
344
345 if (o_len == 0 || i_len == 0)
346 return destlen - o_len;
347
348 /* Keep trying with the next char... */
349 goto again;
350
351 } else {
352 /* Keep compiler happy.... */
353 return destlen - o_len;
354 }
355 }
356}
357
358/**
359 * Convert string from one encoding to another, making error checking etc
360 * Fast path version - handles ASCII first.
361 *
362 * @param src pointer to source string (multibyte or singlebyte)
363 * @param srclen length of the source string in bytes, or -1 for nul terminated.
364 * @param dest pointer to destination string (multibyte or singlebyte)
365 * @param destlen maximal length allowed for string - *NEVER* -1.
366 * @param allow_bad_conv determines if a "best effort" conversion is acceptable (never returns errors)
367 * @returns the number of bytes occupied in the destination
368 *
369 * Ensure the srclen contains the terminating zero.
370 *
371 * This function has been hand-tuned to provide a fast path.
372 * Don't change unless you really know what you are doing. JRA.
373 **/
374
375size_t convert_string(charset_t from, charset_t to,
376 void const *src, size_t srclen,
377 void *dest, size_t destlen, bool allow_bad_conv)
378{
379 /*
380 * NB. We deliberately don't do a strlen here if srclen == -1.
381 * This is very expensive over millions of calls and is taken
382 * care of in the slow path in convert_string_internal. JRA.
383 */
384
385#ifdef DEVELOPER
386 SMB_ASSERT(destlen != (size_t)-1);
387#endif
388
389 if (srclen == 0)
390 return 0;
391
392 if (from != CH_UTF16LE && from != CH_UTF16BE && to != CH_UTF16LE && to != CH_UTF16BE) {
393 const unsigned char *p = (const unsigned char *)src;
394 unsigned char *q = (unsigned char *)dest;
395 size_t slen = srclen;
396 size_t dlen = destlen;
397 unsigned char lastp = '\0';
398 size_t retval = 0;
399
400 /* If all characters are ascii, fast path here. */
401 while (slen && dlen) {
402 if ((lastp = *p) <= 0x7f) {
403 *q++ = *p++;
404 if (slen != (size_t)-1) {
405 slen--;
406 }
407 dlen--;
408 retval++;
409 if (!lastp)
410 break;
411 } else {
412#ifdef BROKEN_UNICODE_COMPOSE_CHARACTERS
413 goto general_case;
414#else
415 size_t ret = convert_string_internal(from, to, p, slen, q, dlen, allow_bad_conv);
416 if (ret == (size_t)-1) {
417 return ret;
418 }
419 return retval + ret;
420#endif
421 }
422 }
423 if (!dlen) {
424 /* Even if we fast path we should note if we ran out of room. */
425 if (((slen != (size_t)-1) && slen) ||
426 ((slen == (size_t)-1) && lastp)) {
427 errno = E2BIG;
428 }
429 }
430 return retval;
431
432 } else if (from == CH_UTF16LE && to != CH_UTF16LE) {
433 const unsigned char *p = (const unsigned char *)src;
434 unsigned char *q = (unsigned char *)dest;
435 size_t retval = 0;
436 size_t slen = srclen;
437 size_t dlen = destlen;
438 unsigned char lastp = '\0';
439
440 /* If all characters are ascii, fast path here. */
441 while (((slen == (size_t)-1) || (slen >= 2)) && dlen) {
442 if (((lastp = *p) <= 0x7f) && (p[1] == 0)) {
443 *q++ = *p;
444 if (slen != (size_t)-1) {
445 slen -= 2;
446 }
447 p += 2;
448 dlen--;
449 retval++;
450 if (!lastp)
451 break;
452 } else {
453#ifdef BROKEN_UNICODE_COMPOSE_CHARACTERS
454 goto general_case;
455#else
456 size_t ret = convert_string_internal(from, to, p, slen, q, dlen, allow_bad_conv);
457 if (ret == (size_t)-1) {
458 return ret;
459 }
460 return retval + ret;
461#endif
462 }
463 }
464 if (!dlen) {
465 /* Even if we fast path we should note if we ran out of room. */
466 if (((slen != (size_t)-1) && slen) ||
467 ((slen == (size_t)-1) && lastp)) {
468 errno = E2BIG;
469 }
470 }
471 return retval;
472
473 } else if (from != CH_UTF16LE && from != CH_UTF16BE && to == CH_UTF16LE) {
474 const unsigned char *p = (const unsigned char *)src;
475 unsigned char *q = (unsigned char *)dest;
476 size_t retval = 0;
477 size_t slen = srclen;
478 size_t dlen = destlen;
479 unsigned char lastp = '\0';
480
481 /* If all characters are ascii, fast path here. */
482 while (slen && (dlen >= 2)) {
483 if ((lastp = *p) <= 0x7F) {
484 *q++ = *p++;
485 *q++ = '\0';
486 if (slen != (size_t)-1) {
487 slen--;
488 }
489 dlen -= 2;
490 retval += 2;
491 if (!lastp)
492 break;
493 } else {
494#ifdef BROKEN_UNICODE_COMPOSE_CHARACTERS
495 goto general_case;
496#else
497 size_t ret = convert_string_internal(from, to, p, slen, q, dlen, allow_bad_conv);
498 if (ret == (size_t)-1) {
499 return ret;
500 }
501 return retval + ret;
502#endif
503 }
504 }
505 if (!dlen) {
506 /* Even if we fast path we should note if we ran out of room. */
507 if (((slen != (size_t)-1) && slen) ||
508 ((slen == (size_t)-1) && lastp)) {
509 errno = E2BIG;
510 }
511 }
512 return retval;
513 }
514
515#ifdef BROKEN_UNICODE_COMPOSE_CHARACTERS
516 general_case:
517#endif
518 return convert_string_internal(from, to, src, srclen, dest, destlen, allow_bad_conv);
519}
520
521/**
522 * Convert between character sets, allocating a new buffer for the result.
523 *
524 * @param ctx TALLOC_CTX to use to allocate with. If NULL use malloc.
525 * (this is a bad interface and needs fixing. JRA).
526 * @param srclen length of source buffer.
527 * @param dest always set at least to NULL
528 * @param converted_size set to the size of the allocated buffer on return
529 * true
530 * @note -1 is not accepted for srclen.
531 *
532 * @return True if new buffer was correctly allocated, and string was
533 * converted.
534 *
535 * Ensure the srclen contains the terminating zero.
536 *
537 * I hate the goto's in this function. It's embarressing.....
538 * There has to be a cleaner way to do this. JRA.
539 **/
540
541bool convert_string_allocate(TALLOC_CTX *ctx, charset_t from, charset_t to,
542 void const *src, size_t srclen, void *dst,
543 size_t *converted_size, bool allow_bad_conv)
544{
545 size_t i_len, o_len, destlen = (srclen * 3) / 2;
546 size_t retval;
547 const char *inbuf = (const char *)src;
548 char *outbuf = NULL, *ob = NULL;
549 smb_iconv_t descriptor;
550 void **dest = (void **)dst;
551
552 *dest = NULL;
553
554 if (!converted_size) {
555 errno = EINVAL;
556 return false;
557 }
558
559 if (src == NULL || srclen == (size_t)-1) {
560 errno = EINVAL;
561 return false;
562 }
563 if (srclen == 0) {
564 ob = ((ctx != NULL) ? talloc_strdup(ctx, "") : SMB_STRDUP(""));
565 if (ob == NULL) {
566 errno = ENOMEM;
567 return false;
568 }
569 *dest = ob;
570 *converted_size = 0;
571 return true;
572 }
573
574 lazy_initialize_conv();
575
576 descriptor = conv_handles[from][to];
577
578 if (descriptor == (smb_iconv_t)-1 || descriptor == (smb_iconv_t)0) {
579 if (!conv_silent)
580 DEBUG(0,("convert_string_allocate: Conversion not supported.\n"));
581 errno = EOPNOTSUPP;
582 return false;
583 }
584
585 convert:
586
587 /* +2 is for ucs2 null termination. */
588 if ((destlen*2)+2 < destlen) {
589 /* wrapped ! abort. */
590 if (!conv_silent)
591 DEBUG(0, ("convert_string_allocate: destlen wrapped !\n"));
592 if (!ctx)
593 SAFE_FREE(outbuf);
594 errno = EOPNOTSUPP;
595 return false;
596 } else {
597 destlen = destlen * 2;
598 }
599
600 /* +2 is for ucs2 null termination. */
601 if (ctx) {
602 ob = (char *)TALLOC_REALLOC(ctx, ob, destlen + 2);
603 } else {
604 ob = (char *)SMB_REALLOC(ob, destlen + 2);
605 }
606
607 if (!ob) {
608 DEBUG(0, ("convert_string_allocate: realloc failed!\n"));
609 errno = ENOMEM;
610 return false;
611 }
612 outbuf = ob;
613 i_len = srclen;
614 o_len = destlen;
615
616 again:
617
618
619 retval = smb_iconv(descriptor,
620 &inbuf, &i_len,
621 &outbuf, &o_len);
622 if(retval == (size_t)-1) {
623 const char *reason="unknown error";
624 switch(errno) {
625 case EINVAL:
626 reason="Incomplete multibyte sequence";
627 if (!conv_silent)
628 DEBUG(3,("convert_string_allocate: Conversion error: %s(%s)\n",reason,inbuf));
629 if (allow_bad_conv)
630 goto use_as_is;
631 break;
632 case E2BIG:
633 goto convert;
634 case EILSEQ:
635 reason="Illegal multibyte sequence";
636 if (!conv_silent)
637 DEBUG(3,("convert_string_allocate: Conversion error: %s(%s)\n",reason,inbuf));
638 if (allow_bad_conv)
639 goto use_as_is;
640 break;
641 }
642 if (!conv_silent)
643 DEBUG(0,("Conversion error: %s(%s)\n",reason,inbuf));
644 /* smb_panic(reason); */
645 if (ctx) {
646 TALLOC_FREE(ob);
647 } else {
648 SAFE_FREE(ob);
649 }
650 return false;
651 }
652
653 out:
654
655 destlen = destlen - o_len;
656 /* Don't shrink unless we're reclaiming a lot of
657 * space. This is in the hot codepath and these
658 * reallocs *cost*. JRA.
659 */
660 if (o_len > 1024) {
661 /* We're shrinking here so we know the +2 is safe from wrap. */
662 if (ctx) {
663 ob = (char *)TALLOC_REALLOC(ctx,ob,destlen + 2);
664 } else {
665 ob = (char *)SMB_REALLOC(ob,destlen + 2);
666 }
667 }
668
669 if (destlen && !ob) {
670 DEBUG(0, ("convert_string_allocate: out of memory!\n"));
671 errno = ENOMEM;
672 return false;
673 }
674
675 *dest = ob;
676
677 /* Must ucs2 null terminate in the extra space we allocated. */
678 ob[destlen] = '\0';
679 ob[destlen+1] = '\0';
680
681 *converted_size = destlen;
682 return true;
683
684 use_as_is:
685
686 /*
687 * Conversion not supported. This is actually an error, but there are so
688 * many misconfigured iconv systems and smb.conf's out there we can't just
689 * fail. Do a very bad conversion instead.... JRA.
690 */
691
692 {
693 if (o_len == 0 || i_len == 0)
694 goto out;
695
696 if (((from == CH_UTF16LE)||(from == CH_UTF16BE)) &&
697 ((to != CH_UTF16LE)||(to != CH_UTF16BE))) {
698 /* Can't convert from utf16 any endian to multibyte.
699 Replace with the default fail char.
700 */
701
702 if (i_len < 2)
703 goto out;
704
705 if (i_len >= 2) {
706 *outbuf = lp_failed_convert_char();
707
708 outbuf++;
709 o_len--;
710
711 inbuf += 2;
712 i_len -= 2;
713 }
714