Line data Source code
1 : /*
2 : Unix SMB/CIFS implementation.
3 : Character set conversion Extensions
4 : Copyright (C) Igor Vergeichik <iverg@mail.ru> 2001
5 : Copyright (C) Andrew Tridgell 2001-2011
6 : Copyright (C) Andrew Bartlett 2011
7 : Copyright (C) Simo Sorce 2001
8 : Copyright (C) Martin Pool 2003
9 :
10 : This program is free software; you can redistribute it and/or modify
11 : it under the terms of the GNU General Public License as published by
12 : the Free Software Foundation; either version 3 of the License, or
13 : (at your option) any later version.
14 :
15 : This program is distributed in the hope that it will be useful,
16 : but WITHOUT ANY WARRANTY; without even the implied warranty of
17 : MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 : GNU General Public License for more details.
19 :
20 : You should have received a copy of the GNU General Public License
21 : along with this program. If not, see <http://www.gnu.org/licenses/>.
22 :
23 : */
24 : #include "replace.h"
25 : #include "system/iconv.h"
26 : #include "charset.h"
27 : #include "lib/util/debug.h"
28 : #include "lib/util/fault.h"
29 :
30 : /**
31 : * @file
32 : *
33 : * @brief Character-set conversion routines built on our iconv.
34 : *
35 : * @note Samba's internal character set (at least in the 3.0 series)
36 : * is always the same as the one for the Unix filesystem. It is
37 : * <b>not</b> necessarily UTF-8 and may be different on machines that
38 : * need i18n filenames to be compatible with Unix software. It does
39 : * have to be a superset of ASCII. All multibyte sequences must start
40 : * with a byte with the high bit set.
41 : *
42 : * @sa lib/iconv.c
43 : */
44 :
45 :
46 : /**
47 : * Convert string from one encoding to another, making error checking etc
48 : * Slow path version - uses (slow) iconv.
49 : *
50 : * @param src pointer to source string (multibyte or singlebyte)
51 : * @param srclen length of the source string in bytes
52 : * @param dest pointer to destination string (multibyte or singlebyte)
53 : * @param destlen maximal length allowed for string
54 : * @param converted size is the number of bytes occupied in the destination
55 : *
56 : * @returns false and sets errno on fail, true on success.
57 : *
58 : * Ensure the srclen contains the terminating zero.
59 : *
60 : **/
61 :
62 43860 : static bool convert_string_internal(struct smb_iconv_handle *ic,
63 : charset_t from, charset_t to,
64 : void const *src, size_t srclen,
65 : void *dest, size_t destlen, size_t *converted_size)
66 : {
67 4376 : size_t i_len, o_len;
68 4376 : size_t retval;
69 43860 : const char* inbuf = (const char*)src;
70 43860 : char* outbuf = (char*)dest;
71 4376 : smb_iconv_t descriptor;
72 :
73 43860 : descriptor = get_conv_handle(ic, from, to);
74 :
75 43860 : if (srclen == (size_t)-1) {
76 95 : if (from == CH_UTF16LE || from == CH_UTF16BE) {
77 30 : srclen = (strlen_w((const smb_ucs2_t *)src)+1) * 2;
78 : } else {
79 65 : srclen = strlen((const char *)src)+1;
80 : }
81 : }
82 :
83 :
84 43860 : if (descriptor == (smb_iconv_t)-1 || descriptor == (smb_iconv_t)0) {
85 0 : errno = EINVAL;
86 0 : return false;
87 : }
88 :
89 43860 : i_len=srclen;
90 43860 : o_len=destlen;
91 :
92 43860 : retval = smb_iconv(descriptor, &inbuf, &i_len, &outbuf, &o_len);
93 43860 : *converted_size = destlen-o_len;
94 :
95 43860 : return (retval != (size_t)-1);
96 : }
97 :
98 : /**
99 : * Convert string from one encoding to another, making error checking etc
100 : * Fast path version - handles ASCII first.
101 : *
102 : * @param src pointer to source string (multibyte or singlebyte)
103 : * @param srclen length of the source string in bytes, or -1 for nul terminated.
104 : * @param dest pointer to destination string (multibyte or singlebyte)
105 : * @param destlen maximal length allowed for string - *NEVER* -1.
106 : * @param converted size is the number of bytes occupied in the destination
107 : *
108 : * @returns false and sets errno on fail, true on success.
109 : *
110 : * Ensure the srclen contains the terminating zero.
111 : *
112 : * This function has been hand-tuned to provide a fast path.
113 : * Don't change unless you really know what you are doing. JRA.
114 : **/
115 :
116 15440596 : bool convert_string_error_handle(struct smb_iconv_handle *ic,
117 : charset_t from, charset_t to,
118 : void const *src, size_t srclen,
119 : void *dest, size_t destlen,
120 : size_t *converted_size)
121 : {
122 : /*
123 : * NB. We deliberately don't do a strlen here if srclen == -1.
124 : * This is very expensive over millions of calls and is taken
125 : * care of in the slow path in convert_string_internal. JRA.
126 : */
127 :
128 : #ifdef DEVELOPER
129 15440596 : SMB_ASSERT(destlen != (size_t)-1);
130 : #endif
131 :
132 15440596 : if (srclen == 0) {
133 1695681 : *converted_size = 0;
134 1695681 : return true;
135 : }
136 :
137 13744915 : if (from != CH_UTF16LE && from != CH_UTF16BE && to != CH_UTF16LE && to != CH_UTF16BE) {
138 7097217 : const unsigned char *p = (const unsigned char *)src;
139 7097217 : unsigned char *q = (unsigned char *)dest;
140 7097217 : size_t slen = srclen;
141 7097217 : size_t dlen = destlen;
142 7097217 : unsigned char lastp = '\0';
143 7097217 : size_t retval = 0;
144 :
145 : /* If all characters are ascii, fast path here. */
146 320785034 : while (slen && dlen) {
147 313714037 : if ((lastp = *p) <= 0x7f) {
148 313712040 : *q++ = *p++;
149 313712040 : if (slen != (size_t)-1) {
150 312869191 : slen--;
151 : }
152 313712040 : dlen--;
153 313712040 : retval++;
154 313712040 : if (!lastp)
155 167946 : break;
156 : } else {
157 : #ifdef BROKEN_UNICODE_COMPOSE_CHARACTERS
158 : goto general_case;
159 : #else
160 1997 : bool ret = convert_string_internal(ic, from, to, p, slen, q, dlen, converted_size);
161 1997 : *converted_size += retval;
162 1997 : return ret;
163 : #endif
164 : }
165 : }
166 :
167 7241007 : *converted_size = retval;
168 :
169 7241007 : if (!dlen) {
170 : /* Even if we fast path we should note if we ran out of room. */
171 588435 : if (((slen != (size_t)-1) && slen) ||
172 9642 : ((slen == (size_t)-1) && lastp)) {
173 4329 : errno = E2BIG;
174 4329 : return false;
175 : }
176 : }
177 7091050 : return true;
178 6501911 : } else if (from == CH_UTF16LE && to != CH_UTF16LE) {
179 65 : const unsigned char *p = (const unsigned char *)src;
180 65 : unsigned char *q = (unsigned char *)dest;
181 65 : size_t retval = 0;
182 65 : size_t slen = srclen;
183 65 : size_t dlen = destlen;
184 65 : unsigned char lastp = '\0';
185 : #ifndef BROKEN_UNICODE_COMPOSE_CHARACTERS
186 65 : bool ret;
187 : #endif
188 :
189 65 : if (slen == (size_t)-1) {
190 1930 : while (dlen &&
191 1936 : ((lastp = *p) <= 0x7f) && (p[1] == 0)) {
192 1906 : *q++ = *p;
193 1906 : p += 2;
194 1906 : dlen--;
195 1906 : retval++;
196 1906 : if (!lastp)
197 0 : break;
198 : }
199 38 : if (lastp != 0) goto slow_path;
200 : } else {
201 1022 : while (slen >= 2 && dlen &&
202 516 : (*p <= 0x7f) && (p[1] == 0)) {
203 489 : *q++ = *p;
204 489 : slen -= 2;
205 489 : p += 2;
206 489 : dlen--;
207 489 : retval++;
208 : }
209 27 : if (slen != 0) goto slow_path;
210 : }
211 :
212 10 : *converted_size = retval;
213 :
214 10 : if (!dlen) {
215 : /* Even if we fast path we should note if we ran out of room. */
216 0 : if (((slen != (size_t)-1) && slen) ||
217 0 : ((slen == (size_t)-1) && lastp)) {
218 0 : errno = E2BIG;
219 0 : return false;
220 : }
221 : }
222 10 : return true;
223 :
224 55 : slow_path:
225 : /* come here when we hit a character we can't deal
226 : * with in the fast path
227 : */
228 : #ifdef BROKEN_UNICODE_COMPOSE_CHARACTERS
229 : goto general_case;
230 : #else
231 55 : ret = convert_string_internal(ic, from, to, p, slen, q, dlen, converted_size);
232 55 : *converted_size += retval;
233 55 : return ret;
234 : #endif
235 :
236 6501846 : } else if (from != CH_UTF16LE && from != CH_UTF16BE && to == CH_UTF16LE) {
237 6404894 : const unsigned char *p = (const unsigned char *)src;
238 6404894 : unsigned char *q = (unsigned char *)dest;
239 6404894 : size_t retval = 0;
240 6404894 : size_t slen = srclen;
241 6404894 : size_t dlen = destlen;
242 6404894 : unsigned char lastp = '\0';
243 :
244 : /* If all characters are ascii, fast path here. */
245 342303738 : while (slen && (dlen >= 1)) {
246 336356702 : if (dlen >=2 && (lastp = *p) <= 0x7F) {
247 336353088 : *q++ = *p++;
248 336353088 : *q++ = '\0';
249 336353088 : if (slen != (size_t)-1) {
250 333992769 : slen--;
251 : }
252 336353088 : dlen -= 2;
253 336353088 : retval += 2;
254 336353088 : if (!lastp)
255 501614 : break;
256 : } else {
257 : #ifdef BROKEN_UNICODE_COMPOSE_CHARACTERS
258 : goto general_case;
259 : #else
260 3614 : bool ret = convert_string_internal(ic, from, to, p, slen, q, dlen, converted_size);
261 3614 : *converted_size += retval;
262 3614 : return ret;
263 : #endif
264 : }
265 : }
266 :
267 6460038 : *converted_size = retval;
268 :
269 6460038 : if (!dlen) {
270 : /* Even if we fast path we should note if we ran out of room. */
271 1017187 : if (((slen != (size_t)-1) && slen) ||
272 41299 : ((slen == (size_t)-1) && lastp)) {
273 4 : errno = E2BIG;
274 4 : return false;
275 : }
276 : }
277 6401310 : return true;
278 : }
279 :
280 : #ifdef BROKEN_UNICODE_COMPOSE_CHARACTERS
281 : general_case:
282 : #endif
283 38194 : return convert_string_internal(ic, from, to, src, srclen, dest, destlen, converted_size);
284 : }
285 :
286 15348102 : bool convert_string_handle(struct smb_iconv_handle *ic,
287 : charset_t from, charset_t to,
288 : void const *src, size_t srclen,
289 : void *dest, size_t destlen,
290 : size_t *converted_size)
291 : {
292 15348102 : bool ret = convert_string_error_handle(ic, from, to, src, srclen, dest, destlen, converted_size);
293 :
294 15348102 : if(ret==false) {
295 100 : const char *reason="unknown error";
296 100 : switch(errno) {
297 40 : case EINVAL:
298 40 : reason="Incomplete multibyte sequence";
299 40 : DBG_NOTICE("Conversion error: %s\n",
300 : reason);
301 40 : break;
302 20 : case E2BIG:
303 : {
304 20 : reason="No more room";
305 20 : if (from == CH_UNIX) {
306 20 : DBG_NOTICE("E2BIG: convert_string(%s,%s): srclen=%u destlen=%u error: %s\n",
307 : charset_name(ic, from), charset_name(ic, to),
308 : (unsigned int)srclen, (unsigned int)destlen, reason);
309 : } else {
310 0 : DBG_NOTICE("E2BIG: convert_string(%s,%s): srclen=%u destlen=%u error: %s\n",
311 : charset_name(ic, from), charset_name(ic, to),
312 : (unsigned int)srclen, (unsigned int)destlen, reason);
313 : }
314 20 : break;
315 : }
316 40 : case EILSEQ:
317 40 : reason="Illegal multibyte sequence";
318 40 : DBG_NOTICE("convert_string_internal: Conversion error: %s\n",
319 : reason);
320 40 : break;
321 0 : default:
322 0 : DBG_ERR("convert_string_internal: Conversion error: %s\n",
323 : reason);
324 0 : break;
325 : }
326 : /* smb_panic(reason); */
327 : }
328 15348102 : return ret;
329 : }
330 :
331 :
332 : /**
333 : * Convert between character sets, allocating a new buffer using talloc for the result.
334 : *
335 : * @param srclen length of source buffer.
336 : * @param dest always set at least to NULL
337 : * @param converted_size set to the number of bytes occupied by the string in
338 : * the destination on success.
339 : * @note -1 is not accepted for srclen.
340 : *
341 : * @return true if new buffer was correctly allocated, and string was
342 : * converted.
343 : *
344 : * Ensure the srclen contains the terminating zero.
345 : */
346 142210991 : bool convert_string_talloc_handle(TALLOC_CTX *ctx, struct smb_iconv_handle *ic,
347 : charset_t from, charset_t to,
348 : void const *src, size_t srclen, void *dst,
349 : size_t *converted_size)
350 :
351 : {
352 2525048 : size_t i_len, o_len, destlen;
353 2525048 : size_t retval;
354 142210991 : const char *inbuf = NULL;
355 142210991 : char *outbuf = NULL, *ob = NULL;
356 2525048 : smb_iconv_t descriptor;
357 142210991 : void **dest = dst;
358 :
359 142210991 : *dest = NULL;
360 142210991 : if (converted_size != NULL) {
361 142205379 : *converted_size = 0;
362 : }
363 :
364 142210991 : if (src == NULL || srclen == (size_t)-1) {
365 2 : errno = EINVAL;
366 2 : return false;
367 : }
368 :
369 142210989 : if (srclen == 0) {
370 : /* We really should treat this as an error, but
371 : there are too many callers that need this to
372 : return a NULL terminated string in the correct
373 : character set. */
374 35026 : if (to == CH_UTF16LE|| to == CH_UTF16BE || to == CH_UTF16MUNGED) {
375 4717 : destlen = 2;
376 : } else {
377 30309 : destlen = 1;
378 : }
379 35026 : ob = talloc_zero_array(ctx, char, destlen);
380 35026 : if (ob == NULL) {
381 0 : DBG_ERR("Could not talloc destination buffer.\n");
382 0 : errno = ENOMEM;
383 0 : return false;
384 : }
385 35026 : if (converted_size != NULL) {
386 34896 : *converted_size = destlen;
387 : }
388 35026 : *dest = ob;
389 35026 : return true;
390 : }
391 :
392 142175963 : descriptor = get_conv_handle(ic, from, to);
393 :
394 142175963 : if (descriptor == (smb_iconv_t)-1 || descriptor == (smb_iconv_t)0) {
395 0 : DEBUG(0,("convert_string_talloc: Conversion not supported.\n"));
396 0 : errno = EOPNOTSUPP;
397 0 : return false;
398 : }
399 :
400 142175963 : if (srclen >= (SIZE_MAX - 2) / 3) {
401 0 : DBG_ERR("convert_string_talloc: "
402 : "srclen is %zu, destlen would wrap!\n",
403 : srclen);
404 0 : errno = EOPNOTSUPP;
405 0 : return false;
406 : }
407 142175963 : destlen = srclen * 3;
408 :
409 : /* +2 is for ucs2 null termination. */
410 142175963 : ob = talloc_realloc(ctx, ob, char, destlen + 2);
411 :
412 142175963 : if (!ob) {
413 0 : DEBUG(0, ("convert_string_talloc: realloc failed!\n"));
414 0 : errno = ENOMEM;
415 0 : return false;
416 : }
417 142175963 : outbuf = ob;
418 142175963 : i_len = srclen;
419 142175963 : o_len = destlen;
420 142175963 : inbuf = (const char *)src;
421 :
422 142175963 : retval = smb_iconv(descriptor,
423 : &inbuf, &i_len,
424 : &outbuf, &o_len);
425 142175963 : if(retval == (size_t)-1) {
426 83 : const char *reason="unknown error";
427 83 : switch(errno) {
428 48 : case EINVAL:
429 48 : reason="Incomplete multibyte sequence";
430 48 : DBG_NOTICE("Conversion error: %s\n",
431 : reason);
432 40 : break;
433 0 : case E2BIG:
434 0 : reason = "output buffer is too small";
435 0 : DBG_ERR("Conversion error: %s\n",
436 : reason);
437 0 : break;
438 35 : case EILSEQ:
439 35 : reason="Illegal multibyte sequence";
440 35 : DBG_NOTICE("Conversion error: %s\n",
441 : reason);
442 12 : break;
443 0 : default:
444 0 : DBG_ERR("Conversion error: %s\n",
445 : reason);
446 0 : break;
447 : }
448 : /* smb_panic(reason); */
449 83 : TALLOC_FREE(ob);
450 83 : return false;
451 : }
452 :
453 142175880 : destlen = destlen - o_len;
454 : /* Don't shrink unless we're reclaiming a lot of
455 : * space. This is in the hot codepath and these
456 : * reallocs *cost*. JRA.
457 : */
458 142175880 : if (o_len > 1024) {
459 : /* We're shrinking here so we know the +2 is safe from wrap. */
460 532822 : ob = talloc_realloc(ctx,ob, char, destlen + 2);
461 : }
462 :
463 142175880 : if (destlen && !ob) {
464 0 : DEBUG(0, ("convert_string_talloc: out of memory!\n"));
465 0 : errno = ENOMEM;
466 0 : return false;
467 : }
468 :
469 142175880 : *dest = ob;
470 :
471 : /* Must ucs2 null terminate in the extra space we allocated. */
472 142175880 : ob[destlen] = '\0';
473 142175880 : ob[destlen+1] = '\0';
474 :
475 : /* Ensure we can never return a *converted_size of zero. */
476 142175880 : if (destlen == 0) {
477 : /* As we're now returning false on a bad smb_iconv call,
478 : this should never happen. But be safe anyway. */
479 0 : if (to == CH_UTF16LE|| to == CH_UTF16BE || to == CH_UTF16MUNGED) {
480 0 : destlen = 2;
481 : } else {
482 2524860 : destlen = 1;
483 : }
484 : }
485 :
486 142175880 : if (converted_size != NULL) {
487 142170398 : *converted_size = destlen;
488 : }
489 139651020 : return true;
490 : }
491 :
492 : /**
493 : * Convert string from one encoding to another, with error checking.
494 : * This version produces more logging information than
495 : * convert_string_error(), but is otherwise functionally identical.
496 : *
497 : * @param src pointer to source string (multibyte or singlebyte)
498 : * @param srclen length of the source string in bytes
499 : * @param dest pointer to destination string (multibyte or singlebyte)
500 : * @param destlen maximal length allowed for string
501 : * @param converted_size the number of bytes occupied in the destination
502 : *
503 : * @returns true on success, false on fail.
504 : **/
505 15348102 : _PUBLIC_ bool convert_string(charset_t from, charset_t to,
506 : void const *src, size_t srclen,
507 : void *dest, size_t destlen,
508 : size_t *converted_size)
509 : {
510 15348102 : return convert_string_handle(get_iconv_handle(), from, to,
511 : src, srclen,
512 : dest, destlen, converted_size);
513 : }
514 :
515 : /**
516 : * Convert string from one encoding to another, with error checking.
517 : * This version is less verbose than convert_string().
518 : *
519 : * @param src pointer to source string (multibyte or singlebyte)
520 : * @param srclen length of the source string in bytes
521 : * @param dest pointer to destination string (multibyte or singlebyte)
522 : * @param destlen maximal length allowed for string
523 : * @param converted_size the number of bytes occupied in the destination
524 : *
525 : * @returns true on success, false on fail.
526 : **/
527 92383 : _PUBLIC_ bool convert_string_error(charset_t from, charset_t to,
528 : void const *src, size_t srclen,
529 : void *dest, size_t destlen,
530 : size_t *converted_size)
531 : {
532 92383 : return convert_string_error_handle(get_iconv_handle(), from, to,
533 : src, srclen,
534 : dest, destlen, converted_size);
535 : }
536 :
537 : /**
538 : * Convert between character sets, allocating a new buffer using talloc for the result.
539 : *
540 : * @param srclen length of source buffer.
541 : * @param dest always set at least to NULL
542 : * @param converted_size Size in bytes of the converted string
543 : * @note -1 is not accepted for srclen.
544 : *
545 : * @returns boolean indication whether the conversion succeeded
546 : **/
547 :
548 142210774 : _PUBLIC_ bool convert_string_talloc(TALLOC_CTX *ctx,
549 : charset_t from, charset_t to,
550 : void const *src, size_t srclen,
551 : void *dest, size_t *converted_size)
552 : {
553 142210774 : return convert_string_talloc_handle(ctx, get_iconv_handle(),
554 : from, to, src, srclen, dest,
555 : converted_size);
556 : }
|