Line data Source code
1 : /*
2 : Unix SMB/CIFS implementation.
3 : Samba utility functions
4 : Copyright (C) Andrew Tridgell 1992-2001
5 : Copyright (C) Simo Sorce 2001
6 : Copyright (C) Andrew Bartlett 2011
7 : Copyright (C) Jeremy Allison 1992-2007
8 : Copyright (C) Martin Pool 2003
9 : Copyright (C) James Peach 2006
10 :
11 : This program is free software; you can redistribute it and/or modify
12 : it under the terms of the GNU General Public License as published by
13 : the Free Software Foundation; either version 3 of the License, or
14 : (at your option) any later version.
15 :
16 : This program is distributed in the hope that it will be useful,
17 : but WITHOUT ANY WARRANTY; without even the implied warranty of
18 : MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 : GNU General Public License for more details.
20 :
21 : You should have received a copy of the GNU General Public License
22 : along with this program. If not, see <http://www.gnu.org/licenses/>.
23 : */
24 :
25 : #include "replace.h"
26 : #include "system/locale.h"
27 : #include "charset.h"
28 : #include "lib/util/fault.h"
29 :
30 : #ifdef strcasecmp
31 : #undef strcasecmp
32 : #endif
33 : #ifdef strncasecmp
34 : #undef strncasecmp
35 : #endif
36 :
37 :
38 : /**
39 : Case insensitive string comparison, handle specified for testing
40 : **/
41 348985496 : _PUBLIC_ int strcasecmp_m_handle(struct smb_iconv_handle *iconv_handle,
42 : const char *s1, const char *s2)
43 : {
44 348985496 : codepoint_t c1=0, c2=0;
45 348985496 : codepoint_t u1=0, u2=0;
46 348985496 : codepoint_t l1=0, l2=0;
47 2222691 : size_t size1, size2;
48 :
49 : /* handle null ptr comparisons to simplify the use in qsort */
50 348985496 : if (s1 == s2) return 0;
51 348984821 : if (s1 == NULL) return -1;
52 348984819 : if (s2 == NULL) return 1;
53 :
54 1165670835 : while (*s1 && *s2) {
55 1148051710 : c1 = next_codepoint_handle(iconv_handle, s1, &size1);
56 1148051710 : c2 = next_codepoint_handle(iconv_handle, s2, &size2);
57 :
58 1148051710 : if (c1 == INVALID_CODEPOINT ||
59 3961961 : c2 == INVALID_CODEPOINT) {
60 9 : return strcasecmp(s1, s2);
61 : }
62 :
63 1148051701 : s1 += size1;
64 1148051701 : s2 += size2;
65 :
66 1148051701 : if (c1 == c2) {
67 815074569 : continue;
68 : }
69 :
70 332977132 : u1 = toupper_m(c1);
71 332977132 : u2 = toupper_m(c2);
72 332977132 : if (u1 == u2) {
73 1611449 : continue;
74 : }
75 :
76 331365683 : l1 = tolower_m(c1);
77 331365683 : l2 = tolower_m(c2);
78 331365683 : if (l1 == l2) {
79 0 : continue;
80 : }
81 :
82 331365683 : return l1 - l2;
83 : }
84 :
85 17619125 : return *s1 - *s2;
86 : }
87 :
88 : /**
89 : Case insensitive string comparison
90 : **/
91 348985478 : _PUBLIC_ int strcasecmp_m(const char *s1, const char *s2)
92 : {
93 348985478 : struct smb_iconv_handle *iconv_handle = get_iconv_handle();
94 348985478 : return strcasecmp_m_handle(iconv_handle, s1, s2);
95 : }
96 :
97 : /**
98 : Case insensitive string comparison, length limited, handle specified for
99 : testing
100 : **/
101 7563203 : _PUBLIC_ int strncasecmp_m_handle(struct smb_iconv_handle *iconv_handle,
102 : const char *s1, const char *s2, size_t n)
103 : {
104 7563203 : codepoint_t c1=0, c2=0;
105 7563203 : codepoint_t u1=0, u2=0;
106 7563203 : codepoint_t l1=0, l2=0;
107 8512 : size_t size1, size2;
108 :
109 : /* handle null ptr comparisons to simplify the use in qsort */
110 7563203 : if (s1 == s2) return 0;
111 7562907 : if (s1 == NULL) return -1;
112 7562906 : if (s2 == NULL) return 1;
113 :
114 19025317 : while (*s1 && *s2 && n) {
115 18182139 : n--;
116 :
117 18182139 : c1 = next_codepoint_handle(iconv_handle, s1, &size1);
118 18182139 : c2 = next_codepoint_handle(iconv_handle, s2, &size2);
119 :
120 18182139 : if (c1 == INVALID_CODEPOINT ||
121 25188 : c2 == INVALID_CODEPOINT) {
122 : /*
123 : * n was specified in characters,
124 : * now we must convert it to bytes.
125 : * As bytes are the smallest
126 : * character unit, the following
127 : * increment and strncasecmp is always
128 : * safe.
129 : *
130 : * The source string was already known
131 : * to be n characters long, so we are
132 : * guaranteed to be able to look at the
133 : * (n remaining + size1) bytes from the
134 : * s1 position).
135 : */
136 1 : n += size1;
137 1 : return strncasecmp(s1, s2, n);
138 : }
139 :
140 18182138 : s1 += size1;
141 18182138 : s2 += size2;
142 :
143 18182138 : if (c1 == c2) {
144 11441822 : continue;
145 : }
146 :
147 6740316 : u1 = toupper_m(c1);
148 6740316 : u2 = toupper_m(c2);
149 6740316 : if (u1 == u2) {
150 20590 : continue;
151 : }
152 :
153 6719726 : l1 = tolower_m(c1);
154 6719726 : l2 = tolower_m(c2);
155 6719726 : if (l1 == l2) {
156 0 : continue;
157 : }
158 :
159 6719726 : return l1 - l2;
160 : }
161 :
162 843178 : if (n == 0) {
163 834167 : return 0;
164 : }
165 :
166 6763 : return *s1 - *s2;
167 : }
168 :
169 : /**
170 : Case insensitive string comparison, length limited
171 : **/
172 7563191 : _PUBLIC_ int strncasecmp_m(const char *s1, const char *s2, size_t n)
173 : {
174 7563191 : struct smb_iconv_handle *iconv_handle = get_iconv_handle();
175 7563191 : return strncasecmp_m_handle(iconv_handle, s1, s2, n);
176 : }
177 :
178 : /**
179 : * Compare 2 strings.
180 : *
181 : * @note The comparison is case-insensitive.
182 : **/
183 95776 : _PUBLIC_ bool strequal_m(const char *s1, const char *s2)
184 : {
185 95776 : return strcasecmp_m(s1,s2) == 0;
186 : }
187 :
188 : /**
189 : Compare 2 strings (case sensitive).
190 : **/
191 3323605 : _PUBLIC_ bool strcsequal(const char *s1,const char *s2)
192 : {
193 3323605 : if (s1 == s2)
194 40 : return true;
195 3323557 : if (!s1 || !s2)
196 0 : return false;
197 :
198 3323555 : return strcmp(s1,s2) == 0;
199 : }
200 :
201 : /**
202 : * Calculate the number of units (8 or 16-bit, depending on the
203 : * destination charset) that would be needed to convert the input
204 : * string, which is expected to be in src_charset encoding, to the
205 : * destination charset (which should be a unicode charset).
206 : */
207 41422289 : _PUBLIC_ size_t strlen_m_ext_handle(struct smb_iconv_handle *ic,
208 : const char *s, charset_t src_charset, charset_t dst_charset)
209 : {
210 41422289 : size_t count = 0;
211 :
212 : #ifdef DEVELOPER
213 41422289 : switch (dst_charset) {
214 0 : case CH_DOS:
215 : case CH_UNIX:
216 0 : smb_panic("cannot call strlen_m_ext() with a variable dest charset (must be UTF16* or UTF8)");
217 40570909 : default:
218 41422289 : break;
219 : }
220 :
221 41422289 : switch (src_charset) {
222 0 : case CH_UTF16LE:
223 : case CH_UTF16BE:
224 0 : smb_panic("cannot call strlen_m_ext() with a UTF16 src charset (must be DOS, UNIX, DISPLAY or UTF8)");
225 40570909 : default:
226 41422289 : break;
227 : }
228 : #endif
229 41422289 : if (!s) {
230 66154 : return 0;
231 : }
232 :
233 1221336343 : while (*s && !(((uint8_t)*s) & 0x80)) {
234 1179983976 : s++;
235 1179983976 : count++;
236 : }
237 :
238 41352367 : if (!*s) {
239 40492894 : return count;
240 : }
241 :
242 575596 : while (*s) {
243 3536 : size_t c_size;
244 563707 : codepoint_t c = next_codepoint_handle_ext(ic, s, strnlen(s, 5),
245 : src_charset, &c_size);
246 563707 : s += c_size;
247 :
248 563707 : switch (dst_charset) {
249 555682 : case CH_UTF16LE:
250 : case CH_UTF16BE:
251 : case CH_UTF16MUNGED:
252 555682 : if (c < 0x10000) {
253 : /* Unicode char fits into 16 bits. */
254 492815 : count += 1;
255 : } else {
256 : /* Double-width unicode char - 32 bits. */
257 62867 : count += 2;
258 : }
259 553391 : break;
260 8025 : case CH_UTF8:
261 : /*
262 : * this only checks ranges, and does not
263 : * check for invalid codepoints
264 : */
265 8025 : if (c < 0x80) {
266 6116 : count += 1;
267 1909 : } else if (c < 0x800) {
268 871 : count += 2;
269 1038 : } else if (c < 0x10000) {
270 1038 : count += 3;
271 : } else {
272 0 : count += 4;
273 : }
274 6780 : break;
275 0 : default:
276 : /*
277 : * non-unicode encoding:
278 : * assume that each codepoint fits into
279 : * one unit in the destination encoding.
280 : */
281 0 : count += 1;
282 : }
283 : }
284 :
285 11861 : return count;
286 : }
287 :
288 : /**
289 : * Calculate the number of units (8 or 16-bit, depending on the
290 : * destination charset) that would be needed to convert the input
291 : * string, which is expected to be in src_charset encoding, to the
292 : * destination charset (which should be a unicode charset).
293 : */
294 41422277 : _PUBLIC_ size_t strlen_m_ext(const char *s, charset_t src_charset, charset_t dst_charset)
295 : {
296 41422277 : struct smb_iconv_handle *ic = get_iconv_handle();
297 41422277 : return strlen_m_ext_handle(ic, s, src_charset, dst_charset);
298 : }
299 :
300 25342401 : _PUBLIC_ size_t strlen_m_ext_term(const char *s, const charset_t src_charset,
301 : const charset_t dst_charset)
302 : {
303 25342401 : if (!s) {
304 94428 : return 0;
305 : }
306 25247617 : return strlen_m_ext(s, src_charset, dst_charset) + 1;
307 : }
308 :
309 931606 : _PUBLIC_ size_t strlen_m_ext_term_null(const char *s,
310 : const charset_t src_charset,
311 : const charset_t dst_charset)
312 : {
313 1952 : size_t len;
314 931606 : if (!s) {
315 972 : return 0;
316 : }
317 930633 : len = strlen_m_ext(s, src_charset, dst_charset);
318 930633 : if (len == 0) {
319 608580 : return 0;
320 : }
321 :
322 321824 : return len+1;
323 : }
324 :
325 : /**
326 : * Calculate the number of 16-bit units that would be needed to convert
327 : * the input string, which is expected to be in CH_UNIX encoding, to UTF16.
328 : *
329 : * This will be the same as the number of bytes in a string for single
330 : * byte strings, but will be different for multibyte.
331 : */
332 15244021 : _PUBLIC_ size_t strlen_m(const char *s)
333 : {
334 15244021 : return strlen_m_ext(s, CH_UNIX, CH_UTF16LE);
335 : }
336 :
337 : /**
338 : Work out the number of multibyte chars in a string, including the NULL
339 : terminator.
340 : **/
341 2242666 : _PUBLIC_ size_t strlen_m_term(const char *s)
342 : {
343 2242666 : return strlen_m_ext_term(s, CH_UNIX, CH_UTF16LE);
344 : }
345 :
346 : /*
347 : * Weird helper routine for the winreg pipe: If nothing is around, return 0,
348 : * if a string is there, include the terminator.
349 : */
350 :
351 931606 : _PUBLIC_ size_t strlen_m_term_null(const char *s)
352 : {
353 931606 : return strlen_m_ext_term_null(s, CH_UNIX, CH_UTF16LE);
354 : }
355 :
356 : /**
357 : Strchr and strrchr_m are a bit complex on general multi-byte strings.
358 : **/
359 318738536 : _PUBLIC_ char *strchr_m(const char *src, char c)
360 : {
361 2044091 : const char *s;
362 318738536 : struct smb_iconv_handle *ic = get_iconv_handle();
363 318738536 : if (src == NULL) {
364 0 : return NULL;
365 : }
366 : /* characters below 0x3F are guaranteed to not appear in
367 : non-initial position in multi-byte charsets */
368 318738536 : if ((c & 0xC0) == 0) {
369 93925449 : return strchr(src, c);
370 : }
371 :
372 : /* this is quite a common operation, so we want it to be
373 : fast. We optimise for the ascii case, knowing that all our
374 : supported multi-byte character sets are ascii-compatible
375 : (ie. they match for the first 128 chars) */
376 :
377 1563637772 : for (s = src; *s && !(((unsigned char)s[0]) & 0x80); s++) {
378 1338889043 : if (*s == c)
379 64358 : return discard_const_p(char, s);
380 : }
381 :
382 224748729 : if (!*s)
383 223532916 : return NULL;
384 :
385 : #ifdef BROKEN_UNICODE_COMPOSE_CHARACTERS
386 : /* With compose characters we must restart from the beginning. JRA. */
387 : s = src;
388 : #endif
389 :
390 4 : while (*s) {
391 3 : size_t size;
392 3 : codepoint_t c2 = next_codepoint_handle(ic, s, &size);
393 3 : if (c2 == c) {
394 0 : return discard_const_p(char, s);
395 : }
396 3 : s += size;
397 : }
398 :
399 0 : return NULL;
400 : }
401 :
402 : /**
403 : * Multibyte-character version of strrchr
404 : */
405 7844288 : _PUBLIC_ char *strrchr_m(const char *s, char c)
406 : {
407 38382 : struct smb_iconv_handle *ic;
408 7844288 : char *ret = NULL;
409 :
410 7844288 : if (s == NULL) {
411 0 : return NULL;
412 : }
413 :
414 : /* characters below 0x3F are guaranteed to not appear in
415 : non-initial position in multi-byte charsets */
416 7844288 : if ((c & 0xC0) == 0) {
417 7792513 : return strrchr(s, c);
418 : }
419 :
420 : /* this is quite a common operation, so we want it to be
421 : fast. We optimise for the ascii case, knowing that all our
422 : supported multi-byte character sets are ascii-compatible
423 : (ie. they match for the first 128 chars). Also, in Samba
424 : we only search for ascii characters in 'c' and that
425 : in all mb character sets with a compound character
426 : containing c, if 'c' is not a match at position
427 : p, then p[-1] > 0x7f. JRA. */
428 :
429 : {
430 51775 : size_t len = strlen(s);
431 51775 : const char *cp = s;
432 51775 : bool got_mb = false;
433 :
434 51775 : if (len == 0)
435 106 : return NULL;
436 51669 : cp += (len - 1);
437 1694 : do {
438 345268 : if (c == *cp) {
439 : /* Could be a match. Part of a multibyte ? */
440 33992 : if ((cp > s) &&
441 32124 : (((unsigned char)cp[-1]) & 0x80)) {
442 : /* Yep - go slow :-( */
443 0 : got_mb = true;
444 0 : break;
445 : }
446 : /* No - we have a match ! */
447 33811 : return discard_const_p(char , cp);
448 : }
449 311276 : } while (cp-- != s);
450 17642 : if (!got_mb)
451 17642 : return NULL;
452 : }
453 :
454 0 : ic = get_iconv_handle();
455 :
456 0 : while (*s) {
457 0 : size_t size;
458 0 : codepoint_t c2 = next_codepoint_handle(ic, s, &size);
459 0 : if (c2 == c) {
460 0 : ret = discard_const_p(char, s);
461 : }
462 0 : s += size;
463 : }
464 :
465 0 : return ret;
466 : }
467 :
468 : /**
469 : return True if any (multi-byte) character is lower case
470 : */
471 35 : _PUBLIC_ bool strhaslower_handle(struct smb_iconv_handle *ic,
472 : const char *string)
473 : {
474 963 : while (*string) {
475 950 : size_t c_size;
476 950 : codepoint_t s;
477 950 : codepoint_t t;
478 :
479 950 : s = next_codepoint_handle(ic, string, &c_size);
480 950 : string += c_size;
481 :
482 950 : t = toupper_m(s);
483 :
484 950 : if (s != t) {
485 22 : return true; /* that means it has lower case chars */
486 : }
487 : }
488 :
489 0 : return false;
490 : }
491 :
492 17 : _PUBLIC_ bool strhaslower(const char *string)
493 : {
494 17 : struct smb_iconv_handle *ic = get_iconv_handle();
495 17 : return strhaslower_handle(ic, string);
496 : }
497 :
498 : /**
499 : return True if any (multi-byte) character is upper case
500 : */
501 35 : _PUBLIC_ bool strhasupper_handle(struct smb_iconv_handle *ic,
502 : const char *string)
503 : {
504 954 : while (*string) {
505 941 : size_t c_size;
506 941 : codepoint_t s;
507 941 : codepoint_t t;
508 :
509 941 : s = next_codepoint_handle(ic, string, &c_size);
510 941 : string += c_size;
511 :
512 941 : t = tolower_m(s);
513 :
514 941 : if (s != t) {
515 22 : return true; /* that means it has upper case chars */
516 : }
517 : }
518 :
519 0 : return false;
520 : }
521 :
522 17 : _PUBLIC_ bool strhasupper(const char *string)
523 : {
524 17 : struct smb_iconv_handle *ic = get_iconv_handle();
525 17 : return strhasupper_handle(ic, string);
526 : }
527 :
528 : /***********************************************************************
529 : strstr_m - We convert via ucs2 for now.
530 : ***********************************************************************/
531 :
532 2438496 : char *strstr_m(const char *src, const char *findstr)
533 : {
534 2438496 : TALLOC_CTX *mem_ctx = NULL;
535 9793 : smb_ucs2_t *p;
536 9793 : smb_ucs2_t *src_w, *find_w;
537 9793 : const char *s;
538 9793 : char *s2;
539 2438496 : char *retp = NULL;
540 2438496 : size_t converted_size, findstr_len = 0;
541 :
542 : /* for correctness */
543 2438496 : if (!findstr[0]) {
544 0 : return discard_const_p(char, src);
545 : }
546 :
547 : /* Samba does single character findstr calls a *lot*. */
548 2438494 : if (findstr[1] == '\0')
549 106523 : return strchr_m(src, *findstr);
550 :
551 : /* We optimise for the ascii case, knowing that all our
552 : supported multi-byte character sets are ascii-compatible
553 : (ie. they match for the first 128 chars) */
554 :
555 46956373 : for (s = src; *s && !(((unsigned char)s[0]) & 0x80); s++) {
556 45430773 : if (*s == *findstr) {
557 2419944 : if (!findstr_len)
558 1426967 : findstr_len = strlen(findstr);
559 :
560 2419944 : if (strncmp(s, findstr, findstr_len) == 0) {
561 806371 : return discard_const_p(char, s);
562 : }
563 : }
564 : }
565 :
566 1525600 : if (!*s)
567 1521466 : return NULL;
568 :
569 : #if 1 /* def BROKEN_UNICODE_COMPOSE_CHARACTERS */
570 : /* 'make check' fails unless we do this */
571 :
572 : /* With compose characters we must restart from the beginning. JRA. */
573 9 : s = src;
574 : #endif
575 :
576 : /*
577 : * Use get_iconv_handle() just as a non-NULL talloc ctx. In
578 : * case we leak memory, this should then be more obvious in
579 : * the talloc report.
580 : */
581 9 : mem_ctx = talloc_new(get_iconv_handle());
582 9 : if (mem_ctx == NULL) {
583 0 : return NULL;
584 : }
585 :
586 9 : if (!push_ucs2_talloc(mem_ctx, &src_w, src, &converted_size)) {
587 0 : goto done;
588 : }
589 :
590 9 : if (!push_ucs2_talloc(mem_ctx, &find_w, findstr, &converted_size)) {
591 3 : goto done;
592 : }
593 :
594 6 : p = strstr_w(src_w, find_w);
595 :
596 6 : if (!p) {
597 3 : goto done;
598 : }
599 :
600 3 : *p = 0;
601 3 : if (!pull_ucs2_talloc(mem_ctx, &s2, src_w, &converted_size)) {
602 0 : goto done;
603 : }
604 3 : retp = discard_const_p(char, (s+strlen(s2)));
605 9 : done:
606 9 : TALLOC_FREE(mem_ctx);
607 9 : return retp;
608 : }
|