Line data Source code
1 : /*
2 : * Copyright (c) 2004 Kungliga Tekniska Högskolan
3 : * (Royal Institute of Technology, Stockholm, Sweden).
4 : * All rights reserved.
5 : *
6 : * Redistribution and use in source and binary forms, with or without
7 : * modification, are permitted provided that the following conditions
8 : * are met:
9 : *
10 : * 1. Redistributions of source code must retain the above copyright
11 : * notice, this list of conditions and the following disclaimer.
12 : *
13 : * 2. Redistributions in binary form must reproduce the above copyright
14 : * notice, this list of conditions and the following disclaimer in the
15 : * documentation and/or other materials provided with the distribution.
16 : *
17 : * 3. Neither the name of the Institute nor the names of its contributors
18 : * may be used to endorse or promote products derived from this software
19 : * without specific prior written permission.
20 : *
21 : * THIS SOFTWARE IS PROVIDED BY THE INSTITUTE AND CONTRIBUTORS ``AS IS'' AND
22 : * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 : * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 : * ARE DISCLAIMED. IN NO EVENT SHALL THE INSTITUTE OR CONTRIBUTORS BE LIABLE
25 : * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 : * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27 : * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 : * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 : * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 : * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 : * SUCH DAMAGE.
32 : */
33 :
34 : #ifdef HAVE_CONFIG_H
35 : #include <config.h>
36 : #endif
37 : #include "windlocl.h"
38 :
39 : #include <assert.h>
40 : #include <stdlib.h>
41 : #include <errno.h>
42 : #include <stdio.h>
43 :
44 : #include "roken.h"
45 :
46 : #include "normalize_table.h"
47 :
48 : static int
49 792350 : translation_cmp(const void *key, const void *data)
50 : {
51 792350 : const struct translation *t1 = (const struct translation *)key;
52 792350 : const struct translation *t2 = (const struct translation *)data;
53 :
54 792350 : return t1->key - t2->key;
55 : }
56 :
57 : enum { s_base = 0xAC00};
58 : enum { s_count = 11172};
59 : enum { l_base = 0x1100};
60 : enum { l_count = 19};
61 : enum { v_base = 0x1161};
62 : enum { v_count = 21};
63 : enum { t_base = 0x11A7};
64 : enum { t_count = 28};
65 : enum { n_count = v_count * t_count};
66 :
67 : static int
68 60950 : hangul_decomp(const uint32_t *in, size_t in_len,
69 : uint32_t *out, size_t *out_len)
70 : {
71 60950 : uint32_t u = *in;
72 0 : unsigned s_index;
73 0 : unsigned l, v, t;
74 0 : unsigned o;
75 :
76 60950 : if (u < s_base || u >= s_base + s_count)
77 60950 : return 0;
78 0 : s_index = u - s_base;
79 0 : l = l_base + s_index / n_count;
80 0 : v = v_base + (s_index % n_count) / t_count;
81 0 : t = t_base + s_index % t_count;
82 0 : o = 2;
83 0 : if (t != t_base)
84 0 : ++o;
85 0 : if (*out_len < o)
86 0 : return WIND_ERR_OVERRUN;
87 0 : out[0] = l;
88 0 : out[1] = v;
89 0 : if (t != t_base)
90 0 : out[2] = t;
91 0 : *out_len = o;
92 0 : return 1;
93 : }
94 :
95 : static uint32_t
96 57240 : hangul_composition(const uint32_t *in, size_t in_len)
97 : {
98 57240 : if (in_len < 2)
99 0 : return 0;
100 57240 : if (in[0] >= l_base && in[0] < l_base + l_count) {
101 0 : unsigned l_index = in[0] - l_base;
102 0 : unsigned v_index;
103 :
104 0 : if (in[1] < v_base || in[1] >= v_base + v_count)
105 0 : return 0;
106 0 : v_index = in[1] - v_base;
107 0 : return (l_index * v_count + v_index) * t_count + s_base;
108 57240 : } else if (in[0] >= s_base && in[0] < s_base + s_count) {
109 0 : unsigned s_index = in[0] - s_base;
110 0 : unsigned t_index;
111 :
112 0 : if (s_index % t_count != 0)
113 0 : return 0;
114 0 : if (in[1] < t_base || in[1] >= t_base + t_count)
115 0 : return 0;
116 0 : t_index = in[1] - t_base;
117 0 : return in[0] + t_index;
118 : }
119 57240 : return 0;
120 : }
121 :
122 : static int
123 3710 : compat_decomp(const uint32_t *in, size_t in_len,
124 : uint32_t *out, size_t *out_len)
125 : {
126 0 : unsigned i;
127 3710 : unsigned o = 0;
128 :
129 64660 : for (i = 0; i < in_len; ++i) {
130 60950 : struct translation ts = {in[i], 0, 0};
131 60950 : size_t sub_len = *out_len - o;
132 0 : int ret;
133 :
134 60950 : ret = hangul_decomp(in + i, in_len - i,
135 60950 : out + o, &sub_len);
136 60950 : if (ret) {
137 0 : if (ret == WIND_ERR_OVERRUN)
138 0 : return ret;
139 0 : o += sub_len;
140 : } else {
141 60950 : void *s = bsearch(&ts,
142 : _wind_normalize_table,
143 : _wind_normalize_table_size,
144 : sizeof(_wind_normalize_table[0]),
145 : translation_cmp);
146 60950 : if (s != NULL) {
147 0 : const struct translation *t = (const struct translation *)s;
148 :
149 0 : ret = compat_decomp(_wind_normalize_val_table + t->val_offset,
150 0 : t->val_len,
151 0 : out + o, &sub_len);
152 0 : if (ret)
153 0 : return ret;
154 0 : o += sub_len;
155 : } else {
156 60950 : if (o >= *out_len)
157 0 : return WIND_ERR_OVERRUN;
158 60950 : out[o++] = in[i];
159 :
160 : }
161 : }
162 : }
163 3710 : *out_len = o;
164 3710 : return 0;
165 : }
166 :
167 : static void
168 0 : swap_char(uint32_t * a, uint32_t * b)
169 : {
170 0 : uint32_t t;
171 0 : t = *a;
172 0 : *a = *b;
173 0 : *b = t;
174 0 : }
175 :
176 : /* Unicode 5.2.0 D109 Canonical Ordering for a sequence of code points
177 : * that all have Canonical_Combining_Class > 0 */
178 : static void
179 0 : canonical_reorder_sequence(uint32_t * a, size_t len)
180 : {
181 0 : size_t i, j;
182 :
183 0 : if (len <= 1)
184 0 : return;
185 :
186 0 : for (i = 1; i < len; i++) {
187 0 : for (j = i;
188 0 : j > 0 &&
189 0 : _wind_combining_class(a[j]) < _wind_combining_class(a[j-1]);
190 0 : j--)
191 0 : swap_char(&a[j], &a[j-1]);
192 : }
193 : }
194 :
195 : static void
196 3710 : canonical_reorder(uint32_t *tmp, size_t tmp_len)
197 : {
198 0 : size_t i;
199 :
200 64660 : for (i = 0; i < tmp_len; ++i) {
201 60950 : int cc = _wind_combining_class(tmp[i]);
202 60950 : if (cc) {
203 0 : size_t j;
204 0 : for (j = i + 1;
205 0 : j < tmp_len && _wind_combining_class(tmp[j]);
206 0 : ++j)
207 : ;
208 0 : canonical_reorder_sequence(&tmp[i], j - i);
209 0 : i = j;
210 : }
211 : }
212 3710 : }
213 :
214 : static uint32_t
215 57240 : find_composition(const uint32_t *in, unsigned in_len)
216 : {
217 57240 : unsigned short canon_index = 0;
218 0 : uint32_t cur;
219 57240 : unsigned n = 0;
220 :
221 57240 : cur = hangul_composition(in, in_len);
222 57240 : if (cur)
223 0 : return cur;
224 :
225 0 : do {
226 435130 : const struct canon_node *c = &_wind_canon_table[canon_index];
227 0 : unsigned i;
228 :
229 435130 : if (n % 5 == 0) {
230 108650 : if (in_len-- == 0)
231 0 : return c->val;
232 108650 : cur = *in++;
233 : }
234 :
235 435130 : i = cur >> 16;
236 435130 : if (i < c->next_start || i >= c->next_end)
237 57240 : canon_index = 0;
238 : else
239 377890 : canon_index =
240 377890 : _wind_canon_next_table[c->next_offset + i - c->next_start];
241 435130 : if (canon_index != 0) {
242 377890 : cur = (cur << 4) & 0xFFFFF;
243 377890 : ++n;
244 : }
245 435130 : } while (canon_index != 0);
246 57240 : return 0;
247 : }
248 :
249 : static int
250 3710 : combine(const uint32_t *in, size_t in_len,
251 : uint32_t *out, size_t *out_len)
252 : {
253 0 : unsigned i;
254 0 : int ostarter;
255 3710 : unsigned o = 0;
256 0 : int old_cc;
257 :
258 64660 : for (i = 0; i < in_len;) {
259 60950 : while (i < in_len && _wind_combining_class(in[i]) != 0) {
260 0 : out[o++] = in[i++];
261 : }
262 60950 : if (i < in_len) {
263 60950 : if (o >= *out_len)
264 0 : return WIND_ERR_OVERRUN;
265 60950 : ostarter = o;
266 60950 : out[o++] = in[i++];
267 60950 : old_cc = -1;
268 :
269 60950 : while (i < in_len) {
270 0 : uint32_t comb;
271 0 : uint32_t v[2];
272 0 : int cc;
273 :
274 57240 : v[0] = out[ostarter];
275 57240 : v[1] = in[i];
276 :
277 57240 : cc = _wind_combining_class(in[i]);
278 57240 : if (old_cc != cc && (comb = find_composition(v, 2))) {
279 0 : out[ostarter] = comb;
280 57240 : } else if (cc == 0) {
281 57240 : break;
282 : } else {
283 0 : if (o >= *out_len)
284 0 : return WIND_ERR_OVERRUN;
285 0 : out[o++] = in[i];
286 0 : old_cc = cc;
287 : }
288 0 : ++i;
289 : }
290 : }
291 : }
292 3710 : *out_len = o;
293 3710 : return 0;
294 : }
295 :
296 : int
297 3710 : _wind_stringprep_normalize(const uint32_t *in, size_t in_len,
298 : uint32_t *out, size_t *out_len)
299 : {
300 0 : size_t tmp_len;
301 0 : uint32_t *tmp;
302 0 : int ret;
303 :
304 3710 : if (in_len == 0) {
305 0 : *out_len = 0;
306 0 : return 0;
307 : }
308 :
309 3710 : tmp_len = in_len * 4;
310 3710 : if (tmp_len < MAX_LENGTH_CANON)
311 530 : tmp_len = MAX_LENGTH_CANON;
312 3710 : tmp = malloc(tmp_len * sizeof(uint32_t));
313 3710 : if (tmp == NULL)
314 0 : return ENOMEM;
315 :
316 3710 : ret = compat_decomp(in, in_len, tmp, &tmp_len);
317 3710 : if (ret) {
318 0 : free(tmp);
319 0 : return ret;
320 : }
321 3710 : canonical_reorder(tmp, tmp_len);
322 3710 : ret = combine(tmp, tmp_len, out, out_len);
323 3710 : free(tmp);
324 3710 : return ret;
325 : }
|