1ae771770SStanislav Sedov /*
2ae771770SStanislav Sedov * Copyright (c) 2004 Kungliga Tekniska Högskolan
3ae771770SStanislav Sedov * (Royal Institute of Technology, Stockholm, Sweden).
4ae771770SStanislav Sedov * All rights reserved.
5ae771770SStanislav Sedov *
6ae771770SStanislav Sedov * Redistribution and use in source and binary forms, with or without
7ae771770SStanislav Sedov * modification, are permitted provided that the following conditions
8ae771770SStanislav Sedov * are met:
9ae771770SStanislav Sedov *
10ae771770SStanislav Sedov * 1. Redistributions of source code must retain the above copyright
11ae771770SStanislav Sedov * notice, this list of conditions and the following disclaimer.
12ae771770SStanislav Sedov *
13ae771770SStanislav Sedov * 2. Redistributions in binary form must reproduce the above copyright
14ae771770SStanislav Sedov * notice, this list of conditions and the following disclaimer in the
15ae771770SStanislav Sedov * documentation and/or other materials provided with the distribution.
16ae771770SStanislav Sedov *
17ae771770SStanislav Sedov * 3. Neither the name of the Institute nor the names of its contributors
18ae771770SStanislav Sedov * may be used to endorse or promote products derived from this software
19ae771770SStanislav Sedov * without specific prior written permission.
20ae771770SStanislav Sedov *
21ae771770SStanislav Sedov * THIS SOFTWARE IS PROVIDED BY THE INSTITUTE AND CONTRIBUTORS ``AS IS'' AND
22ae771770SStanislav Sedov * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23ae771770SStanislav Sedov * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24ae771770SStanislav Sedov * ARE DISCLAIMED. IN NO EVENT SHALL THE INSTITUTE OR CONTRIBUTORS BE LIABLE
25ae771770SStanislav Sedov * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26ae771770SStanislav Sedov * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27ae771770SStanislav Sedov * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28ae771770SStanislav Sedov * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29ae771770SStanislav Sedov * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30ae771770SStanislav Sedov * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31ae771770SStanislav Sedov * SUCH DAMAGE.
32ae771770SStanislav Sedov */
33ae771770SStanislav Sedov
34ae771770SStanislav Sedov #ifdef HAVE_CONFIG_H
35ae771770SStanislav Sedov #include <config.h>
36ae771770SStanislav Sedov #endif
37ae771770SStanislav Sedov #include "windlocl.h"
38ae771770SStanislav Sedov
39ae771770SStanislav Sedov #include <assert.h>
40ae771770SStanislav Sedov #include <stdlib.h>
41ae771770SStanislav Sedov #include <errno.h>
42ae771770SStanislav Sedov #include <stdio.h>
43ae771770SStanislav Sedov
44ae771770SStanislav Sedov #include "roken.h"
45ae771770SStanislav Sedov
46ae771770SStanislav Sedov #include "normalize_table.h"
47ae771770SStanislav Sedov
48ae771770SStanislav Sedov static int
translation_cmp(const void * key,const void * data)49ae771770SStanislav Sedov translation_cmp(const void *key, const void *data)
50ae771770SStanislav Sedov {
51ae771770SStanislav Sedov const struct translation *t1 = (const struct translation *)key;
52ae771770SStanislav Sedov const struct translation *t2 = (const struct translation *)data;
53ae771770SStanislav Sedov
54ae771770SStanislav Sedov return t1->key - t2->key;
55ae771770SStanislav Sedov }
56ae771770SStanislav Sedov
57ae771770SStanislav Sedov enum { s_base = 0xAC00};
58ae771770SStanislav Sedov enum { s_count = 11172};
59ae771770SStanislav Sedov enum { l_base = 0x1100};
60ae771770SStanislav Sedov enum { l_count = 19};
61ae771770SStanislav Sedov enum { v_base = 0x1161};
62ae771770SStanislav Sedov enum { v_count = 21};
63ae771770SStanislav Sedov enum { t_base = 0x11A7};
64ae771770SStanislav Sedov enum { t_count = 28};
65ae771770SStanislav Sedov enum { n_count = v_count * t_count};
66ae771770SStanislav Sedov
67ae771770SStanislav Sedov static int
hangul_decomp(const uint32_t * in,size_t in_len,uint32_t * out,size_t * out_len)68ae771770SStanislav Sedov hangul_decomp(const uint32_t *in, size_t in_len,
69ae771770SStanislav Sedov uint32_t *out, size_t *out_len)
70ae771770SStanislav Sedov {
71ae771770SStanislav Sedov uint32_t u = *in;
72ae771770SStanislav Sedov unsigned s_index;
73ae771770SStanislav Sedov unsigned l, v, t;
74ae771770SStanislav Sedov unsigned o;
75ae771770SStanislav Sedov
76ae771770SStanislav Sedov if (u < s_base || u >= s_base + s_count)
77ae771770SStanislav Sedov return 0;
78ae771770SStanislav Sedov s_index = u - s_base;
79ae771770SStanislav Sedov l = l_base + s_index / n_count;
80ae771770SStanislav Sedov v = v_base + (s_index % n_count) / t_count;
81ae771770SStanislav Sedov t = t_base + s_index % t_count;
82ae771770SStanislav Sedov o = 2;
83ae771770SStanislav Sedov if (t != t_base)
84ae771770SStanislav Sedov ++o;
85ae771770SStanislav Sedov if (*out_len < o)
86ae771770SStanislav Sedov return WIND_ERR_OVERRUN;
87ae771770SStanislav Sedov out[0] = l;
88ae771770SStanislav Sedov out[1] = v;
89ae771770SStanislav Sedov if (t != t_base)
90ae771770SStanislav Sedov out[2] = t;
91ae771770SStanislav Sedov *out_len = o;
92ae771770SStanislav Sedov return 1;
93ae771770SStanislav Sedov }
94ae771770SStanislav Sedov
95ae771770SStanislav Sedov static uint32_t
hangul_composition(const uint32_t * in,size_t in_len)96ae771770SStanislav Sedov hangul_composition(const uint32_t *in, size_t in_len)
97ae771770SStanislav Sedov {
98ae771770SStanislav Sedov if (in_len < 2)
99ae771770SStanislav Sedov return 0;
100ae771770SStanislav Sedov if (in[0] >= l_base && in[0] < l_base + l_count) {
101ae771770SStanislav Sedov unsigned l_index = in[0] - l_base;
102ae771770SStanislav Sedov unsigned v_index;
103ae771770SStanislav Sedov
104ae771770SStanislav Sedov if (in[1] < v_base || in[1] >= v_base + v_count)
105ae771770SStanislav Sedov return 0;
106ae771770SStanislav Sedov v_index = in[1] - v_base;
107ae771770SStanislav Sedov return (l_index * v_count + v_index) * t_count + s_base;
108ae771770SStanislav Sedov } else if (in[0] >= s_base && in[0] < s_base + s_count) {
109ae771770SStanislav Sedov unsigned s_index = in[0] - s_base;
110ae771770SStanislav Sedov unsigned t_index;
111ae771770SStanislav Sedov
112ae771770SStanislav Sedov if (s_index % t_count != 0)
113ae771770SStanislav Sedov return 0;
114ae771770SStanislav Sedov if (in[1] < t_base || in[1] >= t_base + t_count)
115ae771770SStanislav Sedov return 0;
116ae771770SStanislav Sedov t_index = in[1] - t_base;
117ae771770SStanislav Sedov return in[0] + t_index;
118ae771770SStanislav Sedov }
119ae771770SStanislav Sedov return 0;
120ae771770SStanislav Sedov }
121ae771770SStanislav Sedov
122ae771770SStanislav Sedov static int
compat_decomp(const uint32_t * in,size_t in_len,uint32_t * out,size_t * out_len)123ae771770SStanislav Sedov compat_decomp(const uint32_t *in, size_t in_len,
124ae771770SStanislav Sedov uint32_t *out, size_t *out_len)
125ae771770SStanislav Sedov {
126ae771770SStanislav Sedov unsigned i;
127ae771770SStanislav Sedov unsigned o = 0;
128ae771770SStanislav Sedov
129ae771770SStanislav Sedov for (i = 0; i < in_len; ++i) {
130ae771770SStanislav Sedov struct translation ts = {in[i]};
131ae771770SStanislav Sedov size_t sub_len = *out_len - o;
132ae771770SStanislav Sedov int ret;
133ae771770SStanislav Sedov
134ae771770SStanislav Sedov ret = hangul_decomp(in + i, in_len - i,
135ae771770SStanislav Sedov out + o, &sub_len);
136ae771770SStanislav Sedov if (ret) {
137ae771770SStanislav Sedov if (ret == WIND_ERR_OVERRUN)
138ae771770SStanislav Sedov return ret;
139ae771770SStanislav Sedov o += sub_len;
140ae771770SStanislav Sedov } else {
141ae771770SStanislav Sedov void *s = bsearch(&ts,
142ae771770SStanislav Sedov _wind_normalize_table,
143ae771770SStanislav Sedov _wind_normalize_table_size,
144ae771770SStanislav Sedov sizeof(_wind_normalize_table[0]),
145ae771770SStanislav Sedov translation_cmp);
146ae771770SStanislav Sedov if (s != NULL) {
147ae771770SStanislav Sedov const struct translation *t = (const struct translation *)s;
148ae771770SStanislav Sedov
149ae771770SStanislav Sedov ret = compat_decomp(_wind_normalize_val_table + t->val_offset,
150ae771770SStanislav Sedov t->val_len,
151ae771770SStanislav Sedov out + o, &sub_len);
152ae771770SStanislav Sedov if (ret)
153ae771770SStanislav Sedov return ret;
154ae771770SStanislav Sedov o += sub_len;
155ae771770SStanislav Sedov } else {
156ae771770SStanislav Sedov if (o >= *out_len)
157ae771770SStanislav Sedov return WIND_ERR_OVERRUN;
158ae771770SStanislav Sedov out[o++] = in[i];
159ae771770SStanislav Sedov
160ae771770SStanislav Sedov }
161ae771770SStanislav Sedov }
162ae771770SStanislav Sedov }
163ae771770SStanislav Sedov *out_len = o;
164ae771770SStanislav Sedov return 0;
165ae771770SStanislav Sedov }
166ae771770SStanislav Sedov
167ae771770SStanislav Sedov static void
swap_char(uint32_t * a,uint32_t * b)168ae771770SStanislav Sedov swap_char(uint32_t * a, uint32_t * b)
169ae771770SStanislav Sedov {
170ae771770SStanislav Sedov uint32_t t;
171ae771770SStanislav Sedov t = *a;
172ae771770SStanislav Sedov *a = *b;
173ae771770SStanislav Sedov *b = t;
174ae771770SStanislav Sedov }
175ae771770SStanislav Sedov
176ae771770SStanislav Sedov /* Unicode 5.2.0 D109 Canonical Ordering for a sequence of code points
177ae771770SStanislav Sedov * that all have Canonical_Combining_Class > 0 */
178ae771770SStanislav Sedov static void
canonical_reorder_sequence(uint32_t * a,size_t len)179ae771770SStanislav Sedov canonical_reorder_sequence(uint32_t * a, size_t len)
180ae771770SStanislav Sedov {
181ae771770SStanislav Sedov size_t i, j;
182ae771770SStanislav Sedov
183ae771770SStanislav Sedov if (len <= 1)
184ae771770SStanislav Sedov return;
185ae771770SStanislav Sedov
186ae771770SStanislav Sedov for (i = 1; i < len; i++) {
187ae771770SStanislav Sedov for (j = i;
188ae771770SStanislav Sedov j > 0 &&
189ae771770SStanislav Sedov _wind_combining_class(a[j]) < _wind_combining_class(a[j-1]);
190ae771770SStanislav Sedov j--)
191ae771770SStanislav Sedov swap_char(&a[j], &a[j-1]);
192ae771770SStanislav Sedov }
193ae771770SStanislav Sedov }
194ae771770SStanislav Sedov
195ae771770SStanislav Sedov static void
canonical_reorder(uint32_t * tmp,size_t tmp_len)196ae771770SStanislav Sedov canonical_reorder(uint32_t *tmp, size_t tmp_len)
197ae771770SStanislav Sedov {
198ae771770SStanislav Sedov size_t i;
199ae771770SStanislav Sedov
200ae771770SStanislav Sedov for (i = 0; i < tmp_len; ++i) {
201ae771770SStanislav Sedov int cc = _wind_combining_class(tmp[i]);
202ae771770SStanislav Sedov if (cc) {
203ae771770SStanislav Sedov size_t j;
204ae771770SStanislav Sedov for (j = i + 1;
205ae771770SStanislav Sedov j < tmp_len && _wind_combining_class(tmp[j]);
206ae771770SStanislav Sedov ++j)
207ae771770SStanislav Sedov ;
208ae771770SStanislav Sedov canonical_reorder_sequence(&tmp[i], j - i);
209ae771770SStanislav Sedov i = j;
210ae771770SStanislav Sedov }
211ae771770SStanislav Sedov }
212ae771770SStanislav Sedov }
213ae771770SStanislav Sedov
214ae771770SStanislav Sedov static uint32_t
find_composition(const uint32_t * in,unsigned in_len)215ae771770SStanislav Sedov find_composition(const uint32_t *in, unsigned in_len)
216ae771770SStanislav Sedov {
217ae771770SStanislav Sedov unsigned short canon_index = 0;
218ae771770SStanislav Sedov uint32_t cur;
219ae771770SStanislav Sedov unsigned n = 0;
220ae771770SStanislav Sedov
221ae771770SStanislav Sedov cur = hangul_composition(in, in_len);
222ae771770SStanislav Sedov if (cur)
223ae771770SStanislav Sedov return cur;
224ae771770SStanislav Sedov
225ae771770SStanislav Sedov do {
226ae771770SStanislav Sedov const struct canon_node *c = &_wind_canon_table[canon_index];
227ae771770SStanislav Sedov unsigned i;
228ae771770SStanislav Sedov
229ae771770SStanislav Sedov if (n % 5 == 0) {
230ae771770SStanislav Sedov if (in_len-- == 0)
231ae771770SStanislav Sedov return c->val;
232*ed549cb0SCy Schubert cur = *in++;
233ae771770SStanislav Sedov }
234ae771770SStanislav Sedov
235ae771770SStanislav Sedov i = cur >> 16;
236ae771770SStanislav Sedov if (i < c->next_start || i >= c->next_end)
237ae771770SStanislav Sedov canon_index = 0;
238ae771770SStanislav Sedov else
239ae771770SStanislav Sedov canon_index =
240ae771770SStanislav Sedov _wind_canon_next_table[c->next_offset + i - c->next_start];
241ae771770SStanislav Sedov if (canon_index != 0) {
242ae771770SStanislav Sedov cur = (cur << 4) & 0xFFFFF;
243ae771770SStanislav Sedov ++n;
244ae771770SStanislav Sedov }
245ae771770SStanislav Sedov } while (canon_index != 0);
246ae771770SStanislav Sedov return 0;
247ae771770SStanislav Sedov }
248ae771770SStanislav Sedov
249ae771770SStanislav Sedov static int
combine(const uint32_t * in,size_t in_len,uint32_t * out,size_t * out_len)250ae771770SStanislav Sedov combine(const uint32_t *in, size_t in_len,
251ae771770SStanislav Sedov uint32_t *out, size_t *out_len)
252ae771770SStanislav Sedov {
253ae771770SStanislav Sedov unsigned i;
254ae771770SStanislav Sedov int ostarter;
255ae771770SStanislav Sedov unsigned o = 0;
256ae771770SStanislav Sedov int old_cc;
257ae771770SStanislav Sedov
258ae771770SStanislav Sedov for (i = 0; i < in_len;) {
259ae771770SStanislav Sedov while (i < in_len && _wind_combining_class(in[i]) != 0) {
260ae771770SStanislav Sedov out[o++] = in[i++];
261ae771770SStanislav Sedov }
262ae771770SStanislav Sedov if (i < in_len) {
263ae771770SStanislav Sedov if (o >= *out_len)
264ae771770SStanislav Sedov return WIND_ERR_OVERRUN;
265ae771770SStanislav Sedov ostarter = o;
266ae771770SStanislav Sedov out[o++] = in[i++];
267ae771770SStanislav Sedov old_cc = -1;
268ae771770SStanislav Sedov
269ae771770SStanislav Sedov while (i < in_len) {
270ae771770SStanislav Sedov uint32_t comb;
271ae771770SStanislav Sedov uint32_t v[2];
272ae771770SStanislav Sedov int cc;
273ae771770SStanislav Sedov
274ae771770SStanislav Sedov v[0] = out[ostarter];
275ae771770SStanislav Sedov v[1] = in[i];
276ae771770SStanislav Sedov
277ae771770SStanislav Sedov cc = _wind_combining_class(in[i]);
278ae771770SStanislav Sedov if (old_cc != cc && (comb = find_composition(v, 2))) {
279ae771770SStanislav Sedov out[ostarter] = comb;
280ae771770SStanislav Sedov } else if (cc == 0) {
281ae771770SStanislav Sedov break;
282ae771770SStanislav Sedov } else {
283ae771770SStanislav Sedov if (o >= *out_len)
284ae771770SStanislav Sedov return WIND_ERR_OVERRUN;
285ae771770SStanislav Sedov out[o++] = in[i];
286ae771770SStanislav Sedov old_cc = cc;
287ae771770SStanislav Sedov }
288ae771770SStanislav Sedov ++i;
289ae771770SStanislav Sedov }
290ae771770SStanislav Sedov }
291ae771770SStanislav Sedov }
292ae771770SStanislav Sedov *out_len = o;
293ae771770SStanislav Sedov return 0;
294ae771770SStanislav Sedov }
295ae771770SStanislav Sedov
296ae771770SStanislav Sedov int
_wind_stringprep_normalize(const uint32_t * in,size_t in_len,uint32_t * out,size_t * out_len)297ae771770SStanislav Sedov _wind_stringprep_normalize(const uint32_t *in, size_t in_len,
298ae771770SStanislav Sedov uint32_t *out, size_t *out_len)
299ae771770SStanislav Sedov {
300ae771770SStanislav Sedov size_t tmp_len;
301ae771770SStanislav Sedov uint32_t *tmp;
302ae771770SStanislav Sedov int ret;
303ae771770SStanislav Sedov
304ae771770SStanislav Sedov if (in_len == 0) {
305ae771770SStanislav Sedov *out_len = 0;
306ae771770SStanislav Sedov return 0;
307ae771770SStanislav Sedov }
308ae771770SStanislav Sedov
309ae771770SStanislav Sedov tmp_len = in_len * 4;
310ae771770SStanislav Sedov if (tmp_len < MAX_LENGTH_CANON)
311ae771770SStanislav Sedov tmp_len = MAX_LENGTH_CANON;
312ae771770SStanislav Sedov tmp = malloc(tmp_len * sizeof(uint32_t));
313ae771770SStanislav Sedov if (tmp == NULL)
314ae771770SStanislav Sedov return ENOMEM;
315ae771770SStanislav Sedov
316ae771770SStanislav Sedov ret = compat_decomp(in, in_len, tmp, &tmp_len);
317ae771770SStanislav Sedov if (ret) {
318ae771770SStanislav Sedov free(tmp);
319ae771770SStanislav Sedov return ret;
320ae771770SStanislav Sedov }
321ae771770SStanislav Sedov canonical_reorder(tmp, tmp_len);
322ae771770SStanislav Sedov ret = combine(tmp, tmp_len, out, out_len);
323ae771770SStanislav Sedov free(tmp);
324ae771770SStanislav Sedov return ret;
325ae771770SStanislav Sedov }
326