xref: /freebsd/crypto/heimdal/lib/wind/normalize.c (revision ae77177087c655fc883075af4f425b37e032cd05)
1*ae771770SStanislav Sedov /*
2*ae771770SStanislav Sedov  * Copyright (c) 2004 Kungliga Tekniska Högskolan
3*ae771770SStanislav Sedov  * (Royal Institute of Technology, Stockholm, Sweden).
4*ae771770SStanislav Sedov  * All rights reserved.
5*ae771770SStanislav Sedov  *
6*ae771770SStanislav Sedov  * Redistribution and use in source and binary forms, with or without
7*ae771770SStanislav Sedov  * modification, are permitted provided that the following conditions
8*ae771770SStanislav Sedov  * are met:
9*ae771770SStanislav Sedov  *
10*ae771770SStanislav Sedov  * 1. Redistributions of source code must retain the above copyright
11*ae771770SStanislav Sedov  *    notice, this list of conditions and the following disclaimer.
12*ae771770SStanislav Sedov  *
13*ae771770SStanislav Sedov  * 2. Redistributions in binary form must reproduce the above copyright
14*ae771770SStanislav Sedov  *    notice, this list of conditions and the following disclaimer in the
15*ae771770SStanislav Sedov  *    documentation and/or other materials provided with the distribution.
16*ae771770SStanislav Sedov  *
17*ae771770SStanislav Sedov  * 3. Neither the name of the Institute nor the names of its contributors
18*ae771770SStanislav Sedov  *    may be used to endorse or promote products derived from this software
19*ae771770SStanislav Sedov  *    without specific prior written permission.
20*ae771770SStanislav Sedov  *
21*ae771770SStanislav Sedov  * THIS SOFTWARE IS PROVIDED BY THE INSTITUTE AND CONTRIBUTORS ``AS IS'' AND
22*ae771770SStanislav Sedov  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23*ae771770SStanislav Sedov  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24*ae771770SStanislav Sedov  * ARE DISCLAIMED.  IN NO EVENT SHALL THE INSTITUTE OR CONTRIBUTORS BE LIABLE
25*ae771770SStanislav Sedov  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26*ae771770SStanislav Sedov  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27*ae771770SStanislav Sedov  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28*ae771770SStanislav Sedov  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29*ae771770SStanislav Sedov  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30*ae771770SStanislav Sedov  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31*ae771770SStanislav Sedov  * SUCH DAMAGE.
32*ae771770SStanislav Sedov  */
33*ae771770SStanislav Sedov 
34*ae771770SStanislav Sedov #ifdef HAVE_CONFIG_H
35*ae771770SStanislav Sedov #include <config.h>
36*ae771770SStanislav Sedov #endif
37*ae771770SStanislav Sedov #include "windlocl.h"
38*ae771770SStanislav Sedov 
39*ae771770SStanislav Sedov #include <assert.h>
40*ae771770SStanislav Sedov #include <stdlib.h>
41*ae771770SStanislav Sedov #include <errno.h>
42*ae771770SStanislav Sedov #include <stdio.h>
43*ae771770SStanislav Sedov 
44*ae771770SStanislav Sedov #include "roken.h"
45*ae771770SStanislav Sedov 
46*ae771770SStanislav Sedov #include "normalize_table.h"
47*ae771770SStanislav Sedov 
48*ae771770SStanislav Sedov static int
49*ae771770SStanislav Sedov translation_cmp(const void *key, const void *data)
50*ae771770SStanislav Sedov {
51*ae771770SStanislav Sedov     const struct translation *t1 = (const struct translation *)key;
52*ae771770SStanislav Sedov     const struct translation *t2 = (const struct translation *)data;
53*ae771770SStanislav Sedov 
54*ae771770SStanislav Sedov     return t1->key - t2->key;
55*ae771770SStanislav Sedov }
56*ae771770SStanislav Sedov 
57*ae771770SStanislav Sedov enum { s_base  = 0xAC00};
58*ae771770SStanislav Sedov enum { s_count = 11172};
59*ae771770SStanislav Sedov enum { l_base  = 0x1100};
60*ae771770SStanislav Sedov enum { l_count = 19};
61*ae771770SStanislav Sedov enum { v_base  = 0x1161};
62*ae771770SStanislav Sedov enum { v_count = 21};
63*ae771770SStanislav Sedov enum { t_base  = 0x11A7};
64*ae771770SStanislav Sedov enum { t_count = 28};
65*ae771770SStanislav Sedov enum { n_count = v_count * t_count};
66*ae771770SStanislav Sedov 
67*ae771770SStanislav Sedov static int
68*ae771770SStanislav Sedov hangul_decomp(const uint32_t *in, size_t in_len,
69*ae771770SStanislav Sedov 	      uint32_t *out, size_t *out_len)
70*ae771770SStanislav Sedov {
71*ae771770SStanislav Sedov     uint32_t u = *in;
72*ae771770SStanislav Sedov     unsigned s_index;
73*ae771770SStanislav Sedov     unsigned l, v, t;
74*ae771770SStanislav Sedov     unsigned o;
75*ae771770SStanislav Sedov 
76*ae771770SStanislav Sedov     if (u < s_base || u >= s_base + s_count)
77*ae771770SStanislav Sedov 	return 0;
78*ae771770SStanislav Sedov     s_index = u - s_base;
79*ae771770SStanislav Sedov     l = l_base + s_index / n_count;
80*ae771770SStanislav Sedov     v = v_base + (s_index % n_count) / t_count;
81*ae771770SStanislav Sedov     t = t_base + s_index % t_count;
82*ae771770SStanislav Sedov     o = 2;
83*ae771770SStanislav Sedov     if (t != t_base)
84*ae771770SStanislav Sedov 	++o;
85*ae771770SStanislav Sedov     if (*out_len < o)
86*ae771770SStanislav Sedov 	return WIND_ERR_OVERRUN;
87*ae771770SStanislav Sedov     out[0] = l;
88*ae771770SStanislav Sedov     out[1] = v;
89*ae771770SStanislav Sedov     if (t != t_base)
90*ae771770SStanislav Sedov 	out[2] = t;
91*ae771770SStanislav Sedov     *out_len = o;
92*ae771770SStanislav Sedov     return 1;
93*ae771770SStanislav Sedov }
94*ae771770SStanislav Sedov 
95*ae771770SStanislav Sedov static uint32_t
96*ae771770SStanislav Sedov hangul_composition(const uint32_t *in, size_t in_len)
97*ae771770SStanislav Sedov {
98*ae771770SStanislav Sedov     if (in_len < 2)
99*ae771770SStanislav Sedov 	return 0;
100*ae771770SStanislav Sedov     if (in[0] >= l_base && in[0] < l_base + l_count) {
101*ae771770SStanislav Sedov 	unsigned l_index = in[0] - l_base;
102*ae771770SStanislav Sedov 	unsigned v_index;
103*ae771770SStanislav Sedov 
104*ae771770SStanislav Sedov 	if (in[1] < v_base || in[1] >= v_base + v_count)
105*ae771770SStanislav Sedov 	    return 0;
106*ae771770SStanislav Sedov 	v_index = in[1] - v_base;
107*ae771770SStanislav Sedov 	return (l_index * v_count + v_index) * t_count + s_base;
108*ae771770SStanislav Sedov     } else if (in[0] >= s_base && in[0] < s_base + s_count) {
109*ae771770SStanislav Sedov 	unsigned s_index = in[0] - s_base;
110*ae771770SStanislav Sedov 	unsigned t_index;
111*ae771770SStanislav Sedov 
112*ae771770SStanislav Sedov 	if (s_index % t_count != 0)
113*ae771770SStanislav Sedov 	    return 0;
114*ae771770SStanislav Sedov 	if (in[1] < t_base || in[1] >= t_base + t_count)
115*ae771770SStanislav Sedov 	    return 0;
116*ae771770SStanislav Sedov 	t_index = in[1] - t_base;
117*ae771770SStanislav Sedov 	return in[0] + t_index;
118*ae771770SStanislav Sedov     }
119*ae771770SStanislav Sedov     return 0;
120*ae771770SStanislav Sedov }
121*ae771770SStanislav Sedov 
122*ae771770SStanislav Sedov static int
123*ae771770SStanislav Sedov compat_decomp(const uint32_t *in, size_t in_len,
124*ae771770SStanislav Sedov 	      uint32_t *out, size_t *out_len)
125*ae771770SStanislav Sedov {
126*ae771770SStanislav Sedov     unsigned i;
127*ae771770SStanislav Sedov     unsigned o = 0;
128*ae771770SStanislav Sedov 
129*ae771770SStanislav Sedov     for (i = 0; i < in_len; ++i) {
130*ae771770SStanislav Sedov 	struct translation ts = {in[i]};
131*ae771770SStanislav Sedov 	size_t sub_len = *out_len - o;
132*ae771770SStanislav Sedov 	int ret;
133*ae771770SStanislav Sedov 
134*ae771770SStanislav Sedov 	ret = hangul_decomp(in + i, in_len - i,
135*ae771770SStanislav Sedov 			    out + o, &sub_len);
136*ae771770SStanislav Sedov 	if (ret) {
137*ae771770SStanislav Sedov 	    if (ret == WIND_ERR_OVERRUN)
138*ae771770SStanislav Sedov 		return ret;
139*ae771770SStanislav Sedov 	    o += sub_len;
140*ae771770SStanislav Sedov 	} else {
141*ae771770SStanislav Sedov 	    void *s = bsearch(&ts,
142*ae771770SStanislav Sedov 			      _wind_normalize_table,
143*ae771770SStanislav Sedov 			      _wind_normalize_table_size,
144*ae771770SStanislav Sedov 			      sizeof(_wind_normalize_table[0]),
145*ae771770SStanislav Sedov 			      translation_cmp);
146*ae771770SStanislav Sedov 	    if (s != NULL) {
147*ae771770SStanislav Sedov 		const struct translation *t = (const struct translation *)s;
148*ae771770SStanislav Sedov 
149*ae771770SStanislav Sedov 		ret = compat_decomp(_wind_normalize_val_table + t->val_offset,
150*ae771770SStanislav Sedov 				    t->val_len,
151*ae771770SStanislav Sedov 				    out + o, &sub_len);
152*ae771770SStanislav Sedov 		if (ret)
153*ae771770SStanislav Sedov 		    return ret;
154*ae771770SStanislav Sedov 		o += sub_len;
155*ae771770SStanislav Sedov 	    } else {
156*ae771770SStanislav Sedov 		if (o >= *out_len)
157*ae771770SStanislav Sedov 		    return WIND_ERR_OVERRUN;
158*ae771770SStanislav Sedov 		out[o++] = in[i];
159*ae771770SStanislav Sedov 
160*ae771770SStanislav Sedov 	    }
161*ae771770SStanislav Sedov 	}
162*ae771770SStanislav Sedov     }
163*ae771770SStanislav Sedov     *out_len = o;
164*ae771770SStanislav Sedov     return 0;
165*ae771770SStanislav Sedov }
166*ae771770SStanislav Sedov 
167*ae771770SStanislav Sedov static void
168*ae771770SStanislav Sedov swap_char(uint32_t * a, uint32_t * b)
169*ae771770SStanislav Sedov {
170*ae771770SStanislav Sedov     uint32_t t;
171*ae771770SStanislav Sedov     t = *a;
172*ae771770SStanislav Sedov     *a = *b;
173*ae771770SStanislav Sedov     *b = t;
174*ae771770SStanislav Sedov }
175*ae771770SStanislav Sedov 
176*ae771770SStanislav Sedov /* Unicode 5.2.0 D109 Canonical Ordering for a sequence of code points
177*ae771770SStanislav Sedov  * that all have Canonical_Combining_Class > 0 */
178*ae771770SStanislav Sedov static void
179*ae771770SStanislav Sedov canonical_reorder_sequence(uint32_t * a, size_t len)
180*ae771770SStanislav Sedov {
181*ae771770SStanislav Sedov     size_t i, j;
182*ae771770SStanislav Sedov 
183*ae771770SStanislav Sedov     if (len <= 1)
184*ae771770SStanislav Sedov 	return;
185*ae771770SStanislav Sedov 
186*ae771770SStanislav Sedov     for (i = 1; i < len; i++) {
187*ae771770SStanislav Sedov 	for (j = i;
188*ae771770SStanislav Sedov 	     j > 0 &&
189*ae771770SStanislav Sedov 		 _wind_combining_class(a[j]) < _wind_combining_class(a[j-1]);
190*ae771770SStanislav Sedov 	     j--)
191*ae771770SStanislav Sedov 	    swap_char(&a[j], &a[j-1]);
192*ae771770SStanislav Sedov     }
193*ae771770SStanislav Sedov }
194*ae771770SStanislav Sedov 
195*ae771770SStanislav Sedov static void
196*ae771770SStanislav Sedov canonical_reorder(uint32_t *tmp, size_t tmp_len)
197*ae771770SStanislav Sedov {
198*ae771770SStanislav Sedov     size_t i;
199*ae771770SStanislav Sedov 
200*ae771770SStanislav Sedov     for (i = 0; i < tmp_len; ++i) {
201*ae771770SStanislav Sedov 	int cc = _wind_combining_class(tmp[i]);
202*ae771770SStanislav Sedov 	if (cc) {
203*ae771770SStanislav Sedov 	    size_t j;
204*ae771770SStanislav Sedov 	    for (j = i + 1;
205*ae771770SStanislav Sedov 		 j < tmp_len && _wind_combining_class(tmp[j]);
206*ae771770SStanislav Sedov 		 ++j)
207*ae771770SStanislav Sedov 		;
208*ae771770SStanislav Sedov 	    canonical_reorder_sequence(&tmp[i], j - i);
209*ae771770SStanislav Sedov 	    i = j;
210*ae771770SStanislav Sedov 	}
211*ae771770SStanislav Sedov     }
212*ae771770SStanislav Sedov }
213*ae771770SStanislav Sedov 
214*ae771770SStanislav Sedov static uint32_t
215*ae771770SStanislav Sedov find_composition(const uint32_t *in, unsigned in_len)
216*ae771770SStanislav Sedov {
217*ae771770SStanislav Sedov     unsigned short canon_index = 0;
218*ae771770SStanislav Sedov     uint32_t cur;
219*ae771770SStanislav Sedov     unsigned n = 0;
220*ae771770SStanislav Sedov 
221*ae771770SStanislav Sedov     cur = hangul_composition(in, in_len);
222*ae771770SStanislav Sedov     if (cur)
223*ae771770SStanislav Sedov 	return cur;
224*ae771770SStanislav Sedov 
225*ae771770SStanislav Sedov     do {
226*ae771770SStanislav Sedov 	const struct canon_node *c = &_wind_canon_table[canon_index];
227*ae771770SStanislav Sedov 	unsigned i;
228*ae771770SStanislav Sedov 
229*ae771770SStanislav Sedov 	if (n % 5 == 0) {
230*ae771770SStanislav Sedov 	    cur = *in++;
231*ae771770SStanislav Sedov 	    if (in_len-- == 0)
232*ae771770SStanislav Sedov 		return c->val;
233*ae771770SStanislav Sedov 	}
234*ae771770SStanislav Sedov 
235*ae771770SStanislav Sedov 	i = cur >> 16;
236*ae771770SStanislav Sedov 	if (i < c->next_start || i >= c->next_end)
237*ae771770SStanislav Sedov 	    canon_index = 0;
238*ae771770SStanislav Sedov 	else
239*ae771770SStanislav Sedov 	    canon_index =
240*ae771770SStanislav Sedov 		_wind_canon_next_table[c->next_offset + i - c->next_start];
241*ae771770SStanislav Sedov 	if (canon_index != 0) {
242*ae771770SStanislav Sedov 	    cur = (cur << 4) & 0xFFFFF;
243*ae771770SStanislav Sedov 	    ++n;
244*ae771770SStanislav Sedov 	}
245*ae771770SStanislav Sedov     } while (canon_index != 0);
246*ae771770SStanislav Sedov     return 0;
247*ae771770SStanislav Sedov }
248*ae771770SStanislav Sedov 
249*ae771770SStanislav Sedov static int
250*ae771770SStanislav Sedov combine(const uint32_t *in, size_t in_len,
251*ae771770SStanislav Sedov 	uint32_t *out, size_t *out_len)
252*ae771770SStanislav Sedov {
253*ae771770SStanislav Sedov     unsigned i;
254*ae771770SStanislav Sedov     int ostarter;
255*ae771770SStanislav Sedov     unsigned o = 0;
256*ae771770SStanislav Sedov     int old_cc;
257*ae771770SStanislav Sedov 
258*ae771770SStanislav Sedov     for (i = 0; i < in_len;) {
259*ae771770SStanislav Sedov 	while (i < in_len && _wind_combining_class(in[i]) != 0) {
260*ae771770SStanislav Sedov 	    out[o++] = in[i++];
261*ae771770SStanislav Sedov 	}
262*ae771770SStanislav Sedov 	if (i < in_len) {
263*ae771770SStanislav Sedov 	    if (o >= *out_len)
264*ae771770SStanislav Sedov 		return WIND_ERR_OVERRUN;
265*ae771770SStanislav Sedov 	    ostarter = o;
266*ae771770SStanislav Sedov 	    out[o++] = in[i++];
267*ae771770SStanislav Sedov 	    old_cc   = -1;
268*ae771770SStanislav Sedov 
269*ae771770SStanislav Sedov 	    while (i < in_len) {
270*ae771770SStanislav Sedov 		uint32_t comb;
271*ae771770SStanislav Sedov 		uint32_t v[2];
272*ae771770SStanislav Sedov 		int cc;
273*ae771770SStanislav Sedov 
274*ae771770SStanislav Sedov 		v[0] = out[ostarter];
275*ae771770SStanislav Sedov 		v[1] = in[i];
276*ae771770SStanislav Sedov 
277*ae771770SStanislav Sedov 		cc = _wind_combining_class(in[i]);
278*ae771770SStanislav Sedov 		if (old_cc != cc && (comb = find_composition(v, 2))) {
279*ae771770SStanislav Sedov 		    out[ostarter] = comb;
280*ae771770SStanislav Sedov 		} else if (cc == 0) {
281*ae771770SStanislav Sedov 		    break;
282*ae771770SStanislav Sedov 		} else {
283*ae771770SStanislav Sedov 		    if (o >= *out_len)
284*ae771770SStanislav Sedov 			return WIND_ERR_OVERRUN;
285*ae771770SStanislav Sedov 		    out[o++] = in[i];
286*ae771770SStanislav Sedov 		    old_cc   = cc;
287*ae771770SStanislav Sedov 		}
288*ae771770SStanislav Sedov 		++i;
289*ae771770SStanislav Sedov 	    }
290*ae771770SStanislav Sedov 	}
291*ae771770SStanislav Sedov     }
292*ae771770SStanislav Sedov     *out_len = o;
293*ae771770SStanislav Sedov     return 0;
294*ae771770SStanislav Sedov }
295*ae771770SStanislav Sedov 
296*ae771770SStanislav Sedov int
297*ae771770SStanislav Sedov _wind_stringprep_normalize(const uint32_t *in, size_t in_len,
298*ae771770SStanislav Sedov 			   uint32_t *out, size_t *out_len)
299*ae771770SStanislav Sedov {
300*ae771770SStanislav Sedov     size_t tmp_len;
301*ae771770SStanislav Sedov     uint32_t *tmp;
302*ae771770SStanislav Sedov     int ret;
303*ae771770SStanislav Sedov 
304*ae771770SStanislav Sedov     if (in_len == 0) {
305*ae771770SStanislav Sedov 	*out_len = 0;
306*ae771770SStanislav Sedov 	return 0;
307*ae771770SStanislav Sedov     }
308*ae771770SStanislav Sedov 
309*ae771770SStanislav Sedov     tmp_len = in_len * 4;
310*ae771770SStanislav Sedov     if (tmp_len < MAX_LENGTH_CANON)
311*ae771770SStanislav Sedov 	tmp_len = MAX_LENGTH_CANON;
312*ae771770SStanislav Sedov     tmp = malloc(tmp_len * sizeof(uint32_t));
313*ae771770SStanislav Sedov     if (tmp == NULL)
314*ae771770SStanislav Sedov 	return ENOMEM;
315*ae771770SStanislav Sedov 
316*ae771770SStanislav Sedov     ret = compat_decomp(in, in_len, tmp, &tmp_len);
317*ae771770SStanislav Sedov     if (ret) {
318*ae771770SStanislav Sedov 	free(tmp);
319*ae771770SStanislav Sedov 	return ret;
320*ae771770SStanislav Sedov     }
321*ae771770SStanislav Sedov     canonical_reorder(tmp, tmp_len);
322*ae771770SStanislav Sedov     ret = combine(tmp, tmp_len, out, out_len);
323*ae771770SStanislav Sedov     free(tmp);
324*ae771770SStanislav Sedov     return ret;
325*ae771770SStanislav Sedov }
326