xref: /freebsd/crypto/heimdal/lib/wind/normalize.c (revision ed549cb0c53f8438c52593ce811f6fcc812248e9)
1ae771770SStanislav Sedov /*
2ae771770SStanislav Sedov  * Copyright (c) 2004 Kungliga Tekniska Högskolan
3ae771770SStanislav Sedov  * (Royal Institute of Technology, Stockholm, Sweden).
4ae771770SStanislav Sedov  * All rights reserved.
5ae771770SStanislav Sedov  *
6ae771770SStanislav Sedov  * Redistribution and use in source and binary forms, with or without
7ae771770SStanislav Sedov  * modification, are permitted provided that the following conditions
8ae771770SStanislav Sedov  * are met:
9ae771770SStanislav Sedov  *
10ae771770SStanislav Sedov  * 1. Redistributions of source code must retain the above copyright
11ae771770SStanislav Sedov  *    notice, this list of conditions and the following disclaimer.
12ae771770SStanislav Sedov  *
13ae771770SStanislav Sedov  * 2. Redistributions in binary form must reproduce the above copyright
14ae771770SStanislav Sedov  *    notice, this list of conditions and the following disclaimer in the
15ae771770SStanislav Sedov  *    documentation and/or other materials provided with the distribution.
16ae771770SStanislav Sedov  *
17ae771770SStanislav Sedov  * 3. Neither the name of the Institute nor the names of its contributors
18ae771770SStanislav Sedov  *    may be used to endorse or promote products derived from this software
19ae771770SStanislav Sedov  *    without specific prior written permission.
20ae771770SStanislav Sedov  *
21ae771770SStanislav Sedov  * THIS SOFTWARE IS PROVIDED BY THE INSTITUTE AND CONTRIBUTORS ``AS IS'' AND
22ae771770SStanislav Sedov  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23ae771770SStanislav Sedov  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24ae771770SStanislav Sedov  * ARE DISCLAIMED.  IN NO EVENT SHALL THE INSTITUTE OR CONTRIBUTORS BE LIABLE
25ae771770SStanislav Sedov  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26ae771770SStanislav Sedov  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27ae771770SStanislav Sedov  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28ae771770SStanislav Sedov  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29ae771770SStanislav Sedov  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30ae771770SStanislav Sedov  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31ae771770SStanislav Sedov  * SUCH DAMAGE.
32ae771770SStanislav Sedov  */
33ae771770SStanislav Sedov 
34ae771770SStanislav Sedov #ifdef HAVE_CONFIG_H
35ae771770SStanislav Sedov #include <config.h>
36ae771770SStanislav Sedov #endif
37ae771770SStanislav Sedov #include "windlocl.h"
38ae771770SStanislav Sedov 
39ae771770SStanislav Sedov #include <assert.h>
40ae771770SStanislav Sedov #include <stdlib.h>
41ae771770SStanislav Sedov #include <errno.h>
42ae771770SStanislav Sedov #include <stdio.h>
43ae771770SStanislav Sedov 
44ae771770SStanislav Sedov #include "roken.h"
45ae771770SStanislav Sedov 
46ae771770SStanislav Sedov #include "normalize_table.h"
47ae771770SStanislav Sedov 
48ae771770SStanislav Sedov static int
translation_cmp(const void * key,const void * data)49ae771770SStanislav Sedov translation_cmp(const void *key, const void *data)
50ae771770SStanislav Sedov {
51ae771770SStanislav Sedov     const struct translation *t1 = (const struct translation *)key;
52ae771770SStanislav Sedov     const struct translation *t2 = (const struct translation *)data;
53ae771770SStanislav Sedov 
54ae771770SStanislav Sedov     return t1->key - t2->key;
55ae771770SStanislav Sedov }
56ae771770SStanislav Sedov 
57ae771770SStanislav Sedov enum { s_base  = 0xAC00};
58ae771770SStanislav Sedov enum { s_count = 11172};
59ae771770SStanislav Sedov enum { l_base  = 0x1100};
60ae771770SStanislav Sedov enum { l_count = 19};
61ae771770SStanislav Sedov enum { v_base  = 0x1161};
62ae771770SStanislav Sedov enum { v_count = 21};
63ae771770SStanislav Sedov enum { t_base  = 0x11A7};
64ae771770SStanislav Sedov enum { t_count = 28};
65ae771770SStanislav Sedov enum { n_count = v_count * t_count};
66ae771770SStanislav Sedov 
67ae771770SStanislav Sedov static int
hangul_decomp(const uint32_t * in,size_t in_len,uint32_t * out,size_t * out_len)68ae771770SStanislav Sedov hangul_decomp(const uint32_t *in, size_t in_len,
69ae771770SStanislav Sedov 	      uint32_t *out, size_t *out_len)
70ae771770SStanislav Sedov {
71ae771770SStanislav Sedov     uint32_t u = *in;
72ae771770SStanislav Sedov     unsigned s_index;
73ae771770SStanislav Sedov     unsigned l, v, t;
74ae771770SStanislav Sedov     unsigned o;
75ae771770SStanislav Sedov 
76ae771770SStanislav Sedov     if (u < s_base || u >= s_base + s_count)
77ae771770SStanislav Sedov 	return 0;
78ae771770SStanislav Sedov     s_index = u - s_base;
79ae771770SStanislav Sedov     l = l_base + s_index / n_count;
80ae771770SStanislav Sedov     v = v_base + (s_index % n_count) / t_count;
81ae771770SStanislav Sedov     t = t_base + s_index % t_count;
82ae771770SStanislav Sedov     o = 2;
83ae771770SStanislav Sedov     if (t != t_base)
84ae771770SStanislav Sedov 	++o;
85ae771770SStanislav Sedov     if (*out_len < o)
86ae771770SStanislav Sedov 	return WIND_ERR_OVERRUN;
87ae771770SStanislav Sedov     out[0] = l;
88ae771770SStanislav Sedov     out[1] = v;
89ae771770SStanislav Sedov     if (t != t_base)
90ae771770SStanislav Sedov 	out[2] = t;
91ae771770SStanislav Sedov     *out_len = o;
92ae771770SStanislav Sedov     return 1;
93ae771770SStanislav Sedov }
94ae771770SStanislav Sedov 
95ae771770SStanislav Sedov static uint32_t
hangul_composition(const uint32_t * in,size_t in_len)96ae771770SStanislav Sedov hangul_composition(const uint32_t *in, size_t in_len)
97ae771770SStanislav Sedov {
98ae771770SStanislav Sedov     if (in_len < 2)
99ae771770SStanislav Sedov 	return 0;
100ae771770SStanislav Sedov     if (in[0] >= l_base && in[0] < l_base + l_count) {
101ae771770SStanislav Sedov 	unsigned l_index = in[0] - l_base;
102ae771770SStanislav Sedov 	unsigned v_index;
103ae771770SStanislav Sedov 
104ae771770SStanislav Sedov 	if (in[1] < v_base || in[1] >= v_base + v_count)
105ae771770SStanislav Sedov 	    return 0;
106ae771770SStanislav Sedov 	v_index = in[1] - v_base;
107ae771770SStanislav Sedov 	return (l_index * v_count + v_index) * t_count + s_base;
108ae771770SStanislav Sedov     } else if (in[0] >= s_base && in[0] < s_base + s_count) {
109ae771770SStanislav Sedov 	unsigned s_index = in[0] - s_base;
110ae771770SStanislav Sedov 	unsigned t_index;
111ae771770SStanislav Sedov 
112ae771770SStanislav Sedov 	if (s_index % t_count != 0)
113ae771770SStanislav Sedov 	    return 0;
114ae771770SStanislav Sedov 	if (in[1] < t_base || in[1] >= t_base + t_count)
115ae771770SStanislav Sedov 	    return 0;
116ae771770SStanislav Sedov 	t_index = in[1] - t_base;
117ae771770SStanislav Sedov 	return in[0] + t_index;
118ae771770SStanislav Sedov     }
119ae771770SStanislav Sedov     return 0;
120ae771770SStanislav Sedov }
121ae771770SStanislav Sedov 
122ae771770SStanislav Sedov static int
compat_decomp(const uint32_t * in,size_t in_len,uint32_t * out,size_t * out_len)123ae771770SStanislav Sedov compat_decomp(const uint32_t *in, size_t in_len,
124ae771770SStanislav Sedov 	      uint32_t *out, size_t *out_len)
125ae771770SStanislav Sedov {
126ae771770SStanislav Sedov     unsigned i;
127ae771770SStanislav Sedov     unsigned o = 0;
128ae771770SStanislav Sedov 
129ae771770SStanislav Sedov     for (i = 0; i < in_len; ++i) {
130ae771770SStanislav Sedov 	struct translation ts = {in[i]};
131ae771770SStanislav Sedov 	size_t sub_len = *out_len - o;
132ae771770SStanislav Sedov 	int ret;
133ae771770SStanislav Sedov 
134ae771770SStanislav Sedov 	ret = hangul_decomp(in + i, in_len - i,
135ae771770SStanislav Sedov 			    out + o, &sub_len);
136ae771770SStanislav Sedov 	if (ret) {
137ae771770SStanislav Sedov 	    if (ret == WIND_ERR_OVERRUN)
138ae771770SStanislav Sedov 		return ret;
139ae771770SStanislav Sedov 	    o += sub_len;
140ae771770SStanislav Sedov 	} else {
141ae771770SStanislav Sedov 	    void *s = bsearch(&ts,
142ae771770SStanislav Sedov 			      _wind_normalize_table,
143ae771770SStanislav Sedov 			      _wind_normalize_table_size,
144ae771770SStanislav Sedov 			      sizeof(_wind_normalize_table[0]),
145ae771770SStanislav Sedov 			      translation_cmp);
146ae771770SStanislav Sedov 	    if (s != NULL) {
147ae771770SStanislav Sedov 		const struct translation *t = (const struct translation *)s;
148ae771770SStanislav Sedov 
149ae771770SStanislav Sedov 		ret = compat_decomp(_wind_normalize_val_table + t->val_offset,
150ae771770SStanislav Sedov 				    t->val_len,
151ae771770SStanislav Sedov 				    out + o, &sub_len);
152ae771770SStanislav Sedov 		if (ret)
153ae771770SStanislav Sedov 		    return ret;
154ae771770SStanislav Sedov 		o += sub_len;
155ae771770SStanislav Sedov 	    } else {
156ae771770SStanislav Sedov 		if (o >= *out_len)
157ae771770SStanislav Sedov 		    return WIND_ERR_OVERRUN;
158ae771770SStanislav Sedov 		out[o++] = in[i];
159ae771770SStanislav Sedov 
160ae771770SStanislav Sedov 	    }
161ae771770SStanislav Sedov 	}
162ae771770SStanislav Sedov     }
163ae771770SStanislav Sedov     *out_len = o;
164ae771770SStanislav Sedov     return 0;
165ae771770SStanislav Sedov }
166ae771770SStanislav Sedov 
167ae771770SStanislav Sedov static void
swap_char(uint32_t * a,uint32_t * b)168ae771770SStanislav Sedov swap_char(uint32_t * a, uint32_t * b)
169ae771770SStanislav Sedov {
170ae771770SStanislav Sedov     uint32_t t;
171ae771770SStanislav Sedov     t = *a;
172ae771770SStanislav Sedov     *a = *b;
173ae771770SStanislav Sedov     *b = t;
174ae771770SStanislav Sedov }
175ae771770SStanislav Sedov 
176ae771770SStanislav Sedov /* Unicode 5.2.0 D109 Canonical Ordering for a sequence of code points
177ae771770SStanislav Sedov  * that all have Canonical_Combining_Class > 0 */
178ae771770SStanislav Sedov static void
canonical_reorder_sequence(uint32_t * a,size_t len)179ae771770SStanislav Sedov canonical_reorder_sequence(uint32_t * a, size_t len)
180ae771770SStanislav Sedov {
181ae771770SStanislav Sedov     size_t i, j;
182ae771770SStanislav Sedov 
183ae771770SStanislav Sedov     if (len <= 1)
184ae771770SStanislav Sedov 	return;
185ae771770SStanislav Sedov 
186ae771770SStanislav Sedov     for (i = 1; i < len; i++) {
187ae771770SStanislav Sedov 	for (j = i;
188ae771770SStanislav Sedov 	     j > 0 &&
189ae771770SStanislav Sedov 		 _wind_combining_class(a[j]) < _wind_combining_class(a[j-1]);
190ae771770SStanislav Sedov 	     j--)
191ae771770SStanislav Sedov 	    swap_char(&a[j], &a[j-1]);
192ae771770SStanislav Sedov     }
193ae771770SStanislav Sedov }
194ae771770SStanislav Sedov 
195ae771770SStanislav Sedov static void
canonical_reorder(uint32_t * tmp,size_t tmp_len)196ae771770SStanislav Sedov canonical_reorder(uint32_t *tmp, size_t tmp_len)
197ae771770SStanislav Sedov {
198ae771770SStanislav Sedov     size_t i;
199ae771770SStanislav Sedov 
200ae771770SStanislav Sedov     for (i = 0; i < tmp_len; ++i) {
201ae771770SStanislav Sedov 	int cc = _wind_combining_class(tmp[i]);
202ae771770SStanislav Sedov 	if (cc) {
203ae771770SStanislav Sedov 	    size_t j;
204ae771770SStanislav Sedov 	    for (j = i + 1;
205ae771770SStanislav Sedov 		 j < tmp_len && _wind_combining_class(tmp[j]);
206ae771770SStanislav Sedov 		 ++j)
207ae771770SStanislav Sedov 		;
208ae771770SStanislav Sedov 	    canonical_reorder_sequence(&tmp[i], j - i);
209ae771770SStanislav Sedov 	    i = j;
210ae771770SStanislav Sedov 	}
211ae771770SStanislav Sedov     }
212ae771770SStanislav Sedov }
213ae771770SStanislav Sedov 
214ae771770SStanislav Sedov static uint32_t
find_composition(const uint32_t * in,unsigned in_len)215ae771770SStanislav Sedov find_composition(const uint32_t *in, unsigned in_len)
216ae771770SStanislav Sedov {
217ae771770SStanislav Sedov     unsigned short canon_index = 0;
218ae771770SStanislav Sedov     uint32_t cur;
219ae771770SStanislav Sedov     unsigned n = 0;
220ae771770SStanislav Sedov 
221ae771770SStanislav Sedov     cur = hangul_composition(in, in_len);
222ae771770SStanislav Sedov     if (cur)
223ae771770SStanislav Sedov 	return cur;
224ae771770SStanislav Sedov 
225ae771770SStanislav Sedov     do {
226ae771770SStanislav Sedov 	const struct canon_node *c = &_wind_canon_table[canon_index];
227ae771770SStanislav Sedov 	unsigned i;
228ae771770SStanislav Sedov 
229ae771770SStanislav Sedov 	if (n % 5 == 0) {
230ae771770SStanislav Sedov 	    if (in_len-- == 0)
231ae771770SStanislav Sedov 		return c->val;
232*ed549cb0SCy Schubert 	    cur = *in++;
233ae771770SStanislav Sedov 	}
234ae771770SStanislav Sedov 
235ae771770SStanislav Sedov 	i = cur >> 16;
236ae771770SStanislav Sedov 	if (i < c->next_start || i >= c->next_end)
237ae771770SStanislav Sedov 	    canon_index = 0;
238ae771770SStanislav Sedov 	else
239ae771770SStanislav Sedov 	    canon_index =
240ae771770SStanislav Sedov 		_wind_canon_next_table[c->next_offset + i - c->next_start];
241ae771770SStanislav Sedov 	if (canon_index != 0) {
242ae771770SStanislav Sedov 	    cur = (cur << 4) & 0xFFFFF;
243ae771770SStanislav Sedov 	    ++n;
244ae771770SStanislav Sedov 	}
245ae771770SStanislav Sedov     } while (canon_index != 0);
246ae771770SStanislav Sedov     return 0;
247ae771770SStanislav Sedov }
248ae771770SStanislav Sedov 
249ae771770SStanislav Sedov static int
combine(const uint32_t * in,size_t in_len,uint32_t * out,size_t * out_len)250ae771770SStanislav Sedov combine(const uint32_t *in, size_t in_len,
251ae771770SStanislav Sedov 	uint32_t *out, size_t *out_len)
252ae771770SStanislav Sedov {
253ae771770SStanislav Sedov     unsigned i;
254ae771770SStanislav Sedov     int ostarter;
255ae771770SStanislav Sedov     unsigned o = 0;
256ae771770SStanislav Sedov     int old_cc;
257ae771770SStanislav Sedov 
258ae771770SStanislav Sedov     for (i = 0; i < in_len;) {
259ae771770SStanislav Sedov 	while (i < in_len && _wind_combining_class(in[i]) != 0) {
260ae771770SStanislav Sedov 	    out[o++] = in[i++];
261ae771770SStanislav Sedov 	}
262ae771770SStanislav Sedov 	if (i < in_len) {
263ae771770SStanislav Sedov 	    if (o >= *out_len)
264ae771770SStanislav Sedov 		return WIND_ERR_OVERRUN;
265ae771770SStanislav Sedov 	    ostarter = o;
266ae771770SStanislav Sedov 	    out[o++] = in[i++];
267ae771770SStanislav Sedov 	    old_cc   = -1;
268ae771770SStanislav Sedov 
269ae771770SStanislav Sedov 	    while (i < in_len) {
270ae771770SStanislav Sedov 		uint32_t comb;
271ae771770SStanislav Sedov 		uint32_t v[2];
272ae771770SStanislav Sedov 		int cc;
273ae771770SStanislav Sedov 
274ae771770SStanislav Sedov 		v[0] = out[ostarter];
275ae771770SStanislav Sedov 		v[1] = in[i];
276ae771770SStanislav Sedov 
277ae771770SStanislav Sedov 		cc = _wind_combining_class(in[i]);
278ae771770SStanislav Sedov 		if (old_cc != cc && (comb = find_composition(v, 2))) {
279ae771770SStanislav Sedov 		    out[ostarter] = comb;
280ae771770SStanislav Sedov 		} else if (cc == 0) {
281ae771770SStanislav Sedov 		    break;
282ae771770SStanislav Sedov 		} else {
283ae771770SStanislav Sedov 		    if (o >= *out_len)
284ae771770SStanislav Sedov 			return WIND_ERR_OVERRUN;
285ae771770SStanislav Sedov 		    out[o++] = in[i];
286ae771770SStanislav Sedov 		    old_cc   = cc;
287ae771770SStanislav Sedov 		}
288ae771770SStanislav Sedov 		++i;
289ae771770SStanislav Sedov 	    }
290ae771770SStanislav Sedov 	}
291ae771770SStanislav Sedov     }
292ae771770SStanislav Sedov     *out_len = o;
293ae771770SStanislav Sedov     return 0;
294ae771770SStanislav Sedov }
295ae771770SStanislav Sedov 
296ae771770SStanislav Sedov int
_wind_stringprep_normalize(const uint32_t * in,size_t in_len,uint32_t * out,size_t * out_len)297ae771770SStanislav Sedov _wind_stringprep_normalize(const uint32_t *in, size_t in_len,
298ae771770SStanislav Sedov 			   uint32_t *out, size_t *out_len)
299ae771770SStanislav Sedov {
300ae771770SStanislav Sedov     size_t tmp_len;
301ae771770SStanislav Sedov     uint32_t *tmp;
302ae771770SStanislav Sedov     int ret;
303ae771770SStanislav Sedov 
304ae771770SStanislav Sedov     if (in_len == 0) {
305ae771770SStanislav Sedov 	*out_len = 0;
306ae771770SStanislav Sedov 	return 0;
307ae771770SStanislav Sedov     }
308ae771770SStanislav Sedov 
309ae771770SStanislav Sedov     tmp_len = in_len * 4;
310ae771770SStanislav Sedov     if (tmp_len < MAX_LENGTH_CANON)
311ae771770SStanislav Sedov 	tmp_len = MAX_LENGTH_CANON;
312ae771770SStanislav Sedov     tmp = malloc(tmp_len * sizeof(uint32_t));
313ae771770SStanislav Sedov     if (tmp == NULL)
314ae771770SStanislav Sedov 	return ENOMEM;
315ae771770SStanislav Sedov 
316ae771770SStanislav Sedov     ret = compat_decomp(in, in_len, tmp, &tmp_len);
317ae771770SStanislav Sedov     if (ret) {
318ae771770SStanislav Sedov 	free(tmp);
319ae771770SStanislav Sedov 	return ret;
320ae771770SStanislav Sedov     }
321ae771770SStanislav Sedov     canonical_reorder(tmp, tmp_len);
322ae771770SStanislav Sedov     ret = combine(tmp, tmp_len, out, out_len);
323ae771770SStanislav Sedov     free(tmp);
324ae771770SStanislav Sedov     return ret;
325ae771770SStanislav Sedov }
326