xref: /freebsd/crypto/heimdal/lib/wind/normalize.c (revision 2e3507c25e42292b45a5482e116d278f5515d04d)
1 /*
2  * Copyright (c) 2004 Kungliga Tekniska Högskolan
3  * (Royal Institute of Technology, Stockholm, Sweden).
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  *
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  *
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * 3. Neither the name of the Institute nor the names of its contributors
18  *    may be used to endorse or promote products derived from this software
19  *    without specific prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE INSTITUTE AND CONTRIBUTORS ``AS IS'' AND
22  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24  * ARE DISCLAIMED.  IN NO EVENT SHALL THE INSTITUTE OR CONTRIBUTORS BE LIABLE
25  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31  * SUCH DAMAGE.
32  */
33 
34 #ifdef HAVE_CONFIG_H
35 #include <config.h>
36 #endif
37 #include "windlocl.h"
38 
39 #include <assert.h>
40 #include <stdlib.h>
41 #include <errno.h>
42 #include <stdio.h>
43 
44 #include "roken.h"
45 
46 #include "normalize_table.h"
47 
48 static int
49 translation_cmp(const void *key, const void *data)
50 {
51     const struct translation *t1 = (const struct translation *)key;
52     const struct translation *t2 = (const struct translation *)data;
53 
54     return t1->key - t2->key;
55 }
56 
57 enum { s_base  = 0xAC00};
58 enum { s_count = 11172};
59 enum { l_base  = 0x1100};
60 enum { l_count = 19};
61 enum { v_base  = 0x1161};
62 enum { v_count = 21};
63 enum { t_base  = 0x11A7};
64 enum { t_count = 28};
65 enum { n_count = v_count * t_count};
66 
67 static int
68 hangul_decomp(const uint32_t *in, size_t in_len,
69 	      uint32_t *out, size_t *out_len)
70 {
71     uint32_t u = *in;
72     unsigned s_index;
73     unsigned l, v, t;
74     unsigned o;
75 
76     if (u < s_base || u >= s_base + s_count)
77 	return 0;
78     s_index = u - s_base;
79     l = l_base + s_index / n_count;
80     v = v_base + (s_index % n_count) / t_count;
81     t = t_base + s_index % t_count;
82     o = 2;
83     if (t != t_base)
84 	++o;
85     if (*out_len < o)
86 	return WIND_ERR_OVERRUN;
87     out[0] = l;
88     out[1] = v;
89     if (t != t_base)
90 	out[2] = t;
91     *out_len = o;
92     return 1;
93 }
94 
95 static uint32_t
96 hangul_composition(const uint32_t *in, size_t in_len)
97 {
98     if (in_len < 2)
99 	return 0;
100     if (in[0] >= l_base && in[0] < l_base + l_count) {
101 	unsigned l_index = in[0] - l_base;
102 	unsigned v_index;
103 
104 	if (in[1] < v_base || in[1] >= v_base + v_count)
105 	    return 0;
106 	v_index = in[1] - v_base;
107 	return (l_index * v_count + v_index) * t_count + s_base;
108     } else if (in[0] >= s_base && in[0] < s_base + s_count) {
109 	unsigned s_index = in[0] - s_base;
110 	unsigned t_index;
111 
112 	if (s_index % t_count != 0)
113 	    return 0;
114 	if (in[1] < t_base || in[1] >= t_base + t_count)
115 	    return 0;
116 	t_index = in[1] - t_base;
117 	return in[0] + t_index;
118     }
119     return 0;
120 }
121 
122 static int
123 compat_decomp(const uint32_t *in, size_t in_len,
124 	      uint32_t *out, size_t *out_len)
125 {
126     unsigned i;
127     unsigned o = 0;
128 
129     for (i = 0; i < in_len; ++i) {
130 	struct translation ts = {in[i]};
131 	size_t sub_len = *out_len - o;
132 	int ret;
133 
134 	ret = hangul_decomp(in + i, in_len - i,
135 			    out + o, &sub_len);
136 	if (ret) {
137 	    if (ret == WIND_ERR_OVERRUN)
138 		return ret;
139 	    o += sub_len;
140 	} else {
141 	    void *s = bsearch(&ts,
142 			      _wind_normalize_table,
143 			      _wind_normalize_table_size,
144 			      sizeof(_wind_normalize_table[0]),
145 			      translation_cmp);
146 	    if (s != NULL) {
147 		const struct translation *t = (const struct translation *)s;
148 
149 		ret = compat_decomp(_wind_normalize_val_table + t->val_offset,
150 				    t->val_len,
151 				    out + o, &sub_len);
152 		if (ret)
153 		    return ret;
154 		o += sub_len;
155 	    } else {
156 		if (o >= *out_len)
157 		    return WIND_ERR_OVERRUN;
158 		out[o++] = in[i];
159 
160 	    }
161 	}
162     }
163     *out_len = o;
164     return 0;
165 }
166 
167 static void
168 swap_char(uint32_t * a, uint32_t * b)
169 {
170     uint32_t t;
171     t = *a;
172     *a = *b;
173     *b = t;
174 }
175 
176 /* Unicode 5.2.0 D109 Canonical Ordering for a sequence of code points
177  * that all have Canonical_Combining_Class > 0 */
178 static void
179 canonical_reorder_sequence(uint32_t * a, size_t len)
180 {
181     size_t i, j;
182 
183     if (len <= 1)
184 	return;
185 
186     for (i = 1; i < len; i++) {
187 	for (j = i;
188 	     j > 0 &&
189 		 _wind_combining_class(a[j]) < _wind_combining_class(a[j-1]);
190 	     j--)
191 	    swap_char(&a[j], &a[j-1]);
192     }
193 }
194 
195 static void
196 canonical_reorder(uint32_t *tmp, size_t tmp_len)
197 {
198     size_t i;
199 
200     for (i = 0; i < tmp_len; ++i) {
201 	int cc = _wind_combining_class(tmp[i]);
202 	if (cc) {
203 	    size_t j;
204 	    for (j = i + 1;
205 		 j < tmp_len && _wind_combining_class(tmp[j]);
206 		 ++j)
207 		;
208 	    canonical_reorder_sequence(&tmp[i], j - i);
209 	    i = j;
210 	}
211     }
212 }
213 
214 static uint32_t
215 find_composition(const uint32_t *in, unsigned in_len)
216 {
217     unsigned short canon_index = 0;
218     uint32_t cur;
219     unsigned n = 0;
220 
221     cur = hangul_composition(in, in_len);
222     if (cur)
223 	return cur;
224 
225     do {
226 	const struct canon_node *c = &_wind_canon_table[canon_index];
227 	unsigned i;
228 
229 	if (n % 5 == 0) {
230 	    if (in_len-- == 0)
231 		return c->val;
232 	    cur = *in++;
233 	}
234 
235 	i = cur >> 16;
236 	if (i < c->next_start || i >= c->next_end)
237 	    canon_index = 0;
238 	else
239 	    canon_index =
240 		_wind_canon_next_table[c->next_offset + i - c->next_start];
241 	if (canon_index != 0) {
242 	    cur = (cur << 4) & 0xFFFFF;
243 	    ++n;
244 	}
245     } while (canon_index != 0);
246     return 0;
247 }
248 
249 static int
250 combine(const uint32_t *in, size_t in_len,
251 	uint32_t *out, size_t *out_len)
252 {
253     unsigned i;
254     int ostarter;
255     unsigned o = 0;
256     int old_cc;
257 
258     for (i = 0; i < in_len;) {
259 	while (i < in_len && _wind_combining_class(in[i]) != 0) {
260 	    out[o++] = in[i++];
261 	}
262 	if (i < in_len) {
263 	    if (o >= *out_len)
264 		return WIND_ERR_OVERRUN;
265 	    ostarter = o;
266 	    out[o++] = in[i++];
267 	    old_cc   = -1;
268 
269 	    while (i < in_len) {
270 		uint32_t comb;
271 		uint32_t v[2];
272 		int cc;
273 
274 		v[0] = out[ostarter];
275 		v[1] = in[i];
276 
277 		cc = _wind_combining_class(in[i]);
278 		if (old_cc != cc && (comb = find_composition(v, 2))) {
279 		    out[ostarter] = comb;
280 		} else if (cc == 0) {
281 		    break;
282 		} else {
283 		    if (o >= *out_len)
284 			return WIND_ERR_OVERRUN;
285 		    out[o++] = in[i];
286 		    old_cc   = cc;
287 		}
288 		++i;
289 	    }
290 	}
291     }
292     *out_len = o;
293     return 0;
294 }
295 
296 int
297 _wind_stringprep_normalize(const uint32_t *in, size_t in_len,
298 			   uint32_t *out, size_t *out_len)
299 {
300     size_t tmp_len;
301     uint32_t *tmp;
302     int ret;
303 
304     if (in_len == 0) {
305 	*out_len = 0;
306 	return 0;
307     }
308 
309     tmp_len = in_len * 4;
310     if (tmp_len < MAX_LENGTH_CANON)
311 	tmp_len = MAX_LENGTH_CANON;
312     tmp = malloc(tmp_len * sizeof(uint32_t));
313     if (tmp == NULL)
314 	return ENOMEM;
315 
316     ret = compat_decomp(in, in_len, tmp, &tmp_len);
317     if (ret) {
318 	free(tmp);
319 	return ret;
320     }
321     canonical_reorder(tmp, tmp_len);
322     ret = combine(tmp, tmp_len, out, out_len);
323     free(tmp);
324     return ret;
325 }
326