1 /* 2 * Copyright (c) 2004 Kungliga Tekniska Högskolan 3 * (Royal Institute of Technology, Stockholm, Sweden). 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * 3. Neither the name of the Institute nor the names of its contributors 18 * may be used to endorse or promote products derived from this software 19 * without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE INSTITUTE AND CONTRIBUTORS ``AS IS'' AND 22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE INSTITUTE OR CONTRIBUTORS BE LIABLE 25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31 * SUCH DAMAGE. 32 */ 33 34 #ifdef HAVE_CONFIG_H 35 #include <config.h> 36 #endif 37 #include "windlocl.h" 38 39 #include <assert.h> 40 #include <stdlib.h> 41 #include <errno.h> 42 #include <stdio.h> 43 44 #include "roken.h" 45 46 #include "normalize_table.h" 47 48 static int 49 translation_cmp(const void *key, const void *data) 50 { 51 const struct translation *t1 = (const struct translation *)key; 52 const struct translation *t2 = (const struct translation *)data; 53 54 return t1->key - t2->key; 55 } 56 57 enum { s_base = 0xAC00}; 58 enum { s_count = 11172}; 59 enum { l_base = 0x1100}; 60 enum { l_count = 19}; 61 enum { v_base = 0x1161}; 62 enum { v_count = 21}; 63 enum { t_base = 0x11A7}; 64 enum { t_count = 28}; 65 enum { n_count = v_count * t_count}; 66 67 static int 68 hangul_decomp(const uint32_t *in, size_t in_len, 69 uint32_t *out, size_t *out_len) 70 { 71 uint32_t u = *in; 72 unsigned s_index; 73 unsigned l, v, t; 74 unsigned o; 75 76 if (u < s_base || u >= s_base + s_count) 77 return 0; 78 s_index = u - s_base; 79 l = l_base + s_index / n_count; 80 v = v_base + (s_index % n_count) / t_count; 81 t = t_base + s_index % t_count; 82 o = 2; 83 if (t != t_base) 84 ++o; 85 if (*out_len < o) 86 return WIND_ERR_OVERRUN; 87 out[0] = l; 88 out[1] = v; 89 if (t != t_base) 90 out[2] = t; 91 *out_len = o; 92 return 1; 93 } 94 95 static uint32_t 96 hangul_composition(const uint32_t *in, size_t in_len) 97 { 98 if (in_len < 2) 99 return 0; 100 if (in[0] >= l_base && in[0] < l_base + l_count) { 101 unsigned l_index = in[0] - l_base; 102 unsigned v_index; 103 104 if (in[1] < v_base || in[1] >= v_base + v_count) 105 return 0; 106 v_index = in[1] - v_base; 107 return (l_index * v_count + v_index) * t_count + s_base; 108 } else if (in[0] >= s_base && in[0] < s_base + s_count) { 109 unsigned s_index = in[0] - s_base; 110 unsigned t_index; 111 112 if (s_index % t_count != 0) 113 return 0; 114 if (in[1] < t_base || in[1] >= t_base + t_count) 115 return 0; 116 t_index = in[1] - t_base; 117 return in[0] + t_index; 118 } 119 return 0; 120 } 121 122 static int 123 compat_decomp(const uint32_t *in, size_t in_len, 124 uint32_t *out, size_t *out_len) 125 { 126 unsigned i; 127 unsigned o = 0; 128 129 for (i = 0; i < in_len; ++i) { 130 struct translation ts = {in[i]}; 131 size_t sub_len = *out_len - o; 132 int ret; 133 134 ret = hangul_decomp(in + i, in_len - i, 135 out + o, &sub_len); 136 if (ret) { 137 if (ret == WIND_ERR_OVERRUN) 138 return ret; 139 o += sub_len; 140 } else { 141 void *s = bsearch(&ts, 142 _wind_normalize_table, 143 _wind_normalize_table_size, 144 sizeof(_wind_normalize_table[0]), 145 translation_cmp); 146 if (s != NULL) { 147 const struct translation *t = (const struct translation *)s; 148 149 ret = compat_decomp(_wind_normalize_val_table + t->val_offset, 150 t->val_len, 151 out + o, &sub_len); 152 if (ret) 153 return ret; 154 o += sub_len; 155 } else { 156 if (o >= *out_len) 157 return WIND_ERR_OVERRUN; 158 out[o++] = in[i]; 159 160 } 161 } 162 } 163 *out_len = o; 164 return 0; 165 } 166 167 static void 168 swap_char(uint32_t * a, uint32_t * b) 169 { 170 uint32_t t; 171 t = *a; 172 *a = *b; 173 *b = t; 174 } 175 176 /* Unicode 5.2.0 D109 Canonical Ordering for a sequence of code points 177 * that all have Canonical_Combining_Class > 0 */ 178 static void 179 canonical_reorder_sequence(uint32_t * a, size_t len) 180 { 181 size_t i, j; 182 183 if (len <= 1) 184 return; 185 186 for (i = 1; i < len; i++) { 187 for (j = i; 188 j > 0 && 189 _wind_combining_class(a[j]) < _wind_combining_class(a[j-1]); 190 j--) 191 swap_char(&a[j], &a[j-1]); 192 } 193 } 194 195 static void 196 canonical_reorder(uint32_t *tmp, size_t tmp_len) 197 { 198 size_t i; 199 200 for (i = 0; i < tmp_len; ++i) { 201 int cc = _wind_combining_class(tmp[i]); 202 if (cc) { 203 size_t j; 204 for (j = i + 1; 205 j < tmp_len && _wind_combining_class(tmp[j]); 206 ++j) 207 ; 208 canonical_reorder_sequence(&tmp[i], j - i); 209 i = j; 210 } 211 } 212 } 213 214 static uint32_t 215 find_composition(const uint32_t *in, unsigned in_len) 216 { 217 unsigned short canon_index = 0; 218 uint32_t cur; 219 unsigned n = 0; 220 221 cur = hangul_composition(in, in_len); 222 if (cur) 223 return cur; 224 225 do { 226 const struct canon_node *c = &_wind_canon_table[canon_index]; 227 unsigned i; 228 229 if (n % 5 == 0) { 230 if (in_len-- == 0) 231 return c->val; 232 cur = *in++; 233 } 234 235 i = cur >> 16; 236 if (i < c->next_start || i >= c->next_end) 237 canon_index = 0; 238 else 239 canon_index = 240 _wind_canon_next_table[c->next_offset + i - c->next_start]; 241 if (canon_index != 0) { 242 cur = (cur << 4) & 0xFFFFF; 243 ++n; 244 } 245 } while (canon_index != 0); 246 return 0; 247 } 248 249 static int 250 combine(const uint32_t *in, size_t in_len, 251 uint32_t *out, size_t *out_len) 252 { 253 unsigned i; 254 int ostarter; 255 unsigned o = 0; 256 int old_cc; 257 258 for (i = 0; i < in_len;) { 259 while (i < in_len && _wind_combining_class(in[i]) != 0) { 260 out[o++] = in[i++]; 261 } 262 if (i < in_len) { 263 if (o >= *out_len) 264 return WIND_ERR_OVERRUN; 265 ostarter = o; 266 out[o++] = in[i++]; 267 old_cc = -1; 268 269 while (i < in_len) { 270 uint32_t comb; 271 uint32_t v[2]; 272 int cc; 273 274 v[0] = out[ostarter]; 275 v[1] = in[i]; 276 277 cc = _wind_combining_class(in[i]); 278 if (old_cc != cc && (comb = find_composition(v, 2))) { 279 out[ostarter] = comb; 280 } else if (cc == 0) { 281 break; 282 } else { 283 if (o >= *out_len) 284 return WIND_ERR_OVERRUN; 285 out[o++] = in[i]; 286 old_cc = cc; 287 } 288 ++i; 289 } 290 } 291 } 292 *out_len = o; 293 return 0; 294 } 295 296 int 297 _wind_stringprep_normalize(const uint32_t *in, size_t in_len, 298 uint32_t *out, size_t *out_len) 299 { 300 size_t tmp_len; 301 uint32_t *tmp; 302 int ret; 303 304 if (in_len == 0) { 305 *out_len = 0; 306 return 0; 307 } 308 309 tmp_len = in_len * 4; 310 if (tmp_len < MAX_LENGTH_CANON) 311 tmp_len = MAX_LENGTH_CANON; 312 tmp = malloc(tmp_len * sizeof(uint32_t)); 313 if (tmp == NULL) 314 return ENOMEM; 315 316 ret = compat_decomp(in, in_len, tmp, &tmp_len); 317 if (ret) { 318 free(tmp); 319 return ret; 320 } 321 canonical_reorder(tmp, tmp_len); 322 ret = combine(tmp, tmp_len, out, out_len); 323 free(tmp); 324 return ret; 325 } 326