1*ae771770SStanislav Sedov /* 2*ae771770SStanislav Sedov * Copyright (c) 2004 Kungliga Tekniska Högskolan 3*ae771770SStanislav Sedov * (Royal Institute of Technology, Stockholm, Sweden). 4*ae771770SStanislav Sedov * All rights reserved. 5*ae771770SStanislav Sedov * 6*ae771770SStanislav Sedov * Redistribution and use in source and binary forms, with or without 7*ae771770SStanislav Sedov * modification, are permitted provided that the following conditions 8*ae771770SStanislav Sedov * are met: 9*ae771770SStanislav Sedov * 10*ae771770SStanislav Sedov * 1. Redistributions of source code must retain the above copyright 11*ae771770SStanislav Sedov * notice, this list of conditions and the following disclaimer. 12*ae771770SStanislav Sedov * 13*ae771770SStanislav Sedov * 2. Redistributions in binary form must reproduce the above copyright 14*ae771770SStanislav Sedov * notice, this list of conditions and the following disclaimer in the 15*ae771770SStanislav Sedov * documentation and/or other materials provided with the distribution. 16*ae771770SStanislav Sedov * 17*ae771770SStanislav Sedov * 3. Neither the name of the Institute nor the names of its contributors 18*ae771770SStanislav Sedov * may be used to endorse or promote products derived from this software 19*ae771770SStanislav Sedov * without specific prior written permission. 20*ae771770SStanislav Sedov * 21*ae771770SStanislav Sedov * THIS SOFTWARE IS PROVIDED BY THE INSTITUTE AND CONTRIBUTORS ``AS IS'' AND 22*ae771770SStanislav Sedov * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23*ae771770SStanislav Sedov * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24*ae771770SStanislav Sedov * ARE DISCLAIMED. IN NO EVENT SHALL THE INSTITUTE OR CONTRIBUTORS BE LIABLE 25*ae771770SStanislav Sedov * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26*ae771770SStanislav Sedov * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 27*ae771770SStanislav Sedov * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 28*ae771770SStanislav Sedov * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29*ae771770SStanislav Sedov * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 30*ae771770SStanislav Sedov * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31*ae771770SStanislav Sedov * SUCH DAMAGE. 32*ae771770SStanislav Sedov */ 33*ae771770SStanislav Sedov 34*ae771770SStanislav Sedov #ifdef HAVE_CONFIG_H 35*ae771770SStanislav Sedov #include <config.h> 36*ae771770SStanislav Sedov #endif 37*ae771770SStanislav Sedov #include "windlocl.h" 38*ae771770SStanislav Sedov 39*ae771770SStanislav Sedov #include <assert.h> 40*ae771770SStanislav Sedov #include <stdlib.h> 41*ae771770SStanislav Sedov #include <errno.h> 42*ae771770SStanislav Sedov #include <stdio.h> 43*ae771770SStanislav Sedov 44*ae771770SStanislav Sedov #include "roken.h" 45*ae771770SStanislav Sedov 46*ae771770SStanislav Sedov #include "normalize_table.h" 47*ae771770SStanislav Sedov 48*ae771770SStanislav Sedov static int 49*ae771770SStanislav Sedov translation_cmp(const void *key, const void *data) 50*ae771770SStanislav Sedov { 51*ae771770SStanislav Sedov const struct translation *t1 = (const struct translation *)key; 52*ae771770SStanislav Sedov const struct translation *t2 = (const struct translation *)data; 53*ae771770SStanislav Sedov 54*ae771770SStanislav Sedov return t1->key - t2->key; 55*ae771770SStanislav Sedov } 56*ae771770SStanislav Sedov 57*ae771770SStanislav Sedov enum { s_base = 0xAC00}; 58*ae771770SStanislav Sedov enum { s_count = 11172}; 59*ae771770SStanislav Sedov enum { l_base = 0x1100}; 60*ae771770SStanislav Sedov enum { l_count = 19}; 61*ae771770SStanislav Sedov enum { v_base = 0x1161}; 62*ae771770SStanislav Sedov enum { v_count = 21}; 63*ae771770SStanislav Sedov enum { t_base = 0x11A7}; 64*ae771770SStanislav Sedov enum { t_count = 28}; 65*ae771770SStanislav Sedov enum { n_count = v_count * t_count}; 66*ae771770SStanislav Sedov 67*ae771770SStanislav Sedov static int 68*ae771770SStanislav Sedov hangul_decomp(const uint32_t *in, size_t in_len, 69*ae771770SStanislav Sedov uint32_t *out, size_t *out_len) 70*ae771770SStanislav Sedov { 71*ae771770SStanislav Sedov uint32_t u = *in; 72*ae771770SStanislav Sedov unsigned s_index; 73*ae771770SStanislav Sedov unsigned l, v, t; 74*ae771770SStanislav Sedov unsigned o; 75*ae771770SStanislav Sedov 76*ae771770SStanislav Sedov if (u < s_base || u >= s_base + s_count) 77*ae771770SStanislav Sedov return 0; 78*ae771770SStanislav Sedov s_index = u - s_base; 79*ae771770SStanislav Sedov l = l_base + s_index / n_count; 80*ae771770SStanislav Sedov v = v_base + (s_index % n_count) / t_count; 81*ae771770SStanislav Sedov t = t_base + s_index % t_count; 82*ae771770SStanislav Sedov o = 2; 83*ae771770SStanislav Sedov if (t != t_base) 84*ae771770SStanislav Sedov ++o; 85*ae771770SStanislav Sedov if (*out_len < o) 86*ae771770SStanislav Sedov return WIND_ERR_OVERRUN; 87*ae771770SStanislav Sedov out[0] = l; 88*ae771770SStanislav Sedov out[1] = v; 89*ae771770SStanislav Sedov if (t != t_base) 90*ae771770SStanislav Sedov out[2] = t; 91*ae771770SStanislav Sedov *out_len = o; 92*ae771770SStanislav Sedov return 1; 93*ae771770SStanislav Sedov } 94*ae771770SStanislav Sedov 95*ae771770SStanislav Sedov static uint32_t 96*ae771770SStanislav Sedov hangul_composition(const uint32_t *in, size_t in_len) 97*ae771770SStanislav Sedov { 98*ae771770SStanislav Sedov if (in_len < 2) 99*ae771770SStanislav Sedov return 0; 100*ae771770SStanislav Sedov if (in[0] >= l_base && in[0] < l_base + l_count) { 101*ae771770SStanislav Sedov unsigned l_index = in[0] - l_base; 102*ae771770SStanislav Sedov unsigned v_index; 103*ae771770SStanislav Sedov 104*ae771770SStanislav Sedov if (in[1] < v_base || in[1] >= v_base + v_count) 105*ae771770SStanislav Sedov return 0; 106*ae771770SStanislav Sedov v_index = in[1] - v_base; 107*ae771770SStanislav Sedov return (l_index * v_count + v_index) * t_count + s_base; 108*ae771770SStanislav Sedov } else if (in[0] >= s_base && in[0] < s_base + s_count) { 109*ae771770SStanislav Sedov unsigned s_index = in[0] - s_base; 110*ae771770SStanislav Sedov unsigned t_index; 111*ae771770SStanislav Sedov 112*ae771770SStanislav Sedov if (s_index % t_count != 0) 113*ae771770SStanislav Sedov return 0; 114*ae771770SStanislav Sedov if (in[1] < t_base || in[1] >= t_base + t_count) 115*ae771770SStanislav Sedov return 0; 116*ae771770SStanislav Sedov t_index = in[1] - t_base; 117*ae771770SStanislav Sedov return in[0] + t_index; 118*ae771770SStanislav Sedov } 119*ae771770SStanislav Sedov return 0; 120*ae771770SStanislav Sedov } 121*ae771770SStanislav Sedov 122*ae771770SStanislav Sedov static int 123*ae771770SStanislav Sedov compat_decomp(const uint32_t *in, size_t in_len, 124*ae771770SStanislav Sedov uint32_t *out, size_t *out_len) 125*ae771770SStanislav Sedov { 126*ae771770SStanislav Sedov unsigned i; 127*ae771770SStanislav Sedov unsigned o = 0; 128*ae771770SStanislav Sedov 129*ae771770SStanislav Sedov for (i = 0; i < in_len; ++i) { 130*ae771770SStanislav Sedov struct translation ts = {in[i]}; 131*ae771770SStanislav Sedov size_t sub_len = *out_len - o; 132*ae771770SStanislav Sedov int ret; 133*ae771770SStanislav Sedov 134*ae771770SStanislav Sedov ret = hangul_decomp(in + i, in_len - i, 135*ae771770SStanislav Sedov out + o, &sub_len); 136*ae771770SStanislav Sedov if (ret) { 137*ae771770SStanislav Sedov if (ret == WIND_ERR_OVERRUN) 138*ae771770SStanislav Sedov return ret; 139*ae771770SStanislav Sedov o += sub_len; 140*ae771770SStanislav Sedov } else { 141*ae771770SStanislav Sedov void *s = bsearch(&ts, 142*ae771770SStanislav Sedov _wind_normalize_table, 143*ae771770SStanislav Sedov _wind_normalize_table_size, 144*ae771770SStanislav Sedov sizeof(_wind_normalize_table[0]), 145*ae771770SStanislav Sedov translation_cmp); 146*ae771770SStanislav Sedov if (s != NULL) { 147*ae771770SStanislav Sedov const struct translation *t = (const struct translation *)s; 148*ae771770SStanislav Sedov 149*ae771770SStanislav Sedov ret = compat_decomp(_wind_normalize_val_table + t->val_offset, 150*ae771770SStanislav Sedov t->val_len, 151*ae771770SStanislav Sedov out + o, &sub_len); 152*ae771770SStanislav Sedov if (ret) 153*ae771770SStanislav Sedov return ret; 154*ae771770SStanislav Sedov o += sub_len; 155*ae771770SStanislav Sedov } else { 156*ae771770SStanislav Sedov if (o >= *out_len) 157*ae771770SStanislav Sedov return WIND_ERR_OVERRUN; 158*ae771770SStanislav Sedov out[o++] = in[i]; 159*ae771770SStanislav Sedov 160*ae771770SStanislav Sedov } 161*ae771770SStanislav Sedov } 162*ae771770SStanislav Sedov } 163*ae771770SStanislav Sedov *out_len = o; 164*ae771770SStanislav Sedov return 0; 165*ae771770SStanislav Sedov } 166*ae771770SStanislav Sedov 167*ae771770SStanislav Sedov static void 168*ae771770SStanislav Sedov swap_char(uint32_t * a, uint32_t * b) 169*ae771770SStanislav Sedov { 170*ae771770SStanislav Sedov uint32_t t; 171*ae771770SStanislav Sedov t = *a; 172*ae771770SStanislav Sedov *a = *b; 173*ae771770SStanislav Sedov *b = t; 174*ae771770SStanislav Sedov } 175*ae771770SStanislav Sedov 176*ae771770SStanislav Sedov /* Unicode 5.2.0 D109 Canonical Ordering for a sequence of code points 177*ae771770SStanislav Sedov * that all have Canonical_Combining_Class > 0 */ 178*ae771770SStanislav Sedov static void 179*ae771770SStanislav Sedov canonical_reorder_sequence(uint32_t * a, size_t len) 180*ae771770SStanislav Sedov { 181*ae771770SStanislav Sedov size_t i, j; 182*ae771770SStanislav Sedov 183*ae771770SStanislav Sedov if (len <= 1) 184*ae771770SStanislav Sedov return; 185*ae771770SStanislav Sedov 186*ae771770SStanislav Sedov for (i = 1; i < len; i++) { 187*ae771770SStanislav Sedov for (j = i; 188*ae771770SStanislav Sedov j > 0 && 189*ae771770SStanislav Sedov _wind_combining_class(a[j]) < _wind_combining_class(a[j-1]); 190*ae771770SStanislav Sedov j--) 191*ae771770SStanislav Sedov swap_char(&a[j], &a[j-1]); 192*ae771770SStanislav Sedov } 193*ae771770SStanislav Sedov } 194*ae771770SStanislav Sedov 195*ae771770SStanislav Sedov static void 196*ae771770SStanislav Sedov canonical_reorder(uint32_t *tmp, size_t tmp_len) 197*ae771770SStanislav Sedov { 198*ae771770SStanislav Sedov size_t i; 199*ae771770SStanislav Sedov 200*ae771770SStanislav Sedov for (i = 0; i < tmp_len; ++i) { 201*ae771770SStanislav Sedov int cc = _wind_combining_class(tmp[i]); 202*ae771770SStanislav Sedov if (cc) { 203*ae771770SStanislav Sedov size_t j; 204*ae771770SStanislav Sedov for (j = i + 1; 205*ae771770SStanislav Sedov j < tmp_len && _wind_combining_class(tmp[j]); 206*ae771770SStanislav Sedov ++j) 207*ae771770SStanislav Sedov ; 208*ae771770SStanislav Sedov canonical_reorder_sequence(&tmp[i], j - i); 209*ae771770SStanislav Sedov i = j; 210*ae771770SStanislav Sedov } 211*ae771770SStanislav Sedov } 212*ae771770SStanislav Sedov } 213*ae771770SStanislav Sedov 214*ae771770SStanislav Sedov static uint32_t 215*ae771770SStanislav Sedov find_composition(const uint32_t *in, unsigned in_len) 216*ae771770SStanislav Sedov { 217*ae771770SStanislav Sedov unsigned short canon_index = 0; 218*ae771770SStanislav Sedov uint32_t cur; 219*ae771770SStanislav Sedov unsigned n = 0; 220*ae771770SStanislav Sedov 221*ae771770SStanislav Sedov cur = hangul_composition(in, in_len); 222*ae771770SStanislav Sedov if (cur) 223*ae771770SStanislav Sedov return cur; 224*ae771770SStanislav Sedov 225*ae771770SStanislav Sedov do { 226*ae771770SStanislav Sedov const struct canon_node *c = &_wind_canon_table[canon_index]; 227*ae771770SStanislav Sedov unsigned i; 228*ae771770SStanislav Sedov 229*ae771770SStanislav Sedov if (n % 5 == 0) { 230*ae771770SStanislav Sedov cur = *in++; 231*ae771770SStanislav Sedov if (in_len-- == 0) 232*ae771770SStanislav Sedov return c->val; 233*ae771770SStanislav Sedov } 234*ae771770SStanislav Sedov 235*ae771770SStanislav Sedov i = cur >> 16; 236*ae771770SStanislav Sedov if (i < c->next_start || i >= c->next_end) 237*ae771770SStanislav Sedov canon_index = 0; 238*ae771770SStanislav Sedov else 239*ae771770SStanislav Sedov canon_index = 240*ae771770SStanislav Sedov _wind_canon_next_table[c->next_offset + i - c->next_start]; 241*ae771770SStanislav Sedov if (canon_index != 0) { 242*ae771770SStanislav Sedov cur = (cur << 4) & 0xFFFFF; 243*ae771770SStanislav Sedov ++n; 244*ae771770SStanislav Sedov } 245*ae771770SStanislav Sedov } while (canon_index != 0); 246*ae771770SStanislav Sedov return 0; 247*ae771770SStanislav Sedov } 248*ae771770SStanislav Sedov 249*ae771770SStanislav Sedov static int 250*ae771770SStanislav Sedov combine(const uint32_t *in, size_t in_len, 251*ae771770SStanislav Sedov uint32_t *out, size_t *out_len) 252*ae771770SStanislav Sedov { 253*ae771770SStanislav Sedov unsigned i; 254*ae771770SStanislav Sedov int ostarter; 255*ae771770SStanislav Sedov unsigned o = 0; 256*ae771770SStanislav Sedov int old_cc; 257*ae771770SStanislav Sedov 258*ae771770SStanislav Sedov for (i = 0; i < in_len;) { 259*ae771770SStanislav Sedov while (i < in_len && _wind_combining_class(in[i]) != 0) { 260*ae771770SStanislav Sedov out[o++] = in[i++]; 261*ae771770SStanislav Sedov } 262*ae771770SStanislav Sedov if (i < in_len) { 263*ae771770SStanislav Sedov if (o >= *out_len) 264*ae771770SStanislav Sedov return WIND_ERR_OVERRUN; 265*ae771770SStanislav Sedov ostarter = o; 266*ae771770SStanislav Sedov out[o++] = in[i++]; 267*ae771770SStanislav Sedov old_cc = -1; 268*ae771770SStanislav Sedov 269*ae771770SStanislav Sedov while (i < in_len) { 270*ae771770SStanislav Sedov uint32_t comb; 271*ae771770SStanislav Sedov uint32_t v[2]; 272*ae771770SStanislav Sedov int cc; 273*ae771770SStanislav Sedov 274*ae771770SStanislav Sedov v[0] = out[ostarter]; 275*ae771770SStanislav Sedov v[1] = in[i]; 276*ae771770SStanislav Sedov 277*ae771770SStanislav Sedov cc = _wind_combining_class(in[i]); 278*ae771770SStanislav Sedov if (old_cc != cc && (comb = find_composition(v, 2))) { 279*ae771770SStanislav Sedov out[ostarter] = comb; 280*ae771770SStanislav Sedov } else if (cc == 0) { 281*ae771770SStanislav Sedov break; 282*ae771770SStanislav Sedov } else { 283*ae771770SStanislav Sedov if (o >= *out_len) 284*ae771770SStanislav Sedov return WIND_ERR_OVERRUN; 285*ae771770SStanislav Sedov out[o++] = in[i]; 286*ae771770SStanislav Sedov old_cc = cc; 287*ae771770SStanislav Sedov } 288*ae771770SStanislav Sedov ++i; 289*ae771770SStanislav Sedov } 290*ae771770SStanislav Sedov } 291*ae771770SStanislav Sedov } 292*ae771770SStanislav Sedov *out_len = o; 293*ae771770SStanislav Sedov return 0; 294*ae771770SStanislav Sedov } 295*ae771770SStanislav Sedov 296*ae771770SStanislav Sedov int 297*ae771770SStanislav Sedov _wind_stringprep_normalize(const uint32_t *in, size_t in_len, 298*ae771770SStanislav Sedov uint32_t *out, size_t *out_len) 299*ae771770SStanislav Sedov { 300*ae771770SStanislav Sedov size_t tmp_len; 301*ae771770SStanislav Sedov uint32_t *tmp; 302*ae771770SStanislav Sedov int ret; 303*ae771770SStanislav Sedov 304*ae771770SStanislav Sedov if (in_len == 0) { 305*ae771770SStanislav Sedov *out_len = 0; 306*ae771770SStanislav Sedov return 0; 307*ae771770SStanislav Sedov } 308*ae771770SStanislav Sedov 309*ae771770SStanislav Sedov tmp_len = in_len * 4; 310*ae771770SStanislav Sedov if (tmp_len < MAX_LENGTH_CANON) 311*ae771770SStanislav Sedov tmp_len = MAX_LENGTH_CANON; 312*ae771770SStanislav Sedov tmp = malloc(tmp_len * sizeof(uint32_t)); 313*ae771770SStanislav Sedov if (tmp == NULL) 314*ae771770SStanislav Sedov return ENOMEM; 315*ae771770SStanislav Sedov 316*ae771770SStanislav Sedov ret = compat_decomp(in, in_len, tmp, &tmp_len); 317*ae771770SStanislav Sedov if (ret) { 318*ae771770SStanislav Sedov free(tmp); 319*ae771770SStanislav Sedov return ret; 320*ae771770SStanislav Sedov } 321*ae771770SStanislav Sedov canonical_reorder(tmp, tmp_len); 322*ae771770SStanislav Sedov ret = combine(tmp, tmp_len, out, out_len); 323*ae771770SStanislav Sedov free(tmp); 324*ae771770SStanislav Sedov return ret; 325*ae771770SStanislav Sedov } 326