1 /*- 2 * Copyright (c) 2003, 2005 Ryuichiro Imura 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 27 #include <sys/cdefs.h> 28 __FBSDID("$FreeBSD$"); 29 30 #include <sys/param.h> 31 #include <sys/kernel.h> 32 #include <sys/systm.h> 33 #include <sys/malloc.h> 34 #include <sys/iconv.h> 35 36 #include "iconv_converter_if.h" 37 38 /* 39 * "UCS" converter 40 */ 41 42 #define KICONV_UCS_COMBINE 0x1 43 #define KICONV_UCS_FROM_UTF8 0x2 44 #define KICONV_UCS_TO_UTF8 0x4 45 #define KICONV_UCS_FROM_LE 0x8 46 #define KICONV_UCS_TO_LE 0x10 47 #define KICONV_UCS_FROM_UTF16 0x20 48 #define KICONV_UCS_TO_UTF16 0x40 49 #define KICONV_UCS_UCS4 0x80 50 51 #define ENCODING_UTF16 "UTF-16BE" 52 #define ENCODING_UTF8 "UTF-8" 53 54 static struct { 55 const char *name; 56 int from_flag, to_flag; 57 } unicode_family[] = { 58 { "UTF-8", KICONV_UCS_FROM_UTF8, KICONV_UCS_TO_UTF8 }, 59 { "UCS-2LE", KICONV_UCS_FROM_LE, KICONV_UCS_TO_LE }, 60 { "UTF-16BE", KICONV_UCS_FROM_UTF16, KICONV_UCS_TO_UTF16 }, 61 { "UTF-16LE", KICONV_UCS_FROM_UTF16|KICONV_UCS_FROM_LE, 62 KICONV_UCS_TO_UTF16|KICONV_UCS_TO_LE }, 63 { NULL, 0, 0 } 64 }; 65 66 static uint32_t utf8_to_ucs4(const char *src, size_t *utf8width, size_t srclen); 67 static u_char *ucs4_to_utf8(uint32_t ucs4, char * dst, size_t *utf8width, size_t dstlen); 68 static uint32_t encode_surrogate(uint32_t code); 69 static uint32_t decode_surrogate(const u_char *ucs); 70 71 #ifdef MODULE_DEPEND 72 MODULE_DEPEND(iconv_ucs, libiconv, 2, 2, 2); 73 #endif 74 75 /* 76 * UCS converter instance 77 */ 78 struct iconv_ucs { 79 KOBJ_FIELDS; 80 int convtype; 81 struct iconv_cspair * d_csp; 82 struct iconv_cspair * d_cspf; 83 void * f_ctp; 84 void * t_ctp; 85 void * ctype; 86 }; 87 88 static int 89 iconv_ucs_open(struct iconv_converter_class *dcp, 90 struct iconv_cspair *csp, struct iconv_cspair *cspf, void **dpp) 91 { 92 struct iconv_ucs *dp; 93 int i; 94 const char *from, *to; 95 96 dp = (struct iconv_ucs *)kobj_create((struct kobj_class*)dcp, M_ICONV, M_WAITOK); 97 to = csp->cp_to; 98 from = cspf ? cspf->cp_from : csp->cp_from; 99 100 dp->convtype = 0; 101 102 if (cspf) 103 dp->convtype |= KICONV_UCS_COMBINE; 104 for (i = 0; unicode_family[i].name; i++) { 105 if (strcasecmp(from, unicode_family[i].name) == 0) 106 dp->convtype |= unicode_family[i].from_flag; 107 if (strcasecmp(to, unicode_family[i].name) == 0) 108 dp->convtype |= unicode_family[i].to_flag; 109 } 110 if (strcmp(ENCODING_UNICODE, ENCODING_UTF16) == 0) 111 dp->convtype |= KICONV_UCS_UCS4; 112 else 113 dp->convtype &= ~KICONV_UCS_UCS4; 114 115 dp->f_ctp = dp->t_ctp = NULL; 116 if (dp->convtype & KICONV_UCS_COMBINE) { 117 if ((dp->convtype & KICONV_UCS_FROM_UTF8) == 0 && 118 (dp->convtype & KICONV_UCS_FROM_LE) == 0) { 119 iconv_open(ENCODING_UNICODE, from, &dp->f_ctp); 120 } 121 if ((dp->convtype & KICONV_UCS_TO_UTF8) == 0 && 122 (dp->convtype & KICONV_UCS_TO_LE) == 0) { 123 iconv_open(to, ENCODING_UNICODE, &dp->t_ctp); 124 } 125 } 126 127 dp->ctype = NULL; 128 if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_TO_UTF8)) 129 iconv_open(KICONV_WCTYPE_NAME, ENCODING_UTF8, &dp->ctype); 130 131 dp->d_csp = csp; 132 if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_FROM_LE)) { 133 if (cspf) { 134 dp->d_cspf = cspf; 135 cspf->cp_refcount++; 136 } else 137 csp->cp_refcount++; 138 } 139 if (dp->convtype & (KICONV_UCS_TO_UTF8 | KICONV_UCS_TO_LE)) 140 csp->cp_refcount++; 141 *dpp = (void*)dp; 142 return 0; 143 } 144 145 static int 146 iconv_ucs_close(void *data) 147 { 148 struct iconv_ucs *dp = data; 149 150 if (dp->f_ctp) 151 iconv_close(dp->f_ctp); 152 if (dp->t_ctp) 153 iconv_close(dp->t_ctp); 154 if (dp->ctype) 155 iconv_close(dp->ctype); 156 if (dp->d_cspf) 157 dp->d_cspf->cp_refcount--; 158 else if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_FROM_LE)) 159 dp->d_csp->cp_refcount--; 160 if (dp->convtype & (KICONV_UCS_TO_UTF8 | KICONV_UCS_TO_LE)) 161 dp->d_csp->cp_refcount--; 162 kobj_delete((struct kobj*)data, M_ICONV); 163 return 0; 164 } 165 166 static int 167 iconv_ucs_conv(void *d2p, const char **inbuf, 168 size_t *inbytesleft, char **outbuf, size_t *outbytesleft, 169 int convchar, int casetype) 170 { 171 struct iconv_ucs *dp = (struct iconv_ucs*)d2p; 172 int ret = 0, i; 173 size_t in, on, ir, or, inlen, outlen, ucslen; 174 const char *src, *p; 175 char *dst; 176 u_char ucs[4], *q; 177 uint32_t code; 178 179 if (inbuf == NULL || *inbuf == NULL || outbuf == NULL || *outbuf == NULL) 180 return 0; 181 ir = in = *inbytesleft; 182 or = on = *outbytesleft; 183 src = *inbuf; 184 dst = *outbuf; 185 186 while (ir > 0 && or > 0) { 187 188 /* 189 * The first half of conversion. 190 * (convert any code into ENCODING_UNICODE) 191 */ 192 code = 0; 193 p = src; 194 if (dp->convtype & KICONV_UCS_FROM_UTF8) { 195 /* convert UTF-8 to ENCODING_UNICODE */ 196 inlen = 0; 197 code = utf8_to_ucs4(p, &inlen, ir); 198 if (code == 0) { 199 ret = -1; 200 break; 201 } 202 203 if (casetype == KICONV_FROM_LOWER && dp->ctype) { 204 code = towlower(code, dp->ctype); 205 } else if (casetype == KICONV_FROM_UPPER && dp->ctype) { 206 code = towupper(code, dp->ctype); 207 } 208 209 if ((code >= 0xd800 && code < 0xe000) || code >= 0x110000 ) { 210 /* reserved for utf-16 surrogate pair */ 211 /* invalid unicode */ 212 ret = -1; 213 break; 214 } 215 216 if (inlen == 4) { 217 if (dp->convtype & KICONV_UCS_UCS4) { 218 ucslen = 4; 219 code = encode_surrogate(code); 220 } else { 221 /* can't handle with ucs-2 */ 222 ret = -1; 223 break; 224 } 225 } else { 226 ucslen = 2; 227 } 228 229 /* save UCS-4 into ucs[] */ 230 for (q = ucs, i = ucslen - 1 ; i >= 0 ; i--) 231 *q++ = (code >> (i << 3)) & 0xff; 232 233 } else if (dp->convtype & KICONV_UCS_COMBINE && dp->f_ctp) { 234 /* convert local code to ENCODING_UNICODE */ 235 ucslen = 4; 236 inlen = ir; 237 q = ucs; 238 ret = iconv_convchr_case(dp->f_ctp, &p, &inlen, (char **)&q, 239 &ucslen, casetype & (KICONV_FROM_LOWER | KICONV_FROM_UPPER)); 240 if (ret) 241 break; 242 inlen = ir - inlen; 243 ucslen = 4 - ucslen; 244 245 } else { 246 /* src code is a proper subset of ENCODING_UNICODE */ 247 q = ucs; 248 if (dp->convtype & KICONV_UCS_FROM_LE) { 249 *q = *(p + 1); 250 *(q + 1) = *p; 251 p += 2; 252 } else { 253 *q = *p++; 254 *(q + 1) = *p++; 255 } 256 if ((*q & 0xfc) == 0xd8) { 257 if (dp->convtype & KICONV_UCS_UCS4 && 258 dp->convtype & KICONV_UCS_FROM_UTF16) { 259 inlen = ucslen = 4; 260 } else { 261 /* invalid unicode */ 262 ret = -1; 263 break; 264 } 265 } else { 266 inlen = ucslen = 2; 267 } 268 if (ir < inlen) { 269 ret = -1; 270 break; 271 } 272 if (ucslen == 4) { 273 q += 2; 274 if (dp->convtype & KICONV_UCS_FROM_LE) { 275 *q = *(p + 1); 276 *(q + 1) = *p; 277 } else { 278 *q = *p++; 279 *(q + 1) = *p; 280 } 281 if ((*q & 0xfc) != 0xdc) { 282 /* invalid unicode */ 283 ret = -1; 284 break; 285 } 286 } 287 } 288 289 /* 290 * The second half of conversion. 291 * (convert ENCODING_UNICODE into any code) 292 */ 293 p = ucs; 294 if (dp->convtype & KICONV_UCS_TO_UTF8) { 295 q = (u_char *)dst; 296 if (ucslen == 4 && dp->convtype & KICONV_UCS_UCS4) { 297 /* decode surrogate pair */ 298 code = decode_surrogate(p); 299 } else { 300 code = (ucs[0] << 8) | ucs[1]; 301 } 302 303 if (casetype == KICONV_LOWER && dp->ctype) { 304 code = towlower(code, dp->ctype); 305 } else if (casetype == KICONV_UPPER && dp->ctype) { 306 code = towupper(code, dp->ctype); 307 } 308 309 outlen = 0; 310 if (ucs4_to_utf8(code, q, &outlen, or) == NULL) { 311 ret = -1; 312 break; 313 } 314 315 src += inlen; 316 ir -= inlen; 317 dst += outlen; 318 or -= outlen; 319 320 } else if (dp->convtype & KICONV_UCS_COMBINE && dp->t_ctp) { 321 ret = iconv_convchr_case(dp->t_ctp, &p, &ucslen, &dst, 322 &or, casetype & (KICONV_LOWER | KICONV_UPPER)); 323 if (ret) 324 break; 325 326 src += inlen; 327 ir -= inlen; 328 329 } else { 330 /* dst code is a proper subset of ENCODING_UNICODE */ 331 if (or < ucslen) { 332 ret = -1; 333 break; 334 } 335 src += inlen; 336 ir -= inlen; 337 or -= ucslen; 338 if (dp->convtype & KICONV_UCS_TO_LE) { 339 *dst++ = *(p + 1); 340 *dst++ = *p; 341 p += 2; 342 } else { 343 *dst++ = *p++; 344 *dst++ = *p++; 345 } 346 if (ucslen == 4) { 347 if ((dp->convtype & KICONV_UCS_UCS4) == 0 || 348 (dp->convtype & KICONV_UCS_TO_UTF16) == 0) { 349 ret = -1; 350 break; 351 } 352 if (dp->convtype & KICONV_UCS_TO_LE) { 353 *dst++ = *(p + 1); 354 *dst++ = *p; 355 } else { 356 *dst++ = *p++; 357 *dst++ = *p; 358 } 359 } 360 } 361 362 if (convchar == 1) 363 break; 364 } 365 366 *inbuf += in - ir; 367 *outbuf += on - or; 368 *inbytesleft -= in - ir; 369 *outbytesleft -= on - or; 370 return (ret); 371 } 372 373 static int 374 iconv_ucs_init(struct iconv_converter_class *dcp) 375 { 376 int error; 377 378 error = iconv_add(ENCODING_UNICODE, ENCODING_UNICODE, ENCODING_UTF8); 379 if (error) 380 return (error); 381 error = iconv_add(ENCODING_UNICODE, ENCODING_UTF8, ENCODING_UNICODE); 382 if (error) 383 return (error); 384 return (0); 385 } 386 387 static int 388 iconv_ucs_done(struct iconv_converter_class *dcp) 389 { 390 return (0); 391 } 392 393 static const char * 394 iconv_ucs_name(struct iconv_converter_class *dcp) 395 { 396 return (ENCODING_UNICODE); 397 } 398 399 static kobj_method_t iconv_ucs_methods[] = { 400 KOBJMETHOD(iconv_converter_open, iconv_ucs_open), 401 KOBJMETHOD(iconv_converter_close, iconv_ucs_close), 402 KOBJMETHOD(iconv_converter_conv, iconv_ucs_conv), 403 KOBJMETHOD(iconv_converter_init, iconv_ucs_init), 404 KOBJMETHOD(iconv_converter_done, iconv_ucs_done), 405 KOBJMETHOD(iconv_converter_name, iconv_ucs_name), 406 {0, 0} 407 }; 408 409 KICONV_CONVERTER(ucs, sizeof(struct iconv_ucs)); 410 411 static uint32_t 412 utf8_to_ucs4(const char *src, size_t *utf8width, size_t srclen) 413 { 414 size_t i, w = 0; 415 uint32_t ucs4 = 0; 416 417 /* 418 * get leading 1 byte from utf-8 419 */ 420 if ((*src & 0x80) == 0) { 421 /* 422 * leading 1 bit is "0" 423 * utf-8: 0xxxxxxx 424 * ucs-4: 00000000 00000000 00000000 0xxxxxxx 425 */ 426 w = 1; 427 /* get trailing 7 bits */ 428 ucs4 = *src & 0x7f; 429 } else if ((*src & 0xe0) == 0xc0) { 430 /* 431 * leading 3 bits are "110" 432 * utf-8: 110xxxxx 10yyyyyy 433 * ucs-4: 00000000 00000000 00000xxx xxyyyyyy 434 */ 435 w = 2; 436 /* get trailing 5 bits */ 437 ucs4 = *src & 0x1f; 438 } else if ((*src & 0xf0) == 0xe0) { 439 /* 440 * leading 4 bits are "1110" 441 * utf-8: 1110xxxx 10yyyyyy 10zzzzzz 442 * ucs-4: 00000000 00000000 xxxxyyyy yyzzzzzz 443 */ 444 w = 3; 445 /* get trailing 4 bits */ 446 ucs4 = *src & 0x0f; 447 } else if ((*src & 0xf8) == 0xf0) { 448 /* 449 * leading 5 bits are "11110" 450 * utf-8: 11110www 10xxxxxx 10yyyyyy 10zzzzzz 451 * ucs-4: 00000000 000wwwxx xxxxyyyy yyzzzzzz 452 */ 453 w = 4; 454 /* get trailing 3 bits */ 455 ucs4 = *src & 0x07; 456 } else { 457 /* out of utf-16 range or having illegal bits */ 458 return (0); 459 } 460 461 if (srclen < w) 462 return (0); 463 464 /* 465 * get left parts from utf-8 466 */ 467 for (i = 1 ; i < w ; i++) { 468 if ((*(src + i) & 0xc0) != 0x80) { 469 /* invalid: leading 2 bits are not "10" */ 470 return (0); 471 } 472 /* concatenate trailing 6 bits into ucs4 */ 473 ucs4 <<= 6; 474 ucs4 |= *(src + i) & 0x3f; 475 } 476 477 *utf8width = w; 478 return (ucs4); 479 } 480 481 static u_char * 482 ucs4_to_utf8(uint32_t ucs4, char *dst, size_t *utf8width, size_t dstlen) 483 { 484 u_char lead, *p; 485 size_t i, w; 486 487 /* 488 * determine utf-8 width and leading bits 489 */ 490 if (ucs4 < 0x80) { 491 w = 1; 492 lead = 0; /* "0" */ 493 } else if (ucs4 < 0x800) { 494 w = 2; 495 lead = 0xc0; /* "11" */ 496 } else if (ucs4 < 0x10000) { 497 w = 3; 498 lead = 0xe0; /* "111" */ 499 } else if (ucs4 < 0x200000) { 500 w = 4; 501 lead = 0xf0; /* "1111" */ 502 } else { 503 return (NULL); 504 } 505 506 if (dstlen < w) 507 return (NULL); 508 509 /* 510 * construct utf-8 511 */ 512 p = dst; 513 for (i = w - 1 ; i >= 1 ; i--) { 514 /* get trailing 6 bits and put it with leading bit as "1" */ 515 *(p + i) = (ucs4 & 0x3f) | 0x80; 516 ucs4 >>= 6; 517 } 518 *p = ucs4 | lead; 519 520 *utf8width = w; 521 522 return (p); 523 } 524 525 static uint32_t 526 encode_surrogate(register uint32_t code) 527 { 528 return ((((code - 0x10000) << 6) & 0x3ff0000) | 529 ((code - 0x10000) & 0x3ff) | 0xd800dc00); 530 } 531 532 static uint32_t 533 decode_surrogate(register const u_char *ucs) 534 { 535 return ((((ucs[0] & 0x3) << 18) | (ucs[1] << 10) | 536 ((ucs[2] & 0x3) << 8) | ucs[3]) + 0x10000); 537 } 538 539