1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2003, 2005 Ryuichiro Imura 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 #include <sys/param.h> 30 #include <sys/kernel.h> 31 #include <sys/systm.h> 32 #include <sys/malloc.h> 33 #include <sys/iconv.h> 34 35 #include "iconv_converter_if.h" 36 37 /* 38 * "UCS" converter 39 */ 40 41 #define KICONV_UCS_COMBINE 0x1 42 #define KICONV_UCS_FROM_UTF8 0x2 43 #define KICONV_UCS_TO_UTF8 0x4 44 #define KICONV_UCS_FROM_LE 0x8 45 #define KICONV_UCS_TO_LE 0x10 46 #define KICONV_UCS_FROM_UTF16 0x20 47 #define KICONV_UCS_TO_UTF16 0x40 48 #define KICONV_UCS_UCS4 0x80 49 50 #define ENCODING_UTF16 "UTF-16BE" 51 #define ENCODING_UTF8 "UTF-8" 52 53 static struct { 54 const char *name; 55 int from_flag, to_flag; 56 } unicode_family[] = { 57 { "UTF-8", KICONV_UCS_FROM_UTF8, KICONV_UCS_TO_UTF8 }, 58 { "UCS-2LE", KICONV_UCS_FROM_LE, KICONV_UCS_TO_LE }, 59 { "UTF-16BE", KICONV_UCS_FROM_UTF16, KICONV_UCS_TO_UTF16 }, 60 { "UTF-16LE", KICONV_UCS_FROM_UTF16|KICONV_UCS_FROM_LE, 61 KICONV_UCS_TO_UTF16|KICONV_UCS_TO_LE }, 62 { NULL, 0, 0 } 63 }; 64 65 static uint32_t utf8_to_ucs4(const char *src, size_t *utf8width, size_t srclen); 66 static u_char *ucs4_to_utf8(uint32_t ucs4, char * dst, size_t *utf8width, size_t dstlen); 67 static uint32_t encode_surrogate(uint32_t code); 68 static uint32_t decode_surrogate(const u_char *ucs); 69 70 #ifdef MODULE_DEPEND 71 MODULE_DEPEND(iconv_ucs, libiconv, 2, 2, 2); 72 #endif 73 74 /* 75 * UCS converter instance 76 */ 77 struct iconv_ucs { 78 KOBJ_FIELDS; 79 int convtype; 80 struct iconv_cspair * d_csp; 81 struct iconv_cspair * d_cspf; 82 void * f_ctp; 83 void * t_ctp; 84 void * ctype; 85 }; 86 87 static int 88 iconv_ucs_open(struct iconv_converter_class *dcp, 89 struct iconv_cspair *csp, struct iconv_cspair *cspf, void **dpp) 90 { 91 struct iconv_ucs *dp; 92 int i; 93 const char *from, *to; 94 95 dp = (struct iconv_ucs *)kobj_create((struct kobj_class*)dcp, M_ICONV, M_WAITOK); 96 to = csp->cp_to; 97 from = cspf ? cspf->cp_from : csp->cp_from; 98 99 dp->convtype = 0; 100 101 if (cspf) 102 dp->convtype |= KICONV_UCS_COMBINE; 103 for (i = 0; unicode_family[i].name; i++) { 104 if (strcasecmp(from, unicode_family[i].name) == 0) 105 dp->convtype |= unicode_family[i].from_flag; 106 if (strcasecmp(to, unicode_family[i].name) == 0) 107 dp->convtype |= unicode_family[i].to_flag; 108 } 109 if (strcmp(ENCODING_UNICODE, ENCODING_UTF16) == 0) 110 dp->convtype |= KICONV_UCS_UCS4; 111 else 112 dp->convtype &= ~KICONV_UCS_UCS4; 113 114 dp->f_ctp = dp->t_ctp = NULL; 115 if (dp->convtype & KICONV_UCS_COMBINE) { 116 if ((dp->convtype & KICONV_UCS_FROM_UTF8) == 0 && 117 (dp->convtype & KICONV_UCS_FROM_LE) == 0) { 118 iconv_open(ENCODING_UNICODE, from, &dp->f_ctp); 119 } 120 if ((dp->convtype & KICONV_UCS_TO_UTF8) == 0 && 121 (dp->convtype & KICONV_UCS_TO_LE) == 0) { 122 iconv_open(to, ENCODING_UNICODE, &dp->t_ctp); 123 } 124 } 125 126 dp->ctype = NULL; 127 if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_TO_UTF8)) 128 iconv_open(KICONV_WCTYPE_NAME, ENCODING_UTF8, &dp->ctype); 129 130 dp->d_csp = csp; 131 if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_FROM_LE)) { 132 if (cspf) { 133 dp->d_cspf = cspf; 134 cspf->cp_refcount++; 135 } else 136 csp->cp_refcount++; 137 } 138 if (dp->convtype & (KICONV_UCS_TO_UTF8 | KICONV_UCS_TO_LE)) 139 csp->cp_refcount++; 140 *dpp = (void*)dp; 141 return 0; 142 } 143 144 static int 145 iconv_ucs_close(void *data) 146 { 147 struct iconv_ucs *dp = data; 148 149 if (dp->f_ctp) 150 iconv_close(dp->f_ctp); 151 if (dp->t_ctp) 152 iconv_close(dp->t_ctp); 153 if (dp->ctype) 154 iconv_close(dp->ctype); 155 if (dp->d_cspf) 156 dp->d_cspf->cp_refcount--; 157 else if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_FROM_LE)) 158 dp->d_csp->cp_refcount--; 159 if (dp->convtype & (KICONV_UCS_TO_UTF8 | KICONV_UCS_TO_LE)) 160 dp->d_csp->cp_refcount--; 161 kobj_delete((struct kobj*)data, M_ICONV); 162 return 0; 163 } 164 165 static int 166 iconv_ucs_conv(void *d2p, const char **inbuf, 167 size_t *inbytesleft, char **outbuf, size_t *outbytesleft, 168 int convchar, int casetype) 169 { 170 struct iconv_ucs *dp = (struct iconv_ucs*)d2p; 171 int ret = 0, i; 172 size_t in, on, ir, or, inlen, outlen, ucslen; 173 const char *src, *p; 174 char *dst; 175 u_char ucs[4], *q; 176 uint32_t code; 177 178 if (inbuf == NULL || *inbuf == NULL || outbuf == NULL || *outbuf == NULL) 179 return 0; 180 ir = in = *inbytesleft; 181 or = on = *outbytesleft; 182 src = *inbuf; 183 dst = *outbuf; 184 185 while (ir > 0 && or > 0) { 186 /* 187 * The first half of conversion. 188 * (convert any code into ENCODING_UNICODE) 189 */ 190 code = 0; 191 p = src; 192 if (dp->convtype & KICONV_UCS_FROM_UTF8) { 193 /* convert UTF-8 to ENCODING_UNICODE */ 194 inlen = 0; 195 code = utf8_to_ucs4(p, &inlen, ir); 196 if (code == 0) { 197 ret = -1; 198 break; 199 } 200 201 if (casetype == KICONV_FROM_LOWER && dp->ctype) { 202 code = towlower(code, dp->ctype); 203 } else if (casetype == KICONV_FROM_UPPER && dp->ctype) { 204 code = towupper(code, dp->ctype); 205 } 206 207 if ((code >= 0xd800 && code < 0xe000) || code >= 0x110000 ) { 208 /* reserved for utf-16 surrogate pair */ 209 /* invalid unicode */ 210 ret = -1; 211 break; 212 } 213 214 if (inlen == 4) { 215 if (dp->convtype & KICONV_UCS_UCS4) { 216 ucslen = 4; 217 code = encode_surrogate(code); 218 } else { 219 /* can't handle with ucs-2 */ 220 ret = -1; 221 break; 222 } 223 } else { 224 ucslen = 2; 225 } 226 227 /* save UCS-4 into ucs[] */ 228 for (q = ucs, i = ucslen - 1 ; i >= 0 ; i--) 229 *q++ = (code >> (i << 3)) & 0xff; 230 231 } else if (dp->convtype & KICONV_UCS_COMBINE && dp->f_ctp) { 232 /* convert local code to ENCODING_UNICODE */ 233 ucslen = 4; 234 inlen = ir; 235 q = ucs; 236 ret = iconv_convchr_case(dp->f_ctp, &p, &inlen, (char **)&q, 237 &ucslen, casetype & (KICONV_FROM_LOWER | KICONV_FROM_UPPER)); 238 if (ret) 239 break; 240 inlen = ir - inlen; 241 ucslen = 4 - ucslen; 242 243 } else { 244 /* src code is a proper subset of ENCODING_UNICODE */ 245 q = ucs; 246 if (dp->convtype & KICONV_UCS_FROM_LE) { 247 *q = *(p + 1); 248 *(q + 1) = *p; 249 p += 2; 250 } else { 251 *q = *p++; 252 *(q + 1) = *p++; 253 } 254 if ((*q & 0xfc) == 0xd8) { 255 if (dp->convtype & KICONV_UCS_UCS4 && 256 dp->convtype & KICONV_UCS_FROM_UTF16) { 257 inlen = ucslen = 4; 258 } else { 259 /* invalid unicode */ 260 ret = -1; 261 break; 262 } 263 } else { 264 inlen = ucslen = 2; 265 } 266 if (ir < inlen) { 267 ret = -1; 268 break; 269 } 270 if (ucslen == 4) { 271 q += 2; 272 if (dp->convtype & KICONV_UCS_FROM_LE) { 273 *q = *(p + 1); 274 *(q + 1) = *p; 275 } else { 276 *q = *p++; 277 *(q + 1) = *p; 278 } 279 if ((*q & 0xfc) != 0xdc) { 280 /* invalid unicode */ 281 ret = -1; 282 break; 283 } 284 } 285 } 286 287 /* 288 * The second half of conversion. 289 * (convert ENCODING_UNICODE into any code) 290 */ 291 p = ucs; 292 if (dp->convtype & KICONV_UCS_TO_UTF8) { 293 q = (u_char *)dst; 294 if (ucslen == 4 && dp->convtype & KICONV_UCS_UCS4) { 295 /* decode surrogate pair */ 296 code = decode_surrogate(p); 297 } else { 298 code = (ucs[0] << 8) | ucs[1]; 299 } 300 301 if (casetype == KICONV_LOWER && dp->ctype) { 302 code = towlower(code, dp->ctype); 303 } else if (casetype == KICONV_UPPER && dp->ctype) { 304 code = towupper(code, dp->ctype); 305 } 306 307 outlen = 0; 308 if (ucs4_to_utf8(code, q, &outlen, or) == NULL) { 309 ret = -1; 310 break; 311 } 312 313 src += inlen; 314 ir -= inlen; 315 dst += outlen; 316 or -= outlen; 317 318 } else if (dp->convtype & KICONV_UCS_COMBINE && dp->t_ctp) { 319 ret = iconv_convchr_case(dp->t_ctp, &p, &ucslen, &dst, 320 &or, casetype & (KICONV_LOWER | KICONV_UPPER)); 321 if (ret) 322 break; 323 324 src += inlen; 325 ir -= inlen; 326 327 } else { 328 /* dst code is a proper subset of ENCODING_UNICODE */ 329 if (or < ucslen) { 330 ret = -1; 331 break; 332 } 333 src += inlen; 334 ir -= inlen; 335 or -= ucslen; 336 if (dp->convtype & KICONV_UCS_TO_LE) { 337 *dst++ = *(p + 1); 338 *dst++ = *p; 339 p += 2; 340 } else { 341 *dst++ = *p++; 342 *dst++ = *p++; 343 } 344 if (ucslen == 4) { 345 if ((dp->convtype & KICONV_UCS_UCS4) == 0 || 346 (dp->convtype & KICONV_UCS_TO_UTF16) == 0) { 347 ret = -1; 348 break; 349 } 350 if (dp->convtype & KICONV_UCS_TO_LE) { 351 *dst++ = *(p + 1); 352 *dst++ = *p; 353 } else { 354 *dst++ = *p++; 355 *dst++ = *p; 356 } 357 } 358 } 359 360 if (convchar == 1) 361 break; 362 } 363 364 *inbuf += in - ir; 365 *outbuf += on - or; 366 *inbytesleft -= in - ir; 367 *outbytesleft -= on - or; 368 return (ret); 369 } 370 371 static int 372 iconv_ucs_init(struct iconv_converter_class *dcp) 373 { 374 int error; 375 376 error = iconv_add(ENCODING_UNICODE, ENCODING_UNICODE, ENCODING_UTF8); 377 if (error) 378 return (error); 379 error = iconv_add(ENCODING_UNICODE, ENCODING_UTF8, ENCODING_UNICODE); 380 if (error) 381 return (error); 382 return (0); 383 } 384 385 static int 386 iconv_ucs_done(struct iconv_converter_class *dcp) 387 { 388 return (0); 389 } 390 391 static const char * 392 iconv_ucs_name(struct iconv_converter_class *dcp) 393 { 394 return (ENCODING_UNICODE); 395 } 396 397 static kobj_method_t iconv_ucs_methods[] = { 398 KOBJMETHOD(iconv_converter_open, iconv_ucs_open), 399 KOBJMETHOD(iconv_converter_close, iconv_ucs_close), 400 KOBJMETHOD(iconv_converter_conv, iconv_ucs_conv), 401 KOBJMETHOD(iconv_converter_init, iconv_ucs_init), 402 KOBJMETHOD(iconv_converter_done, iconv_ucs_done), 403 KOBJMETHOD(iconv_converter_name, iconv_ucs_name), 404 {0, 0} 405 }; 406 407 KICONV_CONVERTER(ucs, sizeof(struct iconv_ucs)); 408 409 static uint32_t 410 utf8_to_ucs4(const char *src, size_t *utf8width, size_t srclen) 411 { 412 size_t i, w = 0; 413 uint32_t ucs4 = 0; 414 415 /* 416 * get leading 1 byte from utf-8 417 */ 418 if ((*src & 0x80) == 0) { 419 /* 420 * leading 1 bit is "0" 421 * utf-8: 0xxxxxxx 422 * ucs-4: 00000000 00000000 00000000 0xxxxxxx 423 */ 424 w = 1; 425 /* get trailing 7 bits */ 426 ucs4 = *src & 0x7f; 427 } else if ((*src & 0xe0) == 0xc0) { 428 /* 429 * leading 3 bits are "110" 430 * utf-8: 110xxxxx 10yyyyyy 431 * ucs-4: 00000000 00000000 00000xxx xxyyyyyy 432 */ 433 w = 2; 434 /* get trailing 5 bits */ 435 ucs4 = *src & 0x1f; 436 } else if ((*src & 0xf0) == 0xe0) { 437 /* 438 * leading 4 bits are "1110" 439 * utf-8: 1110xxxx 10yyyyyy 10zzzzzz 440 * ucs-4: 00000000 00000000 xxxxyyyy yyzzzzzz 441 */ 442 w = 3; 443 /* get trailing 4 bits */ 444 ucs4 = *src & 0x0f; 445 } else if ((*src & 0xf8) == 0xf0) { 446 /* 447 * leading 5 bits are "11110" 448 * utf-8: 11110www 10xxxxxx 10yyyyyy 10zzzzzz 449 * ucs-4: 00000000 000wwwxx xxxxyyyy yyzzzzzz 450 */ 451 w = 4; 452 /* get trailing 3 bits */ 453 ucs4 = *src & 0x07; 454 } else { 455 /* out of utf-16 range or having illegal bits */ 456 return (0); 457 } 458 459 if (srclen < w) 460 return (0); 461 462 /* 463 * get left parts from utf-8 464 */ 465 for (i = 1 ; i < w ; i++) { 466 if ((*(src + i) & 0xc0) != 0x80) { 467 /* invalid: leading 2 bits are not "10" */ 468 return (0); 469 } 470 /* concatenate trailing 6 bits into ucs4 */ 471 ucs4 <<= 6; 472 ucs4 |= *(src + i) & 0x3f; 473 } 474 475 *utf8width = w; 476 return (ucs4); 477 } 478 479 static u_char * 480 ucs4_to_utf8(uint32_t ucs4, char *dst, size_t *utf8width, size_t dstlen) 481 { 482 u_char lead, *p; 483 size_t i, w; 484 485 /* 486 * determine utf-8 width and leading bits 487 */ 488 if (ucs4 < 0x80) { 489 w = 1; 490 lead = 0; /* "0" */ 491 } else if (ucs4 < 0x800) { 492 w = 2; 493 lead = 0xc0; /* "11" */ 494 } else if (ucs4 < 0x10000) { 495 w = 3; 496 lead = 0xe0; /* "111" */ 497 } else if (ucs4 < 0x200000) { 498 w = 4; 499 lead = 0xf0; /* "1111" */ 500 } else { 501 return (NULL); 502 } 503 504 if (dstlen < w) 505 return (NULL); 506 507 /* 508 * construct utf-8 509 */ 510 p = dst; 511 for (i = w - 1 ; i >= 1 ; i--) { 512 /* get trailing 6 bits and put it with leading bit as "1" */ 513 *(p + i) = (ucs4 & 0x3f) | 0x80; 514 ucs4 >>= 6; 515 } 516 *p = ucs4 | lead; 517 518 *utf8width = w; 519 520 return (p); 521 } 522 523 static uint32_t 524 encode_surrogate(uint32_t code) 525 { 526 return ((((code - 0x10000) << 6) & 0x3ff0000) | 527 ((code - 0x10000) & 0x3ff) | 0xd800dc00); 528 } 529 530 static uint32_t 531 decode_surrogate(const u_char *ucs) 532 { 533 return ((((ucs[0] & 0x3) << 18) | (ucs[1] << 10) | 534 ((ucs[2] & 0x3) << 8) | ucs[3]) + 0x10000); 535 } 536