1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2003, 2005 Ryuichiro Imura 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 #include <sys/cdefs.h> 30 __FBSDID("$FreeBSD$"); 31 32 #include <sys/param.h> 33 #include <sys/kernel.h> 34 #include <sys/systm.h> 35 #include <sys/malloc.h> 36 #include <sys/iconv.h> 37 38 #include "iconv_converter_if.h" 39 40 /* 41 * "UCS" converter 42 */ 43 44 #define KICONV_UCS_COMBINE 0x1 45 #define KICONV_UCS_FROM_UTF8 0x2 46 #define KICONV_UCS_TO_UTF8 0x4 47 #define KICONV_UCS_FROM_LE 0x8 48 #define KICONV_UCS_TO_LE 0x10 49 #define KICONV_UCS_FROM_UTF16 0x20 50 #define KICONV_UCS_TO_UTF16 0x40 51 #define KICONV_UCS_UCS4 0x80 52 53 #define ENCODING_UTF16 "UTF-16BE" 54 #define ENCODING_UTF8 "UTF-8" 55 56 static struct { 57 const char *name; 58 int from_flag, to_flag; 59 } unicode_family[] = { 60 { "UTF-8", KICONV_UCS_FROM_UTF8, KICONV_UCS_TO_UTF8 }, 61 { "UCS-2LE", KICONV_UCS_FROM_LE, KICONV_UCS_TO_LE }, 62 { "UTF-16BE", KICONV_UCS_FROM_UTF16, KICONV_UCS_TO_UTF16 }, 63 { "UTF-16LE", KICONV_UCS_FROM_UTF16|KICONV_UCS_FROM_LE, 64 KICONV_UCS_TO_UTF16|KICONV_UCS_TO_LE }, 65 { NULL, 0, 0 } 66 }; 67 68 static uint32_t utf8_to_ucs4(const char *src, size_t *utf8width, size_t srclen); 69 static u_char *ucs4_to_utf8(uint32_t ucs4, char * dst, size_t *utf8width, size_t dstlen); 70 static uint32_t encode_surrogate(uint32_t code); 71 static uint32_t decode_surrogate(const u_char *ucs); 72 73 #ifdef MODULE_DEPEND 74 MODULE_DEPEND(iconv_ucs, libiconv, 2, 2, 2); 75 #endif 76 77 /* 78 * UCS converter instance 79 */ 80 struct iconv_ucs { 81 KOBJ_FIELDS; 82 int convtype; 83 struct iconv_cspair * d_csp; 84 struct iconv_cspair * d_cspf; 85 void * f_ctp; 86 void * t_ctp; 87 void * ctype; 88 }; 89 90 static int 91 iconv_ucs_open(struct iconv_converter_class *dcp, 92 struct iconv_cspair *csp, struct iconv_cspair *cspf, void **dpp) 93 { 94 struct iconv_ucs *dp; 95 int i; 96 const char *from, *to; 97 98 dp = (struct iconv_ucs *)kobj_create((struct kobj_class*)dcp, M_ICONV, M_WAITOK); 99 to = csp->cp_to; 100 from = cspf ? cspf->cp_from : csp->cp_from; 101 102 dp->convtype = 0; 103 104 if (cspf) 105 dp->convtype |= KICONV_UCS_COMBINE; 106 for (i = 0; unicode_family[i].name; i++) { 107 if (strcasecmp(from, unicode_family[i].name) == 0) 108 dp->convtype |= unicode_family[i].from_flag; 109 if (strcasecmp(to, unicode_family[i].name) == 0) 110 dp->convtype |= unicode_family[i].to_flag; 111 } 112 if (strcmp(ENCODING_UNICODE, ENCODING_UTF16) == 0) 113 dp->convtype |= KICONV_UCS_UCS4; 114 else 115 dp->convtype &= ~KICONV_UCS_UCS4; 116 117 dp->f_ctp = dp->t_ctp = NULL; 118 if (dp->convtype & KICONV_UCS_COMBINE) { 119 if ((dp->convtype & KICONV_UCS_FROM_UTF8) == 0 && 120 (dp->convtype & KICONV_UCS_FROM_LE) == 0) { 121 iconv_open(ENCODING_UNICODE, from, &dp->f_ctp); 122 } 123 if ((dp->convtype & KICONV_UCS_TO_UTF8) == 0 && 124 (dp->convtype & KICONV_UCS_TO_LE) == 0) { 125 iconv_open(to, ENCODING_UNICODE, &dp->t_ctp); 126 } 127 } 128 129 dp->ctype = NULL; 130 if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_TO_UTF8)) 131 iconv_open(KICONV_WCTYPE_NAME, ENCODING_UTF8, &dp->ctype); 132 133 dp->d_csp = csp; 134 if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_FROM_LE)) { 135 if (cspf) { 136 dp->d_cspf = cspf; 137 cspf->cp_refcount++; 138 } else 139 csp->cp_refcount++; 140 } 141 if (dp->convtype & (KICONV_UCS_TO_UTF8 | KICONV_UCS_TO_LE)) 142 csp->cp_refcount++; 143 *dpp = (void*)dp; 144 return 0; 145 } 146 147 static int 148 iconv_ucs_close(void *data) 149 { 150 struct iconv_ucs *dp = data; 151 152 if (dp->f_ctp) 153 iconv_close(dp->f_ctp); 154 if (dp->t_ctp) 155 iconv_close(dp->t_ctp); 156 if (dp->ctype) 157 iconv_close(dp->ctype); 158 if (dp->d_cspf) 159 dp->d_cspf->cp_refcount--; 160 else if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_FROM_LE)) 161 dp->d_csp->cp_refcount--; 162 if (dp->convtype & (KICONV_UCS_TO_UTF8 | KICONV_UCS_TO_LE)) 163 dp->d_csp->cp_refcount--; 164 kobj_delete((struct kobj*)data, M_ICONV); 165 return 0; 166 } 167 168 static int 169 iconv_ucs_conv(void *d2p, const char **inbuf, 170 size_t *inbytesleft, char **outbuf, size_t *outbytesleft, 171 int convchar, int casetype) 172 { 173 struct iconv_ucs *dp = (struct iconv_ucs*)d2p; 174 int ret = 0, i; 175 size_t in, on, ir, or, inlen, outlen, ucslen; 176 const char *src, *p; 177 char *dst; 178 u_char ucs[4], *q; 179 uint32_t code; 180 181 if (inbuf == NULL || *inbuf == NULL || outbuf == NULL || *outbuf == NULL) 182 return 0; 183 ir = in = *inbytesleft; 184 or = on = *outbytesleft; 185 src = *inbuf; 186 dst = *outbuf; 187 188 while (ir > 0 && or > 0) { 189 190 /* 191 * The first half of conversion. 192 * (convert any code into ENCODING_UNICODE) 193 */ 194 code = 0; 195 p = src; 196 if (dp->convtype & KICONV_UCS_FROM_UTF8) { 197 /* convert UTF-8 to ENCODING_UNICODE */ 198 inlen = 0; 199 code = utf8_to_ucs4(p, &inlen, ir); 200 if (code == 0) { 201 ret = -1; 202 break; 203 } 204 205 if (casetype == KICONV_FROM_LOWER && dp->ctype) { 206 code = towlower(code, dp->ctype); 207 } else if (casetype == KICONV_FROM_UPPER && dp->ctype) { 208 code = towupper(code, dp->ctype); 209 } 210 211 if ((code >= 0xd800 && code < 0xe000) || code >= 0x110000 ) { 212 /* reserved for utf-16 surrogate pair */ 213 /* invalid unicode */ 214 ret = -1; 215 break; 216 } 217 218 if (inlen == 4) { 219 if (dp->convtype & KICONV_UCS_UCS4) { 220 ucslen = 4; 221 code = encode_surrogate(code); 222 } else { 223 /* can't handle with ucs-2 */ 224 ret = -1; 225 break; 226 } 227 } else { 228 ucslen = 2; 229 } 230 231 /* save UCS-4 into ucs[] */ 232 for (q = ucs, i = ucslen - 1 ; i >= 0 ; i--) 233 *q++ = (code >> (i << 3)) & 0xff; 234 235 } else if (dp->convtype & KICONV_UCS_COMBINE && dp->f_ctp) { 236 /* convert local code to ENCODING_UNICODE */ 237 ucslen = 4; 238 inlen = ir; 239 q = ucs; 240 ret = iconv_convchr_case(dp->f_ctp, &p, &inlen, (char **)&q, 241 &ucslen, casetype & (KICONV_FROM_LOWER | KICONV_FROM_UPPER)); 242 if (ret) 243 break; 244 inlen = ir - inlen; 245 ucslen = 4 - ucslen; 246 247 } else { 248 /* src code is a proper subset of ENCODING_UNICODE */ 249 q = ucs; 250 if (dp->convtype & KICONV_UCS_FROM_LE) { 251 *q = *(p + 1); 252 *(q + 1) = *p; 253 p += 2; 254 } else { 255 *q = *p++; 256 *(q + 1) = *p++; 257 } 258 if ((*q & 0xfc) == 0xd8) { 259 if (dp->convtype & KICONV_UCS_UCS4 && 260 dp->convtype & KICONV_UCS_FROM_UTF16) { 261 inlen = ucslen = 4; 262 } else { 263 /* invalid unicode */ 264 ret = -1; 265 break; 266 } 267 } else { 268 inlen = ucslen = 2; 269 } 270 if (ir < inlen) { 271 ret = -1; 272 break; 273 } 274 if (ucslen == 4) { 275 q += 2; 276 if (dp->convtype & KICONV_UCS_FROM_LE) { 277 *q = *(p + 1); 278 *(q + 1) = *p; 279 } else { 280 *q = *p++; 281 *(q + 1) = *p; 282 } 283 if ((*q & 0xfc) != 0xdc) { 284 /* invalid unicode */ 285 ret = -1; 286 break; 287 } 288 } 289 } 290 291 /* 292 * The second half of conversion. 293 * (convert ENCODING_UNICODE into any code) 294 */ 295 p = ucs; 296 if (dp->convtype & KICONV_UCS_TO_UTF8) { 297 q = (u_char *)dst; 298 if (ucslen == 4 && dp->convtype & KICONV_UCS_UCS4) { 299 /* decode surrogate pair */ 300 code = decode_surrogate(p); 301 } else { 302 code = (ucs[0] << 8) | ucs[1]; 303 } 304 305 if (casetype == KICONV_LOWER && dp->ctype) { 306 code = towlower(code, dp->ctype); 307 } else if (casetype == KICONV_UPPER && dp->ctype) { 308 code = towupper(code, dp->ctype); 309 } 310 311 outlen = 0; 312 if (ucs4_to_utf8(code, q, &outlen, or) == NULL) { 313 ret = -1; 314 break; 315 } 316 317 src += inlen; 318 ir -= inlen; 319 dst += outlen; 320 or -= outlen; 321 322 } else if (dp->convtype & KICONV_UCS_COMBINE && dp->t_ctp) { 323 ret = iconv_convchr_case(dp->t_ctp, &p, &ucslen, &dst, 324 &or, casetype & (KICONV_LOWER | KICONV_UPPER)); 325 if (ret) 326 break; 327 328 src += inlen; 329 ir -= inlen; 330 331 } else { 332 /* dst code is a proper subset of ENCODING_UNICODE */ 333 if (or < ucslen) { 334 ret = -1; 335 break; 336 } 337 src += inlen; 338 ir -= inlen; 339 or -= ucslen; 340 if (dp->convtype & KICONV_UCS_TO_LE) { 341 *dst++ = *(p + 1); 342 *dst++ = *p; 343 p += 2; 344 } else { 345 *dst++ = *p++; 346 *dst++ = *p++; 347 } 348 if (ucslen == 4) { 349 if ((dp->convtype & KICONV_UCS_UCS4) == 0 || 350 (dp->convtype & KICONV_UCS_TO_UTF16) == 0) { 351 ret = -1; 352 break; 353 } 354 if (dp->convtype & KICONV_UCS_TO_LE) { 355 *dst++ = *(p + 1); 356 *dst++ = *p; 357 } else { 358 *dst++ = *p++; 359 *dst++ = *p; 360 } 361 } 362 } 363 364 if (convchar == 1) 365 break; 366 } 367 368 *inbuf += in - ir; 369 *outbuf += on - or; 370 *inbytesleft -= in - ir; 371 *outbytesleft -= on - or; 372 return (ret); 373 } 374 375 static int 376 iconv_ucs_init(struct iconv_converter_class *dcp) 377 { 378 int error; 379 380 error = iconv_add(ENCODING_UNICODE, ENCODING_UNICODE, ENCODING_UTF8); 381 if (error) 382 return (error); 383 error = iconv_add(ENCODING_UNICODE, ENCODING_UTF8, ENCODING_UNICODE); 384 if (error) 385 return (error); 386 return (0); 387 } 388 389 static int 390 iconv_ucs_done(struct iconv_converter_class *dcp) 391 { 392 return (0); 393 } 394 395 static const char * 396 iconv_ucs_name(struct iconv_converter_class *dcp) 397 { 398 return (ENCODING_UNICODE); 399 } 400 401 static kobj_method_t iconv_ucs_methods[] = { 402 KOBJMETHOD(iconv_converter_open, iconv_ucs_open), 403 KOBJMETHOD(iconv_converter_close, iconv_ucs_close), 404 KOBJMETHOD(iconv_converter_conv, iconv_ucs_conv), 405 KOBJMETHOD(iconv_converter_init, iconv_ucs_init), 406 KOBJMETHOD(iconv_converter_done, iconv_ucs_done), 407 KOBJMETHOD(iconv_converter_name, iconv_ucs_name), 408 {0, 0} 409 }; 410 411 KICONV_CONVERTER(ucs, sizeof(struct iconv_ucs)); 412 413 static uint32_t 414 utf8_to_ucs4(const char *src, size_t *utf8width, size_t srclen) 415 { 416 size_t i, w = 0; 417 uint32_t ucs4 = 0; 418 419 /* 420 * get leading 1 byte from utf-8 421 */ 422 if ((*src & 0x80) == 0) { 423 /* 424 * leading 1 bit is "0" 425 * utf-8: 0xxxxxxx 426 * ucs-4: 00000000 00000000 00000000 0xxxxxxx 427 */ 428 w = 1; 429 /* get trailing 7 bits */ 430 ucs4 = *src & 0x7f; 431 } else if ((*src & 0xe0) == 0xc0) { 432 /* 433 * leading 3 bits are "110" 434 * utf-8: 110xxxxx 10yyyyyy 435 * ucs-4: 00000000 00000000 00000xxx xxyyyyyy 436 */ 437 w = 2; 438 /* get trailing 5 bits */ 439 ucs4 = *src & 0x1f; 440 } else if ((*src & 0xf0) == 0xe0) { 441 /* 442 * leading 4 bits are "1110" 443 * utf-8: 1110xxxx 10yyyyyy 10zzzzzz 444 * ucs-4: 00000000 00000000 xxxxyyyy yyzzzzzz 445 */ 446 w = 3; 447 /* get trailing 4 bits */ 448 ucs4 = *src & 0x0f; 449 } else if ((*src & 0xf8) == 0xf0) { 450 /* 451 * leading 5 bits are "11110" 452 * utf-8: 11110www 10xxxxxx 10yyyyyy 10zzzzzz 453 * ucs-4: 00000000 000wwwxx xxxxyyyy yyzzzzzz 454 */ 455 w = 4; 456 /* get trailing 3 bits */ 457 ucs4 = *src & 0x07; 458 } else { 459 /* out of utf-16 range or having illegal bits */ 460 return (0); 461 } 462 463 if (srclen < w) 464 return (0); 465 466 /* 467 * get left parts from utf-8 468 */ 469 for (i = 1 ; i < w ; i++) { 470 if ((*(src + i) & 0xc0) != 0x80) { 471 /* invalid: leading 2 bits are not "10" */ 472 return (0); 473 } 474 /* concatenate trailing 6 bits into ucs4 */ 475 ucs4 <<= 6; 476 ucs4 |= *(src + i) & 0x3f; 477 } 478 479 *utf8width = w; 480 return (ucs4); 481 } 482 483 static u_char * 484 ucs4_to_utf8(uint32_t ucs4, char *dst, size_t *utf8width, size_t dstlen) 485 { 486 u_char lead, *p; 487 size_t i, w; 488 489 /* 490 * determine utf-8 width and leading bits 491 */ 492 if (ucs4 < 0x80) { 493 w = 1; 494 lead = 0; /* "0" */ 495 } else if (ucs4 < 0x800) { 496 w = 2; 497 lead = 0xc0; /* "11" */ 498 } else if (ucs4 < 0x10000) { 499 w = 3; 500 lead = 0xe0; /* "111" */ 501 } else if (ucs4 < 0x200000) { 502 w = 4; 503 lead = 0xf0; /* "1111" */ 504 } else { 505 return (NULL); 506 } 507 508 if (dstlen < w) 509 return (NULL); 510 511 /* 512 * construct utf-8 513 */ 514 p = dst; 515 for (i = w - 1 ; i >= 1 ; i--) { 516 /* get trailing 6 bits and put it with leading bit as "1" */ 517 *(p + i) = (ucs4 & 0x3f) | 0x80; 518 ucs4 >>= 6; 519 } 520 *p = ucs4 | lead; 521 522 *utf8width = w; 523 524 return (p); 525 } 526 527 static uint32_t 528 encode_surrogate(uint32_t code) 529 { 530 return ((((code - 0x10000) << 6) & 0x3ff0000) | 531 ((code - 0x10000) & 0x3ff) | 0xd800dc00); 532 } 533 534 static uint32_t 535 decode_surrogate(const u_char *ucs) 536 { 537 return ((((ucs[0] & 0x3) << 18) | (ucs[1] << 10) | 538 ((ucs[2] & 0x3) << 8) | ucs[3]) + 0x10000); 539 } 540 541