1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2003, 2005 Ryuichiro Imura 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 #include <sys/cdefs.h> 30 __FBSDID("$FreeBSD$"); 31 32 #include <sys/param.h> 33 #include <sys/kernel.h> 34 #include <sys/systm.h> 35 #include <sys/malloc.h> 36 #include <sys/iconv.h> 37 38 #include "iconv_converter_if.h" 39 40 /* 41 * "UCS" converter 42 */ 43 44 #define KICONV_UCS_COMBINE 0x1 45 #define KICONV_UCS_FROM_UTF8 0x2 46 #define KICONV_UCS_TO_UTF8 0x4 47 #define KICONV_UCS_FROM_LE 0x8 48 #define KICONV_UCS_TO_LE 0x10 49 #define KICONV_UCS_FROM_UTF16 0x20 50 #define KICONV_UCS_TO_UTF16 0x40 51 #define KICONV_UCS_UCS4 0x80 52 53 #define ENCODING_UTF16 "UTF-16BE" 54 #define ENCODING_UTF8 "UTF-8" 55 56 static struct { 57 const char *name; 58 int from_flag, to_flag; 59 } unicode_family[] = { 60 { "UTF-8", KICONV_UCS_FROM_UTF8, KICONV_UCS_TO_UTF8 }, 61 { "UCS-2LE", KICONV_UCS_FROM_LE, KICONV_UCS_TO_LE }, 62 { "UTF-16BE", KICONV_UCS_FROM_UTF16, KICONV_UCS_TO_UTF16 }, 63 { "UTF-16LE", KICONV_UCS_FROM_UTF16|KICONV_UCS_FROM_LE, 64 KICONV_UCS_TO_UTF16|KICONV_UCS_TO_LE }, 65 { NULL, 0, 0 } 66 }; 67 68 static uint32_t utf8_to_ucs4(const char *src, size_t *utf8width, size_t srclen); 69 static u_char *ucs4_to_utf8(uint32_t ucs4, char * dst, size_t *utf8width, size_t dstlen); 70 static uint32_t encode_surrogate(uint32_t code); 71 static uint32_t decode_surrogate(const u_char *ucs); 72 73 #ifdef MODULE_DEPEND 74 MODULE_DEPEND(iconv_ucs, libiconv, 2, 2, 2); 75 #endif 76 77 /* 78 * UCS converter instance 79 */ 80 struct iconv_ucs { 81 KOBJ_FIELDS; 82 int convtype; 83 struct iconv_cspair * d_csp; 84 struct iconv_cspair * d_cspf; 85 void * f_ctp; 86 void * t_ctp; 87 void * ctype; 88 }; 89 90 static int 91 iconv_ucs_open(struct iconv_converter_class *dcp, 92 struct iconv_cspair *csp, struct iconv_cspair *cspf, void **dpp) 93 { 94 struct iconv_ucs *dp; 95 int i; 96 const char *from, *to; 97 98 dp = (struct iconv_ucs *)kobj_create((struct kobj_class*)dcp, M_ICONV, M_WAITOK); 99 to = csp->cp_to; 100 from = cspf ? cspf->cp_from : csp->cp_from; 101 102 dp->convtype = 0; 103 104 if (cspf) 105 dp->convtype |= KICONV_UCS_COMBINE; 106 for (i = 0; unicode_family[i].name; i++) { 107 if (strcasecmp(from, unicode_family[i].name) == 0) 108 dp->convtype |= unicode_family[i].from_flag; 109 if (strcasecmp(to, unicode_family[i].name) == 0) 110 dp->convtype |= unicode_family[i].to_flag; 111 } 112 if (strcmp(ENCODING_UNICODE, ENCODING_UTF16) == 0) 113 dp->convtype |= KICONV_UCS_UCS4; 114 else 115 dp->convtype &= ~KICONV_UCS_UCS4; 116 117 dp->f_ctp = dp->t_ctp = NULL; 118 if (dp->convtype & KICONV_UCS_COMBINE) { 119 if ((dp->convtype & KICONV_UCS_FROM_UTF8) == 0 && 120 (dp->convtype & KICONV_UCS_FROM_LE) == 0) { 121 iconv_open(ENCODING_UNICODE, from, &dp->f_ctp); 122 } 123 if ((dp->convtype & KICONV_UCS_TO_UTF8) == 0 && 124 (dp->convtype & KICONV_UCS_TO_LE) == 0) { 125 iconv_open(to, ENCODING_UNICODE, &dp->t_ctp); 126 } 127 } 128 129 dp->ctype = NULL; 130 if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_TO_UTF8)) 131 iconv_open(KICONV_WCTYPE_NAME, ENCODING_UTF8, &dp->ctype); 132 133 dp->d_csp = csp; 134 if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_FROM_LE)) { 135 if (cspf) { 136 dp->d_cspf = cspf; 137 cspf->cp_refcount++; 138 } else 139 csp->cp_refcount++; 140 } 141 if (dp->convtype & (KICONV_UCS_TO_UTF8 | KICONV_UCS_TO_LE)) 142 csp->cp_refcount++; 143 *dpp = (void*)dp; 144 return 0; 145 } 146 147 static int 148 iconv_ucs_close(void *data) 149 { 150 struct iconv_ucs *dp = data; 151 152 if (dp->f_ctp) 153 iconv_close(dp->f_ctp); 154 if (dp->t_ctp) 155 iconv_close(dp->t_ctp); 156 if (dp->ctype) 157 iconv_close(dp->ctype); 158 if (dp->d_cspf) 159 dp->d_cspf->cp_refcount--; 160 else if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_FROM_LE)) 161 dp->d_csp->cp_refcount--; 162 if (dp->convtype & (KICONV_UCS_TO_UTF8 | KICONV_UCS_TO_LE)) 163 dp->d_csp->cp_refcount--; 164 kobj_delete((struct kobj*)data, M_ICONV); 165 return 0; 166 } 167 168 static int 169 iconv_ucs_conv(void *d2p, const char **inbuf, 170 size_t *inbytesleft, char **outbuf, size_t *outbytesleft, 171 int convchar, int casetype) 172 { 173 struct iconv_ucs *dp = (struct iconv_ucs*)d2p; 174 int ret = 0, i; 175 size_t in, on, ir, or, inlen, outlen, ucslen; 176 const char *src, *p; 177 char *dst; 178 u_char ucs[4], *q; 179 uint32_t code; 180 181 if (inbuf == NULL || *inbuf == NULL || outbuf == NULL || *outbuf == NULL) 182 return 0; 183 ir = in = *inbytesleft; 184 or = on = *outbytesleft; 185 src = *inbuf; 186 dst = *outbuf; 187 188 while (ir > 0 && or > 0) { 189 /* 190 * The first half of conversion. 191 * (convert any code into ENCODING_UNICODE) 192 */ 193 code = 0; 194 p = src; 195 if (dp->convtype & KICONV_UCS_FROM_UTF8) { 196 /* convert UTF-8 to ENCODING_UNICODE */ 197 inlen = 0; 198 code = utf8_to_ucs4(p, &inlen, ir); 199 if (code == 0) { 200 ret = -1; 201 break; 202 } 203 204 if (casetype == KICONV_FROM_LOWER && dp->ctype) { 205 code = towlower(code, dp->ctype); 206 } else if (casetype == KICONV_FROM_UPPER && dp->ctype) { 207 code = towupper(code, dp->ctype); 208 } 209 210 if ((code >= 0xd800 && code < 0xe000) || code >= 0x110000 ) { 211 /* reserved for utf-16 surrogate pair */ 212 /* invalid unicode */ 213 ret = -1; 214 break; 215 } 216 217 if (inlen == 4) { 218 if (dp->convtype & KICONV_UCS_UCS4) { 219 ucslen = 4; 220 code = encode_surrogate(code); 221 } else { 222 /* can't handle with ucs-2 */ 223 ret = -1; 224 break; 225 } 226 } else { 227 ucslen = 2; 228 } 229 230 /* save UCS-4 into ucs[] */ 231 for (q = ucs, i = ucslen - 1 ; i >= 0 ; i--) 232 *q++ = (code >> (i << 3)) & 0xff; 233 234 } else if (dp->convtype & KICONV_UCS_COMBINE && dp->f_ctp) { 235 /* convert local code to ENCODING_UNICODE */ 236 ucslen = 4; 237 inlen = ir; 238 q = ucs; 239 ret = iconv_convchr_case(dp->f_ctp, &p, &inlen, (char **)&q, 240 &ucslen, casetype & (KICONV_FROM_LOWER | KICONV_FROM_UPPER)); 241 if (ret) 242 break; 243 inlen = ir - inlen; 244 ucslen = 4 - ucslen; 245 246 } else { 247 /* src code is a proper subset of ENCODING_UNICODE */ 248 q = ucs; 249 if (dp->convtype & KICONV_UCS_FROM_LE) { 250 *q = *(p + 1); 251 *(q + 1) = *p; 252 p += 2; 253 } else { 254 *q = *p++; 255 *(q + 1) = *p++; 256 } 257 if ((*q & 0xfc) == 0xd8) { 258 if (dp->convtype & KICONV_UCS_UCS4 && 259 dp->convtype & KICONV_UCS_FROM_UTF16) { 260 inlen = ucslen = 4; 261 } else { 262 /* invalid unicode */ 263 ret = -1; 264 break; 265 } 266 } else { 267 inlen = ucslen = 2; 268 } 269 if (ir < inlen) { 270 ret = -1; 271 break; 272 } 273 if (ucslen == 4) { 274 q += 2; 275 if (dp->convtype & KICONV_UCS_FROM_LE) { 276 *q = *(p + 1); 277 *(q + 1) = *p; 278 } else { 279 *q = *p++; 280 *(q + 1) = *p; 281 } 282 if ((*q & 0xfc) != 0xdc) { 283 /* invalid unicode */ 284 ret = -1; 285 break; 286 } 287 } 288 } 289 290 /* 291 * The second half of conversion. 292 * (convert ENCODING_UNICODE into any code) 293 */ 294 p = ucs; 295 if (dp->convtype & KICONV_UCS_TO_UTF8) { 296 q = (u_char *)dst; 297 if (ucslen == 4 && dp->convtype & KICONV_UCS_UCS4) { 298 /* decode surrogate pair */ 299 code = decode_surrogate(p); 300 } else { 301 code = (ucs[0] << 8) | ucs[1]; 302 } 303 304 if (casetype == KICONV_LOWER && dp->ctype) { 305 code = towlower(code, dp->ctype); 306 } else if (casetype == KICONV_UPPER && dp->ctype) { 307 code = towupper(code, dp->ctype); 308 } 309 310 outlen = 0; 311 if (ucs4_to_utf8(code, q, &outlen, or) == NULL) { 312 ret = -1; 313 break; 314 } 315 316 src += inlen; 317 ir -= inlen; 318 dst += outlen; 319 or -= outlen; 320 321 } else if (dp->convtype & KICONV_UCS_COMBINE && dp->t_ctp) { 322 ret = iconv_convchr_case(dp->t_ctp, &p, &ucslen, &dst, 323 &or, casetype & (KICONV_LOWER | KICONV_UPPER)); 324 if (ret) 325 break; 326 327 src += inlen; 328 ir -= inlen; 329 330 } else { 331 /* dst code is a proper subset of ENCODING_UNICODE */ 332 if (or < ucslen) { 333 ret = -1; 334 break; 335 } 336 src += inlen; 337 ir -= inlen; 338 or -= ucslen; 339 if (dp->convtype & KICONV_UCS_TO_LE) { 340 *dst++ = *(p + 1); 341 *dst++ = *p; 342 p += 2; 343 } else { 344 *dst++ = *p++; 345 *dst++ = *p++; 346 } 347 if (ucslen == 4) { 348 if ((dp->convtype & KICONV_UCS_UCS4) == 0 || 349 (dp->convtype & KICONV_UCS_TO_UTF16) == 0) { 350 ret = -1; 351 break; 352 } 353 if (dp->convtype & KICONV_UCS_TO_LE) { 354 *dst++ = *(p + 1); 355 *dst++ = *p; 356 } else { 357 *dst++ = *p++; 358 *dst++ = *p; 359 } 360 } 361 } 362 363 if (convchar == 1) 364 break; 365 } 366 367 *inbuf += in - ir; 368 *outbuf += on - or; 369 *inbytesleft -= in - ir; 370 *outbytesleft -= on - or; 371 return (ret); 372 } 373 374 static int 375 iconv_ucs_init(struct iconv_converter_class *dcp) 376 { 377 int error; 378 379 error = iconv_add(ENCODING_UNICODE, ENCODING_UNICODE, ENCODING_UTF8); 380 if (error) 381 return (error); 382 error = iconv_add(ENCODING_UNICODE, ENCODING_UTF8, ENCODING_UNICODE); 383 if (error) 384 return (error); 385 return (0); 386 } 387 388 static int 389 iconv_ucs_done(struct iconv_converter_class *dcp) 390 { 391 return (0); 392 } 393 394 static const char * 395 iconv_ucs_name(struct iconv_converter_class *dcp) 396 { 397 return (ENCODING_UNICODE); 398 } 399 400 static kobj_method_t iconv_ucs_methods[] = { 401 KOBJMETHOD(iconv_converter_open, iconv_ucs_open), 402 KOBJMETHOD(iconv_converter_close, iconv_ucs_close), 403 KOBJMETHOD(iconv_converter_conv, iconv_ucs_conv), 404 KOBJMETHOD(iconv_converter_init, iconv_ucs_init), 405 KOBJMETHOD(iconv_converter_done, iconv_ucs_done), 406 KOBJMETHOD(iconv_converter_name, iconv_ucs_name), 407 {0, 0} 408 }; 409 410 KICONV_CONVERTER(ucs, sizeof(struct iconv_ucs)); 411 412 static uint32_t 413 utf8_to_ucs4(const char *src, size_t *utf8width, size_t srclen) 414 { 415 size_t i, w = 0; 416 uint32_t ucs4 = 0; 417 418 /* 419 * get leading 1 byte from utf-8 420 */ 421 if ((*src & 0x80) == 0) { 422 /* 423 * leading 1 bit is "0" 424 * utf-8: 0xxxxxxx 425 * ucs-4: 00000000 00000000 00000000 0xxxxxxx 426 */ 427 w = 1; 428 /* get trailing 7 bits */ 429 ucs4 = *src & 0x7f; 430 } else if ((*src & 0xe0) == 0xc0) { 431 /* 432 * leading 3 bits are "110" 433 * utf-8: 110xxxxx 10yyyyyy 434 * ucs-4: 00000000 00000000 00000xxx xxyyyyyy 435 */ 436 w = 2; 437 /* get trailing 5 bits */ 438 ucs4 = *src & 0x1f; 439 } else if ((*src & 0xf0) == 0xe0) { 440 /* 441 * leading 4 bits are "1110" 442 * utf-8: 1110xxxx 10yyyyyy 10zzzzzz 443 * ucs-4: 00000000 00000000 xxxxyyyy yyzzzzzz 444 */ 445 w = 3; 446 /* get trailing 4 bits */ 447 ucs4 = *src & 0x0f; 448 } else if ((*src & 0xf8) == 0xf0) { 449 /* 450 * leading 5 bits are "11110" 451 * utf-8: 11110www 10xxxxxx 10yyyyyy 10zzzzzz 452 * ucs-4: 00000000 000wwwxx xxxxyyyy yyzzzzzz 453 */ 454 w = 4; 455 /* get trailing 3 bits */ 456 ucs4 = *src & 0x07; 457 } else { 458 /* out of utf-16 range or having illegal bits */ 459 return (0); 460 } 461 462 if (srclen < w) 463 return (0); 464 465 /* 466 * get left parts from utf-8 467 */ 468 for (i = 1 ; i < w ; i++) { 469 if ((*(src + i) & 0xc0) != 0x80) { 470 /* invalid: leading 2 bits are not "10" */ 471 return (0); 472 } 473 /* concatenate trailing 6 bits into ucs4 */ 474 ucs4 <<= 6; 475 ucs4 |= *(src + i) & 0x3f; 476 } 477 478 *utf8width = w; 479 return (ucs4); 480 } 481 482 static u_char * 483 ucs4_to_utf8(uint32_t ucs4, char *dst, size_t *utf8width, size_t dstlen) 484 { 485 u_char lead, *p; 486 size_t i, w; 487 488 /* 489 * determine utf-8 width and leading bits 490 */ 491 if (ucs4 < 0x80) { 492 w = 1; 493 lead = 0; /* "0" */ 494 } else if (ucs4 < 0x800) { 495 w = 2; 496 lead = 0xc0; /* "11" */ 497 } else if (ucs4 < 0x10000) { 498 w = 3; 499 lead = 0xe0; /* "111" */ 500 } else if (ucs4 < 0x200000) { 501 w = 4; 502 lead = 0xf0; /* "1111" */ 503 } else { 504 return (NULL); 505 } 506 507 if (dstlen < w) 508 return (NULL); 509 510 /* 511 * construct utf-8 512 */ 513 p = dst; 514 for (i = w - 1 ; i >= 1 ; i--) { 515 /* get trailing 6 bits and put it with leading bit as "1" */ 516 *(p + i) = (ucs4 & 0x3f) | 0x80; 517 ucs4 >>= 6; 518 } 519 *p = ucs4 | lead; 520 521 *utf8width = w; 522 523 return (p); 524 } 525 526 static uint32_t 527 encode_surrogate(uint32_t code) 528 { 529 return ((((code - 0x10000) << 6) & 0x3ff0000) | 530 ((code - 0x10000) & 0x3ff) | 0xd800dc00); 531 } 532 533 static uint32_t 534 decode_surrogate(const u_char *ucs) 535 { 536 return ((((ucs[0] & 0x3) << 18) | (ucs[1] << 10) | 537 ((ucs[2] & 0x3) << 8) | ucs[3]) + 0x10000); 538 } 539