1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/types.h> 27 #include <sys/param.h> 28 #include <sys/sysmacros.h> 29 #include <sys/systm.h> 30 #include <sys/debug.h> 31 #include <sys/kmem.h> 32 #include <sys/sunddi.h> 33 #include <sys/byteorder.h> 34 #include <sys/errno.h> 35 #include <sys/modctl.h> 36 #include <sys/u8_textprep.h> 37 #include <sys/kiconv.h> 38 #include <sys/kiconv_cck_common.h> 39 #include <sys/kiconv_tc.h> 40 #include <sys/kiconv_big5_utf8.h> 41 #include <sys/kiconv_euctw_utf8.h> 42 #include <sys/kiconv_hkscs_utf8.h> 43 #include <sys/kiconv_cp950hkscs_utf8.h> 44 #include <sys/kiconv_utf8_big5.h> 45 #include <sys/kiconv_utf8_euctw.h> 46 #include <sys/kiconv_utf8_cp950hkscs.h> 47 #include <sys/kiconv_utf8_hkscs.h> 48 49 /* 4 HKSCS-2004 code points map to 2 Unicode code points separately. */ 50 static uchar_t hkscs_special_sequence[][4] = { 51 { 0xc3, 0x8a, 0xcc, 0x84 }, /* 0x8862 */ 52 { 0xc3, 0x8a, 0xcc, 0x8c }, /* 0x8864 */ 53 { 0xc3, 0xaa, 0xcc, 0x84 }, /* 0x88a3 */ 54 { 0xc3, 0xaa, 0xcc, 0x8c } /* 0x88a5 */ 55 }; 56 57 /* 4 Unicode code point pair map to 1 HKSCS-2004 code point. */ 58 static uint32_t ucs_special_sequence[] = { 59 0x8866, /* U+00ca */ 60 0x8862, /* U+00ca U+0304 */ 61 0x8864, /* U+00ca U+030c */ 62 0x88a7, /* U+00ea */ 63 0x88a3, /* U+00ea U+0304 */ 64 0x88a5 /* U+00ea U+030c */ 65 }; 66 67 typedef int8_t (*kiconv_big5toutf8_t)(uint32_t value, uchar_t *ob, 68 uchar_t *obtail, size_t *ret_val); 69 70 static int8_t utf8_to_big5(uint32_t utf8, uchar_t **inbuf, uchar_t *ibtail, 71 uchar_t *ob, uchar_t *obtail, size_t *ret_val); 72 static int8_t utf8_to_euctw(uint32_t utf8, uchar_t **inbuf, uchar_t *ibtail, 73 uchar_t *ob, uchar_t *obtail, size_t *ret_val); 74 static int8_t utf8_to_cp950hkscs(uint32_t utf8, uchar_t **inbuf, 75 uchar_t *ibtail, uchar_t *ob, uchar_t *obtail, size_t *ret_val); 76 static int8_t utf8_to_big5hkscs(uint32_t utf8, uchar_t **inbuf, uchar_t *ibtail, 77 uchar_t *ob, uchar_t *obtail, size_t *ret_val); 78 static int8_t big5_to_utf8(uint32_t big5_val, uchar_t *ob, uchar_t *obtail, 79 size_t *ret_val); 80 static int8_t big5hkscs_to_utf8(uint32_t hkscs_val, uchar_t *ob, 81 uchar_t *obtail, size_t *ret_val); 82 static int8_t cp950hkscs_to_utf8(uint32_t hkscs_val, uchar_t *ob, 83 uchar_t *obtail, size_t *ret_val); 84 static int8_t euctw_to_utf8(size_t plane_no, uint32_t euctw_val, 85 uchar_t *ob, uchar_t *obtail, size_t *ret_val); 86 static uint32_t get_unicode_from_UDA(size_t plane_no, uchar_t byte1, 87 uchar_t byte2); 88 89 #define KICONV_TC_BIG5 (0x01) 90 #define KICONV_TC_BIG5HKSCS (0x02) 91 #define KICONV_TC_CP950HKSCS (0x03) 92 #define KICONV_TC_EUCTW (0x04) 93 #define KICONV_TC_MAX_MAGIC_ID (0x04) 94 95 static void * 96 open_fr_big5() 97 { 98 return ((void *)KICONV_TC_BIG5); 99 } 100 101 static void * 102 open_fr_big5hkscs() 103 { 104 return ((void *)KICONV_TC_BIG5HKSCS); 105 } 106 107 static void * 108 open_fr_cp950hkscs() 109 { 110 return ((void *)KICONV_TC_CP950HKSCS); 111 } 112 113 static void * 114 open_fr_euctw() 115 { 116 return ((void *)KICONV_TC_EUCTW); 117 } 118 119 static int 120 close_fr_tc(void *s) 121 { 122 if ((uintptr_t)s > KICONV_TC_MAX_MAGIC_ID) 123 return (EBADF); 124 125 return (0); 126 } 127 128 /* 129 * Common convertor from BIG5/HKSCS(BIG5-HKSCS or CP950-HKSCS) to UTF-8. 130 */ 131 static size_t 132 kiconv_fr_big5_common(void *kcd, char **inbuf, size_t *inbytesleft, 133 char **outbuf, size_t *outbytesleft, int *errno, 134 kiconv_big5toutf8_t ptr_big5touf8) 135 { 136 uchar_t *ib; 137 uchar_t *ob; 138 uchar_t *ibtail; 139 uchar_t *obtail; 140 size_t ret_val; 141 int8_t sz; 142 uint32_t big5_val; 143 144 /* Check on the kiconv code conversion descriptor. */ 145 if (kcd == NULL || kcd == (void *)-1) { 146 *errno = EBADF; 147 return ((size_t)-1); 148 } 149 150 /* If this is a state reset request, process and return. */ 151 if (inbuf == NULL || *inbuf == NULL) { 152 return (0); 153 } 154 155 ret_val = 0; 156 ib = (uchar_t *)*inbuf; 157 ob = (uchar_t *)*outbuf; 158 ibtail = ib + *inbytesleft; 159 obtail = ob + *outbytesleft; 160 161 while (ib < ibtail) { 162 if (KICONV_IS_ASCII(*ib)) { 163 if (ob >= obtail) { 164 KICONV_SET_ERRNO_AND_BREAK(E2BIG); 165 } 166 167 *ob++ = *ib++; 168 continue; 169 } 170 171 /* 172 * Issue EILSEQ error if the first byte is not a 173 * valid BIG5/HKSCS leading byte. 174 */ 175 if (! KICONV_TC_IS_BIG5_1st_BYTE(*ib)) { 176 KICONV_SET_ERRNO_AND_BREAK(EILSEQ); 177 } 178 179 /* 180 * Issue EINVAL error if input buffer has an incomplete 181 * character at the end of the buffer. 182 */ 183 if (ibtail - ib < 2) { 184 KICONV_SET_ERRNO_AND_BREAK(EINVAL); 185 } 186 187 /* 188 * Issue EILSEQ error if the remaining bytes is not 189 * a valid BIG5/HKSCS byte. 190 */ 191 if (! KICONV_TC_IS_BIG5_2nd_BYTE(*(ib + 1))) { 192 KICONV_SET_ERRNO_AND_BREAK(EILSEQ); 193 } 194 195 /* Now we have a valid BIG5/HKSCS character. */ 196 big5_val = (uint32_t)(*ib) << 8 | *(ib + 1); 197 sz = ptr_big5touf8(big5_val, ob, obtail, &ret_val); 198 199 if (sz < 0) { 200 KICONV_SET_ERRNO_AND_BREAK(E2BIG); 201 } 202 203 ib += 2; 204 ob += sz; 205 } 206 207 *inbuf = (char *)ib; 208 *inbytesleft = ibtail - ib; 209 *outbuf = (char *)ob; 210 *outbytesleft = obtail - ob; 211 212 return (ret_val); 213 } 214 215 /* 216 * String based Common convertor from BIG5/HKSCS(BIG5-HKSCS or CP950-HKSCS) 217 * to UTF-8. 218 */ 219 static size_t 220 kiconvstr_fr_big5_common(uchar_t *ib, size_t *inlen, uchar_t *ob, 221 size_t *outlen, int flag, int *errno, 222 kiconv_big5toutf8_t ptr_big5touf8) 223 { 224 uchar_t *oldib; 225 uchar_t *ibtail; 226 uchar_t *obtail; 227 size_t ret_val; 228 int8_t sz; 229 uint32_t big5_val; 230 boolean_t do_not_ignore_null; 231 232 ret_val = 0; 233 ibtail = ib + *inlen; 234 obtail = ob + *outlen; 235 do_not_ignore_null = ((flag & KICONV_IGNORE_NULL) == 0); 236 237 while (ib < ibtail) { 238 if (*ib == '\0' && do_not_ignore_null) 239 break; 240 241 if (KICONV_IS_ASCII(*ib)) { 242 if (ob >= obtail) { 243 KICONV_SET_ERRNO_AND_BREAK(E2BIG); 244 } 245 246 *ob++ = *ib++; 247 continue; 248 } 249 250 oldib = ib; 251 252 if (! KICONV_TC_IS_BIG5_1st_BYTE(*ib)) { 253 KICONV_SET_ERRNO_WITH_FLAG(1, EILSEQ); 254 } 255 256 if (ibtail - ib < 2) { 257 KICONV_SET_ERRNO_WITH_FLAG(1, EINVAL); 258 } 259 260 if (! KICONV_TC_IS_BIG5_2nd_BYTE(*(ib + 1))) { 261 KICONV_SET_ERRNO_WITH_FLAG(2, EILSEQ); 262 } 263 264 big5_val = *ib++; 265 big5_val = (big5_val << 8) | *ib++; 266 sz = ptr_big5touf8(big5_val, ob, obtail, &ret_val); 267 268 if (sz < 0) { 269 ib = oldib; 270 KICONV_SET_ERRNO_AND_BREAK(E2BIG); 271 } 272 273 ob += sz; 274 continue; 275 276 REPLACE_INVALID: 277 if (obtail - ob < KICONV_UTF8_REPLACEMENT_CHAR_LEN) { 278 ib = oldib; 279 KICONV_SET_ERRNO_AND_BREAK(E2BIG); 280 } 281 282 *ob++ = KICONV_UTF8_REPLACEMENT_CHAR1; 283 *ob++ = KICONV_UTF8_REPLACEMENT_CHAR2; 284 *ob++ = KICONV_UTF8_REPLACEMENT_CHAR3; 285 ret_val++; 286 } 287 288 *inlen = ibtail - ib; 289 *outlen = obtail - ob; 290 291 return (ret_val); 292 } 293 294 /* 295 * Encoding convertor from BIG5 to UTF-8. 296 */ 297 static size_t 298 kiconv_fr_big5(void *kcd, char **inbuf, size_t *inbytesleft, char **outbuf, 299 size_t *outbytesleft, int *errno) 300 { 301 return (kiconv_fr_big5_common(kcd, inbuf, inbytesleft, outbuf, 302 outbytesleft, errno, big5_to_utf8)); 303 } 304 305 /* 306 * String based encoding convertor from BIG5 to UTF-8. 307 */ 308 static size_t 309 kiconvstr_fr_big5(char *inarray, size_t *inlen, char *outarray, 310 size_t *outlen, int flag, int *errno) 311 { 312 return (kiconvstr_fr_big5_common((uchar_t *)inarray, inlen, 313 (uchar_t *)outarray, outlen, flag, errno, 314 big5_to_utf8)); 315 } 316 317 /* 318 * Encoding convertor from BIG5-HKSCS to UTF-8. 319 */ 320 static size_t 321 kiconv_fr_big5hkscs(void *kcd, char **inbuf, size_t *inbytesleft, 322 char **outbuf, size_t *outbytesleft, int *errno) 323 { 324 return kiconv_fr_big5_common(kcd, inbuf, inbytesleft, outbuf, 325 outbytesleft, errno, big5hkscs_to_utf8); 326 } 327 328 /* 329 * String based encoding convertor from BIG5-HKSCS to UTF-8. 330 */ 331 static size_t 332 kiconvstr_fr_big5hkscs(char *inarray, size_t *inlen, char *outarray, 333 size_t *outlen, int flag, int *errno) 334 { 335 return kiconvstr_fr_big5_common((uchar_t *)inarray, inlen, 336 (uchar_t *)outarray, outlen, flag, errno, big5hkscs_to_utf8); 337 } 338 339 /* 340 * Encoding convertor from CP950-HKSCS to UTF-8. 341 */ 342 static size_t 343 kiconv_fr_cp950hkscs(void *kcd, char **inbuf, size_t *inbytesleft, 344 char **outbuf, size_t *outbytesleft, int *errno) 345 { 346 return kiconv_fr_big5_common(kcd, inbuf, inbytesleft, outbuf, 347 outbytesleft, errno, cp950hkscs_to_utf8); 348 } 349 350 /* 351 * String based encoding convertor from CP950-HKSCS to UTF-8. 352 */ 353 static size_t 354 kiconvstr_fr_cp950hkscs(char *inarray, size_t *inlen, char *outarray, 355 size_t *outlen, int flag, int *errno) 356 { 357 return kiconvstr_fr_big5_common((uchar_t *)inarray, inlen, 358 (uchar_t *)outarray, outlen, flag, errno, cp950hkscs_to_utf8); 359 } 360 361 /* 362 * Encoding convertor from EUC-TW to UTF-8. 363 */ 364 static size_t 365 kiconv_fr_euctw(void *kcd, char **inbuf, size_t *inbytesleft, 366 char **outbuf, size_t *outbytesleft, int *errno) 367 { 368 uchar_t *ib; 369 uchar_t *ob; 370 uchar_t *ibtail; 371 uchar_t *obtail; 372 uchar_t *oldib; 373 size_t ret_val; 374 size_t plane_no; 375 int8_t sz; 376 uint32_t euctw_val; 377 boolean_t isplane1; 378 379 /* Check on the kiconv code conversion descriptor. */ 380 if (kcd == NULL || kcd == (void *)-1) { 381 *errno = EBADF; 382 return ((size_t)-1); 383 } 384 385 /* If this is a state reset request, process and return. */ 386 if (inbuf == NULL || *inbuf == NULL) { 387 return (0); 388 } 389 390 ret_val = 0; 391 ib = (uchar_t *)*inbuf; 392 ob = (uchar_t *)*outbuf; 393 ibtail = ib + *inbytesleft; 394 obtail = ob + *outbytesleft; 395 396 while (ib < ibtail) { 397 if (KICONV_IS_ASCII(*ib)) { 398 if (ob >= obtail) { 399 KICONV_SET_ERRNO_AND_BREAK(E2BIG); 400 } 401 402 *ob++ = *ib++; 403 continue; 404 } 405 406 /* 407 * Issue EILSEQ error if the first byte is not a 408 * valid EUC-TW leading byte. 409 */ 410 if (! KICONV_TC_IS_EUCTW_1st_BYTE(*ib)) { 411 KICONV_SET_ERRNO_AND_BREAK(EILSEQ); 412 } 413 414 isplane1 = (*ib == KICONV_TC_EUCTW_MBYTE) ? 415 B_FALSE : B_TRUE; 416 417 /* 418 * Issue EINVAL error if input buffer has an incomplete 419 * character at the end of the buffer. 420 */ 421 if (ibtail - ib < (isplane1 ? 2 : 4)) { 422 KICONV_SET_ERRNO_AND_BREAK(EINVAL); 423 } 424 425 oldib = ib; 426 plane_no = isplane1 ? 1 : *(ib + 1) - KICONV_TC_EUCTW_PMASK; 427 428 /* 429 * Issue EILSEQ error if the remaining bytes are not 430 * valid EUC-TW bytes. 431 */ 432 if (! KICONV_TC_IS_VALID_EUCTW_SEQ(ib)) { 433 KICONV_SET_ERRNO_AND_BREAK(EILSEQ); 434 } 435 436 if (! isplane1) 437 ib += 2; 438 439 /* Now we have a valid EUC-TW character. */ 440 euctw_val = *ib++; 441 euctw_val = (euctw_val << 8) | *ib++; 442 sz = euctw_to_utf8(plane_no, euctw_val, ob, obtail, &ret_val); 443 444 if (sz < 0) { 445 ib = oldib; 446 KICONV_SET_ERRNO_AND_BREAK(E2BIG); 447 } 448 449 ob += sz; 450 } 451 452 *inbuf = (char *)ib; 453 *inbytesleft = ibtail - ib; 454 *outbuf = (char *)ob; 455 *outbytesleft = obtail - ob; 456 457 return (ret_val); 458 } 459 460 /* 461 * String based encoding convertor from EUC-TW to UTF-8. 462 */ 463 static size_t 464 kiconvstr_fr_euctw(char *inarray, size_t *inlen, char *outarray, 465 size_t *outlen, int flag, int *errno) 466 { 467 uchar_t *ib; 468 uchar_t *ob; 469 uchar_t *ibtail; 470 uchar_t *obtail; 471 uchar_t *oldib; 472 size_t ret_val; 473 size_t plane_no; 474 int8_t sz; 475 uint32_t euctw_val; 476 boolean_t isplane1; 477 boolean_t do_not_ignore_null; 478 479 ret_val = 0; 480 ib = (uchar_t *)inarray; 481 ob = (uchar_t *)outarray; 482 ibtail = ib + *inlen; 483 obtail = ob + *outlen; 484 do_not_ignore_null = ((flag & KICONV_IGNORE_NULL) == 0); 485 486 while (ib < ibtail) { 487 if (*ib == '\0' && do_not_ignore_null) 488 break; 489 490 if (KICONV_IS_ASCII(*ib)) { 491 if (ob >= obtail) { 492 KICONV_SET_ERRNO_AND_BREAK(E2BIG); 493 } 494 495 *ob++ = *ib++; 496 continue; 497 } 498 499 oldib = ib; 500 501 if (! KICONV_TC_IS_EUCTW_1st_BYTE(*ib)) { 502 KICONV_SET_ERRNO_WITH_FLAG(1, EILSEQ); 503 } 504 505 isplane1 = (*ib == KICONV_TC_EUCTW_MBYTE) ? 506 B_FALSE : B_TRUE; 507 508 if (ibtail - ib < (isplane1 ? 2 : 4)) { 509 if (flag & KICONV_REPLACE_INVALID) { 510 ib = ibtail; 511 goto REPLACE_INVALID; 512 } 513 514 KICONV_SET_ERRNO_AND_BREAK(EINVAL); 515 } 516 517 plane_no = isplane1 ? 1 : *(ib + 1) - KICONV_TC_EUCTW_PMASK; 518 519 if (! KICONV_TC_IS_VALID_EUCTW_SEQ(ib)) { 520 KICONV_SET_ERRNO_WITH_FLAG(isplane1 ? 2 : 4, EILSEQ); 521 } 522 523 if (! isplane1) 524 ib += 2; 525 526 euctw_val = *ib++; 527 euctw_val = (euctw_val << 8) | *ib++; 528 sz = euctw_to_utf8(plane_no, euctw_val, ob, obtail, &ret_val); 529 530 if (sz < 0) { 531 ib = oldib; 532 KICONV_SET_ERRNO_AND_BREAK(E2BIG); 533 } 534 535 ob += sz; 536 continue; 537 538 REPLACE_INVALID: 539 if (obtail - ob < KICONV_UTF8_REPLACEMENT_CHAR_LEN) { 540 ib = oldib; 541 KICONV_SET_ERRNO_AND_BREAK(E2BIG); 542 } 543 544 *ob++ = KICONV_UTF8_REPLACEMENT_CHAR1; 545 *ob++ = KICONV_UTF8_REPLACEMENT_CHAR2; 546 *ob++ = KICONV_UTF8_REPLACEMENT_CHAR3; 547 ret_val++; 548 } 549 550 *inlen = ibtail - ib; 551 *outlen = obtail - ob; 552 553 return (ret_val); 554 } 555 556 /* 557 * Encoding convertor from UTF-8 to BIG5. 558 */ 559 static size_t 560 kiconv_to_big5(void *kcd, char **inbuf, size_t *inbytesleft, 561 char **outbuf, size_t *outbytesleft, int *errno) 562 { 563 return kiconv_utf8_to_cck(kcd, inbuf, inbytesleft, outbuf, 564 outbytesleft, errno, utf8_to_big5); 565 } 566 567 /* 568 * String based encoding convertor from UTF-8 to BIG5. 569 */ 570 static size_t 571 kiconvstr_to_big5(char *inarray, size_t *inlen, char *outarray, 572 size_t *outlen, int flag, int *errno) 573 { 574 return kiconvstr_utf8_to_cck((uchar_t *)inarray, inlen, 575 (uchar_t *)outarray, outlen, flag, errno, utf8_to_big5); 576 } 577 578 /* 579 * Encoding convertor from UTF-8 to EUC-TW. 580 */ 581 static size_t 582 kiconv_to_euctw(void *kcd, char **inbuf, size_t *inbytesleft, 583 char **outbuf, size_t *outbytesleft, int *errno) 584 { 585 return kiconv_utf8_to_cck(kcd, inbuf, inbytesleft, outbuf, 586 outbytesleft, errno, utf8_to_euctw); 587 } 588 589 /* 590 * String based encoding convertor from UTF-8 to EUC-TW. 591 */ 592 static size_t 593 kiconvstr_to_euctw(char *inarray, size_t *inlen, char *outarray, 594 size_t *outlen, int flag, int *errno) 595 { 596 return kiconvstr_utf8_to_cck((uchar_t *)inarray, inlen, 597 (uchar_t *)outarray, outlen, flag, errno, utf8_to_euctw); 598 } 599 600 /* 601 * Encoding convertor from UTF-8 to CP950HKSCS. 602 */ 603 static size_t 604 kiconv_to_cp950hkscs(void *kcd, char **inbuf, size_t *inbytesleft, 605 char **outbuf, size_t *outbytesleft, int *errno) 606 { 607 return kiconv_utf8_to_cck(kcd, inbuf, inbytesleft, outbuf, 608 outbytesleft, errno, utf8_to_cp950hkscs); 609 } 610 611 /* 612 * String based encoding convertor from UTF-8 to CP950HKSCS. 613 */ 614 static size_t 615 kiconvstr_to_cp950hkscs(char *inarray, size_t *inlen, char *outarray, 616 size_t *outlen, int flag, int *errno) 617 { 618 return kiconvstr_utf8_to_cck((uchar_t *)inarray, inlen, 619 (uchar_t *)outarray, outlen, flag, errno, utf8_to_cp950hkscs); 620 } 621 622 /* 623 * Encoding convertor from UTF-8 to BIG5HKSCS(HKSCS-2004). 624 */ 625 static size_t 626 kiconv_to_big5hkscs(void *kcd, char **inbuf, size_t *inbytesleft, 627 char **outbuf, size_t *outbytesleft, int *errno) 628 { 629 return kiconv_utf8_to_cck(kcd, inbuf, inbytesleft, outbuf, 630 outbytesleft, errno, utf8_to_big5hkscs); 631 } 632 633 /* 634 * String based encoding convertor from UTF-8 to BIG5HKSCS(HKSCS-2004). 635 */ 636 static size_t 637 kiconvstr_to_big5hkscs(char *inarray, size_t *inlen, char *outarray, 638 size_t *outlen, int flag, int *errno) 639 { 640 return kiconvstr_utf8_to_cck((uchar_t *)inarray, inlen, 641 (uchar_t *)outarray, outlen, flag, errno, utf8_to_big5hkscs); 642 } 643 644 /* 645 * Common convertor from single BIG5/CP950-HKSCS character to UTF-8. 646 * Return: > 0 - Converted successfully 647 * = -1 - E2BIG 648 */ 649 static int8_t 650 big5_to_utf8_common(uint32_t big5_val, uchar_t *ob, uchar_t *obtail, 651 size_t *ret_val, kiconv_table_array_t *table, size_t nitems) 652 { 653 size_t index; 654 int8_t sz; 655 uchar_t *u8; 656 657 index = kiconv_binsearch(big5_val, table, nitems); 658 u8 = table[index].u8; 659 sz = u8_number_of_bytes[u8[0]]; 660 661 if (obtail - ob < sz) { 662 *ret_val = (size_t)-1; 663 return (-1); 664 } 665 666 if (index == 0) 667 (*ret_val)++; /* Non-identical conversion */ 668 669 for (index = 0; index < sz; index++) 670 *ob++ = u8[index]; 671 672 return (sz); 673 } 674 675 /* 676 * Convert single BIG5 character to UTF-8. 677 */ 678 static int8_t 679 big5_to_utf8(uint32_t big5_val, uchar_t *ob, uchar_t *obtail, size_t *ret_val) 680 { 681 return (big5_to_utf8_common(big5_val, ob, obtail, ret_val, 682 kiconv_big5_utf8, KICONV_BIG5_UTF8_MAX)); 683 } 684 685 /* 686 * Convert single CP950-HKSCS character to UTF-8. 687 */ 688 static int8_t 689 cp950hkscs_to_utf8(uint32_t hkscs_val, uchar_t *ob, uchar_t *obtail, 690 size_t *ret_val) 691 { 692 return (big5_to_utf8_common(hkscs_val, ob, obtail, ret_val, 693 kiconv_cp950hkscs_utf8, KICONV_CP950HKSCS_UTF8_MAX)); 694 } 695 696 /* 697 * Calculate unicode value for some CNS planes which fall in Unicode 698 * UDA range. 699 */ 700 static uint32_t 701 get_unicode_from_UDA(size_t plane_no, uchar_t b1, uchar_t b2) 702 { 703 /* 704 * CNS Plane 15 is pre-allocated, so need move Plane 16 to back 15 705 * to compute the Unicode value. 706 */ 707 if (plane_no == 16) 708 --plane_no; 709 710 /* 0xF0000 + (plane_no - 12) * 8836 + (b1 - 0xA1) * 94 + (b2 - 0xA1) */ 711 return (8836 * plane_no + 94 * b1 + b2 + 0xD2611); 712 } 713 714 /* 715 * Convert single EUC-TW character to UTF-8. 716 * Return: > 0 - Converted successfully 717 * = -1 - E2BIG 718 */ 719 static int8_t 720 euctw_to_utf8(size_t plane_no, uint32_t euctw_val, uchar_t *ob, 721 uchar_t *obtail, size_t *ret_val) 722 { 723 uint32_t u32; 724 size_t index; 725 int8_t sz; 726 uchar_t udc[4]; 727 uchar_t *u8; 728 729 switch (plane_no) { 730 case 1: 731 index = kiconv_binsearch(euctw_val, kiconv_cns1_utf8, 732 KICONV_CNS1_UTF8_MAX); 733 u8 = kiconv_cns1_utf8[index].u8; 734 break; 735 case 2: 736 index = kiconv_binsearch(euctw_val, kiconv_cns2_utf8, 737 KICONV_CNS2_UTF8_MAX); 738 u8 = kiconv_cns2_utf8[index].u8; 739 break; 740 case 3: 741 index = kiconv_binsearch(euctw_val, kiconv_cns3_utf8, 742 KICONV_CNS3_UTF8_MAX); 743 u8 = kiconv_cns3_utf8[index].u8; 744 break; 745 case 4: 746 index = kiconv_binsearch(euctw_val, kiconv_cns4_utf8, 747 KICONV_CNS4_UTF8_MAX); 748 u8 = kiconv_cns4_utf8[index].u8; 749 break; 750 case 5: 751 index = kiconv_binsearch(euctw_val, kiconv_cns5_utf8, 752 KICONV_CNS5_UTF8_MAX); 753 u8 = kiconv_cns5_utf8[index].u8; 754 break; 755 case 6: 756 index = kiconv_binsearch(euctw_val, kiconv_cns6_utf8, 757 KICONV_CNS6_UTF8_MAX); 758 u8 = kiconv_cns6_utf8[index].u8; 759 break; 760 case 7: 761 index = kiconv_binsearch(euctw_val, kiconv_cns7_utf8, 762 KICONV_CNS7_UTF8_MAX); 763 u8 = kiconv_cns7_utf8[index].u8; 764 break; 765 case 12: 766 case 13: 767 case 14: 768 case 16: 769 u32 = get_unicode_from_UDA(plane_no, 770 (euctw_val & 0xFF00) >> 8, euctw_val & 0xFF); 771 /* 772 * As U+F0000 <= u32 <= U+F8A0F, so its UTF-8 sequence 773 * will occupy 4 bytes. 774 */ 775 udc[0] = 0xF3; 776 udc[1] = (uchar_t)(0x80 | (u32 & 0x03F000) >> 12); 777 udc[2] = (uchar_t)(0x80 | (u32 & 0x000FC0) >> 6); 778 udc[3] = (uchar_t)(0x80 | (u32 & 0x00003F)); 779 u8 = udc; 780 index = 1; 781 break; 782 case 15: 783 index = kiconv_binsearch(euctw_val, kiconv_cns15_utf8, 784 KICONV_CNS15_UTF8_MAX); 785 u8 = kiconv_cns15_utf8[index].u8; 786 break; 787 default: 788 index = 0; 789 u8 = kiconv_cns1_utf8[index].u8; 790 } 791 792 sz = u8_number_of_bytes[u8[0]]; 793 if (obtail - ob < sz) { 794 *ret_val = (size_t)-1; 795 return (-1); 796 } 797 798 if (index == 0) 799 (*ret_val)++; 800 801 for (index = 0; index < sz; index++) 802 *ob++ = u8[index]; 803 804 return (sz); 805 } 806 807 /* 808 * Convert single HKSCS character to UTF-8. 809 * Return: > 0 - Converted successfully 810 * = -1 - E2BIG 811 */ 812 static int8_t 813 big5hkscs_to_utf8(uint32_t hkscs_val, uchar_t *ob, uchar_t *obtail, 814 size_t *ret_val) 815 { 816 size_t index; 817 int8_t sz; 818 uchar_t *u8; 819 820 index = kiconv_binsearch(hkscs_val, kiconv_hkscs_utf8, 821 KICONV_HKSCS_UTF8_MAX); 822 u8 = kiconv_hkscs_utf8[index].u8; 823 824 /* 825 * Single HKSCS-2004 character may map to 2 Unicode 826 * code points. 827 */ 828 if (u8[0] == 0xFF) { 829 u8 = hkscs_special_sequence[u8[1]]; 830 sz = 4; 831 } else { 832 sz = u8_number_of_bytes[u8[0]]; 833 } 834 835 if (obtail - ob < sz) { 836 *ret_val = (size_t)-1; 837 return (-1); 838 } 839 840 if (index == 0) 841 (*ret_val)++; /* Non-identical conversion. */ 842 843 for (index = 0; index < sz; index++) 844 *ob++ = u8[index]; 845 846 return (sz); 847 } 848 849 /* 850 * Convert single UTF-8 character to EUC-TW. 851 * Return: > 0 - Converted successfully 852 * = -1 - E2BIG 853 */ 854 /* ARGSUSED */ 855 static int8_t 856 utf8_to_euctw(uint32_t utf8, uchar_t **inbuf, uchar_t *ibtail, 857 uchar_t *ob, uchar_t *obtail, size_t *ret_val) 858 { 859 size_t index; 860 size_t plane_no; 861 uchar_t byte1; 862 uchar_t byte2; 863 864 if (utf8 >= KICONV_TC_UDA_UTF8_START && 865 utf8 <= KICONV_TC_UDA_UTF8_END) { 866 /* 867 * Calculate EUC-TW code if utf8 is in Unicode 868 * Private Plane 15. 869 */ 870 index = (((utf8 & 0x7000000) >> 6) | ((utf8 & 0x3F0000) >> 4) | 871 ((utf8 & 0x3F00) >> 2) | (utf8 & 0x3F)) - 872 KICONV_TC_UDA_UCS4_START; 873 plane_no = 12 + index / 8836; 874 byte1 = 0xA1 + (index % 8836) / 94; 875 byte2 = 0xA1 + index % 94; 876 877 /* CNS Plane 15 is pre-allocated, so place it into Plane 16. */ 878 if (plane_no == 15) 879 plane_no = 16; 880 } else { 881 uint32_t euctw_val; 882 883 index = kiconv_binsearch(utf8, kiconv_utf8_euctw, 884 KICONV_UTF8_EUCTW_MAX); 885 886 if (index == 0) { 887 if (ob >= obtail) { 888 *ret_val = (size_t)-1; 889 return (-1); 890 } 891 892 *ob++ = KICONV_ASCII_REPLACEMENT_CHAR; 893 (*ret_val)++; 894 895 return (1); 896 } 897 898 euctw_val = kiconv_utf8_euctw[index].value; 899 byte1 = (euctw_val & 0xFF00) >> 8; 900 byte2 = euctw_val & 0xFF; 901 plane_no = euctw_val >> 16; 902 } 903 904 if (obtail - ob < (plane_no == 1 ? 2 : 4)) { 905 *ret_val = (size_t)-1; 906 return (-1); 907 } 908 909 if (plane_no != 1) { 910 *ob++ = KICONV_TC_EUCTW_MBYTE; 911 *ob++ = KICONV_TC_EUCTW_PMASK + plane_no; 912 } 913 914 *ob++ = byte1; 915 *ob = byte2; 916 917 return (plane_no == 1 ? 2 : 4); 918 } 919 920 /* 921 * Convert single UTF-8 character to BIG5-HKSCS 922 * Return: > 0 - Converted successfully 923 * = -1 - E2BIG 924 */ 925 static int8_t 926 utf8_to_big5hkscs(uint32_t utf8, uchar_t **inbuf, uchar_t *ibtail, 927 uchar_t *ob, uchar_t *obtail, size_t *ret_val) 928 { 929 size_t index; 930 int8_t hkscslen; 931 uint32_t hkscscode; 932 boolean_t special_sequence = B_FALSE; 933 934 index = kiconv_binsearch(utf8, kiconv_utf8_hkscs, 935 KICONV_UTF8_HKSCS_MAX); 936 hkscscode = kiconv_utf8_hkscs[index].value; 937 938 /* 939 * There are 4 special code points in HKSCS-2004 which mapped 940 * to 2 UNICODE code points. 941 */ 942 if ((int32_t)hkscscode < 0) { 943 size_t special_index = (-(int32_t)hkscscode - 1) * 3; 944 945 /* Check the following 2 bytes. */ 946 if (ibtail - *inbuf >= 2 && **inbuf == 0xcc && 947 (*(*inbuf + 1) == 0x84 || *(*inbuf + 1) == 0x8c)) { 948 special_index += (*(*inbuf + 1) == 0x84 ? 1 : 2); 949 special_sequence = B_TRUE; 950 } 951 952 hkscscode = ucs_special_sequence[special_index]; 953 } 954 955 hkscslen = (hkscscode <= 0xFF) ? 1 : 2; 956 if (obtail - ob < hkscslen) { 957 *ret_val = (size_t)-1; 958 return (-1); 959 } 960 961 if (index == 0) 962 (*ret_val)++; 963 964 if (hkscslen > 1) 965 *ob++ = (uchar_t)(hkscscode >> 8); 966 *ob = (uchar_t)(hkscscode & 0xFF); 967 968 if (special_sequence) { /* Advance for special sequence */ 969 (*inbuf) += 2; 970 } 971 972 return (hkscslen); 973 } 974 975 /* 976 * Common convertor for UTF-8 to BIG5/CP950-HKSCS. 977 * Return: > 0 - Converted successfully 978 * = -1 - E2BIG 979 */ 980 static int8_t 981 utf8_to_big5_common(uint32_t utf8, uchar_t *ob, uchar_t *obtail, 982 size_t *ret_val, kiconv_table_t *table, size_t nitems) 983 { 984 size_t index; 985 int8_t big5len; 986 uint32_t big5code; 987 988 index = kiconv_binsearch(utf8, table, nitems); 989 big5code = table[index].value; 990 big5len = (big5code <= 0xFF) ? 1 : 2; 991 992 if (obtail - ob < big5len) { 993 *ret_val = (size_t)-1; 994 return (-1); 995 } 996 997 if (index == 0) 998 (*ret_val)++; 999 1000 if (big5len > 1) 1001 *ob++ = (uchar_t)(big5code >> 8); 1002 *ob = (uchar_t)(big5code & 0xFF); 1003 1004 return (big5len); 1005 } 1006 1007 /* 1008 * Convert single UTF-8 character to BIG5. 1009 */ 1010 /* ARGSUSED */ 1011 static int8_t 1012 utf8_to_big5(uint32_t utf8, uchar_t **inbuf, uchar_t *ibtail, 1013 uchar_t *ob, uchar_t *obtail, size_t *ret_val) 1014 { 1015 return (utf8_to_big5_common(utf8, ob, obtail, ret_val, 1016 kiconv_utf8_big5, KICONV_UTF8_BIG5_MAX)); 1017 } 1018 1019 /* 1020 * Convert single UTF-8 character to CP950-HKSCS for Windows compatibility. 1021 */ 1022 /* ARGSUSED */ 1023 static int8_t 1024 utf8_to_cp950hkscs(uint32_t utf8, uchar_t **inbuf, uchar_t *ibtail, 1025 uchar_t *ob, uchar_t *obtail, size_t *ret_val) 1026 { 1027 return (utf8_to_big5_common(utf8, ob, obtail, ret_val, 1028 kiconv_utf8_cp950hkscs, KICONV_UTF8_CP950HKSCS)); 1029 } 1030 1031 static kiconv_ops_t kiconv_tc_ops_tbl[] = { 1032 { 1033 "big5", "utf-8", kiconv_open_to_cck, kiconv_to_big5, 1034 kiconv_close_to_cck, kiconvstr_to_big5 1035 }, 1036 { 1037 "utf-8", "big5", open_fr_big5, kiconv_fr_big5, 1038 close_fr_tc, kiconvstr_fr_big5 1039 }, 1040 1041 { 1042 "big5-hkscs", "utf-8", kiconv_open_to_cck, kiconv_to_big5hkscs, 1043 kiconv_close_to_cck, kiconvstr_to_big5hkscs 1044 }, 1045 { 1046 "utf-8", "big5-hkscs", open_fr_big5hkscs, kiconv_fr_big5hkscs, 1047 close_fr_tc, kiconvstr_fr_big5hkscs 1048 }, 1049 1050 { 1051 "euc-tw", "utf-8", kiconv_open_to_cck, kiconv_to_euctw, 1052 kiconv_close_to_cck, kiconvstr_to_euctw 1053 }, 1054 { 1055 "utf-8", "euc-tw", open_fr_euctw, kiconv_fr_euctw, 1056 close_fr_tc, kiconvstr_fr_euctw 1057 }, 1058 1059 { 1060 "cp950-hkscs", "utf-8", kiconv_open_to_cck, 1061 kiconv_to_cp950hkscs, kiconv_close_to_cck, 1062 kiconvstr_to_cp950hkscs 1063 }, 1064 { 1065 "utf-8", "cp950-hkscs", open_fr_cp950hkscs, 1066 kiconv_fr_cp950hkscs, close_fr_tc, kiconvstr_fr_cp950hkscs 1067 }, 1068 }; 1069 1070 static kiconv_module_info_t kiconv_tc_info = { 1071 "kiconv_tc", /* module name */ 1072 sizeof (kiconv_tc_ops_tbl) / sizeof (kiconv_tc_ops_tbl[0]), 1073 kiconv_tc_ops_tbl, 1074 0, 1075 NULL, 1076 NULL, 1077 0 1078 }; 1079 1080 static struct modlkiconv modlkiconv_tc = { 1081 &mod_kiconvops, 1082 "kiconv Traditional Chinese module 1.0", 1083 &kiconv_tc_info 1084 }; 1085 1086 static struct modlinkage modlinkage = { 1087 MODREV_1, 1088 (void *)&modlkiconv_tc, 1089 NULL 1090 }; 1091 1092 int 1093 _init(void) 1094 { 1095 int err; 1096 1097 err = mod_install(&modlinkage); 1098 if (err) 1099 cmn_err(CE_WARN, "kiconv_tc: failed to load kernel module"); 1100 1101 return (err); 1102 } 1103 1104 int 1105 _fini(void) 1106 { 1107 int err; 1108 1109 /* 1110 * If this module is being used, then, we cannot remove the module. 1111 * The following checking will catch pretty much all usual cases. 1112 * 1113 * Any remaining will be catached by the kiconv_unregister_module() 1114 * during mod_remove() at below. 1115 */ 1116 if (kiconv_module_ref_count(KICONV_MODULE_ID_TC)) 1117 return (EBUSY); 1118 1119 err = mod_remove(&modlinkage); 1120 if (err) 1121 cmn_err(CE_WARN, "kiconv_tc: failed to remove kernel module"); 1122 1123 return (err); 1124 } 1125 1126 int 1127 _info(struct modinfo *modinfop) 1128 { 1129 return (mod_info(&modlinkage, modinfop)); 1130 } 1131