1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 #include <sys/types.h> 29 #include <sys/param.h> 30 #include <sys/sysmacros.h> 31 #include <sys/systm.h> 32 #include <sys/debug.h> 33 #include <sys/kmem.h> 34 #include <sys/sunddi.h> 35 #include <sys/byteorder.h> 36 #include <sys/errno.h> 37 #include <sys/modctl.h> 38 #include <sys/u8_textprep.h> 39 #include <sys/kiconv.h> 40 #include <sys/kiconv_cck_common.h> 41 #include <sys/kiconv_tc.h> 42 #include <sys/kiconv_big5_utf8.h> 43 #include <sys/kiconv_euctw_utf8.h> 44 #include <sys/kiconv_hkscs_utf8.h> 45 #include <sys/kiconv_cp950hkscs_utf8.h> 46 #include <sys/kiconv_utf8_big5.h> 47 #include <sys/kiconv_utf8_euctw.h> 48 #include <sys/kiconv_utf8_cp950hkscs.h> 49 #include <sys/kiconv_utf8_hkscs.h> 50 51 /* 4 HKSCS-2004 code points map to 2 Unicode code points separately. */ 52 static uchar_t hkscs_special_sequence[][4] = { 53 { 0xc3, 0x8a, 0xcc, 0x84 }, /* 0x8862 */ 54 { 0xc3, 0x8a, 0xcc, 0x8c }, /* 0x8864 */ 55 { 0xc3, 0xaa, 0xcc, 0x84 }, /* 0x88a3 */ 56 { 0xc3, 0xaa, 0xcc, 0x8c } /* 0x88a5 */ 57 }; 58 59 /* 4 Unicode code point pair map to 1 HKSCS-2004 code point. */ 60 static uint32_t ucs_special_sequence[] = { 61 0x8866, /* U+00ca */ 62 0x8862, /* U+00ca U+0304 */ 63 0x8864, /* U+00ca U+030c */ 64 0x88a7, /* U+00ea */ 65 0x88a3, /* U+00ea U+0304 */ 66 0x88a5 /* U+00ea U+030c */ 67 }; 68 69 typedef int8_t (*kiconv_big5toutf8_t)(uint32_t value, uchar_t *ob, 70 uchar_t *obtail, size_t *ret_val); 71 72 static int8_t utf8_to_big5(uint32_t utf8, uchar_t **inbuf, uchar_t *ibtail, 73 uchar_t *ob, uchar_t *obtail, size_t *ret_val); 74 static int8_t utf8_to_euctw(uint32_t utf8, uchar_t **inbuf, uchar_t *ibtail, 75 uchar_t *ob, uchar_t *obtail, size_t *ret_val); 76 static int8_t utf8_to_cp950hkscs(uint32_t utf8, uchar_t **inbuf, 77 uchar_t *ibtail, uchar_t *ob, uchar_t *obtail, size_t *ret_val); 78 static int8_t utf8_to_big5hkscs(uint32_t utf8, uchar_t **inbuf, uchar_t *ibtail, 79 uchar_t *ob, uchar_t *obtail, size_t *ret_val); 80 static int8_t big5_to_utf8(uint32_t big5_val, uchar_t *ob, uchar_t *obtail, 81 size_t *ret_val); 82 static int8_t big5hkscs_to_utf8(uint32_t hkscs_val, uchar_t *ob, 83 uchar_t *obtail, size_t *ret_val); 84 static int8_t cp950hkscs_to_utf8(uint32_t hkscs_val, uchar_t *ob, 85 uchar_t *obtail, size_t *ret_val); 86 static int8_t euctw_to_utf8(size_t plane_no, uint32_t euctw_val, 87 uchar_t *ob, uchar_t *obtail, size_t *ret_val); 88 static uint32_t get_unicode_from_UDA(size_t plane_no, uchar_t byte1, 89 uchar_t byte2); 90 91 #define KICONV_TC_BIG5 (0x01) 92 #define KICONV_TC_BIG5HKSCS (0x02) 93 #define KICONV_TC_CP950HKSCS (0x03) 94 #define KICONV_TC_EUCTW (0x04) 95 #define KICONV_TC_MAX_MAGIC_ID (0x04) 96 97 static void * 98 open_fr_big5() 99 { 100 return ((void *)KICONV_TC_BIG5); 101 } 102 103 static void * 104 open_fr_big5hkscs() 105 { 106 return ((void *)KICONV_TC_BIG5HKSCS); 107 } 108 109 static void * 110 open_fr_cp950hkscs() 111 { 112 return ((void *)KICONV_TC_CP950HKSCS); 113 } 114 115 static void * 116 open_fr_euctw() 117 { 118 return ((void *)KICONV_TC_EUCTW); 119 } 120 121 static int 122 close_fr_tc(void *s) 123 { 124 if ((uintptr_t)s > KICONV_TC_MAX_MAGIC_ID) 125 return (EBADF); 126 127 return (0); 128 } 129 130 /* 131 * Common convertor from BIG5/HKSCS(BIG5-HKSCS or CP950-HKSCS) to UTF-8. 132 */ 133 static size_t 134 kiconv_fr_big5_common(void *kcd, char **inbuf, size_t *inbytesleft, 135 char **outbuf, size_t *outbytesleft, int *errno, 136 kiconv_big5toutf8_t ptr_big5touf8) 137 { 138 uchar_t *ib; 139 uchar_t *ob; 140 uchar_t *ibtail; 141 uchar_t *obtail; 142 size_t ret_val; 143 int8_t sz; 144 uint32_t big5_val; 145 146 /* Check on the kiconv code conversion descriptor. */ 147 if (kcd == NULL || kcd == (void *)-1) { 148 *errno = EBADF; 149 return ((size_t)-1); 150 } 151 152 /* If this is a state reset request, process and return. */ 153 if (inbuf == NULL || *inbuf == NULL) { 154 return (0); 155 } 156 157 ret_val = 0; 158 ib = (uchar_t *)*inbuf; 159 ob = (uchar_t *)*outbuf; 160 ibtail = ib + *inbytesleft; 161 obtail = ob + *outbytesleft; 162 163 while (ib < ibtail) { 164 if (KICONV_IS_ASCII(*ib)) { 165 if (ob >= obtail) { 166 KICONV_SET_ERRNO_AND_BREAK(E2BIG); 167 } 168 169 *ob++ = *ib++; 170 continue; 171 } 172 173 /* 174 * Issue EILSEQ error if the first byte is not a 175 * valid BIG5/HKSCS leading byte. 176 */ 177 if (! KICONV_TC_IS_BIG5_1st_BYTE(*ib)) { 178 KICONV_SET_ERRNO_AND_BREAK(EILSEQ); 179 } 180 181 /* 182 * Issue EINVAL error if input buffer has an incomplete 183 * character at the end of the buffer. 184 */ 185 if (ibtail - ib < 2) { 186 KICONV_SET_ERRNO_AND_BREAK(EINVAL); 187 } 188 189 /* 190 * Issue EILSEQ error if the remaining bytes is not 191 * a valid BIG5/HKSCS byte. 192 */ 193 if (! KICONV_TC_IS_BIG5_2nd_BYTE(*(ib + 1))) { 194 KICONV_SET_ERRNO_AND_BREAK(EILSEQ); 195 } 196 197 /* Now we have a valid BIG5/HKSCS character. */ 198 big5_val = (uint32_t)(*ib) << 8 | *(ib + 1); 199 sz = ptr_big5touf8(big5_val, ob, obtail, &ret_val); 200 201 if (sz < 0) { 202 KICONV_SET_ERRNO_AND_BREAK(E2BIG); 203 } 204 205 ib += 2; 206 ob += sz; 207 } 208 209 *inbuf = (char *)ib; 210 *inbytesleft = ibtail - ib; 211 *outbuf = (char *)ob; 212 *outbytesleft = obtail - ob; 213 214 return (ret_val); 215 } 216 217 /* 218 * String based Common convertor from BIG5/HKSCS(BIG5-HKSCS or CP950-HKSCS) 219 * to UTF-8. 220 */ 221 static size_t 222 kiconvstr_fr_big5_common(uchar_t *ib, size_t *inlen, uchar_t *ob, 223 size_t *outlen, int flag, int *errno, 224 kiconv_big5toutf8_t ptr_big5touf8) 225 { 226 uchar_t *oldib; 227 uchar_t *ibtail; 228 uchar_t *obtail; 229 size_t ret_val; 230 int8_t sz; 231 uint32_t big5_val; 232 boolean_t do_not_ignore_null; 233 234 ret_val = 0; 235 ibtail = ib + *inlen; 236 obtail = ob + *outlen; 237 do_not_ignore_null = ((flag & KICONV_IGNORE_NULL) == 0); 238 239 while (ib < ibtail) { 240 if (*ib == '\0' && do_not_ignore_null) 241 break; 242 243 if (KICONV_IS_ASCII(*ib)) { 244 if (ob >= obtail) { 245 KICONV_SET_ERRNO_AND_BREAK(E2BIG); 246 } 247 248 *ob++ = *ib++; 249 continue; 250 } 251 252 oldib = ib; 253 254 if (! KICONV_TC_IS_BIG5_1st_BYTE(*ib)) { 255 KICONV_SET_ERRNO_WITH_FLAG(1, EILSEQ); 256 } 257 258 if (ibtail - ib < 2) { 259 KICONV_SET_ERRNO_WITH_FLAG(1, EINVAL); 260 } 261 262 if (! KICONV_TC_IS_BIG5_2nd_BYTE(*(ib + 1))) { 263 KICONV_SET_ERRNO_WITH_FLAG(2, EILSEQ); 264 } 265 266 big5_val = *ib++; 267 big5_val = (big5_val << 8) | *ib++; 268 sz = ptr_big5touf8(big5_val, ob, obtail, &ret_val); 269 270 if (sz < 0) { 271 ib = oldib; 272 KICONV_SET_ERRNO_AND_BREAK(E2BIG); 273 } 274 275 ob += sz; 276 continue; 277 278 REPLACE_INVALID: 279 if (obtail - ob < KICONV_UTF8_REPLACEMENT_CHAR_LEN) { 280 ib = oldib; 281 KICONV_SET_ERRNO_AND_BREAK(E2BIG); 282 } 283 284 *ob++ = KICONV_UTF8_REPLACEMENT_CHAR1; 285 *ob++ = KICONV_UTF8_REPLACEMENT_CHAR2; 286 *ob++ = KICONV_UTF8_REPLACEMENT_CHAR3; 287 ret_val++; 288 } 289 290 *inlen = ibtail - ib; 291 *outlen = obtail - ob; 292 293 return (ret_val); 294 } 295 296 /* 297 * Encoding convertor from BIG5 to UTF-8. 298 */ 299 static size_t 300 kiconv_fr_big5(void *kcd, char **inbuf, size_t *inbytesleft, char **outbuf, 301 size_t *outbytesleft, int *errno) 302 { 303 return (kiconv_fr_big5_common(kcd, inbuf, inbytesleft, outbuf, 304 outbytesleft, errno, big5_to_utf8)); 305 } 306 307 /* 308 * String based encoding convertor from BIG5 to UTF-8. 309 */ 310 static size_t 311 kiconvstr_fr_big5(char *inarray, size_t *inlen, char *outarray, 312 size_t *outlen, int flag, int *errno) 313 { 314 return (kiconvstr_fr_big5_common((uchar_t *)inarray, inlen, 315 (uchar_t *)outarray, outlen, flag, errno, 316 big5_to_utf8)); 317 } 318 319 /* 320 * Encoding convertor from BIG5-HKSCS to UTF-8. 321 */ 322 static size_t 323 kiconv_fr_big5hkscs(void *kcd, char **inbuf, size_t *inbytesleft, 324 char **outbuf, size_t *outbytesleft, int *errno) 325 { 326 return kiconv_fr_big5_common(kcd, inbuf, inbytesleft, outbuf, 327 outbytesleft, errno, big5hkscs_to_utf8); 328 } 329 330 /* 331 * String based encoding convertor from BIG5-HKSCS to UTF-8. 332 */ 333 static size_t 334 kiconvstr_fr_big5hkscs(char *inarray, size_t *inlen, char *outarray, 335 size_t *outlen, int flag, int *errno) 336 { 337 return kiconvstr_fr_big5_common((uchar_t *)inarray, inlen, 338 (uchar_t *)outarray, outlen, flag, errno, big5hkscs_to_utf8); 339 } 340 341 /* 342 * Encoding convertor from CP950-HKSCS to UTF-8. 343 */ 344 static size_t 345 kiconv_fr_cp950hkscs(void *kcd, char **inbuf, size_t *inbytesleft, 346 char **outbuf, size_t *outbytesleft, int *errno) 347 { 348 return kiconv_fr_big5_common(kcd, inbuf, inbytesleft, outbuf, 349 outbytesleft, errno, cp950hkscs_to_utf8); 350 } 351 352 /* 353 * String based encoding convertor from CP950-HKSCS to UTF-8. 354 */ 355 static size_t 356 kiconvstr_fr_cp950hkscs(char *inarray, size_t *inlen, char *outarray, 357 size_t *outlen, int flag, int *errno) 358 { 359 return kiconvstr_fr_big5_common((uchar_t *)inarray, inlen, 360 (uchar_t *)outarray, outlen, flag, errno, cp950hkscs_to_utf8); 361 } 362 363 /* 364 * Encoding convertor from EUC-TW to UTF-8. 365 */ 366 static size_t 367 kiconv_fr_euctw(void *kcd, char **inbuf, size_t *inbytesleft, 368 char **outbuf, size_t *outbytesleft, int *errno) 369 { 370 uchar_t *ib; 371 uchar_t *ob; 372 uchar_t *ibtail; 373 uchar_t *obtail; 374 uchar_t *oldib; 375 size_t ret_val; 376 size_t plane_no; 377 int8_t sz; 378 uint32_t euctw_val; 379 boolean_t isplane1; 380 381 /* Check on the kiconv code conversion descriptor. */ 382 if (kcd == NULL || kcd == (void *)-1) { 383 *errno = EBADF; 384 return ((size_t)-1); 385 } 386 387 /* If this is a state reset request, process and return. */ 388 if (inbuf == NULL || *inbuf == NULL) { 389 return (0); 390 } 391 392 ret_val = 0; 393 ib = (uchar_t *)*inbuf; 394 ob = (uchar_t *)*outbuf; 395 ibtail = ib + *inbytesleft; 396 obtail = ob + *outbytesleft; 397 398 while (ib < ibtail) { 399 if (KICONV_IS_ASCII(*ib)) { 400 if (ob >= obtail) { 401 KICONV_SET_ERRNO_AND_BREAK(E2BIG); 402 } 403 404 *ob++ = *ib++; 405 continue; 406 } 407 408 /* 409 * Issue EILSEQ error if the first byte is not a 410 * valid EUC-TW leading byte. 411 */ 412 if (! KICONV_TC_IS_EUCTW_1st_BYTE(*ib)) { 413 KICONV_SET_ERRNO_AND_BREAK(EILSEQ); 414 } 415 416 isplane1 = (*ib == KICONV_TC_EUCTW_MBYTE) ? 417 B_FALSE : B_TRUE; 418 419 /* 420 * Issue EINVAL error if input buffer has an incomplete 421 * character at the end of the buffer. 422 */ 423 if (ibtail - ib < (isplane1 ? 2 : 4)) { 424 KICONV_SET_ERRNO_AND_BREAK(EINVAL); 425 } 426 427 oldib = ib; 428 plane_no = isplane1 ? 1 : *(ib + 1) - KICONV_TC_EUCTW_PMASK; 429 430 /* 431 * Issue EILSEQ error if the remaining bytes are not 432 * valid EUC-TW bytes. 433 */ 434 if (! KICONV_TC_IS_VALID_EUCTW_SEQ(ib)) { 435 KICONV_SET_ERRNO_AND_BREAK(EILSEQ); 436 } 437 438 if (! isplane1) 439 ib += 2; 440 441 /* Now we have a valid EUC-TW character. */ 442 euctw_val = *ib++; 443 euctw_val = (euctw_val << 8) | *ib++; 444 sz = euctw_to_utf8(plane_no, euctw_val, ob, obtail, &ret_val); 445 446 if (sz < 0) { 447 ib = oldib; 448 KICONV_SET_ERRNO_AND_BREAK(E2BIG); 449 } 450 451 ob += sz; 452 } 453 454 *inbuf = (char *)ib; 455 *inbytesleft = ibtail - ib; 456 *outbuf = (char *)ob; 457 *outbytesleft = obtail - ob; 458 459 return (ret_val); 460 } 461 462 /* 463 * String based encoding convertor from EUC-TW to UTF-8. 464 */ 465 static size_t 466 kiconvstr_fr_euctw(char *inarray, size_t *inlen, char *outarray, 467 size_t *outlen, int flag, int *errno) 468 { 469 uchar_t *ib; 470 uchar_t *ob; 471 uchar_t *ibtail; 472 uchar_t *obtail; 473 uchar_t *oldib; 474 size_t ret_val; 475 size_t plane_no; 476 int8_t sz; 477 uint32_t euctw_val; 478 boolean_t isplane1; 479 boolean_t do_not_ignore_null; 480 481 ret_val = 0; 482 ib = (uchar_t *)inarray; 483 ob = (uchar_t *)outarray; 484 ibtail = ib + *inlen; 485 obtail = ob + *outlen; 486 do_not_ignore_null = ((flag & KICONV_IGNORE_NULL) == 0); 487 488 while (ib < ibtail) { 489 if (*ib == '\0' && do_not_ignore_null) 490 break; 491 492 if (KICONV_IS_ASCII(*ib)) { 493 if (ob >= obtail) { 494 KICONV_SET_ERRNO_AND_BREAK(E2BIG); 495 } 496 497 *ob++ = *ib++; 498 continue; 499 } 500 501 oldib = ib; 502 503 if (! KICONV_TC_IS_EUCTW_1st_BYTE(*ib)) { 504 KICONV_SET_ERRNO_WITH_FLAG(1, EILSEQ); 505 } 506 507 isplane1 = (*ib == KICONV_TC_EUCTW_MBYTE) ? 508 B_FALSE : B_TRUE; 509 510 if (ibtail - ib < (isplane1 ? 2 : 4)) { 511 if (flag & KICONV_REPLACE_INVALID) { 512 ib = ibtail; 513 goto REPLACE_INVALID; 514 } 515 516 KICONV_SET_ERRNO_AND_BREAK(EINVAL); 517 } 518 519 plane_no = isplane1 ? 1 : *(ib + 1) - KICONV_TC_EUCTW_PMASK; 520 521 if (! KICONV_TC_IS_VALID_EUCTW_SEQ(ib)) { 522 KICONV_SET_ERRNO_WITH_FLAG(isplane1 ? 2 : 4, EILSEQ); 523 } 524 525 if (! isplane1) 526 ib += 2; 527 528 euctw_val = *ib++; 529 euctw_val = (euctw_val << 8) | *ib++; 530 sz = euctw_to_utf8(plane_no, euctw_val, ob, obtail, &ret_val); 531 532 if (sz < 0) { 533 ib = oldib; 534 KICONV_SET_ERRNO_AND_BREAK(E2BIG); 535 } 536 537 ob += sz; 538 continue; 539 540 REPLACE_INVALID: 541 if (obtail - ob < KICONV_UTF8_REPLACEMENT_CHAR_LEN) { 542 ib = oldib; 543 KICONV_SET_ERRNO_AND_BREAK(E2BIG); 544 } 545 546 *ob++ = KICONV_UTF8_REPLACEMENT_CHAR1; 547 *ob++ = KICONV_UTF8_REPLACEMENT_CHAR2; 548 *ob++ = KICONV_UTF8_REPLACEMENT_CHAR3; 549 ret_val++; 550 } 551 552 *inlen = ibtail - ib; 553 *outlen = obtail - ob; 554 555 return (ret_val); 556 } 557 558 /* 559 * Encoding convertor from UTF-8 to BIG5. 560 */ 561 static size_t 562 kiconv_to_big5(void *kcd, char **inbuf, size_t *inbytesleft, 563 char **outbuf, size_t *outbytesleft, int *errno) 564 { 565 return kiconv_utf8_to_cck(kcd, inbuf, inbytesleft, outbuf, 566 outbytesleft, errno, utf8_to_big5); 567 } 568 569 /* 570 * String based encoding convertor from UTF-8 to BIG5. 571 */ 572 static size_t 573 kiconvstr_to_big5(char *inarray, size_t *inlen, char *outarray, 574 size_t *outlen, int flag, int *errno) 575 { 576 return kiconvstr_utf8_to_cck((uchar_t *)inarray, inlen, 577 (uchar_t *)outarray, outlen, flag, errno, utf8_to_big5); 578 } 579 580 /* 581 * Encoding convertor from UTF-8 to EUC-TW. 582 */ 583 static size_t 584 kiconv_to_euctw(void *kcd, char **inbuf, size_t *inbytesleft, 585 char **outbuf, size_t *outbytesleft, int *errno) 586 { 587 return kiconv_utf8_to_cck(kcd, inbuf, inbytesleft, outbuf, 588 outbytesleft, errno, utf8_to_euctw); 589 } 590 591 /* 592 * String based encoding convertor from UTF-8 to EUC-TW. 593 */ 594 static size_t 595 kiconvstr_to_euctw(char *inarray, size_t *inlen, char *outarray, 596 size_t *outlen, int flag, int *errno) 597 { 598 return kiconvstr_utf8_to_cck((uchar_t *)inarray, inlen, 599 (uchar_t *)outarray, outlen, flag, errno, utf8_to_euctw); 600 } 601 602 /* 603 * Encoding convertor from UTF-8 to CP950HKSCS. 604 */ 605 static size_t 606 kiconv_to_cp950hkscs(void *kcd, char **inbuf, size_t *inbytesleft, 607 char **outbuf, size_t *outbytesleft, int *errno) 608 { 609 return kiconv_utf8_to_cck(kcd, inbuf, inbytesleft, outbuf, 610 outbytesleft, errno, utf8_to_cp950hkscs); 611 } 612 613 /* 614 * String based encoding convertor from UTF-8 to CP950HKSCS. 615 */ 616 static size_t 617 kiconvstr_to_cp950hkscs(char *inarray, size_t *inlen, char *outarray, 618 size_t *outlen, int flag, int *errno) 619 { 620 return kiconvstr_utf8_to_cck((uchar_t *)inarray, inlen, 621 (uchar_t *)outarray, outlen, flag, errno, utf8_to_cp950hkscs); 622 } 623 624 /* 625 * Encoding convertor from UTF-8 to BIG5HKSCS(HKSCS-2004). 626 */ 627 static size_t 628 kiconv_to_big5hkscs(void *kcd, char **inbuf, size_t *inbytesleft, 629 char **outbuf, size_t *outbytesleft, int *errno) 630 { 631 return kiconv_utf8_to_cck(kcd, inbuf, inbytesleft, outbuf, 632 outbytesleft, errno, utf8_to_big5hkscs); 633 } 634 635 /* 636 * String based encoding convertor from UTF-8 to BIG5HKSCS(HKSCS-2004). 637 */ 638 static size_t 639 kiconvstr_to_big5hkscs(char *inarray, size_t *inlen, char *outarray, 640 size_t *outlen, int flag, int *errno) 641 { 642 return kiconvstr_utf8_to_cck((uchar_t *)inarray, inlen, 643 (uchar_t *)outarray, outlen, flag, errno, utf8_to_big5hkscs); 644 } 645 646 /* 647 * Common convertor from single BIG5/CP950-HKSCS character to UTF-8. 648 * Return: > 0 - Converted successfully 649 * = -1 - E2BIG 650 */ 651 static int8_t 652 big5_to_utf8_common(uint32_t big5_val, uchar_t *ob, uchar_t *obtail, 653 size_t *ret_val, kiconv_table_array_t *table, size_t nitems) 654 { 655 size_t index; 656 int8_t sz; 657 uchar_t *u8; 658 659 index = kiconv_binsearch(big5_val, table, nitems); 660 u8 = table[index].u8; 661 sz = u8_number_of_bytes[u8[0]]; 662 663 if (obtail - ob < sz) { 664 *ret_val = (size_t)-1; 665 return (-1); 666 } 667 668 if (index == 0) 669 (*ret_val)++; /* Non-identical conversion */ 670 671 for (index = 0; index < sz; index++) 672 *ob++ = u8[index]; 673 674 return (sz); 675 } 676 677 /* 678 * Convert single BIG5 character to UTF-8. 679 */ 680 static int8_t 681 big5_to_utf8(uint32_t big5_val, uchar_t *ob, uchar_t *obtail, size_t *ret_val) 682 { 683 return (big5_to_utf8_common(big5_val, ob, obtail, ret_val, 684 kiconv_big5_utf8, KICONV_BIG5_UTF8_MAX)); 685 } 686 687 /* 688 * Convert single CP950-HKSCS character to UTF-8. 689 */ 690 static int8_t 691 cp950hkscs_to_utf8(uint32_t hkscs_val, uchar_t *ob, uchar_t *obtail, 692 size_t *ret_val) 693 { 694 return (big5_to_utf8_common(hkscs_val, ob, obtail, ret_val, 695 kiconv_cp950hkscs_utf8, KICONV_CP950HKSCS_UTF8_MAX)); 696 } 697 698 /* 699 * Calculate unicode value for some CNS planes which fall in Unicode 700 * UDA range. 701 */ 702 static uint32_t 703 get_unicode_from_UDA(size_t plane_no, uchar_t b1, uchar_t b2) 704 { 705 /* 706 * CNS Plane 15 is pre-allocated, so need move Plane 16 to back 15 707 * to compute the Unicode value. 708 */ 709 if (plane_no == 16) 710 --plane_no; 711 712 /* 0xF0000 + (plane_no - 12) * 8836 + (b1 - 0xA1) * 94 + (b2 - 0xA1) */ 713 return (8836 * plane_no + 94 * b1 + b2 + 0xD2611); 714 } 715 716 /* 717 * Convert single EUC-TW character to UTF-8. 718 * Return: > 0 - Converted successfully 719 * = -1 - E2BIG 720 */ 721 static int8_t 722 euctw_to_utf8(size_t plane_no, uint32_t euctw_val, uchar_t *ob, 723 uchar_t *obtail, size_t *ret_val) 724 { 725 uint32_t u32; 726 size_t index; 727 int8_t sz; 728 uchar_t udc[4]; 729 uchar_t *u8; 730 731 switch (plane_no) { 732 case 1: 733 index = kiconv_binsearch(euctw_val, kiconv_cns1_utf8, 734 KICONV_CNS1_UTF8_MAX); 735 u8 = kiconv_cns1_utf8[index].u8; 736 break; 737 case 2: 738 index = kiconv_binsearch(euctw_val, kiconv_cns2_utf8, 739 KICONV_CNS2_UTF8_MAX); 740 u8 = kiconv_cns2_utf8[index].u8; 741 break; 742 case 3: 743 index = kiconv_binsearch(euctw_val, kiconv_cns3_utf8, 744 KICONV_CNS3_UTF8_MAX); 745 u8 = kiconv_cns3_utf8[index].u8; 746 break; 747 case 4: 748 index = kiconv_binsearch(euctw_val, kiconv_cns4_utf8, 749 KICONV_CNS4_UTF8_MAX); 750 u8 = kiconv_cns4_utf8[index].u8; 751 break; 752 case 5: 753 index = kiconv_binsearch(euctw_val, kiconv_cns5_utf8, 754 KICONV_CNS5_UTF8_MAX); 755 u8 = kiconv_cns5_utf8[index].u8; 756 break; 757 case 6: 758 index = kiconv_binsearch(euctw_val, kiconv_cns6_utf8, 759 KICONV_CNS6_UTF8_MAX); 760 u8 = kiconv_cns6_utf8[index].u8; 761 break; 762 case 7: 763 index = kiconv_binsearch(euctw_val, kiconv_cns7_utf8, 764 KICONV_CNS7_UTF8_MAX); 765 u8 = kiconv_cns7_utf8[index].u8; 766 break; 767 case 12: 768 case 13: 769 case 14: 770 case 16: 771 u32 = get_unicode_from_UDA(plane_no, 772 (euctw_val & 0xFF00) >> 8, euctw_val & 0xFF); 773 /* 774 * As U+F0000 <= u32 <= U+F8A0F, so its UTF-8 sequence 775 * will occupy 4 bytes. 776 */ 777 udc[0] = 0xF3; 778 udc[1] = (uchar_t)(0x80 | (u32 & 0x03F000) >> 12); 779 udc[2] = (uchar_t)(0x80 | (u32 & 0x000FC0) >> 6); 780 udc[3] = (uchar_t)(0x80 | (u32 & 0x00003F)); 781 u8 = udc; 782 index = 1; 783 break; 784 case 15: 785 index = kiconv_binsearch(euctw_val, kiconv_cns15_utf8, 786 KICONV_CNS15_UTF8_MAX); 787 u8 = kiconv_cns15_utf8[index].u8; 788 break; 789 default: 790 index = 0; 791 u8 = kiconv_cns1_utf8[index].u8; 792 } 793 794 sz = u8_number_of_bytes[u8[0]]; 795 if (obtail - ob < sz) { 796 *ret_val = (size_t)-1; 797 return (-1); 798 } 799 800 if (index == 0) 801 (*ret_val)++; 802 803 for (index = 0; index < sz; index++) 804 *ob++ = u8[index]; 805 806 return (sz); 807 } 808 809 /* 810 * Convert single HKSCS character to UTF-8. 811 * Return: > 0 - Converted successfully 812 * = -1 - E2BIG 813 */ 814 static int8_t 815 big5hkscs_to_utf8(uint32_t hkscs_val, uchar_t *ob, uchar_t *obtail, 816 size_t *ret_val) 817 { 818 size_t index; 819 int8_t sz; 820 uchar_t *u8; 821 822 index = kiconv_binsearch(hkscs_val, kiconv_hkscs_utf8, 823 KICONV_HKSCS_UTF8_MAX); 824 u8 = kiconv_hkscs_utf8[index].u8; 825 826 /* 827 * Single HKSCS-2004 character may map to 2 Unicode 828 * code points. 829 */ 830 if (u8[0] == 0xFF) { 831 u8 = hkscs_special_sequence[u8[1]]; 832 sz = 4; 833 } else { 834 sz = u8_number_of_bytes[u8[0]]; 835 } 836 837 if (obtail - ob < sz) { 838 *ret_val = (size_t)-1; 839 return (-1); 840 } 841 842 if (index == 0) 843 (*ret_val)++; /* Non-identical conversion. */ 844 845 for (index = 0; index < sz; index++) 846 *ob++ = u8[index]; 847 848 return (sz); 849 } 850 851 /* 852 * Convert single UTF-8 character to EUC-TW. 853 * Return: > 0 - Converted successfully 854 * = -1 - E2BIG 855 */ 856 /* ARGSUSED */ 857 static int8_t 858 utf8_to_euctw(uint32_t utf8, uchar_t **inbuf, uchar_t *ibtail, 859 uchar_t *ob, uchar_t *obtail, size_t *ret_val) 860 { 861 size_t index; 862 size_t plane_no; 863 uchar_t byte1; 864 uchar_t byte2; 865 866 if (utf8 >= KICONV_TC_UDA_UTF8_START && 867 utf8 <= KICONV_TC_UDA_UTF8_END) { 868 /* 869 * Calculate EUC-TW code if utf8 is in Unicode 870 * Private Plane 15. 871 */ 872 index = (((utf8 & 0x7000000) >> 6) | ((utf8 & 0x3F0000) >> 4) | 873 ((utf8 & 0x3F00) >> 2) | (utf8 & 0x3F)) - 874 KICONV_TC_UDA_UCS4_START; 875 plane_no = 12 + index / 8836; 876 byte1 = 0xA1 + (index % 8836) / 94; 877 byte2 = 0xA1 + index % 94; 878 879 /* CNS Plane 15 is pre-allocated, so place it into Plane 16. */ 880 if (plane_no == 15) 881 plane_no = 16; 882 } else { 883 uint32_t euctw_val; 884 885 index = kiconv_binsearch(utf8, kiconv_utf8_euctw, 886 KICONV_UTF8_EUCTW_MAX); 887 888 if (index == 0) { 889 if (ob >= obtail) { 890 *ret_val = (size_t)-1; 891 return (-1); 892 } 893 894 *ob++ = KICONV_ASCII_REPLACEMENT_CHAR; 895 (*ret_val)++; 896 897 return (1); 898 } 899 900 euctw_val = kiconv_utf8_euctw[index].value; 901 byte1 = (euctw_val & 0xFF00) >> 8; 902 byte2 = euctw_val & 0xFF; 903 plane_no = euctw_val >> 16; 904 } 905 906 if (obtail - ob < (plane_no == 1 ? 2 : 4)) { 907 *ret_val = (size_t)-1; 908 return (-1); 909 } 910 911 if (plane_no != 1) { 912 *ob++ = KICONV_TC_EUCTW_MBYTE; 913 *ob++ = KICONV_TC_EUCTW_PMASK + plane_no; 914 } 915 916 *ob++ = byte1; 917 *ob = byte2; 918 919 return (plane_no == 1 ? 2 : 4); 920 } 921 922 /* 923 * Convert single UTF-8 character to BIG5-HKSCS 924 * Return: > 0 - Converted successfully 925 * = -1 - E2BIG 926 */ 927 static int8_t 928 utf8_to_big5hkscs(uint32_t utf8, uchar_t **inbuf, uchar_t *ibtail, 929 uchar_t *ob, uchar_t *obtail, size_t *ret_val) 930 { 931 size_t index; 932 int8_t hkscslen; 933 uint32_t hkscscode; 934 boolean_t special_sequence = B_FALSE; 935 936 index = kiconv_binsearch(utf8, kiconv_utf8_hkscs, 937 KICONV_UTF8_HKSCS_MAX); 938 hkscscode = kiconv_utf8_hkscs[index].value; 939 940 /* 941 * There are 4 special code points in HKSCS-2004 which mapped 942 * to 2 UNICODE code points. 943 */ 944 if ((int32_t)hkscscode < 0) { 945 size_t special_index = (-(int32_t)hkscscode - 1) * 3; 946 947 /* Check the following 2 bytes. */ 948 if (ibtail - *inbuf >= 2 && **inbuf == 0xcc && 949 (*(*inbuf + 1) == 0x84 || *(*inbuf + 1) == 0x8c)) { 950 special_index += (*(*inbuf + 1) == 0x84 ? 1 : 2); 951 special_sequence = B_TRUE; 952 } 953 954 hkscscode = ucs_special_sequence[special_index]; 955 } 956 957 hkscslen = (hkscscode <= 0xFF) ? 1 : 2; 958 if (obtail - ob < hkscslen) { 959 *ret_val = (size_t)-1; 960 return (-1); 961 } 962 963 if (index == 0) 964 (*ret_val)++; 965 966 if (hkscslen > 1) 967 *ob++ = (uchar_t)(hkscscode >> 8); 968 *ob = (uchar_t)(hkscscode & 0xFF); 969 970 if (special_sequence) { /* Advance for special sequence */ 971 (*inbuf) += 2; 972 } 973 974 return (hkscslen); 975 } 976 977 /* 978 * Common convertor for UTF-8 to BIG5/CP950-HKSCS. 979 * Return: > 0 - Converted successfully 980 * = -1 - E2BIG 981 */ 982 static int8_t 983 utf8_to_big5_common(uint32_t utf8, uchar_t *ob, uchar_t *obtail, 984 size_t *ret_val, kiconv_table_t *table, size_t nitems) 985 { 986 size_t index; 987 int8_t big5len; 988 uint32_t big5code; 989 990 index = kiconv_binsearch(utf8, table, nitems); 991 big5code = table[index].value; 992 big5len = (big5code <= 0xFF) ? 1 : 2; 993 994 if (obtail - ob < big5len) { 995 *ret_val = (size_t)-1; 996 return (-1); 997 } 998 999 if (index == 0) 1000 (*ret_val)++; 1001 1002 if (big5len > 1) 1003 *ob++ = (uchar_t)(big5code >> 8); 1004 *ob = (uchar_t)(big5code & 0xFF); 1005 1006 return (big5len); 1007 } 1008 1009 /* 1010 * Convert single UTF-8 character to BIG5. 1011 */ 1012 /* ARGSUSED */ 1013 static int8_t 1014 utf8_to_big5(uint32_t utf8, uchar_t **inbuf, uchar_t *ibtail, 1015 uchar_t *ob, uchar_t *obtail, size_t *ret_val) 1016 { 1017 return (utf8_to_big5_common(utf8, ob, obtail, ret_val, 1018 kiconv_utf8_big5, KICONV_UTF8_BIG5_MAX)); 1019 } 1020 1021 /* 1022 * Convert single UTF-8 character to CP950-HKSCS for Windows compatibility. 1023 */ 1024 /* ARGSUSED */ 1025 static int8_t 1026 utf8_to_cp950hkscs(uint32_t utf8, uchar_t **inbuf, uchar_t *ibtail, 1027 uchar_t *ob, uchar_t *obtail, size_t *ret_val) 1028 { 1029 return (utf8_to_big5_common(utf8, ob, obtail, ret_val, 1030 kiconv_utf8_cp950hkscs, KICONV_UTF8_CP950HKSCS)); 1031 } 1032 1033 static kiconv_ops_t kiconv_tc_ops_tbl[] = { 1034 { 1035 "big5", "utf-8", kiconv_open_to_cck, kiconv_to_big5, 1036 kiconv_close_to_cck, kiconvstr_to_big5 1037 }, 1038 { 1039 "utf-8", "big5", open_fr_big5, kiconv_fr_big5, 1040 close_fr_tc, kiconvstr_fr_big5 1041 }, 1042 1043 { 1044 "big5-hkscs", "utf-8", kiconv_open_to_cck, kiconv_to_big5hkscs, 1045 kiconv_close_to_cck, kiconvstr_to_big5hkscs 1046 }, 1047 { 1048 "utf-8", "big5-hkscs", open_fr_big5hkscs, kiconv_fr_big5hkscs, 1049 close_fr_tc, kiconvstr_fr_big5hkscs 1050 }, 1051 1052 { 1053 "euc-tw", "utf-8", kiconv_open_to_cck, kiconv_to_euctw, 1054 kiconv_close_to_cck, kiconvstr_to_euctw 1055 }, 1056 { 1057 "utf-8", "euc-tw", open_fr_euctw, kiconv_fr_euctw, 1058 close_fr_tc, kiconvstr_fr_euctw 1059 }, 1060 1061 { 1062 "cp950-hkscs", "utf-8", kiconv_open_to_cck, 1063 kiconv_to_cp950hkscs, kiconv_close_to_cck, 1064 kiconvstr_to_cp950hkscs 1065 }, 1066 { 1067 "utf-8", "cp950-hkscs", open_fr_cp950hkscs, 1068 kiconv_fr_cp950hkscs, close_fr_tc, kiconvstr_fr_cp950hkscs 1069 }, 1070 }; 1071 1072 static kiconv_module_info_t kiconv_tc_info = { 1073 "kiconv_tc", /* module name */ 1074 sizeof (kiconv_tc_ops_tbl) / sizeof (kiconv_tc_ops_tbl[0]), 1075 kiconv_tc_ops_tbl, 1076 0, 1077 NULL, 1078 NULL, 1079 0 1080 }; 1081 1082 static struct modlkiconv modlkiconv_tc = { 1083 &mod_kiconvops, 1084 "kiconv Traditional Chinese module 1.0", 1085 &kiconv_tc_info 1086 }; 1087 1088 static struct modlinkage modlinkage = { 1089 MODREV_1, 1090 (void *)&modlkiconv_tc, 1091 NULL 1092 }; 1093 1094 int 1095 _init(void) 1096 { 1097 int err; 1098 1099 err = mod_install(&modlinkage); 1100 if (err) 1101 cmn_err(CE_WARN, "kiconv_tc: failed to load kernel module"); 1102 1103 return (err); 1104 } 1105 1106 int 1107 _fini(void) 1108 { 1109 int err; 1110 1111 /* 1112 * If this module is being used, then, we cannot remove the module. 1113 * The following checking will catch pretty much all usual cases. 1114 * 1115 * Any remaining will be catached by the kiconv_unregister_module() 1116 * during mod_remove() at below. 1117 */ 1118 if (kiconv_module_ref_count(KICONV_MODULE_ID_TC)) 1119 return (EBUSY); 1120 1121 err = mod_remove(&modlinkage); 1122 if (err) 1123 cmn_err(CE_WARN, "kiconv_tc: failed to remove kernel module"); 1124 1125 return (err); 1126 } 1127 1128 int 1129 _info(struct modinfo *modinfop) 1130 { 1131 return (mod_info(&modlinkage, modinfop)); 1132 } 1133