1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/types.h> 27 #include <sys/param.h> 28 #include <sys/sysmacros.h> 29 #include <sys/systm.h> 30 #include <sys/debug.h> 31 #include <sys/kmem.h> 32 #include <sys/sunddi.h> 33 #include <sys/byteorder.h> 34 #include <sys/errno.h> 35 #include <sys/modctl.h> 36 #include <sys/kiconv.h> 37 #include <sys/u8_textprep.h> 38 #include <sys/kiconv_cck_common.h> 39 #include <sys/kiconv_sc.h> 40 #include <sys/kiconv_gb18030_utf8.h> 41 #include <sys/kiconv_gb2312_utf8.h> 42 #include <sys/kiconv_utf8_gb18030.h> 43 #include <sys/kiconv_utf8_gb2312.h> 44 45 static int8_t gb2312_to_utf8(uchar_t byte1, uchar_t byte2, uchar_t *ob, 46 uchar_t *obtail, size_t *ret_val); 47 static int8_t gbk_to_utf8(uint32_t gbk_val, uchar_t *ob, uchar_t *obtail, 48 size_t *ret_val, boolean_t isgbk4); 49 static int8_t utf8_to_gb2312(uint32_t utf8, uchar_t **inbuf, uchar_t *ibtail, 50 uchar_t *ob, uchar_t *obtail, size_t *ret); 51 static int8_t utf8_to_gbk(uint32_t utf8, uchar_t **inbuf, uchar_t *ibtail, 52 uchar_t *ob, uchar_t *obtail, size_t *ret); 53 static int8_t utf8_to_gb18030(uint32_t utf8, uchar_t **inbuf, uchar_t *ibtail, 54 uchar_t *ob, uchar_t *obtail, size_t *ret); 55 56 #define KICONV_SC_GB18030 (0x01) 57 #define KICONV_SC_GBK (0x02) 58 #define KICONV_SC_EUCCN (0x03) 59 #define KICONV_SC_MAX_MAGIC_ID (0x03) 60 61 static void * 62 open_fr_gb18030() 63 { 64 return ((void *)KICONV_SC_GB18030); 65 } 66 67 static void * 68 open_fr_gbk() 69 { 70 return ((void *)KICONV_SC_GBK); 71 } 72 73 static void * 74 open_fr_euccn() 75 { 76 return ((void *)KICONV_SC_EUCCN); 77 } 78 79 static int 80 close_fr_sc(void *s) 81 { 82 if ((uintptr_t)s > KICONV_SC_MAX_MAGIC_ID) 83 return (EBADF); 84 85 return (0); 86 } 87 88 /* 89 * Encoding convertor from UTF-8 to GB18030. 90 */ 91 size_t 92 kiconv_to_gb18030(void *kcd, char **inbuf, size_t *inbytesleft, 93 char **outbuf, size_t *outbytesleft, int *errno) 94 { 95 96 return kiconv_utf8_to_cck(kcd, inbuf, inbytesleft, outbuf, 97 outbytesleft, errno, utf8_to_gb18030); 98 } 99 100 /* 101 * String based encoding convertor from UTF-8 to GB18030. 102 */ 103 size_t 104 kiconvstr_to_gb18030(char *inarray, size_t *inlen, char *outarray, 105 size_t *outlen, int flag, int *errno) 106 { 107 return kiconvstr_utf8_to_cck((uchar_t *)inarray, inlen, 108 (uchar_t *)outarray, outlen, flag, errno, utf8_to_gb18030); 109 } 110 111 /* 112 * Encoding convertor from GB18030 to UTF-8. 113 */ 114 size_t 115 kiconv_fr_gb18030(void *kcd, char **inbuf, size_t *inbytesleft, 116 char **outbuf, size_t *outbytesleft, int *errno) 117 { 118 uchar_t *ib; 119 uchar_t *ob; 120 uchar_t *ibtail; 121 uchar_t *obtail; 122 size_t ret_val; 123 int8_t sz; 124 uint32_t gb_val; 125 boolean_t isgbk4; 126 127 /* Check on the kiconv code conversion descriptor. */ 128 if (kcd == NULL || kcd == (void *)-1) { 129 *errno = EBADF; 130 return ((size_t)-1); 131 } 132 133 /* If this is a state reset request, process and return. */ 134 if (inbuf == NULL || *inbuf == NULL) { 135 return (0); 136 } 137 138 ret_val = 0; 139 ib = (uchar_t *)*inbuf; 140 ob = (uchar_t *)*outbuf; 141 ibtail = ib + *inbytesleft; 142 obtail = ob + *outbytesleft; 143 144 while (ib < ibtail) { 145 if (KICONV_IS_ASCII(*ib)) { 146 if (ob >= obtail) { 147 KICONV_SET_ERRNO_AND_BREAK(E2BIG); 148 } 149 150 *ob++ = *ib++; 151 continue; 152 } 153 154 /* 155 * Issue EILSEQ error if the first byte is not a 156 * valid GB18030 leading byte. 157 */ 158 if (! KICONV_SC_IS_GBK_1st_BYTE(*ib)) { 159 KICONV_SET_ERRNO_AND_BREAK(EILSEQ); 160 } 161 162 isgbk4 = (ibtail - ib < 2) ? B_FALSE : 163 KICONV_SC_IS_GB18030_2nd_BYTE(*(ib + 1)); 164 165 if (isgbk4) { 166 if (ibtail - ib < 4) { 167 KICONV_SET_ERRNO_AND_BREAK(EINVAL); 168 } 169 170 if (! (KICONV_SC_IS_GB18030_2nd_BYTE(*(ib + 1)) && 171 KICONV_SC_IS_GB18030_3rd_BYTE(*(ib + 2)) && 172 KICONV_SC_IS_GB18030_4th_BYTE(*(ib + 3)))) { 173 KICONV_SET_ERRNO_AND_BREAK(EILSEQ); 174 } 175 176 gb_val = (uint32_t)(*ib) << 24 | 177 (uint32_t)(*(ib + 1)) << 16 | 178 (uint32_t)(*(ib + 2)) << 8 | *(ib + 3); 179 } else { 180 if (ibtail - ib < 2) { 181 KICONV_SET_ERRNO_AND_BREAK(EINVAL); 182 } 183 184 if (! KICONV_SC_IS_GBK_2nd_BYTE(*(ib + 1))) { 185 KICONV_SET_ERRNO_AND_BREAK(EILSEQ); 186 } 187 188 gb_val = (uint32_t)(*ib) << 8 | *(ib + 1); 189 } 190 191 sz = gbk_to_utf8(gb_val, ob, obtail, &ret_val, isgbk4); 192 if (sz < 0) { 193 KICONV_SET_ERRNO_AND_BREAK(E2BIG); 194 } 195 196 ib += isgbk4 ? 4 : 2; 197 ob += sz; 198 } 199 200 *inbuf = (char *)ib; 201 *inbytesleft = ibtail - ib; 202 *outbuf = (char *)ob; 203 *outbytesleft = obtail - ob; 204 205 return (ret_val); 206 } 207 208 /* 209 * String based encoding convertor from GB18030 to UTF-8. 210 */ 211 size_t 212 kiconvstr_fr_gb18030(char *inarray, size_t *inlen, char *outarray, 213 size_t *outlen, int flag, int *errno) 214 { 215 uchar_t *ib; 216 uchar_t *ob; 217 uchar_t *ibtail; 218 uchar_t *obtail; 219 uchar_t *oldib; 220 size_t ret_val; 221 int8_t sz; 222 uint32_t gb_val; 223 boolean_t isgbk4; 224 boolean_t do_not_ignore_null; 225 226 ret_val = 0; 227 ib = (uchar_t *)inarray; 228 ob = (uchar_t *)outarray; 229 ibtail = ib + *inlen; 230 obtail = ob + *outlen; 231 do_not_ignore_null = ((flag & KICONV_IGNORE_NULL) == 0); 232 233 while (ib < ibtail) { 234 if (*ib == '\0' && do_not_ignore_null) 235 break; 236 237 if (KICONV_IS_ASCII(*ib)) { 238 if (ob >= obtail) { 239 KICONV_SET_ERRNO_AND_BREAK(E2BIG); 240 } 241 242 *ob++ = *ib++; 243 continue; 244 } 245 246 oldib = ib; 247 248 if (! KICONV_SC_IS_GBK_1st_BYTE(*ib)) { 249 KICONV_SET_ERRNO_WITH_FLAG(1, EILSEQ); 250 } 251 252 isgbk4 = (ibtail - ib < 2) ? B_FALSE : 253 KICONV_SC_IS_GB18030_2nd_BYTE(*(ib + 1)); 254 255 if (isgbk4) { 256 if (ibtail - ib < 4) { 257 if (flag & KICONV_REPLACE_INVALID) { 258 ib = ibtail; 259 goto REPLACE_INVALID; 260 } 261 262 KICONV_SET_ERRNO_AND_BREAK(EINVAL); 263 } 264 265 if (! (KICONV_SC_IS_GB18030_2nd_BYTE(*(ib + 1)) && 266 KICONV_SC_IS_GB18030_3rd_BYTE(*(ib + 2)) && 267 KICONV_SC_IS_GB18030_4th_BYTE(*(ib + 3)))) { 268 KICONV_SET_ERRNO_WITH_FLAG(4, EILSEQ); 269 } 270 271 gb_val = (uint32_t)(*ib) << 24 | 272 (uint32_t)(*(ib + 1)) << 16 | 273 (uint32_t)(*(ib + 2)) << 8 | *(ib + 3); 274 } else { 275 if (ibtail - ib < 2) { 276 if (flag & KICONV_REPLACE_INVALID) { 277 ib = ibtail; 278 goto REPLACE_INVALID; 279 } 280 281 KICONV_SET_ERRNO_AND_BREAK(EINVAL); 282 } 283 284 if (! KICONV_SC_IS_GBK_2nd_BYTE(*(ib + 1))) { 285 KICONV_SET_ERRNO_WITH_FLAG(2, EILSEQ); 286 } 287 288 gb_val = (uint32_t)(*ib) << 8 | *(ib + 1); 289 } 290 291 sz = gbk_to_utf8(gb_val, ob, obtail, &ret_val, isgbk4); 292 if (sz < 0) { 293 KICONV_SET_ERRNO_AND_BREAK(E2BIG); 294 } 295 296 ib += isgbk4 ? 4 : 2; 297 ob += sz; 298 continue; 299 300 REPLACE_INVALID: 301 if (obtail - ob < KICONV_UTF8_REPLACEMENT_CHAR_LEN) { 302 ib = oldib; 303 KICONV_SET_ERRNO_AND_BREAK(E2BIG); 304 } 305 306 *ob++ = KICONV_UTF8_REPLACEMENT_CHAR1; 307 *ob++ = KICONV_UTF8_REPLACEMENT_CHAR2; 308 *ob++ = KICONV_UTF8_REPLACEMENT_CHAR3; 309 ret_val++; 310 } 311 312 *inlen = ibtail - ib; 313 *outlen = obtail - ob; 314 315 return (ret_val); 316 } 317 318 /* 319 * Encoding convertor from UTF-8 to GBK. 320 */ 321 size_t 322 kiconv_to_gbk(void *kcd, char **inbuf, size_t *inbytesleft, 323 char **outbuf, size_t *outbytesleft, int *errno) 324 { 325 326 return kiconv_utf8_to_cck(kcd, inbuf, inbytesleft, outbuf, 327 outbytesleft, errno, utf8_to_gbk); 328 } 329 330 /* 331 * String based encoding convertor from UTF-8 to GBK. 332 */ 333 size_t 334 kiconvstr_to_gbk(char *inarray, size_t *inlen, char *outarray, 335 size_t *outlen, int flag, int *errno) 336 { 337 return kiconvstr_utf8_to_cck((uchar_t *)inarray, inlen, 338 (uchar_t *)outarray, outlen, flag, errno, utf8_to_gbk); 339 } 340 341 /* 342 * Encoding convertor from GBK to UTF-8. 343 */ 344 size_t 345 kiconv_fr_gbk(void *kcd, char **inbuf, size_t *inbytesleft, 346 char **outbuf, size_t *outbytesleft, int *errno) 347 { 348 uchar_t *ib; 349 uchar_t *ob; 350 uchar_t *ibtail; 351 uchar_t *obtail; 352 size_t ret_val; 353 int8_t sz; 354 uint32_t gb_val; 355 356 /* Check on the kiconv code conversion descriptor. */ 357 if (kcd == NULL || kcd == (void *)-1) { 358 *errno = EBADF; 359 return ((size_t)-1); 360 } 361 362 /* If this is a state reset request, process and return. */ 363 if (inbuf == NULL || *inbuf == NULL) { 364 return (0); 365 } 366 367 ret_val = 0; 368 ib = (uchar_t *)*inbuf; 369 ob = (uchar_t *)*outbuf; 370 ibtail = ib + *inbytesleft; 371 obtail = ob + *outbytesleft; 372 373 while (ib < ibtail) { 374 if (KICONV_IS_ASCII(*ib)) { 375 if (ob >= obtail) { 376 KICONV_SET_ERRNO_AND_BREAK(E2BIG); 377 } 378 379 *ob++ = *ib++; 380 continue; 381 } 382 383 /* 384 * Issue EILSEQ error if the first byte is not a 385 * valid GBK leading byte. 386 */ 387 if (! KICONV_SC_IS_GBK_1st_BYTE(*ib)) { 388 KICONV_SET_ERRNO_AND_BREAK(EILSEQ); 389 } 390 391 /* 392 * Issue EINVAL error if input buffer has an incomplete 393 * character at the end of the buffer. 394 */ 395 if (ibtail - ib < 2) { 396 KICONV_SET_ERRNO_AND_BREAK(EINVAL); 397 } 398 399 /* 400 * Issue EILSEQ error if the remaining byte is not 401 * a valid GBK byte. 402 */ 403 if (! KICONV_SC_IS_GBK_2nd_BYTE(*(ib + 1))) { 404 KICONV_SET_ERRNO_AND_BREAK(EILSEQ); 405 } 406 407 /* Now we have a valid GBK character. */ 408 gb_val = (uint32_t)(*ib) << 8 | *(ib + 1); 409 sz = gbk_to_utf8(gb_val, ob, obtail, &ret_val, B_FALSE); 410 411 if (sz < 0) { 412 KICONV_SET_ERRNO_AND_BREAK(E2BIG); 413 } 414 415 ib += 2; 416 ob += sz; 417 } 418 419 *inbuf = (char *)ib; 420 *inbytesleft = ibtail - ib; 421 *outbuf = (char *)ob; 422 *outbytesleft = obtail - ob; 423 424 return (ret_val); 425 } 426 427 /* 428 * String based encoding convertor from GBK to UTF-8. 429 */ 430 size_t 431 kiconvstr_fr_gbk(char *inarray, size_t *inlen, char *outarray, 432 size_t *outlen, int flag, int *errno) 433 { 434 uchar_t *ib; 435 uchar_t *ob; 436 uchar_t *ibtail; 437 uchar_t *obtail; 438 uchar_t *oldib; 439 size_t ret_val; 440 int8_t sz; 441 uint32_t gb_val; 442 boolean_t do_not_ignore_null; 443 444 ret_val = 0; 445 ib = (uchar_t *)inarray; 446 ob = (uchar_t *)outarray; 447 ibtail = ib + *inlen; 448 obtail = ob + *outlen; 449 do_not_ignore_null = ((flag & KICONV_IGNORE_NULL) == 0); 450 451 while (ib < ibtail) { 452 if (*ib == '\0' && do_not_ignore_null) 453 break; 454 455 if (KICONV_IS_ASCII(*ib)) { 456 if (ob >= obtail) { 457 KICONV_SET_ERRNO_AND_BREAK(E2BIG); 458 } 459 460 *ob++ = *ib++; 461 continue; 462 } 463 464 oldib = ib; 465 466 if (! KICONV_SC_IS_GBK_1st_BYTE(*ib)) { 467 KICONV_SET_ERRNO_WITH_FLAG(1, EILSEQ); 468 } 469 470 if (ibtail - ib < 2) { 471 KICONV_SET_ERRNO_WITH_FLAG(1, EINVAL); 472 } 473 474 if (! KICONV_SC_IS_GBK_2nd_BYTE(*(ib + 1))) { 475 KICONV_SET_ERRNO_WITH_FLAG(2, EILSEQ); 476 } 477 478 gb_val = (uint32_t)(*ib << 8) | *(ib + 1); 479 sz = gbk_to_utf8(gb_val, ob, obtail, &ret_val, B_FALSE); 480 481 if (sz < 0) { 482 KICONV_SET_ERRNO_AND_BREAK(E2BIG); 483 } 484 485 ib += 2; 486 ob += sz; 487 continue; 488 489 REPLACE_INVALID: 490 if (obtail - ob < KICONV_UTF8_REPLACEMENT_CHAR_LEN) { 491 ib = oldib; 492 KICONV_SET_ERRNO_AND_BREAK(E2BIG); 493 } 494 495 *ob++ = KICONV_UTF8_REPLACEMENT_CHAR1; 496 *ob++ = KICONV_UTF8_REPLACEMENT_CHAR2; 497 *ob++ = KICONV_UTF8_REPLACEMENT_CHAR3; 498 ret_val++; 499 } 500 501 *inlen = ibtail - ib; 502 *outlen = obtail - ob; 503 504 return (ret_val); 505 } 506 507 /* 508 * Encoding convertor from UTF-8 to EUC-CN. 509 */ 510 size_t 511 kiconv_to_euccn(void *kcd, char **inbuf, size_t *inbytesleft, 512 char **outbuf, size_t *outbytesleft, int *errno) 513 { 514 return kiconv_utf8_to_cck(kcd, inbuf, inbytesleft, outbuf, 515 outbytesleft, errno, utf8_to_gb2312); 516 } 517 518 /* 519 * String based encoding convertor from UTF-8 to EUC-CN. 520 */ 521 size_t 522 kiconvstr_to_euccn(char *inarray, size_t *inlen, char *outarray, 523 size_t *outlen, int flag, int *errno) 524 { 525 return kiconvstr_utf8_to_cck((uchar_t *)inarray, inlen, 526 (uchar_t *)outarray, outlen, flag, errno, utf8_to_gb2312); 527 } 528 529 /* 530 * Encoding converto from EUC-CN to UTF-8 code. 531 */ 532 size_t 533 kiconv_fr_euccn(void *kcd, char **inbuf, size_t *inbytesleft, 534 char **outbuf, size_t *outbytesleft, int *errno) 535 { 536 uchar_t *ib; 537 uchar_t *ob; 538 uchar_t *ibtail; 539 uchar_t *obtail; 540 size_t ret_val; 541 int8_t sz; 542 543 /* Check on the kiconv code conversion descriptor. */ 544 if (kcd == NULL || kcd == (void *)-1) { 545 *errno = EBADF; 546 return ((size_t)-1); 547 } 548 549 /* If this is a state reset request, process and return. */ 550 if (inbuf == NULL || *inbuf == NULL) { 551 return (0); 552 } 553 554 ret_val = 0; 555 ib = (uchar_t *)*inbuf; 556 ob = (uchar_t *)*outbuf; 557 ibtail = ib + *inbytesleft; 558 obtail = ob + *outbytesleft; 559 560 while (ib < ibtail) { 561 if (KICONV_IS_ASCII(*ib)) { 562 if (ob >= obtail) { 563 KICONV_SET_ERRNO_AND_BREAK(E2BIG); 564 } 565 566 *ob++ = *ib++; 567 continue; 568 } 569 570 /* 571 * Issue EILSEQ error if the first byte is not a 572 * valid GB2312 leading byte. 573 */ 574 if (! KICONV_SC_IS_GB2312_BYTE(*ib)) { 575 KICONV_SET_ERRNO_AND_BREAK(EILSEQ); 576 } 577 578 /* 579 * Issue EINVAL error if input buffer has an incomplete 580 * character at the end of the buffer. 581 */ 582 if (ibtail - ib < 2) { 583 KICONV_SET_ERRNO_AND_BREAK(EINVAL); 584 } 585 586 /* 587 * Issue EILSEQ error if the remaining byte is not 588 * a valid GB2312 byte. 589 */ 590 if (! KICONV_SC_IS_GB2312_BYTE(*(ib + 1))) { 591 KICONV_SET_ERRNO_AND_BREAK(EILSEQ); 592 } 593 594 /* Now we have a valid GB2312 character */ 595 sz = gb2312_to_utf8(*ib, *(ib + 1), ob, obtail, &ret_val); 596 if (sz < 0) { 597 KICONV_SET_ERRNO_AND_BREAK(E2BIG); 598 } 599 600 ib += 2; 601 ob += sz; 602 } 603 604 *inbuf = (char *)ib; 605 *inbytesleft = ibtail - ib; 606 *outbuf = (char *)ob; 607 *outbytesleft = obtail - ob; 608 609 return (ret_val); 610 } 611 612 /* 613 * String based encoding convertor from EUC-CN to UTF-8. 614 */ 615 size_t 616 kiconvstr_fr_euccn(char *inarray, size_t *inlen, char *outarray, 617 size_t *outlen, int flag, int *errno) 618 { 619 uchar_t *ib; 620 uchar_t *ob; 621 uchar_t *ibtail; 622 uchar_t *obtail; 623 uchar_t *oldib; 624 size_t ret_val; 625 int8_t sz; 626 boolean_t do_not_ignore_null; 627 628 ret_val = 0; 629 ib = (uchar_t *)inarray; 630 ob = (uchar_t *)outarray; 631 ibtail = ib + *inlen; 632 obtail = ob + *outlen; 633 do_not_ignore_null = ((flag & KICONV_IGNORE_NULL) == 0); 634 635 while (ib < ibtail) { 636 if (*ib == '\0' && do_not_ignore_null) 637 break; 638 639 if (KICONV_IS_ASCII(*ib)) { 640 if (ob >= obtail) { 641 KICONV_SET_ERRNO_AND_BREAK(E2BIG); 642 } 643 644 *ob++ = *ib++; 645 continue; 646 } 647 648 oldib = ib; 649 650 if (! KICONV_SC_IS_GB2312_BYTE(*ib)) { 651 KICONV_SET_ERRNO_WITH_FLAG(1, EILSEQ); 652 } 653 654 if (ibtail - ib < 2) { 655 KICONV_SET_ERRNO_WITH_FLAG(1, EINVAL); 656 } 657 658 if (! KICONV_SC_IS_GB2312_BYTE(*(ib + 1))) { 659 KICONV_SET_ERRNO_WITH_FLAG(2, EILSEQ); 660 } 661 662 sz = gb2312_to_utf8(*ib, *(ib + 1), ob, obtail, &ret_val); 663 if (sz < 0) { 664 KICONV_SET_ERRNO_AND_BREAK(E2BIG); 665 } 666 667 ib += 2; 668 ob += sz; 669 continue; 670 671 REPLACE_INVALID: 672 if (obtail - ob < KICONV_UTF8_REPLACEMENT_CHAR_LEN) { 673 ib = oldib; 674 KICONV_SET_ERRNO_AND_BREAK(E2BIG); 675 } 676 677 *ob++ = KICONV_UTF8_REPLACEMENT_CHAR1; 678 *ob++ = KICONV_UTF8_REPLACEMENT_CHAR2; 679 *ob++ = KICONV_UTF8_REPLACEMENT_CHAR3; 680 ret_val++; 681 } 682 683 *inlen = ibtail - ib; 684 *outlen = obtail - ob; 685 686 return (ret_val); 687 } 688 689 /* 690 * Convert single GB2312 character to UTF-8. 691 * Return: > 0 - Converted successfully 692 * = -1 - E2BIG 693 */ 694 static int8_t 695 gb2312_to_utf8(uchar_t b1, uchar_t b2, uchar_t *ob, uchar_t *obtail, 696 size_t *ret_val) 697 { 698 size_t index; 699 int8_t sz; 700 uchar_t *u8; 701 702 /* index = (b1 - KICONV_EUC_START) * 94 + b2 - KICONV_EUC_START; */ 703 index = b1 * 94 + b2 - 0x3BBF; 704 705 if (index >= KICONV_GB2312_UTF8_MAX) 706 index = KICONV_GB2312_UTF8_MAX - 1; /* Map to 0xEFBFBD */ 707 708 u8 = kiconv_gb2312_utf8[index]; 709 sz = u8_number_of_bytes[u8[0]]; 710 711 if (obtail - ob < sz) { 712 *ret_val = (size_t)-1; 713 return (-1); 714 } 715 716 for (index = 0; index < sz; index++) 717 *ob++ = u8[index]; 718 719 /* 720 * As kiconv_gb2312_utf8 contain muliple KICONV_UTF8_REPLACEMENT_CHAR 721 * elements, so need to ckeck more. 722 */ 723 if (sz == KICONV_UTF8_REPLACEMENT_CHAR_LEN && 724 u8[0] == KICONV_UTF8_REPLACEMENT_CHAR1 && 725 u8[1] == KICONV_UTF8_REPLACEMENT_CHAR2 && 726 u8[2] == KICONV_UTF8_REPLACEMENT_CHAR3) 727 (*ret_val)++; 728 729 return (sz); 730 } 731 732 /* 733 * Convert single GB18030 or GBK character to UTF-8. 734 * Return: > 0 - Converted successfully 735 * = -1 - E2BIG 736 */ 737 static int8_t 738 gbk_to_utf8(uint32_t gbk_val, uchar_t *ob, uchar_t *obtail, size_t *ret_val, 739 boolean_t isgbk4) 740 { 741 size_t index; 742 int8_t sz; 743 uchar_t u8array[4]; 744 uchar_t *u8; 745 746 if (isgbk4) { 747 if (gbk_val >= KICONV_SC_PLANE1_GB18030_START) { 748 uint32_t u32; 749 750 /* 751 * u32 = ((gbk_val >> 24) - 0x90) * 12600 + 752 * (((gbk_val & 0xFF0000) >> 16) - 0x30) * 1260 + 753 * (((gbk_val & 0xFF00) >> 8) - 0x81) * 10 + 754 * (gbk_val & 0xFF - 0x30)+ 755 * KICONV_SC_PLANE1_UCS4_START; 756 */ 757 u32 = (gbk_val >> 24) * 12600 + 758 ((gbk_val & 0xFF0000) >> 16) * 1260 + 759 ((gbk_val & 0xFF00) >> 8) * 10 + 760 (gbk_val & 0xFF) - 0x1BA0FA; 761 u8array[0] = (uchar_t)(0xF0 | ((u32 & 0x1C0000) >> 18)); 762 u8array[1] = (uchar_t)(0x80 | ((u32 & 0x03F000) >> 12)); 763 u8array[2] = (uchar_t)(0x80 | ((u32 & 0x000FC0) >> 6)); 764 u8array[3] = (uchar_t)(0x80 | (u32 & 0x00003F)); 765 u8 = u8array; 766 index = 1; 767 } else { 768 index = kiconv_binsearch(gbk_val, 769 kiconv_gbk4_utf8, KICONV_GBK4_UTF8_MAX); 770 u8 = kiconv_gbk4_utf8[index].u8; 771 } 772 } else { 773 index = kiconv_binsearch(gbk_val, 774 kiconv_gbk_utf8, KICONV_GBK_UTF8_MAX); 775 u8 = kiconv_gbk_utf8[index].u8; 776 } 777 778 sz = u8_number_of_bytes[u8[0]]; 779 if (obtail - ob < sz) { 780 *ret_val = (size_t)-1; 781 return (-1); 782 } 783 784 if (index == 0) 785 (*ret_val)++; /* Non-identical conversion */ 786 787 for (index = 0; index < sz; index++) 788 *ob++ = u8[index]; 789 790 return (sz); 791 } 792 793 /* 794 * Convert single UTF-8 character to GB18030. 795 * Return: > 0 - Converted successfully 796 * = -1 - E2BIG 797 */ 798 /* ARGSUSED */ 799 static int8_t 800 utf8_to_gb18030(uint32_t utf8, uchar_t **inbuf, uchar_t *ibtail, 801 uchar_t *ob, uchar_t *obtail, size_t *ret) 802 { 803 size_t index; 804 int8_t gbklen; 805 uint32_t gbkcode; 806 807 if (utf8 >= KICONV_SC_PLANE1_UTF8_START) { 808 /* Four bytes GB18030 [0x90308130, 0xe339fe39] handling. */ 809 uint32_t u32; 810 811 u32 = (((utf8 & 0x07000000) >> 6) | ((utf8 & 0x3F0000) >> 4) | 812 ((utf8 & 0x3F00) >> 2) | (utf8 & 0x3F)) - 813 KICONV_SC_PLANE1_UCS4_START; 814 gbkcode = ((u32 / 12600 + 0x90) << 24) | 815 (((u32 % 12600) / 1260 + 0x30) << 16) | 816 (((u32 % 1260) / 10 + 0x81) << 8) | (u32 % 10 + 0x30); 817 gbklen = 4; 818 index = 1; 819 } else { 820 index = kiconv_binsearch(utf8, kiconv_utf8_gb18030, 821 KICONV_UTF8_GB18030_MAX); 822 gbkcode = kiconv_utf8_gb18030[index].value; 823 KICONV_SC_GET_GB_LEN(gbkcode, gbklen); 824 } 825 826 if (obtail - ob < gbklen) { 827 *ret = (size_t)-1; 828 return (-1); 829 } 830 831 if (index == 0) 832 (*ret)++; /* Non-identical conversion */ 833 834 if (gbklen == 2) { 835 *ob++ = (uchar_t)(gbkcode >> 8); 836 } else if (gbklen == 4) { 837 *ob++ = (uchar_t)(gbkcode >> 24); 838 *ob++ = (uchar_t)(gbkcode >> 16); 839 *ob++ = (uchar_t)(gbkcode >> 8); 840 } 841 *ob = (uchar_t)(gbkcode & 0xFF); 842 843 return (gbklen); 844 } 845 846 /* 847 * Convert single UTF-8 character to GBK. 848 * Return: > 0 - Converted successfully 849 * = -1 - E2BIG 850 */ 851 /* ARGSUSED */ 852 static int8_t 853 utf8_to_gbk(uint32_t utf8, uchar_t **inbuf, uchar_t *ibtail, 854 uchar_t *ob, uchar_t *obtail, size_t *ret) 855 { 856 size_t index; 857 int8_t gbklen; 858 uint32_t gbkcode; 859 860 index = kiconv_binsearch(utf8, kiconv_utf8_gb18030, 861 KICONV_UTF8_GB18030_MAX); 862 gbkcode = kiconv_utf8_gb18030[index].value; 863 KICONV_SC_GET_GB_LEN(gbkcode, gbklen); 864 865 /* GBK and GB18030 share the same table, so check the length. */ 866 if (gbklen == 4) { 867 index = 0; 868 gbkcode = kiconv_utf8_gb18030[index].value; 869 gbklen = 1; 870 } 871 872 if (obtail - ob < gbklen) { 873 *ret = (size_t)-1; 874 return (-1); 875 } 876 877 if (index == 0) 878 (*ret)++; /* Non-identical conversion */ 879 880 if (gbklen > 1) 881 *ob++ = (uchar_t)(gbkcode >> 8); 882 *ob = (uchar_t)(gbkcode & 0xFF); 883 884 return (gbklen); 885 } 886 887 /* 888 * Convert single UTF-8 character to GB2312. 889 * Return: > 0 - Converted successfully 890 * = -1 - E2BIG 891 */ 892 /* ARGSUSED */ 893 static int8_t 894 utf8_to_gb2312(uint32_t utf8, uchar_t **inbuf, uchar_t *intail, 895 uchar_t *ob, uchar_t *obtail, size_t *ret) 896 { 897 size_t index; 898 int8_t gblen; 899 uint32_t gbcode; 900 901 index = kiconv_binsearch(utf8, kiconv_utf8_gb2312, 902 KICONV_UTF8_GB2312_MAX); 903 gbcode = kiconv_utf8_gb2312[index].value; 904 gblen = (gbcode <= 0xFF) ? 1 : 2; 905 906 if (obtail - ob < gblen) { 907 *ret = (size_t)-1; 908 return (-1); 909 } 910 911 if (index == 0) 912 (*ret)++; 913 914 if (gblen > 1) 915 *ob++ = (uchar_t)(gbcode >> 8); 916 *ob = (uchar_t)(gbcode & 0xFF); 917 918 return (gblen); 919 } 920 921 static kiconv_ops_t kiconv_sc_ops_tbl[] = { 922 { 923 "gb18030", "utf-8", kiconv_open_to_cck, kiconv_to_gb18030, 924 kiconv_close_to_cck, kiconvstr_to_gb18030 925 }, 926 { 927 "utf-8", "gb18030", open_fr_gb18030, kiconv_fr_gb18030, 928 close_fr_sc, kiconvstr_fr_gb18030 929 }, 930 { 931 "gbk", "utf-8", kiconv_open_to_cck, kiconv_to_gbk, 932 kiconv_close_to_cck, kiconvstr_to_gbk 933 }, 934 { 935 "utf-8", "gbk", open_fr_gbk, kiconv_fr_gbk, 936 close_fr_sc, kiconvstr_fr_gbk 937 }, 938 { 939 "euccn", "utf-8", kiconv_open_to_cck, kiconv_to_euccn, 940 kiconv_close_to_cck, kiconvstr_to_euccn 941 }, 942 { 943 "utf-8", "euccn", open_fr_euccn, kiconv_fr_euccn, 944 close_fr_sc, kiconvstr_fr_euccn 945 }, 946 }; 947 948 static kiconv_module_info_t kiconv_sc_info = { 949 "kiconv_sc", /* module name */ 950 sizeof (kiconv_sc_ops_tbl) / sizeof (kiconv_sc_ops_tbl[0]), 951 kiconv_sc_ops_tbl, 952 0, 953 NULL, 954 NULL, 955 0 956 }; 957 958 static struct modlkiconv modlkiconv_sc = { 959 &mod_kiconvops, 960 "kiconv Simplified Chinese module 1.0", 961 &kiconv_sc_info 962 }; 963 964 static struct modlinkage modlinkage = { 965 MODREV_1, 966 (void *)&modlkiconv_sc, 967 NULL 968 }; 969 970 int 971 _init(void) 972 { 973 int err; 974 975 err = mod_install(&modlinkage); 976 if (err) 977 cmn_err(CE_WARN, "kiconv_sc: failed to load kernel module"); 978 979 return (err); 980 } 981 982 int 983 _fini(void) 984 { 985 int err; 986 987 /* 988 * If this module is being used, then, we cannot remove the module. 989 * The following checking will catch pretty much all usual cases. 990 * 991 * Any remaining will be catached by the kiconv_unregister_module() 992 * during mod_remove() at below. 993 */ 994 if (kiconv_module_ref_count(KICONV_MODULE_ID_SC)) 995 return (EBUSY); 996 997 err = mod_remove(&modlinkage); 998 if (err) 999 cmn_err(CE_WARN, "kiconv_sc: failed to remove kernel module"); 1000 1001 return (err); 1002 } 1003 1004 int 1005 _info(struct modinfo *modinfop) 1006 { 1007 return (mod_info(&modlinkage, modinfop)); 1008 } 1009