1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 #include <sys/types.h> 30 #include <sys/param.h> 31 #include <sys/sysmacros.h> 32 #include <sys/systm.h> 33 #include <sys/debug.h> 34 #include <sys/kmem.h> 35 #include <sys/sunddi.h> 36 #include <sys/byteorder.h> 37 #include <sys/errno.h> 38 #include <sys/modctl.h> 39 #include <sys/u8_textprep.h> 40 #include <sys/kiconv.h> 41 #include <sys/kiconv_cck_common.h> 42 #include <sys/kiconv_ko.h> 43 #include <sys/kiconv_uhc_utf8.h> 44 #include <sys/kiconv_utf8_uhc.h> 45 #include <sys/kiconv_euckr_utf8.h> 46 #include <sys/kiconv_utf8_euckr.h> 47 48 static int8_t utf8_to_euckr(uint32_t utf8, uchar_t **inbuf, uchar_t *ibtail, 49 uchar_t *ob, uchar_t *obtail, size_t *ret_val); 50 static int8_t utf8_to_uhc(uint32_t utf8, uchar_t **inbuf, uchar_t *ibtail, 51 uchar_t *ob, uchar_t *obtail, size_t *ret_val); 52 static int8_t ko_to_utf8(uint32_t ko_val, uchar_t *ob, uchar_t *obtail, 53 size_t *ret_val, kiconv_table_array_t *table, size_t nitems); 54 55 56 #define KICONV_KO_EUCKR (0x01) 57 #define KICONV_KO_UHC (0x02) 58 #define KICONV_KO_MAX_MAGIC_ID (0x02) 59 60 static void * 61 open_fr_euckr() 62 { 63 return ((void *)KICONV_KO_EUCKR); 64 } 65 66 static void * 67 open_fr_uhc() 68 { 69 return ((void *)KICONV_KO_UHC); 70 } 71 72 static int 73 close_fr_ko(void *s) 74 { 75 if ((uintptr_t)s > KICONV_KO_MAX_MAGIC_ID) 76 return (EBADF); 77 78 return (0); 79 } 80 81 /* 82 * Encoding convertor from EUC-KR to UTF-8. 83 */ 84 static size_t 85 kiconv_fr_euckr(void *kcd, char **inbuf, size_t *inbufleft, 86 char **outbuf, size_t *outbufleft, int *errno) 87 { 88 uchar_t *ib; 89 uchar_t *ob; 90 uchar_t *ibtail; 91 uchar_t *obtail; 92 size_t ret_val; 93 int8_t sz; 94 uint32_t euckr_val; 95 96 /* Check on the kiconv code conversion descriptor. */ 97 if (kcd == NULL || kcd == (void *)-1) { 98 *errno = EBADF; 99 return ((size_t)-1); 100 } 101 102 /* If this is a state reset request, process and return. */ 103 if (inbuf == NULL || *inbuf == NULL) { 104 return (0); 105 } 106 107 ret_val = 0; 108 ib = (uchar_t *)*inbuf; 109 ob = (uchar_t *)*outbuf; 110 ibtail = ib + *inbufleft; 111 obtail = ob + *outbufleft; 112 113 while (ib < ibtail) { 114 if (KICONV_IS_ASCII(*ib)) { 115 if (ob >= obtail) { 116 KICONV_SET_ERRNO_AND_BREAK(E2BIG); 117 } 118 119 *ob++ = *ib++; 120 continue; 121 } 122 123 /* 124 * Issue EILSEQ error if the first byte is not a 125 * valid EUC-KR leading byte. 126 */ 127 if (! KICONV_KO_IS_EUCKR_BYTE(*ib)) { 128 KICONV_SET_ERRNO_AND_BREAK(EILSEQ); 129 } 130 131 /* 132 * Issue EINVAL error if input buffer has an incomplete 133 * character at the end of the buffer. 134 */ 135 if (ibtail - ib < 2) { 136 KICONV_SET_ERRNO_AND_BREAK(EINVAL); 137 } 138 139 /* 140 * Issue EILSEQ error if the remaining byte is not 141 * a valid EUC-KR byte. 142 */ 143 if (! KICONV_KO_IS_EUCKR_BYTE(*(ib + 1))) { 144 KICONV_SET_ERRNO_AND_BREAK(EILSEQ); 145 } 146 147 euckr_val = (uint32_t)(*ib) << 8 | *(ib + 1); 148 sz = ko_to_utf8(euckr_val, ob, obtail, &ret_val, 149 kiconv_euckr_utf8, KICONV_EUCKR_UTF8_MAX); 150 151 if (sz < 0) { 152 KICONV_SET_ERRNO_AND_BREAK(E2BIG); 153 } 154 155 ib += 2; 156 ob += sz; 157 } 158 159 *inbuf = (char *)ib; 160 *inbufleft = ibtail - ib; 161 *outbuf = (char *)ob; 162 *outbufleft = obtail - ob; 163 164 return (ret_val); 165 } 166 167 /* 168 * String based encoding convertor from EUC-KR to UTF-8. 169 */ 170 static size_t 171 kiconvstr_fr_euckr(char *inarray, size_t *inlen, char *outarray, 172 size_t *outlen, int flag, int *errno) 173 { 174 uchar_t *ib; 175 uchar_t *ob; 176 uchar_t *ibtail; 177 uchar_t *obtail; 178 uchar_t *oldib; 179 size_t ret_val; 180 int8_t sz; 181 uint32_t euckr_val; 182 boolean_t do_not_ignore_null; 183 184 ret_val = 0; 185 ib = (uchar_t *)inarray; 186 ob = (uchar_t *)outarray; 187 ibtail = ib + *inlen; 188 obtail = ob + *outlen; 189 do_not_ignore_null = ((flag & KICONV_IGNORE_NULL) == 0); 190 191 while (ib < ibtail) { 192 if (*ib == '\0' && do_not_ignore_null) 193 break; 194 195 if (KICONV_IS_ASCII(*ib)) { 196 if (ob >= obtail) { 197 KICONV_SET_ERRNO_AND_BREAK(E2BIG); 198 } 199 200 *ob++ = *ib++; 201 continue; 202 } 203 204 oldib = ib; 205 206 if (! KICONV_KO_IS_EUCKR_BYTE(*ib)) { 207 KICONV_SET_ERRNO_WITH_FLAG(1, EILSEQ); 208 } 209 210 if (ibtail - ib < 2) { 211 KICONV_SET_ERRNO_WITH_FLAG(1, EINVAL); 212 } 213 214 if (! KICONV_KO_IS_EUCKR_BYTE(*(ib + 1))) { 215 KICONV_SET_ERRNO_WITH_FLAG(2, EILSEQ); 216 } 217 218 euckr_val = *ib++; 219 euckr_val = (euckr_val << 8) | *ib++; 220 sz = ko_to_utf8(euckr_val, ob, obtail, &ret_val, 221 kiconv_euckr_utf8, KICONV_EUCKR_UTF8_MAX); 222 223 if (sz < 0) { 224 ib = oldib; 225 KICONV_SET_ERRNO_AND_BREAK(E2BIG); 226 } 227 228 ob += sz; 229 continue; 230 231 REPLACE_INVALID: 232 if (obtail - ob < KICONV_UTF8_REPLACEMENT_CHAR_LEN) { 233 ib = oldib; 234 KICONV_SET_ERRNO_AND_BREAK(E2BIG); 235 } 236 237 *ob++ = KICONV_UTF8_REPLACEMENT_CHAR1; 238 *ob++ = KICONV_UTF8_REPLACEMENT_CHAR2; 239 *ob++ = KICONV_UTF8_REPLACEMENT_CHAR3; 240 ret_val++; 241 } 242 243 *inlen = ibtail - ib; 244 *outlen = obtail - ob; 245 246 return (ret_val); 247 } 248 249 /* 250 * Encoding convertor from Unified Hangul Code to UTF-8. 251 */ 252 static size_t 253 kiconv_fr_uhc(void *kcd, char **inbuf, size_t *inbufleft, 254 char **outbuf, size_t *outbufleft, int *errno) 255 { 256 uchar_t *ib; 257 uchar_t *ob; 258 uchar_t *ibtail; 259 uchar_t *obtail; 260 size_t ret_val; 261 int8_t sz; 262 uint32_t uhc_val; 263 264 /* Check on the kiconv code conversion descriptor. */ 265 if (kcd == NULL || kcd == (void *)-1) { 266 *errno = EBADF; 267 return ((size_t)-1); 268 } 269 270 /* If this is a state reset request, process and return. */ 271 if (inbuf == NULL || *inbuf == NULL) { 272 return (0); 273 } 274 275 ret_val = 0; 276 ib = (uchar_t *)*inbuf; 277 ob = (uchar_t *)*outbuf; 278 ibtail = ib + *inbufleft; 279 obtail = ob + *outbufleft; 280 281 while (ib < ibtail) { 282 if (KICONV_IS_ASCII(*ib)) { 283 if (ob >= obtail) { 284 KICONV_SET_ERRNO_AND_BREAK(E2BIG); 285 } 286 287 *ob++ = *ib++; 288 continue; 289 } 290 291 /* 292 * Issue EILSEQ error if the first byte is not a 293 * valid UHC leading byte. 294 */ 295 if (! KICONV_KO_IS_UHC_1st_BYTE(*ib)) { 296 KICONV_SET_ERRNO_AND_BREAK(EILSEQ); 297 } 298 299 /* 300 * Issue EINVAL error if input buffer has an incomplete 301 * character at the end of the buffer. 302 */ 303 if (ibtail - ib < 2) { 304 KICONV_SET_ERRNO_AND_BREAK(EINVAL); 305 } 306 307 /* 308 * Issue EILSEQ error if the remaining byte is not 309 * a valid UHC byte. 310 */ 311 if (! KICONV_KO_IS_UHC_2nd_BYTE(*(ib + 1))) { 312 KICONV_SET_ERRNO_AND_BREAK(EILSEQ); 313 } 314 315 uhc_val = (uint32_t)(*ib) << 8 | *(ib + 1); 316 sz = ko_to_utf8(uhc_val, ob, obtail, &ret_val, 317 kiconv_uhc_utf8, KICONV_UHC_UTF8_MAX); 318 319 if (sz < 0) { 320 KICONV_SET_ERRNO_AND_BREAK(E2BIG); 321 } 322 323 ib += 2; 324 ob += sz; 325 } 326 327 *inbuf = (char *)ib; 328 *inbufleft = ibtail - ib; 329 *outbuf = (char *)ob; 330 *outbufleft = obtail - ob; 331 332 return (ret_val); 333 } 334 335 /* 336 * String based encoding convertor from Unified Hangul Code to UTF-8. 337 */ 338 static size_t 339 kiconvstr_fr_uhc(char *inarray, size_t *inlen, char *outarray, 340 size_t *outlen, int flag, int *errno) 341 { 342 uchar_t *ib; 343 uchar_t *ob; 344 uchar_t *ibtail; 345 uchar_t *obtail; 346 uchar_t *oldib; 347 size_t ret_val; 348 int8_t sz; 349 uint32_t uhc_val; 350 boolean_t do_not_ignore_null; 351 352 ret_val = 0; 353 ib = (uchar_t *)inarray; 354 ob = (uchar_t *)outarray; 355 ibtail = ib + *inlen; 356 obtail = ob + *outlen; 357 do_not_ignore_null = ((flag & KICONV_IGNORE_NULL) == 0); 358 359 while (ib < ibtail) { 360 if (*ib == '\0' && do_not_ignore_null) 361 break; 362 363 if (KICONV_IS_ASCII(*ib)) { 364 if (ob >= obtail) { 365 KICONV_SET_ERRNO_AND_BREAK(E2BIG); 366 } 367 368 *ob++ = *ib++; 369 continue; 370 } 371 372 oldib = ib; 373 374 if (! KICONV_KO_IS_UHC_1st_BYTE(*ib)) { 375 KICONV_SET_ERRNO_WITH_FLAG(1, EILSEQ); 376 } 377 378 if (ibtail - ib < 2) { 379 KICONV_SET_ERRNO_WITH_FLAG(1, EINVAL); 380 } 381 382 if (! KICONV_KO_IS_UHC_2nd_BYTE(*(ib + 1))) { 383 KICONV_SET_ERRNO_WITH_FLAG(2, EILSEQ); 384 } 385 386 uhc_val = *ib++; 387 uhc_val = (uhc_val << 8) | *ib++; 388 sz = ko_to_utf8(uhc_val, ob, obtail, &ret_val, 389 kiconv_uhc_utf8, KICONV_UHC_UTF8_MAX); 390 391 if (sz < 0) { 392 ib = oldib; 393 KICONV_SET_ERRNO_AND_BREAK(E2BIG); 394 } 395 396 ob += sz; 397 continue; 398 399 REPLACE_INVALID: 400 if (obtail - ob < KICONV_UTF8_REPLACEMENT_CHAR_LEN) { 401 ib = oldib; 402 KICONV_SET_ERRNO_AND_BREAK(E2BIG); 403 } 404 405 *ob++ = KICONV_UTF8_REPLACEMENT_CHAR1; 406 *ob++ = KICONV_UTF8_REPLACEMENT_CHAR2; 407 *ob++ = KICONV_UTF8_REPLACEMENT_CHAR3; 408 ret_val++; 409 } 410 411 *inlen = ibtail - ib; 412 *outlen = obtail - ob; 413 414 return (ret_val); 415 } 416 417 /* 418 * Encoding convertor from UTF-8 to EUC-KR. 419 */ 420 static size_t 421 kiconv_to_euckr(void *kcd, char **inbuf, size_t *inbytesleft, 422 char **outbuf, size_t *outbytesleft, int *errno) 423 { 424 return (kiconv_utf8_to_cck(kcd, inbuf, inbytesleft, outbuf, 425 outbytesleft, errno, utf8_to_euckr)); 426 } 427 428 /* 429 * Encoding convertor from UTF-8 to Unified Hangul Code. 430 */ 431 static size_t 432 kiconv_to_uhc(void *kcd, char **inbuf, size_t *inbytesleft, 433 char **outbuf, size_t *outbytesleft, int *errno) 434 { 435 return (kiconv_utf8_to_cck(kcd, inbuf, inbytesleft, outbuf, 436 outbytesleft, errno, utf8_to_uhc)); 437 } 438 439 /* 440 * String based encoding convertor from UTF-8 to EUC-KR. 441 */ 442 static size_t 443 kiconvstr_to_euckr(char *inarray, size_t *inlen, char *outarray, 444 size_t *outlen, int flag, int *errno) 445 { 446 return kiconvstr_utf8_to_cck((uchar_t *)inarray, inlen, 447 (uchar_t *)outarray, outlen, flag, errno, utf8_to_euckr); 448 } 449 450 /* 451 * String based encoding convertor from UTF-8 to Unified Hangul Code. 452 */ 453 static size_t 454 kiconvstr_to_uhc(char *inarray, size_t *inlen, char *outarray, 455 size_t *outlen, int flag, int *errno) 456 { 457 return kiconvstr_utf8_to_cck((uchar_t *)inarray, inlen, 458 (uchar_t *)outarray, outlen, flag, errno, utf8_to_uhc); 459 } 460 461 /* 462 * Convert an UTF-8 character to a character of ko encodings 463 * (EUC-KR or UHC). 464 */ 465 static int8_t 466 utf8_to_ko(uint32_t utf8, uchar_t *ob, uchar_t *obtail, size_t *ret_val, 467 kiconv_table_t *table, size_t nitems) 468 { 469 size_t index; 470 size_t kocode; 471 int8_t kolen; 472 473 if (KICONV_KO_IS_UDC_IN_UTF8(utf8)) { 474 /* User Definable Area handing. */ 475 kocode = (((utf8 & 0xF0000) >> 4) | ((utf8 & 0x3F00) >> 2) | 476 (utf8 & 0x3F)) - KICONV_KO_UDA_UCS4_START; 477 if (kocode < KICONV_KO_UDA_RANGE) { 478 kocode = (KICONV_KO_UDA_EUC_SEG1 << 8) | 479 (kocode + KICONV_KO_UDA_OFFSET_START); 480 } else { 481 /* 0x43 = 0xA1 - 0x5E */ 482 kocode = (KICONV_KO_UDA_EUC_SEG2 << 8) | 483 (kocode + 0x43); 484 } 485 486 index = 1; 487 } else { 488 index = kiconv_binsearch(utf8, table, nitems); 489 kocode = table[index].value; 490 } 491 492 kolen = (kocode <= 0xFF) ? 1 : 2; 493 494 if (obtail - ob < kolen) { 495 *ret_val = (size_t)-1; 496 return (-1); 497 } 498 499 if (index == 0) 500 (*ret_val)++; 501 502 if (kolen > 1) 503 *ob++ = (uchar_t)(kocode >> 8); 504 *ob = (uchar_t)(kocode & 0xFF); 505 506 return (kolen); 507 } 508 509 /* 510 * Convert an UTF-8 character to Unified Hangual Code. 511 */ 512 /* ARGSUSED */ 513 static int8_t 514 utf8_to_uhc(uint32_t utf8, uchar_t **inbuf, uchar_t *ibtail, 515 uchar_t *ob, uchar_t *obtail, size_t *ret_val) 516 { 517 return (utf8_to_ko(utf8, ob, obtail, ret_val, kiconv_utf8_uhc, 518 KICONV_UTF8_UHC_MAX)); 519 } 520 521 /* 522 * Convert an UTF-8 character to EUC-KR. 523 */ 524 /* ARGSUSED */ 525 static int8_t 526 utf8_to_euckr(uint32_t utf8, uchar_t **inbuf, uchar_t *ibtail, 527 uchar_t *ob, uchar_t *obtail, size_t *ret_val) 528 { 529 return (utf8_to_ko(utf8, ob, obtail, ret_val, kiconv_utf8_euckr, 530 KICONV_UTF8_EUCKR_MAX)); 531 } 532 533 /* 534 * Convert a single ko encoding (EUC-KR or UHC) character to UTF-8. 535 */ 536 static int8_t 537 ko_to_utf8(uint32_t ko_val, uchar_t *ob, uchar_t *obtail, size_t *ret_val, 538 kiconv_table_array_t *table, size_t nitems) 539 { 540 size_t index; 541 int8_t sz; 542 uchar_t udc[3]; 543 uchar_t *u8; 544 545 if (KICONV_KO_IS_UDC_IN_EUC(ko_val)) { 546 /* UDA(User Definable Area) handling. */ 547 uint32_t u32; 548 549 u32 = (ko_val & 0xFF) + (((ko_val & 0xFF00) == 0xC900) ? 550 KICONV_KO_UDA_OFFSET_1 : KICONV_KO_UDA_OFFSET_2); 551 udc[0] = 0xEF; 552 udc[1] = (uchar_t)(0x80 | (u32 & 0x00000FC0) >> 6); 553 udc[2] = (uchar_t)(0x80 | (u32 & 0x0000003F)); 554 u8 = udc; 555 index = 1; 556 } else { 557 index = kiconv_binsearch(ko_val, table, nitems); 558 u8 = table[index].u8; 559 } 560 561 sz = u8_number_of_bytes[u8[0]]; 562 563 if (obtail - ob < sz) { 564 *ret_val = (size_t)-1; 565 return (-1); 566 } 567 568 if (index == 0) 569 (*ret_val)++; /* Non-identical conversion */ 570 571 for (index = 0; index < sz; index++) 572 *ob++ = u8[index]; 573 574 return (sz); 575 } 576 577 static kiconv_ops_t kiconv_ko_ops_tbl[] = { 578 { 579 "euc-kr", "utf-8", kiconv_open_to_cck, kiconv_to_euckr, 580 kiconv_close_to_cck, kiconvstr_to_euckr 581 }, 582 { 583 "utf-8", "euc-kr", open_fr_euckr, kiconv_fr_euckr, 584 close_fr_ko, kiconvstr_fr_euckr 585 }, 586 { 587 "unifiedhangul", "utf-8", kiconv_open_to_cck, kiconv_to_uhc, 588 kiconv_close_to_cck, kiconvstr_to_uhc 589 }, 590 { 591 "utf-8", "unifiedhangul", open_fr_uhc, kiconv_fr_uhc, 592 close_fr_ko, kiconvstr_fr_uhc 593 } 594 }; 595 596 static kiconv_module_info_t kiconv_ko_info = { 597 "kiconv_ko", /* module name */ 598 sizeof (kiconv_ko_ops_tbl) / sizeof (kiconv_ko_ops_tbl[0]), 599 kiconv_ko_ops_tbl, 600 0, 601 NULL, 602 NULL, 603 0 604 }; 605 606 static struct modlkiconv modlkiconv_ko = { 607 &mod_kiconvops, 608 "kiconv korean module 1.0", 609 &kiconv_ko_info 610 }; 611 612 static struct modlinkage modlinkage = { 613 MODREV_1, 614 (void *)&modlkiconv_ko, 615 NULL 616 }; 617 618 int 619 _init(void) 620 { 621 int err; 622 623 err = mod_install(&modlinkage); 624 if (err) 625 cmn_err(CE_WARN, "kiconv_ko: failed to load kernel module"); 626 627 return (err); 628 } 629 630 int 631 _fini(void) 632 { 633 int err; 634 635 /* 636 * If this module is being used, then, we cannot remove the module. 637 * The following checking will catch pretty much all usual cases. 638 * 639 * Any remaining will be catached by the kiconv_unregister_module() 640 * during mod_remove() at below. 641 */ 642 if (kiconv_module_ref_count(KICONV_MODULE_ID_KO)) 643 return (EBUSY); 644 645 err = mod_remove(&modlinkage); 646 if (err) 647 cmn_err(CE_WARN, "kiconv_ko: failed to remove kernel module"); 648 649 return (err); 650 } 651 652 int 653 _info(struct modinfo *modinfop) 654 { 655 return (mod_info(&modlinkage, modinfop)); 656 } 657