1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright(c) 1998 Sun Microsystems, Inc. 23 * All right reserved. 24 */ 25 26 #include <stdio.h> 27 #include <errno.h> 28 #include <stdlib.h> 29 #include <sys/types.h> 30 #include <public_struc.h> 31 #include <unicode_gb2312.h> 32 #include <unicode_cns11643_CN.h> 33 #ifdef DEBUG 34 #include <fcntl.h> 35 #include <sys/stat.h> 36 #endif 37 #include "common_defs.h" 38 39 #define SI 0x0f 40 #define SO 0x0e 41 #define SS2 0x4e 42 #define SS3 0x4f 43 #define ESC 0x1b 44 #define MSB 0x80 45 #define MSB_OFF 0x7f 46 47 #define NON_ID_CHAR1 0x21 48 #define NON_ID_CHAR2 0x75 49 50 typedef struct _icv_state { 51 short _ustate; 52 short _istate; 53 short _gstate; 54 char _keepc[6]; 55 int _errno; 56 } _iconv_st; 57 58 enum _USTATE { U0, U1, U2, U3, U4, U5, U6, U7 }; 59 enum _ISTATE { IN, OUT }; 60 enum _GSTATE { G0, G1, G2 }; 61 62 int binary_search(unsigned long key, table_t *table, int tab_len); 63 64 /* 65 * Open; called from iconv_open() 66 */ 67 void * _icv_open() { 68 _iconv_st * st; 69 if ((st = (_iconv_st *)malloc(sizeof(_iconv_st))) == NULL) { 70 errno = ENOMEM; 71 return (void *)-1; 72 } 73 74 st->_ustate = U0; 75 st->_istate = IN; 76 st->_gstate = -1; 77 st->_errno = 0; 78 79 return (void *)st; 80 } 81 82 /* 83 * Close; called from iconv_close() 84 */ 85 86 void _icv_close(_iconv_st *st) { 87 if (st == NULL) 88 errno = EBADF; 89 else 90 free(st); 91 } 92 93 /* 94 * Actual conversion; called from iconv() 95 */ 96 97 size_t _icv_iconv(_iconv_st *st, char **inbuf, size_t *inbytesleft, 98 char **outbuf, size_t *outbytesleft) { 99 char c1 = '\0', c2 = '\0'; 100 int n = 0; 101 unsigned long key; 102 unsigned long gbk; 103 int index; 104 short new_state; 105 106 #ifdef DEBUG 107 fprintf(stderr, "in length is %d\toutlength is %d\n", 108 *inbytesleft, *outbytesleft); 109 #endif 110 if (st == NULL) { 111 errno = EBADF; 112 return ((size_t)-1); 113 } 114 115 if (inbuf == NULL || *inbuf == NULL) { /* Reset request. */ 116 st->_ustate = U0; 117 st->_istate = IN; 118 st->_gstate = G0; 119 st->_errno = 0; 120 return ((size_t)0); 121 } 122 123 errno = 0; 124 while (*inbytesleft > 0 && *outbytesleft > 0) { 125 126 uchar_t first_byte; 127 128 switch (st->_ustate) { 129 case U0: 130 if ((**inbuf & MSB) == 0) { /* ASCII */ 131 if (st->_istate == OUT) { 132 if (*outbytesleft < 2) { 133 #ifdef DEBUG 134 fprintf(stderr, "11111 outbytesleft is %d\n", *outbytesleft); 135 #endif 136 errno = E2BIG; 137 return (size_t) -1; 138 } 139 st->_istate = IN; 140 **outbuf = SI; 141 (*outbuf)++; 142 (*outbytesleft)--; 143 } 144 if (*outbytesleft < 1) { 145 #ifdef DEBUG 146 fprintf(stderr, "22222 outbytesleft is %d\n", *outbytesleft); 147 #endif 148 errno = E2BIG; 149 return (size_t) -1; 150 } 151 **outbuf = **inbuf; 152 (*outbuf)++; 153 (*outbytesleft)--; 154 } else { /* Chinese charactor */ 155 if ((**inbuf & 0xe0) == 0xc0) { /* 2-byte unicode 0xc2..0xdf */ 156 157 /* invalid sequence if the first char is either 0xc0 or 0xc1 */ 158 if ( number_of_bytes_in_utf8_char[((uchar_t)**inbuf)] == ICV_TYPE_ILLEGAL_CHAR ) 159 st->_errno = errno = EILSEQ; 160 else { 161 st->_ustate = U1; 162 st->_keepc[0] = **inbuf; 163 } 164 } else if ((**inbuf & 0xf0) == 0xe0) { /* 3-bytes unicode */ 165 st->_ustate = U2; 166 st->_keepc[0] = **inbuf; 167 } else { 168 169 /* four bytes of UTF-8 sequences */ 170 if ( number_of_bytes_in_utf8_char[((uchar_t)**inbuf)] == ICV_TYPE_ILLEGAL_CHAR ) 171 st->_errno = errno = EILSEQ; 172 else 173 { 174 st->_ustate = U5; 175 st->_keepc[0] = **inbuf; 176 } 177 #ifdef DEBUG 178 fprintf(stderr, "state = %d, keepc is %x\n", st->_ustate, st->_keepc[0]); 179 #endif 180 } 181 } 182 break; 183 184 case U1: /* 2-byte unicode */ 185 if ((**inbuf & 0xc0) == 0x80) { /* 2nd byte is 1xxxxxxx */ 186 st->_ustate = U4; 187 st->_keepc[1] = **inbuf; 188 c1 = (st->_keepc[0] & 0x1c)>>2; 189 c2 = ((st->_keepc[0] & 0x03) << 6) | \ 190 (st->_keepc[1] & 0x3f); 191 continue; 192 } else { 193 st->_errno = errno = EILSEQ; 194 #ifdef DEBUG 195 fprintf(stderr, "state = %d, keepc is %x\n", st->_ustate, st->_keepc[0]); 196 #endif 197 } 198 break; 199 200 case U2: /* 3-byte unicode - 2nd byte */ 201 first_byte = st->_keepc[0]; 202 203 /* if the first byte is 0xed, it is illegal sequence if the second 204 * one is between 0xa0 and 0xbf because surrogate section is ill-formed 205 */ 206 if (((uchar_t)**inbuf) < valid_min_2nd_byte[first_byte] || 207 ((uchar_t)**inbuf) > valid_max_2nd_byte[first_byte] ) 208 st->_errno = errno = EILSEQ; 209 else { 210 st->_ustate = U3; 211 st->_keepc[1] = **inbuf; 212 } 213 break; 214 215 case U3: /* 3-byte unicode - 3th byte */ 216 if ((**inbuf & 0xc0) == 0x80) { 217 st->_ustate = U4; 218 st->_keepc[2] = **inbuf; 219 c1 = ((st->_keepc[0] & 0x0f) << 4) | \ 220 ((st->_keepc[1] & 0x3c) >> 2); 221 c2 = ((st->_keepc[1] & 0x03) << 6) | \ 222 (st->_keepc[2] & 0x3f); 223 continue; 224 } else { 225 st->_errno = errno = EILSEQ; 226 #ifdef DEBUG 227 fprintf(stderr, "state = %d, keepc is %x\n", st->_ustate, st->_keepc[0]); 228 #endif 229 } 230 break; 231 232 case U4: /* Generate iso2022 sequence */ 233 key = ((c1 & 0xff) << 8) | (c2 & 0xff); 234 235 /* 0xFFFE and 0xFFFF should not be allowed */ 236 if ( key == 0xFFFE || key == 0xFFFF ) { 237 st->_errno = errno = EILSEQ; 238 break; 239 } 240 241 if ((index = binary_search(key, unicode_gb_tab, UNICODEMAX)) != -1) { /* GB code set */ 242 gbk = unicode_gb_tab[index].value; 243 if (st->_gstate != G0) { 244 if (*outbytesleft < 7) { 245 #ifdef DEBUG 246 fprintf(stderr, "33333 outbytesleft is %d\n", *outbytesleft); 247 #endif 248 errno = E2BIG; 249 return ((size_t)-1); 250 } 251 st->_istate = OUT; 252 st->_gstate = G0; 253 **outbuf = ESC; 254 *(*outbuf + 1) = '$'; 255 *(*outbuf + 2) = ')'; 256 *(*outbuf + 3) = 'A'; 257 *(*outbuf + 4) = SO; 258 *(*outbuf + 5) = (gbk & 0xff00) >> 8; 259 *(*outbuf + 6) = gbk & 0xff; 260 n = 7; 261 } else if (st->_istate == IN) { 262 if (*outbytesleft < 3) { 263 #ifdef DEBUG 264 fprintf(stderr, "44444outbytesleft is %d\n", *outbytesleft); 265 #endif 266 errno = E2BIG; 267 return ((size_t) -1); 268 } 269 st->_istate = OUT; 270 **(outbuf) = SO; 271 *(*outbuf + 1) = (gbk & 0xff00) >> 8; 272 *(*outbuf + 2) = gbk & 0xff; 273 n = 3; 274 } else { 275 if ( *outbytesleft < 2 ) { 276 errno = E2BIG; 277 return ((size_t)-1); 278 } 279 280 **outbuf = (gbk & 0xff00) >> 8; 281 *(*outbuf + 1) = gbk & 0xff; 282 n = 2; 283 } 284 } else if ((index = binary_search(key, utf_cns_tab, MAX_UTF_NUM)) != -1) { 285 gbk = utf_cns_tab[index].value; 286 new_state = ((gbk >> 16 ) & 0xff) - 0x20; 287 if (new_state == G2 || new_state == G1) { 288 if (st->_gstate != new_state) { 289 if (*outbytesleft < 7) { 290 #ifdef DEBUG 291 fprintf(stderr, "55555 outbytesleft is %d\n", *outbytesleft); 292 #endif 293 errno = E2BIG; 294 return (size_t) -1; 295 } 296 **outbuf = ESC; 297 *(*outbuf + 1) = '$'; 298 *(*outbuf + 2) = ')'; 299 *(*outbuf + 3) = 'G' + new_state - 1; 300 st->_istate = OUT; 301 st->_gstate = new_state; 302 *(*outbuf + 4) = SO; 303 *(*outbuf + 5) = (gbk & 0xff00) >> 8; 304 *(*outbuf + 6) = gbk & 0xff; 305 n = 7; 306 } else if (st->_istate == IN) { 307 if (*outbytesleft < 3) { 308 #ifdef DEBUG 309 fprintf(stderr, "66666 outbytesleft is %d\n", *outbytesleft); 310 #endif 311 errno = E2BIG; 312 return (size_t) -1; 313 } 314 st->_istate = OUT; 315 **outbuf = SO; 316 *(*outbuf + 1) = (gbk & 0xff00) >> 8; 317 *(*outbuf + 2) = gbk & 0xff; 318 n = 3; 319 } else { 320 if (*outbytesleft < 2) { 321 #ifdef DEBUG 322 fprintf(stderr, "77777 outbytesleft is %d\n", *outbytesleft); 323 #endif 324 errno = E2BIG; 325 return (size_t) -1; 326 } 327 **outbuf = (gbk & 0xff00) >> 8; 328 *(*outbuf + 1) = gbk & 0xff; 329 n = 2; 330 } 331 } else if (new_state > G2) { 332 if (st->_gstate != G0) { 333 if (*outbytesleft < 7) { 334 #ifdef DEBUG 335 fprintf(stderr, " 888888 outbytesleft is %d\n", *outbytesleft); 336 #endif 337 errno = E2BIG; 338 return (size_t) -1; 339 } 340 st->_gstate = G0; 341 st->_istate = OUT; 342 **outbuf = ESC; 343 *(*outbuf + 1) = '$'; 344 *(*outbuf + 2) = ')'; 345 *(*outbuf + 3) = 'A'; 346 *(*outbuf + 4) = SO; 347 *(*outbuf + 5) = NON_ID_CHAR1; 348 *(*outbuf + 6) = NON_ID_CHAR2; 349 n = 7; 350 } else if (st->_istate == IN) { 351 if (*outbytesleft < 3) { 352 #ifdef DEBUG 353 fprintf(stderr, "99999 outbytesleft is %d\n", *outbytesleft); 354 #endif 355 errno = E2BIG; 356 return (size_t) -1; 357 } 358 st->_gstate = G0; 359 st->_istate = OUT; 360 **outbuf = SO; 361 *(*outbuf + 1) = NON_ID_CHAR1; 362 *(*outbuf + 2) = NON_ID_CHAR2; 363 n = 3; 364 } else { 365 if (*outbytesleft < 2) { 366 #ifdef DEBUG 367 fprintf(stderr, "aaaaaaoutbytesleft is %d\n", *outbytesleft); 368 #endif 369 errno = E2BIG; 370 return (size_t) -1; 371 } 372 **outbuf = NON_ID_CHAR1; 373 *(*outbuf + 1) = NON_ID_CHAR2; 374 n = 2; 375 } 376 } 377 } else { /* Non-GB & Non-Big5 */ 378 if (st->_gstate != G0) { 379 if (*outbytesleft < 7) { 380 errno = E2BIG; 381 return (size_t) -1; 382 } 383 st->_gstate = G0; 384 st->_istate = OUT; 385 **outbuf = ESC; 386 *(*outbuf + 1) = '$'; 387 *(*outbuf + 2) = ')'; 388 *(*outbuf + 3) = 'A'; 389 *(*outbuf + 4) = SO; 390 *(*outbuf + 5) = NON_ID_CHAR1; 391 *(*outbuf + 6) = NON_ID_CHAR2; 392 n = 7; 393 } else if (st->_istate == IN) { 394 if(*outbytesleft < 3) { 395 errno = E2BIG; 396 return (size_t) -1; 397 } 398 st->_istate = OUT; 399 st->_gstate = G0; 400 **outbuf = SO; 401 *(*outbuf + 1) = NON_ID_CHAR1; 402 *(*outbuf + 2) = NON_ID_CHAR2; 403 n = 3; 404 } else { 405 /* add sanity check to avoid segment error */ 406 if (*outbytesleft < 2) { 407 errno = E2BIG; 408 return (size_t) -1; 409 } 410 **outbuf = NON_ID_CHAR1; 411 *(*outbuf + 1) = NON_ID_CHAR2; 412 n = 2; 413 } 414 } 415 /* 416 n = gen_undef(st, *outbuf, *outbytesleft); 417 fprintf(stderr, "gen_undef return %d\n", n ); 418 } 419 */ 420 if (n > 0) { 421 (*outbuf) += n; 422 (*outbytesleft) -= n; 423 } else { 424 #ifdef DEBUG 425 fprintf(stderr, "bbbbb outbytesleft is %d\n", *outbytesleft); 426 #endif 427 errno = E2BIG; 428 return ((size_t)-1); 429 } 430 st->_ustate = U0; 431 break; 432 433 case U5: 434 first_byte = st->_keepc[0]; 435 436 /* if the first byte is 0xf0, it is illegal sequence if 437 * the second one is between 0x80 and 0x8f 438 * for Four-Byte UTF: U+10000..U+10FFFF 439 */ 440 if (((uchar_t)**inbuf) < valid_min_2nd_byte[first_byte] || 441 ((uchar_t)**inbuf) > valid_max_2nd_byte[first_byte] ) 442 st->_errno = errno = EILSEQ; 443 else { 444 st->_ustate = U6; 445 st->_keepc[1] = **inbuf; 446 } 447 break; 448 case U6: 449 if ((**inbuf & 0xc0) == 0x80) /* 0x80..0xbf */ 450 { 451 st->_ustate = U7; 452 st->_keepc[2] = **inbuf; 453 } 454 else 455 st->_errno = errno = EILSEQ; 456 break; 457 case U7: 458 if ((**inbuf & 0xc0) == 0x80) /* 0x80..0xbf */ 459 { /* skip it to simplify */ 460 st->_ustate = U0; 461 } 462 else 463 st->_errno = errno = EILSEQ; 464 break; 465 default: 466 st->_errno = errno = EILSEQ; 467 #ifdef DEBUG 468 fprintf(stderr, "WHY HERE\n"); 469 #endif 470 st->_ustate = U0; /* reset state */ 471 break; 472 } /* end of switc */ 473 if (st->_errno) 474 break; 475 (*inbuf)++; 476 (*inbytesleft)--; 477 } 478 479 if (errno) 480 return ((size_t)-1); 481 482 if (*inbytesleft == 0 && st->_ustate != U0) 483 { 484 errno = EINVAL; 485 return ((size_t) -1); 486 } 487 488 if (*inbytesleft > 0 && *outbytesleft == 0) { 489 #ifdef DEBUG 490 fprintf(stderr, "cccccc outbytesleft is %d\n", *outbytesleft); 491 #endif 492 errno = E2BIG; 493 return ((size_t)-1); 494 } 495 return ((size_t)(*inbytesleft)); 496 } 497 498 /* 499 * gen_undef(); Called when a char non-gb and non-big5 found. 500 */ 501 int gen_undef(_iconv_st * st, char * outbuf, int bytes) { 502 if (st->_gstate != G0) { 503 if (bytes < 7) { 504 #ifdef DEBUG 505 fprintf(stderr, "in gen outbytesleft is %d\n", bytes); 506 #endif 507 errno = st->_errno = E2BIG; 508 return -1; 509 } 510 st->_gstate = G0; 511 st->_istate = OUT; 512 *outbuf = ESC; 513 *(outbuf + 1) = '$'; 514 *(outbuf + 2) = ')'; 515 *(outbuf + 3) = 'A'; 516 *(outbuf + 4) = SO; 517 *(outbuf + 5) = NON_ID_CHAR1; 518 *(outbuf + 6) = NON_ID_CHAR2; 519 return 7; 520 } 521 if (st->_istate == IN) { 522 if (bytes < 3) { 523 #ifdef DEBUG 524 fprintf(stderr, "in gen outbytesleft is %d\n", bytes); 525 #endif 526 errno = st->_errno = E2BIG; 527 return -1; 528 } 529 st->_istate = OUT; 530 *outbuf = SO; 531 *(outbuf + 1) = NON_ID_CHAR1; 532 *(outbuf + 2) = NON_ID_CHAR2; 533 return 3; 534 } 535 if (bytes < 2) { 536 #ifdef DEBUG 537 fprintf(stderr, "in gen outbytesleft is %d\n", bytes); 538 #endif 539 errno = st->_errno = E2BIG; 540 return -1; 541 } 542 *outbuf = NON_ID_CHAR1; 543 *(outbuf + 1) = NON_ID_CHAR2; 544 return 2; 545 } 546 547 /* 548 * binary_search(); 549 */ 550 int binary_search(unsigned long key, table_t *table, int tab_len) { 551 int i, low, high; 552 553 for (low = 0, high = tab_len-1; low < high; ) { 554 if (table[low].key == key) 555 return low; 556 if (table[high].key == key) 557 return high; 558 i = (low + high) >> 1; 559 if (table[i].key == key) 560 return i; 561 if (table[i].key < key) 562 low = i + 1; 563 else 564 high = i - 1; 565 } 566 return -1; 567 } 568 569 #ifdef DEBUG 570 main(int argc, char ** argv) { 571 _iconv_st * st; 572 int fd; 573 char * in_str; 574 char * out_str; 575 char * tmp_in; 576 char * tmp_out; 577 unsigned int in_len; 578 unsigned int out_len; 579 580 struct stat s; 581 582 if (argc < 2) { 583 fprintf(stderr, "Usage: %s input\n", argv[0]); 584 exit(-1); 585 } 586 587 if (stat(argv[1], &s) == -1) { 588 perror("stat"); 589 exit(-1); 590 } 591 592 if ((fd = open(argv[1], O_RDONLY)) == -1) { 593 perror("open"); 594 exit(-1); 595 } 596 597 tmp_in = in_str = (char *) malloc(1024); 598 tmp_out = out_str = (char *) malloc(1024); 599 if (!in_str || !out_str) { 600 perror("malloc"); 601 exit(-3); 602 free(in_str); 603 free(out_str); 604 } 605 in_len = s.st_size; 606 out_len = s.st_size << 2; 607 st = _icv_open(); 608 if (st == (_iconv_st *) -1) { 609 perror("_icv_open"); 610 free(in_str); 611 free(out_str); 612 exit(-3); 613 } 614 615 while (1) { 616 in_len = 1024; 617 out_len = 1024; 618 in_str = tmp_in; 619 out_str = tmp_out; 620 621 if (!read(fd, in_str, in_len)) 622 exit(0); 623 624 if (_icv_iconv(st, &in_str, &in_len, &out_str, &out_len) == -1) { 625 perror("icv_iconv"); 626 fprintf(stderr, "\ninbytesleft = %d\n", in_len); 627 exit(-2); 628 } 629 fprintf(stderr, "Result is in len %d, out len %d\n", in_len, 630 out_len); 631 if (write(1, tmp_out, 4096 - out_len) == -1) { 632 perror("write"); 633 } 634 } /* end of while */ 635 636 free(tmp_in); 637 free(tmp_out); 638 close(fd); 639 _icv_close(st); 640 } 641 #endif 642