1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 1997, by Sun Microsystems, Inc. 23 * All rights reserved. 24 */ 25 26 27 /* 28 Converts From: Taiwanese BIG5 encoding 29 Converts To: ISO2022-CN-EXT encoding. 30 31 NOTE: This file was created using vi editor with tabstop set to 4. 32 To view this file correctly set tabstop appropriately. 33 e.g. for vi use command ESC:se ts=4 34 */ 35 36 #include <stdio.h> 37 #include <stdlib.h> 38 #include <errno.h> 39 #include "big5_cns11643.h" /* Big5 to CNS 11643 mapping table */ 40 41 #define MSB 0x80 /* The most significant bit */ 42 #define ONEBYTE 0xff /* The right most byte */ 43 44 #define SI 0x0f /* shift in */ 45 #define SO 0x0e /* shift out */ 46 #define SS2 0x4e /* SS2 low byte. High byte is ESC */ 47 #define SS3 0x4f /* SS3 low byte. High byte is ESC */ 48 #define ESC 0x1b /* The Escape character */ 49 #define NON_ID_CHAR '_' /*Substitute this for all unidentified characters*/ 50 51 /* GET_PLANEC() - Gets the corresponding ISO assigned plane character for 52 the CNS11643 plane */ 53 static const char plane_char[] = "0GHIJKLMNOPQRSTUV"; 54 #define GET_PLANEC(i) (plane_char[(i)]) 55 56 typedef struct _icv_state { 57 char keepc[2]; /* Save the recieved bytes here */ 58 short cstate; /* Current state the state machine is in. 59 These states are C0 or C1*/ 60 char ishiftfunc; /* The currently active shift funtion SI or SO 61 in the output ISO buffer */ 62 int iSOplane; /* The current CNS11643 plane which is 63 assigned to the SOdesignation in the output 64 ISO buffer. Only CNS11643 plane 1 can be 65 assigned to SOdesignation */ 66 int iSS2plane; /* The current CNS11643 plane which is 67 assigned to the SS2designation in the output 68 ISO buffer. Only CNS11643 plane 2 can be 69 assigned to SS2designation */ 70 int iSS3plane; /* The current CNS11643 plane which is 71 assigned to the SS3designation in the output 72 ISO buffer. All CNS11643 planes >= 3 are 73 assigned to SS3designation */ 74 size_t nonidcount; /* Keeps track of skipped input bytes in conversion */ 75 int _errno; /* Internal error number */ 76 } _iconv_st; 77 78 enum _CSTATE { C0, C1 }; 79 80 static int isbig5(unsigned char*); 81 static int hascns(char*); 82 static int ascii_to_iso(char, _iconv_st*, char**, size_t*); 83 static int big5_to_iso(int, _iconv_st*, char**, size_t*); 84 static int getcnsbytes(int, char*, int*); 85 static int binsearch(unsigned long, table_t[], int); 86 87 88 /* 89 * _icv_open: Called from iconv_open. Allocates and initializes _iconv_st 90 * structure. Returns pointer to the structure as (void *). 91 */ 92 93 94 void * 95 _icv_open() 96 { 97 _iconv_st *st; 98 99 #ifdef DEBUG 100 fprintf(stderr, "_icv_open(): Come into!\n"); 101 #endif 102 /* Allocate */ 103 if ((st = (_iconv_st *) malloc(sizeof(_iconv_st))) == NULL){ 104 errno = ENOMEM; 105 #ifdef DEBUG 106 fprintf(stderr, "Error\n"); 107 #endif 108 return ((void *) -1); 109 } 110 111 /* Initialize */ 112 st->cstate = C0; 113 st->ishiftfunc = SI; 114 st->iSOplane = -1; 115 st->iSS2plane = -1; 116 st->iSS3plane = -1; 117 st->nonidcount = 0; 118 st->_errno = 0; 119 120 #ifdef DEBUG 121 fprintf(stderr, "====== _icv_open(): Big5 --> ISO2022-CN-EXT =====\n"); 122 #endif 123 124 /* Return struct */ 125 return ((void *) st); 126 } 127 128 129 130 /* 131 * _icv_close: Called from iconv_close(). Frees the _iconv_st structure as 132 * pointed by the argument. 133 */ 134 135 void 136 _icv_close(_iconv_st *st) 137 { 138 if (st == NULL) 139 errno = EBADF; 140 else 141 free(st); 142 } 143 144 /* 145 * _icv_iconv: Called from iconv(). Does the convertion from BIG5 to 146 * ISO2022-CN-EXT. 147 */ 148 /*======================================================= 149 * 150 * State Machine for interpreting Big-5 code 151 * 152 *======================================================= 153 * 154 * 1st C 155 * +--------> C0 ----------> C1 156 * | ascii | 2nd C | 157 * ^ v v 158 * +----<-----+-----<--------+ 159 * 160 *=======================================================*/ 161 size_t 162 _icv_iconv(_iconv_st *st, char **inbuf, size_t *inbytesleft, 163 char **outbuf, size_t *outbytesleft) 164 { 165 166 int n, idx; 167 168 #ifdef DEBUG 169 fprintf(stderr, "=== _icv_iconv(): Big5 --> ISO2022-CN-EXT =====\n"); 170 #endif 171 172 if (st == NULL) { 173 errno = EBADF; 174 return ((size_t) -1); 175 } 176 177 if (inbuf == NULL || *inbuf == NULL || 178 inbytesleft == NULL || *inbytesleft == 0) { /* Reset request */ 179 if (st->ishiftfunc == SO) { 180 if (outbytesleft && *outbytesleft >= 1 && outbuf && *outbuf) { 181 **outbuf = SI; 182 (*outbuf)++; 183 (*outbytesleft)--; 184 } else { 185 errno = E2BIG; 186 return((size_t) -1); 187 } 188 } 189 st->cstate = C0; 190 st->ishiftfunc = SI; 191 st->iSOplane = -1; 192 st->iSS2plane = -1; 193 st->iSS3plane = -1; 194 st->nonidcount = 0; 195 st->_errno = 0; 196 return ((size_t) 0); 197 } 198 199 st->_errno = 0; 200 errno = 0; 201 202 /* Before we use *inbytesleft or *outbytesleft we should confirm that 203 inbytesleft and outbytesleft are non-NULL. I am considering inbytesleft 204 or *inbytesleft having 0 value as a reset request. I am considering 205 outbytesleft having 0 value as no space in output buffer. Also, here 206 itself I am verifying that outbuf and *outbuf should be non-NULL pointers 207 so I do not have to worry about them being NULL below in the conversion 208 sub-routines. I also confirm here that *outbytesleft should be > 0 before 209 we can continue further */ 210 211 if (outbytesleft == NULL || *outbytesleft == 0 || 212 outbuf == NULL || *outbuf == NULL){ 213 errno = E2BIG; 214 return ((size_t)-1); 215 } 216 217 /* A state machine for interpreting Big-5 code */ 218 while (*inbytesleft > 0 && *outbytesleft > 0) { 219 switch (st->cstate) { 220 case C0: 221 if (**inbuf & MSB) { /* May have got the first byte ofa BIG5 code */ 222 223 st->keepc[0] = **inbuf; /*Save byte */ 224 st->cstate = C1; /* Go to the next state where 225 the next BIG5 byte is recieved */ 226 st->nonidcount += 1;/* Until we have verified that this and 227 the next byte make a valid BIG5 code 228 we shall consider this as an 229 unidentified byte */ 230 } else if (**inbuf == ESC || **inbuf == SI || **inbuf == SO){ 231 232 /* We should not process these ASCII control codes as these 233 have special significance in the output ISO encoding. 234 Instead we will output NON_ID_CHAR and continue processing */ 235 236 n = ascii_to_iso(NON_ID_CHAR, st, outbuf, outbytesleft); 237 if (n < 0) /* Insufficient space in the outbuf */ 238 return ((size_t)-1); /* The errno etc. are set in ascii_to_iso */ 239 st->nonidcount += 1; 240 } else { /* Got ASCII code */ 241 n = ascii_to_iso(**inbuf, st, outbuf, outbytesleft); 242 if (n < 0) /* Insufficient space in the outbuf */ 243 return ((size_t)-1); 244 } 245 break; 246 247 case C1: 248 st->keepc[1] = (**inbuf); 249 if (isbig5((unsigned char*) st->keepc) == 0) { 250 if ((idx = hascns(st->keepc)) >= 0){ 251 n = big5_to_iso(idx, st, outbuf, outbytesleft); 252 if (n < 0) /* Insufficient space in the outbuf */ 253 return ((size_t)-1); 254 st->nonidcount -= 1; /* The first byte of this big5 saved in 255 state C0 is confirmed valid BIG5 High 256 byte and is processed correctly */ 257 258 } else { /* Valid BIG5 but has no CNS encoding */ 259 /* We will output the NON_ID_CHAR character */ 260 n = ascii_to_iso(NON_ID_CHAR, st, outbuf, outbytesleft); 261 if (n < 0) /* Insufficient space in the outbuf */ 262 return ((size_t)-1); 263 n = ascii_to_iso(NON_ID_CHAR, st, outbuf, outbytesleft); 264 if (n < 0) /* Insufficient space in the outbuf */ 265 return ((size_t)-1); 266 st->nonidcount -= 1; /* Include the 2nd byte also as 267 unidentified byte */ 268 } 269 } else { /* Input character is not BIG5 encoding */ 270 st->nonidcount += 1; 271 st->_errno = errno = EILSEQ; /* This will cause the code to 272 break out of while loop below 273 to return to the caller */ 274 275 } 276 st->cstate = C0; /* Go to the initial state */ 277 break; 278 279 default: /* Should never come here */ 280 fprintf(stderr, 281 "_icv_iconv():Big5-->ISO2022-CN-EXT: Should not have come here\n"); 282 st->_errno = errno = EILSEQ; 283 st->cstate = C0; 284 break; 285 286 } /* end switch */ 287 288 (*inbuf)++; 289 (*inbytesleft)--; 290 291 if (st->_errno) 292 break; /* Break out of while loop */ 293 294 if (errno) /* We set st->_errno before we set errno. If errno is set 295 somewhere else we handle that here */ 296 return ((size_t)-1); 297 298 } /* end while */ 299 300 /* We now have to handle the case where we have successfully processed the 301 previous input character which exhausted the output buffer. This is handled 302 by the while loop. However, since there are more input characters that 303 haven't been processed yet, we need to set the errno appropriately and 304 return -1. */ 305 if (*inbytesleft > 0 && *outbytesleft == 0) { 306 errno = E2BIG; 307 return ((size_t)-1); 308 } 309 310 return (*inbytesleft + st->nonidcount); 311 312 } 313 314 315 /* 316 * Big-5 encoding range: 317 * High byte: 0xA1 - 0xFE (94 encoding space) 318 * Low byte: 0x40 - 0x7E, 0xA1 - 0xFE (157 encoding space) 319 * Plane #1: 0xA140 - 0xC8FE (6280 encoding space) 320 * Plane #2: 0xC940 - 0xFEFE (8478 encoding space) 321 * Total: 94 * 157 = 14,758 (14758 encoding space) 322 */ 323 static int isbig5(unsigned char *twobytes) 324 { 325 if (twobytes[0] >= 0xa1 && twobytes[0] <= 0xfe) 326 if ((twobytes[1] >= 0x40 && twobytes[1] <= 0x7e) || 327 (twobytes[1] >= 0xa1 && twobytes[1] <= 0xfe)) 328 return (0); 329 return(-1); 330 } 331 332 333 /* 334 * hascns() : checks whether we have a CNS 11643 code for the big5 character 335 * code. If exists returns the index of the big5 character in the 336 * big5 to CNS table else returns -1. 337 */ 338 static int hascns(char* big5mbchar) 339 { 340 341 int idx; 342 unsigned long big5code; 343 344 big5code = (unsigned long) ((big5mbchar[0] & ONEBYTE) << 8) + 345 (big5mbchar[1] & ONEBYTE); 346 347 idx = binsearch(big5code, big5_cns_tab, MAX_BIG5_NUM); 348 349 return (idx); /* binsearch returns -1 if not found, else index */ 350 } 351 352 353 /* ascii_to_iso() : If required, outputs the SI shift function. Outputs the 354 * character. If there is insufficient space in the output 355 * buffer, it flags the error and returns -1. On success it 356 * returns 0. 357 */ 358 static int ascii_to_iso(char c, _iconv_st *st, char **outbuf, 359 size_t *outbytesleft) 360 { 361 if (st->ishiftfunc != SI){ 362 **outbuf = SI; 363 (*outbuf)++; 364 (*outbytesleft)--; 365 st->ishiftfunc = SI; 366 367 if (*outbytesleft < 1){ /* Do we now have space for ASCII character?*/ 368 st->_errno = errno = E2BIG; 369 return (-1); 370 } 371 } 372 373 **outbuf = c; 374 (*outbuf)++; 375 (*outbytesleft)--; 376 377 /* Each line in ISO is expected to have the character set information 378 for the Chinese characters in that line. This facilitates text 379 scrollling. Hence, on encountering newline reset designations to 380 unknown */ 381 if (c == '\n'){ 382 st->iSOplane = -1; 383 st->iSS2plane = -1; 384 st->iSS3plane = -1; 385 } 386 387 return (0); 388 389 } 390 391 392 393 /* big5_to_iso() : Converts the Big5 code, for which the index idx in 394 * the big5 to cns table is provided as an argument, to 395 * its corresponding ISO2022-CN-EXT code. This may 396 * require outputting of SO shift function and/or 397 * the designations. In case we do not have sufficient 398 * space in the outbuf to to do the convertion we flag error 399 * and return -1 400 */ 401 static int big5_to_iso(int idx, _iconv_st *st, char **outbuf, 402 size_t *outbytesleft) 403 { 404 405 char cnsbytes[2]; 406 int cnsplane; 407 int ret; 408 409 ret = getcnsbytes(idx, cnsbytes, &cnsplane); 410 if (ret < 0){ 411 /* This means that the cnscode is invalid. Should have been taken 412 care of in function hascns() and thus this code should never come 413 here. We catch this by the error message below */ 414 fprintf(stderr, 415 "big5_to_iso():Big5->ISO2022-CN-EXT:gencnsbyte() rejected cnscode\n"); 416 st->_errno = errno = EILSEQ; 417 return (0); 418 } 419 420 switch (cnsplane) { 421 case 1: 422 if (st->iSOplane != cnsplane){ /* Is SODESIGNATION set to this plane?*/ 423 /* Output Escape sequence to set the SODESIGNATION to plane 1 */ 424 /* Before that check that we have space in outbuf for it */ 425 if (*outbytesleft < 4){ 426 st->_errno = errno = E2BIG; 427 return (-1); 428 } 429 430 **outbuf = ESC; 431 *(*outbuf+1) = '$'; 432 *(*outbuf+2) = ')'; 433 *(*outbuf+3) = GET_PLANEC(cnsplane); 434 (*outbuf) += 4; 435 (*outbytesleft) -= 4; 436 st->iSOplane = cnsplane; 437 } 438 439 /* Check the current shift function whether it is SO. If not 440 set the SO shift function after confirming that you have 441 space for it. */ 442 if (st->ishiftfunc != SO){ 443 if (*outbytesleft < 1){ 444 st->_errno = errno = E2BIG; 445 return (-1); 446 } 447 448 **outbuf = SO; 449 (*outbuf)++; 450 (*outbytesleft)--; 451 st->ishiftfunc = SO; 452 } 453 break; 454 455 case 2: 456 if (st->iSS2plane != cnsplane){ /* Is SS2DESIGNATION set tothis plane ? */ 457 /* Output escape sequence to set SS2DESIGNATION to plane 2 */ 458 /* Before that check that we have space in outbuf for it */ 459 if (*outbytesleft < 4){ 460 st->_errno = errno = E2BIG; 461 return (-1); 462 } 463 464 **outbuf = ESC; 465 *(*outbuf+1) = '$'; 466 *(*outbuf+2) = '*'; 467 *(*outbuf+3) = GET_PLANEC(cnsplane); 468 (*outbuf) += 4; 469 (*outbytesleft) -= 4; 470 st->iSS2plane = cnsplane; 471 } 472 473 /* Output the SS2 shift function only when we have sufficient space 474 for the 2 cns code bytes also */ 475 if (*outbytesleft < 4){ 476 st->_errno = errno = E2BIG; 477 return (-1); 478 } 479 480 **outbuf = ESC; 481 *(*outbuf+1) = SS2; 482 (*outbuf) += 2; 483 (*outbytesleft) -= 2; 484 485 break; 486 487 case 3: 488 case 4: 489 case 5: 490 case 6: 491 case 7: 492 case 12: 493 case 14: 494 case 15: 495 case 16: 496 if (st->iSS3plane != cnsplane){ /* Is SS3DESIGNATION set tothis plane? */ 497 /* Output escape sequence to set SS3DESIGNATION to cnsplane */ 498 /* Before that check that we have space in outbuf for it */ 499 if (*outbytesleft < 4){ 500 st->_errno = errno = E2BIG; 501 return (-1); 502 } 503 504 **outbuf = ESC; 505 *(*outbuf+1) = '$'; 506 *(*outbuf+2) = '+'; 507 *(*outbuf+3) = GET_PLANEC(cnsplane); 508 (*outbuf) += 4; 509 (*outbytesleft) -= 4; 510 st->iSS3plane = cnsplane; 511 512 } 513 514 /* Output the SS3 shift function only when we have sufficient space 515 for the 2 cns code bytes also */ 516 if (*outbytesleft < 4){ 517 st->_errno = errno = E2BIG; 518 return (-1); 519 } 520 521 **outbuf = ESC; 522 *(*outbuf+1) = SS3; 523 (*outbuf) += 2; 524 (*outbytesleft) -= 2; 525 526 break; 527 528 default: /* Should have been taken care of in caller of this funcion */ 529 530 /* This means that the cnscode is invalid. Should have been taken 531 care of in function hascns() and thus this code should never 532 come here. We catch this by the error message below */ 533 fprintf(stderr, "big5_to_iso():Big5->ISO2022-CN-EXT:Rejecting cnscode\n"); 534 st->_errno = errno = EILSEQ; 535 return (0); 536 537 break; 538 539 } /* end switch */ 540 541 /* Output the cns code */ 542 if (*outbytesleft < 2){ 543 st->_errno = errno = E2BIG; 544 return (-1); 545 } 546 547 **outbuf = cnsbytes[0]; 548 *(*outbuf+1) = cnsbytes[1]; 549 (*outbuf) += 2; 550 (*outbytesleft) -= 2; 551 552 553 return (0); 554 555 } 556 557 558 static int getcnsbytes(int idx, char *cnsbytes, int *cnsplane) 559 { 560 561 unsigned long cnscode; 562 unsigned long val; 563 int plane; 564 565 cnscode = big5_cns_tab[idx].value; 566 567 plane = (int) (cnscode >> 16); 568 switch (plane) { 569 case 0x21: /* 0x8EA1 - G */ 570 case 0x22: /* 0x8EA2 - H */ 571 case 0x23: /* 0x8EA3 - I */ 572 case 0x24: /* 0x8EA4 - J */ 573 case 0x25: /* 0x8EA5 - K */ 574 case 0x26: /* 0x8EA6 - L */ 575 case 0x27: /* 0x8EA7 - M */ 576 case 0x28: /* 0x8EA8 - N */ 577 case 0x29: /* 0x8EA9 - O */ 578 case 0x2a: /* 0x8EAA - P */ 579 case 0x2b: /* 0x8EAB - Q */ 580 case 0x2c: /* 0x8EAC - R */ 581 case 0x2d: /* 0x8EAD - S */ 582 case 0x2f: /* 0x8EAF - U */ 583 case 0x30: /* 0x8EB0 - V */ 584 *cnsplane = plane - 0x20; /* so that we can use GET_PLANEC() */ 585 break; 586 587 case 0x2e: /* 0x8EAE - T */ 588 *cnsplane = 3; /* CNS 11643-1992. Why is this returning 3? */ 589 break; 590 591 default: 592 return (-1); /* Should not have happened */ 593 break; 594 } 595 596 val = cnscode & 0xffff; 597 cnsbytes[0] = (val & 0xff00) >> 8; 598 cnsbytes[1] = val & 0xff; 599 600 return (0); 601 602 } 603 604 605 /* binsearch: find x in v[0] <= v[1] <= ... <= v[n-1] */ 606 static int binsearch(unsigned long x, table_t v[], int n) 607 { 608 int low, high, mid; 609 610 low = 0; 611 high = n - 1; 612 while (low <= high) { 613 mid = (low + high) / 2; 614 if (x < v[mid].key) 615 high = mid - 1; 616 else if (x > v[mid].key) 617 low = mid + 1; 618 else /* found match */ 619 return mid; 620 } 621 return (-1); /* no match */ 622 } 623