1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 1995, by Sun Microsystems, Inc. 24 * All rights reserved. 25 */ 26 27 #include <stdio.h> 28 #include <stdlib.h> 29 #include <sys/types.h> 30 #include <errno.h> 31 #include "unicode_cns11643_TW.h" /* UTF8 to CNS 11643 mapping table */ 32 #include "common_defs.h" 33 34 #define MSB 0x80 /* most significant bit */ 35 #define MBYTE 0x8e /* multi-byte (4 byte character) */ 36 #define PMASK 0xa0 /* plane number mask */ 37 #define ONEBYTE 0xff /* right most byte */ 38 39 #define SI 0x0f /* shift in */ 40 #define SO 0x0e /* shift out */ 41 #define ESC 0x1b /* escape */ 42 43 /* static const char plane_char[] = "0GH23456789:;<=>?"; */ 44 static const char plane_char[] = "0GHIJKLMNOPQRSTUV"; 45 46 #define GET_PLANEC(i) (plane_char[i]) 47 48 #define NON_ID_CHAR '?' /* non-identified character */ 49 50 typedef struct _icv_state { 51 char keepc[6]; /* maximum # byte of UTF8 code */ 52 short cstate; 53 short istate; 54 short ustate; 55 int _errno; /* internal errno */ 56 } _iconv_st; 57 58 enum _CSTATE { C0, C1 }; 59 enum _ISTATE { IN, OUT }; 60 enum _USTATE { U0, U1, U2, U3, U4, U5, U6, U7 }; 61 62 63 static int get_plane_no_by_utf(const char, const char, int *, unsigned long *); 64 static int utf8_to_iso(int, int, unsigned long, char *, size_t); 65 static int binsearch(unsigned long, utf_cns[], int); 66 67 /* 68 * Open; called from iconv_open() 69 */ 70 void * 71 _icv_open() 72 { 73 _iconv_st *st; 74 75 if ((st = (_iconv_st *)malloc(sizeof(_iconv_st))) == NULL) { 76 errno = ENOMEM; 77 return ((void *) -1); 78 } 79 80 st->cstate = C0; 81 st->istate = IN; 82 st->ustate = U0; 83 st->_errno = 0; 84 85 #ifdef DEBUG 86 fprintf(stderr, "========== iconv(): UTF2 --> ISO2022-7 ==========\n"); 87 #endif 88 89 return ((void *) st); 90 } 91 92 93 /* 94 * Close; called from iconv_close() 95 */ 96 void 97 _icv_close(_iconv_st *st) 98 { 99 if (!st) 100 errno = EBADF; 101 else 102 free(st); 103 } 104 105 106 /* 107 * Actual conversion; called from iconv() 108 */ 109 /*========================================================= 110 * 111 * State Machine for interpreting UTF8 code 112 * 113 *========================================================= 114 * 2nd byte 3rd byte 4th byte 115 * +----->------->------->U5------>U6--------->U7 116 * | | 117 * | 3 byte unicode | 118 * +----->------->-------+ | 119 * | | | 120 * ^ v | 121 * | 2 byte U2 ---> U3 | 122 * | unicode v | 123 * +------> U0 -------> U1 +-------->U4---+ 124 * ^ ascii | | ^ | 125 * | | +-------->--------->--------+ | 126 * | v v 127 * +----<---+-----<------------<------------<------------+ 128 * 129 *=========================================================*/ 130 size_t 131 _icv_iconv(_iconv_st *st, char **inbuf, size_t *inbytesleft, 132 char **outbuf, size_t *outbytesleft) 133 { 134 char c1 = '\0', c2 = '\0'; 135 int plane_no, n, unidx; 136 /* pre_plane_no: need to be static when re-entry occurs on errno set */ 137 static int pre_plane_no = -1; /* previous plane number */ 138 unsigned long cnscode; 139 140 if (st == NULL) { 141 errno = EBADF; 142 return ((size_t) -1); 143 } 144 145 if (inbuf == NULL || *inbuf == NULL) { /* Reset request. */ 146 st->cstate = C0; 147 st->istate = IN; 148 st->ustate = U0; 149 st->_errno = 0; 150 return ((size_t) 0); 151 } 152 153 #ifdef DEBUG 154 fprintf(stderr, "=== (Re-entry) iconv(): UTF-8 --> ISO 2022-7 ===\n"); 155 fprintf(stderr, "st->cstate=%d\tst->istate=%d\tst->_errno=%d\tplane_no=%d\n", 156 st->cstate, st->istate, st->_errno, plane_no); 157 #endif 158 st->_errno = 0; /* reset internal errno */ 159 errno = 0; /* reset external errno */ 160 161 /* a state machine for interpreting UTF8 code */ 162 while (*inbytesleft > 0 && *outbytesleft > 0) { 163 164 uchar_t first_byte; 165 166 switch (st->ustate) { 167 case U0: /* assuming ASCII in the beginning */ 168 if ((**inbuf & MSB) == 0) { /* ASCII */ 169 if (st->istate == OUT) { 170 st->cstate = C0; 171 st->istate = IN; 172 **outbuf = SI; 173 (*outbuf)++; 174 (*outbytesleft)--; 175 if (*outbytesleft <= 0) { 176 errno = E2BIG; 177 return((size_t) -1); 178 } 179 } 180 **outbuf = **inbuf; 181 (*outbuf)++; 182 (*outbytesleft)--; 183 } else { /* Chinese character */ 184 if ((**inbuf & 0xe0) == 0xc0) { /* 2 byte unicode 0xc2..0xdf */ 185 186 /* invalid sequence if the first byte is either 0xc0 or 0xc1 */ 187 if ( number_of_bytes_in_utf8_char[((uchar_t)**inbuf)] == ICV_TYPE_ILLEGAL_CHAR ) 188 st->_errno = errno = EILSEQ; 189 else { 190 st->ustate = U1; 191 st->keepc[0] = **inbuf; 192 } 193 } else if ((**inbuf & 0xf0) == 0xe0) { /* 3 byte 0xe0..0xef */ 194 st->ustate = U2; 195 st->keepc[0] = **inbuf; 196 } else { 197 /* four bytes of UTF-8 sequences */ 198 if ( number_of_bytes_in_utf8_char[((uchar_t)**inbuf)] == ICV_TYPE_ILLEGAL_CHAR ) 199 st->_errno = errno = EILSEQ; 200 else 201 { 202 st->ustate = U5; 203 st->keepc[0] = **inbuf; 204 } 205 } 206 } 207 break; 208 case U1: /* 2 byte unicode */ 209 if ((**inbuf & 0xc0) == 0x80) { 210 st->ustate = U4; 211 st->keepc[1] = **inbuf; 212 c1 = (st->keepc[0]&0x1c)>>2; 213 c2 = ((st->keepc[0]&0x03)<<6) | ((**inbuf)&0x3f); 214 #ifdef DEBUG 215 fprintf(stderr, "UTF8: %02x%02x --> ", 216 st->keepc[0]&ONEBYTE, st->keepc[1]&ONEBYTE); 217 #endif 218 continue; /* should not advance *inbuf */ 219 } else { 220 st->_errno = errno = EILSEQ; 221 } 222 break; 223 case U2: /* 3 byte unicode - 2nd byte */ 224 225 first_byte = st->keepc[0]; 226 227 /* if the first byte is 0xed, it is illegal sequence if the second 228 * one is between 0xa0 and 0xbf because surrogate section is ill-formed 229 */ 230 if (((uchar_t)**inbuf) < valid_min_2nd_byte[first_byte] || 231 ((uchar_t)**inbuf) > valid_max_2nd_byte[first_byte] ) 232 st->_errno = errno = EILSEQ; 233 else { 234 st->ustate = U3; 235 st->keepc[1] = **inbuf; 236 } 237 break; 238 case U3: /* 3 byte unicode - 3rd byte */ 239 if ((**inbuf & 0xc0) == 0x80) { 240 st->ustate = U4; 241 st->keepc[2] = **inbuf; 242 c1 = ((st->keepc[0]&0x0f)<<4) | 243 ((st->keepc[1]&0x3c)>>2); 244 c2 = ((st->keepc[1]&0x03)<<6) | ((**inbuf)&0x3f); 245 #ifdef DEBUG 246 fprintf(stderr, "UTF8: %02x%02x%02x --> ", st->keepc[0]&ONEBYTE, 247 st->keepc[1]&ONEBYTE, **inbuf&ONEBYTE); 248 #endif 249 continue; /* should not advance *inbuf */ 250 } else { 251 st->_errno = errno = EILSEQ; 252 } 253 break; 254 case U4: 255 plane_no = get_plane_no_by_utf(c1, c2, &unidx, &cnscode); 256 if (plane_no == -2) 257 { /* unicode is either 0xFFFE or 0xFFFF */ 258 st->_errno = errno = EILSEQ; 259 break; 260 } 261 262 if (plane_no > 0) { /* legal unicode; illegal CNS */ 263 if ((st->istate == IN) || (pre_plane_no != plane_no)) { 264 if ((st->cstate == C0) || 265 (pre_plane_no != plane_no)) { 266 /* change plane # in Chinese mode */ 267 if (st->cstate == C1) { 268 **outbuf = SI; 269 (*outbuf)++; 270 (*outbytesleft)--; 271 } 272 if (*outbytesleft < 4) { 273 st->_errno = errno = E2BIG; 274 return((size_t) -1); 275 } 276 pre_plane_no = plane_no; 277 st->cstate = C1; 278 **outbuf = ESC; 279 *(*outbuf+1) = '$'; 280 *(*outbuf+2) = ')'; 281 *(*outbuf+3) = GET_PLANEC(plane_no); 282 #ifdef DEBUG 283 fprintf(stderr, "\n\t\t\t\tESC $ ) %c\t", *(*outbuf+3)); 284 #endif 285 (*outbuf) += 4; 286 (*outbytesleft) -= 4; 287 if (*outbytesleft <= 0) { 288 st->_errno = errno = E2BIG; 289 return((size_t) -1); 290 } 291 } 292 st->istate = OUT; 293 **outbuf = SO; 294 (*outbuf)++; 295 (*outbytesleft)--; 296 } 297 }/* get_plane_no OK */ 298 299 n = utf8_to_iso(plane_no, unidx, cnscode, 300 *outbuf, *outbytesleft); 301 if (n > 0) { 302 (*outbuf) += n; 303 (*outbytesleft) -= n; 304 } else { 305 st->_errno = errno; 306 return((size_t) -1); 307 } 308 st->ustate = U0; 309 st->_errno = 0; 310 break; 311 case U5: 312 313 first_byte = st->keepc[0]; 314 315 /* if the first byte is 0xed, it is illegal sequence if the second 316 * one is between 0xa0 and 0xbf because surrogate section is ill-formed 317 */ 318 if (((uchar_t)**inbuf) < valid_min_2nd_byte[first_byte] || 319 ((uchar_t)**inbuf) > valid_max_2nd_byte[first_byte] ) 320 st->_errno = errno = EILSEQ; 321 else { 322 st->ustate = U6; 323 st->keepc[1] = **inbuf; 324 } 325 break; 326 case U6: 327 if ((**inbuf & 0xc0) == MSB) /* 0x80..0xbf */ 328 { 329 st->ustate = U7; 330 st->keepc[2] = **inbuf; 331 } 332 else 333 st->_errno = errno = EILSEQ; 334 break; 335 case U7: 336 if ((**inbuf & 0xc0) == MSB) /* 0x80..0xbf */ 337 { /* skip it to simplify */ 338 st->ustate = U0; 339 st->_errno = 0; 340 } 341 else 342 st->_errno = errno = EILSEQ; 343 break; 344 default: /* should never come here */ 345 st->_errno = errno = EILSEQ; 346 st->ustate = U0; /* reset state */ 347 break; 348 } 349 350 if (st->_errno) { 351 #ifdef DEBUG 352 fprintf(stderr, "!!!!!\tst->_errno = %d\tst->ustate = %d\n", 353 st->_errno, st->ustate); 354 #endif 355 break; 356 } 357 (*inbuf)++; 358 (*inbytesleft)--; 359 } 360 361 if (errno) 362 return((size_t) -1); 363 364 if (*inbytesleft == 0 && st->ustate != U0) { 365 errno = EINVAL; 366 return ((size_t) -1); 367 } 368 369 if (*inbytesleft > 0 && *outbytesleft == 0) { 370 errno = E2BIG; 371 return((size_t) -1); 372 } 373 return (*inbytesleft); 374 } 375 376 377 /* 378 * Get plane number by UTF8 code; i.e. plane #1 returns 1, #2 returns 2, etc. 379 * Returns -1 on error conditions and return -2 due to illegal sequence 380 * 381 * Since binary search of the UTF8 to CNS table is necessary, might as well 382 * return index and CNS code matching to the unicode. 383 */ 384 static int get_plane_no_by_utf(const char c1, const char c2, 385 int *unidx, unsigned long *cnscode) 386 { 387 int ret; 388 unsigned long unicode; 389 390 unicode = (unsigned long) ((c1 & ONEBYTE) << 8) + (c2 & ONEBYTE); 391 /* the 0xfffe and 0xffff should not be allowed */ 392 if ( unicode == 0xFFFE || unicode == 0xFFFF ) return -2; 393 394 *unidx = binsearch(unicode, utf_cns_tab, MAX_UTF_NUM); 395 if ((*unidx) >= 0) 396 *cnscode = utf_cns_tab[*unidx].cnscode; 397 else 398 return(0); /* match from UTF8 to CNS not found */ 399 #ifdef DEBUG 400 fprintf(stderr, "Unicode=%04x, idx=%5d, CNS=%x ", unicode, *unidx, *cnscode); 401 #endif 402 403 ret = (int) (*cnscode >> 16); 404 switch (ret) { 405 case 0x21: /* 0x8EA1 - G */ 406 case 0x22: /* 0x8EA2 - H */ 407 case 0x23: /* 0x8EA3 - I */ 408 case 0x24: /* 0x8EA4 - J */ 409 case 0x25: /* 0x8EA5 - K */ 410 case 0x26: /* 0x8EA6 - L */ 411 case 0x27: /* 0x8EA7 - M */ 412 case 0x28: /* 0x8EA8 - N */ 413 case 0x29: /* 0x8EA9 - O */ 414 case 0x2a: /* 0x8EAA - P */ 415 case 0x2b: /* 0x8EAB - Q */ 416 case 0x2c: /* 0x8EAC - R */ 417 case 0x2d: /* 0x8EAD - S */ 418 case 0x2f: /* 0x8EAF - U */ 419 case 0x30: /* 0x8EB0 - V */ 420 return (ret - 0x20); /* so that we can use GET_PLANEC() */ 421 case 0x2e: /* 0x8EAE - T */ 422 return (3); /* CNS 11643-1992 */ 423 default: 424 return (-1); 425 } 426 } 427 428 429 /* 430 * ISO/IEC 10646 (Unicode) --> ISO 2022-7 431 * Unicode --> UTF8 (FSS-UTF) 432 * (File System Safe Universal Character Set Transformation Format) 433 * Return: > 0 - converted with enough space in output buffer 434 * = 0 - no space in outbuf 435 */ 436 static int utf8_to_iso(int plane_no, int unidx, unsigned long cnscode, 437 char *buf, size_t buflen) 438 { 439 unsigned long val; /* CNS 11643 value */ 440 #ifdef DEBUG 441 char cns_str[5]; 442 #endif 443 444 if (buflen < 2) { 445 errno = E2BIG; 446 return(0); 447 } 448 449 450 if (unidx < 0) { /* no match from UTF8 to CNS 11643 */ 451 *buf = *(buf+1) = NON_ID_CHAR; 452 return(2); 453 } else { 454 val = cnscode & 0xffff; 455 *buf = (val & 0xff00) >> 8; 456 *(buf+1) = val & 0xff; 457 } 458 #ifdef DEBUG 459 fprintf(stderr, "\t%02x%02x\t", *buf, *(buf+1)); 460 #endif 461 462 #ifdef DEBUG 463 switch (plane_no) { 464 case 1: 465 cns_str[0] = *buf | MSB; 466 cns_str[1] = *(buf+1) | MSB; 467 cns_str[2] = cns_str[3] = cns_str[4] = NULL; 468 break; 469 case 2: 470 case 3: 471 case 4: 472 case 5: 473 case 6: 474 case 7: 475 case 8: 476 case 9: 477 case 10: 478 case 11: 479 case 12: 480 case 13: 481 case 14: 482 case 15: 483 case 16: 484 cns_str[0] = MBYTE; 485 cns_str[1] = (char) PMASK + plane_no; 486 cns_str[2] = (char) *buf | MSB; 487 cns_str[3] = (char) *(buf+1) | MSB; 488 cns_str[4] = NULL; 489 break; 490 } 491 492 fprintf(stderr, "#%d ->%s<-\n", plane_no, cns_str); 493 #endif 494 return(2); 495 } 496 497 498 /* binsearch: find x in v[0] <= v[1] <= ... <= v[n-1] */ 499 static int binsearch(unsigned long x, utf_cns v[], int n) 500 { 501 int low, high, mid; 502 503 low = 0; 504 high = n - 1; 505 while (low <= high) { 506 mid = (low + high) / 2; 507 if (x < v[mid].unicode) 508 high = mid - 1; 509 else if (x > v[mid].unicode) 510 low = mid + 1; 511 else /* found match */ 512 return mid; 513 } 514 return (-1); /* no match */ 515 } 516