1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 1995, by Sun Microsystems, Inc. 23 * All rights reserved. 24 */ 25 26 #include <stdio.h> 27 #include <stdlib.h> 28 #include <sys/types.h> 29 #include <sys/isa_defs.h> 30 #include <errno.h> 31 #include "common_defs.h" 32 #include "cns11643_unicode_TW.h" /* CNS 11643 to UTF8 mapping table */ 33 34 #define MSB 0x80 /* most significant bit */ 35 #define MBYTE 0x8e /* multi-byte (4 byte character) */ 36 #define PMASK 0xa0 /* plane number mask */ 37 #define ONEBYTE 0xff /* right most byte */ 38 #define MSB_OFF 0x7f /* mask off MBS */ 39 #define VALID_EUC_BYTE(v) (((uchar_t)v) >= 0xA1 && ((uchar_t)v) <= 0xFE) 40 41 /* non-identified character */ 42 #define UTF8_NON_ID_CHAR1 0xEF 43 #define UTF8_NON_ID_CHAR2 0xBF 44 #define UTF8_NON_ID_CHAR3 0xBD 45 46 47 typedef struct _icv_state { 48 char keepc[4]; /* maximum # byte of CNS11643 code */ 49 short cstate; /* state machine id */ 50 int _errno; /* internal errno */ 51 boolean little_endian; 52 boolean bom_written; 53 } _iconv_st; 54 55 enum _CSTATE { C0, C1, C2, C3 }; 56 57 static int get_plane_no_by_char(const char); 58 static int cns_to_utf8(int, _iconv_st *, char*, size_t, int *); 59 static int binsearch(unsigned long, cns_utf[], int); 60 static uint_t getUnicodeFromUDA(int, uchar_t, uchar_t); 61 62 63 /* 64 * Open; called from iconv_open() 65 */ 66 void * 67 _icv_open() 68 { 69 _iconv_st *st; 70 71 if ((st = (_iconv_st *)malloc(sizeof(_iconv_st))) == NULL) { 72 errno = ENOMEM; 73 return ((void *) -1); 74 } 75 76 st->cstate = C0; 77 st->_errno = 0; 78 st->little_endian = false; 79 st->bom_written = false; 80 #if defined(UCS_2LE) 81 st->little_endian = true; 82 st->bom_written = true; 83 #endif 84 return ((void *) st); 85 } 86 87 88 /* 89 * Close; called from iconv_close() 90 */ 91 void 92 _icv_close(_iconv_st *st) 93 { 94 if (!st) 95 errno = EBADF; 96 else 97 free(st); 98 } 99 100 101 /* 102 * Actual conversion; called from iconv() 103 */ 104 /*======================================================= 105 * 106 * State Machine for interpreting CNS 11643 code 107 * 108 *======================================================= 109 * 110 * plane 2 - 16 111 * 1st C 2nd C 3rd C 112 * +------> C0 -----> C1 -----------> C2 -----> C3 113 * | ascii | plane 1 | 4th C | 114 * ^ v 2nd C v v 115 * +----<---+-----<----+-------<---------<-------+ 116 * 117 *=======================================================*/ 118 size_t 119 _icv_iconv(_iconv_st *st, char **inbuf, size_t *inbytesleft, 120 char **outbuf, size_t *outbytesleft) 121 { 122 int plane_no = 0, n; 123 int uconv_num = 0; 124 125 #ifdef DEBUG 126 fprintf(stderr, "========== iconv(): CNS11643 --> UTF2 ==========\n"); 127 #endif 128 if (st == NULL) { 129 errno = EBADF; 130 return ((size_t) -1); 131 } 132 133 if (inbuf == NULL || *inbuf == NULL) { /* Reset request. */ 134 st->cstate = C0; 135 st->_errno = 0; 136 return ((size_t) 0); 137 } 138 139 st->_errno = 0; /* reset internal errno */ 140 errno = 0; /* reset external errno */ 141 142 /* a state machine for interpreting CNS 11643 code */ 143 while (*inbytesleft > 0 && *outbytesleft > 0) { 144 switch (st->cstate) { 145 case C0: /* assuming ASCII in the beginning */ 146 if (**inbuf & MSB) { 147 if (((uchar_t)**inbuf) == MBYTE || VALID_EUC_BYTE(**inbuf)) { 148 st->keepc[0] = (**inbuf); 149 st->cstate = C1; 150 } else 151 st->_errno = errno = EILSEQ; 152 } else { /* real ASCII */ 153 /* 154 * Code conversion for UCS-2LE to support Samba 155 */ 156 if (st->little_endian) { 157 if (!st->bom_written) { 158 if (*outbytesleft < 4) 159 errno = E2BIG; 160 else { 161 *(*outbuf)++ = (uchar_t)0xff; 162 *(*outbuf)++ = (uchar_t)0xfe; 163 *outbytesleft -= 2; 164 165 st->bom_written = true; 166 } 167 } 168 169 if (*outbytesleft < 2) 170 errno = E2BIG; 171 else { 172 *(*outbuf)++ = **inbuf; 173 *(*outbuf)++ = (uchar_t)0x0; 174 *outbytesleft -= 2; 175 } 176 } else { 177 **outbuf = **inbuf; 178 (*outbuf)++; 179 (*outbytesleft)--; 180 } 181 } 182 break; 183 case C1: /* Chinese characters: 2nd byte */ 184 if (((uchar_t)st->keepc[0]) == MBYTE) { 185 plane_no = get_plane_no_by_char(**inbuf); 186 if (plane_no == -1) { /* illegal plane */ 187 st->_errno = errno = EILSEQ; 188 } else { 189 st->keepc[1] = (**inbuf); 190 st->cstate = C2; 191 } 192 } else { 193 if (VALID_EUC_BYTE(**inbuf)) { /* plane #1 */ 194 int uconv_num_internal = 0; 195 196 st->keepc[1] = (**inbuf); 197 st->keepc[2] = st->keepc[3] = NULL; 198 n = cns_to_utf8(1, st, *outbuf, 199 *outbytesleft, &uconv_num_internal); 200 if (n > 0) { 201 (*outbuf) += n; 202 (*outbytesleft) -= n; 203 204 uconv_num += uconv_num_internal; 205 206 st->cstate = C0; 207 } else { /* don't reset state */ 208 st->_errno = errno = E2BIG; 209 } 210 } else { /* input char doesn't belong 211 * to the input code set 212 */ 213 st->_errno = errno = EILSEQ; 214 } 215 } 216 break; 217 case C2: /* plane #2 - #16 (4 bytes): get 3nd byte */ 218 if (VALID_EUC_BYTE(**inbuf)) { /* 3rd byte */ 219 st->keepc[2] = (**inbuf); 220 st->cstate = C3; 221 } else { 222 st->_errno = errno = EILSEQ; 223 } 224 break; 225 case C3: /* plane #2 - #16 (4 bytes): get 4th byte */ 226 if (VALID_EUC_BYTE(**inbuf)) { /* 4th byte */ 227 int uconv_num_internal = 0; 228 229 st->keepc[3] = (**inbuf); 230 n = cns_to_utf8(plane_no, st, *outbuf, 231 *outbytesleft, &uconv_num_internal); 232 if (n > 0) { 233 (*outbuf) += n; 234 (*outbytesleft) -= n; 235 236 uconv_num += uconv_num_internal; 237 238 st->cstate = C0; /* reset state */ 239 } else { /* don't reset state */ 240 st->_errno = errno = E2BIG; 241 } 242 } else { 243 st->_errno = errno = EILSEQ; 244 } 245 break; 246 default: /* should never come here */ 247 st->_errno = errno = EILSEQ; 248 st->cstate = C0; /* reset state */ 249 break; 250 } 251 252 if (st->_errno) { 253 #ifdef DEBUG 254 fprintf(stderr, "!!!!!\tst->_errno = %d\tst->cstate = %d\n", 255 st->_errno, st->cstate); 256 #endif 257 break; 258 } 259 260 (*inbuf)++; 261 (*inbytesleft)--; 262 } 263 264 if (*inbytesleft == 0 && st->cstate != C0) 265 errno = EINVAL; 266 267 if (*inbytesleft > 0 && *outbytesleft == 0) 268 errno = E2BIG; 269 270 if (errno) { 271 /* 272 * if error, *inbuf points to the byte following the last byte 273 * successfully used in the conversion. 274 */ 275 *inbuf -= (st->cstate - C0); 276 *inbytesleft += (st->cstate - C0); 277 st->cstate = C0; 278 return ((size_t) -1); 279 } 280 281 return uconv_num; 282 } 283 284 285 /* 286 * Get plane number by char; i.e. 0xa2 returns 2, 0xae returns 14, etc. 287 * Returns -1 on error conditions 288 */ 289 static int get_plane_no_by_char(const char inbuf) 290 { 291 int ret; 292 unsigned char uc = (unsigned char) inbuf; 293 294 ret = uc - PMASK; 295 switch (ret) { 296 case 1: /* 0x8EA1 */ 297 case 2: /* 0x8EA2 */ 298 case 3: /* 0x8EA3 */ 299 case 4: /* 0x8EA4 */ 300 case 5: /* 0x8EA5 */ 301 case 6: /* 0x8EA6 */ 302 case 7: /* 0x8EA7 */ 303 case 12: /* 0x8EAC */ 304 case 13: /* 0x8EAD */ 305 case 14: /* 0x8EAE */ 306 case 15: /* 0x8EAF */ 307 case 16: /* 0x8EB0 */ 308 return (ret); 309 default: 310 return (-1); 311 } 312 } 313 314 315 /* 316 * CNS 11643 code --> ISO/IEC 10646 (Unicode) 317 * Unicode --> UTF8 (FSS-UTF) 318 * (File System Safe Universal Character Set Transformation Format) 319 * Return: > 0 - converted with enough space in output buffer 320 * = 0 - no space in outbuf 321 */ 322 static int cns_to_utf8(int plane_no, _iconv_st *st, char *buf, size_t buflen, int *uconv_num) 323 { 324 char cns_str[3]; 325 unsigned long cns_val; /* MSB mask off CNS 11643 value */ 326 int unidx; /* Unicode index */ 327 unsigned long uni_val = 0; /* Unicode */ 328 char *keepc = st->keepc; 329 330 #ifdef DEBUG 331 fprintf(stderr, "%s %d ", keepc, plane_no); 332 #endif 333 if (plane_no == 1) { 334 cns_str[0] = keepc[0] & MSB_OFF; 335 cns_str[1] = keepc[1] & MSB_OFF; 336 } else { 337 cns_str[0] = keepc[2] & MSB_OFF; 338 cns_str[1] = keepc[3] & MSB_OFF; 339 } 340 cns_val = (cns_str[0] << 8) + cns_str[1]; 341 #ifdef DEBUG 342 fprintf(stderr, "%x\t", cns_val); 343 #endif 344 345 switch (plane_no) { 346 case 1: 347 unidx = binsearch(cns_val, cns1_utf_tab, MAX_CNS1_NUM); 348 if (unidx >= 0) 349 uni_val = cns1_utf_tab[unidx].unicode; 350 break; 351 case 2: 352 unidx = binsearch(cns_val, cns2_utf_tab, MAX_CNS2_NUM); 353 if (unidx >= 0) 354 uni_val = cns2_utf_tab[unidx].unicode; 355 break; 356 case 3: 357 unidx = binsearch(cns_val, cns3_utf_tab, MAX_CNS3_NUM); 358 if (unidx >= 0) 359 uni_val = cns3_utf_tab[unidx].unicode; 360 break; 361 case 4: 362 unidx = binsearch(cns_val, cns4_utf_tab, MAX_CNS4_NUM); 363 if (unidx >= 0) 364 uni_val = cns4_utf_tab[unidx].unicode; 365 break; 366 case 5: 367 unidx = binsearch(cns_val, cns5_utf_tab, MAX_CNS5_NUM); 368 if (unidx >= 0) 369 uni_val = cns5_utf_tab[unidx].unicode; 370 break; 371 case 6: 372 unidx = binsearch(cns_val, cns6_utf_tab, MAX_CNS6_NUM); 373 if (unidx >= 0) 374 uni_val = cns6_utf_tab[unidx].unicode; 375 break; 376 case 7: 377 unidx = binsearch(cns_val, cns7_utf_tab, MAX_CNS7_NUM); 378 if (unidx >= 0) 379 uni_val = cns7_utf_tab[unidx].unicode; 380 break; 381 case 12: 382 case 13: 383 case 14: 384 case 16: 385 uni_val = getUnicodeFromUDA(plane_no, (uchar_t)keepc[2], (uchar_t)keepc[3]); 386 unidx = 1; /* deceit the following if statement */ 387 break; 388 case 15: 389 unidx = binsearch(cns_val, cns15_utf_tab, MAX_CNS15_NUM); 390 if (unidx >= 0) 391 uni_val = cns15_utf_tab[unidx].unicode; 392 break; 393 default: 394 unidx = -1; /* no mapping from CNS to UTF8 */ 395 break; 396 } 397 398 #ifdef DEBUG 399 fprintf(stderr, "unidx = %d, unicode = %x\t", unidx, uni_val); 400 #endif 401 402 /* 403 * Code version for UCS-2LE to support Samba 404 */ 405 if (st->little_endian) { 406 int size = 0; 407 408 if (unidx < 0 || uni_val > 0x00ffff ) { 409 uni_val = ICV_CHAR_UCS2_REPLACEMENT; 410 *uconv_num = 1; 411 } 412 413 if (!st->bom_written) { 414 if (buflen < 4) 415 return 0; 416 417 *(buf + size++) = (uchar_t)0xff; 418 *(buf + size++) = (uchar_t)0xfe; 419 st->bom_written = true; 420 } 421 422 if (buflen < 2) 423 return 0; 424 425 *(buf + size++) = (uchar_t)(uni_val & 0xff); 426 *(buf + size++) = (uchar_t)((uni_val >> 8) & 0xff); 427 428 return size; 429 } 430 431 if (unidx >= 0) { /* do Unicode to UTF8 conversion */ 432 if (uni_val >= 0x0080 && uni_val <= 0x07ff) { 433 if (buflen < 2) { 434 #ifdef DEBUG 435 fprintf(stderr, "outbuf overflow in cns_to_utf8()!!\n"); 436 #endif 437 errno = E2BIG; 438 return(0); 439 } 440 *buf = (char)((uni_val >> 6) & 0x1f) | 0xc0; 441 *(buf+1) = (char)(uni_val & 0x3f) | 0x80; 442 #ifdef DEBUG 443 fprintf(stderr, "%x %x\n", *buf&ONEBYTE, *(buf+1)&ONEBYTE); 444 #endif 445 return(2); 446 } 447 if (uni_val >= 0x0800 && uni_val <= 0xffff) { 448 if (buflen < 3) { 449 #ifdef DEBUG 450 fprintf(stderr, "outbuf overflow in cns_to_utf8()!!\n"); 451 #endif 452 errno = E2BIG; 453 return(0); 454 } 455 *buf = (char)((uni_val >> 12) & 0xf) | 0xe0; 456 *(buf+1) = (char)((uni_val >>6) & 0x3f) | 0x80; 457 *(buf+2) = (char)(uni_val & 0x3f) | 0x80; 458 #ifdef DEBUG 459 fprintf(stderr, "%x %x %x\n", *buf&ONEBYTE, *(buf+1)&ONEBYTE, *(buf+2)&ONEBYTE); 460 #endif 461 return(3); 462 } 463 if (uni_val >= 0x10000 && uni_val <= 0x10ffff) { 464 if (buflen < 4) { 465 errno = E2BIG; 466 return(0); 467 } 468 469 *buf = (char)((uni_val >> 18) & 0x7) | 0xf0; 470 *(buf+1) = (char)((uni_val >> 12) & 0x3f) | 0x80; 471 *(buf+2) = (char)((uni_val >>6) & 0x3f) | 0x80; 472 *(buf+3) = (char)(uni_val & 0x3f) | 0x80; 473 return(4); 474 } 475 } 476 477 /* can't find a match in CNS --> UTF8 table or illegal UTF8 code */ 478 if (buflen < 3) { 479 #ifdef DEBUG 480 fprintf(stderr, "outbuf overflow in cns_to_utf8()!!\n"); 481 #endif 482 errno = E2BIG; 483 return(0); 484 } 485 486 *(unsigned char*) buf = UTF8_NON_ID_CHAR1; 487 *(unsigned char*) (buf+1) = UTF8_NON_ID_CHAR2; 488 *(unsigned char*) (buf+2) = UTF8_NON_ID_CHAR3; 489 490 /* non-identical conversion */ 491 *uconv_num = 1; 492 493 #ifdef DEBUG 494 fprintf(stderr, "%c %c %c\n", *buf, *(buf+1), *(buf+2)); 495 #endif 496 return(3); 497 } 498 499 static uint_t 500 getUnicodeFromUDA(int plane_no, uchar_t byte1, uchar_t byte2) 501 { 502 uint_t ucs4, disp; 503 504 /* compact into consecutive Unicode value for CNS plane 16 */ 505 if ( plane_no == 16 ) --plane_no; 506 507 disp = (plane_no - 12) * 8836 + (byte1 - 0xA1) * 94 + ( byte2 - 0xA1); 508 return (ucs4 = (0xf << 16) | (disp & 0xffff)); 509 } 510 511 /* binsearch: find x in v[0] <= v[1] <= ... <= v[n-1] */ 512 static int binsearch(unsigned long x, cns_utf v[], int n) 513 { 514 int low, high, mid; 515 516 low = 0; 517 high = n - 1; 518 while (low <= high) { 519 mid = (low + high) / 2; 520 if (x < v[mid].cnscode) 521 high = mid - 1; 522 else if (x > v[mid].cnscode) 523 low = mid + 1; 524 else /* found match */ 525 return mid; 526 } 527 return (-1); /* no match */ 528 } 529