1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 1995, by Sun Microsystems, Inc. 24 * All rights reserved. 25 */ 26 27 #include <stdio.h> 28 #include <stdlib.h> 29 #include <sys/types.h> 30 #include <sys/isa_defs.h> 31 #include <errno.h> 32 #include "unicode_big5.h" /* UTF8 to Big-5 mapping table */ 33 #include "common_defs.h" 34 35 #define MSB 0x80 /* most significant bit */ 36 #define ONEBYTE 0xff /* right most byte */ 37 38 #define NON_ID_CHAR '?' /* non-identified character */ 39 40 typedef struct _icv_state { 41 char keepc[6]; /* maximum # byte of UTF8 code */ 42 short ustate; 43 int _errno; /* internal errno */ 44 boolean little_endian; 45 boolean bom_written; 46 } _iconv_st; 47 48 enum _USTATE { U0, U1, U2, U3, U4, U5, U6, U7 }; 49 50 static int get_big5_by_utf(uint_t, int *, unsigned long *); 51 static int utf8_to_big5(int, unsigned long, char *, size_t, int *); 52 static int binsearch(unsigned long, utf_big5[], int); 53 54 55 /* 56 * Open; called from iconv_open() 57 */ 58 void * 59 _icv_open() 60 { 61 _iconv_st *st; 62 63 if ((st = (_iconv_st *)malloc(sizeof(_iconv_st))) == NULL) { 64 errno = ENOMEM; 65 return ((void *) -1); 66 } 67 68 st->ustate = U0; 69 st->_errno = 0; 70 st->little_endian = false; 71 st->bom_written = false; 72 #if defined(UCS_2LE) 73 st->little_endian = true; 74 st->bom_written = true; 75 #endif 76 return ((void *) st); 77 } 78 79 80 /* 81 * Close; called from iconv_close() 82 */ 83 void 84 _icv_close(_iconv_st *st) 85 { 86 if (!st) 87 errno = EBADF; 88 else 89 free(st); 90 } 91 92 93 /* 94 * Actual conversion; called from iconv() 95 */ 96 /*========================================================= 97 * 98 * State Machine for interpreting UTF8 code 99 * 100 *========================================================= 101 * 2nd byte 3rd byte 4th byte 102 * +----->------->------>U5----->U6------------>U7 103 * | | 104 * | 3 byte unicode | 105 * +----->------->-------+ | 106 * | | | 107 * ^ v | 108 * | 2 byte U2 ---> U3 | 109 * | unicode v | 110 * +------> U0 -------> U1 +-------->U4---+ 111 * ^ ascii | | ^ | 112 * | | +-------->--------->--------+ | 113 * | v v 114 * +----<---+-----<------------<------------<------------+ 115 * 116 *=========================================================*/ 117 size_t 118 _icv_iconv(_iconv_st *st, char **inbuf, size_t *inbytesleft, 119 char **outbuf, size_t *outbytesleft) 120 { 121 int n, unidx; 122 unsigned long big5code; 123 int uconv_num = 0; 124 int utf8_len = 0; 125 uint_t ucs; 126 127 #ifdef DEBUG 128 fprintf(stderr, "========== iconv(): UTF2 --> Big-5 ==========\n"); 129 #endif 130 if (st == NULL) { 131 errno = EBADF; 132 return ((size_t) -1); 133 } 134 135 if (inbuf == NULL || *inbuf == NULL) { /* Reset request. */ 136 st->ustate = U0; 137 st->_errno = 0; 138 return ((size_t) 0); 139 } 140 141 st->_errno = 0; /* reset internal errno */ 142 errno = 0; /* reset external errno */ 143 144 /* a state machine for interpreting UTF8 code */ 145 while (*inbytesleft > 0 && *outbytesleft > 0) { 146 147 uchar_t first_byte; 148 int uconv_num_internal = 0; 149 150 switch (st->ustate) { 151 case U0: /* assuming ASCII in the beginning */ 152 /* 153 * Code converion for UCS-2LE to support Samba 154 */ 155 if (st->little_endian) { 156 st->ustate = U1; 157 st->keepc[0] = **inbuf; 158 } 159 else if ((**inbuf & MSB) == 0) { /* ASCII */ 160 **outbuf = **inbuf; 161 (*outbuf)++; 162 (*outbytesleft)--; 163 } else { /* Chinese character */ 164 if ((**inbuf & 0xe0) == 0xc0) { /* 2 byte unicode 0xc2..0xdf */ 165 166 /* invalid sequence if the first char is either 0xc0 or 0xc1 */ 167 if ( number_of_bytes_in_utf8_char[((uchar_t)**inbuf)] == ICV_TYPE_ILLEGAL_CHAR ) 168 st->_errno = errno = EILSEQ; 169 else { 170 st->ustate = U1; 171 st->keepc[0] = **inbuf; 172 } 173 } else if ((**inbuf & 0xf0) == 0xe0) { /* 3 byte 0xe0..0xef */ 174 st->ustate = U2; 175 st->keepc[0] = **inbuf; 176 } else { 177 /* four bytes of UTF-8 sequences */ 178 if ( number_of_bytes_in_utf8_char[((uchar_t)**inbuf)] == ICV_TYPE_ILLEGAL_CHAR ) 179 st->_errno = errno = EILSEQ; 180 else { 181 st->ustate = U5; 182 st->keepc[0] = **inbuf; 183 } 184 } 185 } 186 break; 187 case U1: /* 2 byte unicode */ 188 if ((**inbuf & 0xc0) == MSB || st->little_endian) { 189 utf8_len = 2; 190 st->keepc[1] = **inbuf; 191 192 /* 193 * Code conversion for UCS-2LE to support Samba 194 */ 195 if (st->little_endian) { 196 /* 197 * It's ASCII 198 */ 199 if (st->keepc[1] == 0 && (st->keepc[0] & 0x80) == 0) { 200 *(*outbuf)++ = st->keepc[0]; 201 (*outbytesleft)--; 202 st->ustate = U0; 203 break; 204 } 205 206 ucs = ((st->keepc[1] & 0xff) << 8) | (st->keepc[0] & 0xff); 207 208 } else 209 convert_utf8_to_ucs4((uchar_t*)(&st->keepc[0]), utf8_len, &ucs); 210 211 st->ustate = U4; 212 #ifdef DEBUG 213 fprintf(stderr, "UTF8: %02x%02x --> ", 214 st->keepc[0]&ONEBYTE, st->keepc[1]&ONEBYTE); 215 #endif 216 continue; /* should not advance *inbuf */ 217 } else { 218 st->_errno = errno = EILSEQ; 219 } 220 break; 221 case U2: /* 3 byte unicode - 2nd byte */ 222 223 first_byte = st->keepc[0]; 224 225 /* if the first byte is 0xed, it is illegal sequence if the second 226 * one is between 0xa0 and 0xbf because surrogate section is ill-formed 227 */ 228 if (((uchar_t)**inbuf) < valid_min_2nd_byte[first_byte] || 229 ((uchar_t)**inbuf) > valid_max_2nd_byte[first_byte] ) 230 st->_errno = errno = EILSEQ; 231 else { 232 st->ustate = U3; 233 st->keepc[1] = **inbuf; 234 } 235 break; 236 case U3: /* 3 byte unicode - 3rd byte */ 237 if ((**inbuf & 0xc0) == MSB) { 238 st->ustate = U4; 239 utf8_len = 3; 240 st->keepc[2] = **inbuf; 241 242 convert_utf8_to_ucs4((uchar_t*)(&st->keepc[0]), utf8_len, &ucs); 243 #ifdef DEBUG 244 fprintf(stderr, "UTF8: %02x%02x%02x --> ", st->keepc[0]&ONEBYTE, 245 st->keepc[1]&ONEBYTE, **inbuf&ONEBYTE); 246 #endif 247 continue; /* should not advance *inbuf */ 248 } else { 249 st->_errno = errno = EILSEQ; 250 } 251 break; 252 case U4: 253 254 n = get_big5_by_utf(ucs, &unidx, &big5code); 255 256 if ( n == -1 ) 257 { /* unicode is either 0xfffe or 0xffff */ 258 st->_errno = errno = EILSEQ; 259 break; 260 } 261 262 /* comment the following lines out to ignore the non-Big5 characters 263 if (n != 0) { * legal unicode;illegal Big5 * 264 st->_errno = errno = EILSEQ; 265 break; 266 } 267 */ 268 269 n = utf8_to_big5(unidx, big5code, 270 *outbuf, *outbytesleft, &uconv_num_internal); 271 if (n > 0) { 272 (*outbuf) += n; 273 (*outbytesleft) -= n; 274 275 uconv_num += uconv_num_internal; 276 277 st->ustate = U0; 278 } else { 279 st->_errno = errno = E2BIG; 280 } 281 break; 282 case U5: 283 284 first_byte = st->keepc[0]; 285 286 /* if the first byte is 0xf0, it is illegal sequence if 287 * the second one is between 0x80 and 0x8f 288 * for Four-Byte UTF: U+10000..U+10FFFF 289 */ 290 if (((uchar_t)**inbuf) < valid_min_2nd_byte[first_byte] || 291 ((uchar_t)**inbuf) > valid_max_2nd_byte[first_byte] ) 292 st->_errno = errno = EILSEQ; 293 else 294 { 295 st->ustate = U6; 296 st->keepc[1] = **inbuf; 297 } 298 break; 299 case U6: 300 if ((**inbuf & 0xc0) == MSB) /* 0x80..0xbf */ 301 { 302 st->ustate = U7; 303 st->keepc[2] = **inbuf; 304 } 305 else 306 st->_errno = errno = EILSEQ; 307 break; 308 case U7: 309 if ((**inbuf & 0xc0) == MSB) /* 0x80..0xbf */ 310 { /* replace with double NON_ID_CHARs */ 311 312 utf8_len = 4; 313 st->keepc[3] = **inbuf; 314 315 convert_utf8_to_ucs4((uchar_t*)(&st->keepc[0]), utf8_len, &ucs); 316 317 st->ustate = U4; 318 continue; 319 320 #if 0 321 if ( *outbytesleft < 2 ) 322 st->_errno = errno = E2BIG; 323 else 324 { 325 **outbuf = NON_ID_CHAR; 326 *(*outbuf+1) = NON_ID_CHAR; 327 (*outbytesleft) -= 2; 328 329 uconv_num++; 330 331 st->ustate = U0; 332 } 333 #endif 334 } 335 else 336 st->_errno = errno = EILSEQ; 337 break; 338 default: /* should never come here */ 339 st->_errno = errno = EILSEQ; 340 st->ustate = U0; /* reset state */ 341 break; 342 } 343 344 if (st->_errno) { 345 #ifdef DEBUG 346 fprintf(stderr, "!!!!!\tst->_errno = %d\tst->ustate = %d\n", 347 st->_errno, st->ustate); 348 #endif 349 break; 350 } 351 352 (*inbuf)++; 353 (*inbytesleft)--; 354 355 } 356 357 if (*inbytesleft == 0 && st->ustate != U0) 358 errno = EINVAL; 359 360 if (*inbytesleft > 0 && *outbytesleft == 0) 361 errno = E2BIG; 362 363 if (errno) { 364 int num_reversed_bytes = 0; 365 366 switch (st->ustate) 367 { 368 case U1: 369 num_reversed_bytes = 1; 370 break; 371 case U2: 372 num_reversed_bytes = 1; 373 break; 374 case U3: 375 num_reversed_bytes = 2; 376 break; 377 case U4: 378 num_reversed_bytes = utf8_len - 1; 379 break; 380 case U5: 381 num_reversed_bytes = 1; 382 break; 383 case U6: 384 num_reversed_bytes = 2; 385 break; 386 case U7: 387 num_reversed_bytes = 3; 388 break; 389 } 390 391 /* 392 * if error, *inbuf points to the byte following the last byte 393 * successfully used in the conversion. 394 */ 395 *inbuf -= num_reversed_bytes; 396 *inbytesleft += num_reversed_bytes; 397 st->ustate = U0; 398 return ((size_t) -1); 399 } 400 401 return uconv_num; 402 } 403 404 /* 405 * Match Big-5 code by UTF8 code; 406 * Return: = 0 - match from Unicode to Big-5 found 407 * = 1 - match from Unicode to Big-5 NOT found 408 * =-1 - illegal sequence 409 * 410 * Since binary search of the UTF8 to Big-5 table is necessary, might as well 411 * return index and Big-5 code matching to the unicode. 412 */ 413 static int get_big5_by_utf(uint_t ucs, int *unidx, unsigned long *big5code) 414 { 415 /* 0xfffe and 0xffff should not be allowed */ 416 if ( ucs == 0xFFFE || ucs == 0xFFFF ) return -1; 417 418 *unidx = binsearch(ucs, utf_big5_tab, MAX_BIG5_NUM); 419 if ((*unidx) >= 0) 420 *big5code = utf_big5_tab[*unidx].big5code; 421 else 422 return(1); /* match from UTF8 to Big-5 not found */ 423 #ifdef DEBUG 424 fprintf(stderr, "Unicode=%04x, idx=%5d, Big-5=%x ", ucs, *unidx, *big5code); 425 #endif 426 427 return(0); 428 } 429 430 431 /* 432 * ISO/IEC 10646 (Unicode) --> Big-5 433 * Unicode --> UTF8 (FSS-UTF) 434 * (File System Safe Universal Character Set Transformation Format) 435 * Return: > 0 - converted with enough space in output buffer 436 * = 0 - no space in outbuf 437 */ 438 static int utf8_to_big5(int unidx, unsigned long big5code, char *buf, size_t buflen, int *uconv_num) 439 { 440 unsigned long val; /* Big-5 value */ 441 char c1, c2, big5_str[3]; 442 443 if (buflen < 2) { 444 errno = E2BIG; 445 return(0); 446 } 447 448 if (unidx < 0) { /* no match from UTF8 to Big-5 */ 449 *buf = *(buf+1) = NON_ID_CHAR; 450 451 /* non-identical conversion */ 452 *uconv_num = 1; 453 454 } else { 455 val = big5code & 0xffff; 456 c1 = (char) ((val & 0xff00) >> 8); 457 c2 = (char) (val & 0xff); 458 459 *buf = big5_str[0] = c1; 460 *(buf+1) = big5_str[1] = c2; 461 big5_str[2] = NULL; 462 } 463 464 #ifdef DEBUG 465 fprintf(stderr, "\t->%x %x<-\n", *buf, *(buf+1)); 466 #endif 467 468 return(2); 469 } 470 471 472 /* binsearch: find x in v[0] <= v[1] <= ... <= v[n-1] */ 473 static int binsearch(unsigned long x, utf_big5 v[], int n) 474 { 475 int low, high, mid; 476 477 low = 0; 478 high = n - 1; 479 while (low <= high) { 480 mid = (low + high) / 2; 481 if (x < v[mid].unicode) 482 high = mid - 1; 483 else if (x > v[mid].unicode) 484 low = mid + 1; 485 else /* found match */ 486 return mid; 487 } 488 return (-1); /* no match */ 489 } 490