1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 1995, by Sun Microsystems, Inc. 24 * All rights reserved. 25 */ 26 27 #include <stdio.h> 28 #include <stdlib.h> 29 #include <sys/types.h> 30 #include <sys/isa_defs.h> 31 #include <errno.h> 32 #include "common_defs.h" 33 #include "big5_unicode.h" /* Big-5 to Unicode mapping table */ 34 35 #define MSB 0x80 /* most significant bit */ 36 #define MBYTE 0x8e /* multi-byte (4 byte character) */ 37 #define PMASK 0xa0 /* plane number mask */ 38 #define ONEBYTE 0xff /* right most byte */ 39 40 /* non-identified character */ 41 #define UTF8_NON_ID_CHAR1 0xEF 42 #define UTF8_NON_ID_CHAR2 0xBF 43 #define UTF8_NON_ID_CHAR3 0xBD 44 45 46 typedef struct _icv_state { 47 char keepc[2]; /* maximum # byte of Big-5 code */ 48 short cstate; /* state machine id */ 49 int _errno; /* internal errno */ 50 boolean little_endian; 51 boolean bom_written; 52 }_iconv_st; 53 54 enum _CSTATE { C0, C1 }; 55 56 static int big5_2nd_byte(char); 57 static int big5_to_utf8(_iconv_st *, char*, size_t, int *); 58 static int binsearch(unsigned long, big5_utf[], int); 59 60 61 /* 62 * Open; called from iconv_open() 63 */ 64 void * 65 _icv_open() 66 { 67 _iconv_st *st; 68 69 if ((st = (_iconv_st *)malloc(sizeof(_iconv_st))) == NULL) { 70 errno = ENOMEM; 71 return ((void *) -1); 72 } 73 74 st->cstate = C0; 75 st->_errno = 0; 76 st->little_endian = false; 77 st->bom_written = false; 78 #if defined(UCS_2LE) 79 st->little_endian = true; 80 st->bom_written = true; 81 #endif 82 return ((void *) st); 83 } 84 85 86 /* 87 * Close; called from iconv_close() 88 */ 89 void 90 _icv_close(_iconv_st *st) 91 { 92 if (!st) 93 errno = EBADF; 94 else 95 free(st); 96 } 97 98 99 /* 100 * Actual conversion; called from iconv() 101 */ 102 /*======================================================= 103 * 104 * State Machine for interpreting Big-5 code 105 * 106 *======================================================= 107 * 108 * 1st C 109 * +--------> C0 ----------> C1 110 * | ascii | 2nd C | 111 * ^ v v 112 * +----<-----+-----<--------+ 113 * 114 *=======================================================*/ 115 /* 116 * Big-5 encoding range: 117 * High byte: 0xA1 - 0xFE ( 94 encoding space) 118 * Low byte: 0x40 - 0x7E, 0xA1 - 0xFE ( 157 encoding space) 119 * Plane #1: 0xA140 - 0xC8FE ( 6280 encoding space) 120 * Plane #2: 0xC940 - 0xFEFE ( 8478 encoding space) 121 * Total: 94 * 157 = 14,758 (14758 encoding space) 122 */ 123 size_t 124 _icv_iconv(_iconv_st *st, char **inbuf, size_t *inbytesleft, 125 char **outbuf, size_t *outbytesleft) 126 { 127 int n; 128 int uconv_num = 0; 129 130 #ifdef DEBUG 131 fprintf(stderr, "========== iconv(): Big-5 --> UTF2 ==========\n"); 132 #endif 133 if (st == NULL) { 134 errno = EBADF; 135 return ((size_t) -1); 136 } 137 138 if (inbuf == NULL || *inbuf == NULL) { /* Reset request. */ 139 st->cstate = C0; 140 st->_errno = 0; 141 return ((size_t) 0); 142 } 143 144 st->_errno = 0; /* reset internal errno */ 145 errno = 0; /* reset external errno */ 146 147 /* a state machine for interpreting CNS 11643 code */ 148 while (*inbytesleft > 0 && *outbytesleft > 0) { 149 switch (st->cstate) { 150 case C0: /* assuming ASCII in the beginning */ 151 if (**inbuf & MSB) { 152 st->keepc[0] = (**inbuf); 153 st->cstate = C1; 154 } else { /* real ASCII */ 155 if (st->little_endian) { 156 if (!st->bom_written) { 157 if (*outbytesleft < 4) 158 errno = E2BIG; 159 else { 160 *(*outbuf)++ = (uchar_t)0xff; 161 *(*outbuf)++ = (uchar_t)0xfe; 162 *outbytesleft -= 2; 163 164 st->bom_written = true; 165 } 166 } 167 168 if (*outbytesleft < 2) 169 return E2BIG; 170 else { 171 *(*outbuf)++ = **inbuf; 172 *(*outbuf)++ = (uchar_t)0x0; 173 *outbytesleft -= 2; 174 } 175 } else { 176 **outbuf = **inbuf; 177 (*outbuf)++; 178 (*outbytesleft)--; 179 } 180 } 181 break; 182 case C1: /* Chinese characters: 2nd byte */ 183 if (big5_2nd_byte(**inbuf) == 0) { 184 int uconv_num_internal = 0; 185 186 st->keepc[1] = (**inbuf); 187 n = big5_to_utf8(st, *outbuf, 188 *outbytesleft, &uconv_num_internal); 189 if (n > 0) { 190 (*outbuf) += n; 191 (*outbytesleft) -= n; 192 193 uconv_num += uconv_num_internal; 194 195 st->cstate = C0; 196 } else { /* don't reset state */ 197 st->_errno = errno = E2BIG; 198 } 199 } else { /* input char doesn't belong 200 * to the input code set 201 */ 202 st->_errno = errno = EILSEQ; 203 } 204 break; 205 default: /* should never come here */ 206 st->_errno = errno = EILSEQ; 207 st->cstate = C0; /* reset state */ 208 break; 209 } 210 211 if (st->_errno) { 212 #ifdef DEBUG 213 fprintf(stderr, "!!!!!\tst->_errno = %d\tst->cstate = %d\n", 214 st->_errno, st->cstate); 215 #endif 216 break; 217 } 218 219 (*inbuf)++; 220 (*inbytesleft)--; 221 } 222 223 if (*inbytesleft == 0 && st->cstate != C0) 224 errno = EINVAL; 225 226 if (*inbytesleft > 0 && *outbytesleft == 0) 227 errno = E2BIG; 228 229 if (errno) { 230 /* 231 * if error, *inbuf points to the byte following the last byte 232 * successfully used in the conversion. 233 */ 234 *inbuf -= (st->cstate - C0); 235 *inbytesleft += (st->cstate - C0); 236 st->cstate = C0; 237 return ((size_t) -1); 238 } 239 240 return uconv_num; 241 } 242 243 244 /* 245 * Test whether inbuf is a valid character for 2nd byte Big-5 code 246 * Return: = 0 - valid Big-5 2nd byte 247 * = 1 - invalid Big-5 2nd byte 248 */ 249 static int big5_2nd_byte(char inbuf) 250 { 251 unsigned int buf = (unsigned int) (inbuf & ONEBYTE); 252 253 if ((buf >= 0x40) && (buf <= 0x7E)) 254 return (0); 255 if ((buf >= 0xA1) && (buf <= 0xFE)) 256 return (0); 257 return(1); 258 } 259 260 #ifdef UDC_SUPPORT 261 typedef struct _udc_sect { 262 unsigned int start, end, count; 263 } UDC; 264 265 UDC udc[] = { 266 { 0xFA40, 0xFEFE, 0x311 } 267 }; 268 269 #define UDC_START_UNICODE 0xF0000 270 271 static int 272 ifUDC(UDC *udc, unsigned int code) 273 { 274 int i; 275 276 for (i=0; i < 1; ++i) 277 if (code >= udc[i].start && code <= udc[i].end) 278 { 279 unsigned char c1, c2, leading_c1; 280 281 c1 = (unsigned char)(code >> 8); 282 c2 = (unsigned char)code; 283 leading_c1 = (unsigned char) (udc[i].start >> 8); 284 285 return UDC_START_UNICODE + (i ? udc[i-1].count : 0) + \ 286 (c1 - leading_c1) * 157 + ((c2 <= 0x7E) ? (c2 - 0x40) : ((c2 - 0x40) - (0xA1 - 0x7F))); 287 } 288 289 return 0; 290 } 291 #endif 292 293 /* 294 * Big-5 code --> ISO/IEC 10646 (Unicode) 295 * Unicode --> UTF8 (FSS-UTF) 296 * (File System Safe Universal Character Set Transformation Format) 297 * Return: > 0 - converted with enough space in output buffer 298 * = 0 - no space in outbuf 299 */ 300 static int big5_to_utf8(_iconv_st *st, char *buf, size_t buflen, int *uconv_num) 301 { 302 unsigned long big5_val; /* Big-5 value */ 303 int unidx = 0; /* Unicode index */ 304 unsigned long uni_val = 0; /* Unicode */ 305 char *keepc = st->keepc; 306 307 big5_val = ((keepc[0]&ONEBYTE) << 8) + (keepc[1]&ONEBYTE); 308 #ifdef DEBUG 309 fprintf(stderr, "%x\t", big5_val); 310 #endif 311 312 #ifdef UDC_SUPPORT 313 if ((uni_val = ifUDC(udc, big5_val)) == 0) { 314 #endif 315 unidx = binsearch(big5_val, big5_utf_tab, MAX_BIG5_NUM); 316 if (unidx >= 0) 317 318 uni_val = big5_utf_tab[unidx].unicode; 319 #ifdef UDC_SUPPORT 320 } 321 #endif 322 #ifdef DEBUG 323 fprintf(stderr, "unidx = %d, unicode = %x\t", unidx, uni_val); 324 #endif 325 326 /* 327 * Code conversion for UCS-2LE to support Samba 328 */ 329 if (st->little_endian) { 330 int size = 0; 331 332 if (unidx < 0 || uni_val > 0x00ffff ) { 333 uni_val = ICV_CHAR_UCS2_REPLACEMENT; 334 *uconv_num = 1; 335 } 336 337 if (!st->bom_written) { 338 if (buflen < 4) 339 return 0; 340 341 *(buf + size++) = (uchar_t)0xff; 342 *(buf + size++) = (uchar_t)0xfe; 343 st->bom_written = true; 344 } 345 346 if (buflen < 2) 347 return 0; 348 349 *(buf + size++) = (uchar_t)(uni_val & 0xff); 350 *(buf + size++) = (uchar_t)((uni_val >> 8) & 0xff); 351 352 return size; 353 } 354 355 if (unidx >= 0) { /* do Unicode to UTF8 conversion */ 356 if (uni_val >= 0x0080 && uni_val <= 0x07ff) { 357 if (buflen < 2) { 358 #ifdef DEBUG 359 fprintf(stderr, "outbuf overflow in big5_to_utf8()!!\n"); 360 #endif 361 errno = E2BIG; 362 return(0); 363 } 364 *buf = (char)((uni_val >> 6) & 0x1f) | 0xc0; 365 *(buf+1) = (char)(uni_val & 0x3f) | 0x80; 366 #ifdef DEBUG 367 fprintf(stderr, "%x %x\n", *buf&ONEBYTE, *(buf+1)&ONEBYTE); 368 #endif 369 return(2); 370 } 371 if (uni_val >= 0x0800 && uni_val <= 0xffff) { 372 if (buflen < 3) { 373 #ifdef DEBUG 374 fprintf(stderr, "outbuf overflow in big5_to_utf8()!!\n"); 375 #endif 376 errno = E2BIG; 377 return(0); 378 } 379 *buf = (char)((uni_val >> 12) & 0xf) | 0xe0; 380 *(buf+1) = (char)((uni_val >>6) & 0x3f) | 0x80; 381 *(buf+2) = (char)(uni_val & 0x3f) | 0x80; 382 #ifdef DEBUG 383 fprintf(stderr, "%x %x %x\n", *buf&ONEBYTE, *(buf+1)&ONEBYTE, *(buf+2)&ONEBYTE); 384 #endif 385 return(3); 386 } 387 if (uni_val >= 0x10000 && uni_val <= 0x10ffff) { 388 if (buflen < 4) { 389 errno = E2BIG; 390 return 0; 391 } 392 393 *buf = (char) ((uni_val >> 18 ) & 0x7) | 0xf0; 394 *(buf+1) = (char) ((uni_val >> 12) & 0x3f) | 0x80; 395 *(buf+2) = (char) ((uni_val >> 6) & 0x3f) | 0x80; 396 *(buf+3) = (char) (uni_val & 0x3f) | 0x80; 397 398 return 4; 399 } 400 } 401 402 /* can't find a match in Big-5 --> UTF8 table or illegal UTF8 code */ 403 if (buflen < 3) { 404 #ifdef DEBUG 405 fprintf(stderr, "outbuf overflow in big5_to_utf8()!!\n"); 406 #endif 407 errno = E2BIG; 408 return(0); 409 } 410 411 *(unsigned char*) buf = UTF8_NON_ID_CHAR1; 412 *(unsigned char*)(buf+1) = UTF8_NON_ID_CHAR2; 413 *(unsigned char*)(buf+2) = UTF8_NON_ID_CHAR3; 414 415 /* non-identical conversion */ 416 *uconv_num = 1; 417 418 #ifdef DEBUG 419 fprintf(stderr, "%c %c %c\n", *buf, *(buf+1), *(buf+2)); 420 #endif 421 return(3); 422 } 423 424 425 /* binsearch: find x in v[0] <= v[1] <= ... <= v[n-1] */ 426 static int binsearch(unsigned long x, big5_utf v[], int n) 427 { 428 int low, high, mid; 429 430 low = 0; 431 high = n - 1; 432 while (low <= high) { 433 mid = (low + high) / 2; 434 if (x < v[mid].big5code) 435 high = mid - 1; 436 else if (x > v[mid].big5code) 437 low = mid + 1; 438 else /* found match */ 439 return mid; 440 } 441 return (-1); /* no match */ 442 } 443