1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2000, by Sun Microsystems, Inc. 24 * All rights reserved. 25 */ 26 27 #include <stdio.h> 28 #include <stdlib.h> 29 #include <sys/types.h> 30 #include <sys/isa_defs.h> 31 #include <errno.h> 32 #include "common_defs.h" 33 #include "big5hk_unicode.h" /* HKSCS to Unicode mapping table */ 34 35 #define MSB 0x80 /* most significant bit */ 36 #define MBYTE 0x8e /* multi-byte (4 byte character) */ 37 #define PMASK 0xa0 /* plane number mask */ 38 #define ONEBYTE 0xff /* right most byte */ 39 40 /* non-identified character */ 41 #define UTF8_NON_ID_CHAR1 0xEF 42 #define UTF8_NON_ID_CHAR2 0xBF 43 #define UTF8_NON_ID_CHAR3 0xBD 44 45 46 typedef struct _icv_state { 47 char keepc[2]; /* maximum # byte of HKSCS code */ 48 short cstate; /* state machine id */ 49 int _errno; /* internal errno */ 50 boolean little_endian; 51 boolean bom_written; 52 }_iconv_st; 53 54 enum _CSTATE { C0, C1 }; 55 56 static int hkscs_2nd_byte(char); 57 static int hkscs_to_utf8(_iconv_st *, char*, size_t, int *); 58 static int binsearch(unsigned long, hkscs_utf[], int); 59 60 61 /* 62 * Open; called from iconv_open() 63 */ 64 void * 65 _icv_open() 66 { 67 _iconv_st *st; 68 69 if ((st = (_iconv_st *)malloc(sizeof(_iconv_st))) == NULL) { 70 errno = ENOMEM; 71 return ((void *) -1); 72 } 73 74 st->cstate = C0; 75 st->_errno = 0; 76 st->little_endian = false; 77 st->bom_written = false; 78 #if defined(UCS_2LE) 79 st->little_endian = true; 80 st->bom_written = true; 81 #endif 82 return ((void *) st); 83 } 84 85 86 /* 87 * Close; called from iconv_close() 88 */ 89 void 90 _icv_close(_iconv_st *st) 91 { 92 if (!st) 93 errno = EBADF; 94 else 95 free(st); 96 } 97 98 99 /* 100 * Actual conversion; called from iconv() 101 */ 102 /*======================================================= 103 * 104 * State Machine for interpreting HKSCS code 105 * 106 *======================================================= 107 * 108 * 1st C 109 * +--------> C0 ----------> C1 110 * | ascii | 2nd C | 111 * ^ v v 112 * +----<-----+-----<--------+ 113 * 114 *=======================================================*/ 115 /* 116 * HKSCS encoding range: 117 * High byte: 0x81 - 0xFE 118 * Low byte: 0x40 - 0x7E, 0xA1 - 0xFE 119 * 120 * For HKSCS: 121 * 0x8140 - 0x8DFE ( 641 encoding space) 122 * 0x8E40 - 0xA0FE ( 2898 encoding space) 123 * 0xC6A1 - 0xC8FE ( 359 encoding space) 124 * 0xF9D6 - 0xF9FE ( 41 encoding space) 125 * 0xFA40 - 0xFEFE ( 763 encoding space) 126 * Total: 4702 127 * For BIG5 128 * 0xA140 - 0xC8FE 129 * 0xC940 - 0xFEFE 130 */ 131 size_t 132 _icv_iconv(_iconv_st *st, char **inbuf, size_t *inbytesleft, 133 char **outbuf, size_t *outbytesleft) 134 { 135 int n; 136 int uconv_num = 0; 137 138 #ifdef DEBUG 139 fprintf(stderr, "========== iconv(): HKSCS --> UTF2 ==========\n"); 140 #endif 141 if (st == NULL) { 142 errno = EBADF; 143 return ((size_t) -1); 144 } 145 146 if (inbuf == NULL || *inbuf == NULL) { /* Reset request. */ 147 st->cstate = C0; 148 st->_errno = 0; 149 return ((size_t) 0); 150 } 151 152 st->_errno = 0; /* reset internal errno */ 153 errno = 0; /* reset external errno */ 154 155 /* a state machine for interpreting CNS 11643 code */ 156 while (*inbytesleft > 0 && *outbytesleft > 0) { 157 switch (st->cstate) { 158 case C0: /* assuming ASCII in the beginning */ 159 if (**inbuf & MSB) { 160 st->keepc[0] = (**inbuf); 161 st->cstate = C1; 162 } else { /* real ASCII */ 163 /* 164 * code conversion for UCS-2LE to support Samba 165 */ 166 if (st->little_endian) { 167 if (!st->bom_written) { 168 if (*outbytesleft < 4) 169 errno = E2BIG; 170 else { 171 *(*outbuf)++ = (uchar_t)0xff; 172 *(*outbuf)++ = (uchar_t)0xfe; 173 *outbytesleft -= 2; 174 175 st->bom_written = true; 176 } 177 } 178 179 if (*outbytesleft < 2) 180 errno = E2BIG; 181 else { 182 *(*outbuf)++ = **inbuf; 183 *(*outbuf)++ = (uchar_t)0x0; 184 *outbytesleft -= 2; 185 } 186 } else { 187 **outbuf = **inbuf; 188 (*outbuf)++; 189 (*outbytesleft)--; 190 } 191 } 192 break; 193 case C1: /* Chinese characters: 2nd byte */ 194 if (hkscs_2nd_byte(**inbuf) == 0) { 195 int uconv_num_internal = 0; 196 197 st->keepc[1] = (**inbuf); 198 n = hkscs_to_utf8(st, *outbuf, 199 *outbytesleft, &uconv_num_internal); 200 if (n > 0) { 201 (*outbuf) += n; 202 (*outbytesleft) -= n; 203 204 uconv_num += uconv_num_internal; 205 206 st->cstate = C0; 207 } else { /* don't reset state */ 208 st->_errno = errno = E2BIG; 209 } 210 } else { /* input char doesn't belong 211 * to the input code set 212 */ 213 st->_errno = errno = EILSEQ; 214 } 215 break; 216 default: /* should never come here */ 217 st->_errno = errno = EILSEQ; 218 st->cstate = C0; /* reset state */ 219 break; 220 } 221 222 if (st->_errno) { 223 #ifdef DEBUG 224 fprintf(stderr, "!!!!!\tst->_errno = %d\tst->cstate = %d\n", 225 st->_errno, st->cstate); 226 #endif 227 break; 228 } 229 230 (*inbuf)++; 231 (*inbytesleft)--; 232 } 233 234 if (*inbytesleft == 0 && st->cstate != C0) 235 errno = EINVAL; 236 237 if (*inbytesleft > 0 && *outbytesleft == 0) 238 errno = E2BIG; 239 240 if (errno) { 241 /* 242 * if error, *inbuf points to the byte following the last byte 243 * successfully used in the conversion. 244 */ 245 *inbuf -= (st->cstate - C0); 246 *inbytesleft += (st->cstate - C0); 247 st->cstate = C0; 248 return ((size_t) -1); 249 } 250 251 return uconv_num; 252 } 253 254 255 /* 256 * Test whether inbuf is a valid character for 2nd byte HKSCS code 257 * Return: = 0 - valid HKSCS 2nd byte 258 * = 1 - invalid HKSCS 2nd byte 259 */ 260 static int hkscs_2nd_byte(char inbuf) 261 { 262 unsigned int buf = (unsigned int) (inbuf & ONEBYTE); 263 264 if ((buf >= 0x40) && (buf <= 0x7E)) 265 return (0); 266 if ((buf >= 0xA1) && (buf <= 0xFE)) 267 return (0); 268 269 return(1); 270 } 271 272 #ifdef UDC_SUPPORT 273 typedef struct _udc_sect { 274 unsigned int start, end, count; 275 } UDC; 276 277 UDC udc[] = { 278 { 0x8140, 0x84FE, 0x274 } 279 }; 280 281 #define START_UNICODE 0xF0000 282 283 static int 284 ifUDC(UDC *udc, unsigned int code) 285 { 286 int i; 287 288 for (i=0; i < 1; ++i) 289 if (code >= udc[i].start && code <= udc[i].end) 290 { 291 unsigned char c1, c2, leading_c1; 292 293 c1 = (unsigned char)(code >> 8); 294 c2 = (unsigned char)code; 295 leading_c1 = (unsigned char) (udc[i].start >> 8); 296 297 return START_UNICODE + (i ? udc[i-1].count : 0) + \ 298 (c1 - leading_c1) * 157 + ((c2 <= 0x7E) ? (c2 - 0x40) : ((c2 - 0x40) - (0xA1 - 0x7F))); 299 } 300 301 return 0; 302 } 303 #endif 304 305 /* 306 * HKSCS code --> ISO/IEC 10646 (Unicode) 307 * Unicode --> UTF8 (FSS-UTF) 308 * (File System Safe Universal Character Set Transformation Format) 309 * Return: > 0 - converted with enough space in output buffer 310 * = 0 - no space in outbuf 311 */ 312 static int hkscs_to_utf8(_iconv_st *st, char *buf, size_t buflen, int *uconv_num) 313 { 314 unsigned long hkscs_val; /* HKSCS value */ 315 int unidx = 0; /* Unicode index */ 316 unsigned long uni_val = 0; /* Unicode */ 317 char *keepc = st->keepc; 318 319 hkscs_val = ((keepc[0]&ONEBYTE) << 8) + (keepc[1]&ONEBYTE); 320 #ifdef DEBUG 321 fprintf(stderr, "%x\t", hkscs_val); 322 #endif 323 324 #ifdef UDC_SUPPORT 325 if ((uni_val = ifUDC(udc, hkscs_val)) == 0) { 326 #endif 327 unidx = binsearch(hkscs_val, hkscs_utf_tab, MAX_HKSCS_NUM); 328 if (unidx >= 0) 329 uni_val = hkscs_utf_tab[unidx].unicode; 330 #ifdef UDC_SUPPORT 331 } 332 #endif 333 #ifdef DEBUG 334 fprintf(stderr, "unidx = %d, unicode = %x\t", unidx, uni_val); 335 #endif 336 337 /* 338 * Code version for UCS-2LE to support Samba 339 */ 340 if (st->little_endian) { 341 int size = 0; 342 343 if (unidx < 0 || uni_val > 0x00ffff ) { 344 uni_val = ICV_CHAR_UCS2_REPLACEMENT; 345 *uconv_num = 1; 346 } 347 348 if (!st->bom_written) { 349 if (buflen < 4) 350 return 0; 351 352 *(buf + size++) = (uchar_t)0xff; 353 *(buf + size++) = (uchar_t)0xfe; 354 st->bom_written = true; 355 } 356 357 if (buflen < 2) 358 return 0; 359 360 *(buf + size++) = (uchar_t)(uni_val & 0xff); 361 *(buf + size++) = (uchar_t)((uni_val >> 8) & 0xff); 362 363 return size; 364 } 365 366 if (unidx >= 0) { /* do Unicode to UTF8 conversion */ 367 if (uni_val >= 0x0080 && uni_val <= 0x07ff) { 368 if (buflen < 2) { 369 #ifdef DEBUG 370 fprintf(stderr, "outbuf overflow in hkscs_to_utf8()!!\n"); 371 #endif 372 errno = E2BIG; 373 return(0); 374 } 375 *buf = (char)((uni_val >> 6) & 0x1f) | 0xc0; 376 *(buf+1) = (char)(uni_val & 0x3f) | 0x80; 377 #ifdef DEBUG 378 fprintf(stderr, "%x %x\n", *buf&ONEBYTE, *(buf+1)&ONEBYTE); 379 #endif 380 return(2); 381 } 382 if (uni_val >= 0x0800 && uni_val <= 0xffff) { 383 if (buflen < 3) { 384 #ifdef DEBUG 385 fprintf(stderr, "outbuf overflow in hkscs_to_utf8()!!\n"); 386 #endif 387 errno = E2BIG; 388 return(0); 389 } 390 *buf = (char)((uni_val >> 12) & 0xf) | 0xe0; 391 *(buf+1) = (char)((uni_val >>6) & 0x3f) | 0x80; 392 *(buf+2) = (char)(uni_val & 0x3f) | 0x80; 393 #ifdef DEBUG 394 fprintf(stderr, "%x %x %x\n", *buf&ONEBYTE, *(buf+1)&ONEBYTE, *(buf+2)&ONEBYTE); 395 #endif 396 return(3); 397 } 398 if (uni_val >= 0x10000 && uni_val <= 0x10ffff) { 399 if (buflen < 4) 400 { 401 errno = E2BIG; 402 return 0; 403 } 404 *buf = (char)((uni_val >> 18) & 0x7) | 0xf0; 405 *(buf+1) = (char)((uni_val >> 12) & 0x3f) | 0x80; 406 *(buf+2) = (char)((uni_val >>6) & 0x3f) | 0x80; 407 *(buf+3) = (char)(uni_val & 0x3f) | 0x80; 408 return(4); 409 } 410 } 411 412 /* can't find a match in HKSCS --> UTF8 table or illegal UTF8 code */ 413 if (buflen < 3) { 414 #ifdef DEBUG 415 fprintf(stderr, "outbuf overflow in hkscs_to_utf8()!!\n"); 416 #endif 417 errno = E2BIG; 418 return(0); 419 } 420 421 *(unsigned char*) buf = UTF8_NON_ID_CHAR1; 422 *(unsigned char*)(buf+1) = UTF8_NON_ID_CHAR2; 423 *(unsigned char*)(buf+2) = UTF8_NON_ID_CHAR3; 424 425 /* non-identical conversion */ 426 *uconv_num = 1; 427 428 #ifdef DEBUG 429 fprintf(stderr, "%c %c %c\n", *buf, *(buf+1), *(buf+2)); 430 #endif 431 return(3); 432 } 433 434 435 /* binsearch: find x in v[0] <= v[1] <= ... <= v[n-1] */ 436 static int binsearch(unsigned long x, hkscs_utf v[], int n) 437 { 438 int low, high, mid; 439 440 low = 0; 441 high = n - 1; 442 while (low <= high) { 443 mid = (low + high) / 2; 444 if (x < v[mid].hkscscode) 445 high = mid - 1; 446 else if (x > v[mid].hkscscode) 447 low = mid + 1; 448 else /* found match */ 449 return mid; 450 } 451 return (-1); /* no match */ 452 } 453