1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 1995, by Sun Microsystems, Inc. 24 * All rights reserved. 25 */ 26 27 #include <stdio.h> 28 #include <stdlib.h> 29 #include <errno.h> 30 #include "cns11643_unicode_TW.h" /* CNS 11643 to UTF8 mapping table */ 31 32 #define MSB 0x80 /* most significant bit */ 33 #define MBYTE 0x8e /* multi-byte (4 byte character) */ 34 #define PMASK 0xa0 /* plane number mask */ 35 #define ONEBYTE 0xff /* right most byte */ 36 #define MSB_OFF 0x7f /* mask off MBS */ 37 38 #define SI 0x0f /* shift in */ 39 #define SO 0x0e /* shift out */ 40 #define ESC 0x1b /* escape */ 41 42 /* 43 * static const char plane_char[] = "0GH23456789:;<=>?"; 44 * static const char plane_char[] = "0GHIJKLMNOPQRSTUV"; 45 * #define GET_PLANEC(i) (plane_char[i]) 46 */ 47 48 /* non-identified character */ 49 #define UTF8_NON_ID_CHAR1 0xEF 50 #define UTF8_NON_ID_CHAR2 0xBF 51 #define UTF8_NON_ID_CHAR3 0xBD 52 53 typedef struct _icv_state { 54 char keepc[4]; /* maximum # byte of CNS11643 code */ 55 short cstate; /* state machine id */ 56 int plane_no; /* plane number for Chinese character */ 57 int _errno; /* internal errno */ 58 } _iconv_st; 59 60 enum _CSTATE { C0, C1, C2, C3, C4, C5, C6, C7 }; 61 62 63 static int get_plane_no_by_iso(const char); 64 static int iso_to_utf8(int, char[], char*, size_t); 65 static int binsearch(unsigned long, cns_utf[], int); 66 67 68 /* 69 * Open; called from iconv_open() 70 */ 71 void * 72 _icv_open() 73 { 74 _iconv_st *st; 75 76 if ((st = (_iconv_st *)malloc(sizeof(_iconv_st))) == NULL) { 77 errno = ENOMEM; 78 return ((void *) -1); 79 } 80 81 st->cstate = C0; 82 st->plane_no = 0; 83 st->_errno = 0; 84 85 return ((void *) st); 86 } 87 88 89 /* 90 * Close; called from iconv_close() 91 */ 92 void 93 _icv_close(_iconv_st *st) 94 { 95 if (!st) 96 errno = EBADF; 97 else 98 free(st); 99 } 100 101 102 /* 103 * Actual conversion; called from iconv() 104 */ 105 /*========================================================================= 106 * 107 * State Machine for interpreting ISO 2022-7 code 108 * 109 *========================================================================= 110 * 111 * plane 2 - 16 112 * +---------->-------+ 113 * plane ^ | 114 * ESC $ ) number SO | plane 1 v 115 * +-> C0 ----> C1 ---> C2 ---> C3 ------> C4 --> C5 -------> C6 C7 116 * | | ascii | ascii | ascii | ascii | SI | | | | 117 * +----------------------------+ <-----+------+ +------<---+------+ 118 * ^ | 119 * | ascii v 120 * +---------<-------------<---------+ 121 * 122 *=========================================================================*/ 123 size_t 124 _icv_iconv(_iconv_st *st, char **inbuf, size_t *inbytesleft, 125 char **outbuf, size_t *outbytesleft) 126 { 127 int n; 128 129 #ifdef DEBUG 130 fprintf(stderr, "========== iconv(): ISO2022-7 --> UTF2 ==========\n"); 131 #endif 132 if (st == NULL) { 133 errno = EBADF; 134 return ((size_t) -1); 135 } 136 137 if (inbuf == NULL || *inbuf == NULL) { /* Reset request. */ 138 st->cstate = C0; 139 st->_errno = 0; 140 return ((size_t) 0); 141 } 142 143 st->_errno = 0; /* reset internal errno */ 144 errno = 0; /* reset external errno */ 145 146 /* a state machine for interpreting ISO 2022-7 code */ 147 while (*inbytesleft > 0 && *outbytesleft > 0) { 148 switch (st->cstate) { 149 case C0: /* assuming ASCII in the beginning */ 150 if (**inbuf == ESC) { 151 st->cstate = C1; 152 } else { /* real ASCII */ 153 **outbuf = **inbuf; 154 (*outbuf)++; 155 (*outbytesleft)--; 156 } 157 break; 158 case C1: /* got ESC, expecting $ */ 159 if (**inbuf == '$') { 160 st->cstate = C2; 161 } else { 162 **outbuf = ESC; 163 (*outbuf)++; 164 (*outbytesleft)--; 165 st->cstate = C0; 166 st->_errno = 0; 167 continue; /* don't advance inbuf */ 168 } 169 break; 170 case C2: /* got $, expecting ) */ 171 if (**inbuf == ')') { 172 st->cstate = C3; 173 } else { 174 if (*outbytesleft < 2) { 175 st->_errno = errno = E2BIG; 176 return((size_t)-1); 177 } 178 **outbuf = ESC; 179 *(*outbuf+1) = '$'; 180 (*outbuf) += 2; 181 (*outbytesleft) -= 2; 182 st->cstate = C0; 183 st->_errno = 0; 184 continue; /* don't advance inbuf */ 185 } 186 break; 187 case C3: /* got ) expecting G,H,I,...,V */ 188 st->plane_no = get_plane_no_by_iso(**inbuf); 189 if (st->plane_no > 0 ) { /* plane #1 - #16 */ 190 st->cstate = C4; 191 } else { 192 if (*outbytesleft < 3) { 193 st->_errno = errno = E2BIG; 194 return((size_t)-1); 195 } 196 **outbuf = ESC; 197 *(*outbuf+1) = '$'; 198 *(*outbuf+2) = ')'; 199 (*outbuf) += 3; 200 (*outbytesleft) -= 3; 201 st->cstate = C0; 202 st->_errno = 0; 203 continue; /* don't advance inbuf */ 204 } 205 break; 206 case C4: /* SI (Shift In) */ 207 if (**inbuf == ESC) { 208 st->cstate = C1; 209 break; 210 } 211 if (**inbuf == SO) { 212 #ifdef DEBUG 213 fprintf(stderr, "<-------------- SO -------------->\n"); 214 #endif 215 st->cstate = C5; 216 } else { /* ASCII */ 217 **outbuf = **inbuf; 218 (*outbuf)++; 219 (*outbytesleft)--; 220 st->cstate = C0; 221 st->_errno = 0; 222 } 223 break; 224 case C5: /* SO (Shift Out) */ 225 if (**inbuf == SI) { 226 #ifdef DEBUG 227 fprintf(stderr, ">-------------- SI --------------<\n"); 228 #endif 229 st->cstate = C4; 230 } else { /* 1st Chinese character */ 231 if (st->plane_no == 1) { 232 st->keepc[0] = (char) (**inbuf | MSB); 233 st->cstate = C6; 234 } else { /* plane #1 - #16 */ 235 st->keepc[0] = (char) MBYTE; 236 st->keepc[1] = (char) (PMASK + 237 st->plane_no); 238 st->keepc[2] = (char) (**inbuf | MSB); 239 st->cstate = C7; 240 } 241 } 242 break; 243 case C6: /* plane #1: 2nd Chinese character */ 244 st->keepc[1] = (char) (**inbuf | MSB); 245 st->keepc[2] = st->keepc[3] = NULL; 246 n = iso_to_utf8(1, st->keepc, *outbuf, 247 *outbytesleft); 248 if (n > 0) { 249 (*outbuf) += n; 250 (*outbytesleft) -= n; 251 } else { 252 st->_errno = errno; 253 return((size_t)-1); 254 } 255 st->cstate = C5; 256 break; 257 case C7: /* 4th Chinese character */ 258 st->keepc[3] = (char) (**inbuf | MSB); 259 n = iso_to_utf8(st->plane_no, st->keepc, *outbuf, 260 *outbytesleft); 261 if (n > 0) { 262 (*outbuf) += n; 263 (*outbytesleft) -= n; 264 } else { 265 st->_errno = errno; 266 return((size_t)-1); 267 } 268 st->cstate = C5; 269 break; 270 default: /* should never come here */ 271 st->_errno = errno = EILSEQ; 272 st->cstate = C0; /* reset state */ 273 break; 274 } 275 276 (*inbuf)++; 277 (*inbytesleft)--; 278 279 if (st->_errno) { 280 #ifdef DEBUG 281 fprintf(stderr, "!!!!!\tst->_errno = %d\tst->cstate = %d\tinbuf=%x\n", 282 st->_errno, st->cstate, **inbuf); 283 #endif 284 break; 285 } 286 if (errno) 287 return((size_t)-1); 288 } 289 290 if (*inbytesleft > 0 && *outbytesleft == 0) { 291 errno = E2BIG; 292 return((size_t)-1); 293 } 294 return (*inbytesleft); 295 } 296 297 298 /* 299 * Get plane number by ISO plane char; i.e. 'G' returns 1, 'H' returns 2, etc. 300 * Returns -1 on error conditions 301 */ 302 static int get_plane_no_by_iso(const char inbuf) 303 { 304 int ret; 305 unsigned char uc = (unsigned char) inbuf; 306 307 if (uc == '0') /* plane #0 */ 308 return(0); 309 310 ret = uc - 'F'; 311 switch (ret) { 312 case 1: /* 0x8EA1 - G */ 313 case 2: /* 0x8EA2 - H */ 314 case 3: /* 0x8EA3 - I */ 315 case 4: /* 0x8EA4 - J */ 316 case 5: /* 0x8EA5 - K */ 317 case 6: /* 0x8EA6 - L */ 318 case 7: /* 0x8EA7 - M */ 319 case 8: /* 0x8EA8 - N */ 320 case 9: /* 0x8EA9 - O */ 321 case 10: /* 0x8EAA - P */ 322 case 11: /* 0x8EAB - Q */ 323 case 12: /* 0x8EAC - R */ 324 case 13: /* 0x8EAD - S */ 325 case 14: /* 0x8EAE - T */ 326 case 15: /* 0x8EAF - U */ 327 case 16: /* 0x8EB0 - V */ 328 return (ret); 329 default: 330 return (-1); 331 } 332 } 333 334 335 /* 336 * ISO 2022-7 code --> ISO/IEC 10646 (Unicode) 337 * Unicode --> UTF8 (FSS-UTF) 338 * (File System Safe Universal Character Set Transformation Format) 339 * Return: > 0 - converted with enough space in output buffer 340 * = 0 - no space in outbuf 341 */ 342 static int iso_to_utf8(int plane_no, char keepc[], char *buf, size_t buflen) 343 { 344 char iso_str[3]; 345 unsigned long iso_val; /* ISO 2022-7 value */ 346 int unidx; /* Unicode index */ 347 unsigned long uni_val; /* Unicode */ 348 349 #ifdef DEBUG 350 fprintf(stderr, "%s %d ", keepc, plane_no); 351 #endif 352 if (plane_no == 1) { 353 iso_str[0] = keepc[0] & MSB_OFF; 354 iso_str[1] = keepc[1] & MSB_OFF; 355 } else { 356 iso_str[0] = keepc[2] & MSB_OFF; 357 iso_str[1] = keepc[3] & MSB_OFF; 358 } 359 iso_val = (iso_str[0] << 8) + iso_str[1]; 360 #ifdef DEBUG 361 fprintf(stderr, "%x\t", iso_val); 362 #endif 363 364 switch (plane_no) { 365 case 1: 366 unidx = binsearch(iso_val, cns1_utf_tab, MAX_CNS1_NUM); 367 if (unidx >= 0) 368 uni_val = cns1_utf_tab[unidx].unicode; 369 break; 370 case 2: 371 unidx = binsearch(iso_val, cns2_utf_tab, MAX_CNS2_NUM); 372 if (unidx >= 0) 373 uni_val = cns2_utf_tab[unidx].unicode; 374 break; 375 case 3: 376 case 14: 377 unidx = binsearch(iso_val, cns3_utf_tab, MAX_CNS3_NUM); 378 if (unidx >= 0) 379 uni_val = cns3_utf_tab[unidx].unicode; 380 break; 381 default: 382 unidx = -1; /* no mapping from CNS to UTF8 */ 383 break; 384 } 385 386 #ifdef DEBUG 387 fprintf(stderr, "unidx = %d, unicode = %x\t", unidx, uni_val); 388 #endif 389 390 if (unidx >= 0) { /* do Unicode to UTF8 conversion */ 391 if (uni_val > 0x0080 && uni_val <= 0x07ff) { 392 if (buflen < 2) { 393 errno = E2BIG; 394 return(0); 395 } 396 *buf = (char)((uni_val >> 6) & 0x1f) | 0xc0; 397 *(buf+1) = (char)(uni_val & 0x3f) | 0x80; 398 #ifdef DEBUG 399 fprintf(stderr, "%x %x\n", *buf&ONEBYTE, *(buf+1)&ONEBYTE); 400 #endif 401 return(2); 402 } 403 if (uni_val > 0x0800 && uni_val <= 0xffff) { 404 if (buflen < 3) { 405 errno = E2BIG; 406 return(0); 407 } 408 *buf = (char)((uni_val >> 12) & 0xf) | 0xe0; 409 *(buf+1) = (char)((uni_val >>6) & 0x3f) | 0x80; 410 *(buf+2) = (char)(uni_val & 0x3f) | 0x80; 411 #ifdef DEBUG 412 fprintf(stderr, "%x %x %x\n", *buf&ONEBYTE, *(buf+1)&ONEBYTE, *(buf+2)&ONEBYTE); 413 #endif 414 return(3); 415 } 416 } 417 418 /* can't find a match in CNS --> UTF8 table or illegal UTF8 code */ 419 if (buflen < 3) { 420 errno = E2BIG; 421 return(0); 422 } 423 424 *(unsigned char*) buf = UTF8_NON_ID_CHAR1; 425 *(unsigned char*) (buf+1) = UTF8_NON_ID_CHAR2; 426 *(unsigned char*) (buf+2) = UTF8_NON_ID_CHAR3; 427 428 #ifdef DEBUG 429 fprintf(stderr, "%c %c %c\n", *buf, *(buf+1), *(buf+2)); 430 #endif 431 return(3); 432 } 433 434 435 /* binsearch: find x in v[0] <= v[1] <= ... <= v[n-1] */ 436 static int binsearch(unsigned long x, cns_utf v[], int n) 437 { 438 int low, high, mid; 439 440 low = 0; 441 high = n - 1; 442 while (low <= high) { 443 mid = (low + high) / 2; 444 if (x < v[mid].cnscode) 445 high = mid - 1; 446 else if (x > v[mid].cnscode) 447 low = mid + 1; 448 else /* found match */ 449 return mid; 450 } 451 return (-1); /* no match */ 452 } 453