1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 1995, by Sun Microsystems, Inc. 24 * All rights reserved. 25 */ 26 27 #include <stdio.h> 28 #include <stdlib.h> 29 #include <sys/types.h> 30 #include <errno.h> 31 #include "unicode_big5p.h" /* UTF8 to Big-5 Plus mapping table */ 32 #include "common_defs.h" 33 34 #define MSB 0x80 /* most significant bit */ 35 #define ONEBYTE 0xff /* right most byte */ 36 37 #define NON_ID_CHAR '?' /* non-identified character */ 38 39 typedef struct _icv_state { 40 char keepc[6]; /* maximum # byte of UTF8 code */ 41 short ustate; 42 int _errno; /* internal errno */ 43 } _iconv_st; 44 45 enum _USTATE { U0, U1, U2, U3, U4, U5, U6, U7 }; 46 47 static int get_big5p_by_utf(char, char, int *, unsigned long *); 48 static int utf8_to_big5p(int, unsigned long, char *, size_t); 49 static int binsearch(unsigned long, utf_big5p[], int); 50 51 52 /* 53 * Open; called from iconv_open() 54 */ 55 void * 56 _icv_open() 57 { 58 _iconv_st *st; 59 60 if ((st = (_iconv_st *)malloc(sizeof(_iconv_st))) == NULL) { 61 errno = ENOMEM; 62 return ((void *) -1); 63 } 64 65 st->ustate = U0; 66 st->_errno = 0; 67 68 return ((void *) st); 69 } 70 71 72 /* 73 * Close; called from iconv_close() 74 */ 75 void 76 _icv_close(_iconv_st *st) 77 { 78 if (!st) 79 errno = EBADF; 80 else 81 free(st); 82 } 83 84 85 /* 86 * Actual conversion; called from iconv() 87 */ 88 /*========================================================= 89 * 90 * State Machine for interpreting UTF8 code 91 * 92 *========================================================= 93 * 94 * 2nd byte 3rd byte 4th byte 95 * +----->------->------->U5------>U6--------->U7 96 * | | 97 * | 3 byte unicode | 98 * +----->------->-------+ | 99 * | | | 100 * ^ v | 101 * | 2 byte U2 ---> U3 | 102 * | unicode v | 103 * +------> U0 -------> U1 +-------->U4---+ 104 * ^ ascii | | ^ | 105 * | | +-------->--------->--------+ | 106 * | v v 107 * +----<---+-----<------------<------------<------------+ 108 * 109 *=========================================================*/ 110 size_t 111 _icv_iconv(_iconv_st *st, char **inbuf, size_t *inbytesleft, 112 char **outbuf, size_t *outbytesleft) 113 { 114 char c1 = '\0', c2 = '\0'; 115 int n, unidx; 116 unsigned long big5pcode; 117 118 #ifdef DEBUG 119 fprintf(stderr, "========== iconv(): UTF2 --> Big-5 Plus ==========\n"); 120 #endif 121 if (st == NULL) { 122 errno = EBADF; 123 return ((size_t) -1); 124 } 125 126 if (inbuf == NULL || *inbuf == NULL) { /* Reset request. */ 127 st->ustate = U0; 128 st->_errno = 0; 129 return ((size_t) 0); 130 } 131 132 st->_errno = 0; /* reset internal errno */ 133 errno = 0; /* reset external errno */ 134 135 /* a state machine for interpreting UTF8 code */ 136 while (*inbytesleft > 0 && *outbytesleft > 0) { 137 138 uchar_t first_byte; 139 140 switch (st->ustate) { 141 case U0: /* assuming ASCII in the beginning */ 142 if ((**inbuf & MSB) == 0) { /* ASCII */ 143 **outbuf = **inbuf; 144 (*outbuf)++; 145 (*outbytesleft)--; 146 } else { /* Chinese character */ 147 if ((**inbuf & 0xe0) == 0xc0) { /* 2 byte unicode 0xc2..0xdf */ 148 149 /* invalid sequence if the first char is either 0xc0 or 0xc1 */ 150 if ( number_of_bytes_in_utf8_char[((uchar_t)**inbuf)] == ICV_TYPE_ILLEGAL_CHAR ) 151 st->_errno = errno = EILSEQ; 152 else { 153 st->ustate = U1; 154 st->keepc[0] = **inbuf; 155 } 156 } else if ((**inbuf & 0xf0) == 0xe0) { /* 3 byte 0xe0..0xef */ 157 st->ustate = U2; 158 st->keepc[0] = **inbuf; 159 } else { 160 /* currently the 16 planes are supported */ 161 if ( number_of_bytes_in_utf8_char[((uchar_t)**inbuf)] == ICV_TYPE_ILLEGAL_CHAR ) 162 st->_errno = errno = EILSEQ; 163 else 164 { 165 st->ustate = U5; 166 st->keepc[0] = **inbuf; 167 } 168 } 169 } 170 break; 171 case U1: /* 2 byte unicode */ 172 if ((**inbuf & 0xc0) == MSB) { 173 st->ustate = U4; 174 st->keepc[1] = **inbuf; 175 c1 = (st->keepc[0]&0x1c)>>2; 176 c2 = ((st->keepc[0]&0x03)<<6) | ((**inbuf)&0x3f); 177 #ifdef DEBUG 178 fprintf(stderr, "UTF8: %02x%02x --> ", 179 st->keepc[0]&ONEBYTE, st->keepc[1]&ONEBYTE); 180 #endif 181 continue; /* should not advance *inbuf */ 182 } else { 183 st->_errno = errno = EILSEQ; 184 } 185 break; 186 case U2: /* 3 byte unicode - 2nd byte */ 187 188 first_byte = st->keepc[0]; 189 190 /* if the first byte is 0xed, it is illegal sequence if the second 191 * one is between 0xa0 and 0xbf because the surrogate section is ill-formed 192 */ 193 if (((uchar_t)**inbuf) < valid_min_2nd_byte[first_byte] || 194 ((uchar_t)**inbuf) > valid_max_2nd_byte[first_byte] ) 195 st->_errno = errno = EILSEQ; 196 else { 197 st->ustate = U3; 198 st->keepc[1] = **inbuf; 199 } 200 break; 201 case U3: /* 3 byte unicode - 3rd byte */ 202 if ((**inbuf & 0xc0) == MSB) { 203 st->ustate = U4; 204 st->keepc[2] = **inbuf; 205 c1 = ((st->keepc[0]&0x0f)<<4) | 206 ((st->keepc[1]&0x3c)>>2); 207 c2 = ((st->keepc[1]&0x03)<<6) | ((**inbuf)&0x3f); 208 #ifdef DEBUG 209 fprintf(stderr, "UTF8: %02x%02x%02x --> ", st->keepc[0]&ONEBYTE, 210 st->keepc[1]&ONEBYTE, **inbuf&ONEBYTE); 211 #endif 212 continue; /* should not advance *inbuf */ 213 } else { 214 st->_errno = errno = EILSEQ; 215 } 216 break; 217 case U4: 218 n = get_big5p_by_utf(c1, c2, &unidx, &big5pcode); 219 if ( n == -1 ) { /* unicode is either 0xfffe or 0xffff */ 220 st->_errno = errno = EILSEQ; 221 break; 222 } 223 224 /* comment the following lines to ignore no Big5 plus characters 225 if (n != 0) { 226 st->_errno = errno = EILSEQ; 227 break; 228 } 229 */ 230 231 n = utf8_to_big5p(unidx, big5pcode, 232 *outbuf, *outbytesleft); 233 if (n > 0) { 234 (*outbuf) += n; 235 (*outbytesleft) -= n; 236 237 st->ustate = U0; 238 } else { 239 st->_errno = errno = E2BIG; 240 } 241 break; 242 case U5: 243 first_byte = st->keepc[0]; 244 245 /* if the first byte is 0xf0, it is illegal sequence if 246 * the second one is between 0x80 and 0x8f 247 * for Four-Byte UTF: U+10000..U+10FFFF 248 */ 249 if (((uchar_t)**inbuf) < valid_min_2nd_byte[first_byte] || 250 ((uchar_t)**inbuf) > valid_max_2nd_byte[first_byte] ) 251 st->_errno = errno = EILSEQ; 252 else 253 { 254 st->ustate = U6; 255 st->keepc[1] = **inbuf; 256 } 257 break; 258 case U6: 259 if ((**inbuf & 0xc0) == MSB) /* 0x80..0xbf */ 260 { 261 st->ustate = U7; 262 st->keepc[2] = **inbuf; 263 } 264 else 265 st->_errno = errno = EILSEQ; 266 break; 267 case U7: 268 if ((**inbuf & 0xc0) == MSB) /* 0x80..0xbf */ 269 { /* skip it */ 270 st->ustate = U0; 271 } 272 else 273 st->_errno = errno = EILSEQ; 274 break; 275 default: /* should never come here */ 276 st->_errno = errno = EILSEQ; 277 st->ustate = U0; /* reset state */ 278 break; 279 } 280 281 if (st->_errno) { 282 #ifdef DEBUG 283 fprintf(stderr, "!!!!!\tst->_errno = %d\tst->ustate = %d\n", 284 st->_errno, st->ustate); 285 #endif 286 break; 287 } 288 289 (*inbuf)++; 290 (*inbytesleft)--; 291 } 292 293 if (errno) return ((size_t) -1); 294 295 if (*inbytesleft == 0 && st->ustate != U0) { 296 errno = EINVAL; 297 return ((size_t) -1); 298 } 299 300 if (*inbytesleft > 0 && *outbytesleft == 0) { 301 errno = E2BIG; 302 return((size_t) -1); 303 } 304 return (*inbytesleft); 305 } 306 307 308 /* 309 * Match Big-5 Plus code by UTF8 code; 310 * Return: = 0 - match from Unicode to Big-5 Plus found 311 * = 1 - match from Unicode to Big-5 Plus NOT found 312 * =-1 - illegal sequence 313 * 314 * Since binary search of the UTF8 to Big-5 Plus table is necessary, might as well 315 * return index and Big-5 Plus code matching to the unicode. 316 */ 317 static int get_big5p_by_utf(char c1, char c2, int *unidx, unsigned long *big5pcode) 318 { 319 unsigned long unicode; 320 321 unicode = (unsigned long) ((c1 & ONEBYTE) << 8) + (c2 & ONEBYTE); 322 /* 0xfffe and 0xffff should not be allowed */ 323 if ( unicode == 0xFFFE || unicode == 0xFFFF ) return -1; 324 325 *unidx = binsearch(unicode, utf_big5p_tab, MAX_BIG5P_NUM); 326 if ((*unidx) >= 0) 327 *big5pcode = utf_big5p_tab[*unidx].big5pcode; 328 else 329 return(1); /* match from UTF8 to Big-5 Plus not found */ 330 #ifdef DEBUG 331 fprintf(stderr, "Unicode=%04x, idx=%5d, Big-5 Plus=%x ", unicode, *unidx, *big5pcode); 332 #endif 333 334 return(0); 335 } 336 337 338 /* 339 * ISO/IEC 10646 (Unicode) --> Big-5 Plus 340 * Unicode --> UTF8 (FSS-UTF) 341 * (File System Safe Universal Character Set Transformation Format) 342 * Return: > 0 - converted with enough space in output buffer 343 * = 0 - no space in outbuf 344 */ 345 static int utf8_to_big5p(int unidx, unsigned long big5pcode, char *buf, size_t buflen) 346 { 347 unsigned long val; /* Big-5 Plus value */ 348 char c1, c2, big5p_str[3]; 349 350 if (buflen < 2) { 351 errno = E2BIG; 352 return(0); 353 } 354 355 if (unidx < 0) { /* no match from UTF8 to Big-5 Plus */ 356 *buf = *(buf+1) = NON_ID_CHAR; 357 } else { 358 val = big5pcode & 0xffff; 359 c1 = (char) ((val & 0xff00) >> 8); 360 c2 = (char) (val & 0xff); 361 362 *buf = big5p_str[0] = c1; 363 *(buf+1) = big5p_str[1] = c2; 364 big5p_str[2] = NULL; 365 } 366 367 #ifdef DEBUG 368 fprintf(stderr, "\t->%x %x<-\n", *buf, *(buf+1)); 369 #endif 370 371 return(2); 372 } 373 374 375 /* binsearch: find x in v[0] <= v[1] <= ... <= v[n-1] */ 376 static int binsearch(unsigned long x, utf_big5p v[], int n) 377 { 378 int low, high, mid; 379 380 low = 0; 381 high = n - 1; 382 while (low <= high) { 383 mid = (low + high) / 2; 384 if (x < v[mid].unicode) 385 high = mid - 1; 386 else if (x > v[mid].unicode) 387 low = mid + 1; 388 else /* found match */ 389 return mid; 390 } 391 return (-1); /* no match */ 392 } 393