1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 1997, by Sun Microsystems, Inc. 24 * All rights reserved. 25 */ 26 27 #include <stdio.h> 28 #include <stdlib.h> 29 #include <errno.h> 30 #include <sys/types.h> 31 32 #include "tab_lookup.h" /* table lookup data types */ 33 34 #define MSB 0x80 /* most significant bit */ 35 #define ONEBYTE 0xff /* right most byte */ 36 37 enum _USTATE { U0, U1, U11, U2, U3, U4 }; 38 39 40 int get_ibm_by_utf(_icv_state *st, char c1, char c2, int *unidx, 41 unsigned long *ibm_code); 42 43 int bisearch(unsigned long val, _icv_state *st, int n); 44 45 int utf8_to_ibm(int unidx, unsigned long ibm_code, char *buf, 46 size_t buflen, _icv_state *st); 47 48 /* 49 * Actual conversion; called from iconv() 50 * Input is UTF-8 data. 51 * first convert to UCS2 52 */ 53 size_t 54 _icv_iconv(_icv_state *st, char **inbuf, size_t *inbytesleft, 55 char **outbuf, size_t *outbytesleft) 56 { 57 /* 58 * Actual conversion; called from iconv() 59 */ 60 /*========================================================= 61 * 62 * State Machine for interpreting UTF8 code 63 * 64 *========================================================= 65 * 66 * 3 byte unicode 67 * +----->------->-------+ 68 * | | 69 * ^ v 70 * | 2 byte U2 ---> U3 71 * | unicode v 72 * +------> U0 -------> U1 +-------->U4---+ 73 * ^ ascii | | ^ | 74 * | | +-------->--------->--------+ | 75 * | v v 76 * +----<---+-----<------------<------------<------------+ 77 * 78 * +----<---+-----<------------<------------<------------+ 79 * 80 *=========================================================*/ 81 82 char c1 = '\0', c2 = '\0'; 83 int n, unidx; 84 unsigned long ibm_code; 85 86 #ifdef DEBUG 87 fprintf(stderr, "========== iconv(): UTF8 --> IBM ==========\n"); 88 #endif 89 90 if (st == NULL) { 91 errno = EBADF; 92 return ((size_t) -1); 93 } 94 95 if (inbuf == NULL || *inbuf == NULL) { /* Reset request. */ 96 st->ustate = U0; 97 st->_errno = 0; 98 st->shift = SHIFT_IN; 99 return ((size_t) 0); 100 } 101 102 st->_errno = 0; /* reset internal errno */ 103 errno = 0; /* reset external errno */ 104 105 /* a state machine for interpreting UTF8 code */ 106 while (*inbytesleft > 0 && *outbytesleft > 0) { 107 switch (st->ustate) { 108 case U0: 109 /* it is ascii, convert it immediately */ 110 if ((**inbuf & MSB) == 0) { /* ASCII */ 111 st->ustate = U4; 112 st->keepc[0] = **inbuf; 113 c1 = 0x0; 114 c2 = **inbuf; 115 continue; 116 } else { /* Chinese character */ 117 if ((**inbuf & 0xe0) == 0xc0) { /* 2 byte unicode */ 118 st->ustate = U1; 119 st->keepc[0] = **inbuf; 120 } else if ((**inbuf & 0xf0) == 0xe0) { /* 3 byte */ 121 st->ustate = U2; 122 st->keepc[0] = **inbuf; 123 } else { /* illegal unicode */ 124 /* st->_errno = errno = EINVAL; */ 125 /* possible UNICODE ko_KR-UTF8 */ 126 c1 =st->keepc[0] = **inbuf; 127 st->ustate = U11; 128 break; 129 } 130 } 131 break; 132 case U1: /* 2 byte unicode */ 133 if ((**inbuf & 0xc0) == MSB) { 134 st->ustate = U4; 135 st->keepc[1] = **inbuf; 136 c1 = (st->keepc[0]&0x1c)>>2; 137 c2 = ((st->keepc[0]&0x03)<<6) | ((**inbuf)&0x3f); 138 #ifdef DEBUG 139 fprintf(stderr, "UTF8: %02x%02x --> ", 140 st->keepc[0]&ONEBYTE, st->keepc[1]&ONEBYTE); 141 #endif 142 continue; /* should not advance *inbuf */ 143 } else { 144 st->_errno = errno = EINVAL; 145 } 146 break; 147 case U11: /* 3 byte unicode - 2nd byte */ 148 c2 =st->keepc[1] = **inbuf; 149 st->ustate = U4; 150 continue; 151 break; 152 case U2: /* 3 byte unicode - 2nd byte */ 153 if ((**inbuf & 0xc0) == MSB) { 154 st->ustate = U3; 155 st->keepc[1] = **inbuf; 156 } else { 157 st->_errno = errno = EINVAL; 158 } 159 break; 160 case U3: /* 3 byte unicode - 3rd byte */ 161 if ((**inbuf & 0xc0) == MSB) { 162 st->ustate = U4; 163 st->keepc[2] = **inbuf; 164 c1 = ((st->keepc[0]&0x0f)<<4) | 165 ((st->keepc[1]&0x3c)>>2); 166 c2 = ((st->keepc[1]&0x03)<<6) | ((**inbuf)&0x3f); 167 #ifdef DEBUG 168 fprintf(stderr, "UTF8: %02x%02x%02x --> ", st->keepc[0]&ONEBYTE, 169 st->keepc[1]&ONEBYTE, **inbuf&ONEBYTE); 170 #endif 171 continue; /* should not advance *inbuf */ 172 } else { 173 st->_errno = errno = EINVAL; 174 } 175 break; 176 case U4: 177 n = get_ibm_by_utf(st, c1, c2, &unidx, &ibm_code); 178 if (n != 0) { /* legal unicode;illegal Big5 */ 179 st->_errno = errno = EILSEQ; 180 break; 181 } 182 183 n = utf8_to_ibm(unidx, ibm_code, 184 *outbuf, *outbytesleft, st); 185 if (n > 0) { 186 (*outbuf) += n; 187 (*outbytesleft) -= n; 188 } else { 189 st->_errno = errno; 190 return((size_t)-1); 191 } 192 st->ustate = U0; 193 st->_errno = 0; 194 break; 195 default: /* should never come here */ 196 st->_errno = errno = EILSEQ; 197 st->ustate = U0; /* reset state */ 198 break; 199 } 200 201 (*inbuf)++; 202 (*inbytesleft)--; 203 204 if (st->_errno) { 205 #ifdef DEBUG 206 fprintf(stderr, "!!!!!\tst->_errno = %d\tst->ustate = %d\n", 207 st->_errno, st->ustate); 208 #endif 209 break; 210 } 211 212 if (errno) 213 return((size_t)-1); 214 } 215 216 if (*outbytesleft == 0) { 217 errno = E2BIG; 218 return((size_t)-1); 219 } 220 return (*inbytesleft); 221 } 222 223 224 /* 225 * Match IBM code by UTF8 code; 226 * Return: = 0 - match from Unicode to IBM found 227 * = 1 - match from Unicode to IBM NOT found 228 * 229 * Since binary search of the UTF8 to IBM table is necessary, might as well 230 * return index and IBM code matching to the unicode. 231 */ 232 int get_ibm_by_utf(st, c1, c2, unidx, ibm_code) 233 _icv_state *st; 234 char c1, c2; 235 int *unidx; 236 unsigned long *ibm_code; 237 { 238 unsigned long unicode; 239 240 unicode = (unsigned long) ((c1 & ONEBYTE) << 8) + (c2 & ONEBYTE); 241 *unidx = bisearch(unicode, st, st->table_size); 242 if ((*unidx) >= 0) 243 { 244 if ( st->left_to_right ) 245 *ibm_code = st->table[*unidx].right_code; 246 else 247 *ibm_code = st->table[*unidx].left_code; 248 } 249 #ifdef DEBUG 250 fprintf(stderr, "Unicode=%04x, idx=%5d, IBM=%x ", unicode, *unidx, *ibm_code); 251 #endif 252 253 return(0); 254 } 255 256 257 /* 258 * ISO/IEC 10646 (Unicode) --> IBM 259 * Unicode --> UTF8 (FSS-UTF) 260 * (File System Safe Universal Character Set Transformation Format) 261 * Return: > 0 - converted with enough space in output buffer 262 * = 0 - no space in outbuf 263 */ 264 int utf8_to_ibm(unidx, ibm_code, buf, buflen, st) 265 int unidx; 266 unsigned long ibm_code; 267 char *buf; 268 size_t buflen; 269 _icv_state *st; 270 271 { 272 unsigned long val; /* IBM value */ 273 char c1, c2, ibm_str[3]; 274 275 if (unidx < 0) /* no match from UTF8 to IBM */ 276 ibm_code = (unsigned long)NON_ID_CHAR; 277 278 { 279 val = ibm_code & 0xffff; 280 c1 = (char) ((val & 0xff00) >> 8); 281 c2 = (char) (val & 0xff); 282 } 283 284 /* it is single byte ascii */ 285 if ( c1 == 0x0 ) { 286 if ( st->shift == SHIFT_OUT ) { 287 if (buflen < 2) { 288 errno = E2BIG; 289 return 0; 290 } 291 *buf = SHIFT_IN; 292 *(buf+1) = c2; 293 st->shift = SHIFT_IN; 294 return 2; 295 } 296 if (buflen < 1) { 297 errno = E2BIG; 298 return 0; 299 } 300 *buf = c2; 301 return 1; 302 } 303 304 /* it is the first two bytes character */ 305 if ( st->shift == SHIFT_IN ) { 306 if (buflen < 3) { 307 errno = E2BIG; 308 return 0; 309 } 310 *buf = SHIFT_OUT; 311 st->shift = SHIFT_OUT; 312 *(buf+1) = c1; 313 *(buf+2) = c2; 314 return 3; 315 } 316 317 *buf = ibm_str[0] = c1; 318 *(buf+1) = ibm_str[1] = c2; 319 ibm_str[2] = NULL; 320 321 #ifdef DEBUG 322 fprintf(stderr, "\t->%x %x<-\n", *buf, *(buf+1)); 323 #endif 324 325 326 if (buflen < 2) { 327 errno = E2BIG; 328 return(0); 329 } 330 331 return(2); 332 } 333