1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 1997, by Sun Microsystems, Inc. 24 * All rights reserved. 25 */ 26 27 #include <stdio.h> 28 #include <stdlib.h> 29 #include <errno.h> 30 #include <sys/types.h> 31 32 #include "tab_lookup.h" /* table lookup data types */ 33 34 #define MSB 0x80 /* most significant bit */ 35 #define ONEBYTE 0xff /* right most byte */ 36 37 enum _USTATE { U0, U1, U11, U2, U3, U4 }; 38 39 40 41 42 /* 43 * Actual conversion; called from iconv() 44 * Input is UTF-8 data. 45 * first convert to UCS2 46 */ 47 size_t 48 _icv_iconv(_icv_state *st, char **inbuf, size_t *inbytesleft, 49 char **outbuf, size_t *outbytesleft) 50 { 51 /* 52 * Actual conversion; called from iconv() 53 */ 54 /*========================================================= 55 * 56 * State Machine for interpreting UTF8 code 57 * 58 *========================================================= 59 * 60 * 3 byte unicode 61 * +----->------->-------+ 62 * | | 63 * ^ v 64 * | 2 byte U2 ---> U3 65 * | unicode v 66 * +------> U0 -------> U1 +-------->U4---+ 67 * ^ ascii | | ^ | 68 * | | +-------->--------->--------+ | 69 * | v v 70 * +----<---+-----<------------<------------<------------+ 71 * 72 * +----<---+-----<------------<------------<------------+ 73 * 74 *=========================================================*/ 75 76 char c1, c2; 77 int n, unidx; 78 unsigned long ibm_code; 79 80 #ifdef DEBUG 81 fprintf(stderr, "========== iconv(): UTF8 --> IBM ==========\n"); 82 #endif 83 84 if (st == NULL) { 85 errno = EBADF; 86 return ((size_t) -1); 87 } 88 89 if (inbuf == NULL || *inbuf == NULL) { /* Reset request. */ 90 st->ustate = U0; 91 st->_errno = 0; 92 return ((size_t) 0); 93 } 94 95 st->_errno = 0; /* reset internal errno */ 96 errno = 0; /* reset external errno */ 97 98 /* a state machine for interpreting UTF8 code */ 99 while (*inbytesleft > 0 && *outbytesleft > 0) { 100 switch (st->ustate) { 101 case U0: /* assuming ASCII in the beginning */ 102 if ((**inbuf & MSB) == 0) { /* ASCII */ 103 **outbuf = **inbuf; 104 (*outbuf)++; 105 (*outbytesleft)--; 106 } else { /* Chinese character */ 107 if ((**inbuf & 0xe0) == 0xc0) { /* 2 byte unicode */ 108 st->ustate = U1; 109 st->keepc[0] = **inbuf; 110 } else if ((**inbuf & 0xf0) == 0xe0) { /* 3 byte */ 111 st->ustate = U2; 112 st->keepc[0] = **inbuf; 113 } else { /* illegal unicode */ 114 /* st->_errno = errno = EINVAL; */ 115 /* possible UNICODE ko_KR-UTF8 */ 116 c1 =st->keepc[0] = **inbuf; 117 st->ustate = U11; 118 break; 119 } 120 } 121 break; 122 case U1: /* 2 byte unicode */ 123 if ((**inbuf & 0xc0) == MSB) { 124 st->ustate = U4; 125 st->keepc[1] = **inbuf; 126 c1 = (st->keepc[0]&0x1c)>>2; 127 c2 = ((st->keepc[0]&0x03)<<6) | ((**inbuf)&0x3f); 128 #ifdef DEBUG 129 fprintf(stderr, "UTF8: %02x%02x --> ", 130 st->keepc[0]&ONEBYTE, st->keepc[1]&ONEBYTE); 131 #endif 132 continue; /* should not advance *inbuf */ 133 } else { 134 st->_errno = errno = EINVAL; 135 } 136 break; 137 case U11: /* 3 byte unicode - 2nd byte */ 138 c2 =st->keepc[1] = **inbuf; 139 st->ustate = U4; 140 continue; 141 break; 142 case U2: /* 3 byte unicode - 2nd byte */ 143 if ((**inbuf & 0xc0) == MSB) { 144 st->ustate = U3; 145 st->keepc[1] = **inbuf; 146 } else { 147 st->_errno = errno = EINVAL; 148 } 149 break; 150 case U3: /* 3 byte unicode - 3rd byte */ 151 if ((**inbuf & 0xc0) == MSB) { 152 st->ustate = U4; 153 st->keepc[2] = **inbuf; 154 c1 = ((st->keepc[0]&0x0f)<<4) | 155 ((st->keepc[1]&0x3c)>>2); 156 c2 = ((st->keepc[1]&0x03)<<6) | ((**inbuf)&0x3f); 157 #ifdef DEBUG 158 fprintf(stderr, "UTF8: %02x%02x%02x --> ", st->keepc[0]&ONEBYTE, 159 st->keepc[1]&ONEBYTE, **inbuf&ONEBYTE); 160 #endif 161 continue; /* should not advance *inbuf */ 162 } else { 163 st->_errno = errno = EINVAL; 164 } 165 break; 166 case U4: 167 n = get_ibm_by_utf(st, c1, c2, &unidx, &ibm_code); 168 if (n != 0) { /* legal unicode;illegal Big5 */ 169 st->_errno = errno = EILSEQ; 170 break; 171 } 172 173 n = utf8_to_ibm(unidx, ibm_code, 174 *outbuf, *outbytesleft); 175 if (n > 0) { 176 (*outbuf) += n; 177 (*outbytesleft) -= n; 178 } else { 179 st->_errno = errno; 180 return((size_t)-1); 181 } 182 st->ustate = U0; 183 st->_errno = 0; 184 break; 185 default: /* should never come here */ 186 st->_errno = errno = EILSEQ; 187 st->ustate = U0; /* reset state */ 188 break; 189 } 190 191 (*inbuf)++; 192 (*inbytesleft)--; 193 194 if (st->_errno) { 195 #ifdef DEBUG 196 fprintf(stderr, "!!!!!\tst->_errno = %d\tst->ustate = %d\n", 197 st->_errno, st->ustate); 198 #endif 199 break; 200 } 201 202 if (errno) 203 return((size_t)-1); 204 } 205 206 if (*outbytesleft == 0) { 207 errno = E2BIG; 208 return((size_t)-1); 209 } 210 return (*inbytesleft); 211 } 212 213 214 /* 215 * Match IBM code by UTF8 code; 216 * Return: = 0 - match from Unicode to IBM found 217 * = 1 - match from Unicode to IBM NOT found 218 * 219 * Since binary search of the UTF8 to IBM table is necessary, might as well 220 * return index and IBM code matching to the unicode. 221 */ 222 int get_ibm_by_utf(st, c1, c2, unidx, ibm_code) 223 _icv_state *st; 224 char c1, c2; 225 int *unidx; 226 unsigned long *ibm_code; 227 { 228 unsigned long unicode; 229 230 unicode = (unsigned long) ((c1 & ONEBYTE) << 8) + (c2 & ONEBYTE); 231 *unidx = bisearch(unicode, st, st->table_size); 232 if ((*unidx) >= 0) 233 { 234 if ( st->left_to_right ) 235 *ibm_code = st->table[*unidx].right_code; 236 else 237 *ibm_code = st->table[*unidx].left_code; 238 } 239 else 240 ; /* match from UTF8 to IBM not found */ 241 #ifdef DEBUG 242 fprintf(stderr, "Unicode=%04x, idx=%5d, IBM=%x ", unicode, *unidx, *ibm_code); 243 #endif 244 245 return(0); 246 } 247 248 249 /* 250 * ISO/IEC 10646 (Unicode) --> IBM 251 * Unicode --> UTF8 (FSS-UTF) 252 * (File System Safe Universal Character Set Transformation Format) 253 * Return: > 0 - converted with enough space in output buffer 254 * = 0 - no space in outbuf 255 */ 256 int utf8_to_ibm(unidx, ibm_code, buf, buflen) 257 int unidx; 258 unsigned long ibm_code; 259 char *buf; 260 size_t buflen; 261 262 { 263 unsigned long val; /* IBM value */ 264 char c1, c2, ibm_str[3]; 265 266 if (unidx < 0) /* no match from UTF8 to IBM */ 267 ibm_code = (unsigned long)NON_ID_CHAR; 268 269 { 270 val = ibm_code & 0xffff; 271 c1 = (char) ((val & 0xff00) >> 8); 272 c2 = (char) (val & 0xff); 273 } 274 275 *buf = ibm_str[0] = c1; 276 *(buf+1) = ibm_str[1] = c2; 277 ibm_str[2] = NULL; 278 279 #ifdef DEBUG 280 fprintf(stderr, "\t->%x %x<-\n", *buf, *(buf+1)); 281 #endif 282 283 284 if (buflen < 2) { 285 errno = E2BIG; 286 return(0); 287 } 288 289 return(2); 290 } 291