1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright(c) 1998 Sun Microsystems, Inc. 23 */ 24 25 #include <stdio.h> 26 #include <errno.h> 27 #include <stdlib.h> 28 #include <sys/types.h> 29 #include <unicode_gb2312.h> 30 #include "common_defs.h" 31 32 #define SI 0x0f 33 #define SO 0x0e 34 #define ESC 0x1b 35 #define MSB 0x80 36 37 #define NON_ID_CHAR '?' 38 39 typedef struct _icv_state { 40 short _ustate; 41 short _istate; 42 short _gstate; 43 char _cbuf[3]; 44 } _iconv_st; 45 46 enum _USTATE { U0, U1, U2, U3, U4, U5, U6 }; 47 enum _ISTATE { IN, OUT }; 48 enum _GSTATE { G0, G1 }; 49 50 int unicode_to_iso(char in_byte1, char in_byte2, char *buf, int buflen); 51 52 /* 53 * Open; called from iconv_open() 54 */ 55 void * 56 _icv_open() 57 { 58 _iconv_st *st; 59 60 if ((st = (_iconv_st *)malloc(sizeof(_iconv_st))) == NULL) { 61 errno = ENOMEM; 62 return ((void *) -1); 63 } 64 65 st->_ustate = U0; 66 st->_istate = IN; 67 st->_gstate = G0; 68 69 return ((void *)st); 70 } 71 72 73 /* 74 * Close; called from iconv_close() 75 */ 76 void 77 _icv_close(_iconv_st *st) 78 { 79 if (st == NULL) 80 errno = EBADF; 81 else 82 free(st); 83 } 84 85 86 /* 87 * Actual conversion; called from iconv() 88 */ 89 size_t 90 _icv_iconv(_iconv_st *st, char **inbuf, size_t*inbytesleft, 91 char **outbuf, size_t*outbytesleft) 92 { 93 char c1, c2; 94 int n; 95 96 if (st == NULL) { 97 errno = EBADF; 98 return ((size_t)-1); 99 } 100 101 if (inbuf == NULL || *inbuf == NULL) { /* Reset request. */ 102 st->_ustate = U0; 103 st->_istate = IN; 104 st->_gstate = G0; 105 return ((size_t)0); 106 } 107 108 errno = 0; 109 110 while (*inbytesleft > 0 && *outbytesleft > 0) { 111 112 uchar_t first_byte; 113 114 switch (st->_ustate) { 115 case U0: 116 if ((**inbuf & MSB) == 0) { /* ASCII */ 117 if (st->_istate == OUT) { 118 st->_istate = IN; 119 **outbuf = SI; 120 (*outbuf)++, (*outbytesleft)--; 121 if (*outbytesleft <= 0) { 122 errno = E2BIG; 123 return ((size_t)-1); 124 } 125 } 126 **outbuf = **inbuf; 127 (*outbuf)++, (*outbytesleft)--; 128 } else { 129 if ((**inbuf & 0xe0) == 0xc0) { /* 0xc2..0xdf */ 130 131 /* invalid sequence if the first char is either 0xc0 or 0xc1 */ 132 if ( number_of_bytes_in_utf8_char[((uchar_t)**inbuf)] == ICV_TYPE_ILLEGAL_CHAR ) 133 { 134 errno = EILSEQ; 135 break; 136 } 137 else 138 { 139 st->_ustate = U1; 140 st->_cbuf[0] = **inbuf; 141 } 142 } else if ((**inbuf & 0xf0) == 0xe0) { /* 0xe0..0xef */ 143 st->_ustate = U2; 144 st->_cbuf[0] = **inbuf; 145 } else { 146 /* four bytes of UTF-8 sequences */ 147 if ( number_of_bytes_in_utf8_char[((uchar_t)**inbuf)] == ICV_TYPE_ILLEGAL_CHAR ) 148 { 149 errno = EILSEQ; 150 break; 151 } 152 else { 153 st->_ustate = U4; 154 st->_cbuf[0] = **inbuf; 155 } 156 } 157 if (st->_istate == IN) { 158 if (st->_gstate == G0) { 159 if (*outbytesleft < 4) { 160 errno = E2BIG; 161 return ((size_t)-1); 162 } 163 st->_gstate = G1; 164 **outbuf = ESC; 165 *(*outbuf+1) = '$'; 166 *(*outbuf+2) = ')'; 167 *(*outbuf+3) = 'A'; 168 (*outbuf) += 4, (*outbytesleft) -= 4; 169 if (*outbytesleft <= 0) { 170 errno = E2BIG; 171 return ((size_t)-1); 172 } 173 } 174 st->_istate = OUT; 175 **outbuf = SO; 176 (*outbuf)++, (*outbytesleft)--; 177 } 178 } 179 break; 180 case U1: 181 if ((**inbuf & 0xc0) == MSB) { /* two-byte UTF */ 182 c1 = (st->_cbuf[0]&0x1c)>>2; 183 c2 = ((st->_cbuf[0]&0x03)<<6) | ((**inbuf)&0x3f); 184 n = unicode_to_iso(c1, c2, *outbuf, *outbytesleft); 185 if (n > 0) { 186 (*outbuf) += n, (*outbytesleft) -= n; 187 } else { 188 errno = E2BIG; 189 return ((size_t)-1); 190 } 191 st->_ustate = U0; 192 } else { 193 errno = EILSEQ; 194 } 195 break; 196 case U2: 197 198 first_byte = st->_cbuf[0]; 199 200 /* if the first byte is 0xed, it is illegal sequence if the second 201 * one is one between 0xa0 and 0xbf because surrogate section is ill-formed 202 */ 203 if (((uchar_t)**inbuf) < valid_min_2nd_byte[first_byte] || 204 ((uchar_t)**inbuf) > valid_max_2nd_byte[first_byte] ) 205 errno = EILSEQ; 206 else { 207 st->_ustate = U3; 208 st->_cbuf[1] = **inbuf; 209 } 210 break; 211 case U3: 212 if ((**inbuf & 0xc0) == MSB) { /* three-byte UTF */ 213 c1 = ((st->_cbuf[0]&0x0f)<<4) | ((st->_cbuf[1]&0x3c)>>2); 214 c2 = ((st->_cbuf[1]&0x03)<<6) | ((**inbuf)&0x3f); 215 n = unicode_to_iso(c1, c2, *outbuf, *outbytesleft); 216 if (n > 0) { 217 (*outbuf) += n, (*outbytesleft) -= n; 218 } else if ( n == -1 ) { 219 errno = EILSEQ; /* unicode is either 0xfffe or 0xffff */ 220 } else { 221 errno = E2BIG; 222 return ((size_t)-1); 223 } 224 st->_ustate = U0; 225 } else { 226 errno = EILSEQ; 227 } 228 break; 229 case U4: 230 first_byte = st->_cbuf[0]; 231 232 /* if the first byte is 0xf0, it is illegal sequence if 233 * the second one is between 0x80 and 0x8f 234 * for Four-Byte UTF: U+10000..U+10FFFF 235 */ 236 if (((uchar_t)**inbuf) < valid_min_2nd_byte[first_byte] || 237 ((uchar_t)**inbuf) > valid_max_2nd_byte[first_byte] ) 238 errno = EILSEQ; 239 else { 240 st->_ustate = U5; 241 st->_cbuf[1] = **inbuf; 242 } 243 break; 244 case U5: 245 if ((**inbuf & 0xc0) == MSB) /* 0x80..0xbf */ 246 { 247 st->_ustate = U6; 248 st->_cbuf[2] = **inbuf; 249 } 250 else 251 errno = EILSEQ; 252 break; 253 case U6: 254 if ((**inbuf & 0xc0) == MSB) /* 0x80..0xbf */ 255 { 256 /* replace with double NON_ID_CHARs */ 257 if ( *outbytesleft < 2 ) 258 errno = E2BIG; 259 else 260 { 261 **outbuf = NON_ID_CHAR; 262 *(*outbuf+1) = NON_ID_CHAR; 263 (*outbytesleft) -= 2; 264 265 st->_ustate = U0; 266 } 267 } 268 else 269 errno = EILSEQ; 270 break; 271 } 272 273 if (errno) 274 return ((size_t)-1); 275 276 (*inbuf)++; (*inbytesleft)--; 277 } 278 279 if (*inbytesleft == 0 && st->_ustate != U0) { 280 errno = EINVAL; 281 return ((size_t) -1); 282 } 283 284 if (*inbytesleft > 0 && *outbytesleft == 0) { 285 errno = E2BIG; 286 return ((size_t)-1); 287 } 288 return ((size_t)(*inbytesleft)); 289 } 290 291 292 int unicode_to_iso(in_byte1, in_byte2, buf, buflen) 293 char in_byte1, in_byte2; 294 char *buf; 295 int buflen; 296 { 297 int gb, unicode; 298 int i, l, h; 299 300 if (buflen < 2) 301 return 0; 302 unicode = ((in_byte1 & 0xff) << 8) + (in_byte2 & 0xff); 303 304 /* 0xfffe and 0xffff should not be allowed */ 305 if ( unicode == 0xFFFE || unicode == 0xFFFF ) return -1; 306 307 for (l = 0, h = UNICODEMAX; l < h; ) { 308 if (unicode_gb_tab[l].key == unicode) { 309 i = l; 310 break; 311 } 312 if (unicode_gb_tab[h].key == unicode) { 313 i = h; 314 break; 315 } 316 i = (l + h) / 2; 317 if (unicode_gb_tab[i].key == unicode) 318 break; 319 if (unicode_gb_tab[i].key < unicode) 320 l = i + 1; 321 else h = i - 1; 322 } 323 if (unicode == unicode_gb_tab[i].key) { 324 gb = unicode_gb_tab[i].value; 325 *buf = (gb & 0xff00) >> 8; 326 *(buf+1) = gb & 0xff; 327 } else { 328 *buf = *(buf+1) = NON_ID_CHAR; 329 } 330 return 2; 331 } 332