1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright(c) 1998 Sun Microsystems, Inc. 23 * All rights reserved. 24 */ 25 26 #include <stdio.h> 27 #include <errno.h> 28 #include <stdlib.h> 29 #include <sys/types.h> 30 #include "unicode_gb2312.h" 31 #include "common_defs.h" 32 33 #define MSB 0x80 34 #define NON_ID_CHAR '?' 35 36 typedef struct _icv_state { 37 short _ustate; 38 short saved_ustate; 39 char _cbuf[3]; 40 } _iconv_st; 41 42 enum _USTATE { U0, U1, U2, U3, U4, U5, U6 }; 43 44 int unicode_to_gb_to_hz(char in_byte1, char in_byte2, char *buf, int buflen); 45 46 /* 47 * Open; called from iconv_open() 48 */ 49 void * 50 _icv_open() 51 { 52 _iconv_st *st; 53 54 if ((st = (_iconv_st *)malloc(sizeof(_iconv_st))) == NULL) { 55 errno = ENOMEM; 56 return ((void *) -1); 57 } 58 59 st->_ustate = U0; 60 st->saved_ustate = U0; 61 return ((void *)st); 62 } 63 64 65 /* 66 * Close; called from iconv_close() 67 */ 68 void 69 _icv_close(_iconv_st *st) 70 { 71 if (st == NULL) 72 errno = EBADF; 73 else 74 free(st); 75 } 76 77 78 /* 79 * Actual conversion; called from iconv() 80 */ 81 size_t 82 _icv_iconv(_iconv_st *st, char **inbuf, size_t*inbytesleft, 83 char **outbuf, size_t*outbytesleft) 84 { 85 char c1, c2; 86 int n; 87 88 if (st == NULL) { 89 errno = EBADF; 90 return ((size_t)-1); 91 } 92 93 if (inbuf == NULL || *inbuf == NULL) { /* Reset request. */ 94 st->_ustate = U0; 95 return ((size_t)0); 96 } 97 98 errno = 0; 99 while (*inbytesleft > 0 && *outbytesleft > 0) { 100 101 uchar_t first_byte; 102 103 switch (st->_ustate) { 104 case U0: 105 if (**inbuf & MSB && st->saved_ustate ==U0) { 106 if(*outbytesleft >=2) { 107 **outbuf = '~'; 108 *(*outbuf+1) = '{'; 109 (*outbuf) += 2, (*outbytesleft) -= 2; 110 } else { 111 errno = E2BIG; 112 return (size_t)-1; 113 } 114 } 115 if ((**inbuf & MSB) == 0) { /* ASCII */ 116 if (st->saved_ustate == U1 || st->saved_ustate == U3) 117 { 118 if(*outbytesleft >=2) { 119 **outbuf = '~'; 120 *(*outbuf+1) = '}'; 121 (*outbuf) += 2, (*outbytesleft) -= 2; 122 }else { 123 errno = E2BIG; 124 return (size_t)-1; 125 } 126 } 127 st->saved_ustate = U0; 128 if(*outbytesleft >=1) { 129 **outbuf = **inbuf; 130 (*outbuf)++; (*outbytesleft)--; 131 }else { 132 errno = E2BIG; 133 return (size_t)-1; 134 } 135 if (**inbuf == '~') { 136 if(*outbytesleft >=1) { 137 **outbuf = '~'; 138 (*outbuf)++, (*outbytesleft)--; 139 }else { 140 errno = E2BIG; 141 return (size_t)-1; 142 } 143 } 144 } else if ((**inbuf & 0xe0) == 0xc0) { /* 0xc2..0xbf */ 145 146 /* invalid sequence if the first char is either 0xc0 or 0xc1 */ 147 if ( number_of_bytes_in_utf8_char[((uchar_t)**inbuf)] == ICV_TYPE_ILLEGAL_CHAR ) 148 errno = EILSEQ; 149 else { 150 st->_ustate = U1; 151 st->_cbuf[0] = **inbuf; 152 } 153 } else if ((**inbuf & 0xf0) == 0xe0) { /* 0xe0..0xef */ 154 st->_ustate = U2; 155 st->_cbuf[0] = **inbuf; 156 } else { 157 /* four bytes of UTF-8 sequences */ 158 if ( number_of_bytes_in_utf8_char[((uchar_t)**inbuf)] == ICV_TYPE_ILLEGAL_CHAR ) 159 errno = EILSEQ; 160 else 161 { 162 st->_ustate = U4; 163 st->_cbuf[0] = **inbuf; 164 } 165 } 166 break; 167 case U1: 168 if ((**inbuf & 0xc0) == MSB) { /* Two-byte UTF */ 169 c1 = (st->_cbuf[0]&0x1c)>>2; 170 c2 = ((st->_cbuf[0]&0x03)<<6) | ((**inbuf)&0x3f); 171 n = unicode_to_gb_to_hz(c1, c2, *outbuf, *outbytesleft); 172 if (n > 0) { 173 (*outbuf) += n, (*outbytesleft) -= n; 174 } else { 175 errno = E2BIG; 176 return ((size_t) -1); 177 } 178 st->saved_ustate = U1; 179 st->_ustate = U0; 180 } else { 181 errno = EILSEQ; 182 } 183 break; 184 case U2: 185 st->saved_ustate = U2; 186 187 first_byte = st->_cbuf[0]; 188 189 /* if the first byte is 0xed, it is illegal sequence if the second 190 * one is between 0xa0 and 0xbf because surrogate section is ill-formed 191 */ 192 if (((uchar_t)**inbuf) < valid_min_2nd_byte[first_byte] || 193 ((uchar_t)**inbuf) > valid_max_2nd_byte[first_byte] ) 194 errno = EILSEQ; 195 else { 196 st->_ustate = U3; 197 st->_cbuf[1] = **inbuf; 198 } 199 break; 200 case U3: 201 if ((**inbuf & 0xc0) == MSB) { /* Three-byte UTF */ 202 c1 = ((st->_cbuf[0]&0x0f)<<4) | ((st->_cbuf[1]&0x3c)>>2); 203 c2 = ((st->_cbuf[1]&0x03)<<6) | ((**inbuf)&0x3f); 204 n = unicode_to_gb_to_hz(c1, c2, *outbuf, *outbytesleft); 205 if (n > 0) { 206 (*outbuf) += n, (*outbytesleft) -= n; 207 } else if ( n == -1 ) { /* unicode is either 0xFFFE or 0xFFFF */ 208 errno = EILSEQ; 209 } else { 210 errno = E2BIG; 211 return ((size_t)-1); 212 } 213 st->saved_ustate = U3; 214 st->_ustate = U0; 215 } else { 216 errno = EILSEQ; 217 break; 218 } 219 break; 220 case U4: 221 222 first_byte = st->_cbuf[0]; 223 224 /* if the first byte is 0xf0, it is illegal sequence if 225 * the second one is between 0x80 and 0x8f 226 * for Four-Byte UTF: U+10000..U+10FFFF 227 */ 228 if (((uchar_t)**inbuf) < valid_min_2nd_byte[first_byte] || 229 ((uchar_t)**inbuf) > valid_max_2nd_byte[first_byte] ) 230 errno = EILSEQ; 231 else 232 { 233 st->_ustate = U5; 234 st->_cbuf[1] = **inbuf; 235 st->saved_ustate = U4; 236 } 237 break; 238 case U5: 239 if ((**inbuf & 0xc0) == MSB) /* 0x80..0xbf */ 240 { 241 st->_ustate = U6; 242 st->_cbuf[2] = **inbuf; 243 st->saved_ustate = U5; 244 } 245 else 246 errno = EILSEQ; 247 break; 248 case U6: 249 if ((**inbuf & 0xc0) == MSB) /* 0x80..0xbf */ 250 { 251 /* replace with double NON_ID_CHARs */ 252 if ( *outbytesleft < 2 ) 253 errno = E2BIG; 254 else 255 { 256 **outbuf = NON_ID_CHAR; 257 *(*outbuf+1) = NON_ID_CHAR; 258 (*outbytesleft) -= 2; 259 260 st->_ustate = U0; 261 st->saved_ustate = U6; 262 } 263 } 264 else 265 errno = EILSEQ; 266 break; 267 } 268 269 if (errno) 270 return ((size_t)-1); 271 (*inbuf)++; (*inbytesleft)--; 272 } 273 274 if (*inbytesleft == 0 && st->_ustate != U0) 275 { 276 errno = EINVAL; 277 return ((size_t) -1); 278 } 279 280 if (*inbytesleft > 0 && *outbytesleft == 0) { 281 errno = E2BIG; 282 return ((size_t)-1); 283 } 284 return ((size_t)(*inbytesleft)); 285 } 286 287 /* return value: 0 - no enough space to hold the HZ-GB-2312 code 288 * -1 - illegal sequence 289 * >0 - buffer length 290 */ 291 int unicode_to_gb_to_hz(in_byte1, in_byte2, buf, buflen) 292 char in_byte1, in_byte2; 293 char *buf; 294 int buflen; 295 { 296 int gb, unicode; 297 int i, l, h; 298 299 if (buflen < 2) 300 return 0; 301 unicode = ((in_byte1 & 0xff) << 8) + (in_byte2 & 0xff); 302 303 /* 0xfffe and 0xffff should not be allowed */ 304 if ( unicode == 0xFFFE || unicode == 0xFFFF ) return -1; 305 306 for (l = 0, h = UNICODEMAX; l < h; ) { 307 if (unicode_gb_tab[l].key == unicode) { 308 i = l; 309 break; 310 } 311 if (unicode_gb_tab[h].key == unicode) { 312 i = h; 313 break; 314 } 315 i = (l + h) / 2; 316 if (unicode_gb_tab[i].key == unicode) 317 break; 318 if (unicode_gb_tab[i].key < unicode) 319 l = i + 1; 320 else h = i - 1; 321 } 322 if (unicode == unicode_gb_tab[i].key) { 323 gb = unicode_gb_tab[i].value; 324 *buf = ((gb & 0xff00) >> 8) & 0x7f; 325 *(buf+1) = (gb & 0xff) & 0x7f; 326 } else { 327 *buf = NON_ID_CHAR; 328 *(buf+1) = NON_ID_CHAR; 329 } 330 return 2; 331 } 332