1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2008, by Sun Microsystems, Inc. 23 * All rights reserved. 24 */ 25 26 #include <stdio.h> 27 #include <errno.h> 28 #include <stdlib.h> 29 #include <sys/types.h> 30 #define __NEED_UNI_2_VISCII__ 31 #include <unicode_viscii.h> /* Unicode to viscii mapping table */ 32 #include "common_defs.h" 33 34 #define MSB 0x80 /* most significant bit */ 35 #define ONEBYTE 0xff /* right most byte */ 36 37 #define NON_ID_CHAR '?' /* non-identified character */ 38 39 40 41 typedef struct _icv_state { 42 char keepc[6]; /* maximum # byte of UTF8 code */ 43 short ustate; 44 int _errno; /* internal errno */ 45 } _iconv_st; 46 47 enum _USTATE { U0, U1, U2, U3, U4, U5, U6, U7 }; 48 49 50 /* 51 * Open; called from iconv_open() 52 */ 53 void * 54 _icv_open() 55 { 56 _iconv_st *st; 57 58 if ((st = (_iconv_st *)malloc(sizeof(_iconv_st))) == NULL) { 59 errno = ENOMEM; 60 return ((void *) -1); 61 } 62 63 st->ustate = U0; 64 st->_errno = 0; 65 return ((void *) st); 66 } 67 68 69 /* 70 * Close; called from iconv_close() 71 */ 72 void 73 _icv_close(_iconv_st *st) 74 { 75 if (!st) 76 errno = EBADF; 77 else 78 free(st); 79 } 80 81 82 /* 83 * Actual conversion; called from iconv() 84 */ 85 /*========================================================= 86 * 87 * State Machine for interpreting UTF8 code 88 * 89 *========================================================= 90 * 4 byte unicode 91 * +----->------->------------> U5 -----> U6-------> U7---+ 92 * | | 93 * | 3 byte unicode | 94 * +----->------->-------+ | 95 * | | | 96 * ^ v | 97 * | 2 byte U2 ---> U3 | 98 * | unicode v | 99 * +------> U0 -------> U1 +-------->U4---+ | 100 * ^ ascii | | ^ | | 101 * | | +-------->--------->--------+ | | 102 * | v v V 103 * +----<---+-----<------------<------------<------------+---------+ 104 * 105 *=========================================================*/ 106 size_t 107 _icv_iconv(_iconv_st *st, char **inbuf, size_t *inbytesleft, 108 char **outbuf, size_t *outbytesleft) 109 { 110 char c1 = '\0', c2 = '\0'; 111 int uconv_num = 0; 112 unsigned long uni = 0; 113 int utf8_len = 0; 114 115 #ifdef DEBUG 116 fprintf(stderr, "========== iconv(): UTF2 --> GBK2K ==========\n"); 117 #endif 118 if (st == NULL) { 119 errno = EBADF; 120 return ((size_t) -1); 121 } 122 123 if (inbuf == NULL || *inbuf == NULL) { /* Reset request. */ 124 st->ustate = U0; 125 st->_errno = 0; 126 return ((size_t) 0); 127 } 128 129 st->_errno = 0; /* reset internal errno */ 130 errno = 0; /* reset external errno */ 131 132 /* a state machine for interpreting UTF8 code */ 133 while (*inbytesleft > 0 && *outbytesleft > 0) { 134 135 uchar_t first_byte; 136 unsigned short ch = 0; 137 switch (st->ustate) { 138 case U0: 139 /* 140 * assuming ASCII in the beginning 141 */ 142 if ((**inbuf & MSB) == 0) { /* ASCII */ 143 **outbuf = **inbuf; 144 (*outbuf)++; 145 (*outbytesleft)--; 146 } else { 147 if ((**inbuf & 0xe0) == 0xc0) { 148 /* 2 byte unicode 0xc0..0xdf */ 149 /* invalid sequence if the first char is either 0xc0 or 0xc1 */ 150 if ( number_of_bytes_in_utf8_char[((uchar_t)**inbuf)] == ICV_TYPE_ILLEGAL_CHAR ) 151 st->_errno = errno = EILSEQ; 152 else { 153 st->ustate = U1; 154 st->keepc[0] = **inbuf; 155 } 156 } else if ((**inbuf & 0xf0) == 0xe0) { /* 3 byte 0xe0..0xf0 */ 157 st->ustate = U2; 158 st->keepc[0] = **inbuf; 159 } else { 160 /* four bytes of UTF-8 sequences */ 161 if ( number_of_bytes_in_utf8_char[((uchar_t)**inbuf)] == ICV_TYPE_ILLEGAL_CHAR ) 162 st->_errno = errno = EILSEQ; 163 else { 164 st->ustate = U5; 165 st->keepc[0] = **inbuf; 166 } 167 } 168 } 169 break; 170 case U1: 171 /* 2 byte utf-8 encoding */ 172 if ((**inbuf & 0xc0) == MSB) { 173 utf8_len = 2; 174 st->keepc[1] = **inbuf; 175 176 c1 = (st->keepc[0]&0x1c)>>2; 177 c2 = ((st->keepc[0]&0x03)<<6) | ((st->keepc[1])&0x3f); 178 st->ustate = U4; 179 #ifdef DEBUG 180 fprintf(stderr, "UTF8: %02x%02x --> ", 181 st->keepc[0]&ONEBYTE, st->keepc[1]&ONEBYTE); 182 #endif 183 continue; /* should not advance *inbuf */ 184 } else { 185 st->_errno = errno = EILSEQ; 186 } 187 break; 188 case U2: 189 /* 3 byte unicode - 2nd byte */ 190 first_byte = (uchar_t)st->keepc[0]; 191 /* if the first byte is 0xed, it is illegal sequence if the second 192 * one is between 0xa0 and 0xbf because surrogate section is ill-formed 193 */ 194 if (((uchar_t)**inbuf) < valid_min_2nd_byte[first_byte] || 195 ((uchar_t)**inbuf) > valid_max_2nd_byte[first_byte] ) 196 st->_errno = errno = EILSEQ; 197 else { 198 st->ustate = U3; 199 st->keepc[1] = **inbuf; 200 } 201 break; 202 case U3: 203 /* 3 byte unicode - 3rd byte */ 204 if ((**inbuf & 0xc0) == MSB) { 205 st->ustate = U4; 206 utf8_len = 3; 207 st->keepc[2] = **inbuf; 208 c1 = ((st->keepc[0]&0x0f)<<4) | 209 ((st->keepc[1]&0x3c)>>2); 210 c2 = ((st->keepc[1]&0x03)<<6) | ((**inbuf)&0x3f); 211 #ifdef DEBUG 212 fprintf(stderr, "UTF8: %02x%02x%02x --> ", st->keepc[0]&ONEBYTE, 213 st->keepc[1]&ONEBYTE, **inbuf&ONEBYTE); 214 #endif 215 continue; /* should not advance *inbuf */ 216 } else { 217 st->_errno = errno = EILSEQ; 218 } 219 break; 220 case U4: 221 uni = (unsigned long) ((c1 & ONEBYTE) << 8) + (c2 & ONEBYTE); 222 if (!uni_2_viscii(uni, (unsigned char*)&ch)) { 223 **outbuf = NON_ID_CHAR; 224 uconv_num += utf8_len; 225 } else { 226 **outbuf = ch; 227 } 228 (*outbuf)++; 229 (*outbytesleft)--; 230 st->ustate = U0; 231 break; 232 case U5: 233 first_byte = st->keepc[0]; 234 235 /* if the first byte is 0xf0, it is illegal sequence if 236 * the second one is between 0x80 and 0x8f 237 * for Four-Byte UTF: U+10000..U+10FFFF 238 * */ 239 if (((uchar_t)**inbuf) < valid_min_2nd_byte[first_byte] || 240 ((uchar_t)**inbuf) > valid_max_2nd_byte[first_byte] ) 241 st->_errno = errno = EILSEQ; 242 else { 243 st->ustate = U6; 244 st->keepc[1] = **inbuf; 245 } 246 break; 247 case U6: 248 if ((**inbuf & 0xc0) == MSB) { 249 /* 0x80..0xbf */ 250 st->ustate = U7; 251 st->keepc[2] = **inbuf; 252 } else 253 st->_errno = errno = EILSEQ; 254 break; 255 case U7: 256 if ((**inbuf & 0xc0) == MSB) { 257 /* 0x80..0xbf */ 258 /* replace with double NON_ID_CHARs */ 259 if ( *outbytesleft < 1 ) 260 st->_errno = errno = E2BIG; 261 else { 262 **outbuf = NON_ID_CHAR; 263 (*outbytesleft) -= 1; 264 uconv_num++; 265 st->ustate = U0; 266 } 267 } else 268 st->_errno = errno = EILSEQ; 269 break; 270 default: 271 /* should never come here */ 272 st->_errno = errno = EILSEQ; 273 st->ustate = U0; /* reset state */ 274 break; 275 } 276 277 if (st->_errno) { 278 #ifdef DEBUG 279 fprintf(stderr, "!!!!!\tst->_errno = %d\tst->ustate = %d\n", 280 st->_errno, st->ustate); 281 #endif 282 break; 283 } 284 285 (*inbuf)++; 286 (*inbytesleft)--; 287 } 288 289 if (*inbytesleft == 0 && st->ustate != U0) 290 errno = EINVAL; 291 292 if (*inbytesleft > 0 && *outbytesleft == 0) 293 errno = E2BIG; 294 295 if (errno) { 296 int num_reversed_bytes = 0; 297 298 switch (st->ustate) 299 { 300 case U1: 301 num_reversed_bytes = 1; 302 break; 303 case U2: 304 num_reversed_bytes = 1; 305 break; 306 case U3: 307 num_reversed_bytes = 2; 308 break; 309 case U4: 310 num_reversed_bytes = utf8_len - 1; 311 break; 312 case U5: 313 num_reversed_bytes = 1; 314 break; 315 case U6: 316 num_reversed_bytes = 2; 317 break; 318 case U7: 319 num_reversed_bytes = 3; 320 break; 321 } 322 323 /* 324 * if error, *inbuf points to the byte following the last byte 325 * successfully used in conversion. 326 */ 327 *inbuf -= num_reversed_bytes; 328 *inbytesleft += num_reversed_bytes; 329 st->ustate = U0; 330 331 return ((size_t) -1); 332 } 333 334 return uconv_num; 335 } 336