1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 1994 by Sun Microsystems, Inc. 23 */ 24 25 26 #include <stdlib.h> 27 #include <errno.h> 28 #include "hangulcode.h" 29 #include "ktable.h" 30 #include "utf_johap92.h" 31 #include "common_defs.h" 32 33 #define MSB 0x80 /* mask for most-significant-bit */ 34 typedef enum _USTATE {U0 = 0, U1, U2, U3, U4, U5, U6,UX} USTATE; 35 36 typedef struct _icv_state { 37 unsigned char _buffer[6]; 38 USTATE _ustate; 39 unsigned short _count; 40 int _errno; 41 } _iconv_st; 42 43 /**** _ I C V _ O P E N ****/ 44 45 void* _icv_open() 46 { 47 _iconv_st *st; 48 if((st = (_iconv_st *) malloc(sizeof(_iconv_st))) == NULL){ 49 errno = ENOMEM; 50 return ((void *) -1); 51 } 52 st->_ustate = U0; 53 st->_errno = 0; 54 st->_count = 0; 55 /* 56 RESET_CONV_DESC(); 57 */ 58 return ((void *) st); 59 } /* end of int _icv_open(). */ 60 61 62 /**** _ I C V _ C L O S E ****/ 63 64 void _icv_close(_iconv_st* st) 65 { 66 if(!st) 67 errno = EBADF; 68 else 69 free(st); 70 } /* end of void _icv_close(int*). */ 71 72 73 /**** _ I C V _ I C O N V ****/ 74 75 size_t _icv_iconv(_iconv_st* st, char** inbuf, size_t* inbufleft, 76 char** outbuf, size_t* outbufleft) 77 { 78 size_t ret_val = 0; 79 unsigned char* ib; 80 unsigned char* ob; 81 unsigned char* ibtail; 82 unsigned char* obtail; 83 84 hcode_type utf8_code, johap92_code; 85 86 if(st == NULL){ 87 errno = EBADF; 88 return ((size_t) -1); 89 } 90 91 if (!inbuf || !(*inbuf)){ 92 st->_ustate = U0; 93 st->_errno = 0; 94 return((size_t)0); 95 } 96 97 st->_errno = 0; 98 errno = 0; 99 100 ib = (unsigned char*)*inbuf; 101 ob = (unsigned char*)*outbuf; 102 ibtail = ib + *inbufleft; 103 obtail = ob + *outbufleft; 104 105 106 while (ib < ibtail) 107 { 108 unsigned char first_byte; 109 switch(st->_ustate){ 110 case U0: /* begining of new utf-8 char sequence */ 111 if((*ib & MSB) == 0){ /* MSB is off, so ASCII */ 112 if(ob >= obtail){ 113 errno = E2BIG; 114 ret_val = (size_t) -1; 115 break; 116 } 117 *ob++ = *ib++; 118 119 } else { /* Now, begining of UTF-8 */ 120 if((*ib & 0xe0) == 0xc0){ 121 /* 2-byte utf-8 */ 122 /* true if *ib is (0xc0 ~ 0xdf) */ 123 /* but, need to filter out the range */ 124 /* 0xc0 ~ 0xc1 */ 125 126 if(number_of_bytes_in_utf8_char[(unsigned char) *ib] == 127 ICV_TYPE_ILLEGAL_CHAR) 128 st->_errno = errno = EILSEQ; 129 else { 130 st->_ustate = U1; 131 st->_buffer[0] = *ib; 132 } 133 } else if((*ib & 0xf0) == 0xe0){ 134 /* 3 byte utf-8 */ 135 /* if *ib is (0xe0 ~ 0xef) */ 136 st->_ustate = U2; 137 st->_buffer[0] = *ib; 138 } else { 139 /* 4 byte utf-8 */ 140 /* true if *ib is (0xf0 ~ 0xff) */ 141 /* but, need to screen out the range */ 142 /* 0xf5 ~ 0xff */ 143 if(number_of_bytes_in_utf8_char[(unsigned char) *ib] == 144 ICV_TYPE_ILLEGAL_CHAR) 145 st->_errno = errno = EILSEQ; 146 else { 147 st->_ustate = U4; 148 st->_buffer[0] = *ib; 149 150 } 151 } 152 st->_count++; 153 ib++; 154 } 155 break; 156 case U1: /* we are getting 2nd byte of 2byte utf-8 */ 157 /* convert it right here */ 158 if((*ib & 0xc0) == MSB){ 159 st->_ustate = UX; 160 st->_buffer[1] = *ib; 161 st->_count++; 162 continue;/* Now, we gotta do the real conversion*/ 163 /* becuase we just came to an the last */ 164 /* byte of utf-8 character */ 165 } else { 166 ib++; 167 st->_errno = errno = EILSEQ; 168 ret_val = (size_t) -1; 169 break; 170 } 171 break; 172 case U2: /* 2nd byte of 3byte utf-8 */ 173 first_byte = (unsigned char) st->_buffer[0]; 174 /* basic utf-8 validity check first... */ 175 if((*ib & 0xc0) == MSB){ 176 /* if okay, then what about the range of this byte? */ 177 /* if the first byte is 0xed, it is illegal sequence */ 178 /* if the second one is between 0xa0 and 0xbf */ 179 /* because surrogate section is ill-formed */ 180 181 if((unsigned char)*ib < valid_min_2nd_byte[first_byte] || 182 (unsigned char)*ib > valid_max_2nd_byte[first_byte]){ 183 st->_errno = errno = EILSEQ; 184 } else { 185 st->_ustate = U3; 186 st->_buffer[1] = *ib; 187 st->_count++; 188 } 189 190 } else { 191 st->_errno = errno = EILSEQ; 192 } 193 ib++; 194 break; 195 case U3: /* 3rd byte of 3byte utf-8 */ 196 if((*ib & 0xc0) == MSB){ 197 st->_ustate = UX; 198 st->_buffer[2] = *ib; 199 st->_count++; 200 continue;/* Now, we gotta do the real conversion*/ 201 /* becuase we just came to an the last */ 202 /* byte of utf-8 character */ 203 } else { 204 st->_errno = errno = EILSEQ; 205 ret_val = (size_t) -1; 206 ib++; 207 break; 208 } 209 break; 210 case U4: /* 2nd byte of 4byte utf-8 */ 211 first_byte = st->_buffer[0]; 212 if((*ib & 0xc0) == MSB){ 213 if((unsigned char)*ib < valid_min_2nd_byte[first_byte] || 214 (unsigned char)*ib > valid_max_2nd_byte[first_byte]){ 215 st->_errno = errno = EILSEQ; 216 } else { 217 st->_ustate = U5; 218 st->_buffer[1] = *ib; 219 st->_count++; 220 } 221 } else { 222 st->_errno = errno = EILSEQ; 223 } 224 ib++; 225 break; 226 case U5: /* 3rd byte of 4byte utf-8 */ 227 if((*ib & 0xc0) == MSB){ 228 st->_ustate = U6; 229 st->_buffer[2] = *ib; 230 st->_count++; 231 } else { 232 st->_errno = errno = EILSEQ; 233 } 234 ib++; 235 break; 236 case U6: /* 4th byte of 4byte utf-8 */ 237 if((*ib & 0xc0) == MSB){ 238 if((obtail - ob) < 2){ 239 st->_errno = errno = E2BIG; 240 } else { 241 *ob++ = NON_ID_CHAR; 242 *ob++ = NON_ID_CHAR; 243 st->_ustate = U0; 244 } 245 } else { 246 st->_errno = errno = EILSEQ; 247 } 248 ib++; 249 break; 250 case UX: 251 /******************************************************* 252 * convert valid utf-8 sequence gathered in the 253 * st->_buffer to euc 254 *******************************************************/ 255 utf8_code.code = 0; 256 switch(st->_count){ 257 case 2: /* 2byte utf-8 code */ 258 utf8_code.byte.byte3 = st->_buffer[0]; 259 utf8_code.byte.byte4 = st->_buffer[1]; 260 break; 261 case 3: /* 3byte utf-8 code */ 262 utf8_code.byte.byte2 = st->_buffer[0]; 263 utf8_code.byte.byte3 = st->_buffer[1]; 264 utf8_code.byte.byte4 = st->_buffer[2]; 265 break; 266 } 267 unsigned short _utf8_to_jahap92(utf_code.code) 268 269 if (euc_code.code != 0) { 270 /* If find something -> EUC code */ 271 *ob++ = euc_code.byte.byte3; 272 *ob++ = euc_code.byte.byte4; 273 } 274 else 275 { 276 /* Let's assume the code is not identifiable */ 277 if ((obtail - ob) < 2) 278 { 279 errno = E2BIG; 280 ret_val = (size_t)-1; 281 } 282 *ob++ = NON_IDENTICAL; 283 *ob++ = NON_IDENTICAL; 284 ret_val += 2; 285 } 286 st->_ustate = U0; 287 st->_count = 0; 288 ib++; 289 break; 290 default: /* You are not supposed to get here... */ 291 /* But, just only for the integrity */ 292 st->_errno = errno = EILSEQ; 293 st->_ustate = U0; 294 st->_count = 0; 295 break; 296 297 } 298 if(st->_errno){ 299 #ifdef DEBUG 300 fprintf(stderr, "st->_errno=%d\tst->_ustate=%d\n", st->_errno, st->_ustate); 301 #endif /* DEBUG */ 302 break; 303 } 304 305 } 306 if(errno) return ((size_t) -1); 307 308 *inbuf = (char*)ib; 309 *inbufleft = ibtail - ib; 310 *outbuf = (char*)ob; 311 *outbufleft = obtail - ob; 312 313 return(ret_val); 314 } /* end of size_t _icv_iconv(int*, char**, size_t*, char**, size_t*).*/ 315 316 317 318 319 320 321 322 323 324 unsigned short _utf8_to_jahap92(unsigned long utf_code) 325 { 326 int low, mid, high; 327 low = 0, high = MAX_U2J92_NUM; 328 while(low < high){ 329 mid = (low + high)/2; 330 if(utf8_to_johap92_tbl[mid].utf8 = utf_code){ 331 break; 332 } else if(utf8_to_johap92_tbl[mid].utf8 > utf_code){ 333 high = mid - 1; 334 } else if(utf8_to_johap92_tbl[mid].utf8 < utf_code){ 335 low = mid + 1; 336 } 337 } 338 } 339