1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright(c) 2001 Sun Microsystems, Inc. 23 * All rights reserved. 24 */ 25 26 #include <stdio.h> 27 #include <errno.h> 28 #include <stdlib.h> 29 #include <sys/types.h> 30 #include <sys/isa_defs.h> 31 #include <gb18030_unicode.h> /* GBK to Unicode mapping table */ 32 #include "common_defs.h" 33 34 #define MSB 0x80 /* most significant bit */ 35 #define ONEBYTE 0xff /* right most byte */ 36 #define GBK_LEN_MAX 4 37 38 #define INVALID_BYTE(v) ( (v) == 0x80 || (v) == 0xff ) 39 #define gbk4_2nd_byte(v) ( (v) >= 0x30 && (v) <= 0x39 ) 40 #define gbk4_3rd_byte(v) ( (v) >= 0x81 && (v) <= 0xfe ) 41 #define gbk4_4th_byte(v) gbk4_2nd_byte(v) 42 43 #define UTF8_NON_ID_CHAR1 0xEF /* non-identified character */ 44 #define UTF8_NON_ID_CHAR2 0xBF 45 #define UTF8_NON_ID_CHAR3 0xBD 46 47 #if defined UCS_2LE 48 #define output_char unichr_to_ucs_2le 49 #elif defined UCS_2BE 50 #define output_char unichr_to_ucs_2be 51 #elif defined UCS_4LE 52 #define output_char unichr_to_ucs_4le 53 #elif defined UCS_4BE 54 #define output_char unichr_to_ucs_4be 55 #else 56 #define output_char unichr_to_utf8 57 #endif 58 59 typedef struct _icv_state { 60 char keepc[GBK_LEN_MAX]; /* maximum # byte of GBK2K code */ 61 short cstate; /* state machine id */ 62 int _errno; /* internal errno */ 63 boolean bom_written; 64 } _iconv_st; 65 66 enum _CSTATE { C0, C1, C2, C3 }; 67 68 static unsigned long gbk_to_unicode (_iconv_st *); 69 70 static int binsearch(unsigned long x, table_t v[], int n); 71 static int gbk_2nd_byte(char inbuf); 72 73 #include "uni_common.c" 74 75 /* 76 * Open; called from iconv_open() 77 */ 78 void * 79 _icv_open() 80 { 81 _iconv_st *st; 82 83 if ((st = (_iconv_st *)malloc(sizeof(_iconv_st))) == NULL) { 84 errno = ENOMEM; 85 return ((void *) -1); 86 } 87 88 st->cstate = C0; 89 st->_errno = 0; 90 #if defined(UCS_2LE) || defined(UCS_2BE) || defined(UCS_4LE) || defined(UCS_4BE) 91 st->bom_written = true; 92 #else 93 st->bom_written = false; 94 #endif 95 return ((void *) st); 96 } 97 98 99 /* 100 * Close; called from iconv_close() 101 */ 102 void 103 _icv_close(_iconv_st *st) 104 { 105 if (!st) 106 errno = EBADF; 107 else 108 free(st); 109 } 110 111 112 /* 113 * Actual conversion; called from iconv() 114 */ 115 /*======================================================= 116 * 117 * State Machine for interpreting GBK code 118 * 119 *======================================================= 120 * 121 * 3rd C 122 * C2--------> C3 123 * ^ | 124 * 2nd C | 4th C | 125 * 1st C | | 126 * +--------> C0 ----------> C1 | 127 * | ascii | 2nd C | | 128 * ^ v v V 129 * +----<-----+-----<--------+-----<------+ 130 * 131 *=======================================================*/ 132 /* 133 * GBK2 encoding range (2 byte area): 134 * High byte: 0x81 - 0xFE ( 126 encoding space) 135 * Low byte: 0x40 - 0x7E, 0x80 - 0xFE ( 190 encoding space) 136 * Total: 126 * 190 = 23,940 (23940 encoding space) 137 * 138 * GBK4 encoding range (4 byte area): 139 * The First byte: 0x81 - 0xFE 140 * The Second byte: 0x30 - 0x39 141 * The Third byte: 0x81 - 0xFE 142 * The fourth byte: 0x30 - 0x39 143 */ 144 145 size_t 146 _icv_iconv(_iconv_st *st, char **inbuf, size_t *inbytesleft, 147 char **outbuf, size_t *outbytesleft) 148 { 149 int n; 150 int uconv_num = 0; 151 152 if (st == NULL) { 153 errno = EBADF; 154 return ((size_t) -1); 155 } 156 157 if (inbuf == NULL || *inbuf == NULL) { /* Reset request. */ 158 st->cstate = C0; 159 st->_errno = 0; 160 return ((size_t) 0); 161 } 162 163 st->_errno = 0; /* reset internal errno */ 164 errno = 0; /* reset external errno */ 165 166 /* a state machine for interpreting GBK code */ 167 while (*inbytesleft > 0 && *outbytesleft > 0) { 168 switch (st->cstate) { 169 case C0: /* assuming ASCII in the beginning */ 170 if (**inbuf & MSB) { 171 if ( INVALID_BYTE((unsigned char)**inbuf) ) { 172 st->_errno = errno = EILSEQ; 173 } else { 174 st->keepc[0] = (**inbuf); 175 st->cstate = C1; 176 } 177 } else { /* real ASCII */ 178 int uconv_num_internal = 0; 179 n = output_char (st, **inbuf, *outbuf, 180 *outbytesleft, &uconv_num_internal); 181 if (n > 0) { 182 (*outbuf) += n; 183 (*outbytesleft) -= n; 184 } 185 } 186 break; 187 case C1: /* GBK2 characters: 2nd byte */ 188 if (gbk_2nd_byte(**inbuf) == 0) { 189 int uconv_num_internal = 0; 190 191 st->keepc[1] = (**inbuf); 192 st->keepc[2] = st->keepc[3] = 0; 193 194 n = output_char (st, gbk_to_unicode (st), *outbuf, 195 *outbytesleft, &uconv_num_internal); 196 if (n > 0) { 197 (*outbuf) += n; 198 (*outbytesleft) -= n; 199 200 uconv_num += uconv_num_internal; 201 202 st->cstate = C0; 203 } else { /* don't reset state */ 204 st->_errno = errno = E2BIG; 205 } 206 207 } else if ( gbk4_2nd_byte((unsigned char)**inbuf) ) { 208 st->keepc[1] = **inbuf; 209 st->cstate = C2; 210 } else { /* input char doesn't belong 211 * to the input code set 212 */ 213 st->_errno = errno = EILSEQ; 214 } 215 break; 216 case C2: 217 if ( gbk4_3rd_byte((unsigned char)**inbuf) ) { 218 st->keepc[2] = **inbuf; 219 st->cstate = C3; 220 } else { 221 st->_errno = errno = EILSEQ; 222 } 223 break; 224 case C3: 225 if ( gbk4_4th_byte((unsigned char)**inbuf) ) { 226 int uconv_num_internal = 0; 227 228 st->keepc[3] = **inbuf; 229 230 n = output_char (st, gbk_to_unicode (st), *outbuf, 231 *outbytesleft, &uconv_num_internal); 232 233 if ( n > 0 ) { 234 (*outbuf) += n; 235 (*outbytesleft) -= n; 236 237 uconv_num += uconv_num_internal; 238 239 st->cstate = C0; 240 } else { 241 st->_errno = errno = E2BIG; 242 } 243 } else { 244 st->_errno = errno = EILSEQ; 245 } 246 break; 247 default: /* should never come here */ 248 st->_errno = errno = EILSEQ; 249 st->cstate = C0; /* reset state */ 250 break; 251 } 252 253 if (st->_errno) { 254 break; 255 } 256 257 (*inbuf)++; 258 (*inbytesleft)--; 259 } 260 261 if (*inbytesleft == 0 && st->cstate != C0) 262 errno = EINVAL; 263 264 if (*inbytesleft > 0 && *outbytesleft == 0) 265 errno = E2BIG; 266 267 if (errno) { 268 /* 269 * if error, *inbuf points to the byte following the last byte 270 * successfully used in the conversion. 271 */ 272 *inbuf -= (st->cstate - C0); 273 *inbytesleft += (st->cstate - C0); 274 st->cstate = C0; 275 return ((size_t) -1); 276 } 277 278 return uconv_num; 279 } 280 281 282 /* 283 * Test whether inbuf is a valid character for 2nd byte GBK code 284 * Return: = 0 - valid GBK2 2nd byte 285 * = 1 - invalid GBK2 2nd byte 286 */ 287 static int gbk_2nd_byte(char inbuf) 288 { 289 unsigned int buf = (unsigned int) (inbuf & ONEBYTE); 290 291 if ((buf >= 0x40) && (buf <= 0x7E)) 292 return (0); 293 if ((buf >= 0x80) && (buf <= 0xFE)) 294 return (0); 295 return(1); 296 } 297 298 static unsigned long gbk_to_unicode (st) 299 _iconv_st *st; 300 { 301 unsigned long gbk_val; /* GBK value */ 302 int unidx; /* Unicode index */ 303 unsigned long uni_val = 0xffffffff; /* Unicode */ 304 int isgbk4 = 1; 305 char *keepc = st->keepc; 306 307 if ( keepc[2] == 0 && keepc[3] == 0 ) 308 isgbk4 = 0; 309 310 if ( ! isgbk4 ) { 311 gbk_val = ((keepc[0]&ONEBYTE) << 8) + (keepc[1]&ONEBYTE); 312 } else { 313 int i; 314 315 gbk_val = keepc[0] & ONEBYTE; 316 for ( i = 1; i < GBK_LEN_MAX; ++i ) 317 gbk_val = (gbk_val << 8) + (keepc[i] & ONEBYTE); 318 } 319 320 if ( isgbk4 ) { 321 unidx = binsearch(gbk_val, gbk4_unicode_tab, GBK4MAX); 322 if ( unidx >= 0 ) uni_val = gbk4_unicode_tab[unidx].value; 323 } else { 324 unidx = binsearch(gbk_val, gbk_unicode_tab, GBKMAX); 325 if ( unidx >= 0 ) uni_val = gbk_unicode_tab[unidx].value; 326 } 327 328 return uni_val; 329 } 330 331 /* binsearch: find x in v[0] <= v[1] <= ... <= v[n-1] */ 332 static int binsearch(unsigned long x, table_t v[], int n) 333 { 334 int low, high, mid; 335 336 low = 0; 337 high = n - 1; 338 while (low <= high) { 339 mid = (high - low) / 2 + low; 340 if (x < v[mid].key) 341 high = mid - 1; 342 else if (x > v[mid].key) 343 low = mid + 1; 344 else /* found match */ 345 return mid; 346 } 347 return (-1); /* no match */ 348 } 349 350 /* 351 vi:ts=8:ai:expandtab 352 */ 353