1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright(c) 1998 Sun Microsystems, Inc. 23 * All rights reserved. 24 */ 25 #include <stdio.h> 26 #include <stdlib.h> 27 #include <errno.h> 28 #include <sys/types.h> 29 #include <sys/isa_defs.h> 30 #include <gb2312_unicode.h> 31 #include "common_defs.h" 32 #define MSB 0x80 33 34 #define UTF8_NON_ID_CHAR1 0xEF 35 #define UTF8_NON_ID_CHAR2 0xBF 36 #define UTF8_NON_ID_CHAR3 0xBD 37 38 #define EUC_BYTE1_LOWER 0xA1 39 #define EUC_BYTE1_UPPER 0xFE 40 #define EUC_BYTE2_LOWER EUC_BYTE1_LOWER 41 #define EUC_BYTE2_UPPER EUC_BYTE1_UPPER 42 43 #define UCHAR unsigned char 44 45 typedef struct _icv_state { 46 char _lastc; 47 short _gstate; 48 boolean little_endian; 49 boolean bom_written; 50 } _iconv_st; 51 52 enum _GSTATE { G0, G1 }; 53 54 static int is_valid_gb2312(UCHAR, UCHAR); 55 int 56 gb_to_unicode(_iconv_st *st, char in_byte2, char *buf, int buflen, int *uconv_num); 57 58 /* 59 * Open; called from iconv_open() 60 */ 61 void * 62 _icv_open() 63 { 64 _iconv_st *st; 65 66 if ((st = (_iconv_st *)malloc(sizeof(_iconv_st))) == NULL) { 67 errno = ENOMEM; 68 return ((void *) -1); 69 } 70 71 st->_gstate = G0; 72 st->little_endian = false; 73 st->bom_written = false; 74 #if defined(UCS_2LE) 75 st->little_endian = true; 76 st->bom_written = true; 77 #endif 78 return ((void *)st); 79 } 80 81 82 /* 83 * Close; called from iconv_close() 84 */ 85 void 86 _icv_close(_iconv_st *st) 87 { 88 if (st == NULL) 89 errno = EBADF; 90 else 91 free(st); 92 } 93 94 95 /* 96 * Actual conversion; called from iconv() 97 */ 98 size_t 99 _icv_iconv(_iconv_st *st, char **inbuf, size_t*inbytesleft, 100 char **outbuf, size_t*outbytesleft) 101 { 102 int n; 103 int uconv_num = 0; 104 105 if (st == NULL) { 106 errno = EBADF; 107 return (size_t)-1; 108 } 109 if (inbuf == NULL || *inbuf == NULL) { /* Reset request. */ 110 st->_gstate = G0; 111 return (size_t)0; 112 } 113 114 errno = 0; 115 116 while (*inbytesleft > 0 && *outbytesleft > 0) { 117 switch (st->_gstate) { 118 case G0: 119 if ( **inbuf & MSB ) { 120 st->_lastc = **inbuf; 121 st->_gstate = G1; 122 } else { /* ASCII */ 123 /* 124 * code conversion for UCS-2LE to support Samba 125 */ 126 if (st->little_endian) { 127 if (!st->bom_written) { 128 if (*outbytesleft < 4) 129 errno = E2BIG; 130 else { 131 *(*outbuf)++ = (uchar_t)0xff; 132 *(*outbuf)++ = (uchar_t)0xfe; 133 134 st->bom_written = true; 135 *outbytesleft -= 2; 136 } 137 } 138 139 if (*outbytesleft < 2) 140 errno = E2BIG; 141 else { 142 *(*outbuf)++ = **inbuf; 143 *(*outbuf)++ = (uchar_t)0x0; 144 *outbytesleft -= 2; 145 } 146 } else { 147 **outbuf = **inbuf; 148 (*outbuf)++, (*outbytesleft)--; 149 } 150 } 151 break; 152 case G1: 153 if (**inbuf & MSB ) { 154 int uconv_num_internal = 0; 155 156 /* bugfix - 4669831 iconv from zh_CN.euc to UTF-8 dumps core on Intel. */ 157 if ( !is_valid_gb2312((UCHAR)st->_lastc, (UCHAR)**inbuf)) 158 { 159 errno = EILSEQ; 160 break; 161 } 162 163 n = gb_to_unicode(st, **inbuf, *outbuf, 164 *outbytesleft, &uconv_num_internal); 165 if (n > 0) { 166 (*outbuf) += n, (*outbytesleft) -= n; 167 168 uconv_num += uconv_num_internal; 169 170 st->_gstate = G0; 171 } else { 172 errno = E2BIG; 173 } 174 } else { 175 errno = EILSEQ; 176 } 177 break; 178 } 179 180 if (errno) break; 181 182 (*inbuf)++, (*inbytesleft)--; 183 } 184 185 if (*inbytesleft == 0 && st->_gstate != G0) 186 errno = EINVAL; 187 188 if (*inbytesleft > 0 && *outbytesleft == 0) 189 errno = E2BIG; 190 191 if (errno) { 192 /* 193 * if error, *inbuf points to the byte following the last byte 194 * successfully used in the conversion. 195 */ 196 *inbuf -= (st->_gstate - G0); 197 *inbytesleft += (st->_gstate - G0); 198 st->_gstate = G0; 199 return ((size_t) -1); 200 } 201 202 return uconv_num; 203 } 204 205 static int 206 is_valid_gb2312(UCHAR byte1, UCHAR byte2) 207 { 208 if ( (byte1 < EUC_BYTE1_LOWER || byte1 > EUC_BYTE1_UPPER) || 209 (byte2 < EUC_BYTE2_LOWER || byte2 > EUC_BYTE2_UPPER) ) { 210 return 0; 211 } 212 213 return 1; 214 } 215 216 217 /* 218 * return: > 0 - converted with enough space 219 * = 0 - no space in outbuf 220 */ 221 int 222 gb_to_unicode(st, in_byte2, buf, buflen, uconv_num) 223 _iconv_st *st; 224 char in_byte2; 225 char *buf; 226 int buflen; 227 int *uconv_num; 228 { 229 int idx; 230 int unicode; 231 char in_byte1 = st->_lastc; 232 233 idx = (((in_byte1 & 0xff) - 0xa1) * 94) + (in_byte2 & 0xff) - 0xa1; 234 /* 235 * code conversion for UCS-2LE to support samba in Solaris 236 */ 237 if (st->little_endian) { 238 int size = 0; 239 240 if (idx < 0 || idx >= GBMAX) { 241 unicode = ICV_CHAR_UCS2_REPLACEMENT; 242 *uconv_num = 1; 243 } else 244 unicode = Unicode[idx]; 245 246 if (!st->bom_written) { 247 if (buflen < 4) 248 return 0; 249 250 *(buf + size++) = (uchar_t)0xff; 251 *(buf + size++) = (uchar_t)0xfe; 252 st->bom_written = true; 253 } 254 255 if (buflen < 2) 256 return 0; 257 258 *(buf + size++) = (uchar_t)(unicode & 0xff); 259 *(buf + size++) = (uchar_t)((unicode >> 8) & 0xff); 260 261 return size; 262 } 263 264 /* bugfix - 4669831 iconv from zh_CN.euc to UTF-8 dumps core on Intel. */ 265 if (idx >= 0 && idx < GBMAX ) { 266 unicode = Unicode[idx]; 267 if (unicode >= 0x0080 && unicode <= 0x07ff) { 268 if ( buflen < 2 ) 269 return 0; 270 *buf = ((unicode >> 6) & 0x1f) | 0xc0; 271 *(buf+1) = (unicode & 0x3f) | MSB; 272 return 2; 273 } 274 if (unicode >= 0x0800 && unicode <= 0xffff) { 275 if ( buflen < 3 ) 276 return 0; 277 *buf = ((unicode >> 12) & 0x0f) | 0xe0; 278 *(buf+1) = ((unicode >> 6) & 0x3f) | MSB; 279 *(buf+2) = (unicode & 0x3f) | MSB; 280 return 3; 281 } 282 } 283 if ( buflen < 3 ) 284 return 0; 285 286 *buf = UTF8_NON_ID_CHAR1; 287 *(buf+1) = UTF8_NON_ID_CHAR2; 288 *(buf+2) = UTF8_NON_ID_CHAR3; 289 290 /* non-identical conversion */ 291 *uconv_num = 1; 292 293 return 3; 294 } 295