1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright(c) 1998 Sun Microsystems, Inc.
23 * All rights reserved.
24 */
25 #include <stdio.h>
26 #include <stdlib.h>
27 #include <errno.h>
28 #include <sys/types.h>
29 #include <sys/isa_defs.h>
30 #include <gb2312_unicode.h>
31 #include "common_defs.h"
32 #define MSB 0x80
33
34 #define UTF8_NON_ID_CHAR1 0xEF
35 #define UTF8_NON_ID_CHAR2 0xBF
36 #define UTF8_NON_ID_CHAR3 0xBD
37
38 #define EUC_BYTE1_LOWER 0xA1
39 #define EUC_BYTE1_UPPER 0xFE
40 #define EUC_BYTE2_LOWER EUC_BYTE1_LOWER
41 #define EUC_BYTE2_UPPER EUC_BYTE1_UPPER
42
43 #define UCHAR unsigned char
44
45 typedef struct _icv_state {
46 char _lastc;
47 short _gstate;
48 boolean little_endian;
49 boolean bom_written;
50 } _iconv_st;
51
52 enum _GSTATE { G0, G1 };
53
54 static int is_valid_gb2312(UCHAR, UCHAR);
55 int
56 gb_to_unicode(_iconv_st *st, char in_byte2, char *buf, int buflen, int *uconv_num);
57
58 /*
59 * Open; called from iconv_open()
60 */
61 void *
_icv_open()62 _icv_open()
63 {
64 _iconv_st *st;
65
66 if ((st = (_iconv_st *)malloc(sizeof(_iconv_st))) == NULL) {
67 errno = ENOMEM;
68 return ((void *) -1);
69 }
70
71 st->_gstate = G0;
72 st->little_endian = false;
73 st->bom_written = false;
74 #if defined(UCS_2LE)
75 st->little_endian = true;
76 st->bom_written = true;
77 #endif
78 return ((void *)st);
79 }
80
81
82 /*
83 * Close; called from iconv_close()
84 */
85 void
_icv_close(_iconv_st * st)86 _icv_close(_iconv_st *st)
87 {
88 if (st == NULL)
89 errno = EBADF;
90 else
91 free(st);
92 }
93
94
95 /*
96 * Actual conversion; called from iconv()
97 */
98 size_t
_icv_iconv(_iconv_st * st,char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft)99 _icv_iconv(_iconv_st *st, char **inbuf, size_t*inbytesleft,
100 char **outbuf, size_t*outbytesleft)
101 {
102 int n;
103 int uconv_num = 0;
104
105 if (st == NULL) {
106 errno = EBADF;
107 return (size_t)-1;
108 }
109 if (inbuf == NULL || *inbuf == NULL) { /* Reset request. */
110 st->_gstate = G0;
111 return (size_t)0;
112 }
113
114 errno = 0;
115
116 while (*inbytesleft > 0 && *outbytesleft > 0) {
117 switch (st->_gstate) {
118 case G0:
119 if ( **inbuf & MSB ) {
120 st->_lastc = **inbuf;
121 st->_gstate = G1;
122 } else { /* ASCII */
123 /*
124 * code conversion for UCS-2LE to support Samba
125 */
126 if (st->little_endian) {
127 if (!st->bom_written) {
128 if (*outbytesleft < 4)
129 errno = E2BIG;
130 else {
131 *(*outbuf)++ = (uchar_t)0xff;
132 *(*outbuf)++ = (uchar_t)0xfe;
133
134 st->bom_written = true;
135 *outbytesleft -= 2;
136 }
137 }
138
139 if (*outbytesleft < 2)
140 errno = E2BIG;
141 else {
142 *(*outbuf)++ = **inbuf;
143 *(*outbuf)++ = (uchar_t)0x0;
144 *outbytesleft -= 2;
145 }
146 } else {
147 **outbuf = **inbuf;
148 (*outbuf)++, (*outbytesleft)--;
149 }
150 }
151 break;
152 case G1:
153 if (**inbuf & MSB ) {
154 int uconv_num_internal = 0;
155
156 /* bugfix - 4669831 iconv from zh_CN.euc to UTF-8 dumps core on Intel. */
157 if ( !is_valid_gb2312((UCHAR)st->_lastc, (UCHAR)**inbuf))
158 {
159 errno = EILSEQ;
160 break;
161 }
162
163 n = gb_to_unicode(st, **inbuf, *outbuf,
164 *outbytesleft, &uconv_num_internal);
165 if (n > 0) {
166 (*outbuf) += n, (*outbytesleft) -= n;
167
168 uconv_num += uconv_num_internal;
169
170 st->_gstate = G0;
171 } else {
172 errno = E2BIG;
173 }
174 } else {
175 errno = EILSEQ;
176 }
177 break;
178 }
179
180 if (errno) break;
181
182 (*inbuf)++, (*inbytesleft)--;
183 }
184
185 if (*inbytesleft == 0 && st->_gstate != G0)
186 errno = EINVAL;
187
188 if (*inbytesleft > 0 && *outbytesleft == 0)
189 errno = E2BIG;
190
191 if (errno) {
192 /*
193 * if error, *inbuf points to the byte following the last byte
194 * successfully used in the conversion.
195 */
196 *inbuf -= (st->_gstate - G0);
197 *inbytesleft += (st->_gstate - G0);
198 st->_gstate = G0;
199 return ((size_t) -1);
200 }
201
202 return uconv_num;
203 }
204
205 static int
is_valid_gb2312(UCHAR byte1,UCHAR byte2)206 is_valid_gb2312(UCHAR byte1, UCHAR byte2)
207 {
208 if ( (byte1 < EUC_BYTE1_LOWER || byte1 > EUC_BYTE1_UPPER) ||
209 (byte2 < EUC_BYTE2_LOWER || byte2 > EUC_BYTE2_UPPER) ) {
210 return 0;
211 }
212
213 return 1;
214 }
215
216
217 /*
218 * return: > 0 - converted with enough space
219 * = 0 - no space in outbuf
220 */
221 int
gb_to_unicode(st,in_byte2,buf,buflen,uconv_num)222 gb_to_unicode(st, in_byte2, buf, buflen, uconv_num)
223 _iconv_st *st;
224 char in_byte2;
225 char *buf;
226 int buflen;
227 int *uconv_num;
228 {
229 int idx;
230 int unicode;
231 char in_byte1 = st->_lastc;
232
233 idx = (((in_byte1 & 0xff) - 0xa1) * 94) + (in_byte2 & 0xff) - 0xa1;
234 /*
235 * code conversion for UCS-2LE to support samba in Solaris
236 */
237 if (st->little_endian) {
238 int size = 0;
239
240 if (idx < 0 || idx >= GBMAX) {
241 unicode = ICV_CHAR_UCS2_REPLACEMENT;
242 *uconv_num = 1;
243 } else
244 unicode = Unicode[idx];
245
246 if (!st->bom_written) {
247 if (buflen < 4)
248 return 0;
249
250 *(buf + size++) = (uchar_t)0xff;
251 *(buf + size++) = (uchar_t)0xfe;
252 st->bom_written = true;
253 }
254
255 if (buflen < 2)
256 return 0;
257
258 *(buf + size++) = (uchar_t)(unicode & 0xff);
259 *(buf + size++) = (uchar_t)((unicode >> 8) & 0xff);
260
261 return size;
262 }
263
264 /* bugfix - 4669831 iconv from zh_CN.euc to UTF-8 dumps core on Intel. */
265 if (idx >= 0 && idx < GBMAX ) {
266 unicode = Unicode[idx];
267 if (unicode >= 0x0080 && unicode <= 0x07ff) {
268 if ( buflen < 2 )
269 return 0;
270 *buf = ((unicode >> 6) & 0x1f) | 0xc0;
271 *(buf+1) = (unicode & 0x3f) | MSB;
272 return 2;
273 }
274 if (unicode >= 0x0800 && unicode <= 0xffff) {
275 if ( buflen < 3 )
276 return 0;
277 *buf = ((unicode >> 12) & 0x0f) | 0xe0;
278 *(buf+1) = ((unicode >> 6) & 0x3f) | MSB;
279 *(buf+2) = (unicode & 0x3f) | MSB;
280 return 3;
281 }
282 }
283 if ( buflen < 3 )
284 return 0;
285
286 *buf = UTF8_NON_ID_CHAR1;
287 *(buf+1) = UTF8_NON_ID_CHAR2;
288 *(buf+2) = UTF8_NON_ID_CHAR3;
289
290 /* non-identical conversion */
291 *uconv_num = 1;
292
293 return 3;
294 }
295