1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright(c) 1998 Sun Microsystems, Inc.
23 */
24
25 #include <stdio.h>
26 #include <errno.h>
27 #include <stdlib.h>
28 #include <sys/types.h>
29 #include <unicode_gb2312.h>
30 #include "common_defs.h"
31
32 #define SI 0x0f
33 #define SO 0x0e
34 #define ESC 0x1b
35 #define MSB 0x80
36
37 #define NON_ID_CHAR '?'
38
39 typedef struct _icv_state {
40 short _ustate;
41 short _istate;
42 short _gstate;
43 char _cbuf[3];
44 } _iconv_st;
45
46 enum _USTATE { U0, U1, U2, U3, U4, U5, U6 };
47 enum _ISTATE { IN, OUT };
48 enum _GSTATE { G0, G1 };
49
50 int unicode_to_iso(char in_byte1, char in_byte2, char *buf, int buflen);
51
52 /*
53 * Open; called from iconv_open()
54 */
55 void *
_icv_open()56 _icv_open()
57 {
58 _iconv_st *st;
59
60 if ((st = (_iconv_st *)malloc(sizeof(_iconv_st))) == NULL) {
61 errno = ENOMEM;
62 return ((void *) -1);
63 }
64
65 st->_ustate = U0;
66 st->_istate = IN;
67 st->_gstate = G0;
68
69 return ((void *)st);
70 }
71
72
73 /*
74 * Close; called from iconv_close()
75 */
76 void
_icv_close(_iconv_st * st)77 _icv_close(_iconv_st *st)
78 {
79 if (st == NULL)
80 errno = EBADF;
81 else
82 free(st);
83 }
84
85
86 /*
87 * Actual conversion; called from iconv()
88 */
89 size_t
_icv_iconv(_iconv_st * st,char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft)90 _icv_iconv(_iconv_st *st, char **inbuf, size_t*inbytesleft,
91 char **outbuf, size_t*outbytesleft)
92 {
93 char c1, c2;
94 int n;
95
96 if (st == NULL) {
97 errno = EBADF;
98 return ((size_t)-1);
99 }
100
101 if (inbuf == NULL || *inbuf == NULL) { /* Reset request. */
102 st->_ustate = U0;
103 st->_istate = IN;
104 st->_gstate = G0;
105 return ((size_t)0);
106 }
107
108 errno = 0;
109
110 while (*inbytesleft > 0 && *outbytesleft > 0) {
111
112 uchar_t first_byte;
113
114 switch (st->_ustate) {
115 case U0:
116 if ((**inbuf & MSB) == 0) { /* ASCII */
117 if (st->_istate == OUT) {
118 st->_istate = IN;
119 **outbuf = SI;
120 (*outbuf)++, (*outbytesleft)--;
121 if (*outbytesleft <= 0) {
122 errno = E2BIG;
123 return ((size_t)-1);
124 }
125 }
126 **outbuf = **inbuf;
127 (*outbuf)++, (*outbytesleft)--;
128 } else {
129 if ((**inbuf & 0xe0) == 0xc0) { /* 0xc2..0xdf */
130
131 /* invalid sequence if the first char is either 0xc0 or 0xc1 */
132 if ( number_of_bytes_in_utf8_char[((uchar_t)**inbuf)] == ICV_TYPE_ILLEGAL_CHAR )
133 {
134 errno = EILSEQ;
135 break;
136 }
137 else
138 {
139 st->_ustate = U1;
140 st->_cbuf[0] = **inbuf;
141 }
142 } else if ((**inbuf & 0xf0) == 0xe0) { /* 0xe0..0xef */
143 st->_ustate = U2;
144 st->_cbuf[0] = **inbuf;
145 } else {
146 /* four bytes of UTF-8 sequences */
147 if ( number_of_bytes_in_utf8_char[((uchar_t)**inbuf)] == ICV_TYPE_ILLEGAL_CHAR )
148 {
149 errno = EILSEQ;
150 break;
151 }
152 else {
153 st->_ustate = U4;
154 st->_cbuf[0] = **inbuf;
155 }
156 }
157 if (st->_istate == IN) {
158 if (st->_gstate == G0) {
159 if (*outbytesleft < 4) {
160 errno = E2BIG;
161 return ((size_t)-1);
162 }
163 st->_gstate = G1;
164 **outbuf = ESC;
165 *(*outbuf+1) = '$';
166 *(*outbuf+2) = ')';
167 *(*outbuf+3) = 'A';
168 (*outbuf) += 4, (*outbytesleft) -= 4;
169 if (*outbytesleft <= 0) {
170 errno = E2BIG;
171 return ((size_t)-1);
172 }
173 }
174 st->_istate = OUT;
175 **outbuf = SO;
176 (*outbuf)++, (*outbytesleft)--;
177 }
178 }
179 break;
180 case U1:
181 if ((**inbuf & 0xc0) == MSB) { /* two-byte UTF */
182 c1 = (st->_cbuf[0]&0x1c)>>2;
183 c2 = ((st->_cbuf[0]&0x03)<<6) | ((**inbuf)&0x3f);
184 n = unicode_to_iso(c1, c2, *outbuf, *outbytesleft);
185 if (n > 0) {
186 (*outbuf) += n, (*outbytesleft) -= n;
187 } else {
188 errno = E2BIG;
189 return ((size_t)-1);
190 }
191 st->_ustate = U0;
192 } else {
193 errno = EILSEQ;
194 }
195 break;
196 case U2:
197
198 first_byte = st->_cbuf[0];
199
200 /* if the first byte is 0xed, it is illegal sequence if the second
201 * one is one between 0xa0 and 0xbf because surrogate section is ill-formed
202 */
203 if (((uchar_t)**inbuf) < valid_min_2nd_byte[first_byte] ||
204 ((uchar_t)**inbuf) > valid_max_2nd_byte[first_byte] )
205 errno = EILSEQ;
206 else {
207 st->_ustate = U3;
208 st->_cbuf[1] = **inbuf;
209 }
210 break;
211 case U3:
212 if ((**inbuf & 0xc0) == MSB) { /* three-byte UTF */
213 c1 = ((st->_cbuf[0]&0x0f)<<4) | ((st->_cbuf[1]&0x3c)>>2);
214 c2 = ((st->_cbuf[1]&0x03)<<6) | ((**inbuf)&0x3f);
215 n = unicode_to_iso(c1, c2, *outbuf, *outbytesleft);
216 if (n > 0) {
217 (*outbuf) += n, (*outbytesleft) -= n;
218 } else if ( n == -1 ) {
219 errno = EILSEQ; /* unicode is either 0xfffe or 0xffff */
220 } else {
221 errno = E2BIG;
222 return ((size_t)-1);
223 }
224 st->_ustate = U0;
225 } else {
226 errno = EILSEQ;
227 }
228 break;
229 case U4:
230 first_byte = st->_cbuf[0];
231
232 /* if the first byte is 0xf0, it is illegal sequence if
233 * the second one is between 0x80 and 0x8f
234 * for Four-Byte UTF: U+10000..U+10FFFF
235 */
236 if (((uchar_t)**inbuf) < valid_min_2nd_byte[first_byte] ||
237 ((uchar_t)**inbuf) > valid_max_2nd_byte[first_byte] )
238 errno = EILSEQ;
239 else {
240 st->_ustate = U5;
241 st->_cbuf[1] = **inbuf;
242 }
243 break;
244 case U5:
245 if ((**inbuf & 0xc0) == MSB) /* 0x80..0xbf */
246 {
247 st->_ustate = U6;
248 st->_cbuf[2] = **inbuf;
249 }
250 else
251 errno = EILSEQ;
252 break;
253 case U6:
254 if ((**inbuf & 0xc0) == MSB) /* 0x80..0xbf */
255 {
256 /* replace with double NON_ID_CHARs */
257 if ( *outbytesleft < 2 )
258 errno = E2BIG;
259 else
260 {
261 **outbuf = NON_ID_CHAR;
262 *(*outbuf+1) = NON_ID_CHAR;
263 (*outbytesleft) -= 2;
264
265 st->_ustate = U0;
266 }
267 }
268 else
269 errno = EILSEQ;
270 break;
271 }
272
273 if (errno)
274 return ((size_t)-1);
275
276 (*inbuf)++; (*inbytesleft)--;
277 }
278
279 if (*inbytesleft == 0 && st->_ustate != U0) {
280 errno = EINVAL;
281 return ((size_t) -1);
282 }
283
284 if (*inbytesleft > 0 && *outbytesleft == 0) {
285 errno = E2BIG;
286 return ((size_t)-1);
287 }
288 return ((size_t)(*inbytesleft));
289 }
290
291
unicode_to_iso(in_byte1,in_byte2,buf,buflen)292 int unicode_to_iso(in_byte1, in_byte2, buf, buflen)
293 char in_byte1, in_byte2;
294 char *buf;
295 int buflen;
296 {
297 int gb, unicode;
298 int i, l, h;
299
300 if (buflen < 2)
301 return 0;
302 unicode = ((in_byte1 & 0xff) << 8) + (in_byte2 & 0xff);
303
304 /* 0xfffe and 0xffff should not be allowed */
305 if ( unicode == 0xFFFE || unicode == 0xFFFF ) return -1;
306
307 for (l = 0, h = UNICODEMAX; l < h; ) {
308 if (unicode_gb_tab[l].key == unicode) {
309 i = l;
310 break;
311 }
312 if (unicode_gb_tab[h].key == unicode) {
313 i = h;
314 break;
315 }
316 i = (l + h) / 2;
317 if (unicode_gb_tab[i].key == unicode)
318 break;
319 if (unicode_gb_tab[i].key < unicode)
320 l = i + 1;
321 else h = i - 1;
322 }
323 if (unicode == unicode_gb_tab[i].key) {
324 gb = unicode_gb_tab[i].value;
325 *buf = (gb & 0xff00) >> 8;
326 *(buf+1) = gb & 0xff;
327 } else {
328 *buf = *(buf+1) = NON_ID_CHAR;
329 }
330 return 2;
331 }
332