1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright(c) 1998 Sun Microsystems, Inc.
23 * All rights reserved.
24 */
25
26 #include <stdio.h>
27 #include <errno.h>
28 #include <stdlib.h>
29 #include <sys/types.h>
30 #include "unicode_gb2312.h"
31 #include "common_defs.h"
32
33 #define MSB 0x80
34 #define NON_ID_CHAR '?'
35
36 typedef struct _icv_state {
37 short _ustate;
38 short saved_ustate;
39 char _cbuf[3];
40 } _iconv_st;
41
42 enum _USTATE { U0, U1, U2, U3, U4, U5, U6 };
43
44 int unicode_to_gb_to_hz(char in_byte1, char in_byte2, char *buf, int buflen);
45
46 /*
47 * Open; called from iconv_open()
48 */
49 void *
_icv_open()50 _icv_open()
51 {
52 _iconv_st *st;
53
54 if ((st = (_iconv_st *)malloc(sizeof(_iconv_st))) == NULL) {
55 errno = ENOMEM;
56 return ((void *) -1);
57 }
58
59 st->_ustate = U0;
60 st->saved_ustate = U0;
61 return ((void *)st);
62 }
63
64
65 /*
66 * Close; called from iconv_close()
67 */
68 void
_icv_close(_iconv_st * st)69 _icv_close(_iconv_st *st)
70 {
71 if (st == NULL)
72 errno = EBADF;
73 else
74 free(st);
75 }
76
77
78 /*
79 * Actual conversion; called from iconv()
80 */
81 size_t
_icv_iconv(_iconv_st * st,char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft)82 _icv_iconv(_iconv_st *st, char **inbuf, size_t*inbytesleft,
83 char **outbuf, size_t*outbytesleft)
84 {
85 char c1, c2;
86 int n;
87
88 if (st == NULL) {
89 errno = EBADF;
90 return ((size_t)-1);
91 }
92
93 if (inbuf == NULL || *inbuf == NULL) { /* Reset request. */
94 st->_ustate = U0;
95 return ((size_t)0);
96 }
97
98 errno = 0;
99 while (*inbytesleft > 0 && *outbytesleft > 0) {
100
101 uchar_t first_byte;
102
103 switch (st->_ustate) {
104 case U0:
105 if (**inbuf & MSB && st->saved_ustate ==U0) {
106 if(*outbytesleft >=2) {
107 **outbuf = '~';
108 *(*outbuf+1) = '{';
109 (*outbuf) += 2, (*outbytesleft) -= 2;
110 } else {
111 errno = E2BIG;
112 return (size_t)-1;
113 }
114 }
115 if ((**inbuf & MSB) == 0) { /* ASCII */
116 if (st->saved_ustate == U1 || st->saved_ustate == U3)
117 {
118 if(*outbytesleft >=2) {
119 **outbuf = '~';
120 *(*outbuf+1) = '}';
121 (*outbuf) += 2, (*outbytesleft) -= 2;
122 }else {
123 errno = E2BIG;
124 return (size_t)-1;
125 }
126 }
127 st->saved_ustate = U0;
128 if(*outbytesleft >=1) {
129 **outbuf = **inbuf;
130 (*outbuf)++; (*outbytesleft)--;
131 }else {
132 errno = E2BIG;
133 return (size_t)-1;
134 }
135 if (**inbuf == '~') {
136 if(*outbytesleft >=1) {
137 **outbuf = '~';
138 (*outbuf)++, (*outbytesleft)--;
139 }else {
140 errno = E2BIG;
141 return (size_t)-1;
142 }
143 }
144 } else if ((**inbuf & 0xe0) == 0xc0) { /* 0xc2..0xbf */
145
146 /* invalid sequence if the first char is either 0xc0 or 0xc1 */
147 if ( number_of_bytes_in_utf8_char[((uchar_t)**inbuf)] == ICV_TYPE_ILLEGAL_CHAR )
148 errno = EILSEQ;
149 else {
150 st->_ustate = U1;
151 st->_cbuf[0] = **inbuf;
152 }
153 } else if ((**inbuf & 0xf0) == 0xe0) { /* 0xe0..0xef */
154 st->_ustate = U2;
155 st->_cbuf[0] = **inbuf;
156 } else {
157 /* four bytes of UTF-8 sequences */
158 if ( number_of_bytes_in_utf8_char[((uchar_t)**inbuf)] == ICV_TYPE_ILLEGAL_CHAR )
159 errno = EILSEQ;
160 else
161 {
162 st->_ustate = U4;
163 st->_cbuf[0] = **inbuf;
164 }
165 }
166 break;
167 case U1:
168 if ((**inbuf & 0xc0) == MSB) { /* Two-byte UTF */
169 c1 = (st->_cbuf[0]&0x1c)>>2;
170 c2 = ((st->_cbuf[0]&0x03)<<6) | ((**inbuf)&0x3f);
171 n = unicode_to_gb_to_hz(c1, c2, *outbuf, *outbytesleft);
172 if (n > 0) {
173 (*outbuf) += n, (*outbytesleft) -= n;
174 } else {
175 errno = E2BIG;
176 return ((size_t) -1);
177 }
178 st->saved_ustate = U1;
179 st->_ustate = U0;
180 } else {
181 errno = EILSEQ;
182 }
183 break;
184 case U2:
185 st->saved_ustate = U2;
186
187 first_byte = st->_cbuf[0];
188
189 /* if the first byte is 0xed, it is illegal sequence if the second
190 * one is between 0xa0 and 0xbf because surrogate section is ill-formed
191 */
192 if (((uchar_t)**inbuf) < valid_min_2nd_byte[first_byte] ||
193 ((uchar_t)**inbuf) > valid_max_2nd_byte[first_byte] )
194 errno = EILSEQ;
195 else {
196 st->_ustate = U3;
197 st->_cbuf[1] = **inbuf;
198 }
199 break;
200 case U3:
201 if ((**inbuf & 0xc0) == MSB) { /* Three-byte UTF */
202 c1 = ((st->_cbuf[0]&0x0f)<<4) | ((st->_cbuf[1]&0x3c)>>2);
203 c2 = ((st->_cbuf[1]&0x03)<<6) | ((**inbuf)&0x3f);
204 n = unicode_to_gb_to_hz(c1, c2, *outbuf, *outbytesleft);
205 if (n > 0) {
206 (*outbuf) += n, (*outbytesleft) -= n;
207 } else if ( n == -1 ) { /* unicode is either 0xFFFE or 0xFFFF */
208 errno = EILSEQ;
209 } else {
210 errno = E2BIG;
211 return ((size_t)-1);
212 }
213 st->saved_ustate = U3;
214 st->_ustate = U0;
215 } else {
216 errno = EILSEQ;
217 break;
218 }
219 break;
220 case U4:
221
222 first_byte = st->_cbuf[0];
223
224 /* if the first byte is 0xf0, it is illegal sequence if
225 * the second one is between 0x80 and 0x8f
226 * for Four-Byte UTF: U+10000..U+10FFFF
227 */
228 if (((uchar_t)**inbuf) < valid_min_2nd_byte[first_byte] ||
229 ((uchar_t)**inbuf) > valid_max_2nd_byte[first_byte] )
230 errno = EILSEQ;
231 else
232 {
233 st->_ustate = U5;
234 st->_cbuf[1] = **inbuf;
235 st->saved_ustate = U4;
236 }
237 break;
238 case U5:
239 if ((**inbuf & 0xc0) == MSB) /* 0x80..0xbf */
240 {
241 st->_ustate = U6;
242 st->_cbuf[2] = **inbuf;
243 st->saved_ustate = U5;
244 }
245 else
246 errno = EILSEQ;
247 break;
248 case U6:
249 if ((**inbuf & 0xc0) == MSB) /* 0x80..0xbf */
250 {
251 /* replace with double NON_ID_CHARs */
252 if ( *outbytesleft < 2 )
253 errno = E2BIG;
254 else
255 {
256 **outbuf = NON_ID_CHAR;
257 *(*outbuf+1) = NON_ID_CHAR;
258 (*outbytesleft) -= 2;
259
260 st->_ustate = U0;
261 st->saved_ustate = U6;
262 }
263 }
264 else
265 errno = EILSEQ;
266 break;
267 }
268
269 if (errno)
270 return ((size_t)-1);
271 (*inbuf)++; (*inbytesleft)--;
272 }
273
274 if (*inbytesleft == 0 && st->_ustate != U0)
275 {
276 errno = EINVAL;
277 return ((size_t) -1);
278 }
279
280 if (*inbytesleft > 0 && *outbytesleft == 0) {
281 errno = E2BIG;
282 return ((size_t)-1);
283 }
284 return ((size_t)(*inbytesleft));
285 }
286
287 /* return value: 0 - no enough space to hold the HZ-GB-2312 code
288 * -1 - illegal sequence
289 * >0 - buffer length
290 */
unicode_to_gb_to_hz(in_byte1,in_byte2,buf,buflen)291 int unicode_to_gb_to_hz(in_byte1, in_byte2, buf, buflen)
292 char in_byte1, in_byte2;
293 char *buf;
294 int buflen;
295 {
296 int gb, unicode;
297 int i, l, h;
298
299 if (buflen < 2)
300 return 0;
301 unicode = ((in_byte1 & 0xff) << 8) + (in_byte2 & 0xff);
302
303 /* 0xfffe and 0xffff should not be allowed */
304 if ( unicode == 0xFFFE || unicode == 0xFFFF ) return -1;
305
306 for (l = 0, h = UNICODEMAX; l < h; ) {
307 if (unicode_gb_tab[l].key == unicode) {
308 i = l;
309 break;
310 }
311 if (unicode_gb_tab[h].key == unicode) {
312 i = h;
313 break;
314 }
315 i = (l + h) / 2;
316 if (unicode_gb_tab[i].key == unicode)
317 break;
318 if (unicode_gb_tab[i].key < unicode)
319 l = i + 1;
320 else h = i - 1;
321 }
322 if (unicode == unicode_gb_tab[i].key) {
323 gb = unicode_gb_tab[i].value;
324 *buf = ((gb & 0xff00) >> 8) & 0x7f;
325 *(buf+1) = (gb & 0xff) & 0x7f;
326 } else {
327 *buf = NON_ID_CHAR;
328 *(buf+1) = NON_ID_CHAR;
329 }
330 return 2;
331 }
332