1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright(c) 1998 Sun Microsystems, Inc.
23 * All rights reserved.
24 */
25 #include <stdio.h>
26 #include <errno.h>
27 #include <stdlib.h>
28 #include <sys/types.h>
29 #include <sys/isa_defs.h>
30 #include <unicode_gb2312.h>
31 #include "common_defs.h"
32
33 #define MSB 0x80
34 #define NON_ID_CHAR '?'
35
36 typedef struct _icv_state {
37 short _ustate;
38 char _cbuf[3];
39 boolean little_endian;
40 boolean bom_written;
41 } _iconv_st;
42
43 enum _USTATE { U0, U1, U2, U3, U4, U5, U6 };
44
45 int unicode_to_gb(char, char, char *, int, int *);
46
47 /*
48 * Open; called from iconv_open()
49 */
50 void *
_icv_open()51 _icv_open()
52 {
53 _iconv_st *st;
54
55 if ((st = (_iconv_st *)malloc(sizeof(_iconv_st))) == NULL) {
56 errno = ENOMEM;
57 return ((void *) -1);
58 }
59
60 st->_ustate = U0;
61 st->little_endian = false;
62 st->bom_written = false;
63 #if defined(UCS_2LE)
64 st->little_endian = true;
65 st->bom_written = true;
66 #endif
67 return ((void *)st);
68 }
69
70
71 /*
72 * Close; called from iconv_close()
73 */
74 void
_icv_close(_iconv_st * st)75 _icv_close(_iconv_st *st)
76 {
77 if (st == NULL)
78 errno = EBADF;
79 else
80 free(st);
81 }
82
83
84 /*
85 * Actual conversion; called from iconv()
86 */
87 size_t
_icv_iconv(_iconv_st * st,char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft)88 _icv_iconv(_iconv_st *st, char **inbuf, size_t*inbytesleft,
89 char **outbuf, size_t*outbytesleft)
90 {
91 char c1, c2;
92 int n;
93 int uconv_num = 0;
94
95 if (st == NULL) {
96 errno = EBADF;
97 return ((size_t)-1);
98 }
99
100 if (inbuf == NULL || *inbuf == NULL) { /* Reset request. */
101 st->_ustate = U0;
102 return ((size_t)0);
103 }
104
105 errno = 0;
106
107 while (*inbytesleft > 0 && *outbytesleft > 0) {
108
109 uchar_t first_byte;
110
111 switch (st->_ustate) {
112 case U0:
113 /*
114 * Code converion for UCS-2LE to support Samba
115 */
116 if (st->little_endian) {
117 st->_ustate = U1;
118 st->_cbuf[0] = **inbuf;
119 }
120 else if ((**inbuf & MSB) == 0) { /* ASCII */
121 **outbuf = **inbuf;
122 (*outbuf)++; (*outbytesleft)--;
123 } else if ((**inbuf & 0xe0) == 0xc0) { /* 0xc2..0xdf */
124
125 /* invalid sequence if the first byte is either 0xc0 or 0xc1 */
126 if ( number_of_bytes_in_utf8_char[((uchar_t)**inbuf)] == ICV_TYPE_ILLEGAL_CHAR )
127 errno = EILSEQ;
128 else {
129 st->_ustate = U1;
130 st->_cbuf[0] = **inbuf;
131 }
132 } else if ((**inbuf & 0xf0) == 0xe0) { /* 0xe0..0xef */
133 st->_ustate = U2;
134 st->_cbuf[0] = **inbuf;
135 } else {
136 /* four bytes of UTF-8 sequence */
137 if ( number_of_bytes_in_utf8_char[((uchar_t)**inbuf)] == ICV_TYPE_ILLEGAL_CHAR )
138 errno = EILSEQ;
139 else
140 {
141 st->_ustate = U4;
142 st->_cbuf[0] = **inbuf;
143 }
144 }
145 break;
146 case U1:
147 if ((**inbuf & 0xc0) == MSB || st->little_endian) { /* Two-byte UTF */
148 int uconv_num_internal = 0;
149
150 /*
151 * Code conversion for UCS-2LE to support Samba
152 */
153 if (st->little_endian) {
154 c1 = **inbuf;
155 c2 = st->_cbuf[0];
156
157 /*
158 * It's ASCII
159 */
160 if (c1 == 0 && (c2 & MSB) == 0) {
161 *(*outbuf)++ = c2;
162 (*outbytesleft) --;
163 st->_ustate = U0;
164 break;
165 }
166 } else {
167 c1 = (st->_cbuf[0]&0x1c)>>2;
168 c2 = ((st->_cbuf[0]&0x03)<<6) | ((**inbuf)&0x3f);
169 }
170 n = unicode_to_gb(c1, c2, *outbuf, *outbytesleft, &uconv_num_internal);
171 if (n > 0) {
172 (*outbuf) += n, (*outbytesleft) -= n;
173
174 uconv_num += uconv_num_internal;
175
176 st->_ustate = U0;
177 } else if (n == 0) {
178 errno = E2BIG;
179 } else { /* n == -1 if unicode is either FFFE or 0xFFFF */
180 errno = EILSEQ;
181 }
182 } else {
183 errno = EILSEQ;
184 }
185 break;
186 case U2:
187
188 first_byte = st->_cbuf[0];
189
190 /* if the first byte is 0xed, it is illegal sequence if the second
191 * one is between 0xa0 and 0xbf because surrogate section is ill-formed
192 */
193 if (((uchar_t)**inbuf) < valid_min_2nd_byte[first_byte] ||
194 ((uchar_t)**inbuf) > valid_max_2nd_byte[first_byte] )
195 errno = EILSEQ;
196 else
197 {
198 st->_ustate = U3;
199 st->_cbuf[1] = **inbuf;
200 }
201 break;
202 case U3:
203 if ((**inbuf & 0xc0) == MSB) { /* Three-byte UTF */
204 int uconv_num_internal = 0;
205
206 c1 = ((st->_cbuf[0]&0x0f)<<4) | ((st->_cbuf[1]&0x3c)>>2);
207 c2 = ((st->_cbuf[1]&0x03)<<6) | ((**inbuf)&0x3f);
208 n = unicode_to_gb(c1, c2, *outbuf, *outbytesleft, &uconv_num_internal);
209 if (n > 0) {
210 (*outbuf) += n, (*outbytesleft) -= n;
211
212 uconv_num += uconv_num_internal;
213
214 st->_ustate = U0;
215 } else if ( n == 0 ) {
216 errno = E2BIG;
217 } else { /* n == -1 if unicode is either 0xFFFE or 0xFFFF */
218 errno = EILSEQ;
219 }
220 } else {
221 errno = EILSEQ;
222 }
223 break;
224 case U4:
225
226 first_byte = st->_cbuf[0];
227
228 /* if the first byte is 0xf0, it is illegal sequence if
229 * the second one is between 0x80 and 0x8f
230 * for Four-Byte UTF: U+10000..U+10FFFF
231 */
232 if (((uchar_t)**inbuf) < valid_min_2nd_byte[first_byte] ||
233 ((uchar_t)**inbuf) > valid_max_2nd_byte[first_byte] )
234 errno = EILSEQ;
235 else
236 {
237 st->_ustate = U5;
238 st->_cbuf[1] = **inbuf;
239 }
240 break;
241 case U5:
242 if ((**inbuf & 0xc0) == MSB) /* 0x80..0xbf */
243 {
244 st->_ustate = U6;
245 st->_cbuf[2] = **inbuf;
246 }
247 else
248 errno = EILSEQ;
249 break;
250 case U6:
251 if ((**inbuf & 0xc0) == MSB) /* 0x80..0xbf */
252 {
253 /* all gb2312 characters are in Unicode Plane 0
254 * so replace these other 16 planes with 0x3f3f
255 */
256 if ( *outbytesleft < 2 )
257 errno = E2BIG;
258 else
259 {
260 **outbuf = NON_ID_CHAR;
261 *(*outbuf+1) = NON_ID_CHAR;
262 (*outbytesleft) -= 2;
263
264 uconv_num++;
265
266 st->_ustate = U0;
267 }
268 }
269 else
270 errno = EILSEQ;
271 break;
272 }
273
274 if (errno) break;
275
276 (*inbuf)++; (*inbytesleft)--;
277 }
278
279 if (*inbytesleft == 0 && st->_ustate != U0)
280 errno = EINVAL;
281
282 if (*inbytesleft > 0 && *outbytesleft == 0)
283 errno = E2BIG;
284
285 if (errno) {
286 int num_reversed_bytes = 0;
287
288 switch (st->_ustate)
289 {
290 case U1:
291 num_reversed_bytes = 1;
292 break;
293 case U2:
294 num_reversed_bytes = 1;
295 break;
296 case U3:
297 num_reversed_bytes = 2;
298 break;
299 case U4:
300 num_reversed_bytes = 1;
301 break;
302 case U5:
303 num_reversed_bytes = 2;
304 break;
305 case U6:
306 num_reversed_bytes = 3;
307 break;
308 }
309
310 /*
311 * if error, *inbuf points to the byte following the last byte
312 * successfully used in conversion.
313 */
314 *inbuf -= num_reversed_bytes;
315 *inbytesleft += num_reversed_bytes;
316 st->_ustate = U0;
317
318 return ((size_t)-1);
319 }
320
321 return uconv_num;
322 }
323
324 /* return values: 0 - no enough space to hold the GB2312 code
325 * -1 - illegal sequence
326 * >0 - buffer length
327 */
unicode_to_gb(char in_byte1,char in_byte2,char * buf,int buflen,int * uconv_num)328 int unicode_to_gb(char in_byte1, char in_byte2, char *buf, int buflen, int *uconv_num)
329 {
330 int gb, unicode;
331 int i, l, h;
332
333 if (buflen < 2)
334 return 0;
335 unicode = ((in_byte1 & 0xff) << 8) + (in_byte2 & 0xff);
336 /* 0xfffe and 0xffff should not be allowed */
337 if ( unicode == 0xFFFE || unicode == 0xFFFF ) return -1;
338
339 for (l = 0, h = UNICODEMAX; l < h; ) {
340 if (unicode_gb_tab[l].key == unicode) {
341 i = l;
342 break;
343 }
344 if (unicode_gb_tab[h].key == unicode) {
345 i = h;
346 break;
347 }
348 i = (l + h) / 2;
349 if (unicode_gb_tab[i].key == unicode)
350 break;
351 if (unicode_gb_tab[i].key < unicode)
352 l = i + 1;
353 else h = i - 1;
354 }
355 if (unicode == unicode_gb_tab[i].key) {
356 gb = unicode_gb_tab[i].value;
357 *buf = ((gb & 0xff00) >> 8) | MSB;
358 *(buf+1) = (gb & 0xff) | MSB;
359 } else {
360 *buf = NON_ID_CHAR;
361 *(buf+1) = NON_ID_CHAR;
362
363 /* non-identical conversion */
364 *uconv_num = 1;
365 }
366
367 return 2;
368 }
369