1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2008, by Sun Microsystems, Inc.
23 * All rights reserved.
24 */
25
26 #include <stdio.h>
27 #include <errno.h>
28 #include <stdlib.h>
29 #include <sys/types.h>
30 #define __NEED_UNI_2_VISCII__
31 #include <unicode_viscii.h> /* Unicode to viscii mapping table */
32 #include "common_defs.h"
33
34 #define MSB 0x80 /* most significant bit */
35 #define ONEBYTE 0xff /* right most byte */
36
37 #define NON_ID_CHAR '?' /* non-identified character */
38
39
40
41 typedef struct _icv_state {
42 char keepc[6]; /* maximum # byte of UTF8 code */
43 short ustate;
44 int _errno; /* internal errno */
45 } _iconv_st;
46
47 enum _USTATE { U0, U1, U2, U3, U4, U5, U6, U7 };
48
49
50 /*
51 * Open; called from iconv_open()
52 */
53 void *
_icv_open()54 _icv_open()
55 {
56 _iconv_st *st;
57
58 if ((st = (_iconv_st *)malloc(sizeof(_iconv_st))) == NULL) {
59 errno = ENOMEM;
60 return ((void *) -1);
61 }
62
63 st->ustate = U0;
64 st->_errno = 0;
65 return ((void *) st);
66 }
67
68
69 /*
70 * Close; called from iconv_close()
71 */
72 void
_icv_close(_iconv_st * st)73 _icv_close(_iconv_st *st)
74 {
75 if (!st)
76 errno = EBADF;
77 else
78 free(st);
79 }
80
81
82 /*
83 * Actual conversion; called from iconv()
84 */
85 /*=========================================================
86 *
87 * State Machine for interpreting UTF8 code
88 *
89 *=========================================================
90 * 4 byte unicode
91 * +----->------->------------> U5 -----> U6-------> U7---+
92 * | |
93 * | 3 byte unicode |
94 * +----->------->-------+ |
95 * | | |
96 * ^ v |
97 * | 2 byte U2 ---> U3 |
98 * | unicode v |
99 * +------> U0 -------> U1 +-------->U4---+ |
100 * ^ ascii | | ^ | |
101 * | | +-------->--------->--------+ | |
102 * | v v V
103 * +----<---+-----<------------<------------<------------+---------+
104 *
105 *=========================================================*/
106 size_t
_icv_iconv(_iconv_st * st,char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft)107 _icv_iconv(_iconv_st *st, char **inbuf, size_t *inbytesleft,
108 char **outbuf, size_t *outbytesleft)
109 {
110 char c1 = '\0', c2 = '\0';
111 int uconv_num = 0;
112 unsigned long uni = 0;
113 int utf8_len = 0;
114
115 #ifdef DEBUG
116 fprintf(stderr, "========== iconv(): UTF2 --> GBK2K ==========\n");
117 #endif
118 if (st == NULL) {
119 errno = EBADF;
120 return ((size_t) -1);
121 }
122
123 if (inbuf == NULL || *inbuf == NULL) { /* Reset request. */
124 st->ustate = U0;
125 st->_errno = 0;
126 return ((size_t) 0);
127 }
128
129 st->_errno = 0; /* reset internal errno */
130 errno = 0; /* reset external errno */
131
132 /* a state machine for interpreting UTF8 code */
133 while (*inbytesleft > 0 && *outbytesleft > 0) {
134
135 uchar_t first_byte;
136 unsigned short ch = 0;
137 switch (st->ustate) {
138 case U0:
139 /*
140 * assuming ASCII in the beginning
141 */
142 if ((**inbuf & MSB) == 0) { /* ASCII */
143 **outbuf = **inbuf;
144 (*outbuf)++;
145 (*outbytesleft)--;
146 } else {
147 if ((**inbuf & 0xe0) == 0xc0) {
148 /* 2 byte unicode 0xc0..0xdf */
149 /* invalid sequence if the first char is either 0xc0 or 0xc1 */
150 if ( number_of_bytes_in_utf8_char[((uchar_t)**inbuf)] == ICV_TYPE_ILLEGAL_CHAR )
151 st->_errno = errno = EILSEQ;
152 else {
153 st->ustate = U1;
154 st->keepc[0] = **inbuf;
155 }
156 } else if ((**inbuf & 0xf0) == 0xe0) { /* 3 byte 0xe0..0xf0 */
157 st->ustate = U2;
158 st->keepc[0] = **inbuf;
159 } else {
160 /* four bytes of UTF-8 sequences */
161 if ( number_of_bytes_in_utf8_char[((uchar_t)**inbuf)] == ICV_TYPE_ILLEGAL_CHAR )
162 st->_errno = errno = EILSEQ;
163 else {
164 st->ustate = U5;
165 st->keepc[0] = **inbuf;
166 }
167 }
168 }
169 break;
170 case U1:
171 /* 2 byte utf-8 encoding */
172 if ((**inbuf & 0xc0) == MSB) {
173 utf8_len = 2;
174 st->keepc[1] = **inbuf;
175
176 c1 = (st->keepc[0]&0x1c)>>2;
177 c2 = ((st->keepc[0]&0x03)<<6) | ((st->keepc[1])&0x3f);
178 st->ustate = U4;
179 #ifdef DEBUG
180 fprintf(stderr, "UTF8: %02x%02x --> ",
181 st->keepc[0]&ONEBYTE, st->keepc[1]&ONEBYTE);
182 #endif
183 continue; /* should not advance *inbuf */
184 } else {
185 st->_errno = errno = EILSEQ;
186 }
187 break;
188 case U2:
189 /* 3 byte unicode - 2nd byte */
190 first_byte = (uchar_t)st->keepc[0];
191 /* if the first byte is 0xed, it is illegal sequence if the second
192 * one is between 0xa0 and 0xbf because surrogate section is ill-formed
193 */
194 if (((uchar_t)**inbuf) < valid_min_2nd_byte[first_byte] ||
195 ((uchar_t)**inbuf) > valid_max_2nd_byte[first_byte] )
196 st->_errno = errno = EILSEQ;
197 else {
198 st->ustate = U3;
199 st->keepc[1] = **inbuf;
200 }
201 break;
202 case U3:
203 /* 3 byte unicode - 3rd byte */
204 if ((**inbuf & 0xc0) == MSB) {
205 st->ustate = U4;
206 utf8_len = 3;
207 st->keepc[2] = **inbuf;
208 c1 = ((st->keepc[0]&0x0f)<<4) |
209 ((st->keepc[1]&0x3c)>>2);
210 c2 = ((st->keepc[1]&0x03)<<6) | ((**inbuf)&0x3f);
211 #ifdef DEBUG
212 fprintf(stderr, "UTF8: %02x%02x%02x --> ", st->keepc[0]&ONEBYTE,
213 st->keepc[1]&ONEBYTE, **inbuf&ONEBYTE);
214 #endif
215 continue; /* should not advance *inbuf */
216 } else {
217 st->_errno = errno = EILSEQ;
218 }
219 break;
220 case U4:
221 uni = (unsigned long) ((c1 & ONEBYTE) << 8) + (c2 & ONEBYTE);
222 if (!uni_2_viscii(uni, (unsigned char*)&ch)) {
223 **outbuf = NON_ID_CHAR;
224 uconv_num += utf8_len;
225 } else {
226 **outbuf = ch;
227 }
228 (*outbuf)++;
229 (*outbytesleft)--;
230 st->ustate = U0;
231 break;
232 case U5:
233 first_byte = st->keepc[0];
234
235 /* if the first byte is 0xf0, it is illegal sequence if
236 * the second one is between 0x80 and 0x8f
237 * for Four-Byte UTF: U+10000..U+10FFFF
238 * */
239 if (((uchar_t)**inbuf) < valid_min_2nd_byte[first_byte] ||
240 ((uchar_t)**inbuf) > valid_max_2nd_byte[first_byte] )
241 st->_errno = errno = EILSEQ;
242 else {
243 st->ustate = U6;
244 st->keepc[1] = **inbuf;
245 }
246 break;
247 case U6:
248 if ((**inbuf & 0xc0) == MSB) {
249 /* 0x80..0xbf */
250 st->ustate = U7;
251 st->keepc[2] = **inbuf;
252 } else
253 st->_errno = errno = EILSEQ;
254 break;
255 case U7:
256 if ((**inbuf & 0xc0) == MSB) {
257 /* 0x80..0xbf */
258 /* replace with double NON_ID_CHARs */
259 if ( *outbytesleft < 1 )
260 st->_errno = errno = E2BIG;
261 else {
262 **outbuf = NON_ID_CHAR;
263 (*outbytesleft) -= 1;
264 uconv_num++;
265 st->ustate = U0;
266 }
267 } else
268 st->_errno = errno = EILSEQ;
269 break;
270 default:
271 /* should never come here */
272 st->_errno = errno = EILSEQ;
273 st->ustate = U0; /* reset state */
274 break;
275 }
276
277 if (st->_errno) {
278 #ifdef DEBUG
279 fprintf(stderr, "!!!!!\tst->_errno = %d\tst->ustate = %d\n",
280 st->_errno, st->ustate);
281 #endif
282 break;
283 }
284
285 (*inbuf)++;
286 (*inbytesleft)--;
287 }
288
289 if (*inbytesleft == 0 && st->ustate != U0)
290 errno = EINVAL;
291
292 if (*inbytesleft > 0 && *outbytesleft == 0)
293 errno = E2BIG;
294
295 if (errno) {
296 int num_reversed_bytes = 0;
297
298 switch (st->ustate)
299 {
300 case U1:
301 num_reversed_bytes = 1;
302 break;
303 case U2:
304 num_reversed_bytes = 1;
305 break;
306 case U3:
307 num_reversed_bytes = 2;
308 break;
309 case U4:
310 num_reversed_bytes = utf8_len - 1;
311 break;
312 case U5:
313 num_reversed_bytes = 1;
314 break;
315 case U6:
316 num_reversed_bytes = 2;
317 break;
318 case U7:
319 num_reversed_bytes = 3;
320 break;
321 }
322
323 /*
324 * if error, *inbuf points to the byte following the last byte
325 * successfully used in conversion.
326 */
327 *inbuf -= num_reversed_bytes;
328 *inbytesleft += num_reversed_bytes;
329 st->ustate = U0;
330
331 return ((size_t) -1);
332 }
333
334 return uconv_num;
335 }
336