1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 1997, by Sun Microsystems, Inc.
24 * All rights reserved.
25 */
26
27 #include <stdio.h>
28 #include <stdlib.h>
29 #include <errno.h>
30 #include <sys/types.h>
31
32 #include "tab_lookup.h" /* table lookup data types */
33
34 #define MSB 0x80 /* most significant bit */
35 #define ONEBYTE 0xff /* right most byte */
36
37 enum _USTATE { U0, U1, U11, U2, U3, U4 };
38
39
40 int get_ibm_by_utf(_icv_state *st, char c1, char c2, int *unidx,
41 unsigned long *ibm_code);
42
43 int bisearch(unsigned long val, _icv_state *st, int n);
44
45 int utf8_to_ibm(int unidx, unsigned long ibm_code, char *buf,
46 size_t buflen, _icv_state *st);
47
48 /*
49 * Actual conversion; called from iconv()
50 * Input is UTF-8 data.
51 * first convert to UCS2
52 */
53 size_t
_icv_iconv(_icv_state * st,char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft)54 _icv_iconv(_icv_state *st, char **inbuf, size_t *inbytesleft,
55 char **outbuf, size_t *outbytesleft)
56 {
57 /*
58 * Actual conversion; called from iconv()
59 */
60 /*=========================================================
61 *
62 * State Machine for interpreting UTF8 code
63 *
64 *=========================================================
65 *
66 * 3 byte unicode
67 * +----->------->-------+
68 * | |
69 * ^ v
70 * | 2 byte U2 ---> U3
71 * | unicode v
72 * +------> U0 -------> U1 +-------->U4---+
73 * ^ ascii | | ^ |
74 * | | +-------->--------->--------+ |
75 * | v v
76 * +----<---+-----<------------<------------<------------+
77 *
78 * +----<---+-----<------------<------------<------------+
79 *
80 *=========================================================*/
81
82 char c1 = '\0', c2 = '\0';
83 int n, unidx;
84 unsigned long ibm_code;
85
86 #ifdef DEBUG
87 fprintf(stderr, "========== iconv(): UTF8 --> IBM ==========\n");
88 #endif
89
90 if (st == NULL) {
91 errno = EBADF;
92 return ((size_t) -1);
93 }
94
95 if (inbuf == NULL || *inbuf == NULL) { /* Reset request. */
96 st->ustate = U0;
97 st->_errno = 0;
98 st->shift = SHIFT_IN;
99 return ((size_t) 0);
100 }
101
102 st->_errno = 0; /* reset internal errno */
103 errno = 0; /* reset external errno */
104
105 /* a state machine for interpreting UTF8 code */
106 while (*inbytesleft > 0 && *outbytesleft > 0) {
107 switch (st->ustate) {
108 case U0:
109 /* it is ascii, convert it immediately */
110 if ((**inbuf & MSB) == 0) { /* ASCII */
111 st->ustate = U4;
112 st->keepc[0] = **inbuf;
113 c1 = 0x0;
114 c2 = **inbuf;
115 continue;
116 } else { /* Chinese character */
117 if ((**inbuf & 0xe0) == 0xc0) { /* 2 byte unicode */
118 st->ustate = U1;
119 st->keepc[0] = **inbuf;
120 } else if ((**inbuf & 0xf0) == 0xe0) { /* 3 byte */
121 st->ustate = U2;
122 st->keepc[0] = **inbuf;
123 } else { /* illegal unicode */
124 /* st->_errno = errno = EINVAL; */
125 /* possible UNICODE ko_KR-UTF8 */
126 c1 =st->keepc[0] = **inbuf;
127 st->ustate = U11;
128 break;
129 }
130 }
131 break;
132 case U1: /* 2 byte unicode */
133 if ((**inbuf & 0xc0) == MSB) {
134 st->ustate = U4;
135 st->keepc[1] = **inbuf;
136 c1 = (st->keepc[0]&0x1c)>>2;
137 c2 = ((st->keepc[0]&0x03)<<6) | ((**inbuf)&0x3f);
138 #ifdef DEBUG
139 fprintf(stderr, "UTF8: %02x%02x --> ",
140 st->keepc[0]&ONEBYTE, st->keepc[1]&ONEBYTE);
141 #endif
142 continue; /* should not advance *inbuf */
143 } else {
144 st->_errno = errno = EINVAL;
145 }
146 break;
147 case U11: /* 3 byte unicode - 2nd byte */
148 c2 =st->keepc[1] = **inbuf;
149 st->ustate = U4;
150 continue;
151 break;
152 case U2: /* 3 byte unicode - 2nd byte */
153 if ((**inbuf & 0xc0) == MSB) {
154 st->ustate = U3;
155 st->keepc[1] = **inbuf;
156 } else {
157 st->_errno = errno = EINVAL;
158 }
159 break;
160 case U3: /* 3 byte unicode - 3rd byte */
161 if ((**inbuf & 0xc0) == MSB) {
162 st->ustate = U4;
163 st->keepc[2] = **inbuf;
164 c1 = ((st->keepc[0]&0x0f)<<4) |
165 ((st->keepc[1]&0x3c)>>2);
166 c2 = ((st->keepc[1]&0x03)<<6) | ((**inbuf)&0x3f);
167 #ifdef DEBUG
168 fprintf(stderr, "UTF8: %02x%02x%02x --> ", st->keepc[0]&ONEBYTE,
169 st->keepc[1]&ONEBYTE, **inbuf&ONEBYTE);
170 #endif
171 continue; /* should not advance *inbuf */
172 } else {
173 st->_errno = errno = EINVAL;
174 }
175 break;
176 case U4:
177 n = get_ibm_by_utf(st, c1, c2, &unidx, &ibm_code);
178 if (n != 0) { /* legal unicode;illegal Big5 */
179 st->_errno = errno = EILSEQ;
180 break;
181 }
182
183 n = utf8_to_ibm(unidx, ibm_code,
184 *outbuf, *outbytesleft, st);
185 if (n > 0) {
186 (*outbuf) += n;
187 (*outbytesleft) -= n;
188 } else {
189 st->_errno = errno;
190 return((size_t)-1);
191 }
192 st->ustate = U0;
193 st->_errno = 0;
194 break;
195 default: /* should never come here */
196 st->_errno = errno = EILSEQ;
197 st->ustate = U0; /* reset state */
198 break;
199 }
200
201 (*inbuf)++;
202 (*inbytesleft)--;
203
204 if (st->_errno) {
205 #ifdef DEBUG
206 fprintf(stderr, "!!!!!\tst->_errno = %d\tst->ustate = %d\n",
207 st->_errno, st->ustate);
208 #endif
209 break;
210 }
211
212 if (errno)
213 return((size_t)-1);
214 }
215
216 if (*outbytesleft == 0) {
217 errno = E2BIG;
218 return((size_t)-1);
219 }
220 return (*inbytesleft);
221 }
222
223
224 /*
225 * Match IBM code by UTF8 code;
226 * Return: = 0 - match from Unicode to IBM found
227 * = 1 - match from Unicode to IBM NOT found
228 *
229 * Since binary search of the UTF8 to IBM table is necessary, might as well
230 * return index and IBM code matching to the unicode.
231 */
get_ibm_by_utf(st,c1,c2,unidx,ibm_code)232 int get_ibm_by_utf(st, c1, c2, unidx, ibm_code)
233 _icv_state *st;
234 char c1, c2;
235 int *unidx;
236 unsigned long *ibm_code;
237 {
238 unsigned long unicode;
239
240 unicode = (unsigned long) ((c1 & ONEBYTE) << 8) + (c2 & ONEBYTE);
241 *unidx = bisearch(unicode, st, st->table_size);
242 if ((*unidx) >= 0)
243 {
244 if ( st->left_to_right )
245 *ibm_code = st->table[*unidx].right_code;
246 else
247 *ibm_code = st->table[*unidx].left_code;
248 }
249 #ifdef DEBUG
250 fprintf(stderr, "Unicode=%04x, idx=%5d, IBM=%x ", unicode, *unidx, *ibm_code);
251 #endif
252
253 return(0);
254 }
255
256
257 /*
258 * ISO/IEC 10646 (Unicode) --> IBM
259 * Unicode --> UTF8 (FSS-UTF)
260 * (File System Safe Universal Character Set Transformation Format)
261 * Return: > 0 - converted with enough space in output buffer
262 * = 0 - no space in outbuf
263 */
utf8_to_ibm(int unidx,unsigned long ibm_code,char * buf,size_t buflen,_icv_state * st)264 int utf8_to_ibm(int unidx, unsigned long ibm_code, char *buf, size_t buflen,
265 _icv_state *st)
266 {
267 unsigned long val; /* IBM value */
268 char c1, c2, ibm_str[3];
269
270 if (unidx < 0) /* no match from UTF8 to IBM */
271 ibm_code = (unsigned long)NON_ID_CHAR;
272
273 {
274 val = ibm_code & 0xffff;
275 c1 = (char) ((val & 0xff00) >> 8);
276 c2 = (char) (val & 0xff);
277 }
278
279 /* it is single byte ascii */
280 if ( c1 == 0x0 ) {
281 if ( st->shift == SHIFT_OUT ) {
282 if (buflen < 2) {
283 errno = E2BIG;
284 return 0;
285 }
286 *buf = SHIFT_IN;
287 *(buf+1) = c2;
288 st->shift = SHIFT_IN;
289 return 2;
290 }
291 if (buflen < 1) {
292 errno = E2BIG;
293 return 0;
294 }
295 *buf = c2;
296 return 1;
297 }
298
299 /* it is the first two bytes character */
300 if ( st->shift == SHIFT_IN ) {
301 if (buflen < 3) {
302 errno = E2BIG;
303 return 0;
304 }
305 *buf = SHIFT_OUT;
306 st->shift = SHIFT_OUT;
307 *(buf+1) = c1;
308 *(buf+2) = c2;
309 return 3;
310 }
311
312 *buf = ibm_str[0] = c1;
313 *(buf+1) = ibm_str[1] = c2;
314 ibm_str[2] = '\0';
315
316 #ifdef DEBUG
317 fprintf(stderr, "\t->%x %x<-\n", *buf, *(buf+1));
318 #endif
319
320
321 if (buflen < 2) {
322 errno = E2BIG;
323 return(0);
324 }
325
326 return(2);
327 }
328