1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 1997, by Sun Microsystems, Inc.
24 * All rights reserved.
25 */
26
27 #include <stdio.h>
28 #include <stdlib.h>
29 #include <errno.h>
30 #include <sys/types.h>
31
32 #include "tab_lookup.h" /* table lookup data types */
33
34 #define MSB 0x80 /* most significant bit */
35 #define ONEBYTE 0xff /* right most byte */
36
37 enum _USTATE { U0, U1, U11, U2, U3, U4 };
38
39
40
41
42 /*
43 * Actual conversion; called from iconv()
44 * Input is UTF-8 data.
45 * first convert to UCS2
46 */
47 size_t
_icv_iconv(_icv_state * st,char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft)48 _icv_iconv(_icv_state *st, char **inbuf, size_t *inbytesleft,
49 char **outbuf, size_t *outbytesleft)
50 {
51 /*
52 * Actual conversion; called from iconv()
53 */
54 /*=========================================================
55 *
56 * State Machine for interpreting UTF8 code
57 *
58 *=========================================================
59 *
60 * 3 byte unicode
61 * +----->------->-------+
62 * | |
63 * ^ v
64 * | 2 byte U2 ---> U3
65 * | unicode v
66 * +------> U0 -------> U1 +-------->U4---+
67 * ^ ascii | | ^ |
68 * | | +-------->--------->--------+ |
69 * | v v
70 * +----<---+-----<------------<------------<------------+
71 *
72 * +----<---+-----<------------<------------<------------+
73 *
74 *=========================================================*/
75
76 char c1, c2;
77 int n, unidx;
78 unsigned long ibm_code;
79
80 #ifdef DEBUG
81 fprintf(stderr, "========== iconv(): UTF8 --> IBM ==========\n");
82 #endif
83
84 if (st == NULL) {
85 errno = EBADF;
86 return ((size_t) -1);
87 }
88
89 if (inbuf == NULL || *inbuf == NULL) { /* Reset request. */
90 st->ustate = U0;
91 st->_errno = 0;
92 return ((size_t) 0);
93 }
94
95 st->_errno = 0; /* reset internal errno */
96 errno = 0; /* reset external errno */
97
98 /* a state machine for interpreting UTF8 code */
99 while (*inbytesleft > 0 && *outbytesleft > 0) {
100 switch (st->ustate) {
101 case U0: /* assuming ASCII in the beginning */
102 if ((**inbuf & MSB) == 0) { /* ASCII */
103 **outbuf = **inbuf;
104 (*outbuf)++;
105 (*outbytesleft)--;
106 } else { /* Chinese character */
107 if ((**inbuf & 0xe0) == 0xc0) { /* 2 byte unicode */
108 st->ustate = U1;
109 st->keepc[0] = **inbuf;
110 } else if ((**inbuf & 0xf0) == 0xe0) { /* 3 byte */
111 st->ustate = U2;
112 st->keepc[0] = **inbuf;
113 } else { /* illegal unicode */
114 /* st->_errno = errno = EINVAL; */
115 /* possible UNICODE ko_KR-UTF8 */
116 c1 =st->keepc[0] = **inbuf;
117 st->ustate = U11;
118 break;
119 }
120 }
121 break;
122 case U1: /* 2 byte unicode */
123 if ((**inbuf & 0xc0) == MSB) {
124 st->ustate = U4;
125 st->keepc[1] = **inbuf;
126 c1 = (st->keepc[0]&0x1c)>>2;
127 c2 = ((st->keepc[0]&0x03)<<6) | ((**inbuf)&0x3f);
128 #ifdef DEBUG
129 fprintf(stderr, "UTF8: %02x%02x --> ",
130 st->keepc[0]&ONEBYTE, st->keepc[1]&ONEBYTE);
131 #endif
132 continue; /* should not advance *inbuf */
133 } else {
134 st->_errno = errno = EINVAL;
135 }
136 break;
137 case U11: /* 3 byte unicode - 2nd byte */
138 c2 =st->keepc[1] = **inbuf;
139 st->ustate = U4;
140 continue;
141 break;
142 case U2: /* 3 byte unicode - 2nd byte */
143 if ((**inbuf & 0xc0) == MSB) {
144 st->ustate = U3;
145 st->keepc[1] = **inbuf;
146 } else {
147 st->_errno = errno = EINVAL;
148 }
149 break;
150 case U3: /* 3 byte unicode - 3rd byte */
151 if ((**inbuf & 0xc0) == MSB) {
152 st->ustate = U4;
153 st->keepc[2] = **inbuf;
154 c1 = ((st->keepc[0]&0x0f)<<4) |
155 ((st->keepc[1]&0x3c)>>2);
156 c2 = ((st->keepc[1]&0x03)<<6) | ((**inbuf)&0x3f);
157 #ifdef DEBUG
158 fprintf(stderr, "UTF8: %02x%02x%02x --> ", st->keepc[0]&ONEBYTE,
159 st->keepc[1]&ONEBYTE, **inbuf&ONEBYTE);
160 #endif
161 continue; /* should not advance *inbuf */
162 } else {
163 st->_errno = errno = EINVAL;
164 }
165 break;
166 case U4:
167 n = get_ibm_by_utf(st, c1, c2, &unidx, &ibm_code);
168 if (n != 0) { /* legal unicode;illegal Big5 */
169 st->_errno = errno = EILSEQ;
170 break;
171 }
172
173 n = utf8_to_ibm(unidx, ibm_code,
174 *outbuf, *outbytesleft);
175 if (n > 0) {
176 (*outbuf) += n;
177 (*outbytesleft) -= n;
178 } else {
179 st->_errno = errno;
180 return((size_t)-1);
181 }
182 st->ustate = U0;
183 st->_errno = 0;
184 break;
185 default: /* should never come here */
186 st->_errno = errno = EILSEQ;
187 st->ustate = U0; /* reset state */
188 break;
189 }
190
191 (*inbuf)++;
192 (*inbytesleft)--;
193
194 if (st->_errno) {
195 #ifdef DEBUG
196 fprintf(stderr, "!!!!!\tst->_errno = %d\tst->ustate = %d\n",
197 st->_errno, st->ustate);
198 #endif
199 break;
200 }
201
202 if (errno)
203 return((size_t)-1);
204 }
205
206 if (*outbytesleft == 0) {
207 errno = E2BIG;
208 return((size_t)-1);
209 }
210 return (*inbytesleft);
211 }
212
213
214 /*
215 * Match IBM code by UTF8 code;
216 * Return: = 0 - match from Unicode to IBM found
217 * = 1 - match from Unicode to IBM NOT found
218 *
219 * Since binary search of the UTF8 to IBM table is necessary, might as well
220 * return index and IBM code matching to the unicode.
221 */
get_ibm_by_utf(st,c1,c2,unidx,ibm_code)222 int get_ibm_by_utf(st, c1, c2, unidx, ibm_code)
223 _icv_state *st;
224 char c1, c2;
225 int *unidx;
226 unsigned long *ibm_code;
227 {
228 unsigned long unicode;
229
230 unicode = (unsigned long) ((c1 & ONEBYTE) << 8) + (c2 & ONEBYTE);
231 *unidx = bisearch(unicode, st, st->table_size);
232 if ((*unidx) >= 0)
233 {
234 if ( st->left_to_right )
235 *ibm_code = st->table[*unidx].right_code;
236 else
237 *ibm_code = st->table[*unidx].left_code;
238 }
239 else
240 ; /* match from UTF8 to IBM not found */
241 #ifdef DEBUG
242 fprintf(stderr, "Unicode=%04x, idx=%5d, IBM=%x ", unicode, *unidx, *ibm_code);
243 #endif
244
245 return(0);
246 }
247
248
249 /*
250 * ISO/IEC 10646 (Unicode) --> IBM
251 * Unicode --> UTF8 (FSS-UTF)
252 * (File System Safe Universal Character Set Transformation Format)
253 * Return: > 0 - converted with enough space in output buffer
254 * = 0 - no space in outbuf
255 */
utf8_to_ibm(unidx,ibm_code,buf,buflen)256 int utf8_to_ibm(unidx, ibm_code, buf, buflen)
257 int unidx;
258 unsigned long ibm_code;
259 char *buf;
260 size_t buflen;
261
262 {
263 unsigned long val; /* IBM value */
264 char c1, c2, ibm_str[3];
265
266 if (unidx < 0) /* no match from UTF8 to IBM */
267 ibm_code = (unsigned long)NON_ID_CHAR;
268
269 {
270 val = ibm_code & 0xffff;
271 c1 = (char) ((val & 0xff00) >> 8);
272 c2 = (char) (val & 0xff);
273 }
274
275 *buf = ibm_str[0] = c1;
276 *(buf+1) = ibm_str[1] = c2;
277 ibm_str[2] = NULL;
278
279 #ifdef DEBUG
280 fprintf(stderr, "\t->%x %x<-\n", *buf, *(buf+1));
281 #endif
282
283
284 if (buflen < 2) {
285 errno = E2BIG;
286 return(0);
287 }
288
289 return(2);
290 }
291